From db749d2134fd2d9cb1eaa1ca5892abcc660916ce Mon Sep 17 00:00:00 2001
From: James Wright <james@jameswright.xyz>
Date: Thu, 25 Apr 2024 08:34:06 -0600
Subject: [PATCH 001/571] fluids: Update PetscCallCeed (from Ratel)

Update from: https://gitlab.com/micromorph/ratel/-/merge_requests/815
---
 examples/fluids/include/ceed-utils.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/examples/fluids/include/ceed-utils.h b/examples/fluids/include/ceed-utils.h
index b7962b0f5c..10d1eeb615 100644
--- a/examples/fluids/include/ceed-utils.h
+++ b/examples/fluids/include/ceed-utils.h
@@ -11,8 +11,10 @@
 
 #define PetscCallCeed(ceed, ...)                                    \
   do {                                                              \
-    int ierr = __VA_ARGS__;                                         \
-    if (ierr != CEED_ERROR_SUCCESS) {                               \
+    int ierr_q_;                                                    \
+    PetscStackUpdateLine;                                           \
+    ierr_q_ = __VA_ARGS__;                                          \
+    if (PetscUnlikely(ierr_q_ != CEED_ERROR_SUCCESS)) {             \
       const char *error_message;                                    \
       CeedGetErrorMessage(ceed, &error_message);                    \
       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "%s", error_message); \

From 3933d9a06aeb299c1f872998cbdde051f6b53220 Mon Sep 17 00:00:00 2001
From: James Wright <james@jameswright.xyz>
Date: Thu, 25 Apr 2024 09:03:59 -0600
Subject: [PATCH 002/571] fluids: Add MatCeedView (from Ratel)

https://gitlab.com/micromorph/ratel/-/merge_requests/817
---
 examples/fluids/include/mat-ceed-impl.h |  1 +
 examples/fluids/src/mat-ceed.c          | 43 +++++++++++++++++++++++++
 2 files changed, 44 insertions(+)

diff --git a/examples/fluids/include/mat-ceed-impl.h b/examples/fluids/include/mat-ceed-impl.h
index f5d5d9ac6a..0bfff6180a 100644
--- a/examples/fluids/include/mat-ceed-impl.h
+++ b/examples/fluids/include/mat-ceed-impl.h
@@ -9,6 +9,7 @@
 #include <ceed.h>
 #include <petscdm.h>
 #include <petscmat.h>
+#include <petsc/private/petscimpl.h>
 
 #if defined(__clang_analyzer__)
 #define MATCEED_EXTERN extern
diff --git a/examples/fluids/src/mat-ceed.c b/examples/fluids/src/mat-ceed.c
index 246df8779c..f5118bccff 100644
--- a/examples/fluids/src/mat-ceed.c
+++ b/examples/fluids/src/mat-ceed.c
@@ -270,6 +270,47 @@ static PetscErrorCode MatInvertVariableBlockDiagonal_Ceed(Mat mat_ceed, PetscInt
   PetscFunctionReturn(PETSC_SUCCESS);
 }
 
+/**
+  @brief View `MATCEED`.
+
+  Collective across MPI processes.
+
+  @param[in]   mat_ceed  `MATCEED` to view
+  @param[in]   viewer    The visualization context
+
+  @return An error code: 0 - success, otherwise - failure
+**/
+static PetscErrorCode MatView_Ceed(Mat mat_ceed, PetscViewer viewer) {
+  PetscBool         is_ascii;
+  PetscViewerFormat format;
+  PetscMPIInt       size;
+  MatCeedContext    ctx;
+
+  PetscFunctionBeginUser;
+  PetscValidHeaderSpecific(viewer, PETSC_VIEWER_CLASSID, 2);
+  PetscCall(MatShellGetContext(mat_ceed, &ctx));
+  if (!viewer) PetscCall(PetscViewerASCIIGetStdout(PetscObjectComm((PetscObject)mat_ceed), &viewer));
+
+  PetscCall(PetscViewerGetFormat(viewer, &format));
+  PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)mat_ceed), &size));
+  if (size == 1 && format == PETSC_VIEWER_LOAD_BALANCE) PetscFunctionReturn(PETSC_SUCCESS);
+
+  PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &is_ascii));
+  {
+    FILE *file;
+
+    PetscCall(PetscViewerASCIIPrintf(viewer, "MatCEED:\n  Internal MatType:%s\n", ctx->internal_mat_type));
+    PetscCall(PetscViewerASCIIGetPointer(viewer, &file));
+    PetscCall(PetscViewerASCIIPrintf(viewer, " libCEED Operator:\n"));
+    PetscCallCeed(ctx->ceed, CeedOperatorView(ctx->op_mult, file));
+    if (ctx->op_mult_transpose) {
+      PetscCall(PetscViewerASCIIPrintf(viewer, "  libCEED Transpose Operator:\n"));
+      PetscCallCeed(ctx->ceed, CeedOperatorView(ctx->op_mult_transpose, file));
+    }
+  }
+  PetscFunctionReturn(PETSC_SUCCESS);
+}
+
 // -----------------------------------------------------------------------------
 // MatCeed
 // -----------------------------------------------------------------------------
@@ -438,6 +479,7 @@ PetscErrorCode MatCeedCreate(DM dm_x, DM dm_y, CeedOperator op_mult, CeedOperato
   }
   // -- Set mat operations
   PetscCall(MatShellSetContextDestroy(*mat, (PetscErrorCode(*)(void *))MatCeedContextDestroy));
+  PetscCall(MatShellSetOperation(*mat, MATOP_VIEW, (void (*)(void))MatView_Ceed));
   PetscCall(MatShellSetOperation(*mat, MATOP_MULT, (void (*)(void))MatMult_Ceed));
   if (op_mult_transpose) PetscCall(MatShellSetOperation(*mat, MATOP_MULT_TRANSPOSE, (void (*)(void))MatMultTranspose_Ceed));
   PetscCall(MatShellSetOperation(*mat, MATOP_GET_DIAGONAL, (void (*)(void))MatGetDiagonal_Ceed));
@@ -500,6 +542,7 @@ PetscErrorCode MatCeedCopy(Mat mat_ceed, Mat mat_other) {
     PetscCall(MatCeedContextReference(ctx));
     PetscCall(MatShellSetContext(mat_other, ctx));
     PetscCall(MatShellSetContextDestroy(mat_other, (PetscErrorCode(*)(void *))MatCeedContextDestroy));
+    PetscCall(MatShellSetOperation(mat_other, MATOP_VIEW, (void (*)(void))MatView_Ceed));
     PetscCall(MatShellSetOperation(mat_other, MATOP_MULT, (void (*)(void))MatMult_Ceed));
     if (ctx->op_mult_transpose) PetscCall(MatShellSetOperation(mat_other, MATOP_MULT_TRANSPOSE, (void (*)(void))MatMultTranspose_Ceed));
     PetscCall(MatShellSetOperation(mat_other, MATOP_GET_DIAGONAL, (void (*)(void))MatGetDiagonal_Ceed));

From ff9b3c0e2ebd8a09dedd1c00be3d2c5b29de65cc Mon Sep 17 00:00:00 2001
From: James Wright <james@jameswright.xyz>
Date: Thu, 25 Apr 2024 09:43:51 -0600
Subject: [PATCH 003/571] fluids: Use StatePrimitive in blasius context

---
 examples/fluids/README.md                    | 10 ++--
 examples/fluids/problems/blasius.c           | 43 ++++++++------
 examples/fluids/problems/newtonian.c         |  3 +-
 examples/fluids/qfunctions/blasius.h         | 60 +++++++++++---------
 examples/fluids/qfunctions/newtonian.h       |  2 +-
 examples/fluids/qfunctions/newtonian_types.h |  2 +-
 6 files changed, 66 insertions(+), 54 deletions(-)

diff --git a/examples/fluids/README.md b/examples/fluids/README.md
index 179e3950c8..c3505fe913 100644
--- a/examples/fluids/README.md
+++ b/examples/fluids/README.md
@@ -948,6 +948,11 @@ The Blasius problem has the following command-line options in addition to the Ne
   - `288`
   - `K`
 
+* - `-pressure_infinity`
+  - Atmospheric pressure, also sets IDL reference pressure
+  - `1.01E5`
+  - `Pa`
+
 * - `-temperature_wall`
   - Wall temperature
   - `288`
@@ -958,11 +963,6 @@ The Blasius problem has the following command-line options in addition to the Ne
   - `4.2e-3`
   - `m`
 
-* - `-P0`
-  - Atmospheric pressure
-  - `1.01E5`
-  - `Pa`
-
 * - `-platemesh_modify_mesh`
   - Whether to modify the mesh using the given options below.
   - `false`
diff --git a/examples/fluids/problems/blasius.c b/examples/fluids/problems/blasius.c
index 4fbfc977a5..8e662dcfb6 100644
--- a/examples/fluids/problems/blasius.c
+++ b/examples/fluids/problems/blasius.c
@@ -21,10 +21,12 @@ PetscErrorCode CompressibleBlasiusResidual(SNES snes, Vec X, Vec R, void *ctx) {
   const BlasiusContext blasius = (BlasiusContext)ctx;
   const PetscScalar   *Tf, *Th;  // Chebyshev coefficients
   PetscScalar         *r, f[4], h[4];
-  PetscInt             N = blasius->n_cheb;
+  PetscInt             N       = blasius->n_cheb;
+  State                S_infty = blasius->S_infty;
+  CeedScalar           U_infty = sqrt(Dot3(S_infty.Y.velocity, S_infty.Y.velocity));
 
   PetscFunctionBeginUser;
-  PetscScalar Ma = Mach(&blasius->newtonian_ctx, blasius->T_inf, blasius->U_inf), Pr = Prandtl(&blasius->newtonian_ctx),
+  PetscScalar Ma = Mach(&blasius->newtonian_ctx, S_infty.Y.temperature, U_infty), Pr = Prandtl(&blasius->newtonian_ctx),
               gamma = HeatCapacityRatio(&blasius->newtonian_ctx);
 
   PetscCall(VecGetArrayRead(X, &Tf));
@@ -59,7 +61,7 @@ PetscErrorCode CompressibleBlasiusResidual(SNES snes, Vec X, Vec R, void *ctx) {
 
   // h - left end boundary condition
   ChebyshevEval(N - 1, Th, -1., blasius->eta_max, h);
-  r[N] = h[0] - blasius->T_wall / blasius->T_inf;
+  r[N] = h[0] - blasius->T_wall / S_infty.Y.temperature;
 
   // h - right end boundary condition
   ChebyshevEval(N - 1, Th, 1., blasius->eta_max, h);
@@ -252,7 +254,7 @@ PetscErrorCode NS_BLASIUS(ProblemData problem, DM dm, void *ctx, SimpleBC bc) {
   CeedScalar T_inf                                = 288.;         // K
   CeedScalar T_wall                               = 288.;         // K
   CeedScalar delta0                               = 4.2e-3;       // m
-  CeedScalar P0                                   = 1.01e5;       // Pa
+  CeedScalar P_inf                                = 1.01e5;       // Pa
   PetscInt   N                                    = 20;           // Number of Chebyshev terms
   PetscBool  weakT                                = PETSC_FALSE;  // weak density or temperature
   PetscReal  mesh_refine_height                   = 5.9e-4;       // m
@@ -260,14 +262,17 @@ PetscErrorCode NS_BLASIUS(ProblemData problem, DM dm, void *ctx, SimpleBC bc) {
   PetscInt   mesh_Ndelta                          = 45;           // [-]
   PetscReal  mesh_top_angle                       = 5;            // degrees
   char       mesh_ynodes_path[PETSC_MAX_PATH_LEN] = "";
+  PetscBool  flg;
 
   PetscOptionsBegin(comm, NULL, "Options for BLASIUS problem", NULL);
   PetscCall(PetscOptionsBool("-weakT", "Change from rho weak to T weak at inflow", NULL, weakT, &weakT, NULL));
   PetscCall(PetscOptionsScalar("-velocity_infinity", "Velocity at boundary layer edge", NULL, U_inf, &U_inf, NULL));
   PetscCall(PetscOptionsScalar("-temperature_infinity", "Temperature at boundary layer edge", NULL, T_inf, &T_inf, NULL));
+  PetscCall(PetscOptionsScalar("-pressure_infinity", "Pressure at boundary layer edge", NULL, P_inf, &P_inf, &flg));
+  PetscCall(PetscOptionsDeprecated("-P0", "-pressure_infinity", "libCEED 0.12.0", "Use -pressure_infinity to set pressure at boundary layer edge"));
+  if (!flg) PetscCall(PetscOptionsScalar("-P0", "Pressure at boundary layer edge", NULL, P_inf, &P_inf, &flg));
   PetscCall(PetscOptionsScalar("-temperature_wall", "Temperature at wall", NULL, T_wall, &T_wall, NULL));
   PetscCall(PetscOptionsScalar("-delta0", "Boundary layer height at inflow", NULL, delta0, &delta0, NULL));
-  PetscCall(PetscOptionsScalar("-P0", "Pressure at outflow", NULL, P0, &P0, NULL));
   PetscCall(PetscOptionsInt("-n_chebyshev", "Number of Chebyshev terms", NULL, N, &N, NULL));
   PetscCheck(3 <= N && N <= BLASIUS_MAX_N_CHEBYSHEV, comm, PETSC_ERR_ARG_OUTOFRANGE, "-n_chebyshev %" PetscInt_FMT " must be in range [3, %d]", N,
              BLASIUS_MAX_N_CHEBYSHEV);
@@ -293,7 +298,7 @@ PetscErrorCode NS_BLASIUS(ProblemData problem, DM dm, void *ctx, SimpleBC bc) {
 
   T_inf *= Kelvin;
   T_wall *= Kelvin;
-  P0 *= Pascal;
+  P_inf *= Pascal;
   U_inf *= meter / second;
   delta0 *= meter;
 
@@ -308,16 +313,20 @@ PetscErrorCode NS_BLASIUS(ProblemData problem, DM dm, void *ctx, SimpleBC bc) {
   // Some properties depend on parameters from NewtonianIdealGas
   PetscCallCeed(ceed, CeedQFunctionContextGetData(problem->apply_vol_rhs.qfunction_context, CEED_MEM_HOST, &newtonian_ig_ctx));
 
-  blasius_ctx->weakT         = weakT;
-  blasius_ctx->U_inf         = U_inf;
-  blasius_ctx->T_inf         = T_inf;
-  blasius_ctx->T_wall        = T_wall;
-  blasius_ctx->delta0        = delta0;
-  blasius_ctx->P0            = P0;
-  blasius_ctx->n_cheb        = N;
-  newtonian_ig_ctx->P0       = P0;
-  blasius_ctx->implicit      = user->phys->implicit;
-  blasius_ctx->newtonian_ctx = *newtonian_ig_ctx;
+  StatePrimitive Y_inf = {
+      .pressure = P_inf, .velocity = {U_inf, 0, 0},
+           .temperature = T_inf
+  };
+  State S_infty = StateFromPrimitive(newtonian_ig_ctx, Y_inf);
+
+  blasius_ctx->weakT             = weakT;
+  blasius_ctx->T_wall            = T_wall;
+  blasius_ctx->delta0            = delta0;
+  blasius_ctx->S_infty           = S_infty;
+  blasius_ctx->n_cheb            = N;
+  newtonian_ig_ctx->idl_pressure = P_inf;
+  blasius_ctx->implicit          = user->phys->implicit;
+  blasius_ctx->newtonian_ctx     = *newtonian_ig_ctx;
 
   {
     PetscReal domain_min[3], domain_max[3];
@@ -338,7 +347,7 @@ PetscErrorCode NS_BLASIUS(ProblemData problem, DM dm, void *ctx, SimpleBC bc) {
   PetscCallCeed(ceed, CeedQFunctionContextDestroy(&problem->ics.qfunction_context));
   problem->ics.qfunction_context = blasius_context;
   if (use_stg) {
-    PetscCall(SetupStg(comm, dm, problem, user, weakT, T_inf, P0));
+    PetscCall(SetupStg(comm, dm, problem, user, weakT, S_infty.Y.temperature, S_infty.Y.pressure));
   } else if (diff_filter_mms) {
     PetscCall(DifferentialFilterMmsICSetup(problem));
   } else {
diff --git a/examples/fluids/problems/newtonian.c b/examples/fluids/problems/newtonian.c
index 61c7ec5a2d..9f39eebaf5 100644
--- a/examples/fluids/problems/newtonian.c
+++ b/examples/fluids/problems/newtonian.c
@@ -322,9 +322,8 @@ PetscErrorCode NS_NEWTONIAN_IG(ProblemData problem, DM dm, void *ctx, SimpleBC b
   newtonian_ig_ctx->Ctau_C        = Ctau_C;
   newtonian_ig_ctx->Ctau_M        = Ctau_M;
   newtonian_ig_ctx->Ctau_E        = Ctau_E;
-  newtonian_ig_ctx->P0            = reference.pressure;
+  newtonian_ig_ctx->idl_pressure  = reference.pressure;
   newtonian_ig_ctx->stabilization = stab;
-  newtonian_ig_ctx->P0            = reference.pressure;
   newtonian_ig_ctx->is_implicit   = implicit;
   newtonian_ig_ctx->state_var     = state_var;
   newtonian_ig_ctx->idl_enable    = idl_enable;
diff --git a/examples/fluids/qfunctions/blasius.h b/examples/fluids/qfunctions/blasius.h
index 52a7ff5614..66ea2d3857 100644
--- a/examples/fluids/qfunctions/blasius.h
+++ b/examples/fluids/qfunctions/blasius.h
@@ -17,13 +17,11 @@
 
 typedef struct BlasiusContext_ *BlasiusContext;
 struct BlasiusContext_ {
-  bool                             implicit;                              // !< Using implicit timesteping or not
-  bool                             weakT;                                 // !< flag to set Temperature weakly at inflow
-  CeedScalar                       delta0;                                // !< Boundary layer height at inflow
-  CeedScalar                       U_inf;                                 // !< Velocity at boundary layer edge
-  CeedScalar                       T_inf;                                 // !< Temperature at boundary layer edge
+  bool                             implicit;  // !< Using implicit timesteping or not
+  bool                             weakT;     // !< flag to set Temperature weakly at inflow
+  CeedScalar                       delta0;    // !< Boundary layer height at inflow
+  State                            S_infty;
   CeedScalar                       T_wall;                                // !< Temperature at the wall
-  CeedScalar                       P0;                                    // !< Pressure at outflow
   CeedScalar                       x_inflow;                              // !< Location of inflow in x
   CeedScalar                       n_cheb;                                // !< Number of Chebyshev terms
   CeedScalar                      *X;                                     // !< Chebyshev polynomial coordinate vector (CPU only)
@@ -72,25 +70,26 @@ CEED_QFUNCTION_HELPER void ChebyshevEval(int N, const double *Tf, double x, doub
 // *****************************************************************************
 State CEED_QFUNCTION_HELPER(BlasiusSolution)(const BlasiusContext blasius, const CeedScalar x[3], const CeedScalar x0, const CeedScalar x_inflow,
                                              const CeedScalar rho_infty, CeedScalar *t12) {
-  CeedInt    N     = blasius->n_cheb;
-  CeedScalar mu    = blasius->newtonian_ctx.mu;
-  CeedScalar nu    = mu / rho_infty;
-  CeedScalar eta   = x[1] * sqrt(blasius->U_inf / (nu * (x0 + x[0] - x_inflow)));
-  CeedScalar X     = 2 * (eta / blasius->eta_max) - 1.;
-  CeedScalar U_inf = blasius->U_inf;
-  CeedScalar Rd    = GasConstant(&blasius->newtonian_ctx);
+  CeedInt    N       = blasius->n_cheb;
+  CeedScalar mu      = blasius->newtonian_ctx.mu;
+  State      S_infty = blasius->S_infty;
+  CeedScalar nu      = mu / rho_infty;
+  CeedScalar U_infty = sqrt(Dot3(S_infty.Y.velocity, S_infty.Y.velocity));
+  CeedScalar eta     = x[1] * sqrt(U_infty / (nu * (x0 + x[0] - x_inflow)));
+  CeedScalar X       = 2 * (eta / blasius->eta_max) - 1.;
+  CeedScalar Rd      = GasConstant(&blasius->newtonian_ctx);
 
   CeedScalar f[4], h[4];
   ChebyshevEval(N, blasius->Tf_cheb, X, blasius->eta_max, f);
   ChebyshevEval(N - 1, blasius->Th_cheb, X, blasius->eta_max, h);
 
-  *t12 = mu * U_inf * f[2] * sqrt(U_inf / (nu * (x0 + x[0] - x_inflow)));
+  *t12 = mu * U_infty * f[2] * sqrt(U_infty / (nu * (x0 + x[0] - x_inflow)));
 
   CeedScalar Y[5];
-  Y[1] = U_inf * f[1];
-  Y[2] = 0.5 * sqrt(nu * U_inf / (x0 + x[0] - x_inflow)) * (eta * f[1] - f[0]);
+  Y[1] = U_infty * f[1];
+  Y[2] = 0.5 * sqrt(nu * U_infty / (x0 + x[0] - x_inflow)) * (eta * f[1] - f[0]);
   Y[3] = 0.;
-  Y[4] = blasius->T_inf * h[0];
+  Y[4] = S_infty.Y.temperature * h[0];
   Y[0] = rho_infty / h[0] * Rd * Y[4];
   return StateFromY(&blasius->newtonian_ctx, Y);
 }
@@ -109,14 +108,14 @@ CEED_QFUNCTION(ICsBlasius)(void *ctx, CeedInt Q, const CeedScalar *const *in, Ce
   const CeedScalar               x_inflow = context->x_inflow;
   CeedScalar                     t12;
 
-  const CeedScalar Y_inf[5] = {context->P0, context->U_inf, 0, 0, context->T_inf};
-  const State      s_inf    = StateFromY(gas, Y_inf);
+  const State      S_infty = context->S_infty;
+  const CeedScalar U_infty = sqrt(Dot3(S_infty.Y.velocity, S_infty.Y.velocity));
 
-  const CeedScalar x0 = context->U_inf * s_inf.U.density / (mu * 25 / Square(delta0));
+  const CeedScalar x0 = U_infty * S_infty.U.density / (mu * 25 / Square(delta0));
 
   CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
     const CeedScalar x[3] = {X[0][i], X[1][i], X[2][i]};
-    State            s    = BlasiusSolution(context, x, x0, x_inflow, s_inf.U.density, &t12);
+    State            s    = BlasiusSolution(context, x, x0, x_inflow, S_infty.U.density, &t12);
     CeedScalar       q[5] = {0};
 
     switch (gas->state_var) {
@@ -143,8 +142,10 @@ CEED_QFUNCTION(Blasius_Inflow)(void *ctx, CeedInt Q, const CeedScalar *const *in
 
   const bool                     is_implicit = context->implicit;
   const NewtonianIdealGasContext gas         = &context->newtonian_ctx;
-  const CeedScalar               rho_0       = context->P0 / (GasConstant(gas) * context->T_inf);
-  const CeedScalar               x0          = context->U_inf * rho_0 / (gas->mu * 25 / Square(context->delta0));
+  State                          S_infty     = context->S_infty;
+  const CeedScalar               rho_0       = S_infty.U.density;
+  const CeedScalar               U_infty     = sqrt(Dot3(S_infty.Y.velocity, S_infty.Y.velocity));
+  const CeedScalar               x0          = U_infty * rho_0 / (gas->mu * 25 / Square(context->delta0));
   const CeedScalar               zeros[11]   = {0.};
 
   CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
@@ -198,8 +199,10 @@ CEED_QFUNCTION(Blasius_Inflow_Jacobian)(void *ctx, CeedInt Q, const CeedScalar *
   const bool                     is_implicit = context->implicit;
   const CeedScalar               Rd          = GasConstant(gas);
   const CeedScalar               gamma       = HeatCapacityRatio(gas);
-  const CeedScalar               rho_0       = context->P0 / (Rd * context->T_inf);
-  const CeedScalar               x0          = context->U_inf * rho_0 / (gas->mu * 25 / (Square(context->delta0)));
+  const State                    S_infty     = context->S_infty;
+  const CeedScalar               rho_0       = S_infty.U.density;
+  const CeedScalar               U_infty     = sqrt(Dot3(S_infty.Y.velocity, S_infty.Y.velocity));
+  const CeedScalar               x0          = U_infty * rho_0 / (gas->mu * 25 / Square(context->delta0));
 
   CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
     CeedScalar wdetJb, norm[3];
@@ -216,11 +219,12 @@ CEED_QFUNCTION(Blasius_Inflow_Jacobian)(void *ctx, CeedInt Q, const CeedScalar *
     if (context->weakT) {
       // rho should be from the current solution
       drho                   = dq[0][i];
-      CeedScalar dE_internal = drho * gas->cv * context->T_inf;
+      CeedScalar dE_internal = drho * gas->cv * S_infty.Y.temperature;
       CeedScalar dE_kinetic  = .5 * drho * Dot3(s.Y.velocity, s.Y.velocity);
       dE                     = dE_internal + dE_kinetic;
-      dP                     = drho * Rd * context->T_inf;  // interior rho with exterior T
-    } else {                                                // rho specified, E_internal from solution
+      dP                     = drho * Rd * S_infty.Y.temperature;  // interior rho with exterior T
+    } else {
+      // rho specified, E_internal from solution
       drho = 0;
       dE   = dq[4][i];
       dP   = dE * (gamma - 1.);
diff --git a/examples/fluids/qfunctions/newtonian.h b/examples/fluids/qfunctions/newtonian.h
index 1068519217..4804965e93 100644
--- a/examples/fluids/qfunctions/newtonian.h
+++ b/examples/fluids/qfunctions/newtonian.h
@@ -211,7 +211,7 @@ CEED_QFUNCTION_HELPER int IFunction_Newtonian(void *ctx, CeedInt Q, const CeedSc
   NewtonianIdealGasContext context = (NewtonianIdealGasContext)ctx;
   const CeedScalar        *g       = context->g;
   const CeedScalar         dt      = context->dt;
-  const CeedScalar         P0      = context->P0;
+  const CeedScalar         P0      = context->idl_pressure;
 
   CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
     const CeedScalar qi[5]  = {q[0][i], q[1][i], q[2][i], q[3][i], q[4][i]};
diff --git a/examples/fluids/qfunctions/newtonian_types.h b/examples/fluids/qfunctions/newtonian_types.h
index 3a5402c36d..0f238298b7 100644
--- a/examples/fluids/qfunctions/newtonian_types.h
+++ b/examples/fluids/qfunctions/newtonian_types.h
@@ -32,11 +32,11 @@ struct NewtonianIdealGasContext_ {
   CeedScalar        dt;
   CeedScalar        time;
   CeedScalar        ijacobian_time_shift;
-  CeedScalar        P0;
   bool              is_implicit;
   StateVariable     state_var;
   StabilizationType stabilization;
   bool              idl_enable;
+  CeedScalar        idl_pressure;
   CeedScalar        idl_amplitude;
   CeedScalar        idl_start;
   CeedScalar        idl_length;

From 64f98e986d64302820caec520c05deb68bf27a53 Mon Sep 17 00:00:00 2001
From: James Wright <james@jameswright.xyz>
Date: Fri, 26 Apr 2024 08:49:34 -0600
Subject: [PATCH 004/571] fluids: Consolidate QF and Op creation blocks

---
 examples/fluids/navierstokes.c     |   5 -
 examples/fluids/navierstokes.h     |   5 +-
 examples/fluids/src/setuplibceed.c | 244 ++++++++++++++---------------
 3 files changed, 116 insertions(+), 138 deletions(-)

diff --git a/examples/fluids/navierstokes.c b/examples/fluids/navierstokes.c
index 5741119dde..3fe6aa3fa0 100644
--- a/examples/fluids/navierstokes.c
+++ b/examples/fluids/navierstokes.c
@@ -297,10 +297,6 @@ int main(int argc, char **argv) {
   }
 
   // -- QFunctions
-  PetscCallCeed(ceed, CeedQFunctionDestroy(&ceed_data->qf_setup_vol));
-  PetscCallCeed(ceed, CeedQFunctionDestroy(&ceed_data->qf_ics));
-  PetscCallCeed(ceed, CeedQFunctionDestroy(&ceed_data->qf_rhs_vol));
-  PetscCallCeed(ceed, CeedQFunctionDestroy(&ceed_data->qf_ifunction_vol));
   PetscCallCeed(ceed, CeedQFunctionDestroy(&ceed_data->qf_setup_sur));
   PetscCallCeed(ceed, CeedQFunctionDestroy(&ceed_data->qf_apply_inflow));
   PetscCallCeed(ceed, CeedQFunctionDestroy(&ceed_data->qf_apply_inflow_jacobian));
@@ -308,7 +304,6 @@ int main(int argc, char **argv) {
   PetscCallCeed(ceed, CeedQFunctionDestroy(&ceed_data->qf_apply_freestream_jacobian));
 
   // -- Operators
-  PetscCallCeed(ceed, CeedOperatorDestroy(&ceed_data->op_setup_vol));
   PetscCall(OperatorApplyContextDestroy(ceed_data->op_ics_ctx));
   PetscCallCeed(ceed, CeedOperatorDestroy(&user->op_rhs_vol));
   PetscCallCeed(ceed, CeedOperatorDestroy(&user->op_ifunction_vol));
diff --git a/examples/fluids/navierstokes.h b/examples/fluids/navierstokes.h
index 49795d2b5f..767d5bddf9 100644
--- a/examples/fluids/navierstokes.h
+++ b/examples/fluids/navierstokes.h
@@ -154,10 +154,9 @@ struct CeedData_private {
   CeedVector           x_coord, q_data;
   CeedBasis            basis_x, basis_xc, basis_q, basis_x_sur, basis_q_sur;
   CeedElemRestriction  elem_restr_x, elem_restr_q, elem_restr_qd_i;
-  CeedOperator         op_setup_vol;
   OperatorApplyContext op_ics_ctx;
-  CeedQFunction        qf_setup_vol, qf_ics, qf_rhs_vol, qf_ifunction_vol, qf_setup_sur, qf_apply_inflow, qf_apply_inflow_jacobian, qf_apply_outflow,
-      qf_apply_outflow_jacobian, qf_apply_freestream, qf_apply_freestream_jacobian, qf_apply_slip, qf_apply_slip_jacobian;
+  CeedQFunction        qf_setup_sur, qf_apply_inflow, qf_apply_inflow_jacobian, qf_apply_outflow, qf_apply_outflow_jacobian, qf_apply_freestream,
+      qf_apply_freestream_jacobian, qf_apply_slip, qf_apply_slip_jacobian;
 };
 
 typedef struct {
diff --git a/examples/fluids/src/setuplibceed.c b/examples/fluids/src/setuplibceed.c
index 18630c0279..ba68f83569 100644
--- a/examples/fluids/src/setuplibceed.c
+++ b/examples/fluids/src/setuplibceed.c
@@ -240,9 +240,6 @@ PetscErrorCode SetupBCQFunctions(Ceed ceed, PetscInt dim_sur, PetscInt num_comp_
 
 PetscErrorCode SetupLibceed(Ceed ceed, CeedData ceed_data, DM dm, User user, AppCtx app_ctx, ProblemData problem, SimpleBC bc) {
   PetscFunctionBeginUser;
-  // *****************************************************************************
-  // Set up CEED objects for the interior domain (volume)
-  // *****************************************************************************
   const PetscInt num_comp_q = 5;
   const CeedInt  dim = problem->dim, num_comp_x = problem->dim, q_data_size_vol = problem->q_data_size_vol;
   CeedInt        jac_data_size_vol = num_comp_q + 6 + 3;
@@ -260,9 +257,6 @@ PetscErrorCode SetupLibceed(Ceed ceed, CeedData ceed_data, DM dm, User user, App
   DMLabel             domain_label = NULL;
   PetscInt            label_value = 0, height = 0, dm_field = 0;
 
-  // -----------------------------------------------------------------------------
-  // CEED Bases
-  // -----------------------------------------------------------------------------
   DM dm_coord;
   PetscCall(DMGetCoordinateDM(dm, &dm_coord));
 
@@ -271,134 +265,91 @@ PetscErrorCode SetupLibceed(Ceed ceed, CeedData ceed_data, DM dm, User user, App
   PetscCallCeed(ceed, CeedBasisCreateProjection(ceed_data->basis_x, ceed_data->basis_q, &ceed_data->basis_xc));
   PetscCallCeed(ceed, CeedBasisGetNumQuadraturePoints(ceed_data->basis_q, &num_qpts));
 
-  // -----------------------------------------------------------------------------
-  // CEED Restrictions
-  // -----------------------------------------------------------------------------
-  // -- Create restriction
   PetscCall(DMPlexCeedElemRestrictionCreate(ceed, dm, domain_label, label_value, height, 0, &ceed_data->elem_restr_q));
   PetscCall(DMPlexCeedElemRestrictionCoordinateCreate(ceed, dm, domain_label, label_value, height, &ceed_data->elem_restr_x));
   PetscCall(DMPlexCeedElemRestrictionQDataCreate(ceed, dm, domain_label, label_value, height, q_data_size_vol, &ceed_data->elem_restr_qd_i));
   PetscCall(DMPlexCeedElemRestrictionQDataCreate(ceed, dm, domain_label, label_value, height, jac_data_size_vol, &elem_restr_jd_i));
-  // -- Create E vectors
+
   PetscCallCeed(ceed, CeedElemRestrictionCreateVector(ceed_data->elem_restr_q, &user->q_ceed, NULL));
   PetscCallCeed(ceed, CeedElemRestrictionCreateVector(ceed_data->elem_restr_q, &user->q_dot_ceed, NULL));
   PetscCallCeed(ceed, CeedElemRestrictionCreateVector(ceed_data->elem_restr_q, &user->g_ceed, NULL));
-
-  // -----------------------------------------------------------------------------
-  // CEED QFunctions
-  // -----------------------------------------------------------------------------
-  // -- Create QFunction for quadrature data
-  PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, problem->setup_vol.qfunction, problem->setup_vol.qfunction_loc, &ceed_data->qf_setup_vol));
-  if (problem->setup_vol.qfunction_context) {
-    PetscCallCeed(ceed, CeedQFunctionSetContext(ceed_data->qf_setup_vol, problem->setup_vol.qfunction_context));
-  }
-  PetscCallCeed(ceed, CeedQFunctionSetUserFlopsEstimate(ceed_data->qf_setup_vol, 0));
-  PetscCallCeed(ceed, CeedQFunctionAddInput(ceed_data->qf_setup_vol, "dx", num_comp_x * dim, CEED_EVAL_GRAD));
-  PetscCallCeed(ceed, CeedQFunctionAddInput(ceed_data->qf_setup_vol, "weight", 1, CEED_EVAL_WEIGHT));
-  PetscCallCeed(ceed, CeedQFunctionAddOutput(ceed_data->qf_setup_vol, "qdata", q_data_size_vol, CEED_EVAL_NONE));
-
-  // -- Create QFunction for ICs
-  PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, problem->ics.qfunction, problem->ics.qfunction_loc, &ceed_data->qf_ics));
-  PetscCallCeed(ceed, CeedQFunctionSetContext(ceed_data->qf_ics, problem->ics.qfunction_context));
-  PetscCallCeed(ceed, CeedQFunctionSetUserFlopsEstimate(ceed_data->qf_ics, 0));
-  PetscCallCeed(ceed, CeedQFunctionAddInput(ceed_data->qf_ics, "x", num_comp_x, CEED_EVAL_INTERP));
-  PetscCallCeed(ceed, CeedQFunctionAddInput(ceed_data->qf_ics, "dx", num_comp_x * dim, CEED_EVAL_GRAD));
-  PetscCallCeed(ceed, CeedQFunctionAddOutput(ceed_data->qf_ics, "q0", num_comp_q, CEED_EVAL_NONE));
-
-  // -- Create QFunction for RHS
-  if (problem->apply_vol_rhs.qfunction) {
-    PetscCallCeed(
-        ceed, CeedQFunctionCreateInterior(ceed, 1, problem->apply_vol_rhs.qfunction, problem->apply_vol_rhs.qfunction_loc, &ceed_data->qf_rhs_vol));
-    PetscCallCeed(ceed, CeedQFunctionSetContext(ceed_data->qf_rhs_vol, problem->apply_vol_rhs.qfunction_context));
-    PetscCallCeed(ceed, CeedQFunctionSetUserFlopsEstimate(ceed_data->qf_rhs_vol, 0));
-    PetscCallCeed(ceed, CeedQFunctionAddInput(ceed_data->qf_rhs_vol, "q", num_comp_q, CEED_EVAL_INTERP));
-    PetscCallCeed(ceed, CeedQFunctionAddInput(ceed_data->qf_rhs_vol, "Grad_q", num_comp_q * dim, CEED_EVAL_GRAD));
-    PetscCallCeed(ceed, CeedQFunctionAddInput(ceed_data->qf_rhs_vol, "qdata", q_data_size_vol, CEED_EVAL_NONE));
-    PetscCallCeed(ceed, CeedQFunctionAddInput(ceed_data->qf_rhs_vol, "x", num_comp_x, CEED_EVAL_INTERP));
-    PetscCallCeed(ceed, CeedQFunctionAddOutput(ceed_data->qf_rhs_vol, "v", num_comp_q, CEED_EVAL_INTERP));
-    PetscCallCeed(ceed, CeedQFunctionAddOutput(ceed_data->qf_rhs_vol, "Grad_v", num_comp_q * dim, CEED_EVAL_GRAD));
-  }
-
-  // -- Create QFunction for IFunction
-  if (problem->apply_vol_ifunction.qfunction) {
-    PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, problem->apply_vol_ifunction.qfunction, problem->apply_vol_ifunction.qfunction_loc,
-                                                    &ceed_data->qf_ifunction_vol));
-    PetscCallCeed(ceed, CeedQFunctionSetContext(ceed_data->qf_ifunction_vol, problem->apply_vol_ifunction.qfunction_context));
-    PetscCallCeed(ceed, CeedQFunctionSetUserFlopsEstimate(ceed_data->qf_ifunction_vol, 0));
-    PetscCallCeed(ceed, CeedQFunctionAddInput(ceed_data->qf_ifunction_vol, "q", num_comp_q, CEED_EVAL_INTERP));
-    PetscCallCeed(ceed, CeedQFunctionAddInput(ceed_data->qf_ifunction_vol, "Grad_q", num_comp_q * dim, CEED_EVAL_GRAD));
-    PetscCallCeed(ceed, CeedQFunctionAddInput(ceed_data->qf_ifunction_vol, "q dot", num_comp_q, CEED_EVAL_INTERP));
-    PetscCallCeed(ceed, CeedQFunctionAddInput(ceed_data->qf_ifunction_vol, "qdata", q_data_size_vol, CEED_EVAL_NONE));
-    PetscCallCeed(ceed, CeedQFunctionAddInput(ceed_data->qf_ifunction_vol, "x", num_comp_x, CEED_EVAL_INTERP));
-    PetscCallCeed(ceed, CeedQFunctionAddOutput(ceed_data->qf_ifunction_vol, "v", num_comp_q, CEED_EVAL_INTERP));
-    PetscCallCeed(ceed, CeedQFunctionAddOutput(ceed_data->qf_ifunction_vol, "Grad_v", num_comp_q * dim, CEED_EVAL_GRAD));
-    PetscCallCeed(ceed, CeedQFunctionAddOutput(ceed_data->qf_ifunction_vol, "jac_data", jac_data_size_vol, CEED_EVAL_NONE));
-  }
-
-  CeedQFunction qf_ijacobian_vol = NULL;
-  if (problem->apply_vol_ijacobian.qfunction) {
-    PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, problem->apply_vol_ijacobian.qfunction, problem->apply_vol_ijacobian.qfunction_loc,
-                                                    &qf_ijacobian_vol));
-    PetscCallCeed(ceed, CeedQFunctionSetContext(qf_ijacobian_vol, problem->apply_vol_ijacobian.qfunction_context));
-    PetscCallCeed(ceed, CeedQFunctionSetUserFlopsEstimate(qf_ijacobian_vol, 0));
-    PetscCallCeed(ceed, CeedQFunctionAddInput(qf_ijacobian_vol, "dq", num_comp_q, CEED_EVAL_INTERP));
-    PetscCallCeed(ceed, CeedQFunctionAddInput(qf_ijacobian_vol, "Grad_dq", num_comp_q * dim, CEED_EVAL_GRAD));
-    PetscCallCeed(ceed, CeedQFunctionAddInput(qf_ijacobian_vol, "qdata", q_data_size_vol, CEED_EVAL_NONE));
-    PetscCallCeed(ceed, CeedQFunctionAddInput(qf_ijacobian_vol, "jac_data", jac_data_size_vol, CEED_EVAL_NONE));
-    PetscCallCeed(ceed, CeedQFunctionAddOutput(qf_ijacobian_vol, "v", num_comp_q, CEED_EVAL_INTERP));
-    PetscCallCeed(ceed, CeedQFunctionAddOutput(qf_ijacobian_vol, "Grad_v", num_comp_q * dim, CEED_EVAL_GRAD));
-  }
-
-  // ---------------------------------------------------------------------------
-  // Element coordinates
-  // ---------------------------------------------------------------------------
-  // -- Create CEED vector
   PetscCallCeed(ceed, CeedElemRestrictionCreateVector(ceed_data->elem_restr_x, &ceed_data->x_coord, NULL));
+  PetscCallCeed(ceed, CeedElemRestrictionCreateVector(ceed_data->elem_restr_qd_i, &ceed_data->q_data, NULL));
+  PetscCallCeed(ceed, CeedElemRestrictionCreateVector(elem_restr_jd_i, &jac_data, NULL));
+
+  {  // -- Copy PETSc coordinate vector into CEED vector
+    Vec X_loc;
+    DM  cdm;
 
-  // -- Copy PETSc vector in CEED vector
-  Vec X_loc;
-  {
-    DM cdm;
     PetscCall(DMGetCellCoordinateDM(dm, &cdm));
     if (cdm) {
       PetscCall(DMGetCellCoordinatesLocal(dm, &X_loc));
     } else {
       PetscCall(DMGetCoordinatesLocal(dm, &X_loc));
     }
+    PetscCall(VecScale(X_loc, problem->dm_scale));
+    PetscCall(VecCopyPetscToCeed(X_loc, ceed_data->x_coord));
   }
-  PetscCall(VecScale(X_loc, problem->dm_scale));
-  PetscCall(VecCopyPetscToCeed(X_loc, ceed_data->x_coord));
 
-  // -----------------------------------------------------------------------------
-  // CEED vectors
-  // -----------------------------------------------------------------------------
-  // -- Create CEED vector for geometric data
-  PetscCallCeed(ceed, CeedElemRestrictionCreateVector(ceed_data->elem_restr_qd_i, &ceed_data->q_data, NULL));
-  PetscCallCeed(ceed, CeedElemRestrictionCreateVector(elem_restr_jd_i, &jac_data, NULL));
+  {  // -- Create quadrature data
+    CeedQFunction qf_setup_vol;
+    CeedOperator  op_setup_vol;
 
-  // -----------------------------------------------------------------------------
-  // CEED Operators
-  // -----------------------------------------------------------------------------
-  // -- Create CEED operator for quadrature data
-  PetscCallCeed(ceed, CeedOperatorCreate(ceed, ceed_data->qf_setup_vol, NULL, NULL, &ceed_data->op_setup_vol));
-  PetscCallCeed(ceed, CeedOperatorSetField(ceed_data->op_setup_vol, "dx", ceed_data->elem_restr_x, ceed_data->basis_x, CEED_VECTOR_ACTIVE));
-  PetscCallCeed(ceed, CeedOperatorSetField(ceed_data->op_setup_vol, "weight", CEED_ELEMRESTRICTION_NONE, ceed_data->basis_x, CEED_VECTOR_NONE));
-  PetscCallCeed(ceed, CeedOperatorSetField(ceed_data->op_setup_vol, "qdata", ceed_data->elem_restr_qd_i, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE));
-
-  // -- Create CEED operator for ICs
-  CeedOperator op_ics;
-  PetscCallCeed(ceed, CeedOperatorCreate(ceed, ceed_data->qf_ics, NULL, NULL, &op_ics));
-  PetscCallCeed(ceed, CeedOperatorSetField(op_ics, "x", ceed_data->elem_restr_x, ceed_data->basis_xc, CEED_VECTOR_ACTIVE));
-  PetscCallCeed(ceed, CeedOperatorSetField(op_ics, "dx", ceed_data->elem_restr_x, ceed_data->basis_xc, CEED_VECTOR_ACTIVE));
-  PetscCallCeed(ceed, CeedOperatorSetField(op_ics, "q0", ceed_data->elem_restr_q, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE));
-  PetscCallCeed(ceed, CeedOperatorGetContextFieldLabel(op_ics, "evaluation time", &user->phys->ics_time_label));
-  PetscCall(OperatorApplyContextCreate(NULL, dm, user->ceed, op_ics, ceed_data->x_coord, NULL, NULL, user->Q_loc, &ceed_data->op_ics_ctx));
-  PetscCallCeed(ceed, CeedOperatorDestroy(&op_ics));
-
-  // Create CEED operator for RHS
-  if (ceed_data->qf_rhs_vol) {
-    CeedOperator op;
-    PetscCallCeed(ceed, CeedOperatorCreate(ceed, ceed_data->qf_rhs_vol, NULL, NULL, &op));
+    PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, problem->setup_vol.qfunction, problem->setup_vol.qfunction_loc, &qf_setup_vol));
+    PetscCallCeed(ceed, CeedQFunctionSetContext(qf_setup_vol, problem->setup_vol.qfunction_context));
+    PetscCallCeed(ceed, CeedQFunctionSetUserFlopsEstimate(qf_setup_vol, 0));
+    PetscCallCeed(ceed, CeedQFunctionAddInput(qf_setup_vol, "dx", num_comp_x * dim, CEED_EVAL_GRAD));
+    PetscCallCeed(ceed, CeedQFunctionAddInput(qf_setup_vol, "weight", 1, CEED_EVAL_WEIGHT));
+    PetscCallCeed(ceed, CeedQFunctionAddOutput(qf_setup_vol, "qdata", q_data_size_vol, CEED_EVAL_NONE));
+
+    PetscCallCeed(ceed, CeedOperatorCreate(ceed, qf_setup_vol, NULL, NULL, &op_setup_vol));
+    PetscCallCeed(ceed, CeedOperatorSetField(op_setup_vol, "dx", ceed_data->elem_restr_x, ceed_data->basis_x, CEED_VECTOR_ACTIVE));
+    PetscCallCeed(ceed, CeedOperatorSetField(op_setup_vol, "weight", CEED_ELEMRESTRICTION_NONE, ceed_data->basis_x, CEED_VECTOR_NONE));
+    PetscCallCeed(ceed, CeedOperatorSetField(op_setup_vol, "qdata", ceed_data->elem_restr_qd_i, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE));
+
+    PetscCallCeed(ceed, CeedOperatorApply(op_setup_vol, ceed_data->x_coord, ceed_data->q_data, CEED_REQUEST_IMMEDIATE));
+
+    PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_setup_vol));
+    PetscCallCeed(ceed, CeedOperatorDestroy(&op_setup_vol));
+  }
+
+  {  // -- Create QFunction for ICs
+    CeedQFunction qf_ics;
+    CeedOperator  op_ics;
+
+    PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, problem->ics.qfunction, problem->ics.qfunction_loc, &qf_ics));
+    PetscCallCeed(ceed, CeedQFunctionSetContext(qf_ics, problem->ics.qfunction_context));
+    PetscCallCeed(ceed, CeedQFunctionSetUserFlopsEstimate(qf_ics, 0));
+    PetscCallCeed(ceed, CeedQFunctionAddInput(qf_ics, "x", num_comp_x, CEED_EVAL_INTERP));
+    PetscCallCeed(ceed, CeedQFunctionAddInput(qf_ics, "dx", num_comp_x * dim, CEED_EVAL_GRAD));
+    PetscCallCeed(ceed, CeedQFunctionAddOutput(qf_ics, "q0", num_comp_q, CEED_EVAL_NONE));
+
+    PetscCallCeed(ceed, CeedOperatorCreate(ceed, qf_ics, NULL, NULL, &op_ics));
+    PetscCallCeed(ceed, CeedOperatorSetField(op_ics, "x", ceed_data->elem_restr_x, ceed_data->basis_xc, CEED_VECTOR_ACTIVE));
+    PetscCallCeed(ceed, CeedOperatorSetField(op_ics, "dx", ceed_data->elem_restr_x, ceed_data->basis_xc, CEED_VECTOR_ACTIVE));
+    PetscCallCeed(ceed, CeedOperatorSetField(op_ics, "q0", ceed_data->elem_restr_q, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE));
+    PetscCallCeed(ceed, CeedOperatorGetContextFieldLabel(op_ics, "evaluation time", &user->phys->ics_time_label));
+    PetscCall(OperatorApplyContextCreate(NULL, dm, user->ceed, op_ics, ceed_data->x_coord, NULL, NULL, user->Q_loc, &ceed_data->op_ics_ctx));
+
+    PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_ics));
+    PetscCallCeed(ceed, CeedOperatorDestroy(&op_ics));
+  }
+
+  if (problem->apply_vol_rhs.qfunction) {
+    CeedQFunction qf_rhs_vol;
+    CeedOperator  op;
+
+    PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, problem->apply_vol_rhs.qfunction, problem->apply_vol_rhs.qfunction_loc, &qf_rhs_vol));
+    PetscCallCeed(ceed, CeedQFunctionSetContext(qf_rhs_vol, problem->apply_vol_rhs.qfunction_context));
+    PetscCallCeed(ceed, CeedQFunctionSetUserFlopsEstimate(qf_rhs_vol, 0));
+    PetscCallCeed(ceed, CeedQFunctionAddInput(qf_rhs_vol, "q", num_comp_q, CEED_EVAL_INTERP));
+    PetscCallCeed(ceed, CeedQFunctionAddInput(qf_rhs_vol, "Grad_q", num_comp_q * dim, CEED_EVAL_GRAD));
+    PetscCallCeed(ceed, CeedQFunctionAddInput(qf_rhs_vol, "qdata", q_data_size_vol, CEED_EVAL_NONE));
+    PetscCallCeed(ceed, CeedQFunctionAddInput(qf_rhs_vol, "x", num_comp_x, CEED_EVAL_INTERP));
+    PetscCallCeed(ceed, CeedQFunctionAddOutput(qf_rhs_vol, "v", num_comp_q, CEED_EVAL_INTERP));
+    PetscCallCeed(ceed, CeedQFunctionAddOutput(qf_rhs_vol, "Grad_v", num_comp_q * dim, CEED_EVAL_GRAD));
+
+    PetscCallCeed(ceed, CeedOperatorCreate(ceed, qf_rhs_vol, NULL, NULL, &op));
     PetscCallCeed(ceed, CeedOperatorSetField(op, "q", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE));
     PetscCallCeed(ceed, CeedOperatorSetField(op, "Grad_q", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE));
     PetscCallCeed(ceed, CeedOperatorSetField(op, "qdata", ceed_data->elem_restr_qd_i, CEED_BASIS_NONE, ceed_data->q_data));
@@ -406,12 +357,28 @@ PetscErrorCode SetupLibceed(Ceed ceed, CeedData ceed_data, DM dm, User user, App
     PetscCallCeed(ceed, CeedOperatorSetField(op, "v", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE));
     PetscCallCeed(ceed, CeedOperatorSetField(op, "Grad_v", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE));
     user->op_rhs_vol = op;
+
+    PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_rhs_vol));
   }
 
-  // -- CEED operator for IFunction
-  if (ceed_data->qf_ifunction_vol) {
-    CeedOperator op;
-    PetscCallCeed(ceed, CeedOperatorCreate(ceed, ceed_data->qf_ifunction_vol, NULL, NULL, &op));
+  if (problem->apply_vol_ifunction.qfunction) {
+    CeedQFunction qf_ifunction_vol;
+    CeedOperator  op;
+
+    PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, problem->apply_vol_ifunction.qfunction, problem->apply_vol_ifunction.qfunction_loc,
+                                                    &qf_ifunction_vol));
+    PetscCallCeed(ceed, CeedQFunctionSetContext(qf_ifunction_vol, problem->apply_vol_ifunction.qfunction_context));
+    PetscCallCeed(ceed, CeedQFunctionSetUserFlopsEstimate(qf_ifunction_vol, 0));
+    PetscCallCeed(ceed, CeedQFunctionAddInput(qf_ifunction_vol, "q", num_comp_q, CEED_EVAL_INTERP));
+    PetscCallCeed(ceed, CeedQFunctionAddInput(qf_ifunction_vol, "Grad_q", num_comp_q * dim, CEED_EVAL_GRAD));
+    PetscCallCeed(ceed, CeedQFunctionAddInput(qf_ifunction_vol, "q dot", num_comp_q, CEED_EVAL_INTERP));
+    PetscCallCeed(ceed, CeedQFunctionAddInput(qf_ifunction_vol, "qdata", q_data_size_vol, CEED_EVAL_NONE));
+    PetscCallCeed(ceed, CeedQFunctionAddInput(qf_ifunction_vol, "x", num_comp_x, CEED_EVAL_INTERP));
+    PetscCallCeed(ceed, CeedQFunctionAddOutput(qf_ifunction_vol, "v", num_comp_q, CEED_EVAL_INTERP));
+    PetscCallCeed(ceed, CeedQFunctionAddOutput(qf_ifunction_vol, "Grad_v", num_comp_q * dim, CEED_EVAL_GRAD));
+    PetscCallCeed(ceed, CeedQFunctionAddOutput(qf_ifunction_vol, "jac_data", jac_data_size_vol, CEED_EVAL_NONE));
+
+    PetscCallCeed(ceed, CeedOperatorCreate(ceed, qf_ifunction_vol, NULL, NULL, &op));
     PetscCallCeed(ceed, CeedOperatorSetField(op, "q", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE));
     PetscCallCeed(ceed, CeedOperatorSetField(op, "Grad_q", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE));
     PetscCallCeed(ceed, CeedOperatorSetField(op, "q dot", ceed_data->elem_restr_q, ceed_data->basis_q, user->q_dot_ceed));
@@ -422,11 +389,25 @@ PetscErrorCode SetupLibceed(Ceed ceed, CeedData ceed_data, DM dm, User user, App
     PetscCallCeed(ceed, CeedOperatorSetField(op, "jac_data", elem_restr_jd_i, CEED_BASIS_NONE, jac_data));
 
     user->op_ifunction_vol = op;
+    PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_ifunction_vol));
   }
 
   CeedOperator op_ijacobian_vol = NULL;
-  if (qf_ijacobian_vol) {
-    CeedOperator op;
+  if (problem->apply_vol_ijacobian.qfunction) {
+    CeedQFunction qf_ijacobian_vol;
+    CeedOperator  op;
+
+    PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, problem->apply_vol_ijacobian.qfunction, problem->apply_vol_ijacobian.qfunction_loc,
+                                                    &qf_ijacobian_vol));
+    PetscCallCeed(ceed, CeedQFunctionSetContext(qf_ijacobian_vol, problem->apply_vol_ijacobian.qfunction_context));
+    PetscCallCeed(ceed, CeedQFunctionSetUserFlopsEstimate(qf_ijacobian_vol, 0));
+    PetscCallCeed(ceed, CeedQFunctionAddInput(qf_ijacobian_vol, "dq", num_comp_q, CEED_EVAL_INTERP));
+    PetscCallCeed(ceed, CeedQFunctionAddInput(qf_ijacobian_vol, "Grad_dq", num_comp_q * dim, CEED_EVAL_GRAD));
+    PetscCallCeed(ceed, CeedQFunctionAddInput(qf_ijacobian_vol, "qdata", q_data_size_vol, CEED_EVAL_NONE));
+    PetscCallCeed(ceed, CeedQFunctionAddInput(qf_ijacobian_vol, "jac_data", jac_data_size_vol, CEED_EVAL_NONE));
+    PetscCallCeed(ceed, CeedQFunctionAddOutput(qf_ijacobian_vol, "v", num_comp_q, CEED_EVAL_INTERP));
+    PetscCallCeed(ceed, CeedQFunctionAddOutput(qf_ijacobian_vol, "Grad_v", num_comp_q * dim, CEED_EVAL_GRAD));
+
     PetscCallCeed(ceed, CeedOperatorCreate(ceed, qf_ijacobian_vol, NULL, NULL, &op));
     PetscCallCeed(ceed, CeedOperatorSetField(op, "dq", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE));
     PetscCallCeed(ceed, CeedOperatorSetField(op, "Grad_dq", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE));
@@ -435,6 +416,7 @@ PetscErrorCode SetupLibceed(Ceed ceed, CeedData ceed_data, DM dm, User user, App
     PetscCallCeed(ceed, CeedOperatorSetField(op, "v", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE));
     PetscCallCeed(ceed, CeedOperatorSetField(op, "Grad_v", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE));
     op_ijacobian_vol = op;
+
     PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_ijacobian_vol));
   }
 
@@ -449,11 +431,16 @@ PetscErrorCode SetupLibceed(Ceed ceed, CeedData ceed_data, DM dm, User user, App
   // CEED Bases
   // -----------------------------------------------------------------------------
 
-  DMLabel  label   = 0;
-  PetscInt face_id = 0;
-  PetscInt field   = 0;  // Still want the normal, default field
-  PetscCall(CreateBasisFromPlex(ceed, dm, label, face_id, height, field, &ceed_data->basis_q_sur));
-  PetscCall(CreateBasisFromPlex(ceed, dm_coord, label, face_id, height, field, &ceed_data->basis_x_sur));
+  {
+    DM dm_coord;
+
+    PetscCall(DMGetCoordinateDM(dm, &dm_coord));
+    DMLabel  label   = NULL;
+    PetscInt face_id = 0;
+    PetscInt field   = 0;  // Still want the normal, default field
+    PetscCall(CreateBasisFromPlex(ceed, dm, label, face_id, height, field, &ceed_data->basis_q_sur));
+    PetscCall(CreateBasisFromPlex(ceed, dm_coord, label, face_id, height, field, &ceed_data->basis_x_sur));
+  }
 
   // -----------------------------------------------------------------------------
   // CEED QFunctions
@@ -480,9 +467,6 @@ PetscErrorCode SetupLibceed(Ceed ceed, CeedData ceed_data, DM dm, User user, App
   // *****************************************************************************
   // CEED Operator Apply
   // *****************************************************************************
-  // -- Apply CEED Operator for the geometric data
-  PetscCallCeed(ceed, CeedOperatorApply(ceed_data->op_setup_vol, ceed_data->x_coord, ceed_data->q_data, CEED_REQUEST_IMMEDIATE));
-
   // -- Create and apply CEED Composite Operator for the entire domain
   if (!user->phys->implicit) {  // RHS
     CeedOperator op_rhs;

From fb133d4b0da93cd2b1f52e39cd1305dc4c4450c3 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Thu, 25 Apr 2024 14:31:55 -0600
Subject: [PATCH 005/571] cpu - add AssembleAddDiagonal for AtPoints Operator

---
 backends/ref/ceed-ref-operator.c | 207 ++++++++++++++++++++++++++++++-
 tests/t533-operator.c            |   4 +-
 tests/t592-operator.c            |   2 +-
 tests/t593-operator.c            |   2 +-
 tests/t594-operator.c            | 179 ++++++++++++++++++++++++++
 5 files changed, 389 insertions(+), 5 deletions(-)
 create mode 100644 tests/t594-operator.c

diff --git a/backends/ref/ceed-ref-operator.c b/backends/ref/ceed-ref-operator.c
index f525460ea6..5347580d2c 100644
--- a/backends/ref/ceed-ref-operator.c
+++ b/backends/ref/ceed-ref-operator.c
@@ -1109,8 +1109,212 @@ static int CeedOperatorLinearAssembleQFunctionAtPointsUpdate_Ref(CeedOperator op
 }
 
 //------------------------------------------------------------------------------
-// Assemble Operator
+// Assemble Operator Diagonal AtPoints
 //------------------------------------------------------------------------------
+static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Ref(CeedOperator op, CeedVector assembled, CeedRequest *request) {
+  bool                is_active_at_points = true;
+  CeedInt             num_points_offset = 0, num_input_fields, num_output_fields, num_elem, elem_size_active = 1, num_comp_active;
+  CeedScalar         *e_data[2 * CEED_FIELD_MAX] = {0};
+  Ceed                ceed;
+  CeedVector          point_coords = NULL, in_vec, out_vec;
+  CeedElemRestriction rstr_points  = NULL;
+  CeedQFunctionField *qf_input_fields, *qf_output_fields;
+  CeedQFunction       qf;
+  CeedOperatorField  *op_input_fields, *op_output_fields;
+  CeedOperator_Ref   *impl;
+
+  CeedCallBackend(CeedOperatorGetData(op, &impl));
+  CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
+  CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
+  CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
+  CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
+
+  // Setup
+  CeedCallBackend(CeedOperatorSetupAtPoints_Ref(op));
+
+  // Ceed
+  {
+    Ceed ceed_parent;
+
+    CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
+    CeedCallBackend(CeedGetParent(ceed, &ceed_parent));
+    if (ceed_parent) ceed = ceed_parent;
+  }
+
+  // Point coordinates
+  CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, &point_coords));
+
+  // Input and output vectors
+  {
+    CeedSize input_size, output_size;
+
+    CeedCallBackend(CeedOperatorGetActiveVectorLengths(op, &input_size, &output_size));
+    CeedCallBackend(CeedVectorCreate(ceed, input_size, &in_vec));
+    CeedCallBackend(CeedVectorCreate(ceed, output_size, &out_vec));
+    CeedCallBackend(CeedVectorSetValue(out_vec, 0.0));
+  }
+
+  // Input Evecs and Restriction
+  CeedCallBackend(CeedOperatorSetupInputs_Ref(num_input_fields, qf_input_fields, op_input_fields, NULL, true, e_data, impl, request));
+
+  // Check if active field is at points
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedRestrictionType rstr_type;
+    CeedVector          vec;
+    CeedElemRestriction elem_rstr;
+
+    CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
+    // Skip non-active input
+    if (vec != CEED_VECTOR_ACTIVE) continue;
+
+    // Get active restriction type
+    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
+    CeedCallBackend(CeedElemRestrictionGetType(elem_rstr, &rstr_type));
+    CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp_active));
+    is_active_at_points = rstr_type == CEED_RESTRICTION_POINTS;
+    if (!is_active_at_points) CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size_active));
+  }
+
+  // Loop through elements
+  for (CeedInt e = 0; e < num_elem; e++) {
+    CeedInt num_points, e_vec_size = 0;
+
+    // Setup points for element
+    CeedCallBackend(CeedElemRestrictionApplyAtPointsInElement(rstr_points, e, CEED_NOTRANSPOSE, point_coords, impl->point_coords_elem, request));
+    CeedCallBackend(CeedElemRestrictionGetNumPointsInElement(rstr_points, e, &num_points));
+
+    // Input basis apply for non-active bases
+    CeedCallBackend(CeedOperatorInputBasisAtPoints_Ref(e, num_points_offset, num_points, qf_input_fields, op_input_fields, num_input_fields, in_vec,
+                                                       impl->point_coords_elem, true, e_data, impl, request));
+
+    // Loop over points on element
+    e_vec_size = (is_active_at_points ? num_points : elem_size_active) * num_comp_active;
+    for (CeedInt s = 0; s < e_vec_size; s++) {
+      for (CeedInt i = 0; i < num_input_fields; i++) {
+        bool                is_active_input = false;
+        CeedInt             size;
+        CeedRestrictionType rstr_type;
+        CeedEvalMode        eval_mode;
+        CeedVector          vec;
+        CeedElemRestriction elem_rstr;
+        CeedBasis           basis;
+
+        CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
+        // Skip non-active input
+        is_active_input = vec == CEED_VECTOR_ACTIVE;
+        if (!is_active_input) continue;
+
+        // Get elem_size, eval_mode, size
+        CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
+        CeedCallBackend(CeedElemRestrictionGetType(elem_rstr, &rstr_type));
+        CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
+        CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &size));
+        // Update unit vector
+        {
+          CeedScalar *array;
+
+          if (s == 0) CeedCallBackend(CeedVectorSetValue(impl->e_vecs_in[i], 0.0));
+          CeedCallBackend(CeedVectorGetArray(impl->e_vecs_in[i], CEED_MEM_HOST, &array));
+          array[s] = 1.0;
+          if (s > 0) array[s - 1] = 0.0;
+          CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs_in[i], &array));
+        }
+        // Basis action
+        switch (eval_mode) {
+          case CEED_EVAL_NONE:
+            break;
+          // Note - these basis eval modes require FEM fields
+          case CEED_EVAL_INTERP:
+          case CEED_EVAL_GRAD:
+          case CEED_EVAL_DIV:
+          case CEED_EVAL_CURL:
+            CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
+            CeedCallBackend(CeedBasisApplyAtPoints(basis, num_points, CEED_NOTRANSPOSE, eval_mode, impl->point_coords_elem, impl->e_vecs_in[i],
+                                                   impl->q_vecs_in[i]));
+            break;
+          case CEED_EVAL_WEIGHT:
+            break;  // No action
+        }
+      }
+
+      // -- Q function
+      if (!impl->is_identity_qf) {
+        CeedCallBackend(CeedQFunctionApply(qf, num_points, impl->q_vecs_in, impl->q_vecs_out));
+      }
+
+      // -- Output basis apply and restriction
+      CeedCallBackend(CeedOperatorOutputBasisAtPoints_Ref(e, num_points_offset, num_points, qf_output_fields, op_output_fields, num_input_fields,
+                                                          num_output_fields, op, out_vec, impl->point_coords_elem, impl, request));
+
+      // -- Grab diagonal value
+      for (CeedInt i = 0; i < num_output_fields; i++) {
+        bool                is_active_input = false;
+        CeedRestrictionType rstr_type;
+        CeedEvalMode        eval_mode;
+        CeedVector          vec;
+        CeedElemRestriction elem_rstr;
+        CeedBasis           basis;
+
+        CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
+        // ---- Skip non-active input
+        is_active_input = vec == CEED_VECTOR_ACTIVE;
+        if (!is_active_input) continue;
+
+        // ---- Get elem_size, eval_mode, size
+        CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
+        CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
+        // ---- Basis action
+        switch (eval_mode) {
+          case CEED_EVAL_NONE:
+            break;  // No action
+          case CEED_EVAL_INTERP:
+          case CEED_EVAL_GRAD:
+          case CEED_EVAL_DIV:
+          case CEED_EVAL_CURL:
+            CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
+            CeedCallBackend(CeedBasisApplyAtPoints(basis, num_points, CEED_TRANSPOSE, eval_mode, impl->point_coords_elem, impl->q_vecs_out[i],
+                                                   impl->e_vecs_out[i]));
+            break;
+          // LCOV_EXCL_START
+          case CEED_EVAL_WEIGHT: {
+            return CeedError(CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
+            // LCOV_EXCL_STOP
+          }
+        }
+        // ---- Update output vector
+        {
+          CeedScalar *array, current_value = 0.0;
+
+          CeedCallBackend(CeedVectorGetArray(impl->e_vecs_out[i], CEED_MEM_HOST, &array));
+          current_value = array[s];
+          CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs_out[i], &array));
+          CeedCallBackend(CeedVectorSetValue(impl->e_vecs_out[i], 0.0));
+          CeedCallBackend(CeedVectorGetArray(impl->e_vecs_out[i], CEED_MEM_HOST, &array));
+          array[s] = current_value;
+          CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs_out[i], &array));
+        }
+        // ---- Restrict output block
+        CeedCallBackend(CeedElemRestrictionGetType(elem_rstr, &rstr_type));
+        if (rstr_type == CEED_RESTRICTION_POINTS) {
+          CeedCallBackend(CeedElemRestrictionApplyAtPointsInElement(elem_rstr, e, CEED_TRANSPOSE, impl->e_vecs_out[i], assembled, request));
+        } else {
+          CeedCallBackend(CeedElemRestrictionApplyBlock(elem_rstr, e, CEED_TRANSPOSE, impl->e_vecs_out[i], assembled, request));
+        }
+      }
+    }
+    num_points_offset += num_points;
+  }
+
+  // Restore input arrays
+  CeedCallBackend(CeedOperatorRestoreInputs_Ref(num_input_fields, qf_input_fields, op_input_fields, true, e_data, impl));
+
+  // Cleanup
+  CeedCallBackend(CeedVectorDestroy(&in_vec));
+  CeedCallBackend(CeedVectorDestroy(&out_vec));
+  CeedCallBackend(CeedVectorDestroy(&point_coords));
+  CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
+  return CEED_ERROR_SUCCESS;
+}
 
 //------------------------------------------------------------------------------
 // Operator Destroy
@@ -1180,6 +1384,7 @@ int CeedOperatorCreateAtPoints_Ref(CeedOperator op) {
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunction", CeedOperatorLinearAssembleQFunctionAtPoints_Ref));
   CeedCallBackend(
       CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunctionUpdate", CeedOperatorLinearAssembleQFunctionAtPointsUpdate_Ref));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddDiagonal", CeedOperatorLinearAssembleAddDiagonalAtPoints_Ref));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAddAtPoints_Ref));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "Destroy", CeedOperatorDestroy_Ref));
   return CEED_ERROR_SUCCESS;
diff --git a/tests/t533-operator.c b/tests/t533-operator.c
index a01dabda4a..2a19143ffa 100644
--- a/tests/t533-operator.c
+++ b/tests/t533-operator.c
@@ -89,7 +89,7 @@ int main(int argc, char **argv) {
 
   // Manually assemble diagonal
   CeedVectorSetValue(u, 0.0);
-  for (int i = 0; i < num_dofs; i++) {
+  for (CeedInt i = 0; i < num_dofs; i++) {
     CeedScalar       *u_array;
     const CeedScalar *v_array;
 
@@ -113,7 +113,7 @@ int main(int argc, char **argv) {
     const CeedScalar *assembled_array;
 
     CeedVectorGetArrayRead(assembled, CEED_MEM_HOST, &assembled_array);
-    for (int i = 0; i < num_dofs; i++) {
+    for (CeedInt i = 0; i < num_dofs; i++) {
       if (fabs(assembled_array[i] - assembled_true[i]) > 100. * CEED_EPSILON) {
         // LCOV_EXCL_START
         printf("[%" CeedInt_FMT "] Error in assembly: %f != %f\n", i, assembled_array[i], assembled_true[i]);
diff --git a/tests/t592-operator.c b/tests/t592-operator.c
index e0ccdace2e..91e519f3bb 100644
--- a/tests/t592-operator.c
+++ b/tests/t592-operator.c
@@ -1,6 +1,6 @@
 /// @file
 /// Test assembly of mass matrix operator QFunction at points
-/// \test Test assembly of mass matrix operator QFunction
+/// \test Test assembly of mass matrix operator QFunction at points
 #include <ceed.h>
 #include <math.h>
 #include <stdio.h>
diff --git a/tests/t593-operator.c b/tests/t593-operator.c
index 5b145d0884..2a2daceb88 100644
--- a/tests/t593-operator.c
+++ b/tests/t593-operator.c
@@ -1,5 +1,5 @@
 /// @file
-/// Bug reproducer for memcheck backends at points
+/// Test 1D mass matrix operator at points with heterogeneous points per element
 /// \test Test 1D mass matrix operator at points with heterogeneous points per element
 #include <ceed.h>
 #include <math.h>
diff --git a/tests/t594-operator.c b/tests/t594-operator.c
new file mode 100644
index 0000000000..2d4e6d876c
--- /dev/null
+++ b/tests/t594-operator.c
@@ -0,0 +1,179 @@
+/// @file
+/// Test diagonal assembly of mass matrix operator at points
+/// \test Test diagonal assembly of mass matrix operator at points
+#include <ceed.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "t500-operator.h"
+
+int main(int argc, char **argv) {
+  Ceed       ceed;
+  CeedInt    num_elem = 3, dim = 1, p = 3, q = 5;
+  CeedInt    num_nodes_x = num_elem + 1, num_nodes_u = num_elem * (p - 1) + 1, num_points_per_elem = 4, num_points = num_elem * num_points_per_elem;
+  CeedInt    ind_x[num_elem * 2], ind_u[num_elem * p], ind_x_points[num_elem + 1 + num_points];
+  CeedScalar x_array_mesh[num_nodes_x], x_array_points[num_points], assembled_true[num_nodes_u];
+  CeedVector x_points = NULL, x_elem = NULL, q_data = NULL, u = NULL, v = NULL, assembled = NULL;
+  CeedElemRestriction elem_restriction_x_points, elem_restriction_q_data, elem_restriction_x, elem_restriction_u;
+  CeedBasis           basis_x, basis_u;
+  CeedQFunction       qf_setup, qf_mass;
+  CeedOperator        op_setup, op_mass;
+  bool                is_at_points;
+
+  CeedInit(argv[1], &ceed);
+
+  // Mesh coordinates
+  for (CeedInt i = 0; i < num_nodes_x; i++) x_array_mesh[i] = (CeedScalar)i / (num_nodes_x - 1);
+  for (CeedInt i = 0; i < num_elem; i++) {
+    ind_x[2 * i + 0] = i;
+    ind_x[2 * i + 1] = i + 1;
+  }
+  CeedElemRestrictionCreate(ceed, num_elem, 2, 1, 1, num_nodes_x, CEED_MEM_HOST, CEED_USE_POINTER, ind_x, &elem_restriction_x);
+  CeedVectorCreate(ceed, num_nodes_x, &x_elem);
+  CeedVectorSetArray(x_elem, CEED_MEM_HOST, CEED_USE_POINTER, x_array_mesh);
+
+  // U mesh
+  for (CeedInt i = 0; i < num_elem; i++) {
+    for (CeedInt j = 0; j < p; j++) {
+      ind_u[p * i + j] = i * (p - 1) + j;
+    }
+  }
+  CeedElemRestrictionCreate(ceed, num_elem, p, 1, 1, num_nodes_u, CEED_MEM_HOST, CEED_USE_POINTER, ind_u, &elem_restriction_u);
+
+  // Point reference coordinates
+  {
+    CeedScalar weight_tmp[num_points_per_elem + 1];
+    CeedInt    current_index = 0;
+
+    // Use num_points_per_elem + 1 to test non-uniform quadrature
+    CeedGaussQuadrature(num_points_per_elem + 1, x_array_points, weight_tmp);
+    ind_x_points[0] = num_elem + 1;
+    for (CeedInt p = 0; p < num_points_per_elem + 1; p++, current_index++) {
+      ind_x_points[num_elem + 1 + current_index] = current_index;
+    }
+    // Use num_points_per_elem for middle elements
+    for (CeedInt e = 1; e < num_elem - 1; e++) {
+      CeedGaussQuadrature(num_points_per_elem, &x_array_points[current_index], weight_tmp);
+      ind_x_points[e] = num_elem + 1 + current_index;
+      for (CeedInt p = 0; p < num_points_per_elem; p++, current_index++) {
+        ind_x_points[num_elem + 1 + current_index] = current_index;
+      }
+    }
+    // Use num_points_per_elem - 1 to test non-uniform quadrature
+    CeedGaussQuadrature(num_points_per_elem - 1, &x_array_points[current_index], weight_tmp);
+    ind_x_points[num_elem - 1] = num_elem + 1 + current_index;
+    for (CeedInt p = 0; p < num_points_per_elem - 1; p++, current_index++) {
+      ind_x_points[num_elem + 1 + current_index] = current_index;
+    }
+    ind_x_points[num_elem] = num_elem + 1 + current_index;
+
+    CeedVectorCreate(ceed, num_elem * num_points_per_elem, &x_points);
+    CeedVectorSetArray(x_points, CEED_MEM_HOST, CEED_USE_POINTER, x_array_points);
+    CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points, 1, num_points, CEED_MEM_HOST, CEED_COPY_VALUES, ind_x_points,
+                                      &elem_restriction_x_points);
+    CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points, 1, num_points, CEED_MEM_HOST, CEED_COPY_VALUES, ind_x_points,
+                                      &elem_restriction_q_data);
+
+    // Q data
+    CeedVectorCreate(ceed, num_points, &q_data);
+  }
+
+  // Basis creation
+  CeedBasisCreateTensorH1Lagrange(ceed, dim, dim, 2, q, CEED_GAUSS, &basis_x);
+  CeedBasisCreateTensorH1Lagrange(ceed, dim, 1, p, q, CEED_GAUSS, &basis_u);
+
+  // Setup geometric scaling
+  CeedQFunctionCreateInterior(ceed, 1, setup, setup_loc, &qf_setup);
+  CeedQFunctionAddInput(qf_setup, "x", dim * dim, CEED_EVAL_GRAD);
+  CeedQFunctionAddInput(qf_setup, "weight", 1, CEED_EVAL_WEIGHT);
+  CeedQFunctionAddOutput(qf_setup, "rho", 1, CEED_EVAL_NONE);
+
+  CeedOperatorCreateAtPoints(ceed, qf_setup, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_setup);
+  CeedOperatorSetField(op_setup, "x", elem_restriction_x, basis_x, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetField(op_setup, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE);
+  CeedOperatorSetField(op_setup, "rho", elem_restriction_q_data, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE);
+  CeedOperatorAtPointsSetPoints(op_setup, elem_restriction_x_points, x_points);
+
+  CeedOperatorApply(op_setup, x_elem, q_data, CEED_REQUEST_IMMEDIATE);
+
+  // Mass operator
+  CeedQFunctionCreateInterior(ceed, 1, mass, mass_loc, &qf_mass);
+  CeedQFunctionAddInput(qf_mass, "u", 1, CEED_EVAL_INTERP);
+  CeedQFunctionAddInput(qf_mass, "rho", 1, CEED_EVAL_NONE);
+  CeedQFunctionAddOutput(qf_mass, "v", 1, CEED_EVAL_INTERP);
+
+  CeedOperatorCreateAtPoints(ceed, qf_mass, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_mass);
+  CeedOperatorSetField(op_mass, "u", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetField(op_mass, "rho", elem_restriction_q_data, CEED_BASIS_NONE, q_data);
+  CeedOperatorSetField(op_mass, "v", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE);
+  CeedOperatorAtPointsSetPoints(op_mass, elem_restriction_x_points, x_points);
+
+  CeedOperatorIsAtPoints(op_mass, &is_at_points);
+  if (!is_at_points) printf("Error: Operator should be at points\n");
+
+  CeedVectorCreate(ceed, num_nodes_u, &u);
+  CeedVectorSetValue(u, 0.0);
+  CeedVectorCreate(ceed, num_nodes_u, &v);
+
+  // Assemble diagonal
+  CeedVectorCreate(ceed, num_nodes_u, &assembled);
+  CeedOperatorLinearAssembleDiagonal(op_mass, assembled, CEED_REQUEST_IMMEDIATE);
+
+  // Manually assemble diagonal
+  CeedVectorSetValue(u, 0.0);
+  for (CeedInt i = 0; i < num_nodes_u; i++) {
+    CeedScalar       *u_array;
+    const CeedScalar *v_array;
+
+    // Set input
+    CeedVectorGetArray(u, CEED_MEM_HOST, &u_array);
+    u_array[i] = 1.0;
+    if (i) u_array[i - 1] = 0.0;
+    CeedVectorRestoreArray(u, &u_array);
+
+    // Compute diag entry for DoF i
+    CeedOperatorApply(op_mass, u, v, CEED_REQUEST_IMMEDIATE);
+
+    // Retrieve entry
+    CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array);
+    assembled_true[i] = v_array[i];
+    CeedVectorRestoreArrayRead(v, &v_array);
+  }
+
+  // Check output
+  {
+    const CeedScalar *assembled_array;
+
+    CeedVectorGetArrayRead(assembled, CEED_MEM_HOST, &assembled_array);
+    for (CeedInt i = 0; i < num_nodes_u; i++) {
+      if (fabs(assembled_array[i] - assembled_true[i]) > 100. * CEED_EPSILON) {
+        // LCOV_EXCL_START
+        printf("[%" CeedInt_FMT "] Error in assembly: %f != %f\n", i, assembled_array[i], assembled_true[i]);
+        // LCOV_EXCL_STOP
+      }
+    }
+    CeedVectorRestoreArrayRead(assembled, &assembled_array);
+  }
+
+  // Cleanup
+  CeedVectorDestroy(&q_data);
+  CeedVectorDestroy(&u);
+  CeedVectorDestroy(&v);
+  CeedVectorDestroy(&x_points);
+  CeedVectorDestroy(&q_data);
+  CeedVectorDestroy(&x_elem);
+  CeedVectorDestroy(&assembled);
+  CeedElemRestrictionDestroy(&elem_restriction_q_data);
+  CeedElemRestrictionDestroy(&elem_restriction_x);
+  CeedElemRestrictionDestroy(&elem_restriction_x_points);
+  CeedElemRestrictionDestroy(&elem_restriction_u);
+  CeedBasisDestroy(&basis_x);
+  CeedBasisDestroy(&basis_u);
+  CeedQFunctionDestroy(&qf_setup);
+  CeedQFunctionDestroy(&qf_mass);
+  CeedOperatorDestroy(&op_setup);
+  CeedOperatorDestroy(&op_mass);
+  CeedDestroy(&ceed);
+  return 0;
+}

From 9c674643a5d0eef8e19137a125392bb41b7be0f4 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 26 Apr 2024 10:07:38 -0600
Subject: [PATCH 006/571] swarm - use diagonal assembly in BPSSwarm

---
 examples/petsc/bpsswarm.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/examples/petsc/bpsswarm.c b/examples/petsc/bpsswarm.c
index 5796cf7471..0609a8d7c0 100644
--- a/examples/petsc/bpsswarm.c
+++ b/examples/petsc/bpsswarm.c
@@ -209,6 +209,7 @@ int main(int argc, char **argv) {
   PetscCall(MatCreateShell(comm, l_size, l_size, g_size, g_size, op_apply_ctx, &mat_O));
   PetscCall(MatSetDM(mat_O, dm_mesh));
   PetscCall(MatShellSetOperation(mat_O, MATOP_MULT, (void (*)(void))MatMult_Ceed));
+  PetscCall(MatShellSetOperation(mat_O, MATOP_GET_DIAGONAL, (void (*)(void))MatGetDiag));
 
   // Set up libCEED
   CeedInit(ceed_resource, &ceed);
@@ -302,14 +303,9 @@ int main(int argc, char **argv) {
     PetscCall(KSPGetPC(ksp, &pc));
     if (bp_choice == CEED_BP1 || bp_choice == CEED_BP2) {
       PetscCall(PCSetType(pc, PCJACOBI));
-      PetscCall(PCJacobiSetType(pc, PC_JACOBI_ROWSUM));
+      PetscCall(PCJacobiSetType(pc, PC_JACOBI_DIAGONAL));
     } else {
       PetscCall(PCSetType(pc, PCNONE));
-      MatNullSpace nullspace;
-
-      PetscCall(MatNullSpaceCreate(PETSC_COMM_WORLD, PETSC_TRUE, 0, 0, &nullspace));
-      PetscCall(MatSetNullSpace(mat_O, nullspace));
-      PetscCall(MatNullSpaceDestroy(&nullspace));
     }
     PetscCall(KSPSetType(ksp, KSPCG));
     PetscCall(KSPSetNormType(ksp, KSP_NORM_NATURAL));

From f0db655e6e1f1937bd50d5bb7a8fc42d5473b178 Mon Sep 17 00:00:00 2001
From: James Wright <james@jameswright.xyz>
Date: Sat, 27 Apr 2024 11:23:15 -0600
Subject: [PATCH 007/571] fix: Correct P0 deprecation

the PetscOptionsDeprecated caused the P0 option to be ignored for some
reason
---
 examples/fluids/problems/blasius.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/examples/fluids/problems/blasius.c b/examples/fluids/problems/blasius.c
index 8e662dcfb6..4ea6389b99 100644
--- a/examples/fluids/problems/blasius.c
+++ b/examples/fluids/problems/blasius.c
@@ -268,9 +268,8 @@ PetscErrorCode NS_BLASIUS(ProblemData problem, DM dm, void *ctx, SimpleBC bc) {
   PetscCall(PetscOptionsBool("-weakT", "Change from rho weak to T weak at inflow", NULL, weakT, &weakT, NULL));
   PetscCall(PetscOptionsScalar("-velocity_infinity", "Velocity at boundary layer edge", NULL, U_inf, &U_inf, NULL));
   PetscCall(PetscOptionsScalar("-temperature_infinity", "Temperature at boundary layer edge", NULL, T_inf, &T_inf, NULL));
-  PetscCall(PetscOptionsScalar("-pressure_infinity", "Pressure at boundary layer edge", NULL, P_inf, &P_inf, &flg));
   PetscCall(PetscOptionsDeprecated("-P0", "-pressure_infinity", "libCEED 0.12.0", "Use -pressure_infinity to set pressure at boundary layer edge"));
-  if (!flg) PetscCall(PetscOptionsScalar("-P0", "Pressure at boundary layer edge", NULL, P_inf, &P_inf, &flg));
+  PetscCall(PetscOptionsScalar("-pressure_infinity", "Pressure at boundary layer edge", NULL, P_inf, &P_inf, &flg));
   PetscCall(PetscOptionsScalar("-temperature_wall", "Temperature at wall", NULL, T_wall, &T_wall, NULL));
   PetscCall(PetscOptionsScalar("-delta0", "Boundary layer height at inflow", NULL, delta0, &delta0, NULL));
   PetscCall(PetscOptionsInt("-n_chebyshev", "Number of Chebyshev terms", NULL, N, &N, NULL));

From c10408e0865ddcf494d231320ea714b7fec69dd3 Mon Sep 17 00:00:00 2001
From: James Wright <james@jameswright.xyz>
Date: Sat, 27 Apr 2024 11:50:47 -0600
Subject: [PATCH 008/571] fluids: Correct PetscOptionsDeprecated usage

It will automatically replace the old option with the new option if
`PetscOptionsDeprecated` is specified before the new option name.
---
 examples/fluids/problems/newtonian.c | 2 --
 examples/fluids/src/cloptions.c      | 4 ----
 2 files changed, 6 deletions(-)

diff --git a/examples/fluids/problems/newtonian.c b/examples/fluids/problems/newtonian.c
index 9f39eebaf5..c28bf50edf 100644
--- a/examples/fluids/problems/newtonian.c
+++ b/examples/fluids/problems/newtonian.c
@@ -229,8 +229,6 @@ PetscErrorCode NS_NEWTONIAN_IG(ProblemData problem, DM dm, void *ctx, SimpleBC b
 
   PetscInt dim = problem->dim;
   PetscCall(PetscOptionsDeprecated("-g", "-gravity", "libCEED 0.11.1", NULL));
-  PetscCall(PetscOptionsRealArray("-g", "Gravitational acceleration vector", NULL, g, &dim, &given_option));
-  dim = problem->dim;
   PetscCall(PetscOptionsRealArray("-gravity", "Gravitational acceleration vector", NULL, g, &dim, &given_option));
   if (given_option) PetscCheck(dim == 3, comm, PETSC_ERR_ARG_SIZ, "Gravity vector must be size 3, %" PetscInt_FMT " values given", dim);
 
diff --git a/examples/fluids/src/cloptions.c b/examples/fluids/src/cloptions.c
index 1fa601231a..abf3ea38b0 100644
--- a/examples/fluids/src/cloptions.c
+++ b/examples/fluids/src/cloptions.c
@@ -152,10 +152,6 @@ PetscErrorCode ProcessCommandLineOptions(MPI_Comm comm, AppCtx app_ctx, SimpleBC
                                        "Use -bc_symmetry_[x,y,z] for direct equivalency, or -bc_slip for weak, Riemann-based, direction-invariant "
                                        "slip/no-penatration boundary conditions"));
       PetscCall(PetscOptionsIntArray(flags[j], "Face IDs to apply symmetry BC", NULL, bc->symmetries[j], &bc->num_symmetry[j], &flg));
-      if (!flg) {
-        bc->num_symmetry[j] = 16;
-        PetscCall(PetscOptionsIntArray(deprecated[j], "Face IDs to apply slip BC", NULL, bc->symmetries[j], &bc->num_symmetry[j], &flg));
-      }
       if (bc->num_symmetry[j] > 0) has_symmetry = PETSC_TRUE;
     }
 

From 2788647c65156ec721245fdd5b42546163b72151 Mon Sep 17 00:00:00 2001
From: James Wright <james@jameswright.xyz>
Date: Sun, 28 Apr 2024 18:38:20 -0600
Subject: [PATCH 009/571] fix(fluids): Make local Vecs use PETSC_COMM_SELF in
 turbstats

Also adds check in CeedOperatorCreateLocalVecs to verify that comm is of
size 1.
---
 examples/fluids/src/petsc_ops.c      |  3 +++
 examples/fluids/src/turb_spanstats.c | 11 +++++------
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/examples/fluids/src/petsc_ops.c b/examples/fluids/src/petsc_ops.c
index 3706751db0..99f88d0999 100644
--- a/examples/fluids/src/petsc_ops.c
+++ b/examples/fluids/src/petsc_ops.c
@@ -176,9 +176,12 @@ VecType DMReturnVecType(DM dm) {
 PetscErrorCode CeedOperatorCreateLocalVecs(CeedOperator op, VecType vec_type, MPI_Comm comm, Vec *input, Vec *output) {
   CeedSize input_size, output_size;
   Ceed     ceed;
+  int      comm_size;
 
   PetscFunctionBeginUser;
   PetscCall(CeedOperatorGetCeed(op, &ceed));
+  PetscCallMPI(MPI_Comm_size(comm, &comm_size));
+  PetscCheck(comm_size == 1, PETSC_COMM_WORLD, PETSC_ERR_ARG_SIZ, "MPI_Comm must be of size 1, recieved comm of size %d", comm_size);
   PetscCallCeed(ceed, CeedOperatorGetActiveVectorLengths(op, &input_size, &output_size));
   if (input) {
     PetscCall(VecCreate(comm, input));
diff --git a/examples/fluids/src/turb_spanstats.c b/examples/fluids/src/turb_spanstats.c
index 54ab617afc..ed34c808d7 100644
--- a/examples/fluids/src/turb_spanstats.c
+++ b/examples/fluids/src/turb_spanstats.c
@@ -169,7 +169,7 @@ PetscErrorCode GetQuadratureCoords(Ceed ceed, DM dm, CeedElemRestriction elem_re
   PetscCallCeed(ceed, CeedOperatorSetField(op_quad_coords, "input", elem_restr_x, basis_x, x_coords));
   PetscCallCeed(ceed, CeedOperatorSetField(op_quad_coords, "output", elem_restr_qx, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE));
 
-  PetscCall(CeedOperatorCreateLocalVecs(op_quad_coords, DMReturnVecType(dm), PetscObjectComm((PetscObject)dm), NULL, Qx_coords));
+  PetscCall(CeedOperatorCreateLocalVecs(op_quad_coords, DMReturnVecType(dm), PETSC_COMM_SELF, NULL, Qx_coords));
   PetscCall(OperatorApplyContextCreate(NULL, NULL, ceed, op_quad_coords, CEED_VECTOR_NONE, NULL, NULL, NULL, &op_quad_coords_ctx));
 
   PetscCall(ApplyCeedOperatorLocalToLocal(NULL, *Qx_coords, op_quad_coords_ctx));
@@ -301,7 +301,6 @@ PetscErrorCode SetupL2ProjectionStats(Ceed ceed, User user, CeedData ceed_data,
   CeedOperator  op_mass, op_setup_sur, op_proj_rhs;
   CeedQFunction qf_mass, qf_stats_proj;
   CeedInt       q_data_size, num_comp_stats = user->spanstats.num_comp_stats;
-  MPI_Comm      comm = PetscObjectComm((PetscObject)user->spanstats.dm);
 
   PetscFunctionBeginUser;
   // -- Create Operator for RHS of L^2 projection of statistics
@@ -314,7 +313,7 @@ PetscErrorCode SetupL2ProjectionStats(Ceed ceed, User user, CeedData ceed_data,
   PetscCallCeed(ceed, CeedOperatorSetField(op_proj_rhs, "output", stats_data->elem_restr_parent_stats, stats_data->basis_stats, CEED_VECTOR_ACTIVE));
 
   PetscCall(OperatorApplyContextCreate(NULL, user->spanstats.dm, ceed, op_proj_rhs, NULL, NULL, NULL, NULL, &user->spanstats.op_proj_rhs_ctx));
-  PetscCall(CeedOperatorCreateLocalVecs(op_proj_rhs, DMReturnVecType(user->spanstats.dm), comm, &user->spanstats.Parent_Stats_loc, NULL));
+  PetscCall(CeedOperatorCreateLocalVecs(op_proj_rhs, DMReturnVecType(user->spanstats.dm), PETSC_COMM_SELF, &user->spanstats.Parent_Stats_loc, NULL));
 
   // -- Setup LHS of L^2 projection
   // Get q_data for mass matrix operator
@@ -340,7 +339,7 @@ PetscErrorCode SetupL2ProjectionStats(Ceed ceed, User user, CeedData ceed_data,
 
     PetscCall(MatCeedCreate(user->spanstats.dm, user->spanstats.dm, op_mass, NULL, &mat_mass));
 
-    PetscCall(KSPCreate(comm, &ksp));
+    PetscCall(KSPCreate(PetscObjectComm((PetscObject)user->spanstats.dm), &ksp));
     PetscCall(KSPSetOptionsPrefix(ksp, "turbulence_spanstats_"));
     {
       PC pc;
@@ -431,8 +430,8 @@ PetscErrorCode CreateStatisticCollectionOperator(Ceed ceed, User user, CeedData
   PetscCall(OperatorApplyContextCreate(user->dm, user->spanstats.dm, user->ceed, op_stats_collect, user->q_ceed, NULL, NULL, NULL,
                                        &user->spanstats.op_stats_collect_ctx));
 
-  PetscCall(CeedOperatorCreateLocalVecs(op_stats_collect, DMReturnVecType(user->spanstats.dm), PetscObjectComm((PetscObject)user->spanstats.dm), NULL,
-                                        &user->spanstats.Child_Stats_loc));
+  PetscCall(
+      CeedOperatorCreateLocalVecs(op_stats_collect, DMReturnVecType(user->spanstats.dm), PETSC_COMM_SELF, NULL, &user->spanstats.Child_Stats_loc));
   PetscCall(VecZeroEntries(user->spanstats.Child_Stats_loc));
 
   PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_stats_collect));

From e9c36be085200ad84b427d3f41921f3626bb31f1 Mon Sep 17 00:00:00 2001
From: James Wright <james@jameswright.xyz>
Date: Fri, 26 Apr 2024 10:48:25 -0600
Subject: [PATCH 010/571] fluids: Consolidate QF and Op creation for BCs

---
 examples/fluids/navierstokes.c                |   7 -
 examples/fluids/navierstokes.h                |  10 +-
 examples/fluids/src/setuplibceed.c            | 303 ++++++++++--------
 .../fluids/src/strong_boundary_conditions.c   |  17 +-
 examples/fluids/src/turb_spanstats.c          |   2 -
 5 files changed, 186 insertions(+), 153 deletions(-)

diff --git a/examples/fluids/navierstokes.c b/examples/fluids/navierstokes.c
index 3fe6aa3fa0..0e8cf6b4a3 100644
--- a/examples/fluids/navierstokes.c
+++ b/examples/fluids/navierstokes.c
@@ -268,9 +268,6 @@ int main(int argc, char **argv) {
   // -- Bases
   PetscCallCeed(ceed, CeedBasisDestroy(&ceed_data->basis_q));
   PetscCallCeed(ceed, CeedBasisDestroy(&ceed_data->basis_x));
-  PetscCallCeed(ceed, CeedBasisDestroy(&ceed_data->basis_xc));
-  PetscCallCeed(ceed, CeedBasisDestroy(&ceed_data->basis_q_sur));
-  PetscCallCeed(ceed, CeedBasisDestroy(&ceed_data->basis_x_sur));
 
   // -- Restrictions
   PetscCallCeed(ceed, CeedElemRestrictionDestroy(&ceed_data->elem_restr_q));
@@ -298,10 +295,6 @@ int main(int argc, char **argv) {
 
   // -- QFunctions
   PetscCallCeed(ceed, CeedQFunctionDestroy(&ceed_data->qf_setup_sur));
-  PetscCallCeed(ceed, CeedQFunctionDestroy(&ceed_data->qf_apply_inflow));
-  PetscCallCeed(ceed, CeedQFunctionDestroy(&ceed_data->qf_apply_inflow_jacobian));
-  PetscCallCeed(ceed, CeedQFunctionDestroy(&ceed_data->qf_apply_freestream));
-  PetscCallCeed(ceed, CeedQFunctionDestroy(&ceed_data->qf_apply_freestream_jacobian));
 
   // -- Operators
   PetscCall(OperatorApplyContextDestroy(ceed_data->op_ics_ctx));
diff --git a/examples/fluids/navierstokes.h b/examples/fluids/navierstokes.h
index 767d5bddf9..18967835c0 100644
--- a/examples/fluids/navierstokes.h
+++ b/examples/fluids/navierstokes.h
@@ -152,11 +152,10 @@ struct AppCtx_private {
 // libCEED data struct
 struct CeedData_private {
   CeedVector           x_coord, q_data;
-  CeedBasis            basis_x, basis_xc, basis_q, basis_x_sur, basis_q_sur;
+  CeedBasis            basis_x, basis_q;
   CeedElemRestriction  elem_restr_x, elem_restr_q, elem_restr_qd_i;
   OperatorApplyContext op_ics_ctx;
-  CeedQFunction        qf_setup_sur, qf_apply_inflow, qf_apply_inflow_jacobian, qf_apply_outflow, qf_apply_outflow_jacobian, qf_apply_freestream,
-      qf_apply_freestream_jacobian, qf_apply_slip, qf_apply_slip_jacobian;
+  CeedQFunction        qf_setup_sur;
 };
 
 typedef struct {
@@ -348,11 +347,6 @@ PetscErrorCode DMPlexCeedElemRestrictionCollocatedCreate(Ceed ceed, DM dm, DMLab
 
 PetscErrorCode CreateBasisFromPlex(Ceed ceed, DM dm, DMLabel domain_label, CeedInt label_value, CeedInt height, CeedInt dm_field, CeedBasis *basis);
 
-// Utility function to create CEED Composite Operator for the entire domain
-PetscErrorCode CreateOperatorForDomain(Ceed ceed, DM dm, SimpleBC bc, CeedData ceed_data, Physics phys, CeedOperator op_apply_vol,
-                                       CeedOperator op_apply_ijacobian_vol, CeedInt height, CeedInt P_sur, CeedInt Q_sur, CeedInt q_data_size_sur,
-                                       CeedInt jac_data_size_sur, CeedOperator *op_apply, CeedOperator *op_apply_ijacobian);
-
 PetscErrorCode SetupLibceed(Ceed ceed, CeedData ceed_data, DM dm, User user, AppCtx app_ctx, ProblemData problem, SimpleBC bc);
 
 // -----------------------------------------------------------------------------
diff --git a/examples/fluids/src/setuplibceed.c b/examples/fluids/src/setuplibceed.c
index ba68f83569..24c6a8a61c 100644
--- a/examples/fluids/src/setuplibceed.c
+++ b/examples/fluids/src/setuplibceed.c
@@ -90,17 +90,16 @@ static PetscErrorCode CreateKSPMass(User user, ProblemData problem) {
   PetscFunctionReturn(PETSC_SUCCESS);
 }
 
-PetscErrorCode AddBCSubOperator(Ceed ceed, DM dm, CeedData ceed_data, DMLabel domain_label, PetscInt label_value, CeedInt height, CeedInt Q_sur,
-                                CeedInt q_data_size_sur, CeedInt jac_data_size_sur, CeedQFunction qf_apply_bc, CeedQFunction qf_apply_bc_jacobian,
-                                CeedOperator *op_apply, CeedOperator *op_apply_ijacobian) {
-  CeedVector          q_data_sur, jac_data_sur = NULL;
-  CeedOperator        op_setup_sur, op_apply_bc, op_apply_bc_jacobian = NULL;
+static PetscErrorCode AddBCSubOperator(Ceed ceed, DM dm, CeedData ceed_data, DMLabel domain_label, PetscInt label_value, CeedInt height,
+                                       CeedInt Q_sur, CeedInt q_data_size_sur, CeedInt jac_data_size_sur, CeedBasis basis_q_sur,
+                                       CeedBasis basis_x_sur, CeedQFunction qf_apply_bc, CeedQFunction qf_apply_bc_jacobian, CeedOperator op_apply,
+                                       CeedOperator op_apply_ijacobian) {
+  CeedVector          q_data_sur, jac_data_sur          = NULL;
+  CeedOperator        op_apply_bc, op_apply_bc_jacobian = NULL;
   CeedElemRestriction elem_restr_x_sur, elem_restr_q_sur, elem_restr_qd_i_sur, elem_restr_jd_i_sur = NULL;
-  CeedInt             num_qpts_sur, dm_field = 0;
+  PetscInt            dm_field = 0;
 
   PetscFunctionBeginUser;
-  // --- Get number of quadrature points for the boundaries
-  PetscCallCeed(ceed, CeedBasisGetNumQuadraturePoints(ceed_data->basis_q_sur, &num_qpts_sur));
 
   // ---- CEED Restriction
   PetscCall(DMPlexCeedElemRestrictionCreate(ceed, dm, domain_label, label_value, height, dm_field, &elem_restr_q_sur));
@@ -112,44 +111,43 @@ PetscErrorCode AddBCSubOperator(Ceed ceed, DM dm, CeedData ceed_data, DMLabel do
     PetscCallCeed(ceed, CeedElemRestrictionCreateVector(elem_restr_jd_i_sur, &jac_data_sur, NULL));
   }
 
-  // ---- CEED Vector
-  CeedInt loc_num_elem_sur;
-  PetscCallCeed(ceed, CeedElemRestrictionGetNumElements(elem_restr_q_sur, &loc_num_elem_sur));
-  PetscCallCeed(ceed, CeedVectorCreate(ceed, q_data_size_sur * loc_num_elem_sur * num_qpts_sur, &q_data_sur));
+  {  // Create q_data_sur vector
+    CeedOperator op_setup_sur;
 
-  // ---- CEED Operator
-  // ----- CEED Operator for Setup (geometric factors)
-  PetscCallCeed(ceed, CeedOperatorCreate(ceed, ceed_data->qf_setup_sur, NULL, NULL, &op_setup_sur));
-  PetscCallCeed(ceed, CeedOperatorSetField(op_setup_sur, "dx", elem_restr_x_sur, ceed_data->basis_x_sur, CEED_VECTOR_ACTIVE));
-  PetscCallCeed(ceed, CeedOperatorSetField(op_setup_sur, "weight", CEED_ELEMRESTRICTION_NONE, ceed_data->basis_x_sur, CEED_VECTOR_NONE));
-  PetscCallCeed(ceed, CeedOperatorSetField(op_setup_sur, "surface qdata", elem_restr_qd_i_sur, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE));
+    PetscCallCeed(ceed, CeedElemRestrictionCreateVector(elem_restr_qd_i_sur, &q_data_sur, NULL));
+
+    PetscCallCeed(ceed, CeedOperatorCreate(ceed, ceed_data->qf_setup_sur, NULL, NULL, &op_setup_sur));
+    PetscCallCeed(ceed, CeedOperatorSetField(op_setup_sur, "dx", elem_restr_x_sur, basis_x_sur, CEED_VECTOR_ACTIVE));
+    PetscCallCeed(ceed, CeedOperatorSetField(op_setup_sur, "weight", CEED_ELEMRESTRICTION_NONE, basis_x_sur, CEED_VECTOR_NONE));
+    PetscCallCeed(ceed, CeedOperatorSetField(op_setup_sur, "surface qdata", elem_restr_qd_i_sur, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE));
+
+    PetscCallCeed(ceed, CeedOperatorApply(op_setup_sur, ceed_data->x_coord, q_data_sur, CEED_REQUEST_IMMEDIATE));
+    PetscCallCeed(ceed, CeedOperatorDestroy(&op_setup_sur));
+  }
 
   // ----- CEED Operator for Physics
   PetscCallCeed(ceed, CeedOperatorCreate(ceed, qf_apply_bc, NULL, NULL, &op_apply_bc));
-  PetscCallCeed(ceed, CeedOperatorSetField(op_apply_bc, "q", elem_restr_q_sur, ceed_data->basis_q_sur, CEED_VECTOR_ACTIVE));
-  PetscCallCeed(ceed, CeedOperatorSetField(op_apply_bc, "Grad_q", elem_restr_q_sur, ceed_data->basis_q_sur, CEED_VECTOR_ACTIVE));
+  PetscCallCeed(ceed, CeedOperatorSetField(op_apply_bc, "q", elem_restr_q_sur, basis_q_sur, CEED_VECTOR_ACTIVE));
+  PetscCallCeed(ceed, CeedOperatorSetField(op_apply_bc, "Grad_q", elem_restr_q_sur, basis_q_sur, CEED_VECTOR_ACTIVE));
   PetscCallCeed(ceed, CeedOperatorSetField(op_apply_bc, "surface qdata", elem_restr_qd_i_sur, CEED_BASIS_NONE, q_data_sur));
-  PetscCallCeed(ceed, CeedOperatorSetField(op_apply_bc, "x", elem_restr_x_sur, ceed_data->basis_x_sur, ceed_data->x_coord));
-  PetscCallCeed(ceed, CeedOperatorSetField(op_apply_bc, "v", elem_restr_q_sur, ceed_data->basis_q_sur, CEED_VECTOR_ACTIVE));
+  PetscCallCeed(ceed, CeedOperatorSetField(op_apply_bc, "x", elem_restr_x_sur, basis_x_sur, ceed_data->x_coord));
+  PetscCallCeed(ceed, CeedOperatorSetField(op_apply_bc, "v", elem_restr_q_sur, basis_q_sur, CEED_VECTOR_ACTIVE));
   if (elem_restr_jd_i_sur)
     PetscCallCeed(ceed, CeedOperatorSetField(op_apply_bc, "surface jacobian data", elem_restr_jd_i_sur, CEED_BASIS_NONE, jac_data_sur));
 
   if (qf_apply_bc_jacobian && elem_restr_jd_i_sur) {
     PetscCallCeed(ceed, CeedOperatorCreate(ceed, qf_apply_bc_jacobian, NULL, NULL, &op_apply_bc_jacobian));
-    PetscCallCeed(ceed, CeedOperatorSetField(op_apply_bc_jacobian, "dq", elem_restr_q_sur, ceed_data->basis_q_sur, CEED_VECTOR_ACTIVE));
-    PetscCallCeed(ceed, CeedOperatorSetField(op_apply_bc_jacobian, "Grad_dq", elem_restr_q_sur, ceed_data->basis_q_sur, CEED_VECTOR_ACTIVE));
+    PetscCallCeed(ceed, CeedOperatorSetField(op_apply_bc_jacobian, "dq", elem_restr_q_sur, basis_q_sur, CEED_VECTOR_ACTIVE));
+    PetscCallCeed(ceed, CeedOperatorSetField(op_apply_bc_jacobian, "Grad_dq", elem_restr_q_sur, basis_q_sur, CEED_VECTOR_ACTIVE));
     PetscCallCeed(ceed, CeedOperatorSetField(op_apply_bc_jacobian, "surface qdata", elem_restr_qd_i_sur, CEED_BASIS_NONE, q_data_sur));
-    PetscCallCeed(ceed, CeedOperatorSetField(op_apply_bc_jacobian, "x", elem_restr_x_sur, ceed_data->basis_x_sur, ceed_data->x_coord));
+    PetscCallCeed(ceed, CeedOperatorSetField(op_apply_bc_jacobian, "x", elem_restr_x_sur, basis_x_sur, ceed_data->x_coord));
     PetscCallCeed(ceed, CeedOperatorSetField(op_apply_bc_jacobian, "surface jacobian data", elem_restr_jd_i_sur, CEED_BASIS_NONE, jac_data_sur));
-    PetscCallCeed(ceed, CeedOperatorSetField(op_apply_bc_jacobian, "v", elem_restr_q_sur, ceed_data->basis_q_sur, CEED_VECTOR_ACTIVE));
+    PetscCallCeed(ceed, CeedOperatorSetField(op_apply_bc_jacobian, "v", elem_restr_q_sur, basis_q_sur, CEED_VECTOR_ACTIVE));
   }
 
-  // ----- Apply CEED operator for Setup
-  PetscCallCeed(ceed, CeedOperatorApply(op_setup_sur, ceed_data->x_coord, q_data_sur, CEED_REQUEST_IMMEDIATE));
-
   // ----- Apply Sub-Operator for Physics
-  PetscCallCeed(ceed, CeedCompositeOperatorAddSub(*op_apply, op_apply_bc));
-  if (op_apply_bc_jacobian) PetscCallCeed(ceed, CeedCompositeOperatorAddSub(*op_apply_ijacobian, op_apply_bc_jacobian));
+  PetscCallCeed(ceed, CeedCompositeOperatorAddSub(op_apply, op_apply_bc));
+  if (op_apply_bc_jacobian) PetscCallCeed(ceed, CeedCompositeOperatorAddSub(op_apply_ijacobian, op_apply_bc_jacobian));
 
   // ----- Cleanup
   PetscCallCeed(ceed, CeedVectorDestroy(&q_data_sur));
@@ -158,60 +156,14 @@ PetscErrorCode AddBCSubOperator(Ceed ceed, DM dm, CeedData ceed_data, DMLabel do
   PetscCallCeed(ceed, CeedElemRestrictionDestroy(&elem_restr_x_sur));
   PetscCallCeed(ceed, CeedElemRestrictionDestroy(&elem_restr_qd_i_sur));
   PetscCallCeed(ceed, CeedElemRestrictionDestroy(&elem_restr_jd_i_sur));
-  PetscCallCeed(ceed, CeedOperatorDestroy(&op_setup_sur));
   PetscCallCeed(ceed, CeedOperatorDestroy(&op_apply_bc));
   PetscCallCeed(ceed, CeedOperatorDestroy(&op_apply_bc_jacobian));
   PetscFunctionReturn(PETSC_SUCCESS);
 }
 
-// Utility function to create CEED Composite Operator for the entire domain
-PetscErrorCode CreateOperatorForDomain(Ceed ceed, DM dm, SimpleBC bc, CeedData ceed_data, Physics phys, CeedOperator op_apply_vol,
-                                       CeedOperator op_apply_ijacobian_vol, CeedInt height, CeedInt P_sur, CeedInt Q_sur, CeedInt q_data_size_sur,
-                                       CeedInt jac_data_size_sur, CeedOperator *op_apply, CeedOperator *op_apply_ijacobian) {
-  DMLabel domain_label;
-
-  PetscFunctionBeginUser;
-  // Create Composite Operaters
-  PetscCallCeed(ceed, CeedCompositeOperatorCreate(ceed, op_apply));
-  if (op_apply_ijacobian) PetscCallCeed(ceed, CeedCompositeOperatorCreate(ceed, op_apply_ijacobian));
-
-  // --Apply Sub-Operator for the volume
-  PetscCallCeed(ceed, CeedCompositeOperatorAddSub(*op_apply, op_apply_vol));
-  if (op_apply_ijacobian) PetscCallCeed(ceed, CeedCompositeOperatorAddSub(*op_apply_ijacobian, op_apply_ijacobian_vol));
-
-  // -- Create Sub-Operator for in/outflow BCs
-  PetscCall(DMGetLabel(dm, "Face Sets", &domain_label));
-
-  // --- Create Sub-Operator for inflow boundaries
-  for (CeedInt i = 0; i < bc->num_inflow; i++) {
-    PetscCall(AddBCSubOperator(ceed, dm, ceed_data, domain_label, bc->inflows[i], height, Q_sur, q_data_size_sur, jac_data_size_sur,
-                               ceed_data->qf_apply_inflow, ceed_data->qf_apply_inflow_jacobian, op_apply, op_apply_ijacobian));
-  }
-  // --- Create Sub-Operator for outflow boundaries
-  for (CeedInt i = 0; i < bc->num_outflow; i++) {
-    PetscCall(AddBCSubOperator(ceed, dm, ceed_data, domain_label, bc->outflows[i], height, Q_sur, q_data_size_sur, jac_data_size_sur,
-                               ceed_data->qf_apply_outflow, ceed_data->qf_apply_outflow_jacobian, op_apply, op_apply_ijacobian));
-  }
-  // --- Create Sub-Operator for freestream boundaries
-  for (CeedInt i = 0; i < bc->num_freestream; i++) {
-    PetscCall(AddBCSubOperator(ceed, dm, ceed_data, domain_label, bc->freestreams[i], height, Q_sur, q_data_size_sur, jac_data_size_sur,
-                               ceed_data->qf_apply_freestream, ceed_data->qf_apply_freestream_jacobian, op_apply, op_apply_ijacobian));
-  }
-  // --- Create Sub-Operator for slip boundaries
-  for (CeedInt i = 0; i < bc->num_slip; i++) {
-    PetscCall(AddBCSubOperator(ceed, dm, ceed_data, domain_label, bc->slips[i], height, Q_sur, q_data_size_sur, jac_data_size_sur,
-                               ceed_data->qf_apply_slip, ceed_data->qf_apply_slip_jacobian, op_apply, op_apply_ijacobian));
-  }
-
-  // ----- Get Context Labels for Operator
-  PetscCallCeed(ceed, CeedOperatorGetContextFieldLabel(*op_apply, "solution time", &phys->solution_time_label));
-  PetscCallCeed(ceed, CeedOperatorGetContextFieldLabel(*op_apply, "timestep size", &phys->timestep_size_label));
-  PetscFunctionReturn(PETSC_SUCCESS);
-}
-
-PetscErrorCode SetupBCQFunctions(Ceed ceed, PetscInt dim_sur, PetscInt num_comp_x, PetscInt num_comp_q, PetscInt q_data_size_sur,
-                                 PetscInt jac_data_size_sur, ProblemQFunctionSpec apply_bc, ProblemQFunctionSpec apply_bc_jacobian,
-                                 CeedQFunction *qf_apply_bc, CeedQFunction *qf_apply_bc_jacobian) {
+static PetscErrorCode SetupBCQFunctions(Ceed ceed, PetscInt dim_sur, PetscInt num_comp_x, PetscInt num_comp_q, PetscInt q_data_size_sur,
+                                        PetscInt jac_data_size_sur, ProblemQFunctionSpec apply_bc, ProblemQFunctionSpec apply_bc_jacobian,
+                                        CeedQFunction *qf_apply_bc, CeedQFunction *qf_apply_bc_jacobian) {
   PetscFunctionBeginUser;
   if (apply_bc.qfunction) {
     PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, apply_bc.qfunction, apply_bc.qfunction_loc, qf_apply_bc));
@@ -238,6 +190,115 @@ PetscErrorCode SetupBCQFunctions(Ceed ceed, PetscInt dim_sur, PetscInt num_comp_
   PetscFunctionReturn(PETSC_SUCCESS);
 }
 
+// Utility function to create CEED Composite Operator for the entire domain
+static PetscErrorCode AddBCSubOperators(User user, Ceed ceed, DM dm, SimpleBC bc, ProblemData problem, CeedData ceed_data, CeedOperator op_apply,
+                                        CeedOperator op_apply_ijacobian) {
+  CeedInt       height = 1, num_comp_q, num_comp_x;
+  CeedInt       dim_sur, P_sur = user->app_ctx->degree + 1, Q_sur = P_sur + user->app_ctx->q_extra;
+  const CeedInt q_data_size_sur = problem->q_data_size_sur, jac_data_size_sur = user->phys->implicit ? problem->jac_data_size_sur : 0;
+  PetscInt      dim;
+  DMLabel       face_sets_label;
+  CeedBasis     basis_q_sur, basis_x_sur;
+
+  PetscFunctionBeginUser;
+  PetscCall(DMGetDimension(dm, &dim));
+  dim_sur = dim - height;
+  {  // Get number of components and coordinate dimension from op_apply
+    CeedOperator       *sub_ops;
+    CeedOperatorField   field;
+    PetscInt            sub_op_index = 0;  // will be 0 for the volume op
+    CeedElemRestriction elem_restr_q, elem_restr_x;
+
+    PetscCallCeed(ceed, CeedCompositeOperatorGetSubList(op_apply, &sub_ops));
+    PetscCallCeed(ceed, CeedOperatorGetFieldByName(sub_ops[sub_op_index], "q", &field));
+    PetscCallCeed(ceed, CeedOperatorFieldGetElemRestriction(field, &elem_restr_q));
+    PetscCallCeed(ceed, CeedElemRestrictionGetNumComponents(elem_restr_q, &num_comp_q));
+
+    PetscCallCeed(ceed, CeedOperatorGetFieldByName(sub_ops[sub_op_index], "x", &field));
+    PetscCallCeed(ceed, CeedOperatorFieldGetElemRestriction(field, &elem_restr_x));
+    PetscCallCeed(ceed, CeedElemRestrictionGetNumComponents(elem_restr_x, &num_comp_x));
+  }
+
+  {  // Get bases
+    DM dm_coord;
+
+    PetscCall(DMGetCoordinateDM(dm, &dm_coord));
+    DMLabel  label       = NULL;
+    PetscInt label_value = 0;
+    PetscInt field       = 0;  // Still want the normal, default field
+    PetscCall(CreateBasisFromPlex(ceed, dm, label, label_value, height, field, &basis_q_sur));
+    PetscCall(CreateBasisFromPlex(ceed, dm_coord, label, label_value, height, field, &basis_x_sur));
+  }
+
+  PetscCall(DMGetLabel(dm, "Face Sets", &face_sets_label));
+
+  {  // -- Create QFunction for quadrature data
+    PetscCallCeed(ceed,
+                  CeedQFunctionCreateInterior(ceed, 1, problem->setup_sur.qfunction, problem->setup_sur.qfunction_loc, &ceed_data->qf_setup_sur));
+    PetscCallCeed(ceed, CeedQFunctionSetContext(ceed_data->qf_setup_sur, problem->setup_sur.qfunction_context));
+    PetscCallCeed(ceed, CeedQFunctionSetUserFlopsEstimate(ceed_data->qf_setup_sur, 0));
+    PetscCallCeed(ceed, CeedQFunctionAddInput(ceed_data->qf_setup_sur, "dx", num_comp_x * dim_sur, CEED_EVAL_GRAD));
+    PetscCallCeed(ceed, CeedQFunctionAddInput(ceed_data->qf_setup_sur, "weight", 1, CEED_EVAL_WEIGHT));
+    PetscCallCeed(ceed, CeedQFunctionAddOutput(ceed_data->qf_setup_sur, "surface qdata", q_data_size_sur, CEED_EVAL_NONE));
+  }
+
+  {  // --- Create Sub-Operator for inflow boundaries
+    CeedQFunction qf_apply_inflow = NULL, qf_apply_inflow_jacobian = NULL;
+
+    PetscCall(SetupBCQFunctions(ceed, dim_sur, num_comp_x, num_comp_q, q_data_size_sur, jac_data_size_sur, problem->apply_inflow,
+                                problem->apply_inflow_jacobian, &qf_apply_inflow, &qf_apply_inflow_jacobian));
+    for (CeedInt i = 0; i < bc->num_inflow; i++) {
+      PetscCall(AddBCSubOperator(ceed, dm, ceed_data, face_sets_label, bc->inflows[i], height, Q_sur, q_data_size_sur, jac_data_size_sur, basis_q_sur,
+                                 basis_x_sur, qf_apply_inflow, qf_apply_inflow_jacobian, op_apply, op_apply_ijacobian));
+    }
+    PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_apply_inflow));
+    PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_apply_inflow_jacobian));
+  }
+
+  {  // --- Create Sub-Operator for outflow boundaries
+    CeedQFunction qf_apply_outflow = NULL, qf_apply_outflow_jacobian = NULL;
+
+    PetscCall(SetupBCQFunctions(ceed, dim_sur, num_comp_x, num_comp_q, q_data_size_sur, jac_data_size_sur, problem->apply_outflow,
+                                problem->apply_outflow_jacobian, &qf_apply_outflow, &qf_apply_outflow_jacobian));
+    for (CeedInt i = 0; i < bc->num_outflow; i++) {
+      PetscCall(AddBCSubOperator(ceed, dm, ceed_data, face_sets_label, bc->outflows[i], height, Q_sur, q_data_size_sur, jac_data_size_sur,
+                                 basis_q_sur, basis_x_sur, qf_apply_outflow, qf_apply_outflow_jacobian, op_apply, op_apply_ijacobian));
+    }
+    PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_apply_outflow));
+    PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_apply_outflow_jacobian));
+  }
+
+  {  // --- Create Sub-Operator for freestream boundaries
+    CeedQFunction qf_apply_freestream = NULL, qf_apply_freestream_jacobian = NULL;
+
+    PetscCall(SetupBCQFunctions(ceed, dim_sur, num_comp_x, num_comp_q, q_data_size_sur, jac_data_size_sur, problem->apply_freestream,
+                                problem->apply_freestream_jacobian, &qf_apply_freestream, &qf_apply_freestream_jacobian));
+    for (CeedInt i = 0; i < bc->num_freestream; i++) {
+      PetscCall(AddBCSubOperator(ceed, dm, ceed_data, face_sets_label, bc->freestreams[i], height, Q_sur, q_data_size_sur, jac_data_size_sur,
+                                 basis_q_sur, basis_x_sur, qf_apply_freestream, qf_apply_freestream_jacobian, op_apply, op_apply_ijacobian));
+    }
+    PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_apply_freestream));
+    PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_apply_freestream_jacobian));
+  }
+
+  {  // --- Create Sub-Operator for slip boundaries
+    CeedQFunction qf_apply_slip = NULL, qf_apply_slip_jacobian = NULL;
+
+    PetscCall(SetupBCQFunctions(ceed, dim_sur, num_comp_x, num_comp_q, q_data_size_sur, jac_data_size_sur, problem->apply_slip,
+                                problem->apply_slip_jacobian, &qf_apply_slip, &qf_apply_slip_jacobian));
+    for (CeedInt i = 0; i < bc->num_slip; i++) {
+      PetscCall(AddBCSubOperator(ceed, dm, ceed_data, face_sets_label, bc->slips[i], height, Q_sur, q_data_size_sur, jac_data_size_sur, basis_q_sur,
+                                 basis_x_sur, qf_apply_slip, qf_apply_slip_jacobian, op_apply, op_apply_ijacobian));
+    }
+    PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_apply_slip));
+    PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_apply_slip_jacobian));
+  }
+
+  PetscCallCeed(ceed, CeedBasisDestroy(&basis_q_sur));
+  PetscCallCeed(ceed, CeedBasisDestroy(&basis_x_sur));
+  PetscFunctionReturn(PETSC_SUCCESS);
+}
+
 PetscErrorCode SetupLibceed(Ceed ceed, CeedData ceed_data, DM dm, User user, AppCtx app_ctx, ProblemData problem, SimpleBC bc) {
   PetscFunctionBeginUser;
   const PetscInt num_comp_q = 5;
@@ -262,7 +323,6 @@ PetscErrorCode SetupLibceed(Ceed ceed, CeedData ceed_data, DM dm, User user, App
 
   PetscCall(CreateBasisFromPlex(ceed, dm, domain_label, label_value, height, dm_field, &ceed_data->basis_q));
   PetscCall(CreateBasisFromPlex(ceed, dm_coord, domain_label, label_value, height, dm_field, &ceed_data->basis_x));
-  PetscCallCeed(ceed, CeedBasisCreateProjection(ceed_data->basis_x, ceed_data->basis_q, &ceed_data->basis_xc));
   PetscCallCeed(ceed, CeedBasisGetNumQuadraturePoints(ceed_data->basis_q, &num_qpts));
 
   PetscCall(DMPlexCeedElemRestrictionCreate(ceed, dm, domain_label, label_value, height, 0, &ceed_data->elem_restr_q));
@@ -314,9 +374,11 @@ PetscErrorCode SetupLibceed(Ceed ceed, CeedData ceed_data, DM dm, User user, App
   }
 
   {  // -- Create QFunction for ICs
+    CeedBasis     basis_xc;
     CeedQFunction qf_ics;
     CeedOperator  op_ics;
 
+    PetscCallCeed(ceed, CeedBasisCreateProjection(ceed_data->basis_x, ceed_data->basis_q, &basis_xc));
     PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, problem->ics.qfunction, problem->ics.qfunction_loc, &qf_ics));
     PetscCallCeed(ceed, CeedQFunctionSetContext(qf_ics, problem->ics.qfunction_context));
     PetscCallCeed(ceed, CeedQFunctionSetUserFlopsEstimate(qf_ics, 0));
@@ -325,12 +387,13 @@ PetscErrorCode SetupLibceed(Ceed ceed, CeedData ceed_data, DM dm, User user, App
     PetscCallCeed(ceed, CeedQFunctionAddOutput(qf_ics, "q0", num_comp_q, CEED_EVAL_NONE));
 
     PetscCallCeed(ceed, CeedOperatorCreate(ceed, qf_ics, NULL, NULL, &op_ics));
-    PetscCallCeed(ceed, CeedOperatorSetField(op_ics, "x", ceed_data->elem_restr_x, ceed_data->basis_xc, CEED_VECTOR_ACTIVE));
-    PetscCallCeed(ceed, CeedOperatorSetField(op_ics, "dx", ceed_data->elem_restr_x, ceed_data->basis_xc, CEED_VECTOR_ACTIVE));
+    PetscCallCeed(ceed, CeedOperatorSetField(op_ics, "x", ceed_data->elem_restr_x, basis_xc, CEED_VECTOR_ACTIVE));
+    PetscCallCeed(ceed, CeedOperatorSetField(op_ics, "dx", ceed_data->elem_restr_x, basis_xc, CEED_VECTOR_ACTIVE));
     PetscCallCeed(ceed, CeedOperatorSetField(op_ics, "q0", ceed_data->elem_restr_q, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE));
     PetscCallCeed(ceed, CeedOperatorGetContextFieldLabel(op_ics, "evaluation time", &user->phys->ics_time_label));
     PetscCall(OperatorApplyContextCreate(NULL, dm, user->ceed, op_ics, ceed_data->x_coord, NULL, NULL, user->Q_loc, &ceed_data->op_ics_ctx));
 
+    PetscCallCeed(ceed, CeedBasisDestroy(&basis_xc));
     PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_ics));
     PetscCallCeed(ceed, CeedOperatorDestroy(&op_ics));
   }
@@ -420,67 +483,39 @@ PetscErrorCode SetupLibceed(Ceed ceed, CeedData ceed_data, DM dm, User user, App
     PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_ijacobian_vol));
   }
 
-  // *****************************************************************************
-  // Set up CEED objects for the exterior domain (surface)
-  // *****************************************************************************
-  height                = 1;
-  CeedInt       dim_sur = dim - height, P_sur = app_ctx->degree + 1, Q_sur = P_sur + app_ctx->q_extra;
-  const CeedInt q_data_size_sur = problem->q_data_size_sur, jac_data_size_sur = user->phys->implicit ? problem->jac_data_size_sur : 0;
-
-  // -----------------------------------------------------------------------------
-  // CEED Bases
-  // -----------------------------------------------------------------------------
-
-  {
-    DM dm_coord;
-
-    PetscCall(DMGetCoordinateDM(dm, &dm_coord));
-    DMLabel  label   = NULL;
-    PetscInt face_id = 0;
-    PetscInt field   = 0;  // Still want the normal, default field
-    PetscCall(CreateBasisFromPlex(ceed, dm, label, face_id, height, field, &ceed_data->basis_q_sur));
-    PetscCall(CreateBasisFromPlex(ceed, dm_coord, label, face_id, height, field, &ceed_data->basis_x_sur));
-  }
-
-  // -----------------------------------------------------------------------------
-  // CEED QFunctions
-  // -----------------------------------------------------------------------------
-  // -- Create QFunction for quadrature data
-  PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, problem->setup_sur.qfunction, problem->setup_sur.qfunction_loc, &ceed_data->qf_setup_sur));
-  if (problem->setup_sur.qfunction_context) {
-    PetscCallCeed(ceed, CeedQFunctionSetContext(ceed_data->qf_setup_sur, problem->setup_sur.qfunction_context));
-  }
-  PetscCallCeed(ceed, CeedQFunctionSetUserFlopsEstimate(ceed_data->qf_setup_sur, 0));
-  PetscCallCeed(ceed, CeedQFunctionAddInput(ceed_data->qf_setup_sur, "dx", num_comp_x * dim_sur, CEED_EVAL_GRAD));
-  PetscCallCeed(ceed, CeedQFunctionAddInput(ceed_data->qf_setup_sur, "weight", 1, CEED_EVAL_WEIGHT));
-  PetscCallCeed(ceed, CeedQFunctionAddOutput(ceed_data->qf_setup_sur, "surface qdata", q_data_size_sur, CEED_EVAL_NONE));
-
-  PetscCall(SetupBCQFunctions(ceed, dim_sur, num_comp_x, num_comp_q, q_data_size_sur, jac_data_size_sur, problem->apply_inflow,
-                              problem->apply_inflow_jacobian, &ceed_data->qf_apply_inflow, &ceed_data->qf_apply_inflow_jacobian));
-  PetscCall(SetupBCQFunctions(ceed, dim_sur, num_comp_x, num_comp_q, q_data_size_sur, jac_data_size_sur, problem->apply_outflow,
-                              problem->apply_outflow_jacobian, &ceed_data->qf_apply_outflow, &ceed_data->qf_apply_outflow_jacobian));
-  PetscCall(SetupBCQFunctions(ceed, dim_sur, num_comp_x, num_comp_q, q_data_size_sur, jac_data_size_sur, problem->apply_freestream,
-                              problem->apply_freestream_jacobian, &ceed_data->qf_apply_freestream, &ceed_data->qf_apply_freestream_jacobian));
-  PetscCall(SetupBCQFunctions(ceed, dim_sur, num_comp_x, num_comp_q, q_data_size_sur, jac_data_size_sur, problem->apply_slip,
-                              problem->apply_slip_jacobian, &ceed_data->qf_apply_slip, &ceed_data->qf_apply_slip_jacobian));
-
-  // *****************************************************************************
-  // CEED Operator Apply
-  // *****************************************************************************
   // -- Create and apply CEED Composite Operator for the entire domain
   if (!user->phys->implicit) {  // RHS
     CeedOperator op_rhs;
-    PetscCall(CreateOperatorForDomain(ceed, dm, bc, ceed_data, user->phys, user->op_rhs_vol, NULL, height, P_sur, Q_sur, q_data_size_sur, 0, &op_rhs,
-                                      NULL));
+
+    PetscCallCeed(ceed, CeedCompositeOperatorCreate(ceed, &op_rhs));
+    PetscCallCeed(ceed, CeedCompositeOperatorAddSub(op_rhs, user->op_rhs_vol));
+    PetscCall(AddBCSubOperators(user, ceed, dm, bc, problem, ceed_data, op_rhs, NULL));
+
     PetscCall(OperatorApplyContextCreate(dm, dm, ceed, op_rhs, user->q_ceed, user->g_ceed, user->Q_loc, NULL, &user->op_rhs_ctx));
+
+    // ----- Get Context Labels for Operator
+    PetscCallCeed(ceed, CeedOperatorGetContextFieldLabel(op_rhs, "solution time", &user->phys->solution_time_label));
+    PetscCallCeed(ceed, CeedOperatorGetContextFieldLabel(op_rhs, "timestep size", &user->phys->timestep_size_label));
+
     PetscCallCeed(ceed, CeedOperatorDestroy(&op_rhs));
     PetscCall(CreateKSPMass(user, problem));
     PetscCheck(app_ctx->sgs_model_type == SGS_MODEL_NONE, user->comm, PETSC_ERR_SUP, "SGS modeling not implemented for explicit timestepping");
   } else {  // IFunction
     CeedOperator op_ijacobian = NULL;
 
-    PetscCall(CreateOperatorForDomain(ceed, dm, bc, ceed_data, user->phys, user->op_ifunction_vol, op_ijacobian_vol, height, P_sur, Q_sur,
-                                      q_data_size_sur, jac_data_size_sur, &user->op_ifunction, op_ijacobian_vol ? &op_ijacobian : NULL));
+    // Create Composite Operaters
+    PetscCallCeed(ceed, CeedCompositeOperatorCreate(ceed, &user->op_ifunction));
+    PetscCallCeed(ceed, CeedCompositeOperatorAddSub(user->op_ifunction, user->op_ifunction_vol));
+    if (op_ijacobian_vol) {
+      PetscCallCeed(ceed, CeedCompositeOperatorCreate(ceed, &op_ijacobian));
+      PetscCallCeed(ceed, CeedCompositeOperatorAddSub(op_ijacobian, op_ijacobian_vol));
+    }
+    PetscCall(AddBCSubOperators(user, ceed, dm, bc, problem, ceed_data, user->op_ifunction, op_ijacobian));
+
+    // ----- Get Context Labels for Operator
+    PetscCallCeed(ceed, CeedOperatorGetContextFieldLabel(user->op_ifunction, "solution time", &user->phys->solution_time_label));
+    PetscCallCeed(ceed, CeedOperatorGetContextFieldLabel(user->op_ifunction, "timestep size", &user->phys->timestep_size_label));
+
     if (op_ijacobian) {
       PetscCall(MatCeedCreate(user->dm, user->dm, op_ijacobian, NULL, &user->mat_ijacobian));
       PetscCall(MatCeedSetLocalVectors(user->mat_ijacobian, user->Q_dot_loc, NULL));
diff --git a/examples/fluids/src/strong_boundary_conditions.c b/examples/fluids/src/strong_boundary_conditions.c
index 9bcc753885..532f486412 100644
--- a/examples/fluids/src/strong_boundary_conditions.c
+++ b/examples/fluids/src/strong_boundary_conditions.c
@@ -26,8 +26,21 @@ PetscErrorCode SetupStrongSTG_Ceed(Ceed ceed, CeedData ceed_data, DM dm, Problem
   PetscFunctionBeginUser;
   PetscCall(DMGetLabel(dm, "Face Sets", &domain_label));
 
-  // Basis
-  PetscCallCeed(ceed, CeedBasisCreateProjection(ceed_data->basis_x_sur, ceed_data->basis_q_sur, &basis_x_to_q_sur));
+  {  // Basis
+    CeedBasis basis_x_sur, basis_q_sur;
+    DM        dm_coord;
+
+    PetscCall(DMGetCoordinateDM(dm, &dm_coord));
+    DMLabel  label       = NULL;
+    PetscInt label_value = 0;
+    PetscCall(CreateBasisFromPlex(ceed, dm, label, label_value, height, dm_field, &basis_q_sur));
+    PetscCall(CreateBasisFromPlex(ceed, dm_coord, label, label_value, height, dm_field, &basis_x_sur));
+
+    PetscCallCeed(ceed, CeedBasisCreateProjection(basis_x_sur, basis_q_sur, &basis_x_to_q_sur));
+
+    PetscCallCeed(ceed, CeedBasisDestroy(&basis_q_sur));
+    PetscCallCeed(ceed, CeedBasisDestroy(&basis_x_sur));
+  }
 
   // Setup QFunction
   PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, SetupStrongBC, SetupStrongBC_loc, &qf_setup));
diff --git a/examples/fluids/src/turb_spanstats.c b/examples/fluids/src/turb_spanstats.c
index 54ab617afc..39743b0898 100644
--- a/examples/fluids/src/turb_spanstats.c
+++ b/examples/fluids/src/turb_spanstats.c
@@ -183,7 +183,6 @@ PetscErrorCode GetQuadratureCoords(Ceed ceed, DM dm, CeedElemRestriction elem_re
 
 PetscErrorCode SpanStatsSetupDataCreate(Ceed ceed, User user, CeedData ceed_data, ProblemData problem, SpanStatsSetupData *stats_data) {
   DM       dm = user->spanstats.dm;
-  PetscInt dim;
   CeedInt  num_comp_x, num_comp_stats = user->spanstats.num_comp_stats;
   Vec      X_loc;
   DMLabel  domain_label = NULL;
@@ -192,7 +191,6 @@ PetscErrorCode SpanStatsSetupDataCreate(Ceed ceed, User user, CeedData ceed_data
   PetscFunctionBeginUser;
   PetscCall(PetscNew(stats_data));
 
-  PetscCall(DMGetDimension(dm, &dim));
   PetscCall(DMPlexCeedElemRestrictionCreate(ceed, dm, domain_label, label_value, height, dm_field, &(*stats_data)->elem_restr_parent_stats));
   PetscCall(DMPlexCeedElemRestrictionCoordinateCreate(ceed, dm, domain_label, label_value, height, &(*stats_data)->elem_restr_parent_x));
   PetscCall(DMPlexCeedElemRestrictionQDataCreate(ceed, dm, domain_label, label_value, height, problem->q_data_size_sur,

From a5b0ec6f41c4198e9c6bd74b9679c3c63ae70b36 Mon Sep 17 00:00:00 2001
From: James Wright <james@jameswright.xyz>
Date: Sun, 28 Apr 2024 20:14:07 -0600
Subject: [PATCH 011/571] fluids: Remove op_*_vol from user struct

And some other misc style changes
---
 examples/fluids/navierstokes.c     |   2 -
 examples/fluids/navierstokes.h     |   2 +-
 examples/fluids/src/setuplibceed.c | 127 ++++++++++++++---------------
 3 files changed, 61 insertions(+), 70 deletions(-)

diff --git a/examples/fluids/navierstokes.c b/examples/fluids/navierstokes.c
index 0e8cf6b4a3..74e80499a2 100644
--- a/examples/fluids/navierstokes.c
+++ b/examples/fluids/navierstokes.c
@@ -298,8 +298,6 @@ int main(int argc, char **argv) {
 
   // -- Operators
   PetscCall(OperatorApplyContextDestroy(ceed_data->op_ics_ctx));
-  PetscCallCeed(ceed, CeedOperatorDestroy(&user->op_rhs_vol));
-  PetscCallCeed(ceed, CeedOperatorDestroy(&user->op_ifunction_vol));
   PetscCall(OperatorApplyContextDestroy(user->op_rhs_ctx));
   PetscCall(OperatorApplyContextDestroy(user->op_strong_bc_ctx));
   PetscCallCeed(ceed, CeedOperatorDestroy(&user->op_ifunction));
diff --git a/examples/fluids/navierstokes.h b/examples/fluids/navierstokes.h
index 18967835c0..783fcab1e4 100644
--- a/examples/fluids/navierstokes.h
+++ b/examples/fluids/navierstokes.h
@@ -231,7 +231,7 @@ struct User_private {
   Physics              phys;
   AppCtx               app_ctx;
   CeedVector           q_ceed, q_dot_ceed, g_ceed, x_ceed;
-  CeedOperator         op_rhs_vol, op_ifunction_vol, op_ifunction;
+  CeedOperator         op_ifunction;
   Mat                  mat_ijacobian;
   KSP                  mass_ksp;
   OperatorApplyContext op_rhs_ctx, op_strong_bc_ctx;
diff --git a/examples/fluids/src/setuplibceed.c b/examples/fluids/src/setuplibceed.c
index 24c6a8a61c..ec8048d657 100644
--- a/examples/fluids/src/setuplibceed.c
+++ b/examples/fluids/src/setuplibceed.c
@@ -100,8 +100,6 @@ static PetscErrorCode AddBCSubOperator(Ceed ceed, DM dm, CeedData ceed_data, DML
   PetscInt            dm_field = 0;
 
   PetscFunctionBeginUser;
-
-  // ---- CEED Restriction
   PetscCall(DMPlexCeedElemRestrictionCreate(ceed, dm, domain_label, label_value, height, dm_field, &elem_restr_q_sur));
   PetscCall(DMPlexCeedElemRestrictionCoordinateCreate(ceed, dm, domain_label, label_value, height, &elem_restr_x_sur));
   PetscCall(DMPlexCeedElemRestrictionQDataCreate(ceed, dm, domain_label, label_value, height, q_data_size_sur, &elem_restr_qd_i_sur));
@@ -125,7 +123,7 @@ static PetscErrorCode AddBCSubOperator(Ceed ceed, DM dm, CeedData ceed_data, DML
     PetscCallCeed(ceed, CeedOperatorDestroy(&op_setup_sur));
   }
 
-  // ----- CEED Operator for Physics
+  // CEED Operator for Physics
   PetscCallCeed(ceed, CeedOperatorCreate(ceed, qf_apply_bc, NULL, NULL, &op_apply_bc));
   PetscCallCeed(ceed, CeedOperatorSetField(op_apply_bc, "q", elem_restr_q_sur, basis_q_sur, CEED_VECTOR_ACTIVE));
   PetscCallCeed(ceed, CeedOperatorSetField(op_apply_bc, "Grad_q", elem_restr_q_sur, basis_q_sur, CEED_VECTOR_ACTIVE));
@@ -145,11 +143,10 @@ static PetscErrorCode AddBCSubOperator(Ceed ceed, DM dm, CeedData ceed_data, DML
     PetscCallCeed(ceed, CeedOperatorSetField(op_apply_bc_jacobian, "v", elem_restr_q_sur, basis_q_sur, CEED_VECTOR_ACTIVE));
   }
 
-  // ----- Apply Sub-Operator for Physics
+  // Apply Sub-Operator for Physics
   PetscCallCeed(ceed, CeedCompositeOperatorAddSub(op_apply, op_apply_bc));
   if (op_apply_bc_jacobian) PetscCallCeed(ceed, CeedCompositeOperatorAddSub(op_apply_ijacobian, op_apply_bc_jacobian));
 
-  // ----- Cleanup
   PetscCallCeed(ceed, CeedVectorDestroy(&q_data_sur));
   PetscCallCeed(ceed, CeedVectorDestroy(&jac_data_sur));
   PetscCallCeed(ceed, CeedElemRestrictionDestroy(&elem_restr_q_sur));
@@ -190,7 +187,7 @@ static PetscErrorCode SetupBCQFunctions(Ceed ceed, PetscInt dim_sur, PetscInt nu
   PetscFunctionReturn(PETSC_SUCCESS);
 }
 
-// Utility function to create CEED Composite Operator for the entire domain
+// Utility function to add boundary operators to the composite operator
 static PetscErrorCode AddBCSubOperators(User user, Ceed ceed, DM dm, SimpleBC bc, ProblemData problem, CeedData ceed_data, CeedOperator op_apply,
                                         CeedOperator op_apply_ijacobian) {
   CeedInt       height = 1, num_comp_q, num_comp_x;
@@ -225,7 +222,7 @@ static PetscErrorCode AddBCSubOperators(User user, Ceed ceed, DM dm, SimpleBC bc
     PetscCall(DMGetCoordinateDM(dm, &dm_coord));
     DMLabel  label       = NULL;
     PetscInt label_value = 0;
-    PetscInt field       = 0;  // Still want the normal, default field
+    PetscInt field       = 0;
     PetscCall(CreateBasisFromPlex(ceed, dm, label, label_value, height, field, &basis_q_sur));
     PetscCall(CreateBasisFromPlex(ceed, dm_coord, label, label_value, height, field, &basis_x_sur));
   }
@@ -300,10 +297,14 @@ static PetscErrorCode AddBCSubOperators(User user, Ceed ceed, DM dm, SimpleBC bc
 }
 
 PetscErrorCode SetupLibceed(Ceed ceed, CeedData ceed_data, DM dm, User user, AppCtx app_ctx, ProblemData problem, SimpleBC bc) {
+  const PetscInt      num_comp_q = 5;
+  const CeedInt       dim = problem->dim, num_comp_x = problem->dim, q_data_size_vol = problem->q_data_size_vol;
+  CeedInt             jac_data_size_vol = num_comp_q + 6 + 3;
+  CeedElemRestriction elem_restr_jd_i;
+  CeedVector          jac_data;
+  CeedOperator        op_ifunction_vol = NULL, op_rhs_vol = NULL, op_ijacobian_vol = NULL;
+
   PetscFunctionBeginUser;
-  const PetscInt num_comp_q = 5;
-  const CeedInt  dim = problem->dim, num_comp_x = problem->dim, q_data_size_vol = problem->q_data_size_vol;
-  CeedInt        jac_data_size_vol = num_comp_q + 6 + 3;
 
   if (problem->apply_vol_ifunction.qfunction && problem->uses_newtonian) {
     NewtonianIdealGasContext gas;
@@ -312,30 +313,27 @@ PetscErrorCode SetupLibceed(Ceed ceed, CeedData ceed_data, DM dm, User user, App
     PetscCallCeed(ceed, CeedQFunctionContextRestoreDataRead(problem->apply_vol_ifunction.qfunction_context, &gas));
   }
 
-  CeedElemRestriction elem_restr_jd_i;
-  CeedVector          jac_data;
-  CeedInt             num_qpts;
-  DMLabel             domain_label = NULL;
-  PetscInt            label_value = 0, height = 0, dm_field = 0;
-
-  DM dm_coord;
-  PetscCall(DMGetCoordinateDM(dm, &dm_coord));
+  {  // Create bases and element restrictions
+    DMLabel  domain_label = NULL;
+    PetscInt label_value = 0, height = 0, dm_field = 0;
+    DM       dm_coord;
 
-  PetscCall(CreateBasisFromPlex(ceed, dm, domain_label, label_value, height, dm_field, &ceed_data->basis_q));
-  PetscCall(CreateBasisFromPlex(ceed, dm_coord, domain_label, label_value, height, dm_field, &ceed_data->basis_x));
-  PetscCallCeed(ceed, CeedBasisGetNumQuadraturePoints(ceed_data->basis_q, &num_qpts));
-
-  PetscCall(DMPlexCeedElemRestrictionCreate(ceed, dm, domain_label, label_value, height, 0, &ceed_data->elem_restr_q));
-  PetscCall(DMPlexCeedElemRestrictionCoordinateCreate(ceed, dm, domain_label, label_value, height, &ceed_data->elem_restr_x));
-  PetscCall(DMPlexCeedElemRestrictionQDataCreate(ceed, dm, domain_label, label_value, height, q_data_size_vol, &ceed_data->elem_restr_qd_i));
-  PetscCall(DMPlexCeedElemRestrictionQDataCreate(ceed, dm, domain_label, label_value, height, jac_data_size_vol, &elem_restr_jd_i));
-
-  PetscCallCeed(ceed, CeedElemRestrictionCreateVector(ceed_data->elem_restr_q, &user->q_ceed, NULL));
-  PetscCallCeed(ceed, CeedElemRestrictionCreateVector(ceed_data->elem_restr_q, &user->q_dot_ceed, NULL));
-  PetscCallCeed(ceed, CeedElemRestrictionCreateVector(ceed_data->elem_restr_q, &user->g_ceed, NULL));
-  PetscCallCeed(ceed, CeedElemRestrictionCreateVector(ceed_data->elem_restr_x, &ceed_data->x_coord, NULL));
-  PetscCallCeed(ceed, CeedElemRestrictionCreateVector(ceed_data->elem_restr_qd_i, &ceed_data->q_data, NULL));
-  PetscCallCeed(ceed, CeedElemRestrictionCreateVector(elem_restr_jd_i, &jac_data, NULL));
+    PetscCall(DMGetCoordinateDM(dm, &dm_coord));
+    PetscCall(CreateBasisFromPlex(ceed, dm, domain_label, label_value, height, dm_field, &ceed_data->basis_q));
+    PetscCall(CreateBasisFromPlex(ceed, dm_coord, domain_label, label_value, height, dm_field, &ceed_data->basis_x));
+
+    PetscCall(DMPlexCeedElemRestrictionCreate(ceed, dm, domain_label, label_value, height, 0, &ceed_data->elem_restr_q));
+    PetscCall(DMPlexCeedElemRestrictionCoordinateCreate(ceed, dm, domain_label, label_value, height, &ceed_data->elem_restr_x));
+    PetscCall(DMPlexCeedElemRestrictionQDataCreate(ceed, dm, domain_label, label_value, height, q_data_size_vol, &ceed_data->elem_restr_qd_i));
+    PetscCall(DMPlexCeedElemRestrictionQDataCreate(ceed, dm, domain_label, label_value, height, jac_data_size_vol, &elem_restr_jd_i));
+
+    PetscCallCeed(ceed, CeedElemRestrictionCreateVector(ceed_data->elem_restr_q, &user->q_ceed, NULL));
+    PetscCallCeed(ceed, CeedElemRestrictionCreateVector(ceed_data->elem_restr_q, &user->q_dot_ceed, NULL));
+    PetscCallCeed(ceed, CeedElemRestrictionCreateVector(ceed_data->elem_restr_q, &user->g_ceed, NULL));
+    PetscCallCeed(ceed, CeedElemRestrictionCreateVector(ceed_data->elem_restr_x, &ceed_data->x_coord, NULL));
+    PetscCallCeed(ceed, CeedElemRestrictionCreateVector(ceed_data->elem_restr_qd_i, &ceed_data->q_data, NULL));
+    PetscCallCeed(ceed, CeedElemRestrictionCreateVector(elem_restr_jd_i, &jac_data, NULL));
+  }
 
   {  // -- Copy PETSc coordinate vector into CEED vector
     Vec X_loc;
@@ -400,7 +398,6 @@ PetscErrorCode SetupLibceed(Ceed ceed, CeedData ceed_data, DM dm, User user, App
 
   if (problem->apply_vol_rhs.qfunction) {
     CeedQFunction qf_rhs_vol;
-    CeedOperator  op;
 
     PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, problem->apply_vol_rhs.qfunction, problem->apply_vol_rhs.qfunction_loc, &qf_rhs_vol));
     PetscCallCeed(ceed, CeedQFunctionSetContext(qf_rhs_vol, problem->apply_vol_rhs.qfunction_context));
@@ -412,21 +409,19 @@ PetscErrorCode SetupLibceed(Ceed ceed, CeedData ceed_data, DM dm, User user, App
     PetscCallCeed(ceed, CeedQFunctionAddOutput(qf_rhs_vol, "v", num_comp_q, CEED_EVAL_INTERP));
     PetscCallCeed(ceed, CeedQFunctionAddOutput(qf_rhs_vol, "Grad_v", num_comp_q * dim, CEED_EVAL_GRAD));
 
-    PetscCallCeed(ceed, CeedOperatorCreate(ceed, qf_rhs_vol, NULL, NULL, &op));
-    PetscCallCeed(ceed, CeedOperatorSetField(op, "q", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE));
-    PetscCallCeed(ceed, CeedOperatorSetField(op, "Grad_q", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE));
-    PetscCallCeed(ceed, CeedOperatorSetField(op, "qdata", ceed_data->elem_restr_qd_i, CEED_BASIS_NONE, ceed_data->q_data));
-    PetscCallCeed(ceed, CeedOperatorSetField(op, "x", ceed_data->elem_restr_x, ceed_data->basis_x, ceed_data->x_coord));
-    PetscCallCeed(ceed, CeedOperatorSetField(op, "v", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE));
-    PetscCallCeed(ceed, CeedOperatorSetField(op, "Grad_v", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE));
-    user->op_rhs_vol = op;
+    PetscCallCeed(ceed, CeedOperatorCreate(ceed, qf_rhs_vol, NULL, NULL, &op_rhs_vol));
+    PetscCallCeed(ceed, CeedOperatorSetField(op_rhs_vol, "q", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE));
+    PetscCallCeed(ceed, CeedOperatorSetField(op_rhs_vol, "Grad_q", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE));
+    PetscCallCeed(ceed, CeedOperatorSetField(op_rhs_vol, "qdata", ceed_data->elem_restr_qd_i, CEED_BASIS_NONE, ceed_data->q_data));
+    PetscCallCeed(ceed, CeedOperatorSetField(op_rhs_vol, "x", ceed_data->elem_restr_x, ceed_data->basis_x, ceed_data->x_coord));
+    PetscCallCeed(ceed, CeedOperatorSetField(op_rhs_vol, "v", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE));
+    PetscCallCeed(ceed, CeedOperatorSetField(op_rhs_vol, "Grad_v", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE));
 
     PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_rhs_vol));
   }
 
   if (problem->apply_vol_ifunction.qfunction) {
     CeedQFunction qf_ifunction_vol;
-    CeedOperator  op;
 
     PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, problem->apply_vol_ifunction.qfunction, problem->apply_vol_ifunction.qfunction_loc,
                                                     &qf_ifunction_vol));
@@ -441,24 +436,21 @@ PetscErrorCode SetupLibceed(Ceed ceed, CeedData ceed_data, DM dm, User user, App
     PetscCallCeed(ceed, CeedQFunctionAddOutput(qf_ifunction_vol, "Grad_v", num_comp_q * dim, CEED_EVAL_GRAD));
     PetscCallCeed(ceed, CeedQFunctionAddOutput(qf_ifunction_vol, "jac_data", jac_data_size_vol, CEED_EVAL_NONE));
 
-    PetscCallCeed(ceed, CeedOperatorCreate(ceed, qf_ifunction_vol, NULL, NULL, &op));
-    PetscCallCeed(ceed, CeedOperatorSetField(op, "q", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE));
-    PetscCallCeed(ceed, CeedOperatorSetField(op, "Grad_q", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE));
-    PetscCallCeed(ceed, CeedOperatorSetField(op, "q dot", ceed_data->elem_restr_q, ceed_data->basis_q, user->q_dot_ceed));
-    PetscCallCeed(ceed, CeedOperatorSetField(op, "qdata", ceed_data->elem_restr_qd_i, CEED_BASIS_NONE, ceed_data->q_data));
-    PetscCallCeed(ceed, CeedOperatorSetField(op, "x", ceed_data->elem_restr_x, ceed_data->basis_x, ceed_data->x_coord));
-    PetscCallCeed(ceed, CeedOperatorSetField(op, "v", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE));
-    PetscCallCeed(ceed, CeedOperatorSetField(op, "Grad_v", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE));
-    PetscCallCeed(ceed, CeedOperatorSetField(op, "jac_data", elem_restr_jd_i, CEED_BASIS_NONE, jac_data));
-
-    user->op_ifunction_vol = op;
+    PetscCallCeed(ceed, CeedOperatorCreate(ceed, qf_ifunction_vol, NULL, NULL, &op_ifunction_vol));
+    PetscCallCeed(ceed, CeedOperatorSetField(op_ifunction_vol, "q", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE));
+    PetscCallCeed(ceed, CeedOperatorSetField(op_ifunction_vol, "Grad_q", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE));
+    PetscCallCeed(ceed, CeedOperatorSetField(op_ifunction_vol, "q dot", ceed_data->elem_restr_q, ceed_data->basis_q, user->q_dot_ceed));
+    PetscCallCeed(ceed, CeedOperatorSetField(op_ifunction_vol, "qdata", ceed_data->elem_restr_qd_i, CEED_BASIS_NONE, ceed_data->q_data));
+    PetscCallCeed(ceed, CeedOperatorSetField(op_ifunction_vol, "x", ceed_data->elem_restr_x, ceed_data->basis_x, ceed_data->x_coord));
+    PetscCallCeed(ceed, CeedOperatorSetField(op_ifunction_vol, "v", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE));
+    PetscCallCeed(ceed, CeedOperatorSetField(op_ifunction_vol, "Grad_v", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE));
+    PetscCallCeed(ceed, CeedOperatorSetField(op_ifunction_vol, "jac_data", elem_restr_jd_i, CEED_BASIS_NONE, jac_data));
+
     PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_ifunction_vol));
   }
 
-  CeedOperator op_ijacobian_vol = NULL;
   if (problem->apply_vol_ijacobian.qfunction) {
     CeedQFunction qf_ijacobian_vol;
-    CeedOperator  op;
 
     PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, problem->apply_vol_ijacobian.qfunction, problem->apply_vol_ijacobian.qfunction_loc,
                                                     &qf_ijacobian_vol));
@@ -471,14 +463,13 @@ PetscErrorCode SetupLibceed(Ceed ceed, CeedData ceed_data, DM dm, User user, App
     PetscCallCeed(ceed, CeedQFunctionAddOutput(qf_ijacobian_vol, "v", num_comp_q, CEED_EVAL_INTERP));
     PetscCallCeed(ceed, CeedQFunctionAddOutput(qf_ijacobian_vol, "Grad_v", num_comp_q * dim, CEED_EVAL_GRAD));
 
-    PetscCallCeed(ceed, CeedOperatorCreate(ceed, qf_ijacobian_vol, NULL, NULL, &op));
-    PetscCallCeed(ceed, CeedOperatorSetField(op, "dq", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE));
-    PetscCallCeed(ceed, CeedOperatorSetField(op, "Grad_dq", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE));
-    PetscCallCeed(ceed, CeedOperatorSetField(op, "qdata", ceed_data->elem_restr_qd_i, CEED_BASIS_NONE, ceed_data->q_data));
-    PetscCallCeed(ceed, CeedOperatorSetField(op, "jac_data", elem_restr_jd_i, CEED_BASIS_NONE, jac_data));
-    PetscCallCeed(ceed, CeedOperatorSetField(op, "v", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE));
-    PetscCallCeed(ceed, CeedOperatorSetField(op, "Grad_v", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE));
-    op_ijacobian_vol = op;
+    PetscCallCeed(ceed, CeedOperatorCreate(ceed, qf_ijacobian_vol, NULL, NULL, &op_ijacobian_vol));
+    PetscCallCeed(ceed, CeedOperatorSetField(op_ijacobian_vol, "dq", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE));
+    PetscCallCeed(ceed, CeedOperatorSetField(op_ijacobian_vol, "Grad_dq", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE));
+    PetscCallCeed(ceed, CeedOperatorSetField(op_ijacobian_vol, "qdata", ceed_data->elem_restr_qd_i, CEED_BASIS_NONE, ceed_data->q_data));
+    PetscCallCeed(ceed, CeedOperatorSetField(op_ijacobian_vol, "jac_data", elem_restr_jd_i, CEED_BASIS_NONE, jac_data));
+    PetscCallCeed(ceed, CeedOperatorSetField(op_ijacobian_vol, "v", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE));
+    PetscCallCeed(ceed, CeedOperatorSetField(op_ijacobian_vol, "Grad_v", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE));
 
     PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_ijacobian_vol));
   }
@@ -488,7 +479,7 @@ PetscErrorCode SetupLibceed(Ceed ceed, CeedData ceed_data, DM dm, User user, App
     CeedOperator op_rhs;
 
     PetscCallCeed(ceed, CeedCompositeOperatorCreate(ceed, &op_rhs));
-    PetscCallCeed(ceed, CeedCompositeOperatorAddSub(op_rhs, user->op_rhs_vol));
+    PetscCallCeed(ceed, CeedCompositeOperatorAddSub(op_rhs, op_rhs_vol));
     PetscCall(AddBCSubOperators(user, ceed, dm, bc, problem, ceed_data, op_rhs, NULL));
 
     PetscCall(OperatorApplyContextCreate(dm, dm, ceed, op_rhs, user->q_ceed, user->g_ceed, user->Q_loc, NULL, &user->op_rhs_ctx));
@@ -505,7 +496,7 @@ PetscErrorCode SetupLibceed(Ceed ceed, CeedData ceed_data, DM dm, User user, App
 
     // Create Composite Operaters
     PetscCallCeed(ceed, CeedCompositeOperatorCreate(ceed, &user->op_ifunction));
-    PetscCallCeed(ceed, CeedCompositeOperatorAddSub(user->op_ifunction, user->op_ifunction_vol));
+    PetscCallCeed(ceed, CeedCompositeOperatorAddSub(user->op_ifunction, op_ifunction_vol));
     if (op_ijacobian_vol) {
       PetscCallCeed(ceed, CeedCompositeOperatorCreate(ceed, &op_ijacobian));
       PetscCallCeed(ceed, CeedCompositeOperatorAddSub(op_ijacobian, op_ijacobian_vol));
@@ -530,8 +521,10 @@ PetscErrorCode SetupLibceed(Ceed ceed, CeedData ceed_data, DM dm, User user, App
   if (app_ctx->diff_filter_monitor && !user->diff_filter) PetscCall(DifferentialFilterSetup(ceed, user, ceed_data, problem));
   if (app_ctx->sgs_train_enable) PetscCall(SGS_DD_TrainingSetup(ceed, user, ceed_data, problem));
 
+  PetscCallCeed(ceed, CeedVectorDestroy(&jac_data));
   PetscCallCeed(ceed, CeedElemRestrictionDestroy(&elem_restr_jd_i));
   PetscCallCeed(ceed, CeedOperatorDestroy(&op_ijacobian_vol));
-  PetscCallCeed(ceed, CeedVectorDestroy(&jac_data));
+  PetscCallCeed(ceed, CeedOperatorDestroy(&op_ifunction_vol));
+  PetscCallCeed(ceed, CeedOperatorDestroy(&op_rhs_vol));
   PetscFunctionReturn(PETSC_SUCCESS);
 }

From 2249ac91e41bcb38055c29b6427350f9214a7158 Mon Sep 17 00:00:00 2001
From: James Wright <james@jameswright.xyz>
Date: Sat, 27 Apr 2024 17:46:15 -0600
Subject: [PATCH 012/571] fluids: Introduce -idl_pressure option

Also re-instates the old P0 behavior with regard to the idl_pressure
---
 examples/fluids/README.md                     |   5 ++++
 examples/fluids/index.md                      |  15 +++++------
 examples/fluids/navierstokes.c                |   2 +-
 examples/fluids/problems/blasius.c            |  25 ++++++++++--------
 examples/fluids/problems/newtonian.c          |   7 +++--
 .../fluids-navierstokes-gaussianwave-IDL.bin  | Bin 2328 -> 2340 bytes
 6 files changed, 31 insertions(+), 23 deletions(-)

diff --git a/examples/fluids/README.md b/examples/fluids/README.md
index c3505fe913..f24e09a694 100644
--- a/examples/fluids/README.md
+++ b/examples/fluids/README.md
@@ -634,6 +634,11 @@ For the Density Current, Channel, and Blasius problems, the following common com
   - `0`
   - `m`
 
+* - `-idl_pressure`
+  - Pressure used for IDL reference pressure
+  -  `-reference_pressure`
+  - `Pa`
+
 * - `-sgs_model_type`
   - Type of subgrid stress model to use. Currently only `data_driven` is available
   - `none`
diff --git a/examples/fluids/index.md b/examples/fluids/index.md
index 9c53bef0f3..f035168ec0 100644
--- a/examples/fluids/index.md
+++ b/examples/fluids/index.md
@@ -855,20 +855,17 @@ numerous terms in the STG formulation.
 
 #### Internal Damping Layer (IDL)
 The STG inflow boundary condition creates large amplitude acoustic waves.
-We use an internal damping layer (IDL) to damp them out without disrupting the synthetic structures developing into natural turbulent structures. This implementation was inspired from
-{cite}`shurSTG2014`, but is implemented here as a ramped volumetric forcing
-term, similar to a sponge layer (see 8.4.2.4 in {cite}`colonius2023turbBC` for example). It takes the following form:
+We use an internal damping layer (IDL) to damp them out without disrupting the synthetic structures developing into natural turbulent structures.
+This implementation was inspired by {cite}`shurSTG2014`, but is implemented here as a ramped volumetric forcing term, similar to a sponge layer (see 8.4.2.4 in {cite}`colonius2023turbBC` for example).
+It takes the following form:
 
 $$
 S(\bm{q}) = -\sigma(\bm{x})\left.\frac{\partial \bm{q}}{\partial \bm{Y}}\right\rvert_{\bm{q}} \bm{Y}'
 $$
 
-where $\bm{Y}' = [P - P_\mathrm{ref}, \bm{0}, 0]^T$, and $\sigma(\bm{x})$ is a
-linear ramp starting at `-idl_start` with length `-idl_length` and an amplitude
-of inverse `-idl_decay_rate`. The damping is defined in terms of a pressure-primitive
-anomaly $\bm Y'$ converted to conservative source using $\partial
-\bm{q}/\partial \bm{Y}\rvert_{\bm{q}}$, which is linearized about the current
-flow state. $P_\mathrm{ref}$ is defined via the `-reference_pressure` flag.
+where $\bm{Y}' = [P - P_\mathrm{ref}, \bm{0}, 0]^T$, and $\sigma(\bm{x})$ is a linear ramp starting at `-idl_start` with length `-idl_length` and an amplitude of inverse `-idl_decay_rate`.
+The damping is defined in terms of a pressure-primitive anomaly $\bm Y'$ converted to conservative source using $\partial \bm{q}/\partial \bm{Y}\rvert_{\bm{q}}$, which is linearized about the current flow state.
+$P_\mathrm{ref}$ has a default value equal to `-reference_pressure` flag, with an optional flag `-idl_pressure` to set it to a different value.
 
 ### Meshing
 
diff --git a/examples/fluids/navierstokes.c b/examples/fluids/navierstokes.c
index 5741119dde..9065f579af 100644
--- a/examples/fluids/navierstokes.c
+++ b/examples/fluids/navierstokes.c
@@ -29,7 +29,7 @@
 //TESTARGS(name="Blasius, SGS DataDriven Fused") -ceed {ceed_resource} -options_file examples/fluids/tests-output/blasius_stgtest.yaml -sgs_model_type data_driven -sgs_model_dd_leakyrelu_alpha 0.3 -sgs_model_dd_parameter_dir examples/fluids/dd_sgs_data -ts_dt 2e-9 -state_var primitive -ksp_rtol 1e-12 -snes_rtol 1e-12 -stg_mean_only -stg_fluctuating_IC -test_type solver -compare_final_state_atol 1e-10 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-blasius-sgs-data-driven.bin
 //TESTARGS(name="Blasius, Anisotropic Differential Filter") -ceed {ceed_resource} -test_type diff_filter -options_file examples/fluids/tests-output/blasius_test.yaml -compare_final_state_atol 5e-10 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-blasius_diff_filter_aniso_vandriest.bin -diff_filter_monitor -ts_max_steps 0 -state_var primitive -diff_filter_friction_length 1e-5 -diff_filter_wall_damping_function van_driest -diff_filter_ksp_rtol 1e-8 -diff_filter_grid_based_width -diff_filter_width_scaling 1,0.7,1
 //TESTARGS(name="Blasius, Isotropic Differential Filter") -ceed {ceed_resource} -test_type diff_filter -options_file examples/fluids/tests-output/blasius_test.yaml -compare_final_state_atol 2e-12 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-blasius_diff_filter_iso.bin -diff_filter_monitor -ts_max_steps 0 -diff_filter_width_scaling 4.2e-5,4.2e-5,4.2e-5 -diff_filter_ksp_atol 1e-14 -diff_filter_ksp_rtol 1e-16
-//TESTARGS(name="Gaussian Wave, with IDL") -ceed {ceed_resource} -test_type solver -options_file examples/fluids/gaussianwave.yaml -compare_final_state_atol 2e-11 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-gaussianwave-IDL.bin -dm_plex_box_faces 5,5,1 -ts_max_steps 5 -idl_decay_time 2e-3 -idl_length 0.25 -idl_start 0 -ts_alpha_radius 0.5
+//TESTARGS(name="Gaussian Wave, with IDL") -ceed {ceed_resource} -test_type solver -options_file examples/fluids/gaussianwave.yaml -compare_final_state_atol 2e-11 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-gaussianwave-IDL.bin -dm_plex_box_faces 5,5,1 -ts_max_steps 5 -idl_decay_time 2e-3 -idl_length 0.25 -idl_start 0 -ts_alpha_radius 0.5 -idl_pressure 70
 //TESTARGS(name="Spanwise Turbulence Statistics") -ceed {ceed_resource} -test_type turb_spanstats -options_file examples/fluids/tests-output/stats_test.yaml -compare_final_state_atol 1E-11 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-turb-spanstats-stats.bin
 //TESTARGS(name="Blasius") -ceed {ceed_resource} -test_type solver -options_file examples/fluids/tests-output/blasius_test.yaml -compare_final_state_atol 2E-11 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-blasius.bin
 //TESTARGS(name="Blasius, STG Inflow") -ceed {ceed_resource} -test_type solver -options_file examples/fluids/tests-output/blasius_stgtest.yaml -compare_final_state_atol 2E-11 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-blasius_STG.bin
diff --git a/examples/fluids/problems/blasius.c b/examples/fluids/problems/blasius.c
index 4ea6389b99..cb061b3bc8 100644
--- a/examples/fluids/problems/blasius.c
+++ b/examples/fluids/problems/blasius.c
@@ -262,14 +262,17 @@ PetscErrorCode NS_BLASIUS(ProblemData problem, DM dm, void *ctx, SimpleBC bc) {
   PetscInt   mesh_Ndelta                          = 45;           // [-]
   PetscReal  mesh_top_angle                       = 5;            // degrees
   char       mesh_ynodes_path[PETSC_MAX_PATH_LEN] = "";
-  PetscBool  flg;
+  PetscBool  P0_set;
 
   PetscOptionsBegin(comm, NULL, "Options for BLASIUS problem", NULL);
   PetscCall(PetscOptionsBool("-weakT", "Change from rho weak to T weak at inflow", NULL, weakT, &weakT, NULL));
   PetscCall(PetscOptionsScalar("-velocity_infinity", "Velocity at boundary layer edge", NULL, U_inf, &U_inf, NULL));
   PetscCall(PetscOptionsScalar("-temperature_infinity", "Temperature at boundary layer edge", NULL, T_inf, &T_inf, NULL));
-  PetscCall(PetscOptionsDeprecated("-P0", "-pressure_infinity", "libCEED 0.12.0", "Use -pressure_infinity to set pressure at boundary layer edge"));
-  PetscCall(PetscOptionsScalar("-pressure_infinity", "Pressure at boundary layer edge", NULL, P_inf, &P_inf, &flg));
+  PetscCall(PetscOptionsHasName(NULL, NULL, "-P0", &P0_set));  // For maintaining behavior of -P0 flag (which is deprecated)
+  PetscCall(
+      PetscOptionsDeprecated("-P0", "-pressure_infinity", "libCEED 0.12.0",
+                             "Use -pressure_infinity to set pressure at boundary layer edge and -idl_pressure to set the IDL reference pressure"));
+  PetscCall(PetscOptionsScalar("-pressure_infinity", "Pressure at boundary layer edge", NULL, P_inf, &P_inf, NULL));
   PetscCall(PetscOptionsScalar("-temperature_wall", "Temperature at wall", NULL, T_wall, &T_wall, NULL));
   PetscCall(PetscOptionsScalar("-delta0", "Boundary layer height at inflow", NULL, delta0, &delta0, NULL));
   PetscCall(PetscOptionsInt("-n_chebyshev", "Number of Chebyshev terms", NULL, N, &N, NULL));
@@ -318,14 +321,14 @@ PetscErrorCode NS_BLASIUS(ProblemData problem, DM dm, void *ctx, SimpleBC bc) {
   };
   State S_infty = StateFromPrimitive(newtonian_ig_ctx, Y_inf);
 
-  blasius_ctx->weakT             = weakT;
-  blasius_ctx->T_wall            = T_wall;
-  blasius_ctx->delta0            = delta0;
-  blasius_ctx->S_infty           = S_infty;
-  blasius_ctx->n_cheb            = N;
-  newtonian_ig_ctx->idl_pressure = P_inf;
-  blasius_ctx->implicit          = user->phys->implicit;
-  blasius_ctx->newtonian_ctx     = *newtonian_ig_ctx;
+  blasius_ctx->weakT    = weakT;
+  blasius_ctx->T_wall   = T_wall;
+  blasius_ctx->delta0   = delta0;
+  blasius_ctx->S_infty  = S_infty;
+  blasius_ctx->n_cheb   = N;
+  blasius_ctx->implicit = user->phys->implicit;
+  if (P0_set) newtonian_ig_ctx->idl_pressure = P_inf;  // For maintaining behavior of -P0 flag (which is deprecated)
+  blasius_ctx->newtonian_ctx = *newtonian_ig_ctx;
 
   {
     PetscReal domain_min[3], domain_max[3];
diff --git a/examples/fluids/problems/newtonian.c b/examples/fluids/problems/newtonian.c
index c28bf50edf..28faf373d2 100644
--- a/examples/fluids/problems/newtonian.c
+++ b/examples/fluids/problems/newtonian.c
@@ -169,7 +169,7 @@ PetscErrorCode NS_NEWTONIAN_IG(ProblemData problem, DM dm, void *ctx, SimpleBC b
   for (PetscInt i = 0; i < 3; i++) domain_size[i] = domain_max[i] - domain_min[i];
 
   StatePrimitive reference      = {.pressure = 1.01e5, .velocity = {0}, .temperature = 288.15};
-  CeedScalar     idl_decay_time = -1, idl_start = 0, idl_length = 0;
+  CeedScalar     idl_decay_time = -1, idl_start = 0, idl_length = 0, idl_pressure = reference.pressure;
   PetscBool      idl_enable = PETSC_FALSE;
 
   // ------------------------------------------------------
@@ -267,6 +267,9 @@ PetscErrorCode NS_NEWTONIAN_IG(ProblemData problem, DM dm, void *ctx, SimpleBC b
   if (idl_decay_time < 0) idl_enable = PETSC_FALSE;
   PetscCall(PetscOptionsScalar("-idl_start", "Start of IDL in the x direction", NULL, idl_start, &idl_start, NULL));
   PetscCall(PetscOptionsScalar("-idl_length", "Length of IDL in the positive x direction", NULL, idl_length, &idl_length, NULL));
+  idl_pressure = reference.pressure;
+  PetscCall(PetscOptionsScalar("-idl_pressure", "Pressure IDL uses as reference (default is `-reference_pressure`)", NULL, idl_pressure,
+                               &idl_pressure, NULL));
   PetscOptionsEnd();
 
   if (stab == STAB_SUPG && !implicit) problem->create_mass_operator = CreateKSPMassOperator_NewtonianStabilized;
@@ -320,7 +323,6 @@ PetscErrorCode NS_NEWTONIAN_IG(ProblemData problem, DM dm, void *ctx, SimpleBC b
   newtonian_ig_ctx->Ctau_C        = Ctau_C;
   newtonian_ig_ctx->Ctau_M        = Ctau_M;
   newtonian_ig_ctx->Ctau_E        = Ctau_E;
-  newtonian_ig_ctx->idl_pressure  = reference.pressure;
   newtonian_ig_ctx->stabilization = stab;
   newtonian_ig_ctx->is_implicit   = implicit;
   newtonian_ig_ctx->state_var     = state_var;
@@ -328,6 +330,7 @@ PetscErrorCode NS_NEWTONIAN_IG(ProblemData problem, DM dm, void *ctx, SimpleBC b
   newtonian_ig_ctx->idl_amplitude = 1 / (idl_decay_time * second);
   newtonian_ig_ctx->idl_start     = idl_start * meter;
   newtonian_ig_ctx->idl_length    = idl_length * meter;
+  newtonian_ig_ctx->idl_pressure  = idl_pressure;
   PetscCall(PetscArraycpy(newtonian_ig_ctx->g, g, 3));
 
   // -- Setup Context
diff --git a/examples/fluids/tests-output/fluids-navierstokes-gaussianwave-IDL.bin b/examples/fluids/tests-output/fluids-navierstokes-gaussianwave-IDL.bin
index facbebe2d632707bacd10577579b9c7b06dad07f..fcb16553ca4f9c0c92fa4e5f69b77498db6c081d 100644
GIT binary patch
literal 2340
zcmXBVc|29y9suxdDl;ifD&%u3FYmft(txv8QRk&`A6-M{Tjrr|JW44tL`5kdL+8}#
zxX3kAhC&g!MLNfjA(i>|hz#K>oc-3XyZ_mrJ*?mU{nq}i?^68SeS-SukU<f_z3jYY
z=e}R6ojdf;Z3!(*-wvyqzVrxFp@N3oZTgrhz}aT%8c3wBb$@ks1EwoyTGTSqk$85=
z<9r7XO!)?z`kvl}MA9}rl84!tv(B{UUGzD`^m|m)FAXNol|KwWpGBfY7X$b0SdHl^
zoz|8ZKBlzqc=){$V@ge0Emns-mx#Anb}b0g<0I8>c}<9!=-8`uYXmXOxqyce2}nd~
zT#9QL#q{TyUfb+;OiAhdnlEt$F$)vVmOe5^LMDE(^r;7!j^)i*>Mmd^%`}X2k&BqJ
z|N1(+`6H1^d2YbWH%zB0*W8b@L(DoY<@2Vxi22yi*w7tI#$RrAIx~;yvJm6{?U+YQ
zM{AV#(@7+D7jd;$Bp_kmRpkQlFPQ#o*y_mgM#MBdq1r<H5!2YPPnqwGgkr@a-=KO-
zk1=n)ZefwUc|#_yYY`)Mc)Q}uArjyFyk~_asn73Yk<TtnQJwtF=J}YCbXqg~6V$iu
zaIIhv(>J~xkd!UJ)UzVTFDm4{WnYh^F78Dl)s6SMZt^glA)UkJG$Zj*956;TV@mGs
zTK|-0GG3|9%OMMMY}dWd(%Olb>hzPs7EdfmxsSgHK8{36FI-}6-(h-Q=B@VeR80Lc
zI<1;>6;rH(ad)_EB$Bj0aK>gOrYCcgy3*AU(;KO^nT<$(1FEYGV@Uj_zn+Kf!StxX
zR&-1eQ;SkwS#9h^%;e<S!X6f>-!IT)ryj{)@%>5t08Hf$m5o$uA|~X_M?Gc>nSTY3
zTYCF3E#Pqqv(peGwJo$QJqR(au7>@Fhe`ZVmeP6)%&~f7#GVR6OmnHlCzGGi__TL>
zV6_m5HgUDbE=pp0`sa$`&5sc?FSkQjKaRxa95#Pmp7fVnvnFmN`8}+0v)3p66XBY=
z^#)=-&ls0}`~wN+Q}<??Jtgy>Sg%x2PwEm6s3T8IEw^X;ej6p<=eiGVa>R7aqXFrX
z<5&uNApe(In37jw4+}OSk$$n|+14gZzln=WK4*nS9~JsdJlTPmsrXy0{$wOtX}Ege
z_BU*G_djYf_g^6KrfSVhX*Qah6q){7;f`JhMTpyuY`~n#DR(D}87#YZ&@E{lso(WN
z_gGvfnZF_p3wbxpIcgBQeb5~(q+rdMGHb+)mH+OLFoi@vBn4k-Ny0QguW`ky>tua;
z^_(e?#FWD7^J$_~By5yRl+m^&{e93}(36cN<R>^ri;@tNWwJAPkrfhY6<;Xnio<l(
z_F~mmZ8UFK*PmsWi<r)?^9j)%<o`Hi){Q*FoNB>(^Ys*BqW}HVt~4H&FZKOqW+_0z
zBxx_N3tpI`IGHrGlZz$712dPXx?%}ezx^gXM@jz}oq7?HfjQJ~UV1t!5fiE=X8NNr
zrOB%LQ-G03=8|KT+*wTb@&A6inZlHYe}|3#KE&LkPF%OhM#5Ps^<s%E@Taz8qy_wW
z8o28n_`_H5PXT|@6ULOlpXRPbcHmEDP*ybfV{L332L5#K!G+*YCFQ9C{+#bzc?SHM
zk=NS;{+u$kC+lZQr@J$4EBF(d7Jm`^Sy5^r1OC`Kdo2ckN?!f=h4fGRbp@6H{5gG^
zUr+i&Ba~xd2mTCrN=1M_uJ-OR;Ll5L?g8*ewj^&H{P{e8qZj<)XB1rqe*zV}UxPn6
z6Xho0Pu2EVbMQy5wJ-|&xhd2<0sb^ybFKt`HawX8?oVI%-)F&}-W2IO;E!&ar8oF<
zWS6HA__O$WSsnP}T<XFDe{x$dtN?#{^oCPN-ggQIEtSBZ$*It4@Fxr_{tNsW@Oix(
z{7HD8(hvTus^3eK`nDY!|E2@}_|01Lz#r95rv>1Tb<~;#@Fy=~4-5QB9qg+Hf1IV4
znu9+cZ5nfAyix-vB%k=BIc3XoBK~9;)Z~8vf0nFFy9xfpxh$6fe`-t2GQppWI$d&R
z9gHpY$_IbUN;Q(fpZF`4iNv3X@!B5ppCtPy`6!XZZyFeTIq}_}X*(J4r+=<E4*dBs
z)YJI8Kc)*U;E#*{o+RSWu*$)r3gS-?_n{;4Peoyg>d~9vPjHY=7V*cZWVhcF;*Yzm
z7C)85AGP?n#hduknsQaI2>j8rSZYK3ae5UKc8K_Mdv;ZOAow$OHQ$5yBY*m&Q8@93
ze&Eb40)K{wj0(V?+Q={Eq<<ni^rHvCA1{rFTJXo>jX&A{#uHy9w0{79cJMkMf<N2l
zw8{FWxSrMnyNN%K^Rkm=i9epYUx$ga@=7kjhHr^KCH#=bRp5`yhZue0Pr6c6ycGD8
zvMDl}_+#+Cer7lEhg~j=av}aatF!!*NBj|X_nVy}{zOv8>I%T0KXP0Qz#q#wmE**p
zivst+7vRrgDzp>)ahmJgK>RT_JF7<O94oJAH~a1nd(ck^{tSFdSVGpPk7vcPaPTL9
zRYvyDM!C9;V`ikk57h9Mb%Q@u+@u2Hk12}^uK<7a!hgR2{!H`*TqXY0Hu?N!5B?;}
zJ6eN3&+Ycsf<GFEj=6z9cR7*W#2-INd)?*WkCWy8%ivFHqU|jBgMaY2PyF$1UNfl-
z{sh|gtARg0)y9^@AN2;CiF@Er3io;q_~UUcK?3|?c+<Av&sjNRckm}*x~~oVQS-Qw
F^MBy-Sfl^|

literal 2328
zcmW;OcU)7~7Xa|bh=3?4!2vEHEb%8GU{ML&i=rZR@DfYKfsK2Bihu(U7bruf0g+_{
zaf5uKtwo8A{3=sPYyqXrm>Q`f5XzAGe&_!3@A;f_?z``t@5_|#b_#wh6dkTuTWlOb
z8AWWQC=D}A_YMT#l5fG3o_#>G;5$rdC7f(Pi;$ekSUt~m5aNmNoRlqyk*#_=axMi^
z8Xlc?X>JhrSi(AR6Z1^g-^x5zhL~@CvlKozF%>>qm!y;-xo-9uJaG@x1FntxrYbRI
zvaC95#|lgtdOX$QY9YCXadO7NAgCYgDOvdz@^8J_=LB&ooqghY1=L?yT_2i-=^p;2
z(Ck>~yX|@Ck41<X*PK~&bP19%xxb7A_hGs_r`KWY7-IgG=A|XALCn9;a?7h!NH(!X
zaYu0%^4ZkP;V4X5z9<i0BSg%oK^61$3FNy5<fI@>i#zPbTg8}S%@7vMtws}jmPQ+S
z9!0WGyn=I43QTu$E$n}OikK%x_d@g15mT3<K9srw$;S0vwc8?~{&&UEsg{r*dB<$3
z0b(X>qhpMMp#Glv`UWdZtBub!UCoBR?=;vkCo!dYcxaNrOn5%pwrztCru%|q8#f9t
zHT%N9f)nO&Ke<(sFg4syzU7%O81hH(GqDq5LX<l=%?ze=O5OFGoFV`IhYOJk%;Rs`
zZ?Pc@G2JJ)saYjhlTwvQwDXbNQ1!r6|2?M1{T`k7--#*j8G-RNeVFp_aGX~84eEFD
z2h2A?{oc$G=}yFS{fA#v!G-gw5=RWSLw(t-TWKzs{`z|L!jnEQ7fpVDsoW6Lr@r>Y
zd;;z_*F7D#K)%)ElOfI!?~s<gxQCedX{)cB?1H#|VS0Zlrn_o2b*!r}HT`(d>(+9_
zwC8Qz^U@dU*UYx~;{oQ;mOd`s^AW=&d)1f7QEy+j#$v&JB)4nbeIU~k(}U;V`-r>{
z)8WN6aDIU%yz-96Wj;c3R#(z>$r`9X<UZy#ftax+SJ~E|;T~l6f$N%(Y%K4Fw*d#v
z=NPl-*d&-MZqrXf8%)hSuxo9R6#9#A1c*H_&$^Ru<1~OZ6CB0A#U~@Cb&B75T!-Wq
zZI8-o=3yQu>4(bm9MroYJJIaVWpMuN1z!vWNNyTtC)8Yz{jR&m$;ywR32f@$;G#yO
z1J_)mWxl8`e9TP8G#K+<6xIrtHDX=2R)@gbvk|jrQ0`+|1oQJt#JYzs;QhT66;8X3
z#&33rM#Z*>>1nj%KgmIItNaY@oc)*<%?(|&PZv{Bi>n+HzhO$BU1%HUjAU<YiX0mr
zV0t8~QOCU!Q>Q;~v>3|36n&$=UBrR=Ii7I3EE>~8+QVrTzG!^;wlp^f5vE*P_f$-u
z2m84zY@)Xk^CW^dzb=eKOu?_a9?#amlZ!vv2>wh%vK+P4wKxIurhb^tmPBHW(^e(A
zfxEEAey<^Qav;o4eAb(Pwqu&<Ugj3E88NZTqc~@qF*RS(JMCy0^v_tA8o<W1a!UVQ
zm;=h>$K$WI!XAi{m{WrhNcL^UJgt^s;!oHPm74hD`|Tq5p_Oof4u|(oWq3FJV@mw#
zP_2RYDqGcaRPQqJM;$Y_9O53%U1K)HpQ!rSJmQa0b_(oa_^7I9sDt<uJlVUG_|weq
zeoFjlQoKJz{3#W3EQmiT>Y*;;kIQ4dgTx<p@yFkZKgRAwo=|^bg>o^>cMm^aG1f@@
zd3jl^B>tGx{*XibDGgX1PyA8l+jkLvYP_q}#Gi|~g89TB`KE*IkZ;s)>w1a!b8oU7
zoE$ZftQ`m={%o%EtRnt=&s_;Vv1X)cwMmITQ`%Nr6Mz2rS+<w>V_xI*lK7K1Qlts~
zq-w{_`AGa(cIvDl@#neZ^I6D0W3gQ7`+t8b!|oA(nkO+~F#pDpN!xOuZ{uH|%-|fF
zhlj_9o8b9qo9Z4Z_;W;KmCh96k8)*sCh_M@$M0<7kH>w*JIEj5rs{()A<Ct-zR*Xf
zbZ?;DN65ebX{L7!%&+gYlRM?apA)YQe<S_`Xx9pfKksv#!QXMetS#bU;?Ehu#z5lF
zitk_DB>t=qdrm?9-c;T*DfqK!o4Ir;@#pf?l31uOGtOi||F5rCR@8%!T21M50>R-v
zb!z#o5#kTGbtlQUvUF7a1aXDUQ0+tT$MDYnI1h+#=?@K35r1q$<Wl0#taQ0I?C%Xz
zmdF<B*Kpbo&j)`v*ZhZf6Mq`E9J&PlbeeF5;Imz;kG0Ja@aKZYbsq=tXI0T88&~ir
z_vxz*#l)XGDM>qtKP#-73&9_kE1DxGh(GHdb`}tS^3@7YIG<xmJdX`?<yP@lu$%a^
z`JLuP=r7JM_^&7Nr#Cb)lK7J`c!dZ4=(qNNs04r7Z*Io$edQ#%_S_eOKbih$Gkjm#
zv(HY?TSEMqc2M^U@rSG5t^@we$(+@>8~hO#ER=;2e@yw03c;V-CY%T%_|v8<YA^?X
zhGODqf8x(plLYXM|1#XwUJm}WoYsoHOZ=Hmjlq6d<p)WWXNf<FW&uaQABQ6%4(y*k
z`+><O1L99ZXYwK9k7LNyCh+Hr_4Y6I;E(8G#5)D?X92534F0e@8(8V!kAL+Xy;I;%
zOZ)B4c;ZjG+m&wOPl<zZANW&h7NWNt{80_+3sZ<c*}n7);?Jy)75c=Vc{iH8h(8ez
zg?hxFg1vQZ#2=5w);ZvhyM6fFt;C-eD?RxC!OGy%j|9Y@vDc^mhW+=eFG)QD{v_*I
N<w=M?rt4~K{|68KOu+yE


From cfb075a441d6e246d8dcf2047927a3e0456b3b9f Mon Sep 17 00:00:00 2001
From: James Wright <james@jameswright.xyz>
Date: Sat, 27 Apr 2024 09:13:52 -0600
Subject: [PATCH 013/571] fluids: Add QDataGet functions

---
 examples/fluids/navierstokes.c          |   5 -
 examples/fluids/navierstokes.h          |  11 +-
 examples/fluids/problems/advection.c    |  14 --
 examples/fluids/problems/eulervortex.c  |   7 -
 examples/fluids/problems/newtonian.c    |  17 +-
 examples/fluids/problems/shocktube.c    |   7 -
 examples/fluids/qfunctions/setupgeo2d.h |  55 ++++++-
 examples/fluids/src/qdata.c             | 199 ++++++++++++++++++++++++
 examples/fluids/src/setuplibceed.c      |  88 +++--------
 examples/fluids/src/turb_spanstats.c    |  60 ++++---
 10 files changed, 318 insertions(+), 145 deletions(-)
 create mode 100644 examples/fluids/src/qdata.c

diff --git a/examples/fluids/navierstokes.c b/examples/fluids/navierstokes.c
index 44ad5b4ef9..919ff16669 100644
--- a/examples/fluids/navierstokes.c
+++ b/examples/fluids/navierstokes.c
@@ -285,17 +285,12 @@ int main(int argc, char **argv) {
     PetscCallCeed(ceed, CeedQFunctionContextDestroy(&problem->apply_freestream_jacobian.qfunction_context));
     PetscCallCeed(ceed, CeedQFunctionContextDestroy(&problem->apply_slip.qfunction_context));
     PetscCallCeed(ceed, CeedQFunctionContextDestroy(&problem->apply_slip_jacobian.qfunction_context));
-    PetscCallCeed(ceed, CeedQFunctionContextDestroy(&problem->setup_sur.qfunction_context));
-    PetscCallCeed(ceed, CeedQFunctionContextDestroy(&problem->setup_vol.qfunction_context));
     PetscCallCeed(ceed, CeedQFunctionContextDestroy(&problem->ics.qfunction_context));
     PetscCallCeed(ceed, CeedQFunctionContextDestroy(&problem->apply_vol_rhs.qfunction_context));
     PetscCallCeed(ceed, CeedQFunctionContextDestroy(&problem->apply_vol_ifunction.qfunction_context));
     PetscCallCeed(ceed, CeedQFunctionContextDestroy(&problem->apply_vol_ijacobian.qfunction_context));
   }
 
-  // -- QFunctions
-  PetscCallCeed(ceed, CeedQFunctionDestroy(&ceed_data->qf_setup_sur));
-
   // -- Operators
   PetscCall(OperatorApplyContextDestroy(ceed_data->op_ics_ctx));
   PetscCall(OperatorApplyContextDestroy(user->op_rhs_ctx));
diff --git a/examples/fluids/navierstokes.h b/examples/fluids/navierstokes.h
index 783fcab1e4..ce8da6ea28 100644
--- a/examples/fluids/navierstokes.h
+++ b/examples/fluids/navierstokes.h
@@ -155,7 +155,6 @@ struct CeedData_private {
   CeedBasis            basis_x, basis_q;
   CeedElemRestriction  elem_restr_x, elem_restr_q, elem_restr_qd_i;
   OperatorApplyContext op_ics_ctx;
-  CeedQFunction        qf_setup_sur;
 };
 
 typedef struct {
@@ -291,8 +290,8 @@ typedef struct ProblemData_private *ProblemData;
 struct ProblemData_private {
   CeedInt              dim, q_data_size_vol, q_data_size_sur, jac_data_size_sur;
   CeedScalar           dm_scale;
-  ProblemQFunctionSpec setup_vol, setup_sur, ics, apply_vol_rhs, apply_vol_ifunction, apply_vol_ijacobian, apply_inflow, apply_outflow,
-      apply_freestream, apply_slip, apply_inflow_jacobian, apply_outflow_jacobian, apply_freestream_jacobian, apply_slip_jacobian;
+  ProblemQFunctionSpec ics, apply_vol_rhs, apply_vol_ifunction, apply_vol_ijacobian, apply_inflow, apply_outflow, apply_freestream, apply_slip,
+      apply_inflow_jacobian, apply_outflow_jacobian, apply_freestream_jacobian, apply_slip_jacobian;
   bool      non_zero_time;
   PetscBool bc_from_ics, use_strong_bc_ceed, uses_newtonian;
   PetscErrorCode (*print_info)(User, ProblemData, AppCtx);
@@ -349,6 +348,12 @@ PetscErrorCode CreateBasisFromPlex(Ceed ceed, DM dm, DMLabel domain_label, CeedI
 
 PetscErrorCode SetupLibceed(Ceed ceed, CeedData ceed_data, DM dm, User user, AppCtx app_ctx, ProblemData problem, SimpleBC bc);
 
+PetscErrorCode QDataGet(Ceed ceed, DM dm, DMLabel domain_label, PetscInt label_value, CeedElemRestriction elem_restr_x, CeedBasis basis_x,
+                        CeedVector x_coord, CeedElemRestriction *elem_restr_qd, CeedVector *q_data, CeedInt *q_data_size);
+PetscErrorCode QDataGetNumComponents(DM dm, CeedInt *q_data_size);
+PetscErrorCode QDataBoundaryGet(Ceed ceed, DM dm, DMLabel domain_label, PetscInt label_value, CeedElemRestriction elem_restr_x, CeedBasis basis_x,
+                                CeedVector x_coord, CeedElemRestriction *elem_restr_qd, CeedVector *q_data, CeedInt *q_data_size);
+PetscErrorCode QDataBoundaryGetNumComponents(DM dm, CeedInt *q_data_size);
 // -----------------------------------------------------------------------------
 // Time-stepping functions
 // -----------------------------------------------------------------------------
diff --git a/examples/fluids/problems/advection.c b/examples/fluids/problems/advection.c
index 1d29b2cddb..4235692863 100644
--- a/examples/fluids/problems/advection.c
+++ b/examples/fluids/problems/advection.c
@@ -14,8 +14,6 @@
 #include <petscdm.h>
 
 #include "../navierstokes.h"
-#include "../qfunctions/setupgeo.h"
-#include "../qfunctions/setupgeo2d.h"
 
 // @brief Create CeedOperator for stabilized mass KSP for explicit timestepping
 //
@@ -106,12 +104,6 @@ PetscErrorCode NS_ADVECTION(ProblemData problem, DM dm, void *ctx, SimpleBC bc)
   switch (dim) {
     case 2:
       problem->dim                               = 2;
-      problem->q_data_size_vol                   = 5;
-      problem->q_data_size_sur                   = 3;
-      problem->setup_vol.qfunction               = Setup2d;
-      problem->setup_vol.qfunction_loc           = Setup2d_loc;
-      problem->setup_sur.qfunction               = SetupBoundary2d;
-      problem->setup_sur.qfunction_loc           = SetupBoundary2d_loc;
       problem->ics.qfunction                     = ICsAdvection2d;
       problem->ics.qfunction_loc                 = ICsAdvection2d_loc;
       problem->apply_vol_rhs.qfunction           = RHS_Advection2d;
@@ -125,12 +117,6 @@ PetscErrorCode NS_ADVECTION(ProblemData problem, DM dm, void *ctx, SimpleBC bc)
       break;
     case 3:
       problem->dim                               = 3;
-      problem->q_data_size_vol                   = 10;
-      problem->q_data_size_sur                   = 10;
-      problem->setup_vol.qfunction               = Setup;
-      problem->setup_vol.qfunction_loc           = Setup_loc;
-      problem->setup_sur.qfunction               = SetupBoundary;
-      problem->setup_sur.qfunction_loc           = SetupBoundary_loc;
       problem->ics.qfunction                     = ICsAdvection;
       problem->ics.qfunction_loc                 = ICsAdvection_loc;
       problem->apply_vol_rhs.qfunction           = RHS_Advection;
diff --git a/examples/fluids/problems/eulervortex.c b/examples/fluids/problems/eulervortex.c
index 0115ab5c83..a823a71433 100644
--- a/examples/fluids/problems/eulervortex.c
+++ b/examples/fluids/problems/eulervortex.c
@@ -14,7 +14,6 @@
 #include <petscdm.h>
 
 #include "../navierstokes.h"
-#include "../qfunctions/setupgeo.h"
 
 PetscErrorCode NS_EULER_VORTEX(ProblemData problem, DM dm, void *ctx, SimpleBC bc) {
   EulerTestType        euler_test;
@@ -33,12 +32,6 @@ PetscErrorCode NS_EULER_VORTEX(ProblemData problem, DM dm, void *ctx, SimpleBC b
   //               SET UP DENSITY_CURRENT
   // ------------------------------------------------------
   problem->dim                               = 3;
-  problem->q_data_size_vol                   = 10;
-  problem->q_data_size_sur                   = 10;
-  problem->setup_vol.qfunction               = Setup;
-  problem->setup_vol.qfunction_loc           = Setup_loc;
-  problem->setup_sur.qfunction               = SetupBoundary;
-  problem->setup_sur.qfunction_loc           = SetupBoundary_loc;
   problem->ics.qfunction                     = ICsEuler;
   problem->ics.qfunction_loc                 = ICsEuler_loc;
   problem->apply_vol_rhs.qfunction           = Euler;
diff --git a/examples/fluids/problems/newtonian.c b/examples/fluids/problems/newtonian.c
index 28faf373d2..80f84c7ec9 100644
--- a/examples/fluids/problems/newtonian.c
+++ b/examples/fluids/problems/newtonian.c
@@ -14,7 +14,6 @@
 #include <petscdm.h>
 
 #include "../navierstokes.h"
-#include "../qfunctions/setupgeo.h"
 
 // For use with PetscOptionsEnum
 static const char *const StateVariables[] = {"CONSERVATIVE", "PRIMITIVE", "StateVariable", "STATEVAR_", NULL};
@@ -136,17 +135,11 @@ PetscErrorCode NS_NEWTONIAN_IG(ProblemData problem, DM dm, void *ctx, SimpleBC b
   // ------------------------------------------------------
   //           Setup Generic Newtonian IG Problem
   // ------------------------------------------------------
-  problem->dim                     = 3;
-  problem->q_data_size_vol         = 10;
-  problem->q_data_size_sur         = 10;
-  problem->jac_data_size_sur       = 11;
-  problem->setup_vol.qfunction     = Setup;
-  problem->setup_vol.qfunction_loc = Setup_loc;
-  problem->setup_sur.qfunction     = SetupBoundary;
-  problem->setup_sur.qfunction_loc = SetupBoundary_loc;
-  problem->non_zero_time           = PETSC_FALSE;
-  problem->print_info              = PRINT_NEWTONIAN;
-  problem->uses_newtonian          = PETSC_TRUE;
+  problem->dim               = 3;
+  problem->jac_data_size_sur = 11;
+  problem->non_zero_time     = PETSC_FALSE;
+  problem->print_info        = PRINT_NEWTONIAN;
+  problem->uses_newtonian    = PETSC_TRUE;
 
   // ------------------------------------------------------
   //             Create the libCEED context
diff --git a/examples/fluids/problems/shocktube.c b/examples/fluids/problems/shocktube.c
index 36d2b991e9..38758516f5 100644
--- a/examples/fluids/problems/shocktube.c
+++ b/examples/fluids/problems/shocktube.c
@@ -14,7 +14,6 @@
 #include <petscdm.h>
 
 #include "../navierstokes.h"
-#include "../qfunctions/setupgeo.h"
 
 PetscErrorCode NS_SHOCKTUBE(ProblemData problem, DM dm, void *ctx, SimpleBC bc) {
   SetupContextShock    setup_context;
@@ -35,12 +34,6 @@ PetscErrorCode NS_SHOCKTUBE(ProblemData problem, DM dm, void *ctx, SimpleBC bc)
   //               SET UP SHOCKTUBE
   // ------------------------------------------------------
   problem->dim                               = 3;
-  problem->q_data_size_vol                   = 10;
-  problem->q_data_size_sur                   = 4;
-  problem->setup_vol.qfunction               = Setup;
-  problem->setup_vol.qfunction_loc           = Setup_loc;
-  problem->setup_sur.qfunction               = SetupBoundary;
-  problem->setup_sur.qfunction_loc           = SetupBoundary_loc;
   problem->ics.qfunction                     = ICsShockTube;
   problem->ics.qfunction_loc                 = ICsShockTube_loc;
   problem->apply_vol_rhs.qfunction           = EulerShockTube;
diff --git a/examples/fluids/qfunctions/setupgeo2d.h b/examples/fluids/qfunctions/setupgeo2d.h
index c01753b2c1..4bbb39795c 100644
--- a/examples/fluids/qfunctions/setupgeo2d.h
+++ b/examples/fluids/qfunctions/setupgeo2d.h
@@ -8,7 +8,6 @@
 /// @file
 /// Geometric factors (2D) for Navier-Stokes example using PETSc
 #include <ceed.h>
-#include <math.h>
 #include "setupgeo_helpers.h"
 #include "utils.h"
 
@@ -98,3 +97,57 @@ CEED_QFUNCTION(SetupBoundary2d)(void *ctx, CeedInt Q, const CeedScalar *const *i
   }
   return 0;
 }
+
+// *****************************************************************************
+// This QFunction sets up the geometric factor required for integration when reference coordinates are in 2D and the physical coordinates are in 3D
+//
+// Reference (parent) 2D coordinates: X
+// Physical (current) 3D coordinates: x
+// Change of coordinate matrix:
+//   dxdX_{i,j} = dx_i/dX_j (indicial notation) [3 * 2]
+// Inverse change of coordinate matrix:
+//   dXdx_{i,j} = dX_i/dx_j (indicial notation) [2 * 3]
+//
+// (J1,J2,J3) is given by the cross product of the columns of dxdX_{i,j}
+//
+// detJb is the magnitude of (J1,J2,J3)
+//
+// dXdx is calculated via Moore–Penrose inverse:
+//
+//   dX_i/dx_j = (dxdX^T dxdX)^(-1) dxdX
+//             = (dx_l/dX_i * dx_l/dX_k)^(-1) dx_j/dX_k
+//
+// All quadrature data is stored in 10 field vector of quadrature data.
+//
+// We require the determinant of the Jacobian to properly compute integrals of
+//   the form: int( u v )
+//
+// Stored: w detJb
+//   in q_data_sur[0]
+//
+// Normal vector = (J1,J2,J3) / detJb
+//
+// Stored: (J1,J2,J3) / detJb
+//
+// Stored: dXdx_{i,j}
+//   in q_data_sur[1:6] as
+//    [dXdx_11 dXdx_12 dXdx_13]
+//    [dXdx_21 dXdx_22 dXdx_23]
+// *****************************************************************************
+CEED_QFUNCTION(Setup2D_3Dcoords)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+  const CeedScalar(*J)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0];
+  const CeedScalar(*w)                = in[1];
+  CeedScalar(*q_data_sur)             = out[0];
+
+  CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
+    CeedScalar detJb, normal[3], dXdx[2][3];
+
+    NormalVectorFromdxdX_3D(Q, i, J, normal, &detJb);
+    InvertBoundaryMappingJacobian_3D(Q, i, J, dXdx);
+    const CeedScalar wdetJ = w[i] * detJb;
+
+    StoredValuesPack(Q, i, 0, 1, &wdetJ, q_data_sur);
+    StoredValuesPack(Q, i, 1, 6, (const CeedScalar *)dXdx, q_data_sur);
+  }
+  return 0;
+}
diff --git a/examples/fluids/src/qdata.c b/examples/fluids/src/qdata.c
new file mode 100644
index 0000000000..6883c2a8c3
--- /dev/null
+++ b/examples/fluids/src/qdata.c
@@ -0,0 +1,199 @@
+// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+#include "../navierstokes.h"
+
+#include <petscsection.h>
+#include "../qfunctions/setupgeo.h"
+#include "../qfunctions/setupgeo2d.h"
+
+/**
+ * @brief Get number of components of quadrature data for domain
+ *
+ * @param[in]  dm          DM where quadrature data would be used
+ * @param[out] q_data_size Number of components of quadrature data
+ */
+PetscErrorCode QDataGetNumComponents(DM dm, CeedInt *q_data_size) {
+  PetscInt num_comp_x, dim;
+
+  PetscFunctionBeginUser;
+  PetscCall(DMGetDimension(dm, &dim));
+  {  // Get number of coordinate components
+    DM           dm_coord;
+    PetscSection section_coord;
+    PetscInt     field = 0;  // Default field has the coordinates
+    PetscCall(DMGetCoordinateDM(dm, &dm_coord));
+    PetscCall(DMGetLocalSection(dm_coord, &section_coord));
+    PetscCall(PetscSectionGetFieldComponents(section_coord, field, &num_comp_x));
+  }
+  switch (dim) {
+    case 2:
+      switch (num_comp_x) {
+        case 2:
+          *q_data_size = 5;
+          break;
+        case 3:
+          *q_data_size = 7;
+          break;
+        default:
+          SETERRQ(PetscObjectComm((PetscObject)dm), PETSC_ERR_SUP,
+                  "QData not valid for DM of dimension %" PetscInt_FMT " and coordinates with dimension %" PetscInt_FMT, dim, num_comp_x);
+          break;
+      }
+      break;
+    case 3:
+      *q_data_size = 10;
+      break;
+    default:
+      SETERRQ(PetscObjectComm((PetscObject)dm), PETSC_ERR_SUP,
+              "QData not valid for DM of dimension %" PetscInt_FMT " and coordinates with dimension %" PetscInt_FMT, dim, num_comp_x);
+      break;
+  }
+  PetscFunctionReturn(PETSC_SUCCESS);
+}
+
+/**
+ * @brief Create quadrature data for domain
+ *
+ * @param[in]  ceed          Ceed object quadrature data will be used with
+ * @param[in]  dm            DM where quadrature data would be used
+ * @param[in]  domain_label  DMLabel that quadrature data would be used one
+ * @param[in]  label_value   Value of label
+ * @param[in]  elem_restr_x  CeedElemRestriction of the coordinates (must match `domain_label` and `label_value` selections)
+ * @param[in]  basis_x       CeedBasis of the coordinates
+ * @param[in]  x_coord       CeedVector of the coordinates
+ * @param[out] elem_restr_qd CeedElemRestriction of the quadrature data
+ * @param[out] q_data        CeedVector of the quadrature data
+ * @param[out] q_data_size   number of components of quadrature data
+ */
+PetscErrorCode QDataGet(Ceed ceed, DM dm, DMLabel domain_label, PetscInt label_value, CeedElemRestriction elem_restr_x, CeedBasis basis_x,
+                        CeedVector x_coord, CeedElemRestriction *elem_restr_qd, CeedVector *q_data, CeedInt *q_data_size) {
+  CeedQFunction qf_setup;
+  CeedOperator  op_setup;
+  CeedInt       num_comp_x;
+  PetscInt      dim, height = 0;
+
+  PetscFunctionBeginUser;
+  PetscCall(QDataGetNumComponents(dm, q_data_size));
+  PetscCallCeed(ceed, CeedElemRestrictionGetNumComponents(elem_restr_x, &num_comp_x));
+  PetscCall(DMGetDimension(dm, &dim));
+  switch (dim) {
+    case 2:
+      switch (num_comp_x) {
+        case 2:
+          PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, Setup2d, Setup2d_loc, &qf_setup));
+          break;
+        case 3:
+          PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, Setup2D_3Dcoords, Setup2D_3Dcoords_loc, &qf_setup));
+          break;
+      }
+      break;
+    case 3:
+      PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, Setup, Setup_loc, &qf_setup));
+      break;
+  }
+
+  // -- Create QFunction for quadrature data
+  PetscCallCeed(ceed, CeedQFunctionSetUserFlopsEstimate(qf_setup, 0));
+  PetscCallCeed(ceed, CeedQFunctionAddInput(qf_setup, "dx", num_comp_x * (dim - height), CEED_EVAL_GRAD));
+  PetscCallCeed(ceed, CeedQFunctionAddInput(qf_setup, "weight", 1, CEED_EVAL_WEIGHT));
+  PetscCallCeed(ceed, CeedQFunctionAddOutput(qf_setup, "surface qdata", *q_data_size, CEED_EVAL_NONE));
+
+  PetscCall(DMPlexCeedElemRestrictionQDataCreate(ceed, dm, domain_label, label_value, height, *q_data_size, elem_restr_qd));
+  PetscCallCeed(ceed, CeedElemRestrictionCreateVector(*elem_restr_qd, q_data, NULL));
+
+  PetscCallCeed(ceed, CeedOperatorCreate(ceed, qf_setup, NULL, NULL, &op_setup));
+  PetscCallCeed(ceed, CeedOperatorSetField(op_setup, "dx", elem_restr_x, basis_x, CEED_VECTOR_ACTIVE));
+  PetscCallCeed(ceed, CeedOperatorSetField(op_setup, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE));
+  PetscCallCeed(ceed, CeedOperatorSetField(op_setup, "surface qdata", *elem_restr_qd, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE));
+
+  PetscCallCeed(ceed, CeedOperatorApply(op_setup, x_coord, *q_data, CEED_REQUEST_IMMEDIATE));
+
+  PetscCallCeed(ceed, CeedOperatorDestroy(&op_setup));
+  PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_setup));
+  PetscFunctionReturn(PETSC_SUCCESS);
+}
+
+/**
+ * @brief Get number of components of quadrature data for boundary of domain
+ *
+ * @param[in]  dm          DM where quadrature data would be used
+ * @param[out] q_data_size Number of components of quadrature data
+ */
+PetscErrorCode QDataBoundaryGetNumComponents(DM dm, CeedInt *q_data_size) {
+  PetscInt dim;
+
+  PetscFunctionBeginUser;
+  PetscCall(DMGetDimension(dm, &dim));
+  switch (dim) {
+    case 2:
+      *q_data_size = 3;
+      break;
+    case 3:
+      *q_data_size = 10;
+      break;
+    default:
+      SETERRQ(PetscObjectComm((PetscObject)dm), PETSC_ERR_SUP, "QDataBoundary not valid for DM of dimension %" PetscInt_FMT, dim);
+      break;
+  }
+  PetscFunctionReturn(PETSC_SUCCESS);
+}
+
+/**
+ * @brief Create quadrature data for boundary of domain
+ *
+ * @param[in]  ceed          Ceed object quadrature data will be used with
+ * @param[in]  dm            DM where quadrature data would be used
+ * @param[in]  domain_label  DMLabel that quadrature data would be used one
+ * @param[in]  label_value   Value of label
+ * @param[in]  elem_restr_x  CeedElemRestriction of the coordinates (must match `domain_label` and `label_value` selections)
+ * @param[in]  basis_x       CeedBasis of the coordinates
+ * @param[in]  x_coord       CeedVector of the coordinates
+ * @param[out] elem_restr_qd CeedElemRestriction of the quadrature data
+ * @param[out] q_data        CeedVector of the quadrature data
+ * @param[out] q_data_size   number of components of quadrature data
+ */
+PetscErrorCode QDataBoundaryGet(Ceed ceed, DM dm, DMLabel domain_label, PetscInt label_value, CeedElemRestriction elem_restr_x, CeedBasis basis_x,
+                                CeedVector x_coord, CeedElemRestriction *elem_restr_qd, CeedVector *q_data, CeedInt *q_data_size) {
+  CeedQFunction qf_setup_sur;
+  CeedOperator  op_setup_sur;
+  CeedInt       num_comp_x;
+  PetscInt      dim, height = 1;
+
+  PetscFunctionBeginUser;
+  PetscCall(QDataBoundaryGetNumComponents(dm, q_data_size));
+  PetscCallCeed(ceed, CeedElemRestrictionGetNumComponents(elem_restr_x, &num_comp_x));
+  PetscCall(DMGetDimension(dm, &dim));
+  switch (dim) {
+    case 2:
+      PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, SetupBoundary2d, SetupBoundary2d_loc, &qf_setup_sur));
+      break;
+    case 3:
+      PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, SetupBoundary, SetupBoundary_loc, &qf_setup_sur));
+      break;
+  }
+
+  // -- Create QFunction for quadrature data
+  PetscCallCeed(ceed, CeedQFunctionSetUserFlopsEstimate(qf_setup_sur, 0));
+  PetscCallCeed(ceed, CeedQFunctionAddInput(qf_setup_sur, "dx", num_comp_x * (dim - height), CEED_EVAL_GRAD));
+  PetscCallCeed(ceed, CeedQFunctionAddInput(qf_setup_sur, "weight", 1, CEED_EVAL_WEIGHT));
+  PetscCallCeed(ceed, CeedQFunctionAddOutput(qf_setup_sur, "surface qdata", *q_data_size, CEED_EVAL_NONE));
+
+  PetscCall(DMPlexCeedElemRestrictionQDataCreate(ceed, dm, domain_label, label_value, height, *q_data_size, elem_restr_qd));
+  PetscCallCeed(ceed, CeedElemRestrictionCreateVector(*elem_restr_qd, q_data, NULL));
+
+  PetscCallCeed(ceed, CeedOperatorCreate(ceed, qf_setup_sur, NULL, NULL, &op_setup_sur));
+  PetscCallCeed(ceed, CeedOperatorSetField(op_setup_sur, "dx", elem_restr_x, basis_x, CEED_VECTOR_ACTIVE));
+  PetscCallCeed(ceed, CeedOperatorSetField(op_setup_sur, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE));
+  PetscCallCeed(ceed, CeedOperatorSetField(op_setup_sur, "surface qdata", *elem_restr_qd, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE));
+
+  PetscCallCeed(ceed, CeedOperatorApply(op_setup_sur, x_coord, *q_data, CEED_REQUEST_IMMEDIATE));
+
+  PetscCallCeed(ceed, CeedOperatorDestroy(&op_setup_sur));
+  PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_setup_sur));
+  PetscFunctionReturn(PETSC_SUCCESS);
+}
diff --git a/examples/fluids/src/setuplibceed.c b/examples/fluids/src/setuplibceed.c
index ec8048d657..972fffa459 100644
--- a/examples/fluids/src/setuplibceed.c
+++ b/examples/fluids/src/setuplibceed.c
@@ -102,26 +102,14 @@ static PetscErrorCode AddBCSubOperator(Ceed ceed, DM dm, CeedData ceed_data, DML
   PetscFunctionBeginUser;
   PetscCall(DMPlexCeedElemRestrictionCreate(ceed, dm, domain_label, label_value, height, dm_field, &elem_restr_q_sur));
   PetscCall(DMPlexCeedElemRestrictionCoordinateCreate(ceed, dm, domain_label, label_value, height, &elem_restr_x_sur));
-  PetscCall(DMPlexCeedElemRestrictionQDataCreate(ceed, dm, domain_label, label_value, height, q_data_size_sur, &elem_restr_qd_i_sur));
   if (jac_data_size_sur > 0) {
     // State-dependent data will be passed from residual to Jacobian. This will be collocated.
     PetscCall(DMPlexCeedElemRestrictionQDataCreate(ceed, dm, domain_label, label_value, height, jac_data_size_sur, &elem_restr_jd_i_sur));
     PetscCallCeed(ceed, CeedElemRestrictionCreateVector(elem_restr_jd_i_sur, &jac_data_sur, NULL));
   }
 
-  {  // Create q_data_sur vector
-    CeedOperator op_setup_sur;
-
-    PetscCallCeed(ceed, CeedElemRestrictionCreateVector(elem_restr_qd_i_sur, &q_data_sur, NULL));
-
-    PetscCallCeed(ceed, CeedOperatorCreate(ceed, ceed_data->qf_setup_sur, NULL, NULL, &op_setup_sur));
-    PetscCallCeed(ceed, CeedOperatorSetField(op_setup_sur, "dx", elem_restr_x_sur, basis_x_sur, CEED_VECTOR_ACTIVE));
-    PetscCallCeed(ceed, CeedOperatorSetField(op_setup_sur, "weight", CEED_ELEMRESTRICTION_NONE, basis_x_sur, CEED_VECTOR_NONE));
-    PetscCallCeed(ceed, CeedOperatorSetField(op_setup_sur, "surface qdata", elem_restr_qd_i_sur, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE));
-
-    PetscCallCeed(ceed, CeedOperatorApply(op_setup_sur, ceed_data->x_coord, q_data_sur, CEED_REQUEST_IMMEDIATE));
-    PetscCallCeed(ceed, CeedOperatorDestroy(&op_setup_sur));
-  }
+  PetscCall(QDataBoundaryGet(ceed, dm, domain_label, label_value, elem_restr_x_sur, basis_x_sur, ceed_data->x_coord, &elem_restr_qd_i_sur,
+                             &q_data_sur, &q_data_size_sur));
 
   // CEED Operator for Physics
   PetscCallCeed(ceed, CeedOperatorCreate(ceed, qf_apply_bc, NULL, NULL, &op_apply_bc));
@@ -191,14 +179,15 @@ static PetscErrorCode SetupBCQFunctions(Ceed ceed, PetscInt dim_sur, PetscInt nu
 static PetscErrorCode AddBCSubOperators(User user, Ceed ceed, DM dm, SimpleBC bc, ProblemData problem, CeedData ceed_data, CeedOperator op_apply,
                                         CeedOperator op_apply_ijacobian) {
   CeedInt       height = 1, num_comp_q, num_comp_x;
-  CeedInt       dim_sur, P_sur = user->app_ctx->degree + 1, Q_sur = P_sur + user->app_ctx->q_extra;
-  const CeedInt q_data_size_sur = problem->q_data_size_sur, jac_data_size_sur = user->phys->implicit ? problem->jac_data_size_sur : 0;
+  CeedInt       P_sur = user->app_ctx->degree + 1, Q_sur = P_sur + user->app_ctx->q_extra, dim_sur, q_data_size_sur;
+  const CeedInt jac_data_size_sur = user->phys->implicit ? problem->jac_data_size_sur : 0;
   PetscInt      dim;
   DMLabel       face_sets_label;
   CeedBasis     basis_q_sur, basis_x_sur;
 
   PetscFunctionBeginUser;
   PetscCall(DMGetDimension(dm, &dim));
+  PetscCall(QDataBoundaryGetNumComponents(dm, &q_data_size_sur));
   dim_sur = dim - height;
   {  // Get number of components and coordinate dimension from op_apply
     CeedOperator       *sub_ops;
@@ -229,16 +218,6 @@ static PetscErrorCode AddBCSubOperators(User user, Ceed ceed, DM dm, SimpleBC bc
 
   PetscCall(DMGetLabel(dm, "Face Sets", &face_sets_label));
 
-  {  // -- Create QFunction for quadrature data
-    PetscCallCeed(ceed,
-                  CeedQFunctionCreateInterior(ceed, 1, problem->setup_sur.qfunction, problem->setup_sur.qfunction_loc, &ceed_data->qf_setup_sur));
-    PetscCallCeed(ceed, CeedQFunctionSetContext(ceed_data->qf_setup_sur, problem->setup_sur.qfunction_context));
-    PetscCallCeed(ceed, CeedQFunctionSetUserFlopsEstimate(ceed_data->qf_setup_sur, 0));
-    PetscCallCeed(ceed, CeedQFunctionAddInput(ceed_data->qf_setup_sur, "dx", num_comp_x * dim_sur, CEED_EVAL_GRAD));
-    PetscCallCeed(ceed, CeedQFunctionAddInput(ceed_data->qf_setup_sur, "weight", 1, CEED_EVAL_WEIGHT));
-    PetscCallCeed(ceed, CeedQFunctionAddOutput(ceed_data->qf_setup_sur, "surface qdata", q_data_size_sur, CEED_EVAL_NONE));
-  }
-
   {  // --- Create Sub-Operator for inflow boundaries
     CeedQFunction qf_apply_inflow = NULL, qf_apply_inflow_jacobian = NULL;
 
@@ -298,7 +277,7 @@ static PetscErrorCode AddBCSubOperators(User user, Ceed ceed, DM dm, SimpleBC bc
 
 PetscErrorCode SetupLibceed(Ceed ceed, CeedData ceed_data, DM dm, User user, AppCtx app_ctx, ProblemData problem, SimpleBC bc) {
   const PetscInt      num_comp_q = 5;
-  const CeedInt       dim = problem->dim, num_comp_x = problem->dim, q_data_size_vol = problem->q_data_size_vol;
+  const CeedInt       dim = problem->dim, num_comp_x = problem->dim;
   CeedInt             jac_data_size_vol = num_comp_q + 6 + 3;
   CeedElemRestriction elem_restr_jd_i;
   CeedVector          jac_data;
@@ -324,51 +303,30 @@ PetscErrorCode SetupLibceed(Ceed ceed, CeedData ceed_data, DM dm, User user, App
 
     PetscCall(DMPlexCeedElemRestrictionCreate(ceed, dm, domain_label, label_value, height, 0, &ceed_data->elem_restr_q));
     PetscCall(DMPlexCeedElemRestrictionCoordinateCreate(ceed, dm, domain_label, label_value, height, &ceed_data->elem_restr_x));
-    PetscCall(DMPlexCeedElemRestrictionQDataCreate(ceed, dm, domain_label, label_value, height, q_data_size_vol, &ceed_data->elem_restr_qd_i));
     PetscCall(DMPlexCeedElemRestrictionQDataCreate(ceed, dm, domain_label, label_value, height, jac_data_size_vol, &elem_restr_jd_i));
 
     PetscCallCeed(ceed, CeedElemRestrictionCreateVector(ceed_data->elem_restr_q, &user->q_ceed, NULL));
     PetscCallCeed(ceed, CeedElemRestrictionCreateVector(ceed_data->elem_restr_q, &user->q_dot_ceed, NULL));
     PetscCallCeed(ceed, CeedElemRestrictionCreateVector(ceed_data->elem_restr_q, &user->g_ceed, NULL));
     PetscCallCeed(ceed, CeedElemRestrictionCreateVector(ceed_data->elem_restr_x, &ceed_data->x_coord, NULL));
-    PetscCallCeed(ceed, CeedElemRestrictionCreateVector(ceed_data->elem_restr_qd_i, &ceed_data->q_data, NULL));
     PetscCallCeed(ceed, CeedElemRestrictionCreateVector(elem_restr_jd_i, &jac_data, NULL));
-  }
 
-  {  // -- Copy PETSc coordinate vector into CEED vector
-    Vec X_loc;
-    DM  cdm;
-
-    PetscCall(DMGetCellCoordinateDM(dm, &cdm));
-    if (cdm) {
-      PetscCall(DMGetCellCoordinatesLocal(dm, &X_loc));
-    } else {
-      PetscCall(DMGetCoordinatesLocal(dm, &X_loc));
+    {  // -- Copy PETSc coordinate vector into CEED vector
+      Vec X_loc;
+      DM  cdm;
+
+      PetscCall(DMGetCellCoordinateDM(dm, &cdm));
+      if (cdm) {
+        PetscCall(DMGetCellCoordinatesLocal(dm, &X_loc));
+      } else {
+        PetscCall(DMGetCoordinatesLocal(dm, &X_loc));
+      }
+      PetscCall(VecScale(X_loc, problem->dm_scale));
+      PetscCall(VecCopyPetscToCeed(X_loc, ceed_data->x_coord));
     }
-    PetscCall(VecScale(X_loc, problem->dm_scale));
-    PetscCall(VecCopyPetscToCeed(X_loc, ceed_data->x_coord));
-  }
-
-  {  // -- Create quadrature data
-    CeedQFunction qf_setup_vol;
-    CeedOperator  op_setup_vol;
-
-    PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, problem->setup_vol.qfunction, problem->setup_vol.qfunction_loc, &qf_setup_vol));
-    PetscCallCeed(ceed, CeedQFunctionSetContext(qf_setup_vol, problem->setup_vol.qfunction_context));
-    PetscCallCeed(ceed, CeedQFunctionSetUserFlopsEstimate(qf_setup_vol, 0));
-    PetscCallCeed(ceed, CeedQFunctionAddInput(qf_setup_vol, "dx", num_comp_x * dim, CEED_EVAL_GRAD));
-    PetscCallCeed(ceed, CeedQFunctionAddInput(qf_setup_vol, "weight", 1, CEED_EVAL_WEIGHT));
-    PetscCallCeed(ceed, CeedQFunctionAddOutput(qf_setup_vol, "qdata", q_data_size_vol, CEED_EVAL_NONE));
-
-    PetscCallCeed(ceed, CeedOperatorCreate(ceed, qf_setup_vol, NULL, NULL, &op_setup_vol));
-    PetscCallCeed(ceed, CeedOperatorSetField(op_setup_vol, "dx", ceed_data->elem_restr_x, ceed_data->basis_x, CEED_VECTOR_ACTIVE));
-    PetscCallCeed(ceed, CeedOperatorSetField(op_setup_vol, "weight", CEED_ELEMRESTRICTION_NONE, ceed_data->basis_x, CEED_VECTOR_NONE));
-    PetscCallCeed(ceed, CeedOperatorSetField(op_setup_vol, "qdata", ceed_data->elem_restr_qd_i, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE));
-
-    PetscCallCeed(ceed, CeedOperatorApply(op_setup_vol, ceed_data->x_coord, ceed_data->q_data, CEED_REQUEST_IMMEDIATE));
 
-    PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_setup_vol));
-    PetscCallCeed(ceed, CeedOperatorDestroy(&op_setup_vol));
+    PetscCall(QDataGet(ceed, dm, domain_label, label_value, ceed_data->elem_restr_x, ceed_data->basis_x, ceed_data->x_coord,
+                       &ceed_data->elem_restr_qd_i, &ceed_data->q_data, &problem->q_data_size_vol));
   }
 
   {  // -- Create QFunction for ICs
@@ -404,7 +362,7 @@ PetscErrorCode SetupLibceed(Ceed ceed, CeedData ceed_data, DM dm, User user, App
     PetscCallCeed(ceed, CeedQFunctionSetUserFlopsEstimate(qf_rhs_vol, 0));
     PetscCallCeed(ceed, CeedQFunctionAddInput(qf_rhs_vol, "q", num_comp_q, CEED_EVAL_INTERP));
     PetscCallCeed(ceed, CeedQFunctionAddInput(qf_rhs_vol, "Grad_q", num_comp_q * dim, CEED_EVAL_GRAD));
-    PetscCallCeed(ceed, CeedQFunctionAddInput(qf_rhs_vol, "qdata", q_data_size_vol, CEED_EVAL_NONE));
+    PetscCallCeed(ceed, CeedQFunctionAddInput(qf_rhs_vol, "qdata", problem->q_data_size_vol, CEED_EVAL_NONE));
     PetscCallCeed(ceed, CeedQFunctionAddInput(qf_rhs_vol, "x", num_comp_x, CEED_EVAL_INTERP));
     PetscCallCeed(ceed, CeedQFunctionAddOutput(qf_rhs_vol, "v", num_comp_q, CEED_EVAL_INTERP));
     PetscCallCeed(ceed, CeedQFunctionAddOutput(qf_rhs_vol, "Grad_v", num_comp_q * dim, CEED_EVAL_GRAD));
@@ -430,7 +388,7 @@ PetscErrorCode SetupLibceed(Ceed ceed, CeedData ceed_data, DM dm, User user, App
     PetscCallCeed(ceed, CeedQFunctionAddInput(qf_ifunction_vol, "q", num_comp_q, CEED_EVAL_INTERP));
     PetscCallCeed(ceed, CeedQFunctionAddInput(qf_ifunction_vol, "Grad_q", num_comp_q * dim, CEED_EVAL_GRAD));
     PetscCallCeed(ceed, CeedQFunctionAddInput(qf_ifunction_vol, "q dot", num_comp_q, CEED_EVAL_INTERP));
-    PetscCallCeed(ceed, CeedQFunctionAddInput(qf_ifunction_vol, "qdata", q_data_size_vol, CEED_EVAL_NONE));
+    PetscCallCeed(ceed, CeedQFunctionAddInput(qf_ifunction_vol, "qdata", problem->q_data_size_vol, CEED_EVAL_NONE));
     PetscCallCeed(ceed, CeedQFunctionAddInput(qf_ifunction_vol, "x", num_comp_x, CEED_EVAL_INTERP));
     PetscCallCeed(ceed, CeedQFunctionAddOutput(qf_ifunction_vol, "v", num_comp_q, CEED_EVAL_INTERP));
     PetscCallCeed(ceed, CeedQFunctionAddOutput(qf_ifunction_vol, "Grad_v", num_comp_q * dim, CEED_EVAL_GRAD));
@@ -458,7 +416,7 @@ PetscErrorCode SetupLibceed(Ceed ceed, CeedData ceed_data, DM dm, User user, App
     PetscCallCeed(ceed, CeedQFunctionSetUserFlopsEstimate(qf_ijacobian_vol, 0));
     PetscCallCeed(ceed, CeedQFunctionAddInput(qf_ijacobian_vol, "dq", num_comp_q, CEED_EVAL_INTERP));
     PetscCallCeed(ceed, CeedQFunctionAddInput(qf_ijacobian_vol, "Grad_dq", num_comp_q * dim, CEED_EVAL_GRAD));
-    PetscCallCeed(ceed, CeedQFunctionAddInput(qf_ijacobian_vol, "qdata", q_data_size_vol, CEED_EVAL_NONE));
+    PetscCallCeed(ceed, CeedQFunctionAddInput(qf_ijacobian_vol, "qdata", problem->q_data_size_vol, CEED_EVAL_NONE));
     PetscCallCeed(ceed, CeedQFunctionAddInput(qf_ijacobian_vol, "jac_data", jac_data_size_vol, CEED_EVAL_NONE));
     PetscCallCeed(ceed, CeedQFunctionAddOutput(qf_ijacobian_vol, "v", num_comp_q, CEED_EVAL_INTERP));
     PetscCallCeed(ceed, CeedQFunctionAddOutput(qf_ijacobian_vol, "Grad_v", num_comp_q * dim, CEED_EVAL_GRAD));
diff --git a/examples/fluids/src/turb_spanstats.c b/examples/fluids/src/turb_spanstats.c
index de3e3e2fab..5b7198952b 100644
--- a/examples/fluids/src/turb_spanstats.c
+++ b/examples/fluids/src/turb_spanstats.c
@@ -21,9 +21,9 @@
 #include "../navierstokes.h"
 
 typedef struct {
-  CeedElemRestriction elem_restr_parent_x, elem_restr_parent_stats, elem_restr_parent_qd, elem_restr_parent_colloc, elem_restr_child_colloc;
+  CeedElemRestriction elem_restr_parent_x, elem_restr_parent_stats, elem_restr_parent_colloc, elem_restr_child_colloc;
   CeedBasis           basis_x, basis_stats;
-  CeedVector          x_coord, q_data;
+  CeedVector          x_coord;
 } *SpanStatsSetupData;
 
 PetscErrorCode CreateStatsDM(User user, ProblemData problem, PetscInt degree) {
@@ -193,11 +193,8 @@ PetscErrorCode SpanStatsSetupDataCreate(Ceed ceed, User user, CeedData ceed_data
 
   PetscCall(DMPlexCeedElemRestrictionCreate(ceed, dm, domain_label, label_value, height, dm_field, &(*stats_data)->elem_restr_parent_stats));
   PetscCall(DMPlexCeedElemRestrictionCoordinateCreate(ceed, dm, domain_label, label_value, height, &(*stats_data)->elem_restr_parent_x));
-  PetscCall(DMPlexCeedElemRestrictionQDataCreate(ceed, dm, domain_label, label_value, height, problem->q_data_size_sur,
-                                                 &(*stats_data)->elem_restr_parent_qd));
   PetscCallCeed(ceed, CeedElemRestrictionGetNumComponents((*stats_data)->elem_restr_parent_x, &num_comp_x));
   PetscCallCeed(ceed, CeedElemRestrictionCreateVector((*stats_data)->elem_restr_parent_x, &(*stats_data)->x_coord, NULL));
-  PetscCallCeed(ceed, CeedElemRestrictionCreateVector((*stats_data)->elem_restr_parent_qd, &(*stats_data)->q_data, NULL));
 
   {
     DM dm_coord;
@@ -232,7 +229,6 @@ PetscErrorCode SpanStatsSetupDataDestroy(SpanStatsSetupData data) {
   PetscCall(CeedElemRestrictionGetCeed(data->elem_restr_parent_x, &ceed));
   PetscCallCeed(ceed, CeedElemRestrictionDestroy(&data->elem_restr_parent_x));
   PetscCallCeed(ceed, CeedElemRestrictionDestroy(&data->elem_restr_parent_stats));
-  PetscCallCeed(ceed, CeedElemRestrictionDestroy(&data->elem_restr_parent_qd));
   PetscCallCeed(ceed, CeedElemRestrictionDestroy(&data->elem_restr_parent_colloc));
   PetscCallCeed(ceed, CeedElemRestrictionDestroy(&data->elem_restr_child_colloc));
 
@@ -240,7 +236,6 @@ PetscErrorCode SpanStatsSetupDataDestroy(SpanStatsSetupData data) {
   PetscCallCeed(ceed, CeedBasisDestroy(&data->basis_stats));
 
   PetscCallCeed(ceed, CeedVectorDestroy(&data->x_coord));
-  PetscCallCeed(ceed, CeedVectorDestroy(&data->q_data));
 
   PetscCall(PetscFree(data));
   PetscFunctionReturn(PETSC_SUCCESS);
@@ -296,9 +291,13 @@ PetscErrorCode CreateStatsSF(Ceed ceed, CeedData ceed_data, SpanStatsSetupData s
 
 // @brief Setup RHS and LHS for L^2 projection of statistics
 PetscErrorCode SetupL2ProjectionStats(Ceed ceed, User user, CeedData ceed_data, SpanStatsSetupData stats_data) {
-  CeedOperator  op_mass, op_setup_sur, op_proj_rhs;
-  CeedQFunction qf_mass, qf_stats_proj;
-  CeedInt       q_data_size, num_comp_stats = user->spanstats.num_comp_stats;
+  CeedOperator        op_mass, op_proj_rhs;
+  CeedQFunction       qf_mass, qf_stats_proj;
+  CeedInt             q_data_size, num_comp_stats = user->spanstats.num_comp_stats;
+  CeedElemRestriction elem_restr_qd;
+  CeedVector          q_data;
+  DMLabel             domain_label = NULL;
+  PetscInt            label_value  = 0;
 
   PetscFunctionBeginUser;
   // -- Create Operator for RHS of L^2 projection of statistics
@@ -312,23 +311,14 @@ PetscErrorCode SetupL2ProjectionStats(Ceed ceed, User user, CeedData ceed_data,
 
   PetscCall(OperatorApplyContextCreate(NULL, user->spanstats.dm, ceed, op_proj_rhs, NULL, NULL, NULL, NULL, &user->spanstats.op_proj_rhs_ctx));
   PetscCall(CeedOperatorCreateLocalVecs(op_proj_rhs, DMReturnVecType(user->spanstats.dm), PETSC_COMM_SELF, &user->spanstats.Parent_Stats_loc, NULL));
-
-  // -- Setup LHS of L^2 projection
-  // Get q_data for mass matrix operator
-  PetscCallCeed(ceed, CeedOperatorCreate(ceed, ceed_data->qf_setup_sur, NULL, NULL, &op_setup_sur));
-  PetscCallCeed(ceed, CeedOperatorSetField(op_setup_sur, "dx", stats_data->elem_restr_parent_x, stats_data->basis_x, CEED_VECTOR_ACTIVE));
-  PetscCallCeed(ceed, CeedOperatorSetField(op_setup_sur, "weight", CEED_ELEMRESTRICTION_NONE, stats_data->basis_x, CEED_VECTOR_NONE));
-  PetscCallCeed(ceed, CeedOperatorSetField(op_setup_sur, "surface qdata", stats_data->elem_restr_parent_qd, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE));
-  PetscCallCeed(ceed, CeedOperatorApply(op_setup_sur, stats_data->x_coord, stats_data->q_data, CEED_REQUEST_IMMEDIATE));
-
-  // CEED Restriction
-  PetscCallCeed(ceed, CeedElemRestrictionGetNumComponents(stats_data->elem_restr_parent_qd, &q_data_size));
+  PetscCall(QDataGet(ceed, user->spanstats.dm, domain_label, label_value, stats_data->elem_restr_parent_x, stats_data->basis_x, stats_data->x_coord,
+                     &elem_restr_qd, &q_data, &q_data_size));
 
   // Create Mass CeedOperator
   PetscCall(CreateMassQFunction(ceed, num_comp_stats, q_data_size, &qf_mass));
   PetscCallCeed(ceed, CeedOperatorCreate(ceed, qf_mass, NULL, NULL, &op_mass));
   PetscCallCeed(ceed, CeedOperatorSetField(op_mass, "u", stats_data->elem_restr_parent_stats, stats_data->basis_stats, CEED_VECTOR_ACTIVE));
-  PetscCallCeed(ceed, CeedOperatorSetField(op_mass, "qdata", stats_data->elem_restr_parent_qd, CEED_BASIS_NONE, stats_data->q_data));
+  PetscCallCeed(ceed, CeedOperatorSetField(op_mass, "qdata", elem_restr_qd, CEED_BASIS_NONE, q_data));
   PetscCallCeed(ceed, CeedOperatorSetField(op_mass, "v", stats_data->elem_restr_parent_stats, stats_data->basis_stats, CEED_VECTOR_ACTIVE));
 
   {  // Setup KSP for L^2 projection
@@ -354,10 +344,11 @@ PetscErrorCode SetupL2ProjectionStats(Ceed ceed, User user, CeedData ceed_data,
   }
 
   // Cleanup
+  PetscCallCeed(ceed, CeedElemRestrictionDestroy(&elem_restr_qd));
+  PetscCallCeed(ceed, CeedVectorDestroy(&q_data));
   PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_mass));
   PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_stats_proj));
   PetscCallCeed(ceed, CeedOperatorDestroy(&op_mass));
-  PetscCallCeed(ceed, CeedOperatorDestroy(&op_setup_sur));
   PetscCallCeed(ceed, CeedOperatorDestroy(&op_proj_rhs));
   PetscFunctionReturn(PETSC_SUCCESS);
 }
@@ -439,13 +430,18 @@ PetscErrorCode CreateStatisticCollectionOperator(Ceed ceed, User user, CeedData
 
 // Creates operator for calculating error of method of manufactured solution (MMS) test
 PetscErrorCode SetupMMSErrorChecking(Ceed ceed, User user, CeedData ceed_data, SpanStatsSetupData stats_data) {
-  CeedInt       num_comp_stats = user->spanstats.num_comp_stats, num_comp_x, q_data_size;
-  CeedQFunction qf_error;
-  CeedOperator  op_error;
-  CeedVector    x_ceed, y_ceed;
+  CeedInt             num_comp_stats = user->spanstats.num_comp_stats, num_comp_x, q_data_size;
+  CeedQFunction       qf_error;
+  CeedOperator        op_error;
+  CeedVector          x_ceed, y_ceed;
+  DMLabel             domain_label = NULL;
+  PetscInt            label_value  = 0;
+  CeedVector          q_data;
+  CeedElemRestriction elem_restr_parent_qd;
 
   PetscFunctionBeginUser;
-  PetscCallCeed(ceed, CeedElemRestrictionGetNumComponents(stats_data->elem_restr_parent_qd, &q_data_size));
+  PetscCall(QDataGet(ceed, user->spanstats.dm, domain_label, label_value, stats_data->elem_restr_parent_x, stats_data->basis_x, stats_data->x_coord,
+                     &elem_restr_parent_qd, &q_data, &q_data_size));
   PetscCallCeed(ceed, CeedBasisGetNumComponents(stats_data->basis_x, &num_comp_x));
 
   PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, ChildStatsCollectionMMSTest_Error, ChildStatsCollectionMMSTest_Error_loc, &qf_error));
@@ -456,7 +452,7 @@ PetscErrorCode SetupMMSErrorChecking(Ceed ceed, User user, CeedData ceed_data, S
 
   PetscCallCeed(ceed, CeedOperatorCreate(ceed, qf_error, NULL, NULL, &op_error));
   PetscCallCeed(ceed, CeedOperatorSetField(op_error, "q", stats_data->elem_restr_parent_stats, stats_data->basis_stats, CEED_VECTOR_ACTIVE));
-  PetscCallCeed(ceed, CeedOperatorSetField(op_error, "qdata", stats_data->elem_restr_parent_qd, CEED_BASIS_NONE, stats_data->q_data));
+  PetscCallCeed(ceed, CeedOperatorSetField(op_error, "qdata", elem_restr_parent_qd, CEED_BASIS_NONE, q_data));
   PetscCallCeed(ceed, CeedOperatorSetField(op_error, "x", stats_data->elem_restr_parent_x, stats_data->basis_x, stats_data->x_coord));
   PetscCallCeed(ceed, CeedOperatorSetField(op_error, "v", stats_data->elem_restr_parent_stats, stats_data->basis_stats, CEED_VECTOR_ACTIVE));
 
@@ -465,10 +461,12 @@ PetscErrorCode SetupMMSErrorChecking(Ceed ceed, User user, CeedData ceed_data, S
   PetscCall(OperatorApplyContextCreate(user->spanstats.dm, user->spanstats.dm, user->ceed, op_error, x_ceed, y_ceed, NULL, NULL,
                                        &user->spanstats.mms_error_ctx));
 
-  PetscCallCeed(ceed, CeedOperatorDestroy(&op_error));
-  PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_error));
+  PetscCallCeed(ceed, CeedVectorDestroy(&q_data));
   PetscCallCeed(ceed, CeedVectorDestroy(&x_ceed));
   PetscCallCeed(ceed, CeedVectorDestroy(&y_ceed));
+  PetscCallCeed(ceed, CeedElemRestrictionDestroy(&elem_restr_parent_qd));
+  PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_error));
+  PetscCallCeed(ceed, CeedOperatorDestroy(&op_error));
   PetscFunctionReturn(PETSC_SUCCESS);
 }
 

From be23f017229a5519b10d2d9a7244384ddded5ee3 Mon Sep 17 00:00:00 2001
From: Zach Atkins <zach.atkins@colorado.edu>
Date: Tue, 30 Apr 2024 12:05:20 -0600
Subject: [PATCH 014/571] Add $(PYTHON) to prove command

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 69f05b0673..750ea2a6cd 100644
--- a/Makefile
+++ b/Makefile
@@ -686,7 +686,7 @@ ctc-% : $(ctests);@$(foreach tst,$(ctests),$(tst) /cpu/$*;)
 
 prove : $(matched)
 	$(info Testing backends: $(BACKENDS))
-	$(PROVE) $(PROVE_OPTS) --exec 'tests/junit.py --mode tap --ceed-backends $(BACKENDS) $(if $(SMARTREDIS_DIR),--smartredis_dir $(SMARTREDIS_DIR) )--nproc $(NPROC_TEST) --pool-size $(NPROC_POOL)' $(matched:$(OBJDIR)/%=%)
+	$(PROVE) $(PROVE_OPTS) --exec '$(PYTHON) tests/junit.py --mode tap --ceed-backends $(BACKENDS) $(if $(SMARTREDIS_DIR),--smartredis_dir $(SMARTREDIS_DIR) )--nproc $(NPROC_TEST) --pool-size $(NPROC_POOL)' $(matched:$(OBJDIR)/%=%)
 # Run prove target in parallel
 prv : ;@$(MAKE) $(MFLAGS) V=$(V) prove
 

From f220c67c2ee09a04caa5f08b43d289c53908d385 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Tue, 6 Feb 2024 10:33:37 -0700
Subject: [PATCH 015/571] xsmm - update for function name change

---
 backends/xsmm/ceed-xsmm-tensor.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/backends/xsmm/ceed-xsmm-tensor.c b/backends/xsmm/ceed-xsmm-tensor.c
index 0d7383bf40..1dc69b30a8 100644
--- a/backends/xsmm/ceed-xsmm-tensor.c
+++ b/backends/xsmm/ceed-xsmm-tensor.c
@@ -30,7 +30,7 @@ static int CeedTensorContractApply_Xsmm(CeedTensorContract contract, CeedInt A,
                                                                             LIBXSMM_DATATYPE_F64, LIBXSMM_DATATYPE_F64)
                                                 : libxsmm_create_gemm_shape(J, A, B, !t_mode ? B : J, B, J, LIBXSMM_DATATYPE_F32, LIBXSMM_DATATYPE_F32,
                                                                             LIBXSMM_DATATYPE_F32, LIBXSMM_DATATYPE_F32);
-    const libxsmm_gemmfunction kernel = libxsmm_dispatch_gemm_v2(gemm_shape, (libxsmm_bitfield)(flags), (libxsmm_bitfield)LIBXSMM_GEMM_PREFETCH_NONE);
+    const libxsmm_gemmfunction kernel = libxsmm_dispatch_gemm(gemm_shape, (libxsmm_bitfield)(flags), (libxsmm_bitfield)LIBXSMM_GEMM_PREFETCH_NONE);
     libxsmm_gemm_param         gemm_param;
 
     CeedCheck(kernel, ceed, CEED_ERROR_BACKEND, "LIBXSMM kernel failed to build.");
@@ -50,7 +50,7 @@ static int CeedTensorContractApply_Xsmm(CeedTensorContract contract, CeedInt A,
                                                                             LIBXSMM_DATATYPE_F64, LIBXSMM_DATATYPE_F64)
                                                 : libxsmm_create_gemm_shape(C, J, B, C, !t_mode ? B : J, C, LIBXSMM_DATATYPE_F32, LIBXSMM_DATATYPE_F32,
                                                                             LIBXSMM_DATATYPE_F32, LIBXSMM_DATATYPE_F32);
-    const libxsmm_gemmfunction kernel = libxsmm_dispatch_gemm_v2(gemm_shape, (libxsmm_bitfield)(flags), (libxsmm_bitfield)LIBXSMM_GEMM_PREFETCH_NONE);
+    const libxsmm_gemmfunction kernel = libxsmm_dispatch_gemm(gemm_shape, (libxsmm_bitfield)(flags), (libxsmm_bitfield)LIBXSMM_GEMM_PREFETCH_NONE);
     libxsmm_gemm_param         gemm_param;
 
     CeedCheck(kernel, ceed, CEED_ERROR_BACKEND, "LIBXSMM kernel failed to build.");

From ed721ed881af0c4c392c60e338da4febcd1996ba Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Thu, 2 May 2024 09:41:12 -0600
Subject: [PATCH 016/571] ci - update LIBXSMM commit hash used

---
 .gitlab-ci.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 58f7e93f59..3ae8918f2a 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -72,8 +72,8 @@ noether-cpu:
     - echo "-------------- FC ------------------" && $FC --version
     - echo "-------------- GCOV ----------------" && gcov --version
 # Libraries for backends
-# -- LIBXSMM 2c145a109b5a8ad4e15f60ea42a86b9056bdc8b8
-    - cd .. && export XSMM_HASH=2c145a109b5a8ad4e15f60ea42a86b9056bdc8b8 && { [[ -d libxsmm-$XSMM_HASH ]] || { curl -L https://github.com/libxsmm/libxsmm/archive/$XSMM_HASH.tar.gz -o xsmm.tar.gz && tar zvxf xsmm.tar.gz && rm xsmm.tar.gz && make -C libxsmm-$XSMM_HASH -j$(nproc); }; } && export XSMM_DIR=$PWD/libxsmm-$XSMM_HASH && cd libCEED
+# -- LIBXSMM 7 April 2024
+    - cd .. && export XSMM_HASH=94ee71576870152feb62f3f0cf6b061d036dcdb5 && { [[ -d libxsmm-$XSMM_HASH ]] || { curl -L https://github.com/libxsmm/libxsmm/archive/$XSMM_HASH.tar.gz -o xsmm.tar.gz && tar zvxf xsmm.tar.gz && rm xsmm.tar.gz && make -C libxsmm-$XSMM_HASH -j$(nproc); }; } && export XSMM_DIR=$PWD/libxsmm-$XSMM_HASH && cd libCEED
     - echo "-------------- LIBXSMM -------------" && basename $XSMM_DIR
 # -- OCCA v1.6.0
     - cd .. && export OCCA_VERSION=occa-1.6.0 && { [[ -d $OCCA_VERSION ]] || { git clone --depth 1 --branch v1.6.0 https://github.com/libocca/occa.git $OCCA_VERSION && cd $OCCA_VERSION && export ENABLE_OPENCL="OFF" ENABLE_DPCPP="OFF" ENABLE_HIP="OFF" ENABLE_CUDA="OFF" && ./configure-cmake.sh && cmake --build build --parallel $NPROC_CPU && cmake --install build && cd ..; }; } && export OCCA_DIR=$PWD/$OCCA_VERSION/install && cd libCEED
@@ -321,8 +321,8 @@ noether-float:
 # -- MAGMA from dev branch
     - echo "-------------- MAGMA ---------------"
     - export MAGMA_DIR=/projects/hipMAGMA && git -C $MAGMA_DIR -c safe.directory=$MAGMA_DIR describe
-# -- LIBXSMM 2c145a109b5a8ad4e15f60ea42a86b9056bdc8b8
-    - cd .. && export XSMM_HASH=2c145a109b5a8ad4e15f60ea42a86b9056bdc8b8 && { [[ -d libxsmm-$XSMM_HASH ]] || { curl -L https://github.com/libxsmm/libxsmm/archive/$XSMM_HASH.tar.gz -o xsmm.tar.gz && tar zvxf xsmm.tar.gz && rm xsmm.tar.gz && make -C libxsmm-$XSMM_HASH -j$(nproc); }; } && export XSMM_DIR=$PWD/libxsmm-$XSMM_HASH && cd libCEED
+# -- LIBXSMM 7 April 2024
+    - cd .. && export XSMM_HASH=94ee71576870152feb62f3f0cf6b061d036dcdb5 && { [[ -d libxsmm-$XSMM_HASH ]] || { curl -L https://github.com/libxsmm/libxsmm/archive/$XSMM_HASH.tar.gz -o xsmm.tar.gz && tar zvxf xsmm.tar.gz && rm xsmm.tar.gz && make -C libxsmm-$XSMM_HASH -j$(nproc); }; } && export XSMM_DIR=$PWD/libxsmm-$XSMM_HASH && cd libCEED
     - echo "-------------- LIBXSMM -------------" && basename $XSMM_DIR
   script:
     - rm -f .SUCCESS

From ba0bd193a1140f803f01201da36534a23dc839cc Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Thu, 2 May 2024 09:59:47 -0600
Subject: [PATCH 017/571] readme - add LIBXSMM versioning minimum

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 7c2d32c0ea..99ba285426 100644
--- a/README.md
+++ b/README.md
@@ -207,6 +207,7 @@ This backend can be run in serial or blocked mode and defaults to running in the
 
 The `/cpu/self/xsmm/*` backends rely upon the [LIBXSMM](https://github.com/libxsmm/libxsmm) package to provide vectorized CPU performance.
 If linking MKL and LIBXSMM is desired but the Makefile is not detecting `MKLROOT`, linking libCEED against MKL can be forced by setting the environment variable `MKL=1`.
+The LIBXSMM `main` development branch from 7 April 2024 or newer is required.
 
 The `/gpu/cuda/*` backends provide GPU performance strictly using CUDA.
 

From 5037d55df331588b72bed903a854832c66414480 Mon Sep 17 00:00:00 2001
From: James Wright <james@jameswright.xyz>
Date: Thu, 2 May 2024 14:01:36 -0600
Subject: [PATCH 018/571] fluids: Update MatCeed with Ratel updates

---
 examples/fluids/include/mat-ceed-impl.h       |  55 +--
 examples/fluids/include/mat-ceed.h            |  58 ++-
 .../{ceed-utils.h => petsc-ceed-utils.h}      |  48 ++-
 examples/fluids/include/petsc-ceed.h          |  42 ++
 examples/fluids/navierstokes.h                |   2 +-
 examples/fluids/src/mat-ceed.c                | 385 +++++++++++-------
 examples/fluids/src/petsc_ops.c               |   2 +-
 7 files changed, 362 insertions(+), 230 deletions(-)
 rename examples/fluids/include/{ceed-utils.h => petsc-ceed-utils.h} (86%)
 create mode 100644 examples/fluids/include/petsc-ceed.h

diff --git a/examples/fluids/include/mat-ceed-impl.h b/examples/fluids/include/mat-ceed-impl.h
index 0bfff6180a..cb49fe901e 100644
--- a/examples/fluids/include/mat-ceed-impl.h
+++ b/examples/fluids/include/mat-ceed-impl.h
@@ -7,45 +7,16 @@
 #pragma once
 
 #include <ceed.h>
+#include <petsc-ceed.h>
 #include <petscdm.h>
 #include <petscmat.h>
 #include <petsc/private/petscimpl.h>
 
-#if defined(__clang_analyzer__)
-#define MATCEED_EXTERN extern
-#elif defined(__cplusplus)
-#define MATCEED_EXTERN extern "C"
-#else
-#define MATCEED_EXTERN extern
-#endif
-
-#if defined(__clang_analyzer__)
-#define MATCEED_INTERN
-#else
-#define MATCEED_INTERN MATCEED_EXTERN __attribute__((visibility("hidden")))
-#endif
-
-/**
-  @brief Calls a libCEED function and then checks the resulting error code.
-  If the error code is non-zero, then a PETSc error is set with the libCEED error message.
-**/
-#ifndef PetscCallCeed
-#define PetscCallCeed(ceed_, ...)                                   \
-  do {                                                              \
-    int ierr_q_ = __VA_ARGS__;                                      \
-    if (ierr_q_ != CEED_ERROR_SUCCESS) {                            \
-      const char *error_message;                                    \
-      CeedGetErrorMessage(ceed_, &error_message);                   \
-      SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "%s", error_message); \
-    }                                                               \
-  } while (0)
-#endif
-
 // MatCeed context for applying composite CeedOperator on a DM
 typedef struct MatCeedContext_private *MatCeedContext;
 struct MatCeedContext_private {
   Ceed           ceed;
-  char          *name, *internal_mat_type;
+  char          *name, *coo_mat_type;
   PetscMemType   mem_type;
   PetscInt       ref_count, num_mats_assembled_full, num_mats_assembled_pbd;
   PetscBool      is_destroyed, is_ceed_pbd_valid, is_ceed_vpbd_valid;
@@ -59,17 +30,17 @@ struct MatCeedContext_private {
 };
 
 // Context data
-MATCEED_INTERN PetscErrorCode MatCeedContextCreate(DM dm_x, DM dm_y, Vec X_loc, Vec Y_loc_transpose, CeedOperator op_mult,
-                                                   CeedOperator op_mult_transpose, PetscLogEvent log_event_mult,
-                                                   PetscLogEvent log_event_mult_transpose, MatCeedContext *ctx);
-MATCEED_INTERN PetscErrorCode MatCeedContextReference(MatCeedContext ctx);
-MATCEED_INTERN PetscErrorCode MatCeedContextReferenceCopy(MatCeedContext ctx, MatCeedContext *ctx_copy);
-MATCEED_INTERN PetscErrorCode MatCeedContextDestroy(MatCeedContext ctx);
-
-// Mat Ceed
-MATCEED_INTERN PetscErrorCode MatGetDiagonal_Ceed(Mat A, Vec D);
-MATCEED_INTERN PetscErrorCode MatMult_Ceed(Mat A, Vec X, Vec Y);
-MATCEED_INTERN PetscErrorCode MatMultTranspose_Ceed(Mat A, Vec Y, Vec X);
+PETSC_CEED_EXTERN PetscErrorCode MatCeedContextCreate(DM dm_x, DM dm_y, Vec X_loc, Vec Y_loc_transpose, CeedOperator op_mult,
+                                                      CeedOperator op_mult_transpose, PetscLogEvent log_event_mult,
+                                                      PetscLogEvent log_event_mult_transpose, MatCeedContext *ctx);
+PETSC_CEED_EXTERN PetscErrorCode MatCeedContextReference(MatCeedContext ctx);
+PETSC_CEED_EXTERN PetscErrorCode MatCeedContextReferenceCopy(MatCeedContext ctx, MatCeedContext *ctx_copy);
+PETSC_CEED_EXTERN PetscErrorCode MatCeedContextDestroy(MatCeedContext ctx);
+
+// MatCEED
+PETSC_CEED_EXTERN PetscErrorCode MatGetDiagonal_Ceed(Mat A, Vec D);
+PETSC_CEED_EXTERN PetscErrorCode MatMult_Ceed(Mat A, Vec X, Vec Y);
+PETSC_CEED_EXTERN PetscErrorCode MatMultTranspose_Ceed(Mat A, Vec Y, Vec X);
 
 extern PetscClassId  MATCEED_CLASSID;
 extern PetscLogEvent MATCEED_MULT, MATCEED_MULT_TRANSPOSE;
diff --git a/examples/fluids/include/mat-ceed.h b/examples/fluids/include/mat-ceed.h
index 75a7a612dd..8b05001273 100644
--- a/examples/fluids/include/mat-ceed.h
+++ b/examples/fluids/include/mat-ceed.h
@@ -7,38 +7,36 @@
 #pragma once
 
 #include <ceed.h>
+#include <petsc-ceed.h>
 #include <petscdm.h>
 #include <petscmat.h>
 
 #define MATCEED "ceed"
 
-#if defined(__clang_analyzer__)
-#define MATCEED_EXTERN extern
-#elif defined(__cplusplus)
-#define MATCEED_EXTERN extern "C"
-#else
-#define MATCEED_EXTERN extern
-#endif
-
-#if defined(__clang_analyzer__)
-#define MATCEED_INTERN
-#else
-#define MATCEED_INTERN MATCEED_EXTERN __attribute__((visibility("hidden")))
-#endif
-
-// Context data
-MATCEED_INTERN PetscErrorCode MatCeedCreate(DM dm_x, DM dm_y, CeedOperator op_mult, CeedOperator op_mult_transpose, Mat *mat);
-MATCEED_INTERN PetscErrorCode MatCeedCopy(Mat mat_ceed, Mat mat_other);
-MATCEED_INTERN PetscErrorCode MatCeedAssembleCOO(Mat mat_ceed, Mat mat_coo);
-MATCEED_INTERN PetscErrorCode MatCeedSetContext(Mat mat, PetscErrorCode (*f)(void *), void *ctx);
-MATCEED_INTERN PetscErrorCode MatCeedGetContext(Mat mat, void *ctx);
-MATCEED_INTERN PetscErrorCode MatCeedSetInnerMatType(Mat mat, MatType type);
-MATCEED_INTERN PetscErrorCode MatCeedGetInnerMatType(Mat mat, MatType *type);
-MATCEED_INTERN PetscErrorCode MatCeedSetOperation(Mat mat, MatOperation op, void (*g)(void));
-MATCEED_INTERN PetscErrorCode MatCeedSetLocalVectors(Mat mat, Vec X_loc, Vec Y_loc_transpose);
-MATCEED_INTERN PetscErrorCode MatCeedGetLocalVectors(Mat mat, Vec *X_loc, Vec *Y_loc_transpose);
-MATCEED_INTERN PetscErrorCode MatCeedRestoreLocalVectors(Mat mat, Vec *X_loc, Vec *Y_loc_transpose);
-MATCEED_INTERN PetscErrorCode MatCeedGetCeedOperators(Mat mat, CeedOperator *op_mult, CeedOperator *op_mult_transpose);
-MATCEED_INTERN PetscErrorCode MatCeedRestoreCeedOperators(Mat mat, CeedOperator *op_mult, CeedOperator *op_mult_transpose);
-MATCEED_INTERN PetscErrorCode MatCeedSetLogEvents(Mat mat, PetscLogEvent log_event_mult, PetscLogEvent log_event_mult_transpose);
-MATCEED_INTERN PetscErrorCode MatCeedGetLogEvents(Mat mat, PetscLogEvent *log_event_mult, PetscLogEvent *log_event_mult_transpose);
+// Core functionality
+PETSC_CEED_EXTERN PetscErrorCode MatCeedCreate(DM dm_x, DM dm_y, CeedOperator op_mult, CeedOperator op_mult_transpose, Mat *mat);
+PETSC_CEED_EXTERN PetscErrorCode MatCeedCopy(Mat mat_ceed, Mat mat_other);
+PETSC_CEED_EXTERN PetscErrorCode MatCeedCreateMatCOO(Mat mat_ceed, Mat *mat_coo);
+PETSC_CEED_EXTERN PetscErrorCode MatCeedSetPreallocationCOO(Mat mat_ceed, Mat mat_coo);
+PETSC_CEED_EXTERN PetscErrorCode MatCeedAssembleCOO(Mat mat_ceed, Mat mat_coo);
+
+PETSC_CEED_INTERN PetscErrorCode MatCeedSetContextDouble(Mat mat, const char *name, double value);
+PETSC_CEED_INTERN PetscErrorCode MatCeedGetContextDouble(Mat mat, const char *name, double *value);
+
+// Advanced functionality
+PETSC_CEED_EXTERN PetscErrorCode MatCeedSetContext(Mat mat, PetscErrorCode (*f)(void *), void *ctx);
+PETSC_CEED_EXTERN PetscErrorCode MatCeedGetContext(Mat mat, void *ctx);
+
+PETSC_CEED_EXTERN PetscErrorCode MatCeedSetOperation(Mat mat, MatOperation op, void (*g)(void));
+PETSC_CEED_EXTERN PetscErrorCode MatCeedSetCOOMatType(Mat mat, MatType type);
+PETSC_CEED_EXTERN PetscErrorCode MatCeedGetCOOMatType(Mat mat, MatType *type);
+
+PETSC_CEED_EXTERN PetscErrorCode MatCeedSetLocalVectors(Mat mat, Vec X_loc, Vec Y_loc_transpose);
+PETSC_CEED_EXTERN PetscErrorCode MatCeedGetLocalVectors(Mat mat, Vec *X_loc, Vec *Y_loc_transpose);
+PETSC_CEED_EXTERN PetscErrorCode MatCeedRestoreLocalVectors(Mat mat, Vec *X_loc, Vec *Y_loc_transpose);
+
+PETSC_CEED_EXTERN PetscErrorCode MatCeedGetCeedOperators(Mat mat, CeedOperator *op_mult, CeedOperator *op_mult_transpose);
+PETSC_CEED_EXTERN PetscErrorCode MatCeedRestoreCeedOperators(Mat mat, CeedOperator *op_mult, CeedOperator *op_mult_transpose);
+
+PETSC_CEED_EXTERN PetscErrorCode MatCeedSetLogEvents(Mat mat, PetscLogEvent log_event_mult, PetscLogEvent log_event_mult_transpose);
+PETSC_CEED_EXTERN PetscErrorCode MatCeedGetLogEvents(Mat mat, PetscLogEvent *log_event_mult, PetscLogEvent *log_event_mult_transpose);
diff --git a/examples/fluids/include/ceed-utils.h b/examples/fluids/include/petsc-ceed-utils.h
similarity index 86%
rename from examples/fluids/include/ceed-utils.h
rename to examples/fluids/include/petsc-ceed-utils.h
index 10d1eeb615..936f278ee3 100644
--- a/examples/fluids/include/ceed-utils.h
+++ b/examples/fluids/include/petsc-ceed-utils.h
@@ -9,17 +9,43 @@
 #include <ceed.h>
 #include <petscdm.h>
 
-#define PetscCallCeed(ceed, ...)                                    \
-  do {                                                              \
-    int ierr_q_;                                                    \
-    PetscStackUpdateLine;                                           \
-    ierr_q_ = __VA_ARGS__;                                          \
-    if (PetscUnlikely(ierr_q_ != CEED_ERROR_SUCCESS)) {             \
-      const char *error_message;                                    \
-      CeedGetErrorMessage(ceed, &error_message);                    \
-      SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "%s", error_message); \
-    }                                                               \
-  } while (0)
+/**
+  @brief Copy the reference to a `Vec`.
+         Note: If `vec_copy` is non-null, it is assumed to be a valid pointer to a `Vec` and `VecDestroy()` will be called.
+
+  Collective across MPI processes.
+
+  @param[in]   vec       `Vec` to reference
+  @param[out]  vec_copy  Copy of reference
+
+  @return An error code: 0 - success, otherwise - failure
+**/
+static inline PetscErrorCode VecReferenceCopy(Vec vec, Vec *vec_copy) {
+  PetscFunctionBeginUser;
+  PetscCall(PetscObjectReference((PetscObject)vec));
+  PetscCall(VecDestroy(vec_copy));
+  *vec_copy = vec;
+  PetscFunctionReturn(PETSC_SUCCESS);
+}
+
+/**
+  @brief Copy the reference to a `DM`.
+         Note: If `dm_copy` is non-null, it is assumed to be a valid pointer to a `DM` and `DMDestroy()` will be called.
+
+  Collective across MPI processes.
+
+  @param[in]   dm       `DM` to reference
+  @param[out]  dm_copy  Copy of reference
+
+  @return An error code: 0 - success, otherwise - failure
+**/
+static inline PetscErrorCode DMReferenceCopy(DM dm, DM *dm_copy) {
+  PetscFunctionBeginUser;
+  PetscCall(PetscObjectReference((PetscObject)dm));
+  PetscCall(DMDestroy(dm_copy));
+  *dm_copy = dm;
+  PetscFunctionReturn(PETSC_SUCCESS);
+}
 
 /**
   @brief Translate PetscMemType to CeedMemType
diff --git a/examples/fluids/include/petsc-ceed.h b/examples/fluids/include/petsc-ceed.h
new file mode 100644
index 0000000000..6e77c39e59
--- /dev/null
+++ b/examples/fluids/include/petsc-ceed.h
@@ -0,0 +1,42 @@
+// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+#pragma once
+
+#include <petscsys.h>
+
+#if defined(__clang_analyzer__)
+#define PETSC_CEED_EXTERN extern
+#elif defined(__cplusplus)
+#define PETSC_CEED_EXTERN extern "C"
+#else
+#define PETSC_CEED_EXTERN extern
+#endif
+
+#if defined(__clang_analyzer__)
+#define PETSC_CEED_INTERN
+#else
+#define PETSC_CEED_INTERN PETSC_CEED_EXTERN __attribute__((visibility("hidden")))
+#endif
+
+/**
+  @brief Calls a libCEED function and then checks the resulting error code.
+  If the error code is non-zero, then a PETSc error is set with the libCEED error message.
+**/
+/// @ingroup RatelInternal
+#ifndef PetscCallCeed
+#define PetscCallCeed(ceed_, ...)                                   \
+  do {                                                              \
+    int ierr_q_;                                                    \
+    PetscStackUpdateLine;                                           \
+    ierr_q_ = __VA_ARGS__;                                          \
+    if (PetscUnlikely(ierr_q_ != CEED_ERROR_SUCCESS)) {             \
+      const char *error_message;                                    \
+      CeedGetErrorMessage(ceed_, &error_message);                   \
+      SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "%s", error_message); \
+    }                                                               \
+  } while (0)
+#endif
diff --git a/examples/fluids/navierstokes.h b/examples/fluids/navierstokes.h
index ce8da6ea28..21732d6c83 100644
--- a/examples/fluids/navierstokes.h
+++ b/examples/fluids/navierstokes.h
@@ -6,9 +6,9 @@
 // This file is part of CEED:  http://github.com/ceed
 #pragma once
 
-#include <ceed-utils.h>
 #include <ceed.h>
 #include <mat-ceed.h>
+#include <petsc-ceed-utils.h>
 #include <petscts.h>
 #include <stdbool.h>
 
diff --git a/examples/fluids/src/mat-ceed.c b/examples/fluids/src/mat-ceed.c
index f5118bccff..f08c71d1a3 100644
--- a/examples/fluids/src/mat-ceed.c
+++ b/examples/fluids/src/mat-ceed.c
@@ -1,11 +1,12 @@
 /// @file
-/// MatCeed and it's related operators
+/// MatCEED implementation
 
-#include <ceed-utils.h>
 #include <ceed.h>
 #include <ceed/backend.h>
 #include <mat-ceed-impl.h>
 #include <mat-ceed.h>
+#include <petsc-ceed-utils.h>
+#include <petsc-ceed.h>
 #include <petscdmplex.h>
 #include <stdlib.h>
 #include <string.h>
@@ -21,67 +22,14 @@ PetscLogEvent MATCEED_MULT, MATCEED_MULT_TRANSPOSE;
   @return An error code: 0 - success, otherwise - failure
 **/
 static PetscErrorCode MatCeedRegisterLogEvents() {
-  static bool registered = false;
+  static PetscBool registered = PETSC_FALSE;
 
   PetscFunctionBeginUser;
   if (registered) PetscFunctionReturn(PETSC_SUCCESS);
   PetscCall(PetscClassIdRegister("MATCEED", &MATCEED_CLASSID));
   PetscCall(PetscLogEventRegister("MATCEED Mult", MATCEED_CLASSID, &MATCEED_MULT));
   PetscCall(PetscLogEventRegister("MATCEED Mult Transpose", MATCEED_CLASSID, &MATCEED_MULT_TRANSPOSE));
-  registered = true;
-  PetscFunctionReturn(PETSC_SUCCESS);
-}
-
-/**
-  @brief Setup inner `Mat` for `PC` operations not directly supported by libCEED.
-
-  Collective across MPI processes.
-
-  @param[in]   mat_ceed   `MATCEED` to setup
-  @param[out]  mat_inner  Inner `Mat`
-
-  @return An error code: 0 - success, otherwise - failure
-**/
-static PetscErrorCode MatCeedSetupInnerMat(Mat mat_ceed, Mat *mat_inner) {
-  MatCeedContext ctx;
-
-  PetscFunctionBeginUser;
-  PetscCall(MatShellGetContext(mat_ceed, &ctx));
-
-  PetscCheck(ctx->dm_x == ctx->dm_y, PetscObjectComm((PetscObject)mat_ceed), PETSC_ERR_SUP, "PC only supported for MATCEED on a single DM");
-
-  // Check cl mat type
-  {
-    PetscBool is_internal_mat_type_cl = PETSC_FALSE;
-    char      internal_mat_type_cl[64];
-
-    // Check for specific CL inner mat type for this Mat
-    {
-      const char *mat_ceed_prefix = NULL;
-
-      PetscCall(MatGetOptionsPrefix(mat_ceed, &mat_ceed_prefix));
-      PetscOptionsBegin(PetscObjectComm((PetscObject)mat_ceed), mat_ceed_prefix, "", NULL);
-      PetscCall(PetscOptionsFList("-ceed_inner_mat_type", "MATCEED inner assembled MatType for PC support", NULL, MatList, internal_mat_type_cl,
-                                  internal_mat_type_cl, sizeof(internal_mat_type_cl), &is_internal_mat_type_cl));
-      PetscOptionsEnd();
-      if (is_internal_mat_type_cl) {
-        PetscCall(PetscFree(ctx->internal_mat_type));
-        PetscCall(PetscStrallocpy(internal_mat_type_cl, &ctx->internal_mat_type));
-      }
-    }
-  }
-
-  // Create sparse matrix
-  {
-    MatType dm_mat_type, dm_mat_type_copy;
-
-    PetscCall(DMGetMatType(ctx->dm_x, &dm_mat_type));
-    PetscCall(PetscStrallocpy(dm_mat_type, (char **)&dm_mat_type_copy));
-    PetscCall(DMSetMatType(ctx->dm_x, ctx->internal_mat_type));
-    PetscCall(DMCreateMatrix(ctx->dm_x, mat_inner));
-    PetscCall(DMSetMatType(ctx->dm_x, dm_mat_type_copy));
-    PetscCall(PetscFree(dm_mat_type_copy));
-  }
+  registered = PETSC_TRUE;
   PetscFunctionReturn(PETSC_SUCCESS);
 }
 
@@ -177,14 +125,14 @@ static PetscErrorCode MatCeedAssembleInnerBlockDiagonalMat(Mat mat_ceed, PetscBo
   PetscCall(MatShellGetContext(mat_ceed, &ctx));
   if (use_ceed_pbd) {
     // Check if COO pattern set
-    if (!ctx->mat_assembled_pbd_internal) PetscCall(MatCeedSetupInnerMat(mat_ceed, &ctx->mat_assembled_pbd_internal));
+    if (!ctx->mat_assembled_pbd_internal) PetscCall(MatCeedCreateMatCOO(mat_ceed, &ctx->mat_assembled_pbd_internal));
 
     // Assemble mat_assembled_full_internal
     PetscCall(MatCeedAssemblePointBlockDiagonalCOO(mat_ceed, ctx->mat_assembled_pbd_internal));
     if (mat_inner) *mat_inner = ctx->mat_assembled_pbd_internal;
   } else {
     // Check if COO pattern set
-    if (!ctx->mat_assembled_full_internal) PetscCall(MatCeedSetupInnerMat(mat_ceed, &ctx->mat_assembled_full_internal));
+    if (!ctx->mat_assembled_full_internal) PetscCall(MatCeedCreateMatCOO(mat_ceed, &ctx->mat_assembled_full_internal));
 
     // Assemble mat_assembled_full_internal
     PetscCall(MatCeedAssembleCOO(mat_ceed, ctx->mat_assembled_full_internal));
@@ -299,7 +247,7 @@ static PetscErrorCode MatView_Ceed(Mat mat_ceed, PetscViewer viewer) {
   {
     FILE *file;
 
-    PetscCall(PetscViewerASCIIPrintf(viewer, "MatCEED:\n  Internal MatType:%s\n", ctx->internal_mat_type));
+    PetscCall(PetscViewerASCIIPrintf(viewer, "MatCEED:\n  Default COO MatType:%s\n", ctx->coo_mat_type));
     PetscCall(PetscViewerASCIIGetPointer(viewer, &file));
     PetscCall(PetscViewerASCIIPrintf(viewer, " libCEED Operator:\n"));
     PetscCallCeed(ctx->ceed, CeedOperatorView(ctx->op_mult, file));
@@ -358,6 +306,7 @@ PetscErrorCode MatCeedCreate(DM dm_x, DM dm_y, CeedOperator op_mult, CeedOperato
     Y_g_size = X_g_size;
     Y_l_size = X_l_size;
   }
+
   // Create context
   {
     Vec X_loc, Y_loc_transpose = NULL;
@@ -469,13 +418,13 @@ PetscErrorCode MatCeedCreate(DM dm_x, DM dm_y, CeedOperator op_mult, CeedOperato
   // -- Set internal mat type
   {
     VecType vec_type;
-    MatType internal_mat_type = MATAIJ;
+    MatType coo_mat_type;
 
     PetscCall(VecGetType(ctx->X_loc, &vec_type));
-    if (strstr(vec_type, VECCUDA)) internal_mat_type = MATAIJCUSPARSE;
-    else if (strstr(vec_type, VECKOKKOS)) internal_mat_type = MATAIJKOKKOS;
-    else internal_mat_type = MATAIJ;
-    PetscCall(PetscStrallocpy(internal_mat_type, &ctx->internal_mat_type));
+    if (strstr(vec_type, VECCUDA)) coo_mat_type = MATAIJCUSPARSE;
+    else if (strstr(vec_type, VECKOKKOS)) coo_mat_type = MATAIJKOKKOS;
+    else coo_mat_type = MATAIJ;
+    PetscCall(PetscStrallocpy(coo_mat_type, &ctx->coo_mat_type));
   }
   // -- Set mat operations
   PetscCall(MatShellSetContextDestroy(*mat, (PetscErrorCode(*)(void *))MatCeedContextDestroy));
@@ -506,13 +455,16 @@ PetscErrorCode MatCeedCopy(Mat mat_ceed, Mat mat_other) {
 
   // Check type compatibility
   {
-    MatType mat_type_ceed, mat_type_other;
+    PetscBool is_matceed = PETSC_FALSE, is_matshell = PETSC_FALSE;
+    MatType   mat_type_ceed, mat_type_other;
 
     PetscCall(MatGetType(mat_ceed, &mat_type_ceed));
-    PetscCheck(!strcmp(mat_type_ceed, MATCEED), PETSC_COMM_SELF, PETSC_ERR_LIB, "mat_ceed must have type " MATCEED);
-    PetscCall(MatGetType(mat_ceed, &mat_type_other));
-    PetscCheck(!strcmp(mat_type_other, MATCEED) || !strcmp(mat_type_other, MATSHELL), PETSC_COMM_SELF, PETSC_ERR_LIB,
-               "mat_other must have type " MATCEED " or " MATSHELL);
+    PetscCall(PetscStrcmp(mat_type_ceed, MATCEED, &is_matceed));
+    PetscCheck(is_matceed, PETSC_COMM_SELF, PETSC_ERR_LIB, "mat_ceed must have type " MATCEED);
+    PetscCall(MatGetType(mat_other, &mat_type_other));
+    PetscCall(PetscStrcmp(mat_type_other, MATCEED, &is_matceed));
+    PetscCall(PetscStrcmp(mat_type_other, MATSHELL, &is_matceed));
+    PetscCheck(is_matceed || is_matshell, PETSC_COMM_SELF, PETSC_ERR_LIB, "mat_other must have type " MATCEED " or " MATSHELL);
   }
 
   // Check dimension compatibility
@@ -568,6 +520,102 @@ PetscErrorCode MatCeedCopy(Mat mat_ceed, Mat mat_other) {
   PetscFunctionReturn(PETSC_SUCCESS);
 }
 
+/**
+  @brief Setup a `Mat` with the same COO pattern as a `MatCEED`.
+
+  Collective across MPI processes.
+
+  @param[in]   mat_ceed  `MATCEED`
+  @param[out]  mat_coo   Sparse `Mat` with same COO pattern
+
+  @return An error code: 0 - success, otherwise - failure
+**/
+PetscErrorCode MatCeedCreateMatCOO(Mat mat_ceed, Mat *mat_coo) {
+  MatCeedContext ctx;
+
+  PetscFunctionBeginUser;
+  PetscCall(MatShellGetContext(mat_ceed, &ctx));
+
+  PetscCheck(ctx->dm_x == ctx->dm_y, PetscObjectComm((PetscObject)mat_ceed), PETSC_ERR_SUP, "COO assembly only supported for MATCEED on a single DM");
+
+  // Check cl mat type
+  {
+    PetscBool is_coo_mat_type_cl = PETSC_FALSE;
+    char      coo_mat_type_cl[64];
+
+    // Check for specific CL coo mat type for this Mat
+    {
+      const char *mat_ceed_prefix = NULL;
+
+      PetscCall(MatGetOptionsPrefix(mat_ceed, &mat_ceed_prefix));
+      PetscOptionsBegin(PetscObjectComm((PetscObject)mat_ceed), mat_ceed_prefix, "", NULL);
+      PetscCall(PetscOptionsFList("-ceed_coo_mat_type", "Default MATCEED COO assembly MatType", NULL, MatList, coo_mat_type_cl, coo_mat_type_cl,
+                                  sizeof(coo_mat_type_cl), &is_coo_mat_type_cl));
+      PetscOptionsEnd();
+      if (is_coo_mat_type_cl) {
+        PetscCall(PetscFree(ctx->coo_mat_type));
+        PetscCall(PetscStrallocpy(coo_mat_type_cl, &ctx->coo_mat_type));
+      }
+    }
+  }
+
+  // Create sparse matrix
+  {
+    MatType dm_mat_type, dm_mat_type_copy;
+
+    PetscCall(DMGetMatType(ctx->dm_x, &dm_mat_type));
+    PetscCall(PetscStrallocpy(dm_mat_type, (char **)&dm_mat_type_copy));
+    PetscCall(DMSetMatType(ctx->dm_x, ctx->coo_mat_type));
+    PetscCall(DMCreateMatrix(ctx->dm_x, mat_coo));
+    PetscCall(DMSetMatType(ctx->dm_x, dm_mat_type_copy));
+    PetscCall(PetscFree(dm_mat_type_copy));
+  }
+  PetscFunctionReturn(PETSC_SUCCESS);
+}
+
+/**
+  @brief Setup the COO preallocation `MATCEED` into a `MATAIJ` or similar.
+         The caller is responsible for assuring the global and local sizes are compatible, otherwise this function will fail.
+
+  Collective across MPI processes.
+
+  @param[in]      mat_ceed  `MATCEED` to assemble
+  @param[in,out]  mat_coo   `MATAIJ` or similar to assemble into
+
+  @return An error code: 0 - success, otherwise - failure
+**/
+PetscErrorCode MatCeedSetPreallocationCOO(Mat mat_ceed, Mat mat_coo) {
+  MatCeedContext ctx;
+
+  PetscFunctionBeginUser;
+  PetscCall(MatShellGetContext(mat_ceed, &ctx));
+
+  {
+    PetscInt     *rows_petsc = NULL, *cols_petsc = NULL;
+    CeedInt      *rows_ceed, *cols_ceed;
+    PetscCount    num_entries;
+    PetscLogStage stage_amg_setup;
+
+    // -- Assemble sparsity pattern if mat hasn't been assembled before
+    PetscCall(PetscLogStageGetId("MATCEED Assembly Setup", &stage_amg_setup));
+    if (stage_amg_setup == -1) {
+      PetscCall(PetscLogStageRegister("MATCEED Assembly Setup", &stage_amg_setup));
+    }
+    PetscCall(PetscLogStagePush(stage_amg_setup));
+    PetscCallCeed(ctx->ceed, CeedOperatorLinearAssembleSymbolic(ctx->op_mult, &num_entries, &rows_ceed, &cols_ceed));
+    PetscCall(IntArrayCeedToPetsc(num_entries, &rows_ceed, &rows_petsc));
+    PetscCall(IntArrayCeedToPetsc(num_entries, &cols_ceed, &cols_petsc));
+    PetscCall(MatSetPreallocationCOOLocal(mat_coo, num_entries, rows_petsc, cols_petsc));
+    free(rows_petsc);
+    free(cols_petsc);
+    if (!ctx->coo_values_full) PetscCallCeed(ctx->ceed, CeedVectorCreate(ctx->ceed, num_entries, &ctx->coo_values_full));
+    PetscCall(PetscRealloc(++ctx->num_mats_assembled_full * sizeof(Mat), &ctx->mats_assembled_full));
+    ctx->mats_assembled_full[ctx->num_mats_assembled_full - 1] = mat_coo;
+    PetscCall(PetscLogStagePop());
+  }
+  PetscFunctionReturn(PETSC_SUCCESS);
+}
+
 /**
   @brief Assemble a `MATCEED` into a `MATAIJ` or similar.
          The `mat_coo` preallocation is set to match the sparsity pattern of `mat_ceed`.
@@ -586,36 +634,14 @@ PetscErrorCode MatCeedAssembleCOO(Mat mat_ceed, Mat mat_coo) {
   PetscFunctionBeginUser;
   PetscCall(MatShellGetContext(mat_ceed, &ctx));
 
-  // Check if COO pattern set
+  // Set COO pattern if needed
   {
-    PetscInt index = -1;
+    CeedInt index = -1;
 
     for (PetscInt i = 0; i < ctx->num_mats_assembled_full; i++) {
       if (ctx->mats_assembled_full[i] == mat_coo) index = i;
     }
-    if (index == -1) {
-      PetscInt     *rows_petsc = NULL, *cols_petsc = NULL;
-      CeedInt      *rows_ceed, *cols_ceed;
-      PetscCount    num_entries;
-      PetscLogStage stage_amg_setup;
-
-      // -- Assemble sparsity pattern if mat hasn't been assembled before
-      PetscCall(PetscLogStageGetId("MATCEED Assembly Setup", &stage_amg_setup));
-      if (stage_amg_setup == -1) {
-        PetscCall(PetscLogStageRegister("MATCEED Assembly Setup", &stage_amg_setup));
-      }
-      PetscCall(PetscLogStagePush(stage_amg_setup));
-      PetscCallCeed(ctx->ceed, CeedOperatorLinearAssembleSymbolic(ctx->op_mult, &num_entries, &rows_ceed, &cols_ceed));
-      PetscCall(IntArrayCeedToPetsc(num_entries, &rows_ceed, &rows_petsc));
-      PetscCall(IntArrayCeedToPetsc(num_entries, &cols_ceed, &cols_petsc));
-      PetscCall(MatSetPreallocationCOOLocal(mat_coo, num_entries, rows_petsc, cols_petsc));
-      free(rows_petsc);
-      free(cols_petsc);
-      if (!ctx->coo_values_full) PetscCallCeed(ctx->ceed, CeedVectorCreate(ctx->ceed, num_entries, &ctx->coo_values_full));
-      PetscCall(PetscRealloc(++ctx->num_mats_assembled_full * sizeof(Mat), &ctx->mats_assembled_full));
-      ctx->mats_assembled_full[ctx->num_mats_assembled_full - 1] = mat_coo;
-      PetscCall(PetscLogStagePop());
-    }
+    if (index == -1) PetscCall(MatCeedSetPreallocationCOO(mat_ceed, mat_coo));
   }
 
   // Assemble mat_ceed
@@ -642,6 +668,84 @@ PetscErrorCode MatCeedAssembleCOO(Mat mat_ceed, Mat mat_coo) {
   PetscFunctionReturn(PETSC_SUCCESS);
 }
 
+/**
+  @brief Set the current value of a context field for a `MatCEED`.
+
+  Not collective across MPI processes.
+
+  @param[in,out]  mat    `MatCEED`
+  @param[in]      name   Name of the context field
+  @param[in]      value  New context field value
+
+  @return An error code: 0 - success, otherwise - failure
+**/
+PetscErrorCode MatCeedSetContextDouble(Mat mat, const char *name, double value) {
+  PetscBool      was_updated = PETSC_FALSE;
+  MatCeedContext ctx;
+
+  PetscFunctionBeginUser;
+  PetscCall(MatShellGetContext(mat, &ctx));
+  {
+    CeedContextFieldLabel label = NULL;
+
+    PetscCallCeed(ctx->ceed, CeedOperatorGetContextFieldLabel(ctx->op_mult, name, &label));
+    if (label) {
+      PetscCallCeed(ctx->ceed, CeedOperatorSetContextDouble(ctx->op_mult, label, &value));
+      was_updated = PETSC_TRUE;
+    }
+    if (ctx->op_mult_transpose) {
+      label = NULL;
+      PetscCallCeed(ctx->ceed, CeedOperatorGetContextFieldLabel(ctx->op_mult_transpose, name, &label));
+      if (label) {
+        PetscCallCeed(ctx->ceed, CeedOperatorSetContextDouble(ctx->op_mult_transpose, label, &value));
+        was_updated = PETSC_TRUE;
+      }
+    }
+  }
+  if (was_updated) {
+    PetscCall(MatAssemblyBegin(mat, MAT_FINAL_ASSEMBLY));
+    PetscCall(MatAssemblyEnd(mat, MAT_FINAL_ASSEMBLY));
+  }
+  PetscFunctionReturn(PETSC_SUCCESS);
+}
+
+/**
+  @brief Get the current value of a context field for a `MatCEED`.
+
+  Not collective across MPI processes.
+
+  @param[in]   mat    `MatCEED`
+  @param[in]   name   Name of the context field
+  @param[out]  value  Current context field value
+
+  @return An error code: 0 - success, otherwise - failure
+**/
+PetscErrorCode MatCeedGetContextDouble(Mat mat, const char *name, double *value) {
+  MatCeedContext ctx;
+
+  PetscFunctionBeginUser;
+  PetscCall(MatShellGetContext(mat, &ctx));
+  {
+    CeedContextFieldLabel label = NULL;
+    CeedOperator          op    = ctx->op_mult;
+
+    PetscCallCeed(ctx->ceed, CeedOperatorGetContextFieldLabel(op, name, &label));
+    if (!label && ctx->op_mult_transpose) {
+      op = ctx->op_mult_transpose;
+      PetscCallCeed(ctx->ceed, CeedOperatorGetContextFieldLabel(op, name, &label));
+    }
+    if (label) {
+      PetscSizeT    num_values;
+      const double *values_ceed;
+
+      PetscCallCeed(ctx->ceed, CeedOperatorGetContextDoubleRead(op, label, &num_values, &values_ceed));
+      *value = values_ceed[0];
+      PetscCallCeed(ctx->ceed, CeedOperatorRestoreContextDoubleRead(op, label, &values_ceed));
+    }
+  }
+  PetscFunctionReturn(PETSC_SUCCESS);
+}
+
 /**
   @brief Set user context for a `MATCEED`.
 
@@ -686,18 +790,37 @@ PetscErrorCode MatCeedGetContext(Mat mat, void *ctx) {
   else *(void **)ctx = NULL;
   PetscFunctionReturn(PETSC_SUCCESS);
 }
+/**
+  @brief Set a user defined matrix operation for a `MATCEED` matrix.
+
+  Within each user-defined routine, the user should call `MatCeedGetContext()` to obtain the user-defined context that was set by
+`MatCeedSetContext()`.
+
+  Collective across MPI processes.
+
+  @param[in,out]  mat  `MATCEED`
+  @param[in]      op   Name of the `MatOperation`
+  @param[in]      g    Function that provides the operation
+
+  @return An error code: 0 - success, otherwise - failure
+**/
+PetscErrorCode MatCeedSetOperation(Mat mat, MatOperation op, void (*g)(void)) {
+  PetscFunctionBeginUser;
+  PetscCall(MatShellSetOperation(mat, op, g));
+  PetscFunctionReturn(PETSC_SUCCESS);
+}
 
 /**
-  @brief Sets the inner matrix type as a string from the `MATCEED`.
+  @brief Sets the default COO matrix type as a string from the `MATCEED`.
 
   Collective across MPI processes.
 
   @param[in,out]  mat   `MATCEED`
-  @param[in]      type  Inner `MatType` to set
+  @param[in]      type  COO `MatType` to set
 
   @return An error code: 0 - success, otherwise - failure
 **/
-PetscErrorCode MatCeedSetInnerMatType(Mat mat, MatType type) {
+PetscErrorCode MatCeedSetCOOMatType(Mat mat, MatType type) {
   MatCeedContext ctx;
 
   PetscFunctionBeginUser;
@@ -707,9 +830,9 @@ PetscErrorCode MatCeedSetInnerMatType(Mat mat, MatType type) {
     size_t    len_old, len_new;
     PetscBool is_same = PETSC_FALSE;
 
-    PetscCall(PetscStrlen(ctx->internal_mat_type, &len_old));
+    PetscCall(PetscStrlen(ctx->coo_mat_type, &len_old));
     PetscCall(PetscStrlen(type, &len_new));
-    if (len_old == len_new) PetscCall(PetscStrncmp(ctx->internal_mat_type, type, len_old, &is_same));
+    if (len_old == len_new) PetscCall(PetscStrncmp(ctx->coo_mat_type, type, len_old, &is_same));
     if (is_same) PetscFunctionReturn(PETSC_SUCCESS);
   }
   // Clean up old mats in different format
@@ -738,48 +861,28 @@ PetscErrorCode MatCeedSetInnerMatType(Mat mat, MatType type) {
       }
     }
   }
-  PetscCall(PetscFree(ctx->internal_mat_type));
-  PetscCall(PetscStrallocpy(type, &ctx->internal_mat_type));
+  PetscCall(PetscFree(ctx->coo_mat_type));
+  PetscCall(PetscStrallocpy(type, &ctx->coo_mat_type));
   PetscFunctionReturn(PETSC_SUCCESS);
   // LCOV_EXCL_STOP
 }
 
 /**
-  @brief Gets the inner matrix type as a string from the `MATCEED`.
+  @brief Gets the default COO matrix type as a string from the `MATCEED`.
 
   Collective across MPI processes.
 
   @param[in,out]  mat   `MATCEED`
-  @param[in]      type  Inner `MatType`
+  @param[in]      type  COO `MatType`
 
   @return An error code: 0 - success, otherwise - failure
 **/
-PetscErrorCode MatCeedGetInnerMatType(Mat mat, MatType *type) {
+PetscErrorCode MatCeedGetCOOMatType(Mat mat, MatType *type) {
   MatCeedContext ctx;
 
   PetscFunctionBeginUser;
   PetscCall(MatShellGetContext(mat, &ctx));
-  *type = ctx->internal_mat_type;
-  PetscFunctionReturn(PETSC_SUCCESS);
-}
-
-/**
-  @brief Set a user defined matrix operation for a `MATCEED` matrix.
-
-  Within each user-defined routine, the user should call `MatCeedGetContext()` to obtain the user-defined context that was set by
-`MatCeedSetContext()`.
-
-  Collective across MPI processes.
-
-  @param[in,out]  mat  `MATCEED`
-  @param[in]      op   Name of the `MatOperation`
-  @param[in]      g    Function that provides the operation
-
-  @return An error code: 0 - success, otherwise - failure
-**/
-PetscErrorCode MatCeedSetOperation(Mat mat, MatOperation op, void (*g)(void)) {
-  PetscFunctionBeginUser;
-  PetscCall(MatShellSetOperation(mat, op, g));
+  *type = ctx->coo_mat_type;
   PetscFunctionReturn(PETSC_SUCCESS);
 }
 
@@ -806,9 +909,7 @@ PetscErrorCode MatCeedSetLocalVectors(Mat mat, Vec X_loc, Vec Y_loc_transpose) {
     PetscCall(VecGetSize(X_loc, &len_new));
     PetscCheck(len_old == len_new, PETSC_COMM_SELF, PETSC_ERR_LIB, "new X_loc length %" PetscInt_FMT " should match old X_loc length %" PetscInt_FMT,
                len_new, len_old);
-    PetscCall(VecDestroy(&ctx->X_loc));
-    ctx->X_loc = X_loc;
-    PetscCall(PetscObjectReference((PetscObject)X_loc));
+    PetscCall(VecReferenceCopy(X_loc, &ctx->X_loc));
   }
   if (Y_loc_transpose) {
     PetscInt len_old, len_new;
@@ -817,9 +918,7 @@ PetscErrorCode MatCeedSetLocalVectors(Mat mat, Vec X_loc, Vec Y_loc_transpose) {
     PetscCall(VecGetSize(Y_loc_transpose, &len_new));
     PetscCheck(len_old == len_new, PETSC_COMM_SELF, PETSC_ERR_LIB,
                "new Y_loc_transpose length %" PetscInt_FMT " should match old Y_loc_transpose length %" PetscInt_FMT, len_new, len_old);
-    PetscCall(VecDestroy(&ctx->Y_loc_transpose));
-    ctx->Y_loc_transpose = Y_loc_transpose;
-    PetscCall(PetscObjectReference((PetscObject)Y_loc_transpose));
+    PetscCall(VecReferenceCopy(Y_loc_transpose, &ctx->Y_loc_transpose));
   }
   PetscFunctionReturn(PETSC_SUCCESS);
 }
@@ -841,12 +940,12 @@ PetscErrorCode MatCeedGetLocalVectors(Mat mat, Vec *X_loc, Vec *Y_loc_transpose)
   PetscFunctionBeginUser;
   PetscCall(MatShellGetContext(mat, &ctx));
   if (X_loc) {
-    *X_loc = ctx->X_loc;
-    PetscCall(PetscObjectReference((PetscObject)*X_loc));
+    *X_loc = NULL;
+    PetscCall(VecReferenceCopy(ctx->X_loc, X_loc));
   }
   if (Y_loc_transpose) {
-    *Y_loc_transpose = ctx->Y_loc_transpose;
-    PetscCall(PetscObjectReference((PetscObject)*Y_loc_transpose));
+    *Y_loc_transpose = NULL;
+    PetscCall(VecReferenceCopy(ctx->Y_loc_transpose, Y_loc_transpose));
   }
   PetscFunctionReturn(PETSC_SUCCESS);
 }
@@ -995,14 +1094,10 @@ PetscErrorCode MatCeedContextCreate(DM dm_x, DM dm_y, Vec X_loc, Vec Y_loc_trans
   (*ctx)->log_event_mult_transpose = log_event_mult_transpose;
 
   // PETSc objects
-  PetscCall(PetscObjectReference((PetscObject)dm_x));
-  (*ctx)->dm_x = dm_x;
-  PetscCall(PetscObjectReference((PetscObject)dm_y));
-  (*ctx)->dm_y = dm_y;
-  if (X_loc) PetscCall(PetscObjectReference((PetscObject)X_loc));
-  (*ctx)->X_loc = X_loc;
-  if (Y_loc_transpose) PetscCall(PetscObjectReference((PetscObject)Y_loc_transpose));
-  (*ctx)->Y_loc_transpose = Y_loc_transpose;
+  PetscCall(DMReferenceCopy(dm_x, &(*ctx)->dm_x));
+  PetscCall(DMReferenceCopy(dm_y, &(*ctx)->dm_y));
+  if (X_loc) PetscCall(VecReferenceCopy(X_loc, &(*ctx)->X_loc));
+  if (Y_loc_transpose) PetscCall(VecReferenceCopy(Y_loc_transpose, &(*ctx)->Y_loc_transpose));
 
   // Memtype
   {
@@ -1130,7 +1225,7 @@ PetscErrorCode MatCeedContextDestroy(MatCeedContext ctx) {
   PetscCall(VecDestroy(&ctx->Y_loc_transpose));
   PetscCall(MatDestroy(&ctx->mat_assembled_full_internal));
   PetscCall(MatDestroy(&ctx->mat_assembled_pbd_internal));
-  PetscCall(PetscFree(ctx->internal_mat_type));
+  PetscCall(PetscFree(ctx->coo_mat_type));
   PetscCall(PetscFree(ctx->mats_assembled_full));
   PetscCall(PetscFree(ctx->mats_assembled_pbd));
 
diff --git a/examples/fluids/src/petsc_ops.c b/examples/fluids/src/petsc_ops.c
index 99f88d0999..e51c21366b 100644
--- a/examples/fluids/src/petsc_ops.c
+++ b/examples/fluids/src/petsc_ops.c
@@ -312,7 +312,7 @@ PetscErrorCode CreateSolveOperatorsFromMatCeed(KSP ksp, Mat mat_ceed, PetscBool
   MatType   mat_ceed_inner_type;
 
   PetscFunctionBeginUser;
-  PetscCall(MatCeedGetInnerMatType(mat_ceed, &mat_ceed_inner_type));
+  PetscCall(MatCeedGetCOOMatType(mat_ceed, &mat_ceed_inner_type));
   {  // Determine if Amat should be MATCEED or assembled
     const char *ksp_prefix = NULL;
 

From a94b429f24f002e973340223a70623592de18e41 Mon Sep 17 00:00:00 2001
From: James Wright <james@jameswright.xyz>
Date: Thu, 2 May 2024 14:16:48 -0600
Subject: [PATCH 019/571] fluids: Use new MatCeed/Ratel updates

---
 examples/fluids/navierstokes.h     |  1 -
 examples/fluids/src/petsc_ops.c    |  6 ++----
 examples/fluids/src/setuplibceed.c |  1 -
 examples/fluids/src/setupts.c      | 10 +++-------
 4 files changed, 5 insertions(+), 13 deletions(-)

diff --git a/examples/fluids/navierstokes.h b/examples/fluids/navierstokes.h
index 21732d6c83..e13cc5d0ea 100644
--- a/examples/fluids/navierstokes.h
+++ b/examples/fluids/navierstokes.h
@@ -276,7 +276,6 @@ struct Physics_private {
   CeedContextFieldLabel stg_solution_time_label;
   CeedContextFieldLabel timestep_size_label;
   CeedContextFieldLabel ics_time_label;
-  CeedContextFieldLabel ijacobian_time_shift_label;
 };
 
 typedef struct {
diff --git a/examples/fluids/src/petsc_ops.c b/examples/fluids/src/petsc_ops.c
index e51c21366b..b74c20a73f 100644
--- a/examples/fluids/src/petsc_ops.c
+++ b/examples/fluids/src/petsc_ops.c
@@ -309,10 +309,8 @@ PetscErrorCode ApplyAddCeedOperatorLocalToLocal(Vec X_loc, Vec Y_loc, OperatorAp
  */
 PetscErrorCode CreateSolveOperatorsFromMatCeed(KSP ksp, Mat mat_ceed, PetscBool assemble, Mat *Amat, Mat *Pmat) {
   PetscBool use_matceed_pmat, assemble_amat = PETSC_FALSE;
-  MatType   mat_ceed_inner_type;
 
   PetscFunctionBeginUser;
-  PetscCall(MatCeedGetCOOMatType(mat_ceed, &mat_ceed_inner_type));
   {  // Determine if Amat should be MATCEED or assembled
     const char *ksp_prefix = NULL;
 
@@ -323,7 +321,7 @@ PetscErrorCode CreateSolveOperatorsFromMatCeed(KSP ksp, Mat mat_ceed, PetscBool
   }
 
   if (assemble_amat) {
-    PetscCall(MatConvert(mat_ceed, mat_ceed_inner_type, MAT_INITIAL_MATRIX, Amat));
+    PetscCall(MatCeedCreateMatCOO(mat_ceed, Amat));
     if (assemble) PetscCall(MatCeedAssembleCOO(mat_ceed, *Amat));
 
     PetscCall(PetscObjectReference((PetscObject)*Amat));
@@ -347,7 +345,7 @@ PetscErrorCode CreateSolveOperatorsFromMatCeed(KSP ksp, Mat mat_ceed, PetscBool
     PetscCall(PetscObjectReference((PetscObject)mat_ceed));
     *Pmat = mat_ceed;
   } else {
-    PetscCall(MatConvert(mat_ceed, mat_ceed_inner_type, MAT_INITIAL_MATRIX, Pmat));
+    PetscCall(MatCeedCreateMatCOO(mat_ceed, Pmat));
     if (assemble) PetscCall(MatCeedAssembleCOO(mat_ceed, *Pmat));
   }
   PetscFunctionReturn(PETSC_SUCCESS);
diff --git a/examples/fluids/src/setuplibceed.c b/examples/fluids/src/setuplibceed.c
index 972fffa459..f249bb6878 100644
--- a/examples/fluids/src/setuplibceed.c
+++ b/examples/fluids/src/setuplibceed.c
@@ -468,7 +468,6 @@ PetscErrorCode SetupLibceed(Ceed ceed, CeedData ceed_data, DM dm, User user, App
     if (op_ijacobian) {
       PetscCall(MatCeedCreate(user->dm, user->dm, op_ijacobian, NULL, &user->mat_ijacobian));
       PetscCall(MatCeedSetLocalVectors(user->mat_ijacobian, user->Q_dot_loc, NULL));
-      PetscCallCeed(ceed, CeedOperatorGetContextFieldLabel(op_ijacobian, "ijacobian time shift", &user->phys->ijacobian_time_shift_label));
       PetscCallCeed(ceed, CeedOperatorDestroy(&op_ijacobian));
     }
     if (app_ctx->sgs_model_type == SGS_MODEL_DATA_DRIVEN) PetscCall(SgsDDSetup(ceed, user, ceed_data, problem));
diff --git a/examples/fluids/src/setupts.c b/examples/fluids/src/setupts.c
index 2ef3de7203..6fc7054240 100644
--- a/examples/fluids/src/setupts.c
+++ b/examples/fluids/src/setupts.c
@@ -154,8 +154,8 @@ PetscErrorCode IFunction_NS(TS ts, PetscReal t, Vec Q, Vec Q_dot, Vec G, void *u
 }
 
 PetscErrorCode FormIJacobian_NS(TS ts, PetscReal t, Vec Q, Vec Q_dot, PetscReal shift, Mat J, Mat J_pre, void *user_data) {
-  User      user = *(User *)user_data;
-  Ceed      ceed = user->ceed;
+  User      user         = *(User *)user_data;
+  double    shift_double = shift;
   PetscBool J_is_matceed, J_is_mffd, J_pre_is_matceed, J_pre_is_mffd;
 
   PetscFunctionBeginUser;
@@ -163,12 +163,8 @@ PetscErrorCode FormIJacobian_NS(TS ts, PetscReal t, Vec Q, Vec Q_dot, PetscReal
   PetscCall(PetscObjectTypeCompare((PetscObject)J, MATCEED, &J_is_matceed));
   PetscCall(PetscObjectTypeCompare((PetscObject)J_pre, MATMFFD, &J_pre_is_mffd));
   PetscCall(PetscObjectTypeCompare((PetscObject)J_pre, MATCEED, &J_pre_is_matceed));
-  if (user->phys->ijacobian_time_shift_label) {
-    CeedOperator op_ijacobian;
 
-    PetscCall(MatCeedGetCeedOperators(user->mat_ijacobian, &op_ijacobian, NULL));
-    PetscCallCeed(ceed, CeedOperatorSetContextDouble(op_ijacobian, user->phys->ijacobian_time_shift_label, &shift));
-  }
+  PetscCall(MatCeedSetContextDouble(user->mat_ijacobian, "ijacobian time shift", shift_double));
 
   if (J_is_matceed || J_is_mffd) {
     PetscCall(MatAssemblyBegin(J, MAT_FINAL_ASSEMBLY));

From f965f5c6f80d9786386ec853fb0e2bfd6c15b6b7 Mon Sep 17 00:00:00 2001
From: James Wright <james@jameswright.xyz>
Date: Thu, 2 May 2024 14:36:17 -0600
Subject: [PATCH 020/571] fluids: Demo MatCeed{Get,Set}ContextReal

---
 examples/fluids/include/mat-ceed.h |  2 ++
 examples/fluids/src/mat-ceed.c     | 39 ++++++++++++++++++++++++++++++
 examples/fluids/src/setupts.c      |  5 ++--
 3 files changed, 43 insertions(+), 3 deletions(-)

diff --git a/examples/fluids/include/mat-ceed.h b/examples/fluids/include/mat-ceed.h
index 8b05001273..17291be46f 100644
--- a/examples/fluids/include/mat-ceed.h
+++ b/examples/fluids/include/mat-ceed.h
@@ -26,6 +26,8 @@ PETSC_CEED_INTERN PetscErrorCode MatCeedGetContextDouble(Mat mat, const char *na
 // Advanced functionality
 PETSC_CEED_EXTERN PetscErrorCode MatCeedSetContext(Mat mat, PetscErrorCode (*f)(void *), void *ctx);
 PETSC_CEED_EXTERN PetscErrorCode MatCeedGetContext(Mat mat, void *ctx);
+PETSC_CEED_EXTERN PetscErrorCode MatCeedSetContextReal(Mat mat, const char *name, PetscReal value);
+PETSC_CEED_EXTERN PetscErrorCode MatCeedGetContextReal(Mat mat, const char *name, PetscReal *value);
 
 PETSC_CEED_EXTERN PetscErrorCode MatCeedSetOperation(Mat mat, MatOperation op, void (*g)(void));
 PETSC_CEED_EXTERN PetscErrorCode MatCeedSetCOOMatType(Mat mat, MatType type);
diff --git a/examples/fluids/src/mat-ceed.c b/examples/fluids/src/mat-ceed.c
index f08c71d1a3..06987153d4 100644
--- a/examples/fluids/src/mat-ceed.c
+++ b/examples/fluids/src/mat-ceed.c
@@ -709,6 +709,25 @@ PetscErrorCode MatCeedSetContextDouble(Mat mat, const char *name, double value)
   PetscFunctionReturn(PETSC_SUCCESS);
 }
 
+/**
+  @brief Set the current `PetscReal` value of a context field for a `MatCEED`.
+
+  Not collective across MPI processes.
+
+  @param[in,out]  mat    `MatCEED`
+  @param[in]      name   Name of the context field
+  @param[in]      value  New context field value
+
+  @return An error code: 0 - success, otherwise - failure
+**/
+PetscErrorCode MatCeedSetContextReal(Mat mat, const char *name, PetscReal value) {
+  double value_double = value;
+
+  PetscFunctionBeginUser;
+  PetscCall(MatCeedSetContextDouble(mat, name, value_double));
+  PetscFunctionReturn(PETSC_SUCCESS);
+}
+
 /**
   @brief Get the current value of a context field for a `MatCEED`.
 
@@ -746,6 +765,26 @@ PetscErrorCode MatCeedGetContextDouble(Mat mat, const char *name, double *value)
   PetscFunctionReturn(PETSC_SUCCESS);
 }
 
+/**
+  @brief Get the current `PetscReal` value of a context field for a `MatCEED`.
+
+  Not collective across MPI processes.
+
+  @param[in]   mat    `MatCEED`
+  @param[in]   name   Name of the context field
+  @param[out]  value  Current context field value
+
+  @return An error code: 0 - success, otherwise - failure
+**/
+PetscErrorCode MatCeedGetContextReal(Mat mat, const char *name, PetscReal *value) {
+  double value_double;
+
+  PetscFunctionBeginUser;
+  PetscCall(MatCeedGetContextDouble(mat, name, &value_double));
+  *value = value_double;
+  PetscFunctionReturn(PETSC_SUCCESS);
+}
+
 /**
   @brief Set user context for a `MATCEED`.
 
diff --git a/examples/fluids/src/setupts.c b/examples/fluids/src/setupts.c
index 6fc7054240..5549c579f2 100644
--- a/examples/fluids/src/setupts.c
+++ b/examples/fluids/src/setupts.c
@@ -154,8 +154,7 @@ PetscErrorCode IFunction_NS(TS ts, PetscReal t, Vec Q, Vec Q_dot, Vec G, void *u
 }
 
 PetscErrorCode FormIJacobian_NS(TS ts, PetscReal t, Vec Q, Vec Q_dot, PetscReal shift, Mat J, Mat J_pre, void *user_data) {
-  User      user         = *(User *)user_data;
-  double    shift_double = shift;
+  User      user = *(User *)user_data;
   PetscBool J_is_matceed, J_is_mffd, J_pre_is_matceed, J_pre_is_mffd;
 
   PetscFunctionBeginUser;
@@ -164,7 +163,7 @@ PetscErrorCode FormIJacobian_NS(TS ts, PetscReal t, Vec Q, Vec Q_dot, PetscReal
   PetscCall(PetscObjectTypeCompare((PetscObject)J_pre, MATMFFD, &J_pre_is_mffd));
   PetscCall(PetscObjectTypeCompare((PetscObject)J_pre, MATCEED, &J_pre_is_matceed));
 
-  PetscCall(MatCeedSetContextDouble(user->mat_ijacobian, "ijacobian time shift", shift_double));
+  PetscCall(MatCeedSetContextReal(user->mat_ijacobian, "ijacobian time shift", shift));
 
   if (J_is_matceed || J_is_mffd) {
     PetscCall(MatAssemblyBegin(J, MAT_FINAL_ASSEMBLY));

From b32e1e4ae534b35db4fce1af36ede8e6afa2b585 Mon Sep 17 00:00:00 2001
From: James Wright <james@jameswright.xyz>
Date: Wed, 1 May 2024 08:07:16 -0600
Subject: [PATCH 021/571] fix(fluids): Add PCNONE to MatCeed solver selection

---
 examples/fluids/src/petsc_ops.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/fluids/src/petsc_ops.c b/examples/fluids/src/petsc_ops.c
index b74c20a73f..398796f33b 100644
--- a/examples/fluids/src/petsc_ops.c
+++ b/examples/fluids/src/petsc_ops.c
@@ -338,7 +338,7 @@ PetscErrorCode CreateSolveOperatorsFromMatCeed(KSP ksp, Mat mat_ceed, PetscBool
 
     PetscCall(KSPGetPC(ksp, &pc));
     PetscCall(PCGetType(pc, &pc_type));
-    PetscCall(PetscStrcmpAny(pc_type, &use_matceed_pmat, PCJACOBI, PCVPBJACOBI, PCPBJACOBI, ""));
+    PetscCall(PetscStrcmpAny(pc_type, &use_matceed_pmat, PCNONE, PCJACOBI, PCVPBJACOBI, PCPBJACOBI, ""));
   }
 
   if (use_matceed_pmat) {

From 24078fc496a1fe46181eedb6cbe4bfe7f461c091 Mon Sep 17 00:00:00 2001
From: James Wright <james@jameswright.xyz>
Date: Wed, 1 May 2024 08:08:32 -0600
Subject: [PATCH 022/571] fix(fluids): Print info just before solving

---
 examples/fluids/navierstokes.c | 5 +----
 examples/fluids/navierstokes.h | 2 +-
 examples/fluids/src/setupts.c  | 4 +++-
 3 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/examples/fluids/navierstokes.c b/examples/fluids/navierstokes.c
index 919ff16669..6f424034b7 100644
--- a/examples/fluids/navierstokes.c
+++ b/examples/fluids/navierstokes.c
@@ -229,9 +229,6 @@ int main(int argc, char **argv) {
     PetscCall(SetupICsFromBinary(comm, app_ctx, Q));
   }
 
-  // Print problem summary
-  if (app_ctx->test_type == TESTTYPE_NONE) PetscCall(PrintRunInfo(user, phys_ctx, problem, comm));
-
   // -- Zero Q_loc
   PetscCall(VecZeroEntries(user->Q_loc));
 
@@ -240,7 +237,7 @@ int main(int argc, char **argv) {
   // ---------------------------------------------------------------------------
   TS          ts;
   PetscScalar final_time;
-  PetscCall(TSSolve_NS(dm, user, app_ctx, phys_ctx, &Q, &final_time, &ts));
+  PetscCall(TSSolve_NS(dm, user, app_ctx, phys_ctx, problem, &Q, &final_time, &ts));
 
   // ---------------------------------------------------------------------------
   // Post-processing
diff --git a/examples/fluids/navierstokes.h b/examples/fluids/navierstokes.h
index e13cc5d0ea..b0b73260f5 100644
--- a/examples/fluids/navierstokes.h
+++ b/examples/fluids/navierstokes.h
@@ -366,7 +366,7 @@ PetscErrorCode IFunction_NS(TS ts, PetscReal t, Vec Q, Vec Q_dot, Vec G, void *u
 PetscErrorCode TSMonitor_NS(TS ts, PetscInt step_no, PetscReal time, Vec Q, void *ctx);
 
 // TS: Create, setup, and solve
-PetscErrorCode TSSolve_NS(DM dm, User user, AppCtx app_ctx, Physics phys, Vec *Q, PetscScalar *f_time, TS *ts);
+PetscErrorCode TSSolve_NS(DM dm, User user, AppCtx app_ctx, Physics phys, ProblemData problem, Vec *Q, PetscScalar *f_time, TS *ts);
 
 // Update Boundary Values when time has changed
 PetscErrorCode UpdateBoundaryValues(User user, Vec Q_loc, PetscReal t);
diff --git a/examples/fluids/src/setupts.c b/examples/fluids/src/setupts.c
index 5549c579f2..b91344f655 100644
--- a/examples/fluids/src/setupts.c
+++ b/examples/fluids/src/setupts.c
@@ -298,7 +298,7 @@ PetscErrorCode TSMonitor_NS(TS ts, PetscInt step_no, PetscReal time, Vec Q, void
 }
 
 // TS: Create, setup, and solve
-PetscErrorCode TSSolve_NS(DM dm, User user, AppCtx app_ctx, Physics phys, Vec *Q, PetscScalar *f_time, TS *ts) {
+PetscErrorCode TSSolve_NS(DM dm, User user, AppCtx app_ctx, Physics phys, ProblemData problem, Vec *Q, PetscScalar *f_time, TS *ts) {
   MPI_Comm    comm = user->comm;
   TSAdapt     adapt;
   PetscScalar final_time;
@@ -377,6 +377,8 @@ PetscErrorCode TSSolve_NS(DM dm, User user, AppCtx app_ctx, Physics phys, Vec *Q
     PetscCall(TSMonitorSet(*ts, TSMonitor_SGS_DD_Training, user, NULL));
     PetscCall(TSSetPostStep(*ts, TSPostStep_SGS_DD_Training));
   }
+
+  if (app_ctx->test_type == TESTTYPE_NONE) PetscCall(PrintRunInfo(user, user->phys, problem, comm));
   // Solve
   PetscReal start_time;
   PetscInt  start_step;

From 43327b86957a0a83ddc40b554a65a2ea63a53aee Mon Sep 17 00:00:00 2001
From: James Wright <james@jameswright.xyz>
Date: Fri, 3 May 2024 13:35:15 -0600
Subject: [PATCH 023/571] fluids: Add back CeedOperator PetscLogEvents

Remove the unused fluids-based ones in favor of the built-in MatCeed
ones
---
 examples/fluids/include/mat-ceed-impl.h |   5 +-
 examples/fluids/include/mat-ceed.h      |   2 +
 examples/fluids/navierstokes.h          |   3 -
 examples/fluids/src/mat-ceed.c          | 137 +++++++++++++++++++-----
 examples/fluids/src/misc.c              |   6 --
 5 files changed, 115 insertions(+), 38 deletions(-)

diff --git a/examples/fluids/include/mat-ceed-impl.h b/examples/fluids/include/mat-ceed-impl.h
index cb49fe901e..cfc1bd61f6 100644
--- a/examples/fluids/include/mat-ceed-impl.h
+++ b/examples/fluids/include/mat-ceed-impl.h
@@ -20,7 +20,7 @@ struct MatCeedContext_private {
   PetscMemType   mem_type;
   PetscInt       ref_count, num_mats_assembled_full, num_mats_assembled_pbd;
   PetscBool      is_destroyed, is_ceed_pbd_valid, is_ceed_vpbd_valid;
-  PetscLogEvent  log_event_mult, log_event_mult_transpose;
+  PetscLogEvent  log_event_mult, log_event_mult_transpose, log_event_ceed_mult, log_event_ceed_mult_transpose;
   DM             dm_x, dm_y;
   Mat           *mats_assembled_full, *mats_assembled_pbd, mat_assembled_full_internal, mat_assembled_pbd_internal;
   Vec            X_loc, Y_loc_transpose;
@@ -32,7 +32,8 @@ struct MatCeedContext_private {
 // Context data
 PETSC_CEED_EXTERN PetscErrorCode MatCeedContextCreate(DM dm_x, DM dm_y, Vec X_loc, Vec Y_loc_transpose, CeedOperator op_mult,
                                                       CeedOperator op_mult_transpose, PetscLogEvent log_event_mult,
-                                                      PetscLogEvent log_event_mult_transpose, MatCeedContext *ctx);
+                                                      PetscLogEvent log_event_mult_transpose, PetscLogEvent log_event_ceed_mult,
+                                                      PetscLogEvent log_event_ceed_mult_transpose, MatCeedContext *ctx);
 PETSC_CEED_EXTERN PetscErrorCode MatCeedContextReference(MatCeedContext ctx);
 PETSC_CEED_EXTERN PetscErrorCode MatCeedContextReferenceCopy(MatCeedContext ctx, MatCeedContext *ctx_copy);
 PETSC_CEED_EXTERN PetscErrorCode MatCeedContextDestroy(MatCeedContext ctx);
diff --git a/examples/fluids/include/mat-ceed.h b/examples/fluids/include/mat-ceed.h
index 17291be46f..42a192fa12 100644
--- a/examples/fluids/include/mat-ceed.h
+++ b/examples/fluids/include/mat-ceed.h
@@ -42,3 +42,5 @@ PETSC_CEED_EXTERN PetscErrorCode MatCeedRestoreCeedOperators(Mat mat, CeedOperat
 
 PETSC_CEED_EXTERN PetscErrorCode MatCeedSetLogEvents(Mat mat, PetscLogEvent log_event_mult, PetscLogEvent log_event_mult_transpose);
 PETSC_CEED_EXTERN PetscErrorCode MatCeedGetLogEvents(Mat mat, PetscLogEvent *log_event_mult, PetscLogEvent *log_event_mult_transpose);
+PETSC_CEED_EXTERN PetscErrorCode MatCeedSetCeedOperatorLogEvents(Mat mat, PetscLogEvent log_event_mult, PetscLogEvent log_event_mult_transpose);
+PETSC_CEED_EXTERN PetscErrorCode MatCeedGetCeedOperatorLogEvents(Mat mat, PetscLogEvent *log_event_mult, PetscLogEvent *log_event_mult_transpose);
diff --git a/examples/fluids/navierstokes.h b/examples/fluids/navierstokes.h
index e13cc5d0ea..f27c410bd1 100644
--- a/examples/fluids/navierstokes.h
+++ b/examples/fluids/navierstokes.h
@@ -80,9 +80,6 @@ static const char *const DifferentialFilterDampingFunctions[] = {
 // Log Events
 // -----------------------------------------------------------------------------
 extern PetscLogEvent FLUIDS_CeedOperatorApply;
-extern PetscLogEvent FLUIDS_CeedOperatorAssemble;
-extern PetscLogEvent FLUIDS_CeedOperatorAssembleDiagonal;
-extern PetscLogEvent FLUIDS_CeedOperatorAssemblePointBlockDiagonal;
 extern PetscLogEvent FLUIDS_SmartRedis_Init;
 extern PetscLogEvent FLUIDS_SmartRedis_Meta;
 extern PetscLogEvent FLUIDS_SmartRedis_Train;
diff --git a/examples/fluids/src/mat-ceed.c b/examples/fluids/src/mat-ceed.c
index 06987153d4..cd164598ed 100644
--- a/examples/fluids/src/mat-ceed.c
+++ b/examples/fluids/src/mat-ceed.c
@@ -12,7 +12,9 @@
 #include <string.h>
 
 PetscClassId  MATCEED_CLASSID;
-PetscLogEvent MATCEED_MULT, MATCEED_MULT_TRANSPOSE;
+PetscLogEvent MATCEED_MULT, MATCEED_MULT_CEEDOP, MATCEED_MULT_TRANSPOSE, MATCEED_MULT_TRANSPOSE_CEEDOP, MATCEED_ASSEMBLE_DIAGONAL,
+    MATCEED_ASSEMBLE_DIAGONAL_CEEDOP, MATCEED_SETUP_PBDIAGONAL, MATCEED_SETUP_PBDIAGONAL_CEEDOP, MATCEED_ASSEMBLE_PBDIAGONAL,
+    MATCEED_ASSEMBLE_PBDIAGONAL_CEEDOP, MATCEED_SETUP_FULL, MATCEED_SETUP_FULL_CEEDOP, MATCEED_ASSEMBLE_FULL, MATCEED_ASSEMBLE_FULL_CEEDOP;
 
 /**
   @brief Register MATCEED log events.
@@ -26,9 +28,20 @@ static PetscErrorCode MatCeedRegisterLogEvents() {
 
   PetscFunctionBeginUser;
   if (registered) PetscFunctionReturn(PETSC_SUCCESS);
-  PetscCall(PetscClassIdRegister("MATCEED", &MATCEED_CLASSID));
-  PetscCall(PetscLogEventRegister("MATCEED Mult", MATCEED_CLASSID, &MATCEED_MULT));
-  PetscCall(PetscLogEventRegister("MATCEED Mult Transpose", MATCEED_CLASSID, &MATCEED_MULT_TRANSPOSE));
+  PetscCall(PetscClassIdRegister("MatCEED", &MATCEED_CLASSID));
+  PetscCall(PetscLogEventRegister("MatCEEDMul", MATCEED_CLASSID, &MATCEED_MULT));
+  PetscCall(PetscLogEventRegister("MatCEEDMulCeed", MATCEED_CLASSID, &MATCEED_MULT_CEEDOP));
+  PetscCall(PetscLogEventRegister("MatCEEDMulT", MATCEED_CLASSID, &MATCEED_MULT_TRANSPOSE));
+  PetscCall(PetscLogEventRegister("MatCEEDMulTCeed", MATCEED_CLASSID, &MATCEED_MULT_TRANSPOSE_CEEDOP));
+  PetscCall(PetscLogEventRegister("MatCEEDAsmDiag", MATCEED_CLASSID, &MATCEED_ASSEMBLE_DIAGONAL));
+  PetscCall(PetscLogEventRegister("MatCEEDAsmPBDSU", MATCEED_CLASSID, &MATCEED_SETUP_PBDIAGONAL));
+  PetscCall(PetscLogEventRegister("MatCEEDAsmPBDSUCeed", MATCEED_CLASSID, &MATCEED_SETUP_PBDIAGONAL_CEEDOP));
+  PetscCall(PetscLogEventRegister("MatCEEDAsmPBD", MATCEED_CLASSID, &MATCEED_ASSEMBLE_PBDIAGONAL));
+  PetscCall(PetscLogEventRegister("MatCEEDAsmPBDCeed", MATCEED_CLASSID, &MATCEED_ASSEMBLE_PBDIAGONAL_CEEDOP));
+  PetscCall(PetscLogEventRegister("MatCEEDAsmSU", MATCEED_CLASSID, &MATCEED_SETUP_FULL));
+  PetscCall(PetscLogEventRegister("MatCEEDAsmSUCeed", MATCEED_CLASSID, &MATCEED_SETUP_FULL_CEEDOP));
+  PetscCall(PetscLogEventRegister("MatCEEDAsm", MATCEED_CLASSID, &MATCEED_ASSEMBLE_FULL));
+  PetscCall(PetscLogEventRegister("MatCEEDAsmCeed", MATCEED_CLASSID, &MATCEED_ASSEMBLE_FULL_CEEDOP));
   registered = PETSC_TRUE;
   PetscFunctionReturn(PETSC_SUCCESS);
 }
@@ -65,12 +78,15 @@ static PetscErrorCode MatCeedAssemblePointBlockDiagonalCOO(Mat mat_ceed, Mat mat
       PetscLogStage stage_amg_setup;
 
       // -- Assemble sparsity pattern if mat hasn't been assembled before
-      PetscCall(PetscLogStageGetId("MATCEED Assembly Setup", &stage_amg_setup));
+      PetscCall(PetscLogStageGetId("MatCEED Asm Setup", &stage_amg_setup));
       if (stage_amg_setup == -1) {
-        PetscCall(PetscLogStageRegister("MATCEED Assembly Setup", &stage_amg_setup));
+        PetscCall(PetscLogStageRegister("MatCEED Asm Setup", &stage_amg_setup));
       }
       PetscCall(PetscLogStagePush(stage_amg_setup));
+      PetscCall(PetscLogEventBegin(MATCEED_SETUP_PBDIAGONAL, mat_ceed, mat_coo, NULL, NULL));
+      PetscCall(PetscLogEventBegin(MATCEED_SETUP_PBDIAGONAL_CEEDOP, mat_ceed, mat_coo, NULL, NULL));
       PetscCallCeed(ctx->ceed, CeedOperatorLinearAssemblePointBlockDiagonalSymbolic(ctx->op_mult, &num_entries, &rows_ceed, &cols_ceed));
+      PetscCall(PetscLogEventEnd(MATCEED_SETUP_PBDIAGONAL_CEEDOP, mat_ceed, mat_coo, NULL, NULL));
       PetscCall(IntArrayCeedToPetsc(num_entries, &rows_ceed, &rows_petsc));
       PetscCall(IntArrayCeedToPetsc(num_entries, &cols_ceed, &cols_petsc));
       PetscCall(MatSetPreallocationCOOLocal(mat_coo, num_entries, rows_petsc, cols_petsc));
@@ -79,11 +95,13 @@ static PetscErrorCode MatCeedAssemblePointBlockDiagonalCOO(Mat mat_ceed, Mat mat
       if (!ctx->coo_values_pbd) PetscCallCeed(ctx->ceed, CeedVectorCreate(ctx->ceed, num_entries, &ctx->coo_values_pbd));
       PetscCall(PetscRealloc(++ctx->num_mats_assembled_pbd * sizeof(Mat), &ctx->mats_assembled_pbd));
       ctx->mats_assembled_pbd[ctx->num_mats_assembled_pbd - 1] = mat_coo;
+      PetscCall(PetscLogEventEnd(MATCEED_SETUP_PBDIAGONAL, mat_ceed, mat_coo, NULL, NULL));
       PetscCall(PetscLogStagePop());
     }
   }
 
   // Assemble mat_ceed
+  PetscCall(PetscLogEventBegin(MATCEED_ASSEMBLE_PBDIAGONAL, mat_ceed, mat_coo, NULL, NULL));
   PetscCall(MatAssemblyBegin(mat_coo, MAT_FINAL_ASSEMBLY));
   {
     const CeedScalar *values;
@@ -96,7 +114,9 @@ static PetscErrorCode MatCeedAssemblePointBlockDiagonalCOO(Mat mat_ceed, Mat mat
     else if (strstr(mat_type, "kokkos")) mem_type = CEED_MEM_DEVICE;
     else mem_type = CEED_MEM_HOST;
 
+    PetscCall(PetscLogEventBegin(MATCEED_ASSEMBLE_PBDIAGONAL_CEEDOP, mat_ceed, mat_coo, NULL, NULL));
     PetscCallCeed(ctx->ceed, CeedOperatorLinearAssemblePointBlockDiagonal(ctx->op_mult, ctx->coo_values_pbd, CEED_REQUEST_IMMEDIATE));
+    PetscCall(PetscLogEventEnd(MATCEED_ASSEMBLE_PBDIAGONAL_CEEDOP, mat_ceed, mat_coo, NULL, NULL));
     PetscCallCeed(ctx->ceed, CeedVectorGetArrayRead(ctx->coo_values_pbd, mem_type, &values));
     PetscCall(MatSetValuesCOO(mat_coo, values, INSERT_VALUES));
     PetscCall(MatIsSPDKnown(mat_ceed, &is_spd_known, &is_spd));
@@ -104,6 +124,7 @@ static PetscErrorCode MatCeedAssemblePointBlockDiagonalCOO(Mat mat_ceed, Mat mat
     PetscCallCeed(ctx->ceed, CeedVectorRestoreArrayRead(ctx->coo_values_pbd, &values));
   }
   PetscCall(MatAssemblyEnd(mat_coo, MAT_FINAL_ASSEMBLY));
+  PetscCall(PetscLogEventEnd(MATCEED_ASSEMBLE_PBDIAGONAL, mat_ceed, mat_coo, NULL, NULL));
   PetscFunctionReturn(PETSC_SUCCESS);
 }
 
@@ -317,7 +338,8 @@ PetscErrorCode MatCeedCreate(DM dm_x, DM dm_y, CeedOperator op_mult, CeedOperato
       PetscCall(DMCreateLocalVector(dm_y, &Y_loc_transpose));
       PetscCall(VecZeroEntries(Y_loc_transpose));
     }
-    PetscCall(MatCeedContextCreate(dm_x, dm_y, X_loc, Y_loc_transpose, op_mult, op_mult_transpose, MATCEED_MULT, MATCEED_MULT_TRANSPOSE, &ctx));
+    PetscCall(MatCeedContextCreate(dm_x, dm_y, X_loc, Y_loc_transpose, op_mult, op_mult_transpose, MATCEED_MULT, MATCEED_MULT_TRANSPOSE,
+                                   MATCEED_MULT_CEEDOP, MATCEED_MULT_TRANSPOSE_CEEDOP, &ctx));
     PetscCall(VecDestroy(&X_loc));
     PetscCall(VecDestroy(&Y_loc_transpose));
   }
@@ -597,12 +619,15 @@ PetscErrorCode MatCeedSetPreallocationCOO(Mat mat_ceed, Mat mat_coo) {
     PetscLogStage stage_amg_setup;
 
     // -- Assemble sparsity pattern if mat hasn't been assembled before
-    PetscCall(PetscLogStageGetId("MATCEED Assembly Setup", &stage_amg_setup));
+    PetscCall(PetscLogStageGetId("MatCEED Asm Setup", &stage_amg_setup));
     if (stage_amg_setup == -1) {
-      PetscCall(PetscLogStageRegister("MATCEED Assembly Setup", &stage_amg_setup));
+      PetscCall(PetscLogStageRegister("MatCEED Asm Setup", &stage_amg_setup));
     }
     PetscCall(PetscLogStagePush(stage_amg_setup));
+    PetscCall(PetscLogEventBegin(MATCEED_SETUP_FULL, mat_ceed, mat_coo, NULL, NULL));
+    PetscCall(PetscLogEventBegin(MATCEED_SETUP_FULL_CEEDOP, mat_ceed, mat_coo, NULL, NULL));
     PetscCallCeed(ctx->ceed, CeedOperatorLinearAssembleSymbolic(ctx->op_mult, &num_entries, &rows_ceed, &cols_ceed));
+    PetscCall(PetscLogEventEnd(MATCEED_SETUP_FULL_CEEDOP, mat_ceed, mat_coo, NULL, NULL));
     PetscCall(IntArrayCeedToPetsc(num_entries, &rows_ceed, &rows_petsc));
     PetscCall(IntArrayCeedToPetsc(num_entries, &cols_ceed, &cols_petsc));
     PetscCall(MatSetPreallocationCOOLocal(mat_coo, num_entries, rows_petsc, cols_petsc));
@@ -611,6 +636,7 @@ PetscErrorCode MatCeedSetPreallocationCOO(Mat mat_ceed, Mat mat_coo) {
     if (!ctx->coo_values_full) PetscCallCeed(ctx->ceed, CeedVectorCreate(ctx->ceed, num_entries, &ctx->coo_values_full));
     PetscCall(PetscRealloc(++ctx->num_mats_assembled_full * sizeof(Mat), &ctx->mats_assembled_full));
     ctx->mats_assembled_full[ctx->num_mats_assembled_full - 1] = mat_coo;
+    PetscCall(PetscLogEventEnd(MATCEED_SETUP_FULL, mat_ceed, mat_coo, NULL, NULL));
     PetscCall(PetscLogStagePop());
   }
   PetscFunctionReturn(PETSC_SUCCESS);
@@ -645,6 +671,7 @@ PetscErrorCode MatCeedAssembleCOO(Mat mat_ceed, Mat mat_coo) {
   }
 
   // Assemble mat_ceed
+  PetscCall(PetscLogEventBegin(MATCEED_ASSEMBLE_FULL, mat_ceed, mat_coo, NULL, NULL));
   PetscCall(MatAssemblyBegin(mat_coo, MAT_FINAL_ASSEMBLY));
   {
     const CeedScalar *values;
@@ -657,7 +684,9 @@ PetscErrorCode MatCeedAssembleCOO(Mat mat_ceed, Mat mat_coo) {
     else if (strstr(mat_type, "kokkos")) mem_type = CEED_MEM_DEVICE;
     else mem_type = CEED_MEM_HOST;
 
+    PetscCall(PetscLogEventBegin(MATCEED_ASSEMBLE_FULL_CEEDOP, mat_ceed, mat_coo, NULL, NULL));
     PetscCallCeed(ctx->ceed, CeedOperatorLinearAssemble(ctx->op_mult, ctx->coo_values_full));
+    PetscCall(PetscLogEventEnd(MATCEED_ASSEMBLE_FULL_CEEDOP, mat_ceed, mat_coo, NULL, NULL));
     PetscCallCeed(ctx->ceed, CeedVectorGetArrayRead(ctx->coo_values_full, mem_type, &values));
     PetscCall(MatSetValuesCOO(mat_coo, values, INSERT_VALUES));
     PetscCall(MatIsSPDKnown(mat_ceed, &is_spd_known, &is_spd));
@@ -665,6 +694,7 @@ PetscErrorCode MatCeedAssembleCOO(Mat mat_ceed, Mat mat_coo) {
     PetscCallCeed(ctx->ceed, CeedVectorRestoreArrayRead(ctx->coo_values_full, &values));
   }
   PetscCall(MatAssemblyEnd(mat_coo, MAT_FINAL_ASSEMBLY));
+  PetscCall(PetscLogEventEnd(MATCEED_ASSEMBLE_FULL, mat_ceed, mat_coo, NULL, NULL));
   PetscFunctionReturn(PETSC_SUCCESS);
 }
 
@@ -1097,6 +1127,48 @@ PetscErrorCode MatCeedGetLogEvents(Mat mat, PetscLogEvent *log_event_mult, Petsc
   PetscFunctionReturn(PETSC_SUCCESS);
 }
 
+/**
+  @brief Set `CeedOperator` `PetscLogEvent` for `MATCEED` `MatMult()` and `MatMultTranspose()` operators.
+
+  Not collective across MPI processes.
+
+  @param[in,out]  mat                       MatCeed
+  @param[out]     log_event_mult            `PetscLogEvent` for forward `CeedOperator` evaluation, or NULL
+  @param[out]     log_event_mult_transpose  `PetscLogEvent` for transpose `CeedOperator` evaluation, or NULL
+
+  @return An error code: 0 - success, otherwise - failure
+**/
+PetscErrorCode MatCeedSetCeedOperatorLogEvents(Mat mat, PetscLogEvent log_event_mult, PetscLogEvent log_event_mult_transpose) {
+  MatCeedContext ctx;
+
+  PetscFunctionBeginUser;
+  PetscCall(MatShellGetContext(mat, &ctx));
+  if (log_event_mult) ctx->log_event_ceed_mult = log_event_mult;
+  if (log_event_mult_transpose) ctx->log_event_ceed_mult_transpose = log_event_mult_transpose;
+  PetscFunctionReturn(PETSC_SUCCESS);
+}
+
+/**
+  @brief Get `CeedOperator` `PetscLogEvent` for `MATCEED` `MatMult()` and `MatMultTranspose()` operators.
+
+  Not collective across MPI processes.
+
+  @param[in,out]  mat                       MatCeed
+  @param[out]     log_event_mult            `PetscLogEvent` for forward `CeedOperator` evaluation, or NULL
+  @param[out]     log_event_mult_transpose  `PetscLogEvent` for transpose `CeedOperator` evaluation, or NULL
+
+  @return An error code: 0 - success, otherwise - failure
+**/
+PetscErrorCode MatCeedGetCeedOperatorLogEvents(Mat mat, PetscLogEvent *log_event_mult, PetscLogEvent *log_event_mult_transpose) {
+  MatCeedContext ctx;
+
+  PetscFunctionBeginUser;
+  PetscCall(MatShellGetContext(mat, &ctx));
+  if (log_event_mult) *log_event_mult = ctx->log_event_ceed_mult;
+  if (log_event_mult_transpose) *log_event_mult_transpose = ctx->log_event_ceed_mult_transpose;
+  PetscFunctionReturn(PETSC_SUCCESS);
+}
+
 // -----------------------------------------------------------------------------
 // Operator context data
 // -----------------------------------------------------------------------------
@@ -1106,20 +1178,23 @@ PetscErrorCode MatCeedGetLogEvents(Mat mat, PetscLogEvent *log_event_mult, Petsc
 
   Collective across MPI processes.
 
-  @param[in]   dm_x                      Input `DM`
-  @param[in]   dm_y                      Output `DM`
-  @param[in]   X_loc                     Input PETSc local vector, or NULL
-  @param[in]   Y_loc_transpose           Input PETSc local vector for transpose operation, or NULL
-  @param[in]   op_mult                   `CeedOperator` for forward evaluation
-  @param[in]   op_mult_transpose         `CeedOperator` for transpose evaluation
-  @param[in]   log_event_mult            `PetscLogEvent` for forward evaluation
-  @param[in]   log_event_mult_transpose  `PetscLogEvent` for transpose evaluation
-  @param[out]  ctx                       Context data for operator evaluation
+  @param[in]   dm_x                           Input `DM`
+  @param[in]   dm_y                           Output `DM`
+  @param[in]   X_loc                          Input PETSc local vector, or NULL
+  @param[in]   Y_loc_transpose                Input PETSc local vector for transpose operation, or NULL
+  @param[in]   op_mult                        `CeedOperator` for forward evaluation
+  @param[in]   op_mult_transpose              `CeedOperator` for transpose evaluation
+  @param[in]   log_event_mult                 `PetscLogEvent` for forward evaluation
+  @param[in]   log_event_mult_transpose       `PetscLogEvent` for transpose evaluation
+  @param[in]   log_event_ceed_mult            `PetscLogEvent` for forward `CeedOperator` evaluation
+  @param[in]   log_event_ceed_mult_transpose  `PetscLogEvent` for transpose `CeedOperator` evaluation
+  @param[out]  ctx                            Context data for operator evaluation
 
   @return An error code: 0 - success, otherwise - failure
 **/
 PetscErrorCode MatCeedContextCreate(DM dm_x, DM dm_y, Vec X_loc, Vec Y_loc_transpose, CeedOperator op_mult, CeedOperator op_mult_transpose,
-                                    PetscLogEvent log_event_mult, PetscLogEvent log_event_mult_transpose, MatCeedContext *ctx) {
+                                    PetscLogEvent log_event_mult, PetscLogEvent log_event_mult_transpose, PetscLogEvent log_event_ceed_mult,
+                                    PetscLogEvent log_event_ceed_mult_transpose, MatCeedContext *ctx) {
   CeedSize x_loc_len, y_loc_len;
 
   PetscFunctionBeginUser;
@@ -1129,8 +1204,10 @@ PetscErrorCode MatCeedContextCreate(DM dm_x, DM dm_y, Vec X_loc, Vec Y_loc_trans
   (*ctx)->ref_count = 1;
 
   // Logging
-  (*ctx)->log_event_mult           = log_event_mult;
-  (*ctx)->log_event_mult_transpose = log_event_mult_transpose;
+  (*ctx)->log_event_mult                = log_event_mult;
+  (*ctx)->log_event_mult_transpose      = log_event_mult_transpose;
+  (*ctx)->log_event_ceed_mult           = log_event_ceed_mult;
+  (*ctx)->log_event_ceed_mult_transpose = log_event_ceed_mult_transpose;
 
   // PETSc objects
   PetscCall(DMReferenceCopy(dm_x, &(*ctx)->dm_x));
@@ -1302,11 +1379,14 @@ PetscErrorCode MatGetDiagonal_Ceed(Mat A, Vec D) {
   PetscCall(MatShellGetContext(A, &ctx));
 
   // Place PETSc vector in libCEED vector
+  PetscCall(PetscLogEventBegin(MATCEED_ASSEMBLE_DIAGONAL, A, D, NULL, NULL));
   PetscCall(DMGetLocalVector(ctx->dm_x, &D_loc));
   PetscCall(VecPetscToCeed(D_loc, &mem_type, ctx->x_loc));
 
   // Compute Diagonal
+  PetscCall(PetscLogEventBegin(MATCEED_ASSEMBLE_DIAGONAL_CEEDOP, A, D, NULL, NULL));
   PetscCallCeed(ctx->ceed, CeedOperatorLinearAssembleDiagonal(ctx->op_mult, ctx->x_loc, CEED_REQUEST_IMMEDIATE));
+  PetscCall(PetscLogEventEnd(MATCEED_ASSEMBLE_DIAGONAL_CEEDOP, A, D, NULL, NULL));
 
   // Restore PETSc vector
   PetscCall(VecCeedToPetsc(ctx->x_loc, mem_type, D_loc));
@@ -1315,6 +1395,7 @@ PetscErrorCode MatGetDiagonal_Ceed(Mat A, Vec D) {
   PetscCall(VecZeroEntries(D));
   PetscCall(DMLocalToGlobal(ctx->dm_x, D_loc, ADD_VALUES, D));
   PetscCall(DMRestoreLocalVector(ctx->dm_x, &D_loc));
+  PetscCall(PetscLogEventEnd(MATCEED_ASSEMBLE_DIAGONAL, A, D, NULL, NULL));
   PetscFunctionReturn(PETSC_SUCCESS);
 }
 
@@ -1334,7 +1415,7 @@ PetscErrorCode MatMult_Ceed(Mat A, Vec X, Vec Y) {
 
   PetscFunctionBeginUser;
   PetscCall(MatShellGetContext(A, &ctx));
-  PetscCall(PetscLogEventBegin(ctx->log_event_mult, A, X, Y, 0));
+  PetscCall(PetscLogEventBegin(ctx->log_event_mult, A, X, Y, NULL));
 
   {
     PetscMemType x_mem_type, y_mem_type;
@@ -1354,7 +1435,9 @@ PetscErrorCode MatMult_Ceed(Mat A, Vec X, Vec Y) {
 
     // Apply libCEED operator
     PetscCall(PetscLogGpuTimeBegin());
+    PetscCall(PetscLogEventBegin(ctx->log_event_ceed_mult, A, X, Y, NULL));
     PetscCallCeed(ctx->ceed, CeedOperatorApplyAdd(ctx->op_mult, ctx->x_loc, ctx->y_loc, CEED_REQUEST_IMMEDIATE));
+    PetscCall(PetscLogEventEnd(ctx->log_event_ceed_mult, A, X, Y, NULL));
     PetscCall(PetscLogGpuTimeEnd());
 
     // Restore PETSc vectors
@@ -1373,8 +1456,7 @@ PetscErrorCode MatMult_Ceed(Mat A, Vec X, Vec Y) {
   // Log flops
   if (PetscMemTypeDevice(ctx->mem_type)) PetscCall(PetscLogGpuFlops(ctx->flops_mult));
   else PetscCall(PetscLogFlops(ctx->flops_mult));
-
-  PetscCall(PetscLogEventEnd(ctx->log_event_mult, A, X, Y, 0));
+  PetscCall(PetscLogEventEnd(ctx->log_event_mult, A, X, Y, NULL));
   PetscFunctionReturn(PETSC_SUCCESS);
 }
 
@@ -1394,7 +1476,7 @@ PetscErrorCode MatMultTranspose_Ceed(Mat A, Vec Y, Vec X) {
 
   PetscFunctionBeginUser;
   PetscCall(MatShellGetContext(A, &ctx));
-  PetscCall(PetscLogEventBegin(ctx->log_event_mult_transpose, A, Y, X, 0));
+  PetscCall(PetscLogEventBegin(ctx->log_event_mult_transpose, A, Y, X, NULL));
 
   {
     PetscMemType x_mem_type, y_mem_type;
@@ -1414,7 +1496,9 @@ PetscErrorCode MatMultTranspose_Ceed(Mat A, Vec Y, Vec X) {
 
     // Apply libCEED operator
     PetscCall(PetscLogGpuTimeBegin());
+    PetscCall(PetscLogEventBegin(ctx->log_event_ceed_mult_transpose, A, Y, X, NULL));
     PetscCallCeed(ctx->ceed, CeedOperatorApplyAdd(ctx->op_mult_transpose, ctx->y_loc, ctx->x_loc, CEED_REQUEST_IMMEDIATE));
+    PetscCall(PetscLogEventEnd(ctx->log_event_ceed_mult_transpose, A, Y, X, NULL));
     PetscCall(PetscLogGpuTimeEnd());
 
     // Restore PETSc vectors
@@ -1433,7 +1517,6 @@ PetscErrorCode MatMultTranspose_Ceed(Mat A, Vec Y, Vec X) {
   // Log flops
   if (PetscMemTypeDevice(ctx->mem_type)) PetscCall(PetscLogGpuFlops(ctx->flops_mult_transpose));
   else PetscCall(PetscLogFlops(ctx->flops_mult_transpose));
-
-  PetscCall(PetscLogEventEnd(ctx->log_event_mult_transpose, A, Y, X, 0));
+  PetscCall(PetscLogEventEnd(ctx->log_event_mult_transpose, A, Y, X, NULL));
   PetscFunctionReturn(PETSC_SUCCESS);
 }
diff --git a/examples/fluids/src/misc.c b/examples/fluids/src/misc.c
index e73769741c..5bbdd98379 100644
--- a/examples/fluids/src/misc.c
+++ b/examples/fluids/src/misc.c
@@ -365,9 +365,6 @@ PetscErrorCode PhastaDatFileReadToArrayReal(MPI_Comm comm, const char path[PETSC
 }
 
 PetscLogEvent       FLUIDS_CeedOperatorApply;
-PetscLogEvent       FLUIDS_CeedOperatorAssemble;
-PetscLogEvent       FLUIDS_CeedOperatorAssembleDiagonal;
-PetscLogEvent       FLUIDS_CeedOperatorAssemblePointBlockDiagonal;
 PetscLogEvent       FLUIDS_SmartRedis_Init;
 PetscLogEvent       FLUIDS_SmartRedis_Meta;
 PetscLogEvent       FLUIDS_SmartRedis_Train;
@@ -380,9 +377,6 @@ PetscErrorCode RegisterLogEvents() {
   PetscFunctionBeginUser;
   PetscCall(PetscClassIdRegister("libCEED", &libCEED_classid));
   PetscCall(PetscLogEventRegister("CeedOpApply", libCEED_classid, &FLUIDS_CeedOperatorApply));
-  PetscCall(PetscLogEventRegister("CeedOpAsm", libCEED_classid, &FLUIDS_CeedOperatorAssemble));
-  PetscCall(PetscLogEventRegister("CeedOpAsmD", libCEED_classid, &FLUIDS_CeedOperatorAssembleDiagonal));
-  PetscCall(PetscLogEventRegister("CeedOpAsmPBD", libCEED_classid, &FLUIDS_CeedOperatorAssemblePointBlockDiagonal));
 
   PetscCall(PetscClassIdRegister("onlineTrain", &onlineTrain_classid));
   PetscCall(PetscLogEventRegister("SmartRedis_Init", onlineTrain_classid, &FLUIDS_SmartRedis_Init));

From 7bc7b61f650775c1f4a19293c33952b8958158d6 Mon Sep 17 00:00:00 2001
From: James Wright <james@jameswright.xyz>
Date: Wed, 1 May 2024 11:50:04 -0600
Subject: [PATCH 024/571] fluids: Print correct Mat Types

---
 examples/fluids/navierstokes.h |  2 +-
 examples/fluids/src/misc.c     | 43 ++++++++++++++++++++++++++--------
 examples/fluids/src/setupts.c  |  2 +-
 3 files changed, 35 insertions(+), 12 deletions(-)

diff --git a/examples/fluids/navierstokes.h b/examples/fluids/navierstokes.h
index b0b73260f5..768ff84921 100644
--- a/examples/fluids/navierstokes.h
+++ b/examples/fluids/navierstokes.h
@@ -325,7 +325,7 @@ extern PetscErrorCode PRINT_ADVECTION(User user, ProblemData problem, AppCtx app
 
 extern PetscErrorCode PRINT_ADVECTION2D(User user, ProblemData problem, AppCtx app_ctx);
 
-PetscErrorCode PrintRunInfo(User user, Physics phys_ctx, ProblemData problem, MPI_Comm comm);
+PetscErrorCode PrintRunInfo(User user, Physics phys_ctx, ProblemData problem, TS ts);
 
 // -----------------------------------------------------------------------------
 // libCEED functions
diff --git a/examples/fluids/src/misc.c b/examples/fluids/src/misc.c
index e73769741c..8fe1484b67 100644
--- a/examples/fluids/src/misc.c
+++ b/examples/fluids/src/misc.c
@@ -397,8 +397,10 @@ PetscErrorCode RegisterLogEvents() {
 }
 
 // Print information about the given simulation run
-PetscErrorCode PrintRunInfo(User user, Physics phys_ctx, ProblemData problem, MPI_Comm comm) {
-  Ceed ceed = user->ceed;
+PetscErrorCode PrintRunInfo(User user, Physics phys_ctx, ProblemData problem, TS ts) {
+  Ceed     ceed = user->ceed;
+  MPI_Comm comm = PetscObjectComm((PetscObject)ts);
+
   PetscFunctionBeginUser;
   // Header and rank
   char        host_name[PETSC_MAX_PATH_LEN];
@@ -427,22 +429,43 @@ PetscErrorCode PrintRunInfo(User user, Physics phys_ctx, ProblemData problem, MP
                         "    libCEED Backend MemType            : %s\n",
                         used_resource, CeedMemTypes[mem_type_backend]));
   // PETSc
-  char box_faces_str[PETSC_MAX_PATH_LEN] = "3,3,3";
+  VecType vec_type;
+  char    box_faces_str[PETSC_MAX_PATH_LEN] = "3,3,3";
   if (problem->dim == 2) box_faces_str[3] = '\0';
   PetscCall(PetscOptionsGetString(NULL, NULL, "-dm_plex_box_faces", box_faces_str, sizeof(box_faces_str), NULL));
-  MatType amat_type = user->app_ctx->amat_type, pmat_type;
-  VecType vec_type;
-  PetscCall(DMGetMatType(user->dm, &pmat_type));
-  if (!amat_type) amat_type = pmat_type;
   PetscCall(DMGetVecType(user->dm, &vec_type));
   PetscCall(PetscPrintf(comm,
                         "  PETSc:\n"
                         "    Box Faces                          : %s\n"
-                        "    A MatType                          : %s\n"
-                        "    P MatType                          : %s\n"
                         "    DM VecType                         : %s\n"
                         "    Time Stepping Scheme               : %s\n",
-                        box_faces_str, amat_type, pmat_type, vec_type, phys_ctx->implicit ? "implicit" : "explicit"));
+                        box_faces_str, vec_type, phys_ctx->implicit ? "implicit" : "explicit"));
+  {
+    char           pmat_type_str[PETSC_MAX_PATH_LEN];
+    MatType        amat_type, pmat_type;
+    Mat            Amat, Pmat;
+    TSIJacobianFn *ijacob_function;
+
+    PetscCall(TSGetIJacobian(ts, &Amat, &Pmat, &ijacob_function, NULL));
+    PetscCall(MatGetType(Amat, &amat_type));
+    PetscCall(MatGetType(Pmat, &pmat_type));
+
+    PetscCall(PetscStrncpy(pmat_type_str, pmat_type, sizeof(pmat_type_str)));
+    if (!strcmp(pmat_type, MATCEED)) {
+      MatType pmat_coo_type;
+      char    pmat_coo_type_str[PETSC_MAX_PATH_LEN];
+
+      PetscCall(MatCeedGetCOOMatType(Pmat, &pmat_coo_type));
+      PetscCall(PetscSNPrintf(pmat_coo_type_str, sizeof(pmat_coo_type_str), " (COO MatType: %s)", pmat_coo_type));
+      PetscCall(PetscStrlcat(pmat_type_str, pmat_coo_type_str, sizeof(pmat_type_str)));
+    }
+    if (ijacob_function) {
+      PetscCall(PetscPrintf(comm,
+                            "    IJacobian A MatType                : %s\n"
+                            "    IJacobian P MatType                : %s\n",
+                            amat_type, pmat_type_str));
+    }
+  }
   if (user->app_ctx->cont_steps) {
     PetscCall(PetscPrintf(comm,
                           "  Continue:\n"
diff --git a/examples/fluids/src/setupts.c b/examples/fluids/src/setupts.c
index b91344f655..8ed11e10c0 100644
--- a/examples/fluids/src/setupts.c
+++ b/examples/fluids/src/setupts.c
@@ -378,7 +378,7 @@ PetscErrorCode TSSolve_NS(DM dm, User user, AppCtx app_ctx, Physics phys, Proble
     PetscCall(TSSetPostStep(*ts, TSPostStep_SGS_DD_Training));
   }
 
-  if (app_ctx->test_type == TESTTYPE_NONE) PetscCall(PrintRunInfo(user, user->phys, problem, comm));
+  if (app_ctx->test_type == TESTTYPE_NONE) PetscCall(PrintRunInfo(user, user->phys, problem, *ts));
   // Solve
   PetscReal start_time;
   PetscInt  start_step;

From 402a89e62454430262c5ecbbd4edfd8709c074dd Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Tue, 7 May 2024 11:59:22 -0600
Subject: [PATCH 025/571] ci - use 6.1.0 in CI

---
 .gitlab-ci.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 3ae8918f2a..b5ebd5c954 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -246,7 +246,7 @@ noether-rocm:
   script:
     - rm -f .SUCCESS
 # libCEED
-    - make configure ROCM_DIR=/opt/rocm-5.6.0 OPT='-O -march=native -ffp-contract=fast'
+    - make configure ROCM_DIR=/opt/rocm-6.1.0 OPT='-O -march=native -ffp-contract=fast'
     - BACKENDS_CPU=$(make info-backends-all | grep -o '/cpu[^ ]*' | tr '\n' ' ') && BACKENDS_GPU=$(make info-backends | grep -o '/gpu[^ ]*' | tr '\n' ' ')
     - echo "-------------- libCEED -------------" && make info
     - echo "-------------- BACKENDS_GPU --------" && echo $BACKENDS_GPU
@@ -330,7 +330,7 @@ noether-float:
 # Change to single precision
     - sed -i 's/ceed-f64/ceed-f32/1' include/ceed/types.h
 # Build libCEED
-    - make configure ROCM_DIR=/opt/rocm-5.6.0 OPT='-O -march=native -ffp-contract=fast'
+    - make configure ROCM_DIR=/opt/rocm-6.1.0 OPT='-O -march=native -ffp-contract=fast'
     - BACKENDS_CPU=$(make info-backends-all | grep -o '/cpu[^ ]*' | tr '\n' ' ') && BACKENDS_GPU=$(make info-backends | grep -o '/gpu[^ ]*' | tr '\n' ' ')
     - echo "-------------- libCEED -------------" && make info
     - echo "-------------- BACKENDS_CPU --------" && echo $BACKENDS_CPU

From 196058356ce16dc55f8605a42694186898beb49b Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Mon, 6 May 2024 12:03:07 -0600
Subject: [PATCH 026/571] rstr - support different at point e-layouts

---
 include/ceed-impl.h              |  1 +
 include/ceed/backend.h           |  1 +
 interface/ceed-elemrestriction.c | 39 ++++++++++++++++++++++++++++++++
 interface/ceed.c                 |  1 +
 tests/t231-elemrestriction.c     | 14 ++++++++----
 5 files changed, 51 insertions(+), 5 deletions(-)

diff --git a/include/ceed-impl.h b/include/ceed-impl.h
index 52b6beb633..4fe0c67c9f 100644
--- a/include/ceed-impl.h
+++ b/include/ceed-impl.h
@@ -157,6 +157,7 @@ struct CeedElemRestriction_private {
   int (*ApplyUnoriented)(CeedElemRestriction, CeedTransposeMode, CeedVector, CeedVector, CeedRequest *);
   int (*ApplyAtPointsInElement)(CeedElemRestriction, CeedInt, CeedTransposeMode, CeedVector, CeedVector, CeedRequest *);
   int (*ApplyBlock)(CeedElemRestriction, CeedInt, CeedTransposeMode, CeedVector, CeedVector, CeedRequest *);
+  int (*GetAtPointsElementOffset)(CeedElemRestriction, CeedInt, CeedSize *);
   int (*GetOffsets)(CeedElemRestriction, CeedMemType, const CeedInt **);
   int (*GetOrientations)(CeedElemRestriction, CeedMemType, const bool **);
   int (*GetCurlOrientations)(CeedElemRestriction, CeedMemType, const CeedInt8 **);
diff --git a/include/ceed/backend.h b/include/ceed/backend.h
index 72868c1ff0..ee61d638e6 100644
--- a/include/ceed/backend.h
+++ b/include/ceed/backend.h
@@ -292,6 +292,7 @@ CEED_EXTERN int CeedElemRestrictionGetLLayout(CeedElemRestriction rstr, CeedInt
 CEED_EXTERN int CeedElemRestrictionSetLLayout(CeedElemRestriction rstr, CeedInt layout[3]);
 CEED_EXTERN int CeedElemRestrictionGetELayout(CeedElemRestriction rstr, CeedInt layout[3]);
 CEED_EXTERN int CeedElemRestrictionSetELayout(CeedElemRestriction rstr, CeedInt layout[3]);
+CEED_EXTERN int CeedElemRestrictionGetAtPointsElementOffset(CeedElemRestriction rstr, CeedInt elem, CeedSize *elem_offset);
 CEED_EXTERN int CeedElemRestrictionGetData(CeedElemRestriction rstr, void *data);
 CEED_EXTERN int CeedElemRestrictionSetData(CeedElemRestriction rstr, void *data);
 CEED_EXTERN int CeedElemRestrictionReference(CeedElemRestriction rstr);
diff --git a/interface/ceed-elemrestriction.c b/interface/ceed-elemrestriction.c
index e687c0daed..8279e34d05 100644
--- a/interface/ceed-elemrestriction.c
+++ b/interface/ceed-elemrestriction.c
@@ -429,6 +429,45 @@ int CeedElemRestrictionSetELayout(CeedElemRestriction rstr, CeedInt layout[3]) {
   return CEED_ERROR_SUCCESS;
 }
 
+/**
+
+  @brief Get the E-vector element offset of a `CeedElemRestriction` at points
+
+  @param[in]  rstr        `CeedElemRestriction`
+  @param[in]  elem        Element number index into E-vector for
+  @param[out] elem_offset Offset for element `elem` in the E-vector.
+                            The data for point `i`, component `j`, element `elem` in the E-vector is given by `i*e_layout[0] + j*e_layout[1] + elem_offset`.
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Backend
+**/
+int CeedElemRestrictionGetAtPointsElementOffset(CeedElemRestriction rstr, CeedInt elem, CeedSize *elem_offset) {
+  CeedInt             num_comp;
+  CeedRestrictionType rstr_type;
+
+  CeedCall(CeedElemRestrictionGetType(rstr, &rstr_type));
+  CeedCheck(rstr_type == CEED_RESTRICTION_POINTS, CeedElemRestrictionReturnCeed(rstr), CEED_ERROR_INCOMPATIBLE,
+            "Can only compute offset for a points CeedElemRestriction");
+
+  // Backend method
+  if (rstr->GetAtPointsElementOffset) {
+    CeedCall(rstr->GetAtPointsElementOffset(rstr, elem, elem_offset));
+    return CEED_ERROR_SUCCESS;
+  }
+
+  // Default layout (CPU)
+  *elem_offset = 0;
+  CeedCall(CeedElemRestrictionGetNumComponents(rstr, &num_comp));
+  for (CeedInt i = 0; i < elem; i++) {
+    CeedInt num_points;
+
+    CeedCall(CeedElemRestrictionGetNumPointsInElement(rstr, i, &num_points));
+    *elem_offset += num_points * num_comp;
+  }
+  return CEED_ERROR_SUCCESS;
+}
+
 /**
   @brief Get the backend data of a `CeedElemRestriction`
 
diff --git a/interface/ceed.c b/interface/ceed.c
index ad7f09fa8e..5800ee2bb4 100644
--- a/interface/ceed.c
+++ b/interface/ceed.c
@@ -948,6 +948,7 @@ int CeedInit(const char *resource, Ceed *ceed) {
       CEED_FTABLE_ENTRY(CeedElemRestriction, GetOffsets),
       CEED_FTABLE_ENTRY(CeedElemRestriction, GetOrientations),
       CEED_FTABLE_ENTRY(CeedElemRestriction, GetCurlOrientations),
+      CEED_FTABLE_ENTRY(CeedElemRestriction, GetAtPointsElementOffset),
       CEED_FTABLE_ENTRY(CeedElemRestriction, Destroy),
       CEED_FTABLE_ENTRY(CeedBasis, Apply),
       CEED_FTABLE_ENTRY(CeedBasis, ApplyAtPoints),
diff --git a/tests/t231-elemrestriction.c b/tests/t231-elemrestriction.c
index de6cd2466e..abbf64e766 100644
--- a/tests/t231-elemrestriction.c
+++ b/tests/t231-elemrestriction.c
@@ -2,6 +2,7 @@
 /// Test creation, use, and destruction of an element restriction at points
 /// \test Test creation, use, and destruction of an element restriction at points
 #include <ceed.h>
+#include <ceed/backend.h>
 #include <stdio.h>
 
 int main(int argc, char **argv) {
@@ -49,21 +50,24 @@ int main(int argc, char **argv) {
   CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points, 1, num_points, CEED_MEM_HOST, CEED_USE_POINTER, ind, &elem_restriction);
   CeedElemRestrictionApply(elem_restriction, CEED_NOTRANSPOSE, x, y, CEED_REQUEST_IMMEDIATE);
   {
-    CeedInt           index = 0;
+    CeedInt           e_layout[3];
     const CeedScalar *read_array;
 
     CeedVectorGetArrayRead(y, CEED_MEM_HOST, &read_array);
+    CeedElemRestrictionGetELayout(elem_restriction, e_layout);
 
     for (CeedInt i = 0; i < num_elem; i++) {
-      CeedInt num_points_in_elem = (i + 1) % num_elem + 1;
+      CeedSize      elem_offset        = 0;
+      const CeedInt num_points_in_elem = (i + 1) % num_elem + 1;
 
+      CeedElemRestrictionGetAtPointsElementOffset(elem_restriction, i, &elem_offset);
       for (CeedInt j = 0; j < num_points_in_elem; j++) {
-        if (i != read_array[index]) {
+        if (i != read_array[elem_offset + j * e_layout[0]]) {
           // LCOV_EXCL_START
-          printf("Error in restricted array y[%" CeedInt_FMT "] = %f\n", index, (CeedScalar)read_array[i]);
+          printf("Error in restricted array y[%" CeedInt_FMT "] = %f\n != %f\n", (CeedInt)elem_offset + j * e_layout[0],
+                 (CeedScalar)read_array[elem_offset + j * e_layout[0]], (CeedScalar)i);
           // LCOV_EXCL_STOP
         }
-        index++;
       }
     }
     CeedVectorRestoreArrayRead(y, &read_array);

From 2c2c926cf994afa71f58703df713ffab2e2b6d6e Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Tue, 7 May 2024 14:48:28 -0600
Subject: [PATCH 027/571] rstr - set comp stride for AtPoints rstr

---
 interface/ceed-elemrestriction.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/interface/ceed-elemrestriction.c b/interface/ceed-elemrestriction.c
index 8279e34d05..7f778e9afe 100644
--- a/interface/ceed-elemrestriction.c
+++ b/interface/ceed-elemrestriction.c
@@ -853,15 +853,16 @@ int CeedElemRestrictionCreateAtPoints(Ceed ceed, CeedInt num_elem, CeedInt num_p
 
   CeedCall(CeedCalloc(1, rstr));
   CeedCall(CeedReferenceCopy(ceed, &(*rstr)->ceed));
-  (*rstr)->ref_count  = 1;
-  (*rstr)->num_elem   = num_elem;
-  (*rstr)->num_points = num_points;
-  (*rstr)->num_comp   = num_comp;
-  (*rstr)->l_size     = l_size;
-  (*rstr)->e_size     = (CeedSize)num_points * (CeedSize)num_comp;
-  (*rstr)->num_block  = num_elem;
-  (*rstr)->block_size = 1;
-  (*rstr)->rstr_type  = CEED_RESTRICTION_POINTS;
+  (*rstr)->ref_count   = 1;
+  (*rstr)->num_elem    = num_elem;
+  (*rstr)->num_points  = num_points;
+  (*rstr)->num_comp    = num_comp;
+  (*rstr)->comp_stride = 1;
+  (*rstr)->l_size      = l_size;
+  (*rstr)->e_size      = (CeedSize)num_points * (CeedSize)num_comp;
+  (*rstr)->num_block   = num_elem;
+  (*rstr)->block_size  = 1;
+  (*rstr)->rstr_type   = CEED_RESTRICTION_POINTS;
   CeedCall(ceed->ElemRestrictionCreateAtPoints(mem_type, copy_mode, offsets, NULL, NULL, *rstr));
   return CEED_ERROR_SUCCESS;
 }

From f930fbbf1bfb835619d7d941b989609757eb4f7c Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Wed, 8 May 2024 09:55:29 -0600
Subject: [PATCH 028/571] test - let rstr AtPoints create vectors

---
 tests/t231-elemrestriction.c | 30 +++++++++++++++---------------
 tests/t232-elemrestriction.c | 28 ++++++++++++++--------------
 tests/t233-elemrestriction.c |  5 ++---
 3 files changed, 31 insertions(+), 32 deletions(-)

diff --git a/tests/t231-elemrestriction.c b/tests/t231-elemrestriction.c
index abbf64e766..b949044143 100644
--- a/tests/t231-elemrestriction.c
+++ b/tests/t231-elemrestriction.c
@@ -14,40 +14,40 @@ int main(int argc, char **argv) {
 
   CeedInit(argv[1], &ceed);
 
-  CeedVectorCreate(ceed, num_points, &x);
   {
-    CeedInt    point_index = num_elem;
-    CeedScalar array[num_points];
+    CeedInt offset      = num_elem + 1;
+    CeedInt point_index = num_elem;
 
     for (CeedInt i = 0; i < num_elem; i++) {
       CeedInt num_points_in_elem = (i + 1) % num_elem + 1;
 
+      ind[i] = offset;
       for (CeedInt j = 0; j < num_points_in_elem; j++) {
-        array[point_index] = i;
-        point_index        = (point_index + 1) % num_points;
+        ind[offset + j] = point_index;
+        point_index     = (point_index + 1) % num_points;
       }
+      offset += num_points_in_elem;
     }
-    CeedVectorSetArray(x, CEED_MEM_HOST, CEED_COPY_VALUES, array);
+    ind[num_elem] = offset;
   }
-  CeedVectorCreate(ceed, num_points, &y);
+  CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points, 1, num_points, CEED_MEM_HOST, CEED_USE_POINTER, ind, &elem_restriction);
 
+  CeedElemRestrictionCreateVector(elem_restriction, &x, &y);
   {
-    CeedInt offset      = num_elem + 1;
-    CeedInt point_index = num_elem;
+    CeedInt    point_index = num_elem;
+    CeedScalar array[num_points];
 
     for (CeedInt i = 0; i < num_elem; i++) {
       CeedInt num_points_in_elem = (i + 1) % num_elem + 1;
 
-      ind[i] = offset;
       for (CeedInt j = 0; j < num_points_in_elem; j++) {
-        ind[offset + j] = point_index;
-        point_index     = (point_index + 1) % num_points;
+        array[point_index] = i;
+        point_index        = (point_index + 1) % num_points;
       }
-      offset += num_points_in_elem;
     }
-    ind[num_elem] = offset;
+    CeedVectorSetArray(x, CEED_MEM_HOST, CEED_COPY_VALUES, array);
   }
-  CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points, 1, num_points, CEED_MEM_HOST, CEED_USE_POINTER, ind, &elem_restriction);
+
   CeedElemRestrictionApply(elem_restriction, CEED_NOTRANSPOSE, x, y, CEED_REQUEST_IMMEDIATE);
   {
     CeedInt           e_layout[3];
diff --git a/tests/t232-elemrestriction.c b/tests/t232-elemrestriction.c
index 66557dba28..1973f3b86a 100644
--- a/tests/t232-elemrestriction.c
+++ b/tests/t232-elemrestriction.c
@@ -13,39 +13,39 @@ int main(int argc, char **argv) {
 
   CeedInit(argv[1], &ceed);
 
-  CeedVectorCreate(ceed, num_points, &x);
   {
-    CeedInt    point_index = num_elem;
-    CeedScalar array[num_points];
+    CeedInt offset      = num_elem + 1;
+    CeedInt point_index = num_elem;
 
     for (CeedInt i = 0; i < num_elem; i++) {
       CeedInt num_points_in_elem = (i + 1) % num_elem + 1;
 
+      ind[i] = offset;
       for (CeedInt j = 0; j < num_points_in_elem; j++) {
-        array[point_index] = i;
-        point_index        = (point_index + 1) % num_points;
+        ind[offset + j] = point_index;
+        point_index     = (point_index + 1) % num_points;
       }
+      offset += num_points_in_elem;
     }
-    CeedVectorSetArray(x, CEED_MEM_HOST, CEED_COPY_VALUES, array);
+    ind[num_elem] = offset;
   }
+  CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points, 1, num_points, CEED_MEM_HOST, CEED_USE_POINTER, ind, &elem_restriction);
 
+  CeedElemRestrictionCreateVector(elem_restriction, &x, NULL);
   {
-    CeedInt offset      = num_elem + 1;
-    CeedInt point_index = num_elem;
+    CeedInt    point_index = num_elem;
+    CeedScalar array[num_points];
 
     for (CeedInt i = 0; i < num_elem; i++) {
       CeedInt num_points_in_elem = (i + 1) % num_elem + 1;
 
-      ind[i] = offset;
       for (CeedInt j = 0; j < num_points_in_elem; j++) {
-        ind[offset + j] = point_index;
-        point_index     = (point_index + 1) % num_points;
+        array[point_index] = i;
+        point_index        = (point_index + 1) % num_points;
       }
-      offset += num_points_in_elem;
     }
-    ind[num_elem] = offset;
+    CeedVectorSetArray(x, CEED_MEM_HOST, CEED_COPY_VALUES, array);
   }
-  CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points, 1, num_points, CEED_MEM_HOST, CEED_USE_POINTER, ind, &elem_restriction);
 
   {
     CeedInt max_points;
diff --git a/tests/t233-elemrestriction.c b/tests/t233-elemrestriction.c
index 1ad395b4d1..f42c2aed9f 100644
--- a/tests/t233-elemrestriction.c
+++ b/tests/t233-elemrestriction.c
@@ -14,9 +14,6 @@ int main(int argc, char **argv) {
 
   CeedInit(argv[1], &ceed);
 
-  CeedVectorCreate(ceed, num_points, &x);
-  CeedVectorSetValue(x, 0.0);
-
   {
     CeedInt offset      = num_elem + 1;
     CeedInt point_index = num_elem;
@@ -35,6 +32,8 @@ int main(int argc, char **argv) {
   }
   CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points, 1, num_points, CEED_MEM_HOST, CEED_COPY_VALUES, ind, &elem_restriction);
 
+  CeedElemRestrictionCreateVector(elem_restriction, &x, NULL);
+  CeedVectorSetValue(x, 0.0);
   {
     CeedInt max_points;
 

From 1ef3a2a99284aa21c437e4d5e30bc6aa0d0f4b0f Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Wed, 8 May 2024 10:17:03 -0600
Subject: [PATCH 029/571] test - fix impl vs support in error messages

---
 interface/ceed-basis.c            | 4 ++--
 interface/ceed-elemrestriction.c  | 8 +++++---
 interface/ceed-operator.c         | 4 ++--
 interface/ceed-qfunction.c        | 2 +-
 interface/ceed-qfunctioncontext.c | 2 +-
 interface/ceed-tensor.c           | 2 +-
 interface/ceed-vector.c           | 2 +-
 7 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/interface/ceed-basis.c b/interface/ceed-basis.c
index 5618a087ab..ed24d72c66 100644
--- a/interface/ceed-basis.c
+++ b/interface/ceed-basis.c
@@ -977,7 +977,7 @@ int CeedBasisCreateTensorH1(Ceed ceed, CeedInt dim, CeedInt num_comp, CeedInt P_
     Ceed delegate;
 
     CeedCall(CeedGetObjectDelegate(ceed, &delegate, "Basis"));
-    CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support BasisCreateTensorH1");
+    CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement BasisCreateTensorH1");
     CeedCall(CeedBasisCreateTensorH1(delegate, dim, num_comp, P_1d, Q_1d, interp_1d, grad_1d, q_ref_1d, q_weight_1d, basis));
     return CEED_ERROR_SUCCESS;
   }
@@ -1116,7 +1116,7 @@ int CeedBasisCreateH1(Ceed ceed, CeedElemTopology topo, CeedInt num_comp, CeedIn
     Ceed delegate;
 
     CeedCall(CeedGetObjectDelegate(ceed, &delegate, "Basis"));
-    CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support BasisCreateH1");
+    CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement BasisCreateH1");
     CeedCall(CeedBasisCreateH1(delegate, topo, num_comp, num_nodes, num_qpts, interp, grad, q_ref, q_weight, basis));
     return CEED_ERROR_SUCCESS;
   }
diff --git a/interface/ceed-elemrestriction.c b/interface/ceed-elemrestriction.c
index 7f778e9afe..f7bc180df3 100644
--- a/interface/ceed-elemrestriction.c
+++ b/interface/ceed-elemrestriction.c
@@ -243,7 +243,7 @@ int CeedElemRestrictionGetOffsets(CeedElemRestriction rstr, CeedMemType mem_type
     CeedCall(CeedElemRestrictionGetOffsets(rstr->rstr_base, mem_type, offsets));
   } else {
     CeedCheck(rstr->GetOffsets, CeedElemRestrictionReturnCeed(rstr), CEED_ERROR_UNSUPPORTED,
-              "Backend does not support CeedElemRestrictionGetOffsets");
+              "Backend does not implement CeedElemRestrictionGetOffsets");
     CeedCall(rstr->GetOffsets(rstr, mem_type, offsets));
     rstr->num_readers++;
   }
@@ -284,7 +284,7 @@ int CeedElemRestrictionRestoreOffsets(CeedElemRestriction rstr, const CeedInt **
 **/
 int CeedElemRestrictionGetOrientations(CeedElemRestriction rstr, CeedMemType mem_type, const bool **orients) {
   CeedCheck(rstr->GetOrientations, CeedElemRestrictionReturnCeed(rstr), CEED_ERROR_UNSUPPORTED,
-            "Backend does not support CeedElemRestrictionGetOrientations");
+            "Backend does not implement CeedElemRestrictionGetOrientations");
   CeedCall(rstr->GetOrientations(rstr, mem_type, orients));
   rstr->num_readers++;
   return CEED_ERROR_SUCCESS;
@@ -320,7 +320,7 @@ int CeedElemRestrictionRestoreOrientations(CeedElemRestriction rstr, const bool
 **/
 int CeedElemRestrictionGetCurlOrientations(CeedElemRestriction rstr, CeedMemType mem_type, const CeedInt8 **curl_orients) {
   CeedCheck(rstr->GetCurlOrientations, CeedElemRestrictionReturnCeed(rstr), CEED_ERROR_UNSUPPORTED,
-            "Backend does not support CeedElemRestrictionGetCurlOrientations");
+            "Backend does not implement CeedElemRestrictionGetCurlOrientations");
   CeedCall(rstr->GetCurlOrientations(rstr, mem_type, curl_orients));
   rstr->num_readers++;
   return CEED_ERROR_SUCCESS;
@@ -1304,6 +1304,8 @@ int CeedElemRestrictionApplyAtPointsInElement(CeedElemRestriction rstr, CeedInt
   Ceed     ceed;
 
   CeedCall(CeedElemRestrictionGetCeed(rstr, &ceed));
+  CeedCheck(rstr->ApplyAtPointsInElement, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement CeedElemRestrictionApplyAtPointsInElement");
+
   if (t_mode == CEED_NOTRANSPOSE) {
     CeedInt num_points, num_comp;
 
diff --git a/interface/ceed-operator.c b/interface/ceed-operator.c
index a4645dd2c6..9e96a1ac84 100644
--- a/interface/ceed-operator.c
+++ b/interface/ceed-operator.c
@@ -727,7 +727,7 @@ int CeedOperatorCreate(Ceed ceed, CeedQFunction qf, CeedQFunction dqf, CeedQFunc
     Ceed delegate;
 
     CeedCall(CeedGetObjectDelegate(ceed, &delegate, "Operator"));
-    CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support CeedOperatorCreate");
+    CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement CeedOperatorCreate");
     CeedCall(CeedOperatorCreate(delegate, qf, dqf, dqfT, op));
     return CEED_ERROR_SUCCESS;
   }
@@ -770,7 +770,7 @@ int CeedOperatorCreateAtPoints(Ceed ceed, CeedQFunction qf, CeedQFunction dqf, C
     Ceed delegate;
 
     CeedCall(CeedGetObjectDelegate(ceed, &delegate, "Operator"));
-    CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support CeedOperatorCreateAtPoints");
+    CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement CeedOperatorCreateAtPoints");
     CeedCall(CeedOperatorCreateAtPoints(delegate, qf, dqf, dqfT, op));
     return CEED_ERROR_SUCCESS;
   }
diff --git a/interface/ceed-qfunction.c b/interface/ceed-qfunction.c
index 226b33c19e..ad90099412 100644
--- a/interface/ceed-qfunction.c
+++ b/interface/ceed-qfunction.c
@@ -624,7 +624,7 @@ int CeedQFunctionCreateInterior(Ceed ceed, CeedInt vec_length, CeedQFunctionUser
     Ceed delegate;
 
     CeedCall(CeedGetObjectDelegate(ceed, &delegate, "QFunction"));
-    CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support CeedQFunctionCreateInterior");
+    CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement CeedQFunctionCreateInterior");
     CeedCall(CeedQFunctionCreateInterior(delegate, vec_length, f, source, qf));
     return CEED_ERROR_SUCCESS;
   }
diff --git a/interface/ceed-qfunctioncontext.c b/interface/ceed-qfunctioncontext.c
index ddb9549fa4..16cc22cebe 100644
--- a/interface/ceed-qfunctioncontext.c
+++ b/interface/ceed-qfunctioncontext.c
@@ -569,7 +569,7 @@ int CeedQFunctionContextCreate(Ceed ceed, CeedQFunctionContext *ctx) {
     Ceed delegate;
 
     CeedCall(CeedGetObjectDelegate(ceed, &delegate, "Context"));
-    CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support CeedQFunctionContextCreate");
+    CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement CeedQFunctionContextCreate");
     CeedCall(CeedQFunctionContextCreate(delegate, ctx));
     return CEED_ERROR_SUCCESS;
   }
diff --git a/interface/ceed-tensor.c b/interface/ceed-tensor.c
index 7cbc69e00c..d5b5578589 100644
--- a/interface/ceed-tensor.c
+++ b/interface/ceed-tensor.c
@@ -34,7 +34,7 @@ int CeedTensorContractCreate(Ceed ceed, CeedTensorContract *contract) {
     Ceed delegate;
 
     CeedCall(CeedGetObjectDelegate(ceed, &delegate, "TensorContract"));
-    CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support CeedTensorContractCreate");
+    CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement CeedTensorContractCreate");
     CeedCall(CeedTensorContractCreate(delegate, contract));
     return CEED_ERROR_SUCCESS;
   }
diff --git a/interface/ceed-vector.c b/interface/ceed-vector.c
index 39b72c770f..6c93f5b6df 100644
--- a/interface/ceed-vector.c
+++ b/interface/ceed-vector.c
@@ -163,7 +163,7 @@ int CeedVectorCreate(Ceed ceed, CeedSize length, CeedVector *vec) {
     Ceed delegate;
 
     CeedCall(CeedGetObjectDelegate(ceed, &delegate, "Vector"));
-    CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support VectorCreate");
+    CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement VectorCreate");
     CeedCall(CeedVectorCreate(delegate, length, vec));
     return CEED_ERROR_SUCCESS;
   }

From fe96005463bdbb79b892d21a5c89e2b475ecf62b Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Tue, 7 May 2024 15:21:45 -0600
Subject: [PATCH 030/571] hip - add ElemRestrictionAtPoints

---
 backends/hip-ref/ceed-hip-ref-restriction.c | 100 +++++++++++++++-----
 backends/hip-ref/ceed-hip-ref.c             |   1 +
 backends/hip-ref/ceed-hip-ref.h             |   6 ++
 tests/t231-elemrestriction.c                |   1 +
 4 files changed, 84 insertions(+), 24 deletions(-)

diff --git a/backends/hip-ref/ceed-hip-ref-restriction.c b/backends/hip-ref/ceed-hip-ref-restriction.c
index 625430f9d7..32e8a9e96a 100644
--- a/backends/hip-ref/ceed-hip-ref-restriction.c
+++ b/backends/hip-ref/ceed-hip-ref-restriction.c
@@ -31,11 +31,15 @@ static inline int CeedElemRestrictionSetupCompile_Hip(CeedElemRestriction rstr)
 
   CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
   CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed));
+  CeedCallBackend(CeedElemRestrictionGetType(rstr, &rstr_type));
   CeedCallBackend(CeedElemRestrictionGetNumElements(rstr, &num_elem));
   CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &num_comp));
-  CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size));
   CeedCallBackend(CeedElemRestrictionGetCompStride(rstr, &comp_stride));
-  CeedCallBackend(CeedElemRestrictionGetType(rstr, &rstr_type));
+  if (rstr_type == CEED_RESTRICTION_POINTS) {
+    CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(rstr, &elem_size));
+  } else {
+    CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size));
+  }
   is_deterministic = impl->d_l_vec_indices != NULL;
 
   // Compile HIP kernels
@@ -59,6 +63,7 @@ static inline int CeedElemRestrictionSetupCompile_Hip(CeedElemRestriction rstr)
       CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "StridedNoTranspose", &impl->ApplyNoTranspose));
       CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "StridedTranspose", &impl->ApplyTranspose));
     } break;
+    case CEED_RESTRICTION_POINTS:
     case CEED_RESTRICTION_STANDARD: {
       CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-restriction-offset.h", &restriction_kernel_path));
       CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source -----\n");
@@ -118,11 +123,6 @@ static inline int CeedElemRestrictionSetupCompile_Hip(CeedElemRestriction rstr)
       for (CeedInt i = 0; i < num_file_paths; i++) CeedCall(CeedFree(&file_paths[i]));
       CeedCall(CeedFree(&file_paths));
     } break;
-    case CEED_RESTRICTION_POINTS: {
-      // LCOV_EXCL_START
-      return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement restriction CeedElemRestrictionAtPoints");
-      // LCOV_EXCL_STOP
-    } break;
   }
   CeedCallBackend(CeedFree(&restriction_kernel_path));
   CeedCallBackend(CeedFree(&restriction_kernel_source));
@@ -174,6 +174,7 @@ static inline int CeedElemRestrictionApply_Hip_Core(CeedElemRestriction rstr, Ce
 
         CeedCallBackend(CeedRunKernel_Hip(ceed, impl->ApplyNoTranspose, grid, block_size, args));
       } break;
+      case CEED_RESTRICTION_POINTS:
       case CEED_RESTRICTION_STANDARD: {
         void *args[] = {&impl->d_offsets, &d_u, &d_v};
 
@@ -205,11 +206,6 @@ static inline int CeedElemRestrictionApply_Hip_Core(CeedElemRestriction rstr, Ce
           CeedCallBackend(CeedRunKernel_Hip(ceed, impl->ApplyUnorientedNoTranspose, grid, block_size, args));
         }
       } break;
-      case CEED_RESTRICTION_POINTS: {
-        // LCOV_EXCL_START
-        return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement restriction CeedElemRestrictionAtPoints");
-        // LCOV_EXCL_STOP
-      } break;
     }
   } else {
     // E-vector -> L-vector
@@ -223,6 +219,7 @@ static inline int CeedElemRestrictionApply_Hip_Core(CeedElemRestriction rstr, Ce
 
         CeedCallBackend(CeedRunKernel_Hip(ceed, impl->ApplyTranspose, grid, block_size, args));
       } break;
+      case CEED_RESTRICTION_POINTS:
       case CEED_RESTRICTION_STANDARD: {
         if (!is_deterministic) {
           void *args[] = {&impl->d_offsets, &d_u, &d_v};
@@ -290,11 +287,6 @@ static inline int CeedElemRestrictionApply_Hip_Core(CeedElemRestriction rstr, Ce
           }
         }
       } break;
-      case CEED_RESTRICTION_POINTS: {
-        // LCOV_EXCL_START
-        return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement restriction CeedElemRestrictionAtPoints");
-        // LCOV_EXCL_STOP
-      } break;
     }
   }
 
@@ -334,14 +326,16 @@ static int CeedElemRestrictionApplyUnoriented_Hip(CeedElemRestriction rstr, Ceed
 //------------------------------------------------------------------------------
 static int CeedElemRestrictionGetOffsets_Hip(CeedElemRestriction rstr, CeedMemType mem_type, const CeedInt **offsets) {
   CeedElemRestriction_Hip *impl;
+  CeedRestrictionType      rstr_type;
 
   CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
+  CeedCallBackend(CeedElemRestrictionGetType(rstr, &rstr_type));
   switch (mem_type) {
     case CEED_MEM_HOST:
-      *offsets = impl->h_offsets;
+      *offsets = rstr_type == CEED_RESTRICTION_POINTS ? impl->h_offsets_at_points : impl->h_offsets;
       break;
     case CEED_MEM_DEVICE:
-      *offsets = impl->d_offsets;
+      *offsets = rstr_type == CEED_RESTRICTION_POINTS ? impl->d_offsets_at_points : impl->d_offsets;
       break;
   }
   return CEED_ERROR_SUCCESS;
@@ -383,6 +377,17 @@ static int CeedElemRestrictionGetCurlOrientations_Hip(CeedElemRestriction rstr,
   return CEED_ERROR_SUCCESS;
 }
 
+//------------------------------------------------------------------------------
+// Get offset for padded AtPoints E-layout
+//------------------------------------------------------------------------------
+static int CeedElemRestrictionGetAtPointsElementOffset_Hip(CeedElemRestriction rstr, CeedInt elem, CeedSize *elem_offset) {
+  CeedInt layout[3];
+
+  CeedCallBackend(CeedElemRestrictionGetELayout(rstr, layout));
+  *elem_offset = 0 * layout[0] + 0 * layout[1] + elem * layout[2];
+  return CEED_ERROR_SUCCESS;
+}
+
 //------------------------------------------------------------------------------
 // Destroy restriction
 //------------------------------------------------------------------------------
@@ -404,6 +409,8 @@ static int CeedElemRestrictionDestroy_Hip(CeedElemRestriction rstr) {
   CeedCallHip(ceed, hipFree((bool *)impl->d_orients_owned));
   CeedCallBackend(CeedFree(&impl->h_curl_orients_owned));
   CeedCallHip(ceed, hipFree((CeedInt8 *)impl->d_curl_orients_owned));
+  CeedCallBackend(CeedFree(&impl->h_offsets_at_points_owned));
+  CeedCallHip(ceed, hipFree((CeedInt8 *)impl->d_offsets_at_points_owned));
   CeedCallBackend(CeedFree(&impl));
   return CEED_ERROR_SUCCESS;
 }
@@ -411,18 +418,19 @@ static int CeedElemRestrictionDestroy_Hip(CeedElemRestriction rstr) {
 //------------------------------------------------------------------------------
 // Create transpose offsets and indices
 //------------------------------------------------------------------------------
-static int CeedElemRestrictionOffset_Hip(const CeedElemRestriction rstr, const CeedInt *indices) {
+static int CeedElemRestrictionOffset_Hip(const CeedElemRestriction rstr, const CeedInt elem_size, const CeedInt *indices) {
   Ceed                     ceed;
   bool                    *is_node;
   CeedSize                 l_size;
-  CeedInt                  num_elem, elem_size, num_comp, num_nodes = 0;
+  CeedInt                  num_elem, num_comp, num_nodes = 0;
   CeedInt                 *ind_to_offset, *l_vec_indices, *t_offsets, *t_indices;
+  CeedRestrictionType      rstr_type;
   CeedElemRestriction_Hip *impl;
 
   CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed));
   CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
   CeedCallBackend(CeedElemRestrictionGetNumElements(rstr, &num_elem));
-  CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size));
+  CeedCallBackend(CeedElemRestrictionGetType(rstr, &rstr_type));
   CeedCallBackend(CeedElemRestrictionGetLVectorSize(rstr, &l_size));
   CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &num_comp));
   const CeedInt size_indices = num_elem * elem_size;
@@ -505,6 +513,15 @@ int CeedElemRestrictionCreate_Hip(CeedMemType mem_type, CeedCopyMode copy_mode,
   CeedCallBackend(CeedElemRestrictionGetNumElements(rstr, &num_elem));
   CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size));
   CeedCallBackend(CeedElemRestrictionGetType(rstr, &rstr_type));
+  // Use max number of points as elem size for AtPoints restrictions
+  if (rstr_type == CEED_RESTRICTION_POINTS) {
+    CeedInt max_points = 0;
+
+    for (CeedInt i = 0; i < num_elem; i++) {
+      max_points = CeedIntMax(max_points, offsets[i + 1] - offsets[i]);
+    }
+    elem_size = max_points;
+  }
   const CeedInt size = num_elem * elem_size;
 
   CeedCallBackend(CeedCalloc(1, &impl));
@@ -525,6 +542,37 @@ int CeedElemRestrictionCreate_Hip(CeedMemType mem_type, CeedCopyMode copy_mode,
     }
   }
 
+  // Pad AtPoints indices
+  if (rstr_type == CEED_RESTRICTION_POINTS) {
+    CeedSize offsets_len = elem_size * num_elem, at_points_size = num_elem + 1;
+    CeedInt  max_points = elem_size, *offsets_padded;
+
+    CeedCheck(mem_type == CEED_MEM_HOST, ceed, CEED_ERROR_BACKEND, "only MemType Host supported when creating AtPoints restriction");
+    CeedCallBackend(CeedMalloc(offsets_len, &offsets_padded));
+    for (CeedInt i = 0; i < num_elem; i++) {
+      CeedInt num_points = offsets[i + 1] - offsets[i];
+
+      at_points_size += num_points;
+      // -- Copy all points in element
+      for (CeedInt j = 0; j < num_points; j++) {
+        offsets_padded[i * max_points + j] = offsets[offsets[i] + j];
+      }
+      // -- Replicate out last point in element
+      for (CeedInt j = num_points; j < max_points; j++) {
+        offsets_padded[i * max_points + j] = offsets[offsets[i] + num_points - 1];
+      }
+    }
+    CeedCallBackend(CeedSetHostCeedIntArray(offsets, copy_mode, at_points_size, &impl->h_offsets_at_points_owned, &impl->h_offsets_at_points_borrowed,
+                                            &impl->h_offsets_at_points));
+    CeedCallHip(ceed, hipMalloc((void **)&impl->d_offsets_at_points_owned, at_points_size * sizeof(CeedInt)));
+    CeedCallHip(ceed, hipMemcpy((CeedInt **)impl->d_offsets_at_points_owned, impl->h_offsets_at_points, at_points_size * sizeof(CeedInt),
+                                hipMemcpyHostToDevice));
+    impl->d_offsets_at_points = (CeedInt *)impl->d_offsets_at_points_owned;
+    // -- Use padded offsets for the rest of the setup
+    offsets   = (const CeedInt *)offsets_padded;
+    copy_mode = CEED_OWN_POINTER;
+  }
+
   // Set up device offset/orientation arrays
   if (rstr_type != CEED_RESTRICTION_STRIDED) {
     switch (mem_type) {
@@ -533,7 +581,7 @@ int CeedElemRestrictionCreate_Hip(CeedMemType mem_type, CeedCopyMode copy_mode,
         CeedCallHip(ceed, hipMalloc((void **)&impl->d_offsets_owned, size * sizeof(CeedInt)));
         CeedCallHip(ceed, hipMemcpy((CeedInt **)impl->d_offsets_owned, impl->h_offsets, size * sizeof(CeedInt), hipMemcpyHostToDevice));
         impl->d_offsets = (CeedInt *)impl->d_offsets_owned;
-        if (is_deterministic) CeedCallBackend(CeedElemRestrictionOffset_Hip(rstr, offsets));
+        if (is_deterministic) CeedCallBackend(CeedElemRestrictionOffset_Hip(rstr, elem_size, offsets));
       } break;
       case CEED_MEM_DEVICE: {
         CeedCallBackend(CeedSetDeviceCeedIntArray_Hip(ceed, offsets, copy_mode, size, &impl->d_offsets_owned, &impl->d_offsets_borrowed,
@@ -541,7 +589,7 @@ int CeedElemRestrictionCreate_Hip(CeedMemType mem_type, CeedCopyMode copy_mode,
         CeedCallBackend(CeedMalloc(size, &impl->h_offsets_owned));
         CeedCallHip(ceed, hipMemcpy((CeedInt **)impl->h_offsets_owned, impl->d_offsets, size * sizeof(CeedInt), hipMemcpyDeviceToHost));
         impl->h_offsets = impl->h_offsets_owned;
-        if (is_deterministic) CeedCallBackend(CeedElemRestrictionOffset_Hip(rstr, offsets));
+        if (is_deterministic) CeedCallBackend(CeedElemRestrictionOffset_Hip(rstr, elem_size, offsets));
       } break;
     }
 
@@ -591,6 +639,10 @@ int CeedElemRestrictionCreate_Hip(CeedMemType mem_type, CeedCopyMode copy_mode,
   CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetOffsets", CeedElemRestrictionGetOffsets_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetOrientations", CeedElemRestrictionGetOrientations_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetCurlOrientations", CeedElemRestrictionGetCurlOrientations_Hip));
+  if (rstr_type == CEED_RESTRICTION_POINTS) {
+    CeedCallBackend(
+        CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetAtPointsElementOffset", CeedElemRestrictionGetAtPointsElementOffset_Hip));
+  }
   CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "Destroy", CeedElemRestrictionDestroy_Hip));
   return CEED_ERROR_SUCCESS;
 }
diff --git a/backends/hip-ref/ceed-hip-ref.c b/backends/hip-ref/ceed-hip-ref.c
index 8494d127e3..c9f765c70f 100644
--- a/backends/hip-ref/ceed-hip-ref.c
+++ b/backends/hip-ref/ceed-hip-ref.c
@@ -57,6 +57,7 @@ static int CeedInit_Hip_ref(const char *resource, Ceed ceed) {
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "BasisCreateHdiv", CeedBasisCreateHdiv_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "BasisCreateHcurl", CeedBasisCreateHcurl_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "ElemRestrictionCreate", CeedElemRestrictionCreate_Hip));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "ElemRestrictionCreateAtPoints", CeedElemRestrictionCreate_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "QFunctionCreate", CeedQFunctionCreate_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "QFunctionContextCreate", CeedQFunctionContextCreate_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "OperatorCreate", CeedOperatorCreate_Hip));
diff --git a/backends/hip-ref/ceed-hip-ref.h b/backends/hip-ref/ceed-hip-ref.h
index 815790c53c..57e4e89162 100644
--- a/backends/hip-ref/ceed-hip-ref.h
+++ b/backends/hip-ref/ceed-hip-ref.h
@@ -52,6 +52,12 @@ typedef struct {
   const CeedInt8 *d_curl_orients;
   const CeedInt8 *d_curl_orients_borrowed;
   const CeedInt8 *d_curl_orients_owned;
+  const CeedInt  *h_offsets_at_points;
+  const CeedInt  *h_offsets_at_points_borrowed;
+  const CeedInt  *h_offsets_at_points_owned;
+  const CeedInt  *d_offsets_at_points;
+  const CeedInt  *d_offsets_at_points_borrowed;
+  const CeedInt  *d_offsets_at_points_owned;
 } CeedElemRestriction_Hip;
 
 typedef struct {
diff --git a/tests/t231-elemrestriction.c b/tests/t231-elemrestriction.c
index b949044143..6073602838 100644
--- a/tests/t231-elemrestriction.c
+++ b/tests/t231-elemrestriction.c
@@ -33,6 +33,7 @@ int main(int argc, char **argv) {
   CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points, 1, num_points, CEED_MEM_HOST, CEED_USE_POINTER, ind, &elem_restriction);
 
   CeedElemRestrictionCreateVector(elem_restriction, &x, &y);
+  CeedVectorSetValue(y, 0.0);
   {
     CeedInt    point_index = num_elem;
     CeedScalar array[num_points];

From ff1bc20e5f1db9421803faf3e8c80e7a5d10bee0 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Wed, 8 May 2024 11:32:18 -0600
Subject: [PATCH 031/571] rstr - allow backends to add padding to AtPoints
 E-vec

---
 backends/hip-ref/ceed-hip-ref-restriction.c   |  5 +++-
 backends/memcheck/ceed-memcheck-restriction.c | 15 ++++++++++++
 backends/ref/ceed-ref-operator.c              | 22 +++---------------
 backends/ref/ceed-ref-restriction.c           | 15 ++++++++++++
 include/ceed/backend.h                        |  1 +
 interface/ceed-elemrestriction.c              | 23 +++++++++++++++++++
 6 files changed, 61 insertions(+), 20 deletions(-)

diff --git a/backends/hip-ref/ceed-hip-ref-restriction.c b/backends/hip-ref/ceed-hip-ref-restriction.c
index 32e8a9e96a..91d92c0ac1 100644
--- a/backends/hip-ref/ceed-hip-ref-restriction.c
+++ b/backends/hip-ref/ceed-hip-ref-restriction.c
@@ -503,7 +503,7 @@ int CeedElemRestrictionCreate_Hip(CeedMemType mem_type, CeedCopyMode copy_mode,
                                   const CeedInt8 *curl_orients, CeedElemRestriction rstr) {
   Ceed                     ceed, ceed_parent;
   bool                     is_deterministic;
-  CeedInt                  num_elem, elem_size;
+  CeedInt                  num_elem, num_comp, elem_size;
   CeedRestrictionType      rstr_type;
   CeedElemRestriction_Hip *impl;
 
@@ -511,6 +511,7 @@ int CeedElemRestrictionCreate_Hip(CeedMemType mem_type, CeedCopyMode copy_mode,
   CeedCallBackend(CeedGetParent(ceed, &ceed_parent));
   CeedCallBackend(CeedIsDeterministic(ceed_parent, &is_deterministic));
   CeedCallBackend(CeedElemRestrictionGetNumElements(rstr, &num_elem));
+  CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &num_comp));
   CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size));
   CeedCallBackend(CeedElemRestrictionGetType(rstr, &rstr_type));
   // Use max number of points as elem size for AtPoints restrictions
@@ -568,9 +569,11 @@ int CeedElemRestrictionCreate_Hip(CeedMemType mem_type, CeedCopyMode copy_mode,
     CeedCallHip(ceed, hipMemcpy((CeedInt **)impl->d_offsets_at_points_owned, impl->h_offsets_at_points, at_points_size * sizeof(CeedInt),
                                 hipMemcpyHostToDevice));
     impl->d_offsets_at_points = (CeedInt *)impl->d_offsets_at_points_owned;
+
     // -- Use padded offsets for the rest of the setup
     offsets   = (const CeedInt *)offsets_padded;
     copy_mode = CEED_OWN_POINTER;
+    CeedCallBackend(CeedElemRestrictionSetAtPointsEVectorSize(rstr, at_points_size * num_comp));
   }
 
   // Set up device offset/orientation arrays
diff --git a/backends/memcheck/ceed-memcheck-restriction.c b/backends/memcheck/ceed-memcheck-restriction.c
index f2877c3a69..298ae43c6c 100644
--- a/backends/memcheck/ceed-memcheck-restriction.c
+++ b/backends/memcheck/ceed-memcheck-restriction.c
@@ -673,6 +673,21 @@ int CeedElemRestrictionCreate_Memcheck(CeedMemType mem_type, CeedCopyMode copy_m
     }
   }
 
+  // Expand E-vector size for AtPoints
+  if (rstr_type == CEED_RESTRICTION_POINTS) {
+    CeedSize max_points = 0, num_points_total = 0;
+
+    for (CeedInt i = 0; i < num_elem; i++) {
+      CeedInt num_points = offsets[i + 1] - offsets[i];
+
+      max_points = CeedIntMax(max_points, num_points);
+      num_points_total += num_points;
+    }
+    // -- Increase size for last element
+    num_points_total += (max_points - (offsets[num_elem] - offsets[num_elem - 1]));
+    CeedCallBackend(CeedElemRestrictionSetAtPointsEVectorSize(rstr, num_points_total * num_comp));
+  }
+
   // Offsets data
   if (rstr_type != CEED_RESTRICTION_STRIDED) {
     // Check indices
diff --git a/backends/ref/ceed-ref-operator.c b/backends/ref/ceed-ref-operator.c
index 5347580d2c..534c0313b2 100644
--- a/backends/ref/ceed-ref-operator.c
+++ b/backends/ref/ceed-ref-operator.c
@@ -556,7 +556,7 @@ static int CeedOperatorSetupFieldsAtPoints_Ref(CeedQFunction qf, CeedOperator op
                                                CeedVector *q_vecs, CeedInt start_e, CeedInt num_fields, CeedInt Q) {
   Ceed                ceed;
   CeedSize            e_size, q_size;
-  CeedInt             e_size_padding = 0, max_num_points, num_comp, size, P;
+  CeedInt             max_num_points, num_comp, size, P;
   CeedQFunctionField *qf_fields;
   CeedOperatorField  *op_fields;
 
@@ -600,26 +600,10 @@ static int CeedOperatorSetupFieldsAtPoints_Ref(CeedQFunction qf, CeedOperator op
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode));
     if (eval_mode != CEED_EVAL_WEIGHT) {
       CeedElemRestriction elem_rstr;
-      CeedSize            e_size;
-      bool                is_at_points;
 
       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &elem_rstr));
-      CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
-      CeedCallBackend(CeedElemRestrictionIsPoints(elem_rstr, &is_at_points));
-      if (is_at_points) {
-        CeedCallBackend(CeedElemRestrictionGetEVectorSize(elem_rstr, &e_size));
-        if (e_size_padding == 0) {
-          CeedInt num_points, num_elem;
-
-          CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem));
-          CeedCallBackend(CeedElemRestrictionGetNumPointsInElement(elem_rstr, num_elem - 1, &num_points));
-          e_size_padding = (max_num_points - num_points) * num_comp;
-        }
-        CeedCallBackend(CeedVectorCreate(ceed, e_size + e_size_padding, &e_vecs_full[i + start_e]));
-        CeedCallBackend(CeedVectorSetValue(e_vecs_full[i + start_e], 0.0));
-      } else {
-        CeedCallBackend(CeedElemRestrictionCreateVector(elem_rstr, NULL, &e_vecs_full[i + start_e]));
-      }
+      CeedCallBackend(CeedElemRestrictionCreateVector(elem_rstr, NULL, &e_vecs_full[i + start_e]));
+      CeedCallBackend(CeedVectorSetValue(e_vecs_full[i + start_e], 0.0));
     }
 
     switch (eval_mode) {
diff --git a/backends/ref/ceed-ref-restriction.c b/backends/ref/ceed-ref-restriction.c
index 08416e4d06..8dca3a90a7 100644
--- a/backends/ref/ceed-ref-restriction.c
+++ b/backends/ref/ceed-ref-restriction.c
@@ -753,6 +753,21 @@ int CeedElemRestrictionCreate_Ref(CeedMemType mem_type, CeedCopyMode copy_mode,
     }
   }
 
+  // Expand E-vector size for AtPoints
+  if (rstr_type == CEED_RESTRICTION_POINTS) {
+    CeedSize max_points = 0, num_points_total = 0;
+
+    for (CeedInt i = 0; i < num_elem; i++) {
+      CeedInt num_points = offsets[i + 1] - offsets[i];
+
+      max_points = CeedIntMax(max_points, num_points);
+      num_points_total += num_points;
+    }
+    // -- Increase size for last element
+    num_points_total += (max_points - (offsets[num_elem] - offsets[num_elem - 1]));
+    CeedCallBackend(CeedElemRestrictionSetAtPointsEVectorSize(rstr, num_points_total * num_comp));
+  }
+
   // Offsets data
   if (rstr_type != CEED_RESTRICTION_STRIDED) {
     const char *resource;
diff --git a/include/ceed/backend.h b/include/ceed/backend.h
index ee61d638e6..1f78fcaec4 100644
--- a/include/ceed/backend.h
+++ b/include/ceed/backend.h
@@ -293,6 +293,7 @@ CEED_EXTERN int CeedElemRestrictionSetLLayout(CeedElemRestriction rstr, CeedInt
 CEED_EXTERN int CeedElemRestrictionGetELayout(CeedElemRestriction rstr, CeedInt layout[3]);
 CEED_EXTERN int CeedElemRestrictionSetELayout(CeedElemRestriction rstr, CeedInt layout[3]);
 CEED_EXTERN int CeedElemRestrictionGetAtPointsElementOffset(CeedElemRestriction rstr, CeedInt elem, CeedSize *elem_offset);
+CEED_EXTERN int CeedElemRestrictionSetAtPointsEVectorSize(CeedElemRestriction rstr, CeedSize e_size);
 CEED_EXTERN int CeedElemRestrictionGetData(CeedElemRestriction rstr, void *data);
 CEED_EXTERN int CeedElemRestrictionSetData(CeedElemRestriction rstr, void *data);
 CEED_EXTERN int CeedElemRestrictionReference(CeedElemRestriction rstr);
diff --git a/interface/ceed-elemrestriction.c b/interface/ceed-elemrestriction.c
index f7bc180df3..1021e1352a 100644
--- a/interface/ceed-elemrestriction.c
+++ b/interface/ceed-elemrestriction.c
@@ -468,6 +468,29 @@ int CeedElemRestrictionGetAtPointsElementOffset(CeedElemRestriction rstr, CeedIn
   return CEED_ERROR_SUCCESS;
 }
 
+/**
+
+  @brief Set the E-vector size of a `CeedElemRestriction` at points
+
+  @param[in,out]  rstr   `CeedElemRestriction`
+  @param[in]      e_size New E-vector size; must be longer than the current E-vector size
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Backend
+**/
+int CeedElemRestrictionSetAtPointsEVectorSize(CeedElemRestriction rstr, CeedSize e_size) {
+  CeedRestrictionType rstr_type;
+  Ceed                ceed;
+
+  CeedCall(CeedElemRestrictionGetCeed(rstr, &ceed));
+  CeedCall(CeedElemRestrictionGetType(rstr, &rstr_type));
+  CeedCheck(rstr_type == CEED_RESTRICTION_POINTS, ceed, CEED_ERROR_INCOMPATIBLE, "Can only compute offset for a points CeedElemRestriction");
+  CeedCheck(e_size >= rstr->e_size, ceed, CEED_ERROR_INCOMPATIBLE, "Can only increase the size of the E-vector for the CeedElemRestriction");
+  rstr->e_size = e_size;
+  return CEED_ERROR_SUCCESS;
+}
+
 /**
   @brief Get the backend data of a `CeedElemRestriction`
 

From 637baffd68e35dfef8a5fda1061b9adf5878d273 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Wed, 8 May 2024 11:33:32 -0600
Subject: [PATCH 032/571] rstr - minor naming fix

---
 backends/ref/ceed-ref-operator.c | 4 ++--
 include/ceed/backend.h           | 2 +-
 interface/ceed-elemrestriction.c | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/backends/ref/ceed-ref-operator.c b/backends/ref/ceed-ref-operator.c
index 534c0313b2..e9ba66fcc2 100644
--- a/backends/ref/ceed-ref-operator.c
+++ b/backends/ref/ceed-ref-operator.c
@@ -928,7 +928,7 @@ static inline int CeedOperatorLinearAssembleQFunctionAtPointsCore_Ref(CeedOperat
           bool                is_at_points = false;
 
           CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
-          CeedCallBackend(CeedElemRestrictionIsPoints(elem_rstr, &is_at_points));
+          CeedCallBackend(CeedElemRestrictionIsAtPoints(elem_rstr, &is_at_points));
           CeedCheck(!is_at_points, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction with active input at points");
         }
         // Get size of active input
@@ -965,7 +965,7 @@ static inline int CeedOperatorLinearAssembleQFunctionAtPointsCore_Ref(CeedOperat
           bool                is_at_points = false;
 
           CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
-          CeedCallBackend(CeedElemRestrictionIsPoints(elem_rstr, &is_at_points));
+          CeedCallBackend(CeedElemRestrictionIsAtPoints(elem_rstr, &is_at_points));
           CeedCheck(!is_at_points, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction with active input at points");
         }
         // Get size of active output
diff --git a/include/ceed/backend.h b/include/ceed/backend.h
index 1f78fcaec4..bd9b365814 100644
--- a/include/ceed/backend.h
+++ b/include/ceed/backend.h
@@ -278,7 +278,7 @@ typedef enum {
 
 CEED_EXTERN int CeedElemRestrictionGetType(CeedElemRestriction rstr, CeedRestrictionType *rstr_type);
 CEED_EXTERN int CeedElemRestrictionIsStrided(CeedElemRestriction rstr, bool *is_strided);
-CEED_EXTERN int CeedElemRestrictionIsPoints(CeedElemRestriction rstr, bool *is_points);
+CEED_EXTERN int CeedElemRestrictionIsAtPoints(CeedElemRestriction rstr, bool *is_points);
 CEED_EXTERN int CeedElemRestrictionAtPointsAreCompatible(CeedElemRestriction rstr_a, CeedElemRestriction rstr_b, bool *are_compatible);
 CEED_EXTERN int CeedElemRestrictionGetStrides(CeedElemRestriction rstr, CeedInt strides[3]);
 CEED_EXTERN int CeedElemRestrictionHasBackendStrides(CeedElemRestriction rstr, bool *has_backend_strides);
diff --git a/interface/ceed-elemrestriction.c b/interface/ceed-elemrestriction.c
index 1021e1352a..242c78e455 100644
--- a/interface/ceed-elemrestriction.c
+++ b/interface/ceed-elemrestriction.c
@@ -146,7 +146,7 @@ int CeedElemRestrictionIsStrided(CeedElemRestriction rstr, bool *is_strided) {
 
   @ref Backend
 **/
-int CeedElemRestrictionIsPoints(CeedElemRestriction rstr, bool *is_points) {
+int CeedElemRestrictionIsAtPoints(CeedElemRestriction rstr, bool *is_points) {
   *is_points = (rstr->rstr_type == CEED_RESTRICTION_POINTS);
   return CEED_ERROR_SUCCESS;
 }

From b20a4af9cd65fb919c98e62df325832b3aefde7d Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Wed, 8 May 2024 11:37:50 -0600
Subject: [PATCH 033/571] cuda - add ElemRestrictionAtPoints

---
 backends/cuda-ref/ceed-cuda-ref-restriction.c | 105 +++++++++++++-----
 backends/cuda-ref/ceed-cuda-ref.c             |   1 +
 backends/cuda-ref/ceed-cuda-ref.h             |   6 +
 3 files changed, 87 insertions(+), 25 deletions(-)

diff --git a/backends/cuda-ref/ceed-cuda-ref-restriction.c b/backends/cuda-ref/ceed-cuda-ref-restriction.c
index f253b5413d..6035f2b956 100644
--- a/backends/cuda-ref/ceed-cuda-ref-restriction.c
+++ b/backends/cuda-ref/ceed-cuda-ref-restriction.c
@@ -32,11 +32,15 @@ static inline int CeedElemRestrictionSetupCompile_Cuda(CeedElemRestriction rstr)
 
   CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
   CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed));
+  CeedCallBackend(CeedElemRestrictionGetType(rstr, &rstr_type));
   CeedCallBackend(CeedElemRestrictionGetNumElements(rstr, &num_elem));
   CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &num_comp));
-  CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size));
   CeedCallBackend(CeedElemRestrictionGetCompStride(rstr, &comp_stride));
-  CeedCallBackend(CeedElemRestrictionGetType(rstr, &rstr_type));
+  if (rstr_type == CEED_RESTRICTION_POINTS) {
+    CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(rstr, &elem_size));
+  } else {
+    CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size));
+  }
   is_deterministic = impl->d_l_vec_indices != NULL;
 
   // Compile CUDA kernels
@@ -60,6 +64,7 @@ static inline int CeedElemRestrictionSetupCompile_Cuda(CeedElemRestriction rstr)
       CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "StridedNoTranspose", &impl->ApplyNoTranspose));
       CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "StridedTranspose", &impl->ApplyTranspose));
     } break;
+    case CEED_RESTRICTION_POINTS:
     case CEED_RESTRICTION_STANDARD: {
       CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-restriction-offset.h", &restriction_kernel_path));
       CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source -----\n");
@@ -119,11 +124,6 @@ static inline int CeedElemRestrictionSetupCompile_Cuda(CeedElemRestriction rstr)
       for (CeedInt i = 0; i < num_file_paths; i++) CeedCall(CeedFree(&file_paths[i]));
       CeedCall(CeedFree(&file_paths));
     } break;
-    case CEED_RESTRICTION_POINTS: {
-      // LCOV_EXCL_START
-      return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement restriction CeedElemRestrictionAtPoints");
-      // LCOV_EXCL_STOP
-    } break;
   }
   CeedCallBackend(CeedFree(&restriction_kernel_path));
   CeedCallBackend(CeedFree(&restriction_kernel_source));
@@ -175,6 +175,7 @@ static inline int CeedElemRestrictionApply_Cuda_Core(CeedElemRestriction rstr, C
 
         CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyNoTranspose, grid, block_size, args));
       } break;
+      case CEED_RESTRICTION_POINTS:
       case CEED_RESTRICTION_STANDARD: {
         void *args[] = {&impl->d_offsets, &d_u, &d_v};
 
@@ -206,11 +207,6 @@ static inline int CeedElemRestrictionApply_Cuda_Core(CeedElemRestriction rstr, C
           CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyUnorientedNoTranspose, grid, block_size, args));
         }
       } break;
-      case CEED_RESTRICTION_POINTS: {
-        // LCOV_EXCL_START
-        return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement restriction CeedElemRestrictionAtPoints");
-        // LCOV_EXCL_STOP
-      } break;
     }
   } else {
     // E-vector -> L-vector
@@ -224,6 +220,7 @@ static inline int CeedElemRestrictionApply_Cuda_Core(CeedElemRestriction rstr, C
 
         CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyTranspose, grid, block_size, args));
       } break;
+      case CEED_RESTRICTION_POINTS:
       case CEED_RESTRICTION_STANDARD: {
         if (!is_deterministic) {
           void *args[] = {&impl->d_offsets, &d_u, &d_v};
@@ -291,11 +288,6 @@ static inline int CeedElemRestrictionApply_Cuda_Core(CeedElemRestriction rstr, C
           }
         }
       } break;
-      case CEED_RESTRICTION_POINTS: {
-        // LCOV_EXCL_START
-        return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement restriction CeedElemRestrictionAtPoints");
-        // LCOV_EXCL_STOP
-      } break;
     }
   }
 
@@ -335,14 +327,16 @@ static int CeedElemRestrictionApplyUnoriented_Cuda(CeedElemRestriction rstr, Cee
 //------------------------------------------------------------------------------
 static int CeedElemRestrictionGetOffsets_Cuda(CeedElemRestriction rstr, CeedMemType mem_type, const CeedInt **offsets) {
   CeedElemRestriction_Cuda *impl;
+  CeedRestrictionType       rstr_type;
 
   CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
+  CeedCallBackend(CeedElemRestrictionGetType(rstr, &rstr_type));
   switch (mem_type) {
     case CEED_MEM_HOST:
-      *offsets = impl->h_offsets;
+      *offsets = rstr_type == CEED_RESTRICTION_POINTS ? impl->h_offsets_at_points : impl->h_offsets;
       break;
     case CEED_MEM_DEVICE:
-      *offsets = impl->d_offsets;
+      *offsets = rstr_type == CEED_RESTRICTION_POINTS ? impl->d_offsets_at_points : impl->d_offsets;
       break;
   }
   return CEED_ERROR_SUCCESS;
@@ -384,6 +378,17 @@ static int CeedElemRestrictionGetCurlOrientations_Cuda(CeedElemRestriction rstr,
   return CEED_ERROR_SUCCESS;
 }
 
+//------------------------------------------------------------------------------
+// Get offset for padded AtPoints E-layout
+//------------------------------------------------------------------------------
+static int CeedElemRestrictionGetAtPointsElementOffset_Cuda(CeedElemRestriction rstr, CeedInt elem, CeedSize *elem_offset) {
+  CeedInt layout[3];
+
+  CeedCallBackend(CeedElemRestrictionGetELayout(rstr, layout));
+  *elem_offset = 0 * layout[0] + 0 * layout[1] + elem * layout[2];
+  return CEED_ERROR_SUCCESS;
+}
+
 //------------------------------------------------------------------------------
 // Destroy restriction
 //------------------------------------------------------------------------------
@@ -405,6 +410,8 @@ static int CeedElemRestrictionDestroy_Cuda(CeedElemRestriction rstr) {
   CeedCallCuda(ceed, cudaFree((bool *)impl->d_orients_owned));
   CeedCallBackend(CeedFree(&impl->h_curl_orients_owned));
   CeedCallCuda(ceed, cudaFree((CeedInt8 *)impl->d_curl_orients_owned));
+  CeedCallBackend(CeedFree(&impl->h_offsets_at_points_owned));
+  CeedCallCuda(ceed, cudaFree((CeedInt *)impl->d_offsets_at_points_owned));
   CeedCallBackend(CeedFree(&impl));
   return CEED_ERROR_SUCCESS;
 }
@@ -412,18 +419,19 @@ static int CeedElemRestrictionDestroy_Cuda(CeedElemRestriction rstr) {
 //------------------------------------------------------------------------------
 // Create transpose offsets and indices
 //------------------------------------------------------------------------------
-static int CeedElemRestrictionOffset_Cuda(const CeedElemRestriction rstr, const CeedInt *indices) {
+static int CeedElemRestrictionOffset_Cuda(const CeedElemRestriction rstr, const CeedInt elem_size, const CeedInt *indices) {
   Ceed                      ceed;
   bool                     *is_node;
   CeedSize                  l_size;
-  CeedInt                   num_elem, elem_size, num_comp, num_nodes = 0;
+  CeedInt                   num_elem, num_comp, num_nodes = 0;
   CeedInt                  *ind_to_offset, *l_vec_indices, *t_offsets, *t_indices;
+  CeedRestrictionType       rstr_type;
   CeedElemRestriction_Cuda *impl;
 
   CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed));
   CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
   CeedCallBackend(CeedElemRestrictionGetNumElements(rstr, &num_elem));
-  CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size));
+  CeedCallBackend(CeedElemRestrictionGetType(rstr, &rstr_type));
   CeedCallBackend(CeedElemRestrictionGetLVectorSize(rstr, &l_size));
   CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &num_comp));
   const CeedInt size_indices = num_elem * elem_size;
@@ -496,7 +504,7 @@ int CeedElemRestrictionCreate_Cuda(CeedMemType mem_type, CeedCopyMode copy_mode,
                                    const CeedInt8 *curl_orients, CeedElemRestriction rstr) {
   Ceed                      ceed, ceed_parent;
   bool                      is_deterministic;
-  CeedInt                   num_elem, elem_size;
+  CeedInt                   num_elem, num_comp, elem_size;
   CeedRestrictionType       rstr_type;
   CeedElemRestriction_Cuda *impl;
 
@@ -504,8 +512,18 @@ int CeedElemRestrictionCreate_Cuda(CeedMemType mem_type, CeedCopyMode copy_mode,
   CeedCallBackend(CeedGetParent(ceed, &ceed_parent));
   CeedCallBackend(CeedIsDeterministic(ceed_parent, &is_deterministic));
   CeedCallBackend(CeedElemRestrictionGetNumElements(rstr, &num_elem));
+  CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &num_comp));
   CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size));
   CeedCallBackend(CeedElemRestrictionGetType(rstr, &rstr_type));
+  // Use max number of points as elem size for AtPoints restrictions
+  if (rstr_type == CEED_RESTRICTION_POINTS) {
+    CeedInt max_points = 0;
+
+    for (CeedInt i = 0; i < num_elem; i++) {
+      max_points = CeedIntMax(max_points, offsets[i + 1] - offsets[i]);
+    }
+    elem_size = max_points;
+  }
   const CeedInt size = num_elem * elem_size;
 
   CeedCallBackend(CeedCalloc(1, &impl));
@@ -526,6 +544,39 @@ int CeedElemRestrictionCreate_Cuda(CeedMemType mem_type, CeedCopyMode copy_mode,
     }
   }
 
+  // Pad AtPoints indices
+  if (rstr_type == CEED_RESTRICTION_POINTS) {
+    CeedSize offsets_len = elem_size * num_elem, at_points_size = num_elem + 1;
+    CeedInt  max_points = elem_size, *offsets_padded;
+
+    CeedCheck(mem_type == CEED_MEM_HOST, ceed, CEED_ERROR_BACKEND, "only MemType Host supported when creating AtPoints restriction");
+    CeedCallBackend(CeedMalloc(offsets_len, &offsets_padded));
+    for (CeedInt i = 0; i < num_elem; i++) {
+      CeedInt num_points = offsets[i + 1] - offsets[i];
+
+      at_points_size += num_points;
+      // -- Copy all points in element
+      for (CeedInt j = 0; j < num_points; j++) {
+        offsets_padded[i * max_points + j] = offsets[offsets[i] + j];
+      }
+      // -- Replicate out last point in element
+      for (CeedInt j = num_points; j < max_points; j++) {
+        offsets_padded[i * max_points + j] = offsets[offsets[i] + num_points - 1];
+      }
+    }
+    CeedCallBackend(CeedSetHostCeedIntArray(offsets, copy_mode, at_points_size, &impl->h_offsets_at_points_owned, &impl->h_offsets_at_points_borrowed,
+                                            &impl->h_offsets_at_points));
+    CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_offsets_at_points_owned, at_points_size * sizeof(CeedInt)));
+    CeedCallCuda(ceed, cudaMemcpy((CeedInt **)impl->d_offsets_at_points_owned, impl->h_offsets_at_points, at_points_size * sizeof(CeedInt),
+                                  cudaMemcpyHostToDevice));
+    impl->d_offsets_at_points = (CeedInt *)impl->d_offsets_at_points_owned;
+
+    // -- Use padded offsets for the rest of the setup
+    offsets   = (const CeedInt *)offsets_padded;
+    copy_mode = CEED_OWN_POINTER;
+    CeedCallBackend(CeedElemRestrictionSetAtPointsEVectorSize(rstr, at_points_size * num_comp));
+  }
+
   // Set up device offset/orientation arrays
   if (rstr_type != CEED_RESTRICTION_STRIDED) {
     switch (mem_type) {
@@ -534,7 +585,7 @@ int CeedElemRestrictionCreate_Cuda(CeedMemType mem_type, CeedCopyMode copy_mode,
         CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_offsets_owned, size * sizeof(CeedInt)));
         CeedCallCuda(ceed, cudaMemcpy((CeedInt *)impl->d_offsets_owned, impl->h_offsets, size * sizeof(CeedInt), cudaMemcpyHostToDevice));
         impl->d_offsets = (CeedInt *)impl->d_offsets_owned;
-        if (is_deterministic) CeedCallBackend(CeedElemRestrictionOffset_Cuda(rstr, offsets));
+        if (is_deterministic) CeedCallBackend(CeedElemRestrictionOffset_Cuda(rstr, elem_size, offsets));
       } break;
       case CEED_MEM_DEVICE: {
         CeedCallBackend(CeedSetDeviceCeedIntArray_Cuda(ceed, offsets, copy_mode, size, &impl->d_offsets_owned, &impl->d_offsets_borrowed,
@@ -542,7 +593,7 @@ int CeedElemRestrictionCreate_Cuda(CeedMemType mem_type, CeedCopyMode copy_mode,
         CeedCallBackend(CeedMalloc(size, &impl->h_offsets_owned));
         CeedCallCuda(ceed, cudaMemcpy((CeedInt *)impl->h_offsets_owned, impl->d_offsets, size * sizeof(CeedInt), cudaMemcpyDeviceToHost));
         impl->h_offsets = impl->h_offsets_owned;
-        if (is_deterministic) CeedCallBackend(CeedElemRestrictionOffset_Cuda(rstr, offsets));
+        if (is_deterministic) CeedCallBackend(CeedElemRestrictionOffset_Cuda(rstr, elem_size, offsets));
       } break;
     }
 
@@ -592,6 +643,10 @@ int CeedElemRestrictionCreate_Cuda(CeedMemType mem_type, CeedCopyMode copy_mode,
   CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetOffsets", CeedElemRestrictionGetOffsets_Cuda));
   CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetOrientations", CeedElemRestrictionGetOrientations_Cuda));
   CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetCurlOrientations", CeedElemRestrictionGetCurlOrientations_Cuda));
+  if (rstr_type == CEED_RESTRICTION_POINTS) {
+    CeedCallBackend(
+        CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetAtPointsElementOffset", CeedElemRestrictionGetAtPointsElementOffset_Cuda));
+  }
   CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "Destroy", CeedElemRestrictionDestroy_Cuda));
   return CEED_ERROR_SUCCESS;
 }
diff --git a/backends/cuda-ref/ceed-cuda-ref.c b/backends/cuda-ref/ceed-cuda-ref.c
index bbfa8cf875..2f00d6c1dc 100644
--- a/backends/cuda-ref/ceed-cuda-ref.c
+++ b/backends/cuda-ref/ceed-cuda-ref.c
@@ -57,6 +57,7 @@ static int CeedInit_Cuda_ref(const char *resource, Ceed ceed) {
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "BasisCreateHdiv", CeedBasisCreateHdiv_Cuda));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "BasisCreateHcurl", CeedBasisCreateHcurl_Cuda));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "ElemRestrictionCreate", CeedElemRestrictionCreate_Cuda));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "ElemRestrictionCreateAtPoints", CeedElemRestrictionCreate_Cuda));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "QFunctionCreate", CeedQFunctionCreate_Cuda));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "QFunctionContextCreate", CeedQFunctionContextCreate_Cuda));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "OperatorCreate", CeedOperatorCreate_Cuda));
diff --git a/backends/cuda-ref/ceed-cuda-ref.h b/backends/cuda-ref/ceed-cuda-ref.h
index 349aa8ef3a..f6ea09290a 100644
--- a/backends/cuda-ref/ceed-cuda-ref.h
+++ b/backends/cuda-ref/ceed-cuda-ref.h
@@ -48,6 +48,12 @@ typedef struct {
   const CeedInt8 *d_curl_orients;
   const CeedInt8 *d_curl_orients_borrowed;
   const CeedInt8 *d_curl_orients_owned;
+  const CeedInt  *h_offsets_at_points;
+  const CeedInt  *h_offsets_at_points_borrowed;
+  const CeedInt  *h_offsets_at_points_owned;
+  const CeedInt  *d_offsets_at_points;
+  const CeedInt  *d_offsets_at_points_borrowed;
+  const CeedInt  *d_offsets_at_points_owned;
 } CeedElemRestriction_Cuda;
 
 typedef struct {

From 831877b7501cd747c77484681d1f0b85164de77f Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Thu, 9 May 2024 10:56:00 -0600
Subject: [PATCH 034/571] tidy - minor fix of unititalized value warning

---
 backends/ref/ceed-ref-operator.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backends/ref/ceed-ref-operator.c b/backends/ref/ceed-ref-operator.c
index e9ba66fcc2..29c29bca01 100644
--- a/backends/ref/ceed-ref-operator.c
+++ b/backends/ref/ceed-ref-operator.c
@@ -1097,7 +1097,7 @@ static int CeedOperatorLinearAssembleQFunctionAtPointsUpdate_Ref(CeedOperator op
 //------------------------------------------------------------------------------
 static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Ref(CeedOperator op, CeedVector assembled, CeedRequest *request) {
   bool                is_active_at_points = true;
-  CeedInt             num_points_offset = 0, num_input_fields, num_output_fields, num_elem, elem_size_active = 1, num_comp_active;
+  CeedInt             num_points_offset = 0, num_input_fields, num_output_fields, num_elem, elem_size_active = 1, num_comp_active = 1;
   CeedScalar         *e_data[2 * CEED_FIELD_MAX] = {0};
   Ceed                ceed;
   CeedVector          point_coords = NULL, in_vec, out_vec;

From 1809c5f74680557a641e900d791842bafdfb4e8b Mon Sep 17 00:00:00 2001
From: James Wright <james@jameswright.xyz>
Date: Thu, 9 May 2024 17:00:37 -0600
Subject: [PATCH 035/571] doc: Mention PETSC_ARCH requirement for examples
 (#1580)

* doc: Mention PETSC_ARCH requirement for examples

* Update README.md

Co-authored-by: Jeremy L Thompson <jeremy@jeremylt.org>

---------

Co-authored-by: Jeremy L Thompson <jeremy@jeremylt.org>
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 99ba285426..45f3a08489 100644
--- a/README.md
+++ b/README.md
@@ -248,7 +248,7 @@ The backends which are capable of generating reproducible results, with the prop
 libCEED comes with several examples of its usage, ranging from standalone C codes in the `/examples/ceed` directory to examples based on external packages, such as MFEM, PETSc, and Nek5000.
 Nek5000 v18.0 or greater is required.
 
-To build the examples, set the `MFEM_DIR`, `PETSC_DIR`, and `NEK5K_DIR` variables and run:
+To build the examples, set the `MFEM_DIR`, `PETSC_DIR` (and optionally `PETSC_ARCH`), and `NEK5K_DIR` variables and run:
 
 ```console
 $ cd examples/

From 64a7ec2fadfbfdde96c718d988ace8687947172c Mon Sep 17 00:00:00 2001
From: Zach Atkins <zach.atkins@colorado.edu>
Date: Tue, 14 May 2024 11:52:10 -0600
Subject: [PATCH 036/571] Fix bug in diagonal assembly for point operators

---
 backends/ref/ceed-ref-operator.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/backends/ref/ceed-ref-operator.c b/backends/ref/ceed-ref-operator.c
index 29c29bca01..b549725d6d 100644
--- a/backends/ref/ceed-ref-operator.c
+++ b/backends/ref/ceed-ref-operator.c
@@ -1232,17 +1232,17 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Ref(CeedOperator op, Ce
 
       // -- Grab diagonal value
       for (CeedInt i = 0; i < num_output_fields; i++) {
-        bool                is_active_input = false;
+        bool                is_active_output = false;
         CeedRestrictionType rstr_type;
         CeedEvalMode        eval_mode;
         CeedVector          vec;
         CeedElemRestriction elem_rstr;
         CeedBasis           basis;
 
-        CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
+        CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
         // ---- Skip non-active input
-        is_active_input = vec == CEED_VECTOR_ACTIVE;
-        if (!is_active_input) continue;
+        is_active_output = vec == CEED_VECTOR_ACTIVE;
+        if (!is_active_output) continue;
 
         // ---- Get elem_size, eval_mode, size
         CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));

From 0c7f167fae0f659d98e17cd75830df6c4815d8c2 Mon Sep 17 00:00:00 2001
From: Zach Atkins <zach.atkins@colorado.edu>
Date: Tue, 14 May 2024 12:28:19 -0600
Subject: [PATCH 037/571] Update backends/ref/ceed-ref-operator.c

Co-authored-by: Jeremy L Thompson <jeremy@jeremylt.org>
---
 backends/ref/ceed-ref-operator.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backends/ref/ceed-ref-operator.c b/backends/ref/ceed-ref-operator.c
index b549725d6d..91ccf97b07 100644
--- a/backends/ref/ceed-ref-operator.c
+++ b/backends/ref/ceed-ref-operator.c
@@ -1240,7 +1240,7 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Ref(CeedOperator op, Ce
         CeedBasis           basis;
 
         CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
-        // ---- Skip non-active input
+        // ---- Skip non-active output
         is_active_output = vec == CEED_VECTOR_ACTIVE;
         if (!is_active_output) continue;
 

From 725a297d23d400e2460da2b936488ccfc1ce09fd Mon Sep 17 00:00:00 2001
From: Zach Atkins <zach.atkins@colorado.edu>
Date: Tue, 14 May 2024 12:39:56 -0600
Subject: [PATCH 038/571] Fix docs for CeedElemRestrictionGetNumPoints

---
 interface/ceed-elemrestriction.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/interface/ceed-elemrestriction.c b/interface/ceed-elemrestriction.c
index 242c78e455..2e78628087 100644
--- a/interface/ceed-elemrestriction.c
+++ b/interface/ceed-elemrestriction.c
@@ -1490,10 +1490,10 @@ int CeedElemRestrictionGetElementSize(CeedElemRestriction rstr, CeedInt *elem_si
 
 /**
 
-  @brief Get the number of points in the l-vector for a points `CeedElemRestriction`
+  @brief Get the number of points in the offsets array for a points `CeedElemRestriction`
 
   @param[in]  rstr       `CeedElemRestriction`
-  @param[out] num_points The number of points in the l-vector
+  @param[out] num_points The number of points in the offsets array
 
   @return An error code: 0 - success, otherwise - failure
 

From 8be297eeb698398a451c04ebf9ca4fa18990f35d Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Tue, 14 May 2024 13:33:09 -0600
Subject: [PATCH 039/571] points - fix gpu conversion to standard indexing for
 num_comp with rstr AtPoints

---
 backends/cuda-ref/ceed-cuda-ref-restriction.c | 4 ++--
 backends/hip-ref/ceed-hip-ref-restriction.c   | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/backends/cuda-ref/ceed-cuda-ref-restriction.c b/backends/cuda-ref/ceed-cuda-ref-restriction.c
index 6035f2b956..00f0dc8e19 100644
--- a/backends/cuda-ref/ceed-cuda-ref-restriction.c
+++ b/backends/cuda-ref/ceed-cuda-ref-restriction.c
@@ -557,11 +557,11 @@ int CeedElemRestrictionCreate_Cuda(CeedMemType mem_type, CeedCopyMode copy_mode,
       at_points_size += num_points;
       // -- Copy all points in element
       for (CeedInt j = 0; j < num_points; j++) {
-        offsets_padded[i * max_points + j] = offsets[offsets[i] + j];
+        offsets_padded[i * max_points + j] = offsets[offsets[i] + j] * num_comp;
       }
       // -- Replicate out last point in element
       for (CeedInt j = num_points; j < max_points; j++) {
-        offsets_padded[i * max_points + j] = offsets[offsets[i] + num_points - 1];
+        offsets_padded[i * max_points + j] = offsets[offsets[i] + num_points - 1] * num_comp;
       }
     }
     CeedCallBackend(CeedSetHostCeedIntArray(offsets, copy_mode, at_points_size, &impl->h_offsets_at_points_owned, &impl->h_offsets_at_points_borrowed,
diff --git a/backends/hip-ref/ceed-hip-ref-restriction.c b/backends/hip-ref/ceed-hip-ref-restriction.c
index 91d92c0ac1..6c3fdee598 100644
--- a/backends/hip-ref/ceed-hip-ref-restriction.c
+++ b/backends/hip-ref/ceed-hip-ref-restriction.c
@@ -556,11 +556,11 @@ int CeedElemRestrictionCreate_Hip(CeedMemType mem_type, CeedCopyMode copy_mode,
       at_points_size += num_points;
       // -- Copy all points in element
       for (CeedInt j = 0; j < num_points; j++) {
-        offsets_padded[i * max_points + j] = offsets[offsets[i] + j];
+        offsets_padded[i * max_points + j] = offsets[offsets[i] + j] * num_comp;
       }
       // -- Replicate out last point in element
       for (CeedInt j = num_points; j < max_points; j++) {
-        offsets_padded[i * max_points + j] = offsets[offsets[i] + num_points - 1];
+        offsets_padded[i * max_points + j] = offsets[offsets[i] + num_points - 1] * num_comp;
       }
     }
     CeedCallBackend(CeedSetHostCeedIntArray(offsets, copy_mode, at_points_size, &impl->h_offsets_at_points_owned, &impl->h_offsets_at_points_borrowed,

From 0b63de31bb7a3640b13441826f394b60c75e3d85 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 17 May 2024 11:36:06 -0600
Subject: [PATCH 040/571] rstr - transpose AtPoints restriction fixes

---
 backends/cuda-ref/ceed-cuda-ref-restriction.c | 44 ++++++++++-
 backends/cuda-ref/ceed-cuda-ref.h             |  6 ++
 backends/hip-ref/ceed-hip-ref-restriction.c   | 44 ++++++++++-
 backends/hip-ref/ceed-hip-ref.h               |  6 ++
 backends/memcheck/ceed-memcheck-restriction.c |  2 +-
 backends/ref/ceed-ref-restriction.c           |  2 +-
 .../cuda/cuda-ref-restriction-at-points.h     | 57 ++++++++++++++
 .../hip/hip-ref-restriction-at-points.h       | 57 ++++++++++++++
 tests/t231-elemrestriction.c                  |  2 +-
 tests/t232-elemrestriction.c                  | 32 ++++----
 tests/t233-elemrestriction.c                  | 48 ++++++------
 tests/t234-elemrestriction.c                  | 75 +++++++++++++++++++
 12 files changed, 328 insertions(+), 47 deletions(-)
 create mode 100644 include/ceed/jit-source/cuda/cuda-ref-restriction-at-points.h
 create mode 100644 include/ceed/jit-source/hip/hip-ref-restriction-at-points.h
 create mode 100644 tests/t234-elemrestriction.c

diff --git a/backends/cuda-ref/ceed-cuda-ref-restriction.c b/backends/cuda-ref/ceed-cuda-ref-restriction.c
index 00f0dc8e19..7e381e77e0 100644
--- a/backends/cuda-ref/ceed-cuda-ref-restriction.c
+++ b/backends/cuda-ref/ceed-cuda-ref-restriction.c
@@ -64,7 +64,23 @@ static inline int CeedElemRestrictionSetupCompile_Cuda(CeedElemRestriction rstr)
       CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "StridedNoTranspose", &impl->ApplyNoTranspose));
       CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "StridedTranspose", &impl->ApplyTranspose));
     } break;
-    case CEED_RESTRICTION_POINTS:
+    case CEED_RESTRICTION_POINTS: {
+      const char *offset_kernel_path;
+      char      **file_paths     = NULL;
+      CeedInt     num_file_paths = 0;
+
+      CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-restriction-at-points.h", &restriction_kernel_path));
+      CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source -----\n");
+      CeedCallBackend(CeedLoadSourceAndInitializeBuffer(ceed, restriction_kernel_path, &num_file_paths, &file_paths, &restriction_kernel_source));
+      CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-restriction-offset.h", &offset_kernel_path));
+      CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, offset_kernel_path, &num_file_paths, &file_paths, &restriction_kernel_source));
+      CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source Complete! -----\n");
+      CeedCallBackend(CeedCompile_Cuda(ceed, restriction_kernel_source, &impl->module, 6, "RSTR_ELEM_SIZE", elem_size, "RSTR_NUM_ELEM", num_elem,
+                                       "RSTR_NUM_COMP", num_comp, "RSTR_NUM_NODES", impl->num_nodes, "RSTR_COMP_STRIDE", comp_stride,
+                                       "USE_DETERMINISTIC", is_deterministic ? 1 : 0));
+      CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OffsetNoTranspose", &impl->ApplyNoTranspose));
+      CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "AtPointsTranspose", &impl->ApplyTranspose));
+    } break;
     case CEED_RESTRICTION_STANDARD: {
       CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-restriction-offset.h", &restriction_kernel_path));
       CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source -----\n");
@@ -220,7 +236,17 @@ static inline int CeedElemRestrictionApply_Cuda_Core(CeedElemRestriction rstr, C
 
         CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyTranspose, grid, block_size, args));
       } break;
-      case CEED_RESTRICTION_POINTS:
+      case CEED_RESTRICTION_POINTS: {
+        if (!is_deterministic) {
+          void *args[] = {&impl->d_offsets, &impl->d_points_per_elem, &d_u, &d_v};
+
+          CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyTranspose, grid, block_size, args));
+        } else {
+          void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, &impl->d_points_per_elem, &impl->d_t_offsets, &d_u, &d_v};
+
+          CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyTranspose, grid, block_size, args));
+        }
+      } break;
       case CEED_RESTRICTION_STANDARD: {
         if (!is_deterministic) {
           void *args[] = {&impl->d_offsets, &d_u, &d_v};
@@ -412,6 +438,8 @@ static int CeedElemRestrictionDestroy_Cuda(CeedElemRestriction rstr) {
   CeedCallCuda(ceed, cudaFree((CeedInt8 *)impl->d_curl_orients_owned));
   CeedCallBackend(CeedFree(&impl->h_offsets_at_points_owned));
   CeedCallCuda(ceed, cudaFree((CeedInt *)impl->d_offsets_at_points_owned));
+  CeedCallBackend(CeedFree(&impl->h_points_per_elem_owned));
+  CeedCallCuda(ceed, cudaFree((CeedInt *)impl->d_points_per_elem_owned));
   CeedCallBackend(CeedFree(&impl));
   return CEED_ERROR_SUCCESS;
 }
@@ -547,13 +575,15 @@ int CeedElemRestrictionCreate_Cuda(CeedMemType mem_type, CeedCopyMode copy_mode,
   // Pad AtPoints indices
   if (rstr_type == CEED_RESTRICTION_POINTS) {
     CeedSize offsets_len = elem_size * num_elem, at_points_size = num_elem + 1;
-    CeedInt  max_points = elem_size, *offsets_padded;
+    CeedInt  max_points = elem_size, *offsets_padded, *points_per_elem;
 
     CeedCheck(mem_type == CEED_MEM_HOST, ceed, CEED_ERROR_BACKEND, "only MemType Host supported when creating AtPoints restriction");
     CeedCallBackend(CeedMalloc(offsets_len, &offsets_padded));
+    CeedCallBackend(CeedMalloc(num_elem, &points_per_elem));
     for (CeedInt i = 0; i < num_elem; i++) {
       CeedInt num_points = offsets[i + 1] - offsets[i];
 
+      points_per_elem[i] = num_points;
       at_points_size += num_points;
       // -- Copy all points in element
       for (CeedInt j = 0; j < num_points; j++) {
@@ -575,6 +605,14 @@ int CeedElemRestrictionCreate_Cuda(CeedMemType mem_type, CeedCopyMode copy_mode,
     offsets   = (const CeedInt *)offsets_padded;
     copy_mode = CEED_OWN_POINTER;
     CeedCallBackend(CeedElemRestrictionSetAtPointsEVectorSize(rstr, at_points_size * num_comp));
+
+    // -- Points per element
+    CeedCallBackend(CeedSetHostCeedIntArray(points_per_elem, CEED_OWN_POINTER, num_elem, &impl->h_points_per_elem_owned,
+                                            &impl->h_points_per_elem_borrowed, &impl->h_points_per_elem));
+    CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_points_per_elem_owned, num_elem * sizeof(CeedInt)));
+    CeedCallCuda(ceed,
+                 cudaMemcpy((CeedInt **)impl->d_points_per_elem_owned, impl->h_points_per_elem, num_elem * sizeof(CeedInt), cudaMemcpyHostToDevice));
+    impl->d_points_per_elem = (CeedInt *)impl->d_points_per_elem_owned;
   }
 
   // Set up device offset/orientation arrays
diff --git a/backends/cuda-ref/ceed-cuda-ref.h b/backends/cuda-ref/ceed-cuda-ref.h
index f6ea09290a..f5a9f059e4 100644
--- a/backends/cuda-ref/ceed-cuda-ref.h
+++ b/backends/cuda-ref/ceed-cuda-ref.h
@@ -54,6 +54,12 @@ typedef struct {
   const CeedInt  *d_offsets_at_points;
   const CeedInt  *d_offsets_at_points_borrowed;
   const CeedInt  *d_offsets_at_points_owned;
+  const CeedInt  *h_points_per_elem;
+  const CeedInt  *h_points_per_elem_borrowed;
+  const CeedInt  *h_points_per_elem_owned;
+  const CeedInt  *d_points_per_elem;
+  const CeedInt  *d_points_per_elem_borrowed;
+  const CeedInt  *d_points_per_elem_owned;
 } CeedElemRestriction_Cuda;
 
 typedef struct {
diff --git a/backends/hip-ref/ceed-hip-ref-restriction.c b/backends/hip-ref/ceed-hip-ref-restriction.c
index 6c3fdee598..0cbdc64c3b 100644
--- a/backends/hip-ref/ceed-hip-ref-restriction.c
+++ b/backends/hip-ref/ceed-hip-ref-restriction.c
@@ -63,7 +63,23 @@ static inline int CeedElemRestrictionSetupCompile_Hip(CeedElemRestriction rstr)
       CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "StridedNoTranspose", &impl->ApplyNoTranspose));
       CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "StridedTranspose", &impl->ApplyTranspose));
     } break;
-    case CEED_RESTRICTION_POINTS:
+    case CEED_RESTRICTION_POINTS: {
+      const char *offset_kernel_path;
+      char      **file_paths     = NULL;
+      CeedInt     num_file_paths = 0;
+
+      CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-restriction-at-points.h", &restriction_kernel_path));
+      CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source -----\n");
+      CeedCallBackend(CeedLoadSourceAndInitializeBuffer(ceed, restriction_kernel_path, &num_file_paths, &file_paths, &restriction_kernel_source));
+      CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-restriction-offset.h", &offset_kernel_path));
+      CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, offset_kernel_path, &num_file_paths, &file_paths, &restriction_kernel_source));
+      CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source Complete! -----\n");
+      CeedCallBackend(CeedCompile_Hip(ceed, restriction_kernel_source, &impl->module, 6, "RSTR_ELEM_SIZE", elem_size, "RSTR_NUM_ELEM", num_elem,
+                                      "RSTR_NUM_COMP", num_comp, "RSTR_NUM_NODES", impl->num_nodes, "RSTR_COMP_STRIDE", comp_stride,
+                                      "USE_DETERMINISTIC", is_deterministic ? 1 : 0));
+      CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "OffsetNoTranspose", &impl->ApplyNoTranspose));
+      CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "AtPointsTranspose", &impl->ApplyTranspose));
+    } break;
     case CEED_RESTRICTION_STANDARD: {
       CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-restriction-offset.h", &restriction_kernel_path));
       CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source -----\n");
@@ -219,7 +235,17 @@ static inline int CeedElemRestrictionApply_Hip_Core(CeedElemRestriction rstr, Ce
 
         CeedCallBackend(CeedRunKernel_Hip(ceed, impl->ApplyTranspose, grid, block_size, args));
       } break;
-      case CEED_RESTRICTION_POINTS:
+      case CEED_RESTRICTION_POINTS: {
+        if (!is_deterministic) {
+          void *args[] = {&impl->d_offsets, &impl->d_points_per_elem, &d_u, &d_v};
+
+          CeedCallBackend(CeedRunKernel_Hip(ceed, impl->ApplyTranspose, grid, block_size, args));
+        } else {
+          void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, &impl->d_points_per_elem, &impl->d_t_offsets, &d_u, &d_v};
+
+          CeedCallBackend(CeedRunKernel_Hip(ceed, impl->ApplyTranspose, grid, block_size, args));
+        }
+      } break;
       case CEED_RESTRICTION_STANDARD: {
         if (!is_deterministic) {
           void *args[] = {&impl->d_offsets, &d_u, &d_v};
@@ -411,6 +437,8 @@ static int CeedElemRestrictionDestroy_Hip(CeedElemRestriction rstr) {
   CeedCallHip(ceed, hipFree((CeedInt8 *)impl->d_curl_orients_owned));
   CeedCallBackend(CeedFree(&impl->h_offsets_at_points_owned));
   CeedCallHip(ceed, hipFree((CeedInt8 *)impl->d_offsets_at_points_owned));
+  CeedCallBackend(CeedFree(&impl->h_points_per_elem_owned));
+  CeedCallHip(ceed, hipFree((CeedInt *)impl->d_points_per_elem_owned));
   CeedCallBackend(CeedFree(&impl));
   return CEED_ERROR_SUCCESS;
 }
@@ -546,13 +574,15 @@ int CeedElemRestrictionCreate_Hip(CeedMemType mem_type, CeedCopyMode copy_mode,
   // Pad AtPoints indices
   if (rstr_type == CEED_RESTRICTION_POINTS) {
     CeedSize offsets_len = elem_size * num_elem, at_points_size = num_elem + 1;
-    CeedInt  max_points = elem_size, *offsets_padded;
+    CeedInt  max_points = elem_size, *offsets_padded, *points_per_elem;
 
     CeedCheck(mem_type == CEED_MEM_HOST, ceed, CEED_ERROR_BACKEND, "only MemType Host supported when creating AtPoints restriction");
     CeedCallBackend(CeedMalloc(offsets_len, &offsets_padded));
+    CeedCallBackend(CeedMalloc(num_elem, &points_per_elem));
     for (CeedInt i = 0; i < num_elem; i++) {
       CeedInt num_points = offsets[i + 1] - offsets[i];
 
+      points_per_elem[i] = num_points;
       at_points_size += num_points;
       // -- Copy all points in element
       for (CeedInt j = 0; j < num_points; j++) {
@@ -574,6 +604,14 @@ int CeedElemRestrictionCreate_Hip(CeedMemType mem_type, CeedCopyMode copy_mode,
     offsets   = (const CeedInt *)offsets_padded;
     copy_mode = CEED_OWN_POINTER;
     CeedCallBackend(CeedElemRestrictionSetAtPointsEVectorSize(rstr, at_points_size * num_comp));
+
+    // -- Points per element
+    CeedCallBackend(CeedSetHostCeedIntArray(points_per_elem, CEED_OWN_POINTER, num_elem, &impl->h_points_per_elem_owned,
+                                            &impl->h_points_per_elem_borrowed, &impl->h_points_per_elem));
+    CeedCallHip(ceed, hipMalloc((void **)&impl->d_points_per_elem_owned, num_elem * sizeof(CeedInt)));
+    CeedCallHip(ceed,
+                hipMemcpy((CeedInt **)impl->d_points_per_elem_owned, impl->h_points_per_elem, num_elem * sizeof(CeedInt), hipMemcpyHostToDevice));
+    impl->d_points_per_elem = (CeedInt *)impl->d_points_per_elem_owned;
   }
 
   // Set up device offset/orientation arrays
diff --git a/backends/hip-ref/ceed-hip-ref.h b/backends/hip-ref/ceed-hip-ref.h
index 57e4e89162..403cdb71e2 100644
--- a/backends/hip-ref/ceed-hip-ref.h
+++ b/backends/hip-ref/ceed-hip-ref.h
@@ -58,6 +58,12 @@ typedef struct {
   const CeedInt  *d_offsets_at_points;
   const CeedInt  *d_offsets_at_points_borrowed;
   const CeedInt  *d_offsets_at_points_owned;
+  const CeedInt  *h_points_per_elem;
+  const CeedInt  *h_points_per_elem_borrowed;
+  const CeedInt  *h_points_per_elem_owned;
+  const CeedInt  *d_points_per_elem;
+  const CeedInt  *d_points_per_elem_borrowed;
+  const CeedInt  *d_points_per_elem_owned;
 } CeedElemRestriction_Hip;
 
 typedef struct {
diff --git a/backends/memcheck/ceed-memcheck-restriction.c b/backends/memcheck/ceed-memcheck-restriction.c
index 298ae43c6c..35d3016726 100644
--- a/backends/memcheck/ceed-memcheck-restriction.c
+++ b/backends/memcheck/ceed-memcheck-restriction.c
@@ -381,7 +381,7 @@ static inline int CeedElemRestrictionApplyAtPointsInElement_Memcheck_Core(CeedEl
       }
     } else {
       for (CeedSize i = 0; i < num_points; i++) {
-        for (CeedSize j = 0; j < num_comp; j++) vv[impl->offsets[i + l_vec_offset] * num_comp + j] = uu[j * num_points + i + e_vec_offset];
+        for (CeedSize j = 0; j < num_comp; j++) vv[impl->offsets[i + l_vec_offset] * num_comp + j] += uu[j * num_points + i + e_vec_offset];
       }
     }
     e_vec_offset += num_points * (CeedSize)num_comp;
diff --git a/backends/ref/ceed-ref-restriction.c b/backends/ref/ceed-ref-restriction.c
index 8dca3a90a7..068b9906f8 100644
--- a/backends/ref/ceed-ref-restriction.c
+++ b/backends/ref/ceed-ref-restriction.c
@@ -384,7 +384,7 @@ static inline int CeedElemRestrictionApplyAtPointsInElement_Ref_Core(CeedElemRes
       }
     } else {
       for (CeedSize i = 0; i < num_points; i++) {
-        for (CeedSize j = 0; j < num_comp; j++) vv[impl->offsets[i + l_vec_offset] * num_comp + j] = uu[j * num_points + i + e_vec_offset];
+        for (CeedSize j = 0; j < num_comp; j++) vv[impl->offsets[i + l_vec_offset] * num_comp + j] += uu[j * num_points + i + e_vec_offset];
       }
     }
     e_vec_offset += num_points * (CeedSize)num_comp;
diff --git a/include/ceed/jit-source/cuda/cuda-ref-restriction-at-points.h b/include/ceed/jit-source/cuda/cuda-ref-restriction-at-points.h
new file mode 100644
index 0000000000..83c4086ed0
--- /dev/null
+++ b/include/ceed/jit-source/cuda/cuda-ref-restriction-at-points.h
@@ -0,0 +1,57 @@
+// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+/// @file
+/// Internal header for CUDA offset element restriction kernels
+
+#include <ceed.h>
+
+//------------------------------------------------------------------------------
+// E-vector -> L-vector, standard (with offsets)
+//------------------------------------------------------------------------------
+#if !USE_DETERMINISTIC
+extern "C" __global__ void AtPointsTranspose(const CeedInt *__restrict__ indices, const CeedInt *__restrict__ points_per_elem,
+                                             const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
+  for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < RSTR_NUM_ELEM * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) {
+    const CeedInt ind      = indices[node];
+    const CeedInt loc_node = node % RSTR_ELEM_SIZE;
+    const CeedInt elem     = node / RSTR_ELEM_SIZE;
+
+    if (loc_node >= points_per_elem[elem]) continue;
+    for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) {
+      atomicAdd(v + ind + comp * RSTR_COMP_STRIDE, u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE]);
+    }
+  }
+}
+#else
+extern "C" __global__ void AtPointsTranspose(const CeedInt *__restrict__ l_vec_indices, const CeedInt *__restrict__ t_indices,
+                                             const CeedInt *__restrict__ points_per_elem, const CeedInt *__restrict__ t_offsets,
+                                             const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
+  CeedScalar value[RSTR_NUM_COMP];
+
+  for (CeedInt i = blockIdx.x * blockDim.x + threadIdx.x; i < RSTR_NUM_NODES; i += blockDim.x * gridDim.x) {
+    const CeedInt ind     = l_vec_indices[i];
+    const CeedInt range_1 = t_offsets[i];
+    const CeedInt range_N = t_offsets[i + 1];
+
+    for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) value[comp] = 0.0;
+
+    for (CeedInt j = range_1; j < range_N; j++) {
+      const CeedInt t_ind    = t_indices[j];
+      const CeedInt loc_node = t_ind % RSTR_ELEM_SIZE;
+      const CeedInt elem     = t_ind / RSTR_ELEM_SIZE;
+
+      if (loc_node >= points_per_elem[elem]) continue;
+      for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) {
+        value[comp] += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE];
+      }
+    }
+
+    for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) v[ind + comp * RSTR_COMP_STRIDE] += value[comp];
+  }
+}
+#endif
diff --git a/include/ceed/jit-source/hip/hip-ref-restriction-at-points.h b/include/ceed/jit-source/hip/hip-ref-restriction-at-points.h
new file mode 100644
index 0000000000..614628a81f
--- /dev/null
+++ b/include/ceed/jit-source/hip/hip-ref-restriction-at-points.h
@@ -0,0 +1,57 @@
+// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+/// @file
+/// Internal header for HIP offset element restriction kernels
+
+#include <ceed.h>
+
+//------------------------------------------------------------------------------
+// E-vector -> L-vector, standard (with offsets)
+//------------------------------------------------------------------------------
+#if !USE_DETERMINISTIC
+extern "C" __global__ void AtPointsTranspose(const CeedInt *__restrict__ indices, const CeedInt *__restrict__ points_per_elem,
+                                             const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
+  for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < RSTR_NUM_ELEM * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) {
+    const CeedInt ind      = indices[node];
+    const CeedInt loc_node = node % RSTR_ELEM_SIZE;
+    const CeedInt elem     = node / RSTR_ELEM_SIZE;
+
+    if (loc_node >= points_per_elem[elem]) continue;
+    for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) {
+      atomicAdd(v + ind + comp * RSTR_COMP_STRIDE, u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE]);
+    }
+  }
+}
+#else
+extern "C" __global__ void AtPointsTranspose(const CeedInt *__restrict__ l_vec_indices, const CeedInt *__restrict__ t_indices,
+                                             const CeedInt *__restrict__ points_per_elem, const CeedInt *__restrict__ t_offsets,
+                                             const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
+  CeedScalar value[RSTR_NUM_COMP];
+
+  for (CeedInt i = blockIdx.x * blockDim.x + threadIdx.x; i < RSTR_NUM_NODES; i += blockDim.x * gridDim.x) {
+    const CeedInt ind     = l_vec_indices[i];
+    const CeedInt range_1 = t_offsets[i];
+    const CeedInt range_N = t_offsets[i + 1];
+
+    for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) value[comp] = 0.0;
+
+    for (CeedInt j = range_1; j < range_N; j++) {
+      const CeedInt t_ind    = t_indices[j];
+      const CeedInt loc_node = t_ind % RSTR_ELEM_SIZE;
+      const CeedInt elem     = t_ind / RSTR_ELEM_SIZE;
+
+      if (loc_node >= points_per_elem[elem]) continue;
+      for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) {
+        value[comp] += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE];
+      }
+    }
+
+    for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) v[ind + comp * RSTR_COMP_STRIDE] += value[comp];
+  }
+}
+#endif
diff --git a/tests/t231-elemrestriction.c b/tests/t231-elemrestriction.c
index 6073602838..21077001aa 100644
--- a/tests/t231-elemrestriction.c
+++ b/tests/t231-elemrestriction.c
@@ -65,7 +65,7 @@ int main(int argc, char **argv) {
       for (CeedInt j = 0; j < num_points_in_elem; j++) {
         if (i != read_array[elem_offset + j * e_layout[0]]) {
           // LCOV_EXCL_START
-          printf("Error in restricted array y[%" CeedInt_FMT "] = %f\n != %f\n", (CeedInt)elem_offset + j * e_layout[0],
+          printf("Error in restricted array y[%" CeedInt_FMT "] = %f != %f\n", (CeedInt)elem_offset + j * e_layout[0],
                  (CeedScalar)read_array[elem_offset + j * e_layout[0]], (CeedScalar)i);
           // LCOV_EXCL_STOP
         }
diff --git a/tests/t232-elemrestriction.c b/tests/t232-elemrestriction.c
index 1973f3b86a..7632875fcf 100644
--- a/tests/t232-elemrestriction.c
+++ b/tests/t232-elemrestriction.c
@@ -1,7 +1,8 @@
 /// @file
-/// Test creation, use, and destruction of an element restriction at points for single elements
-/// \test Test creation, use, and destruction of an element restriction at points for single elements
+/// Test creation, use, and destruction of an element restriction at points
+/// \test Test creation, use, and destruction of an element restriction at points
 #include <ceed.h>
+#include <ceed/backend.h>
 #include <stdio.h>
 
 int main(int argc, char **argv) {
@@ -31,7 +32,8 @@ int main(int argc, char **argv) {
   }
   CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points, 1, num_points, CEED_MEM_HOST, CEED_USE_POINTER, ind, &elem_restriction);
 
-  CeedElemRestrictionCreateVector(elem_restriction, &x, NULL);
+  CeedElemRestrictionCreateVector(elem_restriction, &x, &y);
+  CeedVectorSetValue(y, 0.0);
   {
     CeedInt    point_index = num_elem;
     CeedScalar array[num_points];
@@ -47,30 +49,26 @@ int main(int argc, char **argv) {
     CeedVectorSetArray(x, CEED_MEM_HOST, CEED_COPY_VALUES, array);
   }
 
+  CeedElemRestrictionApply(elem_restriction, CEED_NOTRANSPOSE, x, y, CEED_REQUEST_IMMEDIATE);
+  CeedElemRestrictionApply(elem_restriction, CEED_TRANSPOSE, y, x, CEED_REQUEST_IMMEDIATE);
   {
-    CeedInt max_points;
+    CeedInt           point_index = num_elem;
+    const CeedScalar *read_array;
 
-    CeedElemRestrictionGetMaxPointsInElement(elem_restriction, &max_points);
-    CeedVectorCreate(ceed, max_points, &y);
-  }
-
-  {
+    CeedVectorGetArrayRead(x, CEED_MEM_HOST, &read_array);
     for (CeedInt i = 0; i < num_elem; i++) {
-      CeedInt           num_points_in_elem = (i + 1) % num_elem + 1;
-      const CeedScalar *read_array;
-
-      CeedElemRestrictionApplyAtPointsInElement(elem_restriction, i, CEED_NOTRANSPOSE, x, y, CEED_REQUEST_IMMEDIATE);
-      CeedVectorGetArrayRead(y, CEED_MEM_HOST, &read_array);
+      CeedInt num_points_in_elem = (i + 1) % num_elem + 1;
 
       for (CeedInt j = 0; j < num_points_in_elem; j++) {
-        if (i != read_array[j]) {
+        if (read_array[point_index] != 2 * i) {
           // LCOV_EXCL_START
-          printf("Error in restricted element array %" CeedInt_FMT " y[%" CeedInt_FMT "] = %f\n", i, j, (CeedScalar)read_array[j]);
+          printf("Error in restricted array x[%" CeedInt_FMT "] = %f != %f\n", point_index, read_array[point_index], 2.0 * i);
           // LCOV_EXCL_STOP
         }
+        point_index = (point_index + 1) % num_points;
       }
-      CeedVectorRestoreArrayRead(y, &read_array);
     }
+    CeedVectorRestoreArrayRead(x, &read_array);
   }
 
   CeedVectorDestroy(&x);
diff --git a/tests/t233-elemrestriction.c b/tests/t233-elemrestriction.c
index f42c2aed9f..1973f3b86a 100644
--- a/tests/t233-elemrestriction.c
+++ b/tests/t233-elemrestriction.c
@@ -1,8 +1,7 @@
 /// @file
-/// Test creation, transpose use, and destruction of an element restriction at points for single elements
-/// \test Test creation, transpose use, and destruction of an element restriction at points for single elements
+/// Test creation, use, and destruction of an element restriction at points for single elements
+/// \test Test creation, use, and destruction of an element restriction at points for single elements
 #include <ceed.h>
-#include <math.h>
 #include <stdio.h>
 
 int main(int argc, char **argv) {
@@ -30,40 +29,47 @@ int main(int argc, char **argv) {
     }
     ind[num_elem] = offset;
   }
-  CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points, 1, num_points, CEED_MEM_HOST, CEED_COPY_VALUES, ind, &elem_restriction);
+  CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points, 1, num_points, CEED_MEM_HOST, CEED_USE_POINTER, ind, &elem_restriction);
 
   CeedElemRestrictionCreateVector(elem_restriction, &x, NULL);
-  CeedVectorSetValue(x, 0.0);
+  {
+    CeedInt    point_index = num_elem;
+    CeedScalar array[num_points];
+
+    for (CeedInt i = 0; i < num_elem; i++) {
+      CeedInt num_points_in_elem = (i + 1) % num_elem + 1;
+
+      for (CeedInt j = 0; j < num_points_in_elem; j++) {
+        array[point_index] = i;
+        point_index        = (point_index + 1) % num_points;
+      }
+    }
+    CeedVectorSetArray(x, CEED_MEM_HOST, CEED_COPY_VALUES, array);
+  }
+
   {
     CeedInt max_points;
 
     CeedElemRestrictionGetMaxPointsInElement(elem_restriction, &max_points);
     CeedVectorCreate(ceed, max_points, &y);
-    CeedVectorSetValue(y, 1.0);
   }
 
   {
     for (CeedInt i = 0; i < num_elem; i++) {
-      CeedInt           point_index = num_elem;
+      CeedInt           num_points_in_elem = (i + 1) % num_elem + 1;
       const CeedScalar *read_array;
 
-      CeedVectorSetValue(x, 0.0);
-      CeedElemRestrictionApplyAtPointsInElement(elem_restriction, i, CEED_TRANSPOSE, y, x, CEED_REQUEST_IMMEDIATE);
-
-      CeedVectorGetArrayRead(x, CEED_MEM_HOST, &read_array);
-      for (CeedInt j = 0; j < num_elem; j++) {
-        CeedInt num_points_in_elem = (j + 1) % num_elem + 1;
+      CeedElemRestrictionApplyAtPointsInElement(elem_restriction, i, CEED_NOTRANSPOSE, x, y, CEED_REQUEST_IMMEDIATE);
+      CeedVectorGetArrayRead(y, CEED_MEM_HOST, &read_array);
 
-        for (CeedInt k = 0; k < num_points_in_elem; k++) {
-          if (fabs(read_array[point_index] - (i == j ? 1.0 : 0.0)) > 10 * CEED_EPSILON) {
-            // LCOV_EXCL_START
-            printf("Error in restricted array x[%" CeedInt_FMT "] = %f\n", point_index, (CeedScalar)read_array[point_index]);
-            // LCOV_EXCL_STOP
-          }
-          point_index = (point_index + 1) % num_points;
+      for (CeedInt j = 0; j < num_points_in_elem; j++) {
+        if (i != read_array[j]) {
+          // LCOV_EXCL_START
+          printf("Error in restricted element array %" CeedInt_FMT " y[%" CeedInt_FMT "] = %f\n", i, j, (CeedScalar)read_array[j]);
+          // LCOV_EXCL_STOP
         }
       }
-      CeedVectorRestoreArrayRead(x, &read_array);
+      CeedVectorRestoreArrayRead(y, &read_array);
     }
   }
 
diff --git a/tests/t234-elemrestriction.c b/tests/t234-elemrestriction.c
new file mode 100644
index 0000000000..f42c2aed9f
--- /dev/null
+++ b/tests/t234-elemrestriction.c
@@ -0,0 +1,75 @@
+/// @file
+/// Test creation, transpose use, and destruction of an element restriction at points for single elements
+/// \test Test creation, transpose use, and destruction of an element restriction at points for single elements
+#include <ceed.h>
+#include <math.h>
+#include <stdio.h>
+
+int main(int argc, char **argv) {
+  Ceed                ceed;
+  CeedInt             num_elem = 3, num_points = num_elem * 2;
+  CeedInt             ind[(num_elem + 1) + num_points];
+  CeedVector          x, y;
+  CeedElemRestriction elem_restriction;
+
+  CeedInit(argv[1], &ceed);
+
+  {
+    CeedInt offset      = num_elem + 1;
+    CeedInt point_index = num_elem;
+
+    for (CeedInt i = 0; i < num_elem; i++) {
+      CeedInt num_points_in_elem = (i + 1) % num_elem + 1;
+
+      ind[i] = offset;
+      for (CeedInt j = 0; j < num_points_in_elem; j++) {
+        ind[offset + j] = point_index;
+        point_index     = (point_index + 1) % num_points;
+      }
+      offset += num_points_in_elem;
+    }
+    ind[num_elem] = offset;
+  }
+  CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points, 1, num_points, CEED_MEM_HOST, CEED_COPY_VALUES, ind, &elem_restriction);
+
+  CeedElemRestrictionCreateVector(elem_restriction, &x, NULL);
+  CeedVectorSetValue(x, 0.0);
+  {
+    CeedInt max_points;
+
+    CeedElemRestrictionGetMaxPointsInElement(elem_restriction, &max_points);
+    CeedVectorCreate(ceed, max_points, &y);
+    CeedVectorSetValue(y, 1.0);
+  }
+
+  {
+    for (CeedInt i = 0; i < num_elem; i++) {
+      CeedInt           point_index = num_elem;
+      const CeedScalar *read_array;
+
+      CeedVectorSetValue(x, 0.0);
+      CeedElemRestrictionApplyAtPointsInElement(elem_restriction, i, CEED_TRANSPOSE, y, x, CEED_REQUEST_IMMEDIATE);
+
+      CeedVectorGetArrayRead(x, CEED_MEM_HOST, &read_array);
+      for (CeedInt j = 0; j < num_elem; j++) {
+        CeedInt num_points_in_elem = (j + 1) % num_elem + 1;
+
+        for (CeedInt k = 0; k < num_points_in_elem; k++) {
+          if (fabs(read_array[point_index] - (i == j ? 1.0 : 0.0)) > 10 * CEED_EPSILON) {
+            // LCOV_EXCL_START
+            printf("Error in restricted array x[%" CeedInt_FMT "] = %f\n", point_index, (CeedScalar)read_array[point_index]);
+            // LCOV_EXCL_STOP
+          }
+          point_index = (point_index + 1) % num_points;
+        }
+      }
+      CeedVectorRestoreArrayRead(x, &read_array);
+    }
+  }
+
+  CeedVectorDestroy(&x);
+  CeedVectorDestroy(&y);
+  CeedElemRestrictionDestroy(&elem_restriction);
+  CeedDestroy(&ceed);
+  return 0;
+}

From d83cf49fece5d7d5441d5b92eb712b904329a4d2 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 17 May 2024 11:47:41 -0600
Subject: [PATCH 041/571] ci - use Ubuntu 2404

---
 .github/workflows/c-fortran-test-hardware.yml  | 2 +-
 .github/workflows/c-fortran-test-icc.yml       | 2 +-
 .github/workflows/c-fortran-test-linux-osx.yml | 2 +-
 .github/workflows/c-fortran-test-style.yml     | 2 +-
 .github/workflows/julia-documentation.yml      | 2 +-
 .github/workflows/julia-test-with-style.yml    | 2 +-
 .github/workflows/python-test-with-style.yml   | 2 +-
 .github/workflows/release-notes.yml            | 2 +-
 .github/workflows/rust-documentation.yml       | 2 +-
 .github/workflows/rust-test-with-style.yml     | 4 ++--
 10 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/.github/workflows/c-fortran-test-hardware.yml b/.github/workflows/c-fortran-test-hardware.yml
index 7dd7626ebf..80d395f8d0 100644
--- a/.github/workflows/c-fortran-test-hardware.yml
+++ b/.github/workflows/c-fortran-test-hardware.yml
@@ -10,7 +10,7 @@ jobs:
   test:
     strategy:
       matrix:
-        os: [ubuntu-22.04]
+        os: [ubuntu-24.04]
         compiler: [gcc-13]
         arch: [aarch64, ppc64le]
         distro: [ubuntu22.04]
diff --git a/.github/workflows/c-fortran-test-icc.yml b/.github/workflows/c-fortran-test-icc.yml
index fc5f3407cd..f7101d4e90 100644
--- a/.github/workflows/c-fortran-test-icc.yml
+++ b/.github/workflows/c-fortran-test-icc.yml
@@ -14,7 +14,7 @@ jobs:
   test:
     strategy:
       matrix:
-        os: [ubuntu-22.04]
+        os: [ubuntu-24.04]
 
     runs-on: ${{ matrix.os }}
 
diff --git a/.github/workflows/c-fortran-test-linux-osx.yml b/.github/workflows/c-fortran-test-linux-osx.yml
index 806cbcc16d..3b369ce1e4 100644
--- a/.github/workflows/c-fortran-test-linux-osx.yml
+++ b/.github/workflows/c-fortran-test-linux-osx.yml
@@ -10,7 +10,7 @@ jobs:
   test:
     strategy:
       matrix:
-        os: [ubuntu-22.04, macos-13]
+        os: [ubuntu-24.04, macos-13]
         compiler: [gcc-13, clang]
 
     runs-on: ${{ matrix.os }}
diff --git a/.github/workflows/c-fortran-test-style.yml b/.github/workflows/c-fortran-test-style.yml
index 4f2fcb4054..a6daecc241 100644
--- a/.github/workflows/c-fortran-test-style.yml
+++ b/.github/workflows/c-fortran-test-style.yml
@@ -10,7 +10,7 @@ jobs:
   test:
     strategy:
       matrix:
-        os: [ubuntu-22.04]
+        os: [ubuntu-24.04]
         compiler: [clang]
 
     runs-on: ${{ matrix.os }}
diff --git a/.github/workflows/julia-documentation.yml b/.github/workflows/julia-documentation.yml
index d7a432426f..b90bb1bb1e 100644
--- a/.github/workflows/julia-documentation.yml
+++ b/.github/workflows/julia-documentation.yml
@@ -9,7 +9,7 @@ on:
 
 jobs:
   build:
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-24.04
     steps:
       - uses: actions/checkout@v4
       - uses: julia-actions/setup-julia@latest
diff --git a/.github/workflows/julia-test-with-style.yml b/.github/workflows/julia-test-with-style.yml
index b74434ff49..a292c9550b 100644
--- a/.github/workflows/julia-test-with-style.yml
+++ b/.github/workflows/julia-test-with-style.yml
@@ -10,7 +10,7 @@ jobs:
   test:
     strategy:
       matrix:
-        os: [ubuntu-22.04]
+        os: [ubuntu-24.04]
         julia-version: ['1']
 
     runs-on: ${{ matrix.os }}
diff --git a/.github/workflows/python-test-with-style.yml b/.github/workflows/python-test-with-style.yml
index a8fc0af33c..57112d1e3f 100644
--- a/.github/workflows/python-test-with-style.yml
+++ b/.github/workflows/python-test-with-style.yml
@@ -10,7 +10,7 @@ jobs:
   test:
     strategy:
       matrix:
-        os: [ubuntu-22.04]
+        os: [ubuntu-24.04]
         compiler: [gcc-13]
         python-version: ['3.x']
 
diff --git a/.github/workflows/release-notes.yml b/.github/workflows/release-notes.yml
index 8d90b2490d..a4fa213618 100644
--- a/.github/workflows/release-notes.yml
+++ b/.github/workflows/release-notes.yml
@@ -10,7 +10,7 @@ jobs:
   test:
     strategy:
       matrix:
-        os: [ubuntu-22.04]
+        os: [ubuntu-24.04]
 
     runs-on: ${{ matrix.os }}
 
diff --git a/.github/workflows/rust-documentation.yml b/.github/workflows/rust-documentation.yml
index b0ca00c440..4d6410548a 100644
--- a/.github/workflows/rust-documentation.yml
+++ b/.github/workflows/rust-documentation.yml
@@ -10,7 +10,7 @@ jobs:
   test:
     strategy:
       matrix:
-        os: [ubuntu-22.04]
+        os: [ubuntu-24.04]
         compiler: [clang]
 
     runs-on: ${{ matrix.os }}
diff --git a/.github/workflows/rust-test-with-style.yml b/.github/workflows/rust-test-with-style.yml
index 0626ecb989..d6e1f2a42b 100644
--- a/.github/workflows/rust-test-with-style.yml
+++ b/.github/workflows/rust-test-with-style.yml
@@ -10,7 +10,7 @@ jobs:
   test:
     strategy:
       matrix:
-        os: [ubuntu-22.04]
+        os: [ubuntu-24.04]
         compiler: [clang]
 
     runs-on: ${{ matrix.os }}
@@ -42,7 +42,7 @@ jobs:
   style:
     strategy:
       matrix:
-        os: [ubuntu-22.04]
+        os: [ubuntu-24.04]
         compiler: [clang]
 
     runs-on: ${{ matrix.os }}

From 29ec485eb72075150292bd9d6291eab6d473a1fc Mon Sep 17 00:00:00 2001
From: Jed Brown <jed@jedbrown.org>
Date: Wed, 22 May 2024 15:00:00 -0600
Subject: [PATCH 042/571] backends/cuda: NVRTC compile to CUBIN when supported
 (resolve #1587)

This allows using a newer CUDA runtime with an older driver, and seems
to have no downsides.

  NVRTC can generate cubins directly starting with CUDA 11.1. [...]
  NVRTC used to support only virtual architectures through the option
  -arch, since it was only emitting PTX. It will now support actual
  architectures as well to emit SASS. The interface is augmented to
  retrieve either the PTX or cubin if an actual architecture is
  specified.

https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#dynamic-code-generation
---
 backends/cuda/ceed-cuda-compile.cpp | 22 +++++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)

diff --git a/backends/cuda/ceed-cuda-compile.cpp b/backends/cuda/ceed-cuda-compile.cpp
index b8186ba2d1..9bd433fd01 100644
--- a/backends/cuda/ceed-cuda-compile.cpp
+++ b/backends/cuda/ceed-cuda-compile.cpp
@@ -80,9 +80,19 @@ int CeedCompile_Cuda(Ceed ceed, const char *source, CUmodule *module, const Ceed
   opts[0] = "-default-device";
   CeedCallBackend(CeedGetData(ceed, &ceed_data));
   CeedCallCuda(ceed, cudaGetDeviceProperties(&prop, ceed_data->device_id));
-  std::string arch_arg = "-arch=compute_" + std::to_string(prop.major) + std::to_string(prop.minor);
-  opts[1]              = arch_arg.c_str();
-  opts[2]              = "-Dint32_t=int";
+  std::string arch_arg =
+#if CUDA_VERSION >= 11010
+      // NVRTC used to support only virtual architectures through the option
+      // -arch, since it was only emitting PTX. It will now support actual
+      // architectures as well to emit SASS.
+      // https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#dynamic-code-generation
+      "-arch=sm_"
+#else
+      "-arch=compute_"
+#endif
+      + std::to_string(prop.major) + std::to_string(prop.minor);
+  opts[1] = arch_arg.c_str();
+  opts[2] = "-Dint32_t=int";
 
   // Add string source argument provided in call
   code << source;
@@ -106,9 +116,15 @@ int CeedCompile_Cuda(Ceed ceed, const char *source, CUmodule *module, const Ceed
     return CeedError(ceed, CEED_ERROR_BACKEND, "%s\n%s", nvrtcGetErrorString(result), log);
   }
 
+#if CUDA_VERSION >= 11010
+  CeedCallNvrtc(ceed, nvrtcGetCUBINSize(prog, &ptx_size));
+  CeedCallBackend(CeedMalloc(ptx_size, &ptx));
+  CeedCallNvrtc(ceed, nvrtcGetCUBIN(prog, ptx));
+#else
   CeedCallNvrtc(ceed, nvrtcGetPTXSize(prog, &ptx_size));
   CeedCallBackend(CeedMalloc(ptx_size, &ptx));
   CeedCallNvrtc(ceed, nvrtcGetPTX(prog, ptx));
+#endif
   CeedCallNvrtc(ceed, nvrtcDestroyProgram(&prog));
 
   CeedCallCuda(ceed, cuModuleLoadData(module, ptx));

From 68ebe796ba3a20e3a46fc8d1f708ffe4751e188e Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Thu, 23 May 2024 01:26:10 -0600
Subject: [PATCH 043/571] test - update t531 for corectness, robustness

---
 tests/t531-operator-f.f90 | 47 +++++++++++++++++++--------------------
 tests/t531-operator-f.h   |  4 ++--
 tests/t531-operator.c     | 41 ++++++++++++++++++----------------
 tests/t531-operator.h     |  5 ++---
 4 files changed, 49 insertions(+), 48 deletions(-)

diff --git a/tests/t531-operator-f.f90 b/tests/t531-operator-f.f90
index d78dd92351..a2deb4434d 100644
--- a/tests/t531-operator-f.f90
+++ b/tests/t531-operator-f.f90
@@ -15,7 +15,7 @@ program test
       integer bx,bu
       integer qf_setup,qf_diff,qf_diff_lin
       integer op_setup,op_diff,op_diff_lin
-      integer qdata,x,a,u,v
+      integer qdata,x,a,u,v,v_lin
       integer nelem,p,q,d
       integer row,col,offset
       parameter(nelem=6)
@@ -28,8 +28,8 @@ program test
       parameter(ndofs=(nx*2+1)*(ny*2+1))
       parameter(nqpts=nelem*q*q)
       integer indx(nelem*p*p)
-      real*8 arrx(d*ndofs),vv(ndofs)
-      integer*8 xoffset,voffset
+      real*8 arrx(d*ndofs),uu(ndofs),vv(ndofs),vvlin(ndofs)
+      integer*8 xoffset,uoffset,voffset,vlinoffset
 
       character arg*32
 
@@ -42,14 +42,26 @@ program test
 ! DoF Coordinates
       do i=0,nx*2
         do j=0,ny*2
-          arrx(i+j*(nx*2+1)+0*ndofs+1)=1.d0*i/(2*nx)
-          arrx(i+j*(nx*2+1)+1*ndofs+1)=1.d0*j/(2*ny)
+          arrx(i+j*(nx*2+1)+0*ndofs+1)=1.d0*i/(2*nx)+j*0.5
+          arrx(i+j*(nx*2+1)+1*ndofs+1)=1.d0*j/(2*ny)+i*0.5
         enddo
       enddo
       call ceedvectorcreate(ceed,d*ndofs,x,err)
       xoffset=0
       call ceedvectorsetarray(x,ceed_mem_host,ceed_use_pointer,arrx,xoffset,err)
 
+! Input, output arrays
+      do i=0,nx*2
+        do j=0,ny*2
+          uu(i+j*(nx*2+1)+1)=i*nx+j*ny
+        enddo
+      enddo
+      call ceedvectorcreate(ceed,ndofs,u,err)
+      uoffset=0
+      call ceedvectorsetarray(u,ceed_mem_host,ceed_use_pointer,uu,uoffset,err)
+      call ceedvectorcreate(ceed,ndofs,v,err)
+      call ceedvectorcreate(ceed,ndofs,v_lin,err)
+
 ! Qdata Vector
       call ceedvectorcreate(ceed,nqpts*d*(d+1)/2,qdata,err)
 
@@ -125,23 +137,8 @@ program test
      & bu,ceed_vector_active,err)
 
 ! Apply original Poisson Operator
-      call ceedvectorcreate(ceed,ndofs,u,err)
-      call ceedvectorsetvalue(u,1.d0,err)
-      call ceedvectorcreate(ceed,ndofs,v,err)
-      call ceedvectorsetvalue(v,0.d0,err)
       call ceedoperatorapply(op_diff,u,v,ceed_request_immediate,err)
 
-! Check Output
-      call ceedvectorgetarrayread(v,ceed_mem_host,vv,voffset,err)
-      do i=1,ndofs
-      if (abs(vv(voffset+i))>1.0d-14) then
-! LCOV_EXCL_START
-        write(*,*) 'Error: Operator computed v[i] = ',vv(voffset+i),' != 0.0'
-! LCOV_EXCL_STOP
-      endif
-      enddo
-      call ceedvectorrestorearrayread(v,vv,voffset,err)
-
 ! Assemble QFunction
       call ceedoperatorlinearassembleqfunction(op_diff,a,erestrictlini,&
      & ceed_request_immediate,err)
@@ -165,20 +162,21 @@ program test
      & bu,ceed_vector_active,err)
 
 ! Apply linearized Poisson Operator
-      call ceedvectorsetvalue(v,0.d0,err)
-      call ceedoperatorapply(op_diff_lin,u,v,ceed_request_immediate,err)
+      call ceedoperatorapply(op_diff_lin,u,v_lin,ceed_request_immediate,err)
 
 ! Check Output
       call ceedvectorgetarrayread(v,ceed_mem_host,vv,voffset,err)
+      call ceedvectorgetarrayread(v_lin,ceed_mem_host,vvlin,vlinoffset,err)
       do i=1,ndofs
-      if (abs(vv(voffset+i))>1.0d-14) then
+      if (abs(vv(voffset+i)-vvlin(vlinoffset+i))>1.0d-14) then
 ! LCOV_EXCL_START
         write(*,*) 'Error: Linearized operator computed v[i] = ',vv(voffset+i),&
-     &   ' != 0.0'
+     &   ' != ',vvlin(vlinoffset+i)
 ! LCOV_EXCL_STOP
       endif
       enddo
       call ceedvectorrestorearrayread(v,vv,voffset,err)
+      call ceedvectorrestorearrayread(v_lin,vvlin,vlinoffset,err)
 
 ! Cleanup
       call ceedqfunctiondestroy(qf_setup,err)
@@ -198,6 +196,7 @@ program test
       call ceedvectordestroy(a,err)
       call ceedvectordestroy(u,err)
       call ceedvectordestroy(v,err)
+      call ceedvectordestroy(v_lin,err)
       call ceedvectordestroy(qdata,err)
       call ceeddestroy(ceed,err)
       end
diff --git a/tests/t531-operator-f.h b/tests/t531-operator-f.h
index 20f02ea332..590140632d 100644
--- a/tests/t531-operator-f.h
+++ b/tests/t531-operator-f.h
@@ -11,8 +11,8 @@
       do i=1,q
         w=u2(i)/(u1(i+q*0)*u1(i+q*3)-u1(i+q*1)*u1(i+q*2))
         v1(i+q*0)=w*(u1(i+q*2)*u1(i+q*2)+u1(i+q*3)*u1(i+q*3))
-        v1(i+q*1)=-w*(u1(i+q*0)*u1(i+q*2)+u1(i+q*2)*u1(i+q*3))
-        v1(i+q*2)=w*(u1(i+q*0)*u1(i+q*0)+u1(i+q*1)*u1(i+q*1))
+        v1(i+q*1)=w*(u1(i+q*0)*u1(i+q*0)+u1(i+q*1)*u1(i+q*1))
+        v1(i+q*2)=-w*(u1(i+q*0)*u1(i+q*2)+u1(i+q*2)*u1(i+q*3))
       enddo
 
       ierr=0
diff --git a/tests/t531-operator.c b/tests/t531-operator.c
index 9462d49323..39168ecba6 100644
--- a/tests/t531-operator.c
+++ b/tests/t531-operator.c
@@ -14,7 +14,7 @@ int main(int argc, char **argv) {
   CeedBasis           basis_x, basis_u;
   CeedQFunction       qf_setup, qf_diff, qf_diff_assembled;
   CeedOperator        op_setup, op_diff, op_diff_assembled;
-  CeedVector          q_data, x, assembled = NULL, u, v;
+  CeedVector          q_data, x, assembled = NULL, u, v, v_assembled;
   CeedInt             num_elem = 6, p = 3, q = 4, dim = 2;
   CeedInt             nx = 3, ny = 2;
   CeedInt             num_dofs = (nx * 2 + 1) * (ny * 2 + 1), num_qpts = num_elem * q * q;
@@ -29,14 +29,26 @@ int main(int argc, char **argv) {
 
     for (CeedInt i = 0; i < nx * 2 + 1; i++) {
       for (CeedInt j = 0; j < ny * 2 + 1; j++) {
-        x_array[i + j * (nx * 2 + 1) + 0 * num_dofs] = (CeedScalar)i / (2 * nx);
-        x_array[i + j * (nx * 2 + 1) + 1 * num_dofs] = (CeedScalar)j / (2 * ny);
+        x_array[i + j * (nx * 2 + 1) + 0 * num_dofs] = (CeedScalar)i / (2 * nx) + 0.5 * j;
+        x_array[i + j * (nx * 2 + 1) + 1 * num_dofs] = (CeedScalar)j / (2 * ny) + 0.5 * i;
       }
     }
     CeedVectorSetArray(x, CEED_MEM_HOST, CEED_COPY_VALUES, x_array);
   }
   CeedVectorCreate(ceed, num_dofs, &u);
+  {
+    CeedScalar *u_array;
+
+    CeedVectorGetArrayWrite(u, CEED_MEM_HOST, &u_array);
+    for (CeedInt i = 0; i < nx * 2 + 1; i++) {
+      for (CeedInt j = 0; j < ny * 2 + 1; j++) {
+        u_array[i + j * (nx * 2 + 1)] = i * nx + j * ny;
+      }
+    }
+    CeedVectorRestoreArray(u, &u_array);
+  }
   CeedVectorCreate(ceed, num_dofs, &v);
+  CeedVectorCreate(ceed, num_dofs, &v_assembled);
   CeedVectorCreate(ceed, num_qpts * dim * (dim + 1) / 2, &q_data);
 
   // Restrictions
@@ -88,20 +100,8 @@ int main(int argc, char **argv) {
   CeedOperatorSetField(op_diff, "dv", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE);
 
   // Apply original Poisson Operator
-  CeedVectorSetValue(u, 1.0);
   CeedOperatorApply(op_diff, u, v, CEED_REQUEST_IMMEDIATE);
 
-  // Check output
-  {
-    const CeedScalar *v_array;
-
-    CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array);
-    for (CeedInt i = 0; i < num_dofs; i++) {
-      if (fabs(v_array[i]) > 100. * CEED_EPSILON) printf("Error: Operator computed v[%" CeedInt_FMT "] = %f != 0.0\n", i, v_array[i]);
-    }
-    CeedVectorRestoreArrayRead(v, &v_array);
-  }
-
   // Assemble QFunction
   CeedOperatorSetQFunctionAssemblyReuse(op_diff, true);
   CeedOperatorLinearAssembleQFunctionBuildOrUpdate(op_diff, &assembled, &elem_restriction_assembled, CEED_REQUEST_IMMEDIATE);
@@ -122,18 +122,20 @@ int main(int argc, char **argv) {
   CeedOperatorSetField(op_diff_assembled, "dv", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE);
 
   // Apply new Poisson Operator
-  CeedVectorSetValue(v, 0.0);
-  CeedOperatorApply(op_diff_assembled, u, v, CEED_REQUEST_IMMEDIATE);
+  CeedOperatorApply(op_diff_assembled, u, v_assembled, CEED_REQUEST_IMMEDIATE);
 
   // Check output
   {
-    const CeedScalar *v_array;
+    const CeedScalar *v_array, *v_assembled_array;
 
     CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array);
+    CeedVectorGetArrayRead(v_assembled, CEED_MEM_HOST, &v_assembled_array);
     for (CeedInt i = 0; i < num_dofs; i++) {
-      if (fabs(v_array[i]) > 100. * CEED_EPSILON) printf("Error: Linearized operator computed v[i] = %f != 0.0\n", v_array[i]);
+      if (fabs(v_array[i] - v_assembled_array[i]) > 100. * CEED_EPSILON)
+        printf("Error: Linearized operator computed v[i] = %f != %f\n", v_assembled_array[i], v_array[i]);
     }
     CeedVectorRestoreArrayRead(v, &v_array);
+    CeedVectorRestoreArrayRead(v_assembled, &v_assembled_array);
   }
 
   // Cleanup
@@ -142,6 +144,7 @@ int main(int argc, char **argv) {
   CeedVectorDestroy(&q_data);
   CeedVectorDestroy(&u);
   CeedVectorDestroy(&v);
+  CeedVectorDestroy(&v_assembled);
   CeedElemRestrictionDestroy(&elem_restriction_u);
   CeedElemRestrictionDestroy(&elem_restriction_x);
   CeedElemRestrictionDestroy(&elem_restriction_q_data);
diff --git a/tests/t531-operator.h b/tests/t531-operator.h
index a9f69f6bd5..4050ca35dc 100644
--- a/tests/t531-operator.h
+++ b/tests/t531-operator.h
@@ -28,8 +28,8 @@ CEED_QFUNCTION(setup)(void *ctx, const CeedInt Q, const CeedScalar *const *in, C
     const CeedScalar J22 = J[i + Q * 3];
     const CeedScalar w   = qw[i] / (J11 * J22 - J21 * J12);
     qd[i + Q * 0]        = w * (J12 * J12 + J22 * J22);
-    qd[i + Q * 2]        = w * (J11 * J11 + J21 * J21);
-    qd[i + Q * 1]        = -w * (J11 * J12 + J21 * J22);
+    qd[i + Q * 1]        = w * (J11 * J11 + J21 * J21);
+    qd[i + Q * 2]        = -w * (J11 * J12 + J21 * J22);
   }
 
   return 0;
@@ -50,7 +50,6 @@ CEED_QFUNCTION(diff)(void *ctx, const CeedInt Q, const CeedScalar *const *in, Ce
     dv[i + Q * 0]        = qd[i + Q * 0] * du0 + qd[i + Q * 2] * du1;
     dv[i + Q * 1]        = qd[i + Q * 2] * du0 + qd[i + Q * 1] * du1;
   }
-
   return 0;
 }
 

From c7b67790f45ae71043f73ea7aa7684f20189bfc2 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Thu, 23 May 2024 15:35:03 -0600
Subject: [PATCH 044/571] cpu - clean up QF assembly memory access

---
 backends/blocked/ceed-blocked-operator.c | 125 ++++++------
 backends/blocked/ceed-blocked.h          |   1 -
 backends/opt/ceed-opt-operator.c         | 135 +++++++------
 backends/opt/ceed-opt.h                  |   1 -
 backends/ref/ceed-ref-operator.c         | 245 ++++++++++++-----------
 backends/ref/ceed-ref.h                  |  12 +-
 6 files changed, 278 insertions(+), 241 deletions(-)

diff --git a/backends/blocked/ceed-blocked-operator.c b/backends/blocked/ceed-blocked-operator.c
index d1f2678e1d..fb7425d233 100644
--- a/backends/blocked/ceed-blocked-operator.c
+++ b/backends/blocked/ceed-blocked-operator.c
@@ -437,8 +437,7 @@ static int CeedOperatorApplyAdd_Blocked(CeedOperator op, CeedVector in_vec, Ceed
 static inline int CeedOperatorLinearAssembleQFunctionCore_Blocked(CeedOperator op, bool build_objects, CeedVector *assembled,
                                                                   CeedElemRestriction *rstr, CeedRequest *request) {
   Ceed                  ceed;
-  CeedSize              q_size;
-  CeedInt               Q, num_input_fields, num_output_fields, num_elem, size;
+  CeedInt               num_active_in, num_active_out, Q, num_input_fields, num_output_fields, num_elem;
   const CeedInt         block_size = 8;
   CeedScalar           *l_vec_array;
   CeedScalar           *e_data_full[2 * CEED_FIELD_MAX] = {0};
@@ -448,8 +447,8 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Blocked(CeedOperator o
   CeedOperator_Blocked *impl;
 
   CeedCallBackend(CeedOperatorGetData(op, &impl));
-  CeedInt             num_active_in = impl->num_active_in, num_active_out = impl->num_active_out;
-  CeedVector         *active_in  = impl->qf_active_in;
+  num_active_in                  = impl->num_active_in;
+  num_active_out                 = impl->num_active_in;
   CeedVector          l_vec      = impl->qf_l_vec;
   CeedElemRestriction block_rstr = impl->qf_block_rstr;
 
@@ -471,52 +470,42 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Blocked(CeedOperator o
   CeedCallBackend(CeedOperatorSetupInputs_Blocked(num_input_fields, qf_input_fields, op_input_fields, NULL, true, e_data_full, impl, request));
 
   // Count number of active input fields
-  if (!num_active_in) {
+  if (num_active_in == 0) {
     for (CeedInt i = 0; i < num_input_fields; i++) {
-      CeedScalar *q_vec_array;
-      CeedVector  vec;
+      CeedInt    field_size;
+      CeedVector vec;
 
       // Get input vector
       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
       // Check if active input
       if (vec == CEED_VECTOR_ACTIVE) {
-        CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &size));
+        CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &field_size));
         CeedCallBackend(CeedVectorSetValue(impl->q_vecs_in[i], 0.0));
-        CeedCallBackend(CeedVectorGetArray(impl->q_vecs_in[i], CEED_MEM_HOST, &q_vec_array));
-        CeedCallBackend(CeedRealloc(num_active_in + size, &active_in));
-        for (CeedInt field = 0; field < size; field++) {
-          q_size = (CeedSize)Q * block_size;
-          CeedCallBackend(CeedVectorCreate(ceed, q_size, &active_in[num_active_in + field]));
-          CeedCallBackend(
-              CeedVectorSetArray(active_in[num_active_in + field], CEED_MEM_HOST, CEED_USE_POINTER, &q_vec_array[field * Q * block_size]));
-        }
-        num_active_in += size;
-        CeedCallBackend(CeedVectorRestoreArray(impl->q_vecs_in[i], &q_vec_array));
+        num_active_in += field_size;
       }
     }
+    CeedCheck(num_active_in > 0, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs");
     impl->num_active_in = num_active_in;
-    impl->qf_active_in  = active_in;
   }
 
   // Count number of active output fields
-  if (!num_active_out) {
+  if (num_active_out == 0) {
     for (CeedInt i = 0; i < num_output_fields; i++) {
+      CeedInt    field_size;
       CeedVector vec;
 
       // Get output vector
       CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
       // Check if active output
       if (vec == CEED_VECTOR_ACTIVE) {
-        CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &size));
-        num_active_out += size;
+        CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &field_size));
+        num_active_out += field_size;
       }
     }
+    CeedCheck(num_active_out > 0, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs");
     impl->num_active_out = num_active_out;
   }
 
-  // Check sizes
-  CeedCheck(num_active_in > 0 && num_active_out > 0, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs");
-
   // Setup Lvec
   if (!l_vec) {
     const CeedSize l_size = (CeedSize)num_blocks * block_size * Q * num_active_in * num_active_out;
@@ -553,37 +542,61 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Blocked(CeedOperator o
     CeedCallBackend(CeedOperatorInputBasis_Blocked(e, Q, qf_input_fields, op_input_fields, num_input_fields, block_size, true, e_data_full, impl));
 
     // Assemble QFunction
-    for (CeedInt in = 0; in < num_active_in; in++) {
-      // Set Inputs
-      CeedCallBackend(CeedVectorSetValue(active_in[in], 1.0));
-      if (num_active_in > 1) {
-        CeedCallBackend(CeedVectorSetValue(active_in[(in + num_active_in - 1) % num_active_in], 0.0));
-      }
-      if (!impl->is_identity_qf) {
-        // Set Outputs
-        for (CeedInt out = 0; out < num_output_fields; out++) {
-          CeedVector vec;
-
-          // Get output vector
-          CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec));
-          // Check if active output
-          if (vec == CEED_VECTOR_ACTIVE) {
-            CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[out], CEED_MEM_HOST, CEED_USE_POINTER, l_vec_array));
-            CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[out], &size));
-            l_vec_array += size * Q * block_size;  // Advance the pointer by the size of the output
+    for (CeedInt i = 0; i < num_input_fields; i++) {
+      CeedInt    field_size;
+      CeedVector vec;
+
+      // Get input vector
+      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
+      // Check if active input
+      if (vec != CEED_VECTOR_ACTIVE) continue;
+      CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &field_size));
+      for (CeedInt field = 0; field < field_size; field++) {
+        // Set current portion of input to 1.0
+        {
+          CeedScalar *array;
+
+          CeedCallBackend(CeedVectorGetArray(impl->q_vecs_in[i], CEED_MEM_HOST, &array));
+          for (CeedInt j = 0; j < Q * block_size; j++) array[field * Q * block_size + j] = 1.0;
+          CeedCallBackend(CeedVectorRestoreArray(impl->q_vecs_in[i], &array));
+        }
+
+        if (!impl->is_identity_qf) {
+          // Set Outputs
+          for (CeedInt out = 0; out < num_output_fields; out++) {
+            CeedInt    field_size;
+            CeedVector vec;
+
+            // Get output vector
+            CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec));
+            // Check if active output
+            if (vec == CEED_VECTOR_ACTIVE) {
+              CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[out], CEED_MEM_HOST, CEED_USE_POINTER, l_vec_array));
+              CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[out], &field_size));
+              l_vec_array += field_size * Q * block_size;  // Advance the pointer by the size of the output
+            }
           }
+          // Apply QFunction
+          CeedCallBackend(CeedQFunctionApply(qf, Q * block_size, impl->q_vecs_in, impl->q_vecs_out));
+        } else {
+          CeedInt           field_size;
+          const CeedScalar *array;
+
+          // Copy Identity Outputs
+          CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[0], &field_size));
+          CeedCallBackend(CeedVectorGetArrayRead(impl->q_vecs_out[0], CEED_MEM_HOST, &array));
+          for (CeedInt j = 0; j < field_size * Q * block_size; j++) l_vec_array[j] = array[j];
+          CeedCallBackend(CeedVectorRestoreArrayRead(impl->q_vecs_out[0], &array));
+          l_vec_array += field_size * Q * block_size;
+        }
+        // Reset input to 0.0
+        {
+          CeedScalar *array;
+
+          CeedCallBackend(CeedVectorGetArray(impl->q_vecs_in[i], CEED_MEM_HOST, &array));
+          for (CeedInt j = 0; j < Q * block_size; j++) array[field * Q * block_size + j] = 0.0;
+          CeedCallBackend(CeedVectorRestoreArray(impl->q_vecs_in[i], &array));
         }
-        // Apply QFunction
-        CeedCallBackend(CeedQFunctionApply(qf, Q * block_size, impl->q_vecs_in, impl->q_vecs_out));
-      } else {
-        const CeedScalar *q_vec_array;
-
-        // Copy Identity Outputs
-        CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[0], &size));
-        CeedCallBackend(CeedVectorGetArrayRead(impl->q_vecs_out[0], CEED_MEM_HOST, &q_vec_array));
-        for (CeedInt i = 0; i < size * Q * block_size; i++) l_vec_array[i] = q_vec_array[i];
-        CeedCallBackend(CeedVectorRestoreArrayRead(impl->q_vecs_out[0], &q_vec_array));
-        l_vec_array += size * Q * block_size;
       }
     }
   }
@@ -657,10 +670,6 @@ static int CeedOperatorDestroy_Blocked(CeedOperator op) {
   CeedCallBackend(CeedFree(&impl->q_vecs_out));
 
   // QFunction assembly data
-  for (CeedInt i = 0; i < impl->num_active_in; i++) {
-    CeedCallBackend(CeedVectorDestroy(&impl->qf_active_in[i]));
-  }
-  CeedCallBackend(CeedFree(&impl->qf_active_in));
   CeedCallBackend(CeedVectorDestroy(&impl->qf_l_vec));
   CeedCallBackend(CeedElemRestrictionDestroy(&impl->qf_block_rstr));
 
diff --git a/backends/blocked/ceed-blocked.h b/backends/blocked/ceed-blocked.h
index 917f4eb604..4ddb3ab67e 100644
--- a/backends/blocked/ceed-blocked.h
+++ b/backends/blocked/ceed-blocked.h
@@ -26,7 +26,6 @@ typedef struct {
   CeedVector          *q_vecs_out;   /* Element block output Q-vectors */
   CeedInt              num_inputs, num_outputs;
   CeedInt              num_active_in, num_active_out;
-  CeedVector          *qf_active_in;
   CeedVector           qf_l_vec;
   CeedElemRestriction  qf_block_rstr;
 } CeedOperator_Blocked;
diff --git a/backends/opt/ceed-opt-operator.c b/backends/opt/ceed-opt-operator.c
index 92f1e7ad07..3770164880 100644
--- a/backends/opt/ceed-opt-operator.c
+++ b/backends/opt/ceed-opt-operator.c
@@ -441,8 +441,7 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Opt(CeedOperator op, b
                                                               CeedRequest *request) {
   Ceed                ceed;
   Ceed_Opt           *ceed_impl;
-  CeedSize            q_size;
-  CeedInt             Q, num_input_fields, num_output_fields, num_elem, size;
+  CeedInt             num_active_in, num_active_out, Q, num_input_fields, num_output_fields, num_elem;
   CeedScalar         *l_vec_array, *e_data[2 * CEED_FIELD_MAX] = {0};
   CeedQFunctionField *qf_input_fields, *qf_output_fields;
   CeedQFunction       qf;
@@ -452,16 +451,17 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Opt(CeedOperator op, b
   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
   CeedCallBackend(CeedGetData(ceed, &ceed_impl));
   CeedCallBackend(CeedOperatorGetData(op, &impl));
+  num_active_in  = impl->num_active_in;
+  num_active_out = impl->num_active_out;
+
   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
   CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q));
   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
   CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
-  const CeedInt       block_size    = ceed_impl->block_size;
-  const CeedInt       num_blocks    = (num_elem / block_size) + !!(num_elem % block_size);
-  CeedInt             num_active_in = impl->num_active_in, num_active_out = impl->num_active_out;
+  const CeedInt       block_size = ceed_impl->block_size;
+  const CeedInt       num_blocks = (num_elem / block_size) + !!(num_elem % block_size);
   CeedVector          l_vec      = impl->qf_l_vec;
-  CeedVector         *active_in  = impl->qf_active_in;
   CeedElemRestriction block_rstr = impl->qf_block_rstr;
 
   // Setup
@@ -474,52 +474,42 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Opt(CeedOperator op, b
   CeedCallBackend(CeedOperatorSetupInputs_Opt(num_input_fields, qf_input_fields, op_input_fields, NULL, e_data, impl, request));
 
   // Count number of active input fields
-  if (!num_active_in) {
+  if (num_active_in == 0) {
     for (CeedInt i = 0; i < num_input_fields; i++) {
-      CeedScalar *q_vec_array;
-      CeedVector  vec;
+      CeedInt    field_size;
+      CeedVector vec;
 
       // Get input vector
       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
       // Check if active input
       if (vec == CEED_VECTOR_ACTIVE) {
-        CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &size));
+        CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &field_size));
         CeedCallBackend(CeedVectorSetValue(impl->q_vecs_in[i], 0.0));
-        CeedCallBackend(CeedVectorGetArray(impl->q_vecs_in[i], CEED_MEM_HOST, &q_vec_array));
-        CeedCallBackend(CeedRealloc(num_active_in + size, &active_in));
-        for (CeedInt field = 0; field < size; field++) {
-          q_size = (CeedSize)Q * block_size;
-          CeedCallBackend(CeedVectorCreate(ceed, q_size, &active_in[num_active_in + field]));
-          CeedCallBackend(
-              CeedVectorSetArray(active_in[num_active_in + field], CEED_MEM_HOST, CEED_USE_POINTER, &q_vec_array[field * Q * block_size]));
-        }
-        num_active_in += size;
-        CeedCallBackend(CeedVectorRestoreArray(impl->q_vecs_in[i], &q_vec_array));
+        num_active_in += field_size;
       }
     }
+    CeedCheck(num_active_in > 0, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs");
     impl->num_active_in = num_active_in;
-    impl->qf_active_in  = active_in;
   }
 
   // Count number of active output fields
-  if (!num_active_out) {
+  if (num_active_out == 0) {
     for (CeedInt i = 0; i < num_output_fields; i++) {
+      CeedInt    field_size;
       CeedVector vec;
 
       // Get output vector
       CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
       // Check if active output
       if (vec == CEED_VECTOR_ACTIVE) {
-        CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &size));
-        num_active_out += size;
+        CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &field_size));
+        num_active_out += field_size;
       }
     }
+    CeedCheck(num_active_out > 0, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs");
     impl->num_active_out = num_active_out;
   }
 
-  // Check sizes
-  CeedCheck(num_active_in > 0 && num_active_out > 0, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs");
-
   // Setup l_vec
   if (!l_vec) {
     const CeedSize l_size = (CeedSize)block_size * Q * num_active_in * num_active_out;
@@ -560,41 +550,66 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Opt(CeedOperator op, b
         CeedOperatorInputBasis_Opt(e, Q, qf_input_fields, op_input_fields, num_input_fields, block_size, NULL, true, e_data, impl, request));
 
     // Assemble QFunction
-    for (CeedInt in = 0; in < num_active_in; in++) {
-      // Set Inputs
-      CeedCallBackend(CeedVectorSetValue(active_in[in], 1.0));
-      if (num_active_in > 1) {
-        CeedCallBackend(CeedVectorSetValue(active_in[(in + num_active_in - 1) % num_active_in], 0.0));
-      }
-      if (!impl->is_identity_qf) {
-        // Set Outputs
-        for (CeedInt out = 0; out < num_output_fields; out++) {
-          CeedVector vec;
-
-          // Get output vector
-          CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec));
-          // Check if active output
-          if (vec == CEED_VECTOR_ACTIVE) {
-            CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[out], CEED_MEM_HOST, CEED_USE_POINTER, l_vec_array));
-            CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[out], &size));
-            l_vec_array += size * Q * block_size;  // Advance the pointer by the size of the output
+    for (CeedInt i = 0; i < num_input_fields; i++) {
+      CeedInt    field_size;
+      CeedVector vec;
+
+      // Get input vector
+      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
+      // Check if active input
+      if (vec != CEED_VECTOR_ACTIVE) continue;
+      CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &field_size));
+      for (CeedInt field = 0; field < field_size; field++) {
+        // Set current portion of input to 1.0
+        {
+          CeedScalar *array;
+
+          CeedCallBackend(CeedVectorGetArray(impl->q_vecs_in[i], CEED_MEM_HOST, &array));
+          for (CeedInt j = 0; j < Q * block_size; j++) array[field * Q * block_size + j] = 1.0;
+          CeedCallBackend(CeedVectorRestoreArray(impl->q_vecs_in[i], &array));
+        }
+
+        if (!impl->is_identity_qf) {
+          // Set Outputs
+          for (CeedInt out = 0; out < num_output_fields; out++) {
+            CeedVector vec;
+
+            // Get output vector
+            CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec));
+            // Check if active output
+            if (vec == CEED_VECTOR_ACTIVE) {
+              CeedInt field_size;
+
+              CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[out], CEED_MEM_HOST, CEED_USE_POINTER, l_vec_array));
+              CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[out], &field_size));
+              l_vec_array += field_size * Q * block_size;  // Advance the pointer by the size of the output
+            }
           }
+          // Apply QFunction
+          CeedCallBackend(CeedQFunctionApply(qf, Q * block_size, impl->q_vecs_in, impl->q_vecs_out));
+        } else {
+          CeedInt           field_size;
+          const CeedScalar *array;
+
+          // Copy Identity Outputs
+          CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[0], &field_size));
+          CeedCallBackend(CeedVectorGetArrayRead(impl->q_vecs_out[0], CEED_MEM_HOST, &array));
+          for (CeedInt j = 0; j < field_size * Q * block_size; j++) l_vec_array[j] = array[j];
+          CeedCallBackend(CeedVectorRestoreArrayRead(impl->q_vecs_out[0], &array));
+          l_vec_array += field_size * Q * block_size;
+        }
+        // Reset input to 0.0
+        {
+          CeedScalar *array;
+
+          CeedCallBackend(CeedVectorGetArray(impl->q_vecs_in[i], CEED_MEM_HOST, &array));
+          for (CeedInt j = 0; j < Q * block_size; j++) array[field * Q * block_size + j] = 0.0;
+          CeedCallBackend(CeedVectorRestoreArray(impl->q_vecs_in[i], &array));
         }
-        // Apply QFunction
-        CeedCallBackend(CeedQFunctionApply(qf, Q * block_size, impl->q_vecs_in, impl->q_vecs_out));
-      } else {
-        const CeedScalar *q_vec_array;
-
-        // Copy Identity Outputs
-        CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[0], &size));
-        CeedCallBackend(CeedVectorGetArrayRead(impl->q_vecs_out[0], CEED_MEM_HOST, &q_vec_array));
-        for (CeedInt i = 0; i < size * Q * block_size; i++) l_vec_array[i] = q_vec_array[i];
-        CeedCallBackend(CeedVectorRestoreArrayRead(impl->q_vecs_out[0], &q_vec_array));
-        l_vec_array += size * Q * block_size;
       }
     }
 
-    // Assemble QFunction
+    // Un-set output Qvecs to prevent accidental overwrite of Assembled
     if (!impl->is_identity_qf) {
       for (CeedInt out = 0; out < num_output_fields; out++) {
         CeedVector vec;
@@ -613,7 +628,7 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Opt(CeedOperator op, b
     CeedCallBackend(CeedElemRestrictionApplyBlock(block_rstr, e / block_size, CEED_TRANSPOSE, l_vec, *assembled, request));
   }
 
-  // Un-set output Qvecs to prevent accidental overwrite of Assembled
+  // Reset output Qvecs
   for (CeedInt out = 0; out < num_output_fields; out++) {
     CeedVector vec;
 
@@ -672,10 +687,6 @@ static int CeedOperatorDestroy_Opt(CeedOperator op) {
   CeedCallBackend(CeedFree(&impl->q_vecs_out));
 
   // QFunction assembly data
-  for (CeedInt i = 0; i < impl->num_active_in; i++) {
-    CeedCallBackend(CeedVectorDestroy(&impl->qf_active_in[i]));
-  }
-  CeedCallBackend(CeedFree(&impl->qf_active_in));
   CeedCallBackend(CeedVectorDestroy(&impl->qf_l_vec));
   CeedCallBackend(CeedElemRestrictionDestroy(&impl->qf_block_rstr));
 
diff --git a/backends/opt/ceed-opt.h b/backends/opt/ceed-opt.h
index 9e12e612bf..1ccec8ac75 100644
--- a/backends/opt/ceed-opt.h
+++ b/backends/opt/ceed-opt.h
@@ -30,7 +30,6 @@ typedef struct {
   CeedVector          *q_vecs_out;   /* Element block output Q-vectors */
   CeedInt              num_inputs, num_outputs;
   CeedInt              num_active_in, num_active_out;
-  CeedVector          *qf_active_in;
   CeedVector           qf_l_vec;
   CeedElemRestriction  qf_block_rstr;
 } CeedOperator_Opt;
diff --git a/backends/ref/ceed-ref-operator.c b/backends/ref/ceed-ref-operator.c
index 91ccf97b07..3d9f413d6a 100644
--- a/backends/ref/ceed-ref-operator.c
+++ b/backends/ref/ceed-ref-operator.c
@@ -382,10 +382,8 @@ static int CeedOperatorApplyAdd_Ref(CeedOperator op, CeedVector in_vec, CeedVect
 static inline int CeedOperatorLinearAssembleQFunctionCore_Ref(CeedOperator op, bool build_objects, CeedVector *assembled, CeedElemRestriction *rstr,
                                                               CeedRequest *request) {
   Ceed                ceed, ceed_parent;
-  CeedSize            q_size;
-  CeedInt             num_active_in, num_active_out, Q, num_elem, num_input_fields, num_output_fields, size;
+  CeedInt             num_active_in, num_active_out, Q, num_elem, num_input_fields, num_output_fields;
   CeedScalar         *assembled_array, *e_data_full[2 * CEED_FIELD_MAX] = {NULL};
-  CeedVector         *active_in;
   CeedQFunctionField *qf_input_fields, *qf_output_fields;
   CeedQFunction       qf;
   CeedOperatorField  *op_input_fields, *op_output_fields;
@@ -394,8 +392,8 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Ref(CeedOperator op, b
   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
   CeedCallBackend(CeedOperatorGetFallbackParentCeed(op, &ceed_parent));
   CeedCallBackend(CeedOperatorGetData(op, &impl));
-  active_in     = impl->qf_active_in;
-  num_active_in = impl->num_active_in, num_active_out = impl->num_active_out;
+  num_active_in  = impl->num_active_in;
+  num_active_out = impl->num_active_out;
   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
   CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q));
   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
@@ -412,51 +410,42 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Ref(CeedOperator op, b
   CeedCallBackend(CeedOperatorSetupInputs_Ref(num_input_fields, qf_input_fields, op_input_fields, NULL, true, e_data_full, impl, request));
 
   // Count number of active input fields
-  if (!num_active_in) {
+  if (num_active_in == 0) {
     for (CeedInt i = 0; i < num_input_fields; i++) {
-      CeedScalar *q_vec_array;
-      CeedVector  vec;
+      CeedInt    field_size;
+      CeedVector vec;
 
       // Get input vector
       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
       // Check if active input
       if (vec == CEED_VECTOR_ACTIVE) {
-        CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &size));
+        CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &field_size));
         CeedCallBackend(CeedVectorSetValue(impl->q_vecs_in[i], 0.0));
-        CeedCallBackend(CeedVectorGetArray(impl->q_vecs_in[i], CEED_MEM_HOST, &q_vec_array));
-        CeedCallBackend(CeedRealloc(num_active_in + size, &active_in));
-        for (CeedInt field = 0; field < size; field++) {
-          q_size = (CeedSize)Q;
-          CeedCallBackend(CeedVectorCreate(ceed_parent, q_size, &active_in[num_active_in + field]));
-          CeedCallBackend(CeedVectorSetArray(active_in[num_active_in + field], CEED_MEM_HOST, CEED_USE_POINTER, &q_vec_array[field * Q]));
-        }
-        num_active_in += size;
-        CeedCallBackend(CeedVectorRestoreArray(impl->q_vecs_in[i], &q_vec_array));
+        num_active_in += field_size;
       }
     }
+    CeedCheck(num_active_in > 0, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs");
     impl->num_active_in = num_active_in;
-    impl->qf_active_in  = active_in;
   }
 
   // Count number of active output fields
-  if (!num_active_out) {
+  if (num_active_out == 0) {
     for (CeedInt i = 0; i < num_output_fields; i++) {
+      CeedInt    field_size;
       CeedVector vec;
 
       // Get output vector
       CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
       // Check if active output
       if (vec == CEED_VECTOR_ACTIVE) {
-        CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &size));
-        num_active_out += size;
+        CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &field_size));
+        num_active_out += field_size;
       }
     }
+    CeedCheck(num_active_out > 0, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs");
     impl->num_active_out = num_active_out;
   }
 
-  // Check sizes
-  CeedCheck(num_active_in > 0 && num_active_out > 0, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs");
-
   // Build objects if needed
   if (build_objects) {
     const CeedSize l_size     = (CeedSize)num_elem * Q * num_active_in * num_active_out;
@@ -478,37 +467,62 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Ref(CeedOperator op, b
     CeedCallBackend(CeedOperatorInputBasis_Ref(e, Q, qf_input_fields, op_input_fields, num_input_fields, true, e_data_full, impl));
 
     // Assemble QFunction
-    for (CeedInt in = 0; in < num_active_in; in++) {
+
+    for (CeedInt i = 0; i < num_input_fields; i++) {
+      CeedInt    field_size;
+      CeedVector vec;
+
       // Set Inputs
-      CeedCallBackend(CeedVectorSetValue(active_in[in], 1.0));
-      if (num_active_in > 1) {
-        CeedCallBackend(CeedVectorSetValue(active_in[(in + num_active_in - 1) % num_active_in], 0.0));
-      }
-      if (!impl->is_identity_qf) {
-        // Set Outputs
-        for (CeedInt out = 0; out < num_output_fields; out++) {
-          CeedVector vec;
-
-          // Get output vector
-          CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec));
-          // Check if active output
-          if (vec == CEED_VECTOR_ACTIVE) {
-            CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[out], CEED_MEM_HOST, CEED_USE_POINTER, assembled_array));
-            CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[out], &size));
-            assembled_array += size * Q;  // Advance the pointer by the size of the output
+      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
+      if (vec != CEED_VECTOR_ACTIVE) continue;
+      CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &field_size));
+      for (CeedInt field = 0; field < field_size; field++) {
+        // Set current portion of input to 1.0
+        {
+          CeedScalar *array;
+
+          CeedCallBackend(CeedVectorGetArray(impl->q_vecs_in[i], CEED_MEM_HOST, &array));
+          for (CeedInt j = 0; j < Q; j++) array[field * Q + j] = 1.0;
+          CeedCallBackend(CeedVectorRestoreArray(impl->q_vecs_in[i], &array));
+        }
+
+        if (!impl->is_identity_qf) {
+          // Set Outputs
+          for (CeedInt out = 0; out < num_output_fields; out++) {
+            CeedVector vec;
+
+            // Get output vector
+            CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec));
+            // Check if active output
+            if (vec == CEED_VECTOR_ACTIVE) {
+              CeedInt field_size;
+
+              CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[out], CEED_MEM_HOST, CEED_USE_POINTER, assembled_array));
+              CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[out], &field_size));
+              assembled_array += field_size * Q;  // Advance the pointer by the size of the output
+            }
           }
+          // Apply QFunction
+          CeedCallBackend(CeedQFunctionApply(qf, Q, impl->q_vecs_in, impl->q_vecs_out));
+        } else {
+          CeedInt           field_size;
+          const CeedScalar *array;
+
+          // Copy Identity Outputs
+          CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[0], &field_size));
+          CeedCallBackend(CeedVectorGetArrayRead(impl->q_vecs_out[0], CEED_MEM_HOST, &array));
+          for (CeedInt j = 0; j < field_size * Q; j++) assembled_array[j] = array[j];
+          CeedCallBackend(CeedVectorRestoreArrayRead(impl->q_vecs_out[0], &array));
+          assembled_array += field_size * Q;
+        }
+        // Reset input to 0.0
+        {
+          CeedScalar *array;
+
+          CeedCallBackend(CeedVectorGetArray(impl->q_vecs_in[i], CEED_MEM_HOST, &array));
+          for (CeedInt j = 0; j < Q; j++) array[field * Q + j] = 0.0;
+          CeedCallBackend(CeedVectorRestoreArray(impl->q_vecs_in[i], &array));
         }
-        // Apply QFunction
-        CeedCallBackend(CeedQFunctionApply(qf, Q, impl->q_vecs_in, impl->q_vecs_out));
-      } else {
-        const CeedScalar *q_vec_array;
-
-        // Copy Identity Outputs
-        CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[0], &size));
-        CeedCallBackend(CeedVectorGetArrayRead(impl->q_vecs_out[0], CEED_MEM_HOST, &q_vec_array));
-        for (CeedInt i = 0; i < size * Q; i++) assembled_array[i] = q_vec_array[i];
-        CeedCallBackend(CeedVectorRestoreArrayRead(impl->q_vecs_out[0], &q_vec_array));
-        assembled_array += size * Q;
       }
     }
   }
@@ -879,10 +893,9 @@ static int CeedOperatorApplyAddAtPoints_Ref(CeedOperator op, CeedVector in_vec,
 static inline int CeedOperatorLinearAssembleQFunctionAtPointsCore_Ref(CeedOperator op, bool build_objects, CeedVector *assembled,
                                                                       CeedElemRestriction *rstr, CeedRequest *request) {
   Ceed                ceed;
-  CeedSize            q_size;
   CeedInt             num_active_in, num_active_out, max_num_points, num_elem, num_input_fields, num_output_fields, num_points_offset = 0;
   CeedScalar         *assembled_array, *e_data_full[2 * CEED_FIELD_MAX] = {NULL};
-  CeedVector         *active_in, point_coords                           = NULL;
+  CeedVector          point_coords = NULL;
   CeedQFunctionField *qf_input_fields, *qf_output_fields;
   CeedQFunction       qf;
   CeedOperatorField  *op_input_fields, *op_output_fields;
@@ -891,8 +904,8 @@ static inline int CeedOperatorLinearAssembleQFunctionAtPointsCore_Ref(CeedOperat
 
   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
   CeedCallBackend(CeedOperatorGetData(op, &impl));
-  active_in     = impl->qf_active_in;
-  num_active_in = impl->num_active_in, num_active_out = impl->num_active_out;
+  num_active_in  = impl->num_active_in;
+  num_active_out = impl->num_active_out;
   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
@@ -912,11 +925,10 @@ static inline int CeedOperatorLinearAssembleQFunctionAtPointsCore_Ref(CeedOperat
   CeedCallBackend(CeedOperatorSetupInputs_Ref(num_input_fields, qf_input_fields, op_input_fields, NULL, true, e_data_full, impl, request));
 
   // Count number of active input fields
-  if (!num_active_in) {
+  if (num_active_in == 0) {
     for (CeedInt i = 0; i < num_input_fields; i++) {
-      CeedScalar *q_vec_array;
-      CeedInt     field_size;
-      CeedVector  vec;
+      CeedInt    field_size;
+      CeedVector vec;
 
       // Get input vector
       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
@@ -933,27 +945,18 @@ static inline int CeedOperatorLinearAssembleQFunctionAtPointsCore_Ref(CeedOperat
         }
         // Get size of active input
         CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &field_size));
-        CeedCallBackend(CeedVectorSetValue(impl->q_vecs_in[i], 0.0));
-        CeedCallBackend(CeedVectorGetArray(impl->q_vecs_in[i], CEED_MEM_HOST, &q_vec_array));
-        CeedCallBackend(CeedRealloc(num_active_in + field_size, &active_in));
-        for (CeedInt field = 0; field < field_size; field++) {
-          q_size = (CeedSize)max_num_points;
-          CeedCallBackend(CeedVectorCreate(ceed, q_size, &active_in[num_active_in + field]));
-          CeedCallBackend(CeedVectorSetArray(active_in[num_active_in + field], CEED_MEM_HOST, CEED_USE_POINTER, &q_vec_array[field * q_size]));
-        }
         num_active_in += field_size;
-        CeedCallBackend(CeedVectorRestoreArray(impl->q_vecs_in[i], &q_vec_array));
       }
     }
+    CeedCheck(num_active_in, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs");
     impl->num_active_in = num_active_in;
-    impl->qf_active_in  = active_in;
   }
 
   // Count number of active output fields
-  if (!num_active_out) {
+  if (num_active_out == 0) {
     for (CeedInt i = 0; i < num_output_fields; i++) {
-      CeedVector vec;
       CeedInt    field_size;
+      CeedVector vec;
 
       // Get output vector
       CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
@@ -970,15 +973,14 @@ static inline int CeedOperatorLinearAssembleQFunctionAtPointsCore_Ref(CeedOperat
         }
         // Get size of active output
         CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &field_size));
+        CeedCallBackend(CeedVectorSetValue(impl->q_vecs_in[i], 0.0));
         num_active_out += field_size;
       }
     }
+    CeedCheck(num_active_out > 0, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs");
     impl->num_active_out = num_active_out;
   }
 
-  // Check sizes
-  CeedCheck(num_active_in > 0 && num_active_out > 0, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs");
-
   // Build objects if needed
   if (build_objects) {
     CeedInt        num_points_total;
@@ -1013,39 +1015,62 @@ static inline int CeedOperatorLinearAssembleQFunctionAtPointsCore_Ref(CeedOperat
                                                        impl->point_coords_elem, true, e_data_full, impl, request));
 
     // Assemble QFunction
-    for (CeedInt in = 0; in < num_active_in; in++) {
-      // Set Inputs
-      CeedCallBackend(CeedVectorSetValue(active_in[in], 1.0));
-      if (num_active_in > 1) {
-        CeedCallBackend(CeedVectorSetValue(active_in[(in + num_active_in - 1) % num_active_in], 0.0));
-      }
-      if (!impl->is_identity_qf) {
-        // Set Outputs
-        for (CeedInt out = 0; out < num_output_fields; out++) {
-          CeedVector vec;
-          CeedInt    field_size;
-
-          // Get output vector
-          CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec));
-          // Check if active output
-          if (vec == CEED_VECTOR_ACTIVE) {
-            CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[out], CEED_MEM_HOST, CEED_USE_POINTER, assembled_array));
-            CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[out], &field_size));
-            assembled_array += field_size * num_points;  // Advance the pointer by the size of the output
+    for (CeedInt i = 0; i < num_input_fields; i++) {
+      CeedInt    field_size;
+      CeedVector vec;
+
+      // Get input vector
+      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
+      // Check if active input
+      if (vec != CEED_VECTOR_ACTIVE) continue;
+      // Get size of active input
+      CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &field_size));
+      for (CeedInt field = 0; field < field_size; field++) {
+        // Set current portion of input to 1.0
+        {
+          CeedScalar *array;
+
+          CeedCallBackend(CeedVectorGetArray(impl->q_vecs_in[i], CEED_MEM_HOST, &array));
+          for (CeedInt j = 0; j < num_points; j++) array[field * num_points + j] = 1.0;
+          CeedCallBackend(CeedVectorRestoreArray(impl->q_vecs_in[i], &array));
+        }
+
+        if (!impl->is_identity_qf) {
+          // Set Outputs
+          for (CeedInt out = 0; out < num_output_fields; out++) {
+            CeedVector vec;
+            CeedInt    field_size;
+
+            // Get output vector
+            CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec));
+            // Check if active output
+            if (vec == CEED_VECTOR_ACTIVE) {
+              CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[out], CEED_MEM_HOST, CEED_USE_POINTER, assembled_array));
+              CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[out], &field_size));
+              assembled_array += field_size * num_points;  // Advance the pointer by the size of the output
+            }
           }
+          // Apply QFunction
+          CeedCallBackend(CeedQFunctionApply(qf, num_points, impl->q_vecs_in, impl->q_vecs_out));
+        } else {
+          const CeedScalar *array;
+          CeedInt           field_size;
+
+          // Copy Identity Outputs
+          CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[0], &field_size));
+          CeedCallBackend(CeedVectorGetArrayRead(impl->q_vecs_out[0], CEED_MEM_HOST, &array));
+          for (CeedInt j = 0; j < field_size * num_points; j++) assembled_array[j] = array[j];
+          CeedCallBackend(CeedVectorRestoreArrayRead(impl->q_vecs_out[0], &array));
+          assembled_array += field_size * num_points;
+        }
+        // Reset input to 0.0
+        {
+          CeedScalar *array;
+
+          CeedCallBackend(CeedVectorGetArray(impl->q_vecs_in[i], CEED_MEM_HOST, &array));
+          for (CeedInt j = 0; j < num_points; j++) array[field * num_points + j] = 0.0;
+          CeedCallBackend(CeedVectorRestoreArray(impl->q_vecs_in[i], &array));
         }
-        // Apply QFunction
-        CeedCallBackend(CeedQFunctionApply(qf, num_points, impl->q_vecs_in, impl->q_vecs_out));
-      } else {
-        const CeedScalar *q_vec_array;
-        CeedInt           field_size;
-
-        // Copy Identity Outputs
-        CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[0], &field_size));
-        CeedCallBackend(CeedVectorGetArrayRead(impl->q_vecs_out[0], CEED_MEM_HOST, &q_vec_array));
-        for (CeedInt i = 0; i < field_size * num_points; i++) assembled_array[i] = q_vec_array[i];
-        CeedCallBackend(CeedVectorRestoreArrayRead(impl->q_vecs_out[0], &q_vec_array));
-        assembled_array += field_size * num_points;
       }
     }
     num_points_offset += num_points;
@@ -1328,12 +1353,6 @@ static int CeedOperatorDestroy_Ref(CeedOperator op) {
   CeedCallBackend(CeedFree(&impl->q_vecs_out));
   CeedCallBackend(CeedVectorDestroy(&impl->point_coords_elem));
 
-  // QFunction assembly
-  for (CeedInt i = 0; i < impl->num_active_in; i++) {
-    CeedCallBackend(CeedVectorDestroy(&impl->qf_active_in[i]));
-  }
-  CeedCallBackend(CeedFree(&impl->qf_active_in));
-
   CeedCallBackend(CeedFree(&impl));
   return CEED_ERROR_SUCCESS;
 }
diff --git a/backends/ref/ceed-ref.h b/backends/ref/ceed-ref.h
index 8eb3b54331..9fd3fa7b31 100644
--- a/backends/ref/ceed-ref.h
+++ b/backends/ref/ceed-ref.h
@@ -11,11 +11,6 @@
 #include <stdbool.h>
 #include <stdint.h>
 
-typedef struct {
-  CeedScalar *collo_grad_1d;
-  bool        has_collo_interp;
-} CeedBasis_Ref;
-
 typedef struct {
   CeedScalar *array;
   CeedScalar *array_borrowed;
@@ -36,6 +31,11 @@ typedef struct {
                CeedRequest *);
 } CeedElemRestriction_Ref;
 
+typedef struct {
+  CeedScalar *collo_grad_1d;
+  bool        has_collo_interp;
+} CeedBasis_Ref;
+
 typedef struct {
   const CeedScalar **inputs;
   CeedScalar       **outputs;
@@ -57,7 +57,7 @@ typedef struct {
   CeedVector *q_vecs_out;   /* Single element output Q-vectors */
   CeedInt     num_inputs, num_outputs;
   CeedInt     num_active_in, num_active_out;
-  CeedVector *qf_active_in, point_coords_elem;
+  CeedVector  point_coords_elem;
 } CeedOperator_Ref;
 
 CEED_INTERN int CeedVectorCreate_Ref(CeedSize n, CeedVector vec);

From 1a3e18b33c0d9ffa0a0a078143c6ea92713d0b7d Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Thu, 23 May 2024 08:29:29 -0600
Subject: [PATCH 045/571] memcheck - vec writable buffer

---
 backends/memcheck/ceed-memcheck-vector.c | 11 +++++++++--
 backends/memcheck/ceed-memcheck.h        |  1 +
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/backends/memcheck/ceed-memcheck-vector.c b/backends/memcheck/ceed-memcheck-vector.c
index b12b7ead95..ae29245120 100644
--- a/backends/memcheck/ceed-memcheck-vector.c
+++ b/backends/memcheck/ceed-memcheck-vector.c
@@ -102,13 +102,17 @@ static int CeedVectorTakeArray_Memcheck(CeedVector vec, CeedMemType mem_type, Ce
 // Vector Get Array
 //------------------------------------------------------------------------------
 static int CeedVectorGetArray_Memcheck(CeedVector vec, CeedMemType mem_type, CeedScalar **array) {
+  CeedSize             length;
   CeedVector_Memcheck *impl;
 
   CeedCallBackend(CeedVectorGetData(vec, &impl));
+  CeedCallBackend(CeedVectorGetLength(vec, &length));
 
   CeedCheck(mem_type == CEED_MEM_HOST, CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND, "Can only provide HOST memory for this backend");
 
-  *array = impl->array;
+  CeedCallBackend(CeedCalloc(length, &impl->array_writable_copy));
+  memcpy(impl->array_writable_copy, impl->array, length * sizeof((impl->array)[0]));
+  *array = impl->array_writable_copy;
   return CEED_ERROR_SUCCESS;
 }
 
@@ -122,9 +126,10 @@ static int CeedVectorGetArrayRead_Memcheck(CeedVector vec, CeedMemType mem_type,
   CeedCallBackend(CeedVectorGetData(vec, &impl));
   CeedCallBackend(CeedVectorGetLength(vec, &length));
 
-  CeedCallBackend(CeedVectorGetArray_Memcheck(vec, mem_type, (CeedScalar **)array));
+  CeedCheck(mem_type == CEED_MEM_HOST, CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND, "Can only provide HOST memory for this backend");
 
   // Make copy to verify no write occurred
+  *array = impl->array;
   if (!impl->array_read_only_copy) {
     CeedCallBackend(CeedCalloc(length, &impl->array_read_only_copy));
     memcpy(impl->array_read_only_copy, *array, length * sizeof((*array)[0]));
@@ -162,6 +167,8 @@ static int CeedVectorRestoreArray_Memcheck(CeedVector vec) {
   CeedCallBackend(CeedVectorGetLength(vec, &length));
   CeedCallBackend(CeedVectorGetCeed(vec, &ceed));
 
+  memcpy(impl->array, impl->array_writable_copy, length * sizeof((impl->array)[0]));
+  CeedCallBackend(CeedFree(&impl->array_writable_copy));
   if (impl->is_write_only_access) {
     for (CeedSize i = 0; i < length; i++) {
       if (isnan(impl->array[i]))
diff --git a/backends/memcheck/ceed-memcheck.h b/backends/memcheck/ceed-memcheck.h
index 603597fb1e..f4787cf1ce 100644
--- a/backends/memcheck/ceed-memcheck.h
+++ b/backends/memcheck/ceed-memcheck.h
@@ -17,6 +17,7 @@ typedef struct {
   CeedScalar *array_owned;
   CeedScalar *array_borrowed;
   CeedScalar *array_read_only_copy;
+  CeedScalar *array_writable_copy;
 } CeedVector_Memcheck;
 
 typedef struct {

From ff8551c5d4a7b43daf0eaa912a82cb984d758edd Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Tue, 28 May 2024 09:20:59 -0600
Subject: [PATCH 046/571] style - num_active_* => qf_size_*

---
 backends/blocked/ceed-blocked-operator.c | 38 ++++++++--------
 backends/blocked/ceed-blocked.h          |  2 +-
 backends/opt/ceed-opt-operator.c         | 38 ++++++++--------
 backends/opt/ceed-opt.h                  |  2 +-
 backends/ref/ceed-ref-operator.c         | 57 ++++++++++++------------
 backends/ref/ceed-ref.h                  |  2 +-
 6 files changed, 69 insertions(+), 70 deletions(-)

diff --git a/backends/blocked/ceed-blocked-operator.c b/backends/blocked/ceed-blocked-operator.c
index fb7425d233..10a72510f3 100644
--- a/backends/blocked/ceed-blocked-operator.c
+++ b/backends/blocked/ceed-blocked-operator.c
@@ -437,7 +437,7 @@ static int CeedOperatorApplyAdd_Blocked(CeedOperator op, CeedVector in_vec, Ceed
 static inline int CeedOperatorLinearAssembleQFunctionCore_Blocked(CeedOperator op, bool build_objects, CeedVector *assembled,
                                                                   CeedElemRestriction *rstr, CeedRequest *request) {
   Ceed                  ceed;
-  CeedInt               num_active_in, num_active_out, Q, num_input_fields, num_output_fields, num_elem;
+  CeedInt               qf_size_in, qf_size_out, Q, num_input_fields, num_output_fields, num_elem;
   const CeedInt         block_size = 8;
   CeedScalar           *l_vec_array;
   CeedScalar           *e_data_full[2 * CEED_FIELD_MAX] = {0};
@@ -447,8 +447,8 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Blocked(CeedOperator o
   CeedOperator_Blocked *impl;
 
   CeedCallBackend(CeedOperatorGetData(op, &impl));
-  num_active_in                  = impl->num_active_in;
-  num_active_out                 = impl->num_active_in;
+  qf_size_in                     = impl->qf_size_in;
+  qf_size_out                    = impl->qf_size_out;
   CeedVector          l_vec      = impl->qf_l_vec;
   CeedElemRestriction block_rstr = impl->qf_block_rstr;
 
@@ -470,7 +470,7 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Blocked(CeedOperator o
   CeedCallBackend(CeedOperatorSetupInputs_Blocked(num_input_fields, qf_input_fields, op_input_fields, NULL, true, e_data_full, impl, request));
 
   // Count number of active input fields
-  if (num_active_in == 0) {
+  if (qf_size_in == 0) {
     for (CeedInt i = 0; i < num_input_fields; i++) {
       CeedInt    field_size;
       CeedVector vec;
@@ -481,15 +481,15 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Blocked(CeedOperator o
       if (vec == CEED_VECTOR_ACTIVE) {
         CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &field_size));
         CeedCallBackend(CeedVectorSetValue(impl->q_vecs_in[i], 0.0));
-        num_active_in += field_size;
+        qf_size_in += field_size;
       }
     }
-    CeedCheck(num_active_in > 0, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs");
-    impl->num_active_in = num_active_in;
+    CeedCheck(qf_size_in > 0, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs");
+    impl->qf_size_in = qf_size_in;
   }
 
   // Count number of active output fields
-  if (num_active_out == 0) {
+  if (qf_size_out == 0) {
     for (CeedInt i = 0; i < num_output_fields; i++) {
       CeedInt    field_size;
       CeedVector vec;
@@ -499,16 +499,16 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Blocked(CeedOperator o
       // Check if active output
       if (vec == CEED_VECTOR_ACTIVE) {
         CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &field_size));
-        num_active_out += field_size;
+        qf_size_out += field_size;
       }
     }
-    CeedCheck(num_active_out > 0, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs");
-    impl->num_active_out = num_active_out;
+    CeedCheck(qf_size_out > 0, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs");
+    impl->qf_size_out = qf_size_out;
   }
 
   // Setup Lvec
   if (!l_vec) {
-    const CeedSize l_size = (CeedSize)num_blocks * block_size * Q * num_active_in * num_active_out;
+    const CeedSize l_size = (CeedSize)num_blocks * block_size * Q * qf_size_in * qf_size_out;
 
     CeedCallBackend(CeedVectorCreate(ceed, l_size, &l_vec));
     impl->qf_l_vec = l_vec;
@@ -517,21 +517,21 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Blocked(CeedOperator o
 
   // Setup block restriction
   if (!block_rstr) {
-    const CeedInt strides[3] = {1, Q, num_active_in * num_active_out * Q};
+    const CeedInt strides[3] = {1, Q, qf_size_in * qf_size_out * Q};
 
-    CeedCallBackend(CeedElemRestrictionCreateBlockedStrided(ceed, num_elem, Q, block_size, num_active_in * num_active_out,
-                                                            num_active_in * num_active_out * num_elem * Q, strides, &block_rstr));
+    CeedCallBackend(CeedElemRestrictionCreateBlockedStrided(ceed, num_elem, Q, block_size, qf_size_in * qf_size_out,
+                                                            qf_size_in * qf_size_out * num_elem * Q, strides, &block_rstr));
     impl->qf_block_rstr = block_rstr;
   }
 
   // Build objects if needed
   if (build_objects) {
-    const CeedSize l_size     = (CeedSize)num_elem * Q * num_active_in * num_active_out;
-    const CeedInt  strides[3] = {1, Q, num_active_in * num_active_out * Q};
+    const CeedSize l_size     = (CeedSize)num_elem * Q * qf_size_in * qf_size_out;
+    const CeedInt  strides[3] = {1, Q, qf_size_in * qf_size_out * Q};
 
     // Create output restriction
-    CeedCallBackend(CeedElemRestrictionCreateStrided(ceed, num_elem, Q, num_active_in * num_active_out, num_active_in * num_active_out * num_elem * Q,
-                                                     strides, rstr));
+    CeedCallBackend(
+        CeedElemRestrictionCreateStrided(ceed, num_elem, Q, qf_size_in * qf_size_out, qf_size_in * qf_size_out * num_elem * Q, strides, rstr));
     // Create assembled vector
     CeedCallBackend(CeedVectorCreate(ceed, l_size, assembled));
   }
diff --git a/backends/blocked/ceed-blocked.h b/backends/blocked/ceed-blocked.h
index 4ddb3ab67e..fef7967518 100644
--- a/backends/blocked/ceed-blocked.h
+++ b/backends/blocked/ceed-blocked.h
@@ -25,7 +25,7 @@ typedef struct {
   CeedVector          *q_vecs_in;    /* Element block input Q-vectors  */
   CeedVector          *q_vecs_out;   /* Element block output Q-vectors */
   CeedInt              num_inputs, num_outputs;
-  CeedInt              num_active_in, num_active_out;
+  CeedInt              qf_size_in, qf_size_out;
   CeedVector           qf_l_vec;
   CeedElemRestriction  qf_block_rstr;
 } CeedOperator_Blocked;
diff --git a/backends/opt/ceed-opt-operator.c b/backends/opt/ceed-opt-operator.c
index 3770164880..bc374cd460 100644
--- a/backends/opt/ceed-opt-operator.c
+++ b/backends/opt/ceed-opt-operator.c
@@ -441,7 +441,7 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Opt(CeedOperator op, b
                                                               CeedRequest *request) {
   Ceed                ceed;
   Ceed_Opt           *ceed_impl;
-  CeedInt             num_active_in, num_active_out, Q, num_input_fields, num_output_fields, num_elem;
+  CeedInt             qf_size_in, qf_size_out, Q, num_input_fields, num_output_fields, num_elem;
   CeedScalar         *l_vec_array, *e_data[2 * CEED_FIELD_MAX] = {0};
   CeedQFunctionField *qf_input_fields, *qf_output_fields;
   CeedQFunction       qf;
@@ -451,8 +451,8 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Opt(CeedOperator op, b
   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
   CeedCallBackend(CeedGetData(ceed, &ceed_impl));
   CeedCallBackend(CeedOperatorGetData(op, &impl));
-  num_active_in  = impl->num_active_in;
-  num_active_out = impl->num_active_out;
+  qf_size_in  = impl->qf_size_in;
+  qf_size_out = impl->qf_size_out;
 
   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
   CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q));
@@ -474,7 +474,7 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Opt(CeedOperator op, b
   CeedCallBackend(CeedOperatorSetupInputs_Opt(num_input_fields, qf_input_fields, op_input_fields, NULL, e_data, impl, request));
 
   // Count number of active input fields
-  if (num_active_in == 0) {
+  if (qf_size_in == 0) {
     for (CeedInt i = 0; i < num_input_fields; i++) {
       CeedInt    field_size;
       CeedVector vec;
@@ -485,15 +485,15 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Opt(CeedOperator op, b
       if (vec == CEED_VECTOR_ACTIVE) {
         CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &field_size));
         CeedCallBackend(CeedVectorSetValue(impl->q_vecs_in[i], 0.0));
-        num_active_in += field_size;
+        qf_size_in += field_size;
       }
     }
-    CeedCheck(num_active_in > 0, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs");
-    impl->num_active_in = num_active_in;
+    CeedCheck(qf_size_in > 0, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs");
+    impl->qf_size_in = qf_size_in;
   }
 
   // Count number of active output fields
-  if (num_active_out == 0) {
+  if (qf_size_out == 0) {
     for (CeedInt i = 0; i < num_output_fields; i++) {
       CeedInt    field_size;
       CeedVector vec;
@@ -503,16 +503,16 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Opt(CeedOperator op, b
       // Check if active output
       if (vec == CEED_VECTOR_ACTIVE) {
         CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &field_size));
-        num_active_out += field_size;
+        qf_size_out += field_size;
       }
     }
-    CeedCheck(num_active_out > 0, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs");
-    impl->num_active_out = num_active_out;
+    CeedCheck(qf_size_out > 0, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs");
+    impl->qf_size_out = qf_size_out;
   }
 
   // Setup l_vec
   if (!l_vec) {
-    const CeedSize l_size = (CeedSize)block_size * Q * num_active_in * num_active_out;
+    const CeedSize l_size = (CeedSize)block_size * Q * qf_size_in * qf_size_out;
 
     CeedCallBackend(CeedVectorCreate(ceed, l_size, &l_vec));
     CeedCallBackend(CeedVectorSetValue(l_vec, 0.0));
@@ -521,21 +521,21 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Opt(CeedOperator op, b
 
   // Output blocked restriction
   if (!block_rstr) {
-    CeedInt strides[3] = {1, Q, num_active_in * num_active_out * Q};
+    CeedInt strides[3] = {1, Q, qf_size_in * qf_size_out * Q};
 
-    CeedCallBackend(CeedElemRestrictionCreateBlockedStrided(ceed, num_elem, Q, block_size, num_active_in * num_active_out,
-                                                            num_active_in * num_active_out * num_elem * Q, strides, &block_rstr));
+    CeedCallBackend(CeedElemRestrictionCreateBlockedStrided(ceed, num_elem, Q, block_size, qf_size_in * qf_size_out,
+                                                            qf_size_in * qf_size_out * num_elem * Q, strides, &block_rstr));
     impl->qf_block_rstr = block_rstr;
   }
 
   // Build objects if needed
   if (build_objects) {
-    const CeedSize l_size     = (CeedSize)num_elem * Q * num_active_in * num_active_out;
-    CeedInt        strides[3] = {1, Q, num_active_in * num_active_out * Q};
+    const CeedSize l_size     = (CeedSize)num_elem * Q * qf_size_in * qf_size_out;
+    CeedInt        strides[3] = {1, Q, qf_size_in * qf_size_out * Q};
 
     // Create output restriction
-    CeedCallBackend(CeedElemRestrictionCreateStrided(ceed, num_elem, Q, num_active_in * num_active_out, num_active_in * num_active_out * num_elem * Q,
-                                                     strides, rstr));
+    CeedCallBackend(
+        CeedElemRestrictionCreateStrided(ceed, num_elem, Q, qf_size_in * qf_size_out, qf_size_in * qf_size_out * num_elem * Q, strides, rstr));
     // Create assembled vector
     CeedCallBackend(CeedVectorCreate(ceed, l_size, assembled));
   }
diff --git a/backends/opt/ceed-opt.h b/backends/opt/ceed-opt.h
index 1ccec8ac75..b40124fb99 100644
--- a/backends/opt/ceed-opt.h
+++ b/backends/opt/ceed-opt.h
@@ -29,7 +29,7 @@ typedef struct {
   CeedVector          *q_vecs_in;    /* Element block input Q-vectors  */
   CeedVector          *q_vecs_out;   /* Element block output Q-vectors */
   CeedInt              num_inputs, num_outputs;
-  CeedInt              num_active_in, num_active_out;
+  CeedInt              qf_size_in, qf_size_out;
   CeedVector           qf_l_vec;
   CeedElemRestriction  qf_block_rstr;
 } CeedOperator_Opt;
diff --git a/backends/ref/ceed-ref-operator.c b/backends/ref/ceed-ref-operator.c
index 3d9f413d6a..a0a6a29a2d 100644
--- a/backends/ref/ceed-ref-operator.c
+++ b/backends/ref/ceed-ref-operator.c
@@ -382,7 +382,7 @@ static int CeedOperatorApplyAdd_Ref(CeedOperator op, CeedVector in_vec, CeedVect
 static inline int CeedOperatorLinearAssembleQFunctionCore_Ref(CeedOperator op, bool build_objects, CeedVector *assembled, CeedElemRestriction *rstr,
                                                               CeedRequest *request) {
   Ceed                ceed, ceed_parent;
-  CeedInt             num_active_in, num_active_out, Q, num_elem, num_input_fields, num_output_fields;
+  CeedInt             qf_size_in, qf_size_out, Q, num_elem, num_input_fields, num_output_fields;
   CeedScalar         *assembled_array, *e_data_full[2 * CEED_FIELD_MAX] = {NULL};
   CeedQFunctionField *qf_input_fields, *qf_output_fields;
   CeedQFunction       qf;
@@ -392,8 +392,8 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Ref(CeedOperator op, b
   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
   CeedCallBackend(CeedOperatorGetFallbackParentCeed(op, &ceed_parent));
   CeedCallBackend(CeedOperatorGetData(op, &impl));
-  num_active_in  = impl->num_active_in;
-  num_active_out = impl->num_active_out;
+  qf_size_in  = impl->qf_size_in;
+  qf_size_out = impl->qf_size_out;
   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
   CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q));
   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
@@ -410,7 +410,7 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Ref(CeedOperator op, b
   CeedCallBackend(CeedOperatorSetupInputs_Ref(num_input_fields, qf_input_fields, op_input_fields, NULL, true, e_data_full, impl, request));
 
   // Count number of active input fields
-  if (num_active_in == 0) {
+  if (qf_size_in == 0) {
     for (CeedInt i = 0; i < num_input_fields; i++) {
       CeedInt    field_size;
       CeedVector vec;
@@ -421,15 +421,15 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Ref(CeedOperator op, b
       if (vec == CEED_VECTOR_ACTIVE) {
         CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &field_size));
         CeedCallBackend(CeedVectorSetValue(impl->q_vecs_in[i], 0.0));
-        num_active_in += field_size;
+        qf_size_in += field_size;
       }
     }
-    CeedCheck(num_active_in > 0, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs");
-    impl->num_active_in = num_active_in;
+    CeedCheck(qf_size_in > 0, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs");
+    impl->qf_size_in = qf_size_in;
   }
 
   // Count number of active output fields
-  if (num_active_out == 0) {
+  if (qf_size_out == 0) {
     for (CeedInt i = 0; i < num_output_fields; i++) {
       CeedInt    field_size;
       CeedVector vec;
@@ -439,21 +439,21 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Ref(CeedOperator op, b
       // Check if active output
       if (vec == CEED_VECTOR_ACTIVE) {
         CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &field_size));
-        num_active_out += field_size;
+        qf_size_out += field_size;
       }
     }
-    CeedCheck(num_active_out > 0, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs");
-    impl->num_active_out = num_active_out;
+    CeedCheck(qf_size_out > 0, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs");
+    impl->qf_size_out = qf_size_out;
   }
 
   // Build objects if needed
   if (build_objects) {
-    const CeedSize l_size     = (CeedSize)num_elem * Q * num_active_in * num_active_out;
-    CeedInt        strides[3] = {1, Q, num_active_in * num_active_out * Q}; /* *NOPAD* */
+    const CeedSize l_size     = (CeedSize)num_elem * Q * qf_size_in * qf_size_out;
+    CeedInt        strides[3] = {1, Q, qf_size_in * qf_size_out * Q}; /* *NOPAD* */
 
     // Create output restriction
-    CeedCallBackend(CeedElemRestrictionCreateStrided(ceed_parent, num_elem, Q, num_active_in * num_active_out,
-                                                     num_active_in * num_active_out * num_elem * Q, strides, rstr));
+    CeedCallBackend(
+        CeedElemRestrictionCreateStrided(ceed_parent, num_elem, Q, qf_size_in * qf_size_out, qf_size_in * qf_size_out * num_elem * Q, strides, rstr));
     // Create assembled vector
     CeedCallBackend(CeedVectorCreate(ceed_parent, l_size, assembled));
   }
@@ -893,7 +893,7 @@ static int CeedOperatorApplyAddAtPoints_Ref(CeedOperator op, CeedVector in_vec,
 static inline int CeedOperatorLinearAssembleQFunctionAtPointsCore_Ref(CeedOperator op, bool build_objects, CeedVector *assembled,
                                                                       CeedElemRestriction *rstr, CeedRequest *request) {
   Ceed                ceed;
-  CeedInt             num_active_in, num_active_out, max_num_points, num_elem, num_input_fields, num_output_fields, num_points_offset = 0;
+  CeedInt             qf_size_in, qf_size_out, max_num_points, num_elem, num_input_fields, num_output_fields, num_points_offset = 0;
   CeedScalar         *assembled_array, *e_data_full[2 * CEED_FIELD_MAX] = {NULL};
   CeedVector          point_coords = NULL;
   CeedQFunctionField *qf_input_fields, *qf_output_fields;
@@ -904,8 +904,8 @@ static inline int CeedOperatorLinearAssembleQFunctionAtPointsCore_Ref(CeedOperat
 
   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
   CeedCallBackend(CeedOperatorGetData(op, &impl));
-  num_active_in  = impl->num_active_in;
-  num_active_out = impl->num_active_out;
+  qf_size_in  = impl->qf_size_in;
+  qf_size_out = impl->qf_size_out;
   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
@@ -925,7 +925,7 @@ static inline int CeedOperatorLinearAssembleQFunctionAtPointsCore_Ref(CeedOperat
   CeedCallBackend(CeedOperatorSetupInputs_Ref(num_input_fields, qf_input_fields, op_input_fields, NULL, true, e_data_full, impl, request));
 
   // Count number of active input fields
-  if (num_active_in == 0) {
+  if (qf_size_in == 0) {
     for (CeedInt i = 0; i < num_input_fields; i++) {
       CeedInt    field_size;
       CeedVector vec;
@@ -945,15 +945,15 @@ static inline int CeedOperatorLinearAssembleQFunctionAtPointsCore_Ref(CeedOperat
         }
         // Get size of active input
         CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &field_size));
-        num_active_in += field_size;
+        qf_size_in += field_size;
       }
     }
-    CeedCheck(num_active_in, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs");
-    impl->num_active_in = num_active_in;
+    CeedCheck(qf_size_in, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs");
+    impl->qf_size_in = qf_size_in;
   }
 
   // Count number of active output fields
-  if (num_active_out == 0) {
+  if (qf_size_out == 0) {
     for (CeedInt i = 0; i < num_output_fields; i++) {
       CeedInt    field_size;
       CeedVector vec;
@@ -974,11 +974,11 @@ static inline int CeedOperatorLinearAssembleQFunctionAtPointsCore_Ref(CeedOperat
         // Get size of active output
         CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &field_size));
         CeedCallBackend(CeedVectorSetValue(impl->q_vecs_in[i], 0.0));
-        num_active_out += field_size;
+        qf_size_out += field_size;
       }
     }
-    CeedCheck(num_active_out > 0, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs");
-    impl->num_active_out = num_active_out;
+    CeedCheck(qf_size_out > 0, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs");
+    impl->qf_size_out = qf_size_out;
   }
 
   // Build objects if needed
@@ -990,9 +990,8 @@ static inline int CeedOperatorLinearAssembleQFunctionAtPointsCore_Ref(CeedOperat
 
     // Create output restriction (at points)
     CeedCallBackend(CeedElemRestrictionGetOffsets(rstr_points, CEED_MEM_HOST, &offsets));
-    CeedCallBackend(CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points_total, num_active_in * num_active_out,
-                                                      num_active_in * num_active_out * num_points_total, CEED_MEM_HOST, CEED_COPY_VALUES, offsets,
-                                                      rstr));
+    CeedCallBackend(CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points_total, qf_size_in * qf_size_out,
+                                                      qf_size_in * qf_size_out * num_points_total, CEED_MEM_HOST, CEED_COPY_VALUES, offsets, rstr));
     CeedCallBackend(CeedElemRestrictionRestoreOffsets(rstr_points, &offsets));
 
     // Create assembled vector
diff --git a/backends/ref/ceed-ref.h b/backends/ref/ceed-ref.h
index 9fd3fa7b31..369a27c049 100644
--- a/backends/ref/ceed-ref.h
+++ b/backends/ref/ceed-ref.h
@@ -56,7 +56,7 @@ typedef struct {
   CeedVector *q_vecs_in;    /* Single element input Q-vectors  */
   CeedVector *q_vecs_out;   /* Single element output Q-vectors */
   CeedInt     num_inputs, num_outputs;
-  CeedInt     num_active_in, num_active_out;
+  CeedInt     qf_size_in, qf_size_out;
   CeedVector  point_coords_elem;
 } CeedOperator_Ref;
 

From 0704d75a8ef593ef012321cffbdeae3669b62595 Mon Sep 17 00:00:00 2001
From: James Wright <james@jameswright.xyz>
Date: Fri, 24 May 2024 16:28:01 -0600
Subject: [PATCH 047/571] test: Update location of SmartSim testing

This installation is also a more recent version of SmartSim and
SmartRedis as well.
---
 .gitlab-ci.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index b5ebd5c954..497bfd3139 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -96,7 +96,7 @@ noether-cpu:
 # Libraries for examples
 # -- PETSc with HIP (minimal)
     - export PETSC_DIR=/projects/petsc PETSC_ARCH=mpich-hip-int64 && git -C $PETSC_DIR -c safe.directory=$PETSC_DIR describe
-    - source /home/jawr8143/SmartSimTestingSoftware/bin/activate && export SMARTREDIS_DIR=/home/jawr8143/SmartSimTestingSoftware/smartredis/install
+    - source /home/phypid/SmartSimTestingSoftware/bin/activate && export SMARTREDIS_DIR=/home/phypid/SmartSimTestingSoftware/smartredis/install
     - echo "-------------- PETSc ---------------" && make -C $PETSC_DIR info
     - make -k -j$((NPROC_CPU / NPROC_POOL)) BACKENDS="$BACKENDS_CPU" JUNIT_BATCH="cpu" junit search="petsc fluids solids"
 # -- MFEM v4.6
@@ -196,7 +196,7 @@ noether-cuda:
 # Libraries for examples
 # -- PETSc with CUDA (minimal)
     - export PETSC_DIR=/projects/petsc PETSC_ARCH=mpich-cuda-O PETSC_OPTIONS='-use_gpu_aware_mpi 0' && git -C $PETSC_DIR -c safe.directory=$PETSC_DIR describe
-    - source /home/jawr8143/SmartSimTestingSoftware/bin/activate && export SMARTREDIS_DIR=/home/jawr8143/SmartSimTestingSoftware/smartredis/install
+    - source /home/phypid/SmartSimTestingSoftware/bin/activate && export SMARTREDIS_DIR=/home/phypid/SmartSimTestingSoftware/smartredis/install
     - echo "-------------- PETSc ---------------" && make -C $PETSC_DIR info
     - make -k -j$((NPROC_GPU / NPROC_POOL)) JUNIT_BATCH="cuda" junit BACKENDS="$BACKENDS_GPU" search="petsc fluids solids"
 # Clang-tidy

From 7afa9fc3dd79e297c441a7540bd007b4bd91373e Mon Sep 17 00:00:00 2001
From: James Wright <james@jameswright.xyz>
Date: Wed, 29 May 2024 11:10:05 -0600
Subject: [PATCH 048/571] fluids: Change deprecated vortexshedding.yaml options

---
 examples/fluids/vortexshedding.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/fluids/vortexshedding.yaml b/examples/fluids/vortexshedding.yaml
index 9541a94c7e..9b910da328 100644
--- a/examples/fluids/vortexshedding.yaml
+++ b/examples/fluids/vortexshedding.yaml
@@ -33,7 +33,7 @@ outflow:
 freestream:
   velocity: 1,0,0
 # Small gravity vector to break symmetry so shedding can start
-g: 0,-.01,0
+gravity: 0,-.01,0
 
 # viscosity corresponds to Reynolds number 100
 mu: 0.01
@@ -44,11 +44,11 @@ degree: 3
 dm_plex_filename: examples/fluids/meshes/cylinder-q1-n08.msh
 
 # Boundary Settings
-bc_slip_z: 6
+bc_symmetry_z: 6
 bc_wall: 5
 bc_freestream: 1
 bc_outflow: 2
-bc_slip_y: 3,4
+bc_symmetry_y: 3,4
 wall_comps: 1,2,3
 
 # Primitive variables are preferred at low Mach number

From 75765c5ee087b820f9db177ebacebc85fee03b57 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Wed, 29 May 2024 15:40:28 -0600
Subject: [PATCH 049/571] minor - fix typo

---
 backends/cuda-shared/kernels/cuda-shared-basis.cu | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/backends/cuda-shared/kernels/cuda-shared-basis.cu b/backends/cuda-shared/kernels/cuda-shared-basis.cu
index 3374cd8bb8..1eb03fb2e9 100644
--- a/backends/cuda-shared/kernels/cuda-shared-basis.cu
+++ b/backends/cuda-shared/kernels/cuda-shared-basis.cu
@@ -13,7 +13,7 @@ __constant__ CeedScalar c_B[sizeMax*sizeMax];
 __constant__ CeedScalar c_G[sizeMax*sizeMax];
 
 //------------------------------------------------------------------------------
-// Interp device initalization
+// Interp device initialization
 //------------------------------------------------------------------------------
 extern "C" int CeedInit_CudaInterp(CeedScalar *d_B, CeedInt P_1d, CeedInt Q_1d,
                                   CeedScalar **c_B_ptr) {
@@ -25,7 +25,7 @@ extern "C" int CeedInit_CudaInterp(CeedScalar *d_B, CeedInt P_1d, CeedInt Q_1d,
 }
 
 //------------------------------------------------------------------------------
-// Grad device initalization
+// Grad device initialization
 //------------------------------------------------------------------------------
 extern "C" int CeedInit_CudaGrad(CeedScalar *d_B, CeedScalar *d_G,
     CeedInt P_1d, CeedInt Q_1d, CeedScalar **c_B_ptr, CeedScalar **c_G_ptr) {
@@ -39,7 +39,7 @@ extern "C" int CeedInit_CudaGrad(CeedScalar *d_B, CeedScalar *d_G,
 }
 
 //------------------------------------------------------------------------------
-// Collocated grad device initalization
+// Collocated grad device initialization
 //------------------------------------------------------------------------------
 extern "C" int CeedInit_CudaCollocatedGrad(CeedScalar *d_B, CeedScalar *d_G,
     CeedInt P_1d, CeedInt Q_1d, CeedScalar **c_B_ptr, CeedScalar **c_G_ptr) {

From fc0f7cc68128f3536a834f19d72828f4c59a4439 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 31 May 2024 11:23:11 -0600
Subject: [PATCH 050/571] basis - ApplyAtPoints should take number of elem

---
 backends/ref/ceed-ref-operator.c              | 10 ++--
 include/ceed-impl.h                           |  2 +-
 include/ceed/ceed.h                           | 34 ++++++------
 interface/ceed-basis.c                        | 55 ++++++++++---------
 .../src/generated/libceed_bindings.jl         |  4 +-
 tests/t350-basis.c                            |  2 +-
 tests/t351-basis.c                            |  2 +-
 tests/t352-basis.c                            |  2 +-
 tests/t353-basis.c                            |  7 ++-
 tests/t354-basis.c                            |  7 ++-
 tests/t355-basis.c                            |  2 +-
 tests/t356-basis.c                            |  2 +-
 tests/t357-basis.c                            |  4 +-
 13 files changed, 70 insertions(+), 63 deletions(-)

diff --git a/backends/ref/ceed-ref-operator.c b/backends/ref/ceed-ref-operator.c
index a0a6a29a2d..538accec17 100644
--- a/backends/ref/ceed-ref-operator.c
+++ b/backends/ref/ceed-ref-operator.c
@@ -654,7 +654,7 @@ static int CeedOperatorSetupFieldsAtPoints_Ref(CeedQFunction qf, CeedOperator op
         q_size = (CeedSize)max_num_points;
         CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i]));
         CeedCallBackend(
-            CeedBasisApplyAtPoints(basis, max_num_points, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT, CEED_VECTOR_NONE, CEED_VECTOR_NONE, q_vecs[i]));
+            CeedBasisApplyAtPoints(basis, 1, &max_num_points, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT, CEED_VECTOR_NONE, CEED_VECTOR_NONE, q_vecs[i]));
         break;
     }
     // Initialize full arrays for E-vectors and Q-vectors
@@ -767,7 +767,7 @@ static inline int CeedOperatorInputBasisAtPoints_Ref(CeedInt e, CeedInt num_poin
           CeedCallBackend(CeedVectorSetArray(impl->e_vecs_in[i], CEED_MEM_HOST, CEED_USE_POINTER, &e_data[i][(CeedSize)e * elem_size * num_comp]));
         }
         CeedCallBackend(
-            CeedBasisApplyAtPoints(basis, num_points, CEED_NOTRANSPOSE, eval_mode, point_coords_elem, impl->e_vecs_in[i], impl->q_vecs_in[i]));
+            CeedBasisApplyAtPoints(basis, 1, &num_points, CEED_NOTRANSPOSE, eval_mode, point_coords_elem, impl->e_vecs_in[i], impl->q_vecs_in[i]));
         break;
       case CEED_EVAL_WEIGHT:
         break;  // No action
@@ -803,7 +803,7 @@ static inline int CeedOperatorOutputBasisAtPoints_Ref(CeedInt e, CeedInt num_poi
       case CEED_EVAL_CURL:
         CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
         CeedCallBackend(
-            CeedBasisApplyAtPoints(basis, num_points, CEED_TRANSPOSE, eval_mode, point_coords_elem, impl->q_vecs_out[i], impl->e_vecs_out[i]));
+            CeedBasisApplyAtPoints(basis, 1, &num_points, CEED_TRANSPOSE, eval_mode, point_coords_elem, impl->q_vecs_out[i], impl->e_vecs_out[i]));
         break;
       // LCOV_EXCL_START
       case CEED_EVAL_WEIGHT: {
@@ -1237,7 +1237,7 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Ref(CeedOperator op, Ce
           case CEED_EVAL_DIV:
           case CEED_EVAL_CURL:
             CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
-            CeedCallBackend(CeedBasisApplyAtPoints(basis, num_points, CEED_NOTRANSPOSE, eval_mode, impl->point_coords_elem, impl->e_vecs_in[i],
+            CeedCallBackend(CeedBasisApplyAtPoints(basis, 1, &num_points, CEED_NOTRANSPOSE, eval_mode, impl->point_coords_elem, impl->e_vecs_in[i],
                                                    impl->q_vecs_in[i]));
             break;
           case CEED_EVAL_WEIGHT:
@@ -1280,7 +1280,7 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Ref(CeedOperator op, Ce
           case CEED_EVAL_DIV:
           case CEED_EVAL_CURL:
             CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
-            CeedCallBackend(CeedBasisApplyAtPoints(basis, num_points, CEED_TRANSPOSE, eval_mode, impl->point_coords_elem, impl->q_vecs_out[i],
+            CeedCallBackend(CeedBasisApplyAtPoints(basis, 1, &num_points, CEED_TRANSPOSE, eval_mode, impl->point_coords_elem, impl->q_vecs_out[i],
                                                    impl->e_vecs_out[i]));
             break;
           // LCOV_EXCL_START
diff --git a/include/ceed-impl.h b/include/ceed-impl.h
index 4fe0c67c9f..956b30b757 100644
--- a/include/ceed-impl.h
+++ b/include/ceed-impl.h
@@ -184,7 +184,7 @@ struct CeedElemRestriction_private {
 struct CeedBasis_private {
   Ceed ceed;
   int (*Apply)(CeedBasis, CeedInt, CeedTransposeMode, CeedEvalMode, CeedVector, CeedVector);
-  int (*ApplyAtPoints)(CeedBasis, CeedInt, CeedTransposeMode, CeedEvalMode, CeedVector, CeedVector, CeedVector);
+  int (*ApplyAtPoints)(CeedBasis, CeedInt, const CeedInt *, CeedTransposeMode, CeedEvalMode, CeedVector, CeedVector, CeedVector);
   int (*Destroy)(CeedBasis);
   int                ref_count;
   bool               is_tensor_basis; /* flag for tensor basis */
diff --git a/include/ceed/ceed.h b/include/ceed/ceed.h
index 9ebb40534d..f0f6f65be7 100644
--- a/include/ceed/ceed.h
+++ b/include/ceed/ceed.h
@@ -289,23 +289,23 @@ CEED_EXTERN int  CeedElemRestrictionDestroy(CeedElemRestriction *rstr);
 //  \int_\Omega v^T f_0(u, \nabla u, qdata) + (\nabla v)^T f_1(u, \nabla u, qdata)
 // where gradients are with respect to the reference element.
 
-CEED_EXTERN int  CeedBasisCreateTensorH1Lagrange(Ceed ceed, CeedInt dim, CeedInt num_comp, CeedInt P, CeedInt Q, CeedQuadMode quad_mode,
-                                                 CeedBasis *basis);
-CEED_EXTERN int  CeedBasisCreateTensorH1(Ceed ceed, CeedInt dim, CeedInt num_comp, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d,
-                                         const CeedScalar *grad_1d, const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis *basis);
-CEED_EXTERN int  CeedBasisCreateH1(Ceed ceed, CeedElemTopology topo, CeedInt num_comp, CeedInt num_nodes, CeedInt nqpts, const CeedScalar *interp,
-                                   const CeedScalar *grad, const CeedScalar *q_ref, const CeedScalar *q_weights, CeedBasis *basis);
-CEED_EXTERN int  CeedBasisCreateHdiv(Ceed ceed, CeedElemTopology topo, CeedInt num_comp, CeedInt num_nodes, CeedInt nqpts, const CeedScalar *interp,
-                                     const CeedScalar *div, const CeedScalar *q_ref, const CeedScalar *q_weights, CeedBasis *basis);
-CEED_EXTERN int  CeedBasisCreateHcurl(Ceed ceed, CeedElemTopology topo, CeedInt num_comp, CeedInt num_nodes, CeedInt nqpts, const CeedScalar *interp,
-                                      const CeedScalar *curl, const CeedScalar *q_ref, const CeedScalar *q_weights, CeedBasis *basis);
-CEED_EXTERN int  CeedBasisCreateProjection(CeedBasis basis_from, CeedBasis basis_to, CeedBasis *basis_project);
-CEED_EXTERN int  CeedBasisReferenceCopy(CeedBasis basis, CeedBasis *basis_copy);
-CEED_EXTERN int  CeedBasisView(CeedBasis basis, FILE *stream);
-CEED_EXTERN int  CeedBasisApply(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, CeedVector v);
-CEED_EXTERN int  CeedBasisApplyAtPoints(CeedBasis basis, CeedInt num_points, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector x_ref,
-                                        CeedVector u, CeedVector v);
-CEED_EXTERN int  CeedBasisGetCeed(CeedBasis basis, Ceed *ceed);
+CEED_EXTERN int CeedBasisCreateTensorH1Lagrange(Ceed ceed, CeedInt dim, CeedInt num_comp, CeedInt P, CeedInt Q, CeedQuadMode quad_mode,
+                                                CeedBasis *basis);
+CEED_EXTERN int CeedBasisCreateTensorH1(Ceed ceed, CeedInt dim, CeedInt num_comp, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d,
+                                        const CeedScalar *grad_1d, const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis *basis);
+CEED_EXTERN int CeedBasisCreateH1(Ceed ceed, CeedElemTopology topo, CeedInt num_comp, CeedInt num_nodes, CeedInt nqpts, const CeedScalar *interp,
+                                  const CeedScalar *grad, const CeedScalar *q_ref, const CeedScalar *q_weights, CeedBasis *basis);
+CEED_EXTERN int CeedBasisCreateHdiv(Ceed ceed, CeedElemTopology topo, CeedInt num_comp, CeedInt num_nodes, CeedInt nqpts, const CeedScalar *interp,
+                                    const CeedScalar *div, const CeedScalar *q_ref, const CeedScalar *q_weights, CeedBasis *basis);
+CEED_EXTERN int CeedBasisCreateHcurl(Ceed ceed, CeedElemTopology topo, CeedInt num_comp, CeedInt num_nodes, CeedInt nqpts, const CeedScalar *interp,
+                                     const CeedScalar *curl, const CeedScalar *q_ref, const CeedScalar *q_weights, CeedBasis *basis);
+CEED_EXTERN int CeedBasisCreateProjection(CeedBasis basis_from, CeedBasis basis_to, CeedBasis *basis_project);
+CEED_EXTERN int CeedBasisReferenceCopy(CeedBasis basis, CeedBasis *basis_copy);
+CEED_EXTERN int CeedBasisView(CeedBasis basis, FILE *stream);
+CEED_EXTERN int CeedBasisApply(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, CeedVector v);
+CEED_EXTERN int CeedBasisApplyAtPoints(CeedBasis basis, CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, CeedEvalMode eval_mode,
+                                       CeedVector x_ref, CeedVector u, CeedVector v);
+CEED_EXTERN int CeedBasisGetCeed(CeedBasis basis, Ceed *ceed);
 CEED_EXTERN Ceed CeedBasisReturnCeed(CeedBasis basis);
 CEED_EXTERN int  CeedBasisGetDimension(CeedBasis basis, CeedInt *dim);
 CEED_EXTERN int  CeedBasisGetTopology(CeedBasis basis, CeedElemTopology *topo);
diff --git a/interface/ceed-basis.c b/interface/ceed-basis.c
index ed24d72c66..f64ac7bf50 100644
--- a/interface/ceed-basis.c
+++ b/interface/ceed-basis.c
@@ -1499,7 +1499,9 @@ int CeedBasisApply(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode,
   @brief Apply basis evaluation from nodes to arbitrary points
 
   @param[in]  basis      `CeedBasis` to evaluate
-  @param[in]  num_points The number of points to apply the basis evaluation to
+  @param[in]  num_elem  The number of elements to apply the basis evaluation to;
+                          the backend will specify the ordering in @ref CeedElemRestrictionCreate()
+  @param[in]  num_points The number of points to apply the basis evaluation to in each element
   @param[in]  t_mode     @ref CEED_NOTRANSPOSE to evaluate from nodes to points;
                            @ref CEED_TRANSPOSE to apply the transpose, mapping from points to nodes
   @param[in]  eval_mode  @ref CEED_EVAL_INTERP to use interpolated values,
@@ -1513,10 +1515,10 @@ int CeedBasisApply(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode,
 
   @ref User
 **/
-int CeedBasisApplyAtPoints(CeedBasis basis, CeedInt num_points, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u,
-                           CeedVector v) {
+int CeedBasisApplyAtPoints(CeedBasis basis, CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, CeedEvalMode eval_mode,
+                           CeedVector x_ref, CeedVector u, CeedVector v) {
   bool     is_tensor_basis;
-  CeedInt  dim, num_comp, num_q_comp, num_nodes, P_1d = 1, Q_1d = 1;
+  CeedInt  dim, num_comp, num_q_comp, num_nodes, P_1d = 1, Q_1d = 1, total_num_points = 0;
   CeedSize x_length = 0, u_length = 0, v_length;
   Ceed     ceed;
 
@@ -1532,12 +1534,13 @@ int CeedBasisApplyAtPoints(CeedBasis basis, CeedInt num_points, CeedTransposeMod
   if (u != CEED_VECTOR_NONE) CeedCall(CeedVectorGetLength(u, &u_length));
 
   // Check compatibility of topological and geometrical dimensions
+  for (CeedInt i = 0; i < num_elem; i++) total_num_points += num_points[i];
   CeedCheck((t_mode == CEED_TRANSPOSE && v_length % num_nodes == 0) || (t_mode == CEED_NOTRANSPOSE && u_length % num_nodes == 0) ||
                 (eval_mode == CEED_EVAL_WEIGHT),
             ceed, CEED_ERROR_DIMENSION, "Length of input/output vectors incompatible with basis dimensions and number of points");
 
   // Check compatibility coordinates vector
-  CeedCheck((x_length >= num_points * dim) || (eval_mode == CEED_EVAL_WEIGHT), ceed, CEED_ERROR_DIMENSION,
+  CeedCheck((x_length >= total_num_points * dim) || (eval_mode == CEED_EVAL_WEIGHT), ceed, CEED_ERROR_DIMENSION,
             "Length of reference coordinate vector incompatible with basis dimension and number of points");
 
   // Check CEED_EVAL_WEIGHT only on CEED_NOTRANSPOSE
@@ -1548,15 +1551,16 @@ int CeedBasisApplyAtPoints(CeedBasis basis, CeedInt num_points, CeedTransposeMod
   bool has_good_dims = true;
   switch (eval_mode) {
     case CEED_EVAL_INTERP:
-      has_good_dims = ((t_mode == CEED_TRANSPOSE && (u_length >= num_points * num_q_comp || v_length >= num_nodes * num_comp)) ||
-                       (t_mode == CEED_NOTRANSPOSE && (v_length >= num_points * num_q_comp || u_length >= num_nodes * num_comp)));
+      has_good_dims = ((t_mode == CEED_TRANSPOSE && (u_length >= total_num_points * num_q_comp || v_length >= num_elem * num_nodes * num_comp)) ||
+                       (t_mode == CEED_NOTRANSPOSE && (v_length >= total_num_points * num_q_comp || u_length >= num_elem * num_nodes * num_comp)));
       break;
     case CEED_EVAL_GRAD:
-      has_good_dims = ((t_mode == CEED_TRANSPOSE && (u_length >= num_points * num_q_comp * dim || v_length >= num_nodes * num_comp)) ||
-                       (t_mode == CEED_NOTRANSPOSE && (v_length >= num_points * num_q_comp * dim || u_length >= num_nodes * num_comp)));
+      has_good_dims =
+          ((t_mode == CEED_TRANSPOSE && (u_length >= total_num_points * num_q_comp * dim || v_length >= num_elem * num_nodes * num_comp)) ||
+           (t_mode == CEED_NOTRANSPOSE && (v_length >= total_num_points * num_q_comp * dim || u_length >= num_elem * num_nodes * num_comp)));
       break;
     case CEED_EVAL_WEIGHT:
-      has_good_dims = t_mode == CEED_NOTRANSPOSE && (v_length >= num_points);
+      has_good_dims = t_mode == CEED_NOTRANSPOSE && (v_length >= total_num_points);
       break;
       // LCOV_EXCL_START
     case CEED_EVAL_NONE:
@@ -1569,13 +1573,14 @@ int CeedBasisApplyAtPoints(CeedBasis basis, CeedInt num_points, CeedTransposeMod
 
   // Backend method
   if (basis->ApplyAtPoints) {
-    CeedCall(basis->ApplyAtPoints(basis, num_points, t_mode, eval_mode, x_ref, u, v));
+    CeedCall(basis->ApplyAtPoints(basis, num_elem, num_points, t_mode, eval_mode, x_ref, u, v));
     return CEED_ERROR_SUCCESS;
   }
 
   // Default implementation
   CeedCall(CeedBasisIsTensor(basis, &is_tensor_basis));
   CeedCheck(is_tensor_basis, ceed, CEED_ERROR_UNSUPPORTED, "Evaluation at arbitrary points only supported for tensor product bases");
+  CeedCheck(num_elem == 1, ceed, CEED_ERROR_UNSUPPORTED, "Evaluation at arbitrary  points only supported for a single element at a time");
   if (eval_mode == CEED_EVAL_WEIGHT) {
     CeedCall(CeedVectorSetValue(v, 1.0));
     return CEED_ERROR_SUCCESS;
@@ -1652,18 +1657,18 @@ int CeedBasisApplyAtPoints(CeedBasis basis, CeedInt num_points, CeedTransposeMod
           CeedScalar tmp[2][num_comp * CeedIntPow(Q_1d, dim)], chebyshev_x[Q_1d];
 
           // ---- Values at point
-          for (CeedInt p = 0; p < num_points; p++) {
+          for (CeedInt p = 0; p < total_num_points; p++) {
             CeedInt pre = num_comp * CeedIntPow(Q_1d, dim - 1), post = 1;
 
             for (CeedInt d = 0; d < dim; d++) {
               // ------ Tensor contract with current Chebyshev polynomial values
-              CeedCall(CeedChebyshevPolynomialsAtPoint(x_array_read[d * num_points + p], Q_1d, chebyshev_x));
+              CeedCall(CeedChebyshevPolynomialsAtPoint(x_array_read[d * total_num_points + p], Q_1d, chebyshev_x));
               CeedCall(CeedTensorContractApply(basis->contract, pre, Q_1d, post, 1, chebyshev_x, t_mode, false,
                                                d == 0 ? chebyshev_coeffs : tmp[d % 2], tmp[(d + 1) % 2]));
               pre /= Q_1d;
               post *= 1;
             }
-            for (CeedInt c = 0; c < num_comp; c++) v_array[c * num_points + p] = tmp[dim % 2][c];
+            for (CeedInt c = 0; c < num_comp; c++) v_array[c * total_num_points + p] = tmp[dim % 2][c];
           }
           break;
         }
@@ -1671,21 +1676,21 @@ int CeedBasisApplyAtPoints(CeedBasis basis, CeedInt num_points, CeedTransposeMod
           CeedScalar tmp[2][num_comp * CeedIntPow(Q_1d, dim)], chebyshev_x[Q_1d];
 
           // ---- Values at point
-          for (CeedInt p = 0; p < num_points; p++) {
+          for (CeedInt p = 0; p < total_num_points; p++) {
             // Dim**2 contractions, apply grad when pass == dim
             for (CeedInt pass = 0; pass < dim; pass++) {
               CeedInt pre = num_comp * CeedIntPow(Q_1d, dim - 1), post = 1;
 
               for (CeedInt d = 0; d < dim; d++) {
                 // ------ Tensor contract with current Chebyshev polynomial values
-                if (pass == d) CeedCall(CeedChebyshevDerivativeAtPoint(x_array_read[d * num_points + p], Q_1d, chebyshev_x));
-                else CeedCall(CeedChebyshevPolynomialsAtPoint(x_array_read[d * num_points + p], Q_1d, chebyshev_x));
+                if (pass == d) CeedCall(CeedChebyshevDerivativeAtPoint(x_array_read[d * total_num_points + p], Q_1d, chebyshev_x));
+                else CeedCall(CeedChebyshevPolynomialsAtPoint(x_array_read[d * total_num_points + p], Q_1d, chebyshev_x));
                 CeedCall(CeedTensorContractApply(basis->contract, pre, Q_1d, post, 1, chebyshev_x, t_mode, false,
                                                  d == 0 ? chebyshev_coeffs : tmp[d % 2], tmp[(d + 1) % 2]));
                 pre /= Q_1d;
                 post *= 1;
               }
-              for (CeedInt c = 0; c < num_comp; c++) v_array[(pass * num_comp + c) * num_points + p] = tmp[dim % 2][c];
+              for (CeedInt c = 0; c < num_comp; c++) v_array[(pass * num_comp + c) * total_num_points + p] = tmp[dim % 2][c];
             }
           }
           break;
@@ -1715,13 +1720,13 @@ int CeedBasisApplyAtPoints(CeedBasis basis, CeedInt num_points, CeedTransposeMod
           CeedScalar tmp[2][num_comp * CeedIntPow(Q_1d, dim)], chebyshev_x[Q_1d];
 
           // ---- Values at point
-          for (CeedInt p = 0; p < num_points; p++) {
+          for (CeedInt p = 0; p < total_num_points; p++) {
             CeedInt pre = num_comp * 1, post = 1;
 
-            for (CeedInt c = 0; c < num_comp; c++) tmp[0][c] = u_array[c * num_points + p];
+            for (CeedInt c = 0; c < num_comp; c++) tmp[0][c] = u_array[c * total_num_points + p];
             for (CeedInt d = 0; d < dim; d++) {
               // ------ Tensor contract with current Chebyshev polynomial values
-              CeedCall(CeedChebyshevPolynomialsAtPoint(x_array_read[d * num_points + p], Q_1d, chebyshev_x));
+              CeedCall(CeedChebyshevPolynomialsAtPoint(x_array_read[d * total_num_points + p], Q_1d, chebyshev_x));
               CeedCall(CeedTensorContractApply(basis->contract, pre, 1, post, Q_1d, chebyshev_x, t_mode, p > 0 && d == (dim - 1), tmp[d % 2],
                                                d == (dim - 1) ? chebyshev_coeffs : tmp[(d + 1) % 2]));
               pre /= 1;
@@ -1734,16 +1739,16 @@ int CeedBasisApplyAtPoints(CeedBasis basis, CeedInt num_points, CeedTransposeMod
           CeedScalar tmp[2][num_comp * CeedIntPow(Q_1d, dim)], chebyshev_x[Q_1d];
 
           // ---- Values at point
-          for (CeedInt p = 0; p < num_points; p++) {
+          for (CeedInt p = 0; p < total_num_points; p++) {
             // Dim**2 contractions, apply grad when pass == dim
             for (CeedInt pass = 0; pass < dim; pass++) {
               CeedInt pre = num_comp * 1, post = 1;
 
-              for (CeedInt c = 0; c < num_comp; c++) tmp[0][c] = u_array[(pass * num_comp + c) * num_points + p];
+              for (CeedInt c = 0; c < num_comp; c++) tmp[0][c] = u_array[(pass * num_comp + c) * total_num_points + p];
               for (CeedInt d = 0; d < dim; d++) {
                 // ------ Tensor contract with current Chebyshev polynomial values
-                if (pass == d) CeedCall(CeedChebyshevDerivativeAtPoint(x_array_read[d * num_points + p], Q_1d, chebyshev_x));
-                else CeedCall(CeedChebyshevPolynomialsAtPoint(x_array_read[d * num_points + p], Q_1d, chebyshev_x));
+                if (pass == d) CeedCall(CeedChebyshevDerivativeAtPoint(x_array_read[d * total_num_points + p], Q_1d, chebyshev_x));
+                else CeedCall(CeedChebyshevPolynomialsAtPoint(x_array_read[d * total_num_points + p], Q_1d, chebyshev_x));
                 CeedCall(CeedTensorContractApply(basis->contract, pre, 1, post, Q_1d, chebyshev_x, t_mode,
                                                  (p > 0 || (p == 0 && pass > 0)) && d == (dim - 1), tmp[d % 2],
                                                  d == (dim - 1) ? chebyshev_coeffs : tmp[(d + 1) % 2]));
diff --git a/julia/LibCEED.jl/src/generated/libceed_bindings.jl b/julia/LibCEED.jl/src/generated/libceed_bindings.jl
index f814609b86..9cbf889dd9 100644
--- a/julia/LibCEED.jl/src/generated/libceed_bindings.jl
+++ b/julia/LibCEED.jl/src/generated/libceed_bindings.jl
@@ -436,8 +436,8 @@ function CeedBasisApply(basis, num_elem, t_mode, eval_mode, u, v)
     ccall((:CeedBasisApply, libceed), Cint, (CeedBasis, CeedInt, CeedTransposeMode, CeedEvalMode, CeedVector, CeedVector), basis, num_elem, t_mode, eval_mode, u, v)
 end
 
-function CeedBasisApplyAtPoints(basis, num_points, t_mode, eval_mode, x_ref, u, v)
-    ccall((:CeedBasisApplyAtPoints, libceed), Cint, (CeedBasis, CeedInt, CeedTransposeMode, CeedEvalMode, CeedVector, CeedVector, CeedVector), basis, num_points, t_mode, eval_mode, x_ref, u, v)
+function CeedBasisApplyAtPoints(basis, num_elem, num_points, t_mode, eval_mode, x_ref, u, v)
+    ccall((:CeedBasisApplyAtPoints, libceed), Cint, (CeedBasis, CeedInt, Ptr{CeedInt}, CeedTransposeMode, CeedEvalMode, CeedVector, CeedVector, CeedVector), basis, num_elem, num_points, t_mode, eval_mode, x_ref, u, v)
 end
 
 function CeedBasisGetCeed(basis, ceed)
diff --git a/tests/t350-basis.c b/tests/t350-basis.c
index 54979bb9a0..becc0d98ea 100644
--- a/tests/t350-basis.c
+++ b/tests/t350-basis.c
@@ -56,7 +56,7 @@ int main(int argc, char **argv) {
 
     CeedVectorSetArray(x_points, CEED_MEM_HOST, CEED_COPY_VALUES, x_array);
   }
-  CeedBasisApplyAtPoints(basis_u, num_points, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, x_points, u, v);
+  CeedBasisApplyAtPoints(basis_u, 1, &num_points, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, x_points, u, v);
 
   {
     const CeedScalar *x_array, *v_array;
diff --git a/tests/t351-basis.c b/tests/t351-basis.c
index 14b23730e1..84f59cc838 100644
--- a/tests/t351-basis.c
+++ b/tests/t351-basis.c
@@ -65,7 +65,7 @@ int main(int argc, char **argv) {
 
       CeedVectorSetArray(x_points, CEED_MEM_HOST, CEED_COPY_VALUES, x_array);
     }
-    CeedBasisApplyAtPoints(basis_u, num_points, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, x_points, u, v);
+    CeedBasisApplyAtPoints(basis_u, 1, &num_points, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, x_points, u, v);
 
     {
       const CeedScalar *x_array, *v_array;
diff --git a/tests/t352-basis.c b/tests/t352-basis.c
index a4bf13d8b6..c2da0e2dd4 100644
--- a/tests/t352-basis.c
+++ b/tests/t352-basis.c
@@ -65,7 +65,7 @@ int main(int argc, char **argv) {
 
       CeedVectorSetArray(x_points, CEED_MEM_HOST, CEED_COPY_VALUES, x_array);
     }
-    CeedBasisApplyAtPoints(basis_u, num_points, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, x_points, u, v);
+    CeedBasisApplyAtPoints(basis_u, 1, &num_points, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, x_points, u, v);
 
     {
       const CeedScalar *x_array, *v_array;
diff --git a/tests/t353-basis.c b/tests/t353-basis.c
index 22f80ddcdd..83fd16adb0 100644
--- a/tests/t353-basis.c
+++ b/tests/t353-basis.c
@@ -60,17 +60,18 @@ int main(int argc, char **argv) {
 
     CeedVectorSetArray(x_points, CEED_MEM_HOST, CEED_COPY_VALUES, x_array);
   }
-  CeedBasisApplyAtPoints(basis_u, num_points, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, x_points, u, v);
+  CeedBasisApplyAtPoints(basis_u, 1, &num_points, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, x_points, u, v);
 
   for (CeedInt i = 0; i < num_points; i++) {
-    CeedScalar        fx = 0.0;
+    const CeedInt     num_point[1] = {1};
+    CeedScalar        fx           = 0.0;
     const CeedScalar *x_array, *u_array, *v_array, *u_point_array;
 
     CeedVectorGetArrayRead(x_points, CEED_MEM_HOST, &x_array);
     CeedVectorGetArrayRead(u, CEED_MEM_HOST, &u_array);
     CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array);
     CeedVectorSetValue(x_point, x_array[i]);
-    CeedBasisApplyAtPoints(basis_u, 1, CEED_TRANSPOSE, CEED_EVAL_INTERP, x_point, v_point, u_point);
+    CeedBasisApplyAtPoints(basis_u, 1, num_point, CEED_TRANSPOSE, CEED_EVAL_INTERP, x_point, v_point, u_point);
     CeedVectorGetArrayRead(u_point, CEED_MEM_HOST, &u_point_array);
     for (CeedInt j = 0; j < p; j++) fx += u_array[j] * u_point_array[j];
     if (fabs(v_array[i] - fx) > 100. * CEED_EPSILON) printf("%f != %f = f(%f)\n", v_array[i], fx, x_array[i]);
diff --git a/tests/t354-basis.c b/tests/t354-basis.c
index 85f0ac2293..e137f8c44f 100644
--- a/tests/t354-basis.c
+++ b/tests/t354-basis.c
@@ -69,10 +69,11 @@ int main(int argc, char **argv) {
 
       CeedVectorSetArray(x_points, CEED_MEM_HOST, CEED_COPY_VALUES, x_array);
     }
-    CeedBasisApplyAtPoints(basis_u, num_points, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, x_points, u, v);
+    CeedBasisApplyAtPoints(basis_u, 1, &num_points, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, x_points, u, v);
 
     for (CeedInt i = 0; i < num_points; i++) {
-      CeedScalar        fx = 0.0;
+      const CeedInt     num_point[1] = {1};
+      CeedScalar        fx           = 0.0;
       CeedScalar        coord[dim];
       const CeedScalar *x_array, *u_array, *v_array, *u_point_array;
 
@@ -81,7 +82,7 @@ int main(int argc, char **argv) {
       CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array);
       for (CeedInt d = 0; d < dim; d++) coord[d] = x_array[d * num_points + i];
       CeedVectorSetArray(x_point, CEED_MEM_HOST, CEED_COPY_VALUES, coord);
-      CeedBasisApplyAtPoints(basis_u, 1, CEED_TRANSPOSE, CEED_EVAL_INTERP, x_point, v_point, u_point);
+      CeedBasisApplyAtPoints(basis_u, 1, num_point, CEED_TRANSPOSE, CEED_EVAL_INTERP, x_point, v_point, u_point);
       CeedVectorGetArrayRead(u_point, CEED_MEM_HOST, &u_point_array);
       for (CeedInt j = 0; j < p_dim; j++) fx += u_array[j] * u_point_array[j];
       if (fabs(v_array[i] - fx) > 100. * CEED_EPSILON) {
diff --git a/tests/t355-basis.c b/tests/t355-basis.c
index 7fd7906dcb..5b93764a7a 100644
--- a/tests/t355-basis.c
+++ b/tests/t355-basis.c
@@ -62,7 +62,7 @@ int main(int argc, char **argv) {
 
     CeedVectorSetArray(x_points, CEED_MEM_HOST, CEED_COPY_VALUES, x_array);
   }
-  CeedBasisApplyAtPoints(basis_u, num_points, CEED_NOTRANSPOSE, CEED_EVAL_GRAD, x_points, u, v);
+  CeedBasisApplyAtPoints(basis_u, 1, &num_points, CEED_NOTRANSPOSE, CEED_EVAL_GRAD, x_points, u, v);
 
   {
     const CeedScalar *x_array, *v_array;
diff --git a/tests/t356-basis.c b/tests/t356-basis.c
index 8eb3c57e7c..263cc43b66 100644
--- a/tests/t356-basis.c
+++ b/tests/t356-basis.c
@@ -75,7 +75,7 @@ int main(int argc, char **argv) {
 
       CeedVectorSetArray(x_points, CEED_MEM_HOST, CEED_COPY_VALUES, x_array);
     }
-    CeedBasisApplyAtPoints(basis_u, num_points, CEED_NOTRANSPOSE, CEED_EVAL_GRAD, x_points, u, v);
+    CeedBasisApplyAtPoints(basis_u, 1, &num_points, CEED_NOTRANSPOSE, CEED_EVAL_GRAD, x_points, u, v);
 
     {
       const CeedScalar *x_array, *v_array;
diff --git a/tests/t357-basis.c b/tests/t357-basis.c
index ecfa56476c..0f4e105a66 100644
--- a/tests/t357-basis.c
+++ b/tests/t357-basis.c
@@ -82,8 +82,8 @@ int main(int argc, char **argv) {
     }
 
     // Calculate G u at arbitrary points, G' * 1 at dofs
-    CeedBasisApplyAtPoints(basis_u, num_points, CEED_NOTRANSPOSE, CEED_EVAL_GRAD, x_points, u, u_points);
-    CeedBasisApplyAtPoints(basis_u, num_points, CEED_TRANSPOSE, CEED_EVAL_GRAD, x_points, ones, v);
+    CeedBasisApplyAtPoints(basis_u, 1, &num_points, CEED_NOTRANSPOSE, CEED_EVAL_GRAD, x_points, u, u_points);
+    CeedBasisApplyAtPoints(basis_u, 1, &num_points, CEED_TRANSPOSE, CEED_EVAL_GRAD, x_points, ones, v);
     {
       const CeedScalar *u_array, *v_array, *u_points_array;
 

From 18cc0c1da5053f763fdf720b83221e071f604430 Mon Sep 17 00:00:00 2001
From: James Wright <james@jameswright.xyz>
Date: Thu, 6 Jun 2024 08:01:03 -0600
Subject: [PATCH 051/571] fluids: Add check to blasius for state variable

---
 examples/fluids/problems/blasius.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/examples/fluids/problems/blasius.c b/examples/fluids/problems/blasius.c
index cb061b3bc8..bb51102f6f 100644
--- a/examples/fluids/problems/blasius.c
+++ b/examples/fluids/problems/blasius.c
@@ -353,6 +353,8 @@ PetscErrorCode NS_BLASIUS(ProblemData problem, DM dm, void *ctx, SimpleBC bc) {
   } else if (diff_filter_mms) {
     PetscCall(DifferentialFilterMmsICSetup(problem));
   } else {
+    PetscCheck((user->phys->state_var == STATEVAR_CONSERVATIVE) || (user->app_ctx->test_type == TESTTYPE_DIFF_FILTER), user->comm,
+               PETSC_ERR_ARG_INCOMP, "Can only use conservative variables with Blasius and weak inflow");
     problem->apply_inflow.qfunction              = Blasius_Inflow;
     problem->apply_inflow.qfunction_loc          = Blasius_Inflow_loc;
     problem->apply_inflow_jacobian.qfunction     = Blasius_Inflow_Jacobian;

From faed4840b5126336d09076d80e315366da92cc18 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Thu, 6 Jun 2024 09:27:14 -0600
Subject: [PATCH 052/571] doc - fix docstring alignment

Co-authored-by: James Wright <james@jameswright.xyz>
---
 interface/ceed-basis.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/interface/ceed-basis.c b/interface/ceed-basis.c
index f64ac7bf50..6acb3fdabc 100644
--- a/interface/ceed-basis.c
+++ b/interface/ceed-basis.c
@@ -1499,9 +1499,9 @@ int CeedBasisApply(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode,
   @brief Apply basis evaluation from nodes to arbitrary points
 
   @param[in]  basis      `CeedBasis` to evaluate
-  @param[in]  num_elem  The number of elements to apply the basis evaluation to;
+  @param[in]  num_elem   The number of elements to apply the basis evaluation to;
                           the backend will specify the ordering in @ref CeedElemRestrictionCreate()
-  @param[in]  num_points The number of points to apply the basis evaluation to in each element
+  @param[in]  num_points Array of the number of points to apply the basis evaluation to in each element, size `num_elem`
   @param[in]  t_mode     @ref CEED_NOTRANSPOSE to evaluate from nodes to points;
                            @ref CEED_TRANSPOSE to apply the transpose, mapping from points to nodes
   @param[in]  eval_mode  @ref CEED_EVAL_INTERP to use interpolated values,

From b0cc4569dd7799623ce5e907ba0b8d6826b1ac82 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 7 Jun 2024 12:29:16 -0600
Subject: [PATCH 053/571] basis - refactor Chebyshev interp creation to
 separate fn

---
 include/ceed/backend.h |  1 +
 interface/ceed-basis.c | 64 +++++++++++++++++++++++++++++-------------
 2 files changed, 45 insertions(+), 20 deletions(-)

diff --git a/include/ceed/backend.h b/include/ceed/backend.h
index bd9b365814..d78d37babe 100644
--- a/include/ceed/backend.h
+++ b/include/ceed/backend.h
@@ -312,6 +312,7 @@ typedef enum {
 CEED_EXTERN const char *const CeedFESpaces[];
 
 CEED_EXTERN int CeedBasisGetCollocatedGrad(CeedBasis basis, CeedScalar *colo_grad_1d);
+CEED_EXTERN int CeedBasisGetChebyshevInterp1D(CeedBasis basis, CeedScalar *chebyshev_interp_1d);
 CEED_EXTERN int CeedBasisIsTensor(CeedBasis basis, bool *is_tensor);
 CEED_EXTERN int CeedBasisGetData(CeedBasis basis, void *data);
 CEED_EXTERN int CeedBasisSetData(CeedBasis basis, void *data);
diff --git a/interface/ceed-basis.c b/interface/ceed-basis.c
index 6acb3fdabc..9f52c88f9f 100644
--- a/interface/ceed-basis.c
+++ b/interface/ceed-basis.c
@@ -323,6 +323,47 @@ int CeedBasisGetCollocatedGrad(CeedBasis basis, CeedScalar *collo_grad_1d) {
   return CEED_ERROR_SUCCESS;
 }
 
+/**
+  @brief Return 1D interpolation matrix to Chebyshev polynomial coefficients on quadrature space
+
+  @param[in]  basis               `CeedBasis`
+  @param[out] chebyshev_interp_1d Row-major (`P_1d * Q_1d`) matrix interpolating from basis nodes to Chebyshev polynomial coefficients
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Backend
+**/
+int CeedBasisGetChebyshevInterp1D(CeedBasis basis, CeedScalar *chebyshev_interp_1d) {
+  CeedInt           P_1d, Q_1d;
+  CeedScalar       *C, *chebyshev_coeffs_1d_inv;
+  const CeedScalar *interp_1d, *q_ref_1d;
+  Ceed              ceed;
+
+  CeedCall(CeedBasisGetCeed(basis, &ceed));
+  CeedCall(CeedBasisGetNumNodes1D(basis, &P_1d));
+  CeedCall(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
+
+  // Build coefficient matrix
+  // -- Note: Clang-tidy needs this check because it does not understand the is_tensor_basis check above
+  CeedCheck(P_1d > 0 && Q_1d > 0, ceed, CEED_ERROR_INCOMPATIBLE, "CeedBasis dimensions are malformed");
+  CeedCall(CeedCalloc(Q_1d * Q_1d, &C));
+  CeedCall(CeedBasisGetQRef(basis, &q_ref_1d));
+  for (CeedInt i = 0; i < Q_1d; i++) CeedCall(CeedChebyshevPolynomialsAtPoint(q_ref_1d[i], Q_1d, &C[i * Q_1d]));
+
+  // Compute C^+, pseudoinverse of coefficient matrix
+  CeedCall(CeedCalloc(Q_1d * Q_1d, &chebyshev_coeffs_1d_inv));
+  CeedCall(CeedMatrixPseudoinverse(ceed, C, Q_1d, Q_1d, chebyshev_coeffs_1d_inv));
+
+  // Build mapping from nodes to Chebyshev coefficients
+  CeedCall(CeedBasisGetInterp1D(basis, &interp_1d));
+  CeedCall(CeedMatrixMatrixMultiply(ceed, chebyshev_coeffs_1d_inv, interp_1d, chebyshev_interp_1d, Q_1d, P_1d, Q_1d));
+
+  // Cleanup
+  CeedCall(CeedFree(&C));
+  CeedCall(CeedFree(&chebyshev_coeffs_1d_inv));
+  return CEED_ERROR_SUCCESS;
+}
+
 /**
   @brief Get tensor status for given `CeedBasis`
 
@@ -1586,38 +1627,21 @@ int CeedBasisApplyAtPoints(CeedBasis basis, CeedInt num_elem, const CeedInt *num
     return CEED_ERROR_SUCCESS;
   }
   if (!basis->basis_chebyshev) {
-    // Build matrix mapping from quadrature point values to Chebyshev coefficients
-    CeedScalar       *C, *chebyshev_coeffs_1d_inv;
-    const CeedScalar *q_ref_1d;
-
-    // Build coefficient matrix
-    // -- Note: Clang-tidy needs this check because it does not understand the is_tensor_basis check above
-    CeedCheck(P_1d > 0 && Q_1d > 0, ceed, CEED_ERROR_INCOMPATIBLE, "CeedBasis dimensions are malformed");
-    CeedCall(CeedCalloc(Q_1d * Q_1d, &C));
-    CeedCall(CeedBasisGetQRef(basis, &q_ref_1d));
-    for (CeedInt i = 0; i < Q_1d; i++) CeedCall(CeedChebyshevPolynomialsAtPoint(q_ref_1d[i], Q_1d, &C[i * Q_1d]));
-
-    // Compute C^+, pseudoinverse of coefficient matrix
-    CeedCall(CeedCalloc(Q_1d * Q_1d, &chebyshev_coeffs_1d_inv));
-    CeedCall(CeedMatrixPseudoinverse(ceed, C, Q_1d, Q_1d, chebyshev_coeffs_1d_inv));
-
     // Build basis mapping from nodes to Chebyshev coefficients
     CeedScalar       *chebyshev_interp_1d, *chebyshev_grad_1d, *chebyshev_q_weight_1d;
-    const CeedScalar *interp_1d;
+    const CeedScalar *q_ref_1d;
 
     CeedCall(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d));
     CeedCall(CeedCalloc(P_1d * Q_1d, &chebyshev_grad_1d));
     CeedCall(CeedCalloc(Q_1d, &chebyshev_q_weight_1d));
-    CeedCall(CeedBasisGetInterp1D(basis, &interp_1d));
-    CeedCall(CeedMatrixMatrixMultiply(ceed, chebyshev_coeffs_1d_inv, interp_1d, chebyshev_interp_1d, Q_1d, P_1d, Q_1d));
+    CeedCall(CeedBasisGetQRef(basis, &q_ref_1d));
+    CeedCall(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d));
 
     CeedCall(CeedVectorCreate(ceed, num_comp * CeedIntPow(Q_1d, dim), &basis->vec_chebyshev));
     CeedCall(CeedBasisCreateTensorH1(ceed, dim, num_comp, P_1d, Q_1d, chebyshev_interp_1d, chebyshev_grad_1d, q_ref_1d, chebyshev_q_weight_1d,
                                      &basis->basis_chebyshev));
 
     // Cleanup
-    CeedCall(CeedFree(&C));
-    CeedCall(CeedFree(&chebyshev_coeffs_1d_inv));
     CeedCall(CeedFree(&chebyshev_interp_1d));
     CeedCall(CeedFree(&chebyshev_grad_1d));
     CeedCall(CeedFree(&chebyshev_q_weight_1d));

From bd83cbc5663d4b964befcdf88d03b52a719d2791 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 7 Jun 2024 14:41:59 -0600
Subject: [PATCH 054/571] basis - clean up check for clang-tidy co-authored-by:
 jrwigh <james@jameswright.xyz>

---
 interface/ceed-basis.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/interface/ceed-basis.c b/interface/ceed-basis.c
index 9f52c88f9f..fe5ae3fb73 100644
--- a/interface/ceed-basis.c
+++ b/interface/ceed-basis.c
@@ -344,8 +344,8 @@ int CeedBasisGetChebyshevInterp1D(CeedBasis basis, CeedScalar *chebyshev_interp_
   CeedCall(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
 
   // Build coefficient matrix
-  // -- Note: Clang-tidy needs this check because it does not understand the is_tensor_basis check above
-  CeedCheck(P_1d > 0 && Q_1d > 0, ceed, CEED_ERROR_INCOMPATIBLE, "CeedBasis dimensions are malformed");
+  // -- Note: Clang-tidy needs this check
+  CeedCheck((P_1d > 0) && (Q_1d > 0), ceed, CEED_ERROR_INCOMPATIBLE, "CeedBasis dimensions are malformed");
   CeedCall(CeedCalloc(Q_1d * Q_1d, &C));
   CeedCall(CeedBasisGetQRef(basis, &q_ref_1d));
   for (CeedInt i = 0; i < Q_1d; i++) CeedCall(CeedChebyshevPolynomialsAtPoint(q_ref_1d[i], Q_1d, &C[i * Q_1d]));

From c8565611f4f88586c9ab8f49f4be6e8b5d8096a7 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Mon, 10 Jun 2024 16:20:38 -0600
Subject: [PATCH 055/571] solids - reduce to minimal example set, see Ratel for
 full

---
 examples/solids/README.md                     |   4 +-
 examples/solids/elasticity.c                  |   6 +-
 examples/solids/index.md                      |  40 --
 examples/solids/problems/cl-problems.h        |  24 +-
 .../finite-strain-mooney-rivlin-initial-1.c   |  58 --
 .../problems/finite-strain-mooney-rivlin.c    |  58 ++
 .../finite-strain-neo-hookean-current-1.c     |  58 --
 .../finite-strain-neo-hookean-current-2.c     |  58 --
 .../finite-strain-neo-hookean-initial-1.c     |  58 --
 .../finite-strain-neo-hookean-initial-2.c     |  58 --
 .../problems/finite-strain-neo-hookean.c      |  58 ++
 examples/solids/problems/linear.c             |  16 +-
 examples/solids/problems/problems.c           |   7 +-
 examples/solids/problems/problems.h           |   7 +-
 .../problems/small-strain-neo-hookean.c       |  16 +-
 ...tial-1.h => finite-strain-mooney-rivlin.h} |  33 +-
 .../finite-strain-neo-hookean-current-1.h     | 535 -----------------
 .../finite-strain-neo-hookean-current-2.h     | 482 ---------------
 .../finite-strain-neo-hookean-initial-2.h     | 559 ------------------
 ...nitial-1.h => finite-strain-neo-hookean.h} |  28 +-
 examples/solids/qfunctions/linear.h           |  16 +-
 .../qfunctions/small-strain-neo-hookean.h     |  18 +-
 examples/solids/src/cl-options.c              |   5 +-
 23 files changed, 195 insertions(+), 2007 deletions(-)
 delete mode 100644 examples/solids/problems/finite-strain-mooney-rivlin-initial-1.c
 create mode 100644 examples/solids/problems/finite-strain-mooney-rivlin.c
 delete mode 100644 examples/solids/problems/finite-strain-neo-hookean-current-1.c
 delete mode 100644 examples/solids/problems/finite-strain-neo-hookean-current-2.c
 delete mode 100644 examples/solids/problems/finite-strain-neo-hookean-initial-1.c
 delete mode 100644 examples/solids/problems/finite-strain-neo-hookean-initial-2.c
 create mode 100644 examples/solids/problems/finite-strain-neo-hookean.c
 rename examples/solids/qfunctions/{finite-strain-mooney-rivlin-initial-1.h => finite-strain-mooney-rivlin.h} (95%)
 delete mode 100644 examples/solids/qfunctions/finite-strain-neo-hookean-current-1.h
 delete mode 100644 examples/solids/qfunctions/finite-strain-neo-hookean-current-2.h
 delete mode 100644 examples/solids/qfunctions/finite-strain-neo-hookean-initial-2.h
 rename examples/solids/qfunctions/{finite-strain-neo-hookean-initial-1.h => finite-strain-neo-hookean.h} (97%)

diff --git a/examples/solids/README.md b/examples/solids/README.md
index 4f0d14ab86..c86180d6f2 100644
--- a/examples/solids/README.md
+++ b/examples/solids/README.md
@@ -72,7 +72,7 @@ As an alternative example exploiting {code}`-dm_plex_box_faces`, we consider a {
 Sides 1 through 6 are rotated around $x$-axis:
 
 ```
-./elasticity -problem FSInitial-NH1 -E 1 -nu 0.3 -num_steps 40 -snes_linesearch_type cp -dm_plex_box_faces 4,4,4 -bc_clamp 1,2,3,4,5,6 -bc_clamp_1_rotate 0,0,1,0,.3 -bc_clamp_2_rotate 0,0,1,0,.3 -bc_clamp_3_rotate 0,0,1,0,.3 -bc_clamp_4_rotate 0,0,1,0,.3 -bc_clamp_5_rotate 0,0,1,0,.3 -bc_clamp_6_rotate 0,0,1,0,.3
+./elasticity -problem FS-NH -E 1 -nu 0.3 -num_steps 40 -snes_linesearch_type cp -dm_plex_box_faces 4,4,4 -bc_clamp 1,2,3,4,5,6 -bc_clamp_1_rotate 0,0,1,0,.3 -bc_clamp_2_rotate 0,0,1,0,.3 -bc_clamp_3_rotate 0,0,1,0,.3 -bc_clamp_4_rotate 0,0,1,0,.3 -bc_clamp_5_rotate 0,0,1,0,.3 -bc_clamp_6_rotate 0,0,1,0,.3
 ```
 
 :::{note}
@@ -103,7 +103,7 @@ The command line options just shown are the minimum requirements to run the mini
   -
 
 * - `-problem`
-  - Problem to solve (`Linear`, `SS-NH`, `FSInitial-NH1`, etc.)
+  - Problem to solve (`Linear`, `SS-NH`, `FS-MR`, etc.)
   - `Linear`
 
 * - `-forcing`
diff --git a/examples/solids/elasticity.c b/examples/solids/elasticity.c
index 747f04835f..7de51bd0ee 100644
--- a/examples/solids/elasticity.c
+++ b/examples/solids/elasticity.c
@@ -25,9 +25,9 @@
 //
 // Sample meshes can be found at https://github.com/jeremylt/ceedSampleMeshes
 //
-//TESTARGS(name="linear elasticity, MMS")                                 -ceed {ceed_resource} -test -degree 3 -nu 0.3 -E 1 -dm_plex_box_faces 3,3,3
-//TESTARGS(name="Neo-Hookean hyperelasticity, initial configuration 1")   -ceed {ceed_resource} -test -problem FSInitial-NH1 -E 2.8 -nu 0.4 -degree 2 -dm_plex_box_faces 2,2,2 -num_steps 1 -bc_clamp 6 -bc_traction 5 -bc_traction_5 0,0,-.5 -expect_final_strain_energy 2.124627916174e-01
-//TESTARGS(name="Mooney-Rivlin hyperelasticity, initial configuration 1") -ceed {ceed_resource} -test -problem FSInitial-MR1 -mu_1 .5 -mu_2 .5 -nu 0.4 -degree 2 -dm_plex_box_faces 2,2,2 -num_steps 1 -bc_clamp 6 -bc_traction 5 -bc_traction_5 0,0,-.5 -expect_final_strain_energy 2.339138880207e-01
+//TESTARGS(name="linear elasticity, MMS")        -ceed {ceed_resource} -test -degree 3 -nu 0.3 -E 1 -dm_plex_box_faces 3,3,3
+//TESTARGS(name="Neo-Hookean hyperelasticity")   -ceed {ceed_resource} -test -problem FS-NH -E 2.8 -nu 0.4 -degree 2 -dm_plex_box_faces 2,2,2 -num_steps 1 -bc_clamp 6 -bc_traction 5 -bc_traction_5 0,0,-.5 -expect_final_strain_energy 2.124627916174e-01
+//TESTARGS(name="Mooney-Rivlin hyperelasticity") -ceed {ceed_resource} -test -problem FS-MR -mu_1 .5 -mu_2 .5 -nu 0.4 -degree 2 -dm_plex_box_faces 2,2,2 -num_steps 1 -bc_clamp 6 -bc_traction 5 -bc_traction_5 0,0,-.5 -expect_final_strain_energy 2.339138880207e-01
 
 /// @file
 /// CEED elasticity example using PETSc with DMPlex
diff --git a/examples/solids/index.md b/examples/solids/index.md
index 910959b244..f4ff1816b9 100644
--- a/examples/solids/index.md
+++ b/examples/solids/index.md
@@ -737,43 +737,3 @@ $$ (tau-neo-hookean-stable)
 
 which is more numerically stable for small strain, and thus preferred for computation. Note that the $\log J$ is computed via `log1p` {eq}`log1p`, as we discussed in the previous tip.
 :::
-
-### Jacobian representation
-
-We have implemented four storage variants for the Jacobian in our finite strain hyperelasticity. In each case, some variables are computed during residual evaluation and used during Jacobian application.
-
-:::{list-table} Four algorithms for Jacobian action in finite strain hyperelasticity problem
-:header-rows: 1
-:widths: auto
-
-* - Option `-problem`
-  - Static storage
-  - Computed storage
-  - \# scalars
-  - Equations
-
-
-* - `FSInitial-NH1`
-  - $\nabla_{X} \hat X, \operatorname{det}\nabla_{\hat X} X$
-  - $\nabla_X \bm u$
-  - 19
-  - {eq}`eq-diff-P` {eq}`eq-neo-hookean-incremental-stress`
-
-* - `FSInitial-NH2`
-  - $\nabla_{X} \hat X, \operatorname{det}\nabla_{\hat X} X$
-  - $\nabla_X \bm u, \bm C^{-1}, \lambda \log J$
-  - 26
-  - {eq}`eq-diff-P` {eq}`eq-neo-hookean-incremental-stress`
-
-* - `FSCurrent-NH1`
-  - $\nabla_{X} \hat X, \operatorname{det}\nabla_{\hat X} X$
-  - $\nabla_X \bm u$
-  - 19
-  - {eq}`jacobian-weak-form-current` {eq}`nabla_xdu`
-
-* - `FSCurrent-NH2`
-  - $\operatorname{det}\nabla_{\hat X} X$
-  - $\nabla_x \hat X, \bm \tau, \lambda \log J$
-  - 17
-  - {eq}`jacobian-weak-form-current` {eq}`jacobian-weak-form-current2`
-:::
diff --git a/examples/solids/problems/cl-problems.h b/examples/solids/problems/cl-problems.h
index 249e1fd604..acd8dcf15d 100644
--- a/examples/solids/problems/cl-problems.h
+++ b/examples/solids/problems/cl-problems.h
@@ -7,22 +7,8 @@
 #pragma once
 
 // Problem options
-typedef enum {
-  ELAS_LINEAR        = 0,
-  ELAS_SS_NH         = 1,
-  ELAS_FSInitial_NH1 = 2,
-  ELAS_FSInitial_NH2 = 3,
-  ELAS_FSCurrent_NH1 = 4,
-  ELAS_FSCurrent_NH2 = 5,
-  ELAS_FSInitial_MR1 = 6
-} problemType;
-static const char *const problemTypes[]        = {"Linear",        "SS-NH",         "FSInitial-NH1", "FSInitial-NH2", "FSCurrent-NH1",
-                                                  "FSCurrent-NH2", "FSInitial-MR1", "problemType",   "ELAS_",         0};
-static const char *const problemTypesForDisp[] = {
-    "Linear elasticity",
-    "Hyperelasticity small strain, Neo-Hookean",
-    "Hyperelasticity finite strain Initial configuration Neo-Hookean w/ dXref_dxinit, Grad(u) storage",
-    "Hyperelasticity finite strain Initial configuration Neo-Hookean w/ dXref_dxinit, Grad(u), C_inv, constant storage",
-    "Hyperelasticity finite strain Current configuration Neo-Hookean w/ dXref_dxinit, Grad(u) storage",
-    "Hyperelasticity finite strain Current configuration Neo-Hookean w/ dXref_dxcurr, tau, constant storage",
-    "Hyperelasticity finite strain Initial configuration Moony-Rivlin w/ dXref_dxinit, Grad(u) storage"};
+typedef enum { ELAS_LINEAR = 0, ELAS_SS_NH = 1, ELAS_FS_NH = 2, ELAS_FS_MR = 3 } problemType;
+static const char *const problemTypes[]        = {"Linear", "SS-NH", "FS-NH", "FS-MR", "problemType", "ELAS_", 0};
+static const char *const problemTypesForDisp[] = {"Linear elasticity", "Hyperelasticity small strain, Neo-Hookean",
+                                                  "Hyperelasticity finite strain Initial configuration Neo-Hookean",
+                                                  "Hyperelasticity finite strain Initial configuration Moony-Rivlin"};
diff --git a/examples/solids/problems/finite-strain-mooney-rivlin-initial-1.c b/examples/solids/problems/finite-strain-mooney-rivlin-initial-1.c
deleted file mode 100644
index 57d37efd63..0000000000
--- a/examples/solids/problems/finite-strain-mooney-rivlin-initial-1.c
+++ /dev/null
@@ -1,58 +0,0 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#include "../qfunctions/finite-strain-mooney-rivlin-initial-1.h"
-
-#include <ceed.h>
-#include <petscsys.h>
-
-#include "../include/setup-libceed.h"
-#include "../include/structs.h"
-#include "../problems/mooney-rivlin.h"
-#include "../problems/problems.h"
-#include "../qfunctions/common.h"
-
-static const char *const field_names[] = {"gradu"};
-static CeedInt           field_sizes[] = {9};
-
-ProblemData finite_strain_Mooney_Rivlin_initial_1 = {
-    .setup_geo            = SetupGeo,
-    .setup_geo_loc        = SetupGeo_loc,
-    .q_data_size          = 10,
-    .quadrature_mode      = CEED_GAUSS,
-    .residual             = ElasFSInitialMR1F,
-    .residual_loc         = ElasFSInitialMR1F_loc,
-    .number_fields_stored = 1,
-    .field_names          = field_names,
-    .field_sizes          = field_sizes,
-    .jacobian             = ElasFSInitialMR1dF,
-    .jacobian_loc         = ElasFSInitialMR1dF_loc,
-    .energy               = ElasFSInitialMR1Energy,
-    .energy_loc           = ElasFSInitialMR1Energy_loc,
-    .diagnostic           = ElasFSInitialMR1Diagnostic,
-    .diagnostic_loc       = ElasFSInitialMR1Diagnostic_loc,
-};
-
-PetscErrorCode SetupLibceedFineLevel_ElasFSInitialMR1(DM dm, DM dm_energy, DM dm_diagnostic, Ceed ceed, AppCtx app_ctx, CeedQFunctionContext phys_ctx,
-                                                      PetscInt fine_level, PetscInt num_comp_u, PetscInt U_g_size, PetscInt U_loc_size,
-                                                      CeedVector force_ceed, CeedVector neumann_ceed, CeedData *data) {
-  PetscFunctionBegin;
-
-  PetscCall(SetupLibceedFineLevel(dm, dm_energy, dm_diagnostic, ceed, app_ctx, phys_ctx, finite_strain_Mooney_Rivlin_initial_1, fine_level,
-                                  num_comp_u, U_g_size, U_loc_size, force_ceed, neumann_ceed, data));
-
-  PetscFunctionReturn(PETSC_SUCCESS);
-};
-
-PetscErrorCode SetupLibceedLevel_ElasFSInitialMR1(DM dm, Ceed ceed, AppCtx app_ctx, PetscInt level, PetscInt num_comp_u, PetscInt U_g_size,
-                                                  PetscInt U_loc_size, CeedVector fine_mult, CeedData *data) {
-  PetscFunctionBegin;
-
-  PetscCall(SetupLibceedLevel(dm, ceed, app_ctx, finite_strain_Mooney_Rivlin_initial_1, level, num_comp_u, U_g_size, U_loc_size, fine_mult, data));
-
-  PetscFunctionReturn(PETSC_SUCCESS);
-};
diff --git a/examples/solids/problems/finite-strain-mooney-rivlin.c b/examples/solids/problems/finite-strain-mooney-rivlin.c
new file mode 100644
index 0000000000..6ce2201907
--- /dev/null
+++ b/examples/solids/problems/finite-strain-mooney-rivlin.c
@@ -0,0 +1,58 @@
+// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+#include "../qfunctions/finite-strain-mooney-rivlin.h"
+
+#include <ceed.h>
+#include <petscsys.h>
+
+#include "../include/setup-libceed.h"
+#include "../include/structs.h"
+#include "../problems/mooney-rivlin.h"
+#include "../problems/problems.h"
+#include "../qfunctions/common.h"
+
+static const char *const field_names[] = {"gradu"};
+static CeedInt           field_sizes[] = {9};
+
+ProblemData finite_strain_Mooney_Rivlin = {
+    .setup_geo            = SetupGeo,
+    .setup_geo_loc        = SetupGeo_loc,
+    .q_data_size          = 10,
+    .quadrature_mode      = CEED_GAUSS,
+    .residual             = ElasFSResidual_MR,
+    .residual_loc         = ElasFSResidual_MR_loc,
+    .number_fields_stored = 1,
+    .field_names          = field_names,
+    .field_sizes          = field_sizes,
+    .jacobian             = ElasFSJacobian_MR,
+    .jacobian_loc         = ElasFSJacobian_MR_loc,
+    .energy               = ElasFSEnergy_MR,
+    .energy_loc           = ElasFSEnergy_MR_loc,
+    .diagnostic           = ElasFSDiagnostic_MR,
+    .diagnostic_loc       = ElasFSDiagnostic_MR_loc,
+};
+
+PetscErrorCode SetupLibceedFineLevel_ElasFSMR(DM dm, DM dm_energy, DM dm_diagnostic, Ceed ceed, AppCtx app_ctx, CeedQFunctionContext phys_ctx,
+                                              PetscInt fine_level, PetscInt num_comp_u, PetscInt U_g_size, PetscInt U_loc_size, CeedVector force_ceed,
+                                              CeedVector neumann_ceed, CeedData *data) {
+  PetscFunctionBegin;
+
+  PetscCall(SetupLibceedFineLevel(dm, dm_energy, dm_diagnostic, ceed, app_ctx, phys_ctx, finite_strain_Mooney_Rivlin, fine_level, num_comp_u,
+                                  U_g_size, U_loc_size, force_ceed, neumann_ceed, data));
+
+  PetscFunctionReturn(PETSC_SUCCESS);
+};
+
+PetscErrorCode SetupLibceedLevel_ElasFSMR(DM dm, Ceed ceed, AppCtx app_ctx, PetscInt level, PetscInt num_comp_u, PetscInt U_g_size,
+                                          PetscInt U_loc_size, CeedVector fine_mult, CeedData *data) {
+  PetscFunctionBegin;
+
+  PetscCall(SetupLibceedLevel(dm, ceed, app_ctx, finite_strain_Mooney_Rivlin, level, num_comp_u, U_g_size, U_loc_size, fine_mult, data));
+
+  PetscFunctionReturn(PETSC_SUCCESS);
+};
diff --git a/examples/solids/problems/finite-strain-neo-hookean-current-1.c b/examples/solids/problems/finite-strain-neo-hookean-current-1.c
deleted file mode 100644
index e6ad6a8a99..0000000000
--- a/examples/solids/problems/finite-strain-neo-hookean-current-1.c
+++ /dev/null
@@ -1,58 +0,0 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#include "../qfunctions/finite-strain-neo-hookean-current-1.h"
-
-#include <ceed.h>
-#include <petscsys.h>
-
-#include "../include/setup-libceed.h"
-#include "../include/structs.h"
-#include "../problems/neo-hookean.h"
-#include "../problems/problems.h"
-#include "../qfunctions/common.h"
-
-static const char *const field_names[] = {"gradu"};
-static CeedInt           field_sizes[] = {9};
-
-ProblemData finite_strain_neo_Hookean_current_1 = {
-    .setup_geo            = SetupGeo,
-    .setup_geo_loc        = SetupGeo_loc,
-    .q_data_size          = 10,
-    .quadrature_mode      = CEED_GAUSS,
-    .residual             = ElasFSCurrentNH1F,
-    .residual_loc         = ElasFSCurrentNH1F_loc,
-    .number_fields_stored = 1,
-    .field_names          = field_names,
-    .field_sizes          = field_sizes,
-    .jacobian             = ElasFSCurrentNH1dF,
-    .jacobian_loc         = ElasFSCurrentNH1dF_loc,
-    .energy               = ElasFSCurrentNH1Energy,
-    .energy_loc           = ElasFSCurrentNH1Energy_loc,
-    .diagnostic           = ElasFSCurrentNH1Diagnostic,
-    .diagnostic_loc       = ElasFSCurrentNH1Diagnostic_loc,
-};
-
-PetscErrorCode SetupLibceedFineLevel_ElasFSCurrentNH1(DM dm, DM dm_energy, DM dm_diagnostic, Ceed ceed, AppCtx app_ctx, CeedQFunctionContext phys_ctx,
-                                                      PetscInt fine_level, PetscInt num_comp_u, PetscInt U_g_size, PetscInt U_loc_size,
-                                                      CeedVector force_ceed, CeedVector neumann_ceed, CeedData *data) {
-  PetscFunctionBegin;
-
-  PetscCall(SetupLibceedFineLevel(dm, dm_energy, dm_diagnostic, ceed, app_ctx, phys_ctx, finite_strain_neo_Hookean_current_1, fine_level, num_comp_u,
-                                  U_g_size, U_loc_size, force_ceed, neumann_ceed, data));
-
-  PetscFunctionReturn(PETSC_SUCCESS);
-};
-
-PetscErrorCode SetupLibceedLevel_ElasFSCurrentNH1(DM dm, Ceed ceed, AppCtx app_ctx, PetscInt level, PetscInt num_comp_u, PetscInt U_g_size,
-                                                  PetscInt U_loc_size, CeedVector fine_mult, CeedData *data) {
-  PetscFunctionBegin;
-
-  PetscCall(SetupLibceedLevel(dm, ceed, app_ctx, finite_strain_neo_Hookean_current_1, level, num_comp_u, U_g_size, U_loc_size, fine_mult, data));
-
-  PetscFunctionReturn(PETSC_SUCCESS);
-};
diff --git a/examples/solids/problems/finite-strain-neo-hookean-current-2.c b/examples/solids/problems/finite-strain-neo-hookean-current-2.c
deleted file mode 100644
index 78f34d3ee7..0000000000
--- a/examples/solids/problems/finite-strain-neo-hookean-current-2.c
+++ /dev/null
@@ -1,58 +0,0 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#include "../qfunctions/finite-strain-neo-hookean-current-2.h"
-
-#include <ceed.h>
-#include <petscsys.h>
-
-#include "../include/setup-libceed.h"
-#include "../include/structs.h"
-#include "../problems/neo-hookean.h"
-#include "../problems/problems.h"
-#include "../qfunctions/common.h"
-
-static const char *const field_names[] = {"dXdx", "tau", "lambda_log_J"};
-static CeedInt           field_sizes[] = {9, 6, 1};
-
-ProblemData finite_strain_neo_Hookean_current_2 = {
-    .setup_geo            = SetupGeo,
-    .setup_geo_loc        = SetupGeo_loc,
-    .q_data_size          = 10,
-    .quadrature_mode      = CEED_GAUSS,
-    .residual             = ElasFSCurrentNH2F,
-    .residual_loc         = ElasFSCurrentNH2F_loc,
-    .number_fields_stored = 3,
-    .field_names          = field_names,
-    .field_sizes          = field_sizes,
-    .jacobian             = ElasFSCurrentNH2dF,
-    .jacobian_loc         = ElasFSCurrentNH2dF_loc,
-    .energy               = ElasFSCurrentNH2Energy,
-    .energy_loc           = ElasFSCurrentNH2Energy_loc,
-    .diagnostic           = ElasFSCurrentNH2Diagnostic,
-    .diagnostic_loc       = ElasFSCurrentNH2Diagnostic_loc,
-};
-
-PetscErrorCode SetupLibceedFineLevel_ElasFSCurrentNH2(DM dm, DM dm_energy, DM dm_diagnostic, Ceed ceed, AppCtx app_ctx, CeedQFunctionContext phys_ctx,
-                                                      PetscInt fine_level, PetscInt num_comp_u, PetscInt U_g_size, PetscInt U_loc_size,
-                                                      CeedVector force_ceed, CeedVector neumann_ceed, CeedData *data) {
-  PetscFunctionBegin;
-
-  PetscCall(SetupLibceedFineLevel(dm, dm_energy, dm_diagnostic, ceed, app_ctx, phys_ctx, finite_strain_neo_Hookean_current_2, fine_level, num_comp_u,
-                                  U_g_size, U_loc_size, force_ceed, neumann_ceed, data));
-
-  PetscFunctionReturn(PETSC_SUCCESS);
-};
-
-PetscErrorCode SetupLibceedLevel_ElasFSCurrentNH2(DM dm, Ceed ceed, AppCtx app_ctx, PetscInt level, PetscInt num_comp_u, PetscInt U_g_size,
-                                                  PetscInt U_loc_size, CeedVector fine_mult, CeedData *data) {
-  PetscFunctionBegin;
-
-  PetscCall(SetupLibceedLevel(dm, ceed, app_ctx, finite_strain_neo_Hookean_current_2, level, num_comp_u, U_g_size, U_loc_size, fine_mult, data));
-
-  PetscFunctionReturn(PETSC_SUCCESS);
-};
diff --git a/examples/solids/problems/finite-strain-neo-hookean-initial-1.c b/examples/solids/problems/finite-strain-neo-hookean-initial-1.c
deleted file mode 100644
index cb45b602ad..0000000000
--- a/examples/solids/problems/finite-strain-neo-hookean-initial-1.c
+++ /dev/null
@@ -1,58 +0,0 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#include "../qfunctions/finite-strain-neo-hookean-initial-1.h"
-
-#include <ceed.h>
-#include <petscsys.h>
-
-#include "../include/setup-libceed.h"
-#include "../include/structs.h"
-#include "../problems/neo-hookean.h"
-#include "../problems/problems.h"
-#include "../qfunctions/common.h"
-
-static const char *const field_names[] = {"gradu"};
-static CeedInt           field_sizes[] = {9};
-
-ProblemData finite_strain_neo_Hookean_initial_1 = {
-    .setup_geo            = SetupGeo,
-    .setup_geo_loc        = SetupGeo_loc,
-    .q_data_size          = 10,
-    .quadrature_mode      = CEED_GAUSS,
-    .residual             = ElasFSInitialNH1F,
-    .residual_loc         = ElasFSInitialNH1F_loc,
-    .number_fields_stored = 1,
-    .field_names          = field_names,
-    .field_sizes          = field_sizes,
-    .jacobian             = ElasFSInitialNH1dF,
-    .jacobian_loc         = ElasFSInitialNH1dF_loc,
-    .energy               = ElasFSInitialNH1Energy,
-    .energy_loc           = ElasFSInitialNH1Energy_loc,
-    .diagnostic           = ElasFSInitialNH1Diagnostic,
-    .diagnostic_loc       = ElasFSInitialNH1Diagnostic_loc,
-};
-
-PetscErrorCode SetupLibceedFineLevel_ElasFSInitialNH1(DM dm, DM dm_energy, DM dm_diagnostic, Ceed ceed, AppCtx app_ctx, CeedQFunctionContext phys_ctx,
-                                                      PetscInt fine_level, PetscInt num_comp_u, PetscInt U_g_size, PetscInt U_loc_size,
-                                                      CeedVector force_ceed, CeedVector neumann_ceed, CeedData *data) {
-  PetscFunctionBegin;
-
-  PetscCall(SetupLibceedFineLevel(dm, dm_energy, dm_diagnostic, ceed, app_ctx, phys_ctx, finite_strain_neo_Hookean_initial_1, fine_level, num_comp_u,
-                                  U_g_size, U_loc_size, force_ceed, neumann_ceed, data));
-
-  PetscFunctionReturn(PETSC_SUCCESS);
-};
-
-PetscErrorCode SetupLibceedLevel_ElasFSInitialNH1(DM dm, Ceed ceed, AppCtx app_ctx, PetscInt level, PetscInt num_comp_u, PetscInt U_g_size,
-                                                  PetscInt U_loc_size, CeedVector fine_mult, CeedData *data) {
-  PetscFunctionBegin;
-
-  PetscCall(SetupLibceedLevel(dm, ceed, app_ctx, finite_strain_neo_Hookean_initial_1, level, num_comp_u, U_g_size, U_loc_size, fine_mult, data));
-
-  PetscFunctionReturn(PETSC_SUCCESS);
-};
diff --git a/examples/solids/problems/finite-strain-neo-hookean-initial-2.c b/examples/solids/problems/finite-strain-neo-hookean-initial-2.c
deleted file mode 100644
index 9d52b35aec..0000000000
--- a/examples/solids/problems/finite-strain-neo-hookean-initial-2.c
+++ /dev/null
@@ -1,58 +0,0 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#include "../qfunctions/finite-strain-neo-hookean-initial-2.h"
-
-#include <ceed.h>
-#include <petscsys.h>
-
-#include "../include/setup-libceed.h"
-#include "../include/structs.h"
-#include "../problems/neo-hookean.h"
-#include "../problems/problems.h"
-#include "../qfunctions/common.h"
-
-static const char *const field_names[] = {"gradu", "C_inv", "lambda_log_J"};
-static CeedInt           field_sizes[] = {9, 6, 1};
-
-ProblemData finite_strain_neo_Hookean_initial_2 = {
-    .setup_geo            = SetupGeo,
-    .setup_geo_loc        = SetupGeo_loc,
-    .q_data_size          = 10,
-    .quadrature_mode      = CEED_GAUSS,
-    .residual             = ElasFSInitialNH2F,
-    .residual_loc         = ElasFSInitialNH2F_loc,
-    .number_fields_stored = 3,
-    .field_names          = field_names,
-    .field_sizes          = field_sizes,
-    .jacobian             = ElasFSInitialNH2dF,
-    .jacobian_loc         = ElasFSInitialNH2dF_loc,
-    .energy               = ElasFSInitialNH2Energy,
-    .energy_loc           = ElasFSInitialNH2Energy_loc,
-    .diagnostic           = ElasFSInitialNH2Diagnostic,
-    .diagnostic_loc       = ElasFSInitialNH2Diagnostic_loc,
-};
-
-PetscErrorCode SetupLibceedFineLevel_ElasFSInitialNH2(DM dm, DM dm_energy, DM dm_diagnostic, Ceed ceed, AppCtx app_ctx, CeedQFunctionContext phys_ctx,
-                                                      PetscInt fine_level, PetscInt num_comp_u, PetscInt U_g_size, PetscInt U_loc_size,
-                                                      CeedVector force_ceed, CeedVector neumann_ceed, CeedData *data) {
-  PetscFunctionBegin;
-
-  PetscCall(SetupLibceedFineLevel(dm, dm_energy, dm_diagnostic, ceed, app_ctx, phys_ctx, finite_strain_neo_Hookean_initial_2, fine_level, num_comp_u,
-                                  U_g_size, U_loc_size, force_ceed, neumann_ceed, data));
-
-  PetscFunctionReturn(PETSC_SUCCESS);
-};
-
-PetscErrorCode SetupLibceedLevel_ElasFSInitialNH2(DM dm, Ceed ceed, AppCtx app_ctx, PetscInt level, PetscInt num_comp_u, PetscInt U_g_size,
-                                                  PetscInt U_loc_size, CeedVector fine_mult, CeedData *data) {
-  PetscFunctionBegin;
-
-  PetscCall(SetupLibceedLevel(dm, ceed, app_ctx, finite_strain_neo_Hookean_initial_2, level, num_comp_u, U_g_size, U_loc_size, fine_mult, data));
-
-  PetscFunctionReturn(PETSC_SUCCESS);
-};
diff --git a/examples/solids/problems/finite-strain-neo-hookean.c b/examples/solids/problems/finite-strain-neo-hookean.c
new file mode 100644
index 0000000000..fac1e47ba6
--- /dev/null
+++ b/examples/solids/problems/finite-strain-neo-hookean.c
@@ -0,0 +1,58 @@
+// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+#include "../qfunctions/finite-strain-neo-hookean.h"
+
+#include <ceed.h>
+#include <petscsys.h>
+
+#include "../include/setup-libceed.h"
+#include "../include/structs.h"
+#include "../problems/neo-hookean.h"
+#include "../problems/problems.h"
+#include "../qfunctions/common.h"
+
+static const char *const field_names[] = {"gradu"};
+static CeedInt           field_sizes[] = {9};
+
+ProblemData finite_strain_neo_Hookean = {
+    .setup_geo            = SetupGeo,
+    .setup_geo_loc        = SetupGeo_loc,
+    .q_data_size          = 10,
+    .quadrature_mode      = CEED_GAUSS,
+    .residual             = ElasFSResidual_NH,
+    .residual_loc         = ElasFSResidual_NH_loc,
+    .number_fields_stored = 1,
+    .field_names          = field_names,
+    .field_sizes          = field_sizes,
+    .jacobian             = ElasFSJacobian_NH,
+    .jacobian_loc         = ElasFSJacobian_NH_loc,
+    .energy               = ElasFSEnergy_NH,
+    .energy_loc           = ElasFSEnergy_NH_loc,
+    .diagnostic           = ElasFSDiagnostic_NH,
+    .diagnostic_loc       = ElasFSDiagnostic_NH_loc,
+};
+
+PetscErrorCode SetupLibceedFineLevel_ElasFSNH(DM dm, DM dm_energy, DM dm_diagnostic, Ceed ceed, AppCtx app_ctx, CeedQFunctionContext phys_ctx,
+                                              PetscInt fine_level, PetscInt num_comp_u, PetscInt U_g_size, PetscInt U_loc_size, CeedVector force_ceed,
+                                              CeedVector neumann_ceed, CeedData *data) {
+  PetscFunctionBegin;
+
+  PetscCall(SetupLibceedFineLevel(dm, dm_energy, dm_diagnostic, ceed, app_ctx, phys_ctx, finite_strain_neo_Hookean, fine_level, num_comp_u, U_g_size,
+                                  U_loc_size, force_ceed, neumann_ceed, data));
+
+  PetscFunctionReturn(PETSC_SUCCESS);
+};
+
+PetscErrorCode SetupLibceedLevel_ElasFSNH(DM dm, Ceed ceed, AppCtx app_ctx, PetscInt level, PetscInt num_comp_u, PetscInt U_g_size,
+                                          PetscInt U_loc_size, CeedVector fine_mult, CeedData *data) {
+  PetscFunctionBegin;
+
+  PetscCall(SetupLibceedLevel(dm, ceed, app_ctx, finite_strain_neo_Hookean, level, num_comp_u, U_g_size, U_loc_size, fine_mult, data));
+
+  PetscFunctionReturn(PETSC_SUCCESS);
+};
diff --git a/examples/solids/problems/linear.c b/examples/solids/problems/linear.c
index c013ee716a..051b2f1155 100644
--- a/examples/solids/problems/linear.c
+++ b/examples/solids/problems/linear.c
@@ -22,15 +22,15 @@ ProblemData linear_elasticity = {
     .setup_geo_loc        = SetupGeo_loc,
     .q_data_size          = 10,
     .quadrature_mode      = CEED_GAUSS,
-    .residual             = ElasLinearF,
-    .residual_loc         = ElasLinearF_loc,
+    .residual             = ElasResidual_Linear,
+    .residual_loc         = ElasResidual_Linear_loc,
     .number_fields_stored = 0,
-    .jacobian             = ElasLineardF,
-    .jacobian_loc         = ElasLineardF_loc,
-    .energy               = ElasLinearEnergy,
-    .energy_loc           = ElasLinearEnergy_loc,
-    .diagnostic           = ElasLinearDiagnostic,
-    .diagnostic_loc       = ElasLinearDiagnostic_loc,
+    .jacobian             = ElasJacobian_Linear,
+    .jacobian_loc         = ElasJacobian_Linear_loc,
+    .energy               = ElasEnergy_Linear,
+    .energy_loc           = ElasEnergy_Linear_loc,
+    .diagnostic           = ElasDiagnostic_Linear,
+    .diagnostic_loc       = ElasDiagnostic_Linear_loc,
     .true_soln            = MMSTrueSoln,
     .true_soln_loc        = MMSTrueSoln_loc,
 };
diff --git a/examples/solids/problems/problems.c b/examples/solids/problems/problems.c
index e125997093..5cfd9769a9 100644
--- a/examples/solids/problems/problems.c
+++ b/examples/solids/problems/problems.c
@@ -15,11 +15,8 @@ PetscErrorCode RegisterProblems(ProblemFunctions problem_functions) {
 
   SOLIDS_PROBLEM_REGISTER(problem_functions, "Linear", ElasLinear, NH);
   SOLIDS_PROBLEM_REGISTER(problem_functions, "SS-NH", ElasSSNH, NH);
-  SOLIDS_PROBLEM_REGISTER(problem_functions, "FSCurrent-NH1", ElasFSCurrentNH1, NH);
-  SOLIDS_PROBLEM_REGISTER(problem_functions, "FSCurrent-NH2", ElasFSCurrentNH2, NH);
-  SOLIDS_PROBLEM_REGISTER(problem_functions, "FSInitial-NH1", ElasFSInitialNH1, NH);
-  SOLIDS_PROBLEM_REGISTER(problem_functions, "FSInitial-NH2", ElasFSInitialNH2, NH);
-  SOLIDS_PROBLEM_REGISTER(problem_functions, "FSInitial-MR1", ElasFSInitialMR1, MR);
+  SOLIDS_PROBLEM_REGISTER(problem_functions, "FS-NH", ElasFSNH, NH);
+  SOLIDS_PROBLEM_REGISTER(problem_functions, "FS-MR", ElasFSMR, MR);
 
   PetscFunctionReturn(PETSC_SUCCESS);
 };
diff --git a/examples/solids/problems/problems.h b/examples/solids/problems/problems.h
index 17503fda72..6314426ac5 100644
--- a/examples/solids/problems/problems.h
+++ b/examples/solids/problems/problems.h
@@ -36,8 +36,5 @@ PetscErrorCode RegisterProblems(ProblemFunctions problem_functions);
 
 SOLIDS_PROBLEM(ElasLinear);
 SOLIDS_PROBLEM(ElasSSNH);
-SOLIDS_PROBLEM(ElasFSCurrentNH1);
-SOLIDS_PROBLEM(ElasFSCurrentNH2);
-SOLIDS_PROBLEM(ElasFSInitialNH1);
-SOLIDS_PROBLEM(ElasFSInitialNH2);
-SOLIDS_PROBLEM(ElasFSInitialMR1);
+SOLIDS_PROBLEM(ElasFSNH);
+SOLIDS_PROBLEM(ElasFSMR);
diff --git a/examples/solids/problems/small-strain-neo-hookean.c b/examples/solids/problems/small-strain-neo-hookean.c
index be2fb27c43..f6252807cf 100644
--- a/examples/solids/problems/small-strain-neo-hookean.c
+++ b/examples/solids/problems/small-strain-neo-hookean.c
@@ -24,17 +24,17 @@ ProblemData small_strain_neo_Hookean = {
     .setup_geo_loc        = SetupGeo_loc,
     .q_data_size          = 10,
     .quadrature_mode      = CEED_GAUSS,
-    .residual             = ElasSSNHF,
-    .residual_loc         = ElasSSNHF_loc,
+    .residual             = ElasSSResidual_NH,
+    .residual_loc         = ElasSSResidual_NH_loc,
     .number_fields_stored = 1,
     .field_names          = field_names,
     .field_sizes          = field_sizes,
-    .jacobian             = ElasSSNHdF,
-    .jacobian_loc         = ElasSSNHdF_loc,
-    .energy               = ElasSSNHEnergy,
-    .energy_loc           = ElasSSNHEnergy_loc,
-    .diagnostic           = ElasSSNHDiagnostic,
-    .diagnostic_loc       = ElasSSNHDiagnostic_loc,
+    .jacobian             = ElasSSJacobian_NH,
+    .jacobian_loc         = ElasSSJacobian_NH_loc,
+    .energy               = ElasSSEnergy_NH,
+    .energy_loc           = ElasSSEnergy_NH_loc,
+    .diagnostic           = ElasSSDiagnostic_NH,
+    .diagnostic_loc       = ElasSSDiagnostic_NH_loc,
 };
 
 PetscErrorCode SetupLibceedFineLevel_ElasSSNH(DM dm, DM dm_energy, DM dm_diagnostic, Ceed ceed, AppCtx app_ctx, CeedQFunctionContext phys_ctx,
diff --git a/examples/solids/qfunctions/finite-strain-mooney-rivlin-initial-1.h b/examples/solids/qfunctions/finite-strain-mooney-rivlin.h
similarity index 95%
rename from examples/solids/qfunctions/finite-strain-mooney-rivlin-initial-1.h
rename to examples/solids/qfunctions/finite-strain-mooney-rivlin.h
index 444b71d27f..f9c19e81b1 100644
--- a/examples/solids/qfunctions/finite-strain-mooney-rivlin-initial-1.h
+++ b/examples/solids/qfunctions/finite-strain-mooney-rivlin.h
@@ -91,14 +91,14 @@ CEED_QFUNCTION_HELPER int computeMatinvSym(const CeedScalar A[3][3], const CeedS
   };
   for (CeedInt m = 0; m < 6; m++) Ainv[m] = B[m] / (detA);
 
-  return 0;
-};
+  return CEED_ERROR_SUCCESS;
+}
 #endif
 // -----------------------------------------------------------------------------
 // Common computations between FS and dFS
 // -----------------------------------------------------------------------------
-CEED_QFUNCTION_HELPER int commonFSMR1(const CeedScalar mu_1, const CeedScalar mu_2, const CeedScalar lambda, const CeedScalar grad_u[3][3],
-                                      CeedScalar Swork[6], CeedScalar Cwork[6], CeedScalar Cinvwork[6], CeedScalar *logJ) {
+CEED_QFUNCTION_HELPER int commonFSMR(const CeedScalar mu_1, const CeedScalar mu_2, const CeedScalar lambda, const CeedScalar grad_u[3][3],
+                                     CeedScalar Swork[6], CeedScalar Cwork[6], CeedScalar Cinvwork[6], CeedScalar *logJ) {
   // E - Green-Lagrange strain tensor
   //     E = 1/2 (grad_u + grad_u^T + grad_u^T*grad_u)
   const CeedInt indj[6] = {0, 1, 2, 1, 0, 0}, indk[6] = {0, 1, 2, 2, 2, 1};
@@ -147,13 +147,13 @@ CEED_QFUNCTION_HELPER int commonFSMR1(const CeedScalar mu_1, const CeedScalar mu
                - mu_2 * Cwork[i];
   }
 
-  return 0;
-};
+  return CEED_ERROR_SUCCESS;
+}
 
 // -----------------------------------------------------------------------------
 // Residual evaluation for hyperelasticity, finite strain
 // -----------------------------------------------------------------------------
-CEED_QFUNCTION(ElasFSInitialMR1F)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+CEED_QFUNCTION(ElasFSResidual_MR)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   // Inputs
   const CeedScalar(*ug)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0], (*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[1];
 
@@ -218,7 +218,7 @@ CEED_QFUNCTION(ElasFSInitialMR1F)(void *ctx, CeedInt Q, const CeedScalar *const
 
     // Common components of finite strain calculations
     CeedScalar Swork[6], Cwork[6], Cinvwork[6], logJ;
-    commonFSMR1(mu_1, mu_2, lambda, tempgradu, Swork, Cwork, Cinvwork, &logJ);
+    commonFSMR(mu_1, mu_2, lambda, tempgradu, Swork, Cwork, Cinvwork, &logJ);
 
     // Second Piola-Kirchhoff (S)
     const CeedScalar S[3][3] = {
@@ -245,13 +245,13 @@ CEED_QFUNCTION(ElasFSInitialMR1F)(void *ctx, CeedInt Q, const CeedScalar *const
     }
   }  // End of Quadrature Point Loop
 
-  return 0;
+  return CEED_ERROR_SUCCESS;
 }
 
 // -----------------------------------------------------------------------------
 // Jacobian evaluation for hyperelasticity, finite strain
 // -----------------------------------------------------------------------------
-CEED_QFUNCTION(ElasFSInitialMR1dF)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+CEED_QFUNCTION(ElasFSJacobian_MR)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   // Inputs
   const CeedScalar(*deltaug)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0],
         (*q_data)[CEED_Q_VLA]               = (const CeedScalar(*)[CEED_Q_VLA])in[1];
@@ -311,7 +311,7 @@ CEED_QFUNCTION(ElasFSInitialMR1dF)(void *ctx, CeedInt Q, const CeedScalar *const
 
     // Common components of finite strain calculations
     CeedScalar Swork[6], Cwork[6], Cinvwork[6], logJ;
-    commonFSMR1(mu_1, mu_2, lambda, tempgradu, Swork, Cwork, Cinvwork, &logJ);
+    commonFSMR(mu_1, mu_2, lambda, tempgradu, Swork, Cwork, Cinvwork, &logJ);
 
     // dE - Green-Lagrange strain tensor
     const CeedInt indj[6] = {0, 1, 2, 1, 0, 0}, indk[6] = {0, 1, 2, 2, 2, 1};
@@ -408,12 +408,13 @@ CEED_QFUNCTION(ElasFSInitialMR1dF)(void *ctx, CeedInt Q, const CeedScalar *const
     }
   }  // End of Quadrature Point Loop
 
-  return 0;
+  return CEED_ERROR_SUCCESS;
 }
+
 // -----------------------------------------------------------------------------
 // Strain energy computation for hyperelasticity, finite strain
 // -----------------------------------------------------------------------------
-CEED_QFUNCTION(ElasFSInitialMR1Energy)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+CEED_QFUNCTION(ElasFSEnergy_MR)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   // Inputs
   const CeedScalar(*ug)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0], (*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[1];
 
@@ -498,13 +499,13 @@ CEED_QFUNCTION(ElasFSInitialMR1Energy)(void *ctx, CeedInt Q, const CeedScalar *c
 
   }  // End of Quadrature Point Loop
 
-  return 0;
+  return CEED_ERROR_SUCCESS;
 }
 
 // -----------------------------------------------------------------------------
 // Nodal diagnostic quantities for hyperelasticity, finite strain
 // -----------------------------------------------------------------------------
-CEED_QFUNCTION(ElasFSInitialMR1Diagnostic)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+CEED_QFUNCTION(ElasFSDiagnostic_MR)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   // Inputs
   const CeedScalar(*u)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0], (*ug)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[1],
         (*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[2];
@@ -605,6 +606,6 @@ CEED_QFUNCTION(ElasFSInitialMR1Diagnostic)(void *ctx, CeedInt Q, const CeedScala
     diagnostic[7][i] = (0.5 * lambda * logJ * logJ - (mu_1 + 2 * mu_2) * logJ + (mu_1 / 2.) * (I_1 - 3) + (mu_2 / 2.) * (I_2 - 3));
   }  // End of Quadrature Point Loop
 
-  return 0;
+  return CEED_ERROR_SUCCESS;
 }
 // -----------------------------------------------------------------------------
diff --git a/examples/solids/qfunctions/finite-strain-neo-hookean-current-1.h b/examples/solids/qfunctions/finite-strain-neo-hookean-current-1.h
deleted file mode 100644
index 129a0af3b4..0000000000
--- a/examples/solids/qfunctions/finite-strain-neo-hookean-current-1.h
+++ /dev/null
@@ -1,535 +0,0 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-/// @file
-/// Hyperelasticity, finite strain for solid mechanics example using PETSc
-
-#include <ceed.h>
-#include <math.h>
-
-#ifndef PHYSICS_STRUCT
-#define PHYSICS_STRUCT
-typedef struct Physics_private *Physics;
-struct Physics_private {
-  CeedScalar nu;  // Poisson's ratio
-  CeedScalar E;   // Young's Modulus
-};
-#endif
-
-// -----------------------------------------------------------------------------
-// Series approximation of log1p()
-//  log1p() is not vectorized in libc
-//
-//  The series expansion is accurate to 1e-7 in the range sqrt(2)/2 < J < sqrt(2), with machine precision accuracy near J=1.
-//  The initialization extends this range to 0.35 ~= sqrt(2)/4 < J < sqrt(2)*2 ~= 2.83, which should be sufficient for applications of the Neo-Hookean
-//  model.
-// -----------------------------------------------------------------------------
-#ifndef LOG1P_SERIES_SHIFTED
-#define LOG1P_SERIES_SHIFTED
-CEED_QFUNCTION_HELPER CeedScalar log1p_series_shifted(CeedScalar x) {
-  const CeedScalar left = sqrt(2.) / 2 - 1, right = sqrt(2.) - 1;
-  CeedScalar       sum = 0;
-  if (1) {           // Disable if the smaller range sqrt(2)/2 < J < sqrt(2) is sufficient
-    if (x < left) {  // Replace if with while for arbitrary range (may hurt vectorization)
-      sum -= log(2.) / 2;
-      x = 1 + 2 * x;
-    } else if (right < x) {
-      sum += log(2.) / 2;
-      x = (x - 1) / 2;
-    }
-  }
-  CeedScalar       y  = x / (2. + x);
-  const CeedScalar y2 = y * y;
-  sum += y;
-  y *= y2;
-  sum += y / 3;
-  y *= y2;
-  sum += y / 5;
-  y *= y2;
-  sum += y / 7;
-  return 2 * sum;
-};
-#endif
-
-// -----------------------------------------------------------------------------
-// Compute det F - 1
-// -----------------------------------------------------------------------------
-#ifndef DETJM1
-#define DETJM1
-CEED_QFUNCTION_HELPER CeedScalar computeJM1(const CeedScalar grad_u[3][3]) {
-  return grad_u[0][0] * (grad_u[1][1] * grad_u[2][2] - grad_u[1][2] * grad_u[2][1]) +
-         grad_u[0][1] * (grad_u[1][2] * grad_u[2][0] - grad_u[1][0] * grad_u[2][2]) +
-         grad_u[0][2] * (grad_u[1][0] * grad_u[2][1] - grad_u[2][0] * grad_u[1][1]) + grad_u[0][0] + grad_u[1][1] + grad_u[2][2] +
-         grad_u[0][0] * grad_u[1][1] + grad_u[0][0] * grad_u[2][2] + grad_u[1][1] * grad_u[2][2] - grad_u[0][1] * grad_u[1][0] -
-         grad_u[0][2] * grad_u[2][0] - grad_u[1][2] * grad_u[2][1];
-};
-#endif
-
-// -----------------------------------------------------------------------------
-// Compute matrix^(-1), where matrix is nonsymetric, returns array of 9
-// -----------------------------------------------------------------------------
-#ifndef MatinvNonSym
-#define MatinvNonSym
-CEED_QFUNCTION_HELPER int computeMatinvNonSym(const CeedScalar A[3][3], const CeedScalar detA, CeedScalar Ainv[9]) {
-  // Compute A^(-1) : A-Inverse
-  CeedScalar B[9] = {
-      A[1][1] * A[2][2] - A[1][2] * A[2][1], /* *NOPAD* */
-      A[0][0] * A[2][2] - A[0][2] * A[2][0], /* *NOPAD* */
-      A[0][0] * A[1][1] - A[0][1] * A[1][0], /* *NOPAD* */
-      A[0][2] * A[1][0] - A[0][0] * A[1][2], /* *NOPAD* */
-      A[0][1] * A[1][2] - A[0][2] * A[1][1], /* *NOPAD* */
-      A[0][2] * A[2][1] - A[0][1] * A[2][2], /* *NOPAD* */
-      A[0][1] * A[2][0] - A[0][0] * A[2][1], /* *NOPAD* */
-      A[1][0] * A[2][1] - A[1][1] * A[2][0], /* *NOPAD* */
-      A[1][2] * A[2][0] - A[1][0] * A[2][2]  /* *NOPAD* */
-  };
-  for (CeedInt m = 0; m < 9; m++) Ainv[m] = B[m] / (detA);
-
-  return 0;
-};
-#endif
-
-// -----------------------------------------------------------------------------
-// Common computations between Ftau and dFtau
-// -----------------------------------------------------------------------------
-CEED_QFUNCTION_HELPER int commonFtau(const CeedScalar lambda, const CeedScalar mu, const CeedScalar Grad_u[3][3], CeedScalar Finv[3][3],
-                                     CeedScalar tau_work[6], CeedScalar *llnj) {
-  // Compute The Deformation Gradient : F = I3 + Grad_u
-  const CeedScalar F[3][3] = {
-      {Grad_u[0][0] + 1, Grad_u[0][1],     Grad_u[0][2]    },
-      {Grad_u[1][0],     Grad_u[1][1] + 1, Grad_u[1][2]    },
-      {Grad_u[2][0],     Grad_u[2][1],     Grad_u[2][2] + 1}
-  };
-
-  // b - I3 = (Grad_u + Grad_u^T + Grad_u*Grad_u^T)
-  const CeedInt indj[6] = {0, 1, 2, 1, 0, 0}, indk[6] = {0, 1, 2, 2, 2, 1};
-  CeedScalar    bMI3[6];
-  for (CeedInt m = 0; m < 6; m++) {
-    bMI3[m] = Grad_u[indj[m]][indk[m]] + Grad_u[indk[m]][indj[m]];
-    for (CeedInt n = 0; n < 3; n++) bMI3[m] += Grad_u[indj[m]][n] * Grad_u[indk[m]][n];
-  }
-  const CeedScalar Jm1  = computeJM1(Grad_u);
-  const CeedScalar logJ = log1p_series_shifted(Jm1);
-
-  // Computer F^(-1)
-  const CeedScalar detF = Jm1 + 1.;
-  CeedScalar       Finvwork[9];
-  computeMatinvNonSym(F, detF, Finvwork);
-
-  Finv[0][0] = Finvwork[0];
-  Finv[0][1] = Finvwork[5];
-  Finv[0][2] = Finvwork[4];
-  Finv[1][0] = Finvwork[8];
-  Finv[1][1] = Finvwork[1];
-  Finv[1][2] = Finvwork[3];
-  Finv[2][0] = Finvwork[7];
-  Finv[2][1] = Finvwork[6];
-  Finv[2][2] = Finvwork[2];
-
-  // Compute the Kirchhoff stress (tau) tau = mu*(b - I3) + lambda*log(J)*I3
-  *llnj = lambda * logJ;
-
-  tau_work[0] = mu * bMI3[0] + *llnj;
-  tau_work[1] = mu * bMI3[1] + *llnj;
-  tau_work[2] = mu * bMI3[2] + *llnj;
-  tau_work[3] = mu * bMI3[3];
-  tau_work[4] = mu * bMI3[4];
-  tau_work[5] = mu * bMI3[5];
-
-  return 0;
-};
-
-// -----------------------------------------------------------------------------
-// Residual evaluation for hyperelasticity, finite strain
-// -----------------------------------------------------------------------------
-CEED_QFUNCTION(ElasFSCurrentNH1F)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
-  // Inputs
-  const CeedScalar(*ug)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0], (*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[1];
-
-  // Outputs
-  CeedScalar(*dvdX)[3][CEED_Q_VLA] = (CeedScalar(*)[3][CEED_Q_VLA])out[0];
-  // Store grad_u for HyperFSdF (Jacobian of HyperFSF)
-  CeedScalar(*Grad_u)[3][CEED_Q_VLA] = (CeedScalar(*)[3][CEED_Q_VLA])out[1];
-
-  // Context
-  const Physics    context = (Physics)ctx;
-  const CeedScalar E       = context->E;
-  const CeedScalar nu      = context->nu;
-  const CeedScalar TwoMu   = E / (1 + nu);
-  const CeedScalar mu      = TwoMu / 2;
-  const CeedScalar Kbulk   = E / (3 * (1 - 2 * nu));  // Bulk Modulus
-  const CeedScalar lambda  = (3 * Kbulk - TwoMu) / 3;
-
-  // Formulation Terminology:
-  //  I3    : 3x3 Identity matrix
-  //  b     : left Cauchy-Green tensor
-  //  binv  : inverse of b
-  //  F     : deformation gradient
-  //  tau   : Kirchhoff stress (in current config)
-  // Formulation:
-  //  F =  I3 + Grad_u
-  //  J = det(F)
-  //  b = F*F(^T)
-  //  tau = mu*(b - I3) + lambda*log(J)*I3;
-
-  // Quadrature Point Loop
-  CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
-    // Read spatial derivatives of u
-    const CeedScalar du[3][3] = {
-        {ug[0][0][i], ug[1][0][i], ug[2][0][i]},
-        {ug[0][1][i], ug[1][1][i], ug[2][1][i]},
-        {ug[0][2][i], ug[1][2][i], ug[2][2][i]}
-    };
-    // -- Qdata
-    const CeedScalar wdetJ = q_data[0][i];
-    // dXdx_initial = dX/dx_initial
-    // X is natural coordinate sys OR Reference [-1,1]^dim
-    // x_initial is initial config coordinate system
-    const CeedScalar dXdx_initial[3][3] = {
-        {q_data[1][i], q_data[2][i], q_data[3][i]},
-        {q_data[4][i], q_data[5][i], q_data[6][i]},
-        {q_data[7][i], q_data[8][i], q_data[9][i]}
-    };
-
-    // Compute Grad_u
-    //   dXdx = (dx/dX)^(-1)
-    // Apply dXdx to du = grad_u
-    for (CeedInt j = 0; j < 3; j++) {    // Component
-      for (CeedInt k = 0; k < 3; k++) {  // Derivative
-        Grad_u[j][k][i] = 0;
-        for (CeedInt m = 0; m < 3; m++) Grad_u[j][k][i] += du[j][m] * dXdx_initial[m][k];
-      }
-    }
-
-    const CeedScalar tempGradu[3][3] = {
-        {Grad_u[0][0][i], Grad_u[0][1][i], Grad_u[0][2][i]},
-        {Grad_u[1][0][i], Grad_u[1][1][i], Grad_u[1][2][i]},
-        {Grad_u[2][0][i], Grad_u[2][1][i], Grad_u[2][2][i]}
-    };
-
-    // Common components of finite strain calculations
-    CeedScalar Finv[3][3], tau_work[6], llnj;
-
-    commonFtau(lambda, mu, tempGradu, Finv, tau_work, &llnj);
-    const CeedScalar tau[3][3] = {
-        {tau_work[0], tau_work[5], tau_work[4]},
-        {tau_work[5], tau_work[1], tau_work[3]},
-        {tau_work[4], tau_work[3], tau_work[2]}
-    };
-    // x is current config coordinate system
-    // dXdx = dX/dx = dX/dx_initial * F^{-1}
-    // Note that F^{-1} = dx_initial/dx
-    CeedScalar dXdx[3][3];
-    for (CeedInt j = 0; j < 3; j++) {    // Component
-      for (CeedInt k = 0; k < 3; k++) {  // Derivative
-        dXdx[j][k] = 0;
-        for (CeedInt m = 0; m < 3; m++) dXdx[j][k] += dXdx_initial[j][m] * Finv[m][k];
-      }
-    }
-
-    // Apply dXdx^T and weight to intermediate stress
-    for (CeedInt j = 0; j < 3; j++) {    // Component
-      for (CeedInt k = 0; k < 3; k++) {  // Derivative
-        dvdX[k][j][i] = 0;
-        for (CeedInt m = 0; m < 3; m++) dvdX[k][j][i] += dXdx[k][m] * tau[j][m] * wdetJ;
-      }
-    }
-
-  }  // End of Quadrature Point Loop
-
-  return 0;
-}
-
-// -----------------------------------------------------------------------------
-// Jacobian evaluation for hyperelasticity, finite strain
-// -----------------------------------------------------------------------------
-CEED_QFUNCTION(ElasFSCurrentNH1dF)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
-  // Inputs
-  const CeedScalar(*deltaug)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0],
-        (*q_data)[CEED_Q_VLA]               = (const CeedScalar(*)[CEED_Q_VLA])in[1];
-  // F is used for hyperelasticity (non-linear)
-  const CeedScalar(*Grad_u)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[2];
-
-  // Outputs
-  CeedScalar(*deltadvdX)[3][CEED_Q_VLA] = (CeedScalar(*)[3][CEED_Q_VLA])out[0];
-
-  // Context
-  const Physics    context = (Physics)ctx;
-  const CeedScalar E       = context->E;
-  const CeedScalar nu      = context->nu;
-
-  // Constants
-  const CeedScalar TwoMu  = E / (1 + nu);
-  const CeedScalar mu     = TwoMu / 2;
-  const CeedScalar Kbulk  = E / (3 * (1 - 2 * nu));  // Bulk Modulus
-  const CeedScalar lambda = (3 * Kbulk - TwoMu) / 3;
-
-  // Quadrature Point Loop
-  CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
-    // Read spatial derivatives of delta_u
-    const CeedScalar deltadu[3][3] = {
-        {deltaug[0][0][i], deltaug[1][0][i], deltaug[2][0][i]},
-        {deltaug[0][1][i], deltaug[1][1][i], deltaug[2][1][i]},
-        {deltaug[0][2][i], deltaug[1][2][i], deltaug[2][2][i]}
-    };
-    // -- Qdata
-    const CeedScalar wdetJ = q_data[0][i];
-    // dXdx_initial = dX/dx_initial
-    // X is natural coordinate sys OR Reference [-1,1]^dim
-    // x_initial is initial config coordinate system
-    const CeedScalar dXdx_initial[3][3] = {
-        {q_data[1][i], q_data[2][i], q_data[3][i]},
-        {q_data[4][i], q_data[5][i], q_data[6][i]},
-        {q_data[7][i], q_data[8][i], q_data[9][i]}
-    };
-
-    // Compute Grad_du
-    //   dXdx = (dx/dX)^(-1)
-    // Apply dXdx to deltadu = graddelta
-    // This is dF = Grad_du
-    CeedScalar Grad_du[3][3];
-    for (CeedInt j = 0; j < 3; j++) {    // Component
-      for (CeedInt k = 0; k < 3; k++) {  // Derivative
-        Grad_du[j][k] = 0;
-        for (CeedInt m = 0; m < 3; m++) Grad_du[j][k] += dXdx_initial[m][k] * deltadu[j][m];
-      }
-    }
-
-    const CeedScalar tempGradu[3][3] = {
-        {Grad_u[0][0][i], Grad_u[0][1][i], Grad_u[0][2][i]},
-        {Grad_u[1][0][i], Grad_u[1][1][i], Grad_u[1][2][i]},
-        {Grad_u[2][0][i], Grad_u[2][1][i], Grad_u[2][2][i]}
-    };
-
-    // Common components of finite strain calculations
-    CeedScalar F_inv[3][3], tau_work[6], llnj;
-
-    // Common components of finite strain calculations (cur. config.)
-    commonFtau(lambda, mu, tempGradu, F_inv, tau_work, &llnj);
-    const CeedScalar tau[3][3] = {
-        {tau_work[0], tau_work[5], tau_work[4]},
-        {tau_work[5], tau_work[1], tau_work[3]},
-        {tau_work[4], tau_work[3], tau_work[2]}
-    };
-
-    // Compute grad_du = \nabla_x (deltau) = deltau * dX/dx
-    CeedScalar grad_du[3][3];
-    for (CeedInt j = 0; j < 3; j++) {
-      for (CeedInt k = 0; k < 3; k++) {
-        grad_du[j][k] = 0;
-        for (CeedInt m = 0; m < 3; m++) grad_du[j][k] += deltadu[j][m] * F_inv[m][k];
-      }
-    }
-
-    // Compute grad_du_tau = grad_du*tau
-    CeedScalar grad_du_tau[3][3];
-    for (CeedInt j = 0; j < 3; j++) {
-      for (CeedInt k = 0; k < 3; k++) {
-        grad_du_tau[j][k] = 0;
-        for (CeedInt m = 0; m < 3; m++) grad_du_tau[j][k] += grad_du[j][m] * tau[m][k];
-      }
-    }
-
-    // Compute depsilon = (grad_du + grad_du^T)/2
-    const CeedScalar depsilon[3][3] = {
-        {(grad_du[0][0] + grad_du[0][0]) / 2., (grad_du[0][1] + grad_du[1][0]) / 2., (grad_du[0][2] + grad_du[2][0]) / 2.},
-        {(grad_du[1][0] + grad_du[0][1]) / 2., (grad_du[1][1] + grad_du[1][1]) / 2., (grad_du[1][2] + grad_du[2][1]) / 2.},
-        {(grad_du[2][0] + grad_du[0][2]) / 2., (grad_du[2][1] + grad_du[1][2]) / 2., (grad_du[2][2] + grad_du[2][2]) / 2.}
-    };
-    // Compute trace(depsilon)
-    CeedScalar tr_deps = depsilon[0][0] + depsilon[1][1] + depsilon[2][2];
-    // Compute grad_du*tau + trace(depsilon)I3
-    grad_du_tau[0][0] += lambda * tr_deps;
-    grad_du_tau[1][1] += lambda * tr_deps;
-    grad_du_tau[2][2] += lambda * tr_deps;
-    // Compute dp = grad_du*tau + trace(depsilon)I3 +2(mu-lambda*logJ)depsilon
-    CeedScalar dp[3][3];
-    for (CeedInt j = 0; j < 3; j++) {
-      for (CeedInt k = 0; k < 3; k++) {
-        dp[j][k] = grad_du_tau[j][k] + 2 * (mu - llnj) * depsilon[j][k];
-      }
-    }
-
-    // x is current config coordinate system
-    // dXdx = dX/dx = dX/dx_initial * F^{-1}
-    // Note that F^{-1} = dx_initial/dx
-    CeedScalar dXdx[3][3];
-    for (CeedInt j = 0; j < 3; j++) {    // Component
-      for (CeedInt k = 0; k < 3; k++) {  // Derivative
-        dXdx[j][k] = 0;
-        for (CeedInt m = 0; m < 3; m++) dXdx[j][k] += dXdx_initial[j][m] * F_inv[m][k];
-      }
-    }
-
-    // Apply dXdx^T and weight
-    for (CeedInt j = 0; j < 3; j++) {    // Component
-      for (CeedInt k = 0; k < 3; k++) {  // Derivative
-        deltadvdX[k][j][i] = 0;
-        for (CeedInt m = 0; m < 3; m++) deltadvdX[k][j][i] += dXdx[k][m] * dp[j][m] * wdetJ;
-      }
-    }
-
-  }  // End of Quadrature Point Loop
-
-  return 0;
-}
-
-// -----------------------------------------------------------------------------
-// Strain energy computation for hyperelasticity, finite strain
-// -----------------------------------------------------------------------------
-CEED_QFUNCTION(ElasFSCurrentNH1Energy)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
-  // Inputs
-  const CeedScalar(*ug)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0], (*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[1];
-
-  // Outputs
-  CeedScalar(*energy) = (CeedScalar(*))out[0];
-
-  // Context
-  const Physics    context = (Physics)ctx;
-  const CeedScalar E       = context->E;
-  const CeedScalar nu      = context->nu;
-  const CeedScalar TwoMu   = E / (1 + nu);
-  const CeedScalar mu      = TwoMu / 2;
-  const CeedScalar Kbulk   = E / (3 * (1 - 2 * nu));  // Bulk Modulus
-  const CeedScalar lambda  = (3 * Kbulk - TwoMu) / 3;
-
-  // Quadrature Point Loop
-  CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
-    // Read spatial derivatives of u
-    const CeedScalar du[3][3] = {
-        {ug[0][0][i], ug[1][0][i], ug[2][0][i]},
-        {ug[0][1][i], ug[1][1][i], ug[2][1][i]},
-        {ug[0][2][i], ug[1][2][i], ug[2][2][i]}
-    };
-    // -- Qdata
-    const CeedScalar wdetJ      = q_data[0][i];
-    const CeedScalar dXdx[3][3] = {
-        {q_data[1][i], q_data[2][i], q_data[3][i]},
-        {q_data[4][i], q_data[5][i], q_data[6][i]},
-        {q_data[7][i], q_data[8][i], q_data[9][i]}
-    };
-
-    // Compute Grad_u
-    //   dXdx = (dx/dX)^(-1)
-    // Apply dXdx to du = grad_u
-    CeedScalar Grad_u[3][3];
-    for (int j = 0; j < 3; j++) {    // Component
-      for (int k = 0; k < 3; k++) {  // Derivative
-        Grad_u[j][k] = 0;
-        for (int m = 0; m < 3; m++) Grad_u[j][k] += dXdx[m][k] * du[j][m];
-      }
-    }
-
-    // E - Green-Lagrange strain tensor
-    //     E = 1/2 (Grad_u + Grad_u^T + Grad_u^T*Grad_u)
-    const CeedInt indj[6] = {0, 1, 2, 1, 0, 0}, indk[6] = {0, 1, 2, 2, 2, 1};
-    CeedScalar    E2work[6];
-    for (CeedInt m = 0; m < 6; m++) {
-      E2work[m] = Grad_u[indj[m]][indk[m]] + Grad_u[indk[m]][indj[m]];
-      for (CeedInt n = 0; n < 3; n++) E2work[m] += Grad_u[n][indj[m]] * Grad_u[n][indk[m]];
-    }
-    CeedScalar E2[3][3] = {
-        {E2work[0], E2work[5], E2work[4]},
-        {E2work[5], E2work[1], E2work[3]},
-        {E2work[4], E2work[3], E2work[2]}
-    };
-
-    // Strain energy Phi(E) for compressible Neo-Hookean
-    const CeedScalar Jm1  = computeJM1(Grad_u);
-    const CeedScalar logJ = log1p_series_shifted(Jm1);
-    energy[i]             = (lambda * logJ * logJ / 2. - mu * logJ + mu * (E2[0][0] + E2[1][1] + E2[2][2]) / 2.) * wdetJ;
-
-  }  // End of Quadrature Point Loop
-
-  return 0;
-}
-
-// -----------------------------------------------------------------------------
-// Nodal diagnostic quantities for hyperelasticity, finite strain
-// -----------------------------------------------------------------------------
-CEED_QFUNCTION(ElasFSCurrentNH1Diagnostic)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
-  // Inputs
-  const CeedScalar(*u)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0], (*ug)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[1],
-        (*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[2];
-
-  // Outputs
-  CeedScalar(*diagnostic)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0];
-
-  // Context
-  const Physics    context = (Physics)ctx;
-  const CeedScalar E       = context->E;
-  const CeedScalar nu      = context->nu;
-  const CeedScalar TwoMu   = E / (1 + nu);
-  const CeedScalar mu      = TwoMu / 2;
-  const CeedScalar Kbulk   = E / (3 * (1 - 2 * nu));  // Bulk Modulus
-  const CeedScalar lambda  = (3 * Kbulk - TwoMu) / 3;
-
-  // Quadrature Point Loop
-  CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
-    // Read spatial derivatives of u
-    const CeedScalar du[3][3] = {
-        {ug[0][0][i], ug[1][0][i], ug[2][0][i]},
-        {ug[0][1][i], ug[1][1][i], ug[2][1][i]},
-        {ug[0][2][i], ug[1][2][i], ug[2][2][i]}
-    };
-    // -- Qdata
-    const CeedScalar dXdx[3][3] = {
-        {q_data[1][i], q_data[2][i], q_data[3][i]},
-        {q_data[4][i], q_data[5][i], q_data[6][i]},
-        {q_data[7][i], q_data[8][i], q_data[9][i]}
-    };
-
-    // Compute Grad_u
-    //   dXdx = (dx/dX)^(-1)
-    // Apply dXdx to du = Grad_u
-    CeedScalar Grad_u[3][3];
-    for (int j = 0; j < 3; j++) {    // Component
-      for (int k = 0; k < 3; k++) {  // Derivative
-        Grad_u[j][k] = 0;
-        for (int m = 0; m < 3; m++) Grad_u[j][k] += dXdx[m][k] * du[j][m];
-      }
-    }
-
-    // E - Green-Lagrange strain tensor
-    //     E = 1/2 (Grad_u + Grad_u^T + Grad_u^T*Grad_u)
-    const CeedInt indj[6] = {0, 1, 2, 1, 0, 0}, indk[6] = {0, 1, 2, 2, 2, 1};
-    CeedScalar    E2work[6];
-    for (CeedInt m = 0; m < 6; m++) {
-      E2work[m] = Grad_u[indj[m]][indk[m]] + Grad_u[indk[m]][indj[m]];
-      for (CeedInt n = 0; n < 3; n++) E2work[m] += Grad_u[n][indj[m]] * Grad_u[n][indk[m]];
-    }
-    CeedScalar E2[3][3] = {
-        {E2work[0], E2work[5], E2work[4]},
-        {E2work[5], E2work[1], E2work[3]},
-        {E2work[4], E2work[3], E2work[2]}
-    };
-
-    // Displacement
-    diagnostic[0][i] = u[0][i];
-    diagnostic[1][i] = u[1][i];
-    diagnostic[2][i] = u[2][i];
-
-    // Pressure
-    const CeedScalar Jm1  = computeJM1(Grad_u);
-    const CeedScalar logJ = log1p_series_shifted(Jm1);
-    diagnostic[3][i]      = -lambda * logJ;
-
-    // Stress tensor invariants
-    diagnostic[4][i] = (E2[0][0] + E2[1][1] + E2[2][2]) / 2.;
-    diagnostic[5][i] = 0.;
-    for (CeedInt j = 0; j < 3; j++) {
-      for (CeedInt m = 0; m < 3; m++) diagnostic[5][i] += E2[j][m] * E2[m][j] / 4.;
-    }
-    diagnostic[6][i] = Jm1 + 1.;
-
-    // Strain energy
-    diagnostic[7][i] = (lambda * logJ * logJ / 2. - mu * logJ + mu * (E2[0][0] + E2[1][1] + E2[2][2]) / 2.);
-  }  // End of Quadrature Point Loop
-
-  return 0;
-}
-// -----------------------------------------------------------------------------
diff --git a/examples/solids/qfunctions/finite-strain-neo-hookean-current-2.h b/examples/solids/qfunctions/finite-strain-neo-hookean-current-2.h
deleted file mode 100644
index b03334f999..0000000000
--- a/examples/solids/qfunctions/finite-strain-neo-hookean-current-2.h
+++ /dev/null
@@ -1,482 +0,0 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-/// @file
-/// Hyperelasticity, finite strain for solid mechanics example using PETSc
-
-#include <ceed.h>
-#include <math.h>
-
-#ifndef PHYSICS_STRUCT
-#define PHYSICS_STRUCT
-typedef struct Physics_private *Physics;
-struct Physics_private {
-  CeedScalar nu;  // Poisson's ratio
-  CeedScalar E;   // Young's Modulus
-};
-#endif
-
-// -----------------------------------------------------------------------------
-// Series approximation of log1p()
-//  log1p() is not vectorized in libc
-//
-//  The series expansion is accurate to 1e-7 in the range sqrt(2)/2 < J < sqrt(2), with machine precision accuracy near J=1.
-//  The initialization extends this range to 0.35 ~= sqrt(2)/4 < J < sqrt(2)*2 ~= 2.83, which should be sufficient for applications of the Neo-Hookean
-//  model.
-// -----------------------------------------------------------------------------
-#ifndef LOG1P_SERIES_SHIFTED
-#define LOG1P_SERIES_SHIFTED
-CEED_QFUNCTION_HELPER CeedScalar log1p_series_shifted(CeedScalar x) {
-  const CeedScalar left = sqrt(2.) / 2 - 1, right = sqrt(2.) - 1;
-  CeedScalar       sum = 0;
-  if (1) {           // Disable if the smaller range sqrt(2)/2 < J < sqrt(2) is sufficient
-    if (x < left) {  // Replace if with while for arbitrary range (may hurt vectorization)
-      sum -= log(2.) / 2;
-      x = 1 + 2 * x;
-    } else if (right < x) {
-      sum += log(2.) / 2;
-      x = (x - 1) / 2;
-    }
-  }
-  CeedScalar       y  = x / (2. + x);
-  const CeedScalar y2 = y * y;
-  sum += y;
-  y *= y2;
-  sum += y / 3;
-  y *= y2;
-  sum += y / 5;
-  y *= y2;
-  sum += y / 7;
-  return 2 * sum;
-};
-#endif
-
-// -----------------------------------------------------------------------------
-// Compute det F - 1
-// -----------------------------------------------------------------------------
-#ifndef DETJM1
-#define DETJM1
-CEED_QFUNCTION_HELPER CeedScalar computeJM1(const CeedScalar grad_u[3][3]) {
-  return grad_u[0][0] * (grad_u[1][1] * grad_u[2][2] - grad_u[1][2] * grad_u[2][1]) +
-         grad_u[0][1] * (grad_u[1][2] * grad_u[2][0] - grad_u[1][0] * grad_u[2][2]) +
-         grad_u[0][2] * (grad_u[1][0] * grad_u[2][1] - grad_u[2][0] * grad_u[1][1]) + grad_u[0][0] + grad_u[1][1] + grad_u[2][2] +
-         grad_u[0][0] * grad_u[1][1] + grad_u[0][0] * grad_u[2][2] + grad_u[1][1] * grad_u[2][2] - grad_u[0][1] * grad_u[1][0] -
-         grad_u[0][2] * grad_u[2][0] - grad_u[1][2] * grad_u[2][1];
-};
-#endif
-
-// -----------------------------------------------------------------------------
-// Compute matrix^(-1), where matrix is nonsymetric, returns array of 9
-// -----------------------------------------------------------------------------
-#ifndef MatinvNonSym
-#define MatinvNonSym
-CEED_QFUNCTION_HELPER int computeMatinvNonSym(const CeedScalar A[3][3], const CeedScalar detA, CeedScalar Ainv[9]) {
-  // Compute A^(-1) : A-Inverse
-  CeedScalar B[9] = {
-      A[1][1] * A[2][2] - A[1][2] * A[2][1], /* *NOPAD* */
-      A[0][0] * A[2][2] - A[0][2] * A[2][0], /* *NOPAD* */
-      A[0][0] * A[1][1] - A[0][1] * A[1][0], /* *NOPAD* */
-      A[0][2] * A[1][0] - A[0][0] * A[1][2], /* *NOPAD* */
-      A[0][1] * A[1][2] - A[0][2] * A[1][1], /* *NOPAD* */
-      A[0][2] * A[2][1] - A[0][1] * A[2][2], /* *NOPAD* */
-      A[0][1] * A[2][0] - A[0][0] * A[2][1], /* *NOPAD* */
-      A[1][0] * A[2][1] - A[1][1] * A[2][0], /* *NOPAD* */
-      A[1][2] * A[2][0] - A[1][0] * A[2][2]  /* *NOPAD* */
-  };
-  for (CeedInt m = 0; m < 9; m++) Ainv[m] = B[m] / (detA);
-
-  return 0;
-};
-#endif
-
-// -----------------------------------------------------------------------------
-// Residual evaluation for hyperelasticity, finite strain
-// -----------------------------------------------------------------------------
-CEED_QFUNCTION(ElasFSCurrentNH2F)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
-  // Inputs
-  const CeedScalar(*ug)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0], (*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[1];
-
-  // Outputs
-  CeedScalar(*dvdX)[3][CEED_Q_VLA] = (CeedScalar(*)[3][CEED_Q_VLA])out[0];
-  // Store dXdx
-  CeedScalar(*dXdx)[3][CEED_Q_VLA] = (CeedScalar(*)[3][CEED_Q_VLA])out[1];
-  // Store tau
-  CeedScalar(*tau)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[2];
-  // Store constant lam_log_J = lambda*log(J)
-  CeedScalar(*lam_log_J)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[3];
-
-  // Context
-  const Physics    context = (Physics)ctx;
-  const CeedScalar E       = context->E;
-  const CeedScalar nu      = context->nu;
-  const CeedScalar TwoMu   = E / (1 + nu);
-  const CeedScalar mu      = TwoMu / 2;
-  const CeedScalar Kbulk   = E / (3 * (1 - 2 * nu));  // Bulk Modulus
-  const CeedScalar lambda  = (3 * Kbulk - TwoMu) / 3;
-
-  // Formulation Terminology:
-  //  I3    : 3x3 Identity matrix
-  //  b     : left Cauchy-Green tensor
-  //  F     : deformation gradient
-  //  tau   : Kirchhoff stress (in current config)
-  // Formulation:
-  //  F =  I3 + Grad_ue
-  //  J = det(F)
-  //  b = F*F^{T}
-  //  tau = mu*b - (mu - lambda*log(J))*I3;
-
-  // Quadrature Point Loop
-  CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
-    // Read spatial derivatives of u
-    const CeedScalar du[3][3] = {
-        {ug[0][0][i], ug[1][0][i], ug[2][0][i]},
-        {ug[0][1][i], ug[1][1][i], ug[2][1][i]},
-        {ug[0][2][i], ug[1][2][i], ug[2][2][i]}
-    };
-    // -- Qdata
-    const CeedScalar wdetJ = q_data[0][i];
-    // dXdx_initial = dX/dx_initial
-    // X is natural coordinate sys OR Reference [-1,1]^dim
-    // x_initial is initial config coordinate system
-    const CeedScalar dXdx_initial[3][3] = {
-        {q_data[1][i], q_data[2][i], q_data[3][i]},
-        {q_data[4][i], q_data[5][i], q_data[6][i]},
-        {q_data[7][i], q_data[8][i], q_data[9][i]}
-    };
-
-    // X is natural coordinate sys OR Reference system
-    // x_initial is initial config coordinate system
-    // Grad_u =du/dx_initial= du/dX * dX/dx_initial
-    CeedScalar Grad_u[3][3];
-    for (CeedInt j = 0; j < 3; j++) {    // Component
-      for (CeedInt k = 0; k < 3; k++) {  // Derivative
-        Grad_u[j][k] = 0;
-        for (CeedInt m = 0; m < 3; m++) Grad_u[j][k] += du[j][m] * dXdx_initial[m][k];
-      }
-    }
-
-    // Compute The Deformation Gradient : F = I3 + Gradu
-    const CeedScalar F[3][3] = {
-        {Grad_u[0][0] + 1, Grad_u[0][1],     Grad_u[0][2]    },
-        {Grad_u[1][0],     Grad_u[1][1] + 1, Grad_u[1][2]    },
-        {Grad_u[2][0],     Grad_u[2][1],     Grad_u[2][2] + 1}
-    };
-
-    // b - I3 = (Grad_u + Grad_u^T + Grad_u*Grad_u^T)
-    const CeedInt indj[6] = {0, 1, 2, 1, 0, 0}, indk[6] = {0, 1, 2, 2, 2, 1};
-    CeedScalar    bMI3[6];
-    for (CeedInt m = 0; m < 6; m++) {
-      bMI3[m] = Grad_u[indj[m]][indk[m]] + Grad_u[indk[m]][indj[m]];
-      for (CeedInt n = 0; n < 3; n++) bMI3[m] += Grad_u[indj[m]][n] * Grad_u[indk[m]][n];
-    }
-
-    const CeedScalar Jm1  = computeJM1(Grad_u);
-    const CeedScalar logJ = log1p_series_shifted(Jm1);
-
-    // store lam_log_J = lambda*log(J)
-    lam_log_J[0][i] = lambda * logJ;
-
-    // tau = mu*b - Cc1*I3;
-    tau[0][i] = mu * bMI3[0] + lam_log_J[0][i];
-    tau[1][i] = mu * bMI3[1] + lam_log_J[0][i];
-    tau[2][i] = mu * bMI3[2] + lam_log_J[0][i];
-    tau[3][i] = mu * bMI3[3];
-    tau[4][i] = mu * bMI3[4];
-    tau[5][i] = mu * bMI3[5];
-
-    // Computer F^{-1}
-    const CeedScalar detF = Jm1 + 1.;
-    CeedScalar       Finvwork[9];
-    computeMatinvNonSym(F, detF, Finvwork);
-    CeedScalar Finv[3][3];
-    Finv[0][0] = Finvwork[0];
-    Finv[0][1] = Finvwork[5];
-    Finv[0][2] = Finvwork[4];
-    Finv[1][0] = Finvwork[8];
-    Finv[1][1] = Finvwork[1];
-    Finv[1][2] = Finvwork[3];
-    Finv[2][0] = Finvwork[7];
-    Finv[2][1] = Finvwork[6];
-    Finv[2][2] = Finvwork[2];
-
-    // x is current config coordinate system
-    // dXdx = dX/dx = dX/dx_initial * F^{-1}
-    // Note that F^{-1} = dx_initial/dx
-    for (CeedInt j = 0; j < 3; j++) {    // Component
-      for (CeedInt k = 0; k < 3; k++) {  // Derivative
-        dXdx[j][k][i] = 0;
-        for (CeedInt m = 0; m < 3; m++) dXdx[j][k][i] += dXdx_initial[j][m] * Finv[m][k];
-      }
-    }
-
-    const CeedScalar temptau[3][3] = {
-        {tau[0][i], tau[5][i], tau[4][i]},
-        {tau[5][i], tau[1][i], tau[3][i]},
-        {tau[4][i], tau[3][i], tau[2][i]}
-    };
-    // Apply dXdx^T and weight to intermediate stress
-    for (CeedInt j = 0; j < 3; j++) {    // Component
-      for (CeedInt k = 0; k < 3; k++) {  // Derivative
-        dvdX[k][j][i] = 0;
-        for (CeedInt m = 0; m < 3; m++) dvdX[k][j][i] += dXdx[k][m][i] * temptau[j][m] * wdetJ;
-      }
-    }
-
-  }  // End of Quadrature Point Loop
-
-  return 0;
-}
-
-// -----------------------------------------------------------------------------
-// Jacobian evaluation for hyperelasticity, finite strain
-// -----------------------------------------------------------------------------
-CEED_QFUNCTION(ElasFSCurrentNH2dF)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
-  // Inputs
-  const CeedScalar(*deltaug)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0],
-        (*q_data)[CEED_Q_VLA]               = (const CeedScalar(*)[CEED_Q_VLA])in[1];
-  // dXdx computed in residual
-  const CeedScalar(*dXdx)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[2];
-  // tau computed in residual
-  const CeedScalar(*tau)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[3];
-  // lam_log_J = lambda*log(J) computed in residual
-  const CeedScalar(*lam_log_J)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[4];
-  // Outputs
-  CeedScalar(*deltadvdX)[3][CEED_Q_VLA] = (CeedScalar(*)[3][CEED_Q_VLA])out[0];
-
-  // Context
-  const Physics    context = (Physics)ctx;
-  const CeedScalar E       = context->E;
-  const CeedScalar nu      = context->nu;
-
-  // Constants
-  const CeedScalar TwoMu  = E / (1 + nu);
-  const CeedScalar mu     = TwoMu / 2;
-  const CeedScalar Kbulk  = E / (3 * (1 - 2 * nu));  // Bulk Modulus
-  const CeedScalar lambda = (3 * Kbulk - TwoMu) / 3;
-
-  // Quadrature Point Loop
-  CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
-    // Read spatial derivatives of delta_u
-    const CeedScalar deltadu[3][3] = {
-        {deltaug[0][0][i], deltaug[1][0][i], deltaug[2][0][i]},
-        {deltaug[0][1][i], deltaug[1][1][i], deltaug[2][1][i]},
-        {deltaug[0][2][i], deltaug[1][2][i], deltaug[2][2][i]}
-    };
-    // -- Qdata
-    const CeedScalar wdetJ = q_data[0][i];
-
-    // Compute grad_du = \nabla_x (deltau) = deltau * dX/dx
-    CeedScalar grad_du[3][3];
-    for (CeedInt j = 0; j < 3; j++) {    // Component
-      for (CeedInt k = 0; k < 3; k++) {  // Derivative
-        grad_du[j][k] = 0;
-        for (CeedInt m = 0; m < 3; m++) grad_du[j][k] += deltadu[j][m] * dXdx[m][k][i];
-      }
-    }
-
-    const CeedScalar temptau[3][3] = {
-        {tau[0][i], tau[5][i], tau[4][i]},
-        {tau[5][i], tau[1][i], tau[3][i]},
-        {tau[4][i], tau[3][i], tau[2][i]}
-    };
-
-    // Compute grad_du_tau = grad_du*tau
-    CeedScalar grad_du_tau[3][3];
-    for (CeedInt j = 0; j < 3; j++) {
-      for (CeedInt k = 0; k < 3; k++) {
-        grad_du_tau[j][k] = 0;
-        for (CeedInt m = 0; m < 3; m++) grad_du_tau[j][k] += grad_du[j][m] * temptau[m][k];
-      }
-    }
-
-    // Compute depsilon = (grad_du + grad_du^T)/2
-    const CeedScalar depsilon[3][3] = {
-        {(grad_du[0][0] + grad_du[0][0]) / 2., (grad_du[0][1] + grad_du[1][0]) / 2., (grad_du[0][2] + grad_du[2][0]) / 2.},
-        {(grad_du[1][0] + grad_du[0][1]) / 2., (grad_du[1][1] + grad_du[1][1]) / 2., (grad_du[1][2] + grad_du[2][1]) / 2.},
-        {(grad_du[2][0] + grad_du[0][2]) / 2., (grad_du[2][1] + grad_du[1][2]) / 2., (grad_du[2][2] + grad_du[2][2]) / 2.}
-    };
-    // Compute trace(depsilon)
-    CeedScalar tr_deps = depsilon[0][0] + depsilon[1][1] + depsilon[2][2];
-    // Compute grad_du*tau + trace(depsilon)I3
-    grad_du_tau[0][0] += lambda * tr_deps;
-    grad_du_tau[1][1] += lambda * tr_deps;
-    grad_du_tau[2][2] += lambda * tr_deps;
-    // Compute dp = grad_du*tau + trace(depsilon)I3 +2(mu-lambda*logJ)depsilon
-    CeedScalar dp[3][3];
-    for (CeedInt j = 0; j < 3; j++) {
-      for (CeedInt k = 0; k < 3; k++) {
-        dp[j][k] = grad_du_tau[j][k] + 2 * (mu - lam_log_J[0][i]) * depsilon[j][k];
-      }
-    }
-
-    // Apply dXdx^T and weight
-    for (CeedInt j = 0; j < 3; j++) {    // Component
-      for (CeedInt k = 0; k < 3; k++) {  // Derivative
-        deltadvdX[k][j][i] = 0;
-        for (CeedInt m = 0; m < 3; m++) deltadvdX[k][j][i] += dXdx[k][m][i] * dp[j][m] * wdetJ;
-      }
-    }
-  }  // End of Quadrature Point Loop
-
-  return 0;
-}
-
-// -----------------------------------------------------------------------------
-// Strain energy computation for hyperelasticity, finite strain
-// -----------------------------------------------------------------------------
-CEED_QFUNCTION(ElasFSCurrentNH2Energy)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
-  // Inputs
-  const CeedScalar(*ug)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0], (*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[1];
-
-  // Outputs
-  CeedScalar(*energy) = (CeedScalar(*))out[0];
-
-  // Context
-  const Physics    context = (Physics)ctx;
-  const CeedScalar E       = context->E;
-  const CeedScalar nu      = context->nu;
-  const CeedScalar TwoMu   = E / (1 + nu);
-  const CeedScalar mu      = TwoMu / 2;
-  const CeedScalar Kbulk   = E / (3 * (1 - 2 * nu));  // Bulk Modulus
-  const CeedScalar lambda  = (3 * Kbulk - TwoMu) / 3;
-
-  // Quadrature Point Loop
-  CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
-    // Read spatial derivatives of u
-    const CeedScalar du[3][3] = {
-        {ug[0][0][i], ug[1][0][i], ug[2][0][i]},
-        {ug[0][1][i], ug[1][1][i], ug[2][1][i]},
-        {ug[0][2][i], ug[1][2][i], ug[2][2][i]}
-    };
-    // -- Qdata
-    const CeedScalar wdetJ      = q_data[0][i];
-    const CeedScalar dXdx[3][3] = {
-        {q_data[1][i], q_data[2][i], q_data[3][i]},
-        {q_data[4][i], q_data[5][i], q_data[6][i]},
-        {q_data[7][i], q_data[8][i], q_data[9][i]}
-    };
-
-    // Compute Grad_u
-    //   dXdx = (dx/dX)^(-1)
-    // Apply dXdx to du = grad_u
-    CeedScalar Grad_u[3][3];
-    for (int j = 0; j < 3; j++) {    // Component
-      for (int k = 0; k < 3; k++) {  // Derivative
-        Grad_u[j][k] = 0;
-        for (int m = 0; m < 3; m++) Grad_u[j][k] += dXdx[m][k] * du[j][m];
-      }
-    }
-
-    // E - Green-Lagrange strain tensor
-    //     E = 1/2 (Grad_u + Grad_u^T + Grad_u^T*Grad_u)
-    const CeedInt indj[6] = {0, 1, 2, 1, 0, 0}, indk[6] = {0, 1, 2, 2, 2, 1};
-    CeedScalar    E2work[6];
-    for (CeedInt m = 0; m < 6; m++) {
-      E2work[m] = Grad_u[indj[m]][indk[m]] + Grad_u[indk[m]][indj[m]];
-      for (CeedInt n = 0; n < 3; n++) E2work[m] += Grad_u[n][indj[m]] * Grad_u[n][indk[m]];
-    }
-    CeedScalar E2[3][3] = {
-        {E2work[0], E2work[5], E2work[4]},
-        {E2work[5], E2work[1], E2work[3]},
-        {E2work[4], E2work[3], E2work[2]}
-    };
-    const CeedScalar Jm1  = computeJM1(Grad_u);
-    const CeedScalar logJ = log1p_series_shifted(Jm1);
-
-    // Strain energy Phi(E) for compressible Neo-Hookean
-    energy[i] = (lambda * logJ * logJ / 2. - mu * logJ + mu * (E2[0][0] + E2[1][1] + E2[2][2]) / 2.) * wdetJ;
-
-  }  // End of Quadrature Point Loop
-
-  return 0;
-}
-
-// -----------------------------------------------------------------------------
-// Nodal diagnostic quantities for hyperelasticity, finite strain
-// -----------------------------------------------------------------------------
-CEED_QFUNCTION(ElasFSCurrentNH2Diagnostic)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
-  // Inputs
-  const CeedScalar(*u)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0], (*ug)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[1],
-        (*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[2];
-
-  // Outputs
-  CeedScalar(*diagnostic)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0];
-
-  // Context
-  const Physics    context = (Physics)ctx;
-  const CeedScalar E       = context->E;
-  const CeedScalar nu      = context->nu;
-  const CeedScalar TwoMu   = E / (1 + nu);
-  const CeedScalar mu      = TwoMu / 2;
-  const CeedScalar Kbulk   = E / (3 * (1 - 2 * nu));  // Bulk Modulus
-  const CeedScalar lambda  = (3 * Kbulk - TwoMu) / 3;
-
-  // Quadrature Point Loop
-  CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
-    // Read spatial derivatives of u
-    const CeedScalar du[3][3] = {
-        {ug[0][0][i], ug[1][0][i], ug[2][0][i]},
-        {ug[0][1][i], ug[1][1][i], ug[2][1][i]},
-        {ug[0][2][i], ug[1][2][i], ug[2][2][i]}
-    };
-    // -- Qdata
-    const CeedScalar dXdx[3][3] = {
-        {q_data[1][i], q_data[2][i], q_data[3][i]},
-        {q_data[4][i], q_data[5][i], q_data[6][i]},
-        {q_data[7][i], q_data[8][i], q_data[9][i]}
-    };
-
-    // Compute Grad_u
-    //   dXdx = (dx/dX)^(-1)
-    // Apply dXdx to du = grad_u
-    CeedScalar Grad_u[3][3];
-    for (int j = 0; j < 3; j++) {    // Component
-      for (int k = 0; k < 3; k++) {  // Derivative
-        Grad_u[j][k] = 0;
-        for (int m = 0; m < 3; m++) Grad_u[j][k] += dXdx[m][k] * du[j][m];
-      }
-    }
-
-    // E - Green-Lagrange strain tensor
-    //     E = 1/2 (Grad_u + Grad_u^T + Grad_u^T*Grad_u)
-    const CeedInt indj[6] = {0, 1, 2, 1, 0, 0}, indk[6] = {0, 1, 2, 2, 2, 1};
-    CeedScalar    E2work[6];
-    for (CeedInt m = 0; m < 6; m++) {
-      E2work[m] = Grad_u[indj[m]][indk[m]] + Grad_u[indk[m]][indj[m]];
-      for (CeedInt n = 0; n < 3; n++) E2work[m] += Grad_u[n][indj[m]] * Grad_u[n][indk[m]];
-    }
-    CeedScalar E2[3][3] = {
-        {E2work[0], E2work[5], E2work[4]},
-        {E2work[5], E2work[1], E2work[3]},
-        {E2work[4], E2work[3], E2work[2]}
-    };
-
-    // Displacement
-    diagnostic[0][i] = u[0][i];
-    diagnostic[1][i] = u[1][i];
-    diagnostic[2][i] = u[2][i];
-
-    // Pressure
-    const CeedScalar Jm1  = computeJM1(Grad_u);
-    const CeedScalar logJ = log1p_series_shifted(Jm1);
-    diagnostic[3][i]      = -lambda * logJ;
-
-    // Stress tensor invariants
-    diagnostic[4][i] = (E2[0][0] + E2[1][1] + E2[2][2]) / 2.;
-    diagnostic[5][i] = 0.;
-    for (CeedInt j = 0; j < 3; j++) {
-      for (CeedInt m = 0; m < 3; m++) diagnostic[5][i] += E2[j][m] * E2[m][j] / 4.;
-    }
-    diagnostic[6][i] = Jm1 + 1.;
-
-    // Strain energy
-    diagnostic[7][i] = (lambda * logJ * logJ / 2. - mu * logJ + mu * (E2[0][0] + E2[1][1] + E2[2][2]) / 2.);
-  }  // End of Quadrature Point Loop
-
-  return 0;
-}
-// -----------------------------------------------------------------------------
diff --git a/examples/solids/qfunctions/finite-strain-neo-hookean-initial-2.h b/examples/solids/qfunctions/finite-strain-neo-hookean-initial-2.h
deleted file mode 100644
index 09c4bb99ce..0000000000
--- a/examples/solids/qfunctions/finite-strain-neo-hookean-initial-2.h
+++ /dev/null
@@ -1,559 +0,0 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-/// @file
-/// Hyperelasticity, finite strain for solid mechanics example using PETSc
-
-#include <ceed.h>
-#include <math.h>
-
-#ifndef PHYSICS_STRUCT
-#define PHYSICS_STRUCT
-typedef struct Physics_private *Physics;
-struct Physics_private {
-  CeedScalar nu;  // Poisson's ratio
-  CeedScalar E;   // Young's Modulus
-};
-#endif
-
-// -----------------------------------------------------------------------------
-// Series approximation of log1p()
-//  log1p() is not vectorized in libc
-//
-//  The series expansion is accurate to 1e-7 in the range sqrt(2)/2 < J < sqrt(2), with machine precision accuracy near J=1.
-//  The initialization extends this range to 0.35 ~= sqrt(2)/4 < J < sqrt(2)*2 ~= 2.83, which should be sufficient for applications of the Neo-Hookean
-//  model.
-// -----------------------------------------------------------------------------
-#ifndef LOG1P_SERIES_SHIFTED
-#define LOG1P_SERIES_SHIFTED
-CEED_QFUNCTION_HELPER CeedScalar log1p_series_shifted(CeedScalar x) {
-  const CeedScalar left = sqrt(2.) / 2 - 1, right = sqrt(2.) - 1;
-  CeedScalar       sum = 0;
-  if (1) {           // Disable if the smaller range sqrt(2)/2 < J < sqrt(2) is sufficient
-    if (x < left) {  // Replace if with while for arbitrary range (may hurt vectorization)
-      sum -= log(2.) / 2;
-      x = 1 + 2 * x;
-    } else if (right < x) {
-      sum += log(2.) / 2;
-      x = (x - 1) / 2;
-    }
-  }
-  CeedScalar       y  = x / (2. + x);
-  const CeedScalar y2 = y * y;
-  sum += y;
-  y *= y2;
-  sum += y / 3;
-  y *= y2;
-  sum += y / 5;
-  y *= y2;
-  sum += y / 7;
-  return 2 * sum;
-};
-#endif
-
-// -----------------------------------------------------------------------------
-// Compute det F - 1
-// -----------------------------------------------------------------------------
-#ifndef DETJM1
-#define DETJM1
-CEED_QFUNCTION_HELPER CeedScalar computeJM1(const CeedScalar grad_u[3][3]) {
-  return grad_u[0][0] * (grad_u[1][1] * grad_u[2][2] - grad_u[1][2] * grad_u[2][1]) +
-         grad_u[0][1] * (grad_u[1][2] * grad_u[2][0] - grad_u[1][0] * grad_u[2][2]) +
-         grad_u[0][2] * (grad_u[1][0] * grad_u[2][1] - grad_u[2][0] * grad_u[1][1]) + grad_u[0][0] + grad_u[1][1] + grad_u[2][2] +
-         grad_u[0][0] * grad_u[1][1] + grad_u[0][0] * grad_u[2][2] + grad_u[1][1] * grad_u[2][2] - grad_u[0][1] * grad_u[1][0] -
-         grad_u[0][2] * grad_u[2][0] - grad_u[1][2] * grad_u[2][1];
-};
-#endif
-
-// -----------------------------------------------------------------------------
-// Compute matrix^(-1), where matrix is symetric, returns array of 6
-// -----------------------------------------------------------------------------
-#ifndef MatinvSym
-#define MatinvSym
-CEED_QFUNCTION_HELPER int computeMatinvSym(const CeedScalar A[3][3], const CeedScalar detA, CeedScalar Ainv[6]) {
-  // Compute A^(-1) : A-Inverse
-  CeedScalar B[6] = {
-      A[1][1] * A[2][2] - A[1][2] * A[2][1], /* *NOPAD* */
-      A[0][0] * A[2][2] - A[0][2] * A[2][0], /* *NOPAD* */
-      A[0][0] * A[1][1] - A[0][1] * A[1][0], /* *NOPAD* */
-      A[0][2] * A[1][0] - A[0][0] * A[1][2], /* *NOPAD* */
-      A[0][1] * A[1][2] - A[0][2] * A[1][1], /* *NOPAD* */
-      A[0][2] * A[2][1] - A[0][1] * A[2][2]  /* *NOPAD* */
-  };
-  for (CeedInt m = 0; m < 6; m++) Ainv[m] = B[m] / (detA);
-
-  return 0;
-};
-#endif
-
-// -----------------------------------------------------------------------------
-// Residual evaluation for hyperelasticity, finite strain
-// -----------------------------------------------------------------------------
-CEED_QFUNCTION(ElasFSInitialNH2F)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
-  // Inputs
-  const CeedScalar(*ug)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0], (*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[1];
-
-  // Outputs
-  CeedScalar(*dvdX)[3][CEED_Q_VLA] = (CeedScalar(*)[3][CEED_Q_VLA])out[0];
-  // Store grad_u for HyperFSdF (Jacobian of HyperFSF)
-  CeedScalar(*grad_u)[3][CEED_Q_VLA] = (CeedScalar(*)[3][CEED_Q_VLA])out[1];
-  // Store C_inv for HyperFSdF (Jacobian of HyperFSF)
-  CeedScalar(*C_inv)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[2];
-  // Store constant lam_log_J = lambda*log(J)
-  CeedScalar(*lam_log_J)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[3];
-
-  // Context
-  const Physics    context = (Physics)ctx;
-  const CeedScalar E       = context->E;
-  const CeedScalar nu      = context->nu;
-  const CeedScalar TwoMu   = E / (1 + nu);
-  const CeedScalar mu      = TwoMu / 2;
-  const CeedScalar Kbulk   = E / (3 * (1 - 2 * nu));  // Bulk Modulus
-  const CeedScalar lambda  = (3 * Kbulk - TwoMu) / 3;
-
-  // Formulation Terminology:
-  //  I3    : 3x3 Identity matrix
-  //  C     : right Cauchy-Green tensor
-  //  C_inv : inverse of C
-  //  F     : deformation gradient
-  //  S     : 2nd Piola-Kirchhoff (in current config)
-  //  P     : 1st Piola-Kirchhoff (in referential config)
-  // Formulation:
-  //  F =  I3 + grad_ue
-  //  J = det(F)
-  //  C = F(^T)*F
-  //  S = mu*I3 + (lambda*log(J)-mu)*C_inv;
-  //  P = F*S
-
-  // Quadrature Point Loop
-  CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
-    // Read spatial derivatives of u
-    const CeedScalar du[3][3] = {
-        {ug[0][0][i], ug[1][0][i], ug[2][0][i]},
-        {ug[0][1][i], ug[1][1][i], ug[2][1][i]},
-        {ug[0][2][i], ug[1][2][i], ug[2][2][i]}
-    };
-    // -- Qdata
-    const CeedScalar wdetJ      = q_data[0][i];
-    const CeedScalar dXdx[3][3] = {
-        {q_data[1][i], q_data[2][i], q_data[3][i]},
-        {q_data[4][i], q_data[5][i], q_data[6][i]},
-        {q_data[7][i], q_data[8][i], q_data[9][i]}
-    };
-
-    // Compute grad_u
-    //   dXdx = (dx/dX)^(-1)
-    // Apply dXdx to du = grad_u
-    for (CeedInt j = 0; j < 3; j++) {    // Component
-      for (CeedInt k = 0; k < 3; k++) {  // Derivative
-        grad_u[j][k][i] = 0;
-        for (CeedInt m = 0; m < 3; m++) grad_u[j][k][i] += dXdx[m][k] * du[j][m];
-      }
-    }
-
-    // I3 : 3x3 Identity matrix
-    // Compute The Deformation Gradient : F = I3 + grad_u
-    const CeedScalar F[3][3] = {
-        {grad_u[0][0][i] + 1, grad_u[0][1][i],     grad_u[0][2][i]    },
-        {grad_u[1][0][i],     grad_u[1][1][i] + 1, grad_u[1][2][i]    },
-        {grad_u[2][0][i],     grad_u[2][1][i],     grad_u[2][2][i] + 1}
-    };
-
-    // E - Green-Lagrange strain tensor
-    //     E = 1/2 (grad_u + grad_u^T + grad_u^T*grad_u)
-    const CeedInt indj[6] = {0, 1, 2, 1, 0, 0}, indk[6] = {0, 1, 2, 2, 2, 1};
-    CeedScalar    E2work[6];
-    for (CeedInt m = 0; m < 6; m++) {
-      E2work[m] = grad_u[indj[m]][indk[m]][i] + grad_u[indk[m]][indj[m]][i];
-      for (CeedInt n = 0; n < 3; n++) E2work[m] += grad_u[n][indj[m]][i] * grad_u[n][indk[m]][i];
-    }
-
-    const CeedScalar tempgradu[3][3] = {
-        {grad_u[0][0][i], grad_u[0][1][i], grad_u[0][2][i]},
-        {grad_u[1][0][i], grad_u[1][1][i], grad_u[1][2][i]},
-        {grad_u[2][0][i], grad_u[2][1][i], grad_u[2][2][i]}
-    };
-
-    const CeedScalar Jm1  = computeJM1(tempgradu);
-    const CeedScalar logJ = log1p_series_shifted(Jm1);
-    // store lam_log_J = lambda*log(J)
-    lam_log_J[0][i] = lambda * logJ;
-
-    CeedScalar E2[3][3] = {
-        {E2work[0], E2work[5], E2work[4]},
-        {E2work[5], E2work[1], E2work[3]},
-        {E2work[4], E2work[3], E2work[2]}
-    };
-
-    // C : right Cauchy-Green tensor
-    // C = I + 2E
-    const CeedScalar C[3][3] = {
-        {1 + E2[0][0], E2[0][1],     E2[0][2]    },
-        {E2[0][1],     1 + E2[1][1], E2[1][2]    },
-        {E2[0][2],     E2[1][2],     1 + E2[2][2]}
-    };
-
-    // Compute C^(-1) : C-Inverse
-    const CeedScalar detC = (Jm1 + 1.) * (Jm1 + 1.);
-    CeedScalar       Cinvwork[6];
-    computeMatinvSym(C, detC, Cinvwork);
-
-    // store C_inv
-    C_inv[0][i] = Cinvwork[0];
-    C_inv[1][i] = Cinvwork[1];
-    C_inv[2][i] = Cinvwork[2];
-    C_inv[3][i] = Cinvwork[3];
-    C_inv[4][i] = Cinvwork[4];
-    C_inv[5][i] = Cinvwork[5];
-
-    const CeedScalar tempCinv[3][3] = {
-        {C_inv[0][i], C_inv[5][i], C_inv[4][i]},
-        {C_inv[5][i], C_inv[1][i], C_inv[3][i]},
-        {C_inv[4][i], C_inv[3][i], C_inv[2][i]}
-    };
-    CeedScalar Swork[6];
-    for (CeedInt m = 0; m < 6; m++) {
-      Swork[m] = lam_log_J[0][i] * C_inv[m][i];
-      for (CeedInt n = 0; n < 3; n++) Swork[m] += mu * tempCinv[indj[m]][n] * E2[n][indk[m]];
-    }
-    // Second Piola-Kirchhoff (S)
-    const CeedScalar S[3][3] = {
-        {Swork[0], Swork[5], Swork[4]},
-        {Swork[5], Swork[1], Swork[3]},
-        {Swork[4], Swork[3], Swork[2]}
-    };
-
-    // Compute the First Piola-Kirchhoff : P = F*S
-    CeedScalar P[3][3];
-    for (CeedInt j = 0; j < 3; j++) {
-      for (CeedInt k = 0; k < 3; k++) {
-        P[j][k] = 0;
-        for (CeedInt m = 0; m < 3; m++) P[j][k] += F[j][m] * S[m][k];
-      }
-    }
-
-    // Apply dXdx^T and weight to P (First Piola-Kirchhoff)
-    for (CeedInt j = 0; j < 3; j++) {    // Component
-      for (CeedInt k = 0; k < 3; k++) {  // Derivative
-        dvdX[k][j][i] = 0;
-        for (CeedInt m = 0; m < 3; m++) dvdX[k][j][i] += dXdx[k][m] * P[j][m] * wdetJ;
-      }
-    }
-  }  // End of Quadrature Point Loop
-
-  return 0;
-}
-
-// -----------------------------------------------------------------------------
-// Jacobian evaluation for hyperelasticity, finite strain
-// -----------------------------------------------------------------------------
-CEED_QFUNCTION(ElasFSInitialNH2dF)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
-  // Inputs
-  const CeedScalar(*deltaug)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0],
-        (*q_data)[CEED_Q_VLA]               = (const CeedScalar(*)[CEED_Q_VLA])in[1];
-  // grad_u is used for hyperelasticity (non-linear)
-  const CeedScalar(*grad_u)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[2];
-  const CeedScalar(*C_inv)[CEED_Q_VLA]     = (const CeedScalar(*)[CEED_Q_VLA])in[3];
-  // lam_log_J = lambda*log(J)
-  const CeedScalar(*lam_log_J)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[4];
-  // Outputs
-  CeedScalar(*deltadvdX)[3][CEED_Q_VLA] = (CeedScalar(*)[3][CEED_Q_VLA])out[0];
-
-  // Context
-  const Physics    context = (Physics)ctx;
-  const CeedScalar E       = context->E;
-  const CeedScalar nu      = context->nu;
-
-  // Constants
-  const CeedScalar TwoMu  = E / (1 + nu);
-  const CeedScalar mu     = TwoMu / 2;
-  const CeedScalar Kbulk  = E / (3 * (1 - 2 * nu));  // Bulk Modulus
-  const CeedScalar lambda = (3 * Kbulk - TwoMu) / 3;
-
-  // Quadrature Point Loop
-  CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
-    // Read spatial derivatives of delta_u
-    const CeedScalar deltadu[3][3] = {
-        {deltaug[0][0][i], deltaug[1][0][i], deltaug[2][0][i]},
-        {deltaug[0][1][i], deltaug[1][1][i], deltaug[2][1][i]},
-        {deltaug[0][2][i], deltaug[1][2][i], deltaug[2][2][i]}
-    };
-    // -- Qdata
-    const CeedScalar wdetJ      = q_data[0][i];
-    const CeedScalar dXdx[3][3] = {
-        {q_data[1][i], q_data[2][i], q_data[3][i]},
-        {q_data[4][i], q_data[5][i], q_data[6][i]},
-        {q_data[7][i], q_data[8][i], q_data[9][i]}
-    };
-
-    // Compute graddeltau
-    //   dXdx = (dx/dX)^(-1)
-    // Apply dXdx to deltadu = graddelta
-    CeedScalar graddeltau[3][3];
-    for (CeedInt j = 0; j < 3; j++) {    // Component
-      for (CeedInt k = 0; k < 3; k++) {  // Derivative
-        graddeltau[j][k] = 0;
-        for (CeedInt m = 0; m < 3; m++) graddeltau[j][k] += dXdx[m][k] * deltadu[j][m];
-      }
-    }
-
-    // I3 : 3x3 Identity matrix
-    // Deformation Gradient : F = I3 + grad_u
-    const CeedScalar F[3][3] = {
-        {grad_u[0][0][i] + 1, grad_u[0][1][i],     grad_u[0][2][i]    },
-        {grad_u[1][0][i],     grad_u[1][1][i] + 1, grad_u[1][2][i]    },
-        {grad_u[2][0][i],     grad_u[2][1][i],     grad_u[2][2][i] + 1}
-    };
-    // E - Green-Lagrange strain tensor
-    //     E = 1/2 (grad_u + grad_u^T + grad_u^T*grad_u)
-    const CeedInt indj[6] = {0, 1, 2, 1, 0, 0}, indk[6] = {0, 1, 2, 2, 2, 1};
-    CeedScalar    E2work[6];
-    for (CeedInt m = 0; m < 6; m++) {
-      E2work[m] = grad_u[indj[m]][indk[m]][i] + grad_u[indk[m]][indj[m]][i];
-      for (CeedInt n = 0; n < 3; n++) E2work[m] += grad_u[n][indj[m]][i] * grad_u[n][indk[m]][i];
-    }
-
-    // deltaE - Green-Lagrange strain tensor
-    CeedScalar deltaEwork[6];
-    for (CeedInt m = 0; m < 6; m++) {
-      deltaEwork[m] = 0;
-      for (CeedInt n = 0; n < 3; n++) deltaEwork[m] += (graddeltau[n][indj[m]] * F[n][indk[m]] + F[n][indj[m]] * graddeltau[n][indk[m]]) / 2.;
-    }
-    CeedScalar deltaE[3][3] = {
-        {deltaEwork[0], deltaEwork[5], deltaEwork[4]},
-        {deltaEwork[5], deltaEwork[1], deltaEwork[3]},
-        {deltaEwork[4], deltaEwork[3], deltaEwork[2]}
-    };
-
-    CeedScalar E2[3][3] = {
-        {E2work[0], E2work[5], E2work[4]},
-        {E2work[5], E2work[1], E2work[3]},
-        {E2work[4], E2work[3], E2work[2]}
-    };
-
-    const CeedScalar tempCinv[3][3] = {
-        {C_inv[0][i], C_inv[5][i], C_inv[4][i]},
-        {C_inv[5][i], C_inv[1][i], C_inv[3][i]},
-        {C_inv[4][i], C_inv[3][i], C_inv[2][i]}
-    };
-    CeedScalar Swork[6];
-    for (CeedInt m = 0; m < 6; m++) {
-      Swork[m] = lam_log_J[0][i] * C_inv[m][i];
-      for (CeedInt n = 0; n < 3; n++) Swork[m] += mu * tempCinv[indj[m]][n] * E2[n][indk[m]];
-    }
-    // Second Piola-Kirchhoff (S)
-    const CeedScalar S[3][3] = {
-        {Swork[0], Swork[5], Swork[4]},
-        {Swork[5], Swork[1], Swork[3]},
-        {Swork[4], Swork[3], Swork[2]}
-    };
-
-    // deltaS = dSdE:deltaE
-    //      = lambda(C_inv:deltaE)C_inv + 2(mu-lambda*log(J))C_inv*deltaE*C_inv
-    // -- C_inv:deltaE
-    CeedScalar Cinv_contract_E = 0;
-    for (CeedInt j = 0; j < 3; j++) {
-      for (CeedInt k = 0; k < 3; k++) Cinv_contract_E += tempCinv[j][k] * deltaE[j][k];
-    }
-    // -- deltaE*C_inv
-    CeedScalar deltaECinv[3][3];
-    for (CeedInt j = 0; j < 3; j++) {
-      for (CeedInt k = 0; k < 3; k++) {
-        deltaECinv[j][k] = 0;
-        for (CeedInt m = 0; m < 3; m++) deltaECinv[j][k] += deltaE[j][m] * tempCinv[m][k];
-      }
-    }
-    // -- intermediate deltaS = C_inv*deltaE*C_inv
-    CeedScalar deltaS[3][3];
-    for (CeedInt j = 0; j < 3; j++) {
-      for (CeedInt k = 0; k < 3; k++) {
-        deltaS[j][k] = 0;
-        for (CeedInt m = 0; m < 3; m++) deltaS[j][k] += tempCinv[j][m] * deltaECinv[m][k];
-      }
-    }
-    // -- deltaS = lambda(C_inv:deltaE)C_inv - 2(lambda*log(J)-mu)*(intermediate)
-    const CeedScalar llnj_m = lam_log_J[0][i] - mu;
-    for (CeedInt j = 0; j < 3; j++) {
-      for (CeedInt k = 0; k < 3; k++) deltaS[j][k] = lambda * Cinv_contract_E * tempCinv[j][k] - 2. * llnj_m * deltaS[j][k];
-    }
-
-    // deltaP = dPdF:deltaF = deltaF*S + F*deltaS
-    CeedScalar deltaP[3][3];
-    for (CeedInt j = 0; j < 3; j++) {
-      for (CeedInt k = 0; k < 3; k++) {
-        deltaP[j][k] = 0;
-        for (CeedInt m = 0; m < 3; m++) deltaP[j][k] += graddeltau[j][m] * S[m][k] + F[j][m] * deltaS[m][k];
-      }
-    }
-
-    // Apply dXdx^T and weight
-    for (CeedInt j = 0; j < 3; j++) {    // Component
-      for (CeedInt k = 0; k < 3; k++) {  // Derivative
-        deltadvdX[k][j][i] = 0;
-        for (CeedInt m = 0; m < 3; m++) deltadvdX[k][j][i] += dXdx[k][m] * deltaP[j][m] * wdetJ;
-      }
-    }
-  }  // End of Quadrature Point Loop
-
-  return 0;
-}
-
-// -----------------------------------------------------------------------------
-// Strain energy computation for hyperelasticity, finite strain
-// -----------------------------------------------------------------------------
-CEED_QFUNCTION(ElasFSInitialNH2Energy)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
-  // Inputs
-  const CeedScalar(*ug)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0], (*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[1];
-
-  // Outputs
-  CeedScalar(*energy) = (CeedScalar(*))out[0];
-
-  // Context
-  const Physics    context = (Physics)ctx;
-  const CeedScalar E       = context->E;
-  const CeedScalar nu      = context->nu;
-  const CeedScalar TwoMu   = E / (1 + nu);
-  const CeedScalar mu      = TwoMu / 2;
-  const CeedScalar Kbulk   = E / (3 * (1 - 2 * nu));  // Bulk Modulus
-  const CeedScalar lambda  = (3 * Kbulk - TwoMu) / 3;
-
-  // Quadrature Point Loop
-  CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
-    // Read spatial derivatives of u
-    const CeedScalar du[3][3] = {
-        {ug[0][0][i], ug[1][0][i], ug[2][0][i]},
-        {ug[0][1][i], ug[1][1][i], ug[2][1][i]},
-        {ug[0][2][i], ug[1][2][i], ug[2][2][i]}
-    };
-    // -- Qdata
-    const CeedScalar wdetJ      = q_data[0][i];
-    const CeedScalar dXdx[3][3] = {
-        {q_data[1][i], q_data[2][i], q_data[3][i]},
-        {q_data[4][i], q_data[5][i], q_data[6][i]},
-        {q_data[7][i], q_data[8][i], q_data[9][i]}
-    };
-
-    // Compute grad_u
-    //   dXdx = (dx/dX)^(-1)
-    // Apply dXdx to du = grad_u
-    CeedScalar grad_u[3][3];
-    for (int j = 0; j < 3; j++) {    // Component
-      for (int k = 0; k < 3; k++) {  // Derivative
-        grad_u[j][k] = 0;
-        for (int m = 0; m < 3; m++) grad_u[j][k] += dXdx[m][k] * du[j][m];
-      }
-    }
-
-    // E - Green-Lagrange strain tensor
-    //     E = 1/2 (grad_u + grad_u^T + grad_u^T*grad_u)
-    const CeedInt indj[6] = {0, 1, 2, 1, 0, 0}, indk[6] = {0, 1, 2, 2, 2, 1};
-    CeedScalar    E2work[6];
-    for (CeedInt m = 0; m < 6; m++) {
-      E2work[m] = grad_u[indj[m]][indk[m]] + grad_u[indk[m]][indj[m]];
-      for (CeedInt n = 0; n < 3; n++) E2work[m] += grad_u[n][indj[m]] * grad_u[n][indk[m]];
-    }
-    CeedScalar E2[3][3] = {
-        {E2work[0], E2work[5], E2work[4]},
-        {E2work[5], E2work[1], E2work[3]},
-        {E2work[4], E2work[3], E2work[2]}
-    };
-    const CeedScalar Jm1  = computeJM1(grad_u);
-    const CeedScalar logJ = log1p_series_shifted(Jm1);
-
-    // Strain energy Phi(E) for compressible Neo-Hookean
-    energy[i] = (lambda * logJ * logJ / 2. - mu * logJ + mu * (E2[0][0] + E2[1][1] + E2[2][2]) / 2.) * wdetJ;
-
-  }  // End of Quadrature Point Loop
-
-  return 0;
-}
-
-// -----------------------------------------------------------------------------
-// Nodal diagnostic quantities for hyperelasticity, finite strain
-// -----------------------------------------------------------------------------
-CEED_QFUNCTION(ElasFSInitialNH2Diagnostic)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
-  // Inputs
-  const CeedScalar(*u)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0], (*ug)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[1],
-        (*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[2];
-
-  // Outputs
-  CeedScalar(*diagnostic)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0];
-
-  // Context
-  const Physics    context = (Physics)ctx;
-  const CeedScalar E       = context->E;
-  const CeedScalar nu      = context->nu;
-  const CeedScalar TwoMu   = E / (1 + nu);
-  const CeedScalar mu      = TwoMu / 2;
-  const CeedScalar Kbulk   = E / (3 * (1 - 2 * nu));  // Bulk Modulus
-  const CeedScalar lambda  = (3 * Kbulk - TwoMu) / 3;
-
-  // Quadrature Point Loop
-  CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
-    // Read spatial derivatives of u
-    const CeedScalar du[3][3] = {
-        {ug[0][0][i], ug[1][0][i], ug[2][0][i]},
-        {ug[0][1][i], ug[1][1][i], ug[2][1][i]},
-        {ug[0][2][i], ug[1][2][i], ug[2][2][i]}
-    };
-    // -- Qdata
-    const CeedScalar dXdx[3][3] = {
-        {q_data[1][i], q_data[2][i], q_data[3][i]},
-        {q_data[4][i], q_data[5][i], q_data[6][i]},
-        {q_data[7][i], q_data[8][i], q_data[9][i]}
-    };
-
-    // Compute grad_u
-    //   dXdx = (dx/dX)^(-1)
-    // Apply dXdx to du = grad_u
-    CeedScalar grad_u[3][3];
-    for (int j = 0; j < 3; j++) {    // Component
-      for (int k = 0; k < 3; k++) {  // Derivative
-        grad_u[j][k] = 0;
-        for (int m = 0; m < 3; m++) grad_u[j][k] += dXdx[m][k] * du[j][m];
-      }
-    }
-
-    // E - Green-Lagrange strain tensor
-    //     E = 1/2 (grad_u + grad_u^T + grad_u^T*grad_u)
-    const CeedInt indj[6] = {0, 1, 2, 1, 0, 0}, indk[6] = {0, 1, 2, 2, 2, 1};
-    CeedScalar    E2work[6];
-    for (CeedInt m = 0; m < 6; m++) {
-      E2work[m] = grad_u[indj[m]][indk[m]] + grad_u[indk[m]][indj[m]];
-      for (CeedInt n = 0; n < 3; n++) E2work[m] += grad_u[n][indj[m]] * grad_u[n][indk[m]];
-    }
-    CeedScalar E2[3][3] = {
-        {E2work[0], E2work[5], E2work[4]},
-        {E2work[5], E2work[1], E2work[3]},
-        {E2work[4], E2work[3], E2work[2]}
-    };
-
-    // Displacement
-    diagnostic[0][i] = u[0][i];
-    diagnostic[1][i] = u[1][i];
-    diagnostic[2][i] = u[2][i];
-
-    // Pressure
-    const CeedScalar Jm1  = computeJM1(grad_u);
-    const CeedScalar logJ = log1p_series_shifted(Jm1);
-    diagnostic[3][i]      = -lambda * logJ;
-
-    // Stress tensor invariants
-    diagnostic[4][i] = (E2[0][0] + E2[1][1] + E2[2][2]) / 2.;
-    diagnostic[5][i] = 0.;
-    for (CeedInt j = 0; j < 3; j++) {
-      for (CeedInt m = 0; m < 3; m++) diagnostic[5][i] += E2[j][m] * E2[m][j] / 4.;
-    }
-    diagnostic[6][i] = Jm1 + 1.;
-
-    // Strain energy
-    diagnostic[7][i] = (lambda * logJ * logJ / 2. - mu * logJ + mu * (E2[0][0] + E2[1][1] + E2[2][2]) / 2.);
-  }  // End of Quadrature Point Loop
-
-  return 0;
-}
-// -----------------------------------------------------------------------------
diff --git a/examples/solids/qfunctions/finite-strain-neo-hookean-initial-1.h b/examples/solids/qfunctions/finite-strain-neo-hookean.h
similarity index 97%
rename from examples/solids/qfunctions/finite-strain-neo-hookean-initial-1.h
rename to examples/solids/qfunctions/finite-strain-neo-hookean.h
index 431c8e328a..42b2b46e2c 100644
--- a/examples/solids/qfunctions/finite-strain-neo-hookean-initial-1.h
+++ b/examples/solids/qfunctions/finite-strain-neo-hookean.h
@@ -52,7 +52,7 @@ CEED_QFUNCTION_HELPER CeedScalar log1p_series_shifted(CeedScalar x) {
   y *= y2;
   sum += y / 7;
   return 2 * sum;
-};
+}
 #endif
 
 // -----------------------------------------------------------------------------
@@ -66,7 +66,7 @@ CEED_QFUNCTION_HELPER CeedScalar computeJM1(const CeedScalar grad_u[3][3]) {
          grad_u[0][2] * (grad_u[1][0] * grad_u[2][1] - grad_u[2][0] * grad_u[1][1]) + grad_u[0][0] + grad_u[1][1] + grad_u[2][2] +
          grad_u[0][0] * grad_u[1][1] + grad_u[0][0] * grad_u[2][2] + grad_u[1][1] * grad_u[2][2] - grad_u[0][1] * grad_u[1][0] -
          grad_u[0][2] * grad_u[2][0] - grad_u[1][2] * grad_u[2][1];
-};
+}
 #endif
 
 // -----------------------------------------------------------------------------
@@ -86,8 +86,8 @@ CEED_QFUNCTION_HELPER int computeMatinvSym(const CeedScalar A[3][3], const CeedS
   };
   for (CeedInt m = 0; m < 6; m++) Ainv[m] = B[m] / (detA);
 
-  return 0;
-};
+  return CEED_ERROR_SUCCESS;
+}
 #endif
 
 // -----------------------------------------------------------------------------
@@ -136,13 +136,13 @@ CEED_QFUNCTION_HELPER int commonFS(const CeedScalar lambda, const CeedScalar mu,
     for (CeedInt n = 0; n < 3; n++) Swork[m] += mu * C_inv[indj[m]][n] * E2[n][indk[m]];
   }
 
-  return 0;
-};
+  return CEED_ERROR_SUCCESS;
+}
 
 // -----------------------------------------------------------------------------
 // Residual evaluation for hyperelasticity, finite strain
 // -----------------------------------------------------------------------------
-CEED_QFUNCTION(ElasFSInitialNH1F)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+CEED_QFUNCTION(ElasFSResidual_NH)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   // Inputs
   const CeedScalar(*ug)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0], (*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[1];
 
@@ -242,13 +242,13 @@ CEED_QFUNCTION(ElasFSInitialNH1F)(void *ctx, CeedInt Q, const CeedScalar *const
     }
   }  // End of Quadrature Point Loop
 
-  return 0;
+  return CEED_ERROR_SUCCESS;
 }
 
 // -----------------------------------------------------------------------------
 // Jacobian evaluation for hyperelasticity, finite strain
 // -----------------------------------------------------------------------------
-CEED_QFUNCTION(ElasFSInitialNH1dF)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+CEED_QFUNCTION(ElasFSJacobian_NH)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   // Inputs
   const CeedScalar(*deltaug)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0],
         (*q_data)[CEED_Q_VLA]               = (const CeedScalar(*)[CEED_Q_VLA])in[1];
@@ -387,13 +387,13 @@ CEED_QFUNCTION(ElasFSInitialNH1dF)(void *ctx, CeedInt Q, const CeedScalar *const
     }
   }  // End of Quadrature Point Loop
 
-  return 0;
+  return CEED_ERROR_SUCCESS;
 }
 
 // -----------------------------------------------------------------------------
 // Strain energy computation for hyperelasticity, finite strain
 // -----------------------------------------------------------------------------
-CEED_QFUNCTION(ElasFSInitialNH1Energy)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+CEED_QFUNCTION(ElasFSEnergy_NH)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   // Inputs
   const CeedScalar(*ug)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0], (*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[1];
 
@@ -457,13 +457,13 @@ CEED_QFUNCTION(ElasFSInitialNH1Energy)(void *ctx, CeedInt Q, const CeedScalar *c
 
   }  // End of Quadrature Point Loop
 
-  return 0;
+  return CEED_ERROR_SUCCESS;
 }
 
 // -----------------------------------------------------------------------------
 // Nodal diagnostic quantities for hyperelasticity, finite strain
 // -----------------------------------------------------------------------------
-CEED_QFUNCTION(ElasFSInitialNH1Diagnostic)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+CEED_QFUNCTION(ElasFSDiagnostic_NH)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   // Inputs
   const CeedScalar(*u)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0], (*ug)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[1],
         (*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[2];
@@ -542,6 +542,6 @@ CEED_QFUNCTION(ElasFSInitialNH1Diagnostic)(void *ctx, CeedInt Q, const CeedScala
     diagnostic[7][i] = (lambda * logJ * logJ / 2. - mu * logJ + mu * (E2[0][0] + E2[1][1] + E2[2][2]) / 2.);
   }  // End of Quadrature Point Loop
 
-  return 0;
+  return CEED_ERROR_SUCCESS;
 }
 // -----------------------------------------------------------------------------
diff --git a/examples/solids/qfunctions/linear.h b/examples/solids/qfunctions/linear.h
index b688fdf495..20b293b6f1 100644
--- a/examples/solids/qfunctions/linear.h
+++ b/examples/solids/qfunctions/linear.h
@@ -23,7 +23,7 @@ struct Physics_private {
 // -----------------------------------------------------------------------------
 // Residual evaluation for linear elasticity
 // -----------------------------------------------------------------------------
-CEED_QFUNCTION(ElasLinearF)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+CEED_QFUNCTION(ElasResidual_Linear)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   // Inputs
   const CeedScalar(*ug)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0], (*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[1];
 
@@ -113,13 +113,13 @@ CEED_QFUNCTION(ElasLinearF)(void *ctx, CeedInt Q, const CeedScalar *const *in, C
     }
   }  // End of Quadrature Point Loop
 
-  return 0;
+  return CEED_ERROR_SUCCESS;
 }
 
 // -----------------------------------------------------------------------------
 // Jacobian evaluation for linear elasticity
 // -----------------------------------------------------------------------------
-CEED_QFUNCTION(ElasLineardF)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+CEED_QFUNCTION(ElasJacobian_Linear)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   // Inputs
   const CeedScalar(*deltaug)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0],
         (*q_data)[CEED_Q_VLA]               = (const CeedScalar(*)[CEED_Q_VLA])in[1];
@@ -208,13 +208,13 @@ CEED_QFUNCTION(ElasLineardF)(void *ctx, CeedInt Q, const CeedScalar *const *in,
     }
   }  // End of Quadrature Point Loop
 
-  return 0;
+  return CEED_ERROR_SUCCESS;
 }
 
 // -----------------------------------------------------------------------------
 // Strain energy computation for linear elasticity
 // -----------------------------------------------------------------------------
-CEED_QFUNCTION(ElasLinearEnergy)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+CEED_QFUNCTION(ElasEnergy_Linear)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   // Inputs
   const CeedScalar(*ug)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0], (*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[1];
 
@@ -275,13 +275,13 @@ CEED_QFUNCTION(ElasLinearEnergy)(void *ctx, CeedInt Q, const CeedScalar *const *
 
   }  // End of Quadrature Point Loop
 
-  return 0;
+  return CEED_ERROR_SUCCESS;
 }
 
 // -----------------------------------------------------------------------------
 // Nodal diagnostic quantities for linear elasticity
 // -----------------------------------------------------------------------------
-CEED_QFUNCTION(ElasLinearDiagnostic)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+CEED_QFUNCTION(ElasDiagnostic_Linear)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   // Inputs
   const CeedScalar(*u)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0], (*ug)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[1],
         (*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[2];
@@ -357,6 +357,6 @@ CEED_QFUNCTION(ElasLinearDiagnostic)(void *ctx, CeedInt Q, const CeedScalar *con
         (lambda * strain_vol * strain_vol / 2. + strain_vol * mu + (e[0][1] * e[0][1] + e[0][2] * e[0][2] + e[1][2] * e[1][2]) * 2 * mu);
   }  // End of Quadrature Point Loop
 
-  return 0;
+  return CEED_ERROR_SUCCESS;
 }
 // -----------------------------------------------------------------------------
diff --git a/examples/solids/qfunctions/small-strain-neo-hookean.h b/examples/solids/qfunctions/small-strain-neo-hookean.h
index 95e0afa66c..13662dbdae 100644
--- a/examples/solids/qfunctions/small-strain-neo-hookean.h
+++ b/examples/solids/qfunctions/small-strain-neo-hookean.h
@@ -40,13 +40,13 @@ CEED_QFUNCTION_HELPER CeedScalar log1p_series(CeedScalar x) {
   y *= y2;
   sum += y / 7;
   return 2 * sum;
-};
+}
 #endif
 
 // -----------------------------------------------------------------------------
 // Residual evaluation for hyperelasticity, small strain
 // -----------------------------------------------------------------------------
-CEED_QFUNCTION(ElasSSNHF)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+CEED_QFUNCTION(ElasSSResidual_NH)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   // Inputs
   const CeedScalar(*ug)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0], (*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[1];
 
@@ -140,13 +140,13 @@ CEED_QFUNCTION(ElasSSNHF)(void *ctx, CeedInt Q, const CeedScalar *const *in, Cee
     }
   }  // End of Quadrature Point Loop
 
-  return 0;
+  return CEED_ERROR_SUCCESS;
 }
 
 // -----------------------------------------------------------------------------
 // Jacobian evaluation for hyperelasticity, small strain
 // -----------------------------------------------------------------------------
-CEED_QFUNCTION(ElasSSNHdF)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+CEED_QFUNCTION(ElasSSJacobian_NH)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   // Inputs
   const CeedScalar(*deltaug)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0],
         (*q_data)[CEED_Q_VLA]               = (const CeedScalar(*)[CEED_Q_VLA])in[1];
@@ -254,13 +254,13 @@ CEED_QFUNCTION(ElasSSNHdF)(void *ctx, CeedInt Q, const CeedScalar *const *in, Ce
     }
   }  // End of Quadrature Point Loop
 
-  return 0;
+  return CEED_ERROR_SUCCESS;
 }
 
 // -----------------------------------------------------------------------------
 // Strain energy computation for hyperelasticity, small strain
 // -----------------------------------------------------------------------------
-CEED_QFUNCTION(ElasSSNHEnergy)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+CEED_QFUNCTION(ElasSSEnergy_NH)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   // Inputs
   const CeedScalar(*ug)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0], (*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[1];
 
@@ -322,13 +322,13 @@ CEED_QFUNCTION(ElasSSNHEnergy)(void *ctx, CeedInt Q, const CeedScalar *const *in
 
   }  // End of Quadrature Point Loop
 
-  return 0;
+  return CEED_ERROR_SUCCESS;
 }
 
 // -----------------------------------------------------------------------------
 // Nodal diagnostic quantities for hyperelasticity, small strain
 // -----------------------------------------------------------------------------
-CEED_QFUNCTION(ElasSSNHDiagnostic)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+CEED_QFUNCTION(ElasSSDiagnostic_NH)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   // Inputs
   const CeedScalar(*u)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0], (*ug)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[1],
         (*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[2];
@@ -405,6 +405,6 @@ CEED_QFUNCTION(ElasSSNHDiagnostic)(void *ctx, CeedInt Q, const CeedScalar *const
         (lambda * (1 + strain_vol) * (llv - 1) + strain_vol * mu + (e[0][1] * e[0][1] + e[0][2] * e[0][2] + e[1][2] * e[1][2]) * 2 * mu);
   }  // End of Quadrature Point Loop
 
-  return 0;
+  return CEED_ERROR_SUCCESS;
 }
 // -----------------------------------------------------------------------------
diff --git a/examples/solids/src/cl-options.c b/examples/solids/src/cl-options.c
index 935ee6c9b8..996ffcfbcf 100644
--- a/examples/solids/src/cl-options.c
+++ b/examples/solids/src/cl-options.c
@@ -57,10 +57,7 @@ PetscErrorCode ProcessCommandLineOptions(MPI_Comm comm, AppCtx app_ctx) {
   app_ctx->forcing_vector[2] = 0;
   PetscCall(PetscOptionsScalarArray("-forcing_vec", "Direction to apply constant force", NULL, app_ctx->forcing_vector, &max_n, NULL));
 
-  if ((app_ctx->problem_choice == ELAS_FSInitial_NH1 || app_ctx->problem_choice == ELAS_FSInitial_NH2 ||
-       app_ctx->problem_choice == ELAS_FSCurrent_NH1 || app_ctx->problem_choice == ELAS_FSCurrent_NH2 ||
-       app_ctx->problem_choice == ELAS_FSInitial_MR1) &&
-      app_ctx->forcing_choice == FORCE_CONST) {
+  if ((app_ctx->problem_choice == ELAS_FS_NH || app_ctx->problem_choice == ELAS_FS_MR) && app_ctx->forcing_choice == FORCE_CONST) {
     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP,
             "Cannot use constant forcing and finite strain formulation. "
             "Constant forcing in reference frame currently unavailable.");

From 380e68c47feddb82be6a549fea4dc65614d6b00c Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Mon, 10 Jun 2024 16:37:25 -0600
Subject: [PATCH 056/571] solids - drop current config in docs

---
 examples/solids/index.md | 211 ---------------------------------------
 1 file changed, 211 deletions(-)

diff --git a/examples/solids/index.md b/examples/solids/index.md
index f4ff1816b9..e1334ff308 100644
--- a/examples/solids/index.md
+++ b/examples/solids/index.md
@@ -526,214 +526,3 @@ In the case where complete linearization is preferred, note the symmetry $\maths
 Along with 6 entries for $\bm S$, this totals 27 entries of overhead compared to computing everything from $\bm F$.
 This compares with 13 entries of overhead for direct storage of $\{ \bm S, \bm C^{-1}, \log J \}$, which is sufficient for the Neo-Hookean model to avoid all but matrix products.
 :::
-
-(problem-hyperelasticity-finite-strain-current-configuration)=
-
-## Hyperelasticity in current configuration
-
-In the preceeding discussion, all equations have been formulated in the initial configuration.
-This may feel convenient in that the computational domain is clearly independent of the solution, but there are some advantages to defining the equations in the current configuration.
-
-1. Body forces (like gravity), traction, and contact are more easily defined in the current configuration.
-2. Mesh quality in the initial configuration can be very bad for large deformation.
-3. The required storage and numerical representation can be smaller in the current configuration.
-
-Most of the benefit in case 3 can be attained solely by moving the Jacobian representation to the current configuration {cite}`davydov2020matrix`, though residual evaluation may also be slightly faster in current configuration.
-There are multiple commuting paths from the nonlinear weak form in initial configuration {eq}`hyperelastic-weak-form-initial` to the Jacobian weak form in current configuration {eq}`jacobian-weak-form-current`.
-One may push forward to the current configuration and then linearize or linearize in initial configuration and then push forward, as summarized below.
-
-$$
-\begin{CD}
-  {\overbrace{\nabla_X \bm{v} \tcolon \bm{FS}}^{\text{Initial Residual}}}
-  @>{\text{push forward}}>{}>
-  {\overbrace{\nabla_x \bm{v} \tcolon \bm{\tau}}^{\text{Current Residual}}} \\
-  @V{\text{linearize}}V{\begin{smallmatrix} \diff\bm F = \nabla_X\diff\bm u \\ \diff\bm S(\diff\bm E) \end{smallmatrix}}V
-  @V{\begin{smallmatrix} \diff\nabla_x\bm v = -\nabla_x\bm v \nabla_x \diff\bm u \\ \diff\bm\tau(\diff\bm\epsilon) \end{smallmatrix}}V{\text{linearize}}V \\
-  {\underbrace{\nabla_X\bm{v}\tcolon \Big(\diff\bm{F}\bm{S} + \bm{F}\diff\bm{S}\Big)}_\text{Initial Jacobian}}
-  @>{\text{push forward}}>{}>
-  {\underbrace{\nabla_x\bm{v}\tcolon \Big(\diff\bm{\tau} -\bm{\tau}(\nabla_x \diff\bm{u})^T \Big)}_\text{Current Jacobian}}
-\end{CD}
-$$ (initial-current-linearize)
-
-We will follow both paths for consistency and because both intermediate representations may be useful for implementation.
-
-### Push forward, then linearize
-
-The first term of {eq}`hyperelastic-weak-form-initial` can be rewritten in terms of the symmetric Kirchhoff stress tensor
-$\bm{\tau}=J\bm{\sigma}=\bm{P}\bm{F}^T = \bm F \bm S \bm F^T$ as
-
-$$
-\nabla_X \bm{v} \tcolon \bm{P} = \nabla_X \bm{v} \tcolon \bm{\tau}\bm{F}^{-T} = \nabla_X \bm{v}\bm{F}^{-1} \tcolon \bm{\tau} = \nabla_x \bm{v} \tcolon \bm{\tau}
-$$
-
-therefore, the weak form in terms of $\bm{\tau}$ and $\nabla_x$ with integral over $\Omega_0$ is
-
-$$
-\int_{\Omega_0}{\nabla_x \bm{v} \tcolon \bm{\tau}} \, dV
- - \int_{\Omega_0}{\bm{v} \cdot \rho_0 \bm{g}} \, dV
- - \int_{\partial \Omega_0}{\bm{v}\cdot(\bm{P}\cdot\hat{\bm{N}})} \, dS
- = 0, \quad \forall \bm v \in \mathcal V.
-$$ (hyperelastic-weak-form-current)
-
-#### Linearize in current configuration
-
-To derive a Newton linearization of {eq}`hyperelastic-weak-form-current`, first we define
-
-$$
-\nabla_x \diff \bm{u} = \nabla_X \diff \bm{u} \  \bm{F}^{-1} = \diff \bm{F} \bm{F}^{-1}
-$$ (nabla_xdu)
-
-and $\bm{\tau}$ for Neo-Hookean materials as the push forward of {eq}`neo-hookean-stress`
-
-$$
-\bm{\tau} = \bm{F}\bm{S}\bm{F}^T = \mu (\bm{b} - \bm I_3) + \lambda \log J \bm{I}_3,
-$$ (tau-neo-hookean)
-
-where $\bm{b} = \bm{F} \bm{F}^T$, is the left Cauchy-Green tensor.
-Then by expanding the directional derivative of $\nabla_x \bm{v} \tcolon \bm{\tau}$, we arrive at
-
-$$
-\diff \ (\nabla_x \bm{v} \tcolon \bm{\tau}) = \diff \ (\nabla_x \bm{v})\tcolon \bm{\tau} + \nabla_x \bm{v} \tcolon \diff \bm{\tau} .
-$$ (hyperelastic-linearization-current1)
-
-The first term of {eq}`hyperelastic-linearization-current1` can be written as
-
-$$
-\begin{aligned} \diff \ (\nabla_x \bm{v})\tcolon \bm{\tau} &= \diff \ (\nabla_X \bm{v} \bm{F}^{-1})\tcolon \bm{\tau} = \Big(\underbrace{\nabla_X (\diff \bm{v})}_{0}\bm{F}^{-1} +  \nabla_X \bm{v}\diff \bm{F}^{-1}\Big)\tcolon \bm{\tau}\\   &= \Big(-\nabla_X \bm{v} \bm{F}^{-1}\diff\bm{F}\bm{F}^{-1}\Big)\tcolon \bm{\tau}=\Big(-\nabla_x \bm{v} \diff\bm{F}\bm{F}^{-1}\Big)\tcolon \bm{\tau}\\   &= \Big(-\nabla_x \bm{v} \nabla_x \diff\bm{u} \Big)\tcolon \bm{\tau}= -\nabla_x \bm{v}\tcolon\bm{\tau}(\nabla_x \diff\bm{u})^T \,, \end{aligned}
-$$
-
-where we have used $\diff \bm{F}^{-1}=-\bm{F}^{-1} \diff \bm{F} \bm{F}^{-1}$ and {eq}`nabla_xdu`.
-Using this and {eq}`hyperelastic-linearization-current1` in {eq}`hyperelastic-weak-form-current` yields the weak form in the current configuration
-
-$$
-\int_{\Omega_0} \nabla_x \bm v \tcolon \Big(\diff\bm\tau - \bm\tau (\nabla_x \diff\bm u)^T \Big) = \text{rhs}.
-$$ (jacobian-weak-form-current)
-
-In the following, we will sometimes make use of the incremental strain tensor in the current configuration,
-
-$$
-\diff\bm\epsilon \equiv \frac{1}{2}\Big(\nabla_x \diff\bm{u} + (\nabla_x \diff\bm{u})^T   \Big) .
-$$
-
-:::{dropdown} Deriving $\diff\bm\tau$ for Neo-Hookean material
-To derive a useful expression of $\diff\bm\tau$ for Neo-Hookean materials, we will use the representations
-
-$$
-\begin{aligned}
-\diff \bm{b} &= \diff \bm{F} \bm{F}^T + \bm{F} \diff \bm{F}^T \\
-&= \nabla_x \diff \bm{u} \ \bm{b} + \bm{b} \ (\nabla_x \diff \bm{u})^T \\
-&= (\nabla_x \diff\bm u)(\bm b - \bm I_3) + (\bm b - \bm I_3) (\nabla_x \diff\bm u)^T + 2 \diff\bm\epsilon
-\end{aligned}
-$$
-
-and
-
-$$
-\begin{aligned} \diff\ (\log J) &= \frac{\partial \log J}{\partial \bm{b}}\tcolon \diff \bm{b} = \frac{\partial J}{J\partial \bm{b}}\tcolon \diff \bm{b}=\frac{1}{2}\bm{b}^{-1}\tcolon \diff \bm{b} \\ &= \frac 1 2 \bm b^{-1} \tcolon \Big(\nabla_x \diff\bm u \ \bm b + \bm b (\nabla_x \diff\bm u)^T \Big) \\ &= \trace (\nabla_x \diff\bm u) \\ &= \trace \diff\bm\epsilon . \end{aligned}
-$$
-
-Substituting into {eq}`tau-neo-hookean` gives
-
-$$
-\begin{aligned}
-\diff \bm{\tau} &= \mu \diff \bm{b} + \lambda \trace (\diff\bm\epsilon) \bm I_3 \\
-&= \underbrace{2 \mu \diff\bm\epsilon + \lambda \trace (\diff\bm\epsilon) \bm I_3 - 2\lambda \log J \diff\bm\epsilon}_{\bm F \diff\bm S \bm F^T} \\
-&\quad + (\nabla_x \diff\bm u)\underbrace{\Big( \mu (\bm b - \bm I_3) + \lambda \log J \bm I_3 \Big)}_{\bm\tau} \\
-&\quad + \underbrace{\Big( \mu (\bm b - \bm I_3) + \lambda \log J \bm I_3 \Big)}_{\bm\tau}  (\nabla_x \diff\bm u)^T ,
-\end{aligned}
-$$ (dtau-neo-hookean)
-
-where the final expression has been identified according to
-
-$$
-\diff\bm\tau = \diff\ (\bm F \bm S \bm F^T) = (\nabla_x \diff\bm u) \bm\tau + \bm F \diff\bm S \bm F^T + \bm\tau(\nabla_x \diff\bm u)^T.
-$$
-:::
-
-Collecting terms, we may thus opt to use either of the two forms
-
-$$
-\begin{aligned}
-\diff \bm{\tau} -\bm{\tau}(\nabla_x \diff\bm{u})^T &= (\nabla_x \diff\bm u)\bm\tau + \bm F \diff\bm S \bm F^T \\
-&= (\nabla_x \diff\bm u)\bm\tau + \lambda \trace(\diff\bm\epsilon) \bm I_3 + 2(\mu - \lambda \log J) \diff\bm\epsilon,
-\end{aligned}
-$$ (cur_simp_Jac)
-
-with the last line showing the especially compact representation available for Neo-Hookean materials.
-
-### Linearize, then push forward
-
-We can move the derivatives to the current configuration via
-
-$$
-\nabla_X \bm v \!:\! \diff\bm P = (\nabla_X \bm v) \bm F^{-1} \!:\! \diff \bm P \bm F^T = \nabla_x \bm v \!:\! \diff\bm P \bm F^T
-$$
-
-and expand
-
-$$
-\begin{aligned}
-\diff\bm P \bm F^T &= \diff\bm F \bm S \bm F^T + \bm F \diff\bm S \bm F^T \\
-&= \underbrace{\diff\bm F \bm F^{-1}}_{\nabla_x \diff\bm u} \underbrace{\bm F \bm S \bm F^T}_{\bm\tau} + \bm F \diff\bm S \bm F^T .
-\end{aligned}
-$$
-
-:::{dropdown} Representation of $\bm F \diff\bm S \bm F^T$ for Neo-Hookean materials
-Now we push {eq}`eq-neo-hookean-incremental-stress` forward via
-
-$$
-\begin{aligned}
-\bm F \diff\bm S \bm F^T &= \lambda (\bm C^{-1} \!:\! \diff\bm E) \bm F \bm C^{-1} \bm F^T
-  + 2 (\mu - \lambda \log J) \bm F \bm C^{-1} \diff\bm E \, \bm C^{-1} \bm F^T \\
-    &= \lambda (\bm C^{-1} \!:\! \diff\bm E) \bm I_3 + 2 (\mu - \lambda \log J) \bm F^{-T} \diff\bm E \, \bm F^{-1} \\
-    &= \lambda \operatorname{trace}(\nabla_x \diff\bm u) \bm I_3 + 2 (\mu - \lambda \log J) \diff\bm \epsilon
-\end{aligned}
-$$
-
-where we have used
-
-$$
-\begin{aligned}
-\bm C^{-1} \!:\! \diff\bm E &= \bm F^{-1} \bm F^{-T} \!:\! \bm F^T \diff\bm F \\
-&= \operatorname{trace}(\bm F^{-1} \bm F^{-T} \bm F^T \diff \bm F) \\
-&= \operatorname{trace}(\bm F^{-1} \diff\bm F) \\
-&= \operatorname{trace}(\diff \bm F \bm F^{-1}) \\
-&= \operatorname{trace}(\nabla_x \diff\bm u)
-\end{aligned}
-$$
-
-and
-
-$$
-\begin{aligned}
-\bm F^{-T} \diff\bm E \, \bm F^{-1} &= \frac 1 2 \bm F^{-T} (\bm F^T \diff\bm F + \diff\bm F^T \bm F) \bm F^{-1} \\
-&= \frac 1 2 (\diff \bm F \bm F^{-1} + \bm F^{-T} \diff\bm F^T) \\
-&= \frac 1 2 \Big(\nabla_x \diff\bm u + (\nabla_x\diff\bm u)^T \Big) \equiv \diff\bm\epsilon.
-\end{aligned}
-$$
-:::
-
-Collecting terms, the weak form of the Newton linearization for Neo-Hookean materials in the current configuration is
-
-$$
-\int_{\Omega_0} \nabla_x \bm v \!:\! \Big( (\nabla_x \diff\bm u) \bm\tau + \lambda \operatorname{trace}(\diff\bm\epsilon)\bm I_3 + 2(\mu - \lambda\log J)\diff \bm\epsilon \Big) dV = \text{rhs},
-$$ (jacobian-weak-form-current2)
-
-which equivalent to Algorithm 2 of {cite}`davydov2020matrix` and requires only derivatives with respect to the current configuration. Note that {eq}`cur_simp_Jac` and {eq}`jacobian-weak-form-current2` have recovered the same representation
-using different algebraic manipulations.
-
-:::{tip}
-We define a second order *Green-Euler* strain tensor (cf. Green-Lagrange strain {eq}`eq-green-lagrange-strain`) as
-
-$$
-\bm e = \frac 1 2 \Big(\bm{b} - \bm{I}_3 \Big) = \frac 1 2 \Big( \nabla_X \bm{u} + (\nabla_X \bm{u})^T + \nabla_X \bm{u} \, (\nabla_X \bm{u})^T \Big).
-$$ (green-euler-strain)
-
-Then, the Kirchhoff stress tensor {eq}`tau-neo-hookean` can be written as
-
-$$
-\bm \tau = \lambda \log J \bm I_{3} + 2\mu \bm e,
-$$ (tau-neo-hookean-stable)
-
-which is more numerically stable for small strain, and thus preferred for computation. Note that the $\log J$ is computed via `log1p` {eq}`log1p`, as we discussed in the previous tip.
-:::

From a80a54a28a60e8f6937c4280a1c4175f49372e36 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Mon, 10 Jun 2024 16:46:35 -0600
Subject: [PATCH 057/571] solids - drop small strain since its untested

---
 examples/solids/README.md                     |   2 +-
 examples/solids/problems/cl-problems.h        |   7 +-
 examples/solids/problems/problems.c           |   1 -
 examples/solids/problems/problems.h           |   1 -
 .../problems/small-strain-neo-hookean.c       |  58 ---
 .../qfunctions/small-strain-neo-hookean.h     | 410 ------------------
 6 files changed, 4 insertions(+), 475 deletions(-)
 delete mode 100644 examples/solids/problems/small-strain-neo-hookean.c
 delete mode 100644 examples/solids/qfunctions/small-strain-neo-hookean.h

diff --git a/examples/solids/README.md b/examples/solids/README.md
index c86180d6f2..d6e70d7be4 100644
--- a/examples/solids/README.md
+++ b/examples/solids/README.md
@@ -103,7 +103,7 @@ The command line options just shown are the minimum requirements to run the mini
   -
 
 * - `-problem`
-  - Problem to solve (`Linear`, `SS-NH`, `FS-MR`, etc.)
+  - Problem to solve (`Linear`, `FS-NH`, `FS-MR`, etc.)
   - `Linear`
 
 * - `-forcing`
diff --git a/examples/solids/problems/cl-problems.h b/examples/solids/problems/cl-problems.h
index acd8dcf15d..8a9036c995 100644
--- a/examples/solids/problems/cl-problems.h
+++ b/examples/solids/problems/cl-problems.h
@@ -7,8 +7,7 @@
 #pragma once
 
 // Problem options
-typedef enum { ELAS_LINEAR = 0, ELAS_SS_NH = 1, ELAS_FS_NH = 2, ELAS_FS_MR = 3 } problemType;
-static const char *const problemTypes[]        = {"Linear", "SS-NH", "FS-NH", "FS-MR", "problemType", "ELAS_", 0};
-static const char *const problemTypesForDisp[] = {"Linear elasticity", "Hyperelasticity small strain, Neo-Hookean",
-                                                  "Hyperelasticity finite strain Initial configuration Neo-Hookean",
+typedef enum { ELAS_LINEAR = 0, ELAS_FS_NH = 2, ELAS_FS_MR = 2 } problemType;
+static const char *const problemTypes[]        = {"Linear", "FS-NH", "FS-MR", "problemType", "ELAS_", 0};
+static const char *const problemTypesForDisp[] = {"Linear elasticity", "Hyperelasticity finite strain Initial configuration Neo-Hookean",
                                                   "Hyperelasticity finite strain Initial configuration Moony-Rivlin"};
diff --git a/examples/solids/problems/problems.c b/examples/solids/problems/problems.c
index 5cfd9769a9..1ee1c4c215 100644
--- a/examples/solids/problems/problems.c
+++ b/examples/solids/problems/problems.c
@@ -14,7 +14,6 @@ PetscErrorCode RegisterProblems(ProblemFunctions problem_functions) {
   PetscFunctionBegin;
 
   SOLIDS_PROBLEM_REGISTER(problem_functions, "Linear", ElasLinear, NH);
-  SOLIDS_PROBLEM_REGISTER(problem_functions, "SS-NH", ElasSSNH, NH);
   SOLIDS_PROBLEM_REGISTER(problem_functions, "FS-NH", ElasFSNH, NH);
   SOLIDS_PROBLEM_REGISTER(problem_functions, "FS-MR", ElasFSMR, MR);
 
diff --git a/examples/solids/problems/problems.h b/examples/solids/problems/problems.h
index 6314426ac5..41c4271ffc 100644
--- a/examples/solids/problems/problems.h
+++ b/examples/solids/problems/problems.h
@@ -35,6 +35,5 @@ PetscErrorCode RegisterProblems(ProblemFunctions problem_functions);
                                           PetscInt u_loc_size, CeedVector fine_mult, CeedData *data);
 
 SOLIDS_PROBLEM(ElasLinear);
-SOLIDS_PROBLEM(ElasSSNH);
 SOLIDS_PROBLEM(ElasFSNH);
 SOLIDS_PROBLEM(ElasFSMR);
diff --git a/examples/solids/problems/small-strain-neo-hookean.c b/examples/solids/problems/small-strain-neo-hookean.c
deleted file mode 100644
index f6252807cf..0000000000
--- a/examples/solids/problems/small-strain-neo-hookean.c
+++ /dev/null
@@ -1,58 +0,0 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#include "../qfunctions/small-strain-neo-hookean.h"
-
-#include <ceed.h>
-#include <petscsys.h>
-
-#include "../include/setup-libceed.h"
-#include "../include/structs.h"
-#include "../problems/neo-hookean.h"
-#include "../problems/problems.h"
-#include "../qfunctions/common.h"
-
-static const char *const field_names[] = {"gradu"};
-static CeedInt           field_sizes[] = {9};
-
-ProblemData small_strain_neo_Hookean = {
-    .setup_geo            = SetupGeo,
-    .setup_geo_loc        = SetupGeo_loc,
-    .q_data_size          = 10,
-    .quadrature_mode      = CEED_GAUSS,
-    .residual             = ElasSSResidual_NH,
-    .residual_loc         = ElasSSResidual_NH_loc,
-    .number_fields_stored = 1,
-    .field_names          = field_names,
-    .field_sizes          = field_sizes,
-    .jacobian             = ElasSSJacobian_NH,
-    .jacobian_loc         = ElasSSJacobian_NH_loc,
-    .energy               = ElasSSEnergy_NH,
-    .energy_loc           = ElasSSEnergy_NH_loc,
-    .diagnostic           = ElasSSDiagnostic_NH,
-    .diagnostic_loc       = ElasSSDiagnostic_NH_loc,
-};
-
-PetscErrorCode SetupLibceedFineLevel_ElasSSNH(DM dm, DM dm_energy, DM dm_diagnostic, Ceed ceed, AppCtx app_ctx, CeedQFunctionContext phys_ctx,
-                                              PetscInt fine_level, PetscInt num_comp_u, PetscInt U_g_size, PetscInt U_loc_size, CeedVector force_ceed,
-                                              CeedVector neumann_ceed, CeedData *data) {
-  PetscFunctionBegin;
-
-  PetscCall(SetupLibceedFineLevel(dm, dm_energy, dm_diagnostic, ceed, app_ctx, phys_ctx, small_strain_neo_Hookean, fine_level, num_comp_u, U_g_size,
-                                  U_loc_size, force_ceed, neumann_ceed, data));
-
-  PetscFunctionReturn(PETSC_SUCCESS);
-};
-
-PetscErrorCode SetupLibceedLevel_ElasSSNH(DM dm, Ceed ceed, AppCtx app_ctx, PetscInt level, PetscInt num_comp_u, PetscInt U_g_size,
-                                          PetscInt U_loc_size, CeedVector fine_mult, CeedData *data) {
-  PetscFunctionBegin;
-
-  PetscCall(SetupLibceedLevel(dm, ceed, app_ctx, small_strain_neo_Hookean, level, num_comp_u, U_g_size, U_loc_size, fine_mult, data));
-
-  PetscFunctionReturn(PETSC_SUCCESS);
-};
diff --git a/examples/solids/qfunctions/small-strain-neo-hookean.h b/examples/solids/qfunctions/small-strain-neo-hookean.h
deleted file mode 100644
index 13662dbdae..0000000000
--- a/examples/solids/qfunctions/small-strain-neo-hookean.h
+++ /dev/null
@@ -1,410 +0,0 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-/// @file
-/// Hyperelasticity, small strain for solid mechanics example using PETSc
-
-#include <ceed.h>
-#include <math.h>
-
-#ifndef PHYSICS_STRUCT
-#define PHYSICS_STRUCT
-typedef struct Physics_private *Physics;
-struct Physics_private {
-  CeedScalar nu;  // Poisson's ratio
-  CeedScalar E;   // Young's Modulus
-};
-#endif
-
-// -----------------------------------------------------------------------------
-// Series approximation of log1p()
-//  log1p() is not vectorized in libc
-//
-//  The series expansion is accurate to 1e-7 in the range sqrt(2)/2 < J < sqrt(2), with machine precision accuracy near J=1.
-// -----------------------------------------------------------------------------
-#ifndef LOG1P_SERIES
-#define LOG1P_SERIES
-CEED_QFUNCTION_HELPER CeedScalar log1p_series(CeedScalar x) {
-  CeedScalar       sum = 0;
-  CeedScalar       y   = x / (2. + x);
-  const CeedScalar y2  = y * y;
-  sum += y;
-  y *= y2;
-  sum += y / 3;
-  y *= y2;
-  sum += y / 5;
-  y *= y2;
-  sum += y / 7;
-  return 2 * sum;
-}
-#endif
-
-// -----------------------------------------------------------------------------
-// Residual evaluation for hyperelasticity, small strain
-// -----------------------------------------------------------------------------
-CEED_QFUNCTION(ElasSSResidual_NH)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
-  // Inputs
-  const CeedScalar(*ug)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0], (*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[1];
-
-  // Outputs
-  CeedScalar(*dvdX)[3][CEED_Q_VLA] = (CeedScalar(*)[3][CEED_Q_VLA])out[0];
-  // Store grad_u for HyperFSdF (Jacobian of HyperFSF)
-  CeedScalar(*grad_u)[3][CEED_Q_VLA] = (CeedScalar(*)[3][CEED_Q_VLA])out[1];
-
-  // Context
-  const Physics    context = (Physics)ctx;
-  const CeedScalar E       = context->E;
-  const CeedScalar nu      = context->nu;
-
-  // Constants
-  const CeedScalar TwoMu  = E / (1 + nu);
-  const CeedScalar Kbulk  = E / (3 * (1 - 2 * nu));  // Bulk modulus
-  const CeedScalar lambda = (3 * Kbulk - TwoMu) / 3;
-
-  // Quadrature Point Loop
-  CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
-    // Read spatial derivatives of u
-    const CeedScalar du[3][3] = {
-        {ug[0][0][i], ug[1][0][i], ug[2][0][i]},
-        {ug[0][1][i], ug[1][1][i], ug[2][1][i]},
-        {ug[0][2][i], ug[1][2][i], ug[2][2][i]}
-    };
-    // -- Qdata
-    const CeedScalar wdetJ      = q_data[0][i];
-    const CeedScalar dXdx[3][3] = {
-        {q_data[1][i], q_data[2][i], q_data[3][i]},
-        {q_data[4][i], q_data[5][i], q_data[6][i]},
-        {q_data[7][i], q_data[8][i], q_data[9][i]}
-    };
-
-    // Compute grad_u
-    //   dXdx = (dx/dX)^(-1)
-    // Apply dXdx to du = grad_u
-    for (int j = 0; j < 3; j++) {    // Component
-      for (int k = 0; k < 3; k++) {  // Derivative
-        grad_u[j][k][i] = 0;
-        for (int m = 0; m < 3; m++) grad_u[j][k][i] += dXdx[m][k] * du[j][m];
-      }
-    }
-
-    // Compute Strain : e (epsilon)
-    // e = 1/2 (grad u + (grad u)^T)
-    const CeedScalar e00 = (grad_u[0][0][i] + grad_u[0][0][i]) / 2., e01 = (grad_u[0][1][i] + grad_u[1][0][i]) / 2.,
-                     e02 = (grad_u[0][2][i] + grad_u[2][0][i]) / 2., e11 = (grad_u[1][1][i] + grad_u[1][1][i]) / 2.,
-                     e12 = (grad_u[1][2][i] + grad_u[2][1][i]) / 2., e22 = (grad_u[2][2][i] + grad_u[2][2][i]) / 2.;
-    const CeedScalar e[3][3] = {
-        {e00, e01, e02},
-        {e01, e11, e12},
-        {e02, e12, e22}
-    };
-
-    // strain (epsilon)
-    //    and
-    // stress (sigma) in Voigt notation:
-    //           [e00]              [sigma00]
-    //           [e11]              [sigma11]
-    // epsilon = [e22]  ,   sigma = [sigma22]
-    //           [e12]              [sigma12]
-    //           [e02]              [sigma02]
-    //           [e01]              [sigma01]
-    //
-    // mu = E / (2 * (1 + nu))
-    // bulk modulus = E / (2 * (1 - 2 * nu))
-    // lambda = (3 * bulk modulus - 2 * mu) / 3
-    // e_v = volumetric strain = e00 + e11 + e22
-    //
-    // sigma = lambda * log(1 + e_v) + 2 * mu * epsilon
-    //
-    // Above Voigt Notation is placed in a 3x3 matrix:
-    // Volumetric strain
-    const CeedScalar strain_vol = e[0][0] + e[1][1] + e[2][2];
-    const CeedScalar llv        = log1p_series(strain_vol);
-    const CeedScalar sigma00 = lambda * llv + TwoMu * e[0][0], sigma11 = lambda * llv + TwoMu * e[1][1], sigma22 = lambda * llv + TwoMu * e[2][2],
-                     sigma12 = TwoMu * e[1][2], sigma02 = TwoMu * e[0][2], sigma01 = TwoMu * e[0][1];
-    const CeedScalar sigma[3][3] = {
-        {sigma00, sigma01, sigma02},
-        {sigma01, sigma11, sigma12},
-        {sigma02, sigma12, sigma22}
-    };
-
-    // Apply dXdx^T and weight to sigma
-    for (int j = 0; j < 3; j++) {    // Component
-      for (int k = 0; k < 3; k++) {  // Derivative
-        dvdX[k][j][i] = 0;
-        for (int m = 0; m < 3; m++) dvdX[k][j][i] += dXdx[k][m] * sigma[j][m] * wdetJ;
-      }
-    }
-  }  // End of Quadrature Point Loop
-
-  return CEED_ERROR_SUCCESS;
-}
-
-// -----------------------------------------------------------------------------
-// Jacobian evaluation for hyperelasticity, small strain
-// -----------------------------------------------------------------------------
-CEED_QFUNCTION(ElasSSJacobian_NH)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
-  // Inputs
-  const CeedScalar(*deltaug)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0],
-        (*q_data)[CEED_Q_VLA]               = (const CeedScalar(*)[CEED_Q_VLA])in[1];
-  // grad_u is used for hyperelasticity (non-linear)
-  const CeedScalar(*grad_u)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[2];
-
-  // Outputs
-  CeedScalar(*deltadvdX)[3][CEED_Q_VLA] = (CeedScalar(*)[3][CEED_Q_VLA])out[0];
-
-  // Context
-  const Physics    context = (Physics)ctx;
-  const CeedScalar E       = context->E;
-  const CeedScalar nu      = context->nu;
-
-  // Constants
-  const CeedScalar TwoMu  = E / (1 + nu);
-  const CeedScalar Kbulk  = E / (3 * (1 - 2 * nu));  // Bulk modulus
-  const CeedScalar lambda = (3 * Kbulk - TwoMu) / 3;
-
-  // Quadrature Point Loop
-  CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
-    // Read spatial derivatives of u
-    const CeedScalar deltadu[3][3] = {
-        {deltaug[0][0][i], deltaug[1][0][i], deltaug[2][0][i]},
-        {deltaug[0][1][i], deltaug[1][1][i], deltaug[2][1][i]},
-        {deltaug[0][2][i], deltaug[1][2][i], deltaug[2][2][i]}
-    };
-    // -- Qdata
-    const CeedScalar wdetJ      = q_data[0][i];
-    const CeedScalar dXdx[3][3] = {
-        {q_data[1][i], q_data[2][i], q_data[3][i]},
-        {q_data[4][i], q_data[5][i], q_data[6][i]},
-        {q_data[7][i], q_data[8][i], q_data[9][i]}
-    };
-
-    // Compute graddeltau
-    // Apply dXdx^-1 to deltadu = graddeltau
-    CeedScalar graddeltau[3][3];
-    for (int j = 0; j < 3; j++) {    // Component
-      for (int k = 0; k < 3; k++) {  // Derivative
-        graddeltau[j][k] = 0;
-        for (int m = 0; m < 3; m++) graddeltau[j][k] += dXdx[m][k] * deltadu[j][m];
-      }
-    }
-
-    // Compute Strain : e (epsilon)
-    // e = 1/2 (grad u + (grad u)^T)
-    const CeedScalar de00 = (graddeltau[0][0] + graddeltau[0][0]) / 2., de01 = (graddeltau[0][1] + graddeltau[1][0]) / 2.,
-                     de02 = (graddeltau[0][2] + graddeltau[2][0]) / 2., de11 = (graddeltau[1][1] + graddeltau[1][1]) / 2.,
-                     de12 = (graddeltau[1][2] + graddeltau[2][1]) / 2., de22 = (graddeltau[2][2] + graddeltau[2][2]) / 2.;
-    const CeedScalar de[3][3] = {
-        {de00, de01, de02},
-        {de01, de11, de12},
-        {de02, de12, de22}
-    };
-
-    // strain (epsilon)
-    //     and
-    // stress (sigma) in Voigt notation:
-    //             [e00]               [sigma00]
-    //             [e11]               [sigma11]
-    //  depsilon = [e22]  ,   dsigma = [sigma22]
-    //             [e12]               [sigma12]
-    //             [e02]               [sigma02]
-    //             [e01]               [sigma01]
-    //
-    //  mu = E / (2 * (1 + nu))
-    //  bulk modulus = E / (2 * (1 - 2 * nu))
-    //  lambda = (3 * bulk modulus - 2 * mu) / 3
-    //  e_v = volumetric strain = e00 + e11 + e22
-    //  lambda bar = lambda / (1 + e_v)
-    //
-    //  dSigma = S * epsilon
-    //
-    //  S_ijkl = lambda bar * delta_ij * delta_kl + 2 * mu * delta_ik * delta_jl
-    //
-    //  Matrix form:
-    //
-    //      [2 mu + lambda bar     lambda bar         lambda bar                       ]
-    //      [   lambda bar      2 mu + lambda bar     lambda bar                       ]
-    //      [   lambda bar         lambda bar      2 mu + lambda bar                   ]
-    //  S = [                                                           mu             ]
-    //      [                                                                 mu       ]
-    //      [                                                                       mu ]
-    //
-    //  Above Voigt Notation is placed in a 3x3 matrix:
-    const CeedScalar strain_vol    = grad_u[0][0][i] + grad_u[1][1][i] + grad_u[2][2][i];
-    const CeedScalar lambda_bar    = lambda / (1 + strain_vol);
-    const CeedScalar lambda_dtrace = lambda_bar * (de[0][0] + de[1][1] + de[2][2]);
-    const CeedScalar dsigma00 = lambda_dtrace + TwoMu * de[0][0], dsigma11 = lambda_dtrace + TwoMu * de[1][1],
-                     dsigma22 = lambda_dtrace + TwoMu * de[2][2], dsigma12 = TwoMu * de[1][2], dsigma02 = TwoMu * de[0][2],
-                     dsigma01     = TwoMu * de[0][1];
-    const CeedScalar dsigma[3][3] = {
-        {dsigma00, dsigma01, dsigma02},
-        {dsigma01, dsigma11, dsigma12},
-        {dsigma02, dsigma12, dsigma22}
-    };
-
-    // Apply dXdx^-T and weight
-    for (int j = 0; j < 3; j++) {    // Component
-      for (int k = 0; k < 3; k++) {  // Derivative
-        deltadvdX[k][j][i] = 0;
-        for (int m = 0; m < 3; m++) deltadvdX[k][j][i] += dXdx[k][m] * dsigma[j][m] * wdetJ;
-      }
-    }
-  }  // End of Quadrature Point Loop
-
-  return CEED_ERROR_SUCCESS;
-}
-
-// -----------------------------------------------------------------------------
-// Strain energy computation for hyperelasticity, small strain
-// -----------------------------------------------------------------------------
-CEED_QFUNCTION(ElasSSEnergy_NH)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
-  // Inputs
-  const CeedScalar(*ug)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0], (*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[1];
-
-  // Outputs
-  CeedScalar(*energy) = (CeedScalar(*))out[0];
-
-  // Context
-  const Physics    context = (Physics)ctx;
-  const CeedScalar E       = context->E;
-  const CeedScalar nu      = context->nu;
-
-  // Constants
-  const CeedScalar TwoMu  = E / (1 + nu);
-  const CeedScalar mu     = TwoMu / 2;
-  const CeedScalar Kbulk  = E / (3 * (1 - 2 * nu));  // Bulk Modulus
-  const CeedScalar lambda = (3 * Kbulk - TwoMu) / 3;
-
-  // Quadrature Point Loop
-  CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
-    // Read spatial derivatives of u
-    const CeedScalar du[3][3] = {
-        {ug[0][0][i], ug[1][0][i], ug[2][0][i]},
-        {ug[0][1][i], ug[1][1][i], ug[2][1][i]},
-        {ug[0][2][i], ug[1][2][i], ug[2][2][i]}
-    };
-    // -- Qdata
-    const CeedScalar wdetJ      = q_data[0][i];
-    const CeedScalar dXdx[3][3] = {
-        {q_data[1][i], q_data[2][i], q_data[3][i]},
-        {q_data[4][i], q_data[5][i], q_data[6][i]},
-        {q_data[7][i], q_data[8][i], q_data[9][i]}
-    };
-
-    // Compute grad_u
-    //   dXdx = (dx/dX)^(-1)
-    // Apply dXdx to du = grad_u
-    CeedScalar grad_u[3][3];
-    for (CeedInt j = 0; j < 3; j++) {    // Component
-      for (CeedInt k = 0; k < 3; k++) {  // Derivative
-        grad_u[j][k] = 0;
-        for (CeedInt m = 0; m < 3; m++) grad_u[j][k] += dXdx[m][k] * du[j][m];
-      }
-    }
-
-    // Compute Strain : e (epsilon)
-    // e = 1/2 (grad u + (grad u)^T)
-
-    const CeedScalar e[3][3] = {
-        {(grad_u[0][0] + grad_u[0][0]) / 2., (grad_u[0][1] + grad_u[1][0]) / 2., (grad_u[0][2] + grad_u[2][0]) / 2.},
-        {(grad_u[1][0] + grad_u[0][1]) / 2., (grad_u[1][1] + grad_u[1][1]) / 2., (grad_u[1][2] + grad_u[2][1]) / 2.},
-        {(grad_u[2][0] + grad_u[0][2]) / 2., (grad_u[2][1] + grad_u[1][2]) / 2., (grad_u[2][2] + grad_u[2][2]) / 2.}
-    };
-
-    // Strain Energy
-    const CeedScalar strain_vol = e[0][0] + e[1][1] + e[2][2];
-    const CeedScalar llv        = log1p_series(strain_vol);
-    energy[i] =
-        (lambda * (1 + strain_vol) * (llv - 1) + strain_vol * mu + (e[0][1] * e[0][1] + e[0][2] * e[0][2] + e[1][2] * e[1][2]) * 2 * mu) * wdetJ;
-
-  }  // End of Quadrature Point Loop
-
-  return CEED_ERROR_SUCCESS;
-}
-
-// -----------------------------------------------------------------------------
-// Nodal diagnostic quantities for hyperelasticity, small strain
-// -----------------------------------------------------------------------------
-CEED_QFUNCTION(ElasSSDiagnostic_NH)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
-  // Inputs
-  const CeedScalar(*u)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0], (*ug)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[1],
-        (*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[2];
-
-  // Outputs
-  CeedScalar(*diagnostic)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0];
-
-  // Context
-  const Physics    context = (Physics)ctx;
-  const CeedScalar E       = context->E;
-  const CeedScalar nu      = context->nu;
-
-  // Constants
-  const CeedScalar TwoMu  = E / (1 + nu);
-  const CeedScalar mu     = TwoMu / 2;
-  const CeedScalar Kbulk  = E / (3 * (1 - 2 * nu));  // Bulk Modulus
-  const CeedScalar lambda = (3 * Kbulk - TwoMu) / 3;
-
-  // Quadrature Point Loop
-  CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
-    // Read spatial derivatives of u
-    const CeedScalar du[3][3] = {
-        {ug[0][0][i], ug[1][0][i], ug[2][0][i]},
-        {ug[0][1][i], ug[1][1][i], ug[2][1][i]},
-        {ug[0][2][i], ug[1][2][i], ug[2][2][i]}
-    };
-    // -- Qdata
-    const CeedScalar dXdx[3][3] = {
-        {q_data[1][i], q_data[2][i], q_data[3][i]},
-        {q_data[4][i], q_data[5][i], q_data[6][i]},
-        {q_data[7][i], q_data[8][i], q_data[9][i]}
-    };
-
-    // Compute grad_u
-    //   dXdx = (dx/dX)^(-1)
-    // Apply dXdx to du = grad_u
-    CeedScalar grad_u[3][3];
-    for (CeedInt j = 0; j < 3; j++) {    // Component
-      for (CeedInt k = 0; k < 3; k++) {  // Derivative
-        grad_u[j][k] = 0;
-        for (CeedInt m = 0; m < 3; m++) grad_u[j][k] += dXdx[m][k] * du[j][m];
-      }
-    }
-
-    // Compute Strain : e (epsilon)
-    // e = 1/2 (grad u + (grad u)^T)
-
-    const CeedScalar e[3][3] = {
-        {(grad_u[0][0] + grad_u[0][0]) / 2., (grad_u[0][1] + grad_u[1][0]) / 2., (grad_u[0][2] + grad_u[2][0]) / 2.},
-        {(grad_u[1][0] + grad_u[0][1]) / 2., (grad_u[1][1] + grad_u[1][1]) / 2., (grad_u[1][2] + grad_u[2][1]) / 2.},
-        {(grad_u[2][0] + grad_u[0][2]) / 2., (grad_u[2][1] + grad_u[1][2]) / 2., (grad_u[2][2] + grad_u[2][2]) / 2.}
-    };
-
-    // Displacement
-    diagnostic[0][i] = u[0][i];
-    diagnostic[1][i] = u[1][i];
-    diagnostic[2][i] = u[2][i];
-
-    // Pressure
-    const CeedScalar strain_vol = e[0][0] + e[1][1] + e[2][2];
-    const CeedScalar llv        = log1p_series(strain_vol);
-    diagnostic[3][i]            = -lambda * llv;
-
-    // Stress tensor invariants
-    diagnostic[4][i] = strain_vol;
-    diagnostic[5][i] = 0.;
-    for (CeedInt j = 0; j < 3; j++) {
-      for (CeedInt m = 0; m < 3; m++) diagnostic[5][i] += e[j][m] * e[m][j];
-    }
-    diagnostic[6][i] = 1 + strain_vol;
-
-    // Strain energy
-    diagnostic[7][i] =
-        (lambda * (1 + strain_vol) * (llv - 1) + strain_vol * mu + (e[0][1] * e[0][1] + e[0][2] * e[0][2] + e[1][2] * e[1][2]) * 2 * mu);
-  }  // End of Quadrature Point Loop
-
-  return CEED_ERROR_SUCCESS;
-}
-// -----------------------------------------------------------------------------

From 4aa5fbb4c55e771431ba815a9dddd623e4e3362a Mon Sep 17 00:00:00 2001
From: James Wright <james@jameswright.xyz>
Date: Tue, 11 Jun 2024 10:11:00 -0600
Subject: [PATCH 058/571] fix(fluids): Spanwise width miscalculated for
 statsitics

---
 examples/fluids/src/turb_spanstats.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/fluids/src/turb_spanstats.c b/examples/fluids/src/turb_spanstats.c
index 5b7198952b..f74e2d98ca 100644
--- a/examples/fluids/src/turb_spanstats.c
+++ b/examples/fluids/src/turb_spanstats.c
@@ -40,7 +40,7 @@ PetscErrorCode CreateStatsDM(User user, ProblemData problem, PetscInt degree) {
 
   // Get spanwise length
   PetscCall(DMGetBoundingBox(user->dm, domain_min, domain_max));
-  user->spanstats.span_width = domain_max[2] - domain_min[1];
+  user->spanstats.span_width = domain_max[2] - domain_min[2];
 
   {  // Get DM from surface
     DM             parent_distributed_dm;

From 7a57a7a00fd3cde6c897304c5734b40885fd49f4 Mon Sep 17 00:00:00 2001
From: James Wright <james@jameswright.xyz>
Date: Tue, 11 Jun 2024 10:41:28 -0600
Subject: [PATCH 059/571] fluids: Add IDL to RHSFunction_Newtonian

---
 examples/fluids/qfunctions/newtonian.h | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/examples/fluids/qfunctions/newtonian.h b/examples/fluids/qfunctions/newtonian.h
index 4804965e93..a94ee4f604 100644
--- a/examples/fluids/qfunctions/newtonian.h
+++ b/examples/fluids/qfunctions/newtonian.h
@@ -143,15 +143,18 @@ CEED_QFUNCTION(RHSFunction_Newtonian)(void *ctx, CeedInt Q, const CeedScalar *co
   const CeedScalar(*q)[CEED_Q_VLA]   = (const CeedScalar(*)[CEED_Q_VLA])in[0];
   const CeedScalar(*Grad_q)          = in[1];
   const CeedScalar(*q_data)          = in[2];
+  const CeedScalar(*x)[CEED_Q_VLA]   = (const CeedScalar(*)[CEED_Q_VLA])in[3];
   CeedScalar(*v)[CEED_Q_VLA]         = (CeedScalar(*)[CEED_Q_VLA])out[0];
   CeedScalar(*Grad_v)[5][CEED_Q_VLA] = (CeedScalar(*)[5][CEED_Q_VLA])out[1];
 
   NewtonianIdealGasContext context = (NewtonianIdealGasContext)ctx;
   const CeedScalar        *g       = context->g;
   const CeedScalar         dt      = context->dt;
+  const CeedScalar         P0      = context->idl_pressure;
 
   CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
-    CeedScalar U[5], wdetJ, dXdx[3][3];
+    CeedScalar       U[5], wdetJ, dXdx[3][3];
+    const CeedScalar x_i[3] = {x[0][i], x[1][i], x[2][i]};
     for (int j = 0; j < 5; j++) U[j] = q[j][i];
     QdataUnpack_3D(Q, i, q_data, &wdetJ, dXdx);
     State s = StateFromU(context, U);
@@ -179,6 +182,13 @@ CEED_QFUNCTION(RHSFunction_Newtonian)(void *ctx, CeedInt Q, const CeedScalar *co
     const CeedScalar body_force[5] = {0, s.U.density * g[0], s.U.density * g[1], s.U.density * g[2], Dot3(s.U.momentum, g)};
     for (int j = 0; j < 5; j++) v[j][i] = wdetJ * body_force[j];
 
+    if (context->idl_enable) {
+      const CeedScalar sigma         = LinearRampCoefficient(context->idl_amplitude, context->idl_length, context->idl_start, x_i[0]);
+      CeedScalar       damp_state[5] = {s.Y.pressure - P0, 0, 0, 0, 0}, idl_residual[5] = {0.};
+      InternalDampingLayer(context, s, sigma, damp_state, idl_residual);
+      for (int j = 0; j < 5; j++) v[j][i] -= wdetJ * idl_residual[j];
+    }
+
     // -- Stabilization method: none (Galerkin), SU, or SUPG
     CeedScalar Tau_d[3], stab[5][3], U_dot[5] = {0};
     Tau_diagPrim(context, s, dXdx, dt, Tau_d);

From 6a5b1f8c8ca6bae6b6457d14f82c38963f859dea Mon Sep 17 00:00:00 2001
From: James Wright <james@jameswright.xyz>
Date: Tue, 11 Jun 2024 10:54:03 -0600
Subject: [PATCH 060/571] test(fluids): Modify test for IDL explicit

- Repurposing the Gaussian wave explicit test to also include IDL
  testing
- Also modify the Advection skew tolerance slightly to pass on my local
  machine (failed with 5.076e-10 on the avx/blocked backend)
---
 examples/fluids/navierstokes.c                |   4 ++--
 ...ids-navierstokes-gaussianwave-explicit.bin | Bin 7092 -> 7092 bytes
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/fluids/navierstokes.c b/examples/fluids/navierstokes.c
index 6f424034b7..f4c9676907 100644
--- a/examples/fluids/navierstokes.c
+++ b/examples/fluids/navierstokes.c
@@ -18,9 +18,9 @@
 //     ./navierstokes -ceed /cpu/self -options_file gaussianwave.yml
 //     ./navierstokes -ceed /gpu/cuda -problem advection -degree 1
 //
-//TESTARGS(name="Gaussian Wave, explicit, supg") -ceed {ceed_resource} -test_type solver -options_file examples/fluids/gaussianwave.yaml -compare_final_state_atol 1e-8 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-gaussianwave-explicit.bin -dm_plex_box_faces 2,2,1 -ts_max_steps 5 -degree 3 -implicit false -ts_type rk -stab supg -state_var conservative -mass_ksp_type gmres -mass_pc_jacobi_type diagonal
+//TESTARGS(name="Gaussian Wave, explicit, supg, IDL") -ceed {ceed_resource} -test_type solver -options_file examples/fluids/gaussianwave.yaml -compare_final_state_atol 1e-8 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-gaussianwave-explicit.bin -dm_plex_box_faces 2,2,1 -ts_max_steps 5 -degree 3 -implicit false -ts_type rk -stab supg -state_var conservative -mass_ksp_type gmres -mass_pc_jacobi_type diagonal -idl_decay_time 2e-3 -idl_length 0.25 -idl_start 0 -idl_pressure 70
 //TESTARGS(name="Advection 2D, rotation, explicit, supg, consistent mass") -ceed {ceed_resource} -test_type solver -problem advection -degree 3 -dm_plex_box_faces 2,2 -dm_plex_box_lower 0,0 -dm_plex_box_upper 125,125 -bc_wall 1,2,3,4 -wall_comps 4 -units_kilogram 1e-9 -rc 100. -ts_dt 1e-3 -ts_max_steps 10 -stab supg -Ctaus 0.5 -mass_ksp_type gmres -mass_pc_type vpbjacobi -compare_final_state_atol 1e-10 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-adv2d-rotation-explicit-stab-supg-consistent-mass.bin
-//TESTARGS(name="Advection, skew") -ceed {ceed_resource} -test_type solver -options_file examples/fluids/advection.yaml -ts_max_steps 5 -wind_type translation -wind_translation -0.5547002,0.83205029,0 -advection_ic_type skew  -dm_plex_box_faces 2,1,1 -degree 2 -stab supg -stab_tau advdiff_shakib -Ctau_a 4 -ksp_type gmres -diffusion_coeff 5e-4 -compare_final_state_atol 5e-10 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-adv-skew.bin
+//TESTARGS(name="Advection, skew") -ceed {ceed_resource} -test_type solver -options_file examples/fluids/advection.yaml -ts_max_steps 5 -wind_type translation -wind_translation -0.5547002,0.83205029,0 -advection_ic_type skew  -dm_plex_box_faces 2,1,1 -degree 2 -stab supg -stab_tau advdiff_shakib -Ctau_a 4 -ksp_type gmres -diffusion_coeff 5e-4 -compare_final_state_atol 7e-10 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-adv-skew.bin
 //TESTARGS(name="Blasius, bc_slip") -ceed {ceed_resource} -test_type solver -options_file examples/fluids/blasius.yaml -ts_max_steps 5 -dm_plex_box_faces 3,20,1 -platemesh_nDelta 10 -platemesh_growth 1.2 -bc_outflow 5 -bc_slip 4 -compare_final_state_atol 2E-11 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-blasius-bc_slip.bin
 //TESTARGS(name="Blasius, SGS DataDriven Sequential") -ceed {ceed_resource} -options_file examples/fluids/tests-output/blasius_stgtest.yaml -sgs_model_type data_driven -sgs_model_dd_leakyrelu_alpha 0.3 -sgs_model_dd_parameter_dir examples/fluids/dd_sgs_data -ts_dt 2e-9 -state_var primitive -ksp_rtol 1e-12 -snes_rtol 1e-12 -stg_mean_only -stg_fluctuating_IC -test_type solver -compare_final_state_atol 1e-10 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-blasius-sgs-data-driven.bin -sgs_model_dd_use_fused false
 //TESTARGS(name="Advection, rotation, cosine") -ceed {ceed_resource} -test_type solver -options_file examples/fluids/advection.yaml -ts_max_steps 0 -advection_ic_type cosine_hill -dm_plex_box_faces 2,1,1 -compare_final_state_atol 1e-10 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-adv-rotation-cosine.bin
diff --git a/examples/fluids/tests-output/fluids-navierstokes-gaussianwave-explicit.bin b/examples/fluids/tests-output/fluids-navierstokes-gaussianwave-explicit.bin
index bd8ea4f1637767f0499fa2ce30d2014f4f4fb490..27826f39c22b2c9f0a56d37692a225c003c3e95c 100644
GIT binary patch
literal 7092
zcmXY#c_5VQ7stmG!WAh}(ngj-6kQebzA2@$bW2HSL4zWZmfI>6O4*7^ixygBX`{rH
z646GbL{zwTA%(Kc@4V;y`ma2F&)0XJIp_JDzrKHQ=g}WYCS>=wMQNk<;9r&Y*wQ}_
z@n_m?)NVFAGt6Li;ye}gNJi|PX6tFOUNAvLZOP=B8LZIHC+Dr=R;Imm=3(dG+Zpi@
z-&=<l9bv?WKIu+Yt`m*VE6OlD$Y+HmA!!HS{$Sb{xtGj5YRwErrY7mWD`muX2K>UK
zK>|&wppHd*9ay19;LT)hZKgv(Vw%<meMX#;e?;N)RF<corJXyb*8G=$|7H8<7g=F{
zbmhI<&&i+d3-qUayO8|NA0nEdtQll&xf}hJ4%|Y2E@r0W@7Zbc=d@NZUgSPJ)b1eq
z3sQ+Be^HakUnZaY1>Y2DN<{?b*}oxwI_2c=t0ekU92!LaCJ)d3wQ~kLcjsF2SMY&p
z7bjPoc1dN#t)6LNIz^1Q=dp&Q#=hK=t;@2Oj~K=Z>+^G}Q<~u1cweQLVwN}btDt_-
zZk8vZlz&A3UT#nL@BHOA>sVn!yPQqYAk!g{5aN3<j1m8EKbE`WDa(^Qd(Z5Bq-fIp
zsltYHYOJtXv^;3AooOGaxmUVyG0T%y9I0@uk`e!j-<KHqK%j19mHhN&7t{?A&KP-z
z{CVyoe-`h_pT3KE$;M>g2Ez#Qm-&_a<^3jqW^2ixPKbF=L}Y@y(3kv)zLUShDdg|)
zcJfzYESQw_!{^n$Dd;a)W-<9QS0I0(bIISK!y<LVO2_e&Ov#_DFVjBL<nHCwJ<Olx
z8{vI2a9=!S1BUk(33yTM8}FxmWQ7MN4jrQI&ve>*sa~IPhvnJs^-nT$X8PB!a?ARn
zZr;^+H_0t>E$g{Z*{wT%B-3uv;O{hG%lxssYnNjd%82;_YsDW%0)6%U5_UIQSz+j*
z#CNHyna;hIZ|od=Sl$Z9e7#Uzc1WE@9RIMUVB#jvJD!bYtY`3lc~e#2lfUCD(chmA
z1N5h0xLw3MaXQ$nC!PE)JxcyeAP>%;Lksz<?#%6K`7q~P!Ey3;u7~`6-bent?vlTf
zHzIv?-7nf6DdaEq8TniGiTt_uk-rdo(L}GurbD-Pkv~~iro+(gr_O@{mgnpzwfo-{
z@OgCe^(B6r|E!JpUR!3#dRR{#-e$UuX)pg-ll=M+%hNp*>o{*JBkpwFpI_r4kSmX=
z9};zv6+SW$HeAYPI!qr&Y`)*X^0vNs%6yx{@*Ky;_%&$>RKon{&3diLdaTdS8ao<3
z-?s|c%Aaajo`KottCxEi@gqC&6|ZEG{#&!4`(DPZF!Qcjci9y3ckd+m+gXPG^afRP
zf7TSg%T@eL{tTCrzmIa{Z@wS-YndvNEAKK6<q64O$_Vn8GwT2T+{j;RsYoU4K;RhF
zAo8a!B!6W^<nIdHE6$&Lv_SvujJ>N|#*n|#B~1Hnd5NE`zgXUGrDFa-2FqJFHneh3
zi4|Tq?G1l=h3VkwC5CL!WqB5!+Vk8tvOMjucSl;_d)WT0=HRM_Ob37Ws9F1=FSV;q
zGn&T0eK#B-(+_>KRpn;)F?FUr@X4djst8tM=)ST;?!TBp|JrSpiPu<RhF+Ky)4_B|
z3;w-wU^L5{SMhGGmn}2cc(U;-d!6;*&HmOhs1Dz^+~!q{;f(k{E1&G+a9`fvGK(@9
z$qE~7lnyq`WZJJO7>`@>jFnh2G23hR8J1`G^@>FOeO6d{$!SL1Hl{=KL_x274l7Z(
zrmCWN1S=uypl%u9$a*ZupQo_;8Pg%hlig@o$B6HDUvZPnf!{O!kfYK*Rydemn|oLp
z^BMep1@n=SfW83vd@Tok$S2l$7v;n3!F<@3A(T&>ALTP@4CP}bf%zELLEq!@Not{d
z#s_0QzkK2_pU~)|m`{bH81s?w-%k0Y4p2VFODUg>BFv|C%@fKeYcb`cAjEtkn|djq
z!(+I7W;V|*!hEbJr*ipRZkVi(`S5dwVLrbn?#FzpJZdqYg%Jsu&xr85l+Rx(n9qif
zuP~q0zilWV)45Fh`tR9kWiMD>{>3M;+g34ROXZ4r4|+vfh1V*EOtWT%vGUK7Z`UyG
z{-WA2pGuZj`(Rj9ZV4kkCf#(;tV%R`MQd%^*$b@joJLlf(Mr_2bNCqQ#px8I-XiT&
z0<9Y_vQ*<Ip<aq{A?h8OZGn2x1#eL=Q`bo_+Wg0E@%@*mm;S|sY5(EgRw(@wa(_R&
zJFSrsmw&q&e4;UTK<b)yam7|v_^oZk)9169_QC4<MHWdcZ{gHp=CU!%lVQr9r%MT@
zq)+z#<zLDQzh|rU_S>LdL6QsUt$_K-=}mVr9~k?7UO~?n(%WcFdSm5DFYq4eol_A_
zNmEUl9?2uUcbDMvvnY0-dWI2qoO+XCB*%#R8r_S^CkUoL%Gf??iz+K@8WX?EzZ34!
zHU9j@SVr6skTcl}=AxA3vEVPgqDhtRUQaf@fcqraF6?q9y#=dDFT4Qtilx0o)AKau
zm|LwOz2pO=*EgH=5`#&vt6wmw!q2}m@-FImsD5NR{;JbI{VSg3KaomI+>yWxUQ-oc
zU1FNs<Gzf~>xVv9?eaNMLxbrU?X=SPunapyc9C*=Q8~-IoYrU;V<4E~8CbV!3%rk-
z>!W8_Y(c$*&N--e>7y0u-B>i-y!*lsZGIuV13>TcH}D5~GhF{cy;O5a)U)Z-5lyjq
zoz|Gshk6Eow=*4^^3NZzNMj{K-pacscQImZIf+Njvva>l99?c$_JQ@dEnMhrV#jps
zwlLGGKFdnEE$K3pyTM9~k4*mBQzX(_sp_}RqKftSxFPm~0rVqIZ*C3gg&ZZlx(M?x
zYVr|ApIu2$tCI8@kCUFU1?l~%5op;@@ZZ1An)K$kFdaH~_r;7mz=#*6D@~DK&GO{N
zy;2N{5*c2PDYH#0U_Iu(j%>4B#B@yedcov~7_q@6p_jA?+#C5A$-q8=X<%ZrRZ0lV
ziQ}KOHqIqIm5Zb&9YA_}-35l%PrAO#E+)O%@P6a;3X(`qagg*Pf<&f)kA`Xbc$40)
zXr_IeC~%ldBQscCXIj`gkL87WZ*p{N5GX9$5tnNGgB8ZdueN1pF&)bJex5@g!dyy|
ztedcb<$2ALi&PE}O#3Yv8QV6J^%&+cZG^)o(vwgpz53^*rx7etSon60OyA$AXVa94
zdLEr{PeHHt`AO3I%Uv|B*HFRwZwb;fY+>3hOh5lIUIXvd{lAXC4PeBft(`*ehk{9u
z91Krbr?Ns<E2Yg&Nib)}?Ye*Z1HAJbBqIl(GkwJ-8_!=KBO1Ok@s`fIIM!q7^<9^$
zs!{KE?*`Jlok@BNr9_iN%+B?lk4aC{oAhpEp<dL8X3~3AFBrZe@L<<n0qG6G``5OU
zFLUJtGjP#AZT*}sX5fR4OVutb!DOR{k27b)vcixp*;SW<nT`{C9@k0=VIHSk4)8s|
z4*8)TnmmvxQd;I!@XzC~tVhk639%!mlU|b>>aA1kCB57|qRB?Rb3`okKTfYmKzfVk
zqTcrD_fYTZ!Ww~+Re&eo2krr<mk4$FygQ3iq8ahPnY-u9!dafIO51?}6TxVe?%}PX
zrL3@}+|T0o7kIB0y)Foehxf1F&Rw^^Fyh}kdDlyg1Y>_X`U<-Y;k`QF@XFH-q?h6G
ze?30x1rDtdjUMgX9{c<h>dhYlbB)sr@+Q5^R?;iFAR7DAdYi%EG}6n1`^@vY<?r<i
z=2w$-z9r0sCtvr4X?!s6Y(1;uo@Wc^V&q$!AH(}R)^d@J56c@CvE$F`3D8g7g2y$@
z7OA%#F-{*SgueFo)8%I>NN)?=15WRsZqn0@&+TkWd9$HGmh^sFk{<Jt^p;&Ey}C&P
z^|mJ2Hy%)z(>oRf{c6T26K8XlCvT&fS3DBlxBX*mPRa<T<vZ*0n+#bG{z18<gea!H
zJG|6v1-yThnq9oMk7LBo*2Ev5w@sjMlf6E&)C>CB4!+wC=;NH;x^<*i1O0&0i(4a_
zmS3`TgO?rYU8*BJyT_!r0s1tj$99PnZsfOGHqRkF{|m^6gGr&thguTe$cKB}fa8E$
z8<uz@7lzGXkPEZfeB?s=JHQpdt2ZkzBM+`fszx4MycakLc(s1YBILlrf&k<|jjPa?
zfKz4iV15Iq-b(|H06tCEZKnDKkFmb?UyfAYHHqr~tA_PMkHY&6xcPx3@HKF=y<`N{
zpK+b)pHZaxg)j%X`lgyx|H(nB9|?1otM3foAFlqH2UtI?=N!j(l@2;p9N(oc&DTM`
zd)XEVpL2e0Lbn*XPE#)mxvr`ID{@^_Ml$l;+FE;<mkA;JysePuDmvF7&sjY`h8%Z3
zg~xGRrd&}e%zMR=x<6JT$5oGb2l^pGZ+ly+FMo^b2MtjD_bk<~bfNlI@V*7^ZC1TN
z^>g8U2E4mg3@>1={<_0dzZiZFyla(xo9ag+M`p@>t;YIO`8TQlRPK8L^*6&_0{F35
zwUFbpBV)qjkk4ke$sw0bHCTpR7A<f>E{i%Ghg=poDH?gK&<pwv@S<N)H^cE_VI*)o
z@Z$L063Ag~dfLcgPYfm^hb6e*LJkW)S;Fzx6PemaSik6G4Ap<Vl<NCz#`?OsOR4@W
zm@B}AQ4B-%CpuC6l|fkFe@q3{AFDw1UwotbUQJY=$*1}`o2Y(*I@Xt!U4VSW6v6ih
z_%Bk;7x~Ik-I?Pm1+{+#99Pxqn*D=Zbs--5LZ|D2CEkZPp7QYW9f~{^7k&+S%ICiy
z$Wy0f)R3cm3ssP#MlY`-j=G+K9CbtgIPz1k^L?uSZW+}d5B&l7(^4y$>PMtdeeLg5
zKURb4J6xjrPUES55abKI=~EB<0laydH=pX?zee?)fjc<POm?ICvz@5^M!5gmL<iFO
zy*O`O{)@$Vt117Qo3jo-#mP8l&kyh9=B(7}ld#`_x&53UkMnikva{IF4R^(3Ki}M=
ziv4`;v0vE76}=U)k0-r1z&`FVL>l|}$u<qRfAbAP?47uITIIDw1Lx_lfTuW5tF>WY
z0(1GZ^c|d|#fJal9L>=x#W`BK6y7~BkDvV=g8lmJ(KN1K+o-LF_c`?c(gt`}*2fHI
zGm7E+6Ki?073S8kh@ImHPhp>4uz4N!=fqxpoR`9EN1T@_`}Meaxz^$77H&>HG;?^v
z&B;8;Q(JIOo^uGpIaw$33Fl*_ia+*a=ksuHpue}R?8kn*FGCOe@K3j=TpunGUld>;
zwpw-p`>^Jv1=xSJ^b;ummd%uZ7w|K@pET2?G5=rXT9iKn-wVj!aSO~ZF8{Gol>e<(
z%HO-1^8YPE`R|!U`P*kw{-10ye_`(p%75@P<?rZ7`B(iz`NMvSdmmMYuBQAW=TZLB
z@NR>5(Zjw$%)g{#H|C$;0PlO4zh@78r2H2+Q2yoTDgV<qF#kCDM$CVLq8;VGOAqtE
zs9H<;kCesy)3&8h{%XLNkpJ9%L(G3@3-nKT_atmRgZbAk<757x(!NptCt)wc<*(vN
z`KNrL{Ox8_{x^ZEVXj(*^i%#xk0}2+2Z<LYfZKpSqx9O47cI71ASYhU{K#=)O_SzU
z<V2&&X5>Vr%_EQxhfDs3`gQuVowATe<{tIrcx15B2>7DfrLw9V_5;8ns_S%+Lqa}|
zLk^kq)r8}ZsDUtemjGX8nv6#ttbGOVEZ{+p$v2P#Z@&jl1+IK}aU*iz{wjDM$S@zI
zvY&DMH_j(|BJ#$qG(+T#b73%twB`8fKJfkpZYfDEL(Z^ji9ybAKfjgZjH^SpgmZjx
zLhS_n9Qg63{U?t1N|-IMXXtC(xb<Bod_H4N4Lf!N?$<TGzuR%-yh}kazkwT5{nC){
zo;IyTo_LW5{}1>i(G%tb@QL-yevTueHb~bXN6e^!{UvaTsvpO*^3ly3^pPLrRuZpY
zwT#C6!#D24{Ee&N^8xNZ>pKbauS*`l{HHJ2g!wCupNaX)yiLUXja_9if0LL@%>Uip
zi<tlWfTftfdd`2C|51Si<}diAfcz(&VvqS}P1}L_*WFRY{5>aiVE%Vr&cgh2q%|@B
z12bVi#N}WAp7QS|e!P`8mh!KePWkWufBt3g{Qz#XD)~tH8-JtxH+oY3&fuTRe-!jL
z$X^iYOZhv)evHfiFW?d`e-nAk|KcA9%AbY)4ZQyJXb<K8I-T<OJ4X3edQkq8cT)bv
z$0+}`+LXVgJLRtp^BeYeHNh&}eyXT*`$gPO<)4!i>B&oJjtshrd#b&M{P0|L@M}ER
zK0FG~JxRSS(9?W)zT*8sJhv+Y`XTK5qHjYVf_<ND_kG-FMM&>4?{(i2%y_}O3-*2q
zvoF%QN89k6oz{0cCt04`>uI*{wA~<`vsZ-QfA=V%whhDW2g593PXzlfmr0ui+5uNG
zm;mk`s-&*ESV!l~UFn=_FP&R(Or*U%OETtZEuDKE$nDD>H)*G{uuuFT^JX}=FAI1P
zJ2|%}MymgA0=%1Guh^gLf#(`T>+syLAFX)qrSfO<o(ci;x!4`g^||tKzvlEW%zxNF
zj`bJgeoc8$N2Hy0<5AU@rMPEvyaT^4>?xgu6X=|)kj|BA3AEGI;x@eurE{?F<o1=D
z-QXPx`^w*OuW%oidU}(<DD>)Be&P|_%cW?_)482<=-f*^I+wLaWE3V}7UKbX8rW~9
zEIoz$yFSbFEVtiWb#@f)@2aNdiww>TT{z6P2lsf&;&R-BDwvnyIagaZ+VfRt2@K#X
zxV&>Qp3_x+kNduk%2&AWYm{7q`@Us8wSuv-Po}L1PsP2TM9U{S=lcoI)ej`nx$x(r
zF@p>HO6KpUa|-)$KRE9c_=o-J=(s@K56TXV5E$w1%~aZS2KR&kpMKD}q&z&QD)Ru(
U$w|qJjHYUa_5}{5bKUv>1HIH5mH+?%

literal 7092
zcmY+Jd0b8F_s92XCYnT}QbKbwBy#rNbyXCZB0{E|Q<Mz1G-x7Y%9uG_sT*bbiezXT
zE7Vn?P>E)dIw_iPerrFge}4YEUdv~#^M0Pa*7I3+NZUtW3jfG6apASDS2ZKZe`-W{
z;9tvbVcMnZSB{>sm8Awv>gd{_$4GzmW?H}f!^vuxu0Ivr%!-=5yF+8=Fzph>7h~mT
zGtw^W3@cv+M!NTJhjEW%^J>p%F0eW*U`3T-|9#k5%Sfo1i#LwE!$@mo-s=U$vy@C@
z-JH-WPIIikx4XR@EBYKOG>)odBx;<+DMd$?8oF|r^0x+-vT6VMqIY^;tGHWG`L_!z
zmhXsEK6e7^bA5#M1*BnpA5PP<>USslv0JddxXW0d;xDXEbQkN(&l1+2YP=u(;4#*>
zJ87W4k67QPuz~uFXw9)3_ifTWgY`x0V0{WxvA)(&tglB+*y^``ONx~a)^|3RY5$YE
zEwOzQBmHf5%%+ZIq}@&xi=L$k8x2%vbch34QPbXpk3)r!*X~8L=43|trzmFP4tJIk
zM4xDz#O3`yKPOvecRb|Xp|(nECnFIk>y7MmfVxzatLsm)lw5`9lBfhuMK@~av{B<(
z(U(~xlc_4;;<F-k+j!71vKh7q&gIt%(WYr@gmsI4JNWKB09@P;u{RR2zD^meufYfF
zE4q}|I7)v0uCVD?pV~dFPaWd;`XUvvzVWw&zf;qXSsZo6`qW2aeO)iGzJ_;LUyKc{
zqC4uzz1iVdpKLqU7xHPKz6z|*!9K5UQMq0AL8XEEmNF9KkIr3zo0<NX$IIJ|_pp?a
z%=E{br*Yb|{ij}>_JkGN1t-|dFJL;n*m+_1l~}5*<=Dl0&CD-vyYx>67Q)Yyk1@9A
z99X}F+^Pe8CXB>f@8L%GBTT=6e^yN6VV1Hu@Obcs%)FZ8s_Lsc>{zj@Fv-kQh3SaC
z(&;ot!cvV&`H}l|SlO?wktaOH(PQ2$5T-}fvwlY{>xE8MSYJQ_)@OJF>)W)E)}Fnq
zWuXS#Q@*~rKe4{MtpoLiVtx8Xd7mfso@%3xV0}Fb1NHe~eNAw$NPQ8)n&UQcW_K22
zeG4CBed9(C)K`Y}%?;<qyzzSxk==v!&4^+omgjv_?7y*8fSbwv+-Gnexu^Dcm~e{T
z8U-1XpR!`-Z%_SYmNV@i%{*>ey0DbKpK$7&xA2_|J0*?UEBvW$lcc%fDl2+wwA$)h
zFe8~1dp`U9OO}c)cpHBJ?%@K7$ShQvCz+;~`nmowD_-iM)^q=Frv1HEdzCJnm*I;+
zigI;~^tr{_O|}cUQ3|)>V$I?H-QQ)q<i)yy`Vz3d+u2y(oDf>k+etp&(+lep?85r2
z<gvcFJF&jlV|hQ--F7AinPPoyK3HGXWUMd24eN_oE0j$2xtq82DAxC2HrA(e59^aR
z#QLm`(xVh|DqICoSl{ylaG!hT^#$9qlvas`&e`KErJOy_HSaDf`YN@Qix|d8<geEx
zs=+<BH{3F!YYa<`Qm%>&dc=yljAd(D1L5C~4Pn+iV5Ggr_v^fmVkyNtp{{8@@E!b<
zK1KujjqL03*;iayYE0Ac7AxqlQTDd4zgn=Oj$&tT$23Nwe5a@>pb_d^Vp(Gt!2D_6
zQGa0dPF5^1T_KRGf$!*nZ0jP1k@hr6?p}iLw}SQ6*8j<h8heIu8z(W6p|%IR4z6aY
zG>fN)Eu2_N<lvt+TgZx}CxWvWN8tZ>o?=xUE4UCP2;Y*+3QXU*CA1Z>V)un-R8D#^
z5~XQwDfb;1X^GC5%AtWQHKlL6`*;Ud)R%BREe!fJ&&Ovs@=5%Td{Uqf@qFf-LOu?E
z5k6WYtC7#SuY}J>NBI3bpBW*@Cl&4$&nJ2i;bT&vhI~$`ARjGX<ddzBe6;rs@PR(V
z^D**9K3-7+e0Cro*HYwjqzU;9?m|ARwjv+<=Y-Fpq9cUQg|0({Pv&8$pXc+z9Qgz`
z4De|~KBxadJ`q)f&-*~Q@4&~cb0y((p>Hho*R|hgiK7|mw<>9M5%jt5a;3r!S=#v0
z-?P@o1VeunUw$os-yt}mS`((nQtIZeuh#ppl>E_#mG=#4o7<ZUy$81gPv-!&*(RtL
z*oS(rKcHS$BWHXm`JC0l^QfoTk9yneP*2>CdUyYDHn)8ipO^yQ3$Hg5zWaWkBPH_2
zjP!LUSG~xUk=ERv^@a-Mw45DpsqcTvit3Gxmo!%}61ntbwK45*4rfEAQ(i2klHXCd
z@f@v_Tv%;#$PmsWr$(Vh1N9W%px%;2s3#v!YdM=t+Y+?^^;&MBp2={c=ODd}dK(UN
zI?3Vv&w^u6uW|*PpYoQZ1I92Xob<<6mcc#o93jbar45qp+>GZRhTmJ~G+pH~%%}Wk
z^Dcj5r2XNCI`UIlO7{KKqrC>4&ThZeyZt#<)N-}`sJ<oYovlN?h7{Dh_nI>}XS{E5
zbS&x_E<wGVe^BrJd7^hrCWF@5<$Ua9AJoU|*=%MchP|6s>}+EN*=uf2sNKOxPdC{H
z6p3luZByEe_T;i+MTgb8%^w)a=;B4mxffVLTc&SoEz44?gI;U9H*<Q;ezn(34zXet
zJ>O*xt*Eyz3-y3I(JR}>*=}1cj^5mfdYulacS{TPsxJ@dEui(9s}i2gvqim0b&MoJ
zq2r)H%*tDQP#@$8ePU{ijMdVQ!j2~i2G+GPtT=!FqGR#!E<2#{dgWzJR)HECY$-@%
z1$KVB=2g3Mh7;%4UNmfA#owleRD{n$J()t(TP#GqW#xGtPioIk_Ax=d#AMXV3K`Jr
zL%pgewBbZ=--n?GP%q4ekr>sOPaZytk%mut<Z24<J;htu5>XOovci@vtgU0k<L6sX
zaQ(zc#$A@4jq+xsB9GMV*A}ysLsE%WXfQoJQ?pX-@jX_|j584qN=H4TFw|pYQE!Vc
zZL&P9?RVo#)LTWN-X1g5dlZR!JDj-bnTu=w2wBwAn+o?J{f_tOIgGT-wlGw>3%(=k
zA*u@RX{CVWo@Pfz!9AG0JN`f}Bhef&^X{fTxX0I%m&-FOwf^PFoC#0pQMKVe)B0wx
z;z2KdrzsSoo=za@ed#88D*b;rrS-;&f0+(Nz0h~4x1oyYEt5k3=k=_5xKXus8o}2u
z5WNe(_Au>zyZ=2XfOGx2bIr)n>5Nn=4mj{!fitzRk8zp&mKA;McCNn*zjvatjsGR!
zHbEm-%|(alFPKn2Wq1jvVD<hc_vsre_NpG8qZvT-5~XKQZ<GY}o~6>J7S1=aEhyB>
zU5R>^p}+8Yvo@ig-UC{}%FNGO_Y~^sj)(93=`2A5^yQm&KW+c)V<`idlyv{!+*taB
z$8k+ZR@9j|vAhbtw_|6NPHQ`}lz-p$gQZ5SZ1?Wems)kSaq-RWbyK@o@t3oq6UV_l
z;PvWEQLjrM^~5{rv2>@b;tzk+tGJ1Jn!i!6L=E*csyO4~gWE4ou}3|l!_X&p<n6HA
z0^hg0-YxM>@DZ0ZRwotH*3}~n(=AUxpA1^xVgml*&!3%fyJ0>FQk)tR%3+@TtZ(cw
z=M<f8sH+%GgT7I&aA~j->P_QNZ=(_Fow4DptLOd9Xqt+8R>7!u8u~r2cVZIJYf=uS
z71@8)dx|%p-blXxJ<CvSu!nc~%hG~P;~8oGMw82S+Ihcpk20}a;hoSl_mX}boCmeN
z{F(1@cy}j#5yoV&lu|+cnY*c+{F=-U4~N#my(*KOjFubFb40zMb*MMFPWVgLwDzT9
z_kbR}qj<elNvO9S-t)ZPEmK;4?as2R1$C$w(#A*zw~j4Lhq*BPtwGE5Yb-Tu!$id#
zb$WtQ74?YgV8v9nL6d<8)7~qyUq$;qOAX2IlCBvI^Kxx+nV^H#Em=~!=K2&?^hvX0
z?NJTXyLJloW@Mt?11E06AQPJf_a&%j>y3Ic%Tdqk711kg8_VgwRMv4c{15da)#3N9
zUm5Tb-j{<Web<}QMf_oSO&sxskA7?;zHp%k<{tRgrSwzc3%#cs^M3H<;FJL1FmrLn
zCM)6x&uonY9*uQ-y>1X6m_(HmADA8w?-TH;>dyqk2Ua*#68~qitr7FT-GKSsY%qUm
z4(9(h0rRg+!2DAlVg5;DG5^VHm_O(R=HEOH^Dl+>BlzSb$`tb_q+tFc=uhC2E9gwj
zA2|~4S#GFokuC9eY3g%{zpL=6gmd1TxbOn_1In(*WKI_Gb(haI!F`NAX?Dz<___D}
zGH{=j^`=ZRfxIfpKcANpKNsYcN_<?2V=??bpB0h`ui@Vr*-W!tL44dpr{l!GC0ayd
z{>SjU!T+WWQO5jU@|gd04(4BU74v6(#{B2NCxYLtR33u)$3vgt^B3H}{0m`DgU>CQ
zMq&PmS(xAE59Vk3Fux_dAB{h{=;uG>{aJ-mQ9AKwjjAwzz?VhW$P!=X>Cr`eS+pdc
z__E{mQsT${`WyTw_{I446ynEH9Kj!fA2at~Lwwk}VQs{R-Pm}Y_^{Tjbl!(WseSk#
z@4p(fw>-f7g`JrH6}*$dzc#AF`;^aL7mNAlL*D`48r^Xm^S2ma{xEnSll)T5Pgf4)
zFUI_u37CJwQ_O#<8uM3f!Tf>N#9tNf10M<guca8?f#Cmgipq(vdQ$Jk`>MH${jhHU
z-{<@JKJTkedp+Dm{8a2r*f)URYx>+k{M6%DhQv>KtsKGoD0QE&PQ*t&QMKoNl+urV
z{lrHtUiF6fC-#aw=HCzfmCs-D2=o6Jg!vmoFn{lM%-;}<`R}a7{4P$IzY+FN;5Q$a
zT)_NGY0Td+8}qA=8ORTP5`5-j|GSv~nljw4^gCiH+;5oQspl%lydC*{7@4yfTfrZ}
z+`eY<kjz>466j+vXEUmQk@=di$CLE)$o3r4&$G7&lYZ_=+mk;2$>TWb<1Rk&aQ{-A
z3WMN&!(6z2VH(`;a)k}c$B}v3y!I@arxA|x$vpk?4*bo>ZkOp3RLLCOFKHxm)N0)e
zGDkz-!n+aXaknQ8_xo9fr==m^ugjD)vH!QnC6PY8DxeI0-}drfQ+46{Ncs{~0P_&~
zbn(U=q(4jlJx}JPap@N_FCz*Tl6m>m8v1-^;v}s&7cwWC=4q2TS>7^;pOY76EQEI#
z%-jE(`$#`7HP0jcxVhJZ^kemnQqqTiJ}f1Dc(qaw--mzL>BRGW_}tKH`2L`u=U5IR
z{Qb1ek$>5D!rx%XC*+?7_lxImpNRZFz<18`|NI^K|I>y18=<ele9jJ7hWy9BMgI2J
zk-v2`^7qO^{#))Lf5^|jkDA4W$bY8$0Dtg*Jb$~1$iD@?A9%-Dt#C#D&uWpsbI$;O
z8u^b<BlEI-*J|V+D2M$20w2TkFL{Ohw}Zcec`LmO=L7E*<BD0x|C1T=pA7ep=YQ}b
z@?WwD`G3ts{;Lim|895(!h9{&2todhs>pxIaO8hj0r|5(kpDyx;jf}{8u_o)0zPZM
z=Q^$izgSJVTeJ|rc)N{-`o&iq_qhV!6RP!tPZOV5=06C|=V*h^JNW*<A8LQjCw^pE
zz9;b`wvBMV!QaFhXA>Wy?EpR#e9g_}(|I4VZrBDmU+^`SZgRwb>>2u;_k%~(df=V$
zy7PhING;wEx~XNk5FdCt@e%MzPj-9lPkdnHr~lv{<bRjl`VaAc+lRsa1pG_R>?q<l
zgmLg*2mcay3ElzVTk0M-6Q2>;XHR^_ihcv)GlI`q^ZsI+=57Z#Kjp0|E|zdVob>%?
zc=CQvE3&`<et(_Q%>%#}d}H`S*bDTBANHt*`vt!7b`yMG;P0BMVE+OB#H$DVHu#gH
ze>8bNaZ^QXPJBeDeI)S_nf&|&UlNf4=L5dP_)ZJ)4@(A3M*g<2*X8+Z!Mhs#|Lul@
z$e;H=Jb%^o$UjI9`B%Zaj_3b<Ir7hGMgHaR{qy{XtwR2Wz=!9*ZUXY(2KyPFzk3Ms
zH-!Bz&%bLG@*jQ(`JaRL8~FOu`#q5VJKzJpKC%w(8P7k$2Kno^Apcn=$UoWw`KK>K
z{<-M$BKxzDf3Fnzr+6U$y)Ti!{vqUV0i1dMXV)PAwfy@E_+Nzi&-2fmi~LJuk$;vO
z@~@0X{`thu-Bg)1m+;^AYYy_C0&|P!AJ&Wf&%^!-{Gitdc;5(m=8ub;NA^<-c+RlD
z+Z9$$Glph{|DK0E8SJT6UhT%XgiRQy3GW#`Znr$g7zWJN8GaVy9>Km0_I>+km_xAd
z(?~KU`>eoq-{}!T_t+R4n47Todn4U|aaOR0;N#+;Kfzuwa6uC{!oe+e-E;W=9DLk|
zzaa0}kk~Re;BtKbZ5ucT*ng$gX3*xw=WkSKDv~`|=HoLM_hBo>IeTNAB$qQEz1;jv
z;VO)){7m*`_pU3!c{UCIY@9*%WnYE`(blTrC;y&26VA2Rd42L&jLS8_IE#rGXZD-3
zR_)Z)u7|x8A9n@j2kamJCx!U}`?WM#IkI0H`K6c}XDNM|;i1a!*_NNUUhx0pWPCBM
z?HWDKvZvxi`;vjUTDT7nWP5XppiehR-aIrV`#4qWWwg1b=g#g|SuDS|EV`nCarfcf
z%EvuFhH;ZqIdjeMV^vo(G42L%8ESjTte}JJ@3;b(Q?TEx`Ej3idQ-VECT<(Q#|s&*
z9fol&@J@t1DEG~p#C_Cm=A6nG>vj&U#yB5ce&09ud$W-2`<A!CeTIGLzEVfdMA*${
zhbQrSztfFYB^cLKh;bb%195J&39UZbZ~a${vy_tkV9XroyRbjKvIKk~><0x2UpTV|
ou9r@!rI0<L?+I;;6CcF5li+*!xPP5!v%I&|s~;CI?uFd{0r+0ak^lez


From 0b2e49137516da01e333647ed4c60bbf15a6a3be Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Wed, 12 Jun 2024 10:26:35 -0600
Subject: [PATCH 061/571] style - bool name consistency

---
 .../jit-source/cuda/cuda-ref-basis-tensor.h   | 46 +++++++++----------
 .../jit-source/hip/hip-ref-basis-tensor.h     | 46 +++++++++----------
 2 files changed, 46 insertions(+), 46 deletions(-)

diff --git a/include/ceed/jit-source/cuda/cuda-ref-basis-tensor.h b/include/ceed/jit-source/cuda/cuda-ref-basis-tensor.h
index 7361c994e0..de99b044fb 100644
--- a/include/ceed/jit-source/cuda/cuda-ref-basis-tensor.h
+++ b/include/ceed/jit-source/cuda/cuda-ref-basis-tensor.h
@@ -17,7 +17,7 @@
 //------------------------------------------------------------------------------
 // Interp
 //------------------------------------------------------------------------------
-extern "C" __global__ void Interp(const CeedInt num_elem, const CeedInt transpose, const CeedScalar *__restrict__ interp_1d,
+extern "C" __global__ void Interp(const CeedInt num_elem, const CeedInt is_transpose, const CeedScalar *__restrict__ interp_1d,
                                   const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
   const CeedInt i = threadIdx.x;
 
@@ -29,15 +29,15 @@ extern "C" __global__ void Interp(const CeedInt num_elem, const CeedInt transpos
     s_interp_1d[k] = interp_1d[k];
   }
 
-  const CeedInt P             = transpose ? BASIS_Q_1D : BASIS_P_1D;
-  const CeedInt Q             = transpose ? BASIS_P_1D : BASIS_Q_1D;
-  const CeedInt stride_0      = transpose ? 1 : BASIS_P_1D;
-  const CeedInt stride_1      = transpose ? BASIS_P_1D : 1;
-  const CeedInt u_stride      = transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES;
-  const CeedInt v_stride      = transpose ? BASIS_NUM_NODES : BASIS_NUM_QPTS;
-  const CeedInt u_comp_stride = num_elem * (transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES);
-  const CeedInt v_comp_stride = num_elem * (transpose ? BASIS_NUM_NODES : BASIS_NUM_QPTS);
-  const CeedInt u_size        = transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES;
+  const CeedInt P             = is_transpose ? BASIS_Q_1D : BASIS_P_1D;
+  const CeedInt Q             = is_transpose ? BASIS_P_1D : BASIS_Q_1D;
+  const CeedInt stride_0      = is_transpose ? 1 : BASIS_P_1D;
+  const CeedInt stride_1      = is_transpose ? BASIS_P_1D : 1;
+  const CeedInt u_stride      = is_transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES;
+  const CeedInt v_stride      = is_transpose ? BASIS_NUM_NODES : BASIS_NUM_QPTS;
+  const CeedInt u_comp_stride = num_elem * (is_transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES);
+  const CeedInt v_comp_stride = num_elem * (is_transpose ? BASIS_NUM_NODES : BASIS_NUM_QPTS);
+  const CeedInt u_size        = is_transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES;
 
   // Apply basis element by element
   for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
@@ -77,7 +77,7 @@ extern "C" __global__ void Interp(const CeedInt num_elem, const CeedInt transpos
 //------------------------------------------------------------------------------
 // Grad
 //------------------------------------------------------------------------------
-extern "C" __global__ void Grad(const CeedInt num_elem, const CeedInt transpose, const CeedScalar *__restrict__ interp_1d,
+extern "C" __global__ void Grad(const CeedInt num_elem, const CeedInt is_transpose, const CeedScalar *__restrict__ interp_1d,
                                 const CeedScalar *__restrict__ grad_1d, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
   const CeedInt i = threadIdx.x;
 
@@ -91,23 +91,23 @@ extern "C" __global__ void Grad(const CeedInt num_elem, const CeedInt transpose,
     s_grad_1d[k]   = grad_1d[k];
   }
 
-  const CeedInt P             = transpose ? BASIS_Q_1D : BASIS_P_1D;
-  const CeedInt Q             = transpose ? BASIS_P_1D : BASIS_Q_1D;
-  const CeedInt stride_0      = transpose ? 1 : BASIS_P_1D;
-  const CeedInt stride_1      = transpose ? BASIS_P_1D : 1;
-  const CeedInt u_stride      = transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES;
-  const CeedInt v_stride      = transpose ? BASIS_NUM_NODES : BASIS_NUM_QPTS;
-  const CeedInt u_comp_stride = num_elem * (transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES);
-  const CeedInt v_comp_stride = num_elem * (transpose ? BASIS_NUM_NODES : BASIS_NUM_QPTS);
-  const CeedInt u_dim_stride  = transpose ? num_elem * BASIS_NUM_QPTS * BASIS_NUM_COMP : 0;
-  const CeedInt v_dim_stride  = transpose ? 0 : num_elem * BASIS_NUM_QPTS * BASIS_NUM_COMP;
+  const CeedInt P             = is_transpose ? BASIS_Q_1D : BASIS_P_1D;
+  const CeedInt Q             = is_transpose ? BASIS_P_1D : BASIS_Q_1D;
+  const CeedInt stride_0      = is_transpose ? 1 : BASIS_P_1D;
+  const CeedInt stride_1      = is_transpose ? BASIS_P_1D : 1;
+  const CeedInt u_stride      = is_transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES;
+  const CeedInt v_stride      = is_transpose ? BASIS_NUM_NODES : BASIS_NUM_QPTS;
+  const CeedInt u_comp_stride = num_elem * (is_transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES);
+  const CeedInt v_comp_stride = num_elem * (is_transpose ? BASIS_NUM_NODES : BASIS_NUM_QPTS);
+  const CeedInt u_dim_stride  = is_transpose ? num_elem * BASIS_NUM_QPTS * BASIS_NUM_COMP : 0;
+  const CeedInt v_dim_stride  = is_transpose ? 0 : num_elem * BASIS_NUM_QPTS * BASIS_NUM_COMP;
 
   // Apply basis element by element
   for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
     for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
       // dim*dim contractions for grad
       for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) {
-        CeedInt           pre   = transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES;
+        CeedInt           pre   = is_transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES;
         CeedInt           post  = 1;
         const CeedScalar *cur_u = u + elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride;
         CeedScalar       *cur_v = v + elem * v_stride + dim_1 * v_dim_stride + comp * v_comp_stride;
@@ -129,7 +129,7 @@ extern "C" __global__ void Grad(const CeedInt num_elem, const CeedInt transpose,
             CeedScalar    v_k = 0;
 
             for (CeedInt b = 0; b < P; b++) v_k += op[j * stride_0 + b * stride_1] * in[(a * P + b) * post + c];
-            if (transpose && dim_2 == BASIS_DIM - 1) out[k] += v_k;
+            if (is_transpose && dim_2 == BASIS_DIM - 1) out[k] += v_k;
             else out[k] = v_k;
           }
           post *= Q;
diff --git a/include/ceed/jit-source/hip/hip-ref-basis-tensor.h b/include/ceed/jit-source/hip/hip-ref-basis-tensor.h
index efbb06548b..7482bf9b33 100644
--- a/include/ceed/jit-source/hip/hip-ref-basis-tensor.h
+++ b/include/ceed/jit-source/hip/hip-ref-basis-tensor.h
@@ -17,7 +17,7 @@
 //------------------------------------------------------------------------------
 // Interp
 //------------------------------------------------------------------------------
-extern "C" __global__ void Interp(const CeedInt num_elem, const CeedInt transpose, const CeedScalar *__restrict__ interp_1d,
+extern "C" __global__ void Interp(const CeedInt num_elem, const CeedInt is_transpose, const CeedScalar *__restrict__ interp_1d,
                                   const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
   const CeedInt i = threadIdx.x;
 
@@ -29,15 +29,15 @@ extern "C" __global__ void Interp(const CeedInt num_elem, const CeedInt transpos
     s_interp_1d[k] = interp_1d[k];
   }
 
-  const CeedInt P             = transpose ? BASIS_Q_1D : BASIS_P_1D;
-  const CeedInt Q             = transpose ? BASIS_P_1D : BASIS_Q_1D;
-  const CeedInt stride_0      = transpose ? 1 : BASIS_P_1D;
-  const CeedInt stride_1      = transpose ? BASIS_P_1D : 1;
-  const CeedInt u_stride      = transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES;
-  const CeedInt v_stride      = transpose ? BASIS_NUM_NODES : BASIS_NUM_QPTS;
-  const CeedInt u_comp_stride = num_elem * (transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES);
-  const CeedInt v_comp_stride = num_elem * (transpose ? BASIS_NUM_NODES : BASIS_NUM_QPTS);
-  const CeedInt u_size        = transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES;
+  const CeedInt P             = is_transpose ? BASIS_Q_1D : BASIS_P_1D;
+  const CeedInt Q             = is_transpose ? BASIS_P_1D : BASIS_Q_1D;
+  const CeedInt stride_0      = is_transpose ? 1 : BASIS_P_1D;
+  const CeedInt stride_1      = is_transpose ? BASIS_P_1D : 1;
+  const CeedInt u_stride      = is_transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES;
+  const CeedInt v_stride      = is_transpose ? BASIS_NUM_NODES : BASIS_NUM_QPTS;
+  const CeedInt u_comp_stride = num_elem * (is_transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES);
+  const CeedInt v_comp_stride = num_elem * (is_transpose ? BASIS_NUM_NODES : BASIS_NUM_QPTS);
+  const CeedInt u_size        = is_transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES;
 
   // Apply basis element by element
   for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
@@ -77,7 +77,7 @@ extern "C" __global__ void Interp(const CeedInt num_elem, const CeedInt transpos
 //------------------------------------------------------------------------------
 // Grad
 //------------------------------------------------------------------------------
-extern "C" __global__ void Grad(const CeedInt num_elem, const CeedInt transpose, const CeedScalar *__restrict__ interp_1d,
+extern "C" __global__ void Grad(const CeedInt num_elem, const CeedInt is_transpose, const CeedScalar *__restrict__ interp_1d,
                                 const CeedScalar *__restrict__ grad_1d, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
   const CeedInt i = threadIdx.x;
 
@@ -91,23 +91,23 @@ extern "C" __global__ void Grad(const CeedInt num_elem, const CeedInt transpose,
     s_grad_1d[k]   = grad_1d[k];
   }
 
-  const CeedInt P             = transpose ? BASIS_Q_1D : BASIS_P_1D;
-  const CeedInt Q             = transpose ? BASIS_P_1D : BASIS_Q_1D;
-  const CeedInt stride_0      = transpose ? 1 : BASIS_P_1D;
-  const CeedInt stride_1      = transpose ? BASIS_P_1D : 1;
-  const CeedInt u_stride      = transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES;
-  const CeedInt v_stride      = transpose ? BASIS_NUM_NODES : BASIS_NUM_QPTS;
-  const CeedInt u_comp_stride = num_elem * (transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES);
-  const CeedInt v_comp_stride = num_elem * (transpose ? BASIS_NUM_NODES : BASIS_NUM_QPTS);
-  const CeedInt u_dim_stride  = transpose ? num_elem * BASIS_NUM_QPTS * BASIS_NUM_COMP : 0;
-  const CeedInt v_dim_stride  = transpose ? 0 : num_elem * BASIS_NUM_QPTS * BASIS_NUM_COMP;
+  const CeedInt P             = is_transpose ? BASIS_Q_1D : BASIS_P_1D;
+  const CeedInt Q             = is_transpose ? BASIS_P_1D : BASIS_Q_1D;
+  const CeedInt stride_0      = is_transpose ? 1 : BASIS_P_1D;
+  const CeedInt stride_1      = is_transpose ? BASIS_P_1D : 1;
+  const CeedInt u_stride      = is_transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES;
+  const CeedInt v_stride      = is_transpose ? BASIS_NUM_NODES : BASIS_NUM_QPTS;
+  const CeedInt u_comp_stride = num_elem * (is_transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES);
+  const CeedInt v_comp_stride = num_elem * (is_transpose ? BASIS_NUM_NODES : BASIS_NUM_QPTS);
+  const CeedInt u_dim_stride  = is_transpose ? num_elem * BASIS_NUM_QPTS * BASIS_NUM_COMP : 0;
+  const CeedInt v_dim_stride  = is_transpose ? 0 : num_elem * BASIS_NUM_QPTS * BASIS_NUM_COMP;
 
   // Apply basis element by element
   for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
     for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
       // dim*dim contractions for grad
       for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) {
-        CeedInt           pre   = transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES;
+        CeedInt           pre   = is_transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES;
         CeedInt           post  = 1;
         const CeedScalar *cur_u = u + elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride;
         CeedScalar       *cur_v = v + elem * v_stride + dim_1 * v_dim_stride + comp * v_comp_stride;
@@ -129,7 +129,7 @@ extern "C" __global__ void Grad(const CeedInt num_elem, const CeedInt transpose,
             CeedScalar    v_k = 0;
 
             for (CeedInt b = 0; b < P; b++) v_k += op[j * stride_0 + b * stride_1] * in[(a * P + b) * post + c];
-            if (transpose && dim_2 == BASIS_DIM - 1) out[k] += v_k;
+            if (is_transpose && dim_2 == BASIS_DIM - 1) out[k] += v_k;
             else out[k] = v_k;
           }
           post *= Q;

From 2d903c702dffeaea309445a41715ba3173d9e85e Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Wed, 12 Jun 2024 10:33:05 -0600
Subject: [PATCH 062/571] gpu - drop extra copy in ref tensor basis interp

---
 include/ceed/jit-source/cuda/cuda-ref-basis-tensor.h | 5 +----
 include/ceed/jit-source/hip/hip-ref-basis-tensor.h   | 5 +----
 2 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/include/ceed/jit-source/cuda/cuda-ref-basis-tensor.h b/include/ceed/jit-source/cuda/cuda-ref-basis-tensor.h
index de99b044fb..468ec978c0 100644
--- a/include/ceed/jit-source/cuda/cuda-ref-basis-tensor.h
+++ b/include/ceed/jit-source/cuda/cuda-ref-basis-tensor.h
@@ -47,14 +47,11 @@ extern "C" __global__ void Interp(const CeedInt num_elem, const CeedInt is_trans
       CeedInt           pre   = u_size;
       CeedInt           post  = 1;
 
-      for (CeedInt k = i; k < u_size; k += blockDim.x) {
-        s_buffer_1[k] = cur_u[k];
-      }
       for (CeedInt d = 0; d < BASIS_DIM; d++) {
         __syncthreads();
         // Update buffers used
         pre /= P;
-        const CeedScalar *in       = d % 2 ? s_buffer_2 : s_buffer_1;
+        const CeedScalar *in       = d == 0 ? cur_u : (d % 2 ? s_buffer_2 : s_buffer_1);
         CeedScalar       *out      = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2);
         const CeedInt     writeLen = pre * post * Q;
 
diff --git a/include/ceed/jit-source/hip/hip-ref-basis-tensor.h b/include/ceed/jit-source/hip/hip-ref-basis-tensor.h
index 7482bf9b33..7d732f8e77 100644
--- a/include/ceed/jit-source/hip/hip-ref-basis-tensor.h
+++ b/include/ceed/jit-source/hip/hip-ref-basis-tensor.h
@@ -47,14 +47,11 @@ extern "C" __global__ void Interp(const CeedInt num_elem, const CeedInt is_trans
       CeedInt           pre   = u_size;
       CeedInt           post  = 1;
 
-      for (CeedInt k = i; k < u_size; k += blockDim.x) {
-        s_buffer_1[k] = cur_u[k];
-      }
       for (CeedInt d = 0; d < BASIS_DIM; d++) {
         __syncthreads();
         // Update buffers used
         pre /= P;
-        const CeedScalar *in       = d % 2 ? s_buffer_2 : s_buffer_1;
+        const CeedScalar *in       = d == 0 ? cur_u : (d % 2 ? s_buffer_2 : s_buffer_1);
         CeedScalar       *out      = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2);
         const CeedInt     writeLen = pre * post * Q;
 

From 013a555157cefb861605c4e4ea6ce4e03bb7c3d7 Mon Sep 17 00:00:00 2001
From: James Wright <james@jameswright.xyz>
Date: Wed, 4 Oct 2023 12:54:19 -0600
Subject: [PATCH 063/571] fluids: Add PyTorch external DD SGS evaluation

- Rename sequential_internal -> *_ceed
---
 Makefile                                      |   2 +-
 examples/fluids/Makefile                      |  43 ++++-
 examples/fluids/README.md                     |  18 ++-
 examples/fluids/include/sgs_model_torch.h     |  27 ++++
 examples/fluids/index.md                      |   4 +-
 examples/fluids/navierstokes.c                |   3 +-
 examples/fluids/navierstokes.h                |   9 ++
 examples/fluids/problems/newtonian.c          |   1 +
 examples/fluids/problems/sgs_dd_model.c       | 100 +++++++++---
 .../fluids/problems/sgs_model_torch_weak.c    |  22 +++
 .../fluids/problems/torch/sgs_model_torch.cpp | 152 ++++++++++++++++++
 examples/fluids/pytorch_pkgconfig.py          |  69 ++++++++
 examples/fluids/qfunctions/sgs_dd_model.h     |   2 +
 examples/fluids/src/misc.c                    |   1 +
 .../createPyTorchModel/NNModel_HIT.pt         | Bin 0 -> 1908 bytes
 .../NNModel_HIT_fp64_jit.pt                   | Bin 0 -> 9977 bytes
 .../tests-output/createPyTorchModel/README.md |   1 +
 .../createPyTorchModel/update_weights.py      |  71 ++++++++
 18 files changed, 487 insertions(+), 38 deletions(-)
 create mode 100644 examples/fluids/include/sgs_model_torch.h
 create mode 100644 examples/fluids/problems/sgs_model_torch_weak.c
 create mode 100644 examples/fluids/problems/torch/sgs_model_torch.cpp
 create mode 100644 examples/fluids/pytorch_pkgconfig.py
 create mode 100644 examples/fluids/tests-output/createPyTorchModel/NNModel_HIT.pt
 create mode 100644 examples/fluids/tests-output/createPyTorchModel/NNModel_HIT_fp64_jit.pt
 create mode 100644 examples/fluids/tests-output/createPyTorchModel/README.md
 create mode 100755 examples/fluids/tests-output/createPyTorchModel/update_weights.py

diff --git a/Makefile b/Makefile
index 750ea2a6cd..a8acff1644 100644
--- a/Makefile
+++ b/Makefile
@@ -623,7 +623,7 @@ $(OBJDIR)/petsc-% : examples/petsc/%.c examples/petsc/libutils.a.PHONY $(libceed
 	  PETSC_DIR="$(abspath $(PETSC_DIR))" OPT="$(OPT)" $*
 	cp examples/petsc/$* $@
 
-$(OBJDIR)/fluids-% : examples/fluids/%.c examples/fluids/src/*.c examples/fluids/*.h examples/fluids/problems/*.c examples/fluids/qfunctions/*.h examples/fluids/src/smartsim/*.c $(libceed) $(ceed.pc) | $$(@D)/.DIR
+$(OBJDIR)/fluids-% : examples/fluids/%.c examples/fluids/src/*.c examples/fluids/*.h examples/fluids/problems/*.c examples/fluids/problems/torch/*.cpp examples/fluids/qfunctions/*.h examples/fluids/src/smartsim/*.c $(libceed) $(ceed.pc) | $$(@D)/.DIR
 	+$(call quiet,MAKE) -C examples/fluids CEED_DIR=`pwd` \
 	  PETSC_DIR="$(abspath $(PETSC_DIR))" OPT="$(OPT)" $*
 	cp examples/fluids/$* $@
diff --git a/examples/fluids/Makefile b/examples/fluids/Makefile
index f5a1d7c8c0..3465b77af7 100644
--- a/examples/fluids/Makefile
+++ b/examples/fluids/Makefile
@@ -23,9 +23,6 @@ PETSc.pc := $(PETSC_DIR)/$(PETSC_ARCH)/lib/pkgconfig/PETSc.pc
 CEED_DIR ?= ../..
 ceed.pc := $(CEED_DIR)/lib/pkgconfig/ceed.pc
 
-# ASAN must be left empty if you don't want to use it
-ASAN ?=
-
 CC = $(call pkgconf, --variable=ccompiler $(PETSc.pc) $(ceed.pc))
 CFLAGS = -std=c99 \
   $(call pkgconf, --variable=cflags_extra $(PETSc.pc)) \
@@ -33,22 +30,51 @@ CFLAGS = -std=c99 \
   $(OPT) $(OPT_EXAMPLES)
 CPPFLAGS = $(call pkgconf, --cflags-only-I $(PETSc.pc) $(ceed.pc)) \
   $(call pkgconf, --variable=cflags_dep $(PETSc.pc))
+CXX = $(call pkgconf, --variable=cxxcompiler $(PETSc.pc) $(ceed.pc))
+CXXFLAGS = -std=c++17 -Wno-deprecated -Wno-tautological-compare
 LDFLAGS = $(call pkgconf, --libs-only-L --libs-only-other $(PETSc.pc) $(ceed.pc))
 LDFLAGS += $(patsubst -L%, $(call pkgconf, --variable=ldflag_rpath $(PETSc.pc))%, $(call pkgconf, --libs-only-L $(PETSc.pc) $(ceed.pc)))
-LDLIBS = $(call pkgconf, --libs-only-l $(PETSc.pc) $(ceed.pc)) -lm
+LDLIBS = $(call pkgconf, --libs-only-l $(PETSc.pc) $(ceed.pc)) -lm -lstdc++
 
-AFLAGS ?= -fsanitize=address #-fsanitize=undefined -fno-omit-frame-pointer
+# Address Sanitizer Setup
+# ASAN must be left empty if you don't want to use it
+ASAN ?=
+AFLAGS ?= -fsanitize=address
+# Also: -fsanitize=undefined -fno-omit-frame-pointer
 CFLAGS += $(if $(ASAN),$(AFLAGS))
 FFLAGS += $(if $(ASAN),$(AFLAGS))
 LDFLAGS += $(if $(ASAN),$(AFLAGS))
 CPPFLAGS += -I./include
 
+# LibTorch
+USE_TORCH ?=
+ifeq ($(USE_TORCH),1)
+  libtorch.pc := $(shell python ./pytorch_pkgconfig.py)
+  CPPFLAGS += $(call pkgconf, --cflags-only-I $(libtorch.pc))
+  CXXFLAGS += $(call pkgconf, --cflags-only-other $(libtorch.pc))
+  LDFLAGS += $(call pkgconf, --libs-only-L --libs-only-other $(libtorch.pc))
+  LDFLAGS += $(patsubst -L%, $(call pkgconf, --variable=ldflag_rpath $(PETSc.pc))%, $(call pkgconf, --libs-only-L $(libtorch.pc)))
+  LDLIBS += $(call pkgconf, --libs-only-l $(libtorch.pc))
+
+  src.cpp += $(sort $(wildcard $(PROBLEMDIR)/torch/*.cpp))
+  src.c += $(sort $(wildcard $(PROBLEMDIR)/torch/*.c))
+
+  # Intel Pytorch EXtension (IPEX)
+  IPEX_DIR ?=
+  ifdef IPEX_DIR
+      LDFLAGS += -L$(IPEX_DIR)/lib/
+      LDFLAGS += -Wl,-rpath,$(IPEX_DIR)/lib/
+      LDLIBS += -lintel-ext-pt-gpu
+  endif
+endif
+
+# Source Files
 OBJDIR := build
 SRCDIR := src
 PROBLEMDIR := problems
 
 src.c := navierstokes.c $(sort $(wildcard $(PROBLEMDIR)/*.c)) $(sort $(wildcard $(SRCDIR)/*.c))
-src.o = $(src.c:%.c=$(OBJDIR)/%.o)
+src.o = $(src.c:%.c=$(OBJDIR)/%.o) $(src.cpp:%.cpp=$(OBJDIR)/%.o)
 
 # Path to install directory for SmartRedis. Example: /software/smartredis/install
 SMARTREDIS_DIR ?=
@@ -67,7 +93,7 @@ endif
 all: navierstokes
 
 navierstokes: $(src.o) | $(PETSc.pc) $(ceed.pc)
-	$(call quiet,LINK.o) $(CEED_LDFLAGS) $^ $(LOADLIBES) $(LDLIBS) -o $@
+	$(call quiet,LINK.o) $(LDFLAGS) $^ $(LOADLIBES) $(LDLIBS) -o $@
 
 .SECONDEXPANSION: # to expand $$(@D)/.DIR
 %/.DIR :
@@ -80,6 +106,9 @@ quiet ?= $($(1))
 $(OBJDIR)/%.o : %.c | $$(@D)/.DIR
 	$(call quiet,CC) $(CPPFLAGS) $(CFLAGS) -c -o $@ $(abspath $<)
 
+$(OBJDIR)/%.o : %.cpp | $$(@D)/.DIR
+	$(call quiet,CXX) $(CPPFLAGS) $(CXXFLAGS) -c -o $@ $(abspath $<)
+
 print: $(PETSc.pc) $(ceed.pc)
 	$(info CC      : $(CC))
 	$(info CFLAGS  : $(CFLAGS))
diff --git a/examples/fluids/README.md b/examples/fluids/README.md
index f24e09a694..5b038c71f4 100644
--- a/examples/fluids/README.md
+++ b/examples/fluids/README.md
@@ -654,10 +654,20 @@ For the Density Current, Channel, and Blasius problems, the following common com
   - `./dd_sgs_parameters`
   - string
 
-* - `-sgs_model_dd_use_fused`
-  - Whether to use "fused" mode for data-driven model evaluation
-  - `true`
-  - boolean
+* - `-sgs_model_dd_model_implementation`
+  - Which computational implementation to use for SGS DD model (`fused`, `sequential_ceed`, `sequential_torch`)
+  - `fused`
+  - string
+
+* - `-sgs_model_dd_torch_model_path`
+  - Path to the PyTorch `*.pt` file containing the DD inference model
+  -
+  - string
+
+* - `-sgs_model_dd_torch_model_device`
+  - What hardware to perform the model inference on (`cpu`, `cuda`, `hip`, `xpu`)
+  - Default matches the libCEED backend
+  - string
 
 * - `-diff_filter_monitor`
   - Enable differential filter TSMonitor
diff --git a/examples/fluids/include/sgs_model_torch.h b/examples/fluids/include/sgs_model_torch.h
new file mode 100644
index 0000000000..16217bb51b
--- /dev/null
+++ b/examples/fluids/include/sgs_model_torch.h
@@ -0,0 +1,27 @@
+// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+#include <petsc.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef enum {
+  TORCH_DEVICE_CPU,
+  TORCH_DEVICE_CUDA,
+  TORCH_DEVICE_HIP,
+  TORCH_DEVICE_XPU,
+} TorchDeviceType;
+static const char *const TorchDeviceTypes[] = {"cpu", "cuda", "hip", "xpu", "TorchDeviceType", "TORCH_DEVICE_", NULL};
+
+PetscErrorCode LoadModel_Torch(const char *model_path, TorchDeviceType device_enum);
+PetscErrorCode ModelInference_Torch(Vec DD_Inputs_loc, Vec DD_Outputs_loc);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/examples/fluids/index.md b/examples/fluids/index.md
index f035168ec0..cf52923930 100644
--- a/examples/fluids/index.md
+++ b/examples/fluids/index.md
@@ -374,8 +374,8 @@ Conversely, sequential mode has separate function calls/CeedOperators for input
 By separating the three steps to the model evaluation, the sequential mode allows for functions calling external libraries to be used for the model inference step.
 This however is slower than the fused kernel, but this requires a native libCEED inference implementation.
 
-To use the fused mode, set `-sgs_model_dd_use_fused true`.
-To use the sequential mode, set the same flag to `false`.
+To use the fused mode, set `-sgs_model_dd_implementation fused`.
+To use the sequential mode, set the same flag to `sequential_internal`.
 
 (differential-filtering)=
 ### Differential Filtering
diff --git a/examples/fluids/navierstokes.c b/examples/fluids/navierstokes.c
index f4c9676907..249a4862d6 100644
--- a/examples/fluids/navierstokes.c
+++ b/examples/fluids/navierstokes.c
@@ -18,11 +18,12 @@
 //     ./navierstokes -ceed /cpu/self -options_file gaussianwave.yml
 //     ./navierstokes -ceed /gpu/cuda -problem advection -degree 1
 //
+//TESTARGS(name="Blasius, SGS DataDriven Sequential Torch") -ceed {ceed_resource} -options_file examples/fluids/tests-output/blasius_stgtest.yaml -sgs_model_type data_driven -sgs_model_dd_leakyrelu_alpha 0.3 -sgs_model_dd_parameter_dir examples/fluids/dd_sgs_data -ts_dt 2e-9 -state_var primitive -ksp_rtol 1e-12 -snes_rtol 1e-12 -stg_mean_only -stg_fluctuating_IC -test_type solver -compare_final_state_atol 1e-10 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-blasius-sgs-data-driven.bin -sgs_model_dd_implementation sequential_torch -sgs_model_dd_torch_model_path ./examples/fluids/tests-output/createPyTorchModel/NNModel_HIT_fp64_jit.pt
+//TESTARGS(name="Blasius, SGS DataDriven Sequential Ceed") -ceed {ceed_resource} -options_file examples/fluids/tests-output/blasius_stgtest.yaml -sgs_model_type data_driven -sgs_model_dd_leakyrelu_alpha 0.3 -sgs_model_dd_parameter_dir examples/fluids/dd_sgs_data -ts_dt 2e-9 -state_var primitive -ksp_rtol 1e-12 -snes_rtol 1e-12 -stg_mean_only -stg_fluctuating_IC -test_type solver -compare_final_state_atol 1e-10 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-blasius-sgs-data-driven.bin -sgs_model_dd_implementation sequential_internal
 //TESTARGS(name="Gaussian Wave, explicit, supg, IDL") -ceed {ceed_resource} -test_type solver -options_file examples/fluids/gaussianwave.yaml -compare_final_state_atol 1e-8 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-gaussianwave-explicit.bin -dm_plex_box_faces 2,2,1 -ts_max_steps 5 -degree 3 -implicit false -ts_type rk -stab supg -state_var conservative -mass_ksp_type gmres -mass_pc_jacobi_type diagonal -idl_decay_time 2e-3 -idl_length 0.25 -idl_start 0 -idl_pressure 70
 //TESTARGS(name="Advection 2D, rotation, explicit, supg, consistent mass") -ceed {ceed_resource} -test_type solver -problem advection -degree 3 -dm_plex_box_faces 2,2 -dm_plex_box_lower 0,0 -dm_plex_box_upper 125,125 -bc_wall 1,2,3,4 -wall_comps 4 -units_kilogram 1e-9 -rc 100. -ts_dt 1e-3 -ts_max_steps 10 -stab supg -Ctaus 0.5 -mass_ksp_type gmres -mass_pc_type vpbjacobi -compare_final_state_atol 1e-10 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-adv2d-rotation-explicit-stab-supg-consistent-mass.bin
 //TESTARGS(name="Advection, skew") -ceed {ceed_resource} -test_type solver -options_file examples/fluids/advection.yaml -ts_max_steps 5 -wind_type translation -wind_translation -0.5547002,0.83205029,0 -advection_ic_type skew  -dm_plex_box_faces 2,1,1 -degree 2 -stab supg -stab_tau advdiff_shakib -Ctau_a 4 -ksp_type gmres -diffusion_coeff 5e-4 -compare_final_state_atol 7e-10 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-adv-skew.bin
 //TESTARGS(name="Blasius, bc_slip") -ceed {ceed_resource} -test_type solver -options_file examples/fluids/blasius.yaml -ts_max_steps 5 -dm_plex_box_faces 3,20,1 -platemesh_nDelta 10 -platemesh_growth 1.2 -bc_outflow 5 -bc_slip 4 -compare_final_state_atol 2E-11 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-blasius-bc_slip.bin
-//TESTARGS(name="Blasius, SGS DataDriven Sequential") -ceed {ceed_resource} -options_file examples/fluids/tests-output/blasius_stgtest.yaml -sgs_model_type data_driven -sgs_model_dd_leakyrelu_alpha 0.3 -sgs_model_dd_parameter_dir examples/fluids/dd_sgs_data -ts_dt 2e-9 -state_var primitive -ksp_rtol 1e-12 -snes_rtol 1e-12 -stg_mean_only -stg_fluctuating_IC -test_type solver -compare_final_state_atol 1e-10 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-blasius-sgs-data-driven.bin -sgs_model_dd_use_fused false
 //TESTARGS(name="Advection, rotation, cosine") -ceed {ceed_resource} -test_type solver -options_file examples/fluids/advection.yaml -ts_max_steps 0 -advection_ic_type cosine_hill -dm_plex_box_faces 2,1,1 -compare_final_state_atol 1e-10 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-adv-rotation-cosine.bin
 //TESTARGS(name="Gaussian Wave, using MatShell") -ceed {ceed_resource} -test_type solver -options_file examples/fluids/gaussianwave.yaml -compare_final_state_atol 1e-8 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-gaussianwave-shell.bin -dm_plex_box_faces 2,2,1 -ts_max_steps 5 -degree 3 -amat_type shell -pc_type vpbjacobi -ts_alpha_radius 0.5
 //TESTARGS(name="Taylor-Green Vortex IC") -ceed {ceed_resource} -problem taylor_green -test_type solver -dm_plex_dim 3 -dm_plex_box_faces 6,6,6 -ts_max_steps 0 -compare_final_state_atol 1e-12 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-taylor-green-IC.bin
diff --git a/examples/fluids/navierstokes.h b/examples/fluids/navierstokes.h
index 75dc8a57fd..1c46f57776 100644
--- a/examples/fluids/navierstokes.h
+++ b/examples/fluids/navierstokes.h
@@ -66,6 +66,15 @@ typedef enum {
 } SGSModelType;
 static const char *const SGSModelTypes[] = {"none", "data_driven", "SGSModelType", "SGS_MODEL_", NULL};
 
+// Subgrid-Stress mode type
+typedef enum {
+  SGS_MODEL_DD_FUSED           = 0,
+  SGS_MODEL_DD_SEQENTIAL_CEED  = 1,
+  SGS_MODEL_DD_SEQENTIAL_TORCH = 2,
+} SGSModelDDImplementation;
+static const char *const SGSModelDDImplementations[] = {"fused", "sequential_ceed", "sequential_torch", "SGSModelDDImplementation", "SGS_MODEL_DD_",
+                                                        NULL};
+
 // Mesh transformation type
 typedef enum {
   MESH_TRANSFORM_NONE      = 0,
diff --git a/examples/fluids/problems/newtonian.c b/examples/fluids/problems/newtonian.c
index 80f84c7ec9..efd1f37187 100644
--- a/examples/fluids/problems/newtonian.c
+++ b/examples/fluids/problems/newtonian.c
@@ -115,6 +115,7 @@ PetscErrorCode CreateKSPMassOperator_NewtonianStabilized(User user, CeedOperator
   PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_mass));
   PetscFunctionReturn(PETSC_SUCCESS);
 }
+
 PetscErrorCode NS_NEWTONIAN_IG(ProblemData problem, DM dm, void *ctx, SimpleBC bc) {
   SetupContext             setup_context;
   User                     user   = *(User *)ctx;
diff --git a/examples/fluids/problems/sgs_dd_model.c b/examples/fluids/problems/sgs_dd_model.c
index 3f5f3cddb6..c87ca93f35 100644
--- a/examples/fluids/problems/sgs_dd_model.c
+++ b/examples/fluids/problems/sgs_dd_model.c
@@ -9,12 +9,14 @@
 
 #include <petscdmplex.h>
 
+#include <sgs_model_torch.h>
 #include "../navierstokes.h"
 
 typedef struct {
-  CeedElemRestriction  elem_restr_grid_aniso, elem_restr_sgs;
-  CeedVector           grid_aniso_ceed;
-  CeedQFunctionContext sgsdd_qfctx, ifunction_qfctx;
+  CeedElemRestriction      elem_restr_grid_aniso, elem_restr_sgs;
+  CeedVector               grid_aniso_ceed;
+  CeedQFunctionContext     sgsdd_qfctx, ifunction_qfctx;
+  SGSModelDDImplementation sgs_dd_model_implementation;
 } *SgsDDSetupData;
 
 PetscErrorCode SgsDDSetupDataDestroy(SgsDDSetupData sgs_dd_setup_data) {
@@ -146,12 +148,11 @@ static PetscErrorCode SgsDDSetupNodalEvaluation_Fused(Ceed ceed, User user, Ceed
   PetscFunctionReturn(PETSC_SUCCESS);
 }
 
-// @brief Setup data-driven model inference using internal (libCEED native) implementation
-static PetscErrorCode SgsDDSetupNodalEvaluation_Sequential_Internal(Ceed ceed, SgsDDData sgs_dd_data, SgsDDSetupData sgs_dd_setup_data,
-                                                                    CeedElemRestriction elem_restr_dd_inputs,
-                                                                    CeedElemRestriction elem_restr_dd_outputs,
-                                                                    CeedElemRestriction elem_restr_inv_multiplicity, CeedVector inv_multiplicity,
-                                                                    void **ctx) {
+// @brief Setup data-driven model inference using libCEED native implementation
+static PetscErrorCode SgsDDSetupNodalEvaluation_Sequential_Ceed(Ceed ceed, SgsDDData sgs_dd_data, SgsDDSetupData sgs_dd_setup_data,
+                                                                CeedElemRestriction elem_restr_dd_inputs, CeedElemRestriction elem_restr_dd_outputs,
+                                                                CeedElemRestriction elem_restr_inv_multiplicity, CeedVector inv_multiplicity,
+                                                                void **ctx) {
   CeedQFunction         qf_sgs_dd_inference;
   CeedOperator          op_sgs_dd_inference;
   OperatorApplyContext *op_context = (OperatorApplyContext *)ctx;
@@ -180,8 +181,8 @@ static PetscErrorCode SgsDDSetupNodalEvaluation_Sequential_Internal(Ceed ceed, S
   PetscFunctionReturn(PETSC_SUCCESS);
 }
 
-// @brief Perform data-driven model inference using internal (libCEED native) implementation
-PetscErrorCode SgsDDNodalStressEval_Sequential_Internal(Vec DD_Inputs_loc, Vec DD_Outputs_loc, void *ctx) {
+// @brief Perform data-driven model inference using libCEED native implementation
+PetscErrorCode SgsDDNodalStressEval_Sequential_Ceed(Vec DD_Inputs_loc, Vec DD_Outputs_loc, void *ctx) {
   OperatorApplyContext op_context = *(OperatorApplyContext *)ctx;
 
   PetscFunctionBeginUser;
@@ -189,6 +190,44 @@ PetscErrorCode SgsDDNodalStressEval_Sequential_Internal(Vec DD_Inputs_loc, Vec D
   PetscFunctionReturn(PETSC_SUCCESS);
 }
 
+// @brief Setup data-driven model inference using libtorch
+static PetscErrorCode SgsDDSetupNodalEvaluation_Sequential_Torch(Ceed ceed, SgsDDData sgs_dd_data, SgsDDSetupData sgs_dd_setup_data,
+                                                                 CeedElemRestriction elem_restr_dd_inputs, CeedElemRestriction elem_restr_dd_outputs,
+                                                                 CeedElemRestriction elem_restr_inv_multiplicity, CeedVector inv_multiplicity,
+                                                                 void **ctx) {
+  const char     *ceed_resource;
+  char            model_path[PETSC_MAX_PATH_LEN] = "";
+  TorchDeviceType model_device_type;
+
+  PetscFunctionBeginUser;
+  PetscCallCeed(ceed, CeedGetResource(ceed, &ceed_resource));
+  if (strstr(ceed_resource, "/gpu/cuda")) model_device_type = TORCH_DEVICE_CUDA;
+  else if (strstr(ceed_resource, "/gpu/hip")) model_device_type = TORCH_DEVICE_HIP;
+  else if (strstr(ceed_resource, "/gpu/sycl")) model_device_type = TORCH_DEVICE_XPU;
+  else model_device_type = TORCH_DEVICE_CPU;
+  PetscCall(PetscOptionsGetEnum(NULL, NULL, "-sgs_model_dd_torch_model_device", TorchDeviceTypes, (PetscEnum *)&model_device_type, NULL));
+  PetscCall(PetscOptionsGetString(NULL, NULL, "-sgs_model_dd_torch_model_path", model_path, sizeof(model_path), NULL));
+
+  PetscCall(LoadModel_Torch(model_path, model_device_type));
+
+  PetscFunctionReturn(PETSC_SUCCESS);
+}
+
+// @brief Perform data-driven model inference using libtorch
+static PetscErrorCode SgsDDNodalStressEval_Sequential_Torch(Vec DD_Inputs_loc, Vec DD_Outputs_loc, void *ctx) {
+  static PetscBool run_through = PETSC_FALSE;
+  PetscFunctionBeginUser;
+  if (!run_through) {
+    PetscCall(VecViewFromOptions(DD_Inputs_loc, NULL, "-dd_inputs_loc_view"));
+  }
+  PetscCall(ModelInference_Torch(DD_Inputs_loc, DD_Outputs_loc));
+  if (!run_through) {
+    PetscCall(VecViewFromOptions(DD_Outputs_loc, NULL, "-dd_outputs_loc_view"));
+    run_through = PETSC_TRUE;
+  }
+  PetscFunctionReturn(PETSC_SUCCESS);
+}
+
 // @brief Evaluate data-driven SGS using sequential method
 PetscErrorCode SgsDDNodalStressEval_Sequential(User user, Vec Q_loc, Vec VelocityGradient, Vec SGSNodal_loc) {
   SgsDDData    sgs_dd_data = user->sgs_dd_data;
@@ -358,10 +397,17 @@ static PetscErrorCode SgsDDSetupNodalEvaluation_Sequential(Ceed ceed, User user,
     PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_sgs_dd_outputs));
   }
 
-  sgs_dd_data->sgs_nodal_inference = SgsDDNodalStressEval_Sequential_Internal;
-  sgs_dd_data->sgs_nodal_eval      = SgsDDNodalStressEval_Sequential;
-  PetscCall(SgsDDSetupNodalEvaluation_Sequential_Internal(ceed, sgs_dd_data, sgs_dd_setup_data, elem_restr_dd_inputs, elem_restr_dd_outputs,
-                                                          elem_restr_inv_multiplicity, inv_multiplicity, &sgs_dd_data->sgs_nodal_inference_ctx));
+  sgs_dd_data->sgs_nodal_eval = SgsDDNodalStressEval_Sequential;
+
+  if (sgs_dd_setup_data->sgs_dd_model_implementation == SGS_MODEL_DD_SEQENTIAL_CEED) {
+    sgs_dd_data->sgs_nodal_inference = SgsDDNodalStressEval_Sequential_Ceed;
+    PetscCall(SgsDDSetupNodalEvaluation_Sequential_Ceed(ceed, sgs_dd_data, sgs_dd_setup_data, elem_restr_dd_inputs, elem_restr_dd_outputs,
+                                                        elem_restr_inv_multiplicity, inv_multiplicity, &sgs_dd_data->sgs_nodal_inference_ctx));
+  } else if (sgs_dd_setup_data->sgs_dd_model_implementation == SGS_MODEL_DD_SEQENTIAL_TORCH) {
+    sgs_dd_data->sgs_nodal_inference = SgsDDNodalStressEval_Sequential_Torch;
+    PetscCall(SgsDDSetupNodalEvaluation_Sequential_Torch(ceed, sgs_dd_data, sgs_dd_setup_data, elem_restr_dd_inputs, elem_restr_dd_outputs,
+                                                         elem_restr_inv_multiplicity, inv_multiplicity, &sgs_dd_data->sgs_nodal_inference_ctx));
+  }
 
   sgs_dd_setup_data->elem_restr_sgs = elem_restr_sgs;
 
@@ -515,7 +561,6 @@ PetscErrorCode SgsDDSetup(Ceed ceed, User user, CeedData ceed_data, ProblemData
   MPI_Comm                 comm                           = user->comm;
   char                     sgs_dd_dir[PETSC_MAX_PATH_LEN] = "./dd_sgs_parameters";
   SgsDDSetupData           sgs_dd_setup_data;
-  PetscBool                use_fused;
   NewtonianIdealGasContext gas;
 
   PetscFunctionBeginUser;
@@ -526,13 +571,17 @@ PetscErrorCode SgsDDSetup(Ceed ceed, User user, CeedData ceed_data, ProblemData
   user->sgs_dd_data->num_comp_inputs  = 6;
   user->sgs_dd_data->num_comp_outputs = 6;
 
-  use_fused = PETSC_TRUE;
+  PetscCall(PetscNew(&sgs_dd_setup_data));
+
   PetscOptionsBegin(comm, NULL, "SGS Data-Driven Model Options", NULL);
   PetscCall(PetscOptionsReal("-sgs_model_dd_leakyrelu_alpha", "Slope parameter for Leaky ReLU activation function", NULL, alpha, &alpha, NULL));
   PetscCall(PetscOptionsString("-sgs_model_dd_parameter_dir", "Path to directory with model parameters (weights, biases, etc.)", NULL, sgs_dd_dir,
                                sgs_dd_dir, sizeof(sgs_dd_dir), NULL));
-  PetscCall(
-      PetscOptionsBool("-sgs_model_dd_use_fused", "Use the fused SGS DD model evaluation instead of sequential", NULL, use_fused, &use_fused, NULL));
+  PetscCall(PetscOptionsDeprecated("-sgs_model_dd_use_fused", NULL, "libCEED 0.12.0", "Use -sgs_model_dd_type instead"));
+  sgs_dd_setup_data->sgs_dd_model_implementation = SGS_MODEL_DD_FUSED;
+  PetscCall(PetscOptionsEnum("-sgs_model_dd_implementation", "Data-Driven SGS model implementation", NULL, SGSModelDDImplementations,
+                             (PetscEnum)sgs_dd_setup_data->sgs_dd_model_implementation, (PetscEnum *)&sgs_dd_setup_data->sgs_dd_model_implementation,
+                             NULL));
   PetscOptionsEnd();
 
   PetscCall(PetscNew(&sgsdd_ctx));
@@ -547,8 +596,6 @@ PetscErrorCode SgsDDSetup(Ceed ceed, User user, CeedData ceed_data, ProblemData
   // -- Create DM for storing SGS tensor at nodes
   PetscCall(SgsDDCreateDM(user->dm, &user->sgs_dd_data->dm_sgs, user->app_ctx->degree, user->app_ctx->q_extra, &user->sgs_dd_data->num_comp_sgs));
 
-  PetscCall(PetscNew(&sgs_dd_setup_data));
-
   PetscCallCeed(ceed, CeedQFunctionContextGetDataRead(problem->apply_vol_ifunction.qfunction_context, CEED_MEM_HOST, &gas));
   sgsdd_ctx->gas = *gas;
   PetscCallCeed(ceed, CeedQFunctionContextRestoreDataRead(problem->apply_vol_ifunction.qfunction_context, &gas));
@@ -564,8 +611,15 @@ PetscErrorCode SgsDDSetup(Ceed ceed, User user, CeedData ceed_data, ProblemData
                                                      &sgs_dd_setup_data->grid_aniso_ceed));
 
   // -- Create Nodal Evaluation Operator
-  if (use_fused) PetscCall(SgsDDSetupNodalEvaluation_Fused(ceed, user, ceed_data, sgs_dd_setup_data));
-  else PetscCall(SgsDDSetupNodalEvaluation_Sequential(ceed, user, ceed_data, sgs_dd_setup_data));
+  switch (sgs_dd_setup_data->sgs_dd_model_implementation) {
+    case SGS_MODEL_DD_FUSED:
+      PetscCall(SgsDDSetupNodalEvaluation_Fused(ceed, user, ceed_data, sgs_dd_setup_data));
+      break;
+    case SGS_MODEL_DD_SEQENTIAL_CEED:
+    case SGS_MODEL_DD_SEQENTIAL_TORCH:
+      PetscCall(SgsDDSetupNodalEvaluation_Sequential(ceed, user, ceed_data, sgs_dd_setup_data));
+      break;
+  }
 
   // -- Create Operator to evalutate residual of SGS stress
   PetscCall(SgsSetupNodalIFunction(ceed, user, ceed_data, sgs_dd_setup_data));
diff --git a/examples/fluids/problems/sgs_model_torch_weak.c b/examples/fluids/problems/sgs_model_torch_weak.c
new file mode 100644
index 0000000000..36992a7d5c
--- /dev/null
+++ b/examples/fluids/problems/sgs_model_torch_weak.c
@@ -0,0 +1,22 @@
+// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+//
+// @file This creates weak functions for libtorch dependent functions.
+
+#include <sgs_model_torch.h>
+
+PetscErrorCode LoadModel_Torch(const char *model_path, TorchDeviceType device_enum) __attribute__((weak));
+PetscErrorCode LoadModel_Torch(const char *model_path, TorchDeviceType device_enum) {
+  PetscFunctionBeginUser;
+  SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_SUP, "Must build with USE_TORCH set to run %s", __func__);
+}
+
+PetscErrorCode ModelInference_Torch(Vec DD_Inputs_loc, Vec DD_Outputs_loc) __attribute__((weak));
+PetscErrorCode ModelInference_Torch(Vec DD_Inputs_loc, Vec DD_Outputs_loc) {
+  PetscFunctionBeginUser;
+  SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_SUP, "Must build with USE_TORCH set to run %s", __func__);
+}
diff --git a/examples/fluids/problems/torch/sgs_model_torch.cpp b/examples/fluids/problems/torch/sgs_model_torch.cpp
new file mode 100644
index 0000000000..969bc57ebb
--- /dev/null
+++ b/examples/fluids/problems/torch/sgs_model_torch.cpp
@@ -0,0 +1,152 @@
+// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+#include <petsc.h>
+#include <sgs_model_torch.h>
+#include <torch/script.h>
+#include <torch/torch.h>
+
+torch::jit::script::Module model;
+torch::DeviceType          device_model;
+
+static PetscErrorCode EnumToDeviceType(TorchDeviceType device_enum, torch::DeviceType *device_type) {
+  PetscFunctionBeginUser;
+  switch (device_enum) {
+    case TORCH_DEVICE_CPU:
+      *device_type = torch::kCPU;
+      break;
+    case TORCH_DEVICE_XPU:
+      *device_type = torch::kXPU;
+      break;
+    case TORCH_DEVICE_CUDA:
+      *device_type = torch::kCUDA;
+      break;
+    case TORCH_DEVICE_HIP:
+      *device_type = torch::kHIP;
+      break;
+    default:
+      SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_SUP, "TorchDeviceType %d not supported by PyTorch inference", device_enum);
+  }
+  PetscFunctionReturn(PETSC_SUCCESS);
+}
+
+static PetscErrorCode PetscMemTypeToDeviceType(PetscMemType mem_type, torch::DeviceType *device_type) {
+  PetscFunctionBeginUser;
+  switch (mem_type) {
+    case PETSC_MEMTYPE_HOST:
+      *device_type = torch::kCPU;
+      break;
+    case PETSC_MEMTYPE_SYCL:
+      *device_type = torch::kXPU;
+      break;
+    case PETSC_MEMTYPE_CUDA:
+      *device_type = torch::kCUDA;
+      break;
+    case PETSC_MEMTYPE_HIP:
+      *device_type = torch::kHIP;
+      break;
+    default:
+      SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_SUP, "PetscMemType %s not supported by PyTorch inference", PetscMemTypeToString(mem_type));
+  }
+  PetscFunctionReturn(PETSC_SUCCESS);
+}
+
+PetscErrorCode LoadModel_Torch(const char *model_path, TorchDeviceType device_enum) {
+  PetscFunctionBeginUser;
+  PetscCall(EnumToDeviceType(device_enum, &device_model));
+
+  PetscCallCXX(model = torch::jit::load(model_path));
+  PetscCallCXX(model.to(torch::Device(device_model)));
+  PetscFunctionReturn(PETSC_SUCCESS);
+}
+
+// Load and run model
+PetscErrorCode ModelInference_Torch(Vec DD_Inputs_loc, Vec DD_Outputs_loc) {
+  torch::Tensor  input_tensor, output_tensor;
+  const PetscInt num_input_comps = 6, num_output_comps = 6;
+  PetscBool      debug_tensor_output = PETSC_FALSE;
+
+  PetscFunctionBeginUser;
+  // torch::NoGradGuard no_grad; // equivalent to "with torch.no_grad():" in PyTorch
+  {  // Transfer DD_Inputs_loc into input_tensor
+    PetscMemType         input_mem_type;
+    PetscInt             input_size, num_nodes;
+    const PetscScalar   *dd_inputs_ptr;
+    torch::DeviceType    dd_input_device;
+    torch::TensorOptions options;
+
+    PetscCall(VecGetLocalSize(DD_Inputs_loc, &input_size));
+    num_nodes = input_size / num_input_comps;
+    PetscCall(VecGetArrayReadAndMemType(DD_Inputs_loc, &dd_inputs_ptr, &input_mem_type));
+    PetscCall(PetscMemTypeToDeviceType(input_mem_type, &dd_input_device));
+
+    PetscCallCXX(options = torch::TensorOptions().dtype(torch::kFloat64).device(dd_input_device));
+    if (dd_input_device == torch::kXPU) {  // XPU requires device-to-host-to-device transfer
+      PetscCallCXX(input_tensor =
+                       at::from_blob((void *)dd_inputs_ptr, {num_nodes, num_input_comps}, {num_input_comps, 1}, nullptr, options, dd_input_device)
+                           .to(device_model));
+    } else {
+      PetscCallCXX(input_tensor = torch::from_blob((void *)dd_inputs_ptr, {num_nodes, num_input_comps}, options));
+    }
+    if (debug_tensor_output) {
+      double *input_tensor_ptr;
+
+      PetscCall(VecGetLocalSize(DD_Inputs_loc, &input_size));
+      PetscCallCXX(input_tensor_ptr = (double *)input_tensor.contiguous().to(torch::kCPU).data_ptr());
+      printf("Input_Tensor_Pointer:\n");
+      for (PetscInt i = 0; i < input_size; i++) {
+        printf("%f\n", input_tensor_ptr[i]);
+      }
+    }
+    PetscCall(VecRestoreArrayReadAndMemType(DD_Inputs_loc, &dd_inputs_ptr));
+  }
+
+  // Run model
+  PetscCallCXX(output_tensor = model.forward({input_tensor}).toTensor());
+
+  {  // Transfer output_tensor to DD_Outputs_loc
+    torch::DeviceType    dd_output_device;
+    torch::TensorOptions options;
+    PetscInt             output_size;
+    PetscScalar         *dd_outputs_ptr;
+    PetscMemType         output_mem_type;
+
+    {  // Get DeviceType of DD_Outputs_loc
+      PetscCall(VecGetArrayAndMemType(DD_Outputs_loc, &dd_outputs_ptr, &output_mem_type));
+      PetscCall(PetscMemTypeToDeviceType(output_mem_type, &dd_output_device));
+      PetscCall(VecRestoreArrayAndMemType(DD_Outputs_loc, &dd_outputs_ptr));
+    }
+
+    if (dd_output_device == torch::kXPU) {  // XPU requires device-to-host-to-device transfer
+      double *output_tensor_ptr;
+
+      PetscCall(VecGetLocalSize(DD_Outputs_loc, &output_size));
+      PetscCall(VecGetArray(DD_Outputs_loc, &dd_outputs_ptr));
+      PetscCallCXX(output_tensor_ptr = (double *)output_tensor.contiguous().to(torch::kCPU).data_ptr());
+      if (debug_tensor_output) {
+        printf("Output_Tensor_Pointer:\n");
+        for (PetscInt i = 0; i < output_size; i++) {
+          printf("%f\n", output_tensor_ptr[i]);
+        }
+      }
+      PetscCall(PetscArraycpy(dd_outputs_ptr, output_tensor_ptr, output_size));
+      PetscCall(VecRestoreArray(DD_Outputs_loc, &dd_outputs_ptr));
+    } else {
+      PetscInt      num_nodes;
+      torch::Tensor DD_Outputs_tensor;
+
+      PetscCall(VecGetLocalSize(DD_Outputs_loc, &output_size));
+      num_nodes = output_size / num_output_comps;
+      PetscCall(VecGetArrayAndMemType(DD_Outputs_loc, &dd_outputs_ptr, &output_mem_type));
+      PetscCallCXX(options = torch::TensorOptions().dtype(torch::kFloat64).device(dd_output_device));
+      PetscCallCXX(DD_Outputs_tensor = torch::from_blob((void *)dd_outputs_ptr, {num_nodes, num_output_comps}, options));
+      PetscCallCXX(DD_Outputs_tensor.copy_(output_tensor));
+      PetscCall(VecRestoreArrayAndMemType(DD_Outputs_loc, &dd_outputs_ptr));
+    }
+  }
+  PetscFunctionReturn(PETSC_SUCCESS);
+}
diff --git a/examples/fluids/pytorch_pkgconfig.py b/examples/fluids/pytorch_pkgconfig.py
new file mode 100644
index 0000000000..de3ed0991b
--- /dev/null
+++ b/examples/fluids/pytorch_pkgconfig.py
@@ -0,0 +1,69 @@
+from pathlib import Path
+import torch
+import torch.utils.cpp_extension as C
+import torch.utils as tutils
+import re
+
+build_dir = Path('./build')
+pkgconfig_path = build_dir / 'libtorch.pc'
+
+variables = {}
+keywords = {}
+
+
+def add_variable(file, variable, value):
+    file.write(f"{variable}={value}\n")
+
+
+def add_keyword(file, key, value):
+    file.write(f"{key}: {value}\n")
+
+
+variables['prefix'] = Path(C.library_paths()[0]).parent.as_posix()
+
+keywords['Name'] = 'libTorch'
+keywords['Description'] = 'Custom made PC for PyTorch'
+keywords['Version'] = torch.__version__
+
+keywords['Cflags'] = ''
+for include_path in C.include_paths():
+    keywords['Cflags'] += f'-I{include_path} '
+
+# Need to search the CMake file to see whether the library was compiled with the CXX11 ABI standard
+regex_ABI = re.compile(r'"(\S*GLIBCXX_USE_CXX11_ABI\S*)"')
+torchCMakePath = Path(tutils.cmake_prefix_path) / 'Torch/TorchConfig.cmake'
+abi_flag = ''
+with torchCMakePath.open('r') as f:
+    for line in f:
+        regex_result = regex_ABI.search(line)
+        if regex_result:
+            abi_flag = regex_result[1]
+
+keywords['Cflags'] += abi_flag
+
+keywords['Libs'] = ''
+for lib_path in C.library_paths():
+    keywords['Libs'] += f'-L{lib_path} '
+keywords['Libs'] += '-lc10 -ltorch_cpu '
+if torch.cuda.is_available():
+    keywords['Libs'] += '-lc10_cuda -ltorch_cuda '
+    # Need to force linking with libtorch_cuda.so, so find path and specify linking flag to force it
+    # This flag might be of limited portability
+    for lib_path in C.library_paths():
+        torch_cuda_path = Path(lib_path) / 'libtorch_cuda.so'
+        if torch_cuda_path.exists():
+            variables['torch_cuda_path'] = torch_cuda_path.as_posix()
+            keywords['Libs'] += f'-Wl,--no-as-needed,"{torch_cuda_path.as_posix()}" '
+keywords['Libs'] += '-ltorch '
+keywords['Libs.private'] = ''
+
+with pkgconfig_path.open('w') as file:
+    for variable, value in variables.items():
+        add_variable(file, variable, value)
+
+    file.write('\n')
+
+    for keyword, value in keywords.items():
+        add_keyword(file, keyword, value)
+
+print(pkgconfig_path.absolute())
diff --git a/examples/fluids/qfunctions/sgs_dd_model.h b/examples/fluids/qfunctions/sgs_dd_model.h
index da1a8f7967..9f05f07960 100644
--- a/examples/fluids/qfunctions/sgs_dd_model.h
+++ b/examples/fluids/qfunctions/sgs_dd_model.h
@@ -159,6 +159,8 @@ CEED_QFUNCTION(ComputeSgsDDNodal_Sequential_Inference)(void *ctx, CeedInt Q, con
 
   CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
     CeedScalar model_inputs_i[6], model_outputs_i[6];
+    // CeedScalar model_outputs_i[6];
+    // CeedScalar model_inputs_i[6] = {1, 2, 3, 4, 5, 6};
 
     StoredValuesUnpack(Q, i, 0, 6, (const CeedScalar *)model_inputs, model_inputs_i);
     DataDrivenInference(model_inputs_i, model_outputs_i, sgsdd_ctx);
diff --git a/examples/fluids/src/misc.c b/examples/fluids/src/misc.c
index 33066ab152..cefd38952c 100644
--- a/examples/fluids/src/misc.c
+++ b/examples/fluids/src/misc.c
@@ -148,6 +148,7 @@ PetscErrorCode RegressionTest(AppCtx app_ctx, Vec Q) {
   PetscFunctionBeginUser;
   // Read reference file
   PetscCall(VecDuplicate(Q, &Qref));
+  PetscCheck(strcmp(app_ctx->test_file_path, "") != 0, comm, PETSC_ERR_FILE_READ, "File for regression test not given");
   PetscCall(PetscViewerBinaryOpen(comm, app_ctx->test_file_path, FILE_MODE_READ, &viewer));
   PetscCall(LoadFluidsBinaryVec(comm, viewer, Qref, NULL, NULL));
 
diff --git a/examples/fluids/tests-output/createPyTorchModel/NNModel_HIT.pt b/examples/fluids/tests-output/createPyTorchModel/NNModel_HIT.pt
new file mode 100644
index 0000000000000000000000000000000000000000..dd31c24d27df99507cd00895b727330dbe52cdb1
GIT binary patch
literal 1908
zcmZvac~lcu6vnd=OhAyet}GT11Y?2;0-5(hTZ>8*E3Fu%5<?hCG$y=cP_!Ug#R>(r
z$D*x;wyw3*{a_K8H;G56p!A3fu25Ips9LMliXPfYsMhv4f6O`ay*I!6-S6I|yi&30
zd1dm`khweF8zfoEn|zzkz6%j2QZs<V;pAE^ltphbX%~=Io6%xMT=s&dKYL;_Qk03*
zlICoq&Ww1gG-6hy7Uhy!n{hE|L;OM{h}5{TqRn8jQpios)9~3ZqnScNwLl|qGA!mC
z<gOOv<*_yOY#9@&r`Res@f2$|X(hAA8TFJHaiY=?H&Wxt{$VC5iBz(XH0Bs6#M4t2
ztKJ~d=21qIO{}$&S$RfNww5BzHj7oeAXbd{k&zmArw8_?E{8;dzF)-2CX0@GzfTk+
zH;p%YM<$mlq;iFPtRg-^s)$2EmtTEu9&0f_l|ucdsyS+ZwQwa8skv(Zl}KD-a9Z=A
zkY^g|?+T#LUzSm4LtX~9xItie6&{z6Ad8Ea$>qq~;I8&(p#xmdV&tRd79-z2pdW?&
z(@?<6hGSne9QYdKSZS<cj9iuwi{4<7PM-svNTJi<Al4^qcW@te2!)2Gp<yq5#!6g1
zgI)u5`c#aOO5;&53l*}^5Erx<g*tr>?*qO`p|CU*o>zz>G$J-Y?R=8bW$P#%8sQ3t
z&7M^`D5?;RbWP#K5=E=IHdnZ8TqvdxjWW0t=d!p0M3O=@`oEeCNehwezgjGcEkt8n
zTAnS7U7=adwSC<vxv!}I_Qw04+BX^gE9$@D3P?%5vl;d|MlZ)@e#@Zj*E)DUpaUPS
z?#AOCMiAh;7JhJd6d2szgcBov0CDGXKz}q5PxTaow!PJ0Pxw{k_BN5~*RWE2cXbu<
zXzLxkuXHD<vA8qVq)5=cs24<SX(Emd8^LUyavLnmu7xQT+vz>+ZmKI+I+@U~{ghep
zYv|G^)9F&~MzB3Gj=o&xsd}jB0d?0V!=gHG08%E>sPYD*KT^&_^xMaboZk$hg^kRW
zc^LC%Uw|9Z*5J&v<*=d1%BV8E=z_3qFgRob_^2ibOm^#_1J3Z_vhmkpV}~y|)WC)P
zwcRi*=K|PaX`)+~x&x0(pCu-!IDlF445X(P6JM_QgUEWa6?Y8w2A5}*(D$XcK+oP2
z@eg(mByOp%fhVG~0E)i^n;t{#-7*NHMFW_lEehsxc`kD-eL3t}Sq!>yBFq>TOjpgh
z-Ox5FxxO^DxxTY=ZGFYG4Zuq{o)|u3y>fk1IX)EB22QSurr&ti9W+-zt;>v`i=$Fx
zbU~<xBj!U7M^~etqw4BCXbKcLe%e)KpBZ3PJ?8TdL94)#{_Piz8lQ1?UGsey?(3Gc
zOI&Wpva$Az_6)n(Gr}>v^)wX6d=4KT->4GyjhQpF?h+wa)Dlyk&!US+o~rD^RXAkZ
z4e0;Rd3b&WlQZHV6VuDLhukp%pSNc)KM79a=b8_&qsF8>kQ)G(oJ(LLU;+^)jD}5<
z*5kl?#dP<N?YK3`&WN`3(y2H8RL-9C8#sJ!9elB0=+qMY{ehqH^scjbOKu|ZL+MvU
z7geV0PTq)*ayzg`41>oUFTv8DQXtwy;!jLF@E4nUuzN&3{;t*wKda&33CWiTKO9KQ
zX7g}<zY?(a(@|KmdnhD-(JCuF%5deb9r)tz^H|f;k8YcvO^2F8=+g^#<EhoY^qF@y
zgN}<A>9J=XD69T>0KERX4zzQYf{}|_z_kk<#Nyp&4(hI-BG54@%s7z>)05W0)&qG&
z^L!P(>;7A`Ja9Ciwx1#bdfZ^UAcek?AFnKP+)*Cicn;K#P(#6raQNW8TtZY;Koo2?
zCf=G?L`<4##YcKG!F^*9A?<yPgEKzH!v`|BqI?bZxO1bSdjAnx_2^;4C;Jf=nWn+$
zNyRYqrVbw648Z%#L-1XBCZS53NCah8<EkxPgki!w5Cl3w^hebI#>LUw6iIk5KG{&9
z9Kt+5w4p&WAQjxK@WCs3$}!G8g$LUruwI;?>~UXDM<1>y<OBx<-#g9BY!||PttB`m
zR6;NLhKmIg?I23}HeMJ1i1=NdLeJ6HVxP_~dhxg<9Fsl_?3Xs;kqfrsn?sg>menPA
XLV+DO&v4+QA&vCHxJn{tQ;_m+4Ikae

literal 0
HcmV?d00001

diff --git a/examples/fluids/tests-output/createPyTorchModel/NNModel_HIT_fp64_jit.pt b/examples/fluids/tests-output/createPyTorchModel/NNModel_HIT_fp64_jit.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ef631350158acef30b674f448471c73763081eb3
GIT binary patch
literal 9977
zcmd5>2{@G9+aFsPBw5Q+NY)71LZ_5{4Mmp5#AF}FzD6XJ?27D5NTos&a%7F{Qe>+n
zsU(%HUiqHU+xt@S|9^e&dtKi%*SVf&W}fpq_xYXs{+;`rqo+wmjzUpUp?<p9QIseh
z9c?EooC8Kp-4Ju!S$Y@7&el!L3hRay+o_^NhB7fR>AOYP7R6HsH>L_D4!UK5I_ipv
z8NU1Ag=qMuce~EPDj)sADUv3ztABQ?uzU<)dv+|4W#57(K}y<(ZJNP;CcB9Q-v+>T
z%Om8&%zgN5zGt3|G#r{Jn`+aT&A`)Te&;gd>VfYOjU<7VHgIT&Gc1+02N3d7*MD`d
zh2;8|=F7=^;i0(nb)DU_u<<z~$Dz_b$mWw`MU~kI^!Jlq@T0B(7fgqu$))eX?fhrP
z7mkDhvj<-lW?G*@u1zAL>eNrbg+21R<lVOb)#!<@;^r6O=9^`wX|{F20PSm{I#ibc
zM|2Z;ntCX3I~e)&Iz>GU&}c$EQ11mP3O7e8Vjlzk+A3E}TLa`Q>=W>Ix(-8+WgqH3
z^c9Y<uqNt+JqB6)h2wtPufe@eoc3Er9s<@!B-_LCtze@5^~&D*N{Fkc)K0!s4EkwC
zdrj*yL3^;J8BJXYd{r8uJtck;I6KcK7MWfL=8ZjZU2haY@VeZ`;dYGG3Nc5w%=_O0
z`k@lW(skEB?_`p-ZRmaYzWHgL<iSR;HC@I^N~HnT3q^Gba?|Xs=nP`@42EEX+$G!x
zrdr^@x$cTZR|iZLdU0naq8c3Z)1Hvyu7w^g?`W?Xb^<KCVawhb20Sg|@Sc5baBm{-
z^cDJH`22{gk|O>#JpK6=rgBpo^xkrKq~TLDtgE36Kjhj0?y_x?O6qt3EUjZY=+-|3
zf)e#~)QJs1gGSK=@0kU%A=!G@XsW$emC@$o7Cqp(>5dx>!nGjPl|7Baw*{ULu}B@1
zi~|F@F=1)7rGVvD%(YQdhH7_~hcuR7T0zLL8pbfO8lo>#*K(dJgN!*ho_F4P0KBg}
zHz|qBh9kBo3=7}PgL4@h4`$qZ2zH002014;19D6LGUc!ufa0x_nQQ9;T#=ca%#@d*
z%j3=?`&^l;PqgkY*UyB&)8*>R*jK$km%?^$MP)k>yjvN=t9S*RtkIF+;7)*o=ihOT
ziS)u_!He2OcTT_!mwP|^2=s%ANu!9?o)mcLwRkTZV+sJDNw6bA$v`?%h|NpzCHRB|
ztO`e}LC-U`jpcPQ;GNJM%Qfaa&`;lCL-r;Sx-Hz%GXL}d2xgrXF|8)u>!R2kw|P$i
zC<w${k{d696lxP^{axz-C4+Bh{>^51mU@HD3}pwf(+k9A#Pz_!0PVc3&tpNw+sw-e
z7KO00xy^Rt(MKSUvbLNaRR{F~^`yR%bpiDH6#NJ6W}v4@Np;;?D6E<cg(4QcHS|Dc
z4LuN7QIaI;!T<KU=u4eBXF3c>WKH-!wLStN59ZAxIyyk%0yPbXTpBp5DWte3VGyo&
z&nTg941?>B_w#V{4Z=8yTbcqPr9f(rxRhV}0BCccqvuU*2Jd_ZeQbs9LjSbmA=2A=
z;8Fh=wwQ<~a9A~vuC6K{%8r(sjyk1*LPD>A-r07TH$SuMs9hJ#>!55`kROEW^0!w8
zY7GH@o>5Yl$3u|F&%i;&#0*3qiDfN*Sl8f3LPd%3SNu%XxT}H<cfq@tdN&>qv_aLH
z$Nkg7_n~?|Q+JJ87W_h|rni`W72Gu4aJ;3d4hkj<%PBH602IkOCiC`TXei<u{(7bf
zJSdCx9l&HmZiUdAHdz8lZp`^`!{8C9oPVIgT>KQC!Mq;KV7>!JKYoCN{DY9Wr*=E(
z^+tI2h*UK5u@;a>(_Ea>c^lMU>6;_@PzY_mC@fYm6##j87tVI74se5ecdyh~5n#{D
z_psd22t5Z?-L?tdgrDpwiu^J8pt!pB{6n7x7*z4uy{W1SU`_dVsC%Zu?K1eFy1;r+
zEE6#_J$Dya+0eT07>a_AZti^AalQmvoKdG`nd*Vv%#V15tZP8Y-Lpo_x!rIR?O^V$
z<WPX0sf(+@-GUohIu=Z<Z-QNFdea=@5m0-QS%Rzd3f$#kRNGE|2kz~^L!s7P30ZL1
zc!EwN#PG(-Xb&YogU1DX-;mvgyC08OY3F0XmchN^i>CMBpq1IF>$;Diqj`ATC!<QR
zBfH;MrT7-q;bv{BF^T{}k}02#c$9<d0hwW+Oj=>WA!nvgp&G!@5Tc1rZ-<JZ`{~=}
z>VWwZOyS~m5vW=4&t$rp3*O|VEZE++fwy%9KJcSkfz13kjipouyufA@q!ri#G@K|<
ze&xx4C78F;zPBEbQqmkeiRfo{Ila3iR||{`=ZddAPy`#_#MQIfl!MEvY6=StSs-vd
zCCi{-EeM?Q4wbl_1(W7gcFDLGf}vx>*A<IvAYtkiclm}UFhL@=S?O~zWSE_1sXkf*
z#l%VB%Xe3y=eeokfc|<|?KM6*Y;zxA3M!M@42qz2!8DuEY$teRO=D*fSPH8m@0juC
zw*l<=Cqs@>HIOqVLMhkk4kSCbvsEnsg3!b9R*sz+@T5?|iw}c$AtmRc^5m;aV3X++
z>G-Z@z-yK}RpU|#He``CQG_?a$k_<BVTC#vswk6s&B_lLNWO+f*0(|Fy`usVvK`QF
z{8HuEhE_PYuT*22wgxVkpE~_6tQzohURAC+(gE3hl8po`?}M|fk{%3giNLUBOgDx0
zAvk<K>-Mx)9Xvr{u9|+N6uQ436vg+nLyGv}P}wh?aFhZcik_<lU2I~e%DN3;OK(wb
zrD-F;-Y3sj=G+C(FuWEmVDJZ?+QlkLI;}8>Vs3<2xd`3}Yd$4T83Efqhl}uR>V`~M
z)Vpn7t?+IsuQAzTHBh*TGPV=G1~#xZEu^#FThfOaYG(QWtA0xUrhXn6aVq5Fr~;?D
zMc+O&cmR)-T;HZ=UJIIwi%URO9mvTV4~}#?2P-ZpTc<^KE$O`<^>+20h?Wt{>UzeB
zqF7zeM4jy&RFp_iCV`}u7>t{fi=_<)BZ~R{=!nHzJK!)<Xe{2=)yY8B08Ky%p7a+m
zCgz!LE?8T<E#8_y5@yVTe8<?jVvf5zIAD%pE$uzAE>;*DCntLXsg5x+AEEF#Hv-wR
zHkR5F#p6XCovhp)aIT`3PIxy25a;q!^SjUnID$J4?`Dg2KoiJ`2H22<*&x6q2=Gn<
z#agfq2qgArOmVc<(jRETx}^@SLy$aiw$?Uo1WL<qwqe}eY#m(D7#G|zcUuQ5j2jN`
z>g0m)kU$ft1O$vJks7YZn^<ccf%;n&^Z_ULV-7fjZ@;1mXd8;9zFRuG6KFKO+zt|G
zHBkbZOq%Q51T@!alDLIgS~@v6;4Iy2o$#(`T^B2y3(o3*t)&~9Kqsj0rcYRJjAZqw
zW42gV0=*5=stvUbda2jqgbg+{noI~J!xAI{h``x=tF`ePJ?_T{j5ZtAA_t4K?13e=
z{uAjEXe}Jp-di81bqGyh`kQkElzGVl2?EPn(5vt+`4{i3OPG}3du79d^oK|h8?is@
zkggGl`nyk?Y*rE=N!W~3Scg<#U#hTz0FJ*1;6&`@Ldq6VVg?`;qjvH+DGEg_#E4-+
zR!l!HnPgG2zm6D7WDzCyJvK1N)1MLZJxoNMy;YQU{VF~9kLi@Tt~|bWC+(msI#N2?
z&W@FA{K@54Px1Tq-Iq?FifM|E?cjj1dSu$$W1}Ybx7}-?I`1Dp<@k08*L3nTmNL-m
zmYtDC4g<q_=RfRT#iv#T$C|XK>F%uFwo#}_)lH5(<*ncR&GM&5Ug*zkdHf>8QTFh$
z9*Ik&yxTkHBWz5hT8=x09+0HTJEm*rY<cax?#cIk5uZ}S<8Dj2a#QM*<hsWMeVP0$
z;4j`^`z4FKjPAWUhqAAi&w_nu>E&6@jL%73(rSV@+^gn+<Q-F`p(kc(h-@bg4tjMU
z?yH4_>FWBVgxdStY>QeUv1yH25;29};{E%;1d<xFA*QP6>TB;a#2Bd?s$mQa_0@G$
zF^2n;w3G=bW8_~Hfh2@Ls)_njAdp40D8|{=(%u1w{w5K&csB=IS0sAL1k43+Zu*)i
zO;jj>eCdWfg%Bu0m;CVN)Llh81dmu&_d^2|^MA$6l3a=5@nU~Usn}mq`jd<9T5?f_
zpQ%cXX|+TAFL2Tq-DyCYs@|1K+Rx}lJL4*(oPW9NV!4a8yy@(W>p%!T$6@Er-befa
z=ifF&&%Up_&Z*g9RpeXoj&^LC8TI@m$AsLCk73R<XK`l=%S#s+(2*rJeS7JI?X}Z$
z6iZXZ%d(Eo>fq0r+Iz+Tj-XPd<DL9zZrhHLO+U*w(>l{6;lmK@-pX?D(vk3d|AH3b
z?2slw4*i*J?6t-P-lP{&Mlm{eK_@ild7Uo3)T)S5O@n6oMvEez9-^MDJ4zl+x5s-+
zJ1!&_-rV@y{$sCKcH6M*Ig>z3x^|JFa*o2us`KtV#`Re;9ABtJ2jxO~)lVE7O-vvY
zZn(<UD^f&z#f+}i%J{@E(^xA3J#U+n9l3vO+vdVOBt&=i7R~V6g1B<@pW^ZB?G|$s
z%fB&b%RPPL&%=kz-~a#oR}aPZdsKXzJAQIr@g?VF`bF<eOthogiT>L#>5q;vAWgzP
z-aAXqUh($uX}yhd;_h#G<V&K!qPnVX?4E__DyJeQQoqfO+a@lgdp;O+Cf!>Z)6o#a
zx3y-IXlrU~QA2L%@m);%3p)nN>=)>9mtA&N(^ov@6HE$7&5kBC>`-UFdwV{DI^WZt
zkTdSXpG27cdU0m>woN6kZ<9sUp5k>Ae3Fh9S3<40eUc6zI8@-G<H)C`SNcMK?trw*
zS)8Adlymw!5{0z3-Ckd0pJY0mnUs7XBQ&e5@SM{nwOL8_-Sdo#`RV<t0q_0g9;JRo
z6oQh<4%4}708tC)KehDNGnWa9;~!^yx#zL{l6#_t-i>c}>fXuJn2LW>8j&s&$?*Uy
z-ruXiptM=fLW0DQ=NXLz{<-u}imZp31_fz}=GTe4IVP8E$5hTZj4F=#S52z^0rzj=
zJ~iae6_UsB#e>D)0+Zd@mp3c2+kb&zqZbv#zA5VKX}Y}KjB^8R*HUt2^>)8RUyyFt
z)ZXBiHDPEGvnWEi>HMT&yo&KstwyB%{(_I$7J-`MbtmqP(ClI6!$&?-XiU;Mj;7Uk
z8PuO){AvCAG)3<zwxG5<+J4=c7bXOZ-)T~`)9)h{PDm?Gh^G5|-Jd(SoQ0O{Ow(1p
zs=l0n{(}r}1_(A9-Ep;fGCoDhBc?nWiiD@kQlqWTb1Y{u98CLj!Y?+FlbYO1ohu5n
z)6ZwjbPlvuCRLGU-B6Pt_4?Ji%bKkATC--mTT(^|1{3=h$vawwO%LCb>3>+-Fu8@k
zaK5T9bN-2jp9?L=CGQ%SQVpYO=ZUT@_Q5l6FharMAI|B<O=DjV92M=KZI<yhyyX{=
zB;Pi`CxF)>?|2r-EKwDR-Q1p|k=T+Nd+B5A-J#huXWk&Hf|iBj!TcQFwiDc3<I!|Q
zeOFCepYZitg}&XGp7FUt@tE+(z>N30w|R~-+liSgUv8-tDkCL2UGufdzD~s9UQnzx
zO&Owg{GQYOoqhfk>Jm$#F1V_@eV?}fx|>I7>(uR}4ViUCi<#PnS$G%*G14k2-Pp|i
zz%f<l;ls-*<aHCa5`R!t-oyn=_HilHQ>o_8kAz2wweDvt9xLr9*;<%W;(Y8}m@{SG
z<0livw*pMmo5JI$cfZ>Ip2CH}afZux^GnYzhIa0j{yzxXqEzz+BxgOWXryhh7w->w
z^anAy$?{&jcrNV7>w}#g(F5$vl)=NjN)p};!SnN2*hd<z<Be7K>b@tbN&BRxXThlN
zP+il-?wLAVl$ej<%$@n3ch8=1;1`umdzuBWm}z;W6Gy;BR*k0{347Hw!~Q6W`(GJ>
z<)P2H_E>z6|89>rr4D~oIiE6>6FF}GSpQV?I;KP|RD#A_kb(?>{kjf^e49oHd6?!`
z+&0l(K||#f51X9(hyVC6H2d*{?C_gcg(I?J!*mPX3}3yZea?Z1)7DOyXPx84GGyZ=
z(c<z&r>hd_4)EkYRcd8UR;eE0*~(!hgc1*LwBmUfPkPbp_*Qe9%%E3#-CAUA4KrIe
zs`tMMj<EGfjI~h7-9<KPy;Z$>n=WbV0j07Fq9&??GgHBhv$A}}6o=!-3a@56CG&+T
zC%U@srETQ9DRMsNvAp=!?eA0aI}+$VWp6$+di2rrS25Qp+R73~nC~AxG-^ya?kN)1
z`82U!&~sG!vk9H@){WZ6N(mHlCwh-esxwk7?z1|UbuPgqVJ!F_&0fH`-+@Qlt4)bf
z?_k#Wm-_oc^E^2mUR!ndyM~QDJk^nuV7HiE63oJ`H5+6bc#CJ>k%<%6?Q<R&RbCAe
z_KYdef}UNAS0<Yk;~lQnnfu-=+sdu;UdY_;^aJHFcE*W5Vdms;m(e|F8GcJw-y(-$
z10$}C{2G{FnL9eo;uh5PHfhF37I&{jt4r2ep-!YT1@|OD#w6XukUo<vRcS~>=F1-u
zS4kdOa7sUA84!JZ{LZ^-^D;{FGJ4K8UQg3yXuN55f4zR3<V{Eb^iUq|5Ab6T8NRG+
zA5(tIMmc9RwdM;Q(Pxi0`1w&HIZ`pQrd~a{)kDeup3kn597rxHhRoVt`j=e%^q?K}
zfT$nJvmcoc8(lNx*-)WT%Np2wgPo`!(ieAalDqtJwkezJssm;VcxNR@?j;kc{)8S(
zp09XD9_YoJd5%|e?<q;Mi}K01bK3N+HUc?w=4FJ(pIMvrM=b)yJnm`gODw`jrG`zn
zw>f?qdMUL%W9V9aOzxXhtA?I0_OAnmvZf;6@&$W;Y1v$Ip{3;HHLfn2&Ictt1sfAw
z?Sxf=is~=aP!`3Qii}(-IQ^xBKG3R9#)ARFCFb$|lSj_}eP>QezVH<qQ&#Zfa=E5K
zmL6vuaGAa&fd8}QRWbRB8k$CPv(54gOUXp^L-Lp7h^j=csVXKY`hR?cmMaR)^0E4^
zAA^Oz$RAE#K2XDIHBjeRDMxr)7LO+H7M;I68*7)4o60U&lrWnzJ72YB3YWZRAC>>y
zZnTcl%)+ie>Z(GryT!H~@%k|Hu_J?vLPlc8XG-amSB6hr<!`%Ow7`sq37hp&<tcD}
z3@<e}sXTL~dcMY%ozGW?X}z#qA#Hfu`UMiv97bQcDA74Xrm{V!i)r(ArwQ&9NTS9s
z_;vOV2TZ-~F`(K#COxB-d{ipi;*J<|r2>ljs-92VVI}ETi!VcH3ru!e2yE{*^UmGB
zD3#6Al{C^3U(Qe2?lf%`r{Z~l$>({suuyZwv-gTDP)^{q8lzO4IC)8sJfW0ISU<xu
z^G1}lH|<jXz|8#mJ<GEE;h;H^Kd^O76Y~d2(}&VX{y>MHOY6(+L-GedawLCfk@%KB
zbfhEs!xfBBNce{c)sbmz!N5_m{@Dh}3PUL*fADP^;1?*+o$J7+k4ls;<qv0)T2f*|
zKDGo8MJ?qILantv!ThrJd<mf?YRL}KI(PAp>@PUPI2P?!2r8KL2;b4l8x#ZwW8Ipc
z7A0KGihoHM$B+@FfPqEwRoAiv;@3+i9u(JVj)r%2!{Xgszim@)UhXhIL~tOf;L^(4
zQH<J%xMfBxtM55*pd^0Xq6f~!71<3_Q4(FFtsk0R`YMSg_O~<5`VgXgC}PpmBq84Z
zI+I_1I>%1@W5u<`3H^^i%a5^E0&PHgy*iNik3h@!yH^73L=wK$fh2weTD}pz5=f7F
zEg;DsftK&jajxJ8g<b=Q=pNso{$3U>-+Yo?0VP4V3e<nGz4xQR%QteAR<x3_{`Xt?
zTP1(&)$%<EgB9%|r^3G;vLAZ&zuYWZtHb6i3~*xjR}5I4ixo@a6{h%X{7q9<$t{Q*
z4?m8?^0kr4iq_vU{`0M`&cgB)g5e4~u*jjzFN63$dBlHMm;K1Z@_g^;3JXS9{~Zff
zXJbWXv%;1On}5@mReXY&Z~n-{^674+Psnln^R55L!rxQP@=1_q1q)eRtJ<+_s{JwO
zmrrX*N0wboTUNpQb0XE#T(?f-&-b~=cKlTn(Tb((`<;S#nuc7l2p;i5>Mi|pVf(l5
VNDhJgMnOEKLopy9;``s;{T~`>-Le1x

literal 0
HcmV?d00001

diff --git a/examples/fluids/tests-output/createPyTorchModel/README.md b/examples/fluids/tests-output/createPyTorchModel/README.md
new file mode 100644
index 0000000000..cc34055a4b
--- /dev/null
+++ b/examples/fluids/tests-output/createPyTorchModel/README.md
@@ -0,0 +1 @@
+This directory exists to create a PyTorch model with certain weights and biases. It is mostly setup for testing purposes.
diff --git a/examples/fluids/tests-output/createPyTorchModel/update_weights.py b/examples/fluids/tests-output/createPyTorchModel/update_weights.py
new file mode 100755
index 0000000000..a30304d647
--- /dev/null
+++ b/examples/fluids/tests-output/createPyTorchModel/update_weights.py
@@ -0,0 +1,71 @@
+#!/usr/bin/env python3
+import torch
+import torch.nn as nn
+from pathlib import Path
+import numpy as np
+
+
+new_parameters_Path = Path('../../dd_sgs_data')
+
+weights = []
+biases = []
+weights.append(np.loadtxt(new_parameters_Path / 'w1.dat', skiprows=1).reshape(6, 20).T)
+weights.append(np.loadtxt(new_parameters_Path / 'w2.dat', skiprows=1).reshape(20, 6).T)
+biases.append(np.loadtxt(new_parameters_Path / 'b1.dat', skiprows=1))
+biases.append(np.loadtxt(new_parameters_Path / 'b2.dat', skiprows=1))
+
+# Anisotropic SGS model for LES developed by Aviral Prakash and John A. Evans at UCB
+
+
+class anisoSGS(nn.Module):
+    # The class takes as inputs the input and output dimensions and the number of layers
+    def __init__(self, inputDim=6, outputDim=6, numNeurons=20, numLayers=1):
+        super().__init__()
+        self.ndIn = inputDim
+        self.ndOut = outputDim
+        self.nNeurons = numNeurons
+        self.nLayers = numLayers
+        self.net = nn.Sequential(
+            nn.Linear(self.ndIn, self.nNeurons),
+            nn.LeakyReLU(0.3),
+            nn.Linear(self.nNeurons, self.ndOut))
+
+    # Define the method to do a forward pass
+    def forward(self, x):
+        return self.net(x)
+
+
+def load_n_trace_model(model_name):
+    # Instantiate PT model and load state dict
+    model = anisoSGS()
+    model.load_state_dict(torch.load(f'{model_name}.pt', map_location=torch.device('cpu')))
+    model.double()
+
+    # Change individual model weights
+    with torch.no_grad():
+        for i, layer in enumerate([0, 2]):
+            m, n = model.net[layer].weight.shape
+            print('weight shape', m, n)
+
+            model.net[layer].weight[...] = torch.from_numpy(weights[i])[...]
+            model.net[layer].bias[...] = torch.from_numpy(biases[i])[...]
+
+    # Prepare model for inference
+    dummy_input = torch.randn(512, 6, dtype=torch.float64, device='cpu')
+    with torch.no_grad():
+        # model_script = torch.jit.script(model)
+        # torch.jit.save(model_script, f"{model_name}_fp64_jit.ptc")
+
+        model = torch.jit.trace(model, dummy_input)
+        torch.jit.save(model, f"{model_name}_fp64_jit.pt")
+
+    return model
+
+
+def main():
+    model_name = 'NNModel_HIT'
+    model = load_n_trace_model(model_name)
+
+
+if __name__ == '__main__':
+    main()

From 9d9921469e04172ec2c38394694d3e721744c768 Mon Sep 17 00:00:00 2001
From: James Wright <james@jameswright.xyz>
Date: Sat, 11 May 2024 08:50:53 -0600
Subject: [PATCH 064/571] fluids: Add log_events.c/h, PetscLogEvent for torch

To have the log_events accessbile to torch (in C++), I needed to
separate out the header file containing the extern PetscLogEvent
declarations. While I was at it, I figured it'd be more clear to have a
separate log_events.c file as well to have the actual "storage" of the
PetscLogEvents and the RegisterLogEvents function itself.
---
 examples/fluids/include/log_events.h          | 25 ++++++++++
 examples/fluids/navierstokes.h                | 13 +----
 examples/fluids/problems/sgs_dd_model.c       |  8 +++
 .../fluids/problems/torch/sgs_model_torch.cpp |  9 ++++
 examples/fluids/src/log_events.c              | 50 +++++++++++++++++++
 examples/fluids/src/misc.c                    | 26 ----------
 6 files changed, 93 insertions(+), 38 deletions(-)
 create mode 100644 examples/fluids/include/log_events.h
 create mode 100644 examples/fluids/src/log_events.c

diff --git a/examples/fluids/include/log_events.h b/examples/fluids/include/log_events.h
new file mode 100644
index 0000000000..1649da9f5a
--- /dev/null
+++ b/examples/fluids/include/log_events.h
@@ -0,0 +1,25 @@
+// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+#pragma once
+
+#include <petsc.h>
+
+extern PetscLogEvent FLUIDS_CeedOperatorApply;
+extern PetscLogEvent FLUIDS_CeedOperatorAssemble;
+extern PetscLogEvent FLUIDS_CeedOperatorAssembleDiagonal;
+extern PetscLogEvent FLUIDS_CeedOperatorAssemblePointBlockDiagonal;
+extern PetscLogEvent FLUIDS_SmartRedis_Init;
+extern PetscLogEvent FLUIDS_SmartRedis_Meta;
+extern PetscLogEvent FLUIDS_SmartRedis_Train;
+extern PetscLogEvent FLUIDS_TrainDataCompute;
+extern PetscLogEvent FLUIDS_DifferentialFilter;
+extern PetscLogEvent FLUIDS_VelocityGradientProjection;
+extern PetscLogEvent FLUIDS_SgsModel;
+extern PetscLogEvent FLUIDS_SgsModelDDInference;
+extern PetscLogEvent FLUIDS_SgsModelDDData;
+
+PetscErrorCode RegisterLogEvents();
diff --git a/examples/fluids/navierstokes.h b/examples/fluids/navierstokes.h
index 1c46f57776..de1fd300f0 100644
--- a/examples/fluids/navierstokes.h
+++ b/examples/fluids/navierstokes.h
@@ -7,6 +7,7 @@
 #pragma once
 
 #include <ceed.h>
+#include <log_events.h>
 #include <mat-ceed.h>
 #include <petsc-ceed-utils.h>
 #include <petscts.h>
@@ -85,18 +86,6 @@ static const char *const MeshTransformTypes[] = {"none", "platemesh", "MeshTrans
 static const char *const DifferentialFilterDampingFunctions[] = {
     "none", "van_driest", "mms", "DifferentialFilterDampingFunction", "DIFF_FILTER_DAMP_", NULL};
 
-// -----------------------------------------------------------------------------
-// Log Events
-// -----------------------------------------------------------------------------
-extern PetscLogEvent FLUIDS_CeedOperatorApply;
-extern PetscLogEvent FLUIDS_SmartRedis_Init;
-extern PetscLogEvent FLUIDS_SmartRedis_Meta;
-extern PetscLogEvent FLUIDS_SmartRedis_Train;
-extern PetscLogEvent FLUIDS_TrainDataCompute;
-extern PetscLogEvent FLUIDS_DifferentialFilter;
-extern PetscLogEvent FLUIDS_VelocityGradientProjection;
-PetscErrorCode       RegisterLogEvents();
-
 // -----------------------------------------------------------------------------
 // Structs
 // -----------------------------------------------------------------------------
diff --git a/examples/fluids/problems/sgs_dd_model.c b/examples/fluids/problems/sgs_dd_model.c
index c87ca93f35..57f90889cf 100644
--- a/examples/fluids/problems/sgs_dd_model.c
+++ b/examples/fluids/problems/sgs_dd_model.c
@@ -186,7 +186,13 @@ PetscErrorCode SgsDDNodalStressEval_Sequential_Ceed(Vec DD_Inputs_loc, Vec DD_Ou
   OperatorApplyContext op_context = *(OperatorApplyContext *)ctx;
 
   PetscFunctionBeginUser;
+  PetscCall(PetscLogEventBegin(FLUIDS_SgsModelDDData, DD_Inputs_loc, DD_Outputs_loc, NULL, NULL));
+  PetscCall(PetscLogEventBegin(FLUIDS_SgsModelDDInference, DD_Inputs_loc, DD_Outputs_loc, NULL, NULL));
+  PetscCall(PetscLogGpuTimeBegin());
   PetscCall(ApplyCeedOperatorLocalToLocal(DD_Inputs_loc, DD_Outputs_loc, op_context));
+  PetscCall(PetscLogGpuTimeEnd());
+  PetscCall(PetscLogEventEnd(FLUIDS_SgsModelDDInference, DD_Inputs_loc, DD_Outputs_loc, NULL, NULL));
+  PetscCall(PetscLogEventEnd(FLUIDS_SgsModelDDData, DD_Inputs_loc, DD_Outputs_loc, NULL, NULL));
   PetscFunctionReturn(PETSC_SUCCESS);
 }
 
@@ -475,6 +481,7 @@ PetscErrorCode SgsDDApplyIFunction(User user, const Vec Q_loc, Vec G_loc) {
   PetscMemType sgs_nodal_mem_type;
 
   PetscFunctionBeginUser;
+  PetscCall(PetscLogEventBegin(FLUIDS_SgsModel, Q_loc, G_loc, NULL, NULL));
   PetscCall(DMGetGlobalVector(user->grad_velo_proj->dm, &VelocityGradient));
   PetscCall(VelocityGradientProjectionApply(user->grad_velo_proj, Q_loc, VelocityGradient));
 
@@ -490,6 +497,7 @@ PetscErrorCode SgsDDApplyIFunction(User user, const Vec Q_loc, Vec G_loc) {
   PetscCall(VecCeedToPetsc(sgs_dd_data->sgs_nodal_ceed, sgs_nodal_mem_type, SGSNodal_loc));
   PetscCall(DMRestoreLocalVector(sgs_dd_data->dm_sgs, &SGSNodal_loc));
   PetscCall(DMRestoreGlobalVector(user->grad_velo_proj->dm, &VelocityGradient));
+  PetscCall(PetscLogEventEnd(FLUIDS_SgsModel, Q_loc, G_loc, NULL, NULL));
   PetscFunctionReturn(PETSC_SUCCESS);
 }
 
diff --git a/examples/fluids/problems/torch/sgs_model_torch.cpp b/examples/fluids/problems/torch/sgs_model_torch.cpp
index 969bc57ebb..28641b8664 100644
--- a/examples/fluids/problems/torch/sgs_model_torch.cpp
+++ b/examples/fluids/problems/torch/sgs_model_torch.cpp
@@ -5,6 +5,7 @@
 //
 // This file is part of CEED:  http://github.com/ceed
 
+#include <log_events.h>
 #include <petsc.h>
 #include <sgs_model_torch.h>
 #include <torch/script.h>
@@ -72,6 +73,7 @@ PetscErrorCode ModelInference_Torch(Vec DD_Inputs_loc, Vec DD_Outputs_loc) {
 
   PetscFunctionBeginUser;
   // torch::NoGradGuard no_grad; // equivalent to "with torch.no_grad():" in PyTorch
+  PetscCall(PetscLogEventBegin(FLUIDS_SgsModelDDData, DD_Inputs_loc, DD_Outputs_loc, NULL, NULL));
   {  // Transfer DD_Inputs_loc into input_tensor
     PetscMemType         input_mem_type;
     PetscInt             input_size, num_nodes;
@@ -104,10 +106,16 @@ PetscErrorCode ModelInference_Torch(Vec DD_Inputs_loc, Vec DD_Outputs_loc) {
     }
     PetscCall(VecRestoreArrayReadAndMemType(DD_Inputs_loc, &dd_inputs_ptr));
   }
+  PetscCall(PetscLogEventEnd(FLUIDS_SgsModelDDData, DD_Inputs_loc, DD_Outputs_loc, NULL, NULL));
 
   // Run model
+  PetscCall(PetscLogEventBegin(FLUIDS_SgsModelDDInference, DD_Inputs_loc, DD_Outputs_loc, NULL, NULL));
+  PetscCall(PetscLogGpuTimeBegin());
   PetscCallCXX(output_tensor = model.forward({input_tensor}).toTensor());
+  PetscCall(PetscLogGpuTimeEnd());
+  PetscCall(PetscLogEventEnd(FLUIDS_SgsModelDDInference, DD_Inputs_loc, DD_Outputs_loc, NULL, NULL));
 
+  PetscCall(PetscLogEventBegin(FLUIDS_SgsModelDDData, DD_Inputs_loc, DD_Outputs_loc, NULL, NULL));
   {  // Transfer output_tensor to DD_Outputs_loc
     torch::DeviceType    dd_output_device;
     torch::TensorOptions options;
@@ -148,5 +156,6 @@ PetscErrorCode ModelInference_Torch(Vec DD_Inputs_loc, Vec DD_Outputs_loc) {
       PetscCall(VecRestoreArrayAndMemType(DD_Outputs_loc, &dd_outputs_ptr));
     }
   }
+  PetscCall(PetscLogEventEnd(FLUIDS_SgsModelDDData, DD_Inputs_loc, DD_Outputs_loc, NULL, NULL));
   PetscFunctionReturn(PETSC_SUCCESS);
 }
diff --git a/examples/fluids/src/log_events.c b/examples/fluids/src/log_events.c
new file mode 100644
index 0000000000..c5e968b485
--- /dev/null
+++ b/examples/fluids/src/log_events.c
@@ -0,0 +1,50 @@
+// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+#include <log_events.h>
+#include <petsc.h>
+
+static PetscClassId libCEED_classid, onlineTrain_classid, sgs_model_classid, misc_classid;
+
+PetscLogEvent FLUIDS_CeedOperatorApply;
+PetscLogEvent FLUIDS_CeedOperatorAssemble;
+PetscLogEvent FLUIDS_CeedOperatorAssembleDiagonal;
+PetscLogEvent FLUIDS_CeedOperatorAssemblePointBlockDiagonal;
+PetscLogEvent FLUIDS_SmartRedis_Init;
+PetscLogEvent FLUIDS_SmartRedis_Meta;
+PetscLogEvent FLUIDS_SmartRedis_Train;
+PetscLogEvent FLUIDS_TrainDataCompute;
+PetscLogEvent FLUIDS_DifferentialFilter;
+PetscLogEvent FLUIDS_VelocityGradientProjection;
+PetscLogEvent FLUIDS_SgsModel;
+PetscLogEvent FLUIDS_SgsModelDDInference;
+PetscLogEvent FLUIDS_SgsModelDDData;
+
+PetscErrorCode RegisterLogEvents() {
+  PetscFunctionBeginUser;
+  PetscCall(PetscClassIdRegister("libCEED", &libCEED_classid));
+  PetscCall(PetscLogEventRegister("CeedOpApply", libCEED_classid, &FLUIDS_CeedOperatorApply));
+  PetscCall(PetscLogEventRegister("CeedOpAsm", libCEED_classid, &FLUIDS_CeedOperatorAssemble));
+  PetscCall(PetscLogEventRegister("CeedOpAsmD", libCEED_classid, &FLUIDS_CeedOperatorAssembleDiagonal));
+  PetscCall(PetscLogEventRegister("CeedOpAsmPBD", libCEED_classid, &FLUIDS_CeedOperatorAssemblePointBlockDiagonal));
+
+  PetscCall(PetscClassIdRegister("onlineTrain", &onlineTrain_classid));
+  PetscCall(PetscLogEventRegister("SmartRedis_Init", onlineTrain_classid, &FLUIDS_SmartRedis_Init));
+  PetscCall(PetscLogEventRegister("SmartRedis_Meta", onlineTrain_classid, &FLUIDS_SmartRedis_Meta));
+  PetscCall(PetscLogEventRegister("SmartRedis_Train", onlineTrain_classid, &FLUIDS_SmartRedis_Train));
+  PetscCall(PetscLogEventRegister("TrainDataCompute", onlineTrain_classid, &FLUIDS_TrainDataCompute));
+
+  PetscCall(PetscClassIdRegister("SGS Model", &sgs_model_classid));
+  PetscCall(PetscLogEventRegister("SgsModel", sgs_model_classid, &FLUIDS_SgsModel));
+  PetscCall(PetscLogEventRegister("SgsModelDDInfer", sgs_model_classid, &FLUIDS_SgsModelDDInference));
+  PetscCall(PetscLogEventRegister("SgsModelDDData", sgs_model_classid, &FLUIDS_SgsModelDDData));
+
+  PetscCall(PetscClassIdRegister("Miscellaneous", &misc_classid));
+  PetscCall(PetscLogEventRegister("DiffFilter", misc_classid, &FLUIDS_DifferentialFilter));
+  PetscCall(PetscLogEventRegister("VeloGradProj", misc_classid, &FLUIDS_VelocityGradientProjection));
+  PetscFunctionReturn(PETSC_SUCCESS);
+}
diff --git a/examples/fluids/src/misc.c b/examples/fluids/src/misc.c
index cefd38952c..60c078b0d5 100644
--- a/examples/fluids/src/misc.c
+++ b/examples/fluids/src/misc.c
@@ -365,32 +365,6 @@ PetscErrorCode PhastaDatFileReadToArrayReal(MPI_Comm comm, const char path[PETSC
   PetscFunctionReturn(PETSC_SUCCESS);
 }
 
-PetscLogEvent       FLUIDS_CeedOperatorApply;
-PetscLogEvent       FLUIDS_SmartRedis_Init;
-PetscLogEvent       FLUIDS_SmartRedis_Meta;
-PetscLogEvent       FLUIDS_SmartRedis_Train;
-PetscLogEvent       FLUIDS_TrainDataCompute;
-PetscLogEvent       FLUIDS_DifferentialFilter;
-PetscLogEvent       FLUIDS_VelocityGradientProjection;
-static PetscClassId libCEED_classid, onlineTrain_classid, misc_classid;
-
-PetscErrorCode RegisterLogEvents() {
-  PetscFunctionBeginUser;
-  PetscCall(PetscClassIdRegister("libCEED", &libCEED_classid));
-  PetscCall(PetscLogEventRegister("CeedOpApply", libCEED_classid, &FLUIDS_CeedOperatorApply));
-
-  PetscCall(PetscClassIdRegister("onlineTrain", &onlineTrain_classid));
-  PetscCall(PetscLogEventRegister("SmartRedis_Init", onlineTrain_classid, &FLUIDS_SmartRedis_Init));
-  PetscCall(PetscLogEventRegister("SmartRedis_Meta", onlineTrain_classid, &FLUIDS_SmartRedis_Meta));
-  PetscCall(PetscLogEventRegister("SmartRedis_Train", onlineTrain_classid, &FLUIDS_SmartRedis_Train));
-  PetscCall(PetscLogEventRegister("TrainDataCompute", onlineTrain_classid, &FLUIDS_TrainDataCompute));
-
-  PetscCall(PetscClassIdRegister("Miscellaneous", &misc_classid));
-  PetscCall(PetscLogEventRegister("DiffFilter", misc_classid, &FLUIDS_DifferentialFilter));
-  PetscCall(PetscLogEventRegister("VeloGradProj", misc_classid, &FLUIDS_VelocityGradientProjection));
-  PetscFunctionReturn(PETSC_SUCCESS);
-}
-
 // Print information about the given simulation run
 PetscErrorCode PrintRunInfo(User user, Physics phys_ctx, ProblemData problem, TS ts) {
   Ceed     ceed = user->ceed;

From 49fc647510b7e0dcf730e37ef22e49b8e60c5698 Mon Sep 17 00:00:00 2001
From: James Wright <james@jameswright.xyz>
Date: Mon, 20 May 2024 17:23:42 -0400
Subject: [PATCH 065/571] doc(fluids): Add documentation for pytorch SGS

---
 examples/fluids/index.md | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/examples/fluids/index.md b/examples/fluids/index.md
index cf52923930..57884a0d50 100644
--- a/examples/fluids/index.md
+++ b/examples/fluids/index.md
@@ -370,12 +370,20 @@ The current data-driven model parameters are not accurate and are for regression
 There are two different modes for using the data-driven model: fused and sequential.
 
 In fused mode, the input processing, model inference, and output handling were all done in a single CeedOperator.
-Conversely, sequential mode has separate function calls/CeedOperators for input creation, model inference, and output handling.
-By separating the three steps to the model evaluation, the sequential mode allows for functions calling external libraries to be used for the model inference step.
-This however is slower than the fused kernel, but this requires a native libCEED inference implementation.
-
+Fused mode is generally faster than the sequential mode, however fused mode requires that the model architecture be manually implemented into a libCEED QFunction.
 To use the fused mode, set `-sgs_model_dd_implementation fused`.
-To use the sequential mode, set the same flag to `sequential_internal`.
+
+Sequential mode has separate function calls/CeedOperators for input creation, model inference, and output handling.
+By separating the three steps of the model evaluation, the sequential mode allows for functions calling external libraries to be used for the model inference step.
+The use of these external libraries allows us to leverage the flexibility of those external libraries in their model architectures.
+
+PyTorch is currently the only external library implemented with the sequential mode.
+This is enabled with `USE_TORCH=1` during the build process, which will use the PyTorch accessible from the build environment's Python interpreter.
+To specify the path to the PyTorch model file, use `-sgs_model_dd_torch_model_path`.
+The hardware used to run the model inference is determined automatically from the libCEED backend chosen, but can be overridden with `-sgs_model_dd_torch_model_device`.
+Note that if you chose to run the inference on host while using a GPU libCEED backend (e.g. `/gpu/cuda`), then host-to-device transfers (and vice versa) will be done automatically.
+
+The sequential mode is available using a libCEED based inference evaluation via `-sgs_model_dd_implementation sequential_ceed`, but it is only for verification purposes.
 
 (differential-filtering)=
 ### Differential Filtering

From bf1dd0b83bbc377b1ab74182a1694549c8eaf983 Mon Sep 17 00:00:00 2001
From: James Wright <james@jameswright.xyz>
Date: Mon, 20 May 2024 21:40:10 +0000
Subject: [PATCH 066/571] fluids: Use TORCH_DEVICE_CPU for sycl backends

On Sunspot, on-device inference is not working reliably. I'm not sure
exactly why at the moment (whether it's a libCEED backend issue or
something else).
---
 examples/fluids/problems/sgs_dd_model.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/fluids/problems/sgs_dd_model.c b/examples/fluids/problems/sgs_dd_model.c
index 57f90889cf..b8e6dcdbc5 100644
--- a/examples/fluids/problems/sgs_dd_model.c
+++ b/examples/fluids/problems/sgs_dd_model.c
@@ -209,7 +209,8 @@ static PetscErrorCode SgsDDSetupNodalEvaluation_Sequential_Torch(Ceed ceed, SgsD
   PetscCallCeed(ceed, CeedGetResource(ceed, &ceed_resource));
   if (strstr(ceed_resource, "/gpu/cuda")) model_device_type = TORCH_DEVICE_CUDA;
   else if (strstr(ceed_resource, "/gpu/hip")) model_device_type = TORCH_DEVICE_HIP;
-  else if (strstr(ceed_resource, "/gpu/sycl")) model_device_type = TORCH_DEVICE_XPU;
+  // On-device XPU is not working reliably currently, default to CPU inference evaluation
+  // else if (strstr(ceed_resource, "/gpu/sycl")) model_device_type = TORCH_DEVICE_XPU;
   else model_device_type = TORCH_DEVICE_CPU;
   PetscCall(PetscOptionsGetEnum(NULL, NULL, "-sgs_model_dd_torch_model_device", TorchDeviceTypes, (PetscEnum *)&model_device_type, NULL));
   PetscCall(PetscOptionsGetString(NULL, NULL, "-sgs_model_dd_torch_model_path", model_path, sizeof(model_path), NULL));

From c0bf4d34824fff9c5386363796395f42cde174e3 Mon Sep 17 00:00:00 2001
From: James Wright <james@jameswright.xyz>
Date: Fri, 24 May 2024 11:46:33 -0600
Subject: [PATCH 067/571] fluids: Force recompilation if Makefile changes

---
 Makefile                 | 2 +-
 examples/fluids/Makefile | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/Makefile b/Makefile
index a8acff1644..4c10f10250 100644
--- a/Makefile
+++ b/Makefile
@@ -623,7 +623,7 @@ $(OBJDIR)/petsc-% : examples/petsc/%.c examples/petsc/libutils.a.PHONY $(libceed
 	  PETSC_DIR="$(abspath $(PETSC_DIR))" OPT="$(OPT)" $*
 	cp examples/petsc/$* $@
 
-$(OBJDIR)/fluids-% : examples/fluids/%.c examples/fluids/src/*.c examples/fluids/*.h examples/fluids/problems/*.c examples/fluids/problems/torch/*.cpp examples/fluids/qfunctions/*.h examples/fluids/src/smartsim/*.c $(libceed) $(ceed.pc) | $$(@D)/.DIR
+$(OBJDIR)/fluids-% : examples/fluids/%.c examples/fluids/src/*.c examples/fluids/*.h examples/fluids/problems/*.c examples/fluids/problems/torch/*.cpp examples/fluids/qfunctions/*.h examples/fluids/src/smartsim/*.c $(libceed) $(ceed.pc) examples/fluids/Makefile | $$(@D)/.DIR
 	+$(call quiet,MAKE) -C examples/fluids CEED_DIR=`pwd` \
 	  PETSC_DIR="$(abspath $(PETSC_DIR))" OPT="$(OPT)" $*
 	cp examples/fluids/$* $@
diff --git a/examples/fluids/Makefile b/examples/fluids/Makefile
index 3465b77af7..bcec70e4f9 100644
--- a/examples/fluids/Makefile
+++ b/examples/fluids/Makefile
@@ -103,10 +103,10 @@ navierstokes: $(src.o) | $(PETSc.pc) $(ceed.pc)
 # Quiet, color output
 quiet ?= $($(1))
 
-$(OBJDIR)/%.o : %.c | $$(@D)/.DIR
+$(OBJDIR)/%.o : %.c  Makefile | $$(@D)/.DIR
 	$(call quiet,CC) $(CPPFLAGS) $(CFLAGS) -c -o $@ $(abspath $<)
 
-$(OBJDIR)/%.o : %.cpp | $$(@D)/.DIR
+$(OBJDIR)/%.o : %.cpp Makefile | $$(@D)/.DIR
 	$(call quiet,CXX) $(CPPFLAGS) $(CXXFLAGS) -c -o $@ $(abspath $<)
 
 print: $(PETSc.pc) $(ceed.pc)

From 6c7f295c86caf7e1632954e5e8e37f2d592c6f7f Mon Sep 17 00:00:00 2001
From: James Wright <james@jameswright.xyz>
Date: Wed, 10 Jan 2024 09:53:45 -0700
Subject: [PATCH 068/571] docs(fluids): Spanwise stats

---
 examples/fluids/index.md | 106 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 106 insertions(+)

diff --git a/examples/fluids/index.md b/examples/fluids/index.md
index f035168ec0..fa380989a0 100644
--- a/examples/fluids/index.md
+++ b/examples/fluids/index.md
@@ -317,6 +317,112 @@ Note that this wave speed is specific to ideal gases as $\gamma$ is an ideal gas
 Currently, this demo provides three types of problems/physical models that can be selected at run time via the option `-problem`.
 {ref}`problem-advection`, the problem of the transport of energy in a uniform vector velocity field, {ref}`problem-euler-vortex`, the exact solution to the Euler equations, and the so called {ref}`problem-density-current` problem.
 
+### Statistics Collection
+For scale-resolving simulations (such as LES and DNS), statistics for a simulation are more often useful than time-instantaneous snapshots of the simulation itself.
+To make this process more computationally efficient, averaging in the spanwise direction, if physically correct, can help reduce the amount of simulation time needed to get converged statistics.
+
+First, let's more precisely define what we mean by spanwise average.
+Denote $\langle \phi \rangle$ as the Reynolds average of $\phi$, which in this case would be a average over the spanwise direction and time:
+
+$$
+\langle \phi \rangle(x,y) = \frac{1}{L_z + (T_f - T_0)}\int_0^{L_z} \int_{T_0}^{T_f} \phi(x, y, z, t) \mathrm{d}t \mathrm{d}z
+$$
+
+where $z$ is the spanwise direction, the domain has size $[0, L_z]$ in the spanwise direction, and $[T_0, T_f]$ is the range of time being averaged over.
+Note that here and in the code, **we assume the spanwise direction to be in the $z$ direction**.
+
+To discuss the details of the implementation we'll first discuss the spanwise integral, then the temporal integral, and lastly the statistics themselves.
+
+#### Spanwise Integral
+The function $\langle \phi \rangle (x,y)$ is represented on a 2-D finite element grid, taken from the full domain mesh itself.
+If isoperiodicity is set, the periodic face is extracted as the spanwise statistics mesh.
+Otherwise the negative z face is used.
+We'll refer to this mesh as the *parent grid*, as for every "parent" point in the parent grid, there are many "child" points in the full domain.
+Define a function space on the parent grid as $\mathcal{V}_p^\mathrm{parent} = \{ \bm v(\bm x) \in H^{1}(\Omega_e^\mathrm{parent}) \,|\, \bm v(\bm x_e(\bm X)) \in P_p(\bm{I}), e=1,\ldots,N_e \}$.
+We enforce that the order of the parent FEM space is equal to the full domain's order.
+
+Many statistics are the product of 2 or more solution functions, which results in functions of degree higher than the parent FEM space, $\mathcal{V}_p^\mathrm{parent}$.
+To represent these higher-order functions on the parent FEM space, we perform an $L^2$ projection.
+Define the spanwise averaged function as:
+
+$$
+\langle \phi \rangle_z(x,y,t) = \frac{1}{L_z} \int_0^{L_z} \phi(x, y, z, t) \mathrm{d}z
+$$
+
+where the function $\phi$ may be the product of multiple solution functions and $\langle \phi \rangle_z$ denotes the spanwise average.
+The projection of a function $u$ onto the parent FEM space would look like:
+
+$$
+\bm M u_N = \int_0^{L_x} \int_0^{L_y} u \psi^\mathrm{parent}_N \mathrm{d}y \mathrm{d}x
+$$
+where $\bm M$ is the mass matrix for $\mathcal{V}_p^\mathrm{parent}$, $u_N$ the coefficients of the projected function, and $\psi^\mathrm{parent}_N$ the basis functions of the parent FEM space.
+Substituting the spanwise average of $\phi$ for $u$, we get:
+
+$$
+\bm M [\langle \phi \rangle_z]_N = \int_0^{L_x} \int_0^{L_y} \left [\frac{1}{L_z} \int_0^{L_z} \phi(x,y,z,t) \mathrm{d}z \right ] \psi^\mathrm{parent}_N(x,y) \mathrm{d}y \mathrm{d}x
+$$
+
+The triple integral in the right hand side is just an integral over the full domain
+
+$$
+\bm M [\langle \phi \rangle_z]_N = \frac{1}{L_z} \int_\Omega \phi(x,y,z,t) \psi^\mathrm{parent}_N(x,y) \mathrm{d}\Omega
+$$
+
+We need to evaluate $\psi^\mathrm{parent}_N$ at quadrature points in the full domain.
+To do this efficiently, **we assume and exploit the full domain grid to be a tensor product in the spanwise direction**.
+This assumption means quadrature points in the full domain have the same $(x,y)$ coordinate location as quadrature points in the parent domain.
+This also allows the use of the full domain quadrature weights for the triple integral.
+
+#### Temporal Integral/Averaging
+To calculate the temporal integral, we do a running average using left-rectangle rule.
+At the beginning of each simulation, the time integral of a statistic is set to 0, $\overline{\phi} = 0$.
+Periodically, the integral is updated using left-rectangle rule:
+
+$$\overline{\phi}_\mathrm{new} = \overline{\phi}_{\mathrm{old}} + \phi(t_\mathrm{new}) \Delta T$$
+where $\phi(t_\mathrm{new})$ is the statistic at the current time and $\Delta T$ is the time since the last update.
+When stats are written out to file, this running sum is then divided by $T_f - T_0$ to get the time average.
+
+With this method of calculating the running time average, we can plug this into the $L^2$ projection of the spanwise integral:
+
+$$
+\bm M [\langle \phi \rangle]_N = \frac{1}{L_z + (T_f - T_0)} \int_\Omega \int_{T_0}^{T_f} \phi(x,y,z,t) \psi^\mathrm{parent}_N \mathrm{d}t \mathrm{d}\Omega
+$$
+where the integral $\int_{T_0}^{T_f} \phi(x,y,z,t) \mathrm{d}t$ is calculated on a running basis.
+
+
+#### Running
+As the simulation runs, it takes a running time average of the statistics at the full domain quadrature points.
+This running average is only updated at the interval specified by `-ts_monitor_turbulence_spanstats_collect_interval` as number of timesteps.
+The $L^2$ projection problem is only solved when statistics are written to file, which is controlled by `-ts_monitor_turbulence_spanstats_viewer_interval`.
+Note that the averaging is not reset after each file write.
+The average is always over the bounds $[T_0, T_f]$, where $T_f$ in this case would be the time the file was written at and $T_0$ is the solution time at the beginning of the run.
+
+#### Turbulent Statistics
+
+The focus here are those statistics that are relevant to turbulent flow.
+The terms collected are listed below, with the mathematical definition on the left and the label (present in CGNS output files) is on the right.
+
+| Math                           | Label                           |
+| -----------------              | --------                        |
+| $\langle \rho \rangle$         | MeanDensity                     |
+| $\langle p \rangle$            | MeanPressure                    |
+| $\langle p^2 \rangle$          | MeanPressureSquared             |
+| $\langle p u_i \rangle$        | MeanPressureVelocity[$i$]       |
+| $\langle \rho T \rangle$       | MeanDensityTemperature          |
+| $\langle \rho T u_i \rangle$   | MeanDensityTemperatureFlux[$i$] |
+| $\langle \rho u_i \rangle$     | MeanMomentum[$i$]               |
+| $\langle \rho u_i u_j \rangle$ | MeanMomentumFlux[$ij$]          |
+| $\langle u_i \rangle$          | MeanVelocity[$i$]               |
+
+where [$i$] are suffixes to the labels. So $\langle \rho u_x u_y \rangle$ would correspond to MeanMomentumFluxXY.
+This naming convention attempts to mimic the CGNS standard.
+
+To get second-order statistics from these terms, simply use the identity:
+
+$$
+\langle \phi' \theta' \rangle = \langle \phi \theta \rangle - \langle \phi \rangle \langle \theta \rangle
+$$
+
 ### Subgrid Stress Modeling
 
 When a fluid simulation is under-resolved (the smallest length scale resolved by the grid is much larger than the smallest physical scale, the [Kolmogorov length scale](https://en.wikipedia.org/wiki/Kolmogorov_microscales)), this is mathematically interpreted as filtering the Navier-Stokes equations.

From 637c7b11eaf8aa85ae3f6db9c10f649d172df128 Mon Sep 17 00:00:00 2001
From: James Wright <james@jameswright.xyz>
Date: Fri, 24 May 2024 13:43:21 -0600
Subject: [PATCH 069/571] test(fluids): Add SGS torch testing

---
 .gitlab-ci.yml                       | 16 ++++++++++++----
 Makefile                             | 11 +++++++----
 examples/fluids/navierstokes.c       |  4 ++--
 examples/fluids/pytorch_pkgconfig.py |  2 ++
 tests/junit.py                       |  8 +++++++-
 5 files changed, 30 insertions(+), 11 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 497bfd3139..78e052b855 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -96,9 +96,13 @@ noether-cpu:
 # Libraries for examples
 # -- PETSc with HIP (minimal)
     - export PETSC_DIR=/projects/petsc PETSC_ARCH=mpich-hip-int64 && git -C $PETSC_DIR -c safe.directory=$PETSC_DIR describe
-    - source /home/phypid/SmartSimTestingSoftware/bin/activate && export SMARTREDIS_DIR=/home/phypid/SmartSimTestingSoftware/smartredis/install
+    - source /home/phypid/spack/share/spack/setup-env.sh && spack load py-torch@2.3+cuda && export USE_TORCH=1
+    - export SMARTREDIS_DIR=/home/phypid/SmartSimTestingSoftware/smartredis/install
     - echo "-------------- PETSc ---------------" && make -C $PETSC_DIR info
-    - make -k -j$((NPROC_CPU / NPROC_POOL)) BACKENDS="$BACKENDS_CPU" JUNIT_BATCH="cpu" junit search="petsc fluids solids"
+    - make -k -j$((NPROC_CPU / NPROC_POOL)) BACKENDS="$BACKENDS_CPU" JUNIT_BATCH="cpu" junit search="petsc fluids-navierstokes solids"
+    - spack unload py-torch@2.3+cuda
+    - source /home/phypid/SmartSimTestingSoftware/bin/activate
+    - make -k -j$((NPROC_CPU / NPROC_POOL)) BACKENDS="$BACKENDS_CPU" JUNIT_BATCH="cpu" junit search="fluids-py-smartsim_regression_framework"
 # -- MFEM v4.6
     - cd .. && export MFEM_VERSION=mfem-4.6 && { [[ -d $MFEM_VERSION ]] || { git clone --depth 1 --branch v4.6 https://github.com/mfem/mfem.git $MFEM_VERSION && make -C $MFEM_VERSION -j$(nproc) serial CXXFLAGS="-O -std=c++11"; }; } && export MFEM_DIR=$PWD/$MFEM_VERSION && cd libCEED
     - echo "-------------- MFEM ----------------" && make -C $MFEM_DIR info
@@ -196,9 +200,13 @@ noether-cuda:
 # Libraries for examples
 # -- PETSc with CUDA (minimal)
     - export PETSC_DIR=/projects/petsc PETSC_ARCH=mpich-cuda-O PETSC_OPTIONS='-use_gpu_aware_mpi 0' && git -C $PETSC_DIR -c safe.directory=$PETSC_DIR describe
-    - source /home/phypid/SmartSimTestingSoftware/bin/activate && export SMARTREDIS_DIR=/home/phypid/SmartSimTestingSoftware/smartredis/install
+    - source /home/phypid/spack/share/spack/setup-env.sh && spack load py-torch@2.3+cuda && export USE_TORCH=1
+    - export SMARTREDIS_DIR=/home/phypid/SmartSimTestingSoftware/smartredis/install
     - echo "-------------- PETSc ---------------" && make -C $PETSC_DIR info
-    - make -k -j$((NPROC_GPU / NPROC_POOL)) JUNIT_BATCH="cuda" junit BACKENDS="$BACKENDS_GPU" search="petsc fluids solids"
+    - make -k -j$((NPROC_GPU / NPROC_POOL)) JUNIT_BATCH="cuda" junit BACKENDS="$BACKENDS_GPU" search="petsc fluids-navierstokes solids"
+    - spack unload py-torch@2.3+cuda
+    - source /home/phypid/SmartSimTestingSoftware/bin/activate
+    - make -k -j$((NPROC_CPU / NPROC_POOL)) BACKENDS="$BACKENDS_CPU" JUNIT_BATCH="cpu" junit search="fluids-py-smartsim_regression_framework"
 # Clang-tidy
     - echo "-------------- clang-tidy ----------" && clang-tidy --version
     - TIDY_OPTS="-fix-errors" make -j$NPROC_CPU tidy && git diff --color=always --exit-code
diff --git a/Makefile b/Makefile
index 4c10f10250..e08d834282 100644
--- a/Makefile
+++ b/Makefile
@@ -100,6 +100,9 @@ endif
 # SmartSim testing
 SMARTREDIS_DIR ?=
 
+# PyTorch testing
+USE_TORCH ?=
+
 # Warning: SANTIZ options still don't run with /gpu/occa
 AFLAGS ?= -fsanitize=address #-fsanitize=undefined -fno-omit-frame-pointer
 
@@ -652,7 +655,7 @@ NPROC_POOL ?= 1
 export NPROC_POOL
 
 run-% : $(OBJDIR)/%
-	@$(PYTHON) tests/junit.py --mode tap --ceed-backends $(BACKENDS) $(if $(SMARTREDIS_DIR),--smartredis_dir $(SMARTREDIS_DIR) )--nproc $(NPROC_TEST) --pool-size $(NPROC_POOL) $(<:$(OBJDIR)/%=%)
+	@$(PYTHON) tests/junit.py --mode tap --ceed-backends $(BACKENDS) $(if $(SMARTREDIS_DIR),--smartredis_dir $(SMARTREDIS_DIR) ) $(if $(USE_TORCH),--has_torch $(USE_TORCH) )--nproc $(NPROC_TEST) --pool-size $(NPROC_POOL) $(<:$(OBJDIR)/%=%)
 
 external_examples := \
 	$(if $(MFEM_DIR),$(mfemexamples)) \
@@ -686,7 +689,7 @@ ctc-% : $(ctests);@$(foreach tst,$(ctests),$(tst) /cpu/$*;)
 
 prove : $(matched)
 	$(info Testing backends: $(BACKENDS))
-	$(PROVE) $(PROVE_OPTS) --exec '$(PYTHON) tests/junit.py --mode tap --ceed-backends $(BACKENDS) $(if $(SMARTREDIS_DIR),--smartredis_dir $(SMARTREDIS_DIR) )--nproc $(NPROC_TEST) --pool-size $(NPROC_POOL)' $(matched:$(OBJDIR)/%=%)
+	$(PROVE) $(PROVE_OPTS) --exec '$(PYTHON) tests/junit.py --mode tap --ceed-backends $(BACKENDS) $(if $(SMARTREDIS_DIR),--smartredis_dir $(SMARTREDIS_DIR) ) $(if $(USE_TORCH),--has_torch $(USE_TORCH) )--nproc $(NPROC_TEST) --pool-size $(NPROC_POOL)' $(matched:$(OBJDIR)/%=%)
 # Run prove target in parallel
 prv : ;@$(MAKE) $(MFLAGS) V=$(V) prove
 
@@ -694,7 +697,7 @@ prove-all :
 	+$(MAKE) prove realsearch=%
 
 junit-% : $(OBJDIR)/%
-	@printf "  %10s %s\n" TEST $(<:$(OBJDIR)/%=%); $(PYTHON) tests/junit.py --ceed-backends $(BACKENDS) $(if $(SMARTREDIS_DIR),--smartredis_dir $(SMARTREDIS_DIR) )--nproc $(NPROC_TEST) --pool-size $(NPROC_POOL) --junit-batch $(JUNIT_BATCH) $(<:$(OBJDIR)/%=%)
+	@printf "  %10s %s\n" TEST $(<:$(OBJDIR)/%=%); $(PYTHON) tests/junit.py --ceed-backends $(BACKENDS) $(if $(SMARTREDIS_DIR),--smartredis_dir $(SMARTREDIS_DIR) ) $(if $(USE_TORCH),--has_torch $(USE_TORCH) )--nproc $(NPROC_TEST) --pool-size $(NPROC_POOL) --junit-batch $(JUNIT_BATCH) $(<:$(OBJDIR)/%=%)
 
 junit : $(matched:$(OBJDIR)/%=junit-%)
 
@@ -852,7 +855,7 @@ print-% :
 CONFIG_VARS = CC CXX FC NVCC NVCC_CXX HIPCC \
   OPT CFLAGS CPPFLAGS CXXFLAGS FFLAGS NVCCFLAGS HIPCCFLAGS SYCLFLAGS \
   AR ARFLAGS LDFLAGS LDLIBS LIBCXX SED \
-  MAGMA_DIR OCCA_DIR XSMM_DIR CUDA_DIR CUDA_ARCH MFEM_DIR PETSC_DIR NEK5K_DIR ROCM_DIR HIP_ARCH SYCL_DIR SMARTREDIS_DIR
+  MAGMA_DIR OCCA_DIR XSMM_DIR CUDA_DIR CUDA_ARCH MFEM_DIR PETSC_DIR NEK5K_DIR ROCM_DIR HIP_ARCH SYCL_DIR SMARTREDIS_DIR USE_TORCH
 
 # $(call needs_save,CFLAGS) returns true (a nonempty string) if CFLAGS
 # was set on the command line or in config.mk (where it will appear as
diff --git a/examples/fluids/navierstokes.c b/examples/fluids/navierstokes.c
index 249a4862d6..cc619ea82c 100644
--- a/examples/fluids/navierstokes.c
+++ b/examples/fluids/navierstokes.c
@@ -18,8 +18,8 @@
 //     ./navierstokes -ceed /cpu/self -options_file gaussianwave.yml
 //     ./navierstokes -ceed /gpu/cuda -problem advection -degree 1
 //
-//TESTARGS(name="Blasius, SGS DataDriven Sequential Torch") -ceed {ceed_resource} -options_file examples/fluids/tests-output/blasius_stgtest.yaml -sgs_model_type data_driven -sgs_model_dd_leakyrelu_alpha 0.3 -sgs_model_dd_parameter_dir examples/fluids/dd_sgs_data -ts_dt 2e-9 -state_var primitive -ksp_rtol 1e-12 -snes_rtol 1e-12 -stg_mean_only -stg_fluctuating_IC -test_type solver -compare_final_state_atol 1e-10 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-blasius-sgs-data-driven.bin -sgs_model_dd_implementation sequential_torch -sgs_model_dd_torch_model_path ./examples/fluids/tests-output/createPyTorchModel/NNModel_HIT_fp64_jit.pt
-//TESTARGS(name="Blasius, SGS DataDriven Sequential Ceed") -ceed {ceed_resource} -options_file examples/fluids/tests-output/blasius_stgtest.yaml -sgs_model_type data_driven -sgs_model_dd_leakyrelu_alpha 0.3 -sgs_model_dd_parameter_dir examples/fluids/dd_sgs_data -ts_dt 2e-9 -state_var primitive -ksp_rtol 1e-12 -snes_rtol 1e-12 -stg_mean_only -stg_fluctuating_IC -test_type solver -compare_final_state_atol 1e-10 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-blasius-sgs-data-driven.bin -sgs_model_dd_implementation sequential_internal
+//TESTARGS(name="Blasius, SGS DataDriven Sequential Torch",only="torch") -ceed {ceed_resource} -options_file examples/fluids/tests-output/blasius_stgtest.yaml -sgs_model_type data_driven -sgs_model_dd_leakyrelu_alpha 0.3 -sgs_model_dd_parameter_dir examples/fluids/dd_sgs_data -ts_dt 2e-9 -state_var primitive -ksp_rtol 1e-12 -snes_rtol 1e-12 -stg_mean_only -stg_fluctuating_IC -test_type solver -compare_final_state_atol 1e-10 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-blasius-sgs-data-driven.bin -sgs_model_dd_implementation sequential_torch -sgs_model_dd_torch_model_path ./examples/fluids/tests-output/createPyTorchModel/NNModel_HIT_fp64_jit.pt
+//TESTARGS(name="Blasius, SGS DataDriven Sequential Ceed") -ceed {ceed_resource} -options_file examples/fluids/tests-output/blasius_stgtest.yaml -sgs_model_type data_driven -sgs_model_dd_leakyrelu_alpha 0.3 -sgs_model_dd_parameter_dir examples/fluids/dd_sgs_data -ts_dt 2e-9 -state_var primitive -ksp_rtol 1e-12 -snes_rtol 1e-12 -stg_mean_only -stg_fluctuating_IC -test_type solver -compare_final_state_atol 1e-10 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-blasius-sgs-data-driven.bin -sgs_model_dd_implementation sequential_ceed
 //TESTARGS(name="Gaussian Wave, explicit, supg, IDL") -ceed {ceed_resource} -test_type solver -options_file examples/fluids/gaussianwave.yaml -compare_final_state_atol 1e-8 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-gaussianwave-explicit.bin -dm_plex_box_faces 2,2,1 -ts_max_steps 5 -degree 3 -implicit false -ts_type rk -stab supg -state_var conservative -mass_ksp_type gmres -mass_pc_jacobi_type diagonal -idl_decay_time 2e-3 -idl_length 0.25 -idl_start 0 -idl_pressure 70
 //TESTARGS(name="Advection 2D, rotation, explicit, supg, consistent mass") -ceed {ceed_resource} -test_type solver -problem advection -degree 3 -dm_plex_box_faces 2,2 -dm_plex_box_lower 0,0 -dm_plex_box_upper 125,125 -bc_wall 1,2,3,4 -wall_comps 4 -units_kilogram 1e-9 -rc 100. -ts_dt 1e-3 -ts_max_steps 10 -stab supg -Ctaus 0.5 -mass_ksp_type gmres -mass_pc_type vpbjacobi -compare_final_state_atol 1e-10 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-adv2d-rotation-explicit-stab-supg-consistent-mass.bin
 //TESTARGS(name="Advection, skew") -ceed {ceed_resource} -test_type solver -options_file examples/fluids/advection.yaml -ts_max_steps 5 -wind_type translation -wind_translation -0.5547002,0.83205029,0 -advection_ic_type skew  -dm_plex_box_faces 2,1,1 -degree 2 -stab supg -stab_tau advdiff_shakib -Ctau_a 4 -ksp_type gmres -diffusion_coeff 5e-4 -compare_final_state_atol 7e-10 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-adv-skew.bin
diff --git a/examples/fluids/pytorch_pkgconfig.py b/examples/fluids/pytorch_pkgconfig.py
index de3ed0991b..8cfdb96c32 100644
--- a/examples/fluids/pytorch_pkgconfig.py
+++ b/examples/fluids/pytorch_pkgconfig.py
@@ -5,6 +5,8 @@
 import re
 
 build_dir = Path('./build')
+if not build_dir.is_dir():
+    build_dir.mkdir()
 pkgconfig_path = build_dir / 'libtorch.pc'
 
 variables = {}
diff --git a/tests/junit.py b/tests/junit.py
index 041582c172..27d18220ea 100755
--- a/tests/junit.py
+++ b/tests/junit.py
@@ -28,6 +28,7 @@ def create_argparser() -> argparse.ArgumentParser:
     parser.add_argument('-b', '--junit-batch', type=str, default='', help='Name of JUnit batch for output file')
     parser.add_argument('-np', '--pool-size', type=int, default=1, help='Number of test cases to run in parallel')
     parser.add_argument('-s', '--smartredis_dir', type=str, default='', help='path to SmartSim library, if present')
+    parser.add_argument('--has_torch', type=bool, default=False, help='Whether to build with torch')
     parser.add_argument('test', help='Test executable', nargs='?')
 
     return parser
@@ -35,6 +36,9 @@ def create_argparser() -> argparse.ArgumentParser:
 
 # Necessary functions for running tests
 class CeedSuiteSpec(SuiteSpec):
+    def __init__(self, has_torch: bool):
+        self.has_torch: bool = has_torch
+
     def get_source_path(self, test: str) -> Path:
         """Compute path to test source file
 
@@ -110,6 +114,8 @@ def check_pre_skip(self, test: str, spec: TestSpec, resource: str, nproc: int) -
         for condition in spec.only:
             if (condition == 'cpu') and ('gpu' in resource):
                 return 'CPU only test with GPU backend'
+            if condition == 'torch' and not self.has_torch:
+                return 'PyTorch only test without USE_TORCH=1'
 
     def check_post_skip(self, test: str, spec: TestSpec, resource: str, stderr: str) -> Optional[str]:
         """Check if a test case should be allowed to fail, based on its stderr output
@@ -228,7 +234,7 @@ def check_allowed_stdout(self, test: str) -> bool:
             args.ceed_backends,
             args.mode,
             args.nproc,
-            CeedSuiteSpec(),
+            CeedSuiteSpec(args.has_torch),
             args.pool_size)
 
     # write output and check for failures

From cee3daa9f9aeb65d7ef0670c807c7894c670da7f Mon Sep 17 00:00:00 2001
From: James Wright <james@jameswright.xyz>
Date: Fri, 7 Jun 2024 10:47:53 -0600
Subject: [PATCH 070/571] build: Update fluids Makefile rules

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index e08d834282..85c5a9d481 100644
--- a/Makefile
+++ b/Makefile
@@ -626,7 +626,7 @@ $(OBJDIR)/petsc-% : examples/petsc/%.c examples/petsc/libutils.a.PHONY $(libceed
 	  PETSC_DIR="$(abspath $(PETSC_DIR))" OPT="$(OPT)" $*
 	cp examples/petsc/$* $@
 
-$(OBJDIR)/fluids-% : examples/fluids/%.c examples/fluids/src/*.c examples/fluids/*.h examples/fluids/problems/*.c examples/fluids/problems/torch/*.cpp examples/fluids/qfunctions/*.h examples/fluids/src/smartsim/*.c $(libceed) $(ceed.pc) examples/fluids/Makefile | $$(@D)/.DIR
+$(OBJDIR)/fluids-% : examples/fluids/%.c examples/fluids/src/*.c examples/fluids/*.h examples/fluids/include/*.h examples/fluids/problems/*.c examples/fluids/problems/torch/*.cpp examples/fluids/qfunctions/*.h examples/fluids/src/smartsim/*.c $(libceed) $(ceed.pc) examples/fluids/Makefile | $$(@D)/.DIR
 	+$(call quiet,MAKE) -C examples/fluids CEED_DIR=`pwd` \
 	  PETSC_DIR="$(abspath $(PETSC_DIR))" OPT="$(OPT)" $*
 	cp examples/fluids/$* $@

From a8748852738d115cb225422dc30b7f1a40541446 Mon Sep 17 00:00:00 2001
From: James Wright <james@jameswright.xyz>
Date: Thu, 25 Apr 2024 09:14:59 -0600
Subject: [PATCH 071/571] fluids: Create BCDefinition, use for essential BCs

---
 examples/fluids/include/bc_definition.h  |  45 ++++++++++
 examples/fluids/navierstokes.c           |   5 ++
 examples/fluids/navierstokes.h           |  31 +++----
 examples/fluids/src/bc_definition.c      | 106 +++++++++++++++++++++++
 examples/fluids/src/boundary_condition.c | 100 +++++++++++++++++++++
 examples/fluids/src/cloptions.c          |  50 -----------
 examples/fluids/src/setupdm.c            |  31 +++----
 7 files changed, 285 insertions(+), 83 deletions(-)
 create mode 100644 examples/fluids/include/bc_definition.h
 create mode 100644 examples/fluids/src/bc_definition.c
 create mode 100644 examples/fluids/src/boundary_condition.c

diff --git a/examples/fluids/include/bc_definition.h b/examples/fluids/include/bc_definition.h
new file mode 100644
index 0000000000..53fff9f23f
--- /dev/null
+++ b/examples/fluids/include/bc_definition.h
@@ -0,0 +1,45 @@
+// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+#pragma once
+
+#include <ceed.h>
+#include <petsc.h>
+
+typedef struct _p_BCDefinition *BCDefinition;
+struct _p_BCDefinition {
+  char *name;
+
+  // Boundary ID information
+  PetscInt num_label_values, *label_values, dm_field;
+
+  // Essential Boundary information
+  PetscInt num_essential_comps, *essential_comps;
+};
+
+/**
+   @brief Creates a `BCDefinition` from an array of integers in an option in the database
+
+   Must be between `PetscOptionsBegin()` and `PetscOptionsEnd()`.
+
+   @param[in]  opt    The option one is seeking
+   @param[in]  text   Short string describing option
+   @param[in]  man    Manual page for the option
+   @param[in]  name   String that sets the name of the `BCDefinition`
+   @param[out] bc_def Resulting `BCDefinition`, `NULL` if option is not set
+   @param[out] set    `PETSC_TRUE` if found, else `PETSC_FALSE`
+**/
+#define PetscOptionsBCDefinition(opt, text, man, name, bc_def, set) \
+  PetscOptionsBCDefinition_Private(PetscOptionsObject, opt, text, man, name, bc_def, set)
+PetscErrorCode PetscOptionsBCDefinition_Private(PetscOptionItems *PetscOptionsObject, const char opt[], const char text[], const char man[],
+                                                const char name[], BCDefinition *bc_def, PetscBool *set);
+
+PetscErrorCode BCDefinitionCreate(const char *name, PetscInt num_label_values, PetscInt label_values[], BCDefinition *bc_def);
+PetscErrorCode BCDefinitionGetInfo(BCDefinition bc_def, const char *name[], PetscInt *num_label_values, const PetscInt *label_values[]);
+PetscErrorCode BCDefinitionDestroy(BCDefinition *bc_def);
+
+PetscErrorCode BCDefinitionSetEssential(BCDefinition bc_def, PetscInt num_essential_comps, PetscInt essential_comps[]);
+PetscErrorCode BCDefinitionGetEssential(BCDefinition bc_def, PetscInt *num_essential_comps, const PetscInt *essential_comps[]);
diff --git a/examples/fluids/navierstokes.c b/examples/fluids/navierstokes.c
index cc619ea82c..2377aa4b98 100644
--- a/examples/fluids/navierstokes.c
+++ b/examples/fluids/navierstokes.c
@@ -107,6 +107,7 @@ int main(int argc, char **argv) {
   MPI_Comm comm = PETSC_COMM_WORLD;
   user->comm    = comm;
   PetscCall(ProcessCommandLineOptions(comm, app_ctx, bc));
+  PetscCall(BoundaryConditionSetUp(user, problem, app_ctx, bc));
 
   // ---------------------------------------------------------------------------
   // Initialize libCEED
@@ -334,6 +335,10 @@ int main(int argc, char **argv) {
   PetscCall(PetscViewerDestroy(&app_ctx->wall_forces.viewer));
 
   // -- Structs
+  for (PetscInt i = 0; i < problem->num_bc_defs; i++) {
+    PetscCall(BCDefinitionDestroy(&problem->bc_defs[i]));
+  }
+  PetscCall(PetscFree(problem->bc_defs));
   PetscCall(PetscFree(units));
   PetscCall(PetscFree(user));
   PetscCall(PetscFree(problem));
diff --git a/examples/fluids/navierstokes.h b/examples/fluids/navierstokes.h
index de1fd300f0..8c069892d1 100644
--- a/examples/fluids/navierstokes.h
+++ b/examples/fluids/navierstokes.h
@@ -7,6 +7,7 @@
 #pragma once
 
 #include <ceed.h>
+#include <bc_definition.h>
 #include <log_events.h>
 #include <mat-ceed.h>
 #include <petsc-ceed-utils.h>
@@ -90,12 +91,13 @@ static const char *const DifferentialFilterDampingFunctions[] = {
 // Structs
 // -----------------------------------------------------------------------------
 // Structs declarations
-typedef struct AppCtx_private   *AppCtx;
-typedef struct CeedData_private *CeedData;
-typedef struct User_private     *User;
-typedef struct Units_private    *Units;
-typedef struct SimpleBC_private *SimpleBC;
-typedef struct Physics_private  *Physics;
+typedef struct AppCtx_private      *AppCtx;
+typedef struct CeedData_private    *CeedData;
+typedef struct User_private        *User;
+typedef struct Units_private       *Units;
+typedef struct SimpleBC_private    *SimpleBC;
+typedef struct Physics_private     *Physics;
+typedef struct ProblemData_private *ProblemData;
 
 // Application context from user command line options
 struct AppCtx_private {
@@ -255,12 +257,8 @@ struct Units_private {
 
 // Boundary conditions
 struct SimpleBC_private {
-  PetscInt num_wall,  // Number of faces with wall BCs
-      wall_comps[5],  // An array of constrained component numbers
-      num_comps,
-      num_symmetry[3],  // Number of faces with symmetry BCs
-      num_inflow, num_outflow, num_freestream, num_slip;
-  PetscInt walls[16], symmetries[3][16], inflows[16], outflows[16], freestreams[16], slips[16];
+  PetscInt num_inflow, num_outflow, num_freestream, num_slip;
+  PetscInt inflows[16], outflows[16], freestreams[16], slips[16];
 };
 
 // Struct that contains all enums and structs used for the physics of all problems
@@ -273,6 +271,8 @@ struct Physics_private {
   CeedContextFieldLabel ics_time_label;
 };
 
+PetscErrorCode BoundaryConditionSetUp(User user, ProblemData problem, AppCtx app_ctx, SimpleBC bc);
+
 typedef struct {
   CeedQFunctionUser    qfunction;
   const char          *qfunction_loc;
@@ -280,14 +280,15 @@ typedef struct {
 } ProblemQFunctionSpec;
 
 // Problem specific data
-typedef struct ProblemData_private *ProblemData;
 struct ProblemData_private {
   CeedInt              dim, q_data_size_vol, q_data_size_sur, jac_data_size_sur;
   CeedScalar           dm_scale;
   ProblemQFunctionSpec ics, apply_vol_rhs, apply_vol_ifunction, apply_vol_ijacobian, apply_inflow, apply_outflow, apply_freestream, apply_slip,
       apply_inflow_jacobian, apply_outflow_jacobian, apply_freestream_jacobian, apply_slip_jacobian;
-  bool      non_zero_time;
-  PetscBool bc_from_ics, use_strong_bc_ceed, uses_newtonian;
+  bool          non_zero_time;
+  PetscBool     bc_from_ics, use_strong_bc_ceed, uses_newtonian;
+  size_t        num_bc_defs;
+  BCDefinition *bc_defs;
   PetscErrorCode (*print_info)(User, ProblemData, AppCtx);
   PetscErrorCode (*create_mass_operator)(User, CeedOperator *);
 };
diff --git a/examples/fluids/src/bc_definition.c b/examples/fluids/src/bc_definition.c
new file mode 100644
index 0000000000..5d9b467057
--- /dev/null
+++ b/examples/fluids/src/bc_definition.c
@@ -0,0 +1,106 @@
+// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+#include <bc_definition.h>
+
+/**
+   @brief Create `BCDefinition`
+
+   @param[in]  name             Name of the boundary condition
+   @param[in]  num_label_values Number of `DMLabel` values
+   @param[in]  label_values     Array of label values that define the boundaries controlled by the `BCDefinition`, size `num_label_values`
+   @param[out] bc_def           The new `BCDefinition`
+**/
+PetscErrorCode BCDefinitionCreate(const char *name, PetscInt num_label_values, PetscInt label_values[], BCDefinition *bc_def) {
+  PetscFunctionBeginUser;
+  PetscCall(PetscNew(bc_def));
+
+  PetscCall(PetscStrallocpy(name, &(*bc_def)->name));
+  (*bc_def)->num_label_values = num_label_values;
+  PetscCall(PetscMalloc1(num_label_values, &(*bc_def)->label_values));
+  for (PetscInt i = 0; i < num_label_values; i++) (*bc_def)->label_values[i] = label_values[i];
+  PetscFunctionReturn(PETSC_SUCCESS);
+}
+
+/**
+   @brief Get base information for `BCDefinition`
+
+   @param[in]  bc_def           `BCDefinition` to get information from
+   @param[out] name             Name of the `BCDefinition`
+   @param[out] num_label_values Number of `DMLabel` values
+   @param[out] label_values     Array of label values that define the boundaries controlled by the `BCDefinition`, size `num_label_values`
+**/
+PetscErrorCode BCDefinitionGetInfo(BCDefinition bc_def, const char *name[], PetscInt *num_label_values, const PetscInt *label_values[]) {
+  PetscFunctionBeginUser;
+  if (name) *name = bc_def->name;
+  if (label_values) {
+    *num_label_values = bc_def->num_label_values;
+    *label_values     = bc_def->label_values;
+  }
+  PetscFunctionReturn(PETSC_SUCCESS);
+}
+
+/**
+   @brief Destory a `BCDefinition` object
+
+   @param[in,out] bc_def `BCDefinition` to be destroyed
+**/
+PetscErrorCode BCDefinitionDestroy(BCDefinition *bc_def) {
+  PetscFunctionBeginUser;
+  if ((*bc_def)->name) PetscCall(PetscFree((*bc_def)->name));
+  if ((*bc_def)->label_values) PetscCall(PetscFree((*bc_def)->label_values));
+  if ((*bc_def)->essential_comps) PetscCall(PetscFree((*bc_def)->essential_comps));
+  PetscCall(PetscFree(*bc_def));
+  *bc_def = NULL;
+  PetscFunctionReturn(PETSC_SUCCESS);
+}
+
+/**
+   @brief Set `DM_BC_ESSENTIAL` boundary condition values
+
+   @param[in,out] bc_def              `BCDefinition` to set values to
+   @param[in]     num_essential_comps Number of components to set
+   @param[in]     essential_comps     Array of components to set, size `num_essential_comps`
+**/
+PetscErrorCode BCDefinitionSetEssential(BCDefinition bc_def, PetscInt num_essential_comps, PetscInt essential_comps[]) {
+  PetscFunctionBeginUser;
+  bc_def->num_essential_comps = num_essential_comps;
+  PetscCall(PetscMalloc1(num_essential_comps, &bc_def->essential_comps));
+  PetscCall(PetscArraycpy(bc_def->essential_comps, essential_comps, num_essential_comps));
+  PetscFunctionReturn(PETSC_SUCCESS);
+}
+
+/**
+   @brief Get `DM_BC_ESSENTIAL` boundary condition values
+
+   @param[in]  bc_def              `BCDefinition` to set values to
+   @param[out] num_essential_comps Number of components to set
+   @param[out] essential_comps     Array of components to set, size `num_essential_comps`
+**/
+PetscErrorCode BCDefinitionGetEssential(BCDefinition bc_def, PetscInt *num_essential_comps, const PetscInt *essential_comps[]) {
+  PetscFunctionBeginUser;
+  *num_essential_comps = bc_def->num_essential_comps;
+  *essential_comps     = bc_def->essential_comps;
+  PetscFunctionReturn(PETSC_SUCCESS);
+}
+
+#define LABEL_ARRAY_SIZE 256
+
+// @brief See `PetscOptionsBCDefinition`
+PetscErrorCode PetscOptionsBCDefinition_Private(PetscOptionItems *PetscOptionsObject, const char opt[], const char text[], const char man[],
+                                                const char name[], BCDefinition *bc_def, PetscBool *set) {
+  PetscInt num_label_values = LABEL_ARRAY_SIZE, label_values[LABEL_ARRAY_SIZE] = {0};
+
+  PetscFunctionBeginUser;
+  PetscCall(PetscOptionsIntArray(opt, text, man, label_values, &num_label_values, set));
+  if (num_label_values > 0) {
+    PetscCall(BCDefinitionCreate(name, num_label_values, label_values, bc_def));
+  } else {
+    *bc_def = NULL;
+  }
+  PetscFunctionReturn(PETSC_SUCCESS);
+}
diff --git a/examples/fluids/src/boundary_condition.c b/examples/fluids/src/boundary_condition.c
new file mode 100644
index 0000000000..18882f9839
--- /dev/null
+++ b/examples/fluids/src/boundary_condition.c
@@ -0,0 +1,100 @@
+// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+#include "../navierstokes.h"
+
+/**
+   @brief Add `BCDefinition` to a `PetscSegBuffer`
+
+   @param[in]     bc_def      `BCDefinition` to add
+   @param[in,out] bc_defs_seg `PetscSegBuffer` to add to
+**/
+static PetscErrorCode AddBCDefinitionToSegBuffer(BCDefinition bc_def, PetscSegBuffer bc_defs_seg) {
+  BCDefinition *bc_def_ptr;
+
+  PetscFunctionBeginUser;
+  if (bc_def == NULL) PetscFunctionReturn(PETSC_SUCCESS);
+  PetscCall(PetscSegBufferGet(bc_defs_seg, 1, &bc_def_ptr));
+  *bc_def_ptr = bc_def;
+  PetscFunctionReturn(PETSC_SUCCESS);
+}
+
+/**
+   @brief Create and setup `BCDefinition`s and `SimpleBC` from commandline options
+
+   @param[in]     user    `User`
+   @param[in,out] problem `ProblemData`
+   @param[in]     app_ctx `AppCtx`
+   @param[in,out] bc      `SimpleBC`
+**/
+PetscErrorCode BoundaryConditionSetUp(User user, ProblemData problem, AppCtx app_ctx, SimpleBC bc) {
+  PetscSegBuffer bc_defs_seg;
+  PetscBool      flg;
+  BCDefinition   bc_def;
+
+  PetscFunctionBeginUser;
+  PetscCall(PetscSegBufferCreate(sizeof(BCDefinition), 4, &bc_defs_seg));
+
+  PetscOptionsBegin(user->comm, NULL, "Boundary Condition Options", NULL);
+
+  PetscCall(PetscOptionsBCDefinition("-bc_wall", "Face IDs to apply wall BC", NULL, "wall", &bc_def, NULL));
+  PetscCall(AddBCDefinitionToSegBuffer(bc_def, bc_defs_seg));
+  if (bc_def) {
+    PetscInt num_essential_comps = 16, essential_comps[16];
+
+    PetscCall(PetscOptionsIntArray("-wall_comps", "An array of constrained component numbers", NULL, essential_comps, &num_essential_comps, &flg));
+    PetscCall(BCDefinitionSetEssential(bc_def, num_essential_comps, essential_comps));
+
+    app_ctx->wall_forces.num_wall = bc_def->num_label_values;
+    PetscCall(PetscMalloc1(bc_def->num_label_values, &app_ctx->wall_forces.walls));
+    PetscCall(PetscArraycpy(app_ctx->wall_forces.walls, bc_def->label_values, bc_def->num_label_values));
+  }
+
+  {  // Symmetry Boundary Conditions
+    const char *deprecated[3] = {"-bc_slip_x", "-bc_slip_y", "-bc_slip_z"};
+    const char *flags[3]      = {"-bc_symmetry_x", "-bc_symmetry_y", "-bc_symmetry_z"};
+
+    for (PetscInt j = 0; j < 3; j++) {
+      PetscCall(PetscOptionsDeprecated(deprecated[j], flags[j], "libCEED 0.12.0",
+                                       "Use -bc_symmetry_[x,y,z] for direct equivalency, or -bc_slip for weak, Riemann-based, direction-invariant "
+                                       "slip/no-penatration boundary conditions"));
+      PetscCall(PetscOptionsBCDefinition(flags[j], "Face IDs to apply symmetry BC", NULL, "symmetry", &bc_def, NULL));
+      if (!bc_def) {
+        PetscCall(PetscOptionsBCDefinition(deprecated[j], "Face IDs to apply symmetry BC", NULL, "symmetry", &bc_def, NULL));
+      }
+      PetscCall(AddBCDefinitionToSegBuffer(bc_def, bc_defs_seg));
+      if (bc_def) {
+        PetscInt essential_comps[1] = {j + 1};
+
+        PetscCall(BCDefinitionSetEssential(bc_def, 1, essential_comps));
+      }
+    }
+  }
+
+  // Inflow BCs
+  bc->num_inflow = 16;
+  PetscCall(PetscOptionsIntArray("-bc_inflow", "Face IDs to apply inflow BC", NULL, bc->inflows, &bc->num_inflow, NULL));
+  // Outflow BCs
+  bc->num_outflow = 16;
+  PetscCall(PetscOptionsIntArray("-bc_outflow", "Face IDs to apply outflow BC", NULL, bc->outflows, &bc->num_outflow, NULL));
+  // Freestream BCs
+  bc->num_freestream = 16;
+  PetscCall(PetscOptionsIntArray("-bc_freestream", "Face IDs to apply freestream BC", NULL, bc->freestreams, &bc->num_freestream, NULL));
+
+  bc->num_slip = 16;
+  PetscCall(PetscOptionsIntArray("-bc_slip", "Face IDs to apply slip BC", NULL, bc->slips, &bc->num_slip, NULL));
+
+  PetscOptionsEnd();
+
+  PetscCall(PetscSegBufferGetSize(bc_defs_seg, &problem->num_bc_defs));
+  PetscCall(PetscSegBufferExtractAlloc(bc_defs_seg, &problem->bc_defs));
+  PetscCall(PetscSegBufferDestroy(&bc_defs_seg));
+
+  //TODO: Verify that the BCDefinition don't have overlapping claims to boundary faces
+
+  PetscFunctionReturn(PETSC_SUCCESS);
+}
diff --git a/examples/fluids/src/cloptions.c b/examples/fluids/src/cloptions.c
index abf3ea38b0..265092badd 100644
--- a/examples/fluids/src/cloptions.c
+++ b/examples/fluids/src/cloptions.c
@@ -134,56 +134,6 @@ PetscErrorCode ProcessCommandLineOptions(MPI_Comm comm, AppCtx app_ctx, SimpleBC
     strncpy(app_ctx->problem_name, problem_name, 16);
   }
 
-  // Wall Boundary Conditions
-  bc->num_wall = 16;
-  PetscBool flg;
-  PetscCall(PetscOptionsIntArray("-bc_wall", "Face IDs to apply wall BC", NULL, bc->walls, &bc->num_wall, NULL));
-  bc->num_comps = 5;
-  PetscCall(PetscOptionsIntArray("-wall_comps", "An array of constrained component numbers", NULL, bc->wall_comps, &bc->num_comps, &flg));
-
-  {  // Symmetry Boundary Conditions
-    const char *deprecated[3] = {"-bc_slip_x", "-bc_slip_y", "-bc_slip_z"};
-    const char *flags[3]      = {"-bc_symmetry_x", "-bc_symmetry_y", "-bc_symmetry_z"};
-    PetscBool   flg, has_symmetry = PETSC_FALSE;
-
-    for (PetscInt j = 0; j < 3; j++) {
-      bc->num_symmetry[j] = 16;
-      PetscCall(PetscOptionsDeprecated(deprecated[j], flags[j], "libCEED 0.12.0",
-                                       "Use -bc_symmetry_[x,y,z] for direct equivalency, or -bc_slip for weak, Riemann-based, direction-invariant "
-                                       "slip/no-penatration boundary conditions"));
-      PetscCall(PetscOptionsIntArray(flags[j], "Face IDs to apply symmetry BC", NULL, bc->symmetries[j], &bc->num_symmetry[j], &flg));
-      if (bc->num_symmetry[j] > 0) has_symmetry = PETSC_TRUE;
-    }
-
-    // Error if wall and symmetry BCs are set on the same face
-    if (has_symmetry) {
-      for (PetscInt c = 0; c < 3; c++) {
-        for (PetscInt s = 0; s < bc->num_symmetry[c]; s++) {
-          for (PetscInt w = 0; w < bc->num_wall; w++) {
-            PetscCheck(bc->symmetries[c][s] != bc->walls[w], PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG,
-                       "Boundary condition already set on face %" PetscInt_FMT "!\n", bc->walls[w]);
-          }
-        }
-      }
-    }
-  }
-  app_ctx->wall_forces.num_wall = bc->num_wall;
-  PetscCall(PetscMalloc1(bc->num_wall, &app_ctx->wall_forces.walls));
-  PetscCall(PetscArraycpy(app_ctx->wall_forces.walls, bc->walls, bc->num_wall));
-
-  // Inflow BCs
-  bc->num_inflow = 16;
-  PetscCall(PetscOptionsIntArray("-bc_inflow", "Face IDs to apply inflow BC", NULL, bc->inflows, &bc->num_inflow, NULL));
-  // Outflow BCs
-  bc->num_outflow = 16;
-  PetscCall(PetscOptionsIntArray("-bc_outflow", "Face IDs to apply outflow BC", NULL, bc->outflows, &bc->num_outflow, NULL));
-  // Freestream BCs
-  bc->num_freestream = 16;
-  PetscCall(PetscOptionsIntArray("-bc_freestream", "Face IDs to apply freestream BC", NULL, bc->freestreams, &bc->num_freestream, NULL));
-
-  bc->num_slip = 16;
-  PetscCall(PetscOptionsIntArray("-bc_slip", "Face IDs to apply slip BC", NULL, bc->slips, &bc->num_slip, NULL));
-
   // Statistics Options
   app_ctx->turb_spanstats_collect_interval = 1;
   PetscCall(PetscOptionsInt("-ts_monitor_turbulence_spanstats_collect_interval", "Number of timesteps between statistics collection", NULL,
diff --git a/examples/fluids/src/setupdm.c b/examples/fluids/src/setupdm.c
index a0df7dfe69..44cd510b6d 100644
--- a/examples/fluids/src/setupdm.c
+++ b/examples/fluids/src/setupdm.c
@@ -52,24 +52,19 @@ PetscErrorCode SetUpDM(DM dm, ProblemData problem, PetscInt degree, PetscInt q_e
     DMLabel label;
     PetscCall(DMGetLabel(dm, "Face Sets", &label));
     PetscCall(DMPlexLabelComplete(dm, label));
-    // Set wall BCs
-    if (bc->num_wall > 0) {
-      PetscCall(DMAddBoundary(dm, DM_BC_ESSENTIAL, "wall", label, bc->num_wall, bc->walls, 0, bc->num_comps, bc->wall_comps, NULL, NULL, NULL, NULL));
-    }
-    // Set symmetry BCs in the x direction
-    if (bc->num_symmetry[0] > 0) {
-      PetscInt comps[1] = {1};
-      PetscCall(DMAddBoundary(dm, DM_BC_ESSENTIAL, "symmetry_x", label, bc->num_symmetry[0], bc->symmetries[0], 0, 1, comps, NULL, NULL, NULL, NULL));
-    }
-    // Set symmetry BCs in the y direction
-    if (bc->num_symmetry[1] > 0) {
-      PetscInt comps[1] = {2};
-      PetscCall(DMAddBoundary(dm, DM_BC_ESSENTIAL, "symmetry_y", label, bc->num_symmetry[1], bc->symmetries[1], 0, 1, comps, NULL, NULL, NULL, NULL));
-    }
-    // Set symmetry BCs in the z direction
-    if (bc->num_symmetry[2] > 0) {
-      PetscInt comps[1] = {3};
-      PetscCall(DMAddBoundary(dm, DM_BC_ESSENTIAL, "symmetry_z", label, bc->num_symmetry[2], bc->symmetries[2], 0, 1, comps, NULL, NULL, NULL, NULL));
+
+    for (PetscInt i = 0; i < problem->num_bc_defs; i++) {
+      BCDefinition    bc_def = problem->bc_defs[i];
+      PetscInt        num_essential_comps, num_label_values;
+      const PetscInt *essential_comps, *label_values;
+      const char     *name;
+
+      PetscCall(BCDefinitionGetEssential(bc_def, &num_essential_comps, &essential_comps));
+      if (essential_comps > 0) {
+        PetscCall(BCDefinitionGetInfo(bc_def, &name, &num_label_values, &label_values));
+        PetscCall(DMAddBoundary(dm, DM_BC_ESSENTIAL, name, label, num_label_values, label_values, 0, num_essential_comps, essential_comps, NULL, NULL,
+                                NULL, NULL));
+      }
     }
     {
       PetscBool use_strongstg = PETSC_FALSE;

From de327db462e8afadb04b2fd99bdb39cfb29a1d98 Mon Sep 17 00:00:00 2001
From: James Wright <james@jameswright.xyz>
Date: Mon, 17 Jun 2024 11:59:07 -0600
Subject: [PATCH 072/571] refactor(fluids): Rename some booleans

---
 examples/fluids/navierstokes.c         | 10 +++++-----
 examples/fluids/navierstokes.h         |  4 ++--
 examples/fluids/problems/advection.c   |  4 ++--
 examples/fluids/problems/eulervortex.c |  2 +-
 examples/fluids/problems/newtonian.c   | 10 +++++-----
 examples/fluids/problems/shocktube.c   |  2 +-
 examples/fluids/problems/stg_shur14.c  |  4 ++--
 examples/fluids/src/misc.c             |  2 +-
 8 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/examples/fluids/navierstokes.c b/examples/fluids/navierstokes.c
index 2377aa4b98..0a91069b58 100644
--- a/examples/fluids/navierstokes.c
+++ b/examples/fluids/navierstokes.c
@@ -90,10 +90,10 @@ int main(int argc, char **argv) {
   Units units;
   PetscCall(PetscCalloc1(1, &units));
 
-  user->app_ctx        = app_ctx;
-  user->units          = units;
-  user->phys           = phys_ctx;
-  problem->bc_from_ics = PETSC_TRUE;
+  user->app_ctx            = app_ctx;
+  user->units              = units;
+  user->phys               = phys_ctx;
+  problem->set_bc_from_ics = PETSC_TRUE;
 
   PetscCall(RegisterLogEvents());
 
@@ -210,7 +210,7 @@ int main(int argc, char **argv) {
   //    We use this for the main simulation DM because the reference DMPlexInsertBoundaryValues() is very slow on the GPU due to extra device-to-host
   //    communication. If we disable this, we should still get the same results due to the problem->bc function, but with potentially much slower
   //    execution.
-  if (problem->bc_from_ics) {
+  if (problem->set_bc_from_ics) {
     PetscCall(SetBCsFromICs(dm, Q, user->Q_loc));
   }
 
diff --git a/examples/fluids/navierstokes.h b/examples/fluids/navierstokes.h
index 8c069892d1..02f2bd87f1 100644
--- a/examples/fluids/navierstokes.h
+++ b/examples/fluids/navierstokes.h
@@ -285,8 +285,8 @@ struct ProblemData_private {
   CeedScalar           dm_scale;
   ProblemQFunctionSpec ics, apply_vol_rhs, apply_vol_ifunction, apply_vol_ijacobian, apply_inflow, apply_outflow, apply_freestream, apply_slip,
       apply_inflow_jacobian, apply_outflow_jacobian, apply_freestream_jacobian, apply_slip_jacobian;
-  bool          non_zero_time;
-  PetscBool     bc_from_ics, use_strong_bc_ceed, uses_newtonian;
+  bool          compute_exact_solution_error;
+  PetscBool     set_bc_from_ics, use_strong_bc_ceed, uses_newtonian;
   size_t        num_bc_defs;
   BCDefinition *bc_defs;
   PetscErrorCode (*print_info)(User, ProblemData, AppCtx);
diff --git a/examples/fluids/problems/advection.c b/examples/fluids/problems/advection.c
index 4235692863..3c8423a9d7 100644
--- a/examples/fluids/problems/advection.c
+++ b/examples/fluids/problems/advection.c
@@ -112,7 +112,7 @@ PetscErrorCode NS_ADVECTION(ProblemData problem, DM dm, void *ctx, SimpleBC bc)
       problem->apply_vol_ifunction.qfunction_loc = IFunction_Advection2d_loc;
       problem->apply_inflow.qfunction            = Advection2d_InOutFlow;
       problem->apply_inflow.qfunction_loc        = Advection2d_InOutFlow_loc;
-      problem->non_zero_time                     = PETSC_TRUE;
+      problem->compute_exact_solution_error      = PETSC_TRUE;
       problem->print_info                        = PRINT_ADVECTION;
       break;
     case 3:
@@ -125,7 +125,7 @@ PetscErrorCode NS_ADVECTION(ProblemData problem, DM dm, void *ctx, SimpleBC bc)
       problem->apply_vol_ifunction.qfunction_loc = IFunction_Advection_loc;
       problem->apply_inflow.qfunction            = Advection_InOutFlow;
       problem->apply_inflow.qfunction_loc        = Advection_InOutFlow_loc;
-      problem->non_zero_time                     = PETSC_FALSE;
+      problem->compute_exact_solution_error      = PETSC_FALSE;
       problem->print_info                        = PRINT_ADVECTION;
       break;
   }
diff --git a/examples/fluids/problems/eulervortex.c b/examples/fluids/problems/eulervortex.c
index a823a71433..8397561102 100644
--- a/examples/fluids/problems/eulervortex.c
+++ b/examples/fluids/problems/eulervortex.c
@@ -42,7 +42,7 @@ PetscErrorCode NS_EULER_VORTEX(ProblemData problem, DM dm, void *ctx, SimpleBC b
   problem->apply_inflow.qfunction_loc        = TravelingVortex_Inflow_loc;
   problem->apply_outflow.qfunction           = Euler_Outflow;
   problem->apply_outflow.qfunction_loc       = Euler_Outflow_loc;
-  problem->non_zero_time                     = PETSC_TRUE;
+  problem->compute_exact_solution_error      = PETSC_TRUE;
   problem->print_info                        = PRINT_EULER_VORTEX;
 
   // ------------------------------------------------------
diff --git a/examples/fluids/problems/newtonian.c b/examples/fluids/problems/newtonian.c
index efd1f37187..ed204602da 100644
--- a/examples/fluids/problems/newtonian.c
+++ b/examples/fluids/problems/newtonian.c
@@ -136,11 +136,11 @@ PetscErrorCode NS_NEWTONIAN_IG(ProblemData problem, DM dm, void *ctx, SimpleBC b
   // ------------------------------------------------------
   //           Setup Generic Newtonian IG Problem
   // ------------------------------------------------------
-  problem->dim               = 3;
-  problem->jac_data_size_sur = 11;
-  problem->non_zero_time     = PETSC_FALSE;
-  problem->print_info        = PRINT_NEWTONIAN;
-  problem->uses_newtonian    = PETSC_TRUE;
+  problem->dim                          = 3;
+  problem->jac_data_size_sur            = 11;
+  problem->compute_exact_solution_error = PETSC_FALSE;
+  problem->print_info                   = PRINT_NEWTONIAN;
+  problem->uses_newtonian               = PETSC_TRUE;
 
   // ------------------------------------------------------
   //             Create the libCEED context
diff --git a/examples/fluids/problems/shocktube.c b/examples/fluids/problems/shocktube.c
index 38758516f5..3e637392fa 100644
--- a/examples/fluids/problems/shocktube.c
+++ b/examples/fluids/problems/shocktube.c
@@ -40,7 +40,7 @@ PetscErrorCode NS_SHOCKTUBE(ProblemData problem, DM dm, void *ctx, SimpleBC bc)
   problem->apply_vol_rhs.qfunction_loc       = EulerShockTube_loc;
   problem->apply_vol_ifunction.qfunction     = NULL;
   problem->apply_vol_ifunction.qfunction_loc = NULL;
-  problem->non_zero_time                     = PETSC_FALSE;
+  problem->compute_exact_solution_error      = PETSC_FALSE;
   problem->print_info                        = PRINT_SHOCKTUBE;
 
   // ------------------------------------------------------
diff --git a/examples/fluids/problems/stg_shur14.c b/examples/fluids/problems/stg_shur14.c
index 5d9d4f1dc4..b748300bfa 100644
--- a/examples/fluids/problems/stg_shur14.c
+++ b/examples/fluids/problems/stg_shur14.c
@@ -278,7 +278,7 @@ PetscErrorCode SetupStg(const MPI_Comm comm, const DM dm, ProblemData problem, U
   if (use_stgstrong) {
     // Use default boundary integral QF (BoundaryIntegral) in newtonian.h
     problem->use_strong_bc_ceed = PETSC_TRUE;
-    problem->bc_from_ics        = PETSC_FALSE;
+    problem->set_bc_from_ics    = PETSC_FALSE;
   } else {
     problem->apply_inflow.qfunction              = StgShur14Inflow;
     problem->apply_inflow.qfunction_loc          = StgShur14Inflow_loc;
@@ -286,7 +286,7 @@ PetscErrorCode SetupStg(const MPI_Comm comm, const DM dm, ProblemData problem, U
     problem->apply_inflow_jacobian.qfunction_loc = StgShur14Inflow_Jacobian_loc;
     PetscCallCeed(ceed, CeedQFunctionContextReferenceCopy(stg_context, &problem->apply_inflow.qfunction_context));
     PetscCallCeed(ceed, CeedQFunctionContextReferenceCopy(stg_context, &problem->apply_inflow_jacobian.qfunction_context));
-    problem->bc_from_ics = PETSC_TRUE;
+    problem->set_bc_from_ics = PETSC_TRUE;
   }
   PetscFunctionReturn(PETSC_SUCCESS);
 }
diff --git a/examples/fluids/src/misc.c b/examples/fluids/src/misc.c
index 60c078b0d5..f60f931555 100644
--- a/examples/fluids/src/misc.c
+++ b/examples/fluids/src/misc.c
@@ -201,7 +201,7 @@ PetscErrorCode PostProcess(TS ts, CeedData ceed_data, DM dm, ProblemData problem
 
   PetscFunctionBeginUser;
   // Print relative error
-  if (problem->non_zero_time && user->app_ctx->test_type == TESTTYPE_NONE) {
+  if (problem->compute_exact_solution_error && user->app_ctx->test_type == TESTTYPE_NONE) {
     PetscCall(PrintError(ceed_data, dm, user, Q, final_time));
   }
 

From 02b29df7b0ab76fdb65016753e3b39a54ba3f090 Mon Sep 17 00:00:00 2001
From: James Wright <james@jameswright.xyz>
Date: Fri, 14 Jun 2024 13:41:14 -0600
Subject: [PATCH 073/571] fluids: Add Entropy state variable and tests

Co-authored-by: Zach Atkins <zacharyjayhawk@gmail.com>
---
 examples/fluids/problems/newtonian.c         | 155 ++++++++++++----
 examples/fluids/qfunctions/newtonian_state.h | 176 +++++++++++++++++++
 examples/fluids/qfunctions/newtonian_types.h |   7 +
 3 files changed, 307 insertions(+), 31 deletions(-)

diff --git a/examples/fluids/problems/newtonian.c b/examples/fluids/problems/newtonian.c
index efd1f37187..39e8c469d9 100644
--- a/examples/fluids/problems/newtonian.c
+++ b/examples/fluids/problems/newtonian.c
@@ -16,47 +16,138 @@
 #include "../navierstokes.h"
 
 // For use with PetscOptionsEnum
-static const char *const StateVariables[] = {"CONSERVATIVE", "PRIMITIVE", "StateVariable", "STATEVAR_", NULL};
+static const char *const StateVariables[] = {"conservative", "primitive", "entropy", "StateVariable", "STATEVAR_", NULL};
 
-// Compute relative error |a - b|/|s|
-static PetscErrorCode CheckPrimitiveWithTolerance(StatePrimitive sY, StatePrimitive aY, StatePrimitive bY, const char *name, PetscReal rtol_pressure,
-                                                  PetscReal rtol_velocity, PetscReal rtol_temperature) {
-  StatePrimitive eY;  // relative error
+static PetscErrorCode CheckQWithTolerance(const CeedScalar Q_s[5], const CeedScalar Q_a[5], const CeedScalar Q_b[5], const char *name,
+                                          PetscReal rtol_0, PetscReal rtol_u, PetscReal rtol_4) {
+  CeedScalar relative_error[5];  // relative error
+  CeedScalar divisor_threshold = 10 * CEED_EPSILON;
 
   PetscFunctionBeginUser;
-  eY.pressure   = (aY.pressure - bY.pressure) / sY.pressure;
-  PetscScalar u = sqrt(Square(sY.velocity[0]) + Square(sY.velocity[1]) + Square(sY.velocity[2]));
-  for (int j = 0; j < 3; j++) eY.velocity[j] = (aY.velocity[j] - bY.velocity[j]) / u;
-  eY.temperature = (aY.temperature - bY.temperature) / sY.temperature;
-  if (fabs(eY.pressure) > rtol_pressure) printf("%s: pressure error %g\n", name, eY.pressure);
-  for (int j = 0; j < 3; j++) {
-    if (fabs(eY.velocity[j]) > rtol_velocity) printf("%s: velocity[%d] error %g\n", name, j, eY.velocity[j]);
+  relative_error[0] = (Q_a[0] - Q_b[0]) / (fabs(Q_s[0]) > divisor_threshold ? Q_s[0] : 1);
+  relative_error[4] = (Q_a[4] - Q_b[4]) / (fabs(Q_s[4]) > divisor_threshold ? Q_s[4] : 1);
+
+  CeedScalar u_magnitude = sqrt(Square(Q_s[1]) + Square(Q_s[2]) + Square(Q_s[3]));
+  CeedScalar u_divisor   = u_magnitude > divisor_threshold ? u_magnitude : 1;
+  for (int i = 1; i < 4; i++) {
+    relative_error[i] = (Q_a[i] - Q_b[i]) / u_divisor;
+  }
+
+  if (fabs(relative_error[0]) >= rtol_0) {
+    printf("%s[0] error %g (expected %.10e, got %.10e)\n", name, relative_error[0], Q_s[0], Q_a[0]);
   }
-  if (fabs(eY.temperature) > rtol_temperature) printf("%s: temperature error %g\n", name, eY.temperature);
+  for (int i = 1; i < 4; i++) {
+    if (fabs(relative_error[i]) >= rtol_u) {
+      printf("%s[%d] error %g (expected %.10e, got %.10e)\n", name, i, relative_error[i], Q_s[i], Q_a[i]);
+    }
+  }
+  if (fabs(relative_error[4]) >= rtol_4) {
+    printf("%s[4] error %g (expected %.10e, got %.10e)\n", name, relative_error[4], Q_s[4], Q_a[4]);
+  }
+  PetscFunctionReturn(PETSC_SUCCESS);
+}
+
+// @brief Verify `StateFromQ` by converting A0 -> B0 -> A0_test, where A0 should equal A0_test
+static PetscErrorCode TestState(StateVariable state_var_A, StateVariable state_var_B, NewtonianIdealGasContext gas, const CeedScalar A0[5],
+                                CeedScalar rtol_0, CeedScalar rtol_u, CeedScalar rtol_4) {
+  CeedScalar        B0[5], A0_test[5];
+  char              buf[128];
+  const char *const StateVariables_Initial[] = {"U", "Y", "V"};
+
+  PetscFunctionBeginUser;
+  const char *A_initial = StateVariables_Initial[state_var_A];
+  const char *B_initial = StateVariables_Initial[state_var_B];
+
+  State state_A0 = StateFromQ(gas, A0, state_var_A);
+  StateToQ(gas, state_A0, B0, state_var_B);
+  State state_B0 = StateFromQ(gas, B0, state_var_B);
+  StateToQ(gas, state_B0, A0_test, state_var_A);
+
+  snprintf(buf, sizeof buf, "%s->%s->%s: %s", A_initial, B_initial, A_initial, A_initial);
+  PetscCall(CheckQWithTolerance(A0, A0_test, A0, buf, rtol_0, rtol_u, rtol_4));
   PetscFunctionReturn(PETSC_SUCCESS);
 }
 
+// @brief Verify `StateFromQ_fwd` via a finite difference approximation
+static PetscErrorCode TestState_fwd(StateVariable state_var_A, StateVariable state_var_B, NewtonianIdealGasContext gas, const CeedScalar A0[5],
+                                    CeedScalar rtol_0, CeedScalar rtol_u, CeedScalar rtol_4) {
+  CeedScalar        eps = 4e-7;  // Finite difference step
+  char              buf[128];
+  const char *const StateVariables_Initial[] = {"U", "Y", "V"};
+
+  PetscFunctionBeginUser;
+  const char *A_initial = StateVariables_Initial[state_var_A];
+  const char *B_initial = StateVariables_Initial[state_var_B];
+  State       state_0   = StateFromQ(gas, A0, state_var_A);
+
+  for (int i = 0; i < 5; i++) {
+    CeedScalar dB[5] = {0.}, dB_fd[5] = {0.};
+    {  // Calculate dB using State functions
+      CeedScalar dA[5] = {0};
+
+      dA[i]    = A0[i];
+      State ds = StateFromQ_fwd(gas, state_0, dA, state_var_A);
+      StateToQ(gas, ds, dB, state_var_B);
+    }
+
+    {  // Calculate dB_fd via finite difference approximation
+      CeedScalar A1[5], B0[5], B1[5];
+
+      for (int j = 0; j < 5; j++) A1[j] = (1 + eps * (i == j)) * A0[j];
+      State state_1 = StateFromQ(gas, A1, state_var_A);
+      StateToQ(gas, state_0, B0, state_var_B);
+      StateToQ(gas, state_1, B1, state_var_B);
+      for (int j = 0; j < 5; j++) dB_fd[j] = (B1[j] - B0[j]) / eps;
+    }
+
+    snprintf(buf, sizeof buf, "d%s->d%s: StateFrom%s_fwd i=%d: d%s", A_initial, B_initial, A_initial, i, B_initial);
+    PetscCall(CheckQWithTolerance(dB_fd, dB, dB_fd, buf, rtol_0, rtol_u, rtol_4));
+  }
+  PetscFunctionReturn(PETSC_SUCCESS);
+}
+
+// @brief Test the Newtonian State transformation functions, `StateFrom*`
 static PetscErrorCode UnitTests_Newtonian(User user, NewtonianIdealGasContext gas) {
   Units            units = user->units;
-  const CeedScalar eps   = 1e-6;
-  const CeedScalar kg = units->kilogram, m = units->meter, sec = units->second, Pascal = units->Pascal;
+  const CeedScalar kg = units->kilogram, m = units->meter, sec = units->second, K = units->Kelvin;
+
   PetscFunctionBeginUser;
-  const CeedScalar rho = 1.2 * kg / (m * m * m), u = 40 * m / sec;
-  CeedScalar       U[5] = {rho, rho * u, rho * u * 1.1, rho * u * 1.2, 250e3 * Pascal + .5 * rho * u * u};
-  State            s    = StateFromU(gas, U);
-  for (int i = 0; i < 8; i++) {
-    CeedScalar dU[5] = {0};
-    if (i < 5) dU[i] = U[i];
-    State ds = StateFromU_fwd(gas, s, dU);
-    for (int j = 0; j < 5; j++) dU[j] = (1 + eps * (i == j)) * U[j];
-    State          t = StateFromU(gas, dU);
-    StatePrimitive dY;
-    dY.pressure = (t.Y.pressure - s.Y.pressure) / eps;
-    for (int j = 0; j < 3; j++) dY.velocity[j] = (t.Y.velocity[j] - s.Y.velocity[j]) / eps;
-    dY.temperature = (t.Y.temperature - s.Y.temperature) / eps;
-    char buf[128];
-    snprintf(buf, sizeof buf, "StateFromU_fwd i=%d", i);
-    PetscCall(CheckPrimitiveWithTolerance(dY, ds.Y, dY, buf, 5e-6, 1e-6, 1e-6));
+  const CeedScalar T          = 200 * K;
+  const CeedScalar rho        = 1.2 * kg / Cube(m);
+  const CeedScalar P          = (HeatCapacityRatio(gas) - 1) * rho * gas->cv * T;
+  const CeedScalar u_base     = 40 * m / sec;
+  const CeedScalar u[3]       = {u_base, u_base * 1.1, u_base * 1.2};
+  const CeedScalar e_kinetic  = 0.5 * Dot3(u, u);
+  const CeedScalar e_internal = gas->cv * T;
+  const CeedScalar e_total    = e_kinetic + e_internal;
+  const CeedScalar gamma      = HeatCapacityRatio(gas);
+  const CeedScalar entropy    = log(P) - gamma * log(rho);
+  const CeedScalar rho_div_p  = rho / P;
+  const CeedScalar Y0[5]      = {P, u[0], u[1], u[2], T};
+  const CeedScalar U0[5]      = {rho, rho * u[0], rho * u[1], rho * u[2], rho * e_total};
+  const CeedScalar V0[5]      = {(gamma - entropy) / (gamma - 1) - rho_div_p * (e_kinetic), rho_div_p * u[0], rho_div_p * u[1], rho_div_p * u[2],
+                                 -rho_div_p};
+
+  {
+    CeedScalar rtol = 20 * CEED_EPSILON;
+
+    PetscCall(TestState(STATEVAR_PRIMITIVE, STATEVAR_CONSERVATIVE, gas, Y0, rtol, rtol, rtol));
+    PetscCall(TestState(STATEVAR_PRIMITIVE, STATEVAR_ENTROPY, gas, Y0, rtol, rtol, rtol));
+    PetscCall(TestState(STATEVAR_CONSERVATIVE, STATEVAR_PRIMITIVE, gas, U0, rtol, rtol, rtol));
+    PetscCall(TestState(STATEVAR_CONSERVATIVE, STATEVAR_ENTROPY, gas, U0, rtol, rtol, rtol));
+    PetscCall(TestState(STATEVAR_ENTROPY, STATEVAR_CONSERVATIVE, gas, V0, rtol, rtol, rtol));
+    PetscCall(TestState(STATEVAR_ENTROPY, STATEVAR_PRIMITIVE, gas, V0, rtol, rtol, rtol));
+  }
+
+  {
+    CeedScalar rtol = 5e-6;
+
+    PetscCall(TestState_fwd(STATEVAR_PRIMITIVE, STATEVAR_CONSERVATIVE, gas, Y0, rtol, rtol, rtol));
+    PetscCall(TestState_fwd(STATEVAR_PRIMITIVE, STATEVAR_ENTROPY, gas, Y0, rtol, rtol, rtol));
+    PetscCall(TestState_fwd(STATEVAR_CONSERVATIVE, STATEVAR_PRIMITIVE, gas, U0, rtol, rtol, rtol));
+    PetscCall(TestState_fwd(STATEVAR_CONSERVATIVE, STATEVAR_ENTROPY, gas, U0, 10 * rtol, rtol, rtol));
+    PetscCall(TestState_fwd(STATEVAR_ENTROPY, STATEVAR_CONSERVATIVE, gas, V0, 5 * rtol, rtol, rtol));
+    PetscCall(TestState_fwd(STATEVAR_ENTROPY, STATEVAR_PRIMITIVE, gas, V0, 5 * rtol, 5 * rtol, 5 * rtol));
   }
   PetscFunctionReturn(PETSC_SUCCESS);
 }
@@ -212,6 +303,8 @@ PetscErrorCode NS_NEWTONIAN_IG(ProblemData problem, DM dm, void *ctx, SimpleBC b
       problem->apply_inflow_jacobian.qfunction     = BoundaryIntegral_Jacobian_Prim;
       problem->apply_inflow_jacobian.qfunction_loc = BoundaryIntegral_Jacobian_Prim_loc;
       break;
+    case STATEVAR_ENTROPY:
+      break;
   }
 
   // -- Physics
diff --git a/examples/fluids/qfunctions/newtonian_state.h b/examples/fluids/qfunctions/newtonian_state.h
index 185caf06d6..eb9f12b3d1 100644
--- a/examples/fluids/qfunctions/newtonian_state.h
+++ b/examples/fluids/qfunctions/newtonian_state.h
@@ -24,6 +24,7 @@ typedef struct {
 typedef struct {
   StateConservative U;
   StatePrimitive    Y;
+  StateEntropy      V;
 } State;
 
 CEED_QFUNCTION_HELPER void UnpackState_U(StateConservative s, CeedScalar U[5]) {
@@ -38,6 +39,12 @@ CEED_QFUNCTION_HELPER void UnpackState_Y(StatePrimitive s, CeedScalar Y[5]) {
   Y[4] = s.temperature;
 }
 
+CEED_QFUNCTION_HELPER void UnpackState_V(StateEntropy s, CeedScalar V[5]) {
+  V[0] = s.S_density;
+  for (int i = 0; i < 3; i++) V[i + 1] = s.S_momentum[i];
+  V[4] = s.S_energy;
+}
+
 CEED_QFUNCTION_HELPER CeedScalar HeatCapacityRatio(NewtonianIdealGasContext gas) { return gas->cp / gas->cv; }
 
 CEED_QFUNCTION_HELPER CeedScalar GasConstant(NewtonianIdealGasContext gas) { return gas->cp - gas->cv; }
@@ -116,19 +123,149 @@ CEED_QFUNCTION_HELPER StateConservative StateConservativeFromPrimitive_fwd(Newto
   return dU;
 }
 
+CEED_QFUNCTION_HELPER StateEntropy StateEntropyFromConservative(NewtonianIdealGasContext gas, StateConservative U) {
+  StateEntropy     V;
+  const CeedScalar gamma      = HeatCapacityRatio(gas);
+  const CeedScalar e_kinetic  = .5 * Dot3(U.momentum, U.momentum) / U.density;
+  const CeedScalar e_internal = U.E_total - e_kinetic;
+  const CeedScalar p          = (gamma - 1) * e_internal;
+  const CeedScalar entropy    = log(p) - gamma * log(U.density);
+
+  V.S_density = (gamma - entropy) / (gamma - 1) - e_kinetic / p;
+  for (int i = 0; i < 3; i++) V.S_momentum[i] = U.momentum[i] / p;
+  V.S_energy = -U.density / p;
+  return V;
+}
+
+CEED_QFUNCTION_HELPER StateEntropy StateEntropyFromConservative_fwd(NewtonianIdealGasContext gas, State s, StateConservative dU) {
+  StateEntropy     dV;
+  const CeedScalar gamma       = HeatCapacityRatio(gas);
+  const CeedScalar e_kinetic   = .5 * Dot3(s.U.momentum, s.U.momentum) / s.U.density;
+  const CeedScalar de_kinetic  = (Dot3(s.U.momentum, dU.momentum) - e_kinetic * dU.density) / s.U.density;
+  const CeedScalar de_internal = dU.E_total - de_kinetic;
+  const CeedScalar p           = s.Y.pressure;
+  const CeedScalar dp          = (gamma - 1) * de_internal;
+
+  CeedScalar dentropy = dp / p - gamma * dU.density / s.U.density;
+
+  dV.S_density = -dentropy / (gamma - 1) - de_kinetic / p + dp * e_kinetic / Square(p);
+  for (CeedInt i = 0; i < 3; i++) {
+    dV.S_momentum[i] = (dU.momentum[i] - s.U.momentum[i] * dp / p) / p;
+  }
+  dV.S_energy = -(dU.density - s.U.density * dp / p) / p;
+  return dV;
+}
+
+CEED_QFUNCTION_HELPER StateConservative StateConservativeFromEntropy(NewtonianIdealGasContext gas, StateEntropy V) {
+  StateConservative U;
+  CeedScalar        velocity[3];
+  for (int i = 0; i < 3; i++) velocity[i] = -V.S_momentum[i] / V.S_energy;
+  const CeedScalar gamma     = HeatCapacityRatio(gas);
+  const CeedScalar e_kinetic = 0.5 * Dot3(velocity, velocity);
+  const CeedScalar entropy   = gamma - (gamma - 1) * (V.S_density - e_kinetic * V.S_energy);
+  const CeedScalar log_rho   = -(entropy + log(-V.S_energy)) / (gamma - 1);
+  U.density                  = exp(log_rho);
+  for (int i = 0; i < 3; i++) U.momentum[i] = U.density * velocity[i];
+
+  const CeedScalar e_internal = -gas->cv / (GasConstant(gas) * V.S_energy);
+  U.E_total                   = U.density * (e_internal + e_kinetic);
+  return U;
+}
+
+CEED_QFUNCTION_HELPER StateConservative StateConservativeFromEntropy_fwd(NewtonianIdealGasContext gas, State s, StateEntropy dV) {
+  StateConservative dU;
+  CeedScalar        dvelocity[3];
+  for (int i = 0; i < 3; i++) dvelocity[i] = (-dV.S_momentum[i] - s.Y.velocity[i] * dV.S_energy) / s.V.S_energy;
+  const CeedScalar gamma      = HeatCapacityRatio(gas);
+  const CeedScalar e_kinetic  = 0.5 * Dot3(s.Y.velocity, s.Y.velocity);
+  const CeedScalar de_kinetic = Dot3(dvelocity, s.Y.velocity);
+  const CeedScalar entropy    = gamma - (gamma - 1) * (s.V.S_density - e_kinetic * s.V.S_energy);
+  const CeedScalar dentropy   = -(gamma - 1) * (dV.S_density - (de_kinetic * s.V.S_energy + e_kinetic * dV.S_energy));
+  const CeedScalar log_rho    = -(entropy + log(-s.V.S_energy)) / (gamma - 1);
+  const CeedScalar rho        = exp(log_rho);
+  dU.density                  = -rho / (gamma - 1) * (dentropy + dV.S_energy / s.V.S_energy);
+  for (int i = 0; i < 3; i++) dU.momentum[i] = dU.density * s.Y.velocity[i] + s.U.density * dvelocity[i];
+
+  const CeedScalar e_internal  = -gas->cv / (GasConstant(gas) * s.V.S_energy);
+  const CeedScalar de_internal = gas->cv * dV.S_energy / (GasConstant(gas) * s.V.S_energy * s.V.S_energy);
+  const CeedScalar e_total     = e_internal + e_kinetic;
+  dU.E_total                   = dU.density * e_total + s.U.density * (de_internal + de_kinetic);
+  return dU;
+}
+
+CEED_QFUNCTION_HELPER StateEntropy StateEntropyFromPrimitive(NewtonianIdealGasContext gas, StatePrimitive Y) {
+  StateEntropy     V;
+  const CeedScalar gamma     = HeatCapacityRatio(gas);
+  const CeedScalar rho       = Y.pressure / (GasConstant(gas) * Y.temperature);
+  const CeedScalar entropy   = log(Y.pressure) - gamma * log(rho);
+  const CeedScalar rho_div_p = rho / Y.pressure;
+  const CeedScalar e_kinetic = 0.5 * Dot3(Y.velocity, Y.velocity);
+
+  V.S_density = (gamma - entropy) / (gamma - 1) - rho_div_p * e_kinetic;
+  for (int i = 0; i < 3; i++) V.S_momentum[i] = rho_div_p * Y.velocity[i];
+  V.S_energy = -rho_div_p;
+  return V;
+}
+
+CEED_QFUNCTION_HELPER StateEntropy StateEntropyFromPrimitive_fwd(NewtonianIdealGasContext gas, State s, StatePrimitive dY) {
+  StateEntropy     dV;
+  const CeedScalar gamma = HeatCapacityRatio(gas);
+  CeedScalar       drho  = (dY.pressure * s.Y.temperature - s.Y.pressure * dY.temperature) / (GasConstant(gas) * s.Y.temperature * s.Y.temperature);
+
+  const CeedScalar e_kinetic  = .5 * Dot3(s.Y.velocity, s.Y.velocity);
+  const CeedScalar de_kinetic = Dot3(dY.velocity, s.Y.velocity);
+  const CeedScalar rho_div_p  = s.U.density / s.Y.pressure;
+  const CeedScalar drho_div_p = (drho * s.Y.pressure - s.U.density * dY.pressure) / Square(s.Y.pressure);
+
+  CeedScalar dentropy = dY.pressure / s.Y.pressure - gamma * drho / s.U.density;
+
+  dV.S_density = -dentropy / (gamma - 1) - de_kinetic * rho_div_p - e_kinetic * drho_div_p;
+  for (CeedInt i = 0; i < 3; i++) dV.S_momentum[i] = rho_div_p * dY.velocity[i] + drho_div_p * s.Y.velocity[i];
+  dV.S_energy = -drho_div_p;
+  return dV;
+}
+
+CEED_QFUNCTION_HELPER StatePrimitive StatePrimitiveFromEntropy(NewtonianIdealGasContext gas, StateEntropy V) {
+  StatePrimitive Y;
+  for (int i = 0; i < 3; i++) Y.velocity[i] = -V.S_momentum[i] / V.S_energy;
+  Y.temperature              = -1 / (GasConstant(gas) * V.S_energy);
+  const CeedScalar gamma     = HeatCapacityRatio(gas);
+  const CeedScalar e_kinetic = 0.5 * Dot3(Y.velocity, Y.velocity);
+  const CeedScalar entropy   = gamma - (gamma - 1) * (V.S_density - e_kinetic * V.S_energy);
+  const CeedScalar log_P     = -(entropy + gamma * log(-V.S_energy)) / (gamma - 1);
+  Y.pressure                 = exp(log_P);
+  return Y;
+}
+
+CEED_QFUNCTION_HELPER StatePrimitive StatePrimitiveFromEntropy_fwd(NewtonianIdealGasContext gas, State s, StateEntropy dV) {
+  StatePrimitive dY;
+  for (int i = 0; i < 3; i++) dY.velocity[i] = -(dV.S_momentum[i] - s.V.S_momentum[i] * dV.S_energy / s.V.S_energy) / s.V.S_energy;
+  dY.temperature              = dV.S_energy / (GasConstant(gas) * s.V.S_energy * s.V.S_energy);
+  const CeedScalar gamma      = HeatCapacityRatio(gas);
+  const CeedScalar e_kinetic  = 0.5 * Dot3(s.Y.velocity, s.Y.velocity);
+  const CeedScalar de_kinetic = Dot3(dY.velocity, s.Y.velocity);
+  const CeedScalar dentropy   = (1 - gamma) * (dV.S_density - e_kinetic * dV.S_energy - de_kinetic * s.V.S_energy);
+  dY.pressure                 = s.Y.pressure * (-dentropy - gamma * dV.S_energy / s.V.S_energy) / (gamma - 1);
+  return dY;
+}
+
 CEED_QFUNCTION_HELPER State StateFromPrimitive(NewtonianIdealGasContext gas, StatePrimitive Y) {
   StateConservative U = StateConservativeFromPrimitive(gas, Y);
+  StateEntropy      V = StateEntropyFromPrimitive(gas, Y);
   State             s;
   s.U = U;
   s.Y = Y;
+  s.V = V;
   return s;
 }
 
 CEED_QFUNCTION_HELPER State StateFromPrimitive_fwd(NewtonianIdealGasContext gas, State s, StatePrimitive dY) {
   StateConservative dU = StateConservativeFromPrimitive_fwd(gas, s, dY);
+  StateEntropy      dV = StateEntropyFromPrimitive_fwd(gas, s, dY);
   State             ds;
   ds.U = dU;
   ds.Y = dY;
+  ds.V = dV;
   return ds;
 }
 
@@ -156,6 +293,8 @@ CEED_QFUNCTION_HELPER void StateToU(NewtonianIdealGasContext gas, const State in
 
 CEED_QFUNCTION_HELPER void StateToY(NewtonianIdealGasContext gas, const State input, CeedScalar Y[5]) { UnpackState_Y(input.Y, Y); }
 
+CEED_QFUNCTION_HELPER void StateToV(NewtonianIdealGasContext gas, const State input, CeedScalar V[5]) { UnpackState_V(input.V, V); }
+
 CEED_QFUNCTION_HELPER void StateToQ(NewtonianIdealGasContext gas, const State input, CeedScalar Q[5], StateVariable state_var) {
   switch (state_var) {
     case STATEVAR_CONSERVATIVE:
@@ -164,6 +303,9 @@ CEED_QFUNCTION_HELPER void StateToQ(NewtonianIdealGasContext gas, const State in
     case STATEVAR_PRIMITIVE:
       StateToY(gas, input, Q);
       break;
+    case STATEVAR_ENTROPY:
+      StateToV(gas, input, Q);
+      break;
   }
 }
 
@@ -175,6 +317,7 @@ CEED_QFUNCTION_HELPER State StateFromU(NewtonianIdealGasContext gas, const CeedS
   s.U.momentum[2] = U[3];
   s.U.E_total     = U[4];
   s.Y             = StatePrimitiveFromConservative(gas, s.U);
+  s.V             = StateEntropyFromConservative(gas, s.U);
   return s;
 }
 
@@ -186,6 +329,7 @@ CEED_QFUNCTION_HELPER State StateFromU_fwd(NewtonianIdealGasContext gas, State s
   ds.U.momentum[2] = dU[3];
   ds.U.E_total     = dU[4];
   ds.Y             = StatePrimitiveFromConservative_fwd(gas, s, ds.U);
+  ds.V             = StateEntropyFromConservative_fwd(gas, s, ds.U);
   return ds;
 }
 
@@ -197,6 +341,7 @@ CEED_QFUNCTION_HELPER State StateFromY(NewtonianIdealGasContext gas, const CeedS
   s.Y.velocity[2] = Y[3];
   s.Y.temperature = Y[4];
   s.U             = StateConservativeFromPrimitive(gas, s.Y);
+  s.V             = StateEntropyFromPrimitive(gas, s.Y);
   return s;
 }
 
@@ -208,6 +353,31 @@ CEED_QFUNCTION_HELPER State StateFromY_fwd(NewtonianIdealGasContext gas, State s
   ds.Y.velocity[2] = dY[3];
   ds.Y.temperature = dY[4];
   ds.U             = StateConservativeFromPrimitive_fwd(gas, s, ds.Y);
+  ds.V             = StateEntropyFromPrimitive_fwd(gas, s, ds.Y);
+  return ds;
+}
+
+CEED_QFUNCTION_HELPER State StateFromV(NewtonianIdealGasContext gas, const CeedScalar V[5]) {
+  State s;
+  s.V.S_density     = V[0];
+  s.V.S_momentum[0] = V[1];
+  s.V.S_momentum[1] = V[2];
+  s.V.S_momentum[2] = V[3];
+  s.V.S_energy      = V[4];
+  s.U               = StateConservativeFromEntropy(gas, s.V);
+  s.Y               = StatePrimitiveFromEntropy(gas, s.V);
+  return s;
+}
+
+CEED_QFUNCTION_HELPER State StateFromV_fwd(NewtonianIdealGasContext gas, State s, const CeedScalar dV[5]) {
+  State ds;
+  ds.V.S_density     = dV[0];
+  ds.V.S_momentum[0] = dV[1];
+  ds.V.S_momentum[1] = dV[2];
+  ds.V.S_momentum[2] = dV[3];
+  ds.V.S_energy      = dV[4];
+  ds.U               = StateConservativeFromEntropy_fwd(gas, s, ds.V);
+  ds.Y               = StatePrimitiveFromEntropy_fwd(gas, s, ds.V);
   return ds;
 }
 
@@ -220,6 +390,9 @@ CEED_QFUNCTION_HELPER State StateFromQ(NewtonianIdealGasContext gas, const CeedS
     case STATEVAR_PRIMITIVE:
       s = StateFromY(gas, Q);
       break;
+    case STATEVAR_ENTROPY:
+      s = StateFromV(gas, Q);
+      break;
   }
   return s;
 }
@@ -233,6 +406,9 @@ CEED_QFUNCTION_HELPER State StateFromQ_fwd(NewtonianIdealGasContext gas, State s
     case STATEVAR_PRIMITIVE:
       ds = StateFromY_fwd(gas, s, dQ);
       break;
+    case STATEVAR_ENTROPY:
+      ds = StateFromV_fwd(gas, s, dQ);
+      break;
   }
   return ds;
 }
diff --git a/examples/fluids/qfunctions/newtonian_types.h b/examples/fluids/qfunctions/newtonian_types.h
index 0f238298b7..60478c397b 100644
--- a/examples/fluids/qfunctions/newtonian_types.h
+++ b/examples/fluids/qfunctions/newtonian_types.h
@@ -13,6 +13,7 @@
 typedef enum {
   STATEVAR_CONSERVATIVE = 0,
   STATEVAR_PRIMITIVE    = 1,
+  STATEVAR_ENTROPY      = 2,
 } StateVariable;
 
 typedef struct NewtonianIdealGasContext_ *NewtonianIdealGasContext;
@@ -48,6 +49,12 @@ typedef struct {
   CeedScalar temperature;
 } StatePrimitive;
 
+typedef struct {
+  CeedScalar S_density;
+  CeedScalar S_momentum[3];
+  CeedScalar S_energy;
+} StateEntropy;
+
 typedef struct SetupContext_ *SetupContext;
 struct SetupContext_ {
   StatePrimitive                   reference;

From a2d72b6f1ed489cbeb0eb5f72cf8bf977e7ff50a Mon Sep 17 00:00:00 2001
From: James Wright <james@jameswright.xyz>
Date: Mon, 17 Jun 2024 10:52:21 -0600
Subject: [PATCH 074/571] fluids: Proliferate entropy variables, add test

Co-authored-by: Zach Atkins <zacharyjayhawk@gmail.com>
---
 examples/fluids/README.md                     |   2 +-
 examples/fluids/navierstokes.c                |   1 +
 examples/fluids/problems/bc_freestream.c      |  28 ++++++
 examples/fluids/problems/bc_slip.c            |   6 ++
 examples/fluids/problems/gaussianwave.c       |   4 +
 examples/fluids/problems/newtonian.c          |  10 +++
 examples/fluids/problems/sgs_dd_model.c       |  17 ++--
 examples/fluids/problems/stg_shur14.c         |   5 ++
 examples/fluids/qfunctions/bc_freestream.h    |  32 +++++++
 examples/fluids/qfunctions/bc_slip.h          |   8 ++
 examples/fluids/qfunctions/blasius.h          |  11 +--
 examples/fluids/qfunctions/channel.h          |  13 +--
 examples/fluids/qfunctions/densitycurrent.h   |  13 +--
 .../fluids/qfunctions/differential_filter.h   |   4 +
 examples/fluids/qfunctions/gaussianwave.h     |   4 +
 examples/fluids/qfunctions/newtonian.h        |  25 +++++-
 examples/fluids/qfunctions/sgs_dd_model.h     |  12 +++
 examples/fluids/qfunctions/stg_shur14.h       |  82 ++++++++----------
 examples/fluids/qfunctions/taylorgreen.h      |  22 ++---
 examples/fluids/qfunctions/turb_spanstats.h   |   4 +
 .../qfunctions/velocity_gradient_projection.h |   4 +
 examples/fluids/src/differential_filter.c     |   5 +-
 examples/fluids/src/setupdm.c                 |   8 ++
 examples/fluids/src/turb_spanstats.c          |   5 +-
 .../fluids/src/velocity_gradient_projection.c |   6 +-
 ...-navierstokes-gaussianwave-IDL-entropy.bin | Bin 0 -> 2340 bytes
 26 files changed, 226 insertions(+), 105 deletions(-)
 create mode 100644 examples/fluids/tests-output/fluids-navierstokes-gaussianwave-IDL-entropy.bin

diff --git a/examples/fluids/README.md b/examples/fluids/README.md
index 5b038c71f4..0c4fa11cce 100644
--- a/examples/fluids/README.md
+++ b/examples/fluids/README.md
@@ -615,7 +615,7 @@ For the Density Current, Channel, and Blasius problems, the following common com
   - boolean
 
 * - `-state_var`
-  - State variables to solve solution with. `conservative` ($\rho, \rho \bm{u}, \rho e$) or `primitive` ($P, \bm{u}, T$)
+  - State variables to solve solution with. `conservative` ($\rho, \rho \bm{u}, \rho e$), `primitive` ($P, \bm{u}, T$), or `entropy` ($\frac{\gamma - s}{\gamma - 1} - \frac{\rho}{P} (e - c_v T),\ \frac{\rho}{P} \bm{u},\ -\frac{\rho}{P}$) where  $s = \ln(P\rho^{-\gamma})$
   - `conservative`
   - string
 
diff --git a/examples/fluids/navierstokes.c b/examples/fluids/navierstokes.c
index cc619ea82c..613edfc3a8 100644
--- a/examples/fluids/navierstokes.c
+++ b/examples/fluids/navierstokes.c
@@ -18,6 +18,7 @@
 //     ./navierstokes -ceed /cpu/self -options_file gaussianwave.yml
 //     ./navierstokes -ceed /gpu/cuda -problem advection -degree 1
 //
+//TESTARGS(name="Gaussian Wave, IDL and Entropy variables") -ceed {ceed_resource} -test_type solver -options_file examples/fluids/gaussianwave.yaml -compare_final_state_atol 2e-11 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-gaussianwave-IDL-entropy.bin -state_var entropy -dm_plex_box_faces 5,5,1 -ts_max_steps 5 -idl_decay_time 2e-3 -idl_length 0.25 -idl_start 0 -idl_pressure 70 -newtonian_unit_tests
 //TESTARGS(name="Blasius, SGS DataDriven Sequential Torch",only="torch") -ceed {ceed_resource} -options_file examples/fluids/tests-output/blasius_stgtest.yaml -sgs_model_type data_driven -sgs_model_dd_leakyrelu_alpha 0.3 -sgs_model_dd_parameter_dir examples/fluids/dd_sgs_data -ts_dt 2e-9 -state_var primitive -ksp_rtol 1e-12 -snes_rtol 1e-12 -stg_mean_only -stg_fluctuating_IC -test_type solver -compare_final_state_atol 1e-10 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-blasius-sgs-data-driven.bin -sgs_model_dd_implementation sequential_torch -sgs_model_dd_torch_model_path ./examples/fluids/tests-output/createPyTorchModel/NNModel_HIT_fp64_jit.pt
 //TESTARGS(name="Blasius, SGS DataDriven Sequential Ceed") -ceed {ceed_resource} -options_file examples/fluids/tests-output/blasius_stgtest.yaml -sgs_model_type data_driven -sgs_model_dd_leakyrelu_alpha 0.3 -sgs_model_dd_parameter_dir examples/fluids/dd_sgs_data -ts_dt 2e-9 -state_var primitive -ksp_rtol 1e-12 -snes_rtol 1e-12 -stg_mean_only -stg_fluctuating_IC -test_type solver -compare_final_state_atol 1e-10 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-blasius-sgs-data-driven.bin -sgs_model_dd_implementation sequential_ceed
 //TESTARGS(name="Gaussian Wave, explicit, supg, IDL") -ceed {ceed_resource} -test_type solver -options_file examples/fluids/gaussianwave.yaml -compare_final_state_atol 1e-8 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-gaussianwave-explicit.bin -dm_plex_box_faces 2,2,1 -ts_max_steps 5 -degree 3 -implicit false -ts_type rk -stab supg -state_var conservative -mass_ksp_type gmres -mass_pc_jacobi_type diagonal -idl_decay_time 2e-3 -idl_length 0.25 -idl_start 0 -idl_pressure 70
diff --git a/examples/fluids/problems/bc_freestream.c b/examples/fluids/problems/bc_freestream.c
index cff3c74e67..e57dcf6272 100644
--- a/examples/fluids/problems/bc_freestream.c
+++ b/examples/fluids/problems/bc_freestream.c
@@ -78,6 +78,22 @@ PetscErrorCode FreestreamBCSetup(ProblemData problem, DM dm, void *ctx, Newtonia
           break;
       }
       break;
+    case STATEVAR_ENTROPY:
+      switch (riemann) {
+        case RIEMANN_HLL:
+          problem->apply_freestream.qfunction              = Freestream_Entropy_HLL;
+          problem->apply_freestream.qfunction_loc          = Freestream_Entropy_HLL_loc;
+          problem->apply_freestream_jacobian.qfunction     = Freestream_Jacobian_Entropy_HLL;
+          problem->apply_freestream_jacobian.qfunction_loc = Freestream_Jacobian_Entropy_HLL_loc;
+          break;
+        case RIEMANN_HLLC:
+          problem->apply_freestream.qfunction              = Freestream_Entropy_HLLC;
+          problem->apply_freestream.qfunction_loc          = Freestream_Entropy_HLLC_loc;
+          problem->apply_freestream_jacobian.qfunction     = Freestream_Jacobian_Entropy_HLLC;
+          problem->apply_freestream_jacobian.qfunction_loc = Freestream_Jacobian_Entropy_HLLC_loc;
+          break;
+      }
+      break;
   }
 
   Y_inf.pressure *= Pascal;
@@ -148,6 +164,12 @@ PetscErrorCode OutflowBCSetup(ProblemData problem, DM dm, void *ctx, NewtonianId
           problem->apply_outflow_jacobian.qfunction     = RiemannOutflow_Jacobian_Prim;
           problem->apply_outflow_jacobian.qfunction_loc = RiemannOutflow_Jacobian_Prim_loc;
           break;
+        case STATEVAR_ENTROPY:
+          problem->apply_outflow.qfunction              = RiemannOutflow_Entropy;
+          problem->apply_outflow.qfunction_loc          = RiemannOutflow_Entropy_loc;
+          problem->apply_outflow_jacobian.qfunction     = RiemannOutflow_Jacobian_Entropy;
+          problem->apply_outflow_jacobian.qfunction_loc = RiemannOutflow_Jacobian_Entropy_loc;
+          break;
       }
       break;
     case OUTFLOW_PRESSURE:
@@ -164,6 +186,12 @@ PetscErrorCode OutflowBCSetup(ProblemData problem, DM dm, void *ctx, NewtonianId
           problem->apply_outflow_jacobian.qfunction     = PressureOutflow_Jacobian_Prim;
           problem->apply_outflow_jacobian.qfunction_loc = PressureOutflow_Jacobian_Prim_loc;
           break;
+        case STATEVAR_ENTROPY:
+          problem->apply_outflow.qfunction              = PressureOutflow_Entropy;
+          problem->apply_outflow.qfunction_loc          = PressureOutflow_Entropy_loc;
+          problem->apply_outflow_jacobian.qfunction     = PressureOutflow_Jacobian_Entropy;
+          problem->apply_outflow_jacobian.qfunction_loc = PressureOutflow_Jacobian_Entropy_loc;
+          break;
       }
       break;
   }
diff --git a/examples/fluids/problems/bc_slip.c b/examples/fluids/problems/bc_slip.c
index 4b6708436e..05c686d4ff 100644
--- a/examples/fluids/problems/bc_slip.c
+++ b/examples/fluids/problems/bc_slip.c
@@ -34,6 +34,12 @@ PetscErrorCode SlipBCSetup(ProblemData problem, DM dm, void *ctx, CeedQFunctionC
       problem->apply_slip_jacobian.qfunction     = Slip_Jacobian_Prim;
       problem->apply_slip_jacobian.qfunction_loc = Slip_Jacobian_Prim_loc;
       break;
+    case STATEVAR_ENTROPY:
+      problem->apply_slip.qfunction              = Slip_Entropy;
+      problem->apply_slip.qfunction_loc          = Slip_Entropy_loc;
+      problem->apply_slip_jacobian.qfunction     = Slip_Jacobian_Entropy;
+      problem->apply_slip_jacobian.qfunction_loc = Slip_Jacobian_Entropy_loc;
+      break;
   }
 
   PetscCallCeed(ceed, CeedQFunctionContextReferenceCopy(newtonian_ig_qfctx, &problem->apply_slip.qfunction_context));
diff --git a/examples/fluids/problems/gaussianwave.c b/examples/fluids/problems/gaussianwave.c
index 9af7924b78..2ee5e41726 100644
--- a/examples/fluids/problems/gaussianwave.c
+++ b/examples/fluids/problems/gaussianwave.c
@@ -37,6 +37,10 @@ PetscErrorCode NS_GAUSSIAN_WAVE(ProblemData problem, DM dm, void *ctx, SimpleBC
       problem->ics.qfunction     = IC_GaussianWave_Prim;
       problem->ics.qfunction_loc = IC_GaussianWave_Prim_loc;
       break;
+    case STATEVAR_ENTROPY:
+      problem->ics.qfunction     = IC_GaussianWave_Entropy;
+      problem->ics.qfunction_loc = IC_GaussianWave_Entropy_loc;
+      break;
   }
 
   // -- Option Defaults
diff --git a/examples/fluids/problems/newtonian.c b/examples/fluids/problems/newtonian.c
index 39e8c469d9..e1aecc6ca6 100644
--- a/examples/fluids/problems/newtonian.c
+++ b/examples/fluids/problems/newtonian.c
@@ -304,6 +304,16 @@ PetscErrorCode NS_NEWTONIAN_IG(ProblemData problem, DM dm, void *ctx, SimpleBC b
       problem->apply_inflow_jacobian.qfunction_loc = BoundaryIntegral_Jacobian_Prim_loc;
       break;
     case STATEVAR_ENTROPY:
+      problem->ics.qfunction                       = ICsNewtonianIG_Entropy;
+      problem->ics.qfunction_loc                   = ICsNewtonianIG_Entropy_loc;
+      problem->apply_vol_ifunction.qfunction       = IFunction_Newtonian_Entropy;
+      problem->apply_vol_ifunction.qfunction_loc   = IFunction_Newtonian_Entropy_loc;
+      problem->apply_vol_ijacobian.qfunction       = IJacobian_Newtonian_Entropy;
+      problem->apply_vol_ijacobian.qfunction_loc   = IJacobian_Newtonian_Entropy_loc;
+      problem->apply_inflow.qfunction              = BoundaryIntegral_Entropy;
+      problem->apply_inflow.qfunction_loc          = BoundaryIntegral_Entropy_loc;
+      problem->apply_inflow_jacobian.qfunction     = BoundaryIntegral_Jacobian_Entropy;
+      problem->apply_inflow_jacobian.qfunction_loc = BoundaryIntegral_Jacobian_Entropy_loc;
       break;
   }
 
diff --git a/examples/fluids/problems/sgs_dd_model.c b/examples/fluids/problems/sgs_dd_model.c
index b8e6dcdbc5..49a7efb9f2 100644
--- a/examples/fluids/problems/sgs_dd_model.c
+++ b/examples/fluids/problems/sgs_dd_model.c
@@ -109,8 +109,9 @@ static PetscErrorCode SgsDDSetupNodalEvaluation_Fused(Ceed ceed, User user, Ceed
     case STATEVAR_CONSERVATIVE:
       PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, ComputeSgsDDNodal_Conserv, ComputeSgsDDNodal_Conserv_loc, &qf_sgs_dd_nodal));
       break;
-    default:
-      SETERRQ(PetscObjectComm((PetscObject)user->dm), PETSC_ERR_SUP, "Data-driven SGS nodal evaluation not available for chosen state variable");
+    case STATEVAR_ENTROPY:
+      PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, ComputeSgsDDNodal_Entropy, ComputeSgsDDNodal_Entropy_loc, &qf_sgs_dd_nodal));
+      break;
   }
 
   // Mesh/geometry order and solution basis order may differ, therefore must interpolate
@@ -347,9 +348,10 @@ static PetscErrorCode SgsDDSetupNodalEvaluation_Sequential(Ceed ceed, User user,
         PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, ComputeSgsDDNodal_Sequential_Inputs_Conserv,
                                                         ComputeSgsDDNodal_Sequential_Inputs_Conserv_loc, &qf_sgs_dd_inputs));
         break;
-      default:
-        SETERRQ(PetscObjectComm((PetscObject)user->dm), PETSC_ERR_SUP,
-                "Data-driven SGS nodal input evaluation not available for chosen state variable");
+      case STATEVAR_ENTROPY:
+        PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, ComputeSgsDDNodal_Sequential_Inputs_Entropy,
+                                                        ComputeSgsDDNodal_Sequential_Inputs_Entropy_loc, &qf_sgs_dd_inputs));
+        break;
     }
 
     PetscCallCeed(ceed, CeedQFunctionSetContext(qf_sgs_dd_inputs, sgs_dd_setup_data->sgsdd_qfctx));
@@ -451,8 +453,9 @@ static PetscErrorCode SgsSetupNodalIFunction(Ceed ceed, User user, CeedData ceed
     case STATEVAR_CONSERVATIVE:
       PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, IFunction_NodalSgs_Conserv, IFunction_NodalSgs_Conserv_loc, &qf_sgs_apply));
       break;
-    default:
-      SETERRQ(PetscObjectComm((PetscObject)user->dm), PETSC_ERR_SUP, "Nodal SGS evaluation not available for chosen state variable");
+    case STATEVAR_ENTROPY:
+      PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, IFunction_NodalSgs_Entropy, IFunction_NodalSgs_Entropy_loc, &qf_sgs_apply));
+      break;
   }
 
   PetscCallCeed(ceed, CeedQFunctionSetContext(qf_sgs_apply, sgs_dd_setup_data->ifunction_qfctx));
diff --git a/examples/fluids/problems/stg_shur14.c b/examples/fluids/problems/stg_shur14.c
index 5d9d4f1dc4..98858886ae 100644
--- a/examples/fluids/problems/stg_shur14.c
+++ b/examples/fluids/problems/stg_shur14.c
@@ -307,6 +307,11 @@ PetscErrorCode SetupStrongStg(DM dm, SimpleBC bc, ProblemData problem, Physics p
       // {1,2,3,4} for u, v, w, T
       for (int i = 0; i < 4; i++) comps[i] = i + 1;
       break;
+
+    case STATEVAR_ENTROPY:
+      // {1,2,3,4}
+      for (int i = 0; i < 4; i++) comps[i] = i + 1;
+      break;
   }
 
   PetscCall(DMGetLabel(dm, "Face Sets", &label));
diff --git a/examples/fluids/qfunctions/bc_freestream.h b/examples/fluids/qfunctions/bc_freestream.h
index 5fb4da2289..4475f65495 100644
--- a/examples/fluids/qfunctions/bc_freestream.h
+++ b/examples/fluids/qfunctions/bc_freestream.h
@@ -64,6 +64,10 @@ CEED_QFUNCTION(Freestream_Prim_HLL)(void *ctx, CeedInt Q, const CeedScalar *cons
   return Freestream(ctx, Q, in, out, STATEVAR_PRIMITIVE, RIEMANN_HLL);
 }
 
+CEED_QFUNCTION(Freestream_Entropy_HLL)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+  return Freestream(ctx, Q, in, out, STATEVAR_ENTROPY, RIEMANN_HLL);
+}
+
 CEED_QFUNCTION(Freestream_Conserv_HLLC)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   return Freestream(ctx, Q, in, out, STATEVAR_CONSERVATIVE, RIEMANN_HLLC);
 }
@@ -72,6 +76,10 @@ CEED_QFUNCTION(Freestream_Prim_HLLC)(void *ctx, CeedInt Q, const CeedScalar *con
   return Freestream(ctx, Q, in, out, STATEVAR_PRIMITIVE, RIEMANN_HLLC);
 }
 
+CEED_QFUNCTION(Freestream_Entropy_HLLC)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+  return Freestream(ctx, Q, in, out, STATEVAR_ENTROPY, RIEMANN_HLLC);
+}
+
 CEED_QFUNCTION_HELPER int Freestream_Jacobian(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out, StateVariable state_var,
                                               RiemannFluxType flux_type) {
   const CeedScalar(*dq)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0];
@@ -120,6 +128,10 @@ CEED_QFUNCTION(Freestream_Jacobian_Prim_HLL)(void *ctx, CeedInt Q, const CeedSca
   return Freestream_Jacobian(ctx, Q, in, out, STATEVAR_PRIMITIVE, RIEMANN_HLL);
 }
 
+CEED_QFUNCTION(Freestream_Jacobian_Entropy_HLL)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+  return Freestream_Jacobian(ctx, Q, in, out, STATEVAR_ENTROPY, RIEMANN_HLL);
+}
+
 CEED_QFUNCTION(Freestream_Jacobian_Conserv_HLLC)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   return Freestream_Jacobian(ctx, Q, in, out, STATEVAR_CONSERVATIVE, RIEMANN_HLLC);
 }
@@ -128,6 +140,10 @@ CEED_QFUNCTION(Freestream_Jacobian_Prim_HLLC)(void *ctx, CeedInt Q, const CeedSc
   return Freestream_Jacobian(ctx, Q, in, out, STATEVAR_PRIMITIVE, RIEMANN_HLLC);
 }
 
+CEED_QFUNCTION(Freestream_Jacobian_Entropy_HLLC)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+  return Freestream_Jacobian(ctx, Q, in, out, STATEVAR_ENTROPY, RIEMANN_HLLC);
+}
+
 // Note the identity
 //
 // softplus(x) - x = log(1 + exp(x)) - x
@@ -215,6 +231,10 @@ CEED_QFUNCTION(RiemannOutflow_Prim)(void *ctx, CeedInt Q, const CeedScalar *cons
   return RiemannOutflow(ctx, Q, in, out, STATEVAR_PRIMITIVE);
 }
 
+CEED_QFUNCTION(RiemannOutflow_Entropy)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+  return RiemannOutflow(ctx, Q, in, out, STATEVAR_ENTROPY);
+}
+
 // *****************************************************************************
 // Jacobian for Riemann pressure/temperature outflow boundary condition
 // *****************************************************************************
@@ -287,6 +307,10 @@ CEED_QFUNCTION(RiemannOutflow_Jacobian_Prim)(void *ctx, CeedInt Q, const CeedSca
   return RiemannOutflow_Jacobian(ctx, Q, in, out, STATEVAR_PRIMITIVE);
 }
 
+CEED_QFUNCTION(RiemannOutflow_Jacobian_Entropy)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+  return RiemannOutflow_Jacobian(ctx, Q, in, out, STATEVAR_ENTROPY);
+}
+
 // *****************************************************************************
 // Outflow boundary condition, weakly setting a constant pressure. This is the
 // classic outflow condition used by PHASTA-C and retained largely for
@@ -348,6 +372,10 @@ CEED_QFUNCTION(PressureOutflow_Prim)(void *ctx, CeedInt Q, const CeedScalar *con
   return PressureOutflow(ctx, Q, in, out, STATEVAR_PRIMITIVE);
 }
 
+CEED_QFUNCTION(PressureOutflow_Entropy)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+  return PressureOutflow(ctx, Q, in, out, STATEVAR_ENTROPY);
+}
+
 // *****************************************************************************
 // Jacobian for weak-pressure outflow boundary condition
 // *****************************************************************************
@@ -406,3 +434,7 @@ CEED_QFUNCTION(PressureOutflow_Jacobian_Conserv)(void *ctx, CeedInt Q, const Cee
 CEED_QFUNCTION(PressureOutflow_Jacobian_Prim)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   return PressureOutflow_Jacobian(ctx, Q, in, out, STATEVAR_PRIMITIVE);
 }
+
+CEED_QFUNCTION(PressureOutflow_Jacobian_Entropy)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+  return PressureOutflow_Jacobian(ctx, Q, in, out, STATEVAR_ENTROPY);
+}
diff --git a/examples/fluids/qfunctions/bc_slip.h b/examples/fluids/qfunctions/bc_slip.h
index 816d4957ca..3a7c5b5bc2 100644
--- a/examples/fluids/qfunctions/bc_slip.h
+++ b/examples/fluids/qfunctions/bc_slip.h
@@ -56,6 +56,10 @@ CEED_QFUNCTION(Slip_Prim)(void *ctx, CeedInt Q, const CeedScalar *const *in, Cee
   return Slip(ctx, Q, in, out, STATEVAR_PRIMITIVE);
 }
 
+CEED_QFUNCTION(Slip_Entropy)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+  return Slip(ctx, Q, in, out, STATEVAR_ENTROPY);
+}
+
 CEED_QFUNCTION_HELPER int Slip_Jacobian(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out, StateVariable state_var) {
   const CeedScalar(*dq)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0];
   const CeedScalar(*q_data_sur)     = in[2];
@@ -104,3 +108,7 @@ CEED_QFUNCTION(Slip_Jacobian_Conserv)(void *ctx, CeedInt Q, const CeedScalar *co
 CEED_QFUNCTION(Slip_Jacobian_Prim)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   return Slip_Jacobian(ctx, Q, in, out, STATEVAR_PRIMITIVE);
 }
+
+CEED_QFUNCTION(Slip_Jacobian_Entropy)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+  return Slip_Jacobian(ctx, Q, in, out, STATEVAR_ENTROPY);
+}
diff --git a/examples/fluids/qfunctions/blasius.h b/examples/fluids/qfunctions/blasius.h
index 66ea2d3857..384a67a903 100644
--- a/examples/fluids/qfunctions/blasius.h
+++ b/examples/fluids/qfunctions/blasius.h
@@ -118,14 +118,7 @@ CEED_QFUNCTION(ICsBlasius)(void *ctx, CeedInt Q, const CeedScalar *const *in, Ce
     State            s    = BlasiusSolution(context, x, x0, x_inflow, S_infty.U.density, &t12);
     CeedScalar       q[5] = {0};
 
-    switch (gas->state_var) {
-      case STATEVAR_CONSERVATIVE:
-        UnpackState_U(s.U, q);
-        break;
-      case STATEVAR_PRIMITIVE:
-        UnpackState_Y(s.Y, q);
-        break;
-    }
+    StateToQ(gas, s, q, gas->state_var);
     for (CeedInt j = 0; j < 5; j++) q0[j][i] = q[j];
   }
   return 0;
@@ -165,9 +158,11 @@ CEED_QFUNCTION(Blasius_Inflow)(void *ctx, CeedInt Q, const CeedScalar *const *in
     if (context->weakT) {  // density from the current solution
       s.U.density = s_int.U.density;
       s.Y         = StatePrimitiveFromConservative(gas, s.U);
+      s.V         = StateEntropyFromConservative(gas, s.U);
     } else {  // Total energy from current solution
       s.U.E_total = s_int.U.E_total;
       s.Y         = StatePrimitiveFromConservative(gas, s.U);
+      s.V         = StateEntropyFromConservative(gas, s.U);
     }
 
     StateConservative Flux_inviscid[3];
diff --git a/examples/fluids/qfunctions/channel.h b/examples/fluids/qfunctions/channel.h
index 7634696c74..21db7c8dd6 100644
--- a/examples/fluids/qfunctions/channel.h
+++ b/examples/fluids/qfunctions/channel.h
@@ -64,21 +64,14 @@ CEED_QFUNCTION(ICsChannel)(void *ctx, CeedInt Q, const CeedScalar *const *in, Ce
   const CeedScalar(*X)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0];
   CeedScalar(*q0)[CEED_Q_VLA]      = (CeedScalar(*)[CEED_Q_VLA])out[0];
 
-  const ChannelContext context = (ChannelContext)ctx;
+  const ChannelContext           context = (ChannelContext)ctx;
+  const NewtonianIdealGasContext gas     = &context->newtonian_ctx;
 
   CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
     const CeedScalar x[]  = {X[0][i], X[1][i], X[2][i]};
     State            s    = Exact_Channel(3, 0., x, 5, ctx);
     CeedScalar       q[5] = {0};
-    switch (context->newtonian_ctx.state_var) {
-      case STATEVAR_CONSERVATIVE:
-        UnpackState_U(s.U, q);
-        break;
-      case STATEVAR_PRIMITIVE:
-        UnpackState_Y(s.Y, q);
-        break;
-    }
-
+    StateToQ(gas, s, q, gas->state_var);
     for (CeedInt j = 0; j < 5; j++) q0[j][i] = q[j];
   }
   return 0;
diff --git a/examples/fluids/qfunctions/densitycurrent.h b/examples/fluids/qfunctions/densitycurrent.h
index 4d10261b4f..c5fe3752a0 100644
--- a/examples/fluids/qfunctions/densitycurrent.h
+++ b/examples/fluids/qfunctions/densitycurrent.h
@@ -133,21 +133,14 @@ CEED_QFUNCTION(ICsDC)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedSca
   const CeedScalar(*X)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0];
   CeedScalar(*q0)[CEED_Q_VLA]      = (CeedScalar(*)[CEED_Q_VLA])out[0];
 
-  const DensityCurrentContext context = (DensityCurrentContext)ctx;
+  const DensityCurrentContext    context = (DensityCurrentContext)ctx;
+  const NewtonianIdealGasContext gas     = &context->newtonian_ctx;
 
   CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
     const CeedScalar x[]  = {X[0][i], X[1][i], X[2][i]};
     State            s    = Exact_DC(3, 0., x, 5, ctx);
     CeedScalar       q[5] = {0};
-    switch (context->newtonian_ctx.state_var) {
-      case STATEVAR_CONSERVATIVE:
-        UnpackState_U(s.U, q);
-        break;
-      case STATEVAR_PRIMITIVE:
-        UnpackState_Y(s.Y, q);
-        break;
-    }
-
+    StateToQ(gas, s, q, gas->state_var);
     for (CeedInt j = 0; j < 5; j++) q0[j][i] = q[j];
   }
   return 0;
diff --git a/examples/fluids/qfunctions/differential_filter.h b/examples/fluids/qfunctions/differential_filter.h
index 703bc2bfc8..10b89b70c7 100644
--- a/examples/fluids/qfunctions/differential_filter.h
+++ b/examples/fluids/qfunctions/differential_filter.h
@@ -64,6 +64,10 @@ CEED_QFUNCTION(DifferentialFilter_RHS_Prim)(void *ctx, CeedInt Q, const CeedScal
   return DifferentialFilter_RHS(ctx, Q, in, out, STATEVAR_PRIMITIVE);
 }
 
+CEED_QFUNCTION(DifferentialFilter_RHS_Entropy)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+  return DifferentialFilter_RHS(ctx, Q, in, out, STATEVAR_ENTROPY);
+}
+
 CEED_QFUNCTION_HELPER CeedScalar VanDriestWallDamping(const CeedScalar wall_dist_plus, const CeedScalar A_plus) {
   return -expm1(-wall_dist_plus / A_plus);
 }
diff --git a/examples/fluids/qfunctions/gaussianwave.h b/examples/fluids/qfunctions/gaussianwave.h
index 4115d86a81..88f9feb126 100644
--- a/examples/fluids/qfunctions/gaussianwave.h
+++ b/examples/fluids/qfunctions/gaussianwave.h
@@ -69,3 +69,7 @@ CEED_QFUNCTION(IC_GaussianWave_Conserv)(void *ctx, CeedInt Q, const CeedScalar *
 CEED_QFUNCTION(IC_GaussianWave_Prim)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   return IC_GaussianWave(ctx, Q, in, out, STATEVAR_PRIMITIVE);
 }
+
+CEED_QFUNCTION(IC_GaussianWave_Entropy)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+  return IC_GaussianWave(ctx, Q, in, out, STATEVAR_ENTROPY);
+}
diff --git a/examples/fluids/qfunctions/newtonian.h b/examples/fluids/qfunctions/newtonian.h
index a94ee4f604..66fc309018 100644
--- a/examples/fluids/qfunctions/newtonian.h
+++ b/examples/fluids/qfunctions/newtonian.h
@@ -43,11 +43,16 @@ CEED_QFUNCTION_HELPER int ICsNewtonianIG(void *ctx, CeedInt Q, const CeedScalar
   return 0;
 }
 
+CEED_QFUNCTION(ICsNewtonianIG_Conserv)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+  return ICsNewtonianIG(ctx, Q, in, out, STATEVAR_CONSERVATIVE);
+}
+
 CEED_QFUNCTION(ICsNewtonianIG_Prim)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   return ICsNewtonianIG(ctx, Q, in, out, STATEVAR_PRIMITIVE);
 }
-CEED_QFUNCTION(ICsNewtonianIG_Conserv)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
-  return ICsNewtonianIG(ctx, Q, in, out, STATEVAR_CONSERVATIVE);
+
+CEED_QFUNCTION(ICsNewtonianIG_Entropy)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+  return ICsNewtonianIG(ctx, Q, in, out, STATEVAR_ENTROPY);
 }
 
 CEED_QFUNCTION_HELPER void MassFunction_Newtonian(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out,
@@ -292,6 +297,10 @@ CEED_QFUNCTION(IFunction_Newtonian_Prim)(void *ctx, CeedInt Q, const CeedScalar
   return IFunction_Newtonian(ctx, Q, in, out, STATEVAR_PRIMITIVE);
 }
 
+CEED_QFUNCTION(IFunction_Newtonian_Entropy)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+  return IFunction_Newtonian(ctx, Q, in, out, STATEVAR_ENTROPY);
+}
+
 // *****************************************************************************
 // This QFunction implements the jacobian of the Navier-Stokes equations for implicit time stepping method.
 // *****************************************************************************
@@ -374,6 +383,10 @@ CEED_QFUNCTION(IJacobian_Newtonian_Prim)(void *ctx, CeedInt Q, const CeedScalar
   return IJacobian_Newtonian(ctx, Q, in, out, STATEVAR_PRIMITIVE);
 }
 
+CEED_QFUNCTION(IJacobian_Newtonian_Entropy)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+  return IJacobian_Newtonian(ctx, Q, in, out, STATEVAR_ENTROPY);
+}
+
 // *****************************************************************************
 // Compute boundary integral (ie. for strongly set inflows)
 // *****************************************************************************
@@ -428,6 +441,10 @@ CEED_QFUNCTION(BoundaryIntegral_Prim)(void *ctx, CeedInt Q, const CeedScalar *co
   return BoundaryIntegral(ctx, Q, in, out, STATEVAR_PRIMITIVE);
 }
 
+CEED_QFUNCTION(BoundaryIntegral_Entropy)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+  return BoundaryIntegral(ctx, Q, in, out, STATEVAR_ENTROPY);
+}
+
 // *****************************************************************************
 // Jacobian for "set nothing" boundary integral
 // *****************************************************************************
@@ -483,3 +500,7 @@ CEED_QFUNCTION(BoundaryIntegral_Jacobian_Conserv)(void *ctx, CeedInt Q, const Ce
 CEED_QFUNCTION(BoundaryIntegral_Jacobian_Prim)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   return BoundaryIntegral_Jacobian(ctx, Q, in, out, STATEVAR_PRIMITIVE);
 }
+
+CEED_QFUNCTION(BoundaryIntegral_Jacobian_Entropy)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+  return BoundaryIntegral_Jacobian(ctx, Q, in, out, STATEVAR_ENTROPY);
+}
diff --git a/examples/fluids/qfunctions/sgs_dd_model.h b/examples/fluids/qfunctions/sgs_dd_model.h
index 9f05f07960..e904389814 100644
--- a/examples/fluids/qfunctions/sgs_dd_model.h
+++ b/examples/fluids/qfunctions/sgs_dd_model.h
@@ -106,6 +106,10 @@ CEED_QFUNCTION(ComputeSgsDDNodal_Conserv)(void *ctx, CeedInt Q, const CeedScalar
   return ComputeSgsDDNodal_Fused(ctx, Q, in, out, STATEVAR_CONSERVATIVE);
 }
 
+CEED_QFUNCTION(ComputeSgsDDNodal_Entropy)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+  return ComputeSgsDDNodal_Fused(ctx, Q, in, out, STATEVAR_ENTROPY);
+}
+
 // @brief Calculate inputs to anisotropic data-driven model
 CEED_QFUNCTION_HELPER int ComputeSgsDDNodal_Sequential_Inputs(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out,
                                                               StateVariable state_var) {
@@ -149,6 +153,10 @@ CEED_QFUNCTION(ComputeSgsDDNodal_Sequential_Inputs_Conserv)(void *ctx, CeedInt Q
   return ComputeSgsDDNodal_Sequential_Inputs(ctx, Q, in, out, STATEVAR_CONSERVATIVE);
 }
 
+CEED_QFUNCTION(ComputeSgsDDNodal_Sequential_Inputs_Entropy)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+  return ComputeSgsDDNodal_Sequential_Inputs(ctx, Q, in, out, STATEVAR_ENTROPY);
+}
+
 // @brief Runs inference on the data-driven model, used predominantsly for testing and validation
 CEED_QFUNCTION(ComputeSgsDDNodal_Sequential_Inference)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   const CeedScalar(*model_inputs)     = in[0];
@@ -245,3 +253,7 @@ CEED_QFUNCTION(IFunction_NodalSgs_Conserv)(void *ctx, CeedInt Q, const CeedScala
 CEED_QFUNCTION(IFunction_NodalSgs_Prim)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   return IFunction_NodalSgs(ctx, Q, in, out, STATEVAR_PRIMITIVE);
 }
+
+CEED_QFUNCTION(IFunction_NodalSgs_Entropy)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+  return IFunction_NodalSgs(ctx, Q, in, out, STATEVAR_ENTROPY);
+}
diff --git a/examples/fluids/qfunctions/stg_shur14.h b/examples/fluids/qfunctions/stg_shur14.h
index d6c7464660..ccebf38fca 100644
--- a/examples/fluids/qfunctions/stg_shur14.h
+++ b/examples/fluids/qfunctions/stg_shur14.h
@@ -279,15 +279,15 @@ CEED_QFUNCTION(ICsStg)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedSc
   const CeedScalar(*J)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[1];
   CeedScalar(*q0)[CEED_Q_VLA]         = (CeedScalar(*)[CEED_Q_VLA])out[0];
 
-  const StgShur14Context stg_ctx = (StgShur14Context)ctx;
-  CeedScalar             qn[STG_NMODES_MAX], u[3], ubar[3], cij[6], eps, lt;
-  const CeedScalar       dx     = stg_ctx->dx;
-  const CeedScalar       time   = stg_ctx->time;
-  const CeedScalar       theta0 = stg_ctx->theta0;
-  const CeedScalar       P0     = stg_ctx->P0;
-  const CeedScalar       cv     = stg_ctx->newtonian_ctx.cv;
-  const CeedScalar       rho    = P0 / (GasConstant(&stg_ctx->newtonian_ctx) * theta0);
-  const CeedScalar       nu     = stg_ctx->newtonian_ctx.mu / rho;
+  const StgShur14Context         stg_ctx = (StgShur14Context)ctx;
+  const NewtonianIdealGasContext gas     = &stg_ctx->newtonian_ctx;
+  CeedScalar                     qn[STG_NMODES_MAX], u[3], ubar[3], cij[6], eps, lt;
+  const CeedScalar               dx     = stg_ctx->dx;
+  const CeedScalar               time   = stg_ctx->time;
+  const CeedScalar               theta0 = stg_ctx->theta0;
+  const CeedScalar               P0     = stg_ctx->P0;
+  const CeedScalar               rho    = P0 / (GasConstant(gas) * theta0);
+  const CeedScalar               nu     = gas->mu / rho;
 
   CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
     const CeedScalar x_i[3] = {x[0][i], x[1][i], x[2][i]};
@@ -305,22 +305,11 @@ CEED_QFUNCTION(ICsStg)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedSc
       for (CeedInt j = 0; j < 3; j++) u[j] = ubar[j];
     }
 
-    switch (stg_ctx->newtonian_ctx.state_var) {
-      case STATEVAR_CONSERVATIVE:
-        q0[0][i] = rho;
-        q0[1][i] = u[0] * rho;
-        q0[2][i] = u[1] * rho;
-        q0[3][i] = u[2] * rho;
-        q0[4][i] = rho * (0.5 * Dot3(u, u) + cv * theta0);
-        break;
-
-      case STATEVAR_PRIMITIVE:
-        q0[0][i] = P0;
-        q0[1][i] = u[0];
-        q0[2][i] = u[1];
-        q0[3][i] = u[2];
-        q0[4][i] = theta0;
-        break;
+    CeedScalar Y[5] = {P0, u[0], u[1], u[2], theta0}, q[5];
+    State      s    = StateFromY(gas, Y);
+    StateToQ(gas, s, q, gas->state_var);
+    for (CeedInt j = 0; j < 5; j++) {
+      q0[j][i] = q[j];
     }
   }
   return 0;
@@ -477,15 +466,16 @@ CEED_QFUNCTION(StgShur14InflowStrongQF)(void *ctx, CeedInt Q, const CeedScalar *
   const CeedScalar(*inv_Ektotal)           = (const CeedScalar(*))in[3];
   CeedScalar(*bcval)[CEED_Q_VLA]           = (CeedScalar(*)[CEED_Q_VLA])out[0];
 
-  const StgShur14Context stg_ctx = (StgShur14Context)ctx;
-  CeedScalar             u[3], ubar[3], cij[6], eps, lt;
-  const bool             mean_only = stg_ctx->mean_only;
-  const CeedScalar       dx        = stg_ctx->dx;
-  const CeedScalar       time      = stg_ctx->time;
-  const CeedScalar       theta0    = stg_ctx->theta0;
-  const CeedScalar       P0        = stg_ctx->P0;
-  const CeedScalar       rho       = P0 / (GasConstant(&stg_ctx->newtonian_ctx) * theta0);
-  const CeedScalar       nu        = stg_ctx->newtonian_ctx.mu / rho;
+  const StgShur14Context         stg_ctx = (StgShur14Context)ctx;
+  const NewtonianIdealGasContext gas     = &stg_ctx->newtonian_ctx;
+  CeedScalar                     u[3], ubar[3], cij[6], eps, lt;
+  const bool                     mean_only = stg_ctx->mean_only;
+  const CeedScalar               dx        = stg_ctx->dx;
+  const CeedScalar               time      = stg_ctx->time;
+  const CeedScalar               theta0    = stg_ctx->theta0;
+  const CeedScalar               P0        = stg_ctx->P0;
+  const CeedScalar               rho       = P0 / (GasConstant(gas) * theta0);
+  const CeedScalar               nu        = gas->mu / rho;
 
   CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
     const CeedScalar x[]        = {coords[0][i], coords[1][i], coords[2][i]};
@@ -511,22 +501,22 @@ CEED_QFUNCTION(StgShur14InflowStrongQF)(void *ctx, CeedInt Q, const CeedScalar *
       for (CeedInt j = 0; j < 3; j++) u[j] = ubar[j];
     }
 
-    switch (stg_ctx->newtonian_ctx.state_var) {
+    CeedScalar Y[5] = {P0, u[0], u[1], u[2], theta0}, q[5];
+    State      s    = StateFromY(gas, Y);
+    StateToQ(gas, s, q, gas->state_var);
+    switch (gas->state_var) {
       case STATEVAR_CONSERVATIVE:
-        bcval[0][i] = scale[i] * rho;
-        bcval[1][i] = scale[i] * rho * u[0];
-        bcval[2][i] = scale[i] * rho * u[1];
-        bcval[3][i] = scale[i] * rho * u[2];
-        bcval[4][i] = 0.;
+        q[4] = 0.;  // Don't set energy
         break;
-
       case STATEVAR_PRIMITIVE:
-        bcval[0][i] = 0;
-        bcval[1][i] = scale[i] * u[0];
-        bcval[2][i] = scale[i] * u[1];
-        bcval[3][i] = scale[i] * u[2];
-        bcval[4][i] = scale[i] * theta0;
+        q[0] = 0;  // Don't set pressure
         break;
+      case STATEVAR_ENTROPY:
+        q[0] = 0;  // Don't set V_density
+        break;
+    }
+    for (CeedInt j = 0; j < 5; j++) {
+      bcval[j][i] = scale[i] * q[j];
     }
   }
   return 0;
diff --git a/examples/fluids/qfunctions/taylorgreen.h b/examples/fluids/qfunctions/taylorgreen.h
index 72c128400d..ddf33e665b 100644
--- a/examples/fluids/qfunctions/taylorgreen.h
+++ b/examples/fluids/qfunctions/taylorgreen.h
@@ -17,12 +17,12 @@ CEED_QFUNCTION(ICsTaylorGreen)(void *ctx, CeedInt Q, const CeedScalar *const *in
 
   CeedScalar(*q0)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0];
 
-  const SetupContext                context   = (SetupContext)ctx;
-  struct NewtonianIdealGasContext_ *gas       = &context->gas;
-  CeedScalar                        R         = GasConstant(gas);
-  StatePrimitive                    reference = context->reference;
-  const CeedScalar                  V0        = sqrt(Dot3(reference.velocity, reference.velocity));
-  const CeedScalar                  density0  = reference.pressure / (reference.temperature * R);
+  const SetupContext             context   = (SetupContext)ctx;
+  const NewtonianIdealGasContext gas       = &context->gas;
+  CeedScalar                     R         = GasConstant(gas);
+  StatePrimitive                 reference = context->reference;
+  const CeedScalar               V0        = sqrt(Dot3(reference.velocity, reference.velocity));
+  const CeedScalar               density0  = reference.pressure / (reference.temperature * R);
 
   CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
     CeedScalar x[]  = {X[0][i], X[1][i], X[2][i]};
@@ -36,15 +36,7 @@ CEED_QFUNCTION(ICsTaylorGreen)(void *ctx, CeedInt Q, const CeedScalar *const *in
     Y[4] = reference.temperature;
 
     State s = StateFromY(gas, Y);
-    switch (gas->state_var) {
-      case STATEVAR_CONSERVATIVE:
-        UnpackState_U(s.U, q);
-        break;
-      case STATEVAR_PRIMITIVE:
-        UnpackState_Y(s.Y, q);
-        break;
-    }
-
+    StateToQ(gas, s, q, gas->state_var);
     for (CeedInt j = 0; j < 5; j++) q0[j][i] = q[j];
   }
   return 0;
diff --git a/examples/fluids/qfunctions/turb_spanstats.h b/examples/fluids/qfunctions/turb_spanstats.h
index dccba29a7b..344adeebaa 100644
--- a/examples/fluids/qfunctions/turb_spanstats.h
+++ b/examples/fluids/qfunctions/turb_spanstats.h
@@ -59,6 +59,10 @@ CEED_QFUNCTION(ChildStatsCollection_Prim)(void *ctx, CeedInt Q, const CeedScalar
   return ChildStatsCollection(ctx, Q, in, out, STATEVAR_PRIMITIVE);
 }
 
+CEED_QFUNCTION(ChildStatsCollection_Entropy)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+  return ChildStatsCollection(ctx, Q, in, out, STATEVAR_ENTROPY);
+}
+
 // QFunctions for testing
 CEED_QFUNCTION_HELPER CeedScalar ChildStatsCollectionTest_Exact(const CeedScalar x_i[3]) { return x_i[0] + Square(x_i[1]); }
 
diff --git a/examples/fluids/qfunctions/velocity_gradient_projection.h b/examples/fluids/qfunctions/velocity_gradient_projection.h
index 73b51eff84..c21bb68adc 100644
--- a/examples/fluids/qfunctions/velocity_gradient_projection.h
+++ b/examples/fluids/qfunctions/velocity_gradient_projection.h
@@ -47,3 +47,7 @@ CEED_QFUNCTION(VelocityGradientProjectionRHS_Conserv)(void *ctx, CeedInt Q, cons
 CEED_QFUNCTION(VelocityGradientProjectionRHS_Prim)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   return VelocityGradientProjectionRHS(ctx, Q, in, out, STATEVAR_PRIMITIVE);
 }
+
+CEED_QFUNCTION(VelocityGradientProjectionRHS_Entropy)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+  return VelocityGradientProjectionRHS(ctx, Q, in, out, STATEVAR_ENTROPY);
+}
diff --git a/examples/fluids/src/differential_filter.c b/examples/fluids/src/differential_filter.c
index 414c7154f8..04d264a112 100644
--- a/examples/fluids/src/differential_filter.c
+++ b/examples/fluids/src/differential_filter.c
@@ -36,8 +36,9 @@ PetscErrorCode DifferentialFilterCreateOperators(Ceed ceed, User user, CeedData
       case STATEVAR_CONSERVATIVE:
         PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, DifferentialFilter_RHS_Conserv, DifferentialFilter_RHS_Conserv_loc, &qf_rhs));
         break;
-      default:
-        SETERRQ(PetscObjectComm((PetscObject)user->dm), PETSC_ERR_SUP, "Differential filtering not available for chosen state variable");
+      case STATEVAR_ENTROPY:
+        PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, DifferentialFilter_RHS_Entropy, DifferentialFilter_RHS_Entropy_loc, &qf_rhs));
+        break;
     }
     if (diff_filter->do_mms_test) {
       PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_rhs));
diff --git a/examples/fluids/src/setupdm.c b/examples/fluids/src/setupdm.c
index a0df7dfe69..ca997adc01 100644
--- a/examples/fluids/src/setupdm.c
+++ b/examples/fluids/src/setupdm.c
@@ -100,6 +100,14 @@ PetscErrorCode SetUpDM(DM dm, ProblemData problem, PetscInt degree, PetscInt q_e
       PetscCall(PetscSectionSetComponentName(section, 0, 3, "VelocityZ"));
       PetscCall(PetscSectionSetComponentName(section, 0, 4, "Temperature"));
       break;
+
+    case STATEVAR_ENTROPY:
+      PetscCall(PetscSectionSetComponentName(section, 0, 0, "EntropyDensity"));
+      PetscCall(PetscSectionSetComponentName(section, 0, 1, "EntropyMomentumX"));
+      PetscCall(PetscSectionSetComponentName(section, 0, 2, "EntropyMomentumY"));
+      PetscCall(PetscSectionSetComponentName(section, 0, 3, "EntropyMomentumZ"));
+      PetscCall(PetscSectionSetComponentName(section, 0, 4, "EntropyTotalEnergy"));
+      break;
   }
   PetscFunctionReturn(PETSC_SUCCESS);
 }
diff --git a/examples/fluids/src/turb_spanstats.c b/examples/fluids/src/turb_spanstats.c
index f74e2d98ca..6cd7370a37 100644
--- a/examples/fluids/src/turb_spanstats.c
+++ b/examples/fluids/src/turb_spanstats.c
@@ -373,8 +373,9 @@ PetscErrorCode CreateStatisticCollectionOperator(Ceed ceed, User user, CeedData
     case STATEVAR_CONSERVATIVE:
       PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, ChildStatsCollection_Conserv, ChildStatsCollection_Conserv_loc, &qf_stats_collect));
       break;
-    default:
-      SETERRQ(PetscObjectComm((PetscObject)user->dm), PETSC_ERR_SUP, "No statisics collection available for chosen state variable");
+    case STATEVAR_ENTROPY:
+      PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, ChildStatsCollection_Entropy, ChildStatsCollection_Entropy_loc, &qf_stats_collect));
+      break;
   }
 
   if (user->spanstats.do_mms_test) {
diff --git a/examples/fluids/src/velocity_gradient_projection.c b/examples/fluids/src/velocity_gradient_projection.c
index 7b1f970d72..931b69f57d 100644
--- a/examples/fluids/src/velocity_gradient_projection.c
+++ b/examples/fluids/src/velocity_gradient_projection.c
@@ -74,8 +74,10 @@ PetscErrorCode VelocityGradientProjectionSetup(Ceed ceed, User user, CeedData ce
       PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, VelocityGradientProjectionRHS_Conserv, VelocityGradientProjectionRHS_Conserv_loc,
                                                       &qf_rhs_assemble));
       break;
-    default:
-      SETERRQ(PetscObjectComm((PetscObject)user->dm), PETSC_ERR_SUP, "No velocity gradient projection QFunction for chosen state variable");
+    case STATEVAR_ENTROPY:
+      PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, VelocityGradientProjectionRHS_Entropy, VelocityGradientProjectionRHS_Entropy_loc,
+                                                      &qf_rhs_assemble));
+      break;
   }
 
   PetscCallCeed(ceed, CeedQFunctionSetContext(qf_rhs_assemble, problem->apply_vol_ifunction.qfunction_context));
diff --git a/examples/fluids/tests-output/fluids-navierstokes-gaussianwave-IDL-entropy.bin b/examples/fluids/tests-output/fluids-navierstokes-gaussianwave-IDL-entropy.bin
new file mode 100644
index 0000000000000000000000000000000000000000..6347e95ca826677768a98731415bd0aa2e2db036
GIT binary patch
literal 2340
zcmXBVdpuNm8vt;rt!TD=3Ef6xZ8a`kHRw&=U%4ddf=yd?%Ozcq+=^TlgIo`Jbz|lX
zp`>Drln>ILV`rD5KCx1$C<!$#xs+bjdY@-@{+oH`obx=-?|Ht5Gt}j!^3UfR5+DA`
zPO255{HhtT^Pium>9M-=v!hlz#}kQ7fxvuG1CfZe3XC`@40ZU~5BmiwEceUs6ANdS
zi1YcY#X6^nWMlhabFhS=;uPwioFZASVs>A$dlI32SLU?{mlMgVDeHZ={TWIq)DO}>
z$8v}7CRk_n6PmFS-YGsq=)7ak>|1|jsHAp}mZ9@3cgRC7^BN>{lyl6i3vq-_onLle
zA!ewQ$&0IclOQh3%u@>`^nV@x)h!hgI$30B$D3fN)R--bUQ1SYURGwlsf0*UKl2wz
z?-6><+JQR9DuxQMwh4(!X1QN0v`SuR6S4f$uI|x0gbuUoNc&|2T>sAejKCN2uV+R>
zH3%)tDhs0L60wWbw5~r>87i;A+aqW_<gcl#rvHY}M|GmlKUfQO29F3$H!>8p?fUOS
zw;(^aFLp~>2~C-BCgwbZ{Pvc(?)nJ#FQSIVjzGNn)X*>sbrt_y@z)GOpHb8P$Q^|G
z>ZV%tLReib%K;USbwt8!b(uWmPUs6x_H+Nf&rtrAkt4Zs=zox9{qklaUcCK+`;ldY
zKIf#`v_Js&=a$uOA7{D!@qRu{HwkUqHe%JCLUQFEf|-UCLxq|Z3|8l|-1oM<Jd^1}
zvdDUGX<QwlJBkH94Y>@Jc-?EK+yL?mE)zUxA@s4(_3ym*5IQ5FvM8m9p^_V(b2GCc
zF7{;|&k;I1=Y!9_Xz0`YNp}5ys6XQ9!?<)-SMzdhf8YlqN$9h%o0mz%Q|c4eZXXyb
z(AbKTv={pCX54-)kcgkuuG=v>LFnLo<H8MXQ2+m&7q_x3w~w~0aFP+)P|Lab{7aId
zc|;;Szl))^&ipb>_8y*}cunr=O=y?m;b8YtBHpbhJ>Mu~sMz*xn@zk~?qHiykKbxS
zr&ZW=ZrDxe$oLy%)sGC7KFe01vJlRX@71zmADoNM<Mz@XLQBnavU3zr-=kQ=el1p)
z-(^#D%7t8N4jFwKX-{a!T0u-0%r`0N4*oI*{hM*tq&9_6rEXtmch4sDL9as#0__><
z(20h|@hEojPg{D9xk(A_Gj?`)`CB3iyK|p%%V5$Tl^QurK8EuN82vR%My_zox%w$k
zC$~CoqK;;$gn{0JJUx~>()i?q+d?8*scV_gl?dl(b8w-t6GJ(5NEEe+P+v^r(M8vY
z<l=AuKfI67e?)uKin<x9yvRxWQyQzgWLC<5ECz|>ZCq1`U>2b@DnI?(Gl!ve%RFD^
zeuDmoXzWlqNwUkUR&T#x3HwpyIi``pP$%|`M+CaE+y@UD^f_(NpY#jIi+#yekIU|&
zrEq<lUBZfPH<mlvQ|cfcA~_f9?psD$lAO?mzMAGg!~B?&^rmR&uUS?`L=2&gORpT*
zA42GA&eYKGOYq@ud2zD{@_%bIZ8glzt)w}kA}c~$FBr-6vu3Ca;jI^1jp$GK@^@17
zXUhuCO7th+jOUI17~edVhW<Qmc4|d`0{mU=z@HB79kyoZ&v%cu1t|TIsfD3GnHwv?
zr&Uv0kAyIPLZSBVm>Trw^ubrj=+7n{ekJ<T*{!3m^yjMi3z#nt`CSF<Pn2`K%j>1+
z&&|GY@Cp2h$o&c8vdk-+RM8*JmXZVLkHO(?Kczo^j5(k`7kP8y(4Ql%TfRkqZr3k1
zMSm=|@y*a54ZZI<;Lk*Xo~Q!-Y5n}x68+h*H5Teq)K`}DE6|_R={}y|52*|*<f1=e
zE1tYXe|Sr)hryru&aWG6(Vs?hr%9zhUblIWpWEk}brkrsw)0i=W%Q>oqmZZcNAuwm
zh*yUhQqcd@V(+>!9{R&8Z-@CysaL5|zlHuxOt)4=e-f@aHljZ#jkkbr{*^~$R3ZAa
zF<^QI_~Q^FinRfM980Zak?2oJ?N$o?Ia$V^f&M5)?mB}%UZo17&FD{Ir;juGqgeLh
zD)?g~t`~cvKS8kp?&yzuy15YY3znH`>4QHZ0~4<s&>yBNFB$#m(9u<exY+kiI@Fz=
z%Q^1y4*eOrZV`_D_>}2vLVs%RJ!in5-nOP-L-eQb_{eGW$9|2BgZ`{em}v|C$W=bh
zg#3dGrmyO+L4WpHnDNn{P2T(n^ym1wXL;by;kvK(f#^?1lb;DZKe6D_0PKg$^^Z4x
ze1!h=@ArrKjBVe!`Sc0&$GAxy_A;$PWZ(|(k;wQn^J@M=f3_~TAyE2r`|B(8hZ&zt
zM}ICGocxUbM4xD{M}MrBjPC+}LfLisqv(&Cbnq$oBURm#0{i!VM$B(IY2eR+(~As5
z=ui2vpgrhMk7}z5`qO2wT!{YY*rx6Sf2#5&y|w6%+3&se=+BtZq%Ha*Kc;~HajPAQ
zTj2egFwmb~6{_^7;F}HTkAuN@2m0f5;}>1@N0i=iN$Jn+f*SP4Z^u13`V;Qj+JOG-
z=7pxBKjLLI>flejy56DZ;7`@uER!Jg$BLZoM1R!w7zLm|^FItg{c9?v{kBo)&s2Ee
z7xbs;-L>`TPq64@68Lk)v!kQ{{YlJv3jH1Jk!@c0J^C}KQ8tPG9F8@0RQl5@->CFQ
z;Twzoym?_`0sef;`{)n*5B|K?mZ3j}zs)iQe-e8JH}TP*PS>X^(4X>wHrM|HBZ!)b

literal 0
HcmV?d00001


From eba67db85a1584b4b3808078695d4c1b4e99aab2 Mon Sep 17 00:00:00 2001
From: James Wright <james@jameswright.xyz>
Date: Fri, 14 Jun 2024 18:47:38 -0600
Subject: [PATCH 075/571] fluids: Remove entropy variable from State struct

---
 examples/fluids/problems/newtonian.c         |   6 +-
 examples/fluids/qfunctions/blasius.h         |   2 -
 examples/fluids/qfunctions/newtonian_state.h | 192 ++++++++++---------
 3 files changed, 106 insertions(+), 94 deletions(-)

diff --git a/examples/fluids/problems/newtonian.c b/examples/fluids/problems/newtonian.c
index e1aecc6ca6..4357644711 100644
--- a/examples/fluids/problems/newtonian.c
+++ b/examples/fluids/problems/newtonian.c
@@ -85,9 +85,9 @@ static PetscErrorCode TestState_fwd(StateVariable state_var_A, StateVariable sta
     {  // Calculate dB using State functions
       CeedScalar dA[5] = {0};
 
-      dA[i]    = A0[i];
-      State ds = StateFromQ_fwd(gas, state_0, dA, state_var_A);
-      StateToQ(gas, ds, dB, state_var_B);
+      dA[i]          = A0[i];
+      State dstate_0 = StateFromQ_fwd(gas, state_0, dA, state_var_A);
+      StateToQ_fwd(gas, state_0, dstate_0, dB, state_var_B);
     }
 
     {  // Calculate dB_fd via finite difference approximation
diff --git a/examples/fluids/qfunctions/blasius.h b/examples/fluids/qfunctions/blasius.h
index 384a67a903..d80fe4ce63 100644
--- a/examples/fluids/qfunctions/blasius.h
+++ b/examples/fluids/qfunctions/blasius.h
@@ -158,11 +158,9 @@ CEED_QFUNCTION(Blasius_Inflow)(void *ctx, CeedInt Q, const CeedScalar *const *in
     if (context->weakT) {  // density from the current solution
       s.U.density = s_int.U.density;
       s.Y         = StatePrimitiveFromConservative(gas, s.U);
-      s.V         = StateEntropyFromConservative(gas, s.U);
     } else {  // Total energy from current solution
       s.U.E_total = s_int.U.E_total;
       s.Y         = StatePrimitiveFromConservative(gas, s.U);
-      s.V         = StateEntropyFromConservative(gas, s.U);
     }
 
     StateConservative Flux_inviscid[3];
diff --git a/examples/fluids/qfunctions/newtonian_state.h b/examples/fluids/qfunctions/newtonian_state.h
index eb9f12b3d1..e41dd8d4fd 100644
--- a/examples/fluids/qfunctions/newtonian_state.h
+++ b/examples/fluids/qfunctions/newtonian_state.h
@@ -24,7 +24,6 @@ typedef struct {
 typedef struct {
   StateConservative U;
   StatePrimitive    Y;
-  StateEntropy      V;
 } State;
 
 CEED_QFUNCTION_HELPER void UnpackState_U(StateConservative s, CeedScalar U[5]) {
@@ -96,6 +95,63 @@ CEED_QFUNCTION_HELPER StatePrimitive StatePrimitiveFromConservative_fwd(Newtonia
   return dY;
 }
 
+CEED_QFUNCTION_HELPER StateEntropy StateEntropyFromPrimitive(NewtonianIdealGasContext gas, StatePrimitive Y) {
+  StateEntropy     V;
+  const CeedScalar gamma     = HeatCapacityRatio(gas);
+  const CeedScalar rho       = Y.pressure / (GasConstant(gas) * Y.temperature);
+  const CeedScalar entropy   = log(Y.pressure) - gamma * log(rho);
+  const CeedScalar rho_div_p = rho / Y.pressure;
+  const CeedScalar e_kinetic = 0.5 * Dot3(Y.velocity, Y.velocity);
+
+  V.S_density = (gamma - entropy) / (gamma - 1) - rho_div_p * e_kinetic;
+  for (int i = 0; i < 3; i++) V.S_momentum[i] = rho_div_p * Y.velocity[i];
+  V.S_energy = -rho_div_p;
+  return V;
+}
+
+CEED_QFUNCTION_HELPER StateEntropy StateEntropyFromPrimitive_fwd(NewtonianIdealGasContext gas, State s, StatePrimitive dY) {
+  StateEntropy     dV;
+  const CeedScalar gamma = HeatCapacityRatio(gas);
+  CeedScalar       drho  = (dY.pressure * s.Y.temperature - s.Y.pressure * dY.temperature) / (GasConstant(gas) * s.Y.temperature * s.Y.temperature);
+
+  const CeedScalar e_kinetic  = .5 * Dot3(s.Y.velocity, s.Y.velocity);
+  const CeedScalar de_kinetic = Dot3(dY.velocity, s.Y.velocity);
+  const CeedScalar rho_div_p  = s.U.density / s.Y.pressure;
+  const CeedScalar drho_div_p = (drho * s.Y.pressure - s.U.density * dY.pressure) / Square(s.Y.pressure);
+
+  CeedScalar dentropy = dY.pressure / s.Y.pressure - gamma * drho / s.U.density;
+
+  dV.S_density = -dentropy / (gamma - 1) - de_kinetic * rho_div_p - e_kinetic * drho_div_p;
+  for (CeedInt i = 0; i < 3; i++) dV.S_momentum[i] = rho_div_p * dY.velocity[i] + drho_div_p * s.Y.velocity[i];
+  dV.S_energy = -drho_div_p;
+  return dV;
+}
+
+CEED_QFUNCTION_HELPER StatePrimitive StatePrimitiveFromEntropy(NewtonianIdealGasContext gas, StateEntropy V) {
+  StatePrimitive Y;
+  for (int i = 0; i < 3; i++) Y.velocity[i] = -V.S_momentum[i] / V.S_energy;
+  Y.temperature              = -1 / (GasConstant(gas) * V.S_energy);
+  const CeedScalar gamma     = HeatCapacityRatio(gas);
+  const CeedScalar e_kinetic = 0.5 * Dot3(Y.velocity, Y.velocity);
+  const CeedScalar entropy   = gamma - (gamma - 1) * (V.S_density - e_kinetic * V.S_energy);
+  const CeedScalar log_P     = -(entropy + gamma * log(-V.S_energy)) / (gamma - 1);
+  Y.pressure                 = exp(log_P);
+  return Y;
+}
+
+CEED_QFUNCTION_HELPER StatePrimitive StatePrimitiveFromEntropy_fwd(NewtonianIdealGasContext gas, State s, StateEntropy dV) {
+  StatePrimitive dY;
+  StateEntropy   V = StateEntropyFromPrimitive(gas, s.Y);
+  for (int i = 0; i < 3; i++) dY.velocity[i] = -(dV.S_momentum[i] - V.S_momentum[i] * dV.S_energy / V.S_energy) / V.S_energy;
+  dY.temperature              = dV.S_energy / (GasConstant(gas) * V.S_energy * V.S_energy);
+  const CeedScalar gamma      = HeatCapacityRatio(gas);
+  const CeedScalar e_kinetic  = 0.5 * Dot3(s.Y.velocity, s.Y.velocity);
+  const CeedScalar de_kinetic = Dot3(dY.velocity, s.Y.velocity);
+  const CeedScalar dentropy   = (1 - gamma) * (dV.S_density - e_kinetic * dV.S_energy - de_kinetic * V.S_energy);
+  dY.pressure                 = s.Y.pressure * (-dentropy - gamma * dV.S_energy / V.S_energy) / (gamma - 1);
+  return dY;
+}
+
 CEED_QFUNCTION_HELPER StateConservative StateConservativeFromPrimitive(NewtonianIdealGasContext gas, StatePrimitive Y) {
   StateConservative U;
   U.density = Y.pressure / (GasConstant(gas) * Y.temperature);
@@ -175,97 +231,38 @@ CEED_QFUNCTION_HELPER StateConservative StateConservativeFromEntropy(NewtonianId
 CEED_QFUNCTION_HELPER StateConservative StateConservativeFromEntropy_fwd(NewtonianIdealGasContext gas, State s, StateEntropy dV) {
   StateConservative dU;
   CeedScalar        dvelocity[3];
-  for (int i = 0; i < 3; i++) dvelocity[i] = (-dV.S_momentum[i] - s.Y.velocity[i] * dV.S_energy) / s.V.S_energy;
+  StateEntropy      V = StateEntropyFromPrimitive(gas, s.Y);
+  for (int i = 0; i < 3; i++) dvelocity[i] = (-dV.S_momentum[i] - s.Y.velocity[i] * dV.S_energy) / V.S_energy;
   const CeedScalar gamma      = HeatCapacityRatio(gas);
   const CeedScalar e_kinetic  = 0.5 * Dot3(s.Y.velocity, s.Y.velocity);
   const CeedScalar de_kinetic = Dot3(dvelocity, s.Y.velocity);
-  const CeedScalar entropy    = gamma - (gamma - 1) * (s.V.S_density - e_kinetic * s.V.S_energy);
-  const CeedScalar dentropy   = -(gamma - 1) * (dV.S_density - (de_kinetic * s.V.S_energy + e_kinetic * dV.S_energy));
-  const CeedScalar log_rho    = -(entropy + log(-s.V.S_energy)) / (gamma - 1);
+  const CeedScalar entropy    = gamma - (gamma - 1) * (V.S_density - e_kinetic * V.S_energy);
+  const CeedScalar dentropy   = -(gamma - 1) * (dV.S_density - (de_kinetic * V.S_energy + e_kinetic * dV.S_energy));
+  const CeedScalar log_rho    = -(entropy + log(-V.S_energy)) / (gamma - 1);
   const CeedScalar rho        = exp(log_rho);
-  dU.density                  = -rho / (gamma - 1) * (dentropy + dV.S_energy / s.V.S_energy);
+  dU.density                  = -rho / (gamma - 1) * (dentropy + dV.S_energy / V.S_energy);
   for (int i = 0; i < 3; i++) dU.momentum[i] = dU.density * s.Y.velocity[i] + s.U.density * dvelocity[i];
 
-  const CeedScalar e_internal  = -gas->cv / (GasConstant(gas) * s.V.S_energy);
-  const CeedScalar de_internal = gas->cv * dV.S_energy / (GasConstant(gas) * s.V.S_energy * s.V.S_energy);
+  const CeedScalar e_internal  = -gas->cv / (GasConstant(gas) * V.S_energy);
+  const CeedScalar de_internal = gas->cv * dV.S_energy / (GasConstant(gas) * V.S_energy * V.S_energy);
   const CeedScalar e_total     = e_internal + e_kinetic;
   dU.E_total                   = dU.density * e_total + s.U.density * (de_internal + de_kinetic);
   return dU;
 }
 
-CEED_QFUNCTION_HELPER StateEntropy StateEntropyFromPrimitive(NewtonianIdealGasContext gas, StatePrimitive Y) {
-  StateEntropy     V;
-  const CeedScalar gamma     = HeatCapacityRatio(gas);
-  const CeedScalar rho       = Y.pressure / (GasConstant(gas) * Y.temperature);
-  const CeedScalar entropy   = log(Y.pressure) - gamma * log(rho);
-  const CeedScalar rho_div_p = rho / Y.pressure;
-  const CeedScalar e_kinetic = 0.5 * Dot3(Y.velocity, Y.velocity);
-
-  V.S_density = (gamma - entropy) / (gamma - 1) - rho_div_p * e_kinetic;
-  for (int i = 0; i < 3; i++) V.S_momentum[i] = rho_div_p * Y.velocity[i];
-  V.S_energy = -rho_div_p;
-  return V;
-}
-
-CEED_QFUNCTION_HELPER StateEntropy StateEntropyFromPrimitive_fwd(NewtonianIdealGasContext gas, State s, StatePrimitive dY) {
-  StateEntropy     dV;
-  const CeedScalar gamma = HeatCapacityRatio(gas);
-  CeedScalar       drho  = (dY.pressure * s.Y.temperature - s.Y.pressure * dY.temperature) / (GasConstant(gas) * s.Y.temperature * s.Y.temperature);
-
-  const CeedScalar e_kinetic  = .5 * Dot3(s.Y.velocity, s.Y.velocity);
-  const CeedScalar de_kinetic = Dot3(dY.velocity, s.Y.velocity);
-  const CeedScalar rho_div_p  = s.U.density / s.Y.pressure;
-  const CeedScalar drho_div_p = (drho * s.Y.pressure - s.U.density * dY.pressure) / Square(s.Y.pressure);
-
-  CeedScalar dentropy = dY.pressure / s.Y.pressure - gamma * drho / s.U.density;
-
-  dV.S_density = -dentropy / (gamma - 1) - de_kinetic * rho_div_p - e_kinetic * drho_div_p;
-  for (CeedInt i = 0; i < 3; i++) dV.S_momentum[i] = rho_div_p * dY.velocity[i] + drho_div_p * s.Y.velocity[i];
-  dV.S_energy = -drho_div_p;
-  return dV;
-}
-
-CEED_QFUNCTION_HELPER StatePrimitive StatePrimitiveFromEntropy(NewtonianIdealGasContext gas, StateEntropy V) {
-  StatePrimitive Y;
-  for (int i = 0; i < 3; i++) Y.velocity[i] = -V.S_momentum[i] / V.S_energy;
-  Y.temperature              = -1 / (GasConstant(gas) * V.S_energy);
-  const CeedScalar gamma     = HeatCapacityRatio(gas);
-  const CeedScalar e_kinetic = 0.5 * Dot3(Y.velocity, Y.velocity);
-  const CeedScalar entropy   = gamma - (gamma - 1) * (V.S_density - e_kinetic * V.S_energy);
-  const CeedScalar log_P     = -(entropy + gamma * log(-V.S_energy)) / (gamma - 1);
-  Y.pressure                 = exp(log_P);
-  return Y;
-}
-
-CEED_QFUNCTION_HELPER StatePrimitive StatePrimitiveFromEntropy_fwd(NewtonianIdealGasContext gas, State s, StateEntropy dV) {
-  StatePrimitive dY;
-  for (int i = 0; i < 3; i++) dY.velocity[i] = -(dV.S_momentum[i] - s.V.S_momentum[i] * dV.S_energy / s.V.S_energy) / s.V.S_energy;
-  dY.temperature              = dV.S_energy / (GasConstant(gas) * s.V.S_energy * s.V.S_energy);
-  const CeedScalar gamma      = HeatCapacityRatio(gas);
-  const CeedScalar e_kinetic  = 0.5 * Dot3(s.Y.velocity, s.Y.velocity);
-  const CeedScalar de_kinetic = Dot3(dY.velocity, s.Y.velocity);
-  const CeedScalar dentropy   = (1 - gamma) * (dV.S_density - e_kinetic * dV.S_energy - de_kinetic * s.V.S_energy);
-  dY.pressure                 = s.Y.pressure * (-dentropy - gamma * dV.S_energy / s.V.S_energy) / (gamma - 1);
-  return dY;
-}
-
 CEED_QFUNCTION_HELPER State StateFromPrimitive(NewtonianIdealGasContext gas, StatePrimitive Y) {
   StateConservative U = StateConservativeFromPrimitive(gas, Y);
-  StateEntropy      V = StateEntropyFromPrimitive(gas, Y);
   State             s;
   s.U = U;
   s.Y = Y;
-  s.V = V;
   return s;
 }
 
 CEED_QFUNCTION_HELPER State StateFromPrimitive_fwd(NewtonianIdealGasContext gas, State s, StatePrimitive dY) {
   StateConservative dU = StateConservativeFromPrimitive_fwd(gas, s, dY);
-  StateEntropy      dV = StateEntropyFromPrimitive_fwd(gas, s, dY);
   State             ds;
   ds.U = dU;
   ds.Y = dY;
-  ds.V = dV;
   return ds;
 }
 
@@ -293,7 +290,10 @@ CEED_QFUNCTION_HELPER void StateToU(NewtonianIdealGasContext gas, const State in
 
 CEED_QFUNCTION_HELPER void StateToY(NewtonianIdealGasContext gas, const State input, CeedScalar Y[5]) { UnpackState_Y(input.Y, Y); }
 
-CEED_QFUNCTION_HELPER void StateToV(NewtonianIdealGasContext gas, const State input, CeedScalar V[5]) { UnpackState_V(input.V, V); }
+CEED_QFUNCTION_HELPER void StateToV(NewtonianIdealGasContext gas, const State input, CeedScalar V[5]) {
+  StateEntropy state_V = StateEntropyFromPrimitive(gas, input.Y);
+  UnpackState_V(state_V, V);
+}
 
 CEED_QFUNCTION_HELPER void StateToQ(NewtonianIdealGasContext gas, const State input, CeedScalar Q[5], StateVariable state_var) {
   switch (state_var) {
@@ -309,6 +309,22 @@ CEED_QFUNCTION_HELPER void StateToQ(NewtonianIdealGasContext gas, const State in
   }
 }
 
+CEED_QFUNCTION_HELPER void StateToQ_fwd(NewtonianIdealGasContext gas, const State input, const State dinput, CeedScalar dQ[5],
+                                        StateVariable state_var) {
+  switch (state_var) {
+    case STATEVAR_CONSERVATIVE:
+    case STATEVAR_PRIMITIVE:
+      StateToQ(gas, dinput, dQ, state_var);
+      break;
+    case STATEVAR_ENTROPY: {
+      StateEntropy dstate_v;
+
+      dstate_v = StateEntropyFromPrimitive_fwd(gas, input, dinput.Y);
+      UnpackState_V(dstate_v, dQ);
+    } break;
+  }
+}
+
 CEED_QFUNCTION_HELPER State StateFromU(NewtonianIdealGasContext gas, const CeedScalar U[5]) {
   State s;
   s.U.density     = U[0];
@@ -317,7 +333,6 @@ CEED_QFUNCTION_HELPER State StateFromU(NewtonianIdealGasContext gas, const CeedS
   s.U.momentum[2] = U[3];
   s.U.E_total     = U[4];
   s.Y             = StatePrimitiveFromConservative(gas, s.U);
-  s.V             = StateEntropyFromConservative(gas, s.U);
   return s;
 }
 
@@ -329,7 +344,6 @@ CEED_QFUNCTION_HELPER State StateFromU_fwd(NewtonianIdealGasContext gas, State s
   ds.U.momentum[2] = dU[3];
   ds.U.E_total     = dU[4];
   ds.Y             = StatePrimitiveFromConservative_fwd(gas, s, ds.U);
-  ds.V             = StateEntropyFromConservative_fwd(gas, s, ds.U);
   return ds;
 }
 
@@ -341,7 +355,6 @@ CEED_QFUNCTION_HELPER State StateFromY(NewtonianIdealGasContext gas, const CeedS
   s.Y.velocity[2] = Y[3];
   s.Y.temperature = Y[4];
   s.U             = StateConservativeFromPrimitive(gas, s.Y);
-  s.V             = StateEntropyFromPrimitive(gas, s.Y);
   return s;
 }
 
@@ -353,31 +366,32 @@ CEED_QFUNCTION_HELPER State StateFromY_fwd(NewtonianIdealGasContext gas, State s
   ds.Y.velocity[2] = dY[3];
   ds.Y.temperature = dY[4];
   ds.U             = StateConservativeFromPrimitive_fwd(gas, s, ds.Y);
-  ds.V             = StateEntropyFromPrimitive_fwd(gas, s, ds.Y);
   return ds;
 }
 
 CEED_QFUNCTION_HELPER State StateFromV(NewtonianIdealGasContext gas, const CeedScalar V[5]) {
-  State s;
-  s.V.S_density     = V[0];
-  s.V.S_momentum[0] = V[1];
-  s.V.S_momentum[1] = V[2];
-  s.V.S_momentum[2] = V[3];
-  s.V.S_energy      = V[4];
-  s.U               = StateConservativeFromEntropy(gas, s.V);
-  s.Y               = StatePrimitiveFromEntropy(gas, s.V);
+  State        s;
+  StateEntropy state_V;
+  state_V.S_density     = V[0];
+  state_V.S_momentum[0] = V[1];
+  state_V.S_momentum[1] = V[2];
+  state_V.S_momentum[2] = V[3];
+  state_V.S_energy      = V[4];
+  s.U                   = StateConservativeFromEntropy(gas, state_V);
+  s.Y                   = StatePrimitiveFromEntropy(gas, state_V);
   return s;
 }
 
 CEED_QFUNCTION_HELPER State StateFromV_fwd(NewtonianIdealGasContext gas, State s, const CeedScalar dV[5]) {
-  State ds;
-  ds.V.S_density     = dV[0];
-  ds.V.S_momentum[0] = dV[1];
-  ds.V.S_momentum[1] = dV[2];
-  ds.V.S_momentum[2] = dV[3];
-  ds.V.S_energy      = dV[4];
-  ds.U               = StateConservativeFromEntropy_fwd(gas, s, ds.V);
-  ds.Y               = StatePrimitiveFromEntropy_fwd(gas, s, ds.V);
+  State        ds;
+  StateEntropy state_dV;
+  state_dV.S_density     = dV[0];
+  state_dV.S_momentum[0] = dV[1];
+  state_dV.S_momentum[1] = dV[2];
+  state_dV.S_momentum[2] = dV[3];
+  state_dV.S_energy      = dV[4];
+  ds.U                   = StateConservativeFromEntropy_fwd(gas, s, state_dV);
+  ds.Y                   = StatePrimitiveFromEntropy_fwd(gas, s, state_dV);
   return ds;
 }
 

From 831dbe9e1dadefb64122292c735c2cf2987406b2 Mon Sep 17 00:00:00 2001
From: James Wright <james@jameswright.xyz>
Date: Tue, 11 Jun 2024 11:13:35 -0600
Subject: [PATCH 076/571] fluids: Add h_scale_factor to STG options

---
 examples/fluids/README.md                    | 10 +++
 examples/fluids/problems/stg_shur14.c        | 11 ++-
 examples/fluids/qfunctions/stg_shur14.h      | 91 +++++++++++---------
 examples/fluids/qfunctions/stg_shur14_type.h |  1 +
 4 files changed, 68 insertions(+), 45 deletions(-)

diff --git a/examples/fluids/README.md b/examples/fluids/README.md
index f24e09a694..2fe34dcb7b 100644
--- a/examples/fluids/README.md
+++ b/examples/fluids/README.md
@@ -1072,6 +1072,16 @@ Using the STG Inflow for the blasius problem adds the following command-line opt
   - `false`
   -
 
+* - `-stg_dx`
+  - Set the element size in the x direction. Default is calculated for box meshes, assuming equispaced elements.
+  -
+  - `m`
+
+* - `-stg_h_scale_factor`
+  - Scale element size for cutoff frequency calculation
+  - $1/p$
+  -
+
 :::
 
 This problem can be run with the `blasius.yaml` file via:
diff --git a/examples/fluids/problems/stg_shur14.c b/examples/fluids/problems/stg_shur14.c
index 5d9d4f1dc4..d4a6e12d71 100644
--- a/examples/fluids/problems/stg_shur14.c
+++ b/examples/fluids/problems/stg_shur14.c
@@ -221,7 +221,7 @@ PetscErrorCode SetupStg(const MPI_Comm comm, const DM dm, ProblemData problem, U
   char                     stg_inflow_path[PETSC_MAX_PATH_LEN] = "./STGInflow.dat";
   char                     stg_rand_path[PETSC_MAX_PATH_LEN]   = "./STGRand.dat";
   PetscBool                mean_only = PETSC_FALSE, use_stgstrong = PETSC_FALSE, use_fluctuating_IC = PETSC_FALSE, given_stg_dx = PETSC_FALSE;
-  CeedScalar               u0 = 0.0, alpha = 1.01, stg_dx = 1.0e-3;
+  CeedScalar               u0 = 0.0, alpha = 1.01, stg_dx = -1, stg_h_scale_factor = 1 / user->app_ctx->degree;
   CeedQFunctionContext     stg_context;
   NewtonianIdealGasContext newtonian_ig_ctx;
 
@@ -235,7 +235,11 @@ PetscErrorCode SetupStg(const MPI_Comm comm, const DM dm, ProblemData problem, U
   PetscCall(PetscOptionsBool("-stg_strong", "Enforce STG inflow strongly", NULL, use_stgstrong, &use_stgstrong, NULL));
   PetscCall(PetscOptionsBool("-stg_fluctuating_IC", "\"Extrude\" the fluctuations through the domain as an initial condition", NULL,
                              use_fluctuating_IC, &use_fluctuating_IC, NULL));
-  PetscCall(PetscOptionsReal("-stg_dx", "Element size in streamwise direction at inflow", NULL, stg_dx, &stg_dx, &given_stg_dx));
+  PetscCall(PetscOptionsReal("-stg_dx", "Element length in x direction at inflow", NULL, stg_dx, &stg_dx, &given_stg_dx));
+  PetscCall(PetscOptionsReal("-stg_h_scale_factor", "Scale element size for cutoff frequency calculation", NULL, stg_h_scale_factor,
+                             &stg_h_scale_factor, NULL));
+  PetscCall(PetscOptionsDeprecated("-stg_dyScale", NULL, "libCEED 0.12.0", "Use -stg_h_scale_factor to scale all the element dimensions"));
+  PetscCall(PetscOptionsDeprecated("-stg_dz", NULL, "libCEED 0.12.0", NULL));
   PetscOptionsEnd();
 
   PetscCall(PetscCalloc1(1, &global_stg_ctx));
@@ -247,6 +251,7 @@ PetscErrorCode SetupStg(const MPI_Comm comm, const DM dm, ProblemData problem, U
   global_stg_ctx->use_fluctuating_IC = use_fluctuating_IC;
   global_stg_ctx->theta0             = theta0;
   global_stg_ctx->P0                 = P0;
+  global_stg_ctx->h_scale_factor     = stg_h_scale_factor;
 
   {  // Calculate dx assuming constant spacing
     PetscReal domain_min[3], domain_max[3], domain_size[3];
@@ -256,6 +261,8 @@ PetscErrorCode SetupStg(const MPI_Comm comm, const DM dm, ProblemData problem, U
     PetscInt nmax = 3, faces[3];
     PetscCall(PetscOptionsGetIntArray(NULL, NULL, "-dm_plex_box_faces", faces, &nmax, NULL));
     global_stg_ctx->dx = given_stg_dx ? stg_dx : domain_size[0] / faces[0];
+    PetscCheck((global_stg_ctx->dx > 0) && PetscIsNormalReal((PetscReal)global_stg_ctx->dx), comm, PETSC_ERR_LIB,
+               "STG dx must be positive normal number, got %g", global_stg_ctx->dx);
   }
 
   PetscCallCeed(ceed, CeedQFunctionContextGetData(problem->apply_vol_rhs.qfunction_context, CEED_MEM_HOST, &newtonian_ig_ctx));
diff --git a/examples/fluids/qfunctions/stg_shur14.h b/examples/fluids/qfunctions/stg_shur14.h
index d6c7464660..e4ab60802c 100644
--- a/examples/fluids/qfunctions/stg_shur14.h
+++ b/examples/fluids/qfunctions/stg_shur14.h
@@ -102,12 +102,12 @@ CEED_QFUNCTION_HELPER CeedScalar Calc_qn(const CeedScalar kappa, const CeedScala
 }
 
 // Calculate hmax, ke, keta, and kcut
-CEED_QFUNCTION_HELPER void SpectrumConstants(const CeedScalar wall_dist, const CeedScalar eps, const CeedScalar lt, const CeedScalar h[3],
+CEED_QFUNCTION_HELPER void SpectrumConstants(const CeedScalar wall_dist, const CeedScalar eps, const CeedScalar lt, const CeedScalar hNodSep[3],
                                              const CeedScalar nu, CeedScalar *hmax, CeedScalar *ke, CeedScalar *keta, CeedScalar *kcut) {
-  *hmax = Max(Max(h[0], h[1]), h[2]);
+  *hmax = Max(Max(hNodSep[0], hNodSep[1]), hNodSep[2]);
   *ke   = wall_dist == 0 ? 1e16 : 2 * M_PI / Min(2 * wall_dist, 3 * lt);
   *keta = 2 * M_PI * pow(Cube(nu) / eps, -0.25);
-  *kcut = M_PI / Min(Max(Max(h[1], h[2]), 0.3 * (*hmax)) + 0.1 * wall_dist, *hmax);
+  *kcut = M_PI / Min(Max(Max(hNodSep[1], hNodSep[2]), 0.3 * (*hmax)) + 0.1 * wall_dist, *hmax);
 }
 
 /*
@@ -115,21 +115,21 @@ CEED_QFUNCTION_HELPER void SpectrumConstants(const CeedScalar wall_dist, const C
  *
  * Calculates q_n at a given distance to the wall
  *
- * @param[in]  wall_dist Distance to the nearest wall
- * @param[in]  eps       Turbulent dissipation w/rt wall_dist
- * @param[in]  lt        Turbulent length scale w/rt wall_dist
- * @param[in]  h         Element lengths in coordinate directions
- * @param[in]  nu        Dynamic Viscosity;
- * @param[in]  stg_ctx   STGShur14Context for the problem
- * @param[out] qn        Spectrum coefficients, [nmodes]
+ * @param[in]  wall_dist  Distance to the nearest wall
+ * @param[in]  eps        Turbulent dissipation w/rt wall_dist
+ * @param[in]  lt         Turbulent length scale w/rt wall_dist
+ * @param[in]  h_node_sep Element lengths in coordinate directions
+ * @param[in]  nu         Dynamic Viscosity;
+ * @param[in]  stg_ctx    STGShur14Context for the problem
+ * @param[out] qn         Spectrum coefficients, [nmodes]
  */
-CEED_QFUNCTION_HELPER void CalcSpectrum(const CeedScalar wall_dist, const CeedScalar eps, const CeedScalar lt, const CeedScalar h[3],
+CEED_QFUNCTION_HELPER void CalcSpectrum(const CeedScalar wall_dist, const CeedScalar eps, const CeedScalar lt, const CeedScalar h_node_sep[3],
                                         const CeedScalar nu, CeedScalar qn[], const StgShur14Context stg_ctx) {
   const CeedInt     nmodes = stg_ctx->nmodes;
   const CeedScalar *kappa  = &stg_ctx->data[stg_ctx->offsets.kappa];
   CeedScalar        hmax, ke, keta, kcut, Ektot = 0.0;
 
-  SpectrumConstants(wall_dist, eps, lt, h, nu, &hmax, &ke, &keta, &kcut);
+  SpectrumConstants(wall_dist, eps, lt, h_node_sep, nu, &hmax, &ke, &keta, &kcut);
 
   for (CeedInt n = 0; n < nmodes; n++) {
     const CeedScalar dkappa = n == 0 ? kappa[0] : kappa[n] - kappa[n - 1];
@@ -181,28 +181,29 @@ CEED_QFUNCTION_HELPER void StgShur14Calc(const CeedScalar X[3], const CeedScalar
 /******************************************************
  * @brief Calculate u(x,t) for STG inflow condition
  *
- * @param[in]  X         Location to evaluate u(X,t)
- * @param[in]  t         Time to evaluate u(X,t)
- * @param[in]  ubar      Mean velocity at X
- * @param[in]  cij       Cholesky decomposition at X
- * @param[in]  Ektot     Total spectrum energy at this location
- * @param[in]  h         Element size in 3 directions
- * @param[in]  wall_dist Distance to closest wall
- * @param[in]  eps       Turbulent dissipation
- * @param[in]  lt        Turbulent length scale
- * @param[out] u         Velocity at X and t
- * @param[in]  stg_ctx   STGShur14Context for the problem
+ * @param[in]  X          Location to evaluate u(X,t)
+ * @param[in]  t          Time to evaluate u(X,t)
+ * @param[in]  ubar       Mean velocity at X
+ * @param[in]  cij        Cholesky decomposition at X
+ * @param[in]  Ektot      Total spectrum energy at this location
+ * @param[in]  h_node_sep Element size in 3 directions
+ * @param[in]  wall_dist  Distance to closest wall
+ * @param[in]  eps        Turbulent dissipation
+ * @param[in]  lt         Turbulent length scale
+ * @param[out] u          Velocity at X and t
+ * @param[in]  stg_ctx    STGShur14Context for the problem
  */
 CEED_QFUNCTION_HELPER void StgShur14Calc_PrecompEktot(const CeedScalar X[3], const CeedScalar t, const CeedScalar ubar[3], const CeedScalar cij[6],
-                                                      const CeedScalar Ektot, const CeedScalar h[3], const CeedScalar wall_dist, const CeedScalar eps,
-                                                      const CeedScalar lt, const CeedScalar nu, CeedScalar u[3], const StgShur14Context stg_ctx) {
+                                                      const CeedScalar Ektot, const CeedScalar h_node_sep[3], const CeedScalar wall_dist,
+                                                      const CeedScalar eps, const CeedScalar lt, const CeedScalar nu, CeedScalar u[3],
+                                                      const StgShur14Context stg_ctx) {
   const CeedInt     nmodes = stg_ctx->nmodes;
   const CeedScalar *kappa  = &stg_ctx->data[stg_ctx->offsets.kappa];
   const CeedScalar *phi    = &stg_ctx->data[stg_ctx->offsets.phi];
   const CeedScalar *sigma  = &stg_ctx->data[stg_ctx->offsets.sigma];
   const CeedScalar *d      = &stg_ctx->data[stg_ctx->offsets.d];
   CeedScalar        hmax, ke, keta, kcut;
-  SpectrumConstants(wall_dist, eps, lt, h, nu, &hmax, &ke, &keta, &kcut);
+  SpectrumConstants(wall_dist, eps, lt, h_node_sep, nu, &hmax, &ke, &keta, &kcut);
   CeedScalar xdotd, vp[3] = {0.};
   CeedScalar xhat[] = {0., X[1], X[2]};
 
@@ -254,12 +255,13 @@ CEED_QFUNCTION(StgShur14Preprocess)(void *ctx, CeedInt Q, const CeedScalar *cons
         {dXdx_q[1][0][i], dXdx_q[1][1][i], dXdx_q[1][2][i]},
     };
 
-    CeedScalar h[3];
-    h[0] = dx;
-    for (CeedInt j = 1; j < 3; j++) h[j] = 2 / sqrt(dXdx[0][j] * dXdx[0][j] + dXdx[1][j] * dXdx[1][j]);
+    CeedScalar h_node_sep[3];
+    h_node_sep[0] = dx;
+    for (CeedInt j = 1; j < 3; j++) h_node_sep[j] = 2 / sqrt(dXdx[0][j] * dXdx[0][j] + dXdx[1][j] * dXdx[1][j]);
+    ScaleN(h_node_sep, stg_ctx->h_scale_factor, 3);
 
     InterpolateProfile(wall_dist, ubar, cij, &eps, &lt, stg_ctx);
-    SpectrumConstants(wall_dist, eps, lt, h, nu, &hmax, &ke, &keta, &kcut);
+    SpectrumConstants(wall_dist, eps, lt, h_node_sep, nu, &hmax, &ke, &keta, &kcut);
 
     // Calculate total TKE per spectrum
     CeedScalar Ek_tot = 0;
@@ -293,13 +295,14 @@ CEED_QFUNCTION(ICsStg)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedSc
     const CeedScalar x_i[3] = {x[0][i], x[1][i], x[2][i]};
     CeedScalar       dXdx[3][3];
     InvertMappingJacobian_3D(Q, i, J, dXdx, NULL);
-    CeedScalar h[3];
-    h[0] = dx;
-    for (CeedInt j = 1; j < 3; j++) h[j] = 2 / sqrt(Square(dXdx[0][j]) + Square(dXdx[1][j]) + Square(dXdx[2][j]));
+    CeedScalar h_node_sep[3];
+    h_node_sep[0] = dx;
+    for (CeedInt j = 1; j < 3; j++) h_node_sep[j] = 2 / sqrt(Square(dXdx[0][j]) + Square(dXdx[1][j]) + Square(dXdx[2][j]));
+    ScaleN(h_node_sep, stg_ctx->h_scale_factor, 3);
 
     InterpolateProfile(x_i[1], ubar, cij, &eps, &lt, stg_ctx);
     if (stg_ctx->use_fluctuating_IC) {
-      CalcSpectrum(x_i[1], eps, lt, h, nu, qn, stg_ctx);
+      CalcSpectrum(x_i[1], eps, lt, h_node_sep, nu, qn, stg_ctx);
       StgShur14Calc(x_i, time, ubar, cij, qn, u, stg_ctx);
     } else {
       for (CeedInt j = 0; j < 3; j++) u[j] = ubar[j];
@@ -361,13 +364,14 @@ CEED_QFUNCTION(StgShur14Inflow)(void *ctx, CeedInt Q, const CeedScalar *const *i
     QdataBoundaryUnpack_3D(Q, i, q_data_sur, &wdetJb, dXdx, norm);
     wdetJb *= is_implicit ? -1. : 1.;
 
-    CeedScalar h[3];
-    h[0] = dx;
-    for (CeedInt j = 1; j < 3; j++) h[j] = 2 / sqrt(Square(dXdx[0][j]) + Square(dXdx[1][j]));
+    CeedScalar h_node_sep[3];
+    h_node_sep[0] = dx;
+    for (CeedInt j = 1; j < 3; j++) h_node_sep[j] = 2 / sqrt(Square(dXdx[0][j]) + Square(dXdx[1][j]));
+    ScaleN(h_node_sep, stg_ctx->h_scale_factor, 3);
 
     InterpolateProfile(X[1][i], ubar, cij, &eps, &lt, stg_ctx);
     if (!mean_only) {
-      CalcSpectrum(X[1][i], eps, lt, h, mu / rho, qn, stg_ctx);
+      CalcSpectrum(X[1][i], eps, lt, h_node_sep, mu / rho, qn, stg_ctx);
       StgShur14Calc(x, time, ubar, cij, qn, u, stg_ctx);
     } else {
       for (CeedInt j = 0; j < 3; j++) u[j] = ubar[j];
@@ -494,17 +498,18 @@ CEED_QFUNCTION(StgShur14InflowStrongQF)(void *ctx, CeedInt Q, const CeedScalar *
         {dXdx_q[1][0][i], dXdx_q[1][1][i], dXdx_q[1][2][i]},
     };
 
-    CeedScalar h[3];
-    h[0] = dx;
-    for (CeedInt j = 1; j < 3; j++) h[j] = 2 / sqrt(Square(dXdx[0][j]) + Square(dXdx[1][j]));
+    CeedScalar h_node_sep[3];
+    h_node_sep[0] = dx;
+    for (CeedInt j = 1; j < 3; j++) h_node_sep[j] = 2 / sqrt(Square(dXdx[0][j]) + Square(dXdx[1][j]));
+    ScaleN(h_node_sep, stg_ctx->h_scale_factor, 3);
 
     InterpolateProfile(coords[1][i], ubar, cij, &eps, &lt, stg_ctx);
     if (!mean_only) {
       if (1) {
-        StgShur14Calc_PrecompEktot(x, time, ubar, cij, inv_Ektotal[i], h, x[1], eps, lt, nu, u, stg_ctx);
+        StgShur14Calc_PrecompEktot(x, time, ubar, cij, inv_Ektotal[i], h_node_sep, x[1], eps, lt, nu, u, stg_ctx);
       } else {  // Original way
         CeedScalar qn[STG_NMODES_MAX];
-        CalcSpectrum(coords[1][i], eps, lt, h, nu, qn, stg_ctx);
+        CalcSpectrum(coords[1][i], eps, lt, h_node_sep, nu, qn, stg_ctx);
         StgShur14Calc(x, time, ubar, cij, qn, u, stg_ctx);
       }
     } else {
diff --git a/examples/fluids/qfunctions/stg_shur14_type.h b/examples/fluids/qfunctions/stg_shur14_type.h
index a8ed21c292..5e369cd702 100644
--- a/examples/fluids/qfunctions/stg_shur14_type.h
+++ b/examples/fluids/qfunctions/stg_shur14_type.h
@@ -25,6 +25,7 @@ struct STGShur14Context_ {
   bool                             is_implicit;         // !< Whether using implicit time integration
   bool                             mean_only;           // !< Only apply the mean profile
   CeedScalar                       dx;                  // !< dx used for h calculation
+  CeedScalar                       h_scale_factor;      // !< Scales the element size
   bool                             prescribe_T;         // !< Prescribe temperature weakly
   bool                             use_fluctuating_IC;  // !< Only apply the mean profile
   struct NewtonianIdealGasContext_ newtonian_ctx;

From fc39c77ef991b17a0c558c74fe821c6309b1c03f Mon Sep 17 00:00:00 2001
From: James Wright <james@jameswright.xyz>
Date: Tue, 18 Jun 2024 11:54:58 -0600
Subject: [PATCH 077/571] fluids: Make commandline option arrays all upper case

---
 examples/fluids/include/sgs_model_torch.h |  2 +-
 examples/fluids/navierstokes.h            | 23 +++++++++++------------
 examples/fluids/problems/bc_freestream.c  |  2 +-
 examples/fluids/problems/newtonian.c      |  2 +-
 4 files changed, 14 insertions(+), 15 deletions(-)

diff --git a/examples/fluids/include/sgs_model_torch.h b/examples/fluids/include/sgs_model_torch.h
index 16217bb51b..a268f1287c 100644
--- a/examples/fluids/include/sgs_model_torch.h
+++ b/examples/fluids/include/sgs_model_torch.h
@@ -17,7 +17,7 @@ typedef enum {
   TORCH_DEVICE_HIP,
   TORCH_DEVICE_XPU,
 } TorchDeviceType;
-static const char *const TorchDeviceTypes[] = {"cpu", "cuda", "hip", "xpu", "TorchDeviceType", "TORCH_DEVICE_", NULL};
+static const char *const TorchDeviceTypes[] = {"CPU", "CUDA", "HIP", "XPU", "TorchDeviceType", "TORCH_DEVICE_", NULL};
 
 PetscErrorCode LoadModel_Torch(const char *model_path, TorchDeviceType device_enum);
 PetscErrorCode ModelInference_Torch(Vec DD_Inputs_loc, Vec DD_Outputs_loc);
diff --git a/examples/fluids/navierstokes.h b/examples/fluids/navierstokes.h
index de1fd300f0..e06f35f5b0 100644
--- a/examples/fluids/navierstokes.h
+++ b/examples/fluids/navierstokes.h
@@ -33,23 +33,22 @@ typedef enum {
   EULER_TEST_4                 = 4,
   EULER_TEST_5                 = 5,
 } EulerTestType;
-static const char *const EulerTestTypes[] = {"isentropic_vortex", "test_1",      "test_2", "test_3", "test_4", "test_5",
-                                             "EulerTestType",     "EULER_TEST_", NULL};
+static const char *const EulerTestTypes[] = {"ISENTROPIC_VORTEX", "1", "2", "3", "4", "5", "EulerTestType", "EULER_TEST_", NULL};
 
 // Advection - Wind types
-static const char *const WindTypes[] = {"rotation", "translation", "WindType", "WIND_", NULL};
+static const char *const WindTypes[] = {"ROTATION", "TRANSLATION", "WindType", "WIND_", NULL};
 
 // Advection - Initial Condition Types
-static const char *const AdvectionICTypes[] = {"sphere", "cylinder", "cosine_hill", "skew", "AdvectionICType", "ADVECTIONIC_", NULL};
+static const char *const AdvectionICTypes[] = {"SPHERE", "CYLINDER", "COSINE_HILL", "SKEW", "AdvectionICType", "ADVECTIONIC_", NULL};
 
 // Advection - Bubble Continuity Types
-static const char *const BubbleContinuityTypes[] = {"smooth", "back_sharp", "thick", "cosine", "BubbleContinuityType", "BUBBLE_CONTINUITY_", NULL};
+static const char *const BubbleContinuityTypes[] = {"SMOOTH", "BACK_SHARP", "THICK", "COSINE", "BubbleContinuityType", "BUBBLE_CONTINUITY_", NULL};
 
 // Stabilization methods
-static const char *const StabilizationTypes[] = {"none", "SU", "SUPG", "StabilizationType", "STAB_", NULL};
+static const char *const StabilizationTypes[] = {"NONE", "SU", "SUPG", "StabilizationType", "STAB_", NULL};
 
 // Stabilization tau constants
-static const char *const StabilizationTauTypes[] = {"Ctau", "AdvDiff_Shakib", "AdvDiff_Shakib_P", "StabilizationTauType", "STAB_TAU_", NULL};
+static const char *const StabilizationTauTypes[] = {"CTAU", "ADVDIFF_SHAKIB", "ADVDIFF_SHAKIB_P", "StabilizationTauType", "STAB_TAU_", NULL};
 
 // Test mode type
 typedef enum {
@@ -58,14 +57,14 @@ typedef enum {
   TESTTYPE_TURB_SPANSTATS = 2,
   TESTTYPE_DIFF_FILTER    = 3,
 } TestType;
-static const char *const TestTypes[] = {"none", "solver", "turb_spanstats", "diff_filter", "TestType", "TESTTYPE_", NULL};
+static const char *const TestTypes[] = {"NONE", "SOLVER", "TURB_SPANSTATS", "DIFF_FILTER", "TestType", "TESTTYPE_", NULL};
 
 // Subgrid-Stress mode type
 typedef enum {
   SGS_MODEL_NONE        = 0,
   SGS_MODEL_DATA_DRIVEN = 1,
 } SGSModelType;
-static const char *const SGSModelTypes[] = {"none", "data_driven", "SGSModelType", "SGS_MODEL_", NULL};
+static const char *const SGSModelTypes[] = {"NONE", "DATA_DRIVEN", "SGSModelType", "SGS_MODEL_", NULL};
 
 // Subgrid-Stress mode type
 typedef enum {
@@ -73,7 +72,7 @@ typedef enum {
   SGS_MODEL_DD_SEQENTIAL_CEED  = 1,
   SGS_MODEL_DD_SEQENTIAL_TORCH = 2,
 } SGSModelDDImplementation;
-static const char *const SGSModelDDImplementations[] = {"fused", "sequential_ceed", "sequential_torch", "SGSModelDDImplementation", "SGS_MODEL_DD_",
+static const char *const SGSModelDDImplementations[] = {"FUSED", "SEQUENTIAL_CEED", "SEQUENTIAL_TORCH", "SGSModelDDImplementation", "SGS_MODEL_DD_",
                                                         NULL};
 
 // Mesh transformation type
@@ -81,10 +80,10 @@ typedef enum {
   MESH_TRANSFORM_NONE      = 0,
   MESH_TRANSFORM_PLATEMESH = 1,
 } MeshTransformType;
-static const char *const MeshTransformTypes[] = {"none", "platemesh", "MeshTransformType", "MESH_TRANSFORM_", NULL};
+static const char *const MeshTransformTypes[] = {"NONE", "PLATEMESH", "MeshTransformType", "MESH_TRANSFORM_", NULL};
 
 static const char *const DifferentialFilterDampingFunctions[] = {
-    "none", "van_driest", "mms", "DifferentialFilterDampingFunction", "DIFF_FILTER_DAMP_", NULL};
+    "NONE", "VAN_DRIEST", "MMS", "DifferentialFilterDampingFunction", "DIFF_FILTER_DAMP_", NULL};
 
 // -----------------------------------------------------------------------------
 // Structs
diff --git a/examples/fluids/problems/bc_freestream.c b/examples/fluids/problems/bc_freestream.c
index e57dcf6272..ac5713e666 100644
--- a/examples/fluids/problems/bc_freestream.c
+++ b/examples/fluids/problems/bc_freestream.c
@@ -16,7 +16,7 @@
 #include "../navierstokes.h"
 #include "../qfunctions/newtonian_types.h"
 
-static const char *const RiemannSolverTypes[] = {"hll", "hllc", "RiemannSolverTypes", "RIEMANN_", NULL};
+static const char *const RiemannSolverTypes[] = {"HLL", "HLLC", "RiemannSolverTypes", "RIEMANN_", NULL};
 
 PetscErrorCode FreestreamBCSetup(ProblemData problem, DM dm, void *ctx, NewtonianIdealGasContext newtonian_ig_ctx, const StatePrimitive *reference) {
   User                 user = *(User *)ctx;
diff --git a/examples/fluids/problems/newtonian.c b/examples/fluids/problems/newtonian.c
index 4357644711..5e9d6f579a 100644
--- a/examples/fluids/problems/newtonian.c
+++ b/examples/fluids/problems/newtonian.c
@@ -16,7 +16,7 @@
 #include "../navierstokes.h"
 
 // For use with PetscOptionsEnum
-static const char *const StateVariables[] = {"conservative", "primitive", "entropy", "StateVariable", "STATEVAR_", NULL};
+static const char *const StateVariables[] = {"CONSERVATIVE", "PRIMITIVE", "ENTROPY", "StateVariable", "STATEVAR_", NULL};
 
 static PetscErrorCode CheckQWithTolerance(const CeedScalar Q_s[5], const CeedScalar Q_a[5], const CeedScalar Q_b[5], const char *name,
                                           PetscReal rtol_0, PetscReal rtol_u, PetscReal rtol_4) {

From aedeac777a6915d907296ca7d6b0ea49e684ee4b Mon Sep 17 00:00:00 2001
From: James Wright <james@jameswright.xyz>
Date: Thu, 20 Jun 2024 08:29:23 -0600
Subject: [PATCH 078/571] fluids: Add unit tests for Riemann solver functions

---
 examples/fluids/problems/bc_freestream.c | 342 +++++++++++++++++++++++
 1 file changed, 342 insertions(+)

diff --git a/examples/fluids/problems/bc_freestream.c b/examples/fluids/problems/bc_freestream.c
index ac5713e666..2907a241da 100644
--- a/examples/fluids/problems/bc_freestream.c
+++ b/examples/fluids/problems/bc_freestream.c
@@ -18,6 +18,8 @@
 
 static const char *const RiemannSolverTypes[] = {"HLL", "HLLC", "RiemannSolverTypes", "RIEMANN_", NULL};
 
+static PetscErrorCode RiemannSolverUnitTests(NewtonianIdealGasContext gas, CeedScalar rtol);
+
 PetscErrorCode FreestreamBCSetup(ProblemData problem, DM dm, void *ctx, NewtonianIdealGasContext newtonian_ig_ctx, const StatePrimitive *reference) {
   User                 user = *(User *)ctx;
   MPI_Comm             comm = user->comm;
@@ -112,6 +114,13 @@ PetscErrorCode FreestreamBCSetup(ProblemData problem, DM dm, void *ctx, Newtonia
   PetscCallCeed(ceed, CeedQFunctionContextSetDataDestroy(freestream_context, CEED_MEM_HOST, FreeContextPetsc));
   problem->apply_freestream.qfunction_context = freestream_context;
   PetscCallCeed(ceed, CeedQFunctionContextReferenceCopy(freestream_context, &problem->apply_freestream_jacobian.qfunction_context));
+
+  {
+    PetscBool run_unit_tests = PETSC_FALSE;
+
+    PetscCall(PetscOptionsGetBool(NULL, NULL, "-riemann_solver_unit_tests", &run_unit_tests, NULL));
+    if (run_unit_tests) PetscCall(RiemannSolverUnitTests(newtonian_ig_ctx, 5e-7));
+  }
   PetscFunctionReturn(PETSC_SUCCESS);
 }
 
@@ -209,3 +218,336 @@ PetscErrorCode OutflowBCSetup(ProblemData problem, DM dm, void *ctx, NewtonianId
   PetscCallCeed(ceed, CeedQFunctionContextReferenceCopy(outflow_context, &problem->apply_outflow_jacobian.qfunction_context));
   PetscFunctionReturn(PETSC_SUCCESS);
 }
+
+// @brief Calculate relative error, (A - B) / S
+// If S < threshold, then set S=1
+static inline CeedScalar RelativeError(CeedScalar S, CeedScalar A, CeedScalar B, CeedScalar threshold) {
+  return (A - B) / (fabs(S) > threshold ? S : 1);
+}
+
+// @brief Check errors of a State vector and print if above tolerance
+static PetscErrorCode CheckQWithTolerance(const CeedScalar Q_s[5], const CeedScalar Q_a[5], const CeedScalar Q_b[5], const char *name,
+                                          PetscReal rtol_0, PetscReal rtol_u, PetscReal rtol_4) {
+  CeedScalar relative_error[5];  // relative error
+  CeedScalar divisor_threshold = 10 * CEED_EPSILON;
+
+  PetscFunctionBeginUser;
+  relative_error[0] = RelativeError(Q_s[0], Q_a[0], Q_b[0], divisor_threshold);
+  relative_error[4] = RelativeError(Q_s[4], Q_a[4], Q_b[4], divisor_threshold);
+
+  CeedScalar u_magnitude = sqrt(Square(Q_s[1]) + Square(Q_s[2]) + Square(Q_s[3]));
+  for (int i = 1; i < 4; i++) {
+    relative_error[i] = RelativeError(u_magnitude, Q_a[i], Q_b[i], divisor_threshold);
+  }
+
+  if (fabs(relative_error[0]) >= rtol_0) {
+    printf("%s[0] error %g (expected %.10e, got %.10e)\n", name, relative_error[0], Q_s[0], Q_a[0]);
+  }
+  for (int i = 1; i < 4; i++) {
+    if (fabs(relative_error[i]) >= rtol_u) {
+      printf("%s[%d] error %g (expected %.10e, got %.10e)\n", name, i, relative_error[i], Q_s[i], Q_a[i]);
+    }
+  }
+  if (fabs(relative_error[4]) >= rtol_4) {
+    printf("%s[4] error %g (expected %.10e, got %.10e)\n", name, relative_error[4], Q_s[4], Q_a[4]);
+  }
+  PetscFunctionReturn(PETSC_SUCCESS);
+}
+
+// @brief Verify RiemannFlux_HLL_fwd function against finite-difference approximation
+static PetscErrorCode TestRiemannHLL_fwd(NewtonianIdealGasContext gas, CeedScalar rtol_0, CeedScalar rtol_u, CeedScalar rtol_4) {
+  CeedScalar       eps = 4e-7;  // Finite difference step
+  char             buf[128];
+  const CeedScalar T           = 200;
+  const CeedScalar rho         = 1.2;
+  const CeedScalar p           = (HeatCapacityRatio(gas) - 1) * rho * gas->cv * T;
+  const CeedScalar u_base      = 40;
+  const CeedScalar u[3]        = {u_base, u_base * 1.1, u_base * 1.2};
+  const CeedScalar Y0_left[5]  = {p, u[0], u[1], u[2], T};
+  const CeedScalar Y0_right[5] = {1.2 * p, 1.2 * u[0], 1.2 * u[1], 1.2 * u[2], 1.2 * T};
+  CeedScalar       normal[3]   = {1, 2, 3};
+
+  PetscFunctionBeginUser;
+  State left0  = StateFromY(gas, Y0_left);
+  State right0 = StateFromY(gas, Y0_right);
+  ScaleN(normal, 1 / sqrt(Dot3(normal, normal)), 3);
+
+  for (int i = 0; i < 10; i++) {
+    CeedScalar dFlux[5] = {0.}, dFlux_fd[5] = {0.};
+    {  // Calculate dFlux using *_fwd function
+      CeedScalar dY_right[5] = {0};
+      CeedScalar dY_left[5]  = {0};
+
+      if (i < 5) {
+        dY_left[i] = Y0_left[i];
+      } else {
+        dY_right[i % 5] = Y0_right[i % 5];
+      }
+      State dleft0  = StateFromY_fwd(gas, left0, dY_left);
+      State dright0 = StateFromY_fwd(gas, right0, dY_right);
+
+      StateConservative dFlux_state = RiemannFlux_HLL_fwd(gas, left0, dleft0, right0, dright0, normal);
+      UnpackState_U(dFlux_state, dFlux);
+    }
+
+    {  // Calculate dFlux_fd via finite difference approximation
+      CeedScalar Y1_left[5]  = {Y0_left[0], Y0_left[1], Y0_left[2], Y0_left[3], Y0_left[4]};
+      CeedScalar Y1_right[5] = {Y0_right[0], Y0_right[1], Y0_right[2], Y0_right[3], Y0_right[4]};
+      CeedScalar Flux0[5], Flux1[5];
+
+      if (i < 5) {
+        Y1_left[i] *= 1 + eps;
+      } else {
+        Y1_right[i % 5] *= 1 + eps;
+      }
+      State left1  = StateFromY(gas, Y1_left);
+      State right1 = StateFromY(gas, Y1_right);
+
+      StateConservative Flux0_state = RiemannFlux_HLL(gas, left0, right0, normal);
+      StateConservative Flux1_state = RiemannFlux_HLL(gas, left1, right1, normal);
+      UnpackState_U(Flux0_state, Flux0);
+      UnpackState_U(Flux1_state, Flux1);
+      for (int j = 0; j < 5; j++) dFlux_fd[j] = (Flux1[j] - Flux0[j]) / eps;
+    }
+
+    snprintf(buf, sizeof buf, "RiemannFlux_HLL i=%d: Flux", i);
+    PetscCall(CheckQWithTolerance(dFlux_fd, dFlux, dFlux_fd, buf, rtol_0, rtol_u, rtol_4));
+  }
+  PetscFunctionReturn(PETSC_SUCCESS);
+}
+
+// @brief Verify RiemannFlux_HLLC_fwd function against finite-difference approximation
+static PetscErrorCode TestRiemannHLLC_fwd(NewtonianIdealGasContext gas, CeedScalar rtol_0, CeedScalar rtol_u, CeedScalar rtol_4) {
+  CeedScalar       eps = 4e-7;  // Finite difference step
+  char             buf[128];
+  const CeedScalar T           = 200;
+  const CeedScalar rho         = 1.2;
+  const CeedScalar p           = (HeatCapacityRatio(gas) - 1) * rho * gas->cv * T;
+  const CeedScalar u_base      = 40;
+  const CeedScalar u[3]        = {u_base, u_base * 1.1, u_base * 1.2};
+  const CeedScalar Y0_left[5]  = {p, u[0], u[1], u[2], T};
+  const CeedScalar Y0_right[5] = {1.2 * p, 1.2 * u[0], 1.2 * u[1], 1.2 * u[2], 1.2 * T};
+  CeedScalar       normal[3]   = {1, 2, 3};
+
+  PetscFunctionBeginUser;
+  State left0  = StateFromY(gas, Y0_left);
+  State right0 = StateFromY(gas, Y0_right);
+  ScaleN(normal, 1 / sqrt(Dot3(normal, normal)), 3);
+
+  for (int i = 0; i < 10; i++) {
+    CeedScalar dFlux[5] = {0.}, dFlux_fd[5] = {0.};
+    {  // Calculate dFlux using *_fwd function
+      CeedScalar dY_right[5] = {0};
+      CeedScalar dY_left[5]  = {0};
+
+      if (i < 5) {
+        dY_left[i] = Y0_left[i];
+      } else {
+        dY_right[i % 5] = Y0_right[i % 5];
+      }
+      State dleft0  = StateFromY_fwd(gas, left0, dY_left);
+      State dright0 = StateFromY_fwd(gas, right0, dY_right);
+
+      StateConservative dFlux_state = RiemannFlux_HLLC_fwd(gas, left0, dleft0, right0, dright0, normal);
+      UnpackState_U(dFlux_state, dFlux);
+    }
+
+    {  // Calculate dFlux_fd via finite difference approximation
+      CeedScalar Y1_left[5]  = {Y0_left[0], Y0_left[1], Y0_left[2], Y0_left[3], Y0_left[4]};
+      CeedScalar Y1_right[5] = {Y0_right[0], Y0_right[1], Y0_right[2], Y0_right[3], Y0_right[4]};
+      CeedScalar Flux0[5], Flux1[5];
+
+      if (i < 5) {
+        Y1_left[i] *= 1 + eps;
+      } else {
+        Y1_right[i % 5] *= 1 + eps;
+      }
+      State left1  = StateFromY(gas, Y1_left);
+      State right1 = StateFromY(gas, Y1_right);
+
+      StateConservative Flux0_state = RiemannFlux_HLLC(gas, left0, right0, normal);
+      StateConservative Flux1_state = RiemannFlux_HLLC(gas, left1, right1, normal);
+      UnpackState_U(Flux0_state, Flux0);
+      UnpackState_U(Flux1_state, Flux1);
+      for (int j = 0; j < 5; j++) dFlux_fd[j] = (Flux1[j] - Flux0[j]) / eps;
+    }
+
+    snprintf(buf, sizeof buf, "RiemannFlux_HLLC i=%d: Flux", i);
+    PetscCall(CheckQWithTolerance(dFlux_fd, dFlux, dFlux_fd, buf, rtol_0, rtol_u, rtol_4));
+  }
+  PetscFunctionReturn(PETSC_SUCCESS);
+}
+
+// @brief Verify ComputeHLLSpeeds_Roe_fwd function against finite-difference approximation
+static PetscErrorCode TestComputeHLLSpeeds_Roe_fwd(NewtonianIdealGasContext gas, CeedScalar rtol) {
+  CeedScalar       eps = 4e-7;  // Finite difference step
+  char             buf[128];
+  const CeedScalar T           = 200;
+  const CeedScalar rho         = 1.2;
+  const CeedScalar p           = (HeatCapacityRatio(gas) - 1) * rho * gas->cv * T;
+  const CeedScalar u_base      = 40;
+  const CeedScalar u[3]        = {u_base, u_base * 1.1, u_base * 1.2};
+  const CeedScalar Y0_left[5]  = {p, u[0], u[1], u[2], T};
+  const CeedScalar Y0_right[5] = {1.2 * p, 1.2 * u[0], 1.2 * u[1], 1.2 * u[2], 1.2 * T};
+  CeedScalar       normal[3]   = {1, 2, 3};
+
+  PetscFunctionBeginUser;
+  State left0  = StateFromY(gas, Y0_left);
+  State right0 = StateFromY(gas, Y0_right);
+  ScaleN(normal, 1 / sqrt(Dot3(normal, normal)), 3);
+  CeedScalar u_left0  = Dot3(left0.Y.velocity, normal);
+  CeedScalar u_right0 = Dot3(right0.Y.velocity, normal);
+
+  for (int i = 0; i < 10; i++) {
+    CeedScalar ds_left, ds_right, ds_left_fd, ds_right_fd;
+    {  // Calculate ds_{left,right} using *_fwd function
+      CeedScalar dY_right[5] = {0};
+      CeedScalar dY_left[5]  = {0};
+
+      if (i < 5) {
+        dY_left[i] = Y0_left[i];
+      } else {
+        dY_right[i % 5] = Y0_right[i % 5];
+      }
+      State      dleft0   = StateFromY_fwd(gas, left0, dY_left);
+      State      dright0  = StateFromY_fwd(gas, right0, dY_right);
+      CeedScalar du_left  = Dot3(dleft0.Y.velocity, normal);
+      CeedScalar du_right = Dot3(dright0.Y.velocity, normal);
+
+      CeedScalar s_left, s_right;  // Throw away
+      ComputeHLLSpeeds_Roe_fwd(gas, left0, dleft0, u_left0, du_left, right0, dright0, u_right0, du_right, &s_left, &ds_left, &s_right, &ds_right);
+    }
+
+    {  // Calculate ds_{left,right}_fd via finite difference approximation
+      CeedScalar Y1_left[5]  = {Y0_left[0], Y0_left[1], Y0_left[2], Y0_left[3], Y0_left[4]};
+      CeedScalar Y1_right[5] = {Y0_right[0], Y0_right[1], Y0_right[2], Y0_right[3], Y0_right[4]};
+
+      if (i < 5) {
+        Y1_left[i] *= 1 + eps;
+      } else {
+        Y1_right[i % 5] *= 1 + eps;
+      }
+      State      left1    = StateFromY(gas, Y1_left);
+      State      right1   = StateFromY(gas, Y1_right);
+      CeedScalar u_left1  = Dot3(left1.Y.velocity, normal);
+      CeedScalar u_right1 = Dot3(right1.Y.velocity, normal);
+
+      CeedScalar s_left0, s_right0, s_left1, s_right1;
+      ComputeHLLSpeeds_Roe(gas, left0, u_left0, right0, u_right0, &s_left0, &s_right0);
+      ComputeHLLSpeeds_Roe(gas, left1, u_left1, right1, u_right1, &s_left1, &s_right1);
+      ds_left_fd  = (s_left1 - s_left0) / eps;
+      ds_right_fd = (s_right1 - s_right0) / eps;
+    }
+
+    snprintf(buf, sizeof buf, "ComputeHLLSpeeds_Roe i=%d:", i);
+    {
+      CeedScalar divisor_threshold = 10 * CEED_EPSILON;
+      CeedScalar ds_left_err, ds_right_err;
+
+      ds_left_err  = RelativeError(ds_left_fd, ds_left, ds_left_fd, divisor_threshold);
+      ds_right_err = RelativeError(ds_right_fd, ds_right, ds_right_fd, divisor_threshold);
+      if (fabs(ds_left_err) >= rtol) printf("%s ds_left error %g (expected %.10e, got %.10e)\n", buf, ds_left_err, ds_left_fd, ds_left);
+      if (fabs(ds_right_err) >= rtol) printf("%s ds_right error %g (expected %.10e, got %.10e)\n", buf, ds_right_err, ds_right_fd, ds_right);
+    }
+  }
+  PetscFunctionReturn(PETSC_SUCCESS);
+}
+
+// @brief Verify TotalSpecificEnthalpy_fwd function against finite-difference approximation
+static PetscErrorCode TestTotalSpecificEnthalpy_fwd(NewtonianIdealGasContext gas, CeedScalar rtol) {
+  CeedScalar       eps = 4e-7;  // Finite difference step
+  char             buf[128];
+  const CeedScalar T      = 200;
+  const CeedScalar rho    = 1.2;
+  const CeedScalar p      = (HeatCapacityRatio(gas) - 1) * rho * gas->cv * T;
+  const CeedScalar u_base = 40;
+  const CeedScalar u[3]   = {u_base, u_base * 1.1, u_base * 1.2};
+  const CeedScalar Y0[5]  = {p, u[0], u[1], u[2], T};
+
+  PetscFunctionBeginUser;
+  State state0 = StateFromY(gas, Y0);
+
+  for (int i = 0; i < 5; i++) {
+    CeedScalar dH, dH_fd;
+    {  // Calculate dH using *_fwd function
+      CeedScalar dY[5] = {0};
+
+      dY[i]         = Y0[i];
+      State dstate0 = StateFromY_fwd(gas, state0, dY);
+      dH            = TotalSpecificEnthalpy_fwd(gas, state0, dstate0);
+    }
+
+    {  // Calculate dH_fd via finite difference approximation
+      CeedScalar H0, H1;
+      CeedScalar Y1[5] = {Y0[0], Y0[1], Y0[2], Y0[3], Y0[4]};
+      Y1[i] *= 1 + eps;
+      State state1 = StateFromY(gas, Y1);
+
+      H0    = TotalSpecificEnthalpy(gas, state0);
+      H1    = TotalSpecificEnthalpy(gas, state1);
+      dH_fd = (H1 - H0) / eps;
+    }
+
+    snprintf(buf, sizeof buf, "TotalSpecificEnthalpy i=%d:", i);
+    {
+      CeedScalar divisor_threshold = 10 * CEED_EPSILON;
+      CeedScalar dH_err;
+
+      dH_err = RelativeError(dH_fd, dH, dH_fd, divisor_threshold);
+      if (fabs(dH_err) >= rtol) printf("%s dH error %g (expected %.10e, got %.10e)\n", buf, dH_err, dH_fd, dH);
+    }
+  }
+  PetscFunctionReturn(PETSC_SUCCESS);
+}
+
+// @brief Verify RoeSetup_fwd function against finite-difference approximation
+static PetscErrorCode TestRowSetup_fwd(NewtonianIdealGasContext gas, CeedScalar rtol) {
+  CeedScalar       eps = 4e-7;  // Finite difference step
+  char             buf[128];
+  const CeedScalar rho0[2] = {1.2, 1.4};
+
+  PetscFunctionBeginUser;
+  for (int i = 0; i < 2; i++) {
+    RoeWeights dR, dR_fd;
+    {  // Calculate using *_fwd function
+      CeedScalar drho[5] = {0};
+
+      drho[i] = rho0[i];
+      dR      = RoeSetup_fwd(rho0[0], rho0[1], drho[0], drho[1]);
+    }
+
+    {  // Calculate via finite difference approximation
+      RoeWeights R0, R1;
+      CeedScalar rho1[5] = {rho0[0], rho0[1]};
+      rho1[i] *= 1 + eps;
+
+      R0          = RoeSetup(rho0[0], rho0[1]);
+      R1          = RoeSetup(rho1[0], rho1[1]);
+      dR_fd.left  = (R1.left - R0.left) / eps;
+      dR_fd.right = (R1.right - R0.right) / eps;
+    }
+
+    snprintf(buf, sizeof buf, "RoeSetup i=%d:", i);
+    {
+      CeedScalar divisor_threshold = 10 * CEED_EPSILON;
+      RoeWeights dR_err;
+
+      dR_err.left  = RelativeError(dR_fd.left, dR.left, dR_fd.left, divisor_threshold);
+      dR_err.right = RelativeError(dR_fd.right, dR.right, dR_fd.right, divisor_threshold);
+      if (fabs(dR_err.left) >= rtol) printf("%s dR.left error %g (expected %.10e, got %.10e)\n", buf, dR_err.left, dR_fd.left, dR.left);
+      if (fabs(dR_err.right) >= rtol) printf("%s dR.right error %g (expected %.10e, got %.10e)\n", buf, dR_err.right, dR_fd.right, dR.right);
+    }
+  }
+  PetscFunctionReturn(PETSC_SUCCESS);
+}
+
+// @brief Test Riemann solver related `*_fwd` functions via finite-difference approximation
+static PetscErrorCode RiemannSolverUnitTests(NewtonianIdealGasContext gas, CeedScalar rtol) {
+  PetscFunctionBeginUser;
+  PetscCall(TestRiemannHLL_fwd(gas, rtol, rtol, rtol));
+  PetscCall(TestRiemannHLLC_fwd(gas, rtol, rtol, rtol));
+  PetscCall(TestComputeHLLSpeeds_Roe_fwd(gas, rtol));
+  PetscCall(TestTotalSpecificEnthalpy_fwd(gas, rtol));
+  PetscCall(TestRowSetup_fwd(gas, rtol));
+  PetscFunctionReturn(PETSC_SUCCESS);
+}

From 54f81801806a437fa03eeab77ebdba810797a95b Mon Sep 17 00:00:00 2001
From: James Wright <james@jameswright.xyz>
Date: Thu, 20 Jun 2024 08:31:16 -0600
Subject: [PATCH 079/571] fix(fluids): Correct Riemann HLL *_fwd functions

---
 examples/fluids/navierstokes.c                |   3 ++-
 examples/fluids/qfunctions/riemann_solver.h   |  13 ++++++-------
 ...-navierstokes-gaussianwave-IDL-entropy.bin | Bin 2340 -> 2340 bytes
 .../fluids-navierstokes-gaussianwave-IDL.bin  | Bin 2340 -> 2340 bytes
 ...fluids-navierstokes-gaussianwave-shell.bin | Bin 7080 -> 7092 bytes
 5 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/examples/fluids/navierstokes.c b/examples/fluids/navierstokes.c
index 9324bce74f..713e30df4d 100644
--- a/examples/fluids/navierstokes.c
+++ b/examples/fluids/navierstokes.c
@@ -18,7 +18,8 @@
 //     ./navierstokes -ceed /cpu/self -options_file gaussianwave.yml
 //     ./navierstokes -ceed /gpu/cuda -problem advection -degree 1
 //
-//TESTARGS(name="Gaussian Wave, IDL and Entropy variables") -ceed {ceed_resource} -test_type solver -options_file examples/fluids/gaussianwave.yaml -compare_final_state_atol 2e-11 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-gaussianwave-IDL-entropy.bin -state_var entropy -dm_plex_box_faces 5,5,1 -ts_max_steps 5 -idl_decay_time 2e-3 -idl_length 0.25 -idl_start 0 -idl_pressure 70 -newtonian_unit_tests
+//TESTARGS(name="Newtonian and Riemann Solver Unit Tests",only="cpu") -ceed {ceed_resource} -test_type solver -options_file examples/fluids/gaussianwave.yaml -compare_final_state_atol 1e100 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-gaussianwave-IDL-entropy.bin -dm_plex_box_faces 5,5,1 -ts_max_steps 0 -newtonian_unit_tests -riemann_solver_unit_tests
+//TESTARGS(name="Gaussian Wave, IDL and Entropy variables") -ceed {ceed_resource} -test_type solver -options_file examples/fluids/gaussianwave.yaml -compare_final_state_atol 2e-11 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-gaussianwave-IDL-entropy.bin -state_var entropy -dm_plex_box_faces 5,5,1 -ts_max_steps 5 -idl_decay_time 2e-3 -idl_length 0.25 -idl_start 0 -idl_pressure 70
 //TESTARGS(name="Blasius, SGS DataDriven Sequential Torch",only="torch") -ceed {ceed_resource} -options_file examples/fluids/tests-output/blasius_stgtest.yaml -sgs_model_type data_driven -sgs_model_dd_leakyrelu_alpha 0.3 -sgs_model_dd_parameter_dir examples/fluids/dd_sgs_data -ts_dt 2e-9 -state_var primitive -ksp_rtol 1e-12 -snes_rtol 1e-12 -stg_mean_only -stg_fluctuating_IC -test_type solver -compare_final_state_atol 1e-10 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-blasius-sgs-data-driven.bin -sgs_model_dd_implementation sequential_torch -sgs_model_dd_torch_model_path ./examples/fluids/tests-output/createPyTorchModel/NNModel_HIT_fp64_jit.pt
 //TESTARGS(name="Blasius, SGS DataDriven Sequential Ceed") -ceed {ceed_resource} -options_file examples/fluids/tests-output/blasius_stgtest.yaml -sgs_model_type data_driven -sgs_model_dd_leakyrelu_alpha 0.3 -sgs_model_dd_parameter_dir examples/fluids/dd_sgs_data -ts_dt 2e-9 -state_var primitive -ksp_rtol 1e-12 -snes_rtol 1e-12 -stg_mean_only -stg_fluctuating_IC -test_type solver -compare_final_state_atol 1e-10 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-blasius-sgs-data-driven.bin -sgs_model_dd_implementation sequential_ceed
 //TESTARGS(name="Gaussian Wave, explicit, supg, IDL") -ceed {ceed_resource} -test_type solver -options_file examples/fluids/gaussianwave.yaml -compare_final_state_atol 1e-8 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-gaussianwave-explicit.bin -dm_plex_box_faces 2,2,1 -ts_max_steps 5 -degree 3 -implicit false -ts_type rk -stab supg -state_var conservative -mass_ksp_type gmres -mass_pc_jacobi_type diagonal -idl_decay_time 2e-3 -idl_length 0.25 -idl_start 0 -idl_pressure 70
diff --git a/examples/fluids/qfunctions/riemann_solver.h b/examples/fluids/qfunctions/riemann_solver.h
index 8ab0570504..1a793b356c 100644
--- a/examples/fluids/qfunctions/riemann_solver.h
+++ b/examples/fluids/qfunctions/riemann_solver.h
@@ -84,13 +84,11 @@ CEED_QFUNCTION_HELPER StateConservative Flux_HLL_fwd(State left, State right, St
   UnpackState_U(dflux_left, dF_l);
   UnpackState_U(dflux_right, dF_r);
   for (int i = 0; i < 5; i++) {
-    const CeedScalar U_diff      = U_r[i] - U_l[i];
-    const CeedScalar S_diff      = S_r - S_l;
-    const CeedScalar F_hll_denom = S_r * F_l[i] - S_l * F_r[i] + S_l * S_r * U_diff;
+    const CeedScalar S_diff = S_r - S_l;
 
-    dF_hll[i] += ((F_l[i] + S_r * U_diff) * S_diff - F_hll_denom) / Square(S_diff) * dS_r;
-    dF_hll[i] += ((-F_r[i] + S_r * U_diff) * S_diff + F_hll_denom) / Square(S_diff) * dS_l;
-    dF_hll[i] += (S_r * dF_l[i] - S_l * dF_r[i] + S_r * S_l * dU_r[i] - S_r * S_l * dU_l[i]) / S_diff;
+    dF_hll[i] += (S_l * (-F_l[i] + F_r[i] + S_l * U_l[i] - S_l * U_r[i]) / Square(S_diff)) * dS_r;
+    dF_hll[i] += (S_r * (F_l[i] - F_r[i] - S_r * U_l[i] + S_r * U_r[i]) / Square(S_diff)) * dS_l;
+    dF_hll[i] += (S_r * dF_l[i] - S_l * dF_r[i] + S_r * S_l * (dU_r[i] - dU_l[i])) / S_diff;
   }
   StateConservative dF = {
       dF_hll[0],
@@ -142,7 +140,8 @@ CEED_QFUNCTION_HELPER void ComputeHLLSpeeds_Roe_fwd(NewtonianIdealGasContext gas
   CeedScalar H_roe  = RoeAverage(r, H_left, H_right);
   CeedScalar dH_roe = RoeAverage_fwd(r, dr, H_left, H_right, dH_left, dH_right);
   CeedScalar a_roe  = sqrt((gamma - 1) * (H_roe - 0.5 * Square(u_roe)));
-  CeedScalar da_roe = 0.5 * (gamma - 1) / sqrt(H_roe) * dH_roe - 0.5 * sqrt(gamma - 1) * u_roe / sqrt(H_roe - 0.5 * Square(u_roe)) * du_roe;
+  CeedScalar da_roe = 0.5 * sqrt((gamma - 1) / (H_roe - 0.5 * Square(u_roe))) * dH_roe;  // (da/dH) dH
+  da_roe -= 0.5 * sqrt(gamma - 1) * u_roe / sqrt(H_roe - 0.5 * Square(u_roe)) * du_roe;  // (da/du) du
 
   *s_left   = u_roe - a_roe;
   *ds_left  = du_roe - da_roe;
diff --git a/examples/fluids/tests-output/fluids-navierstokes-gaussianwave-IDL-entropy.bin b/examples/fluids/tests-output/fluids-navierstokes-gaussianwave-IDL-entropy.bin
index 6347e95ca826677768a98731415bd0aa2e2db036..af70688040c15a2297f6289323b140f3568f1fa8 100644
GIT binary patch
literal 2340
zcmXBVdpuS7765SRNqTwToRb<8$*4pss$YeO#-m}H)J^iH@+!t75sqYss~O_#qr4Ik
zuH;p-cO9R+Y7!DEM)xE+$FrMKsafmH`S0xC-ut)K`mOcd3Zwn*GXMPMlDO~z%bUs(
z(!Yukw*Oq7wUXtkWk%`C7ZQPap3z3yi3s@0d8^-DU?|V$T1RBxvz#wuU8YB`5Pq)C
zdO9+e2)6Z%bOdTLRIFJ3qe2tQ5ifX?c<42u{q%D>suPG{-8ZuXMHUPd6Z2j0oE0o*
z>@Lsj@^V5myJPA%ClEU4_;YJHc>a8kYe!El%NcbQmV3bUsQu9k=6)h{l3GP`3!kAb
ze9Bli4Ev9kr{&1$5c)qhOJ8|?Pw2!fT}%A{hDwUwBNoZ9T&2rtxjXI<LDFX(jht<S
z-e4eUDj^INU}hc?dXVMJ+*U4a&m(-{N4tS~8$yR!_FnkV56=&8PVt|D_#2r?o`ldb
zmn#BYX~K8fJ!>$;hM{s=e2yC0L;Q+dx$k}=w72Tn^n!TEGk7BU+6{)Htc!*6^$?%S
z7fbFqp{b1uQzo_$-!G-kNxAU;>(uDTDOj)jb#&YW@+vu0eISa^DRL@Pb95oU2j8A*
z-C()O+a)rKP7(oQ<s^Q*fzas=d*us$U?~5Z$Vtx$sDIG*Mynda*Vvct9zRCt-yCGS
z`uD>7ITdVvF3TB;JK>x1E1@mACoEJ4$rYii;n>A&h6**#8)wy6&bY;(rty0sSZ?;q
z&0_*W_m&tPFRo>%gkpEw@G6KexWeeqR6-vQ-6Gj?ozSVgn!=`A43*gOf|LIi*7<&{
z9U*jP_In>CG1N)z6C3*!@{jO-6h6dq74z>6eF`7~-WyZD;Yh;&CjY^~@G(OL>g`_G
z{1EEzqSxcBMEG6z3@zg45jr?mFL|po<UhTi^Z5nKc|&in-t>~ty2|?>NhV3EqL-kv
z!-%1*=6(LM)C2ZUC=%{~zHllT3toPh@Eup)OdcF!sB=BmyGpjQoRRLeA|Ee8U%YMJ
zGrfS&k#RS6?a^YW<oOncV>NJoI_@1C+~HhQJ73QEozOQoXJ4Lx`X@d<rz>`6xjOyk
z1zHp2>XVR3?;jaL+ut*Ch=BefrLRJ-DMJ0`Tr#AZ303AY<LzuuXg7C{$otTr9;aF!
zuHD0G9M~iBt<EL1@4HJHR5{5Ct8aX!lE7SiT(;J7FcZ!vVDiihLsFoynWH~pKxj@~
zoJLmxL-8boZsAERXQH+1{n2G4OP{+vMt3iqqq&>1sUt%<^a{kyFkd6lt=@`FM36BS
zpsh7c=!Ubd_gNE$s=V%?x;TsFu9$yehl(2!yp3%u?(QP=vYP3oyp0UySbj`W0`;2-
zS+*}~CCRL;U0<;2Im}0`>%NZ%80yr||M$(_$#Nb(Y*|xz2I`ZXUVW^e6uRcShc&%o
zsBTMM?wNL$Gbt*wb~{P3GafWmIjWQF(0}=c1;;}FY$iPUTTox)%c=d)KYC>a`QD*~
zu3IFtaP496;j6IBbQQ$^cCF0UFhbutzc8>6?lV)LIKR%Ap;BXRJuljc{)B7iEJA<w
z=v0-XKe-0e{^*Zh<$(nBr}N3bozS0vfD?a!KfNk;^F-*+lE?GUN&P9Gy9)hD+b%bN
z{wTL{tI!`6r(=ESPrTb-_2|z|m1imFPv5Iu9#Ve_*Y!hxxeD(}X!Iw}Y1$M0`R|*I
zLsEYt@|0n{JgtC4qd$rr{6O?)jc3blsXq-9OVFPTO{JgEAFpoN2=u3_IcNm^F|}5w
z(Vt~2)q}vF4|$s24d~C)X#?0lY1qW-{uA`4rgZ8!`jaGE4bD1|npo#n^e0Tu73-_0
z`F%a?>#eq5<Olxw1-BX(O-cQ6Z*YM4Tt45~=mY*3^bKrNK!2{KW`#=qQEcpm^|~<K
zi(2T9Pop->af+N~<+ujq*C<oFH~{_mAp6x0{o(N~VLsCxPHXpp5B@b?6^^sepKSr0
zW#ErZNcMOk_+wvYZ3gv8;*_#{@NX#YbcK=#{Si;v`h!32Wo`FW(4T7qtOWfLuYR!#
z{4wW0D1U_h1jU`OMSqSatH(op!R4FOZh=1`l8+X@p+8Ljy{+g^uc~Gotn>X|Q>x(4
z6@?RHT=ZwO=u0yC<6EJef&ScSUf~1&40ew^nm~Wv#M<vefA$)zNQC;ktdD>8C-@_j
zk<<);KY6nk=cS`R2TlKKMt^qt@cxJXoXqGQ0e?Ij&uvLSe|lTPdtm>BysLxl;7{>X
z@va-_&yYhH7yYr_McJW0dTq0%{$#CLI)nbCsAj)Kf2`Cio=N?=J^K*)!@QT=Mt|~k
zPLHBLXHT_fqCaLD)pFoZC{vjhg#O4yr1gP6H)Rj93g}PtAD_>HKS$#8$2X!smF>-^
z&>xYkvp4$FukGKD{;2B9od$nuuSz;6(H~>R4E!Goc&BG*f&K_X>+hmJHf=s@(4V0c
zb7QGLc}u`8eXh;wh7$D0;ac-B`jeI1Ss?YNDt!g|bHch+1^o#>RPqr0an!tMi2m?b
zpMC=V#K}9QHiJL4OZ=nGqCdN>f?<AULgbuE)6pMwE7`*^A2p>rmA;`r-+YBXqd#rq
zFTv%+<luoB3HVdsStS2E`g4Af=oR`?PG(HdAIYU@0s7-HqI5*+&!dG6Qh&rv$Izd_
vg<0_Zty{EG{Rd5{Kd(zD^hft@Un2OEAZkAY-(NHJz8?ck(Vt4O%ly9q5h$7+

literal 2340
zcmXBVdpuNm8vt;rt!TD=3Ef6xZ8a`kHRw&=U%4ddf=yd?%Ozcq+=^TlgIo`Jbz|lX
zp`>Drln>ILV`rD5KCx1$C<!$#xs+bjdY@-@{+oH`obx=-?|Ht5Gt}j!^3UfR5+DA`
zPO255{HhtT^Pium>9M-=v!hlz#}kQ7fxvuG1CfZe3XC`@40ZU~5BmiwEceUs6ANdS
zi1YcY#X6^nWMlhabFhS=;uPwioFZASVs>A$dlI32SLU?{mlMgVDeHZ={TWIq)DO}>
z$8v}7CRk_n6PmFS-YGsq=)7ak>|1|jsHAp}mZ9@3cgRC7^BN>{lyl6i3vq-_onLle
zA!ewQ$&0IclOQh3%u@>`^nV@x)h!hgI$30B$D3fN)R--bUQ1SYURGwlsf0*UKl2wz
z?-6><+JQR9DuxQMwh4(!X1QN0v`SuR6S4f$uI|x0gbuUoNc&|2T>sAejKCN2uV+R>
zH3%)tDhs0L60wWbw5~r>87i;A+aqW_<gcl#rvHY}M|GmlKUfQO29F3$H!>8p?fUOS
zw;(^aFLp~>2~C-BCgwbZ{Pvc(?)nJ#FQSIVjzGNn)X*>sbrt_y@z)GOpHb8P$Q^|G
z>ZV%tLReib%K;USbwt8!b(uWmPUs6x_H+Nf&rtrAkt4Zs=zox9{qklaUcCK+`;ldY
zKIf#`v_Js&=a$uOA7{D!@qRu{HwkUqHe%JCLUQFEf|-UCLxq|Z3|8l|-1oM<Jd^1}
zvdDUGX<QwlJBkH94Y>@Jc-?EK+yL?mE)zUxA@s4(_3ym*5IQ5FvM8m9p^_V(b2GCc
zF7{;|&k;I1=Y!9_Xz0`YNp}5ys6XQ9!?<)-SMzdhf8YlqN$9h%o0mz%Q|c4eZXXyb
z(AbKTv={pCX54-)kcgkuuG=v>LFnLo<H8MXQ2+m&7q_x3w~w~0aFP+)P|Lab{7aId
zc|;;Szl))^&ipb>_8y*}cunr=O=y?m;b8YtBHpbhJ>Mu~sMz*xn@zk~?qHiykKbxS
zr&ZW=ZrDxe$oLy%)sGC7KFe01vJlRX@71zmADoNM<Mz@XLQBnavU3zr-=kQ=el1p)
z-(^#D%7t8N4jFwKX-{a!T0u-0%r`0N4*oI*{hM*tq&9_6rEXtmch4sDL9as#0__><
z(20h|@hEojPg{D9xk(A_Gj?`)`CB3iyK|p%%V5$Tl^QurK8EuN82vR%My_zox%w$k
zC$~CoqK;;$gn{0JJUx~>()i?q+d?8*scV_gl?dl(b8w-t6GJ(5NEEe+P+v^r(M8vY
z<l=AuKfI67e?)uKin<x9yvRxWQyQzgWLC<5ECz|>ZCq1`U>2b@DnI?(Gl!ve%RFD^
zeuDmoXzWlqNwUkUR&T#x3HwpyIi``pP$%|`M+CaE+y@UD^f_(NpY#jIi+#yekIU|&
zrEq<lUBZfPH<mlvQ|cfcA~_f9?psD$lAO?mzMAGg!~B?&^rmR&uUS?`L=2&gORpT*
zA42GA&eYKGOYq@ud2zD{@_%bIZ8glzt)w}kA}c~$FBr-6vu3Ca;jI^1jp$GK@^@17
zXUhuCO7th+jOUI17~edVhW<Qmc4|d`0{mU=z@HB79kyoZ&v%cu1t|TIsfD3GnHwv?
zr&Uv0kAyIPLZSBVm>Trw^ubrj=+7n{ekJ<T*{!3m^yjMi3z#nt`CSF<Pn2`K%j>1+
z&&|GY@Cp2h$o&c8vdk-+RM8*JmXZVLkHO(?Kczo^j5(k`7kP8y(4Ql%TfRkqZr3k1
zMSm=|@y*a54ZZI<;Lk*Xo~Q!-Y5n}x68+h*H5Teq)K`}DE6|_R={}y|52*|*<f1=e
zE1tYXe|Sr)hryru&aWG6(Vs?hr%9zhUblIWpWEk}brkrsw)0i=W%Q>oqmZZcNAuwm
zh*yUhQqcd@V(+>!9{R&8Z-@CysaL5|zlHuxOt)4=e-f@aHljZ#jkkbr{*^~$R3ZAa
zF<^QI_~Q^FinRfM980Zak?2oJ?N$o?Ia$V^f&M5)?mB}%UZo17&FD{Ir;juGqgeLh
zD)?g~t`~cvKS8kp?&yzuy15YY3znH`>4QHZ0~4<s&>yBNFB$#m(9u<exY+kiI@Fz=
z%Q^1y4*eOrZV`_D_>}2vLVs%RJ!in5-nOP-L-eQb_{eGW$9|2BgZ`{em}v|C$W=bh
zg#3dGrmyO+L4WpHnDNn{P2T(n^ym1wXL;by;kvK(f#^?1lb;DZKe6D_0PKg$^^Z4x
ze1!h=@ArrKjBVe!`Sc0&$GAxy_A;$PWZ(|(k;wQn^J@M=f3_~TAyE2r`|B(8hZ&zt
zM}ICGocxUbM4xD{M}MrBjPC+}LfLisqv(&Cbnq$oBURm#0{i!VM$B(IY2eR+(~As5
z=ui2vpgrhMk7}z5`qO2wT!{YY*rx6Sf2#5&y|w6%+3&se=+BtZq%Ha*Kc;~HajPAQ
zTj2egFwmb~6{_^7;F}HTkAuN@2m0f5;}>1@N0i=iN$Jn+f*SP4Z^u13`V;Qj+JOG-
z=7pxBKjLLI>flejy56DZ;7`@uER!Jg$BLZoM1R!w7zLm|^FItg{c9?v{kBo)&s2Ee
z7xbs;-L>`TPq64@68Lk)v!kQ{{YlJv3jH1Jk!@c0J^C}KQ8tPG9F8@0RQl5@->CFQ
z;Twzoym?_`0sef;`{)n*5B|K?mZ3j}zs)iQe-e8JH}TP*PS>X^(4X>wHrM|HBZ!)b

diff --git a/examples/fluids/tests-output/fluids-navierstokes-gaussianwave-IDL.bin b/examples/fluids/tests-output/fluids-navierstokes-gaussianwave-IDL.bin
index fcb16553ca4f9c0c92fa4e5f69b77498db6c081d..a7133e5ad2fe98d923b67079a96fa4b7c3cb94e0 100644
GIT binary patch
literal 2340
zcmXBVdpuOz9suy|kr0ZbNMc-dJWky{#pEWe)jgv6Q0?(3J-jL=qY|Nza1%}uI+sc{
zCpS{36g@;LkEuMH8Ha>Cn{=AIX1wpkzU$Z7|Lwg$>$iTtwSMcnG={nlQhz=3(AlV7
z=R)1+@2@)1+yDBkz{2#^_)3+4-I$8I*`U;?im8YA_p+)3NVJ%B+sbMsre93_p(Wgb
z=BBjEqI?oCbtKCEaECt<Devx4U^`-tk-b1^haX}-abH;0R+Il)-MZy2NThNmg7e}(
zm>x52Zt&lODU<75pH4qa=_u<YS&;Y&8M}2R?J+$vP|4+-LyYW$(*~{>F--J{BL5{w
zL@i6W=pw@O&_u7-f?JqUGWEZm>4unDS=AEHEF@$y<|{8W#&qhn36D5WOx?1NPwOi|
z%(L(#28L=#wD9@;m=-CfbG54Ph+Pq5#L_xd*@~Di^^M)X?;zuontgdGn0|J8qu`&n
z5c8q=T#{%7nhO+NEmP}2!oKrbmwojx-7ocY6Ivif{E`y$enL!R{Z>o;9weO8C_cVb
z8Pmf|%amO=$@|(@!>v(>ncLrPbmK6I&x_$YhmiV?ekrcj!4%cTw@$Igl%kK}mkakv
zeKtD;!ycGUA9c~v@55A8v3Hy0-!P>+z4POT86;X{_S;vP7N+x+@1<_FKy&Ume9<cj
zrqpgM_gd#p#%sO_IL5;qFQcxdO;d=e%nLE^PsWN=pv<e{G!kjn_!W8AVtQs?yB7Bi
zraT76B>8oia&S$HNR}ay;y#x!-2_a_?`QYeRv@M~k(IBMNAi;_(m=nG_~QPzmuxXT
zXvKC=*2R>{l?LnN4~UV=mkTmSNc~46Y@5|c{u-UfhYc}x|7&USw@k#GKH1MR-$2%X
z0rzHN4yJEj<Gj_VM~sq9+#Vqr-+YKInu;dz2OTst9%GJY3p-WqDPo#Rwxv8!K_lb5
zre9WvB9VEHNktRMZ+u;OLrXAXX4Gu&>_3R+964JWC;lY;70~qK#$1x$<LZDbB);tI
z!Q`Mq#0*W?Y1c*};Y_a6ZCwFb|ExN7ei*51PU6$H98*j7Id4x%C-dh7{>d7{bako3
zudxOzVJ;qeOV(FibBp1YSR`8W#H~!;j_H=Pw7fOmXt1;>viV~OV#YEuXEWC#k-oLT
z*y};;d1H%?=iNLsXI^O>61@%mPcCA6tlNj)MJ7_|b{?4XVl2>~?}JsHJ_jgk)*$9k
zY?pngCRxAYWlKX8F~{91btyL-&0fJKPv*Tw%<%JDQwlqfXk~U(T-q5-^9vi-CNz=r
z8PwyDS&XR#1~KN__aI@TT9%=nJLzv%#|=tJSV3LpSz|9j%w0R1V16JHv7W?c`VL~c
z;vY}KcC*op^_!lb#$yrF))C)tv!CqG{yRLsi<nb+lO+}HL`?GPRSVJ_v3kjo2_N<}
z5@sv=#VX#w91VH4N7gs2@UMsh&Fe3)!l9!(Ynv{T{#k$Aq;n1CP{)IU?<OK9?gz#E
zhmSC2?C|Q+_-7=Vm*D+$bqS{X_>)zpLzr3?)~e^FikRQ1LywG9kZ?+Aab^9F;E!N*
z)Dir7HQ<y8{_vmGr-46tMF#@FpQevB_25r|H)|UF*=1b12mI-F?2>{%<#pB(;7`oV
zf78LA@v6@A;7`ap{dM3^ch8qh@Fy-^djR~=dn%~|fA;z-3BjMIbqOP+e_AiDWjTUB
zp~2rIq(7F$aa1|rk0i+KG5B+6=lD_Zr<N}}5B{i@uxh}cp;7Bo;19p(1Q+}XuYOPR
zU-<mq_~w(~PsN%E64!{OCSI}<{K@PoJqP}XlWwj6f6NU2m<E6Pct3Q5KfNz49)Uj=
zqK9Pv`=vYA`#Xa_^BcAAfj_>l=Z}Iv_l0N9gFiiM93POpuNM_2?gfA3vrIJj6OT*m
zz#j>3WhnUb>-#)?@Mq~;D=$)?&GwOkF!1M?M(-=|M_YZvJ@9AOr4z*GMP`NBq59xY
zF7xai_~RQU6N5k8s`wZ(UbBAJ*lFUA@#q2LG~!Rb)!6Jy@JBnf>lpZxw)W0D@JG;;
zD+Pb@BLw99Ik=|&`}}u*elG6{27fZn#u8t9&&u3FlSqD_<eBnwB)(WOyjlF+pXsd~
z;LoRbI1K#xd-~VU-~B-=+`u2dCk1lik5p%};3V-UGG|lSH29;vcAf$F6J`3zpZK%>
z(dCCG#GgRb?qcwFa9hLrVd777&Mo#n@Mo1}j3M#I=j}(6N#f7dsSB;X;Lq?jP5|*o
z{e*h<IPr&m95Y}9{z%&+2f!aemdhmRpTzeS#$Dh~koRmq__M8Z{&MiA;X*4H{IS*j
z><a$ad^)k2%%5}EBalt}DJwk9YbX93Hqe%C27mlKgOZ6qPx-0a$^F%m=5PFSAn_;9
z?0t?o_;V%u{X*i8Rp&RJDe=eoxfiR7_)}H=(ZZDYBW%sM*b4qAl&e1gf9|xK#e+X?
zq2bSnKWF%Y3_I{grGLZ{{PF2oRZ9HX==4CG4*t9>sdWW^oIAwc;E!aIT|~}jFgI6R
z3I2o?4txWD-q^gvq`zGRSx&a#&zqJ$UE+_uo$lTQ@Ml$*?jG<*R+FGk{1G&^<P*o+
zI<96EoCSYgx0U|_{unG~>w!O)rCdGY&rxdcD+>JaOxd&>{K;KBwI2N0WHP;<_;aMy
z?X@NN6H(;h3H}8427M*|EUwe<oe%z8PHB^aKitUp<KPdI5qSjsITdhh4*Ut1x|921
LN=fI8d-?wX7iL{{

literal 2340
zcmXBVc|29y9suxdDl;ifD&%u3FYmft(txv8QRk&`A6-M{Tjrr|JW44tL`5kdL+8}#
zxX3kAhC&g!MLNfjA(i>|hz#K>oc-3XyZ_mrJ*?mU{nq}i?^68SeS-SukU<f_z3jYY
z=e}R6ojdf;Z3!(*-wvyqzVrxFp@N3oZTgrhz}aT%8c3wBb$@ks1EwoyTGTSqk$85=
z<9r7XO!)?z`kvl}MA9}rl84!tv(B{UUGzD`^m|m)FAXNol|KwWpGBfY7X$b0SdHl^
zoz|8ZKBlzqc=){$V@ge0Emns-mx#Anb}b0g<0I8>c}<9!=-8`uYXmXOxqyce2}nd~
zT#9QL#q{TyUfb+;OiAhdnlEt$F$)vVmOe5^LMDE(^r;7!j^)i*>Mmd^%`}X2k&BqJ
z|N1(+`6H1^d2YbWH%zB0*W8b@L(DoY<@2Vxi22yi*w7tI#$RrAIx~;yvJm6{?U+YQ
zM{AV#(@7+D7jd;$Bp_kmRpkQlFPQ#o*y_mgM#MBdq1r<H5!2YPPnqwGgkr@a-=KO-
zk1=n)ZefwUc|#_yYY`)Mc)Q}uArjyFyk~_asn73Yk<TtnQJwtF=J}YCbXqg~6V$iu
zaIIhv(>J~xkd!UJ)UzVTFDm4{WnYh^F78Dl)s6SMZt^glA)UkJG$Zj*956;TV@mGs
zTK|-0GG3|9%OMMMY}dWd(%Olb>hzPs7EdfmxsSgHK8{36FI-}6-(h-Q=B@VeR80Lc
zI<1;>6;rH(ad)_EB$Bj0aK>gOrYCcgy3*AU(;KO^nT<$(1FEYGV@Uj_zn+Kf!StxX
zR&-1eQ;SkwS#9h^%;e<S!X6f>-!IT)ryj{)@%>5t08Hf$m5o$uA|~X_M?Gc>nSTY3
zTYCF3E#Pqqv(peGwJo$QJqR(au7>@Fhe`ZVmeP6)%&~f7#GVR6OmnHlCzGGi__TL>
zV6_m5HgUDbE=pp0`sa$`&5sc?FSkQjKaRxa95#Pmp7fVnvnFmN`8}+0v)3p66XBY=
z^#)=-&ls0}`~wN+Q}<??Jtgy>Sg%x2PwEm6s3T8IEw^X;ej6p<=eiGVa>R7aqXFrX
z<5&uNApe(In37jw4+}OSk$$n|+14gZzln=WK4*nS9~JsdJlTPmsrXy0{$wOtX}Ege
z_BU*G_djYf_g^6KrfSVhX*Qah6q){7;f`JhMTpyuY`~n#DR(D}87#YZ&@E{lso(WN
z_gGvfnZF_p3wbxpIcgBQeb5~(q+rdMGHb+)mH+OLFoi@vBn4k-Ny0QguW`ky>tua;
z^_(e?#FWD7^J$_~By5yRl+m^&{e93}(36cN<R>^ri;@tNWwJAPkrfhY6<;Xnio<l(
z_F~mmZ8UFK*PmsWi<r)?^9j)%<o`Hi){Q*FoNB>(^Ys*BqW}HVt~4H&FZKOqW+_0z
zBxx_N3tpI`IGHrGlZz$712dPXx?%}ezx^gXM@jz}oq7?HfjQJ~UV1t!5fiE=X8NNr
zrOB%LQ-G03=8|KT+*wTb@&A6inZlHYe}|3#KE&LkPF%OhM#5Ps^<s%E@Taz8qy_wW
z8o28n_`_H5PXT|@6ULOlpXRPbcHmEDP*ybfV{L332L5#K!G+*YCFQ9C{+#bzc?SHM
zk=NS;{+u$kC+lZQr@J$4EBF(d7Jm`^Sy5^r1OC`Kdo2ckN?!f=h4fGRbp@6H{5gG^
zUr+i&Ba~xd2mTCrN=1M_uJ-OR;Ll5L?g8*ewj^&H{P{e8qZj<)XB1rqe*zV}UxPn6
z6Xho0Pu2EVbMQy5wJ-|&xhd2<0sb^ybFKt`HawX8?oVI%-)F&}-W2IO;E!&ar8oF<
zWS6HA__O$WSsnP}T<XFDe{x$dtN?#{^oCPN-ggQIEtSBZ$*It4@Fxr_{tNsW@Oix(
z{7HD8(hvTus^3eK`nDY!|E2@}_|01Lz#r95rv>1Tb<~;#@Fy=~4-5QB9qg+Hf1IV4
znu9+cZ5nfAyix-vB%k=BIc3XoBK~9;)Z~8vf0nFFy9xfpxh$6fe`-t2GQppWI$d&R
z9gHpY$_IbUN;Q(fpZF`4iNv3X@!B5ppCtPy`6!XZZyFeTIq}_}X*(J4r+=<E4*dBs
z)YJI8Kc)*U;E#*{o+RSWu*$)r3gS-?_n{;4Peoyg>d~9vPjHY=7V*cZWVhcF;*Yzm
z7C)85AGP?n#hduknsQaI2>j8rSZYK3ae5UKc8K_Mdv;ZOAow$OHQ$5yBY*m&Q8@93
ze&Eb40)K{wj0(V?+Q={Eq<<ni^rHvCA1{rFTJXo>jX&A{#uHy9w0{79cJMkMf<N2l
zw8{FWxSrMnyNN%K^Rkm=i9epYUx$ga@=7kjhHr^KCH#=bRp5`yhZue0Pr6c6ycGD8
zvMDl}_+#+Cer7lEhg~j=av}aatF!!*NBj|X_nVy}{zOv8>I%T0KXP0Qz#q#wmE**p
zivst+7vRrgDzp>)ahmJgK>RT_JF7<O94oJAH~a1nd(ck^{tSFdSVGpPk7vcPaPTL9
zRYvyDM!C9;V`ikk57h9Mb%Q@u+@u2Hk12}^uK<7a!hgR2{!H`*TqXY0Hu?N!5B?;}
zJ6eN3&+Ycsf<GFEj=6z9cR7*W#2-INd)?*WkCWy8%ivFHqU|jBgMaY2PyF$1UNfl-
z{sh|gtARg0)y9^@AN2;CiF@Er3io;q_~UUcK?3|?c+<Av&sjNRckm}*x~~oVQS-Qw
F^MBy-Sfl^|

diff --git a/examples/fluids/tests-output/fluids-navierstokes-gaussianwave-shell.bin b/examples/fluids/tests-output/fluids-navierstokes-gaussianwave-shell.bin
index ab4052b1dcbd131ed548737ae02d1bd99a3aa5da..67097e2ca604328ebb7a3bba06f6bfc54a30f668 100644
GIT binary patch
literal 7092
zcmXY#c_39?7sjuU43RXDL?{_c1C^X}%#!G;49S#`sVFJxrJ_PJDWQnw3=s|FCPgYC
z^N=~E$W$TXe(S9B^<Ul9^E~%=_g;IiEz;k%o5lR$C&Pm~Ofu#i!N1NvV!`|~?|zCs
zpr>JYsh?sA_KIFI&?dQ)+p|0RGdPLdc7x|t$H)q<<Mfb(Qxv;9B_O9amtsxRD3dM|
zC%IP)ggX<<i0r-;)s*E>H|5Liy1I~J57tB)<#9-E>z>#zxfh^5%T`F@29d5WJwABR
zkE~?b-<u*&QdToQFSyh@i{y?)J?>_wQ7oCdk7g<_IR%3+WOJL2k(GQpg>gzFn9m@(
zkjZD%j``Z#ZHdIm6xF!O|L4<jWAaT{Gx=`q!+c%y4sf!6o-3YtqL9faRfhRY7UVPe
zURYv2VLoS0I`zXQD7T%-7a4&0qEn@qd};DbzG6$FV5mt(-(wb&?@A2C?(g2F=V(B3
zr!*1+x1~dUO6ZHuFi!D}mC>2;e!$P|$g@~=@ZHrTZ{!Sh`Z*n!?ouosm2~sqHlpOp
ziBCl#mEbG??9=Rb6np5K*@$fld~UN<o?nt;@$OJe>b^?6R#i4rd?^Wa$D8NB3IHy9
zekcemhPuer<!g=rmtbx3wiM^px82p^%eR2<4c=ERPnmo_`I&rqzz^~jq}?ZqZ|u~#
zHqp-HoA`<O7CWUe`HX2@$mdeSDY>$}KUyOK^Ho;VV7@=sD=^>U>>(x}?`F<xRewWg
zwR=py6Azhu`Dd7XfuWd>_+mo5`X(M~WWnSs+89e&QSaZ_sg{!yaTRh~e!%zms^|Cl
z-6~F(>DkFkk`!4f_st>p&Q)sb?9kQ<;t)ACHBV)mY6CewFTl=gZX59>G$v&7r!!e`
zaQxy~^FNdod#COhW0D;2h(5p2#17WYsRbG%Eu7ZIWnU!rz`9bRf>*geq_)Z&E3@Pa
zC8z3E$xqhSQ_~1#yAP5)M3;ntVUZE67d>~8(T}SzpG(bQ%*VE|$9%2Zo)KN98@_IJ
zOvZdy$#l$D`PUZn?fi8H^SS5E<$MX{?KLh=#(d}Wr7_>7{;y0v>)lK~{urXQ;Z)p;
zcqZQ(aSHQo@#LlRDe(UbVe(a$aJnRBw*L@k^8E=GqO8{SB*eVmNwG|=v&Bn2NbbY@
zOBqF~L}P)~j|E#nZ_eYx{jCiYyZW5#r-Ee^OF1TQU+pJ~B_#jLNm`!sdG&sen?WqH
zqHlAe=L{3d%4AOc;ee$S%XVF+;fe1QOIGy9zJ%ArkMY5ktQgRf3<>fpi=^1)x{Ehw
zL{O~d+WCH0*OS~KbKW!d_c(p;A{=X%LcShzH-n&MnD2JxYs@EcaRT$@?b^<1d@9}Z
zS%ro9hQAqMzS{gT%;#kwhWXr-Gl<XYRBs2H9m9Oy?Y@{VBH#t)Gj2)8eBPqxIX_0v
zCteK%J;)budlK_m7|LNj&wpz$-|>D2qOVLZ*y1je&(Po;#U6fXcwYeY=KW`9mRkhA
z+u#4#p9t&NRWi?U9mO7-sVNtcL$Owk<axclK(VBCeLLIVkd-`Q?hTw!*bl$rPG4#u
zxg#a%lEcyzi~rBv%TMWjkb2?Hd3X5#CKu0cRH0akZwwy|<dIzer|~08Uy_x)d`bUW
zD<~^LH@B`2+VDN*%lOV(Pfiu+>^<whi>wrqG`agd1NKqN?2d#8k~_+K$|$~&<kmN;
zQ%d2`&l+CLPWw)=|83dUZr4Gv;-apZe%ed1%(-sDV@71f*g(+qRi>1crXsb+Z4bp0
z?qglc^O@oiI&=D+tvOjK9JwaeyOLs0z5baU4t+oW`{`_tc#5TNUVGzA67;uh^IX?z
z<dZVT0r_m?Q)2jdoIpO$#s!hjlJQ3jp9LDor$f*H`7~ZUihRQKG?9<&Ze8RPyHcLv
zb6o)WMESz`1AHD=RU@B<;<LzS@}DsB`C=1;eB6VlBcHn}k;vzs+F9h&bLcDbF;Evp
zKC7-ZBOi(7rpV{>R#@-A$7pFZ@>$%pl;(4o+hmS>T$gV`KGp3jk<a5C7332$CW(A9
z=ISG#M=|2a=iz)ehEKz6=&!w6&XV~gx4*>iLi=`-JKX-}ew7zdIGiD)wyYic>)VNg
zE98NrTLVjICdHcN?`{)nL$UZIUOF$|$|?0vd8yNu3_RQe*SPwj-mV?Mf!1qdqh4Cn
zGfv^Kzwp;wIn)c3PoVYcC*6Y?y>NZhbJWWvO8wn?zpjUU0D7zIdnk5a>Ban3b>Onh
z(mv=r$?Yu8G!=6siUe+%mus^?&(63*ypv*&8gE-Rl1Z`Tf7Le~`3ZeVq<S)HJ*Ox|
z|K!&d?eKeA&RbK^je2>O=TL8_X*lW`XZvu91ggy97SBSxcFlCuQ|0HNUgP9j)SHS9
zC5lomWF0hve4wXgUIM?LivF3lXK+q5+AbTuOLE7Dd{!3ba+<uS`c?9rL2qBVQR*nH
zqn$}d?!fu<*T-M2*Z|J$u&{4_Z;ASQo7QN5gnTR?%?8WOsCRBY>|<Kb+XwYl304wK
z-oq<iy`j&YedWgIJW(&W_7>`C@ULd{QiD15_g30|s5pjt^}iOu`pKJT9;8L_ggi)k
za}R#!FrAxk)ij9TZNzT5d4ps{>#Mm3Pxe6_QGZ-Ak>W|O6Pq)zi(;vbaeAusiDCDy
zfjqk3G}_%{vv;GOyP*~86`gZMy_aUKoZoFZ@v;e!5A?zn{-WM>kB_J)F$(tu(EEAw
zE@#+7|B=XT-~xJ%Hx5!(XZQPcx4)+NBHq4^t6f7*DaW{GRHbo-UOZlOvkvYlGrCu|
z#LH4v*Q4!Srj#gtLWp{rxshUJKdP!*@sBeyxTrIFEBqe(4^~;FE2G{(_<ht%5{N;)
z=R-S*p@NIKoe_hm*Y0=^_1qf9Q19g#KGdtc`kfdV5|}(zOQGJjv<S*d*=cv&en*mP
zsMmJrVlK(8*u+&zzC;YP%$Xi12fh;8_s%-hQ&y{7?DtHAKBRPNYj1fE#hNYb$e$rh
zvK$r^+`3vwR?Zv!VmJ60^`dyPP|wRb67}TFK5_<HBDM61W2hG)auoG;xS69~lUF$E
z%{ZL*lI38zs&^Fffu6M^^sk|-@%Ni6NN%~bKxJ7K?4xE&1A`qz;l$R7+OzGj4pJWM
z8&{#Mgmyj&G=e_=`>{x|p(DkzoBK##%#Ks8*u=l06!J~G92OwA67?Q(OHr?)uNL(T
zm)dg*Cq$kdC}Q;d_I^XXgB|lxuf&8xJ!)4LQNBdOyDat?qn9*?V*kjKn#sSG<n~mR
zpASo=Sgh1_<@}$C+`o~N8-0(F6%*%ueMF@wE8Ce8=C*J@84I#hI!%xhMzfbbb{6F1
zG<S#h8nEErv)RBTc?F~QHWT$OTiK%C`vfj0cT`ApmktZ{e671t&uV5j>L~`UK)vTL
zEs30t83jCW@22%k)yttj=3m*n_y;)^=dC+449*XQ-YtKg+HkUbbl3azQDmjS8`k^Z
z7bvSc(hqqW=8@c6wQEP;Z=!gOk6m4|cQ>c7tY!V57|191aL%ha(Tv{bPSg|DeSmsn
zgT_RbPlEER&;QqRTFB^yz`Yjq{%*HJJq6jlMB&Go2Y4qLy&@@L=##Pk;wKDYf6rGi
z@%NxuGvssCKWh@T>Un&@`vc(j%{fuaqY8aP`k<u1NvP9(-KzoT^tflNzx*9e9s9%@
zvGy;}H+GvZZa9p3*YtQ9y(Lkor}n9hQ>$Jz+bduO>eYCfp`KI_tYcblmlNvkk7*_9
z*gmS$T})9=J}VXO-9I;9<*$U#xnZ1hdr58~_p<gp9ZspzrmEGBa-b(F{5wtu>XJ*u
zt4@HP_Fbv4<>2eo5)ty7`1s$l?V^*(P<KG2x_ck$B?fw<-isq}?*Tm{o)MySN%T&=
z=0emPkuX6$i3VrXTkuo@_0G;q;(QEA;!0g-^lBF7QC7T<GP>8FhjYenjiP!2#nNA*
zE*}`psabesb;(R;*yo<lyQSd1_1%Mii!%%Aq`ggSZ<E}*znc}4J`uGwYo~kIUxsz{
z>JBG+BkFa%S7r2WE2G}r$RVOe=I(pfU%(Iaz9hr%M(eTGqMq?l=;NR_+clk2TOGbP
zPoe?!Uap7zJD``J@xTermtNTg%FXnAn0ZOK4fr~4F?tT?cXtXa+ieDXPmKb;W!Cgu
zs4sUqot_8F4v1#~hw2I5CR2JI4E)+f^Ubo+(s~D+?8EbZ+0b)fru4{M*auO~r2Zdz
z4$Siyh40O$)0!F%^B?%9TeP9S)XGKZA6-z-_^Ze={z=duY5y)bzrbIn;y?5cSPgR$
z`18LrLI0j^cg9~q3;pw5t<isBAM97~*KI$A{$5RRf1~|%XTW{otHO0DD|)^=ryBAG
z>Qd4+e}4-8ZeR7p$3Wj@g1r&;-IRW@$Q-zb@~A{71e^qY`Tx4zhM-@4^-%sf3IA?m
zV>^7Ap65h9DD}YS$CE!b!dy7u`9w@&JA59g;^hT8lffP*92`LZhR5&JK*qlo`V;u8
zEsjNh3$as-{|q=6!QZkG`abvzCpR(v)p?BnTDb3k|FWrA#^1XX{ogC~G5&Rr(SMd<
z5#wJ`kN%-Ka*Y3HUYO6yH(yS&rsp$>CF>GkJ~OcKdX^4zS&uZ&cqGhaN20w(romh`
zcTQPI9n58Z;k`Z6VICV@e#l4=<}s&{kSk8~JVxl6HpB0Da^L)D*e6PQ<Po?(Z<RT!
zQIb&)bJ$AW9MOF+hp8MTM@nEmbjgqU7>E8TE$Zm6)(H0^@OP22Mt}bJ<LK{G8jb#q
zO6$;nD7y;%54dQff8@(%^e4L1(SM;<Ao?G5%|`#y(>u}s^^?QsuM-IO4%&aiJeaS#
zACzik!F;v-g}C1|dcJaqnVkl6RrbY4cNAf+x_EFyRt?Nmm&Fh4x&`-lg&=<c9hj#q
zh4Mdsgn3F#mH#W8XSQq4A9)4)QdW$2>n^xoOCIGsIQSaoDE+5qif%ARt(7R2eg$*X
zrZtUWaR2DBT)aHM75!VC4x<0ulMB#4Yr$spfAUWm{fU4|^xqRjpnnkWa>ifS8~r7Z
z!}$#Up_|6h-$cX}{WEvM{T=*`=fgV!_&;y^gZ||;1L&`@I3Cu|m3^7s<8a<Ccps5#
zPoKAYw)6gk^^;;>8wK;bkVD0<MQ~63{n&P&;1GSzu9i`VqR-b?@9w{Vejby5r;8u<
z-F%JIW7%{+&sOa5g#Z8er?wUS&LVR?q6l(L(@hK452E|{wOZ{f^!nb_*t81H)8|2Y
z?}2+y)$h59AL#RRe%$Rv^f{`K_9q0+QBkt4>=NvYAa|Dt12{+L2o61XL7$(A(QFym
z9|PVe|6V4!6D<D0><4teexpj-fWPQ96B)Q44<-pNv6KdX?UR+VGIXE5z5IF`e7^st
z#wlAkFJG+W)e4}`%i(ICd-Qqfs%kDkpS#NQZTjeQ^5DuGm>X}^u@>eO!Z{gw)I$d5
zE5V0JX5RauA7|VD{h>qmV~_EZ3UohqdlQ)eeb{S5fnq!KVK;JR{0_PgpOUv}gFZY~
z<Yik7-><OdfZ#9WpMAa<`JdVCh5W64wIcsI3ht-Ce>i0g^7nCt`I+XQ)`9#>4<ASV
zw>;z-{>oYm{{xR0{www%|EMW=hoJfI=Ry8;v-*&K6a6{x*N-wp{!+h+k$-|*A@YBC
zG7$N<?igYCTX-V>-kb1_2K-|uW0C&_Z9nASYu17MH|UKZf76g=<ge_Kg8ZxJoJIaR
z3eCuWg?bC}=W}dC{<V$okpI#<`N&^#)D!t@4(~?(l3^{#->0n@`I|g~`4jk$jCdpe
ze_G#=|D$eyhX2WY<ge*gg8XIkt|5P&cxmMCmrnQJAN;&UF#q<K>{)U>jGh-C3!bw8
zK5rd@&#wo*9u|RXf$uDThftrx@cT$~u}LM!?_QHD5)5-jwuMKs3d|!97YyFlg?VJH
z%Epp!Fo(Pt6zqcc+<#ivMH^uLNw2l5JP7&J{tk%BM#22CeK{p62J_(CflK9l^gMX=
z!YyZd9z0`7R)W9nP{6Kx^c)yv9SZjc`Cn=4mrp@|6Zv)}1=cr@<r07MDVR5cdEMTf
zp!gz69kfmPVBUyzxxBRq=8SKuStI>0XIyu9kaCNjGxF7b&Z6gx4c43DfKR$Y()<Eg
zA8Jazci|m~r_pdI5BmT3&}VDURPgtE;iU`vb7;pGUQu6q&buFcrWbz4FbgmB2k`y9
zi{I+B!8~EdqnreN%}_tDa~XX93NxMloACZ3p(WGWu>j_X?aR)(2*4b1`qCQ<6PP1J
zG{qNp!u;^Nx@w>c`3uM$MgB|6Uo-q=1CjsHnFo;ne&cz_KLp+zX#QJV>Hg$ilcV_#
z`Nyn~K>oeT_Q>Db75Xdim%Uzy{0UJ%hQF^F@)r_&g8UuRpnm}W3-Qo@fd3`F6y(p&
z+K&7W6x1VsJI@K^zs|yf;V-Pp@c)>B{QZ6ABYy+Ub;v(<{RQNoyJ`*cPv7s0{B>>o
z8UC89k$>{@T;#tm0M-TY9~6ao0r+oIc+BvZkwX4F^-;+GKQ7MM7CtkL82+9skiUe<
zX5{~_1Mc&{f3W%w@^3!xg!~V+!MzIj->iW53E(fb=p6DdzYOQgaDL5)%4Yg~>WJD}
zxQ`22FI03d;nZ_G!|KB6KC)}|!xSg1d#jzu)J=zXAgEjH^^~aRet3T68{Frh?m><a
zyhjPY={#r;?^&a~Yo8m^?^#Z%U(GmGQW5S(c3Xg(7_WnfFV=lkk6`L-WwGw|HCduc
zD!I#X(=4oW6fcGEKNcltTm|o6#rGO#MAGkHx1L%!5`9KzGEcr^f$y!dEz0nY1a&ey
zk6~T2(ig0I!V}8rGm1<LZSuo9%Xy$TX85Ne>jC{<W`A4y4gFqbw?}ddr#)@%f4bi(
zcsJu6SQroQ3{Y3Edyc6)0sNrOr)HFBe;D{7Y=Ehg-lhicYkRyb4t3J+YuE4l-h=nG
z^;2of_=pxO4YBq5$Kah!O6r$s3R73Fh;?^1G-2I?Hg8S~`(}}PTryL~hWqwFL(EXA
zEWF3@rEDIC^P;}NKg~ds(~&!@#k-#c_uje)zAd#_H*yj74P7?~=O5H%K6D^Do)AU?
z!|hl%LqQ+bzm;*~z+d|PE^kRcyqlR%RXW=#5apuYSGJMI;2m#J^;2~w*2VWf#k!>~
z8d&FKlE^6+DOsPpkg2Qjf_|*2*lL+34DWruo~cJu;l1zd4AzTmqU<N%+O<-J^t<0d
zQ}!3EGmn3Ubt(Br={kX_%m_|dU$n;2ns%%+S?3M?|K+Rax5Vi8!J858u)fsI-5S;=
va0U-KJ6wJWdVE=%wk<Qjx+k5Vu<q1p7p!YE%q0d7+1&jYS&4O4S+@TJlU?Bk

literal 7080
zcmXY$c_3C@*T#>~s1zaOQ7B_186xK#8uUa7$vkC9hR{HXC{odrO3`R+kcbEsP9zl(
zZ>EZbLPh2f<^I-L_xEpSxvsT;_ugymwG-(7yoMkMekwA!!|1`{qlD1W?F7NQpEesX
zP<76{LKAa(7bKVQP@KP(xkHZS%uPOjJ~u~#s^vJ&jLRvf&AL(#Uh<kj6Ejq+MSt$6
zIPn|iDT?`%xxLc%LGx*<cG{QIyu6<_8*DfuFZPt;w0T}Gc2R=(hgL&nEo9b4ji6Z9
zI;xJaOCPEEhc=(xDp9B<O>rio^BVdiXhN!~ChW;wret9H(eAl}R2`pQX>xEP=I4|h
z&&{vA74sXk{7v4Rq5te#1o*P~Z5!m~7na4%@8xoCeu?=^Zl8dCly*4g*Od7O^HT^4
z;^yb3hWQO=wJ=#5=G|x-UWoaPiqGTb*S`|;Yy5XL<`@2N1z9pAUU+ejGdI8Pv$R=%
z*G_G|#}sE;^~MR|D2PuDt4z4ZR3s(@>LgA;e(p!}tc<~T_ct{GC5SV8)cK)^CU}&x
zEaHX8m+`;<Og-}lUzx`ZdG@r~P>(UO>l1vpviRrQ1DfEqmAjt*g)CQoV^CYs4spj@
zBuwT57tbFem((Hd%-ZUA(ZD5GTV<CyQ`YnGSiqWO@J-}>O2F^N=9d8LW%E0{81t+0
z@F6P_OT8~k_Hpwo?Zf<3W!`f0JI}|>FVu{A8UH5t@jLc+gZ%u0+5EP0;_q|wOMra?
z`CY5q!IY~Aohofz!p(2q8O%>~O9kdvBjk(uDaQqnWj#@ItH0K9^YfXe&FQKay0foR
zQ&(!2N9SyzINb*R@)FmXFI#42ew!Ii)h+9B$XT_Qwh#(y3lgfNrl&;}%MDDaNzsE&
z9Y61pU&DGSqxtar9-2I#PEXS2X7=m9XVg=Z9T($**L%UfiIPYj_h8zZ3$)MY5LBH!
z9lWA3k+zTuuTftmM@=_rNagPeq-T(dR)n|>`9;X!zn~6+sx|PKfAw56=C|Q3FE_tf
z-~;(7#Ke+cj51uv>&cj3?xGmXFZjVN%&&TbGUnG7_Lun@c2J?Vq#g6ikmBYy<XwaL
z9qIc2{A66owr0NLAtzwHke~ch8_aL5{V~k1BvXf*-=rM#MMx@sg0~d&yFzcH&DVXq
zURfAT6I*uWmTIa}ocl!=JU%OuABuPN-;4#l#d$~glO$-f`u|+xb~e(4V%(G8@|`p>
zS7yZDbbx8q7;+VS5B&PJ-Pm~eJ8f>XxH&b^lP0X!We4~;(1i4YAJ>WR<d3P1g?YPy
z-;!fTPVi;YW^eVE3;i^u2{r8^o0e4+XUK%tNp2I<S4GEpR{_6oChoE+S1`X;)fmk0
zIiDfsw|{>p^Pxb=ZpMGy{N_}ZVSWjR#WBB;8+S0jK<75HRVO2QB8{7$(QZS`FHf@r
z^E>d$1M`aysbGGL_l!)grZK;$*&NJII7bHav)Jl@`B^@SA^WN)ZQS3}n4jE1720h0
z*{0dY@6v>5sO>3#74WsJI(AG4_Oa`d);*%M*~EPHMb|TELSyWy%-g**A*t`1B2h-w
z@d$d<IvBut7`bx8K9b^$y?h|Nyo)CIf8D)vx*YyKEjBCQ4g9^)d8-6Xnvi=z)foJt
zI01!MbBqp9b-a8@qp!zl^EvMB1%m>x4zYh;4JA?2C3*qd`+BIlxl4=^FYJJG^hqdU
zvpvNb=M7JLUO;i0Kd9Jld<FIFebJXJLE7xk4ztV8lW5{fbjo>C;9|n@CF%>P+KGXn
zjNFs7xw_nja_4t6Po&RIR~0^*XKu`4Ptkm;Zr&L!_YyJMY`VPfdF^S6Q}q3KSd{`z
ztTK6TG;|B<Tdv8W8<&v}&yB~(=j7wHTt1<fkWY)kJme!L3H6)hqx1*)ByG5heCn3E
zBcFs}s87I0yae_Y%V)Hb%O_F``B>JrA)mS>8ptQ^DT91|W&A-th0*(vPuZ4g<g+*O
z1oBCmQG<M<4T_OZiisBT*_Km;e7*`qAfF3$u%Cd>+0xs{M^sW1`P}+#jC@Mk3z3gY
zog4BozYf1Y@Nt^~`v~|jJ7yuD&_QeDvomlA`2;G>fco02rRJ<jar$5SuN?BDIK%BP
zVy~_tOGl31+0nNQ>fEa-i{B4`qx<{yFJPSu13Y9#3TT2)_*vBGdgk^1jjleAVc!xS
zfm<XNp`Q9p*srYKpUbFs(Kwqa9WmM!uff&ZIpl|Wx^g*Oy<t1lJ1NjbzTQ87tG-Jz
z>KVmsq0RbUpYN9*1uhDvCoJ<RPG?1ScXuNBe5U=2kwyNHpUq~DU>I#SzS&|}_9IQm
zj5PnSpMbh4P@g)lf_a{j>tzM~vyR79bmd$)_n_Aon~r)Bav7-Cb+(duKJ&C^?bdeG
zn^w|Cz0vhIQSbZbVbn`oc#nLZ^2dBrtut3Ik=@@)hCXGL&?i1vEA<OgoXH^{p&M$<
zN1s(?Z7*{`&+qNpC5o_*?31p21K;010a6)e(6=KZM(-Pv&FN2u!&@Lf!biR1QYY%|
zBj7x<dQK;}dR<$|k3P~>k~hLpPjlv4uAWde>ishg`-0VLTf#J_C)?|ODdp;E&w~B)
zRP_D*KQzy=j4N+mx>1}6y)*-d405z}A|f~&@@p#-(OwSg_(Oga_fw{MvKog{PeNT(
znJ8)B_lg{L-8U7G=1kRout{Cw1p5W_hN4VSukz?3)LU0^kQx2t^7!*8S8tQ11?sKM
zPeZ*FZ`d!O*EQo1Gwk{zbK?pc^;S4R|2(_j|HX(1&3F1$dCkBmHLV!u(p}QZ3>6KK
zwm-H|b+fzHw%zWd&2L_`-Rmk(^OJKo?wA^)iQJs}_V4!0*q}jNhGGs?$Dg4gCA$Ij
zYA;WsUV}v_>J9yfCWngOwBELZdjRNlWIjc`eXBB1&xYp&>b30nNsbLEYtMhU4E3b9
z)X?UNPVSfgG8AW{LDwd)lN6`ccskqDlpJWglIXVuem7z5^tw5+w7G_h{TaRc6h}VF
zhBwfiCWPiW+J@OsgriM|l{NexqT^pvdLvQqlsD7~(9_ZFL%nptW6Z#3fsfQKuHKuh
zPSiUZ(9h~w4^6&7J%_Dr&j`nJo`Ms}sF!F9^=ByI+Gfsgit|=-)?+6}I7cm}-fL{g
z(y95Ix?>i?K1j_lHLak{=h_#FyF^f&(L8~YP%WCU5y{SY@RxZjt8uqh4Dy?CDdMVz
z66*ac1LqC&3PZ9{&u4KYQ#x5$aN(LWSMPNS>Mi+If_hBYD%49Ken7sJJzbe}8P>t-
zWo)I*emoVIw}Ewjdsp+jJ)9<pG~J!Pi^%+6XNPH7*w0hveEoJpowuGZd|%EK?srE_
zC9<-qsZB!CtJC<IN1uWYTbqNP{I(6dcAiB&1z|<h<0SB--i~e&CVyl}?8r_6_535d
zQ7@;m4E4nBcB7uwaeMMnXK?Ck|Nqzf!4LJZC?TcLlA6Bat(W5~MH8~U#z#BcnTPu|
z4=#=@rRrwA;1Spe_x(GPX<`k~XKpp>MHnxmc{hhY_3$H^(p;+MWH9K>xi6Ia1o|cD
zrCLCp2fYV@#;6yj6Hh+e|7wAoAAx%F9=UV%Yywd)GXFm6xp}9NrMY)X4h2CR=;hst
zggSZoPfmF&oNqB%<wIRGF<T~I*0GXoRGm}$vlh+=@#w_IS0+$5BoF>o`UG+M<)e4t
zJ~O%ZvU_Ma(`3eso^>6By5VLb<S~qT*>JB$Jr!BhlQH?oG^#FZ4xQ)2)l+dsz3E$W
zT)k#b)bkiOC7aCdCK|7U^9y>y8=<}o{@k|PycWK5B5bwboRo6nUK;2yuUC$F^<H-d
zy#@2SQ~V(=`Q`fKlAx!3SN|~Fzh^srdMrYbH6d41&F(CNxB~*8*L^{~J&Y6TSxf&!
zy@o}O<m(lhlXCIxsP|1;7xl={t*A#nT!(r^Ss6^tu|=nK_Lg$>ygg}i-kb;D)OJ9h
z@z?C?xI`0%D^?{h?_nC2rmrYfhWkGM-X~v|s?lcO_wXOsyc^;qy`G$cI@$Ewcm;8m
zY;16IaNn{F_Ep)P`jCgH*ZmUiDWKPFV}N>b#^q##RIx%pqXO!ENrHM0diw{@qu%4Y
zP{%<}sIZi2tbbV3d;;nu=rt^M1I`9Tc}45l`A}L<1?uTv%j~m^2Jm&<VH&=NoePPy
z)xEGz)#9ISG3;Du@X4-=od>J;jV=cc^^;TYhS+&9FeO|B;vQP9ypap*w~Nf@Sg~_p
z_NohTub7B#*{)*F&Vf&NORZt&zqY%UQ|O=T8ifA3S|;d!?%4mh{sinl*8jl&(7%7#
zGxYyhERX*FbD&;;|KG9-^iMl1%=Hg_g8l`bLFlh~#u@!hs?MUnTfZ0Af5}p~FLcXZ
zd<K1UTJ=Afi{BtFHPg-{6#U)0^`gc>-(||?;6iq;tB}$UgTAPAG1Wp4^kqVOTAZL>
zb?;H@VgKFA%CQ&v0<Wz=MR+iLKb~A43i<<k3ueWfgYRdQTwZTt=eRuqeJtO^Jp*q~
zqkmjD)NSyW)mp&yKU&K54?d3mR?~3r1^=aQXs&;rDcApj7y7FX!2N*rcmIt3FM<=f
z{=pZ~e|9nO1OMHtg3;eke;EA-dA`AX_IBHiH?iz|CU|XSGR$WitfYO6VJ`b7DXSR=
zbJ@`g6X)*1TqYu}Y&{$1GXK+_33V`!jjLU7{R;D<)0o235OyBZ3n_r}Jm;jJ#q>?6
zmj+buoidoiLe#7qjbRR3GqaN#gE>qo_?Irs^#(4*()khSpV=jj{>hQ?=r3~I7yae+
zwbB3YsbcikxzvgN`<xTdKUP!>{WWvz&_8qw^cC>8?!1QnA<n<i-y!u9`VU7{q5t6Y
za`azjaTn&RuKN;apl)rkWE8y4vh$UL!Qx1m`*P3!&(0R+s`CeXHDG@#UJ|@m1@qNh
z*&`8455v4?Iyd%$I?Pjo%0@x1Fi+{6f2rLG_iRDlV@;MYM};sizP^Jw%CKNGAqM6s
zokeGPV9psb*33ON0`rsUa{d%0^jA+`iT)}vP`|+6Y0eb-H~)hE0RBH$$D;q~8Qtg~
zBi4%k=Xy)f-%iL3{VQt&(0~8=qv&77`x^ZR8`ID~yw(Q&@7D97KVQ-X*gx@prxuk%
zzm=$x{l}T@x1N`h%whke+Le2GL!Wi1+p`kx)1!ITk1|czKC7{y&xq}>WnWYlKs}Eu
zde|fn=T1!ZdWtAp&vV5tZ-T$i`>C5h3j0s$W>~E@eAmwm87XAz_|><cH?#YDcm2^W
z=%-JPSoj67{dDx46x4enP3--u2)2*PW|rTBKDvO`?YaZ!;)uJ!G6(3Ri)Y%J!2Xxt
zb|cmI6r7I%Z_Dd&ub3kEre)ywE9I2GDT)FA1y|QKDZuxnIa@QXgTHo^Zgw?Wr*GSI
zOvCs6DLRh3p<fnjXuCVI{c`wt=5e-PIu%s<vwc$0&ha_hCw*IW5}{8v@dRZRK%Wdd
zf8h8e+aGT_d09d|&b8~z2x057C;eQPt;ZgYyp2$Yz1BYxTLE?0oeC6Z=M2fHMF(Nd
zYMLlrk$eW$SK5$ZH-Y@4)q9Ztisw)tf&ay*Bgp^7!rRE-Bl!^W&*i;|`~@BCkbmcV
zUgV!`0&@q;f2$jpzi={_e>}{e!2g5@>|5X;y1Wkg*H#2@`ByJN{ySHGNB$Shw;}(J
z0Yb?C{>dlE-*7k74d74ziA4Su3cSew_P7u7cdUoG4fs!#!+Rv~Zw=2z{(1@>$bWkd
zyu$&1{X4)H_`7?%A%EQkuaLja_Up+1q#D#I;GY|K4*A>H%tHPxtDx@y{{yPB$p6Ss
zxJLtj`wCv<|G5m#FYq_qlZgDcM#1`7{%3X~|AWJ!$bZ47*)0EOV_#j6|D~=GsJ}n>
zdE|ZAdC?=?0{Y)@d!EYkdBEqDgWqeIbB6b9KfU8B{C$8!@e8>B@Ckph{w@Xido+}=
z_oF|#+hfmNg?Z$@3aM5F^N5a8<W{JUZN&pgPCD>iD{*+th@C?`TD>J;4*A`mA({*G
zhou^$90K#;tASJQ(ASz362?BleXO%0=GTRB@V6d1XaRi2H%IUM{)?RhGgXRVAI%Zy
z(QJVC5gyYO`<}SMyb;W6x66y>JN?>5kWU2Wjmu8^?AF4Z(WCsBs)RY?rafaO#?Bdq
zSFC`4ZjOF}DDdOYlJ$KB?-v6q@(x`upg(-r_$X2h_Qz1GR+Az4`xiT>#=`!z{VJp&
z&(3-GzRE&<pAoS=P#oST2=9t&6L?=xw3%_$*c#>u!^R$M=##Zu^|n22fw@HMA97zh
z%n_DK8>)L@jyP2yqXPL038?!{?SlDXv|+to9`ffqx(4~*J2S@RZ#W<M2gsdA{*i7i
zT>fgKT>jE*eeq~0{+NUOpLn(+f1R5N$p1p767qi(RLt^ET4r6r<$q!`^1mGh_aETj
za%2<oKe>7-@}JPiM*b5E+>rl)vIOLB^#$(Hz+buJFPFbYEb{l$fb{`?&+}8rU(!|!
z`TMPf`vveXg7+?#|MR2<<bQ8=6qmmM)Gy%gH+&QMI~7+U|J>SG<i9qSz5f9JkRas0
z@CU35`1jmVLjFNhv$_03N#uXx`3&U$w)-mbH~o7b`Rhq6M*eDZ;GIGKR6Zva?!Um_
zdl=qdfWP)*nBRtr8pCv;eu;)2-Q=VO{bkou9!Fp1J<mUifBV?`?(VgnOW{5aalyAs
zxN*|^**M8*CuQ<I&piDTzB@3kW91~gN6mY&S?o2uXN~i&u|2@PXE|+p-@v>RH_`0f
zLjyNK-qpcb829f-Rg6>rt%z|(7dDgc#9hjxmMz3M^;3bc{)uRp38+6~73mN9p^x$Z
zy45&y3E8))tm#(oYVf^Pz5B8Z#yS4og>es6p$|h`eRdGjw~0ykHJF2OE}>=MyYbhV
z2d?aUncdeY3HH5g*M{jbraj}LvuX&emzQ@y&_Dy@PH%&`l8yTXeFftFRZ}P1?~QqR
zejqSzfe74h=XiSAY5-Shu|zT9C+zzg@yGo=`N`BHY1;^_cdoej7>_bHF1`cfggfD#
z8{(#BnKGZYw`?8APR6)jQ)93n-p7T%8i)5dzT_uvaBpvZ@1McDmgy)E(M)fGxo5iR
z)Ehej<4)e@<;EFN7?<S}LUt55Z@2bn$GBRf&9MK?4IBO7++T@){jw15`6kn!*GpKF
zZx^>;Q{EpA?|6esdfJaLu72MkjC-{n`Xt2Z);ck7g*^-_*Kp(XFT}F%eI0UX1ibh8
zy0>5W%)a+c{&nS$)g$Lpm%uwSy!##c-I0fJ%lBGg+~-VnjGMCkz*LV2tFQ8e^+Mc4
zJ-mlaKPxjBeagNMrpbvUu<wJOs?y7t!Nc}pP4NFPb$kzv*Bp3)ag-v=MGz-r2J;BS
WsjXc^4j$flXpbcLLfq>9ss96i%Kv2m


From 6fa2c4dafed9e033f9d25ea23122f61b16180a35 Mon Sep 17 00:00:00 2001
From: James Wright <james@jameswright.xyz>
Date: Thu, 20 Jun 2024 08:32:14 -0600
Subject: [PATCH 080/571] refactor(fluids): Rename norm -> normal in freestream

Also remove comment references to potential energy, which is no longer
included in the total energy definition.
---
 examples/fluids/qfunctions/bc_freestream.h   | 56 ++++++++++----------
 examples/fluids/qfunctions/newtonian_state.h |  4 +-
 examples/fluids/qfunctions/riemann_solver.h  |  1 -
 3 files changed, 29 insertions(+), 32 deletions(-)

diff --git a/examples/fluids/qfunctions/bc_freestream.h b/examples/fluids/qfunctions/bc_freestream.h
index 4475f65495..90700496e0 100644
--- a/examples/fluids/qfunctions/bc_freestream.h
+++ b/examples/fluids/qfunctions/bc_freestream.h
@@ -30,17 +30,17 @@ CEED_QFUNCTION_HELPER int Freestream(void *ctx, CeedInt Q, const CeedScalar *con
     const CeedScalar qi[5] = {q[0][i], q[1][i], q[2][i], q[3][i], q[4][i]};
     const State      s     = StateFromQ(newt_ctx, qi, state_var);
 
-    CeedScalar wdetJb, norm[3];
-    QdataBoundaryUnpack_3D(Q, i, q_data_sur, &wdetJb, NULL, norm);
+    CeedScalar wdetJb, normal[3];
+    QdataBoundaryUnpack_3D(Q, i, q_data_sur, &wdetJb, NULL, normal);
     wdetJb *= is_implicit ? -1. : 1.;
 
     StateConservative flux;
     switch (flux_type) {
       case RIEMANN_HLL:
-        flux = RiemannFlux_HLL(newt_ctx, s, context->S_infty, norm);
+        flux = RiemannFlux_HLL(newt_ctx, s, context->S_infty, normal);
         break;
       case RIEMANN_HLLC:
-        flux = RiemannFlux_HLLC(newt_ctx, s, context->S_infty, norm);
+        flux = RiemannFlux_HLLC(newt_ctx, s, context->S_infty, normal);
         break;
     }
     CeedScalar Flux[5];
@@ -94,8 +94,8 @@ CEED_QFUNCTION_HELPER int Freestream_Jacobian(void *ctx, CeedInt Q, const CeedSc
   const State                    dS_infty    = {0};
 
   CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
-    CeedScalar wdetJb, norm[3];
-    QdataBoundaryUnpack_3D(Q, i, q_data_sur, &wdetJb, NULL, norm);
+    CeedScalar wdetJb, normal[3];
+    QdataBoundaryUnpack_3D(Q, i, q_data_sur, &wdetJb, NULL, normal);
     wdetJb *= is_implicit ? -1. : 1.;
 
     CeedScalar qi[5], dqi[5];
@@ -107,10 +107,10 @@ CEED_QFUNCTION_HELPER int Freestream_Jacobian(void *ctx, CeedInt Q, const CeedSc
     StateConservative dflux;
     switch (flux_type) {
       case RIEMANN_HLL:
-        dflux = RiemannFlux_HLL_fwd(newt_ctx, s, ds, context->S_infty, dS_infty, norm);
+        dflux = RiemannFlux_HLL_fwd(newt_ctx, s, ds, context->S_infty, dS_infty, normal);
         break;
       case RIEMANN_HLLC:
-        dflux = RiemannFlux_HLLC_fwd(newt_ctx, s, ds, context->S_infty, dS_infty, norm);
+        dflux = RiemannFlux_HLLC_fwd(newt_ctx, s, ds, context->S_infty, dS_infty, normal);
         break;
     }
     CeedScalar dFlux[5];
@@ -182,8 +182,8 @@ CEED_QFUNCTION_HELPER int RiemannOutflow(void *ctx, CeedInt Q, const CeedScalar
   const bool                     is_implicit = gas->is_implicit;
 
   CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
-    CeedScalar wdetJb, dXdx[2][3], norm[3];
-    QdataBoundaryUnpack_3D(Q, i, q_data_sur, &wdetJb, dXdx, norm);
+    CeedScalar wdetJb, dXdx[2][3], normal[3];
+    QdataBoundaryUnpack_3D(Q, i, q_data_sur, &wdetJb, dXdx, normal);
     wdetJb *= is_implicit ? -1. : 1.;
     const CeedScalar qi[5] = {q[0][i], q[1][i], q[2][i], q[3][i], q[4][i]};
     const State      s_int = StateFromQ(gas, qi, state_var);
@@ -191,10 +191,10 @@ CEED_QFUNCTION_HELPER int RiemannOutflow(void *ctx, CeedInt Q, const CeedScalar
     StatePrimitive y_ext      = s_int.Y;
     y_ext.pressure            = outflow->pressure;
     y_ext.temperature         = outflow->temperature;
-    const CeedScalar u_normal = Dot3(y_ext.velocity, norm);
+    const CeedScalar u_normal = Dot3(y_ext.velocity, normal);
     const CeedScalar proj     = (1 - outflow->recirc) * Softplus(-u_normal, outflow->softplus_velocity);
     for (CeedInt j = 0; j < 3; j++) {
-      y_ext.velocity[j] += norm[j] * proj;  // (I - n n^T) projects into the plane tangent to the normal
+      y_ext.velocity[j] += normal[j] * proj;  // (I - n n^T) projects into the plane tangent to the normal
     }
     State s_ext = StateFromPrimitive(gas, y_ext);
 
@@ -207,10 +207,10 @@ CEED_QFUNCTION_HELPER int RiemannOutflow(void *ctx, CeedInt Q, const CeedScalar
     KMUnpack(kmstress, stress);
     ViscousEnergyFlux(gas, s_int.Y, grad_s, stress, Fe);
 
-    StateConservative F_inviscid_normal = RiemannFlux_HLLC(gas, s_int, s_ext, norm);
+    StateConservative F_inviscid_normal = RiemannFlux_HLLC(gas, s_int, s_ext, normal);
 
     CeedScalar Flux[5];
-    FluxTotal_RiemannBoundary(F_inviscid_normal, stress, Fe, norm, Flux);
+    FluxTotal_RiemannBoundary(F_inviscid_normal, stress, Fe, normal, Flux);
 
     for (CeedInt j = 0; j < 5; j++) v[j][i] = -wdetJb * Flux[j];
 
@@ -251,8 +251,8 @@ CEED_QFUNCTION_HELPER int RiemannOutflow_Jacobian(void *ctx, CeedInt Q, const Ce
   const bool                     is_implicit = gas->is_implicit;
 
   CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
-    CeedScalar wdetJb, dXdx[2][3], norm[3];
-    QdataBoundaryUnpack_3D(Q, i, q_data_sur, &wdetJb, dXdx, norm);
+    CeedScalar wdetJb, dXdx[2][3], normal[3];
+    QdataBoundaryUnpack_3D(Q, i, q_data_sur, &wdetJb, dXdx, normal);
     wdetJb *= is_implicit ? -1. : 1.;
 
     CeedScalar qi[5], kmstress[6], dqi[5];
@@ -267,13 +267,13 @@ CEED_QFUNCTION_HELPER int RiemannOutflow_Jacobian(void *ctx, CeedInt Q, const Ce
     y_ext.temperature          = outflow->temperature;
     dy_ext.pressure            = 0;
     dy_ext.temperature         = 0;
-    const CeedScalar u_normal  = Dot3(s_int.Y.velocity, norm);
-    const CeedScalar du_normal = Dot3(ds_int.Y.velocity, norm);
+    const CeedScalar u_normal  = Dot3(s_int.Y.velocity, normal);
+    const CeedScalar du_normal = Dot3(ds_int.Y.velocity, normal);
     const CeedScalar proj      = (1 - outflow->recirc) * Softplus(-u_normal, outflow->softplus_velocity);
     const CeedScalar dproj     = (1 - outflow->recirc) * Softplus_fwd(-u_normal, -du_normal, outflow->softplus_velocity);
     for (CeedInt j = 0; j < 3; j++) {
-      y_ext.velocity[j] += norm[j] * proj;
-      dy_ext.velocity[j] += norm[j] * dproj;
+      y_ext.velocity[j] += normal[j] * proj;
+      dy_ext.velocity[j] += normal[j] * dproj;
     }
 
     State s_ext  = StateFromPrimitive(gas, y_ext);
@@ -289,10 +289,10 @@ CEED_QFUNCTION_HELPER int RiemannOutflow_Jacobian(void *ctx, CeedInt Q, const Ce
     KMUnpack(kmstress, stress);
     ViscousEnergyFlux_fwd(gas, s_int.Y, ds_int.Y, grad_ds, stress, dstress, dFe);
 
-    StateConservative dF_inviscid_normal = RiemannFlux_HLLC_fwd(gas, s_int, ds_int, s_ext, ds_ext, norm);
+    StateConservative dF_inviscid_normal = RiemannFlux_HLLC_fwd(gas, s_int, ds_int, s_ext, ds_ext, normal);
 
     CeedScalar dFlux[5];
-    FluxTotal_RiemannBoundary(dF_inviscid_normal, dstress, dFe, norm, dFlux);
+    FluxTotal_RiemannBoundary(dF_inviscid_normal, dstress, dFe, normal, dFlux);
 
     for (int j = 0; j < 5; j++) v[j][i] = -wdetJb * dFlux[j];
   }
@@ -334,8 +334,8 @@ CEED_QFUNCTION_HELPER int PressureOutflow(void *ctx, CeedInt Q, const CeedScalar
     State            s     = StateFromQ(gas, qi, state_var);
     s.Y.pressure           = outflow->pressure;
 
-    CeedScalar wdetJb, dXdx[2][3], norm[3];
-    QdataBoundaryUnpack_3D(Q, i, q_data_sur, &wdetJb, dXdx, norm);
+    CeedScalar wdetJb, dXdx[2][3], normal[3];
+    QdataBoundaryUnpack_3D(Q, i, q_data_sur, &wdetJb, dXdx, normal);
     wdetJb *= is_implicit ? -1. : 1.;
 
     State grad_s[3];
@@ -351,7 +351,7 @@ CEED_QFUNCTION_HELPER int PressureOutflow(void *ctx, CeedInt Q, const CeedScalar
     FluxInviscid(gas, s, F_inviscid);
 
     CeedScalar Flux[5];
-    FluxTotal_Boundary(F_inviscid, stress, Fe, norm, Flux);
+    FluxTotal_Boundary(F_inviscid, stress, Fe, normal, Flux);
 
     for (CeedInt j = 0; j < 5; j++) v[j][i] = -wdetJb * Flux[j];
 
@@ -392,8 +392,8 @@ CEED_QFUNCTION_HELPER int PressureOutflow_Jacobian(void *ctx, CeedInt Q, const C
   const bool                     is_implicit = gas->is_implicit;
 
   CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
-    CeedScalar wdetJb, dXdx[2][3], norm[3];
-    QdataBoundaryUnpack_3D(Q, i, q_data_sur, &wdetJb, dXdx, norm);
+    CeedScalar wdetJb, dXdx[2][3], normal[3];
+    QdataBoundaryUnpack_3D(Q, i, q_data_sur, &wdetJb, dXdx, normal);
     wdetJb *= is_implicit ? -1. : 1.;
 
     CeedScalar qi[5], kmstress[6], dqi[5];
@@ -420,7 +420,7 @@ CEED_QFUNCTION_HELPER int PressureOutflow_Jacobian(void *ctx, CeedInt Q, const C
     FluxInviscid_fwd(gas, s, ds, dF_inviscid);
 
     CeedScalar dFlux[5];
-    FluxTotal_Boundary(dF_inviscid, dstress, dFe, norm, dFlux);
+    FluxTotal_Boundary(dF_inviscid, dstress, dFe, normal, dFlux);
 
     for (int j = 0; j < 5; j++) v[j][i] = -wdetJb * dFlux[j];
   }
diff --git a/examples/fluids/qfunctions/newtonian_state.h b/examples/fluids/qfunctions/newtonian_state.h
index e41dd8d4fd..fa38c45e68 100644
--- a/examples/fluids/qfunctions/newtonian_state.h
+++ b/examples/fluids/qfunctions/newtonian_state.h
@@ -55,14 +55,12 @@ CEED_QFUNCTION_HELPER CeedScalar SoundSpeed(NewtonianIdealGasContext gas, CeedSc
 CEED_QFUNCTION_HELPER CeedScalar Mach(NewtonianIdealGasContext gas, CeedScalar T, CeedScalar u) { return u / SoundSpeed(gas, T); }
 
 CEED_QFUNCTION_HELPER CeedScalar TotalSpecificEnthalpy(NewtonianIdealGasContext gas, const State s) {
-  // Ignoring potential energy
-  CeedScalar e_internal = gas->cv * s.Y.temperature;
   CeedScalar e_kinetic  = 0.5 * Dot3(s.Y.velocity, s.Y.velocity);
+  CeedScalar e_internal = gas->cv * s.Y.temperature;
   return e_internal + e_kinetic + s.Y.pressure / s.U.density;
 }
 
 CEED_QFUNCTION_HELPER CeedScalar TotalSpecificEnthalpy_fwd(NewtonianIdealGasContext gas, const State s, const State ds) {
-  // Ignoring potential energy
   CeedScalar de_kinetic  = Dot3(ds.Y.velocity, s.Y.velocity);
   CeedScalar de_internal = gas->cv * ds.Y.temperature;
   return de_internal + de_kinetic + ds.Y.pressure / s.U.density - s.Y.pressure / Square(s.U.density) * ds.U.density;
diff --git a/examples/fluids/qfunctions/riemann_solver.h b/examples/fluids/qfunctions/riemann_solver.h
index 1a793b356c..b3d36f86ba 100644
--- a/examples/fluids/qfunctions/riemann_solver.h
+++ b/examples/fluids/qfunctions/riemann_solver.h
@@ -108,7 +108,6 @@ CEED_QFUNCTION_HELPER void ComputeHLLSpeeds_Roe(NewtonianIdealGasContext gas, St
   // Stability requires that these speed estimates are *at least* as fast as the physical wave speeds.
   CeedScalar u_roe = RoeAverage(r, u_left, u_right);
 
-  // TODO: revisit this for gravity
   CeedScalar H_left  = TotalSpecificEnthalpy(gas, left);
   CeedScalar H_right = TotalSpecificEnthalpy(gas, right);
   CeedScalar H_roe   = RoeAverage(r, H_left, H_right);

From 935f026aeac4afd1a5641109d8d2197022f564c1 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 21 Jun 2024 10:29:06 -0600
Subject: [PATCH 081/571] op - add terse view

---
 include/ceed/ceed.h              |  1 +
 interface/ceed-operator.c        | 42 +++++++++++++++++++++++++++-----
 tests/output/t504-operator-f.out |  4 +--
 tests/output/t504-operator.out   |  6 +++--
 tests/output/t523-operator.out   |  6 +++++
 tests/t504-operator-f.f90        |  2 ++
 tests/t504-operator.c            |  4 +++
 tests/t523-operator.c            |  2 ++
 8 files changed, 57 insertions(+), 10 deletions(-)

diff --git a/include/ceed/ceed.h b/include/ceed/ceed.h
index f0f6f65be7..4c950a02bd 100644
--- a/include/ceed/ceed.h
+++ b/include/ceed/ceed.h
@@ -438,6 +438,7 @@ CEED_EXTERN int  CeedOperatorMultigridLevelCreateH1(CeedOperator op_fine, CeedVe
 CEED_EXTERN int  CeedOperatorCreateFDMElementInverse(CeedOperator op, CeedOperator *fdm_inv, CeedRequest *request);
 CEED_EXTERN int  CeedOperatorSetName(CeedOperator op, const char *name);
 CEED_EXTERN int  CeedOperatorView(CeedOperator op, FILE *stream);
+CEED_EXTERN int  CeedOperatorViewTerse(CeedOperator op, FILE *stream);
 CEED_EXTERN int  CeedOperatorGetCeed(CeedOperator op, Ceed *ceed);
 CEED_EXTERN Ceed CeedOperatorReturnCeed(CeedOperator op);
 CEED_EXTERN int  CeedOperatorGetNumElements(CeedOperator op, CeedInt *num_elem);
diff --git a/interface/ceed-operator.c b/interface/ceed-operator.c
index 9e96a1ac84..9a1277084a 100644
--- a/interface/ceed-operator.c
+++ b/interface/ceed-operator.c
@@ -1469,16 +1469,16 @@ int CeedOperatorSetName(CeedOperator op, const char *name) {
 }
 
 /**
-  @brief View a `CeedOperator`
+  @brief Core logic for viewing a `CeedOperator`
 
-  @param[in] op     `CeedOperator` to view
+  @param[in] op     `CeedOperator` to view brief summary
   @param[in] stream Stream to write; typically `stdout` or a file
 
   @return Error code: 0 - success, otherwise - failure
 
   @ref User
 **/
-int CeedOperatorView(CeedOperator op, FILE *stream) {
+int CeedOperatorView_Core(CeedOperator op, FILE *stream, bool is_full) {
   bool has_name = op->name, is_composite;
 
   CeedCall(CeedOperatorIsComposite(op, &is_composite));
@@ -1492,16 +1492,46 @@ int CeedOperatorView(CeedOperator op, FILE *stream) {
 
     for (CeedInt i = 0; i < num_suboperators; i++) {
       has_name = sub_operators[i]->name;
-      fprintf(stream, "  SubOperator %" CeedInt_FMT "%s%s:\n", i, has_name ? " - " : "", has_name ? sub_operators[i]->name : "");
-      CeedCall(CeedOperatorSingleView(sub_operators[i], 1, stream));
+      fprintf(stream, "  SubOperator %" CeedInt_FMT "%s%s%s\n", i, has_name ? " - " : "", has_name ? sub_operators[i]->name : "", is_full ? ":" : "");
+      if (is_full) CeedCall(CeedOperatorSingleView(sub_operators[i], 1, stream));
     }
   } else {
     fprintf(stream, "CeedOperator%s%s\n", has_name ? " - " : "", has_name ? op->name : "");
-    CeedCall(CeedOperatorSingleView(op, 0, stream));
+    if (is_full) CeedCall(CeedOperatorSingleView(op, 0, stream));
   }
   return CEED_ERROR_SUCCESS;
 }
 
+/**
+  @brief View a `CeedOperator`
+
+  @param[in] op     `CeedOperator` to view
+  @param[in] stream Stream to write; typically `stdout` or a file
+
+  @return Error code: 0 - success, otherwise - failure
+
+  @ref User
+**/
+int CeedOperatorView(CeedOperator op, FILE *stream) {
+  CeedCall(CeedOperatorView_Core(op, stream, true));
+  return CEED_ERROR_SUCCESS;
+}
+
+/**
+  @brief View a brief summary `CeedOperator`
+
+  @param[in] op     `CeedOperator` to view brief summary
+  @param[in] stream Stream to write; typically `stdout` or a file
+
+  @return Error code: 0 - success, otherwise - failure
+
+  @ref User
+**/
+int CeedOperatorViewTerse(CeedOperator op, FILE *stream) {
+  CeedCall(CeedOperatorView_Core(op, stream, false));
+  return CEED_ERROR_SUCCESS;
+}
+
 /**
   @brief Get the `Ceed` associated with a `CeedOperator`
 
diff --git a/tests/output/t504-operator-f.out b/tests/output/t504-operator-f.out
index 3fcc6b0458..41b0ea772b 100644
--- a/tests/output/t504-operator-f.out
+++ b/tests/output/t504-operator-f.out
@@ -1,4 +1,4 @@
-CeedOperator
+CeedOperator - setup
   15 elements with 8 quadrature points each
   3 fields
   2 input fields:
@@ -19,7 +19,7 @@ CeedOperator
       EvalMode: none
       No basis
       Active vector
-CeedOperator
+CeedOperator - mass
   15 elements with 8 quadrature points each
   3 fields
   2 input fields:
diff --git a/tests/output/t504-operator.out b/tests/output/t504-operator.out
index 3fcc6b0458..3f1a31a7d4 100644
--- a/tests/output/t504-operator.out
+++ b/tests/output/t504-operator.out
@@ -1,4 +1,5 @@
-CeedOperator
+CeedOperator - setup
+CeedOperator - setup
   15 elements with 8 quadrature points each
   3 fields
   2 input fields:
@@ -19,7 +20,8 @@ CeedOperator
       EvalMode: none
       No basis
       Active vector
-CeedOperator
+CeedOperator - mass
+CeedOperator - mass
   15 elements with 8 quadrature points each
   3 fields
   2 input fields:
diff --git a/tests/output/t523-operator.out b/tests/output/t523-operator.out
index 1817a8a2cf..87d61f5b0a 100644
--- a/tests/output/t523-operator.out
+++ b/tests/output/t523-operator.out
@@ -1,3 +1,6 @@
+Composite CeedOperator - setup
+  SubOperator 0 - triangle elements
+  SubOperator 1 - quadrilateral elements
 Composite CeedOperator - setup
   SubOperator 0 - triangle elements:
     6 elements with 4 quadrature points each
@@ -39,6 +42,9 @@ Composite CeedOperator - setup
         Size: 1
         EvalMode: none
         No basis
+Composite CeedOperator - mass
+  SubOperator 0 - triangle elements
+  SubOperator 1 - quadrilateral elements
 Composite CeedOperator - mass
   SubOperator 0 - triangle elements:
     6 elements with 4 quadrature points each
diff --git a/tests/t504-operator-f.f90 b/tests/t504-operator-f.f90
index ccc87614bc..5e555d5c13 100644
--- a/tests/t504-operator-f.f90
+++ b/tests/t504-operator-f.f90
@@ -92,7 +92,9 @@ program test
       call ceedoperatorsetfield(op_mass,'v',erestrictu,bu,&
      & ceed_vector_active,err)
 
+      call ceedoperatorsetname(op_setup,'setup',err)
       call ceedoperatorview(op_setup,err)
+      call ceedoperatorsetname(op_mass,'mass',err)
       call ceedoperatorview(op_mass,err)
 
       call ceedvectordestroy(qdata,err)
diff --git a/tests/t504-operator.c b/tests/t504-operator.c
index ce5e7bb0c2..4d75c79b0c 100644
--- a/tests/t504-operator.c
+++ b/tests/t504-operator.c
@@ -66,7 +66,11 @@ int main(int argc, char **argv) {
   CeedOperatorSetField(op_mass, "u", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE);
   CeedOperatorSetField(op_mass, "v", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE);
 
+  CeedOperatorSetName(op_setup, "setup");
+  CeedOperatorViewTerse(op_setup, stdout);
   CeedOperatorView(op_setup, stdout);
+  CeedOperatorSetName(op_mass, "mass");
+  CeedOperatorViewTerse(op_mass, stdout);
   CeedOperatorView(op_mass, stdout);
 
   CeedVectorDestroy(&q_data);
diff --git a/tests/t523-operator.c b/tests/t523-operator.c
index b882379516..9b614ec360 100644
--- a/tests/t523-operator.c
+++ b/tests/t523-operator.c
@@ -164,7 +164,9 @@ int main(int argc, char **argv) {
   CeedCompositeOperatorAddSub(op_mass, op_mass_hex);
 
   // View
+  CeedOperatorViewTerse(op_setup, stdout);
   CeedOperatorView(op_setup, stdout);
+  CeedOperatorViewTerse(op_mass, stdout);
   CeedOperatorView(op_mass, stdout);
 
   // Cleanup

From 097cc79570fc1ccd17774d2bf9cd5e51d3925370 Mon Sep 17 00:00:00 2001
From: James Wright <james@jameswright.xyz>
Date: Fri, 21 Jun 2024 14:37:57 -0600
Subject: [PATCH 082/571] basis: CreateProjection set q_ref, q_weight to NULL

---
 backends/cuda-ref/ceed-cuda-ref-basis.c       |  30 ++--
 backends/cuda-shared/ceed-cuda-shared-basis.c |   9 +-
 backends/hip-ref/ceed-hip-ref-basis.c         |  30 ++--
 backends/hip-shared/ceed-hip-shared-basis.c   |   9 +-
 backends/magma/ceed-magma-basis.c             |  30 ++--
 backends/sycl-gen/ceed-sycl-gen.sycl.cpp      |   2 +-
 .../sycl-ref/ceed-sycl-ref-basis.sycl.cpp     |  30 +++-
 .../ceed-sycl-shared-basis.sycl.cpp           |  15 +-
 .../sycl-shared/ceed-sycl-shared.sycl.cpp     |   2 +-
 interface/ceed-basis.c                        |  12 +-
 tests/t319-basis.c                            | 165 +++++++++++-------
 11 files changed, 212 insertions(+), 122 deletions(-)

diff --git a/backends/cuda-ref/ceed-cuda-ref-basis.c b/backends/cuda-ref/ceed-cuda-ref-basis.c
index 529c538182..e4ec48105d 100644
--- a/backends/cuda-ref/ceed-cuda-ref-basis.c
+++ b/backends/cuda-ref/ceed-cuda-ref-basis.c
@@ -60,6 +60,7 @@ int CeedBasisApply_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMo
       CeedCallBackend(CeedRunKernel_Cuda(ceed, data->Grad, num_elem, block_size, grad_args));
     } break;
     case CEED_EVAL_WEIGHT: {
+      CeedCheck(data->d_q_weight_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights_1d not set", CeedEvalModes[eval_mode]);
       void     *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight_1d, &d_v};
       const int block_size_x  = Q_1d;
       const int block_size_y  = dim >= 2 ? Q_1d : 1;
@@ -157,6 +158,7 @@ int CeedBasisApplyNonTensor_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTr
       }
     } break;
     case CEED_EVAL_WEIGHT: {
+      CeedCheck(data->d_q_weight, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights not set", CeedEvalModes[eval_mode]);
       void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight, &d_v};
 
       CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, grid, num_qpts, 1, elems_per_block, weight_args));
@@ -182,7 +184,7 @@ static int CeedBasisDestroy_Cuda(CeedBasis basis) {
   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
   CeedCallBackend(CeedBasisGetData(basis, &data));
   CeedCallCuda(ceed, cuModuleUnload(data->module));
-  CeedCallCuda(ceed, cudaFree(data->d_q_weight_1d));
+  if (data->d_q_weight_1d) CeedCallCuda(ceed, cudaFree(data->d_q_weight_1d));
   CeedCallCuda(ceed, cudaFree(data->d_interp_1d));
   CeedCallCuda(ceed, cudaFree(data->d_grad_1d));
   CeedCallBackend(CeedFree(&data));
@@ -199,7 +201,7 @@ static int CeedBasisDestroyNonTensor_Cuda(CeedBasis basis) {
   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
   CeedCallBackend(CeedBasisGetData(basis, &data));
   CeedCallCuda(ceed, cuModuleUnload(data->module));
-  CeedCallCuda(ceed, cudaFree(data->d_q_weight));
+  if (data->d_q_weight) CeedCallCuda(ceed, cudaFree(data->d_q_weight));
   CeedCallCuda(ceed, cudaFree(data->d_interp));
   CeedCallCuda(ceed, cudaFree(data->d_grad));
   CeedCallCuda(ceed, cudaFree(data->d_div));
@@ -225,8 +227,10 @@ int CeedBasisCreateTensorH1_Cuda(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const
   CeedCallBackend(CeedCalloc(1, &data));
 
   // Copy data to GPU
-  CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight_1d, q_bytes));
-  CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight_1d, q_weight_1d, q_bytes, cudaMemcpyHostToDevice));
+  if (q_weight_1d) {
+    CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight_1d, q_bytes));
+    CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight_1d, q_weight_1d, q_bytes, cudaMemcpyHostToDevice));
+  }
   CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp_1d, interp_bytes));
   CeedCallCuda(ceed, cudaMemcpy(data->d_interp_1d, interp_1d, interp_bytes, cudaMemcpyHostToDevice));
   CeedCallCuda(ceed, cudaMalloc((void **)&data->d_grad_1d, interp_bytes));
@@ -273,8 +277,10 @@ int CeedBasisCreateH1_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes
   // Copy basis data to GPU
   CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp));
   CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_GRAD, &q_comp_grad));
-  CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight, q_bytes));
-  CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight, q_weight, q_bytes, cudaMemcpyHostToDevice));
+  if (q_weight) {
+    CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight, q_bytes));
+    CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight, q_weight, q_bytes, cudaMemcpyHostToDevice));
+  }
   if (interp) {
     const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp;
 
@@ -330,8 +336,10 @@ int CeedBasisCreateHdiv_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nod
   // Copy basis data to GPU
   CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp));
   CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_DIV, &q_comp_div));
-  CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight, q_bytes));
-  CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight, q_weight, q_bytes, cudaMemcpyHostToDevice));
+  if (q_weight) {
+    CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight, q_bytes));
+    CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight, q_weight, q_bytes, cudaMemcpyHostToDevice));
+  }
   if (interp) {
     const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp;
 
@@ -387,8 +395,10 @@ int CeedBasisCreateHcurl_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_no
   // Copy basis data to GPU
   CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp));
   CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_CURL, &q_comp_curl));
-  CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight, q_bytes));
-  CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight, q_weight, q_bytes, cudaMemcpyHostToDevice));
+  if (q_weight) {
+    CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight, q_bytes));
+    CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight, q_weight, q_bytes, cudaMemcpyHostToDevice));
+  }
   if (interp) {
     const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp;
 
diff --git a/backends/cuda-shared/ceed-cuda-shared-basis.c b/backends/cuda-shared/ceed-cuda-shared-basis.c
index c22bce82da..f5b5897167 100644
--- a/backends/cuda-shared/ceed-cuda-shared-basis.c
+++ b/backends/cuda-shared/ceed-cuda-shared-basis.c
@@ -148,6 +148,7 @@ int CeedBasisApplyTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, Ce
       CeedInt Q_1d;
       CeedInt block_size = 32;
 
+      CeedCheck(data->d_q_weight_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights_1d not set", CeedEvalModes[eval_mode]);
       CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
       void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight_1d, &d_v};
       if (dim == 1) {
@@ -195,7 +196,7 @@ static int CeedBasisDestroy_Cuda_shared(CeedBasis basis) {
   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
   CeedCallBackend(CeedBasisGetData(basis, &data));
   CeedCallCuda(ceed, cuModuleUnload(data->module));
-  CeedCallCuda(ceed, cudaFree(data->d_q_weight_1d));
+  if (data->d_q_weight_1d) CeedCallCuda(ceed, cudaFree(data->d_q_weight_1d));
   CeedCallCuda(ceed, cudaFree(data->d_interp_1d));
   CeedCallCuda(ceed, cudaFree(data->d_grad_1d));
   CeedCallCuda(ceed, cudaFree(data->d_collo_grad_1d));
@@ -220,8 +221,10 @@ int CeedBasisCreateTensorH1_Cuda_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d,
   CeedCallBackend(CeedCalloc(1, &data));
 
   // Copy basis data to GPU
-  CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight_1d, q_bytes));
-  CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight_1d, q_weight_1d, q_bytes, cudaMemcpyHostToDevice));
+  if (q_weight_1d) {
+    CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight_1d, q_bytes));
+    CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight_1d, q_weight_1d, q_bytes, cudaMemcpyHostToDevice));
+  }
   CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp_1d, interp_bytes));
   CeedCallCuda(ceed, cudaMemcpy(data->d_interp_1d, interp_1d, interp_bytes, cudaMemcpyHostToDevice));
   CeedCallCuda(ceed, cudaMalloc((void **)&data->d_grad_1d, interp_bytes));
diff --git a/backends/hip-ref/ceed-hip-ref-basis.c b/backends/hip-ref/ceed-hip-ref-basis.c
index 3163018669..243e801e52 100644
--- a/backends/hip-ref/ceed-hip-ref-basis.c
+++ b/backends/hip-ref/ceed-hip-ref-basis.c
@@ -59,6 +59,7 @@ int CeedBasisApply_Hip(CeedBasis basis, const CeedInt num_elem, CeedTransposeMod
       CeedCallBackend(CeedRunKernel_Hip(ceed, data->Grad, num_elem, block_size, grad_args));
     } break;
     case CEED_EVAL_WEIGHT: {
+      CeedCheck(data->d_q_weight_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights_1d not set", CeedEvalModes[eval_mode]);
       void     *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight_1d, &d_v};
       const int block_size_x  = Q_1d;
       const int block_size_y  = dim >= 2 ? Q_1d : 1;
@@ -156,6 +157,7 @@ int CeedBasisApplyNonTensor_Hip(CeedBasis basis, const CeedInt num_elem, CeedTra
       }
     } break;
     case CEED_EVAL_WEIGHT: {
+      CeedCheck(data->d_q_weight, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights not set", CeedEvalModes[eval_mode]);
       void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight, &d_v};
 
       CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Weight, grid, num_qpts, 1, elems_per_block, weight_args));
@@ -181,7 +183,7 @@ static int CeedBasisDestroy_Hip(CeedBasis basis) {
   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
   CeedCallBackend(CeedBasisGetData(basis, &data));
   CeedCallHip(ceed, hipModuleUnload(data->module));
-  CeedCallHip(ceed, hipFree(data->d_q_weight_1d));
+  if (data->d_q_weight_1d) CeedCallHip(ceed, hipFree(data->d_q_weight_1d));
   CeedCallHip(ceed, hipFree(data->d_interp_1d));
   CeedCallHip(ceed, hipFree(data->d_grad_1d));
   CeedCallBackend(CeedFree(&data));
@@ -198,7 +200,7 @@ static int CeedBasisDestroyNonTensor_Hip(CeedBasis basis) {
   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
   CeedCallBackend(CeedBasisGetData(basis, &data));
   CeedCallHip(ceed, hipModuleUnload(data->module));
-  CeedCallHip(ceed, hipFree(data->d_q_weight));
+  if (data->d_q_weight) CeedCallHip(ceed, hipFree(data->d_q_weight));
   CeedCallHip(ceed, hipFree(data->d_interp));
   CeedCallHip(ceed, hipFree(data->d_grad));
   CeedCallHip(ceed, hipFree(data->d_div));
@@ -224,8 +226,10 @@ int CeedBasisCreateTensorH1_Hip(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const C
   CeedCallBackend(CeedCalloc(1, &data));
 
   // Copy data to GPU
-  CeedCallHip(ceed, hipMalloc((void **)&data->d_q_weight_1d, q_bytes));
-  CeedCallHip(ceed, hipMemcpy(data->d_q_weight_1d, q_weight_1d, q_bytes, hipMemcpyHostToDevice));
+  if (q_weight_1d) {
+    CeedCallHip(ceed, hipMalloc((void **)&data->d_q_weight_1d, q_bytes));
+    CeedCallHip(ceed, hipMemcpy(data->d_q_weight_1d, q_weight_1d, q_bytes, hipMemcpyHostToDevice));
+  }
   CeedCallHip(ceed, hipMalloc((void **)&data->d_interp_1d, interp_bytes));
   CeedCallHip(ceed, hipMemcpy(data->d_interp_1d, interp_1d, interp_bytes, hipMemcpyHostToDevice));
   CeedCallHip(ceed, hipMalloc((void **)&data->d_grad_1d, interp_bytes));
@@ -272,8 +276,10 @@ int CeedBasisCreateH1_Hip(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes,
   // Copy basis data to GPU
   CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp));
   CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_GRAD, &q_comp_grad));
-  CeedCallHip(ceed, hipMalloc((void **)&data->d_q_weight, q_bytes));
-  CeedCallHip(ceed, hipMemcpy(data->d_q_weight, q_weight, q_bytes, hipMemcpyHostToDevice));
+  if (q_weight) {
+    CeedCallHip(ceed, hipMalloc((void **)&data->d_q_weight, q_bytes));
+    CeedCallHip(ceed, hipMemcpy(data->d_q_weight, q_weight, q_bytes, hipMemcpyHostToDevice));
+  }
   if (interp) {
     const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp;
 
@@ -329,8 +335,10 @@ int CeedBasisCreateHdiv_Hip(CeedElemTopology topo, CeedInt dim, CeedInt num_node
   // Copy basis data to GPU
   CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp));
   CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_DIV, &q_comp_div));
-  CeedCallHip(ceed, hipMalloc((void **)&data->d_q_weight, q_bytes));
-  CeedCallHip(ceed, hipMemcpy(data->d_q_weight, q_weight, q_bytes, hipMemcpyHostToDevice));
+  if (q_weight) {
+    CeedCallHip(ceed, hipMalloc((void **)&data->d_q_weight, q_bytes));
+    CeedCallHip(ceed, hipMemcpy(data->d_q_weight, q_weight, q_bytes, hipMemcpyHostToDevice));
+  }
   if (interp) {
     const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp;
 
@@ -386,8 +394,10 @@ int CeedBasisCreateHcurl_Hip(CeedElemTopology topo, CeedInt dim, CeedInt num_nod
   // Copy basis data to GPU
   CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp));
   CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_CURL, &q_comp_curl));
-  CeedCallHip(ceed, hipMalloc((void **)&data->d_q_weight, q_bytes));
-  CeedCallHip(ceed, hipMemcpy(data->d_q_weight, q_weight, q_bytes, hipMemcpyHostToDevice));
+  if (q_weight) {
+    CeedCallHip(ceed, hipMalloc((void **)&data->d_q_weight, q_bytes));
+    CeedCallHip(ceed, hipMemcpy(data->d_q_weight, q_weight, q_bytes, hipMemcpyHostToDevice));
+  }
   if (interp) {
     const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp;
 
diff --git a/backends/hip-shared/ceed-hip-shared-basis.c b/backends/hip-shared/ceed-hip-shared-basis.c
index 7642043d4c..298a270292 100644
--- a/backends/hip-shared/ceed-hip-shared-basis.c
+++ b/backends/hip-shared/ceed-hip-shared-basis.c
@@ -205,6 +205,7 @@ int CeedBasisApplyTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, Cee
       CeedInt Q_1d;
       CeedInt block_size = data->block_sizes[2];
 
+      CeedCheck(data->d_q_weight_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights_1d not set", CeedEvalModes[eval_mode]);
       CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
       void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight_1d, &d_v};
 
@@ -254,7 +255,7 @@ static int CeedBasisDestroy_Hip_shared(CeedBasis basis) {
   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
   CeedCallBackend(CeedBasisGetData(basis, &data));
   CeedCallHip(ceed, hipModuleUnload(data->module));
-  CeedCallHip(ceed, hipFree(data->d_q_weight_1d));
+  if (data->d_q_weight_1d) CeedCallHip(ceed, hipFree(data->d_q_weight_1d));
   CeedCallHip(ceed, hipFree(data->d_interp_1d));
   CeedCallHip(ceed, hipFree(data->d_grad_1d));
   CeedCallHip(ceed, hipFree(data->d_collo_grad_1d));
@@ -279,8 +280,10 @@ int CeedBasisCreateTensorH1_Hip_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d,
   CeedCallBackend(CeedCalloc(1, &data));
 
   // Copy basis data to GPU
-  CeedCallHip(ceed, hipMalloc((void **)&data->d_q_weight_1d, q_bytes));
-  CeedCallHip(ceed, hipMemcpy(data->d_q_weight_1d, q_weight_1d, q_bytes, hipMemcpyHostToDevice));
+  if (q_weight_1d) {
+    CeedCallHip(ceed, hipMalloc((void **)&data->d_q_weight_1d, q_bytes));
+    CeedCallHip(ceed, hipMemcpy(data->d_q_weight_1d, q_weight_1d, q_bytes, hipMemcpyHostToDevice));
+  }
   CeedCallHip(ceed, hipMalloc((void **)&data->d_interp_1d, interp_bytes));
   CeedCallHip(ceed, hipMemcpy(data->d_interp_1d, interp_1d, interp_bytes, hipMemcpyHostToDevice));
   CeedCallHip(ceed, hipMalloc((void **)&data->d_grad_1d, interp_bytes));
diff --git a/backends/magma/ceed-magma-basis.c b/backends/magma/ceed-magma-basis.c
index 3d00a64100..f1e52ba2db 100644
--- a/backends/magma/ceed-magma-basis.c
+++ b/backends/magma/ceed-magma-basis.c
@@ -199,6 +199,7 @@ static int CeedBasisApply_Magma(CeedBasis basis, CeedInt num_elem, CeedTranspose
     } break;
     case CEED_EVAL_WEIGHT: {
       CeedCheck(t_mode != CEED_TRANSPOSE, ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE");
+      CeedCheck(impl->d_q_weight_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weight_1d not set", CeedEvalModes[e_mode]);
       CeedInt elem_dofs_size = CeedIntPow(Q, dim);
       CeedInt num_threads    = 1;
       CeedInt num_t_col      = 1;
@@ -414,6 +415,7 @@ static int CeedBasisApplyNonTensor_Magma(CeedBasis basis, CeedInt num_elem, Ceed
     }
   } else {
     CeedCheck(t_mode != CEED_TRANSPOSE, ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE");
+    CeedCheck(impl->d_q_weight, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weight not set", CeedEvalModes[e_mode]);
     CeedInt num_t_col  = MAGMA_BASIS_NTCOL(Q, MAGMA_MAXTHREADS_1D);
     CeedInt grid       = CeedDivUpInt(num_elem, num_t_col);
     CeedInt shared_mem = Q * sizeof(CeedScalar) + num_t_col * Q * sizeof(CeedScalar);
@@ -449,7 +451,7 @@ static int CeedBasisDestroy_Magma(CeedBasis basis) {
 #endif
   CeedCallBackend(magma_free(impl->d_interp_1d));
   CeedCallBackend(magma_free(impl->d_grad_1d));
-  CeedCallBackend(magma_free(impl->d_q_weight_1d));
+  if (impl->d_q_weight_1d) CeedCallBackend(magma_free(impl->d_q_weight_1d));
   CeedCallBackend(CeedFree(&impl));
   return CEED_ERROR_SUCCESS;
 }
@@ -476,7 +478,7 @@ static int CeedBasisDestroyNonTensor_Magma(CeedBasis basis) {
   CeedCallBackend(magma_free(impl->d_grad));
   CeedCallBackend(magma_free(impl->d_div));
   CeedCallBackend(magma_free(impl->d_curl));
-  CeedCallBackend(magma_free(impl->d_q_weight));
+  if (impl->d_q_weight) CeedCallBackend(magma_free(impl->d_q_weight));
   CeedCallBackend(CeedFree(&impl));
   return CEED_ERROR_SUCCESS;
 }
@@ -500,8 +502,10 @@ int CeedBasisCreateTensorH1_Magma(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const
   CeedCallBackend(CeedCalloc(1, &impl));
 
   // Copy basis data to GPU
-  CeedCallBackend(magma_malloc((void **)&impl->d_q_weight_1d, Q_1d * sizeof(q_weight_1d[0])));
-  magma_setvector(Q_1d, sizeof(q_weight_1d[0]), q_weight_1d, 1, impl->d_q_weight_1d, 1, data->queue);
+  if (q_weight_1d) {
+    CeedCallBackend(magma_malloc((void **)&impl->d_q_weight_1d, Q_1d * sizeof(q_weight_1d[0])));
+    magma_setvector(Q_1d, sizeof(q_weight_1d[0]), q_weight_1d, 1, impl->d_q_weight_1d, 1, data->queue);
+  }
   CeedCallBackend(magma_malloc((void **)&impl->d_interp_1d, Q_1d * P_1d * sizeof(interp_1d[0])));
   magma_setvector(Q_1d * P_1d, sizeof(interp_1d[0]), interp_1d, 1, impl->d_interp_1d, 1, data->queue);
   CeedCallBackend(magma_malloc((void **)&impl->d_grad_1d, Q_1d * P_1d * sizeof(grad_1d[0])));
@@ -594,8 +598,10 @@ int CeedBasisCreateH1_Magma(CeedElemTopology topo, CeedInt dim, CeedInt num_node
   CeedCallBackend(CeedCalloc(1, &impl));
 
   // Copy basis data to GPU
-  CeedCallBackend(magma_malloc((void **)&impl->d_q_weight, num_qpts * sizeof(q_weight[0])));
-  magma_setvector(num_qpts, sizeof(q_weight[0]), q_weight, 1, impl->d_q_weight, 1, data->queue);
+  if (q_weight) {
+    CeedCallBackend(magma_malloc((void **)&impl->d_q_weight, num_qpts * sizeof(q_weight[0])));
+    magma_setvector(num_qpts, sizeof(q_weight[0]), q_weight, 1, impl->d_q_weight, 1, data->queue);
+  }
   if (interp) {
     CeedInt q_comp_interp;
 
@@ -653,8 +659,10 @@ int CeedBasisCreateHdiv_Magma(CeedElemTopology topo, CeedInt dim, CeedInt num_no
   CeedCallBackend(CeedCalloc(1, &impl));
 
   // Copy basis data to GPU
-  CeedCallBackend(magma_malloc((void **)&impl->d_q_weight, num_qpts * sizeof(q_weight[0])));
-  magma_setvector(num_qpts, sizeof(q_weight[0]), q_weight, 1, impl->d_q_weight, 1, data->queue);
+  if (q_weight) {
+    CeedCallBackend(magma_malloc((void **)&impl->d_q_weight, num_qpts * sizeof(q_weight[0])));
+    magma_setvector(num_qpts, sizeof(q_weight[0]), q_weight, 1, impl->d_q_weight, 1, data->queue);
+  }
   if (interp) {
     CeedInt q_comp_interp;
 
@@ -712,8 +720,10 @@ int CeedBasisCreateHcurl_Magma(CeedElemTopology topo, CeedInt dim, CeedInt num_n
   CeedCallBackend(CeedCalloc(1, &impl));
 
   // Copy basis data to GPU
-  CeedCallBackend(magma_malloc((void **)&impl->d_q_weight, num_qpts * sizeof(q_weight[0])));
-  magma_setvector(num_qpts, sizeof(q_weight[0]), q_weight, 1, impl->d_q_weight, 1, data->queue);
+  if (q_weight) {
+    CeedCallBackend(magma_malloc((void **)&impl->d_q_weight, num_qpts * sizeof(q_weight[0])));
+    magma_setvector(num_qpts, sizeof(q_weight[0]), q_weight, 1, impl->d_q_weight, 1, data->queue);
+  }
   if (interp) {
     CeedInt q_comp_interp;
 
diff --git a/backends/sycl-gen/ceed-sycl-gen.sycl.cpp b/backends/sycl-gen/ceed-sycl-gen.sycl.cpp
index 3b274c8348..2cee0469a7 100644
--- a/backends/sycl-gen/ceed-sycl-gen.sycl.cpp
+++ b/backends/sycl-gen/ceed-sycl-gen.sycl.cpp
@@ -19,7 +19,7 @@
 //------------------------------------------------------------------------------
 static int CeedInit_Sycl_gen(const char *resource, Ceed ceed) {
   Ceed       ceed_shared;
-  Ceed_Sycl *data, *shared_data;
+  Ceed_Sycl *data;
   char      *resource_root;
   const char fallback_resource[] = "/gpu/sycl/ref";
 
diff --git a/backends/sycl-ref/ceed-sycl-ref-basis.sycl.cpp b/backends/sycl-ref/ceed-sycl-ref-basis.sycl.cpp
index 54c01f0825..e5495fe83f 100644
--- a/backends/sycl-ref/ceed-sycl-ref-basis.sycl.cpp
+++ b/backends/sycl-ref/ceed-sycl-ref-basis.sycl.cpp
@@ -312,6 +312,7 @@ static int CeedBasisApply_Sycl(CeedBasis basis, const CeedInt num_elem, CeedTran
       }
       break;
     case CEED_EVAL_WEIGHT:
+      CeedCheck(impl->d_q_weight_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weight_1d not set", CeedEvalModes[eval_mode]);
       CeedCallBackend(CeedBasisApplyWeight_Sycl(data->sycl_queue, num_elem, impl, d_v));
       break;
     case CEED_EVAL_NONE: /* handled separately below */
@@ -487,6 +488,7 @@ static int CeedBasisApplyNonTensor_Sycl(CeedBasis basis, const CeedInt num_elem,
       CeedCallBackend(CeedBasisApplyNonTensorGrad_Sycl(data->sycl_queue, num_elem, is_transpose, impl, d_u, d_v));
       break;
     case CEED_EVAL_WEIGHT:
+      CeedCheck(impl->d_q_weight, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights not set", CeedEvalModes[eval_mode]);
       CeedCallBackend(CeedBasisApplyNonTensorWeight_Sycl(data->sycl_queue, num_elem, impl, d_v));
       break;
     case CEED_EVAL_NONE: /* handled separately below */
@@ -520,7 +522,7 @@ static int CeedBasisDestroy_Sycl(CeedBasis basis) {
   // Wait for all work to finish before freeing memory
   CeedCallSycl(ceed, data->sycl_queue.wait_and_throw());
 
-  CeedCallSycl(ceed, sycl::free(impl->d_q_weight_1d, data->sycl_context));
+  if (impl->d_q_weight_1d) CeedCallSycl(ceed, sycl::free(impl->d_q_weight_1d, data->sycl_context));
   CeedCallSycl(ceed, sycl::free(impl->d_interp_1d, data->sycl_context));
   CeedCallSycl(ceed, sycl::free(impl->d_grad_1d, data->sycl_context));
 
@@ -542,7 +544,7 @@ static int CeedBasisDestroyNonTensor_Sycl(CeedBasis basis) {
   // Wait for all work to finish before freeing memory
   CeedCallSycl(ceed, data->sycl_queue.wait_and_throw());
 
-  CeedCallSycl(ceed, sycl::free(impl->d_q_weight, data->sycl_context));
+  if (impl->d_q_weight) CeedCallSycl(ceed, sycl::free(impl->d_q_weight, data->sycl_context));
   CeedCallSycl(ceed, sycl::free(impl->d_interp, data->sycl_context));
   CeedCallSycl(ceed, sycl::free(impl->d_grad, data->sycl_context));
 
@@ -581,17 +583,23 @@ int CeedBasisCreateTensorH1_Sycl(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const
 
   if (!data->sycl_queue.is_in_order()) e = {data->sycl_queue.ext_oneapi_submit_barrier()};
 
-  CeedCallSycl(ceed, impl->d_q_weight_1d = sycl::malloc_device<CeedScalar>(Q_1d, data->sycl_device, data->sycl_context));
-  sycl::event copy_weight = data->sycl_queue.copy<CeedScalar>(q_weight_1d, impl->d_q_weight_1d, Q_1d, e);
+  std::vector<sycl::event> copy_events;
+  if (q_weight_1d) {
+    CeedCallSycl(ceed, impl->d_q_weight_1d = sycl::malloc_device<CeedScalar>(Q_1d, data->sycl_device, data->sycl_context));
+    sycl::event copy_weight = data->sycl_queue.copy<CeedScalar>(q_weight_1d, impl->d_q_weight_1d, Q_1d, e);
+    copy_events.push_back(copy_weight);
+  }
 
   const CeedInt interp_length = Q_1d * P_1d;
   CeedCallSycl(ceed, impl->d_interp_1d = sycl::malloc_device<CeedScalar>(interp_length, data->sycl_device, data->sycl_context));
   sycl::event copy_interp = data->sycl_queue.copy<CeedScalar>(interp_1d, impl->d_interp_1d, interp_length, e);
+  copy_events.push_back(copy_interp);
 
   CeedCallSycl(ceed, impl->d_grad_1d = sycl::malloc_device<CeedScalar>(interp_length, data->sycl_device, data->sycl_context));
   sycl::event copy_grad = data->sycl_queue.copy<CeedScalar>(grad_1d, impl->d_grad_1d, interp_length, e);
+  copy_events.push_back(copy_grad);
 
-  CeedCallSycl(ceed, sycl::event::wait_and_throw({copy_weight, copy_interp, copy_grad}));
+  CeedCallSycl(ceed, sycl::event::wait_and_throw(copy_events));
 
   std::vector<sycl::kernel_id> kernel_ids = {sycl::get_kernel_id<CeedBasisSyclInterp<1>>(), sycl::get_kernel_id<CeedBasisSyclInterp<0>>(),
                                              sycl::get_kernel_id<CeedBasisSyclGrad<1>>(), sycl::get_kernel_id<CeedBasisSyclGrad<0>>()};
@@ -636,18 +644,24 @@ int CeedBasisCreateH1_Sycl(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes
 
   if (!data->sycl_queue.is_in_order()) e = {data->sycl_queue.ext_oneapi_submit_barrier()};
 
-  CeedCallSycl(ceed, impl->d_q_weight = sycl::malloc_device<CeedScalar>(num_qpts, data->sycl_device, data->sycl_context));
-  sycl::event copy_weight = data->sycl_queue.copy<CeedScalar>(q_weight, impl->d_q_weight, num_qpts, e);
+  std::vector<sycl::event> copy_events;
+  if (q_weight) {
+    CeedCallSycl(ceed, impl->d_q_weight = sycl::malloc_device<CeedScalar>(num_qpts, data->sycl_device, data->sycl_context));
+    sycl::event copy_weight = data->sycl_queue.copy<CeedScalar>(q_weight, impl->d_q_weight, num_qpts, e);
+    copy_events.push_back(copy_weight);
+  }
 
   const CeedInt interp_length = num_qpts * num_nodes;
   CeedCallSycl(ceed, impl->d_interp = sycl::malloc_device<CeedScalar>(interp_length, data->sycl_device, data->sycl_context));
   sycl::event copy_interp = data->sycl_queue.copy<CeedScalar>(interp, impl->d_interp, interp_length, e);
+  copy_events.push_back(copy_interp);
 
   const CeedInt grad_length = num_qpts * num_nodes * dim;
   CeedCallSycl(ceed, impl->d_grad = sycl::malloc_device<CeedScalar>(grad_length, data->sycl_device, data->sycl_context));
   sycl::event copy_grad = data->sycl_queue.copy<CeedScalar>(grad, impl->d_grad, grad_length, e);
+  copy_events.push_back(copy_grad);
 
-  CeedCallSycl(ceed, sycl::event::wait_and_throw({copy_weight, copy_interp, copy_grad}));
+  CeedCallSycl(ceed, sycl::event::wait_and_throw(copy_events));
 
   CeedCallBackend(CeedBasisSetData(basis, impl));
 
diff --git a/backends/sycl-shared/ceed-sycl-shared-basis.sycl.cpp b/backends/sycl-shared/ceed-sycl-shared-basis.sycl.cpp
index 27ca11b6e5..d549f6cd4f 100644
--- a/backends/sycl-shared/ceed-sycl-shared-basis.sycl.cpp
+++ b/backends/sycl-shared/ceed-sycl-shared-basis.sycl.cpp
@@ -106,6 +106,7 @@ int CeedBasisApplyTensor_Sycl_shared(CeedBasis basis, const CeedInt num_elem, Ce
       //-----------
       std::vector<sycl::event> e;
 
+      CeedCheck(impl->d_q_weight_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weight_1d not set", CeedEvalModes[eval_mode]);
       if (!ceed_Sycl->sycl_queue.is_in_order()) e = {ceed_Sycl->sycl_queue.ext_oneapi_submit_barrier()};
 
       ceed_Sycl->sycl_queue.submit([&](sycl::handler &cgh) {
@@ -143,7 +144,7 @@ static int CeedBasisDestroy_Sycl_shared(CeedBasis basis) {
   CeedCallBackend(CeedBasisGetData(basis, &impl));
   CeedCallBackend(CeedGetData(ceed, &data));
   CeedCallSycl(ceed, data->sycl_queue.wait_and_throw());
-  CeedCallSycl(ceed, sycl::free(impl->d_q_weight_1d, data->sycl_context));
+  if (impl->d_q_weight_1d) CeedCallSycl(ceed, sycl::free(impl->d_q_weight_1d, data->sycl_context));
   CeedCallSycl(ceed, sycl::free(impl->d_interp_1d, data->sycl_context));
   CeedCallSycl(ceed, sycl::free(impl->d_grad_1d, data->sycl_context));
   CeedCallSycl(ceed, sycl::free(impl->d_collo_grad_1d, data->sycl_context));
@@ -198,17 +199,23 @@ int CeedBasisCreateTensorH1_Sycl_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d,
   if (!data->sycl_queue.is_in_order()) e = {data->sycl_queue.ext_oneapi_submit_barrier()};
 
   // Copy basis data to GPU
-  CeedCallSycl(ceed, impl->d_q_weight_1d = sycl::malloc_device<CeedScalar>(Q_1d, data->sycl_device, data->sycl_context));
-  sycl::event copy_weight = data->sycl_queue.copy<CeedScalar>(q_weight_1d, impl->d_q_weight_1d, Q_1d, e);
+  std::vector<sycl::event> copy_events;
+  if (q_weight_1d) {
+    CeedCallSycl(ceed, impl->d_q_weight_1d = sycl::malloc_device<CeedScalar>(Q_1d, data->sycl_device, data->sycl_context));
+    sycl::event copy_weight = data->sycl_queue.copy<CeedScalar>(q_weight_1d, impl->d_q_weight_1d, Q_1d, e);
+    copy_events.push_back(copy_weight);
+  }
 
   const CeedInt interp_length = Q_1d * P_1d;
   CeedCallSycl(ceed, impl->d_interp_1d = sycl::malloc_device<CeedScalar>(interp_length, data->sycl_device, data->sycl_context));
   sycl::event copy_interp = data->sycl_queue.copy<CeedScalar>(interp_1d, impl->d_interp_1d, interp_length, e);
+  copy_events.push_back(copy_interp);
 
   CeedCallSycl(ceed, impl->d_grad_1d = sycl::malloc_device<CeedScalar>(interp_length, data->sycl_device, data->sycl_context));
   sycl::event copy_grad = data->sycl_queue.copy<CeedScalar>(grad_1d, impl->d_grad_1d, interp_length, e);
+  copy_events.push_back(copy_grad);
 
-  CeedCallSycl(ceed, sycl::event::wait_and_throw({copy_weight, copy_interp, copy_grad}));
+  CeedCallSycl(ceed, sycl::event::wait_and_throw(copy_events));
 
   // Compute collocated gradient and copy to GPU
   impl->d_collo_grad_1d          = NULL;
diff --git a/backends/sycl-shared/ceed-sycl-shared.sycl.cpp b/backends/sycl-shared/ceed-sycl-shared.sycl.cpp
index d629e76f95..d7018c149c 100644
--- a/backends/sycl-shared/ceed-sycl-shared.sycl.cpp
+++ b/backends/sycl-shared/ceed-sycl-shared.sycl.cpp
@@ -19,7 +19,7 @@
 //------------------------------------------------------------------------------
 static int CeedInit_Sycl_shared(const char *resource, Ceed ceed) {
   Ceed       ceed_ref;
-  Ceed_Sycl *data, *ref_data;
+  Ceed_Sycl *data;
   char      *resource_root;
 
   CeedCallBackend(CeedGetResourceRoot(ceed, resource, ":", &resource_root));
diff --git a/interface/ceed-basis.c b/interface/ceed-basis.c
index fe5ae3fb73..f134cd30bd 100644
--- a/interface/ceed-basis.c
+++ b/interface/ceed-basis.c
@@ -1334,7 +1334,7 @@ int CeedBasisCreateProjection(CeedBasis basis_from, CeedBasis basis_to, CeedBasi
   Ceed        ceed;
   bool        is_tensor;
   CeedInt     dim, num_comp;
-  CeedScalar *q_ref, *q_weight, *interp_project, *grad_project;
+  CeedScalar *interp_project, *grad_project;
 
   CeedCall(CeedBasisGetCeed(basis_to, &ceed));
 
@@ -1350,9 +1350,7 @@ int CeedBasisCreateProjection(CeedBasis basis_from, CeedBasis basis_to, CeedBasi
 
     CeedCall(CeedBasisGetNumNodes1D(basis_from, &P_1d_from));
     CeedCall(CeedBasisGetNumNodes1D(basis_to, &P_1d_to));
-    CeedCall(CeedCalloc(P_1d_to, &q_ref));
-    CeedCall(CeedCalloc(P_1d_to, &q_weight));
-    CeedCall(CeedBasisCreateTensorH1(ceed, dim, num_comp, P_1d_from, P_1d_to, interp_project, grad_project, q_ref, q_weight, basis_project));
+    CeedCall(CeedBasisCreateTensorH1(ceed, dim, num_comp, P_1d_from, P_1d_to, interp_project, grad_project, NULL, NULL, basis_project));
   } else {
     // Even if basis_to and basis_from are not H1, the resulting basis is H1 for interpolation to work
     CeedInt          num_nodes_to, num_nodes_from;
@@ -1361,16 +1359,12 @@ int CeedBasisCreateProjection(CeedBasis basis_from, CeedBasis basis_to, CeedBasi
     CeedCall(CeedBasisGetTopology(basis_to, &topo));
     CeedCall(CeedBasisGetNumNodes(basis_from, &num_nodes_from));
     CeedCall(CeedBasisGetNumNodes(basis_to, &num_nodes_to));
-    CeedCall(CeedCalloc(num_nodes_to * dim, &q_ref));
-    CeedCall(CeedCalloc(num_nodes_to, &q_weight));
-    CeedCall(CeedBasisCreateH1(ceed, topo, num_comp, num_nodes_from, num_nodes_to, interp_project, grad_project, q_ref, q_weight, basis_project));
+    CeedCall(CeedBasisCreateH1(ceed, topo, num_comp, num_nodes_from, num_nodes_to, interp_project, grad_project, NULL, NULL, basis_project));
   }
 
   // Cleanup
   CeedCall(CeedFree(&interp_project));
   CeedCall(CeedFree(&grad_project));
-  CeedCall(CeedFree(&q_ref));
-  CeedCall(CeedFree(&q_weight));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/tests/t319-basis.c b/tests/t319-basis.c
index 8e542ad1c6..3c2d99fa02 100644
--- a/tests/t319-basis.c
+++ b/tests/t319-basis.c
@@ -34,6 +34,79 @@ static CeedScalar GetTolerance(CeedScalarType scalar_type, int dim) {
   return tol;
 }
 
+static void VerifyProjectedBasis(CeedBasis basis_project, CeedInt dim, CeedInt p_to_dim, CeedInt p_from_dim, CeedVector x_to, CeedVector x_from,
+                                 CeedVector u_to, CeedVector u_from, CeedVector du_to) {
+  CeedScalar tol;
+
+  {
+    CeedScalarType scalar_type;
+
+    CeedGetScalarType(&scalar_type);
+    tol = GetTolerance(scalar_type, dim);
+  }
+
+  // Setup coarse solution
+  {
+    const CeedScalar *x_array;
+    CeedScalar        u_array[p_from_dim];
+
+    CeedVectorGetArrayRead(x_from, CEED_MEM_HOST, &x_array);
+    for (CeedInt i = 0; i < p_from_dim; i++) {
+      CeedScalar coord[dim];
+      for (CeedInt d = 0; d < dim; d++) coord[d] = x_array[p_from_dim * d + i];
+      u_array[i] = Eval(dim, coord);
+    }
+    CeedVectorRestoreArrayRead(x_from, &x_array);
+    CeedVectorSetArray(u_from, CEED_MEM_HOST, CEED_COPY_VALUES, u_array);
+  }
+
+  // Project to fine basis
+  CeedBasisApply(basis_project, 1, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, u_from, u_to);
+
+  // Check solution
+  {
+    const CeedScalar *x_array, *u_array;
+
+    CeedVectorGetArrayRead(x_to, CEED_MEM_HOST, &x_array);
+    CeedVectorGetArrayRead(u_to, CEED_MEM_HOST, &u_array);
+    for (CeedInt i = 0; i < p_to_dim; i++) {
+      CeedScalar coord[dim];
+      for (CeedInt d = 0; d < dim; d++) coord[d] = x_array[d * p_to_dim + i];
+      const CeedScalar u = Eval(dim, coord);
+      if (fabs(u - u_array[i]) > tol) printf("[%" CeedInt_FMT ", %" CeedInt_FMT "] %f != %f\n", dim, i, u_array[i], u);
+    }
+    CeedVectorRestoreArrayRead(x_to, &x_array);
+    CeedVectorRestoreArrayRead(u_to, &u_array);
+  }
+
+  // Project and take gradient
+  CeedBasisApply(basis_project, 1, CEED_NOTRANSPOSE, CEED_EVAL_GRAD, u_from, du_to);
+
+  // Check solution
+  {
+    const CeedScalar *x_array, *du_array;
+
+    CeedVectorGetArrayRead(x_to, CEED_MEM_HOST, &x_array);
+    CeedVectorGetArrayRead(du_to, CEED_MEM_HOST, &du_array);
+    for (CeedInt i = 0; i < p_to_dim; i++) {
+      CeedScalar coord[dim];
+
+      for (CeedInt d = 0; d < dim; d++) coord[d] = x_array[p_to_dim * d + i];
+      for (CeedInt d = 0; d < dim; d++) {
+        const CeedScalar du = EvalGrad(d, coord);
+
+        if (fabs(du - du_array[p_to_dim * d + i]) > tol) {
+          // LCOV_EXCL_START
+          printf("[%" CeedInt_FMT ", %" CeedInt_FMT ", %" CeedInt_FMT "] %f != %f\n", dim, i, d, du_array[p_to_dim * (dim - 1 - d) + i], du);
+          // LCOV_EXCL_STOP
+        }
+      }
+    }
+    CeedVectorRestoreArrayRead(x_to, &x_array);
+    CeedVectorRestoreArrayRead(du_to, &du_array);
+  }
+}
+
 int main(int argc, char **argv) {
   Ceed ceed;
 
@@ -43,14 +116,7 @@ int main(int argc, char **argv) {
     CeedVector x_corners, x_from, x_to, u_from, u_to, du_to;
     CeedBasis  basis_x, basis_from, basis_to, basis_project;
     CeedInt    p_from = 5, p_to = 6, q = 7, x_dim = CeedIntPow(2, dim), p_from_dim = CeedIntPow(p_from, dim), p_to_dim = CeedIntPow(p_to, dim);
-    CeedScalar tol;
-
-    {
-      CeedScalarType scalar_type;
 
-      CeedGetScalarType(&scalar_type);
-      tol = GetTolerance(scalar_type, dim);
-    }
     CeedVectorCreate(ceed, x_dim * dim, &x_corners);
     {
       CeedScalar x_array[x_dim * dim];
@@ -82,66 +148,39 @@ int main(int argc, char **argv) {
     CeedBasisCreateTensorH1Lagrange(ceed, dim, 1, p_to, q, CEED_GAUSS, &basis_to);
     CeedBasisCreateProjection(basis_from, basis_to, &basis_project);
 
-    // Setup coarse solution
-    {
-      const CeedScalar *x_array;
-      CeedScalar        u_array[p_from_dim];
-
-      CeedVectorGetArrayRead(x_from, CEED_MEM_HOST, &x_array);
-      for (CeedInt i = 0; i < p_from_dim; i++) {
-        CeedScalar coord[dim];
-        for (CeedInt d = 0; d < dim; d++) coord[d] = x_array[p_from_dim * d + i];
-        u_array[i] = Eval(dim, coord);
-      }
-      CeedVectorRestoreArrayRead(x_from, &x_array);
-      CeedVectorSetArray(u_from, CEED_MEM_HOST, CEED_COPY_VALUES, u_array);
-    }
-
-    // Project to fine basis
-    CeedBasisApply(basis_project, 1, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, u_from, u_to);
+    VerifyProjectedBasis(basis_project, dim, p_to_dim, p_from_dim, x_to, x_from, u_to, u_from, du_to);
 
-    // Check solution
+    // Test projection on non-tensor bases
     {
-      const CeedScalar *x_array, *u_array;
-
-      CeedVectorGetArrayRead(x_to, CEED_MEM_HOST, &x_array);
-      CeedVectorGetArrayRead(u_to, CEED_MEM_HOST, &u_array);
-      for (CeedInt i = 0; i < p_to_dim; i++) {
-        CeedScalar coord[dim];
-        for (CeedInt d = 0; d < dim; d++) coord[d] = x_array[d * p_to_dim + i];
-        const CeedScalar u = Eval(dim, coord);
-        if (fabs(u - u_array[i]) > tol) printf("[%" CeedInt_FMT ", %" CeedInt_FMT "] %f != %f\n", dim, i, u_array[i], u);
-      }
-      CeedVectorRestoreArrayRead(x_to, &x_array);
-      CeedVectorRestoreArrayRead(u_to, &u_array);
+      CeedBasis         basis_from_nontensor, basis_to_nontensor;
+      CeedElemTopology  topo;
+      CeedInt           num_comp, num_nodes, nqpts;
+      const CeedScalar *interp, *grad;
+
+      CeedBasisGetTopology(basis_from, &topo);
+      CeedBasisGetNumComponents(basis_from, &num_comp);
+      CeedBasisGetNumNodes(basis_from, &num_nodes);
+      CeedBasisGetNumQuadraturePoints(basis_from, &nqpts);
+      CeedBasisGetInterp(basis_from, &interp);
+      CeedBasisGetGrad(basis_from, &grad);
+      CeedBasisCreateH1(ceed, topo, num_comp, num_nodes, nqpts, interp, grad, NULL, NULL, &basis_from_nontensor);
+
+      CeedBasisGetTopology(basis_to, &topo);
+      CeedBasisGetNumComponents(basis_to, &num_comp);
+      CeedBasisGetNumNodes(basis_to, &num_nodes);
+      CeedBasisGetNumQuadraturePoints(basis_to, &nqpts);
+      CeedBasisGetInterp(basis_to, &interp);
+      CeedBasisGetGrad(basis_to, &grad);
+      CeedBasisCreateH1(ceed, topo, num_comp, num_nodes, nqpts, interp, grad, NULL, NULL, &basis_to_nontensor);
+
+      CeedBasisDestroy(&basis_project);
+      CeedBasisCreateProjection(basis_from_nontensor, basis_to_nontensor, &basis_project);
+
+      CeedBasisDestroy(&basis_to_nontensor);
+      CeedBasisDestroy(&basis_from_nontensor);
     }
 
-    // Project and take gradient
-    CeedBasisApply(basis_project, 1, CEED_NOTRANSPOSE, CEED_EVAL_GRAD, u_from, du_to);
-
-    // Check solution
-    {
-      const CeedScalar *x_array, *du_array;
-
-      CeedVectorGetArrayRead(x_to, CEED_MEM_HOST, &x_array);
-      CeedVectorGetArrayRead(du_to, CEED_MEM_HOST, &du_array);
-      for (CeedInt i = 0; i < p_to_dim; i++) {
-        CeedScalar coord[dim];
-
-        for (CeedInt d = 0; d < dim; d++) coord[d] = x_array[p_to_dim * d + i];
-        for (CeedInt d = 0; d < dim; d++) {
-          const CeedScalar du = EvalGrad(d, coord);
-
-          if (fabs(du - du_array[p_to_dim * d + i]) > tol) {
-            // LCOV_EXCL_START
-            printf("[%" CeedInt_FMT ", %" CeedInt_FMT ", %" CeedInt_FMT "] %f != %f\n", dim, i, d, du_array[p_to_dim * (dim - 1 - d) + i], du);
-            // LCOV_EXCL_STOP
-          }
-        }
-      }
-      CeedVectorRestoreArrayRead(x_to, &x_array);
-      CeedVectorRestoreArrayRead(du_to, &du_array);
-    }
+    VerifyProjectedBasis(basis_project, dim, p_to_dim, p_from_dim, x_to, x_from, u_to, u_from, du_to);
 
     CeedVectorDestroy(&x_corners);
     CeedVectorDestroy(&x_from);

From e104ad118a2093af56612daf7249aa7085194bc6 Mon Sep 17 00:00:00 2001
From: James Wright <james@jameswright.xyz>
Date: Thu, 20 Jun 2024 20:23:33 -0600
Subject: [PATCH 083/571] basis: Allow CreateProjection for mixed-tensor bases

---
 interface/ceed-basis.c | 36 ++++++++++++++++++++++++------------
 tests/t319-basis.c     | 23 ++++++++++++++++-------
 2 files changed, 40 insertions(+), 19 deletions(-)

diff --git a/interface/ceed-basis.c b/interface/ceed-basis.c
index f134cd30bd..30207eb3b3 100644
--- a/interface/ceed-basis.c
+++ b/interface/ceed-basis.c
@@ -195,7 +195,7 @@ static int CeedScalarView(const char *name, const char *fp_fmt, CeedInt m, CeedI
 **/
 static int CeedBasisCreateProjectionMatrices(CeedBasis basis_from, CeedBasis basis_to, CeedScalar **interp_project, CeedScalar **grad_project) {
   Ceed    ceed;
-  bool    is_tensor_to, is_tensor_from;
+  bool    are_both_tensor;
   CeedInt Q, Q_to, Q_from, P_to, P_from;
 
   CeedCall(CeedBasisGetCeed(basis_to, &ceed));
@@ -207,10 +207,14 @@ static int CeedBasisCreateProjectionMatrices(CeedBasis basis_from, CeedBasis bas
   Q = Q_to;
 
   // Check for matching tensor or non-tensor
-  CeedCall(CeedBasisIsTensor(basis_to, &is_tensor_to));
-  CeedCall(CeedBasisIsTensor(basis_from, &is_tensor_from));
-  CeedCheck(is_tensor_to == is_tensor_from, ceed, CEED_ERROR_MINOR, "Bases must both be tensor or non-tensor");
-  if (is_tensor_to) {
+  {
+    bool is_tensor_to, is_tensor_from;
+
+    CeedCall(CeedBasisIsTensor(basis_to, &is_tensor_to));
+    CeedCall(CeedBasisIsTensor(basis_from, &is_tensor_from));
+    are_both_tensor = is_tensor_to && is_tensor_from;
+  }
+  if (are_both_tensor) {
     CeedCall(CeedBasisGetNumNodes1D(basis_to, &P_to));
     CeedCall(CeedBasisGetNumNodes1D(basis_from, &P_from));
     CeedCall(CeedBasisGetNumQuadraturePoints1D(basis_from, &Q));
@@ -231,7 +235,7 @@ static int CeedBasisCreateProjectionMatrices(CeedBasis basis_from, CeedBasis bas
   const CeedScalar *interp_to_source = NULL, *interp_from_source = NULL, *grad_from_source = NULL;
 
   CeedCall(CeedBasisGetDimension(basis_to, &dim));
-  if (is_tensor_to) {
+  if (are_both_tensor) {
     CeedCall(CeedBasisGetInterp1D(basis_to, &interp_to_source));
     CeedCall(CeedBasisGetInterp1D(basis_from, &interp_from_source));
   } else {
@@ -246,19 +250,19 @@ static int CeedBasisCreateProjectionMatrices(CeedBasis basis_from, CeedBasis bas
   // projection basis will have a gradient operation (allocated even if not H^1 for the
   // basis construction later on)
   if (fe_space_to == CEED_FE_SPACE_H1) {
-    if (is_tensor_to) {
+    if (are_both_tensor) {
       CeedCall(CeedBasisGetGrad1D(basis_from, &grad_from_source));
     } else {
       CeedCall(CeedBasisGetGrad(basis_from, &grad_from_source));
     }
   }
-  CeedCall(CeedCalloc(P_to * P_from * (is_tensor_to ? 1 : dim), grad_project));
+  CeedCall(CeedCalloc(P_to * P_from * (are_both_tensor ? 1 : dim), grad_project));
 
   // Compute interp_to^+, pseudoinverse of interp_to
   CeedCall(CeedCalloc(Q * q_comp * P_to, &interp_to_inv));
   CeedCall(CeedMatrixPseudoinverse(ceed, interp_to_source, Q * q_comp, P_to, interp_to_inv));
   // Build matrices
-  CeedInt     num_matrices = 1 + (fe_space_to == CEED_FE_SPACE_H1) * (is_tensor_to ? 1 : dim);
+  CeedInt     num_matrices = 1 + (fe_space_to == CEED_FE_SPACE_H1) * (are_both_tensor ? 1 : dim);
   CeedScalar *input_from[num_matrices], *output_project[num_matrices];
 
   input_from[0]     = (CeedScalar *)interp_from_source;
@@ -1322,6 +1326,8 @@ int CeedBasisCreateHcurl(Ceed ceed, CeedElemTopology topo, CeedInt num_comp, Cee
   Note: `basis_project` will have the same number of components as `basis_from`, regardless of the number of components that `basis_to` has.
         If `basis_from` has 3 components and `basis_to` has 5 components, then `basis_project` will have 3 components.
 
+  Note: If either `basis_from` or `basis_to` are non-tensor, then `basis_project` will also be non-tensor
+
   @param[in]  basis_from    `CeedBasis` to prolong from
   @param[in]  basis_to      `CeedBasis` to prolong to
   @param[out] basis_project Address of the variable where the newly created `CeedBasis` will be stored
@@ -1332,7 +1338,7 @@ int CeedBasisCreateHcurl(Ceed ceed, CeedElemTopology topo, CeedInt num_comp, Cee
 **/
 int CeedBasisCreateProjection(CeedBasis basis_from, CeedBasis basis_to, CeedBasis *basis_project) {
   Ceed        ceed;
-  bool        is_tensor;
+  bool        create_tensor;
   CeedInt     dim, num_comp;
   CeedScalar *interp_project, *grad_project;
 
@@ -1342,10 +1348,16 @@ int CeedBasisCreateProjection(CeedBasis basis_from, CeedBasis basis_to, CeedBasi
   CeedCall(CeedBasisCreateProjectionMatrices(basis_from, basis_to, &interp_project, &grad_project));
 
   // Build basis
-  CeedCall(CeedBasisIsTensor(basis_to, &is_tensor));
+  {
+    bool is_tensor_to, is_tensor_from;
+
+    CeedCall(CeedBasisIsTensor(basis_to, &is_tensor_to));
+    CeedCall(CeedBasisIsTensor(basis_from, &is_tensor_from));
+    create_tensor = is_tensor_from && is_tensor_to;
+  }
   CeedCall(CeedBasisGetDimension(basis_to, &dim));
   CeedCall(CeedBasisGetNumComponents(basis_from, &num_comp));
-  if (is_tensor) {
+  if (create_tensor) {
     CeedInt P_1d_to, P_1d_from;
 
     CeedCall(CeedBasisGetNumNodes1D(basis_from, &P_1d_from));
diff --git a/tests/t319-basis.c b/tests/t319-basis.c
index 3c2d99fa02..2aee7d4e4f 100644
--- a/tests/t319-basis.c
+++ b/tests/t319-basis.c
@@ -150,9 +150,9 @@ int main(int argc, char **argv) {
 
     VerifyProjectedBasis(basis_project, dim, p_to_dim, p_from_dim, x_to, x_from, u_to, u_from, du_to);
 
-    // Test projection on non-tensor bases
+    // Create non-tensor bases
+    CeedBasis basis_from_nontensor, basis_to_nontensor;
     {
-      CeedBasis         basis_from_nontensor, basis_to_nontensor;
       CeedElemTopology  topo;
       CeedInt           num_comp, num_nodes, nqpts;
       const CeedScalar *interp, *grad;
@@ -172,14 +172,21 @@ int main(int argc, char **argv) {
       CeedBasisGetInterp(basis_to, &interp);
       CeedBasisGetGrad(basis_to, &grad);
       CeedBasisCreateH1(ceed, topo, num_comp, num_nodes, nqpts, interp, grad, NULL, NULL, &basis_to_nontensor);
+    }
 
-      CeedBasisDestroy(&basis_project);
-      CeedBasisCreateProjection(basis_from_nontensor, basis_to_nontensor, &basis_project);
+    // Test projection on non-tensor bases
+    CeedBasisDestroy(&basis_project);
+    CeedBasisCreateProjection(basis_from_nontensor, basis_to_nontensor, &basis_project);
+    VerifyProjectedBasis(basis_project, dim, p_to_dim, p_from_dim, x_to, x_from, u_to, u_from, du_to);
 
-      CeedBasisDestroy(&basis_to_nontensor);
-      CeedBasisDestroy(&basis_from_nontensor);
-    }
+    // Test projection from non-tensor to tensor
+    CeedBasisDestroy(&basis_project);
+    CeedBasisCreateProjection(basis_from_nontensor, basis_to, &basis_project);
+    VerifyProjectedBasis(basis_project, dim, p_to_dim, p_from_dim, x_to, x_from, u_to, u_from, du_to);
 
+    // Test projection from tensor to non-tensor
+    CeedBasisDestroy(&basis_project);
+    CeedBasisCreateProjection(basis_from, basis_to_nontensor, &basis_project);
     VerifyProjectedBasis(basis_project, dim, p_to_dim, p_from_dim, x_to, x_from, u_to, u_from, du_to);
 
     CeedVectorDestroy(&x_corners);
@@ -189,7 +196,9 @@ int main(int argc, char **argv) {
     CeedVectorDestroy(&u_to);
     CeedVectorDestroy(&du_to);
     CeedBasisDestroy(&basis_from);
+    CeedBasisDestroy(&basis_from_nontensor);
     CeedBasisDestroy(&basis_to);
+    CeedBasisDestroy(&basis_to_nontensor);
     CeedBasisDestroy(&basis_project);
   }
   CeedDestroy(&ceed);

From 0b8f3c4edb119a4d519cc48e74ffe3275b15e198 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Mon, 24 Jun 2024 09:39:52 -0600
Subject: [PATCH 084/571] vec - add CeedVectorCopyStrided and
 CeedVectorSetStrided utils

---
 include/ceed-impl.h     |  2 ++
 include/ceed/ceed.h     |  2 ++
 interface/ceed-vector.c | 80 +++++++++++++++++++++++++++++++++++++++++
 interface/ceed.c        |  2 ++
 tests/t127-vector.c     | 60 +++++++++++++++++++++++++++++++
 5 files changed, 146 insertions(+)
 create mode 100644 tests/t127-vector.c

diff --git a/include/ceed-impl.h b/include/ceed-impl.h
index 956b30b757..de206e9e59 100644
--- a/include/ceed-impl.h
+++ b/include/ceed-impl.h
@@ -126,8 +126,10 @@ struct CeedVector_private {
   Ceed ceed;
   int (*HasValidArray)(CeedVector, bool *);
   int (*HasBorrowedArrayOfType)(CeedVector, CeedMemType, bool *);
+  int (*CopyStrided)(CeedVector, CeedSize, CeedSize, CeedVector);
   int (*SetArray)(CeedVector, CeedMemType, CeedCopyMode, CeedScalar *);
   int (*SetValue)(CeedVector, CeedScalar);
+  int (*SetValueStrided)(CeedVector, CeedSize, CeedSize, CeedScalar);
   int (*SyncArray)(CeedVector, CeedMemType);
   int (*TakeArray)(CeedVector, CeedMemType, CeedScalar **);
   int (*GetArray)(CeedVector, CeedMemType, CeedScalar **);
diff --git a/include/ceed/ceed.h b/include/ceed/ceed.h
index 4c950a02bd..4fff8784bc 100644
--- a/include/ceed/ceed.h
+++ b/include/ceed/ceed.h
@@ -180,8 +180,10 @@ CEED_EXTERN int CeedGetPreferredMemType(Ceed ceed, CeedMemType *type);
 CEED_EXTERN int  CeedVectorCreate(Ceed ceed, CeedSize len, CeedVector *vec);
 CEED_EXTERN int  CeedVectorReferenceCopy(CeedVector vec, CeedVector *vec_copy);
 CEED_EXTERN int  CeedVectorCopy(CeedVector vec, CeedVector vec_copy);
+CEED_EXTERN int  CeedVectorCopyStrided(CeedVector vec, CeedSize start, CeedInt step, CeedVector vec_copy);
 CEED_EXTERN int  CeedVectorSetArray(CeedVector vec, CeedMemType mem_type, CeedCopyMode copy_mode, CeedScalar *array);
 CEED_EXTERN int  CeedVectorSetValue(CeedVector vec, CeedScalar value);
+CEED_EXTERN int  CeedVectorSetValueStrided(CeedVector vec, CeedSize start, CeedInt step, CeedScalar value);
 CEED_EXTERN int  CeedVectorSyncArray(CeedVector vec, CeedMemType mem_type);
 CEED_EXTERN int  CeedVectorTakeArray(CeedVector vec, CeedMemType mem_type, CeedScalar **array);
 CEED_EXTERN int  CeedVectorGetArray(CeedVector vec, CeedMemType mem_type, CeedScalar **array);
diff --git a/interface/ceed-vector.c b/interface/ceed-vector.c
index 6c93f5b6df..0794f69954 100644
--- a/interface/ceed-vector.c
+++ b/interface/ceed-vector.c
@@ -238,6 +238,50 @@ int CeedVectorCopy(CeedVector vec, CeedVector vec_copy) {
   return CEED_ERROR_SUCCESS;
 }
 
+/**
+  @brief Copy a strided portion of `CeedVector` contents into a different `CeedVector`
+
+  @param[in]     vec      `CeedVector` to copy
+  @param[in]     start    First index to copy
+  @param[in]     step     Stride between indices to copy
+  @param[in,out] vec_copy `CeedVector` to copy values to
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref User
+**/
+int CeedVectorCopyStrided(CeedVector vec, CeedSize start, CeedInt step, CeedVector vec_copy) {
+  CeedSize          length;
+  const CeedScalar *array;
+  CeedScalar       *array_copy;
+
+  // Backend version
+  if (vec->CopyStrided && vec_copy->CopyStrided) {
+    CeedCall(vec->CopyStrided(vec, start, step, vec_copy));
+    vec_copy->state += 2;
+    return CEED_ERROR_SUCCESS;
+  }
+
+  // Get length
+  {
+    CeedSize length_vec, length_copy;
+
+    CeedCall(CeedVectorGetLength(vec, &length_vec));
+    CeedCall(CeedVectorGetLength(vec_copy, &length_copy));
+    length = length_vec > length_copy ? length_vec : length_copy;
+  }
+
+  // Copy
+  CeedCall(CeedVectorGetArrayRead(vec, CEED_MEM_HOST, &array));
+  CeedCall(CeedVectorGetArray(vec_copy, CEED_MEM_HOST, &array_copy));
+  for (CeedSize i = start; i < length; i += step) array_copy[i] = array[i];
+
+  // Cleanup
+  CeedCall(CeedVectorRestoreArrayRead(vec, &array));
+  CeedCall(CeedVectorRestoreArray(vec_copy, &array_copy));
+  return CEED_ERROR_SUCCESS;
+}
+
 /**
   @brief Set the array used by a `CeedVector`, freeing any previously allocated array if applicable.
 
@@ -301,6 +345,42 @@ int CeedVectorSetValue(CeedVector vec, CeedScalar value) {
   return CEED_ERROR_SUCCESS;
 }
 
+/**
+  @brief Set a portion of a `CeedVector` to a constant value.
+
+  Note: The `CeedVector` must already have valid data set via @ref CeedVectorSetArray() or similar.
+
+  @param[in,out] vec   `CeedVector`
+  @param[in]     start First index to set
+  @param[in]     step  Stride between indices to set
+  @param[in]     value Value to be used
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref User
+**/
+int CeedVectorSetValueStrided(CeedVector vec, CeedSize start, CeedInt step, CeedScalar value) {
+  Ceed ceed;
+
+  CeedCall(CeedVectorGetCeed(vec, &ceed));
+  CeedCheck(vec->state % 2 == 0, ceed, CEED_ERROR_ACCESS, "Cannot grant CeedVector array access, the access lock is already in use");
+  CeedCheck(vec->num_readers == 0, ceed, CEED_ERROR_ACCESS, "Cannot grant CeedVector array access, a process has read access");
+
+  if (vec->SetValueStrided) {
+    CeedCall(vec->SetValueStrided(vec, start, step, value));
+    vec_copy->state += 2;
+  } else {
+    CeedSize    length;
+    CeedScalar *array;
+
+    CeedCall(CeedVectorGetArray(vec, CEED_MEM_HOST, &array));
+    CeedCall(CeedVectorGetLength(vec, &length));
+    for (CeedSize i = start; i < length; i += step) array[i] = value;
+    CeedCall(CeedVectorRestoreArray(vec, &array));
+  }
+  return CEED_ERROR_SUCCESS;
+}
+
 /**
   @brief Sync the `CeedVector` to a specified `mem_type`.
 
diff --git a/interface/ceed.c b/interface/ceed.c
index 5800ee2bb4..073220d187 100644
--- a/interface/ceed.c
+++ b/interface/ceed.c
@@ -924,9 +924,11 @@ int CeedInit(const char *resource, Ceed *ceed) {
       CEED_FTABLE_ENTRY(Ceed, CompositeOperatorCreate),
       CEED_FTABLE_ENTRY(CeedVector, HasValidArray),
       CEED_FTABLE_ENTRY(CeedVector, HasBorrowedArrayOfType),
+      CEED_FTABLE_ENTRY(CeedVector, CopyStrided),
       CEED_FTABLE_ENTRY(CeedVector, SetArray),
       CEED_FTABLE_ENTRY(CeedVector, TakeArray),
       CEED_FTABLE_ENTRY(CeedVector, SetValue),
+      CEED_FTABLE_ENTRY(CeedVector, SetValueStrided),
       CEED_FTABLE_ENTRY(CeedVector, SyncArray),
       CEED_FTABLE_ENTRY(CeedVector, GetArray),
       CEED_FTABLE_ENTRY(CeedVector, GetArrayRead),
diff --git a/tests/t127-vector.c b/tests/t127-vector.c
new file mode 100644
index 0000000000..0a38056c1d
--- /dev/null
+++ b/tests/t127-vector.c
@@ -0,0 +1,60 @@
+/// @file
+/// Test strided setting and copying of vectors
+/// \test Test strided setting and copying of vectors
+#include <ceed.h>
+#include <stdio.h>
+
+int main(int argc, char **argv) {
+  Ceed       ceed;
+  CeedSize   start = 2, step = 3;
+  CeedVector x, y;
+  CeedInt    len = 10;
+
+  CeedInit(argv[1], &ceed);
+
+  CeedVectorCreate(ceed, len, &x);
+  CeedVectorCreate(ceed, len, &y);
+  
+  // Set strided
+  CeedVectorSetValue(x, 1.0);
+  CeedVectorSetValueStrided(x, start, step, 42.0);
+  {
+    const CeedScalar *read_array;
+
+    CeedVectorGetArrayRead(x, CEED_MEM_HOST, &read_array);
+    for (CeedInt i = 0; i < len; i++) {
+      CeedScalar value = (i - start) % step == 0 ? 42.0 : 1.0;
+
+      if (read_array[i] != value) {
+        // LCOV_EXCL_START
+        printf("Error in setting value in x at index %" CeedInt_FMT ", computed: %f actual: %f\n", i, read_array[i], value);
+        // LCOV_EXCL_STOP
+      }
+    }
+    CeedVectorRestoreArrayRead(x, &read_array);
+  }
+  
+  // Copy strided
+  CeedVectorSetValue(y, 0.0);
+  CeedVectorCopyStrided(x, start, step, y);
+  {
+    const CeedScalar *read_array;
+
+    CeedVectorGetArrayRead(y, CEED_MEM_HOST, &read_array);
+    for (CeedInt i = 0; i < len; i++) {
+      CeedScalar value = (i - start) % step == 0 ? 42.0 : 0.0;
+
+      if (read_array[i] != value) {
+        // LCOV_EXCL_START
+        printf("Error in copying value to y at index %" CeedInt_FMT ", computed: %f actual: %f\n", i, read_array[i], value);
+        // LCOV_EXCL_STOP
+      }
+    }
+    CeedVectorRestoreArrayRead(y, &read_array);
+  }
+
+  CeedVectorDestroy(&x);
+  CeedVectorDestroy(&y);
+  CeedDestroy(&ceed);
+  return 0;
+}

From 2d4e06054ce9048e5293b6d862d152ac33373122 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Mon, 24 Jun 2024 09:41:40 -0600
Subject: [PATCH 085/571] vec - fix double state incrementing

---
 interface/ceed-vector.c | 4 ++--
 tests/t127-vector.c     | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/interface/ceed-vector.c b/interface/ceed-vector.c
index 0794f69954..263df4cfe0 100644
--- a/interface/ceed-vector.c
+++ b/interface/ceed-vector.c
@@ -332,6 +332,7 @@ int CeedVectorSetValue(CeedVector vec, CeedScalar value) {
 
   if (vec->SetValue) {
     CeedCall(vec->SetValue(vec, value));
+    vec->state += 2;
   } else {
     CeedSize    length;
     CeedScalar *array;
@@ -341,7 +342,6 @@ int CeedVectorSetValue(CeedVector vec, CeedScalar value) {
     for (CeedSize i = 0; i < length; i++) array[i] = value;
     CeedCall(CeedVectorRestoreArray(vec, &array));
   }
-  vec->state += 2;
   return CEED_ERROR_SUCCESS;
 }
 
@@ -368,7 +368,7 @@ int CeedVectorSetValueStrided(CeedVector vec, CeedSize start, CeedInt step, Ceed
 
   if (vec->SetValueStrided) {
     CeedCall(vec->SetValueStrided(vec, start, step, value));
-    vec_copy->state += 2;
+    vec->state += 2;
   } else {
     CeedSize    length;
     CeedScalar *array;
diff --git a/tests/t127-vector.c b/tests/t127-vector.c
index 0a38056c1d..e9fb578d65 100644
--- a/tests/t127-vector.c
+++ b/tests/t127-vector.c
@@ -14,7 +14,7 @@ int main(int argc, char **argv) {
 
   CeedVectorCreate(ceed, len, &x);
   CeedVectorCreate(ceed, len, &y);
-  
+
   // Set strided
   CeedVectorSetValue(x, 1.0);
   CeedVectorSetValueStrided(x, start, step, 42.0);
@@ -33,7 +33,7 @@ int main(int argc, char **argv) {
     }
     CeedVectorRestoreArrayRead(x, &read_array);
   }
-  
+
   // Copy strided
   CeedVectorSetValue(y, 0.0);
   CeedVectorCopyStrided(x, start, step, y);

From e706ae07528c369d8ff94b8781ba9f488031a0f5 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Mon, 24 Jun 2024 09:46:16 -0600
Subject: [PATCH 086/571] vec - fix documentation, checking in CeedVectorCopy

---
 interface/ceed-vector.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/interface/ceed-vector.c b/interface/ceed-vector.c
index 263df4cfe0..da44920f44 100644
--- a/interface/ceed-vector.c
+++ b/interface/ceed-vector.c
@@ -202,13 +202,8 @@ int CeedVectorReferenceCopy(CeedVector vec, CeedVector *vec_copy) {
 /**
   @brief Copy a `CeedVector` into a different `CeedVector`.
 
-  Both pointers should be destroyed with @ref CeedVectorDestroy().
-
-  Note: If `*vec_copy` is non-`NULL`, then it is assumed that `*vec_copy` is a pointer to a `CeedVector`.
-        This `CeedVector` will be destroyed if `*vec_copy` is the only reference to this `CeedVector`.
-
   @param[in]     vec      `CeedVector` to copy
-  @param[in,out] vec_copy Variable to store copied `CeedVector` to
+  @param[in,out] vec_copy `CeedVector` to copy array into
 
   @return An error code: 0 - success, otherwise - failure
 
@@ -230,6 +225,15 @@ int CeedVectorCopy(CeedVector vec, CeedVector vec_copy) {
   // Check that both have same memory type
   if (mem_type != mem_type_copy) mem_type = CEED_MEM_HOST;
 
+  // Check compatible lengths
+  {
+    CeedSize length_vec, length_copy;
+
+    CeedCall(CeedVectorGetLength(vec, &length_vec));
+    CeedCall(CeedVectorGetLength(vec_copy, &length_copy));
+    CeedCheck(length_vec == length_copy, ceed, CEED_ERROR_INCOMPATIBLE, "CeedVectors must have the same length to copy");
+  }
+
   // Copy the values from vec to vec_copy
   CeedCall(CeedVectorGetArray(vec, mem_type, &array));
   CeedCall(CeedVectorSetArray(vec_copy, mem_type, CEED_COPY_VALUES, array));

From 9ef220489445f31398d6e42662820ac48a00a591 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Mon, 24 Jun 2024 09:49:45 -0600
Subject: [PATCH 087/571] cuda - follow multi line statement conventions

---
 backends/cuda-ref/kernels/cuda-ref-vector.cu | 39 +++++++-------------
 1 file changed, 13 insertions(+), 26 deletions(-)

diff --git a/backends/cuda-ref/kernels/cuda-ref-vector.cu b/backends/cuda-ref/kernels/cuda-ref-vector.cu
index 51c5565308..bbb75fdf08 100644
--- a/backends/cuda-ref/kernels/cuda-ref-vector.cu
+++ b/backends/cuda-ref/kernels/cuda-ref-vector.cu
@@ -14,8 +14,7 @@
 __global__ static void setValueK(CeedScalar * __restrict__ vec, CeedSize size,
                                  CeedScalar val) {
   CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
-  if (index >= size)
-    return;
+  if (index >= size) return;
   vec[index] = val;
 }
 
@@ -28,8 +27,7 @@ extern "C" int CeedDeviceSetValue_Cuda(CeedScalar* d_array, CeedSize length,
   const CeedSize vec_size = length;
   int grid_size = vec_size / block_size;
 
-  if (block_size * grid_size < vec_size)
-    grid_size += 1;
+  if (block_size * grid_size < vec_size) grid_size += 1;
   setValueK<<<grid_size,block_size>>>(d_array, length, val);
   return 0;
 }
@@ -39,10 +37,8 @@ extern "C" int CeedDeviceSetValue_Cuda(CeedScalar* d_array, CeedSize length,
 //------------------------------------------------------------------------------
 __global__ static void rcpValueK(CeedScalar * __restrict__ vec, CeedSize size) {
   CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
-  if (index >= size)
-    return;
-  if (fabs(vec[index]) > 1E-16)
-    vec[index] = 1./vec[index];
+  if (index >= size) return;
+  if (fabs(vec[index]) > 1E-16) vec[index] = 1./vec[index];
 }
 
 //------------------------------------------------------------------------------
@@ -53,8 +49,7 @@ extern "C" int CeedDeviceReciprocal_Cuda(CeedScalar* d_array, CeedSize length) {
   const CeedSize vec_size = length;
   int grid_size = vec_size / block_size;
 
-  if (block_size * grid_size < vec_size)
-    grid_size += 1;
+  if (block_size * grid_size < vec_size) grid_size += 1;
   rcpValueK<<<grid_size,block_size>>>(d_array, length);
   return 0;
 }
@@ -65,8 +60,7 @@ extern "C" int CeedDeviceReciprocal_Cuda(CeedScalar* d_array, CeedSize length) {
 __global__ static void scaleValueK(CeedScalar * __restrict__ x, CeedScalar alpha,
     CeedSize size) {
   CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
-  if (index >= size)
-    return;
+  if (index >= size) return;
   x[index] *= alpha;
 }
 
@@ -79,8 +73,7 @@ extern "C" int CeedDeviceScale_Cuda(CeedScalar *x_array, CeedScalar alpha,
   const CeedSize vec_size = length;
   int grid_size = vec_size / block_size;
 
-  if (block_size * grid_size < vec_size)
-    grid_size += 1;
+  if (block_size * grid_size < vec_size) grid_size += 1;
   scaleValueK<<<grid_size,block_size>>>(x_array, alpha, length);
   return 0;
 }
@@ -91,8 +84,7 @@ extern "C" int CeedDeviceScale_Cuda(CeedScalar *x_array, CeedScalar alpha,
 __global__ static void axpyValueK(CeedScalar * __restrict__ y, CeedScalar alpha,
     CeedScalar * __restrict__ x, CeedSize size) {
   CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
-  if (index >= size)
-    return;
+  if (index >= size) return;
   y[index] += alpha * x[index];
 }
 
@@ -105,8 +97,7 @@ extern "C" int CeedDeviceAXPY_Cuda(CeedScalar *y_array, CeedScalar alpha,
   const CeedSize vec_size = length;
   int grid_size = vec_size / block_size;
 
-  if (block_size * grid_size < vec_size)
-    grid_size += 1;
+  if (block_size * grid_size < vec_size) grid_size += 1;
   axpyValueK<<<grid_size,block_size>>>(y_array, alpha, x_array, length);
   return 0;
 }
@@ -117,8 +108,7 @@ extern "C" int CeedDeviceAXPY_Cuda(CeedScalar *y_array, CeedScalar alpha,
 __global__ static void axpbyValueK(CeedScalar * __restrict__ y, CeedScalar alpha, CeedScalar beta,
     CeedScalar * __restrict__ x, CeedSize size) {
   CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
-  if (index >= size)
-    return;
+  if (index >= size) return;
   y[index] = beta * y[index];
   y[index] += alpha * x[index];
 }
@@ -132,8 +122,7 @@ extern "C" int CeedDeviceAXPBY_Cuda(CeedScalar *y_array, CeedScalar alpha, CeedS
   const CeedSize vec_size = length;
   int grid_size = vec_size / block_size;
 
-  if (block_size * grid_size < vec_size)
-    grid_size += 1;
+  if (block_size * grid_size < vec_size) grid_size += 1;
   axpbyValueK<<<grid_size,block_size>>>(y_array, alpha, beta, x_array, length);
   return 0;
 }
@@ -144,8 +133,7 @@ extern "C" int CeedDeviceAXPBY_Cuda(CeedScalar *y_array, CeedScalar alpha, CeedS
 __global__ static void pointwiseMultValueK(CeedScalar * __restrict__ w,
     CeedScalar * x, CeedScalar * __restrict__ y, CeedSize size) {
   CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
-  if (index >= size)
-    return;
+  if (index >= size) return;
   w[index] = x[index] * y[index];
 }
 
@@ -158,8 +146,7 @@ extern "C" int CeedDevicePointwiseMult_Cuda(CeedScalar *w_array, CeedScalar *x_a
   const CeedSize vec_size = length;
   int grid_size = vec_size / block_size;
 
-  if (block_size * grid_size < vec_size)
-    grid_size += 1;
+  if (block_size * grid_size < vec_size) grid_size += 1;
   pointwiseMultValueK<<<grid_size,block_size>>>(w_array, x_array, y_array, length);
   return 0;
 }

From f1c2287b836e28c06f03661cee66832fa5f0f99d Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Mon, 24 Jun 2024 10:03:25 -0600
Subject: [PATCH 088/571] cuda - add Vec*Strided utils

---
 backends/cuda-ref/ceed-cuda-ref-vector.c     | 78 ++++++++++++++++++++
 backends/cuda-ref/kernels/cuda-ref-vector.cu | 48 ++++++++++++
 2 files changed, 126 insertions(+)

diff --git a/backends/cuda-ref/ceed-cuda-ref-vector.c b/backends/cuda-ref/ceed-cuda-ref-vector.c
index 2759b38a4c..0aeb445cd8 100644
--- a/backends/cuda-ref/ceed-cuda-ref-vector.c
+++ b/backends/cuda-ref/ceed-cuda-ref-vector.c
@@ -223,6 +223,47 @@ static int CeedVectorSetArray_Cuda(const CeedVector vec, const CeedMemType mem_t
   return CEED_ERROR_UNSUPPORTED;
 }
 
+//------------------------------------------------------------------------------
+// Copy host array to value strided
+//------------------------------------------------------------------------------
+static int CeedHostCopyStrided_Cuda(CeedScalar *h_array, CeedSize start, CeedSize step, CeedSize length, CeedScalar *h_copy_array) {
+  for (CeedSize i = start; i < length; i += step) h_copy_array[i] = h_array[i];
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
+// Copy device array to value strided (impl in .cu file)
+//------------------------------------------------------------------------------
+int CeedDeviceCopyStrided_Cuda(CeedScalar *d_array, CeedSize start, CeedSize step, CeedSize length, CeedScalar *d_copy_array);
+
+//------------------------------------------------------------------------------
+// Copy a vector to a value strided
+//------------------------------------------------------------------------------
+static int CeedVectorCopyStrided_Cuda(CeedVector vec, CeedSize start, CeedSize step, CeedVector vec_copy) {
+  CeedSize         length;
+  CeedVector_Cuda *impl;
+
+  CeedCallBackend(CeedVectorGetData(vec, &impl));
+  CeedCallBackend(CeedVectorGetLength(vec, &length));
+  // Set value for synced device/host array
+  if (impl->d_array) {
+    CeedScalar *copy_array;
+
+    CeedCallBackend(CeedVectorGetArray(vec_copy, CEED_MEM_DEVICE, &copy_array));
+    CeedCallBackend(CeedDeviceCopyStrided_Cuda(impl->d_array, start, step, length, copy_array));
+    CeedCallBackend(CeedVectorRestoreArray(vec_copy, &copy_array));
+  } else if (impl->h_array) {
+    CeedScalar *copy_array;
+
+    CeedCallBackend(CeedVectorGetArray(vec_copy, CEED_MEM_HOST, &copy_array));
+    CeedCallBackend(CeedHostCopyStrided_Cuda(impl->h_array, start, step, length, copy_array));
+    CeedCallBackend(CeedVectorRestoreArray(vec_copy, &copy_array));
+  } else {
+    return CeedError(CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND, "CeedVector must have valid data set");
+  }
+  return CEED_ERROR_SUCCESS;
+}
+
 //------------------------------------------------------------------------------
 // Set host array to value
 //------------------------------------------------------------------------------
@@ -270,6 +311,41 @@ static int CeedVectorSetValue_Cuda(CeedVector vec, CeedScalar val) {
   return CEED_ERROR_SUCCESS;
 }
 
+//------------------------------------------------------------------------------
+// Set host array to value strided
+//------------------------------------------------------------------------------
+static int CeedHostSetValueStrided_Cuda(CeedScalar *h_array, CeedSize start, CeedSize step, CeedSize length, CeedScalar val) {
+  for (CeedSize i = start; i < length; i += step) h_array[i] = val;
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
+// Set device array to value strided (impl in .cu file)
+//------------------------------------------------------------------------------
+int CeedDeviceSetValueStrided_Cuda(CeedScalar *d_array, CeedSize start, CeedSize step, CeedSize length, CeedScalar val);
+
+//------------------------------------------------------------------------------
+// Set a vector to a value strided
+//------------------------------------------------------------------------------
+static int CeedVectorSetValueStrided_Cuda(CeedVector vec, CeedSize start, CeedSize step, CeedScalar val) {
+  CeedSize         length;
+  CeedVector_Cuda *impl;
+
+  CeedCallBackend(CeedVectorGetData(vec, &impl));
+  CeedCallBackend(CeedVectorGetLength(vec, &length));
+  // Set value for synced device/host array
+  if (impl->d_array) {
+    CeedCallBackend(CeedDeviceSetValueStrided_Cuda(impl->d_array, start, step, length, val));
+    impl->h_array = NULL;
+  } else if (impl->h_array) {
+    CeedCallBackend(CeedHostSetValueStrided_Cuda(impl->h_array, start, step, length, val));
+    impl->d_array = NULL;
+  } else {
+    return CeedError(CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND, "CeedVector must have valid data set");
+  }
+  return CEED_ERROR_SUCCESS;
+}
+
 //------------------------------------------------------------------------------
 // Vector Take Array
 //------------------------------------------------------------------------------
@@ -728,7 +804,9 @@ int CeedVectorCreate_Cuda(CeedSize n, CeedVector vec) {
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "HasBorrowedArrayOfType", CeedVectorHasBorrowedArrayOfType_Cuda));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "SetArray", CeedVectorSetArray_Cuda));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "TakeArray", CeedVectorTakeArray_Cuda));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "CopyStrided", (int (*)())CeedVectorCopyStrided_Cuda));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "SetValue", (int (*)())CeedVectorSetValue_Cuda));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "SetValueStrided", (int (*)())CeedVectorSetValueStrided_Cuda));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "SyncArray", CeedVectorSyncArray_Cuda));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "GetArray", CeedVectorGetArray_Cuda));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "GetArrayRead", CeedVectorGetArrayRead_Cuda));
diff --git a/backends/cuda-ref/kernels/cuda-ref-vector.cu b/backends/cuda-ref/kernels/cuda-ref-vector.cu
index bbb75fdf08..43e558107e 100644
--- a/backends/cuda-ref/kernels/cuda-ref-vector.cu
+++ b/backends/cuda-ref/kernels/cuda-ref-vector.cu
@@ -8,6 +8,30 @@
 #include <ceed.h>
 #include <cuda.h>
 
+//------------------------------------------------------------------------------
+// Kernel for copy strided on device
+//------------------------------------------------------------------------------
+__global__ static void copyStridedK(CeedScalar * __restrict__ vec, CeedSize start, CeedSize step,
+                                    CeedSize size, CeedScalar * __restrict__ vec_copy) {
+  CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
+  if (index >= size) return;
+  if ((index - start) % step == 0) vec_copy[index] = vec[index];
+}
+
+//------------------------------------------------------------------------------
+// Copy strided on device memory
+//------------------------------------------------------------------------------
+extern "C" int CeedDeviceCopyStrided_Cuda(CeedScalar* d_array, CeedSize start, CeedSize step,
+                                          CeedSize length, CeedScalar* d_copy_array) {
+  const int block_size = 512;
+  const CeedSize vec_size = length;
+  int grid_size = vec_size / block_size;
+
+  if (block_size * grid_size < vec_size) grid_size += 1;
+  copyStridedK<<<grid_size,block_size>>>(d_array, start, step, length, d_copy_array);
+  return 0;
+}
+
 //------------------------------------------------------------------------------
 // Kernel for set value on device
 //------------------------------------------------------------------------------
@@ -32,6 +56,30 @@ extern "C" int CeedDeviceSetValue_Cuda(CeedScalar* d_array, CeedSize length,
   return 0;
 }
 
+//------------------------------------------------------------------------------
+// Kernel for set value strided on device
+//------------------------------------------------------------------------------
+__global__ static void setValueStridedK(CeedScalar * __restrict__ vec, CeedSize start, CeedSize step,
+                                        CeedSize size, CeedScalar val) {
+  CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
+  if (index >= size) return;
+  if ((index - start) % step == 0) vec[index] = val;
+}
+
+//------------------------------------------------------------------------------
+// Set value strided on device memory
+//------------------------------------------------------------------------------
+extern "C" int CeedDeviceSetValueStrided_Cuda(CeedScalar* d_array, CeedSize start, CeedSize step,
+                                              CeedSize length, CeedScalar val) {
+  const int block_size = 512;
+  const CeedSize vec_size = length;
+  int grid_size = vec_size / block_size;
+
+  if (block_size * grid_size < vec_size) grid_size += 1;
+  setValueStridedK<<<grid_size,block_size>>>(d_array, start, step, length, val);
+  return 0;
+}
+
 //------------------------------------------------------------------------------
 // Kernel for taking reciprocal
 //------------------------------------------------------------------------------

From 3196072fa7d47cffcf474f8b421f80aad90fa9c5 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Mon, 24 Jun 2024 10:07:09 -0600
Subject: [PATCH 089/571] hip - add Vec*Strided utils

---
 backends/hip-ref/ceed-hip-ref-vector.c        | 78 +++++++++++++++++++
 .../hip-ref/kernels/hip-ref-vector.hip.cpp    | 46 +++++++++++
 2 files changed, 124 insertions(+)

diff --git a/backends/hip-ref/ceed-hip-ref-vector.c b/backends/hip-ref/ceed-hip-ref-vector.c
index 5789679578..62ae8bdb55 100644
--- a/backends/hip-ref/ceed-hip-ref-vector.c
+++ b/backends/hip-ref/ceed-hip-ref-vector.c
@@ -223,6 +223,47 @@ static int CeedVectorSetArray_Hip(const CeedVector vec, const CeedMemType mem_ty
   return CEED_ERROR_UNSUPPORTED;
 }
 
+//------------------------------------------------------------------------------
+// Copy host array to value strided
+//------------------------------------------------------------------------------
+static int CeedHostCopyStrided_Hip(CeedScalar *h_array, CeedSize start, CeedSize step, CeedSize length, CeedScalar *h_copy_array) {
+  for (CeedSize i = start; i < length; i += step) h_copy_array[i] = h_array[i];
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
+// Copy device array to value strided (impl in .hip.cc file)
+//------------------------------------------------------------------------------
+int CeedDeviceCopyStrided_Hip(CeedScalar *d_array, CeedSize start, CeedSize step, CeedSize length, CeedScalar *d_copy_array);
+
+//------------------------------------------------------------------------------
+// Copy a vector to a value strided
+//------------------------------------------------------------------------------
+static int CeedVectorCopyStrided_Hip(CeedVector vec, CeedSize start, CeedSize step, CeedVector vec_copy) {
+  CeedSize        length;
+  CeedVector_Hip *impl;
+
+  CeedCallBackend(CeedVectorGetData(vec, &impl));
+  CeedCallBackend(CeedVectorGetLength(vec, &length));
+  // Set value for synced device/host array
+  if (impl->d_array) {
+    CeedScalar *copy_array;
+
+    CeedCallBackend(CeedVectorGetArray(vec_copy, CEED_MEM_DEVICE, &copy_array));
+    CeedCallBackend(CeedDeviceCopyStrided_Hip(impl->d_array, start, step, length, copy_array));
+    CeedCallBackend(CeedVectorRestoreArray(vec_copy, &copy_array));
+  } else if (impl->h_array) {
+    CeedScalar *copy_array;
+
+    CeedCallBackend(CeedVectorGetArray(vec_copy, CEED_MEM_HOST, &copy_array));
+    CeedCallBackend(CeedHostCopyStrided_Hip(impl->h_array, start, step, length, copy_array));
+    CeedCallBackend(CeedVectorRestoreArray(vec_copy, &copy_array));
+  } else {
+    return CeedError(CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND, "CeedVector must have valid data set");
+  }
+  return CEED_ERROR_SUCCESS;
+}
+
 //------------------------------------------------------------------------------
 // Set host array to value
 //------------------------------------------------------------------------------
@@ -270,6 +311,41 @@ static int CeedVectorSetValue_Hip(CeedVector vec, CeedScalar val) {
   return CEED_ERROR_SUCCESS;
 }
 
+//------------------------------------------------------------------------------
+// Set host array to value strided
+//------------------------------------------------------------------------------
+static int CeedHostSetValueStrided_Hip(CeedScalar *h_array, CeedSize start, CeedSize step, CeedSize length, CeedScalar val) {
+  for (CeedSize i = start; i < length; i += step) h_array[i] = val;
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
+// Set device array to value strided (impl in .hip.cc file)
+//------------------------------------------------------------------------------
+int CeedDeviceSetValueStrided_Hip(CeedScalar *d_array, CeedSize start, CeedSize step, CeedSize length, CeedScalar val);
+
+//------------------------------------------------------------------------------
+// Set a vector to a value strided
+//------------------------------------------------------------------------------
+static int CeedVectorSetValueStrided_Hip(CeedVector vec, CeedSize start, CeedSize step, CeedScalar val) {
+  CeedSize        length;
+  CeedVector_Hip *impl;
+
+  CeedCallBackend(CeedVectorGetData(vec, &impl));
+  CeedCallBackend(CeedVectorGetLength(vec, &length));
+  // Set value for synced device/host array
+  if (impl->d_array) {
+    CeedCallBackend(CeedDeviceSetValueStrided_Hip(impl->d_array, start, step, length, val));
+    impl->h_array = NULL;
+  } else if (impl->h_array) {
+    CeedCallBackend(CeedHostSetValueStrided_Hip(impl->h_array, start, step, length, val));
+    impl->d_array = NULL;
+  } else {
+    return CeedError(CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND, "CeedVector must have valid data set");
+  }
+  return CEED_ERROR_SUCCESS;
+}
+
 //------------------------------------------------------------------------------
 // Vector Take Array
 //------------------------------------------------------------------------------
@@ -690,7 +766,9 @@ int CeedVectorCreate_Hip(CeedSize n, CeedVector vec) {
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "HasBorrowedArrayOfType", CeedVectorHasBorrowedArrayOfType_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "SetArray", CeedVectorSetArray_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "TakeArray", CeedVectorTakeArray_Hip));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "CopyStrided", (int (*)())CeedVectorCopyStrided_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "SetValue", (int (*)())CeedVectorSetValue_Hip));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "SetValueStrided", (int (*)())CeedVectorSetValueStrided_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "SyncArray", CeedVectorSyncArray_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "GetArray", CeedVectorGetArray_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "GetArrayRead", CeedVectorGetArrayRead_Hip));
diff --git a/backends/hip-ref/kernels/hip-ref-vector.hip.cpp b/backends/hip-ref/kernels/hip-ref-vector.hip.cpp
index 5f6dd15f2a..5375d2e10b 100644
--- a/backends/hip-ref/kernels/hip-ref-vector.hip.cpp
+++ b/backends/hip-ref/kernels/hip-ref-vector.hip.cpp
@@ -8,6 +8,29 @@
 #include <ceed.h>
 #include <hip/hip_runtime.h>
 
+//------------------------------------------------------------------------------
+// Kernel for copy strided on device
+//------------------------------------------------------------------------------
+__global__ static void copyStridedK(CeedScalar *__restrict__ vec, CeedSize start, CeedSize step, CeedSize size, CeedScalar *__restrict__ vec_copy) {
+  CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
+
+  if (index >= size) return;
+  if ((index - start) % step == 0) vec_copy[index] = vec[index];
+}
+
+//------------------------------------------------------------------------------
+// Copy strided on device memory
+//------------------------------------------------------------------------------
+extern "C" int CeedDeviceCopyStrided_Hip(CeedScalar *d_array, CeedSize start, CeedSize step, CeedSize length, CeedScalar *d_copy_array) {
+  const int      block_size = 512;
+  const CeedSize vec_size   = length;
+  int            grid_size  = vec_size / block_size;
+
+  if (block_size * grid_size < vec_size) grid_size += 1;
+  hipLaunchKernelGGL(copyStridedK, dim3(grid_size), dim3(block_size), 0, 0, d_array, start, step, length, d_copy_array);
+  return 0;
+}
+
 //------------------------------------------------------------------------------
 // Kernel for set value on device
 //------------------------------------------------------------------------------
@@ -31,6 +54,29 @@ extern "C" int CeedDeviceSetValue_Hip(CeedScalar *d_array, CeedSize length, Ceed
   return 0;
 }
 
+//------------------------------------------------------------------------------
+// Kernel for set value strided on device
+//------------------------------------------------------------------------------
+__global__ static void setValueStridedK(CeedScalar *__restrict__ vec, CeedSize start, CeedSize step, CeedSize size, CeedScalar val) {
+  CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
+
+  if (index >= size) return;
+  if ((index - start) % step == 0) vec[index] = val;
+}
+
+//------------------------------------------------------------------------------
+// Set value strided on device memory
+//------------------------------------------------------------------------------
+extern "C" int CeedDeviceSetValueStrided_Hip(CeedScalar *d_array, CeedSize start, CeedSize step, CeedSize length, CeedScalar val) {
+  const int      block_size = 512;
+  const CeedSize vec_size   = length;
+  int            grid_size  = vec_size / block_size;
+
+  if (block_size * grid_size < vec_size) grid_size += 1;
+  hipLaunchKernelGGL(setValueStridedK, dim3(grid_size), dim3(block_size), 0, 0, d_array, start, step, length, val);
+  return 0;
+}
+
 //------------------------------------------------------------------------------
 // Kernel for taking reciprocal
 //------------------------------------------------------------------------------

From 956a3dba83dc7b9f65c506526c501667c011375a Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Mon, 24 Jun 2024 11:02:39 -0600
Subject: [PATCH 090/571] hip - fix references to .cu

---
 backends/hip-ref/ceed-hip-ref-vector.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/backends/hip-ref/ceed-hip-ref-vector.c b/backends/hip-ref/ceed-hip-ref-vector.c
index 62ae8bdb55..83f00d2701 100644
--- a/backends/hip-ref/ceed-hip-ref-vector.c
+++ b/backends/hip-ref/ceed-hip-ref-vector.c
@@ -232,7 +232,7 @@ static int CeedHostCopyStrided_Hip(CeedScalar *h_array, CeedSize start, CeedSize
 }
 
 //------------------------------------------------------------------------------
-// Copy device array to value strided (impl in .hip.cc file)
+// Copy device array to value strided (impl in .hip.cpp file)
 //------------------------------------------------------------------------------
 int CeedDeviceCopyStrided_Hip(CeedScalar *d_array, CeedSize start, CeedSize step, CeedSize length, CeedScalar *d_copy_array);
 
@@ -320,7 +320,7 @@ static int CeedHostSetValueStrided_Hip(CeedScalar *h_array, CeedSize start, Ceed
 }
 
 //------------------------------------------------------------------------------
-// Set device array to value strided (impl in .hip.cc file)
+// Set device array to value strided (impl in .hip.cpp file)
 //------------------------------------------------------------------------------
 int CeedDeviceSetValueStrided_Hip(CeedScalar *d_array, CeedSize start, CeedSize step, CeedSize length, CeedScalar val);
 
@@ -582,7 +582,7 @@ static int CeedHostReciprocal_Hip(CeedScalar *h_array, CeedSize length) {
 }
 
 //------------------------------------------------------------------------------
-// Take reciprocal of a vector on device (impl in .cu file)
+// Take reciprocal of a vector on device (impl in .hip.cpp file)
 //------------------------------------------------------------------------------
 int CeedDeviceReciprocal_Hip(CeedScalar *d_array, CeedSize length);
 
@@ -610,7 +610,7 @@ static int CeedHostScale_Hip(CeedScalar *x_array, CeedScalar alpha, CeedSize len
 }
 
 //------------------------------------------------------------------------------
-// Compute x = alpha x on device (impl in .cu file)
+// Compute x = alpha x on device (impl in .hip.cpp file)
 //------------------------------------------------------------------------------
 int CeedDeviceScale_Hip(CeedScalar *x_array, CeedScalar alpha, CeedSize length);
 
@@ -638,7 +638,7 @@ static int CeedHostAXPY_Hip(CeedScalar *y_array, CeedScalar alpha, CeedScalar *x
 }
 
 //------------------------------------------------------------------------------
-// Compute y = alpha x + y on device (impl in .cu file)
+// Compute y = alpha x + y on device (impl in .hip.cpp file)
 //------------------------------------------------------------------------------
 int CeedDeviceAXPY_Hip(CeedScalar *y_array, CeedScalar alpha, CeedScalar *x_array, CeedSize length);
 
@@ -673,7 +673,7 @@ static int CeedHostAXPBY_Hip(CeedScalar *y_array, CeedScalar alpha, CeedScalar b
 }
 
 //------------------------------------------------------------------------------
-// Compute y = alpha x + beta y on device (impl in .cu file)
+// Compute y = alpha x + beta y on device (impl in .hip.cpp file)
 //------------------------------------------------------------------------------
 int CeedDeviceAXPBY_Hip(CeedScalar *y_array, CeedScalar alpha, CeedScalar beta, CeedScalar *x_array, CeedSize length);
 
@@ -708,7 +708,7 @@ static int CeedHostPointwiseMult_Hip(CeedScalar *w_array, CeedScalar *x_array, C
 }
 
 //------------------------------------------------------------------------------
-// Compute the pointwise multiplication w = x .* y on device (impl in .cu file)
+// Compute the pointwise multiplication w = x .* y on device (impl in .hip.cpp file)
 //------------------------------------------------------------------------------
 int CeedDevicePointwiseMult_Hip(CeedScalar *w_array, CeedScalar *x_array, CeedScalar *y_array, CeedSize length);
 

From b73fa92ca232cd7a1379d6ffb54e013a90896973 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Mon, 24 Jun 2024 11:04:52 -0600
Subject: [PATCH 091/571] style - also format .cu files

---
 Makefile                                      |   2 +-
 backends/cuda-ref/kernels/cuda-ref-vector.cu  | 112 ++++++++----------
 .../cuda-shared/kernels/cuda-shared-basis.cu  |  23 ++--
 3 files changed, 60 insertions(+), 77 deletions(-)

diff --git a/Makefile b/Makefile
index 85c5a9d481..aa1fead449 100644
--- a/Makefile
+++ b/Makefile
@@ -780,7 +780,7 @@ CLANG_FORMAT_OPTS += -style=file -i
 AUTOPEP8          ?= autopep8
 AUTOPEP8_OPTS     += --in-place --aggressive --max-line-length 120
 
-format.ch := $(filter-out include/ceedf.h $(wildcard tests/t*-f.h), $(shell git ls-files '*.[ch]pp' '*.[ch]'))
+format.ch := $(filter-out include/ceedf.h $(wildcard tests/t*-f.h), $(shell git ls-files '*.[ch]pp' '*.[ch]' '*.cu'))
 format.py := $(filter-out tests/junit-xml/junit_xml/__init__.py, $(shell git ls-files '*.py'))
 format.ot := $(filter-out doc/sphinx/source/CODE_OF_CONDUCT.md doc/sphinx/source/CONTRIBUTING.md, $(shell git ls-files '*.md' '*.f90'))
 
diff --git a/backends/cuda-ref/kernels/cuda-ref-vector.cu b/backends/cuda-ref/kernels/cuda-ref-vector.cu
index 43e558107e..29788bf4fd 100644
--- a/backends/cuda-ref/kernels/cuda-ref-vector.cu
+++ b/backends/cuda-ref/kernels/cuda-ref-vector.cu
@@ -11,8 +11,7 @@
 //------------------------------------------------------------------------------
 // Kernel for copy strided on device
 //------------------------------------------------------------------------------
-__global__ static void copyStridedK(CeedScalar * __restrict__ vec, CeedSize start, CeedSize step,
-                                    CeedSize size, CeedScalar * __restrict__ vec_copy) {
+__global__ static void copyStridedK(CeedScalar *__restrict__ vec, CeedSize start, CeedSize step, CeedSize size, CeedScalar *__restrict__ vec_copy) {
   CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
   if (index >= size) return;
   if ((index - start) % step == 0) vec_copy[index] = vec[index];
@@ -21,22 +20,20 @@ __global__ static void copyStridedK(CeedScalar * __restrict__ vec, CeedSize star
 //------------------------------------------------------------------------------
 // Copy strided on device memory
 //------------------------------------------------------------------------------
-extern "C" int CeedDeviceCopyStrided_Cuda(CeedScalar* d_array, CeedSize start, CeedSize step,
-                                          CeedSize length, CeedScalar* d_copy_array) {
-  const int block_size = 512;
-  const CeedSize vec_size = length;
-  int grid_size = vec_size / block_size;
+extern "C" int CeedDeviceCopyStrided_Cuda(CeedScalar *d_array, CeedSize start, CeedSize step, CeedSize length, CeedScalar *d_copy_array) {
+  const int      block_size = 512;
+  const CeedSize vec_size   = length;
+  int            grid_size  = vec_size / block_size;
 
   if (block_size * grid_size < vec_size) grid_size += 1;
-  copyStridedK<<<grid_size,block_size>>>(d_array, start, step, length, d_copy_array);
+  copyStridedK<<<grid_size, block_size>>>(d_array, start, step, length, d_copy_array);
   return 0;
 }
 
 //------------------------------------------------------------------------------
 // Kernel for set value on device
 //------------------------------------------------------------------------------
-__global__ static void setValueK(CeedScalar * __restrict__ vec, CeedSize size,
-                                 CeedScalar val) {
+__global__ static void setValueK(CeedScalar *__restrict__ vec, CeedSize size, CeedScalar val) {
   CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
   if (index >= size) return;
   vec[index] = val;
@@ -45,22 +42,20 @@ __global__ static void setValueK(CeedScalar * __restrict__ vec, CeedSize size,
 //------------------------------------------------------------------------------
 // Set value on device memory
 //------------------------------------------------------------------------------
-extern "C" int CeedDeviceSetValue_Cuda(CeedScalar* d_array, CeedSize length,
-                                       CeedScalar val) {
-  const int block_size = 512;
-  const CeedSize vec_size = length;
-  int grid_size = vec_size / block_size;
+extern "C" int CeedDeviceSetValue_Cuda(CeedScalar *d_array, CeedSize length, CeedScalar val) {
+  const int      block_size = 512;
+  const CeedSize vec_size   = length;
+  int            grid_size  = vec_size / block_size;
 
   if (block_size * grid_size < vec_size) grid_size += 1;
-  setValueK<<<grid_size,block_size>>>(d_array, length, val);
+  setValueK<<<grid_size, block_size>>>(d_array, length, val);
   return 0;
 }
 
 //------------------------------------------------------------------------------
 // Kernel for set value strided on device
 //------------------------------------------------------------------------------
-__global__ static void setValueStridedK(CeedScalar * __restrict__ vec, CeedSize start, CeedSize step,
-                                        CeedSize size, CeedScalar val) {
+__global__ static void setValueStridedK(CeedScalar *__restrict__ vec, CeedSize start, CeedSize step, CeedSize size, CeedScalar val) {
   CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
   if (index >= size) return;
   if ((index - start) % step == 0) vec[index] = val;
@@ -69,44 +64,42 @@ __global__ static void setValueStridedK(CeedScalar * __restrict__ vec, CeedSize
 //------------------------------------------------------------------------------
 // Set value strided on device memory
 //------------------------------------------------------------------------------
-extern "C" int CeedDeviceSetValueStrided_Cuda(CeedScalar* d_array, CeedSize start, CeedSize step,
-                                              CeedSize length, CeedScalar val) {
-  const int block_size = 512;
-  const CeedSize vec_size = length;
-  int grid_size = vec_size / block_size;
+extern "C" int CeedDeviceSetValueStrided_Cuda(CeedScalar *d_array, CeedSize start, CeedSize step, CeedSize length, CeedScalar val) {
+  const int      block_size = 512;
+  const CeedSize vec_size   = length;
+  int            grid_size  = vec_size / block_size;
 
   if (block_size * grid_size < vec_size) grid_size += 1;
-  setValueStridedK<<<grid_size,block_size>>>(d_array, start, step, length, val);
+  setValueStridedK<<<grid_size, block_size>>>(d_array, start, step, length, val);
   return 0;
 }
 
 //------------------------------------------------------------------------------
 // Kernel for taking reciprocal
 //------------------------------------------------------------------------------
-__global__ static void rcpValueK(CeedScalar * __restrict__ vec, CeedSize size) {
+__global__ static void rcpValueK(CeedScalar *__restrict__ vec, CeedSize size) {
   CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
   if (index >= size) return;
-  if (fabs(vec[index]) > 1E-16) vec[index] = 1./vec[index];
+  if (fabs(vec[index]) > 1E-16) vec[index] = 1. / vec[index];
 }
 
 //------------------------------------------------------------------------------
 // Take vector reciprocal in device memory
 //------------------------------------------------------------------------------
-extern "C" int CeedDeviceReciprocal_Cuda(CeedScalar* d_array, CeedSize length) {
-  const int block_size = 512;
-  const CeedSize vec_size = length;
-  int grid_size = vec_size / block_size;
+extern "C" int CeedDeviceReciprocal_Cuda(CeedScalar *d_array, CeedSize length) {
+  const int      block_size = 512;
+  const CeedSize vec_size   = length;
+  int            grid_size  = vec_size / block_size;
 
   if (block_size * grid_size < vec_size) grid_size += 1;
-  rcpValueK<<<grid_size,block_size>>>(d_array, length);
+  rcpValueK<<<grid_size, block_size>>>(d_array, length);
   return 0;
 }
 
 //------------------------------------------------------------------------------
 // Kernel for scale
 //------------------------------------------------------------------------------
-__global__ static void scaleValueK(CeedScalar * __restrict__ x, CeedScalar alpha,
-    CeedSize size) {
+__global__ static void scaleValueK(CeedScalar *__restrict__ x, CeedScalar alpha, CeedSize size) {
   CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
   if (index >= size) return;
   x[index] *= alpha;
@@ -115,22 +108,20 @@ __global__ static void scaleValueK(CeedScalar * __restrict__ x, CeedScalar alpha
 //------------------------------------------------------------------------------
 // Compute x = alpha x on device
 //------------------------------------------------------------------------------
-extern "C" int CeedDeviceScale_Cuda(CeedScalar *x_array, CeedScalar alpha,
-    CeedSize length) {
-  const int block_size = 512;
-  const CeedSize vec_size = length;
-  int grid_size = vec_size / block_size;
+extern "C" int CeedDeviceScale_Cuda(CeedScalar *x_array, CeedScalar alpha, CeedSize length) {
+  const int      block_size = 512;
+  const CeedSize vec_size   = length;
+  int            grid_size  = vec_size / block_size;
 
   if (block_size * grid_size < vec_size) grid_size += 1;
-  scaleValueK<<<grid_size,block_size>>>(x_array, alpha, length);
+  scaleValueK<<<grid_size, block_size>>>(x_array, alpha, length);
   return 0;
 }
 
 //------------------------------------------------------------------------------
 // Kernel for axpy
 //------------------------------------------------------------------------------
-__global__ static void axpyValueK(CeedScalar * __restrict__ y, CeedScalar alpha,
-    CeedScalar * __restrict__ x, CeedSize size) {
+__global__ static void axpyValueK(CeedScalar *__restrict__ y, CeedScalar alpha, CeedScalar *__restrict__ x, CeedSize size) {
   CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
   if (index >= size) return;
   y[index] += alpha * x[index];
@@ -139,22 +130,20 @@ __global__ static void axpyValueK(CeedScalar * __restrict__ y, CeedScalar alpha,
 //------------------------------------------------------------------------------
 // Compute y = alpha x + y on device
 //------------------------------------------------------------------------------
-extern "C" int CeedDeviceAXPY_Cuda(CeedScalar *y_array, CeedScalar alpha,
-    CeedScalar *x_array, CeedSize length) {
-  const int block_size = 512;
-  const CeedSize vec_size = length;
-  int grid_size = vec_size / block_size;
+extern "C" int CeedDeviceAXPY_Cuda(CeedScalar *y_array, CeedScalar alpha, CeedScalar *x_array, CeedSize length) {
+  const int      block_size = 512;
+  const CeedSize vec_size   = length;
+  int            grid_size  = vec_size / block_size;
 
   if (block_size * grid_size < vec_size) grid_size += 1;
-  axpyValueK<<<grid_size,block_size>>>(y_array, alpha, x_array, length);
+  axpyValueK<<<grid_size, block_size>>>(y_array, alpha, x_array, length);
   return 0;
 }
 
 //------------------------------------------------------------------------------
 // Kernel for axpby
 //------------------------------------------------------------------------------
-__global__ static void axpbyValueK(CeedScalar * __restrict__ y, CeedScalar alpha, CeedScalar beta,
-    CeedScalar * __restrict__ x, CeedSize size) {
+__global__ static void axpbyValueK(CeedScalar *__restrict__ y, CeedScalar alpha, CeedScalar beta, CeedScalar *__restrict__ x, CeedSize size) {
   CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
   if (index >= size) return;
   y[index] = beta * y[index];
@@ -164,22 +153,20 @@ __global__ static void axpbyValueK(CeedScalar * __restrict__ y, CeedScalar alpha
 //------------------------------------------------------------------------------
 // Compute y = alpha x + beta y on device
 //------------------------------------------------------------------------------
-extern "C" int CeedDeviceAXPBY_Cuda(CeedScalar *y_array, CeedScalar alpha, CeedScalar beta,
-    CeedScalar *x_array, CeedSize length) {
-  const int block_size = 512;
-  const CeedSize vec_size = length;
-  int grid_size = vec_size / block_size;
+extern "C" int CeedDeviceAXPBY_Cuda(CeedScalar *y_array, CeedScalar alpha, CeedScalar beta, CeedScalar *x_array, CeedSize length) {
+  const int      block_size = 512;
+  const CeedSize vec_size   = length;
+  int            grid_size  = vec_size / block_size;
 
   if (block_size * grid_size < vec_size) grid_size += 1;
-  axpbyValueK<<<grid_size,block_size>>>(y_array, alpha, beta, x_array, length);
+  axpbyValueK<<<grid_size, block_size>>>(y_array, alpha, beta, x_array, length);
   return 0;
 }
 
 //------------------------------------------------------------------------------
 // Kernel for pointwise mult
 //------------------------------------------------------------------------------
-__global__ static void pointwiseMultValueK(CeedScalar * __restrict__ w,
-    CeedScalar * x, CeedScalar * __restrict__ y, CeedSize size) {
+__global__ static void pointwiseMultValueK(CeedScalar *__restrict__ w, CeedScalar *x, CeedScalar *__restrict__ y, CeedSize size) {
   CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
   if (index >= size) return;
   w[index] = x[index] * y[index];
@@ -188,14 +175,13 @@ __global__ static void pointwiseMultValueK(CeedScalar * __restrict__ w,
 //------------------------------------------------------------------------------
 // Compute the pointwise multiplication w = x .* y on device
 //------------------------------------------------------------------------------
-extern "C" int CeedDevicePointwiseMult_Cuda(CeedScalar *w_array, CeedScalar *x_array,
-    CeedScalar *y_array, CeedSize length) {
-  const int block_size = 512;
-  const CeedSize vec_size = length;
-  int grid_size = vec_size / block_size;
+extern "C" int CeedDevicePointwiseMult_Cuda(CeedScalar *w_array, CeedScalar *x_array, CeedScalar *y_array, CeedSize length) {
+  const int      block_size = 512;
+  const CeedSize vec_size   = length;
+  int            grid_size  = vec_size / block_size;
 
   if (block_size * grid_size < vec_size) grid_size += 1;
-  pointwiseMultValueK<<<grid_size,block_size>>>(w_array, x_array, y_array, length);
+  pointwiseMultValueK<<<grid_size, block_size>>>(w_array, x_array, y_array, length);
   return 0;
 }
 
diff --git a/backends/cuda-shared/kernels/cuda-shared-basis.cu b/backends/cuda-shared/kernels/cuda-shared-basis.cu
index 1eb03fb2e9..f654f7ddda 100644
--- a/backends/cuda-shared/kernels/cuda-shared-basis.cu
+++ b/backends/cuda-shared/kernels/cuda-shared-basis.cu
@@ -8,16 +8,15 @@
 #include <ceed.h>
 #include <cuda.h>
 
-const int sizeMax = 16;
-__constant__ CeedScalar c_B[sizeMax*sizeMax];
-__constant__ CeedScalar c_G[sizeMax*sizeMax];
+const int               sizeMax = 16;
+__constant__ CeedScalar c_B[sizeMax * sizeMax];
+__constant__ CeedScalar c_G[sizeMax * sizeMax];
 
 //------------------------------------------------------------------------------
 // Interp device initialization
 //------------------------------------------------------------------------------
-extern "C" int CeedInit_CudaInterp(CeedScalar *d_B, CeedInt P_1d, CeedInt Q_1d,
-                                  CeedScalar **c_B_ptr) {
-  const int bytes = P_1d*Q_1d*sizeof(CeedScalar);
+extern "C" int CeedInit_CudaInterp(CeedScalar *d_B, CeedInt P_1d, CeedInt Q_1d, CeedScalar **c_B_ptr) {
+  const int bytes = P_1d * Q_1d * sizeof(CeedScalar);
 
   cudaMemcpyToSymbol(c_B, d_B, bytes, 0, cudaMemcpyDeviceToDevice);
   cudaGetSymbolAddress((void **)c_B_ptr, c_B);
@@ -27,9 +26,8 @@ extern "C" int CeedInit_CudaInterp(CeedScalar *d_B, CeedInt P_1d, CeedInt Q_1d,
 //------------------------------------------------------------------------------
 // Grad device initialization
 //------------------------------------------------------------------------------
-extern "C" int CeedInit_CudaGrad(CeedScalar *d_B, CeedScalar *d_G,
-    CeedInt P_1d, CeedInt Q_1d, CeedScalar **c_B_ptr, CeedScalar **c_G_ptr) {
-  const int bytes = P_1d*Q_1d*sizeof(CeedScalar);
+extern "C" int CeedInit_CudaGrad(CeedScalar *d_B, CeedScalar *d_G, CeedInt P_1d, CeedInt Q_1d, CeedScalar **c_B_ptr, CeedScalar **c_G_ptr) {
+  const int bytes = P_1d * Q_1d * sizeof(CeedScalar);
 
   cudaMemcpyToSymbol(c_B, d_B, bytes, 0, cudaMemcpyDeviceToDevice);
   cudaGetSymbolAddress((void **)c_B_ptr, c_B);
@@ -41,10 +39,9 @@ extern "C" int CeedInit_CudaGrad(CeedScalar *d_B, CeedScalar *d_G,
 //------------------------------------------------------------------------------
 // Collocated grad device initialization
 //------------------------------------------------------------------------------
-extern "C" int CeedInit_CudaCollocatedGrad(CeedScalar *d_B, CeedScalar *d_G,
-    CeedInt P_1d, CeedInt Q_1d, CeedScalar **c_B_ptr, CeedScalar **c_G_ptr) {
-  const int bytes_interp = P_1d*Q_1d*sizeof(CeedScalar);
-  const int bytes_grad = Q_1d*Q_1d*sizeof(CeedScalar);
+extern "C" int CeedInit_CudaCollocatedGrad(CeedScalar *d_B, CeedScalar *d_G, CeedInt P_1d, CeedInt Q_1d, CeedScalar **c_B_ptr, CeedScalar **c_G_ptr) {
+  const int bytes_interp = P_1d * Q_1d * sizeof(CeedScalar);
+  const int bytes_grad   = Q_1d * Q_1d * sizeof(CeedScalar);
 
   cudaMemcpyToSymbol(c_B, d_B, bytes_interp, 0, cudaMemcpyDeviceToDevice);
   cudaGetSymbolAddress((void **)c_B_ptr, c_B);

From a7efc114811d86487a1a2fdd5ce6c3d337b4ade3 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Mon, 24 Jun 2024 11:14:40 -0600
Subject: [PATCH 092/571] vec - use min of 2 lengths for gpu impl of
 CopyStrided

---
 backends/cuda-ref/ceed-cuda-ref-vector.c | 8 +++++++-
 backends/hip-ref/ceed-hip-ref-vector.c   | 8 +++++++-
 interface/ceed-vector.c                  | 2 +-
 3 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/backends/cuda-ref/ceed-cuda-ref-vector.c b/backends/cuda-ref/ceed-cuda-ref-vector.c
index 0aeb445cd8..e15d44789a 100644
--- a/backends/cuda-ref/ceed-cuda-ref-vector.c
+++ b/backends/cuda-ref/ceed-cuda-ref-vector.c
@@ -244,7 +244,13 @@ static int CeedVectorCopyStrided_Cuda(CeedVector vec, CeedSize start, CeedSize s
   CeedVector_Cuda *impl;
 
   CeedCallBackend(CeedVectorGetData(vec, &impl));
-  CeedCallBackend(CeedVectorGetLength(vec, &length));
+  {
+    CeedSize length_vec, length_copy;
+
+    CeedCall(CeedVectorGetLength(vec, &length_vec));
+    CeedCall(CeedVectorGetLength(vec_copy, &length_copy));
+    length = length_vec < length_copy ? length_vec : length_copy;
+  }
   // Set value for synced device/host array
   if (impl->d_array) {
     CeedScalar *copy_array;
diff --git a/backends/hip-ref/ceed-hip-ref-vector.c b/backends/hip-ref/ceed-hip-ref-vector.c
index 83f00d2701..2883de9e25 100644
--- a/backends/hip-ref/ceed-hip-ref-vector.c
+++ b/backends/hip-ref/ceed-hip-ref-vector.c
@@ -244,7 +244,13 @@ static int CeedVectorCopyStrided_Hip(CeedVector vec, CeedSize start, CeedSize st
   CeedVector_Hip *impl;
 
   CeedCallBackend(CeedVectorGetData(vec, &impl));
-  CeedCallBackend(CeedVectorGetLength(vec, &length));
+  {
+    CeedSize length_vec, length_copy;
+
+    CeedCall(CeedVectorGetLength(vec, &length_vec));
+    CeedCall(CeedVectorGetLength(vec_copy, &length_copy));
+    length = length_vec < length_copy ? length_vec : length_copy;
+  }
   // Set value for synced device/host array
   if (impl->d_array) {
     CeedScalar *copy_array;
diff --git a/interface/ceed-vector.c b/interface/ceed-vector.c
index da44920f44..649a320b63 100644
--- a/interface/ceed-vector.c
+++ b/interface/ceed-vector.c
@@ -272,7 +272,7 @@ int CeedVectorCopyStrided(CeedVector vec, CeedSize start, CeedInt step, CeedVect
 
     CeedCall(CeedVectorGetLength(vec, &length_vec));
     CeedCall(CeedVectorGetLength(vec_copy, &length_copy));
-    length = length_vec > length_copy ? length_vec : length_copy;
+    length = length_vec < length_copy ? length_vec : length_copy;
   }
 
   // Copy

From bb03490db5dd50abd44d5abe8f5d902899c96770 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Mon, 24 Jun 2024 12:29:24 -0600
Subject: [PATCH 093/571] tidy - add check tidy needs

---
 interface/ceed-vector.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/interface/ceed-vector.c b/interface/ceed-vector.c
index 649a320b63..e2da964009 100644
--- a/interface/ceed-vector.c
+++ b/interface/ceed-vector.c
@@ -256,8 +256,8 @@ int CeedVectorCopy(CeedVector vec, CeedVector vec_copy) {
 **/
 int CeedVectorCopyStrided(CeedVector vec, CeedSize start, CeedInt step, CeedVector vec_copy) {
   CeedSize          length;
-  const CeedScalar *array;
-  CeedScalar       *array_copy;
+  const CeedScalar *array      = NULL;
+  CeedScalar       *array_copy = NULL;
 
   // Backend version
   if (vec->CopyStrided && vec_copy->CopyStrided) {
@@ -272,6 +272,7 @@ int CeedVectorCopyStrided(CeedVector vec, CeedSize start, CeedInt step, CeedVect
 
     CeedCall(CeedVectorGetLength(vec, &length_vec));
     CeedCall(CeedVectorGetLength(vec_copy, &length_copy));
+    if (length_vec <= 0 || length_copy <= 0) return CEED_ERROR_SUCCESS;
     length = length_vec < length_copy ? length_vec : length_copy;
   }
 
@@ -377,8 +378,9 @@ int CeedVectorSetValueStrided(CeedVector vec, CeedSize start, CeedInt step, Ceed
     CeedSize    length;
     CeedScalar *array;
 
-    CeedCall(CeedVectorGetArray(vec, CEED_MEM_HOST, &array));
     CeedCall(CeedVectorGetLength(vec, &length));
+    if (length <= 0) return CEED_ERROR_SUCCESS;
+    CeedCall(CeedVectorGetArray(vec, CEED_MEM_HOST, &array));
     for (CeedSize i = start; i < length; i += step) array[i] = value;
     CeedCall(CeedVectorRestoreArray(vec, &array));
   }

From c1222711bbaa154065e44a2caf05c765092ea957 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Mon, 24 Jun 2024 10:36:00 -0600
Subject: [PATCH 094/571] gpu - skip unneeded restrictions in OpApply

---
 backends/cuda-ref/ceed-cuda-ref-operator.c | 10 +++++++++-
 backends/cuda-ref/ceed-cuda-ref.h          |  7 ++++---
 backends/hip-ref/ceed-hip-ref-operator.c   | 10 +++++++++-
 backends/hip-ref/ceed-hip-ref.h            |  7 ++++---
 4 files changed, 26 insertions(+), 8 deletions(-)

diff --git a/backends/cuda-ref/ceed-cuda-ref-operator.c b/backends/cuda-ref/ceed-cuda-ref-operator.c
index 670eb14e12..c6604907ba 100644
--- a/backends/cuda-ref/ceed-cuda-ref-operator.c
+++ b/backends/cuda-ref/ceed-cuda-ref-operator.c
@@ -31,6 +31,7 @@ static int CeedOperatorDestroy_Cuda(CeedOperator op) {
     CeedCallBackend(CeedVectorDestroy(&impl->e_vecs[i]));
   }
   CeedCallBackend(CeedFree(&impl->e_vecs));
+  CeedCallBackend(CeedFree(&impl->input_states));
 
   for (CeedInt i = 0; i < impl->num_inputs; i++) {
     CeedCallBackend(CeedVectorDestroy(&impl->q_vecs_in[i]));
@@ -201,6 +202,7 @@ static int CeedOperatorSetup_Cuda(CeedOperator op) {
 
   // Allocate
   CeedCallBackend(CeedCalloc(num_input_fields + num_output_fields, &impl->e_vecs));
+  CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->input_states));
   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_in));
   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_out));
   impl->num_inputs  = num_input_fields;
@@ -247,7 +249,13 @@ static inline int CeedOperatorSetupInputs_Cuda(CeedInt num_input_fields, CeedQFu
         // No restriction for this field; read data directly from vec.
         CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, (const CeedScalar **)&e_data[i]));
       } else {
-        CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_NOTRANSPOSE, vec, impl->e_vecs[i], request));
+        uint64_t state;
+
+        CeedCallBackend(CeedVectorGetState(vec, &state));
+        if (state != impl->input_states[i]) {
+          CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_NOTRANSPOSE, vec, impl->e_vecs[i], request));
+          impl->input_states[i] = state;
+        }
         // Get evec
         CeedCallBackend(CeedVectorGetArrayRead(impl->e_vecs[i], CEED_MEM_DEVICE, (const CeedScalar **)&e_data[i]));
       }
diff --git a/backends/cuda-ref/ceed-cuda-ref.h b/backends/cuda-ref/ceed-cuda-ref.h
index f5a9f059e4..0f4ac3f583 100644
--- a/backends/cuda-ref/ceed-cuda-ref.h
+++ b/backends/cuda-ref/ceed-cuda-ref.h
@@ -123,9 +123,10 @@ typedef struct {
 } CeedOperatorAssemble_Cuda;
 
 typedef struct {
-  CeedVector                *e_vecs;      // E-vectors, inputs followed by outputs
-  CeedVector                *q_vecs_in;   // Input Q-vectors needed to apply operator
-  CeedVector                *q_vecs_out;  // Output Q-vectors needed to apply operator
+  CeedVector                *e_vecs;        // E-vectors, inputs followed by outputs
+  uint64_t                  *input_states;  // State tracking for passive inputs
+  CeedVector                *q_vecs_in;     // Input Q-vectors needed to apply operator
+  CeedVector                *q_vecs_out;    // Output Q-vectors needed to apply operator
   CeedInt                    num_inputs, num_outputs;
   CeedInt                    num_active_in, num_active_out;
   CeedVector                *qf_active_in;
diff --git a/backends/hip-ref/ceed-hip-ref-operator.c b/backends/hip-ref/ceed-hip-ref-operator.c
index 486d9bc400..06ff9bfa98 100644
--- a/backends/hip-ref/ceed-hip-ref-operator.c
+++ b/backends/hip-ref/ceed-hip-ref-operator.c
@@ -30,6 +30,7 @@ static int CeedOperatorDestroy_Hip(CeedOperator op) {
     CeedCallBackend(CeedVectorDestroy(&impl->e_vecs[i]));
   }
   CeedCallBackend(CeedFree(&impl->e_vecs));
+  CeedCallBackend(CeedFree(&impl->input_states));
 
   for (CeedInt i = 0; i < impl->num_inputs; i++) {
     CeedCallBackend(CeedVectorDestroy(&impl->q_vecs_in[i]));
@@ -200,6 +201,7 @@ static int CeedOperatorSetup_Hip(CeedOperator op) {
 
   // Allocate
   CeedCallBackend(CeedCalloc(num_input_fields + num_output_fields, &impl->e_vecs));
+  CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->input_states));
   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_in));
   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_out));
   impl->num_inputs  = num_input_fields;
@@ -246,7 +248,13 @@ static inline int CeedOperatorSetupInputs_Hip(CeedInt num_input_fields, CeedQFun
         // No restriction for this field; read data directly from vec.
         CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, (const CeedScalar **)&e_data[i]));
       } else {
-        CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_NOTRANSPOSE, vec, impl->e_vecs[i], request));
+        uint64_t state;
+
+        CeedCallBackend(CeedVectorGetState(vec, &state));
+        if (state != impl->input_states[i]) {
+          CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_NOTRANSPOSE, vec, impl->e_vecs[i], request));
+          impl->input_states[i] = state;
+        }
         // Get evec
         CeedCallBackend(CeedVectorGetArrayRead(impl->e_vecs[i], CEED_MEM_DEVICE, (const CeedScalar **)&e_data[i]));
       }
diff --git a/backends/hip-ref/ceed-hip-ref.h b/backends/hip-ref/ceed-hip-ref.h
index 403cdb71e2..392f9ddb79 100644
--- a/backends/hip-ref/ceed-hip-ref.h
+++ b/backends/hip-ref/ceed-hip-ref.h
@@ -127,9 +127,10 @@ typedef struct {
 } CeedOperatorAssemble_Hip;
 
 typedef struct {
-  CeedVector               *e_vecs;      // E-vectors, inputs followed by outputs
-  CeedVector               *q_vecs_in;   // Input Q-vectors needed to apply operator
-  CeedVector               *q_vecs_out;  // Output Q-vectors needed to apply operator
+  CeedVector               *e_vecs;        // E-vectors, inputs followed by outputs
+  uint64_t                 *input_states;  // State tracking for passive inputs
+  CeedVector               *q_vecs_in;     // Input Q-vectors needed to apply operator
+  CeedVector               *q_vecs_out;    // Output Q-vectors needed to apply operator
   CeedInt                   num_inputs, num_outputs;
   CeedInt                   num_active_in, num_active_out;
   CeedVector               *qf_active_in;

From c6b75494539d8d8116b11862e504869937fbf3b9 Mon Sep 17 00:00:00 2001
From: Valeria Barra <39932030+valeriabarra@users.noreply.github.com>
Date: Mon, 24 Jun 2024 14:39:17 -0700
Subject: [PATCH 095/571] Update Valeria's email address in CODE_OF_CONDUCT.md

---
 CODE_OF_CONDUCT.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md
index 01e35d0de3..2f48726d3e 100644
--- a/CODE_OF_CONDUCT.md
+++ b/CODE_OF_CONDUCT.md
@@ -37,7 +37,7 @@ Examples of representing our community include using an official e-mail address,
 
 ## Enforcement
 
-Instances of abusive, harassing, or otherwise unacceptable behavior may be reported to the community leaders responsible for enforcement at jed@jedbrown.org, valeria@caltech.edu, or tzanio@llnl.gov.
+Instances of abusive, harassing, or otherwise unacceptable behavior may be reported to the community leaders responsible for enforcement at jed@jedbrown.org, vbarra@sdsu.edu, or tzanio@llnl.gov.
 All complaints will be reviewed and investigated promptly and fairly.
 
 All community leaders are obligated to respect the privacy and security of the reporter of any incident.

From 7d5185d77fd3980a37189da5411f3a999b5628b9 Mon Sep 17 00:00:00 2001
From: Sebastian Grimberg <sjg@amazon.com>
Date: Tue, 25 Jun 2024 11:19:20 -0700
Subject: [PATCH 096/571] Add CeedOperatorGetQFunctionAssemblyData so user can
 call CeedQFunctionAssemblyDataDestroy and CeedOperatorAssemblyDataDestroy to
 free up temporary memory

---
 include/ceed/backend.h           |  4 ++-
 interface/ceed-operator.c        | 12 ++++---
 interface/ceed-preconditioning.c | 59 +++++++++++++++++++++++++-------
 3 files changed, 57 insertions(+), 18 deletions(-)

diff --git a/include/ceed/backend.h b/include/ceed/backend.h
index d78d37babe..d6c01735f2 100644
--- a/include/ceed/backend.h
+++ b/include/ceed/backend.h
@@ -393,6 +393,8 @@ CEED_EXTERN int  CeedQFunctionContextReference(CeedQFunctionContext ctx);
 
 CEED_EXTERN int CeedOperatorGetBasisPointer(CeedBasis basis, CeedEvalMode eval_mode, const CeedScalar *identity, const CeedScalar **basis_ptr);
 CEED_EXTERN int CeedOperatorCreateActivePointBlockRestriction(CeedElemRestriction rstr, CeedElemRestriction *pointblock_rstr);
+
+CEED_EXTERN int CeedOperatorGetQFunctionAssemblyData(CeedOperator op, CeedQFunctionAssemblyData *data);
 CEED_EXTERN int CeedQFunctionAssemblyDataCreate(Ceed ceed, CeedQFunctionAssemblyData *data);
 CEED_EXTERN int CeedQFunctionAssemblyDataReference(CeedQFunctionAssemblyData data);
 CEED_EXTERN int CeedQFunctionAssemblyDataSetReuse(CeedQFunctionAssemblyData data, bool reuse_assembly_data);
@@ -404,6 +406,7 @@ CEED_EXTERN int CeedQFunctionAssemblyDataSetObjects(CeedQFunctionAssemblyData da
 CEED_EXTERN int CeedQFunctionAssemblyDataGetObjects(CeedQFunctionAssemblyData data, CeedVector *vec, CeedElemRestriction *rstr);
 CEED_EXTERN int CeedQFunctionAssemblyDataDestroy(CeedQFunctionAssemblyData *data);
 
+CEED_EXTERN int CeedOperatorGetOperatorAssemblyData(CeedOperator op, CeedOperatorAssemblyData *data);
 CEED_EXTERN int CeedOperatorAssemblyDataCreate(Ceed ceed, CeedOperator op, CeedOperatorAssemblyData *data);
 CEED_EXTERN int CeedOperatorAssemblyDataGetEvalModes(CeedOperatorAssemblyData data, CeedInt *num_active_bases_in, CeedInt **num_eval_modes_in,
                                                      const CeedEvalMode ***eval_modes_in, CeedSize ***eval_mode_offsets_in,
@@ -418,7 +421,6 @@ CEED_EXTERN int CeedOperatorAssemblyDataGetElemRestrictions(CeedOperatorAssembly
                                                             CeedElemRestriction **active_elem_rstrs_out);
 CEED_EXTERN int CeedOperatorAssemblyDataDestroy(CeedOperatorAssemblyData *data);
 
-CEED_EXTERN int CeedOperatorGetOperatorAssemblyData(CeedOperator op, CeedOperatorAssemblyData *data);
 CEED_EXTERN int CeedOperatorGetActiveBasis(CeedOperator op, CeedBasis *active_basis);
 CEED_EXTERN int CeedOperatorGetActiveBases(CeedOperator op, CeedBasis *active_input_basis, CeedBasis *active_output_basis);
 CEED_EXTERN int CeedOperatorGetActiveElemRestriction(CeedOperator op, CeedElemRestriction *active_rstr);
diff --git a/interface/ceed-operator.c b/interface/ceed-operator.c
index 9a1277084a..dca66ad78f 100644
--- a/interface/ceed-operator.c
+++ b/interface/ceed-operator.c
@@ -742,7 +742,6 @@ int CeedOperatorCreate(Ceed ceed, CeedQFunction qf, CeedQFunction dqf, CeedQFunc
   CeedCall(CeedQFunctionReferenceCopy(qf, &(*op)->qf));
   if (dqf && dqf != CEED_QFUNCTION_NONE) CeedCall(CeedQFunctionReferenceCopy(dqf, &(*op)->dqf));
   if (dqfT && dqfT != CEED_QFUNCTION_NONE) CeedCall(CeedQFunctionReferenceCopy(dqfT, &(*op)->dqfT));
-  CeedCall(CeedQFunctionAssemblyDataCreate(ceed, &(*op)->qf_assembled));
   CeedCall(CeedCalloc(CEED_FIELD_MAX, &(*op)->input_fields));
   CeedCall(CeedCalloc(CEED_FIELD_MAX, &(*op)->output_fields));
   CeedCall(ceed->OperatorCreate(*op));
@@ -786,7 +785,6 @@ int CeedOperatorCreateAtPoints(Ceed ceed, CeedQFunction qf, CeedQFunction dqf, C
   CeedCall(CeedQFunctionReferenceCopy(qf, &(*op)->qf));
   if (dqf && dqf != CEED_QFUNCTION_NONE) CeedCall(CeedQFunctionReferenceCopy(dqf, &(*op)->dqf));
   if (dqfT && dqfT != CEED_QFUNCTION_NONE) CeedCall(CeedQFunctionReferenceCopy(dqfT, &(*op)->dqfT));
-  CeedCall(CeedQFunctionAssemblyDataCreate(ceed, &(*op)->qf_assembled));
   CeedCall(CeedCalloc(CEED_FIELD_MAX, &(*op)->input_fields));
   CeedCall(CeedCalloc(CEED_FIELD_MAX, &(*op)->output_fields));
   CeedCall(ceed->OperatorCreateAtPoints(*op));
@@ -1411,7 +1409,10 @@ int CeedOperatorSetQFunctionAssemblyReuse(CeedOperator op, bool reuse_assembly_d
       CeedCall(CeedOperatorSetQFunctionAssemblyReuse(op->sub_operators[i], reuse_assembly_data));
     }
   } else {
-    CeedCall(CeedQFunctionAssemblyDataSetReuse(op->qf_assembled, reuse_assembly_data));
+    CeedQFunctionAssemblyData data;
+
+    CeedCall(CeedOperatorGetQFunctionAssemblyData(op, &data));
+    CeedCall(CeedQFunctionAssemblyDataSetReuse(data, reuse_assembly_data));
   }
   return CEED_ERROR_SUCCESS;
 }
@@ -1440,7 +1441,10 @@ int CeedOperatorSetQFunctionAssemblyDataUpdateNeeded(CeedOperator op, bool needs
       CeedCall(CeedOperatorSetQFunctionAssemblyDataUpdateNeeded(sub_operators[i], needs_data_update));
     }
   } else {
-    CeedCall(CeedQFunctionAssemblyDataSetUpdateNeeded(op->qf_assembled, needs_data_update));
+    CeedQFunctionAssemblyData data;
+
+    CeedCall(CeedOperatorGetQFunctionAssemblyData(op, &data));
+    CeedCall(CeedQFunctionAssemblyDataSetUpdateNeeded(data, needs_data_update));
   }
   return CEED_ERROR_SUCCESS;
 }
diff --git a/interface/ceed-preconditioning.c b/interface/ceed-preconditioning.c
index 88255ced0a..8eb59409d5 100644
--- a/interface/ceed-preconditioning.c
+++ b/interface/ceed-preconditioning.c
@@ -160,7 +160,12 @@ static int CeedOperatorCreateFallback(CeedOperator op) {
       CeedCall(CeedOperatorFieldGetData(output_fields[i], &field_name, &rstr, &basis, &vec));
       CeedCall(CeedOperatorSetField(op_fallback, field_name, rstr, basis, vec));
     }
-    CeedCall(CeedQFunctionAssemblyDataReferenceCopy(op->qf_assembled, &op_fallback->qf_assembled));
+    {
+      CeedQFunctionAssemblyData data;
+
+      CeedCall(CeedOperatorGetQFunctionAssemblyData(op, &data));
+      CeedCall(CeedQFunctionAssemblyDataReferenceCopy(data, &op_fallback->qf_assembled));
+    }
     // Cleanup
     CeedCall(CeedQFunctionDestroy(&qf_fallback));
     CeedCall(CeedQFunctionDestroy(&dqf_fallback));
@@ -882,7 +887,12 @@ static int CeedSingleOperatorMultigridLevel(CeedOperator op_fine, CeedVector p_m
     CeedCall(CeedOperatorSetField(*op_coarse, field_name, rstr, basis, vec));
   }
   // -- Clone QFunctionAssemblyData
-  CeedCall(CeedQFunctionAssemblyDataReferenceCopy(op_fine->qf_assembled, &(*op_coarse)->qf_assembled));
+  {
+    CeedQFunctionAssemblyData fine_data;
+
+    CeedCall(CeedOperatorGetQFunctionAssemblyData(op_fine, &fine_data));
+    CeedCall(CeedQFunctionAssemblyDataReferenceCopy(fine_data, &(*op_coarse)->qf_assembled));
+  }
 
   // Multiplicity vector
   if (op_restrict || op_prolong) {
@@ -1125,6 +1135,27 @@ int CeedOperatorCreateActivePointBlockRestriction(CeedElemRestriction rstr, Ceed
   return CEED_ERROR_SUCCESS;
 }
 
+/**
+  @brief Get `CeedQFunctionAssemblyData`
+
+  @param[in]  op   `CeedOperator` to assemble
+  @param[out] data `CeedQFunctionAssemblyData`
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Backend
+**/
+int CeedOperatorGetQFunctionAssemblyData(CeedOperator op, CeedQFunctionAssemblyData *data) {
+  if (!op->qf_assembled) {
+    CeedQFunctionAssemblyData data;
+
+    CeedCall(CeedQFunctionAssemblyDataCreate(op->ceed, &data));
+    op->qf_assembled = data;
+  }
+  *data = op->qf_assembled;
+  return CEED_ERROR_SUCCESS;
+}
+
 /**
   @brief Create object holding `CeedQFunction` assembly data for `CeedOperator`
 
@@ -1304,7 +1335,7 @@ int CeedQFunctionAssemblyDataDestroy(CeedQFunctionAssemblyData *data) {
   @brief Get `CeedOperatorAssemblyData`
 
   @param[in]  op   `CeedOperator` to assemble
-  @param[out] data `CeedQFunctionAssemblyData`
+  @param[out] data `CeedOperatorAssemblyData`
 
   @return An error code: 0 - success, otherwise - failure
 
@@ -1868,22 +1899,24 @@ int CeedOperatorLinearAssembleQFunctionBuildOrUpdate(CeedOperator op, CeedVector
   // Assemble QFunction
   if (LinearAssembleQFunctionUpdate) {
     // Backend or fallback parent version
-    bool                qf_assembled_is_setup;
-    CeedVector          assembled_vec  = NULL;
-    CeedElemRestriction assembled_rstr = NULL;
-
-    CeedCall(CeedQFunctionAssemblyDataIsSetup(op->qf_assembled, &qf_assembled_is_setup));
-    if (qf_assembled_is_setup) {
+    CeedQFunctionAssemblyData data;
+    bool                      data_is_setup;
+    CeedVector                assembled_vec  = NULL;
+    CeedElemRestriction       assembled_rstr = NULL;
+
+    CeedCall(CeedOperatorGetQFunctionAssemblyData(op, &data));
+    CeedCall(CeedQFunctionAssemblyDataIsSetup(data, &data_is_setup));
+    if (data_is_setup) {
       bool update_needed;
 
-      CeedCall(CeedQFunctionAssemblyDataGetObjects(op->qf_assembled, &assembled_vec, &assembled_rstr));
-      CeedCall(CeedQFunctionAssemblyDataIsUpdateNeeded(op->qf_assembled, &update_needed));
+      CeedCall(CeedQFunctionAssemblyDataGetObjects(data, &assembled_vec, &assembled_rstr));
+      CeedCall(CeedQFunctionAssemblyDataIsUpdateNeeded(data, &update_needed));
       if (update_needed) CeedCall(LinearAssembleQFunctionUpdate(op_assemble, assembled_vec, assembled_rstr, request));
     } else {
       CeedCall(CeedOperatorLinearAssembleQFunction(op_assemble, &assembled_vec, &assembled_rstr, request));
-      CeedCall(CeedQFunctionAssemblyDataSetObjects(op->qf_assembled, assembled_vec, assembled_rstr));
+      CeedCall(CeedQFunctionAssemblyDataSetObjects(data, assembled_vec, assembled_rstr));
     }
-    CeedCall(CeedQFunctionAssemblyDataSetUpdateNeeded(op->qf_assembled, false));
+    CeedCall(CeedQFunctionAssemblyDataSetUpdateNeeded(data, false));
 
     // Copy reference from internally held copy
     CeedCall(CeedVectorReferenceCopy(assembled_vec, assembled));

From 536b928cfd67e9b0db70f332afdf4cd5725665c0 Mon Sep 17 00:00:00 2001
From: Sebastian Grimberg <sjg@amazon.com>
Date: Tue, 25 Jun 2024 15:08:34 -0700
Subject: [PATCH 097/571] Add CeedOperatorAssemblyDataStrip to public interface

---
 include/ceed/ceed.h       |  1 +
 interface/ceed-operator.c | 29 +++++++++++++++++++++++++++++
 2 files changed, 30 insertions(+)

diff --git a/include/ceed/ceed.h b/include/ceed/ceed.h
index 4fff8784bc..f55aa9c6f3 100644
--- a/include/ceed/ceed.h
+++ b/include/ceed/ceed.h
@@ -459,6 +459,7 @@ CEED_EXTERN int  CeedOperatorGetContextBooleanRead(CeedOperator op, CeedContextF
 CEED_EXTERN int  CeedOperatorRestoreContextBooleanRead(CeedOperator op, CeedContextFieldLabel field_label, const bool **values);
 CEED_EXTERN int  CeedOperatorApply(CeedOperator op, CeedVector in, CeedVector out, CeedRequest *request);
 CEED_EXTERN int  CeedOperatorApplyAdd(CeedOperator op, CeedVector in, CeedVector out, CeedRequest *request);
+CEED_EXTERN int  CeedOperatorAssemblyDataStrip(CeedOperator op);
 CEED_EXTERN int  CeedOperatorDestroy(CeedOperator *op);
 
 CEED_EXTERN int CeedOperatorGetFieldByName(CeedOperator op, const char *field_name, CeedOperatorField *op_field);
diff --git a/interface/ceed-operator.c b/interface/ceed-operator.c
index dca66ad78f..495c41fc9c 100644
--- a/interface/ceed-operator.c
+++ b/interface/ceed-operator.c
@@ -2110,6 +2110,35 @@ int CeedOperatorApplyAdd(CeedOperator op, CeedVector in, CeedVector out, CeedReq
   return CEED_ERROR_SUCCESS;
 }
 
+/**
+  @brief Destroy temporary assembly data associated with a `CeedOperator`
+
+  @param[in,out] op `CeedOperator` whose assembly data to destroy
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref User
+**/
+int CeedOperatorAssemblyDataStrip(CeedOperator op) {
+  bool is_composite;
+
+  CeedCall(CeedQFunctionAssemblyDataDestroy(&op->qf_assembled));
+  CeedCall(CeedOperatorAssemblyDataDestroy(&op->op_assembled));
+  CeedCall(CeedOperatorIsComposite(op, &is_composite));
+  if (is_composite) {
+    CeedInt       num_suboperators;
+    CeedOperator *sub_operators;
+
+    CeedCall(CeedCompositeOperatorGetNumSub(op, &num_suboperators));
+    CeedCall(CeedCompositeOperatorGetSubList(op, &sub_operators));
+    for (CeedInt i = 0; i < num_suboperators; i++) {
+      CeedCall(CeedQFunctionAssemblyDataDestroy(&sub_operators[i]->qf_assembled));
+      CeedCall(CeedOperatorAssemblyDataDestroy(&sub_operators[i]->op_assembled));
+    }
+  }
+  return CEED_ERROR_SUCCESS;
+}
+
 /**
   @brief Destroy a `CeedOperator`
 

From f883f0a5423dccfdefcc45bb5c497717e44013fc Mon Sep 17 00:00:00 2001
From: Sebastian Grimberg <sjg@amazon.com>
Date: Wed, 26 Jun 2024 10:40:01 -0700
Subject: [PATCH 098/571] Call CeedOperatorAssemblyDataStrip in
 CeedOperatorDestroy

---
 interface/ceed-operator.c | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/interface/ceed-operator.c b/interface/ceed-operator.c
index 495c41fc9c..16379098c0 100644
--- a/interface/ceed-operator.c
+++ b/interface/ceed-operator.c
@@ -2153,8 +2153,10 @@ int CeedOperatorDestroy(CeedOperator *op) {
     *op = NULL;
     return CEED_ERROR_SUCCESS;
   }
-  if ((*op)->Destroy) CeedCall((*op)->Destroy(*op));
-  CeedCall(CeedDestroy(&(*op)->ceed));
+  // Backend destroy
+  if ((*op)->Destroy) {
+    CeedCall((*op)->Destroy(*op));
+  }
   // Free fields
   for (CeedInt i = 0; i < (*op)->num_fields; i++) {
     if ((*op)->input_fields[i]) {
@@ -2184,16 +2186,21 @@ int CeedOperatorDestroy(CeedOperator *op) {
       CeedCall(CeedFree(&(*op)->output_fields[i]));
     }
   }
-  // AtPoints data
+  CeedCall(CeedFree(&(*op)->input_fields));
+  CeedCall(CeedFree(&(*op)->output_fields));
+  // Destroy AtPoints data
   CeedCall(CeedVectorDestroy(&(*op)->point_coords));
   CeedCall(CeedElemRestrictionDestroy(&(*op)->rstr_points));
   CeedCall(CeedElemRestrictionDestroy(&(*op)->first_points_rstr));
+  // Destroy assembly data
+  CeedCall(CeedOperatorAssemblyDataStrip(*op));
   // Destroy sub_operators
   for (CeedInt i = 0; i < (*op)->num_suboperators; i++) {
     if ((*op)->sub_operators[i]) {
       CeedCall(CeedOperatorDestroy(&(*op)->sub_operators[i]));
     }
   }
+  CeedCall(CeedFree(&(*op)->sub_operators));
   CeedCall(CeedQFunctionDestroy(&(*op)->qf));
   CeedCall(CeedQFunctionDestroy(&(*op)->dqf));
   CeedCall(CeedQFunctionDestroy(&(*op)->dqfT));
@@ -2209,14 +2216,8 @@ int CeedOperatorDestroy(CeedOperator *op) {
   // Destroy fallback
   CeedCall(CeedOperatorDestroy(&(*op)->op_fallback));
 
-  // Destroy assembly data
-  CeedCall(CeedQFunctionAssemblyDataDestroy(&(*op)->qf_assembled));
-  CeedCall(CeedOperatorAssemblyDataDestroy(&(*op)->op_assembled));
-
-  CeedCall(CeedFree(&(*op)->input_fields));
-  CeedCall(CeedFree(&(*op)->output_fields));
-  CeedCall(CeedFree(&(*op)->sub_operators));
   CeedCall(CeedFree(&(*op)->name));
+  CeedCall(CeedDestroy(&(*op)->ceed));
   CeedCall(CeedFree(op));
   return CEED_ERROR_SUCCESS;
 }

From c6b536a8adc1c8ea763ec6cf1b342657e84d2699 Mon Sep 17 00:00:00 2001
From: Sebastian Grimberg <sjg@amazon.com>
Date: Wed, 26 Jun 2024 11:30:23 -0700
Subject: [PATCH 099/571] Clarify operator destroy comment

---
 interface/ceed-operator.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/interface/ceed-operator.c b/interface/ceed-operator.c
index 16379098c0..4acb8f1983 100644
--- a/interface/ceed-operator.c
+++ b/interface/ceed-operator.c
@@ -2192,7 +2192,7 @@ int CeedOperatorDestroy(CeedOperator *op) {
   CeedCall(CeedVectorDestroy(&(*op)->point_coords));
   CeedCall(CeedElemRestrictionDestroy(&(*op)->rstr_points));
   CeedCall(CeedElemRestrictionDestroy(&(*op)->first_points_rstr));
-  // Destroy assembly data
+  // Destroy assembly data (must happen before destroying sub_operators)
   CeedCall(CeedOperatorAssemblyDataStrip(*op));
   // Destroy sub_operators
   for (CeedInt i = 0; i < (*op)->num_suboperators; i++) {

From 6d9fcd4bf2031900f926a4c63de98a31f3f5b77c Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Thu, 27 Jun 2024 10:37:25 -0600
Subject: [PATCH 100/571] ex - add links to Ratel, HONEE

---
 examples/fluids/README.md | 2 ++
 examples/solids/README.md | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/examples/fluids/README.md b/examples/fluids/README.md
index 40ce78cf87..3b036e2f98 100644
--- a/examples/fluids/README.md
+++ b/examples/fluids/README.md
@@ -3,6 +3,8 @@
 This page provides a description of the Navier-Stokes example for the libCEED library, based on PETSc.
 PETSc v3.17 or a development version of PETSc at commit 0e95d842 or later is required.
 
+HONEE, a more fully featured fluid dynamics solver, can be found on [GitLab](https://gitlab.com/phypid/honee).
+
 The Navier-Stokes problem solves the compressible Navier-Stokes equations in three dimensions using an explicit time integration.
 The state variables are mass density, momentum density, and energy density.
 
diff --git a/examples/solids/README.md b/examples/solids/README.md
index d6e70d7be4..80f242b809 100644
--- a/examples/solids/README.md
+++ b/examples/solids/README.md
@@ -3,6 +3,8 @@
 This page provides a description of the solid mechanics example for the libCEED library, based on PETSc.
 PETSc v3.17 or a development version of PETSc at commit 0e95d842 or later is required.
 
+Ratel, a more fully featured solid mechanics library, can be found on [GitLab](https://gitlab.com/micromorph/ratel).
+
 This code solves the steady-state static momentum balance equations using unstructured high-order finite/spectral element spatial discretizations.
 In this mini-app, we consider three formulations used in solid mechanics applications: linear elasticity, Neo-Hookean hyperelasticity at small strain, and Neo-Hookean hyperelasticity at finite strain.
 All three of these formulations are for compressible materials.

From 3e0c2f3f098a94aa29497d10cd3cab29dd4ca6e3 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Thu, 27 Jun 2024 15:03:40 -0600
Subject: [PATCH 101/571] fluids - Remove SGS, SmartSim, Torch (moved to HONEE)

---
 .gitlab-ci.yml                                |  10 -
 Makefile                                      |  21 +-
 examples/fluids/Makefile                      |  45 +-
 examples/fluids/README.md                     |  61 --
 examples/fluids/dd_sgs_data/OutScaling.dat    |  13 -
 examples/fluids/dd_sgs_data/b1.dat            |  21 -
 examples/fluids/dd_sgs_data/b2.dat            |   7 -
 examples/fluids/dd_sgs_data/w1.dat            | 121 ----
 examples/fluids/dd_sgs_data/w2.dat            | 121 ----
 examples/fluids/include/log_events.h          |   3 -
 examples/fluids/include/sgs_model_torch.h     |  27 -
 examples/fluids/include/smartsim.h            |  30 -
 examples/fluids/index.md                      | 104 ---
 examples/fluids/navierstokes.c                |   6 -
 examples/fluids/navierstokes.h                |  65 --
 examples/fluids/problems/sgs_dd_model.c       | 660 ------------------
 .../fluids/problems/sgs_model_torch_weak.c    |  22 -
 .../fluids/problems/torch/sgs_model_torch.cpp | 161 -----
 examples/fluids/qfunctions/sgs_dd_model.h     | 259 -------
 examples/fluids/qfunctions/sgs_dd_training.h  |  68 --
 examples/fluids/qfunctions/sgs_dd_utils.h     | 131 ----
 .../fluids/smartsim_regression_framework.py   | 241 -------
 examples/fluids/src/cloptions.c               |   8 -
 examples/fluids/src/log_events.c              |  16 +-
 examples/fluids/src/setuplibceed.c            |   3 -
 examples/fluids/src/setupts.c                 |   9 -
 .../fluids/src/smartsim/sgs_dd_training.c     | 390 -----------
 examples/fluids/src/smartsim/smartsim.c       |  76 --
 examples/fluids/src/smartsim_weak.c           |  46 --
 .../createPyTorchModel/NNModel_HIT.pt         | Bin 1908 -> 0 bytes
 .../NNModel_HIT_fp64_jit.pt                   | Bin 9977 -> 0 bytes
 .../tests-output/createPyTorchModel/README.md |   1 -
 .../createPyTorchModel/update_weights.py      |  71 --
 tests/junit.py                                |  52 +-
 34 files changed, 17 insertions(+), 2852 deletions(-)
 delete mode 100644 examples/fluids/dd_sgs_data/OutScaling.dat
 delete mode 100644 examples/fluids/dd_sgs_data/b1.dat
 delete mode 100644 examples/fluids/dd_sgs_data/b2.dat
 delete mode 100644 examples/fluids/dd_sgs_data/w1.dat
 delete mode 100644 examples/fluids/dd_sgs_data/w2.dat
 delete mode 100644 examples/fluids/include/sgs_model_torch.h
 delete mode 100644 examples/fluids/include/smartsim.h
 delete mode 100644 examples/fluids/problems/sgs_dd_model.c
 delete mode 100644 examples/fluids/problems/sgs_model_torch_weak.c
 delete mode 100644 examples/fluids/problems/torch/sgs_model_torch.cpp
 delete mode 100644 examples/fluids/qfunctions/sgs_dd_model.h
 delete mode 100644 examples/fluids/qfunctions/sgs_dd_training.h
 delete mode 100644 examples/fluids/qfunctions/sgs_dd_utils.h
 delete mode 100755 examples/fluids/smartsim_regression_framework.py
 delete mode 100644 examples/fluids/src/smartsim/sgs_dd_training.c
 delete mode 100644 examples/fluids/src/smartsim/smartsim.c
 delete mode 100644 examples/fluids/src/smartsim_weak.c
 delete mode 100644 examples/fluids/tests-output/createPyTorchModel/NNModel_HIT.pt
 delete mode 100644 examples/fluids/tests-output/createPyTorchModel/NNModel_HIT_fp64_jit.pt
 delete mode 100644 examples/fluids/tests-output/createPyTorchModel/README.md
 delete mode 100755 examples/fluids/tests-output/createPyTorchModel/update_weights.py

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 78e052b855..6b4bfdc3c9 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -96,13 +96,8 @@ noether-cpu:
 # Libraries for examples
 # -- PETSc with HIP (minimal)
     - export PETSC_DIR=/projects/petsc PETSC_ARCH=mpich-hip-int64 && git -C $PETSC_DIR -c safe.directory=$PETSC_DIR describe
-    - source /home/phypid/spack/share/spack/setup-env.sh && spack load py-torch@2.3+cuda && export USE_TORCH=1
-    - export SMARTREDIS_DIR=/home/phypid/SmartSimTestingSoftware/smartredis/install
     - echo "-------------- PETSc ---------------" && make -C $PETSC_DIR info
     - make -k -j$((NPROC_CPU / NPROC_POOL)) BACKENDS="$BACKENDS_CPU" JUNIT_BATCH="cpu" junit search="petsc fluids-navierstokes solids"
-    - spack unload py-torch@2.3+cuda
-    - source /home/phypid/SmartSimTestingSoftware/bin/activate
-    - make -k -j$((NPROC_CPU / NPROC_POOL)) BACKENDS="$BACKENDS_CPU" JUNIT_BATCH="cpu" junit search="fluids-py-smartsim_regression_framework"
 # -- MFEM v4.6
     - cd .. && export MFEM_VERSION=mfem-4.6 && { [[ -d $MFEM_VERSION ]] || { git clone --depth 1 --branch v4.6 https://github.com/mfem/mfem.git $MFEM_VERSION && make -C $MFEM_VERSION -j$(nproc) serial CXXFLAGS="-O -std=c++11"; }; } && export MFEM_DIR=$PWD/$MFEM_VERSION && cd libCEED
     - echo "-------------- MFEM ----------------" && make -C $MFEM_DIR info
@@ -200,13 +195,8 @@ noether-cuda:
 # Libraries for examples
 # -- PETSc with CUDA (minimal)
     - export PETSC_DIR=/projects/petsc PETSC_ARCH=mpich-cuda-O PETSC_OPTIONS='-use_gpu_aware_mpi 0' && git -C $PETSC_DIR -c safe.directory=$PETSC_DIR describe
-    - source /home/phypid/spack/share/spack/setup-env.sh && spack load py-torch@2.3+cuda && export USE_TORCH=1
-    - export SMARTREDIS_DIR=/home/phypid/SmartSimTestingSoftware/smartredis/install
     - echo "-------------- PETSc ---------------" && make -C $PETSC_DIR info
     - make -k -j$((NPROC_GPU / NPROC_POOL)) JUNIT_BATCH="cuda" junit BACKENDS="$BACKENDS_GPU" search="petsc fluids-navierstokes solids"
-    - spack unload py-torch@2.3+cuda
-    - source /home/phypid/SmartSimTestingSoftware/bin/activate
-    - make -k -j$((NPROC_CPU / NPROC_POOL)) BACKENDS="$BACKENDS_CPU" JUNIT_BATCH="cpu" junit search="fluids-py-smartsim_regression_framework"
 # Clang-tidy
     - echo "-------------- clang-tidy ----------" && clang-tidy --version
     - TIDY_OPTS="-fix-errors" make -j$NPROC_CPU tidy && git diff --color=always --exit-code
diff --git a/Makefile b/Makefile
index aa1fead449..4d0f150e78 100644
--- a/Makefile
+++ b/Makefile
@@ -97,12 +97,6 @@ ifneq ($(wildcard ../petsc/lib/libpetsc.*),)
   PETSC_DIR ?= ../petsc
 endif
 
-# SmartSim testing
-SMARTREDIS_DIR ?=
-
-# PyTorch testing
-USE_TORCH ?=
-
 # Warning: SANTIZ options still don't run with /gpu/occa
 AFLAGS ?= -fsanitize=address #-fsanitize=undefined -fno-omit-frame-pointer
 
@@ -263,9 +257,7 @@ petscexamples   := $(petscexamples.c:examples/petsc/%.c=$(OBJDIR)/petsc-%)
 dealiiexamples  := $(OBJDIR)/dealii-bps
 # Fluid Dynamics Examples
 fluidsexamples.c  := $(sort $(wildcard examples/fluids/*.c))
-fluidsexamples.py := examples/fluids/smartsim_regression_framework.py
 fluidsexamples    := $(fluidsexamples.c:examples/fluids/%.c=$(OBJDIR)/fluids-%)
-fluidsexamples    += $(fluidsexamples.py:examples/fluids/%.py=$(OBJDIR)/fluids-py-%)
 # Solid Mechanics Examples
 solidsexamples.c  := $(sort $(wildcard examples/solids/*.c))
 solidsexamples    := $(solidsexamples.c:examples/solids/%.c=$(OBJDIR)/solids-%)
@@ -626,14 +618,11 @@ $(OBJDIR)/petsc-% : examples/petsc/%.c examples/petsc/libutils.a.PHONY $(libceed
 	  PETSC_DIR="$(abspath $(PETSC_DIR))" OPT="$(OPT)" $*
 	cp examples/petsc/$* $@
 
-$(OBJDIR)/fluids-% : examples/fluids/%.c examples/fluids/src/*.c examples/fluids/*.h examples/fluids/include/*.h examples/fluids/problems/*.c examples/fluids/problems/torch/*.cpp examples/fluids/qfunctions/*.h examples/fluids/src/smartsim/*.c $(libceed) $(ceed.pc) examples/fluids/Makefile | $$(@D)/.DIR
+$(OBJDIR)/fluids-% : examples/fluids/%.c examples/fluids/src/*.c examples/fluids/*.h examples/fluids/include/*.h examples/fluids/problems/*.c examples/fluids/qfunctions/*.h $(libceed) $(ceed.pc) examples/fluids/Makefile | $$(@D)/.DIR
 	+$(call quiet,MAKE) -C examples/fluids CEED_DIR=`pwd` \
 	  PETSC_DIR="$(abspath $(PETSC_DIR))" OPT="$(OPT)" $*
 	cp examples/fluids/$* $@
 
-$(OBJDIR)/fluids-py-% : examples/fluids/%.py $(OBJDIR)/fluids-navierstokes
-	cp $< $@
-
 $(OBJDIR)/solids-% : examples/solids/%.c examples/solids/%.h \
     examples/solids/problems/*.c examples/solids/src/*.c \
     examples/solids/include/*.h examples/solids/problems/*.h examples/solids/qfunctions/*.h \
@@ -655,7 +644,7 @@ NPROC_POOL ?= 1
 export NPROC_POOL
 
 run-% : $(OBJDIR)/%
-	@$(PYTHON) tests/junit.py --mode tap --ceed-backends $(BACKENDS) $(if $(SMARTREDIS_DIR),--smartredis_dir $(SMARTREDIS_DIR) ) $(if $(USE_TORCH),--has_torch $(USE_TORCH) )--nproc $(NPROC_TEST) --pool-size $(NPROC_POOL) $(<:$(OBJDIR)/%=%)
+	@$(PYTHON) tests/junit.py --mode tap --ceed-backends $(BACKENDS) --nproc $(NPROC_TEST) --pool-size $(NPROC_POOL) $(<:$(OBJDIR)/%=%)
 
 external_examples := \
 	$(if $(MFEM_DIR),$(mfemexamples)) \
@@ -689,7 +678,7 @@ ctc-% : $(ctests);@$(foreach tst,$(ctests),$(tst) /cpu/$*;)
 
 prove : $(matched)
 	$(info Testing backends: $(BACKENDS))
-	$(PROVE) $(PROVE_OPTS) --exec '$(PYTHON) tests/junit.py --mode tap --ceed-backends $(BACKENDS) $(if $(SMARTREDIS_DIR),--smartredis_dir $(SMARTREDIS_DIR) ) $(if $(USE_TORCH),--has_torch $(USE_TORCH) )--nproc $(NPROC_TEST) --pool-size $(NPROC_POOL)' $(matched:$(OBJDIR)/%=%)
+	$(PROVE) $(PROVE_OPTS) --exec '$(PYTHON) tests/junit.py --mode tap --ceed-backends $(BACKENDS) --nproc $(NPROC_TEST) --pool-size $(NPROC_POOL)' $(matched:$(OBJDIR)/%=%)
 # Run prove target in parallel
 prv : ;@$(MAKE) $(MFLAGS) V=$(V) prove
 
@@ -697,7 +686,7 @@ prove-all :
 	+$(MAKE) prove realsearch=%
 
 junit-% : $(OBJDIR)/%
-	@printf "  %10s %s\n" TEST $(<:$(OBJDIR)/%=%); $(PYTHON) tests/junit.py --ceed-backends $(BACKENDS) $(if $(SMARTREDIS_DIR),--smartredis_dir $(SMARTREDIS_DIR) ) $(if $(USE_TORCH),--has_torch $(USE_TORCH) )--nproc $(NPROC_TEST) --pool-size $(NPROC_POOL) --junit-batch $(JUNIT_BATCH) $(<:$(OBJDIR)/%=%)
+	@printf "  %10s %s\n" TEST $(<:$(OBJDIR)/%=%); $(PYTHON) tests/junit.py --ceed-backends $(BACKENDS) --nproc $(NPROC_TEST) --pool-size $(NPROC_POOL) --junit-batch $(JUNIT_BATCH) $(<:$(OBJDIR)/%=%)
 
 junit : $(matched:$(OBJDIR)/%=junit-%)
 
@@ -855,7 +844,7 @@ print-% :
 CONFIG_VARS = CC CXX FC NVCC NVCC_CXX HIPCC \
   OPT CFLAGS CPPFLAGS CXXFLAGS FFLAGS NVCCFLAGS HIPCCFLAGS SYCLFLAGS \
   AR ARFLAGS LDFLAGS LDLIBS LIBCXX SED \
-  MAGMA_DIR OCCA_DIR XSMM_DIR CUDA_DIR CUDA_ARCH MFEM_DIR PETSC_DIR NEK5K_DIR ROCM_DIR HIP_ARCH SYCL_DIR SMARTREDIS_DIR USE_TORCH
+  MAGMA_DIR OCCA_DIR XSMM_DIR CUDA_DIR CUDA_ARCH MFEM_DIR PETSC_DIR NEK5K_DIR ROCM_DIR HIP_ARCH SYCL_DIR
 
 # $(call needs_save,CFLAGS) returns true (a nonempty string) if CFLAGS
 # was set on the command line or in config.mk (where it will appear as
diff --git a/examples/fluids/Makefile b/examples/fluids/Makefile
index bcec70e4f9..1162dbc8bb 100644
--- a/examples/fluids/Makefile
+++ b/examples/fluids/Makefile
@@ -30,11 +30,9 @@ CFLAGS = -std=c99 \
   $(OPT) $(OPT_EXAMPLES)
 CPPFLAGS = $(call pkgconf, --cflags-only-I $(PETSc.pc) $(ceed.pc)) \
   $(call pkgconf, --variable=cflags_dep $(PETSc.pc))
-CXX = $(call pkgconf, --variable=cxxcompiler $(PETSc.pc) $(ceed.pc))
-CXXFLAGS = -std=c++17 -Wno-deprecated -Wno-tautological-compare
 LDFLAGS = $(call pkgconf, --libs-only-L --libs-only-other $(PETSc.pc) $(ceed.pc))
 LDFLAGS += $(patsubst -L%, $(call pkgconf, --variable=ldflag_rpath $(PETSc.pc))%, $(call pkgconf, --libs-only-L $(PETSc.pc) $(ceed.pc)))
-LDLIBS = $(call pkgconf, --libs-only-l $(PETSc.pc) $(ceed.pc)) -lm -lstdc++
+LDLIBS = $(call pkgconf, --libs-only-l $(PETSc.pc) $(ceed.pc)) -lm
 
 # Address Sanitizer Setup
 # ASAN must be left empty if you don't want to use it
@@ -46,49 +44,13 @@ FFLAGS += $(if $(ASAN),$(AFLAGS))
 LDFLAGS += $(if $(ASAN),$(AFLAGS))
 CPPFLAGS += -I./include
 
-# LibTorch
-USE_TORCH ?=
-ifeq ($(USE_TORCH),1)
-  libtorch.pc := $(shell python ./pytorch_pkgconfig.py)
-  CPPFLAGS += $(call pkgconf, --cflags-only-I $(libtorch.pc))
-  CXXFLAGS += $(call pkgconf, --cflags-only-other $(libtorch.pc))
-  LDFLAGS += $(call pkgconf, --libs-only-L --libs-only-other $(libtorch.pc))
-  LDFLAGS += $(patsubst -L%, $(call pkgconf, --variable=ldflag_rpath $(PETSc.pc))%, $(call pkgconf, --libs-only-L $(libtorch.pc)))
-  LDLIBS += $(call pkgconf, --libs-only-l $(libtorch.pc))
-
-  src.cpp += $(sort $(wildcard $(PROBLEMDIR)/torch/*.cpp))
-  src.c += $(sort $(wildcard $(PROBLEMDIR)/torch/*.c))
-
-  # Intel Pytorch EXtension (IPEX)
-  IPEX_DIR ?=
-  ifdef IPEX_DIR
-      LDFLAGS += -L$(IPEX_DIR)/lib/
-      LDFLAGS += -Wl,-rpath,$(IPEX_DIR)/lib/
-      LDLIBS += -lintel-ext-pt-gpu
-  endif
-endif
-
 # Source Files
 OBJDIR := build
 SRCDIR := src
 PROBLEMDIR := problems
 
 src.c := navierstokes.c $(sort $(wildcard $(PROBLEMDIR)/*.c)) $(sort $(wildcard $(SRCDIR)/*.c))
-src.o = $(src.c:%.c=$(OBJDIR)/%.o) $(src.cpp:%.cpp=$(OBJDIR)/%.o)
-
-# Path to install directory for SmartRedis. Example: /software/smartredis/install
-SMARTREDIS_DIR ?=
-ifdef SMARTREDIS_DIR
-	hiredis.pc := $(SMARTREDIS_DIR)/lib/pkgconfig/hiredis.pc
-	lsmartredis:= -lsmartredis
-	redis++.pc = $(wildcard $(SMARTREDIS_DIR)/lib/pkgconfig/redis++.pc $(SMARTREDIS_DIR)/lib64/pkgconfig/redis++.pc)
-
-	CPPFLAGS += $(call pkgconf, --cflags-only-I $(hiredis.pc) $(redis++.pc))
-	LDFLAGS += $(call pkgconf, --libs-only-L --libs-only-other $(hiredis.pc) $(redis++.pc))
-	LDFLAGS += $(patsubst -L%, $(call pkgconf, --variable=ldflag_rpath $(PETSc.pc))%, $(call pkgconf, --libs-only-L $(hiredis.pc) $(redis++.pc)))
-	LDLIBS += $(call pkgconf, --libs-only-l $(hiredis.pc) $(redis++.pc)) $(lsmartredis)
-	src.c += $(sort $(wildcard $(SRCDIR)/smartsim/*.c))
-endif
+src.o = $(src.c:%.c=$(OBJDIR)/%.o)
 
 all: navierstokes
 
@@ -106,9 +68,6 @@ quiet ?= $($(1))
 $(OBJDIR)/%.o : %.c  Makefile | $$(@D)/.DIR
 	$(call quiet,CC) $(CPPFLAGS) $(CFLAGS) -c -o $@ $(abspath $<)
 
-$(OBJDIR)/%.o : %.cpp Makefile | $$(@D)/.DIR
-	$(call quiet,CXX) $(CPPFLAGS) $(CXXFLAGS) -c -o $@ $(abspath $<)
-
 print: $(PETSc.pc) $(ceed.pc)
 	$(info CC      : $(CC))
 	$(info CFLAGS  : $(CFLAGS))
diff --git a/examples/fluids/README.md b/examples/fluids/README.md
index 3b036e2f98..be0bd82597 100644
--- a/examples/fluids/README.md
+++ b/examples/fluids/README.md
@@ -22,12 +22,6 @@ and run with:
 ./navierstokes -ceed [ceed] -problem [problem type] -degree [degree]
 ```
 
-If you want to do *in situ* machine-learning training, specify `SMARTREDIS_DIR` in the make command like:
-
-```
-make SMARTREDIS_DIR=~/software/smartredis/install
-```
-
 ## Runtime options
 
 % inclusion-fluids-marker
@@ -641,36 +635,6 @@ For the Density Current, Channel, and Blasius problems, the following common com
   -  `-reference_pressure`
   - `Pa`
 
-* - `-sgs_model_type`
-  - Type of subgrid stress model to use. Currently only `data_driven` is available
-  - `none`
-  - string
-
-* - `-sgs_model_dd_leakyrelu_alpha`
-  - Slope parameter for Leaky ReLU activation function. `0` corresponds to normal ReLU
-  - 0
-  -
-
-* - `-sgs_model_dd_parameter_dir`
-  - Path to directory with data-driven model parameters (weights, biases, etc.)
-  - `./dd_sgs_parameters`
-  - string
-
-* - `-sgs_model_dd_model_implementation`
-  - Which computational implementation to use for SGS DD model (`fused`, `sequential_ceed`, `sequential_torch`)
-  - `fused`
-  - string
-
-* - `-sgs_model_dd_torch_model_path`
-  - Path to the PyTorch `*.pt` file containing the DD inference model
-  -
-  - string
-
-* - `-sgs_model_dd_torch_model_device`
-  - What hardware to perform the model inference on (`cpu`, `cuda`, `hip`, `xpu`)
-  - Default matches the libCEED backend
-  - string
-
 * - `-diff_filter_monitor`
   - Enable differential filter TSMonitor
   - `false`
@@ -705,31 +669,6 @@ For the Density Current, Channel, and Blasius problems, the following common com
   - Friction length associated with the flow, $\delta_\nu$. Used in wall-damping functions
   - 0
   - `m`
-
-* - `-sgs_train_enable`
-  - Whether to enable *in situ* training of data-driven SGS model. Require building with SmartRedis.
-  - `false`
-  - boolean
-
-* - `-sgs_train_write_data_interval`
-  - Number of timesteps between writing training data into SmartRedis database
-  - `1`
-  -
-
-* - `-sgs_train_overwrite_data`
-  - Whether new training data should overwrite old data on database
-  - `true`
-  - boolean
-
-* - `-sgs_train_filter_widths`
-  - List of scalar values for different filter widths to calculate for training data
-  -
-  - `m`
-
-* - `-smartsim_collocated_num_ranks`
-  - Number of MPI ranks associated with each collocated database (i.e. ranks per node)
-  - `1`
-  -
 :::
 
 #### Gaussian Wave
diff --git a/examples/fluids/dd_sgs_data/OutScaling.dat b/examples/fluids/dd_sgs_data/OutScaling.dat
deleted file mode 100644
index 94dab73636..0000000000
--- a/examples/fluids/dd_sgs_data/OutScaling.dat
+++ /dev/null
@@ -1,13 +0,0 @@
-12 1
-0e+00
-2e+00
-0e+00
-2e+00
-0e+00
-2e+00
--1e+00
-1e+00
--1e+00
-1e+00
--1e+00
-1e+00
diff --git a/examples/fluids/dd_sgs_data/b1.dat b/examples/fluids/dd_sgs_data/b1.dat
deleted file mode 100644
index 873f658ea9..0000000000
--- a/examples/fluids/dd_sgs_data/b1.dat
+++ /dev/null
@@ -1,21 +0,0 @@
-20 1
-4.899884770038e-01
-3.563204159517e-01
-2.627287776915e-01
-2.951473061921e-02
--4.622340771977e-01
--1.209842939357e-02
--4.663763370896e-01
-8.796932075820e-02
-4.501638907868e-01
-2.077678516370e-01
--1.139284062351e-01
--3.303352644675e-01
--4.148295154500e-01
--4.833042778786e-02
-2.972372410179e-02
--2.464389991227e-01
--2.877421872362e-01
--4.567405721457e-01
-4.734193646824e-01
--4.818997410080e-01
diff --git a/examples/fluids/dd_sgs_data/b2.dat b/examples/fluids/dd_sgs_data/b2.dat
deleted file mode 100644
index 4ff5bd0b30..0000000000
--- a/examples/fluids/dd_sgs_data/b2.dat
+++ /dev/null
@@ -1,7 +0,0 @@
-6 1
-1.176169920799e-01
--2.134958413350e-01
-1.512851885922e-01
-1.612014419874e-01
--1.437293376985e-02
-2.899547585024e-01
diff --git a/examples/fluids/dd_sgs_data/w1.dat b/examples/fluids/dd_sgs_data/w1.dat
deleted file mode 100644
index f27466d680..0000000000
--- a/examples/fluids/dd_sgs_data/w1.dat
+++ /dev/null
@@ -1,121 +0,0 @@
-120 1
--1.573046615553e-01
--8.451867037896e-02
-1.685678425651e-01
--4.017536901661e-01
-4.068168468515e-01
--1.642360540833e-01
-3.676945195442e-01
-3.470122358933e-02
-8.395344749312e-02
-1.230997497940e-01
-1.211759010593e-02
-2.570394361674e-01
-3.340400131793e-01
-2.342342193371e-01
-3.243180076338e-01
-1.235553459881e-01
--7.501312735230e-03
-1.277232278360e-01
-5.125506452634e-02
-1.844073315749e-02
-3.604786379338e-02
-2.063091161008e-01
-2.406054256905e-01
-1.846018306032e-01
-8.525111135827e-02
-3.795039661101e-01
--5.693426468413e-02
--8.111639981827e-02
-5.813760592106e-02
-1.490767475429e-01
--2.410115518494e-01
-2.173034199970e-01
-1.497734413376e-01
--1.296487298257e-01
-2.078686368723e-01
-3.891951801941e-01
-4.243457269355e-01
-1.735201583344e-02
--8.168373382023e-02
--5.933063216886e-02
-1.886585865778e-01
-1.756465348482e-01
-3.295663670792e-01
-1.056135052370e-01
--2.574613681620e-02
-3.683309291418e-01
-3.263624712033e-01
-8.396039179924e-03
--1.916324382654e-01
--2.628404302745e-01
--4.853315252243e-01
-3.133577858731e-01
--2.225070735939e-01
--9.576593410171e-02
-1.447837018193e-01
-2.479471268180e-01
--2.815934342469e-02
-4.508725076092e-02
-1.597744878041e-01
-3.494916947631e-01
--1.426111236028e-02
--1.950362350157e-01
--1.520383426062e-01
--1.344609935156e-01
--2.834500136985e-01
--1.781729998743e-01
--2.521768488857e-03
-6.124647252338e-02
-1.821655951804e-01
-1.293018729851e-01
--9.152586815194e-02
-1.765147511709e-01
-1.875253937772e-02
-2.166082722554e-01
-2.938824219314e-01
-5.320082811374e-02
-2.741659946012e-02
--2.433400466181e-02
--2.085467015769e-01
--1.562518751071e-01
-1.953718281920e-01
--1.221103203238e-02
--6.595354434769e-03
-1.189039582211e-02
-4.107899017131e-02
--6.139734862958e-02
--1.123938999802e-01
-4.565610032251e-04
--1.740175952284e-01
--1.494514855103e-01
--2.351603953684e-01
-3.606743670982e-02
-8.892241319819e-02
--3.823627099458e-02
--4.888216006000e-02
--2.063411767057e-02
-2.653079299534e-02
-2.183949112550e-01
-2.504859939801e-01
-2.814937134408e-01
-7.415384984914e-01
--1.397471716093e+00
--1.489213014481e+00
--5.251418296160e-01
-1.137088253126e-02
--1.895953497433e+00
-7.674570685028e-02
--4.854041451939e+00
-3.391193043882e+00
-2.707932115838e-01
--2.105949983636e-01
-3.070531949510e+00
--4.339743339556e+00
-1.620039300970e+00
--5.362553981240e+00
--3.777406494431e-01
-6.925621482846e-01
-2.343923900615e+00
--3.371354057465e-01
-2.055037536703e-01
diff --git a/examples/fluids/dd_sgs_data/w2.dat b/examples/fluids/dd_sgs_data/w2.dat
deleted file mode 100644
index e1553a6817..0000000000
--- a/examples/fluids/dd_sgs_data/w2.dat
+++ /dev/null
@@ -1,121 +0,0 @@
-120 1
-1.135291623557e-01
-2.505376613198e-01
--9.772966879924e-02
--3.165730972704e-02
-2.807214492556e-01
-1.905260494013e-01
--2.411145792883e-01
-7.384048966390e-02
--1.125988973598e-01
-2.226653706004e-01
--8.789637173632e-02
-2.422358783658e-02
--1.888415645076e-01
--1.810726479901e-01
--1.820814108385e-01
-2.707856893663e-02
-2.395061686285e-01
-3.132696895911e-01
--3.571137262982e-02
--6.703403319249e-02
--2.135582591703e-02
-1.706671398779e-01
--1.422555292276e-01
--1.599414011627e-01
-5.590818266867e-02
-4.760353849516e-02
--8.990354851525e-02
--2.351533551901e-01
--9.919203877195e-02
--8.730502598066e-02
--1.624083994254e-01
--1.756234871059e-01
-2.155448112826e-03
--2.196716615285e-01
-1.230359555198e-01
--2.487008789866e-01
-1.724010972168e-01
-1.200986779247e-01
--1.356200209136e-01
--7.136175504869e-02
--3.284780361916e-02
--2.809583022011e-01
--4.970314689199e-01
--2.877535188767e-02
--3.486136238658e-01
-1.031508309715e-01
--1.166679199470e-01
-1.560071145323e-01
-2.028477831976e-01
-1.679921757572e-01
-1.107170925328e-01
-3.667441712254e-02
-4.279277543497e-02
-1.742941565737e-01
--3.784073837720e-02
-1.170800846414e-01
--8.476677440525e-02
-1.497150762135e-01
-2.095513599240e-01
-1.824870885809e-01
-4.204566627279e-03
--1.556048882917e-02
-1.383926559619e-02
--3.655393508686e-02
-1.111261215177e-01
--3.069205340750e-04
-3.488581056182e-01
--8.042626832384e-02
-1.033683988755e-01
-5.948803437376e-02
--1.994940978541e-01
-7.096924570423e-03
--5.218607313871e-01
--3.428397293084e-01
--2.293382327216e-01
--1.460950001481e-01
--1.581076721431e-01
--2.289507718293e-01
-9.798627298221e-02
-1.437733340246e-01
-1.419228410529e-01
-1.958229699684e-01
-6.931951694653e-03
--7.136749568601e-02
--4.555582403662e-01
--3.070119242611e-01
--2.470410221827e-01
--7.803738726853e-02
-9.142063556119e-02
--1.368559538361e-02
--1.850283326418e-01
-1.152746119954e-02
-1.638429235964e-01
--1.435165512193e-01
--2.534513849487e-01
--2.984090266181e-01
-2.217432932036e-01
--8.358398540164e-02
-7.406614310444e-02
--5.651017266891e-02
--2.270784064420e-01
--2.302290117375e-01
-9.304265393625e-02
-6.798332878752e-02
-4.431976767864e-02
--1.707610729819e-01
--1.410204520039e-01
-1.327823810929e-01
--6.044012224887e-02
--1.376555083883e-02
--3.025252354651e-03
-1.907005235143e-01
-1.291788250753e-01
-5.697185825588e-02
-5.093944063855e-02
--5.412382470510e-02
-2.268724377069e-01
--7.159129384369e-02
--2.554784469980e-01
--1.335334767520e-01
diff --git a/examples/fluids/include/log_events.h b/examples/fluids/include/log_events.h
index 1649da9f5a..418897df94 100644
--- a/examples/fluids/include/log_events.h
+++ b/examples/fluids/include/log_events.h
@@ -18,8 +18,5 @@ extern PetscLogEvent FLUIDS_SmartRedis_Train;
 extern PetscLogEvent FLUIDS_TrainDataCompute;
 extern PetscLogEvent FLUIDS_DifferentialFilter;
 extern PetscLogEvent FLUIDS_VelocityGradientProjection;
-extern PetscLogEvent FLUIDS_SgsModel;
-extern PetscLogEvent FLUIDS_SgsModelDDInference;
-extern PetscLogEvent FLUIDS_SgsModelDDData;
 
 PetscErrorCode RegisterLogEvents();
diff --git a/examples/fluids/include/sgs_model_torch.h b/examples/fluids/include/sgs_model_torch.h
deleted file mode 100644
index a268f1287c..0000000000
--- a/examples/fluids/include/sgs_model_torch.h
+++ /dev/null
@@ -1,27 +0,0 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#include <petsc.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef enum {
-  TORCH_DEVICE_CPU,
-  TORCH_DEVICE_CUDA,
-  TORCH_DEVICE_HIP,
-  TORCH_DEVICE_XPU,
-} TorchDeviceType;
-static const char *const TorchDeviceTypes[] = {"CPU", "CUDA", "HIP", "XPU", "TorchDeviceType", "TORCH_DEVICE_", NULL};
-
-PetscErrorCode LoadModel_Torch(const char *model_path, TorchDeviceType device_enum);
-PetscErrorCode ModelInference_Torch(Vec DD_Inputs_loc, Vec DD_Outputs_loc);
-
-#ifdef __cplusplus
-}
-#endif
diff --git a/examples/fluids/include/smartsim.h b/examples/fluids/include/smartsim.h
deleted file mode 100644
index f8ba943e5f..0000000000
--- a/examples/fluids/include/smartsim.h
+++ /dev/null
@@ -1,30 +0,0 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-#pragma once
-
-#include <c_client.h>
-#include <petscsys.h>
-#include <sr_enums.h>
-
-#if defined(__clang_analyzer__)
-void PetscCallSmartRedis(SRError);
-#else
-#define PetscCallSmartRedis(...)                                                                                                   \
-  do {                                                                                                                             \
-    SRError   ierr_smartredis_call_q_;                                                                                             \
-    PetscBool disable_calls = PETSC_FALSE;                                                                                         \
-    PetscStackUpdateLine;                                                                                                          \
-    PetscCall(PetscOptionsGetBool(NULL, NULL, "-smartsim_disable_calls", &disable_calls, NULL));                                   \
-    if (disable_calls == PETSC_TRUE) break;                                                                                        \
-    ierr_smartredis_call_q_ = __VA_ARGS__;                                                                                         \
-    if (PetscUnlikely(ierr_smartredis_call_q_ != SRNoError))                                                                       \
-      SETERRQ(PETSC_COMM_SELF, (PetscErrorCode)ierr_smartredis_call_q_, "SmartRedis Error (Code %d): %s", ierr_smartredis_call_q_, \
-              SRGetLastError());                                                                                                   \
-  } while (0)
-#endif
-
-PetscErrorCode SmartRedisVerifyPutTensor(void *c_client, const char *name, const size_t name_length);
diff --git a/examples/fluids/index.md b/examples/fluids/index.md
index ee5ee9282d..31e1a12aa2 100644
--- a/examples/fluids/index.md
+++ b/examples/fluids/index.md
@@ -423,74 +423,6 @@ $$
 \langle \phi' \theta' \rangle = \langle \phi \theta \rangle - \langle \phi \rangle \langle \theta \rangle
 $$
 
-### Subgrid Stress Modeling
-
-When a fluid simulation is under-resolved (the smallest length scale resolved by the grid is much larger than the smallest physical scale, the [Kolmogorov length scale](https://en.wikipedia.org/wiki/Kolmogorov_microscales)), this is mathematically interpreted as filtering the Navier-Stokes equations.
-This is known as large-eddy simulation (LES), as only the "large" scales of turbulence are resolved.
-This filtering operation results in an extra stress-like term, $\bm{\tau}^r$, representing the effect of unresolved (or "subgrid" scale) structures in the flow.
-Denoting the filtering operation by $\overline \cdot$, the LES governing equations are:
-
-$$
-\frac{\partial \bm{\overline q}}{\partial t} + \nabla \cdot \bm{\overline F}(\bm{\overline q}) -S(\bm{\overline q}) = 0 \, ,
-$$ (eq-vector-les)
-
-where
-
-$$
-\bm{\overline F}(\bm{\overline q}) =
-\bm{F} (\bm{\overline q}) +
-\begin{pmatrix}
-    0\\
-     \bm{\tau}^r \\
-     \bm{u}  \cdot \bm{\tau}^r
-\end{pmatrix}
-$$ (eq-les-flux)
-
-More details on deriving the above expression, filtering, and large eddy simulation can be found in {cite}`popeTurbulentFlows2000`.
-To close the problem, the subgrid stress must be defined.
-For implicit LES, the subgrid stress is set to zero and the numerical properties of the discretized system are assumed to account for the effect of subgrid scale structures on the filtered solution field.
-For explicit LES, it is defined by a subgrid stress model.
-
-(sgs-dd-model)=
-#### Data-driven SGS Model
-
-The data-driven SGS model implemented here uses a small neural network to compute the SGS term.
-The SGS tensor is calculated at nodes using an $L^2$ projection of the velocity gradient and grid anisotropy tensor, and then interpolated onto quadrature points.
-More details regarding the theoretical background of the model can be found in {cite}`prakashDDSGS2022` and {cite}`prakashDDSGSAnisotropic2022`.
-
-The neural network itself consists of 1 hidden layer and 20 neurons, using Leaky ReLU as its activation function.
-The slope parameter for the Leaky ReLU function is set via `-sgs_model_dd_leakyrelu_alpha`.
-The outputs of the network are assumed to be normalized on a min-max scale, so they must be rescaled by the original min-max bounds.
-Parameters for the neural network are put into files in a directory found in `-sgs_model_dd_parameter_dir`.
-These files store the network weights (`w1.dat` and `w2.dat`), biases (`b1.dat` and `b2.dat`), and scaling parameters (`OutScaling.dat`).
-The first row of each files stores the number of columns and rows in each file.
-Note that the weight coefficients are assumed to be in column-major order.
-This is done to keep consistent with legacy file compatibility.
-
-:::{note}
-The current data-driven model parameters are not accurate and are for regression testing only.
-:::
-
-##### Data-driven Model Using External Libraries
-
-There are two different modes for using the data-driven model: fused and sequential.
-
-In fused mode, the input processing, model inference, and output handling were all done in a single CeedOperator.
-Fused mode is generally faster than the sequential mode, however fused mode requires that the model architecture be manually implemented into a libCEED QFunction.
-To use the fused mode, set `-sgs_model_dd_implementation fused`.
-
-Sequential mode has separate function calls/CeedOperators for input creation, model inference, and output handling.
-By separating the three steps of the model evaluation, the sequential mode allows for functions calling external libraries to be used for the model inference step.
-The use of these external libraries allows us to leverage the flexibility of those external libraries in their model architectures.
-
-PyTorch is currently the only external library implemented with the sequential mode.
-This is enabled with `USE_TORCH=1` during the build process, which will use the PyTorch accessible from the build environment's Python interpreter.
-To specify the path to the PyTorch model file, use `-sgs_model_dd_torch_model_path`.
-The hardware used to run the model inference is determined automatically from the libCEED backend chosen, but can be overridden with `-sgs_model_dd_torch_model_device`.
-Note that if you chose to run the inference on host while using a GPU libCEED backend (e.g. `/gpu/cuda`), then host-to-device transfers (and vice versa) will be done automatically.
-
-The sequential mode is available using a libCEED based inference evaluation via `-sgs_model_dd_implementation sequential_ceed`, but it is only for verification purposes.
-
 (differential-filtering)=
 ### Differential Filtering
 
@@ -584,42 +516,6 @@ To match the "size" of a normal kernel to our differential kernel, we attempt to
 To match the box and Gaussian filters "sizes", we use $\beta = 1/10$ and $\beta = 1/6$, respectively.
 $\beta$ can be set via `-diff_filter_kernel_scaling`.
 
-### *In Situ* Machine-Learning Model Training
-Training machine-learning models normally uses *a priori* (already gathered) data stored on disk.
-This is computationally inefficient, particularly as the scale of the problem grows and the data that is saved to disk reduces to a small percentage of the total data generated by a simulation.
-One way of working around this to to train a model on data coming from an ongoing simulation, known as *in situ* (in place) learning.
-
-This is implemented in the code using [SmartSim](https://www.craylabs.org/docs/overview.html).
-Briefly, the fluid simulation will periodically place data for training purposes into a database that a separate process uses to train a model.
-The database used by SmartSim is [Redis](https://redis.com/modules/redis-ai/) and the library to connect to the database is called [SmartRedis](https://www.craylabs.org/docs/smartredis.html).
-More information about how to utilize this code in a SmartSim configuration can be found on [SmartSim's website](https://www.craylabs.org/docs/overview.html).
-
-To use this code in a SmartSim *in situ* setup, first the code must be built with SmartRedis enabled.
-This is done by specifying the installation directory of SmartRedis using the `SMARTREDIS_DIR` environment variable when building:
-
-```
-make SMARTREDIS_DIR=~/software/smartredis/install
-```
-
-#### SGS Data-Driven Model *In Situ* Training
-Currently the code is only setup to do *in situ* training for the SGS data-driven model.
-Training data is split into the model inputs and outputs.
-The model inputs are calculated as the same model inputs in the SGS Data-Driven model described {ref}`earlier<sgs-dd-model>`.
-The model outputs (or targets in the case of training) are the subgrid stresses.
-Both the inputs and outputs are computed from a filtered velocity field, which is calculated via {ref}`differential-filtering`.
-The settings for the differential filtering used during training are described in {ref}`differential-filtering`.
-The training will create multiple sets of data per each filter width defined in `-sgs_train_filter_widths`.
-Those scalar filter widths correspond to the scaling correspond to $\bm{D} = c \bm{I}$, where $c$ is the scalar filter width.
-
-The SGS *in situ* training can be enabled using the `-sgs_train_enable` flag.
-Data can be processed and placed into the database periodically.
-The interval between is controlled by `-sgs_train_write_data_interval`.
-There's also the choice of whether to add new training data on each database write or to overwrite the old data with new data.
-This is controlled by `-sgs_train_overwrite_data`.
-
-The database may also be located on the same node as a MPI rank (collocated) or located on a separate node (distributed).
-It's necessary to know how many ranks are associated with each collocated database, which is set by `-smartsim_collocated_database_num_ranks`.
-
 (problem-advection)=
 ## Advection-Diffusion
 
diff --git a/examples/fluids/navierstokes.c b/examples/fluids/navierstokes.c
index 713e30df4d..ed0cafb8dd 100644
--- a/examples/fluids/navierstokes.c
+++ b/examples/fluids/navierstokes.c
@@ -20,8 +20,6 @@
 //
 //TESTARGS(name="Newtonian and Riemann Solver Unit Tests",only="cpu") -ceed {ceed_resource} -test_type solver -options_file examples/fluids/gaussianwave.yaml -compare_final_state_atol 1e100 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-gaussianwave-IDL-entropy.bin -dm_plex_box_faces 5,5,1 -ts_max_steps 0 -newtonian_unit_tests -riemann_solver_unit_tests
 //TESTARGS(name="Gaussian Wave, IDL and Entropy variables") -ceed {ceed_resource} -test_type solver -options_file examples/fluids/gaussianwave.yaml -compare_final_state_atol 2e-11 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-gaussianwave-IDL-entropy.bin -state_var entropy -dm_plex_box_faces 5,5,1 -ts_max_steps 5 -idl_decay_time 2e-3 -idl_length 0.25 -idl_start 0 -idl_pressure 70
-//TESTARGS(name="Blasius, SGS DataDriven Sequential Torch",only="torch") -ceed {ceed_resource} -options_file examples/fluids/tests-output/blasius_stgtest.yaml -sgs_model_type data_driven -sgs_model_dd_leakyrelu_alpha 0.3 -sgs_model_dd_parameter_dir examples/fluids/dd_sgs_data -ts_dt 2e-9 -state_var primitive -ksp_rtol 1e-12 -snes_rtol 1e-12 -stg_mean_only -stg_fluctuating_IC -test_type solver -compare_final_state_atol 1e-10 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-blasius-sgs-data-driven.bin -sgs_model_dd_implementation sequential_torch -sgs_model_dd_torch_model_path ./examples/fluids/tests-output/createPyTorchModel/NNModel_HIT_fp64_jit.pt
-//TESTARGS(name="Blasius, SGS DataDriven Sequential Ceed") -ceed {ceed_resource} -options_file examples/fluids/tests-output/blasius_stgtest.yaml -sgs_model_type data_driven -sgs_model_dd_leakyrelu_alpha 0.3 -sgs_model_dd_parameter_dir examples/fluids/dd_sgs_data -ts_dt 2e-9 -state_var primitive -ksp_rtol 1e-12 -snes_rtol 1e-12 -stg_mean_only -stg_fluctuating_IC -test_type solver -compare_final_state_atol 1e-10 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-blasius-sgs-data-driven.bin -sgs_model_dd_implementation sequential_ceed
 //TESTARGS(name="Gaussian Wave, explicit, supg, IDL") -ceed {ceed_resource} -test_type solver -options_file examples/fluids/gaussianwave.yaml -compare_final_state_atol 1e-8 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-gaussianwave-explicit.bin -dm_plex_box_faces 2,2,1 -ts_max_steps 5 -degree 3 -implicit false -ts_type rk -stab supg -state_var conservative -mass_ksp_type gmres -mass_pc_jacobi_type diagonal -idl_decay_time 2e-3 -idl_length 0.25 -idl_start 0 -idl_pressure 70
 //TESTARGS(name="Advection 2D, rotation, explicit, supg, consistent mass") -ceed {ceed_resource} -test_type solver -problem advection -degree 3 -dm_plex_box_faces 2,2 -dm_plex_box_lower 0,0 -dm_plex_box_upper 125,125 -bc_wall 1,2,3,4 -wall_comps 4 -units_kilogram 1e-9 -rc 100. -ts_dt 1e-3 -ts_max_steps 10 -stab supg -Ctaus 0.5 -mass_ksp_type gmres -mass_pc_type vpbjacobi -compare_final_state_atol 1e-10 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-adv2d-rotation-explicit-stab-supg-consistent-mass.bin
 //TESTARGS(name="Advection, skew") -ceed {ceed_resource} -test_type solver -options_file examples/fluids/advection.yaml -ts_max_steps 5 -wind_type translation -wind_translation -0.5547002,0.83205029,0 -advection_ic_type skew  -dm_plex_box_faces 2,1,1 -degree 2 -stab supg -stab_tau advdiff_shakib -Ctau_a 4 -ksp_type gmres -diffusion_coeff 5e-4 -compare_final_state_atol 7e-10 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-adv-skew.bin
@@ -29,7 +27,6 @@
 //TESTARGS(name="Advection, rotation, cosine") -ceed {ceed_resource} -test_type solver -options_file examples/fluids/advection.yaml -ts_max_steps 0 -advection_ic_type cosine_hill -dm_plex_box_faces 2,1,1 -compare_final_state_atol 1e-10 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-adv-rotation-cosine.bin
 //TESTARGS(name="Gaussian Wave, using MatShell") -ceed {ceed_resource} -test_type solver -options_file examples/fluids/gaussianwave.yaml -compare_final_state_atol 1e-8 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-gaussianwave-shell.bin -dm_plex_box_faces 2,2,1 -ts_max_steps 5 -degree 3 -amat_type shell -pc_type vpbjacobi -ts_alpha_radius 0.5
 //TESTARGS(name="Taylor-Green Vortex IC") -ceed {ceed_resource} -problem taylor_green -test_type solver -dm_plex_dim 3 -dm_plex_box_faces 6,6,6 -ts_max_steps 0 -compare_final_state_atol 1e-12 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-taylor-green-IC.bin
-//TESTARGS(name="Blasius, SGS DataDriven Fused") -ceed {ceed_resource} -options_file examples/fluids/tests-output/blasius_stgtest.yaml -sgs_model_type data_driven -sgs_model_dd_leakyrelu_alpha 0.3 -sgs_model_dd_parameter_dir examples/fluids/dd_sgs_data -ts_dt 2e-9 -state_var primitive -ksp_rtol 1e-12 -snes_rtol 1e-12 -stg_mean_only -stg_fluctuating_IC -test_type solver -compare_final_state_atol 1e-10 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-blasius-sgs-data-driven.bin
 //TESTARGS(name="Blasius, Anisotropic Differential Filter") -ceed {ceed_resource} -test_type diff_filter -options_file examples/fluids/tests-output/blasius_test.yaml -compare_final_state_atol 5e-10 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-blasius_diff_filter_aniso_vandriest.bin -diff_filter_monitor -ts_max_steps 0 -state_var primitive -diff_filter_friction_length 1e-5 -diff_filter_wall_damping_function van_driest -diff_filter_ksp_rtol 1e-8 -diff_filter_grid_based_width -diff_filter_width_scaling 1,0.7,1
 //TESTARGS(name="Blasius, Isotropic Differential Filter") -ceed {ceed_resource} -test_type diff_filter -options_file examples/fluids/tests-output/blasius_test.yaml -compare_final_state_atol 2e-12 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-blasius_diff_filter_iso.bin -diff_filter_monitor -ts_max_steps 0 -diff_filter_width_scaling 4.2e-5,4.2e-5,4.2e-5 -diff_filter_ksp_atol 1e-14 -diff_filter_ksp_rtol 1e-16
 //TESTARGS(name="Gaussian Wave, with IDL") -ceed {ceed_resource} -test_type solver -options_file examples/fluids/gaussianwave.yaml -compare_final_state_atol 2e-11 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-gaussianwave-IDL.bin -dm_plex_box_faces 5,5,1 -ts_max_steps 5 -idl_decay_time 2e-3 -idl_length 0.25 -idl_start 0 -ts_alpha_radius 0.5 -idl_pressure 70
@@ -254,10 +251,7 @@ int main(int argc, char **argv) {
 
   PetscCall(TurbulenceStatisticsDestroy(user, ceed_data));
   PetscCall(NodalProjectionDataDestroy(user->grad_velo_proj));
-  PetscCall(SgsDDDataDestroy(user->sgs_dd_data));
   PetscCall(DifferentialFilterDataDestroy(user->diff_filter));
-  PetscCall(SGS_DD_TrainingDataDestroy(user->sgs_dd_train));
-  PetscCall(SmartSimDataDestroy(user->smartsim));
 
   // -- Vectors
   PetscCallCeed(ceed, CeedVectorDestroy(&ceed_data->x_coord));
diff --git a/examples/fluids/navierstokes.h b/examples/fluids/navierstokes.h
index 11f1f19fa8..c2d57c7f32 100644
--- a/examples/fluids/navierstokes.h
+++ b/examples/fluids/navierstokes.h
@@ -60,22 +60,6 @@ typedef enum {
 } TestType;
 static const char *const TestTypes[] = {"NONE", "SOLVER", "TURB_SPANSTATS", "DIFF_FILTER", "TestType", "TESTTYPE_", NULL};
 
-// Subgrid-Stress mode type
-typedef enum {
-  SGS_MODEL_NONE        = 0,
-  SGS_MODEL_DATA_DRIVEN = 1,
-} SGSModelType;
-static const char *const SGSModelTypes[] = {"NONE", "DATA_DRIVEN", "SGSModelType", "SGS_MODEL_", NULL};
-
-// Subgrid-Stress mode type
-typedef enum {
-  SGS_MODEL_DD_FUSED           = 0,
-  SGS_MODEL_DD_SEQENTIAL_CEED  = 1,
-  SGS_MODEL_DD_SEQENTIAL_TORCH = 2,
-} SGSModelDDImplementation;
-static const char *const SGSModelDDImplementations[] = {"FUSED", "SEQUENTIAL_CEED", "SEQUENTIAL_TORCH", "SGSModelDDImplementation", "SGS_MODEL_DD_",
-                                                        NULL};
-
 // Mesh transformation type
 typedef enum {
   MESH_TRANSFORM_NONE      = 0,
@@ -137,9 +121,6 @@ struct AppCtx_private {
     PetscViewerFormat viewer_format;
     PetscBool         header_written;
   } wall_forces;
-  // Subgrid Stress Model
-  SGSModelType sgs_model_type;
-  PetscBool    sgs_train_enable;
   // Differential Filtering
   PetscBool         diff_filter_monitor;
   MeshTransformType mesh_transform_type;
@@ -173,29 +154,6 @@ typedef struct {
   KSP                  ksp;
 } *NodalProjectionData;
 
-typedef PetscErrorCode (*SgsDDNodalStressEval)(User user, Vec Q_loc, Vec VelocityGradient, Vec SGSNodal_loc);
-typedef PetscErrorCode (*SgsDDNodalStressInference)(Vec DD_Inputs_loc, Vec DD_Outputs_loc, void *ctx);
-typedef struct {
-  DM                        dm_sgs, dm_dd_inputs, dm_dd_outputs;
-  PetscInt                  num_comp_sgs, num_comp_inputs, num_comp_outputs;
-  OperatorApplyContext      op_nodal_evaluation_ctx, op_nodal_dd_inputs_ctx, op_nodal_dd_outputs_ctx, op_sgs_apply_ctx;
-  CeedVector                sgs_nodal_ceed, grad_velo_ceed;
-  SgsDDNodalStressEval      sgs_nodal_eval;
-  SgsDDNodalStressInference sgs_nodal_inference;
-  void                     *sgs_nodal_inference_ctx;
-  PetscErrorCode (*sgs_nodal_inference_ctx_destroy)(void *ctx);
-} *SgsDDData;
-
-typedef struct {
-  DM                   dm_dd_training;
-  PetscInt             num_comp_dd_inputs, write_data_interval, num_filter_widths;
-  PetscScalar          filter_widths[16];
-  OperatorApplyContext op_training_data_calc_ctx;
-  NodalProjectionData  filtered_grad_velo_proj;
-  size_t               training_data_array_dims[2];
-  PetscBool            overwrite_training_data;
-} *SGS_DD_TrainingData;
-
 typedef struct {
   DM                    dm_filter;
   PetscInt              num_filtered_fields;
@@ -208,12 +166,6 @@ typedef struct {
   CeedContextFieldLabel filter_width_scaling_label;
 } *DiffFilterData;
 
-typedef struct {
-  void    *client;
-  char     rank_id_name[16];
-  PetscInt collocated_database_num_ranks;
-} *SmartSimData;
-
 // PETSc user data
 struct User_private {
   MPI_Comm             comm;
@@ -233,10 +185,7 @@ struct User_private {
   CeedScalar           time_bc_set;
   SpanStatsData        spanstats;
   NodalProjectionData  grad_velo_proj;
-  SgsDDData            sgs_dd_data;
   DiffFilterData       diff_filter;
-  SmartSimData         smartsim;
-  SGS_DD_TrainingData  sgs_dd_train;
 };
 
 // Units
@@ -446,10 +395,6 @@ PetscErrorCode TurbulenceStatisticsDestroy(User user, CeedData ceed_data);
 // -----------------------------------------------------------------------------
 // Data-Driven Subgrid Stress (DD-SGS) Modeling Functions
 // -----------------------------------------------------------------------------
-
-PetscErrorCode SgsDDSetup(Ceed ceed, User user, CeedData ceed_data, ProblemData problem);
-PetscErrorCode SgsDDDataDestroy(SgsDDData sgs_dd_data);
-PetscErrorCode SgsDDApplyIFunction(User user, const Vec Q_loc, Vec G_loc);
 PetscErrorCode VelocityGradientProjectionSetup(Ceed ceed, User user, CeedData ceed_data, ProblemData problem, StateVariable state_var_input,
                                                CeedElemRestriction elem_restr_input, CeedBasis basis_input, NodalProjectionData *pgrad_velo_proj);
 PetscErrorCode VelocityGradientProjectionApply(NodalProjectionData grad_velo_proj, Vec Q_loc, Vec VelocityGradient);
@@ -478,13 +423,3 @@ PetscErrorCode DifferentialFilterDataDestroy(DiffFilterData diff_filter);
 PetscErrorCode TSMonitor_DifferentialFilter(TS ts, PetscInt steps, PetscReal solution_time, Vec Q, void *ctx);
 PetscErrorCode DifferentialFilterApply(User user, const PetscReal solution_time, const Vec Q, Vec Filtered_Solution);
 PetscErrorCode DifferentialFilterMmsICSetup(ProblemData problem);
-
-// -----------------------------------------------------------------------------
-// SGS Data-Driven Training via SmartSim
-// -----------------------------------------------------------------------------
-PetscErrorCode SmartSimSetup(User user);
-PetscErrorCode SmartSimDataDestroy(SmartSimData smartsim);
-PetscErrorCode SGS_DD_TrainingSetup(Ceed ceed, User user, CeedData ceed_data, ProblemData problem);
-PetscErrorCode TSMonitor_SGS_DD_Training(TS ts, PetscInt step_num, PetscReal solution_time, Vec Q, void *ctx);
-PetscErrorCode TSPostStep_SGS_DD_Training(TS ts);
-PetscErrorCode SGS_DD_TrainingDataDestroy(SGS_DD_TrainingData sgs_dd_train);
diff --git a/examples/fluids/problems/sgs_dd_model.c b/examples/fluids/problems/sgs_dd_model.c
deleted file mode 100644
index 49a7efb9f2..0000000000
--- a/examples/fluids/problems/sgs_dd_model.c
+++ /dev/null
@@ -1,660 +0,0 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#include "../qfunctions/sgs_dd_model.h"
-
-#include <petscdmplex.h>
-
-#include <sgs_model_torch.h>
-#include "../navierstokes.h"
-
-typedef struct {
-  CeedElemRestriction      elem_restr_grid_aniso, elem_restr_sgs;
-  CeedVector               grid_aniso_ceed;
-  CeedQFunctionContext     sgsdd_qfctx, ifunction_qfctx;
-  SGSModelDDImplementation sgs_dd_model_implementation;
-} *SgsDDSetupData;
-
-PetscErrorCode SgsDDSetupDataDestroy(SgsDDSetupData sgs_dd_setup_data) {
-  Ceed ceed;
-
-  PetscFunctionBeginUser;
-  PetscCall(CeedElemRestrictionGetCeed(sgs_dd_setup_data->elem_restr_sgs, &ceed));
-
-  PetscCallCeed(ceed, CeedElemRestrictionDestroy(&sgs_dd_setup_data->elem_restr_grid_aniso));
-  PetscCallCeed(ceed, CeedElemRestrictionDestroy(&sgs_dd_setup_data->elem_restr_sgs));
-  PetscCallCeed(ceed, CeedVectorDestroy(&sgs_dd_setup_data->grid_aniso_ceed));
-  PetscCallCeed(ceed, CeedQFunctionContextDestroy(&sgs_dd_setup_data->sgsdd_qfctx));
-  PetscCallCeed(ceed, CeedQFunctionContextDestroy(&sgs_dd_setup_data->ifunction_qfctx));
-  PetscCall(PetscFree(sgs_dd_setup_data));
-  PetscFunctionReturn(PETSC_SUCCESS);
-}
-
-// @brief Create DM for storing subgrid stress at nodes
-static PetscErrorCode SgsDDCreateDM(DM dm_source, DM *dm_sgs, PetscInt degree, PetscInt q_extra, PetscInt *num_components) {
-  PetscSection section;
-
-  PetscFunctionBeginUser;
-  *num_components = 6;
-
-  PetscCall(DMClone(dm_source, dm_sgs));
-  PetscCall(PetscObjectSetName((PetscObject)*dm_sgs, "Subgrid Stress Projection"));
-
-  PetscCall(DMSetupByOrder_FEM(PETSC_TRUE, PETSC_TRUE, degree, 1, q_extra, 1, num_components, *dm_sgs));
-
-  PetscCall(DMGetLocalSection(*dm_sgs, &section));
-  PetscCall(PetscSectionSetFieldName(section, 0, ""));
-  PetscCall(PetscSectionSetComponentName(section, 0, 0, "KMSubgridStressXX"));
-  PetscCall(PetscSectionSetComponentName(section, 0, 1, "KMSubgridStressYY"));
-  PetscCall(PetscSectionSetComponentName(section, 0, 2, "KMSubgridStressZZ"));
-  PetscCall(PetscSectionSetComponentName(section, 0, 3, "KMSubgridStressYZ"));
-  PetscCall(PetscSectionSetComponentName(section, 0, 4, "KMSubgridStressXZ"));
-  PetscCall(PetscSectionSetComponentName(section, 0, 5, "KMSubgridStressXY"));
-  PetscFunctionReturn(PETSC_SUCCESS);
-};
-
-// @brief Evaluate data-driven SGS using fused method
-static PetscErrorCode SgsDDNodalStressEval_Fused(User user, Vec Q_loc, Vec VelocityGradient, Vec SGSNodal_loc) {
-  SgsDDData    sgs_dd_data = user->sgs_dd_data;
-  PetscMemType q_mem_type;
-
-  PetscFunctionBeginUser;
-  PetscCall(VecPetscToCeed(Q_loc, &q_mem_type, user->q_ceed));  // q_ceed is an implicit input
-
-  PetscCall(ApplyCeedOperatorGlobalToLocal(VelocityGradient, SGSNodal_loc, sgs_dd_data->op_nodal_evaluation_ctx));
-
-  PetscCall(VecCeedToPetsc(user->q_ceed, q_mem_type, Q_loc));
-  PetscFunctionReturn(PETSC_SUCCESS);
-}
-
-// @brief Create CeedOperator to calculate data-drive SGS at nodes using fused operator
-static PetscErrorCode SgsDDSetupNodalEvaluation_Fused(Ceed ceed, User user, CeedData ceed_data, SgsDDSetupData sgs_dd_setup_data) {
-  SgsDDData           sgs_dd_data = user->sgs_dd_data;
-  CeedQFunction       qf_sgs_dd_nodal;
-  CeedOperator        op_sgs_dd_nodal;
-  CeedInt             num_comp_q, num_comp_grad_velo, num_comp_x, num_comp_grid_aniso;
-  PetscInt            dim;
-  CeedVector          inv_multiplicity;
-  CeedElemRestriction elem_restr_inv_multiplicity, elem_restr_grad_velo, elem_restr_sgs;
-  DMLabel             domain_label = NULL;
-  PetscInt            label_value = 0, height = 0, dm_field = 0;
-
-  PetscFunctionBeginUser;
-  PetscCall(DMGetDimension(user->dm, &dim));
-  PetscCallCeed(ceed, CeedElemRestrictionGetNumComponents(ceed_data->elem_restr_x, &num_comp_x));
-  PetscCallCeed(ceed, CeedElemRestrictionGetNumComponents(ceed_data->elem_restr_q, &num_comp_q));
-  PetscCallCeed(ceed, CeedElemRestrictionGetNumComponents(sgs_dd_setup_data->elem_restr_grid_aniso, &num_comp_grid_aniso));
-
-  {  // Get velocity gradient information
-    CeedOperatorField op_field;
-    PetscCallCeed(ceed, CeedOperatorGetFieldByName(user->grad_velo_proj->l2_rhs_ctx->op, "velocity gradient", &op_field));
-    PetscCallCeed(ceed, CeedOperatorFieldGetElemRestriction(op_field, &elem_restr_grad_velo));
-    PetscCallCeed(ceed, CeedElemRestrictionGetNumComponents(elem_restr_grad_velo, &num_comp_grad_velo));
-  }
-  PetscCall(DMPlexCeedElemRestrictionCreate(ceed, sgs_dd_data->dm_sgs, domain_label, label_value, height, dm_field, &elem_restr_sgs));
-  PetscCallCeed(ceed, CeedElemRestrictionCreateVector(elem_restr_sgs, &sgs_dd_data->sgs_nodal_ceed, NULL));
-
-  PetscCall(GetInverseMultiplicity(ceed, sgs_dd_data->dm_sgs, domain_label, label_value, height, dm_field, PETSC_FALSE, &elem_restr_inv_multiplicity,
-                                   &inv_multiplicity));
-
-  // -- Create operator for SGS DD model nodal evaluation
-  switch (user->phys->state_var) {
-    case STATEVAR_PRIMITIVE:
-      PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, ComputeSgsDDNodal_Prim, ComputeSgsDDNodal_Prim_loc, &qf_sgs_dd_nodal));
-      break;
-    case STATEVAR_CONSERVATIVE:
-      PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, ComputeSgsDDNodal_Conserv, ComputeSgsDDNodal_Conserv_loc, &qf_sgs_dd_nodal));
-      break;
-    case STATEVAR_ENTROPY:
-      PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, ComputeSgsDDNodal_Entropy, ComputeSgsDDNodal_Entropy_loc, &qf_sgs_dd_nodal));
-      break;
-  }
-
-  // Mesh/geometry order and solution basis order may differ, therefore must interpolate
-  CeedBasis basis_x_to_q;
-  PetscCallCeed(ceed, CeedBasisCreateProjection(ceed_data->basis_x, ceed_data->basis_q, &basis_x_to_q));
-
-  PetscCallCeed(ceed, CeedQFunctionSetContext(qf_sgs_dd_nodal, sgs_dd_setup_data->sgsdd_qfctx));
-  PetscCallCeed(ceed, CeedQFunctionAddInput(qf_sgs_dd_nodal, "q", num_comp_q, CEED_EVAL_NONE));
-  PetscCallCeed(ceed, CeedQFunctionAddInput(qf_sgs_dd_nodal, "x", num_comp_x, CEED_EVAL_INTERP));
-  PetscCallCeed(ceed, CeedQFunctionAddInput(qf_sgs_dd_nodal, "gradient velocity", num_comp_grad_velo, CEED_EVAL_NONE));
-  PetscCallCeed(ceed, CeedQFunctionAddInput(qf_sgs_dd_nodal, "anisotropy tensor", num_comp_grid_aniso, CEED_EVAL_NONE));
-  PetscCallCeed(ceed, CeedQFunctionAddInput(qf_sgs_dd_nodal, "inverse multiplicity", 1, CEED_EVAL_NONE));
-  PetscCallCeed(ceed, CeedQFunctionAddOutput(qf_sgs_dd_nodal, "km_sgs", sgs_dd_data->num_comp_sgs, CEED_EVAL_NONE));
-
-  PetscCallCeed(ceed, CeedOperatorCreate(ceed, qf_sgs_dd_nodal, NULL, NULL, &op_sgs_dd_nodal));
-  PetscCallCeed(ceed, CeedOperatorSetField(op_sgs_dd_nodal, "q", ceed_data->elem_restr_q, CEED_BASIS_NONE, user->q_ceed));
-  PetscCallCeed(ceed, CeedOperatorSetField(op_sgs_dd_nodal, "x", ceed_data->elem_restr_x, basis_x_to_q, ceed_data->x_coord));
-  PetscCallCeed(ceed, CeedOperatorSetField(op_sgs_dd_nodal, "gradient velocity", elem_restr_grad_velo, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE));
-  PetscCallCeed(ceed, CeedOperatorSetField(op_sgs_dd_nodal, "anisotropy tensor", sgs_dd_setup_data->elem_restr_grid_aniso, CEED_BASIS_NONE,
-                                           sgs_dd_setup_data->grid_aniso_ceed));
-  PetscCallCeed(ceed, CeedOperatorSetField(op_sgs_dd_nodal, "inverse multiplicity", elem_restr_inv_multiplicity, CEED_BASIS_NONE, inv_multiplicity));
-  PetscCallCeed(ceed, CeedOperatorSetField(op_sgs_dd_nodal, "km_sgs", elem_restr_sgs, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE));
-
-  PetscCall(OperatorApplyContextCreate(user->grad_velo_proj->dm, sgs_dd_data->dm_sgs, ceed, op_sgs_dd_nodal, NULL, sgs_dd_data->sgs_nodal_ceed, NULL,
-                                       NULL, &sgs_dd_data->op_nodal_evaluation_ctx));
-
-  sgs_dd_setup_data->elem_restr_sgs = elem_restr_sgs;
-  sgs_dd_data->sgs_nodal_eval       = SgsDDNodalStressEval_Fused;
-
-  PetscCallCeed(ceed, CeedVectorDestroy(&inv_multiplicity));
-  PetscCallCeed(ceed, CeedBasisDestroy(&basis_x_to_q));
-  PetscCallCeed(ceed, CeedElemRestrictionDestroy(&elem_restr_inv_multiplicity));
-  PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_sgs_dd_nodal));
-  PetscCallCeed(ceed, CeedOperatorDestroy(&op_sgs_dd_nodal));
-  PetscFunctionReturn(PETSC_SUCCESS);
-}
-
-// @brief Setup data-driven model inference using libCEED native implementation
-static PetscErrorCode SgsDDSetupNodalEvaluation_Sequential_Ceed(Ceed ceed, SgsDDData sgs_dd_data, SgsDDSetupData sgs_dd_setup_data,
-                                                                CeedElemRestriction elem_restr_dd_inputs, CeedElemRestriction elem_restr_dd_outputs,
-                                                                CeedElemRestriction elem_restr_inv_multiplicity, CeedVector inv_multiplicity,
-                                                                void **ctx) {
-  CeedQFunction         qf_sgs_dd_inference;
-  CeedOperator          op_sgs_dd_inference;
-  OperatorApplyContext *op_context = (OperatorApplyContext *)ctx;
-
-  PetscFunctionBeginUser;
-  PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, ComputeSgsDDNodal_Sequential_Inference, ComputeSgsDDNodal_Sequential_Inference_loc,
-                                                  &qf_sgs_dd_inference));
-
-  PetscCallCeed(ceed, CeedQFunctionSetContext(qf_sgs_dd_inference, sgs_dd_setup_data->sgsdd_qfctx));
-  PetscCallCeed(ceed, CeedQFunctionAddInput(qf_sgs_dd_inference, "model inputs", sgs_dd_data->num_comp_inputs, CEED_EVAL_NONE));
-  PetscCallCeed(ceed, CeedQFunctionAddInput(qf_sgs_dd_inference, "inverse multiplicity", 1, CEED_EVAL_NONE));
-  PetscCallCeed(ceed, CeedQFunctionAddOutput(qf_sgs_dd_inference, "model outputs", sgs_dd_data->num_comp_outputs, CEED_EVAL_NONE));
-
-  PetscCallCeed(ceed, CeedOperatorCreate(ceed, qf_sgs_dd_inference, NULL, NULL, &op_sgs_dd_inference));
-  PetscCallCeed(ceed, CeedOperatorSetField(op_sgs_dd_inference, "model inputs", elem_restr_dd_inputs, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE));
-  PetscCallCeed(ceed,
-                CeedOperatorSetField(op_sgs_dd_inference, "inverse multiplicity", elem_restr_inv_multiplicity, CEED_BASIS_NONE, inv_multiplicity));
-  PetscCallCeed(ceed, CeedOperatorSetField(op_sgs_dd_inference, "model outputs", elem_restr_dd_outputs, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE));
-
-  PetscCall(OperatorApplyContextCreate(sgs_dd_data->dm_dd_inputs, sgs_dd_data->dm_dd_outputs, ceed, op_sgs_dd_inference, NULL, NULL, NULL, NULL,
-                                       op_context));
-  sgs_dd_data->sgs_nodal_inference_ctx_destroy = (PetscErrorCode(*)(void *))OperatorApplyContextDestroy;
-
-  PetscCallCeed(ceed, CeedOperatorDestroy(&op_sgs_dd_inference));
-  PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_sgs_dd_inference));
-  PetscFunctionReturn(PETSC_SUCCESS);
-}
-
-// @brief Perform data-driven model inference using libCEED native implementation
-PetscErrorCode SgsDDNodalStressEval_Sequential_Ceed(Vec DD_Inputs_loc, Vec DD_Outputs_loc, void *ctx) {
-  OperatorApplyContext op_context = *(OperatorApplyContext *)ctx;
-
-  PetscFunctionBeginUser;
-  PetscCall(PetscLogEventBegin(FLUIDS_SgsModelDDData, DD_Inputs_loc, DD_Outputs_loc, NULL, NULL));
-  PetscCall(PetscLogEventBegin(FLUIDS_SgsModelDDInference, DD_Inputs_loc, DD_Outputs_loc, NULL, NULL));
-  PetscCall(PetscLogGpuTimeBegin());
-  PetscCall(ApplyCeedOperatorLocalToLocal(DD_Inputs_loc, DD_Outputs_loc, op_context));
-  PetscCall(PetscLogGpuTimeEnd());
-  PetscCall(PetscLogEventEnd(FLUIDS_SgsModelDDInference, DD_Inputs_loc, DD_Outputs_loc, NULL, NULL));
-  PetscCall(PetscLogEventEnd(FLUIDS_SgsModelDDData, DD_Inputs_loc, DD_Outputs_loc, NULL, NULL));
-  PetscFunctionReturn(PETSC_SUCCESS);
-}
-
-// @brief Setup data-driven model inference using libtorch
-static PetscErrorCode SgsDDSetupNodalEvaluation_Sequential_Torch(Ceed ceed, SgsDDData sgs_dd_data, SgsDDSetupData sgs_dd_setup_data,
-                                                                 CeedElemRestriction elem_restr_dd_inputs, CeedElemRestriction elem_restr_dd_outputs,
-                                                                 CeedElemRestriction elem_restr_inv_multiplicity, CeedVector inv_multiplicity,
-                                                                 void **ctx) {
-  const char     *ceed_resource;
-  char            model_path[PETSC_MAX_PATH_LEN] = "";
-  TorchDeviceType model_device_type;
-
-  PetscFunctionBeginUser;
-  PetscCallCeed(ceed, CeedGetResource(ceed, &ceed_resource));
-  if (strstr(ceed_resource, "/gpu/cuda")) model_device_type = TORCH_DEVICE_CUDA;
-  else if (strstr(ceed_resource, "/gpu/hip")) model_device_type = TORCH_DEVICE_HIP;
-  // On-device XPU is not working reliably currently, default to CPU inference evaluation
-  // else if (strstr(ceed_resource, "/gpu/sycl")) model_device_type = TORCH_DEVICE_XPU;
-  else model_device_type = TORCH_DEVICE_CPU;
-  PetscCall(PetscOptionsGetEnum(NULL, NULL, "-sgs_model_dd_torch_model_device", TorchDeviceTypes, (PetscEnum *)&model_device_type, NULL));
-  PetscCall(PetscOptionsGetString(NULL, NULL, "-sgs_model_dd_torch_model_path", model_path, sizeof(model_path), NULL));
-
-  PetscCall(LoadModel_Torch(model_path, model_device_type));
-
-  PetscFunctionReturn(PETSC_SUCCESS);
-}
-
-// @brief Perform data-driven model inference using libtorch
-static PetscErrorCode SgsDDNodalStressEval_Sequential_Torch(Vec DD_Inputs_loc, Vec DD_Outputs_loc, void *ctx) {
-  static PetscBool run_through = PETSC_FALSE;
-  PetscFunctionBeginUser;
-  if (!run_through) {
-    PetscCall(VecViewFromOptions(DD_Inputs_loc, NULL, "-dd_inputs_loc_view"));
-  }
-  PetscCall(ModelInference_Torch(DD_Inputs_loc, DD_Outputs_loc));
-  if (!run_through) {
-    PetscCall(VecViewFromOptions(DD_Outputs_loc, NULL, "-dd_outputs_loc_view"));
-    run_through = PETSC_TRUE;
-  }
-  PetscFunctionReturn(PETSC_SUCCESS);
-}
-
-// @brief Evaluate data-driven SGS using sequential method
-PetscErrorCode SgsDDNodalStressEval_Sequential(User user, Vec Q_loc, Vec VelocityGradient, Vec SGSNodal_loc) {
-  SgsDDData    sgs_dd_data = user->sgs_dd_data;
-  PetscMemType q_mem_type;
-  Vec          DD_Inputs_loc, DD_Outputs_loc;
-
-  PetscFunctionBeginUser;
-  PetscCall(DMGetLocalVector(sgs_dd_data->dm_dd_inputs, &DD_Inputs_loc));
-  PetscCall(DMGetLocalVector(sgs_dd_data->dm_dd_outputs, &DD_Outputs_loc));
-  PetscCall(VecPetscToCeed(Q_loc, &q_mem_type, user->q_ceed));  // q_ceed is an implicit input
-
-  PetscCall(ApplyCeedOperatorGlobalToLocal(VelocityGradient, DD_Inputs_loc, sgs_dd_data->op_nodal_dd_inputs_ctx));
-  PetscCall(sgs_dd_data->sgs_nodal_inference(DD_Inputs_loc, DD_Outputs_loc, &sgs_dd_data->sgs_nodal_inference_ctx));
-  PetscCall(ApplyCeedOperatorLocalToLocal(DD_Outputs_loc, SGSNodal_loc, sgs_dd_data->op_nodal_dd_outputs_ctx));
-
-  PetscCall(VecCeedToPetsc(user->q_ceed, q_mem_type, Q_loc));
-  PetscCall(DMRestoreLocalVector(sgs_dd_data->dm_dd_inputs, &DD_Inputs_loc));
-  PetscCall(DMRestoreLocalVector(sgs_dd_data->dm_dd_outputs, &DD_Outputs_loc));
-  PetscFunctionReturn(PETSC_SUCCESS);
-}
-
-// @brief Create CeedOperator to calculate data-drive SGS at nodes using sequentially-applied operators
-static PetscErrorCode SgsDDSetupNodalEvaluation_Sequential(Ceed ceed, User user, CeedData ceed_data, SgsDDSetupData sgs_dd_setup_data) {
-  SgsDDData           sgs_dd_data = user->sgs_dd_data;
-  CeedInt             num_comp_q, num_comp_grad_velo, num_comp_x, num_comp_grid_aniso, num_comp_eigvec = 9 + 1;
-  PetscInt            dim;
-  CeedVector          inv_multiplicity, eigvec;
-  CeedElemRestriction elem_restr_inv_multiplicity, elem_restr_grad_velo, elem_restr_sgs, elem_restr_eigvec, elem_restr_dd_inputs,
-      elem_restr_dd_outputs;
-  DMLabel  domain_label = NULL;
-  PetscInt label_value = 0, height = 0, dm_field = 0;
-
-  PetscFunctionBeginUser;
-  {  // Create DMs for data-driven input and output values
-    PetscSection section;
-    PetscInt     degree, q_extra;
-    {  // Get degree and number of quadrature points from dm_sgs
-      PetscFE         fe;
-      PetscSpace      basis;
-      PetscQuadrature quadrature;
-      PetscInt        num_qpnts;
-      PetscCall(DMGetField(sgs_dd_data->dm_sgs, 0, NULL, (PetscObject *)&fe));
-      PetscCall(PetscFEGetBasisSpace(fe, &basis));
-      PetscCall(PetscSpaceGetDegree(basis, &degree, NULL));
-      PetscCall(PetscFEGetQuadrature(fe, &quadrature));
-      PetscCall(PetscQuadratureGetOrder(quadrature, &num_qpnts));
-      q_extra = degree - num_qpnts;
-    }
-
-    PetscCall(DMClone(sgs_dd_data->dm_sgs, &sgs_dd_data->dm_dd_inputs));
-    PetscCall(PetscObjectSetName((PetscObject)sgs_dd_data->dm_dd_inputs, "Data-Driven Model Inputs"));
-    PetscCall(DMSetupByOrder_FEM(PETSC_TRUE, PETSC_TRUE, degree, 1, q_extra, 1, &sgs_dd_data->num_comp_inputs, sgs_dd_data->dm_dd_inputs));
-    PetscCall(DMGetLocalSection(sgs_dd_data->dm_dd_inputs, &section));
-    PetscCall(PetscSectionSetFieldName(section, 0, ""));
-    for (CeedInt i = 0; i < sgs_dd_data->num_comp_inputs; i++) {
-      char component_name[PETSC_MAX_PATH_LEN];
-
-      PetscCall(PetscSNPrintf(component_name, sizeof component_name, "DataDrivenInput%" CeedInt_FMT, i + 1));
-      PetscCall(PetscSectionSetComponentName(section, 0, i, component_name));
-    }
-
-    PetscCall(DMClone(sgs_dd_data->dm_sgs, &sgs_dd_data->dm_dd_outputs));
-    PetscCall(PetscObjectSetName((PetscObject)sgs_dd_data->dm_dd_outputs, "Data-Driven Model Outputs"));
-    PetscCall(DMSetupByOrder_FEM(PETSC_TRUE, PETSC_TRUE, degree, 1, q_extra, 1, &sgs_dd_data->num_comp_outputs, sgs_dd_data->dm_dd_outputs));
-    PetscCall(DMGetLocalSection(sgs_dd_data->dm_dd_outputs, &section));
-    PetscCall(PetscSectionSetFieldName(section, 0, ""));
-    for (CeedInt i = 0; i < sgs_dd_data->num_comp_outputs; i++) {
-      char component_name[PETSC_MAX_PATH_LEN];
-
-      PetscCall(PetscSNPrintf(component_name, sizeof component_name, "DataDrivenOutput%" CeedInt_FMT, i + 1));
-      PetscCall(PetscSectionSetComponentName(section, 0, i, component_name));
-    }
-  }
-
-  PetscCall(DMGetDimension(user->dm, &dim));
-  PetscCallCeed(ceed, CeedElemRestrictionGetNumComponents(ceed_data->elem_restr_x, &num_comp_x));
-  PetscCallCeed(ceed, CeedElemRestrictionGetNumComponents(ceed_data->elem_restr_q, &num_comp_q));
-  PetscCallCeed(ceed, CeedElemRestrictionGetNumComponents(sgs_dd_setup_data->elem_restr_grid_aniso, &num_comp_grid_aniso));
-
-  {  // Get velocity gradient information
-    CeedOperatorField op_field;
-    PetscCallCeed(ceed, CeedOperatorGetFieldByName(user->grad_velo_proj->l2_rhs_ctx->op, "velocity gradient", &op_field));
-    PetscCallCeed(ceed, CeedOperatorFieldGetElemRestriction(op_field, &elem_restr_grad_velo));
-    PetscCallCeed(ceed, CeedElemRestrictionGetNumComponents(elem_restr_grad_velo, &num_comp_grad_velo));
-    PetscCallCeed(ceed, CeedElemRestrictionCreateVector(elem_restr_grad_velo, &sgs_dd_data->grad_velo_ceed, NULL));
-  }
-
-  PetscCall(DMPlexCeedElemRestrictionCreate(ceed, sgs_dd_data->dm_sgs, domain_label, label_value, height, dm_field, &elem_restr_sgs));
-  PetscCallCeed(ceed, CeedElemRestrictionCreateVector(elem_restr_sgs, &sgs_dd_data->sgs_nodal_ceed, NULL));
-  PetscCall(
-      DMPlexCeedElemRestrictionCollocatedCreate(ceed, sgs_dd_data->dm_sgs, domain_label, label_value, height, num_comp_eigvec, &elem_restr_eigvec));
-  PetscCallCeed(ceed, CeedElemRestrictionCreateVector(elem_restr_eigvec, &eigvec, NULL));
-
-  PetscCall(DMPlexCeedElemRestrictionCreate(ceed, sgs_dd_data->dm_dd_inputs, domain_label, label_value, height, dm_field, &elem_restr_dd_inputs));
-  PetscCall(DMPlexCeedElemRestrictionCreate(ceed, sgs_dd_data->dm_dd_outputs, domain_label, label_value, height, dm_field, &elem_restr_dd_outputs));
-
-  PetscCall(GetInverseMultiplicity(ceed, sgs_dd_data->dm_sgs, domain_label, label_value, height, dm_field, PETSC_FALSE, &elem_restr_inv_multiplicity,
-                                   &inv_multiplicity));
-
-  {  // Create operator for data-driven input evaluation
-    CeedQFunction qf_sgs_dd_inputs;
-    CeedOperator  op_sgs_dd_inputs;
-
-    switch (user->phys->state_var) {
-      case STATEVAR_PRIMITIVE:
-        PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, ComputeSgsDDNodal_Sequential_Inputs_Prim,
-                                                        ComputeSgsDDNodal_Sequential_Inputs_Prim_loc, &qf_sgs_dd_inputs));
-        break;
-      case STATEVAR_CONSERVATIVE:
-        PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, ComputeSgsDDNodal_Sequential_Inputs_Conserv,
-                                                        ComputeSgsDDNodal_Sequential_Inputs_Conserv_loc, &qf_sgs_dd_inputs));
-        break;
-      case STATEVAR_ENTROPY:
-        PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, ComputeSgsDDNodal_Sequential_Inputs_Entropy,
-                                                        ComputeSgsDDNodal_Sequential_Inputs_Entropy_loc, &qf_sgs_dd_inputs));
-        break;
-    }
-
-    PetscCallCeed(ceed, CeedQFunctionSetContext(qf_sgs_dd_inputs, sgs_dd_setup_data->sgsdd_qfctx));
-    PetscCallCeed(ceed, CeedQFunctionAddInput(qf_sgs_dd_inputs, "q", num_comp_q, CEED_EVAL_NONE));
-    PetscCallCeed(ceed, CeedQFunctionAddInput(qf_sgs_dd_inputs, "gradient velocity", num_comp_grad_velo, CEED_EVAL_NONE));
-    PetscCallCeed(ceed, CeedQFunctionAddInput(qf_sgs_dd_inputs, "anisotropy tensor", num_comp_grid_aniso, CEED_EVAL_NONE));
-    PetscCallCeed(ceed, CeedQFunctionAddInput(qf_sgs_dd_inputs, "inverse multiplicity", 1, CEED_EVAL_NONE));
-    PetscCallCeed(ceed, CeedQFunctionAddOutput(qf_sgs_dd_inputs, "eigenvectors", num_comp_eigvec, CEED_EVAL_NONE));
-    PetscCallCeed(ceed, CeedQFunctionAddOutput(qf_sgs_dd_inputs, "model inputs", sgs_dd_data->num_comp_inputs, CEED_EVAL_NONE));
-
-    PetscCallCeed(ceed, CeedOperatorCreate(ceed, qf_sgs_dd_inputs, NULL, NULL, &op_sgs_dd_inputs));
-    PetscCallCeed(ceed, CeedOperatorSetField(op_sgs_dd_inputs, "q", ceed_data->elem_restr_q, CEED_BASIS_NONE, user->q_ceed));
-    PetscCallCeed(ceed, CeedOperatorSetField(op_sgs_dd_inputs, "gradient velocity", elem_restr_grad_velo, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE));
-    PetscCallCeed(ceed, CeedOperatorSetField(op_sgs_dd_inputs, "anisotropy tensor", sgs_dd_setup_data->elem_restr_grid_aniso, CEED_BASIS_NONE,
-                                             sgs_dd_setup_data->grid_aniso_ceed));
-    PetscCallCeed(ceed,
-                  CeedOperatorSetField(op_sgs_dd_inputs, "inverse multiplicity", elem_restr_inv_multiplicity, CEED_BASIS_NONE, inv_multiplicity));
-    PetscCallCeed(ceed, CeedOperatorSetField(op_sgs_dd_inputs, "eigenvectors", elem_restr_eigvec, CEED_BASIS_NONE, eigvec));
-    PetscCallCeed(ceed, CeedOperatorSetField(op_sgs_dd_inputs, "model inputs", elem_restr_dd_inputs, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE));
-
-    PetscCall(OperatorApplyContextCreate(user->grad_velo_proj->dm, sgs_dd_data->dm_dd_inputs, ceed, op_sgs_dd_inputs, NULL, NULL, NULL, NULL,
-                                         &sgs_dd_data->op_nodal_dd_inputs_ctx));
-    PetscCallCeed(ceed, CeedOperatorDestroy(&op_sgs_dd_inputs));
-    PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_sgs_dd_inputs));
-  }
-
-  {  // Create operator for data-driven output handling
-    CeedQFunction qf_sgs_dd_outputs;
-    CeedOperator  op_sgs_dd_outputs;
-
-    PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, ComputeSgsDDNodal_Sequential_Outputs, ComputeSgsDDNodal_Sequential_Outputs_loc,
-                                                    &qf_sgs_dd_outputs));
-    PetscCallCeed(ceed, CeedQFunctionSetContext(qf_sgs_dd_outputs, sgs_dd_setup_data->sgsdd_qfctx));
-    PetscCallCeed(ceed, CeedQFunctionAddInput(qf_sgs_dd_outputs, "model outputs", sgs_dd_data->num_comp_outputs, CEED_EVAL_NONE));
-    PetscCallCeed(ceed, CeedQFunctionAddInput(qf_sgs_dd_outputs, "anisotropy tensor", num_comp_grid_aniso, CEED_EVAL_NONE));
-    PetscCallCeed(ceed, CeedQFunctionAddInput(qf_sgs_dd_outputs, "inverse multiplicity", 1, CEED_EVAL_NONE));
-    PetscCallCeed(ceed, CeedQFunctionAddInput(qf_sgs_dd_outputs, "eigenvectors", num_comp_eigvec, CEED_EVAL_NONE));
-    PetscCallCeed(ceed, CeedQFunctionAddOutput(qf_sgs_dd_outputs, "km_sgs", sgs_dd_data->num_comp_sgs, CEED_EVAL_NONE));
-
-    PetscCallCeed(ceed, CeedOperatorCreate(ceed, qf_sgs_dd_outputs, NULL, NULL, &op_sgs_dd_outputs));
-    PetscCallCeed(ceed, CeedOperatorSetField(op_sgs_dd_outputs, "model outputs", elem_restr_dd_outputs, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE));
-    PetscCallCeed(ceed, CeedOperatorSetField(op_sgs_dd_outputs, "anisotropy tensor", sgs_dd_setup_data->elem_restr_grid_aniso, CEED_BASIS_NONE,
-                                             sgs_dd_setup_data->grid_aniso_ceed));
-    PetscCallCeed(ceed,
-                  CeedOperatorSetField(op_sgs_dd_outputs, "inverse multiplicity", elem_restr_inv_multiplicity, CEED_BASIS_NONE, inv_multiplicity));
-    PetscCallCeed(ceed, CeedOperatorSetField(op_sgs_dd_outputs, "eigenvectors", elem_restr_eigvec, CEED_BASIS_NONE, eigvec));
-    PetscCallCeed(ceed, CeedOperatorSetField(op_sgs_dd_outputs, "km_sgs", elem_restr_sgs, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE));
-
-    PetscCall(OperatorApplyContextCreate(sgs_dd_data->dm_dd_outputs, sgs_dd_data->dm_sgs, ceed, op_sgs_dd_outputs, NULL, sgs_dd_data->sgs_nodal_ceed,
-                                         NULL, NULL, &sgs_dd_data->op_nodal_dd_outputs_ctx));
-    PetscCallCeed(ceed, CeedOperatorDestroy(&op_sgs_dd_outputs));
-    PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_sgs_dd_outputs));
-  }
-
-  sgs_dd_data->sgs_nodal_eval = SgsDDNodalStressEval_Sequential;
-
-  if (sgs_dd_setup_data->sgs_dd_model_implementation == SGS_MODEL_DD_SEQENTIAL_CEED) {
-    sgs_dd_data->sgs_nodal_inference = SgsDDNodalStressEval_Sequential_Ceed;
-    PetscCall(SgsDDSetupNodalEvaluation_Sequential_Ceed(ceed, sgs_dd_data, sgs_dd_setup_data, elem_restr_dd_inputs, elem_restr_dd_outputs,
-                                                        elem_restr_inv_multiplicity, inv_multiplicity, &sgs_dd_data->sgs_nodal_inference_ctx));
-  } else if (sgs_dd_setup_data->sgs_dd_model_implementation == SGS_MODEL_DD_SEQENTIAL_TORCH) {
-    sgs_dd_data->sgs_nodal_inference = SgsDDNodalStressEval_Sequential_Torch;
-    PetscCall(SgsDDSetupNodalEvaluation_Sequential_Torch(ceed, sgs_dd_data, sgs_dd_setup_data, elem_restr_dd_inputs, elem_restr_dd_outputs,
-                                                         elem_restr_inv_multiplicity, inv_multiplicity, &sgs_dd_data->sgs_nodal_inference_ctx));
-  }
-
-  sgs_dd_setup_data->elem_restr_sgs = elem_restr_sgs;
-
-  PetscCallCeed(ceed, CeedVectorDestroy(&inv_multiplicity));
-  PetscCallCeed(ceed, CeedVectorDestroy(&eigvec));
-  PetscCallCeed(ceed, CeedElemRestrictionDestroy(&elem_restr_inv_multiplicity));
-  PetscCallCeed(ceed, CeedElemRestrictionDestroy(&elem_restr_eigvec));
-  PetscCallCeed(ceed, CeedElemRestrictionDestroy(&elem_restr_dd_inputs));
-  PetscCallCeed(ceed, CeedElemRestrictionDestroy(&elem_restr_dd_outputs));
-  PetscFunctionReturn(PETSC_SUCCESS);
-}
-
-// @brief Create CeedOperator to compute SGS contribution to the residual
-static PetscErrorCode SgsSetupNodalIFunction(Ceed ceed, User user, CeedData ceed_data, SgsDDSetupData sgs_dd_setup_data) {
-  SgsDDData     sgs_dd_data = user->sgs_dd_data;
-  CeedInt       num_comp_q, num_comp_qd, num_comp_x;
-  PetscInt      dim;
-  CeedQFunction qf_sgs_apply;
-  CeedOperator  op_sgs_apply;
-  CeedBasis     basis_sgs;
-
-  PetscFunctionBeginUser;
-  PetscCall(DMGetDimension(user->dm, &dim));
-  PetscCallCeed(ceed, CeedElemRestrictionGetNumComponents(ceed_data->elem_restr_q, &num_comp_q));
-  PetscCallCeed(ceed, CeedElemRestrictionGetNumComponents(ceed_data->elem_restr_qd_i, &num_comp_qd));
-  PetscCallCeed(ceed, CeedElemRestrictionGetNumComponents(ceed_data->elem_restr_x, &num_comp_x));
-
-  PetscCall(CreateBasisFromPlex(ceed, sgs_dd_data->dm_sgs, 0, 0, 0, 0, &basis_sgs));
-
-  switch (user->phys->state_var) {
-    case STATEVAR_PRIMITIVE:
-      PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, IFunction_NodalSgs_Prim, IFunction_NodalSgs_Prim_loc, &qf_sgs_apply));
-      break;
-    case STATEVAR_CONSERVATIVE:
-      PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, IFunction_NodalSgs_Conserv, IFunction_NodalSgs_Conserv_loc, &qf_sgs_apply));
-      break;
-    case STATEVAR_ENTROPY:
-      PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, IFunction_NodalSgs_Entropy, IFunction_NodalSgs_Entropy_loc, &qf_sgs_apply));
-      break;
-  }
-
-  PetscCallCeed(ceed, CeedQFunctionSetContext(qf_sgs_apply, sgs_dd_setup_data->ifunction_qfctx));
-  PetscCallCeed(ceed, CeedQFunctionAddInput(qf_sgs_apply, "q", num_comp_q, CEED_EVAL_INTERP));
-  PetscCallCeed(ceed, CeedQFunctionAddInput(qf_sgs_apply, "qdata", num_comp_qd, CEED_EVAL_NONE));
-  PetscCallCeed(ceed, CeedQFunctionAddInput(qf_sgs_apply, "km_sgs", sgs_dd_data->num_comp_sgs, CEED_EVAL_INTERP));
-  PetscCallCeed(ceed, CeedQFunctionAddOutput(qf_sgs_apply, "Grad_v", num_comp_q * dim, CEED_EVAL_GRAD));
-
-  PetscCallCeed(ceed, CeedOperatorCreate(ceed, qf_sgs_apply, NULL, NULL, &op_sgs_apply));
-  PetscCallCeed(ceed, CeedOperatorSetField(op_sgs_apply, "q", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE));
-  PetscCallCeed(ceed, CeedOperatorSetField(op_sgs_apply, "qdata", ceed_data->elem_restr_qd_i, CEED_BASIS_NONE, ceed_data->q_data));
-  PetscCallCeed(ceed, CeedOperatorSetField(op_sgs_apply, "km_sgs", sgs_dd_setup_data->elem_restr_sgs, basis_sgs, sgs_dd_data->sgs_nodal_ceed));
-  PetscCallCeed(ceed, CeedOperatorSetField(op_sgs_apply, "Grad_v", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE));
-
-  PetscCall(
-      OperatorApplyContextCreate(user->dm, user->dm, ceed, op_sgs_apply, user->q_ceed, user->g_ceed, NULL, NULL, &sgs_dd_data->op_sgs_apply_ctx));
-
-  PetscCallCeed(ceed, CeedOperatorDestroy(&op_sgs_apply));
-  PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_sgs_apply));
-  PetscFunctionReturn(PETSC_SUCCESS);
-}
-
-// @brief Calculate and add data-driven SGS residual to the global residual
-PetscErrorCode SgsDDApplyIFunction(User user, const Vec Q_loc, Vec G_loc) {
-  SgsDDData    sgs_dd_data = user->sgs_dd_data;
-  Vec          VelocityGradient, SGSNodal_loc;
-  PetscMemType sgs_nodal_mem_type;
-
-  PetscFunctionBeginUser;
-  PetscCall(PetscLogEventBegin(FLUIDS_SgsModel, Q_loc, G_loc, NULL, NULL));
-  PetscCall(DMGetGlobalVector(user->grad_velo_proj->dm, &VelocityGradient));
-  PetscCall(VelocityGradientProjectionApply(user->grad_velo_proj, Q_loc, VelocityGradient));
-
-  // -- Compute Nodal SGS tensor
-  PetscCall(DMGetLocalVector(sgs_dd_data->dm_sgs, &SGSNodal_loc));
-  PetscCall(sgs_dd_data->sgs_nodal_eval(user, Q_loc, VelocityGradient, SGSNodal_loc));
-
-  // -- Compute contribution of the SGS stress
-  PetscCall(VecPetscToCeed(SGSNodal_loc, &sgs_nodal_mem_type, sgs_dd_data->sgs_nodal_ceed));  // sgs_nodal_ceed is an implicit input
-  PetscCall(ApplyAddCeedOperatorLocalToLocal(Q_loc, G_loc, sgs_dd_data->op_sgs_apply_ctx));
-
-  // -- Return local SGS vector
-  PetscCall(VecCeedToPetsc(sgs_dd_data->sgs_nodal_ceed, sgs_nodal_mem_type, SGSNodal_loc));
-  PetscCall(DMRestoreLocalVector(sgs_dd_data->dm_sgs, &SGSNodal_loc));
-  PetscCall(DMRestoreGlobalVector(user->grad_velo_proj->dm, &VelocityGradient));
-  PetscCall(PetscLogEventEnd(FLUIDS_SgsModel, Q_loc, G_loc, NULL, NULL));
-  PetscFunctionReturn(PETSC_SUCCESS);
-}
-
-// @brief B = A^T, A is NxM, B is MxN
-static PetscErrorCode TransposeMatrix(const PetscScalar *A, PetscScalar *B, const PetscInt N, const PetscInt M) {
-  PetscFunctionBeginUser;
-  for (PetscInt i = 0; i < N; i++) {
-    for (PetscInt j = 0; j < M; j++) {
-      B[j * N + i] = A[i * M + j];
-    }
-  }
-  PetscFunctionReturn(PETSC_SUCCESS);
-}
-
-// @brief Read neural network coefficients from file and put into context struct
-static PetscErrorCode SgsDDContextFill(MPI_Comm comm, char data_dir[PETSC_MAX_PATH_LEN], SgsDDContext *psgsdd_ctx) {
-  SgsDDContext sgsdd_ctx;
-  PetscInt     num_inputs = (*psgsdd_ctx)->num_inputs, num_outputs = (*psgsdd_ctx)->num_outputs, num_neurons = (*psgsdd_ctx)->num_neurons;
-  char         file_path[PETSC_MAX_PATH_LEN];
-  PetscScalar *temp;
-
-  PetscFunctionBeginUser;
-  {
-    SgsDDContext sgsdd_temp;
-    PetscCall(PetscNew(&sgsdd_temp));
-    *sgsdd_temp                     = **psgsdd_ctx;
-    sgsdd_temp->offsets.bias1       = 0;
-    sgsdd_temp->offsets.bias2       = sgsdd_temp->offsets.bias1 + num_neurons;
-    sgsdd_temp->offsets.weight1     = sgsdd_temp->offsets.bias2 + num_neurons;
-    sgsdd_temp->offsets.weight2     = sgsdd_temp->offsets.weight1 + num_neurons * num_inputs;
-    sgsdd_temp->offsets.out_scaling = sgsdd_temp->offsets.weight2 + num_inputs * num_neurons;
-    PetscInt total_num_scalars      = sgsdd_temp->offsets.out_scaling + 2 * num_outputs;
-    sgsdd_temp->total_bytes         = sizeof(*sgsdd_ctx) + total_num_scalars * sizeof(sgsdd_ctx->data[0]);
-    PetscCall(PetscMalloc(sgsdd_temp->total_bytes, &sgsdd_ctx));
-    *sgsdd_ctx = *sgsdd_temp;
-    PetscCall(PetscFree(sgsdd_temp));
-  }
-
-  PetscCall(PetscSNPrintf(file_path, sizeof file_path, "%s/%s", data_dir, "b1.dat"));
-  PetscCall(PhastaDatFileReadToArrayReal(comm, file_path, &sgsdd_ctx->data[sgsdd_ctx->offsets.bias1]));
-  PetscCall(PetscSNPrintf(file_path, sizeof file_path, "%s/%s", data_dir, "b2.dat"));
-  PetscCall(PhastaDatFileReadToArrayReal(comm, file_path, &sgsdd_ctx->data[sgsdd_ctx->offsets.bias2]));
-  PetscCall(PetscSNPrintf(file_path, sizeof file_path, "%s/%s", data_dir, "OutScaling.dat"));
-  PetscCall(PhastaDatFileReadToArrayReal(comm, file_path, &sgsdd_ctx->data[sgsdd_ctx->offsets.out_scaling]));
-
-  {
-    PetscCall(PetscMalloc1(num_inputs * num_neurons, &temp));
-    PetscCall(PetscSNPrintf(file_path, sizeof file_path, "%s/%s", data_dir, "w1.dat"));
-    PetscCall(PhastaDatFileReadToArrayReal(comm, file_path, temp));
-    PetscCall(TransposeMatrix(temp, &sgsdd_ctx->data[sgsdd_ctx->offsets.weight1], num_inputs, num_neurons));
-    PetscCall(PetscFree(temp));
-  }
-  {
-    PetscCall(PetscMalloc1(num_outputs * num_neurons, &temp));
-    PetscCall(PetscSNPrintf(file_path, sizeof file_path, "%s/%s", data_dir, "w2.dat"));
-    PetscCall(PhastaDatFileReadToArrayReal(comm, file_path, temp));
-    PetscCall(TransposeMatrix(temp, &sgsdd_ctx->data[sgsdd_ctx->offsets.weight2], num_neurons, num_outputs));
-    PetscCall(PetscFree(temp));
-  }
-
-  PetscCall(PetscFree(*psgsdd_ctx));
-  *psgsdd_ctx = sgsdd_ctx;
-  PetscFunctionReturn(PETSC_SUCCESS);
-}
-
-PetscErrorCode SgsDDSetup(Ceed ceed, User user, CeedData ceed_data, ProblemData problem) {
-  PetscReal                alpha = 0;
-  SgsDDContext             sgsdd_ctx;
-  MPI_Comm                 comm                           = user->comm;
-  char                     sgs_dd_dir[PETSC_MAX_PATH_LEN] = "./dd_sgs_parameters";
-  SgsDDSetupData           sgs_dd_setup_data;
-  NewtonianIdealGasContext gas;
-
-  PetscFunctionBeginUser;
-  PetscCall(VelocityGradientProjectionSetup(ceed, user, ceed_data, problem, user->phys->state_var, ceed_data->elem_restr_q, ceed_data->basis_q,
-                                            &user->grad_velo_proj));
-
-  PetscCall(PetscNew(&user->sgs_dd_data));
-  user->sgs_dd_data->num_comp_inputs  = 6;
-  user->sgs_dd_data->num_comp_outputs = 6;
-
-  PetscCall(PetscNew(&sgs_dd_setup_data));
-
-  PetscOptionsBegin(comm, NULL, "SGS Data-Driven Model Options", NULL);
-  PetscCall(PetscOptionsReal("-sgs_model_dd_leakyrelu_alpha", "Slope parameter for Leaky ReLU activation function", NULL, alpha, &alpha, NULL));
-  PetscCall(PetscOptionsString("-sgs_model_dd_parameter_dir", "Path to directory with model parameters (weights, biases, etc.)", NULL, sgs_dd_dir,
-                               sgs_dd_dir, sizeof(sgs_dd_dir), NULL));
-  PetscCall(PetscOptionsDeprecated("-sgs_model_dd_use_fused", NULL, "libCEED 0.12.0", "Use -sgs_model_dd_type instead"));
-  sgs_dd_setup_data->sgs_dd_model_implementation = SGS_MODEL_DD_FUSED;
-  PetscCall(PetscOptionsEnum("-sgs_model_dd_implementation", "Data-Driven SGS model implementation", NULL, SGSModelDDImplementations,
-                             (PetscEnum)sgs_dd_setup_data->sgs_dd_model_implementation, (PetscEnum *)&sgs_dd_setup_data->sgs_dd_model_implementation,
-                             NULL));
-  PetscOptionsEnd();
-
-  PetscCall(PetscNew(&sgsdd_ctx));
-  sgsdd_ctx->num_layers  = 1;
-  sgsdd_ctx->num_inputs  = 6;
-  sgsdd_ctx->num_outputs = 6;
-  sgsdd_ctx->num_neurons = 20;
-  sgsdd_ctx->alpha       = alpha;
-
-  PetscCall(SgsDDContextFill(comm, sgs_dd_dir, &sgsdd_ctx));
-
-  // -- Create DM for storing SGS tensor at nodes
-  PetscCall(SgsDDCreateDM(user->dm, &user->sgs_dd_data->dm_sgs, user->app_ctx->degree, user->app_ctx->q_extra, &user->sgs_dd_data->num_comp_sgs));
-
-  PetscCallCeed(ceed, CeedQFunctionContextGetDataRead(problem->apply_vol_ifunction.qfunction_context, CEED_MEM_HOST, &gas));
-  sgsdd_ctx->gas = *gas;
-  PetscCallCeed(ceed, CeedQFunctionContextRestoreDataRead(problem->apply_vol_ifunction.qfunction_context, &gas));
-  PetscCallCeed(ceed, CeedQFunctionContextCreate(user->ceed, &sgs_dd_setup_data->sgsdd_qfctx));
-  PetscCallCeed(ceed,
-                CeedQFunctionContextSetData(sgs_dd_setup_data->sgsdd_qfctx, CEED_MEM_HOST, CEED_USE_POINTER, sgsdd_ctx->total_bytes, sgsdd_ctx));
-  PetscCallCeed(ceed, CeedQFunctionContextSetDataDestroy(sgs_dd_setup_data->sgsdd_qfctx, CEED_MEM_HOST, FreeContextPetsc));
-
-  PetscCallCeed(ceed, CeedQFunctionContextReferenceCopy(problem->apply_vol_ifunction.qfunction_context, &sgs_dd_setup_data->ifunction_qfctx));
-
-  // -- Compute and store anisotropy tensor
-  PetscCall(GridAnisotropyTensorProjectionSetupApply(ceed, user, ceed_data, &sgs_dd_setup_data->elem_restr_grid_aniso,
-                                                     &sgs_dd_setup_data->grid_aniso_ceed));
-
-  // -- Create Nodal Evaluation Operator
-  switch (sgs_dd_setup_data->sgs_dd_model_implementation) {
-    case SGS_MODEL_DD_FUSED:
-      PetscCall(SgsDDSetupNodalEvaluation_Fused(ceed, user, ceed_data, sgs_dd_setup_data));
-      break;
-    case SGS_MODEL_DD_SEQENTIAL_CEED:
-    case SGS_MODEL_DD_SEQENTIAL_TORCH:
-      PetscCall(SgsDDSetupNodalEvaluation_Sequential(ceed, user, ceed_data, sgs_dd_setup_data));
-      break;
-  }
-
-  // -- Create Operator to evalutate residual of SGS stress
-  PetscCall(SgsSetupNodalIFunction(ceed, user, ceed_data, sgs_dd_setup_data));
-
-  PetscCall(SgsDDSetupDataDestroy(sgs_dd_setup_data));
-  PetscFunctionReturn(PETSC_SUCCESS);
-}
-
-PetscErrorCode SgsDDDataDestroy(SgsDDData sgs_dd_data) {
-  PetscFunctionBeginUser;
-  if (!sgs_dd_data) PetscFunctionReturn(PETSC_SUCCESS);
-  Ceed ceed = sgs_dd_data->op_sgs_apply_ctx->ceed;
-
-  PetscCallCeed(ceed, CeedVectorDestroy(&sgs_dd_data->sgs_nodal_ceed));
-  PetscCallCeed(ceed, CeedVectorDestroy(&sgs_dd_data->grad_velo_ceed));
-  PetscCall(OperatorApplyContextDestroy(sgs_dd_data->op_nodal_evaluation_ctx));
-  PetscCall(OperatorApplyContextDestroy(sgs_dd_data->op_sgs_apply_ctx));
-  PetscCall(OperatorApplyContextDestroy(sgs_dd_data->op_nodal_dd_inputs_ctx));
-  PetscCall(OperatorApplyContextDestroy(sgs_dd_data->op_nodal_dd_outputs_ctx));
-  PetscCall(DMDestroy(&sgs_dd_data->dm_sgs));
-  PetscCall(DMDestroy(&sgs_dd_data->dm_dd_inputs));
-  PetscCall(DMDestroy(&sgs_dd_data->dm_dd_outputs));
-  if (sgs_dd_data->sgs_nodal_inference_ctx) PetscCall(sgs_dd_data->sgs_nodal_inference_ctx_destroy(sgs_dd_data->sgs_nodal_inference_ctx));
-  PetscCall(PetscFree(sgs_dd_data));
-  PetscFunctionReturn(PETSC_SUCCESS);
-}
diff --git a/examples/fluids/problems/sgs_model_torch_weak.c b/examples/fluids/problems/sgs_model_torch_weak.c
deleted file mode 100644
index 36992a7d5c..0000000000
--- a/examples/fluids/problems/sgs_model_torch_weak.c
+++ /dev/null
@@ -1,22 +0,0 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-//
-// @file This creates weak functions for libtorch dependent functions.
-
-#include <sgs_model_torch.h>
-
-PetscErrorCode LoadModel_Torch(const char *model_path, TorchDeviceType device_enum) __attribute__((weak));
-PetscErrorCode LoadModel_Torch(const char *model_path, TorchDeviceType device_enum) {
-  PetscFunctionBeginUser;
-  SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_SUP, "Must build with USE_TORCH set to run %s", __func__);
-}
-
-PetscErrorCode ModelInference_Torch(Vec DD_Inputs_loc, Vec DD_Outputs_loc) __attribute__((weak));
-PetscErrorCode ModelInference_Torch(Vec DD_Inputs_loc, Vec DD_Outputs_loc) {
-  PetscFunctionBeginUser;
-  SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_SUP, "Must build with USE_TORCH set to run %s", __func__);
-}
diff --git a/examples/fluids/problems/torch/sgs_model_torch.cpp b/examples/fluids/problems/torch/sgs_model_torch.cpp
deleted file mode 100644
index 28641b8664..0000000000
--- a/examples/fluids/problems/torch/sgs_model_torch.cpp
+++ /dev/null
@@ -1,161 +0,0 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#include <log_events.h>
-#include <petsc.h>
-#include <sgs_model_torch.h>
-#include <torch/script.h>
-#include <torch/torch.h>
-
-torch::jit::script::Module model;
-torch::DeviceType          device_model;
-
-static PetscErrorCode EnumToDeviceType(TorchDeviceType device_enum, torch::DeviceType *device_type) {
-  PetscFunctionBeginUser;
-  switch (device_enum) {
-    case TORCH_DEVICE_CPU:
-      *device_type = torch::kCPU;
-      break;
-    case TORCH_DEVICE_XPU:
-      *device_type = torch::kXPU;
-      break;
-    case TORCH_DEVICE_CUDA:
-      *device_type = torch::kCUDA;
-      break;
-    case TORCH_DEVICE_HIP:
-      *device_type = torch::kHIP;
-      break;
-    default:
-      SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_SUP, "TorchDeviceType %d not supported by PyTorch inference", device_enum);
-  }
-  PetscFunctionReturn(PETSC_SUCCESS);
-}
-
-static PetscErrorCode PetscMemTypeToDeviceType(PetscMemType mem_type, torch::DeviceType *device_type) {
-  PetscFunctionBeginUser;
-  switch (mem_type) {
-    case PETSC_MEMTYPE_HOST:
-      *device_type = torch::kCPU;
-      break;
-    case PETSC_MEMTYPE_SYCL:
-      *device_type = torch::kXPU;
-      break;
-    case PETSC_MEMTYPE_CUDA:
-      *device_type = torch::kCUDA;
-      break;
-    case PETSC_MEMTYPE_HIP:
-      *device_type = torch::kHIP;
-      break;
-    default:
-      SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_SUP, "PetscMemType %s not supported by PyTorch inference", PetscMemTypeToString(mem_type));
-  }
-  PetscFunctionReturn(PETSC_SUCCESS);
-}
-
-PetscErrorCode LoadModel_Torch(const char *model_path, TorchDeviceType device_enum) {
-  PetscFunctionBeginUser;
-  PetscCall(EnumToDeviceType(device_enum, &device_model));
-
-  PetscCallCXX(model = torch::jit::load(model_path));
-  PetscCallCXX(model.to(torch::Device(device_model)));
-  PetscFunctionReturn(PETSC_SUCCESS);
-}
-
-// Load and run model
-PetscErrorCode ModelInference_Torch(Vec DD_Inputs_loc, Vec DD_Outputs_loc) {
-  torch::Tensor  input_tensor, output_tensor;
-  const PetscInt num_input_comps = 6, num_output_comps = 6;
-  PetscBool      debug_tensor_output = PETSC_FALSE;
-
-  PetscFunctionBeginUser;
-  // torch::NoGradGuard no_grad; // equivalent to "with torch.no_grad():" in PyTorch
-  PetscCall(PetscLogEventBegin(FLUIDS_SgsModelDDData, DD_Inputs_loc, DD_Outputs_loc, NULL, NULL));
-  {  // Transfer DD_Inputs_loc into input_tensor
-    PetscMemType         input_mem_type;
-    PetscInt             input_size, num_nodes;
-    const PetscScalar   *dd_inputs_ptr;
-    torch::DeviceType    dd_input_device;
-    torch::TensorOptions options;
-
-    PetscCall(VecGetLocalSize(DD_Inputs_loc, &input_size));
-    num_nodes = input_size / num_input_comps;
-    PetscCall(VecGetArrayReadAndMemType(DD_Inputs_loc, &dd_inputs_ptr, &input_mem_type));
-    PetscCall(PetscMemTypeToDeviceType(input_mem_type, &dd_input_device));
-
-    PetscCallCXX(options = torch::TensorOptions().dtype(torch::kFloat64).device(dd_input_device));
-    if (dd_input_device == torch::kXPU) {  // XPU requires device-to-host-to-device transfer
-      PetscCallCXX(input_tensor =
-                       at::from_blob((void *)dd_inputs_ptr, {num_nodes, num_input_comps}, {num_input_comps, 1}, nullptr, options, dd_input_device)
-                           .to(device_model));
-    } else {
-      PetscCallCXX(input_tensor = torch::from_blob((void *)dd_inputs_ptr, {num_nodes, num_input_comps}, options));
-    }
-    if (debug_tensor_output) {
-      double *input_tensor_ptr;
-
-      PetscCall(VecGetLocalSize(DD_Inputs_loc, &input_size));
-      PetscCallCXX(input_tensor_ptr = (double *)input_tensor.contiguous().to(torch::kCPU).data_ptr());
-      printf("Input_Tensor_Pointer:\n");
-      for (PetscInt i = 0; i < input_size; i++) {
-        printf("%f\n", input_tensor_ptr[i]);
-      }
-    }
-    PetscCall(VecRestoreArrayReadAndMemType(DD_Inputs_loc, &dd_inputs_ptr));
-  }
-  PetscCall(PetscLogEventEnd(FLUIDS_SgsModelDDData, DD_Inputs_loc, DD_Outputs_loc, NULL, NULL));
-
-  // Run model
-  PetscCall(PetscLogEventBegin(FLUIDS_SgsModelDDInference, DD_Inputs_loc, DD_Outputs_loc, NULL, NULL));
-  PetscCall(PetscLogGpuTimeBegin());
-  PetscCallCXX(output_tensor = model.forward({input_tensor}).toTensor());
-  PetscCall(PetscLogGpuTimeEnd());
-  PetscCall(PetscLogEventEnd(FLUIDS_SgsModelDDInference, DD_Inputs_loc, DD_Outputs_loc, NULL, NULL));
-
-  PetscCall(PetscLogEventBegin(FLUIDS_SgsModelDDData, DD_Inputs_loc, DD_Outputs_loc, NULL, NULL));
-  {  // Transfer output_tensor to DD_Outputs_loc
-    torch::DeviceType    dd_output_device;
-    torch::TensorOptions options;
-    PetscInt             output_size;
-    PetscScalar         *dd_outputs_ptr;
-    PetscMemType         output_mem_type;
-
-    {  // Get DeviceType of DD_Outputs_loc
-      PetscCall(VecGetArrayAndMemType(DD_Outputs_loc, &dd_outputs_ptr, &output_mem_type));
-      PetscCall(PetscMemTypeToDeviceType(output_mem_type, &dd_output_device));
-      PetscCall(VecRestoreArrayAndMemType(DD_Outputs_loc, &dd_outputs_ptr));
-    }
-
-    if (dd_output_device == torch::kXPU) {  // XPU requires device-to-host-to-device transfer
-      double *output_tensor_ptr;
-
-      PetscCall(VecGetLocalSize(DD_Outputs_loc, &output_size));
-      PetscCall(VecGetArray(DD_Outputs_loc, &dd_outputs_ptr));
-      PetscCallCXX(output_tensor_ptr = (double *)output_tensor.contiguous().to(torch::kCPU).data_ptr());
-      if (debug_tensor_output) {
-        printf("Output_Tensor_Pointer:\n");
-        for (PetscInt i = 0; i < output_size; i++) {
-          printf("%f\n", output_tensor_ptr[i]);
-        }
-      }
-      PetscCall(PetscArraycpy(dd_outputs_ptr, output_tensor_ptr, output_size));
-      PetscCall(VecRestoreArray(DD_Outputs_loc, &dd_outputs_ptr));
-    } else {
-      PetscInt      num_nodes;
-      torch::Tensor DD_Outputs_tensor;
-
-      PetscCall(VecGetLocalSize(DD_Outputs_loc, &output_size));
-      num_nodes = output_size / num_output_comps;
-      PetscCall(VecGetArrayAndMemType(DD_Outputs_loc, &dd_outputs_ptr, &output_mem_type));
-      PetscCallCXX(options = torch::TensorOptions().dtype(torch::kFloat64).device(dd_output_device));
-      PetscCallCXX(DD_Outputs_tensor = torch::from_blob((void *)dd_outputs_ptr, {num_nodes, num_output_comps}, options));
-      PetscCallCXX(DD_Outputs_tensor.copy_(output_tensor));
-      PetscCall(VecRestoreArrayAndMemType(DD_Outputs_loc, &dd_outputs_ptr));
-    }
-  }
-  PetscCall(PetscLogEventEnd(FLUIDS_SgsModelDDData, DD_Inputs_loc, DD_Outputs_loc, NULL, NULL));
-  PetscFunctionReturn(PETSC_SUCCESS);
-}
diff --git a/examples/fluids/qfunctions/sgs_dd_model.h b/examples/fluids/qfunctions/sgs_dd_model.h
deleted file mode 100644
index e904389814..0000000000
--- a/examples/fluids/qfunctions/sgs_dd_model.h
+++ /dev/null
@@ -1,259 +0,0 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-/// @file
-/// Structs and helper functions to evaluate data-driven subgrid-stress modeling
-/// See 'Invariant data-driven subgrid stress modeling in the strain-rate eigenframe for large eddy simulation' 2022 and 'S-frame discrepancy
-/// correction models for data-informed Reynolds stress closure' 2022
-#include <ceed.h>
-
-#include "newtonian_state.h"
-#include "newtonian_types.h"
-#include "sgs_dd_utils.h"
-#include "utils.h"
-#include "utils_eigensolver_jacobi.h"
-
-typedef struct SgsDDContext_ *SgsDDContext;
-struct SgsDDContext_ {
-  CeedInt    num_inputs, num_outputs;
-  CeedInt    num_layers;
-  CeedInt    num_neurons;
-  CeedScalar alpha;
-
-  struct NewtonianIdealGasContext_ gas;
-  struct {
-    size_t bias1, bias2;
-    size_t weight1, weight2;
-    size_t out_scaling;
-  } offsets;
-  size_t     total_bytes;
-  CeedScalar data[1];
-};
-
-CEED_QFUNCTION_HELPER void LeakyReLU(CeedScalar *x, const CeedScalar alpha, const CeedInt N) {
-  for (CeedInt i = 0; i < N; i++) x[i] *= (x[i] < 0 ? alpha : 1.);
-}
-
-CEED_QFUNCTION_HELPER void DataDrivenInference(const CeedScalar *inputs, CeedScalar *outputs, SgsDDContext sgsdd_ctx) {
-  const CeedInt     num_neurons = sgsdd_ctx->num_neurons;
-  const CeedInt     num_inputs  = sgsdd_ctx->num_inputs;
-  const CeedInt     num_outputs = sgsdd_ctx->num_outputs;
-  const CeedScalar  alpha       = sgsdd_ctx->alpha;
-  const CeedScalar *bias1       = &sgsdd_ctx->data[sgsdd_ctx->offsets.bias1];
-  const CeedScalar *bias2       = &sgsdd_ctx->data[sgsdd_ctx->offsets.bias2];
-  const CeedScalar *weight1     = &sgsdd_ctx->data[sgsdd_ctx->offsets.weight1];
-  const CeedScalar *weight2     = &sgsdd_ctx->data[sgsdd_ctx->offsets.weight2];
-  CeedScalar        V[20]       = {0.};
-
-  CopyN(bias1, V, num_neurons);
-  MatVecNM(weight1, inputs, num_neurons, num_inputs, CEED_NOTRANSPOSE, V);
-  LeakyReLU(V, alpha, num_neurons);
-  CopyN(bias2, outputs, num_outputs);
-  MatVecNM(weight2, V, num_outputs, num_neurons, CEED_NOTRANSPOSE, outputs);
-}
-
-CEED_QFUNCTION_HELPER void ComputeSgsDD_Fused(const CeedScalar grad_velo_aniso[3][3], const CeedScalar km_A_ij[6], const CeedScalar delta,
-                                              const CeedScalar viscosity, CeedScalar kmsgs_stress[6], SgsDDContext sgsdd_ctx) {
-  CeedScalar inputs[6], grad_velo_magnitude, eigenvectors[3][3], sgs_sframe_sym[6] = {0.}, new_bounds[6][2];
-  // Copying new_bounds because Sycl online compiler doesn't like direct casting the pointer
-  CopyN(&sgsdd_ctx->data[sgsdd_ctx->offsets.out_scaling], (CeedScalar *)new_bounds, 12);
-
-  ComputeSgsDDInputs(grad_velo_aniso, km_A_ij, delta, viscosity, eigenvectors, inputs, &grad_velo_magnitude);
-  DataDrivenInference(inputs, sgs_sframe_sym, sgsdd_ctx);
-  ComputeSgsDDOutputs(sgs_sframe_sym, delta, eigenvectors, new_bounds, grad_velo_magnitude, kmsgs_stress);
-}
-
-// @brief Calculate subgrid stress at nodes using anisotropic data-driven model
-CEED_QFUNCTION_HELPER int ComputeSgsDDNodal_Fused(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out,
-                                                  StateVariable state_var) {
-  const CeedScalar(*q)[CEED_Q_VLA]            = (const CeedScalar(*)[CEED_Q_VLA])in[0];
-  const CeedScalar(*grad_velo)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[2];
-  const CeedScalar(*A_ij_delta)[CEED_Q_VLA]   = (const CeedScalar(*)[CEED_Q_VLA])in[3];
-  const CeedScalar(*inv_multiplicity)         = (const CeedScalar(*))in[4];
-  CeedScalar(*v)[CEED_Q_VLA]                  = (CeedScalar(*)[CEED_Q_VLA])out[0];
-
-  const SgsDDContext             sgsdd_ctx = (SgsDDContext)ctx;
-  const NewtonianIdealGasContext gas       = &sgsdd_ctx->gas;
-
-  CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
-    const CeedScalar qi[5]                 = {q[0][i], q[1][i], q[2][i], q[3][i], q[4][i]};
-    const CeedScalar grad_velo_aniso[3][3] = {
-        {grad_velo[0][0][i], grad_velo[0][1][i], grad_velo[0][2][i]},
-        {grad_velo[1][0][i], grad_velo[1][1][i], grad_velo[1][2][i]},
-        {grad_velo[2][0][i], grad_velo[2][1][i], grad_velo[2][2][i]}
-    };
-    const CeedScalar km_A_ij[6] = {A_ij_delta[0][i], A_ij_delta[1][i], A_ij_delta[2][i], A_ij_delta[3][i], A_ij_delta[4][i], A_ij_delta[5][i]};
-    const CeedScalar delta      = A_ij_delta[6][i];
-    const State      s          = StateFromQ(gas, qi, state_var);
-    CeedScalar       km_sgs[6];
-
-    ComputeSgsDD_Fused(grad_velo_aniso, km_A_ij, delta, gas->mu / s.U.density, km_sgs, sgsdd_ctx);
-
-    for (int j = 0; j < 6; j++) v[j][i] = inv_multiplicity[i] * km_sgs[j];
-  }
-  return 0;
-}
-
-CEED_QFUNCTION(ComputeSgsDDNodal_Prim)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
-  return ComputeSgsDDNodal_Fused(ctx, Q, in, out, STATEVAR_PRIMITIVE);
-}
-
-CEED_QFUNCTION(ComputeSgsDDNodal_Conserv)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
-  return ComputeSgsDDNodal_Fused(ctx, Q, in, out, STATEVAR_CONSERVATIVE);
-}
-
-CEED_QFUNCTION(ComputeSgsDDNodal_Entropy)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
-  return ComputeSgsDDNodal_Fused(ctx, Q, in, out, STATEVAR_ENTROPY);
-}
-
-// @brief Calculate inputs to anisotropic data-driven model
-CEED_QFUNCTION_HELPER int ComputeSgsDDNodal_Sequential_Inputs(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out,
-                                                              StateVariable state_var) {
-  const CeedScalar(*q)[CEED_Q_VLA]            = (const CeedScalar(*)[CEED_Q_VLA])in[0];
-  const CeedScalar(*grad_velo)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[1];
-  const CeedScalar(*A_ij_delta)[CEED_Q_VLA]   = (const CeedScalar(*)[CEED_Q_VLA])in[2];
-  const CeedScalar(*inv_multiplicity)         = (const CeedScalar(*))in[3];
-  CeedScalar(*eigenvectors_stored)            = out[0];
-  CeedScalar(*model_inputs)[CEED_Q_VLA]       = (CeedScalar(*)[CEED_Q_VLA])out[1];
-
-  const SgsDDContext             sgsdd_ctx = (SgsDDContext)ctx;
-  const NewtonianIdealGasContext gas       = &sgsdd_ctx->gas;
-
-  CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
-    const CeedScalar qi[5]                 = {q[0][i], q[1][i], q[2][i], q[3][i], q[4][i]};
-    const CeedScalar grad_velo_aniso[3][3] = {
-        {grad_velo[0][0][i], grad_velo[0][1][i], grad_velo[0][2][i]},
-        {grad_velo[1][0][i], grad_velo[1][1][i], grad_velo[1][2][i]},
-        {grad_velo[2][0][i], grad_velo[2][1][i], grad_velo[2][2][i]}
-    };
-    const CeedScalar km_A_ij[6] = {A_ij_delta[0][i], A_ij_delta[1][i], A_ij_delta[2][i], A_ij_delta[3][i], A_ij_delta[4][i], A_ij_delta[5][i]};
-    const CeedScalar delta      = A_ij_delta[6][i];
-    const State      s          = StateFromQ(gas, qi, state_var);
-
-    CeedScalar model_inputs_i[6], grad_velo_magnitude, eigenvectors[3][3];
-    ComputeSgsDDInputs(grad_velo_aniso, km_A_ij, delta, gas->mu / s.U.density, eigenvectors, model_inputs_i, &grad_velo_magnitude);
-
-    ScaleN(model_inputs_i, inv_multiplicity[i], 6);
-    StoredValuesPack(Q, i, 0, 6, model_inputs_i, (CeedScalar *)model_inputs);
-    StoredValuesPack(Q, i, 0, 9, (const CeedScalar *)eigenvectors, eigenvectors_stored);
-    StoredValuesPack(Q, i, 9, 1, &grad_velo_magnitude, eigenvectors_stored);
-  }
-  return CEED_ERROR_SUCCESS;
-}
-
-CEED_QFUNCTION(ComputeSgsDDNodal_Sequential_Inputs_Prim)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
-  return ComputeSgsDDNodal_Sequential_Inputs(ctx, Q, in, out, STATEVAR_PRIMITIVE);
-}
-
-CEED_QFUNCTION(ComputeSgsDDNodal_Sequential_Inputs_Conserv)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
-  return ComputeSgsDDNodal_Sequential_Inputs(ctx, Q, in, out, STATEVAR_CONSERVATIVE);
-}
-
-CEED_QFUNCTION(ComputeSgsDDNodal_Sequential_Inputs_Entropy)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
-  return ComputeSgsDDNodal_Sequential_Inputs(ctx, Q, in, out, STATEVAR_ENTROPY);
-}
-
-// @brief Runs inference on the data-driven model, used predominantsly for testing and validation
-CEED_QFUNCTION(ComputeSgsDDNodal_Sequential_Inference)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
-  const CeedScalar(*model_inputs)     = in[0];
-  const CeedScalar(*inv_multiplicity) = in[1];
-  CeedScalar(*model_outputs)          = out[0];
-
-  const SgsDDContext sgsdd_ctx = (SgsDDContext)ctx;
-
-  CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
-    CeedScalar model_inputs_i[6], model_outputs_i[6];
-    // CeedScalar model_outputs_i[6];
-    // CeedScalar model_inputs_i[6] = {1, 2, 3, 4, 5, 6};
-
-    StoredValuesUnpack(Q, i, 0, 6, (const CeedScalar *)model_inputs, model_inputs_i);
-    DataDrivenInference(model_inputs_i, model_outputs_i, sgsdd_ctx);
-    ScaleN(model_outputs_i, inv_multiplicity[i], 6);
-    StoredValuesPack(Q, i, 0, 6, model_outputs_i, model_outputs);
-  }
-  return CEED_ERROR_SUCCESS;
-}
-
-// @brief Calculates SGS from outputs of anisotropic data-driven model
-CEED_QFUNCTION(ComputeSgsDDNodal_Sequential_Outputs)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
-  const CeedScalar(*model_outputs)          = in[0];
-  const CeedScalar(*A_ij_delta)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[1];
-  const CeedScalar(*inv_multiplicity)       = (const CeedScalar(*))in[2];
-  const CeedScalar(*eigenvectors_stored)    = in[3];
-  CeedScalar(*kmsgs_stress)[CEED_Q_VLA]     = (CeedScalar(*)[CEED_Q_VLA])out[0];
-
-  const SgsDDContext sgsdd_ctx = (SgsDDContext)ctx;
-  CeedScalar         new_bounds[6][2];
-  CopyN(&sgsdd_ctx->data[sgsdd_ctx->offsets.out_scaling], (CeedScalar *)new_bounds, 12);
-
-  CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
-    CeedScalar       model_outputs_i[6];
-    const CeedScalar delta = A_ij_delta[6][i];
-
-    StoredValuesUnpack(Q, i, 0, 6, model_outputs, model_outputs_i);
-    CeedScalar grad_velo_magnitude, eigenvectors[3][3], kmsgs_stress_i[6];
-    StoredValuesUnpack(Q, i, 0, 9, eigenvectors_stored, (CeedScalar *)eigenvectors);
-    StoredValuesUnpack(Q, i, 9, 1, eigenvectors_stored, &grad_velo_magnitude);
-    ComputeSgsDDOutputs(model_outputs_i, delta, eigenvectors, new_bounds, grad_velo_magnitude, kmsgs_stress_i);
-
-    for (int j = 0; j < 6; j++) kmsgs_stress[j][i] = inv_multiplicity[i] * kmsgs_stress_i[j];
-  }
-  return CEED_ERROR_SUCCESS;
-}
-
-// @brief Adds subgrid stress to residual (during IFunction evaluation)
-CEED_QFUNCTION_HELPER int FluxSubgridStress(const StatePrimitive Y, const CeedScalar km_sgs[6], CeedScalar Flux[5][3]) {
-  CeedScalar sgs[3][3];
-
-  KMUnpack(km_sgs, sgs);
-  for (CeedInt j = 0; j < 3; j++) {
-    Flux[0][j] = 0.;
-    for (CeedInt k = 0; k < 3; k++) Flux[k + 1][j] = sgs[k][j];
-    Flux[4][j] = Y.velocity[0] * sgs[0][j] + Y.velocity[1] * sgs[1][j] + Y.velocity[2] * sgs[2][j];
-  }
-  return 0;
-}
-
-CEED_QFUNCTION_HELPER int IFunction_NodalSgs(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out, StateVariable state_var) {
-  const CeedScalar(*q)[CEED_Q_VLA]      = (const CeedScalar(*)[CEED_Q_VLA])in[0];
-  const CeedScalar(*q_data)             = in[1];
-  const CeedScalar(*km_sgs)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[2];
-  CeedScalar(*Grad_v)[5][CEED_Q_VLA]    = (CeedScalar(*)[5][CEED_Q_VLA])out[0];
-
-  NewtonianIdealGasContext gas = (NewtonianIdealGasContext)ctx;
-
-  CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
-    const CeedScalar qi[5] = {q[0][i], q[1][i], q[2][i], q[3][i], q[4][i]};
-    const State      s     = StateFromQ(gas, qi, state_var);
-
-    CeedScalar wdetJ, dXdx[3][3];
-    QdataUnpack_3D(Q, i, q_data, &wdetJ, dXdx);
-
-    CeedScalar       Flux[5][3];
-    const CeedScalar km_sgs_i[6] = {km_sgs[0][i], km_sgs[1][i], km_sgs[2][i], km_sgs[3][i], km_sgs[4][i], km_sgs[5][i]};
-    FluxSubgridStress(s.Y, km_sgs_i, Flux);
-
-    for (CeedInt k = 0; k < 3; k++) {
-      for (CeedInt j = 0; j < 5; j++) {
-        Grad_v[k][j][i] = -wdetJ * (dXdx[k][0] * Flux[j][0] + dXdx[k][1] * Flux[j][1] + dXdx[k][2] * Flux[j][2]);
-      }
-    }
-  }
-  return 0;
-}
-
-CEED_QFUNCTION(IFunction_NodalSgs_Conserv)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
-  return IFunction_NodalSgs(ctx, Q, in, out, STATEVAR_CONSERVATIVE);
-}
-
-CEED_QFUNCTION(IFunction_NodalSgs_Prim)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
-  return IFunction_NodalSgs(ctx, Q, in, out, STATEVAR_PRIMITIVE);
-}
-
-CEED_QFUNCTION(IFunction_NodalSgs_Entropy)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
-  return IFunction_NodalSgs(ctx, Q, in, out, STATEVAR_ENTROPY);
-}
diff --git a/examples/fluids/qfunctions/sgs_dd_training.h b/examples/fluids/qfunctions/sgs_dd_training.h
deleted file mode 100644
index 803f959a1d..0000000000
--- a/examples/fluids/qfunctions/sgs_dd_training.h
+++ /dev/null
@@ -1,68 +0,0 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-/// @file
-/// Structs and helper functions for training data-driven subgrid-stress models
-/// See 'Invariant data-driven subgrid stress modeling in the strain-rate eigenframe for large eddy simulation' 2022 and 'S-frame discrepancy
-/// correction models for data-informed Reynolds stress closure' 2022
-#include <ceed.h>
-
-#include "differential_filter_enums.h"
-#include "newtonian_state.h"
-#include "newtonian_types.h"
-#include "sgs_dd_utils.h"
-#include "utils.h"
-#include "utils_eigensolver_jacobi.h"
-
-typedef struct SGS_DD_TrainingContext_ *SGS_DDTrainingContext;
-struct SGS_DD_TrainingContext_ {
-  struct NewtonianIdealGasContext_ gas;
-};
-
-// @brief Calculate Data-Driven SGS model training data at nodes
-CEED_QFUNCTION_HELPER int ComputeSGS_DDAnisotropicTrainingDataNodal(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out,
-                                                                    StateVariable state_var) {
-  const CeedScalar(*q)[CEED_Q_VLA]            = (const CeedScalar(*)[CEED_Q_VLA])in[0];
-  const CeedScalar(*velo_prod)[CEED_Q_VLA]    = (const CeedScalar(*)[CEED_Q_VLA])in[1];
-  const CeedScalar(*grad_velo)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[2];
-  const CeedScalar(*A_ij_delta)[CEED_Q_VLA]   = (const CeedScalar(*)[CEED_Q_VLA])in[3];
-  const CeedScalar(*inv_multiplicity)         = (const CeedScalar(*))in[4];
-  CeedScalar(*v)[CEED_Q_VLA]                  = (CeedScalar(*)[CEED_Q_VLA])out[0];
-
-  const SGS_DDTrainingContext    sgsdd_ctx = (SGS_DDTrainingContext)ctx;
-  const NewtonianIdealGasContext gas       = &sgsdd_ctx->gas;
-
-  CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
-    const CeedScalar qi[5]                 = {q[0][i], q[1][i], q[2][i], q[3][i], q[4][i]};
-    const CeedScalar grad_velo_aniso[3][3] = {
-        {grad_velo[0][0][i], grad_velo[0][1][i], grad_velo[0][2][i]},
-        {grad_velo[1][0][i], grad_velo[1][1][i], grad_velo[1][2][i]},
-        {grad_velo[2][0][i], grad_velo[2][1][i], grad_velo[2][2][i]}
-    };
-    const CeedScalar km_A_ij[6] = {A_ij_delta[0][i], A_ij_delta[1][i], A_ij_delta[2][i], A_ij_delta[3][i], A_ij_delta[4][i], A_ij_delta[5][i]};
-    const CeedScalar delta      = A_ij_delta[6][i];
-    const State      s          = StateFromQ(gas, qi, state_var);
-    CeedScalar       inputs[6];
-    CeedScalar       eigenvectors[3][3], grad_velo_magnitude;  // dummy variables, don't actually use them
-
-    ComputeSgsDDInputs(grad_velo_aniso, km_A_ij, delta, gas->mu / s.U.density, eigenvectors, inputs, &grad_velo_magnitude);
-
-    for (int j = 0; j < 6; j++) v[j][i] = inv_multiplicity[i] * inputs[j];
-
-    v[0 + 6][i] = (velo_prod[DIFF_FILTER_VELOCITY_SQUARED_XX][i] - Square(s.Y.velocity[0])) * inv_multiplicity[i];
-    v[1 + 6][i] = (velo_prod[DIFF_FILTER_VELOCITY_SQUARED_YY][i] - Square(s.Y.velocity[1])) * inv_multiplicity[i];
-    v[2 + 6][i] = (velo_prod[DIFF_FILTER_VELOCITY_SQUARED_ZZ][i] - Square(s.Y.velocity[2])) * inv_multiplicity[i];
-    v[3 + 6][i] = (velo_prod[DIFF_FILTER_VELOCITY_SQUARED_YZ][i] - s.Y.velocity[1] * s.Y.velocity[2]) * inv_multiplicity[i];
-    v[4 + 6][i] = (velo_prod[DIFF_FILTER_VELOCITY_SQUARED_XZ][i] - s.Y.velocity[0] * s.Y.velocity[2]) * inv_multiplicity[i];
-    v[5 + 6][i] = (velo_prod[DIFF_FILTER_VELOCITY_SQUARED_XY][i] - s.Y.velocity[0] * s.Y.velocity[1]) * inv_multiplicity[i];
-  }
-  return 0;
-}
-
-CEED_QFUNCTION(ComputeSGS_DDAnisotropicTrainingDataNodal_Prim)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
-  return ComputeSGS_DDAnisotropicTrainingDataNodal(ctx, Q, in, out, STATEVAR_PRIMITIVE);
-}
diff --git a/examples/fluids/qfunctions/sgs_dd_utils.h b/examples/fluids/qfunctions/sgs_dd_utils.h
deleted file mode 100644
index 4bcb9fc181..0000000000
--- a/examples/fluids/qfunctions/sgs_dd_utils.h
+++ /dev/null
@@ -1,131 +0,0 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-/// @file
-/// Structs and helper functions for data-driven subgrid-stress modeling
-/// See 'Invariant data-driven subgrid stress modeling in the strain-rate eigenframe for large eddy simulation' 2022 and 'S-frame discrepancy
-/// correction models for data-informed Reynolds stress closure' 2022
-#pragma once
-
-#include <ceed.h>
-
-#include "newtonian_state.h"
-#include "newtonian_types.h"
-#include "utils.h"
-#include "utils_eigensolver_jacobi.h"
-
-// @brief Calculate Frobenius norm of velocity gradient from eigenframe quantities
-CEED_QFUNCTION_HELPER CeedScalar VelocityGradientMagnitude(const CeedScalar strain_sframe[3], const CeedScalar vorticity_sframe[3]) {
-  return sqrt(Dot3(strain_sframe, strain_sframe) + 0.5 * Dot3(vorticity_sframe, vorticity_sframe));
-};
-
-// @brief Change the order of basis vectors so that they align with vector and obey right-hand rule
-// @details The e_1 and e_3 basis vectors are the closest aligned to the vector. The e_2 is set via  e_3 x e_1
-// The basis vectors are assumed to form the rows of the basis matrix.
-CEED_QFUNCTION_HELPER void OrientBasisWithVector(CeedScalar basis[3][3], const CeedScalar vector[3]) {
-  CeedScalar alignment[3] = {0.}, cross[3];
-
-  MatVec3(basis, vector, CEED_NOTRANSPOSE, alignment);
-
-  if (alignment[0] < 0) ScaleN(basis[0], -1, 3);
-  if (alignment[2] < 0) ScaleN(basis[2], -1, 3);
-
-  Cross3(basis[2], basis[0], cross);
-  CeedScalar basis_1_orientation = Dot3(cross, basis[1]);
-  if (basis_1_orientation < 0) ScaleN(basis[1], -1, 3);
-}
-
-// @brief Denormalize outputs using min-max (de-)normalization
-CEED_QFUNCTION_HELPER void DenormalizeDDOutputs(CeedScalar output[6], const CeedScalar new_bounds[6][2], const CeedScalar old_bounds[6][2]) {
-  CeedScalar bounds_ratio;
-  for (int i = 0; i < 6; i++) {
-    bounds_ratio = (new_bounds[i][1] - new_bounds[i][0]) / (old_bounds[i][1] - old_bounds[i][0]);
-    output[i]    = bounds_ratio * (output[i] - old_bounds[i][1]) + new_bounds[i][1];
-  }
-}
-
-/**
- * @brief Compute model inputs for anisotropic data-driven model
- *
- * @param[in]  grad_velo_aniso     Gradient of velocity in physical (anisotropic) coordinates
- * @param[in]  km_A_ij             Anisotropy tensor, in Kelvin-Mandel notation
- * @param[in]  delta               Length used to create anisotropy tensor
- * @param[in]  viscosity           Kinematic viscosity
- * @param[out] eigenvectors        Eigenvectors of the (anisotropic) velocity gradient
- * @param[out] inputs              Data-driven model inputs
- * @param[out] grad_velo_magnitude Frobenius norm of the velocity gradient
- */
-CEED_QFUNCTION_HELPER void ComputeSgsDDInputs(const CeedScalar grad_velo_aniso[3][3], const CeedScalar km_A_ij[6], const CeedScalar delta,
-                                              const CeedScalar viscosity, CeedScalar eigenvectors[3][3], CeedScalar inputs[6],
-                                              CeedScalar *grad_velo_magnitude) {
-  CeedScalar strain_sframe[3] = {0.}, vorticity_sframe[3] = {0.};
-  CeedScalar A_ij[3][3] = {{0.}}, grad_velo_iso[3][3] = {{0.}};
-
-  // -- Transform physical, anisotropic velocity gradient to isotropic
-  KMUnpack(km_A_ij, A_ij);
-  MatMat3(grad_velo_aniso, A_ij, CEED_NOTRANSPOSE, CEED_NOTRANSPOSE, grad_velo_iso);
-
-  {  // -- Get Eigenframe
-    CeedScalar kmstrain_iso[6], strain_iso[3][3];
-    CeedInt    work_vector[3] = {0};
-    KMStrainRate(grad_velo_iso, kmstrain_iso);
-    KMUnpack(kmstrain_iso, strain_iso);
-    Diagonalize3(strain_iso, strain_sframe, eigenvectors, work_vector, SORT_DECREASING_EVALS, true, 5);
-  }
-
-  {  // -- Get vorticity in S-frame
-    CeedScalar rotation_iso[3][3];
-    RotationRate(grad_velo_iso, rotation_iso);
-    CeedScalar vorticity_iso[3] = {-2 * rotation_iso[1][2], 2 * rotation_iso[0][2], -2 * rotation_iso[0][1]};
-    OrientBasisWithVector(eigenvectors, vorticity_iso);
-    MatVec3(eigenvectors, vorticity_iso, CEED_NOTRANSPOSE, vorticity_sframe);
-  }
-
-  // -- Calculate DD model inputs
-  *grad_velo_magnitude = VelocityGradientMagnitude(strain_sframe, vorticity_sframe);
-  inputs[0]            = strain_sframe[0];
-  inputs[1]            = strain_sframe[1];
-  inputs[2]            = strain_sframe[2];
-  inputs[3]            = vorticity_sframe[0];
-  inputs[4]            = vorticity_sframe[1];
-  inputs[5]            = viscosity / Square(delta);
-  ScaleN(inputs, 1 / (*grad_velo_magnitude + CEED_EPSILON), 6);
-}
-
-/**
- * @brief Compute the physical SGS stresses from the neural-network output
- *
- * @param[in,out] outputs             Outputs from the neural-network
- * @param[in]     delta               Length used to create anisotropy tensor
- * @param[in]     eigenvectors        Eigenvectors of the (anisotropic) velocity gradient
- * @param[in]     new_bounds          Bounds used for min-max de-normalization
- * @param[in]     grad_velo_magnitude Magnitude of the velocity gradient
- * @param[out]    kmsgs_stress        Physical SGS stresses in Kelvin-Mandel notation
- */
-CEED_QFUNCTION_HELPER void ComputeSgsDDOutputs(CeedScalar outputs[6], const CeedScalar delta, const CeedScalar eigenvectors[3][3],
-                                               const CeedScalar new_bounds[6][2], const CeedScalar grad_velo_magnitude, CeedScalar kmsgs_stress[6]) {
-  CeedScalar old_bounds[6][2] = {{0}};
-  for (int j = 0; j < 6; j++) old_bounds[j][1] = 1;
-  DenormalizeDDOutputs(outputs, new_bounds, old_bounds);
-
-  // Re-dimensionalize sgs_stress
-  ScaleN(outputs, Square(delta) * Square(grad_velo_magnitude), 6);
-
-  CeedScalar sgs_stress[3][3] = {{0.}};
-  {  // Rotate SGS Stress back to physical frame, SGS_physical = E^T SGS_sframe E
-    CeedScalar       Evec_sgs[3][3]   = {{0.}};
-    const CeedScalar sgs_sframe[3][3] = {
-        {outputs[0], outputs[3], outputs[4]},
-        {outputs[3], outputs[1], outputs[5]},
-        {outputs[4], outputs[5], outputs[2]},
-    };
-    MatMat3(eigenvectors, sgs_sframe, CEED_TRANSPOSE, CEED_NOTRANSPOSE, Evec_sgs);
-    MatMat3(Evec_sgs, eigenvectors, CEED_NOTRANSPOSE, CEED_NOTRANSPOSE, sgs_stress);
-  }
-
-  KMPack(sgs_stress, kmsgs_stress);
-}
diff --git a/examples/fluids/smartsim_regression_framework.py b/examples/fluids/smartsim_regression_framework.py
deleted file mode 100755
index 2834263e6a..0000000000
--- a/examples/fluids/smartsim_regression_framework.py
+++ /dev/null
@@ -1,241 +0,0 @@
-#!/usr/bin/env python3
-from junit_xml import TestCase
-from smartsim import Experiment
-from smartsim.settings import RunSettings
-from smartredis import Client
-import numpy as np
-from pathlib import Path
-import argparse
-import traceback
-import sys
-import time
-from typing import Tuple
-import os
-import shutil
-import logging
-import socket
-
-# autopep8 off
-sys.path.insert(0, (Path(__file__).parents[3] / "tests/junit-xml").as_posix())
-# autopep8 on
-
-logging.disable(logging.WARNING)
-
-fluids_example_dir = Path(__file__).parent.absolute()
-
-
-def getOpenSocket():
-    s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
-    s.bind(('', 0))
-    addr = s.getsockname()
-    s.close()
-    return addr[1]
-
-
-class NoError(Exception):
-    pass
-
-
-def assert_np_all(test, truth):
-    """Assert with better error reporting"""
-    try:
-        assert np.all(test == truth)
-    except Exception as e:
-        raise Exception(f"Expected {truth}, but got {test}") from e
-
-
-def assert_equal(test, truth):
-    """Assert with better error reporting"""
-    try:
-        assert test == truth
-    except Exception as e:
-        raise Exception(f"Expected {truth}, but got {test}") from e
-
-
-def verify_training_data(database_array, correct_array, ceed_resource, atol=1e-8, rtol=1e-8):
-    """Verify the training data
-
-    Cannot just use np.allclose due to vorticity vector directionality.
-    Check whether the S-frame-oriented vorticity vector's second component is just flipped.
-    This can happen due to the eigenvector ordering changing based on whichever one is closest to the vorticity vector.
-    If two eigenvectors are very close to the vorticity vector, this can cause the ordering to flip.
-    This flipping of the vorticity vector is not incorrect, just a known sensitivity of the model.
-    """
-    if not np.allclose(database_array, correct_array, atol=atol, rtol=rtol):
-
-        total_tolerances = atol + rtol * np.abs(correct_array)  # mimic np.allclose tolerance calculation
-        idx_notclose = np.where(np.abs(database_array - correct_array) > total_tolerances)
-        if not np.all(idx_notclose[1] == 4):
-            # values other than vorticity are not close
-            test_fail = True
-        else:
-            database_vorticity = database_array[idx_notclose]
-            correct_vorticity = correct_array[idx_notclose]
-            test_fail = False if np.allclose(-database_vorticity, correct_vorticity,
-                                             atol=atol, rtol=rtol) else True
-
-        if test_fail:
-            database_output_path = Path(
-                f"./y0_database_values_{ceed_resource.replace('/', '_')}.npy").absolute()
-            np.save(database_output_path, database_array)
-            raise AssertionError(f"Array values in database max difference: {np.max(np.abs(correct_array - database_array))}\n"
-                                 f"Array saved to {database_output_path.as_posix()}")
-
-
-class SmartSimTest(object):
-
-    def __init__(self, directory_path: Path):
-        self.exp: Experiment
-        self.database = None
-        self.directory_path: Path = directory_path
-        self.original_path: Path
-
-    def setup(self):
-        """To create the test directory and start SmartRedis database"""
-        self.original_path = Path(os.getcwd())
-
-        if self.directory_path.exists() and self.directory_path.is_dir():
-            shutil.rmtree(self.directory_path)
-        self.directory_path.mkdir()
-        os.chdir(self.directory_path)
-
-        PORT = getOpenSocket()
-        self.exp = Experiment("test", launcher="local")
-        self.database = self.exp.create_database(port=PORT, batch=False, interface="lo")
-        self.exp.generate(self.database)
-        self.exp.start(self.database)
-
-        # SmartRedis will complain if these aren't set
-        os.environ['SR_LOG_FILE'] = 'R'
-        os.environ['SR_LOG_LEVEL'] = 'INFO'
-
-    def test(self, ceed_resource) -> Tuple[bool, Exception, str]:
-        client = None
-        arguments = []
-        exe_path = "../../build/fluids-navierstokes"
-        try:
-            arguments = [
-                '-ceed', ceed_resource,
-                '-options_file', (fluids_example_dir / 'blasius.yaml').as_posix(),
-                '-ts_max_steps', '2',
-                '-diff_filter_grid_based_width',
-                '-ts_monitor', '-snes_monitor',
-                '-diff_filter_ksp_max_it', '50', '-diff_filter_ksp_monitor',
-                '-degree', '1',
-                '-sgs_train_enable',
-                '-sgs_train_write_data_interval', '2',
-                '-sgs_train_filter_width_scales', '1.2,3.1',
-                '-bc_symmetry_z',
-                '-dm_plex_shape', 'zbox',
-                '-dm_plex_box_bd', 'none,none,periodic',
-                '-dm_plex_box_faces', '4,6,1',
-                '-mesh_transform',
-            ]
-
-            run_settings = RunSettings(exe_path, exe_args=arguments)
-
-            client_exp = self.exp.create_model(f"client_{ceed_resource.replace('/', '_')}", run_settings)
-
-            # Start the client model
-            self.exp.start(client_exp, summary=False, block=True)
-
-            client = Client(cluster=False, address=self.database.get_address()[0])
-
-            assert client.poll_tensor("sizeInfo", 250, 5)
-            assert_np_all(client.get_tensor("sizeInfo"), np.array([35, 12, 6, 1, 1, 0]))
-
-            assert client.poll_tensor("check-run", 250, 5)
-            assert_equal(client.get_tensor("check-run")[0], 1)
-
-            assert client.poll_tensor("tensor-ow", 250, 5)
-            assert_equal(client.get_tensor("tensor-ow")[0], 1)
-
-            assert client.poll_tensor("num_filter_widths", 250, 5)
-            assert_equal(client.get_tensor("num_filter_widths")[0], 2)
-
-            assert client.poll_tensor("step", 250, 5)
-            assert_equal(client.get_tensor("step")[0], 2)
-
-            assert client.poll_tensor("y.0.0", 250, 5)
-            test_data_path = fluids_example_dir / "tests-output/y00_output.npy"
-            assert test_data_path.is_file()
-            correct_value = np.load(test_data_path)
-            database_value = client.get_tensor("y.0.0")
-            verify_training_data(database_value, correct_value, ceed_resource)
-
-            assert client.poll_tensor("y.0.1", 250, 5)
-            test_data_path = fluids_example_dir / "tests-output/y01_output.npy"
-            assert test_data_path.is_file()
-            correct_value = np.load(test_data_path)
-            database_value = client.get_tensor("y.0.1")
-            verify_training_data(database_value, correct_value, ceed_resource)
-
-            client.flush_db([os.environ["SSDB"]])
-            output = (True, NoError(), exe_path + ' ' + ' '.join(arguments))
-        except Exception as e:
-            output = (False, e, exe_path + ' ' + ' '.join(arguments))
-
-        finally:
-            if client:
-                client.flush_db([os.environ["SSDB"]])
-
-        return output
-
-    def test_junit(self, ceed_resource):
-        start: float = time.time()
-
-        passTest, exception, args = self.test(ceed_resource)
-
-        output = "" if isinstance(exception, NoError) else ''.join(
-            traceback.TracebackException.from_exception(exception).format())
-
-        test_case = TestCase(f'SmartSim Test {ceed_resource}',
-                             elapsed_sec=time.time() - start,
-                             timestamp=time.strftime(
-                                 '%Y-%m-%d %H:%M:%S %Z', time.localtime(start)),
-                             stdout=output,
-                             stderr=output,
-                             allow_multiple_subelements=True,
-                             category=f'SmartSim Tests')
-        test_case.args = args
-        if not passTest and 'occa' in ceed_resource:
-            test_case.add_skipped_info("OCCA mode not supported")
-        elif not passTest:
-            test_case.add_failure_info("exception", output)
-
-        return test_case
-
-    def teardown(self):
-        self.exp.stop(self.database)
-        os.chdir(self.original_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser('Testing script for SmartSim integration')
-    parser.add_argument(
-        '-c',
-        '--ceed-backends',
-        type=str,
-        nargs='*',
-        default=['/cpu/self'],
-        help='libCEED backend to use with convergence tests')
-    args = parser.parse_args()
-
-    test_dir = fluids_example_dir / "test_dir"
-    print("Setting up database...", end='')
-    test_framework = SmartSimTest(test_dir)
-    test_framework.setup()
-    print(" Done!")
-    for ceed_resource in args.ceed_backends:
-        print("working on " + ceed_resource + ' ...', end='')
-        passTest, exception, _ = test_framework.test(ceed_resource)
-
-        if passTest:
-            print("Passed!")
-        else:
-            print("Failed!", file=sys.stderr)
-            print('\t' + ''.join(traceback.TracebackException.from_exception(exception).format()), file=sys.stderr)
-
-    print("Cleaning up database...", end='')
-    test_framework.teardown()
-    print(" Done!")
diff --git a/examples/fluids/src/cloptions.c b/examples/fluids/src/cloptions.c
index 265092badd..6dd1d36b6b 100644
--- a/examples/fluids/src/cloptions.c
+++ b/examples/fluids/src/cloptions.c
@@ -149,11 +149,6 @@ PetscErrorCode ProcessCommandLineOptions(MPI_Comm comm, AppCtx app_ctx, SimpleBC
   PetscCall(PetscOptionsViewer("-ts_monitor_wall_force", "Viewer for force on each (no-slip) wall", NULL, &app_ctx->wall_forces.viewer,
                                &app_ctx->wall_forces.viewer_format, NULL));
 
-  // SGS Model Options
-  app_ctx->sgs_model_type = SGS_MODEL_NONE;
-  PetscCall(PetscOptionsEnum("-sgs_model_type", "Subgrid Stress Model type", NULL, SGSModelTypes, (PetscEnum)app_ctx->sgs_model_type,
-                             (PetscEnum *)&app_ctx->sgs_model_type, NULL));
-
   PetscCall(PetscOptionsBool("-diff_filter_monitor", "Enable differential filtering TSMonitor", NULL, app_ctx->diff_filter_monitor,
                              &app_ctx->diff_filter_monitor, NULL));
 
@@ -162,9 +157,6 @@ PetscErrorCode ProcessCommandLineOptions(MPI_Comm comm, AppCtx app_ctx, SimpleBC
   PetscCall(PetscOptionsEnum("-mesh_transform", "Mesh transform to perform", NULL, MeshTransformTypes, (PetscEnum)app_ctx->mesh_transform_type,
                              (PetscEnum *)&app_ctx->mesh_transform_type, NULL));
 
-  PetscCall(
-      PetscOptionsBool("-sgs_train_enable", "Enable Data-Driven SGS training", NULL, app_ctx->sgs_train_enable, &app_ctx->sgs_train_enable, NULL));
-
   PetscOptionsEnd();
   PetscFunctionReturn(PETSC_SUCCESS);
 }
diff --git a/examples/fluids/src/log_events.c b/examples/fluids/src/log_events.c
index c5e968b485..1bf3b3b039 100644
--- a/examples/fluids/src/log_events.c
+++ b/examples/fluids/src/log_events.c
@@ -8,7 +8,7 @@
 #include <log_events.h>
 #include <petsc.h>
 
-static PetscClassId libCEED_classid, onlineTrain_classid, sgs_model_classid, misc_classid;
+static PetscClassId libCEED_classid, misc_classid;
 
 PetscLogEvent FLUIDS_CeedOperatorApply;
 PetscLogEvent FLUIDS_CeedOperatorAssemble;
@@ -20,9 +20,6 @@ PetscLogEvent FLUIDS_SmartRedis_Train;
 PetscLogEvent FLUIDS_TrainDataCompute;
 PetscLogEvent FLUIDS_DifferentialFilter;
 PetscLogEvent FLUIDS_VelocityGradientProjection;
-PetscLogEvent FLUIDS_SgsModel;
-PetscLogEvent FLUIDS_SgsModelDDInference;
-PetscLogEvent FLUIDS_SgsModelDDData;
 
 PetscErrorCode RegisterLogEvents() {
   PetscFunctionBeginUser;
@@ -32,17 +29,6 @@ PetscErrorCode RegisterLogEvents() {
   PetscCall(PetscLogEventRegister("CeedOpAsmD", libCEED_classid, &FLUIDS_CeedOperatorAssembleDiagonal));
   PetscCall(PetscLogEventRegister("CeedOpAsmPBD", libCEED_classid, &FLUIDS_CeedOperatorAssemblePointBlockDiagonal));
 
-  PetscCall(PetscClassIdRegister("onlineTrain", &onlineTrain_classid));
-  PetscCall(PetscLogEventRegister("SmartRedis_Init", onlineTrain_classid, &FLUIDS_SmartRedis_Init));
-  PetscCall(PetscLogEventRegister("SmartRedis_Meta", onlineTrain_classid, &FLUIDS_SmartRedis_Meta));
-  PetscCall(PetscLogEventRegister("SmartRedis_Train", onlineTrain_classid, &FLUIDS_SmartRedis_Train));
-  PetscCall(PetscLogEventRegister("TrainDataCompute", onlineTrain_classid, &FLUIDS_TrainDataCompute));
-
-  PetscCall(PetscClassIdRegister("SGS Model", &sgs_model_classid));
-  PetscCall(PetscLogEventRegister("SgsModel", sgs_model_classid, &FLUIDS_SgsModel));
-  PetscCall(PetscLogEventRegister("SgsModelDDInfer", sgs_model_classid, &FLUIDS_SgsModelDDInference));
-  PetscCall(PetscLogEventRegister("SgsModelDDData", sgs_model_classid, &FLUIDS_SgsModelDDData));
-
   PetscCall(PetscClassIdRegister("Miscellaneous", &misc_classid));
   PetscCall(PetscLogEventRegister("DiffFilter", misc_classid, &FLUIDS_DifferentialFilter));
   PetscCall(PetscLogEventRegister("VeloGradProj", misc_classid, &FLUIDS_VelocityGradientProjection));
diff --git a/examples/fluids/src/setuplibceed.c b/examples/fluids/src/setuplibceed.c
index f249bb6878..628b370617 100644
--- a/examples/fluids/src/setuplibceed.c
+++ b/examples/fluids/src/setuplibceed.c
@@ -448,7 +448,6 @@ PetscErrorCode SetupLibceed(Ceed ceed, CeedData ceed_data, DM dm, User user, App
 
     PetscCallCeed(ceed, CeedOperatorDestroy(&op_rhs));
     PetscCall(CreateKSPMass(user, problem));
-    PetscCheck(app_ctx->sgs_model_type == SGS_MODEL_NONE, user->comm, PETSC_ERR_SUP, "SGS modeling not implemented for explicit timestepping");
   } else {  // IFunction
     CeedOperator op_ijacobian = NULL;
 
@@ -470,13 +469,11 @@ PetscErrorCode SetupLibceed(Ceed ceed, CeedData ceed_data, DM dm, User user, App
       PetscCall(MatCeedSetLocalVectors(user->mat_ijacobian, user->Q_dot_loc, NULL));
       PetscCallCeed(ceed, CeedOperatorDestroy(&op_ijacobian));
     }
-    if (app_ctx->sgs_model_type == SGS_MODEL_DATA_DRIVEN) PetscCall(SgsDDSetup(ceed, user, ceed_data, problem));
   }
 
   if (problem->use_strong_bc_ceed) PetscCall(SetupStrongBC_Ceed(ceed, ceed_data, dm, user, problem, bc));
   if (app_ctx->turb_spanstats_enable) PetscCall(TurbulenceStatisticsSetup(ceed, user, ceed_data, problem));
   if (app_ctx->diff_filter_monitor && !user->diff_filter) PetscCall(DifferentialFilterSetup(ceed, user, ceed_data, problem));
-  if (app_ctx->sgs_train_enable) PetscCall(SGS_DD_TrainingSetup(ceed, user, ceed_data, problem));
 
   PetscCallCeed(ceed, CeedVectorDestroy(&jac_data));
   PetscCallCeed(ceed, CeedElemRestrictionDestroy(&elem_restr_jd_i));
diff --git a/examples/fluids/src/setupts.c b/examples/fluids/src/setupts.c
index 8ed11e10c0..9b9669fe45 100644
--- a/examples/fluids/src/setupts.c
+++ b/examples/fluids/src/setupts.c
@@ -140,10 +140,6 @@ PetscErrorCode IFunction_NS(TS ts, PetscReal t, Vec Q, Vec Q_dot, Vec G, void *u
   PetscCall(VecReadCeedToPetsc(user->q_dot_ceed, q_dot_mem_type, Q_dot_loc));
   PetscCall(VecCeedToPetsc(user->g_ceed, g_mem_type, G_loc));
 
-  if (user->app_ctx->sgs_model_type == SGS_MODEL_DATA_DRIVEN) {
-    PetscCall(SgsDDApplyIFunction(user, Q_loc, G_loc));
-  }
-
   // Local-to-Global
   PetscCall(VecZeroEntries(G));
   PetscCall(DMLocalToGlobal(user->dm, G_loc, ADD_VALUES, G));
@@ -373,11 +369,6 @@ PetscErrorCode TSSolve_NS(DM dm, User user, AppCtx app_ctx, Physics phys, Proble
   }
   if (app_ctx->diff_filter_monitor) PetscCall(TSMonitorSet(*ts, TSMonitor_DifferentialFilter, user, NULL));
 
-  if (app_ctx->sgs_train_enable) {
-    PetscCall(TSMonitorSet(*ts, TSMonitor_SGS_DD_Training, user, NULL));
-    PetscCall(TSSetPostStep(*ts, TSPostStep_SGS_DD_Training));
-  }
-
   if (app_ctx->test_type == TESTTYPE_NONE) PetscCall(PrintRunInfo(user, user->phys, problem, *ts));
   // Solve
   PetscReal start_time;
diff --git a/examples/fluids/src/smartsim/sgs_dd_training.c b/examples/fluids/src/smartsim/sgs_dd_training.c
deleted file mode 100644
index c3ff2ac43b..0000000000
--- a/examples/fluids/src/smartsim/sgs_dd_training.c
+++ /dev/null
@@ -1,390 +0,0 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#include "../../qfunctions/sgs_dd_training.h"
-
-#include <petscdmplex.h>
-
-#include "../../include/smartsim.h"
-#include "../../navierstokes.h"
-
-typedef struct {
-  CeedElemRestriction  elem_restr_grid_aniso;
-  CeedVector           grid_aniso_ceed;
-  CeedQFunctionContext sgs_dd_train_qfctx;
-} *SGS_DD_TrainingSetupData;
-
-static PetscErrorCode SGS_DD_TrainingSetupDataDestroy(SGS_DD_TrainingSetupData sgs_dd_train_setup_data) {
-  Ceed ceed;
-
-  PetscFunctionBeginUser;
-  PetscCall(CeedElemRestrictionGetCeed(sgs_dd_train_setup_data->elem_restr_grid_aniso, &ceed));
-
-  PetscCallCeed(ceed, CeedElemRestrictionDestroy(&sgs_dd_train_setup_data->elem_restr_grid_aniso));
-  PetscCallCeed(ceed, CeedVectorDestroy(&sgs_dd_train_setup_data->grid_aniso_ceed));
-  PetscCallCeed(ceed, CeedQFunctionContextDestroy(&sgs_dd_train_setup_data->sgs_dd_train_qfctx));
-  PetscCall(PetscFree(sgs_dd_train_setup_data));
-  PetscFunctionReturn(PETSC_SUCCESS);
-}
-
-// @brief Create DM for storing data-drive SGS model inputs
-static PetscErrorCode SGS_DD_TrainingCreateDM(DM dm_source, DM *dm_dd_training, PetscInt degree, PetscInt q_extra, PetscInt *num_components) {
-  PetscSection section;
-
-  PetscFunctionBeginUser;
-  *num_components = 12;
-
-  PetscCall(DMClone(dm_source, dm_dd_training));
-  PetscCall(PetscObjectSetName((PetscObject)*dm_dd_training, "Data-Driven SGS Training Data"));
-
-  PetscCall(DMSetupByOrder_FEM(PETSC_TRUE, PETSC_TRUE, degree, 1, q_extra, 1, num_components, *dm_dd_training));
-
-  PetscCall(DMGetLocalSection(*dm_dd_training, &section));
-  PetscCall(PetscSectionSetFieldName(section, 0, "Data-Driven SGS Training Data"));
-  PetscCall(PetscSectionSetComponentName(section, 0, 0, "SGSInput1"));
-  PetscCall(PetscSectionSetComponentName(section, 0, 1, "SGSInput2"));
-  PetscCall(PetscSectionSetComponentName(section, 0, 2, "SGSInput3"));
-  PetscCall(PetscSectionSetComponentName(section, 0, 3, "SGSInput4"));
-  PetscCall(PetscSectionSetComponentName(section, 0, 4, "SGSInput5"));
-  PetscCall(PetscSectionSetComponentName(section, 0, 5, "SGSInput6"));
-  PetscCall(PetscSectionSetComponentName(section, 0, 6, "FilteredSGSXX"));
-  PetscCall(PetscSectionSetComponentName(section, 0, 7, "FilteredSGSYY"));
-  PetscCall(PetscSectionSetComponentName(section, 0, 8, "FilteredSGSZZ"));
-  PetscCall(PetscSectionSetComponentName(section, 0, 9, "FilteredSGSYZ"));
-  PetscCall(PetscSectionSetComponentName(section, 0, 10, "FilteredSGSXZ"));
-  PetscCall(PetscSectionSetComponentName(section, 0, 11, "FilteredSGSXY"));
-  PetscFunctionReturn(PETSC_SUCCESS);
-};
-
-// @brief Create CeedOperator to calculate training data for data-drive SGS model at nodes
-static PetscErrorCode SetupTrainingDataCalculation(Ceed ceed, User user, CeedData ceed_data, ProblemData problem,
-                                                   SGS_DD_TrainingSetupData sgs_dd_train_setup_data) {
-  SGS_DD_TrainingData sgs_dd_train = user->sgs_dd_train;
-  CeedQFunction       qf_sgs_dd_train;
-  CeedOperator        op_sgs_dd_train;
-  CeedInt             num_comp_grad_velo, num_comp_grid_aniso;
-  CeedVector          inv_multiplicity, filtered_fields;
-  CeedElemRestriction elem_restr_inv_multiplicity, elem_restr_grad_velo, elem_restr_sgs_train;
-  DMLabel             domain_label = NULL;
-  PetscInt            label_value = 0, height = 0, dm_field = 0;
-
-  PetscFunctionBeginUser;
-  PetscCallCeed(ceed, CeedElemRestrictionGetNumComponents(sgs_dd_train_setup_data->elem_restr_grid_aniso, &num_comp_grid_aniso));
-
-  PetscCall(DMPlexCeedElemRestrictionCreate(ceed, sgs_dd_train->dm_dd_training, domain_label, label_value, height, dm_field, &elem_restr_sgs_train));
-  PetscCall(GetInverseMultiplicity(ceed, sgs_dd_train->dm_dd_training, domain_label, label_value, height, dm_field, PETSC_TRUE,
-                                   &elem_restr_inv_multiplicity, &inv_multiplicity));
-
-  CeedElemRestriction elem_restr_filtered_state;
-  CeedInt             num_comp_filtered_state;
-  {  // -- Setup filtered velocity gradient projection
-    CeedBasis         basis_filtered_state;
-    CeedOperatorField op_field;
-    PetscCallCeed(ceed, CeedOperatorGetFieldByName(user->diff_filter->op_rhs_ctx->op, "v0", &op_field));
-    PetscCallCeed(ceed, CeedOperatorFieldGetElemRestriction(op_field, &elem_restr_filtered_state));
-    PetscCallCeed(ceed, CeedElemRestrictionGetNumComponents(elem_restr_filtered_state, &num_comp_filtered_state));
-    PetscCallCeed(ceed, CeedOperatorFieldGetBasis(op_field, &basis_filtered_state));
-    PetscCall(VelocityGradientProjectionSetup(ceed, user, ceed_data, problem, STATEVAR_PRIMITIVE, elem_restr_filtered_state, basis_filtered_state,
-                                              &sgs_dd_train->filtered_grad_velo_proj));
-    // Get velocity gradient information
-    PetscCallCeed(ceed, CeedOperatorGetFieldByName(sgs_dd_train->filtered_grad_velo_proj->l2_rhs_ctx->op, "velocity gradient", &op_field));
-    PetscCallCeed(ceed, CeedOperatorFieldGetElemRestriction(op_field, &elem_restr_grad_velo));
-    PetscCallCeed(ceed, CeedElemRestrictionGetNumComponents(elem_restr_grad_velo, &num_comp_grad_velo));
-  }
-
-  CeedElemRestriction elem_restr_filtered_velo_prod;
-  CeedInt             num_comp_filtered_velo_prod;
-  {  // Get filtered velocity product information
-    CeedOperatorField op_field;
-    PetscCallCeed(ceed, CeedOperatorGetFieldByName(user->diff_filter->op_rhs_ctx->op, "v1", &op_field));
-    PetscCallCeed(ceed, CeedOperatorFieldGetElemRestriction(op_field, &elem_restr_filtered_velo_prod));
-    PetscCallCeed(ceed, CeedElemRestrictionGetNumComponents(elem_restr_filtered_velo_prod, &num_comp_filtered_velo_prod));
-  }
-
-  // -- Create operator for generating training data at nodes
-  // Differential Filter only provides filtered primitive variables
-  PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, ComputeSGS_DDAnisotropicTrainingDataNodal_Prim,
-                                                  ComputeSGS_DDAnisotropicTrainingDataNodal_Prim_loc, &qf_sgs_dd_train));
-
-  PetscCallCeed(ceed, CeedQFunctionSetContext(qf_sgs_dd_train, sgs_dd_train_setup_data->sgs_dd_train_qfctx));
-  PetscCallCeed(ceed, CeedQFunctionAddInput(qf_sgs_dd_train, "q", num_comp_filtered_state, CEED_EVAL_NONE));
-  PetscCallCeed(ceed, CeedQFunctionAddInput(qf_sgs_dd_train, "velocity product", num_comp_filtered_velo_prod, CEED_EVAL_NONE));
-  PetscCallCeed(ceed, CeedQFunctionAddInput(qf_sgs_dd_train, "gradient velocity", num_comp_grad_velo, CEED_EVAL_NONE));
-  PetscCallCeed(ceed, CeedQFunctionAddInput(qf_sgs_dd_train, "anisotropy tensor", num_comp_grid_aniso, CEED_EVAL_NONE));
-  PetscCallCeed(ceed, CeedQFunctionAddInput(qf_sgs_dd_train, "inverse multiplicity", 1, CEED_EVAL_NONE));
-  PetscCallCeed(ceed, CeedQFunctionAddOutput(qf_sgs_dd_train, "training data", sgs_dd_train->num_comp_dd_inputs, CEED_EVAL_NONE));
-
-  PetscCallCeed(ceed, CeedElemRestrictionCreateVector(elem_restr_filtered_state, &filtered_fields, NULL));
-  PetscCallCeed(ceed, CeedOperatorCreate(ceed, qf_sgs_dd_train, NULL, NULL, &op_sgs_dd_train));
-  PetscCallCeed(ceed, CeedOperatorSetField(op_sgs_dd_train, "q", elem_restr_filtered_state, CEED_BASIS_NONE, filtered_fields));
-  PetscCallCeed(ceed, CeedOperatorSetField(op_sgs_dd_train, "velocity product", elem_restr_filtered_velo_prod, CEED_BASIS_NONE, filtered_fields));
-  PetscCallCeed(ceed, CeedOperatorSetField(op_sgs_dd_train, "gradient velocity", elem_restr_grad_velo, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE));
-  PetscCallCeed(ceed, CeedOperatorSetField(op_sgs_dd_train, "anisotropy tensor", sgs_dd_train_setup_data->elem_restr_grid_aniso, CEED_BASIS_NONE,
-                                           sgs_dd_train_setup_data->grid_aniso_ceed));
-  PetscCallCeed(ceed, CeedOperatorSetField(op_sgs_dd_train, "inverse multiplicity", elem_restr_inv_multiplicity, CEED_BASIS_NONE, inv_multiplicity));
-  PetscCallCeed(ceed, CeedOperatorSetField(op_sgs_dd_train, "training data", elem_restr_sgs_train, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE));
-
-  PetscCall(OperatorApplyContextCreate(sgs_dd_train->filtered_grad_velo_proj->dm, sgs_dd_train->dm_dd_training, ceed, op_sgs_dd_train, NULL, NULL,
-                                       NULL, NULL, &sgs_dd_train->op_training_data_calc_ctx));
-
-  PetscCallCeed(ceed, CeedVectorDestroy(&inv_multiplicity));
-  PetscCallCeed(ceed, CeedVectorDestroy(&filtered_fields));
-  PetscCallCeed(ceed, CeedElemRestrictionDestroy(&elem_restr_inv_multiplicity));
-  PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_sgs_dd_train));
-  PetscCallCeed(ceed, CeedOperatorDestroy(&op_sgs_dd_train));
-  PetscFunctionReturn(PETSC_SUCCESS);
-}
-
-PetscErrorCode SGS_DD_TrainingSetup(Ceed ceed, User user, CeedData ceed_data, ProblemData problem) {
-  SGS_DDTrainingContext    sgsdd_train_qfctx;
-  SGS_DD_TrainingSetupData sgs_dd_train_setup_data;
-
-  PetscFunctionBeginUser;
-  if (!user->diff_filter) PetscCall(DifferentialFilterSetup(ceed, user, ceed_data, problem));
-  if (!user->smartsim) PetscCall(SmartSimSetup(user));
-
-  PetscCall(PetscNew(&sgsdd_train_qfctx));
-  PetscCall(PetscNew(&sgs_dd_train_setup_data));
-  PetscCall(PetscNew(&user->sgs_dd_train));
-  SGS_DD_TrainingData sgs_dd_train = user->sgs_dd_train;
-
-  sgs_dd_train->overwrite_training_data = PETSC_TRUE;
-  sgs_dd_train->write_data_interval     = 1;
-  sgs_dd_train->num_filter_widths       = sizeof(sgs_dd_train->filter_widths) / sizeof(sgs_dd_train->filter_widths[0]);
-  PetscOptionsBegin(user->comm, NULL, "SGS Data-Driven Training Options", NULL);
-  PetscCall(PetscOptionsInt("-sgs_train_write_data_interval", "Number of timesteps between writing data into database", NULL,
-                            sgs_dd_train->write_data_interval, &sgs_dd_train->write_data_interval, NULL));
-  PetscCall(PetscOptionsBool("-sgs_train_overwrite_data", "Overwrite old training data in the database", NULL, sgs_dd_train->overwrite_training_data,
-                             &sgs_dd_train->overwrite_training_data, NULL));
-  PetscCall(PetscOptionsRealArray("-sgs_train_filter_width_scales", "Scales of each filter width put into training database", NULL,
-                                  sgs_dd_train->filter_widths, &sgs_dd_train->num_filter_widths, NULL));
-  PetscOptionsEnd();
-
-  // -- Create DM for storing training data
-  PetscCall(SGS_DD_TrainingCreateDM(user->dm, &sgs_dd_train->dm_dd_training, user->app_ctx->degree, user->app_ctx->q_extra,
-                                    &sgs_dd_train->num_comp_dd_inputs));
-
-  {  // -- Create QFunction Context
-    NewtonianIdealGasContext gas;
-    PetscCallCeed(ceed, CeedQFunctionContextGetDataRead(problem->apply_vol_ifunction.qfunction_context, CEED_MEM_HOST, &gas));
-    sgsdd_train_qfctx->gas = *gas;
-    PetscCallCeed(ceed, CeedQFunctionContextRestoreDataRead(problem->apply_vol_ifunction.qfunction_context, &gas));
-    PetscCallCeed(ceed, CeedQFunctionContextCreate(user->ceed, &sgs_dd_train_setup_data->sgs_dd_train_qfctx));
-    PetscCallCeed(ceed, CeedQFunctionContextSetData(sgs_dd_train_setup_data->sgs_dd_train_qfctx, CEED_MEM_HOST, CEED_USE_POINTER,
-                                                    sizeof(*sgsdd_train_qfctx), sgsdd_train_qfctx));
-    PetscCallCeed(ceed, CeedQFunctionContextSetDataDestroy(sgs_dd_train_setup_data->sgs_dd_train_qfctx, CEED_MEM_HOST, FreeContextPetsc));
-  }
-
-  {  // -- Send training data array info to SmartRedis database
-    PetscMPIInt  rank, num_ranks;
-    SmartSimData smartsim = user->smartsim;
-    PetscCallMPI(MPI_Comm_rank(user->comm, &rank));
-    PetscCallMPI(MPI_Comm_size(user->comm, &num_ranks));
-
-    {
-      PetscSection global_section;
-      PetscInt     num_dofs, num_comps, local_min_max[2] = {0.}, global_min_max[2] = {0.};
-
-      PetscCall(DMGetGlobalSection(sgs_dd_train->dm_dd_training, &global_section));
-      PetscCall(DMGetGlobalVectorInfo(sgs_dd_train->dm_dd_training, &num_dofs, NULL, NULL));
-      PetscCall(PetscSectionGetFieldComponents(global_section, 0, &num_comps));
-      local_min_max[0] = num_dofs;
-      PetscCall(PetscGlobalMinMaxInt(user->comm, local_min_max, global_min_max));
-
-      sgs_dd_train->training_data_array_dims[0] = global_min_max[0] / num_comps;
-      sgs_dd_train->training_data_array_dims[1] = num_comps;
-    }
-
-    if (rank % smartsim->collocated_database_num_ranks == 0) {
-      {  // Communicate info on simulation size
-        const char tensor_name[]  = "sizeInfo";
-        size_t     array_info_dim = 6;
-        PetscInt64 array_info[6] = {0}, num_features = 6;
-
-        array_info[0] = sgs_dd_train->training_data_array_dims[0];
-        array_info[1] = sgs_dd_train->training_data_array_dims[1];
-        array_info[2] = num_features;
-        array_info[3] = num_ranks;
-        array_info[4] = smartsim->collocated_database_num_ranks;
-        array_info[5] = rank;
-
-        PetscCall(PetscLogEventBegin(FLUIDS_SmartRedis_Meta, 0, 0, 0, 0));
-        PetscCallSmartRedis(
-            put_tensor(smartsim->client, tensor_name, strlen(tensor_name), array_info, &array_info_dim, 1, SRTensorTypeInt64, SRMemLayoutContiguous));
-        PetscCall(SmartRedisVerifyPutTensor(smartsim->client, tensor_name, strlen(tensor_name)));
-        PetscCall(PetscLogEventEnd(FLUIDS_SmartRedis_Meta, 0, 0, 0, 0));
-      }
-
-      {  // Send array that communicates if tensors are overwritten in database
-        const char tensor_name[]       = "tensor-ow";
-        PetscInt64 tensor_overwrite[2] = {sgs_dd_train->overwrite_training_data};
-        size_t     dim_2[1]            = {2};
-
-        PetscCall(PetscLogEventBegin(FLUIDS_SmartRedis_Meta, 0, 0, 0, 0));
-        PetscCallSmartRedis(
-            put_tensor(smartsim->client, tensor_name, strlen(tensor_name), tensor_overwrite, dim_2, 1, SRTensorTypeInt64, SRMemLayoutContiguous));
-        PetscCall(SmartRedisVerifyPutTensor(smartsim->client, tensor_name, strlen(tensor_name)));
-        PetscCall(PetscLogEventEnd(FLUIDS_SmartRedis_Meta, 0, 0, 0, 0));
-      }
-
-      {  // Communicate number of filter widths used
-        const char tensor_name[]     = "num_filter_widths";
-        PetscInt64 num_filter_widths = sgs_dd_train->num_filter_widths;
-        size_t     dim_2             = 1;
-
-        PetscCall(PetscLogEventBegin(FLUIDS_SmartRedis_Meta, 0, 0, 0, 0));
-        PetscCallSmartRedis(
-            put_tensor(smartsim->client, tensor_name, strlen(tensor_name), &num_filter_widths, &dim_2, 1, SRTensorTypeInt64, SRMemLayoutContiguous));
-        PetscCall(SmartRedisVerifyPutTensor(smartsim->client, tensor_name, strlen(tensor_name)));
-        PetscCall(PetscLogEventEnd(FLUIDS_SmartRedis_Meta, 0, 0, 0, 0));
-      }
-    }
-  }
-
-  // -- Compute and store anisotropy tensor
-  PetscCall(GridAnisotropyTensorProjectionSetupApply(ceed, user, ceed_data, &sgs_dd_train_setup_data->elem_restr_grid_aniso,
-                                                     &sgs_dd_train_setup_data->grid_aniso_ceed));
-
-  // -- Create Nodal Evaluation Operator
-  PetscCall(SetupTrainingDataCalculation(ceed, user, ceed_data, problem, sgs_dd_train_setup_data));
-
-  PetscCall(SGS_DD_TrainingSetupDataDestroy(sgs_dd_train_setup_data));
-  PetscFunctionReturn(PETSC_SUCCESS);
-}
-
-PetscErrorCode TSMonitor_SGS_DD_Training(TS ts, PetscInt step_num, PetscReal solution_time, Vec Q, void *ctx) {
-  User                user         = (User)ctx;
-  Ceed                ceed         = user->ceed;
-  SGS_DD_TrainingData sgs_dd_train = user->sgs_dd_train;
-  SmartSimData        smartsim     = user->smartsim;
-  Vec                 TrainingData;
-  PetscMPIInt         rank;
-
-  PetscFunctionBeginUser;
-
-  PetscCallMPI(MPI_Comm_rank(user->comm, &rank));
-
-  if (step_num % sgs_dd_train->write_data_interval != 0) PetscFunctionReturn(PETSC_SUCCESS);
-  PetscCall(DMGetGlobalVector(sgs_dd_train->dm_dd_training, &TrainingData));
-
-  for (PetscInt filter_index = 0; filter_index < sgs_dd_train->num_filter_widths; filter_index++) {
-    PetscCall(PetscLogEventBegin(FLUIDS_TrainDataCompute, 0, 0, 0, 0));
-    {  // -- Compute and assemble training data
-      Vec          FilteredVelocityGradient, FilteredFields, FilteredFields_loc;
-      PetscMemType filtered_fields_mem_type;
-      CeedVector   filtered_fields;
-
-      {  // Set filter width for the current solve
-        double       filter_width_scaling[3];
-        CeedOperator op_mat;
-        Mat          A_mat;
-
-        for (int j = 0; j < 3; j++) filter_width_scaling[j] = sgs_dd_train->filter_widths[filter_index];
-        PetscCall(KSPGetOperators(user->diff_filter->ksp, &A_mat, NULL));
-        PetscCall(MatCeedGetCeedOperators(A_mat, &op_mat, NULL));
-        PetscCall(CeedOperatorSetContextDouble(op_mat, user->diff_filter->filter_width_scaling_label, filter_width_scaling));
-      }
-
-      PetscCall(DMGetGlobalVector(user->diff_filter->dm_filter, &FilteredFields));
-      PetscCall(DMGetLocalVector(user->diff_filter->dm_filter, &FilteredFields_loc));
-
-      PetscCall(DifferentialFilterApply(user, solution_time, Q, FilteredFields));
-      PetscCall(DMGlobalToLocal(user->diff_filter->dm_filter, FilteredFields, INSERT_VALUES, FilteredFields_loc));
-
-      PetscCall(DMGetGlobalVector(sgs_dd_train->filtered_grad_velo_proj->dm, &FilteredVelocityGradient));
-      PetscCall(VelocityGradientProjectionApply(sgs_dd_train->filtered_grad_velo_proj, FilteredFields_loc, FilteredVelocityGradient));
-
-      {
-        CeedOperatorField op_field;
-
-        PetscCallCeed(ceed, CeedOperatorGetFieldByName(sgs_dd_train->op_training_data_calc_ctx->op, "q", &op_field));
-        PetscCallCeed(ceed, CeedOperatorFieldGetVector(op_field, &filtered_fields));
-      }
-
-      PetscCall(VecPetscToCeed(FilteredFields_loc, &filtered_fields_mem_type, filtered_fields));  // filtered_fields is an implicit input
-      PetscCall(ApplyCeedOperatorGlobalToGlobal(FilteredVelocityGradient, TrainingData, sgs_dd_train->op_training_data_calc_ctx));
-      PetscCall(VecCeedToPetsc(filtered_fields, filtered_fields_mem_type, FilteredFields_loc));
-
-      PetscCall(DMRestoreGlobalVector(sgs_dd_train->filtered_grad_velo_proj->dm, &FilteredVelocityGradient));
-      PetscCall(DMRestoreGlobalVector(user->diff_filter->dm_filter, &FilteredFields));
-      PetscCall(DMRestoreLocalVector(user->diff_filter->dm_filter, &FilteredFields_loc));
-    }
-    PetscCall(PetscLogEventEnd(FLUIDS_TrainDataCompute, 0, 0, 0, 0));
-
-    {  // -- Send training data to SmartSim
-      char   array_key[PETSC_MAX_PATH_LEN];
-      size_t array_key_len;
-
-      if (sgs_dd_train->overwrite_training_data) {
-        PetscCall(PetscSNPrintf(array_key, sizeof array_key, "%s.%" PetscInt_FMT, smartsim->rank_id_name, filter_index));
-      } else {
-        PetscCall(PetscSNPrintf(array_key, sizeof array_key, "%s.%" PetscInt_FMT "%" PetscInt_FMT, smartsim->rank_id_name, step_num, filter_index));
-      }
-      PetscCall(PetscStrlen(array_key, &array_key_len));
-
-      {
-        const PetscScalar *training_data;
-        PetscCall(VecGetArrayRead(TrainingData, &training_data));
-        PetscCall(PetscLogEventBegin(FLUIDS_SmartRedis_Train, 0, 0, 0, 0));
-        PetscCallSmartRedis(put_tensor(smartsim->client, array_key, array_key_len, (void *)training_data, sgs_dd_train->training_data_array_dims, 2,
-                                       SRTensorTypeDouble, SRMemLayoutContiguous));
-        PetscCall(PetscLogEventEnd(FLUIDS_SmartRedis_Train, 0, 0, 0, 0));
-        PetscCall(VecRestoreArrayRead(TrainingData, &training_data));
-      }
-    }
-  }
-
-  if (rank % smartsim->collocated_database_num_ranks == 0) {
-    const char tensor_name[] = "step";
-    size_t     dim_2[1]      = {2};
-    PetscInt64 step_array[2] = {step_num, step_num};
-
-    PetscCall(PetscLogEventBegin(FLUIDS_SmartRedis_Meta, 0, 0, 0, 0));
-    PetscCallSmartRedis(
-        put_tensor(smartsim->client, tensor_name, strlen(tensor_name), step_array, dim_2, 1, SRTensorTypeInt64, SRMemLayoutContiguous));
-    PetscCall(PetscLogEventEnd(FLUIDS_SmartRedis_Meta, 0, 0, 0, 0));
-  }
-
-  PetscCall(DMRestoreGlobalVector(user->sgs_dd_train->dm_dd_training, &TrainingData));
-  PetscFunctionReturn(PETSC_SUCCESS);
-}
-
-PetscErrorCode TSPostStep_SGS_DD_Training(TS ts) {
-  User         user;
-  const char   check_run_key[]   = "check-run";
-  PetscReal    check_run[2]      = {1};
-  const size_t check_run_dims[1] = {2};
-  size_t       check_run_key_size;
-
-  PetscFunctionBeginUser;
-  PetscCall(PetscStrlen(check_run_key, &check_run_key_size));
-  PetscCall(TSGetApplicationContext(ts, &user));
-  SmartSimData smartsim = user->smartsim;
-
-  PetscCall(PetscLogEventBegin(FLUIDS_SmartRedis_Meta, 0, 0, 0, 0));
-  PetscCallSmartRedis(
-      unpack_tensor(smartsim->client, check_run_key, check_run_key_size, check_run, check_run_dims, 1, SRTensorTypeDouble, SRMemLayoutContiguous));
-  PetscCall(PetscLogEventEnd(FLUIDS_SmartRedis_Meta, 0, 0, 0, 0));
-  if (check_run[0] == 0) {
-    PetscCall(PetscPrintf(user->comm, "-- Simulation stopped by 'check-run' tensor in Redis database\n"));
-    PetscCall(TSSetConvergedReason(ts, TS_CONVERGED_USER));
-  }
-
-  PetscFunctionReturn(PETSC_SUCCESS);
-}
-
-PetscErrorCode SGS_DD_TrainingDataDestroy(SGS_DD_TrainingData sgs_dd_train) {
-  PetscFunctionBeginUser;
-  if (!sgs_dd_train) PetscFunctionReturn(PETSC_SUCCESS);
-
-  PetscCall(OperatorApplyContextDestroy(sgs_dd_train->op_training_data_calc_ctx));
-  PetscCall(NodalProjectionDataDestroy(sgs_dd_train->filtered_grad_velo_proj));
-  PetscCall(DMDestroy(&sgs_dd_train->dm_dd_training));
-  PetscCall(PetscFree(sgs_dd_train));
-
-  PetscFunctionReturn(PETSC_SUCCESS);
-}
diff --git a/examples/fluids/src/smartsim/smartsim.c b/examples/fluids/src/smartsim/smartsim.c
deleted file mode 100644
index 03ddab9606..0000000000
--- a/examples/fluids/src/smartsim/smartsim.c
+++ /dev/null
@@ -1,76 +0,0 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-// Based on the instructions from https://www.craylabs.org/docs/sr_integration.html and PHASTA implementation
-
-#include "../../include/smartsim.h"
-
-#include "../../navierstokes.h"
-
-PetscErrorCode SmartRedisVerifyPutTensor(void *c_client, const char *name, const size_t name_length) {
-  bool does_exist = true;
-
-  PetscFunctionBeginUser;
-  PetscCall(PetscLogEventBegin(FLUIDS_SmartRedis_Meta, 0, 0, 0, 0));
-  PetscCallSmartRedis(tensor_exists(c_client, name, name_length, &does_exist));
-  PetscCheck(does_exist, PETSC_COMM_SELF, -1, "Tensor of name '%s' was not written to the database successfully", name);
-  PetscCall(PetscLogEventEnd(FLUIDS_SmartRedis_Meta, 0, 0, 0, 0));
-  PetscFunctionReturn(PETSC_SUCCESS);
-}
-
-PetscErrorCode SmartSimTrainingSetup(User user) {
-  SmartSimData smartsim = user->smartsim;
-  PetscMPIInt  rank;
-  PetscReal    checkrun[2] = {1};
-  size_t       dim_2[1]    = {2};
-
-  PetscFunctionBeginUser;
-  PetscCallMPI(MPI_Comm_rank(user->comm, &rank));
-
-  if (rank % smartsim->collocated_database_num_ranks == 0) {
-    // -- Send array that communicates when ML is done training
-    PetscCall(PetscLogEventBegin(FLUIDS_SmartRedis_Meta, 0, 0, 0, 0));
-    PetscCallSmartRedis(put_tensor(smartsim->client, "check-run", 9, checkrun, dim_2, 1, SRTensorTypeDouble, SRMemLayoutContiguous));
-    PetscCall(SmartRedisVerifyPutTensor(smartsim->client, "check-run", 9));
-    PetscCall(PetscLogEventEnd(FLUIDS_SmartRedis_Meta, 0, 0, 0, 0));
-  }
-  PetscFunctionReturn(PETSC_SUCCESS);
-}
-
-PetscErrorCode SmartSimSetup(User user) {
-  PetscMPIInt rank;
-  PetscInt    num_orchestrator_nodes = 1;
-
-  PetscFunctionBeginUser;
-  PetscCall(PetscNew(&user->smartsim));
-  SmartSimData smartsim = user->smartsim;
-
-  smartsim->collocated_database_num_ranks = 1;
-  PetscOptionsBegin(user->comm, NULL, "Options for SmartSim integration", NULL);
-  PetscCall(PetscOptionsInt("-smartsim_collocated_database_num_ranks", "Number of ranks per collocated database instance", NULL,
-                            smartsim->collocated_database_num_ranks, &smartsim->collocated_database_num_ranks, NULL));
-  PetscOptionsEnd();
-
-  // Create prefix to be put on tensor names
-  PetscCallMPI(MPI_Comm_rank(user->comm, &rank));
-  PetscCall(PetscSNPrintf(smartsim->rank_id_name, sizeof(smartsim->rank_id_name), "y.%d", rank));
-
-  PetscCall(PetscLogEventBegin(FLUIDS_SmartRedis_Init, 0, 0, 0, 0));
-  PetscCallSmartRedis(SmartRedisCClient(num_orchestrator_nodes != 1, smartsim->rank_id_name, strlen(smartsim->rank_id_name), &smartsim->client));
-  PetscCall(PetscLogEventEnd(FLUIDS_SmartRedis_Init, 0, 0, 0, 0));
-
-  PetscCall(SmartSimTrainingSetup(user));
-  PetscFunctionReturn(PETSC_SUCCESS);
-}
-
-PetscErrorCode SmartSimDataDestroy(SmartSimData smartsim) {
-  PetscFunctionBeginUser;
-  if (!smartsim) PetscFunctionReturn(PETSC_SUCCESS);
-
-  PetscCallSmartRedis(DeleteCClient(&smartsim->client));
-  PetscCall(PetscFree(smartsim));
-  PetscFunctionReturn(PETSC_SUCCESS);
-}
diff --git a/examples/fluids/src/smartsim_weak.c b/examples/fluids/src/smartsim_weak.c
deleted file mode 100644
index 9c97419a8c..0000000000
--- a/examples/fluids/src/smartsim_weak.c
+++ /dev/null
@@ -1,46 +0,0 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-//
-// @file This creates weak functions for smartsim dependent functions. If the smartsim-dependent functions are actually built, these functions are not
-// linked to the final executable.
-
-#include "../navierstokes.h"
-
-PetscErrorCode SGS_DD_TrainingSetup(Ceed ceed, User user, CeedData ceed_data, ProblemData problem) __attribute__((weak));
-PetscErrorCode SGS_DD_TrainingSetup(Ceed ceed, User user, CeedData ceed_data, ProblemData problem) {
-  PetscFunctionBeginUser;
-  SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_SUP, "Must build with SMARTREDIS_DIR set to run %s", __func__);
-};
-
-PetscErrorCode TSMonitor_SGS_DD_Training(TS ts, PetscInt step_num, PetscReal solution_time, Vec Q, void *ctx) __attribute__((weak));
-PetscErrorCode TSMonitor_SGS_DD_Training(TS ts, PetscInt step_num, PetscReal solution_time, Vec Q, void *ctx) {
-  PetscFunctionBeginUser;
-  SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_SUP, "Must build with SMARTREDIS_DIR set to run %s", __func__);
-};
-
-PetscErrorCode SGS_DD_TrainingDataDestroy(SGS_DD_TrainingData sgs_dd_train) __attribute__((weak));
-PetscErrorCode SGS_DD_TrainingDataDestroy(SGS_DD_TrainingData sgs_dd_train) {
-  PetscFunctionBeginUser;
-  if (!sgs_dd_train) PetscFunctionReturn(PETSC_SUCCESS);
-  PetscCall(PetscPrintf(PETSC_COMM_WORLD, "Warning: SGS_DD_TrainingData struct should not be initialized if SMARTREDIS_DIR isn't set on build..."));
-  PetscFunctionReturn(PETSC_SUCCESS);
-}
-
-PetscErrorCode TSPostStep_SGS_DD_Training(TS ts) __attribute__((weak));
-PetscErrorCode TSPostStep_SGS_DD_Training(TS ts) {
-  PetscFunctionBeginUser;
-  SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_SUP, "Must build with SMARTREDIS_DIR set to run %s", __func__);
-};
-
-PetscErrorCode SmartSimDataDestroy(SmartSimData smartsim) __attribute__((weak));
-PetscErrorCode SmartSimDataDestroy(SmartSimData smartsim) {
-  PetscFunctionBeginUser;
-  if (!smartsim) PetscFunctionReturn(PETSC_SUCCESS);
-  PetscCall(PetscPrintf(PETSC_COMM_WORLD, "Warning: SmartSimData struct should not be initialized if SMARTREDIS_DIR isn't set on build..."));
-
-  PetscFunctionReturn(PETSC_SUCCESS);
-}
diff --git a/examples/fluids/tests-output/createPyTorchModel/NNModel_HIT.pt b/examples/fluids/tests-output/createPyTorchModel/NNModel_HIT.pt
deleted file mode 100644
index dd31c24d27df99507cd00895b727330dbe52cdb1..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 1908
zcmZvac~lcu6vnd=OhAyet}GT11Y?2;0-5(hTZ>8*E3Fu%5<?hCG$y=cP_!Ug#R>(r
z$D*x;wyw3*{a_K8H;G56p!A3fu25Ips9LMliXPfYsMhv4f6O`ay*I!6-S6I|yi&30
zd1dm`khweF8zfoEn|zzkz6%j2QZs<V;pAE^ltphbX%~=Io6%xMT=s&dKYL;_Qk03*
zlICoq&Ww1gG-6hy7Uhy!n{hE|L;OM{h}5{TqRn8jQpios)9~3ZqnScNwLl|qGA!mC
z<gOOv<*_yOY#9@&r`Res@f2$|X(hAA8TFJHaiY=?H&Wxt{$VC5iBz(XH0Bs6#M4t2
ztKJ~d=21qIO{}$&S$RfNww5BzHj7oeAXbd{k&zmArw8_?E{8;dzF)-2CX0@GzfTk+
zH;p%YM<$mlq;iFPtRg-^s)$2EmtTEu9&0f_l|ucdsyS+ZwQwa8skv(Zl}KD-a9Z=A
zkY^g|?+T#LUzSm4LtX~9xItie6&{z6Ad8Ea$>qq~;I8&(p#xmdV&tRd79-z2pdW?&
z(@?<6hGSne9QYdKSZS<cj9iuwi{4<7PM-svNTJi<Al4^qcW@te2!)2Gp<yq5#!6g1
zgI)u5`c#aOO5;&53l*}^5Erx<g*tr>?*qO`p|CU*o>zz>G$J-Y?R=8bW$P#%8sQ3t
z&7M^`D5?;RbWP#K5=E=IHdnZ8TqvdxjWW0t=d!p0M3O=@`oEeCNehwezgjGcEkt8n
zTAnS7U7=adwSC<vxv!}I_Qw04+BX^gE9$@D3P?%5vl;d|MlZ)@e#@Zj*E)DUpaUPS
z?#AOCMiAh;7JhJd6d2szgcBov0CDGXKz}q5PxTaow!PJ0Pxw{k_BN5~*RWE2cXbu<
zXzLxkuXHD<vA8qVq)5=cs24<SX(Emd8^LUyavLnmu7xQT+vz>+ZmKI+I+@U~{ghep
zYv|G^)9F&~MzB3Gj=o&xsd}jB0d?0V!=gHG08%E>sPYD*KT^&_^xMaboZk$hg^kRW
zc^LC%Uw|9Z*5J&v<*=d1%BV8E=z_3qFgRob_^2ibOm^#_1J3Z_vhmkpV}~y|)WC)P
zwcRi*=K|PaX`)+~x&x0(pCu-!IDlF445X(P6JM_QgUEWa6?Y8w2A5}*(D$XcK+oP2
z@eg(mByOp%fhVG~0E)i^n;t{#-7*NHMFW_lEehsxc`kD-eL3t}Sq!>yBFq>TOjpgh
z-Ox5FxxO^DxxTY=ZGFYG4Zuq{o)|u3y>fk1IX)EB22QSurr&ti9W+-zt;>v`i=$Fx
zbU~<xBj!U7M^~etqw4BCXbKcLe%e)KpBZ3PJ?8TdL94)#{_Piz8lQ1?UGsey?(3Gc
zOI&Wpva$Az_6)n(Gr}>v^)wX6d=4KT->4GyjhQpF?h+wa)Dlyk&!US+o~rD^RXAkZ
z4e0;Rd3b&WlQZHV6VuDLhukp%pSNc)KM79a=b8_&qsF8>kQ)G(oJ(LLU;+^)jD}5<
z*5kl?#dP<N?YK3`&WN`3(y2H8RL-9C8#sJ!9elB0=+qMY{ehqH^scjbOKu|ZL+MvU
z7geV0PTq)*ayzg`41>oUFTv8DQXtwy;!jLF@E4nUuzN&3{;t*wKda&33CWiTKO9KQ
zX7g}<zY?(a(@|KmdnhD-(JCuF%5deb9r)tz^H|f;k8YcvO^2F8=+g^#<EhoY^qF@y
zgN}<A>9J=XD69T>0KERX4zzQYf{}|_z_kk<#Nyp&4(hI-BG54@%s7z>)05W0)&qG&
z^L!P(>;7A`Ja9Ciwx1#bdfZ^UAcek?AFnKP+)*Cicn;K#P(#6raQNW8TtZY;Koo2?
zCf=G?L`<4##YcKG!F^*9A?<yPgEKzH!v`|BqI?bZxO1bSdjAnx_2^;4C;Jf=nWn+$
zNyRYqrVbw648Z%#L-1XBCZS53NCah8<EkxPgki!w5Cl3w^hebI#>LUw6iIk5KG{&9
z9Kt+5w4p&WAQjxK@WCs3$}!G8g$LUruwI;?>~UXDM<1>y<OBx<-#g9BY!||PttB`m
zR6;NLhKmIg?I23}HeMJ1i1=NdLeJ6HVxP_~dhxg<9Fsl_?3Xs;kqfrsn?sg>menPA
XLV+DO&v4+QA&vCHxJn{tQ;_m+4Ikae

diff --git a/examples/fluids/tests-output/createPyTorchModel/NNModel_HIT_fp64_jit.pt b/examples/fluids/tests-output/createPyTorchModel/NNModel_HIT_fp64_jit.pt
deleted file mode 100644
index ef631350158acef30b674f448471c73763081eb3..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 9977
zcmd5>2{@G9+aFsPBw5Q+NY)71LZ_5{4Mmp5#AF}FzD6XJ?27D5NTos&a%7F{Qe>+n
zsU(%HUiqHU+xt@S|9^e&dtKi%*SVf&W}fpq_xYXs{+;`rqo+wmjzUpUp?<p9QIseh
z9c?EooC8Kp-4Ju!S$Y@7&el!L3hRay+o_^NhB7fR>AOYP7R6HsH>L_D4!UK5I_ipv
z8NU1Ag=qMuce~EPDj)sADUv3ztABQ?uzU<)dv+|4W#57(K}y<(ZJNP;CcB9Q-v+>T
z%Om8&%zgN5zGt3|G#r{Jn`+aT&A`)Te&;gd>VfYOjU<7VHgIT&Gc1+02N3d7*MD`d
zh2;8|=F7=^;i0(nb)DU_u<<z~$Dz_b$mWw`MU~kI^!Jlq@T0B(7fgqu$))eX?fhrP
z7mkDhvj<-lW?G*@u1zAL>eNrbg+21R<lVOb)#!<@;^r6O=9^`wX|{F20PSm{I#ibc
zM|2Z;ntCX3I~e)&Iz>GU&}c$EQ11mP3O7e8Vjlzk+A3E}TLa`Q>=W>Ix(-8+WgqH3
z^c9Y<uqNt+JqB6)h2wtPufe@eoc3Er9s<@!B-_LCtze@5^~&D*N{Fkc)K0!s4EkwC
zdrj*yL3^;J8BJXYd{r8uJtck;I6KcK7MWfL=8ZjZU2haY@VeZ`;dYGG3Nc5w%=_O0
z`k@lW(skEB?_`p-ZRmaYzWHgL<iSR;HC@I^N~HnT3q^Gba?|Xs=nP`@42EEX+$G!x
zrdr^@x$cTZR|iZLdU0naq8c3Z)1Hvyu7w^g?`W?Xb^<KCVawhb20Sg|@Sc5baBm{-
z^cDJH`22{gk|O>#JpK6=rgBpo^xkrKq~TLDtgE36Kjhj0?y_x?O6qt3EUjZY=+-|3
zf)e#~)QJs1gGSK=@0kU%A=!G@XsW$emC@$o7Cqp(>5dx>!nGjPl|7Baw*{ULu}B@1
zi~|F@F=1)7rGVvD%(YQdhH7_~hcuR7T0zLL8pbfO8lo>#*K(dJgN!*ho_F4P0KBg}
zHz|qBh9kBo3=7}PgL4@h4`$qZ2zH002014;19D6LGUc!ufa0x_nQQ9;T#=ca%#@d*
z%j3=?`&^l;PqgkY*UyB&)8*>R*jK$km%?^$MP)k>yjvN=t9S*RtkIF+;7)*o=ihOT
ziS)u_!He2OcTT_!mwP|^2=s%ANu!9?o)mcLwRkTZV+sJDNw6bA$v`?%h|NpzCHRB|
ztO`e}LC-U`jpcPQ;GNJM%Qfaa&`;lCL-r;Sx-Hz%GXL}d2xgrXF|8)u>!R2kw|P$i
zC<w${k{d696lxP^{axz-C4+Bh{>^51mU@HD3}pwf(+k9A#Pz_!0PVc3&tpNw+sw-e
z7KO00xy^Rt(MKSUvbLNaRR{F~^`yR%bpiDH6#NJ6W}v4@Np;;?D6E<cg(4QcHS|Dc
z4LuN7QIaI;!T<KU=u4eBXF3c>WKH-!wLStN59ZAxIyyk%0yPbXTpBp5DWte3VGyo&
z&nTg941?>B_w#V{4Z=8yTbcqPr9f(rxRhV}0BCccqvuU*2Jd_ZeQbs9LjSbmA=2A=
z;8Fh=wwQ<~a9A~vuC6K{%8r(sjyk1*LPD>A-r07TH$SuMs9hJ#>!55`kROEW^0!w8
zY7GH@o>5Yl$3u|F&%i;&#0*3qiDfN*Sl8f3LPd%3SNu%XxT}H<cfq@tdN&>qv_aLH
z$Nkg7_n~?|Q+JJ87W_h|rni`W72Gu4aJ;3d4hkj<%PBH602IkOCiC`TXei<u{(7bf
zJSdCx9l&HmZiUdAHdz8lZp`^`!{8C9oPVIgT>KQC!Mq;KV7>!JKYoCN{DY9Wr*=E(
z^+tI2h*UK5u@;a>(_Ea>c^lMU>6;_@PzY_mC@fYm6##j87tVI74se5ecdyh~5n#{D
z_psd22t5Z?-L?tdgrDpwiu^J8pt!pB{6n7x7*z4uy{W1SU`_dVsC%Zu?K1eFy1;r+
zEE6#_J$Dya+0eT07>a_AZti^AalQmvoKdG`nd*Vv%#V15tZP8Y-Lpo_x!rIR?O^V$
z<WPX0sf(+@-GUohIu=Z<Z-QNFdea=@5m0-QS%Rzd3f$#kRNGE|2kz~^L!s7P30ZL1
zc!EwN#PG(-Xb&YogU1DX-;mvgyC08OY3F0XmchN^i>CMBpq1IF>$;Diqj`ATC!<QR
zBfH;MrT7-q;bv{BF^T{}k}02#c$9<d0hwW+Oj=>WA!nvgp&G!@5Tc1rZ-<JZ`{~=}
z>VWwZOyS~m5vW=4&t$rp3*O|VEZE++fwy%9KJcSkfz13kjipouyufA@q!ri#G@K|<
ze&xx4C78F;zPBEbQqmkeiRfo{Ila3iR||{`=ZddAPy`#_#MQIfl!MEvY6=StSs-vd
zCCi{-EeM?Q4wbl_1(W7gcFDLGf}vx>*A<IvAYtkiclm}UFhL@=S?O~zWSE_1sXkf*
z#l%VB%Xe3y=eeokfc|<|?KM6*Y;zxA3M!M@42qz2!8DuEY$teRO=D*fSPH8m@0juC
zw*l<=Cqs@>HIOqVLMhkk4kSCbvsEnsg3!b9R*sz+@T5?|iw}c$AtmRc^5m;aV3X++
z>G-Z@z-yK}RpU|#He``CQG_?a$k_<BVTC#vswk6s&B_lLNWO+f*0(|Fy`usVvK`QF
z{8HuEhE_PYuT*22wgxVkpE~_6tQzohURAC+(gE3hl8po`?}M|fk{%3giNLUBOgDx0
zAvk<K>-Mx)9Xvr{u9|+N6uQ436vg+nLyGv}P}wh?aFhZcik_<lU2I~e%DN3;OK(wb
zrD-F;-Y3sj=G+C(FuWEmVDJZ?+QlkLI;}8>Vs3<2xd`3}Yd$4T83Efqhl}uR>V`~M
z)Vpn7t?+IsuQAzTHBh*TGPV=G1~#xZEu^#FThfOaYG(QWtA0xUrhXn6aVq5Fr~;?D
zMc+O&cmR)-T;HZ=UJIIwi%URO9mvTV4~}#?2P-ZpTc<^KE$O`<^>+20h?Wt{>UzeB
zqF7zeM4jy&RFp_iCV`}u7>t{fi=_<)BZ~R{=!nHzJK!)<Xe{2=)yY8B08Ky%p7a+m
zCgz!LE?8T<E#8_y5@yVTe8<?jVvf5zIAD%pE$uzAE>;*DCntLXsg5x+AEEF#Hv-wR
zHkR5F#p6XCovhp)aIT`3PIxy25a;q!^SjUnID$J4?`Dg2KoiJ`2H22<*&x6q2=Gn<
z#agfq2qgArOmVc<(jRETx}^@SLy$aiw$?Uo1WL<qwqe}eY#m(D7#G|zcUuQ5j2jN`
z>g0m)kU$ft1O$vJks7YZn^<ccf%;n&^Z_ULV-7fjZ@;1mXd8;9zFRuG6KFKO+zt|G
zHBkbZOq%Q51T@!alDLIgS~@v6;4Iy2o$#(`T^B2y3(o3*t)&~9Kqsj0rcYRJjAZqw
zW42gV0=*5=stvUbda2jqgbg+{noI~J!xAI{h``x=tF`ePJ?_T{j5ZtAA_t4K?13e=
z{uAjEXe}Jp-di81bqGyh`kQkElzGVl2?EPn(5vt+`4{i3OPG}3du79d^oK|h8?is@
zkggGl`nyk?Y*rE=N!W~3Scg<#U#hTz0FJ*1;6&`@Ldq6VVg?`;qjvH+DGEg_#E4-+
zR!l!HnPgG2zm6D7WDzCyJvK1N)1MLZJxoNMy;YQU{VF~9kLi@Tt~|bWC+(msI#N2?
z&W@FA{K@54Px1Tq-Iq?FifM|E?cjj1dSu$$W1}Ybx7}-?I`1Dp<@k08*L3nTmNL-m
zmYtDC4g<q_=RfRT#iv#T$C|XK>F%uFwo#}_)lH5(<*ncR&GM&5Ug*zkdHf>8QTFh$
z9*Ik&yxTkHBWz5hT8=x09+0HTJEm*rY<cax?#cIk5uZ}S<8Dj2a#QM*<hsWMeVP0$
z;4j`^`z4FKjPAWUhqAAi&w_nu>E&6@jL%73(rSV@+^gn+<Q-F`p(kc(h-@bg4tjMU
z?yH4_>FWBVgxdStY>QeUv1yH25;29};{E%;1d<xFA*QP6>TB;a#2Bd?s$mQa_0@G$
zF^2n;w3G=bW8_~Hfh2@Ls)_njAdp40D8|{=(%u1w{w5K&csB=IS0sAL1k43+Zu*)i
zO;jj>eCdWfg%Bu0m;CVN)Llh81dmu&_d^2|^MA$6l3a=5@nU~Usn}mq`jd<9T5?f_
zpQ%cXX|+TAFL2Tq-DyCYs@|1K+Rx}lJL4*(oPW9NV!4a8yy@(W>p%!T$6@Er-befa
z=ifF&&%Up_&Z*g9RpeXoj&^LC8TI@m$AsLCk73R<XK`l=%S#s+(2*rJeS7JI?X}Z$
z6iZXZ%d(Eo>fq0r+Iz+Tj-XPd<DL9zZrhHLO+U*w(>l{6;lmK@-pX?D(vk3d|AH3b
z?2slw4*i*J?6t-P-lP{&Mlm{eK_@ild7Uo3)T)S5O@n6oMvEez9-^MDJ4zl+x5s-+
zJ1!&_-rV@y{$sCKcH6M*Ig>z3x^|JFa*o2us`KtV#`Re;9ABtJ2jxO~)lVE7O-vvY
zZn(<UD^f&z#f+}i%J{@E(^xA3J#U+n9l3vO+vdVOBt&=i7R~V6g1B<@pW^ZB?G|$s
z%fB&b%RPPL&%=kz-~a#oR}aPZdsKXzJAQIr@g?VF`bF<eOthogiT>L#>5q;vAWgzP
z-aAXqUh($uX}yhd;_h#G<V&K!qPnVX?4E__DyJeQQoqfO+a@lgdp;O+Cf!>Z)6o#a
zx3y-IXlrU~QA2L%@m);%3p)nN>=)>9mtA&N(^ov@6HE$7&5kBC>`-UFdwV{DI^WZt
zkTdSXpG27cdU0m>woN6kZ<9sUp5k>Ae3Fh9S3<40eUc6zI8@-G<H)C`SNcMK?trw*
zS)8Adlymw!5{0z3-Ckd0pJY0mnUs7XBQ&e5@SM{nwOL8_-Sdo#`RV<t0q_0g9;JRo
z6oQh<4%4}708tC)KehDNGnWa9;~!^yx#zL{l6#_t-i>c}>fXuJn2LW>8j&s&$?*Uy
z-ruXiptM=fLW0DQ=NXLz{<-u}imZp31_fz}=GTe4IVP8E$5hTZj4F=#S52z^0rzj=
zJ~iae6_UsB#e>D)0+Zd@mp3c2+kb&zqZbv#zA5VKX}Y}KjB^8R*HUt2^>)8RUyyFt
z)ZXBiHDPEGvnWEi>HMT&yo&KstwyB%{(_I$7J-`MbtmqP(ClI6!$&?-XiU;Mj;7Uk
z8PuO){AvCAG)3<zwxG5<+J4=c7bXOZ-)T~`)9)h{PDm?Gh^G5|-Jd(SoQ0O{Ow(1p
zs=l0n{(}r}1_(A9-Ep;fGCoDhBc?nWiiD@kQlqWTb1Y{u98CLj!Y?+FlbYO1ohu5n
z)6ZwjbPlvuCRLGU-B6Pt_4?Ji%bKkATC--mTT(^|1{3=h$vawwO%LCb>3>+-Fu8@k
zaK5T9bN-2jp9?L=CGQ%SQVpYO=ZUT@_Q5l6FharMAI|B<O=DjV92M=KZI<yhyyX{=
zB;Pi`CxF)>?|2r-EKwDR-Q1p|k=T+Nd+B5A-J#huXWk&Hf|iBj!TcQFwiDc3<I!|Q
zeOFCepYZitg}&XGp7FUt@tE+(z>N30w|R~-+liSgUv8-tDkCL2UGufdzD~s9UQnzx
zO&Owg{GQYOoqhfk>Jm$#F1V_@eV?}fx|>I7>(uR}4ViUCi<#PnS$G%*G14k2-Pp|i
zz%f<l;ls-*<aHCa5`R!t-oyn=_HilHQ>o_8kAz2wweDvt9xLr9*;<%W;(Y8}m@{SG
z<0livw*pMmo5JI$cfZ>Ip2CH}afZux^GnYzhIa0j{yzxXqEzz+BxgOWXryhh7w->w
z^anAy$?{&jcrNV7>w}#g(F5$vl)=NjN)p};!SnN2*hd<z<Be7K>b@tbN&BRxXThlN
zP+il-?wLAVl$ej<%$@n3ch8=1;1`umdzuBWm}z;W6Gy;BR*k0{347Hw!~Q6W`(GJ>
z<)P2H_E>z6|89>rr4D~oIiE6>6FF}GSpQV?I;KP|RD#A_kb(?>{kjf^e49oHd6?!`
z+&0l(K||#f51X9(hyVC6H2d*{?C_gcg(I?J!*mPX3}3yZea?Z1)7DOyXPx84GGyZ=
z(c<z&r>hd_4)EkYRcd8UR;eE0*~(!hgc1*LwBmUfPkPbp_*Qe9%%E3#-CAUA4KrIe
zs`tMMj<EGfjI~h7-9<KPy;Z$>n=WbV0j07Fq9&??GgHBhv$A}}6o=!-3a@56CG&+T
zC%U@srETQ9DRMsNvAp=!?eA0aI}+$VWp6$+di2rrS25Qp+R73~nC~AxG-^ya?kN)1
z`82U!&~sG!vk9H@){WZ6N(mHlCwh-esxwk7?z1|UbuPgqVJ!F_&0fH`-+@Qlt4)bf
z?_k#Wm-_oc^E^2mUR!ndyM~QDJk^nuV7HiE63oJ`H5+6bc#CJ>k%<%6?Q<R&RbCAe
z_KYdef}UNAS0<Yk;~lQnnfu-=+sdu;UdY_;^aJHFcE*W5Vdms;m(e|F8GcJw-y(-$
z10$}C{2G{FnL9eo;uh5PHfhF37I&{jt4r2ep-!YT1@|OD#w6XukUo<vRcS~>=F1-u
zS4kdOa7sUA84!JZ{LZ^-^D;{FGJ4K8UQg3yXuN55f4zR3<V{Eb^iUq|5Ab6T8NRG+
zA5(tIMmc9RwdM;Q(Pxi0`1w&HIZ`pQrd~a{)kDeup3kn597rxHhRoVt`j=e%^q?K}
zfT$nJvmcoc8(lNx*-)WT%Np2wgPo`!(ieAalDqtJwkezJssm;VcxNR@?j;kc{)8S(
zp09XD9_YoJd5%|e?<q;Mi}K01bK3N+HUc?w=4FJ(pIMvrM=b)yJnm`gODw`jrG`zn
zw>f?qdMUL%W9V9aOzxXhtA?I0_OAnmvZf;6@&$W;Y1v$Ip{3;HHLfn2&Ictt1sfAw
z?Sxf=is~=aP!`3Qii}(-IQ^xBKG3R9#)ARFCFb$|lSj_}eP>QezVH<qQ&#Zfa=E5K
zmL6vuaGAa&fd8}QRWbRB8k$CPv(54gOUXp^L-Lp7h^j=csVXKY`hR?cmMaR)^0E4^
zAA^Oz$RAE#K2XDIHBjeRDMxr)7LO+H7M;I68*7)4o60U&lrWnzJ72YB3YWZRAC>>y
zZnTcl%)+ie>Z(GryT!H~@%k|Hu_J?vLPlc8XG-amSB6hr<!`%Ow7`sq37hp&<tcD}
z3@<e}sXTL~dcMY%ozGW?X}z#qA#Hfu`UMiv97bQcDA74Xrm{V!i)r(ArwQ&9NTS9s
z_;vOV2TZ-~F`(K#COxB-d{ipi;*J<|r2>ljs-92VVI}ETi!VcH3ru!e2yE{*^UmGB
zD3#6Al{C^3U(Qe2?lf%`r{Z~l$>({suuyZwv-gTDP)^{q8lzO4IC)8sJfW0ISU<xu
z^G1}lH|<jXz|8#mJ<GEE;h;H^Kd^O76Y~d2(}&VX{y>MHOY6(+L-GedawLCfk@%KB
zbfhEs!xfBBNce{c)sbmz!N5_m{@Dh}3PUL*fADP^;1?*+o$J7+k4ls;<qv0)T2f*|
zKDGo8MJ?qILantv!ThrJd<mf?YRL}KI(PAp>@PUPI2P?!2r8KL2;b4l8x#ZwW8Ipc
z7A0KGihoHM$B+@FfPqEwRoAiv;@3+i9u(JVj)r%2!{Xgszim@)UhXhIL~tOf;L^(4
zQH<J%xMfBxtM55*pd^0Xq6f~!71<3_Q4(FFtsk0R`YMSg_O~<5`VgXgC}PpmBq84Z
zI+I_1I>%1@W5u<`3H^^i%a5^E0&PHgy*iNik3h@!yH^73L=wK$fh2weTD}pz5=f7F
zEg;DsftK&jajxJ8g<b=Q=pNso{$3U>-+Yo?0VP4V3e<nGz4xQR%QteAR<x3_{`Xt?
zTP1(&)$%<EgB9%|r^3G;vLAZ&zuYWZtHb6i3~*xjR}5I4ixo@a6{h%X{7q9<$t{Q*
z4?m8?^0kr4iq_vU{`0M`&cgB)g5e4~u*jjzFN63$dBlHMm;K1Z@_g^;3JXS9{~Zff
zXJbWXv%;1On}5@mReXY&Z~n-{^674+Psnln^R55L!rxQP@=1_q1q)eRtJ<+_s{JwO
zmrrX*N0wboTUNpQb0XE#T(?f-&-b~=cKlTn(Tb((`<;S#nuc7l2p;i5>Mi|pVf(l5
VNDhJgMnOEKLopy9;``s;{T~`>-Le1x

diff --git a/examples/fluids/tests-output/createPyTorchModel/README.md b/examples/fluids/tests-output/createPyTorchModel/README.md
deleted file mode 100644
index cc34055a4b..0000000000
--- a/examples/fluids/tests-output/createPyTorchModel/README.md
+++ /dev/null
@@ -1 +0,0 @@
-This directory exists to create a PyTorch model with certain weights and biases. It is mostly setup for testing purposes.
diff --git a/examples/fluids/tests-output/createPyTorchModel/update_weights.py b/examples/fluids/tests-output/createPyTorchModel/update_weights.py
deleted file mode 100755
index a30304d647..0000000000
--- a/examples/fluids/tests-output/createPyTorchModel/update_weights.py
+++ /dev/null
@@ -1,71 +0,0 @@
-#!/usr/bin/env python3
-import torch
-import torch.nn as nn
-from pathlib import Path
-import numpy as np
-
-
-new_parameters_Path = Path('../../dd_sgs_data')
-
-weights = []
-biases = []
-weights.append(np.loadtxt(new_parameters_Path / 'w1.dat', skiprows=1).reshape(6, 20).T)
-weights.append(np.loadtxt(new_parameters_Path / 'w2.dat', skiprows=1).reshape(20, 6).T)
-biases.append(np.loadtxt(new_parameters_Path / 'b1.dat', skiprows=1))
-biases.append(np.loadtxt(new_parameters_Path / 'b2.dat', skiprows=1))
-
-# Anisotropic SGS model for LES developed by Aviral Prakash and John A. Evans at UCB
-
-
-class anisoSGS(nn.Module):
-    # The class takes as inputs the input and output dimensions and the number of layers
-    def __init__(self, inputDim=6, outputDim=6, numNeurons=20, numLayers=1):
-        super().__init__()
-        self.ndIn = inputDim
-        self.ndOut = outputDim
-        self.nNeurons = numNeurons
-        self.nLayers = numLayers
-        self.net = nn.Sequential(
-            nn.Linear(self.ndIn, self.nNeurons),
-            nn.LeakyReLU(0.3),
-            nn.Linear(self.nNeurons, self.ndOut))
-
-    # Define the method to do a forward pass
-    def forward(self, x):
-        return self.net(x)
-
-
-def load_n_trace_model(model_name):
-    # Instantiate PT model and load state dict
-    model = anisoSGS()
-    model.load_state_dict(torch.load(f'{model_name}.pt', map_location=torch.device('cpu')))
-    model.double()
-
-    # Change individual model weights
-    with torch.no_grad():
-        for i, layer in enumerate([0, 2]):
-            m, n = model.net[layer].weight.shape
-            print('weight shape', m, n)
-
-            model.net[layer].weight[...] = torch.from_numpy(weights[i])[...]
-            model.net[layer].bias[...] = torch.from_numpy(biases[i])[...]
-
-    # Prepare model for inference
-    dummy_input = torch.randn(512, 6, dtype=torch.float64, device='cpu')
-    with torch.no_grad():
-        # model_script = torch.jit.script(model)
-        # torch.jit.save(model_script, f"{model_name}_fp64_jit.ptc")
-
-        model = torch.jit.trace(model, dummy_input)
-        torch.jit.save(model, f"{model_name}_fp64_jit.pt")
-
-    return model
-
-
-def main():
-    model_name = 'NNModel_HIT'
-    model = load_n_trace_model(model_name)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/tests/junit.py b/tests/junit.py
index 27d18220ea..923dbee50c 100755
--- a/tests/junit.py
+++ b/tests/junit.py
@@ -27,8 +27,6 @@ def create_argparser() -> argparse.ArgumentParser:
     parser.add_argument('-o', '--output', type=Optional[Path], default=None, help='Output file to write test')
     parser.add_argument('-b', '--junit-batch', type=str, default='', help='Name of JUnit batch for output file')
     parser.add_argument('-np', '--pool-size', type=int, default=1, help='Number of test cases to run in parallel')
-    parser.add_argument('-s', '--smartredis_dir', type=str, default='', help='path to SmartSim library, if present')
-    parser.add_argument('--has_torch', type=bool, default=False, help='Whether to build with torch')
     parser.add_argument('test', help='Test executable', nargs='?')
 
     return parser
@@ -36,8 +34,8 @@ def create_argparser() -> argparse.ArgumentParser:
 
 # Necessary functions for running tests
 class CeedSuiteSpec(SuiteSpec):
-    def __init__(self, has_torch: bool):
-        self.has_torch: bool = has_torch
+    def __init__(self):
+        pass
 
     def get_source_path(self, test: str) -> Path:
         """Compute path to test source file
@@ -114,8 +112,6 @@ def check_pre_skip(self, test: str, spec: TestSpec, resource: str, nproc: int) -
         for condition in spec.only:
             if (condition == 'cpu') and ('gpu' in resource):
                 return 'CPU only test with GPU backend'
-            if condition == 'torch' and not self.has_torch:
-                return 'PyTorch only test without USE_TORCH=1'
 
     def check_post_skip(self, test: str, spec: TestSpec, resource: str, stderr: str) -> Optional[str]:
         """Check if a test case should be allowed to fail, based on its stderr output
@@ -199,43 +195,13 @@ def check_allowed_stdout(self, test: str) -> bool:
 if __name__ == '__main__':
     args = create_argparser().parse_args()
 
-    # run tests
-    if 'smartsim' in args.test:
-        has_smartsim: bool = args.smartredis_dir and Path(args.smartredis_dir).is_dir()
-        test_cases = []
-
-        if args.mode is RunMode.TAP:
-            print(f'1..1')
-        if has_smartsim:
-            sys.path.insert(0, str(Path(__file__).parents[1] / "examples" / "fluids"))
-            from smartsim_regression_framework import SmartSimTest
-
-            test_framework = SmartSimTest(Path(__file__).parent / 'test_dir')
-            test_framework.setup()
-
-            is_new_subtest = True
-            subtest_ok = True
-            for i, backend in enumerate(args.ceed_backends):
-                test_cases.append(test_framework.test_junit(backend))
-                if is_new_subtest and args.mode == RunMode.TAP:
-                    is_new_subtest = False
-                    print(f'# Subtest: {test_cases[0].category}')
-                    print(f'    1..{len(args.ceed_backends)}')
-                print(test_case_output_string(test_cases[i], TestSpec("SmartSim Tests"), args.mode, backend, '', i))
-            if args.mode == RunMode.TAP:
-                print(f'{"" if subtest_ok else "not "}ok 1 - {test_cases[0].category}')
-            test_framework.teardown()
-        elif args.mode is RunMode.TAP:
-            print(f'ok 1 - # SKIP SmartSim not installed')
-        result: TestSuite = TestSuite('SmartSim Tests', test_cases)
-    else:
-        result: TestSuite = run_tests(
-            args.test,
-            args.ceed_backends,
-            args.mode,
-            args.nproc,
-            CeedSuiteSpec(args.has_torch),
-            args.pool_size)
+    result: TestSuite = run_tests(
+        args.test,
+        args.ceed_backends,
+        args.mode,
+        args.nproc,
+        CeedSuiteSpec(),
+        args.pool_size)
 
     # write output and check for failures
     if args.mode is RunMode.JUNIT:

From 958e607d5f3dec692b0a7e530b6ada4186151869 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 28 Jun 2024 10:41:17 -0600
Subject: [PATCH 102/571] ref - drop unused variables in OpAtPoints

---
 backends/ref/ceed-ref-operator.c | 22 +++++++---------------
 1 file changed, 7 insertions(+), 15 deletions(-)

diff --git a/backends/ref/ceed-ref-operator.c b/backends/ref/ceed-ref-operator.c
index 538accec17..3d65e1af05 100644
--- a/backends/ref/ceed-ref-operator.c
+++ b/backends/ref/ceed-ref-operator.c
@@ -1199,24 +1199,16 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Ref(CeedOperator op, Ce
     e_vec_size = (is_active_at_points ? num_points : elem_size_active) * num_comp_active;
     for (CeedInt s = 0; s < e_vec_size; s++) {
       for (CeedInt i = 0; i < num_input_fields; i++) {
-        bool                is_active_input = false;
-        CeedInt             size;
-        CeedRestrictionType rstr_type;
-        CeedEvalMode        eval_mode;
-        CeedVector          vec;
-        CeedElemRestriction elem_rstr;
-        CeedBasis           basis;
+        bool         is_active_input = false;
+        CeedEvalMode eval_mode;
+        CeedVector   vec;
+        CeedBasis    basis;
 
         CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
         // Skip non-active input
         is_active_input = vec == CEED_VECTOR_ACTIVE;
         if (!is_active_input) continue;
 
-        // Get elem_size, eval_mode, size
-        CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
-        CeedCallBackend(CeedElemRestrictionGetType(elem_rstr, &rstr_type));
-        CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
-        CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &size));
         // Update unit vector
         {
           CeedScalar *array;
@@ -1228,6 +1220,7 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Ref(CeedOperator op, Ce
           CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs_in[i], &array));
         }
         // Basis action
+        CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
         switch (eval_mode) {
           case CEED_EVAL_NONE:
             break;
@@ -1268,10 +1261,8 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Ref(CeedOperator op, Ce
         is_active_output = vec == CEED_VECTOR_ACTIVE;
         if (!is_active_output) continue;
 
-        // ---- Get elem_size, eval_mode, size
-        CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
-        CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
         // ---- Basis action
+        CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
         switch (eval_mode) {
           case CEED_EVAL_NONE:
             break;  // No action
@@ -1302,6 +1293,7 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Ref(CeedOperator op, Ce
           CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs_out[i], &array));
         }
         // ---- Restrict output block
+        CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
         CeedCallBackend(CeedElemRestrictionGetType(elem_rstr, &rstr_type));
         if (rstr_type == CEED_RESTRICTION_POINTS) {
           CeedCallBackend(CeedElemRestrictionApplyAtPointsInElement(elem_rstr, e, CEED_TRANSPOSE, impl->e_vecs_out[i], assembled, request));

From ecc12a5a87562b9b60760c6565c9215dbc60efc9 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 28 Jun 2024 12:30:37 -0600
Subject: [PATCH 103/571] ci - drop extra CPU tests in CUDA job

---
 .gitlab-ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 6b4bfdc3c9..00dca5b235 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -196,7 +196,7 @@ noether-cuda:
 # -- PETSc with CUDA (minimal)
     - export PETSC_DIR=/projects/petsc PETSC_ARCH=mpich-cuda-O PETSC_OPTIONS='-use_gpu_aware_mpi 0' && git -C $PETSC_DIR -c safe.directory=$PETSC_DIR describe
     - echo "-------------- PETSc ---------------" && make -C $PETSC_DIR info
-    - make -k -j$((NPROC_GPU / NPROC_POOL)) JUNIT_BATCH="cuda" junit BACKENDS="$BACKENDS_GPU" search="petsc fluids-navierstokes solids"
+    - make -k -j$((NPROC_GPU / NPROC_POOL)) BACKENDS="$BACKENDS_GPU" JUNIT_BATCH="cuda" junit search="petsc fluids-navierstokes solids"
 # Clang-tidy
     - echo "-------------- clang-tidy ----------" && clang-tidy --version
     - TIDY_OPTS="-fix-errors" make -j$NPROC_CPU tidy && git diff --color=always --exit-code

From 3bcf805f51eb7499f089092c0b427c1730cc2165 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 28 Jun 2024 14:31:43 -0600
Subject: [PATCH 104/571] ci - make clean

---
 .gitlab-ci.yml | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 00dca5b235..be6b4db93d 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -34,6 +34,7 @@ noether-asan:
     - BACKENDS_CPU=$(make info-backends-all | grep -o '/cpu[^ ]*' | tr '\n' ' ')
     - echo "-------------- libCEED -------------" && make info
     - echo "-------------- BACKENDS_CPU --------" && echo $BACKENDS_CPU
+    - make clean
     - make -j$NPROC_CPU
 # -- libCEED only tests
     - echo "-------------- core tests ----------"
@@ -85,6 +86,7 @@ noether-cpu:
     - BACKENDS_CPU=$(make info-backends-all | grep -o '/cpu[^ ]*' | tr '\n' ' ')
     - echo "-------------- libCEED -------------" && make info
     - echo "-------------- BACKENDS_CPU --------" && echo $BACKENDS_CPU
+    - make clean
     - OCCA_DIR= PEDANTIC=1 make -j$NPROC_CPU
     - make -j$NPROC_CPU
 # -- libCEED only tests
@@ -156,6 +158,7 @@ noether-sycl:
     - BACKENDS_SYCL=$(make info-backends-all | grep -o '/sycl[^ ]*' | tr '\n' ' ')
     - echo "-------------- libCEED -------------" && make info
     - echo "-------------- BACKENDS_SYCL -------" && echo $BACKENDS_SYCL
+    - make clean
     - make -j$NPROC_CPU
 # Report status
     - touch .SUCCESS
@@ -186,6 +189,7 @@ noether-cuda:
     - echo "-------------- libCEED -------------" && make info
     - BACKENDS_GPU=$(make info-backends | grep -o '/gpu[^ ]*' | tr '\n' ' ')
     - echo "-------------- BACKENDS_GPU --------" && echo $BACKENDS_GPU
+    - make clean
     - PEDANTIC=1 make -k -j$NPROC_CPU -l$NPROC_CPU
 # -- libCEED only tests
     - echo "-------------- core tests ----------"
@@ -248,6 +252,7 @@ noether-rocm:
     - BACKENDS_CPU=$(make info-backends-all | grep -o '/cpu[^ ]*' | tr '\n' ' ') && BACKENDS_GPU=$(make info-backends | grep -o '/gpu[^ ]*' | tr '\n' ' ')
     - echo "-------------- libCEED -------------" && make info
     - echo "-------------- BACKENDS_GPU --------" && echo $BACKENDS_GPU
+    - make clean
     - make -j$NPROC_CPU
 # -- libCEED only tests
     - echo "-------------- core tests ----------"
@@ -333,6 +338,7 @@ noether-float:
     - echo "-------------- libCEED -------------" && make info
     - echo "-------------- BACKENDS_CPU --------" && echo $BACKENDS_CPU
     - echo "-------------- BACKENDS_GPU --------" && echo $BACKENDS_GPU
+    - make clean
     - make -j$NPROC_CPU
 # -- libCEED only tests
     - echo "-------------- core tests ----------"

From 7b64fac93034e90dc20136499b3f1bb9fd8a78e0 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 28 Jun 2024 14:48:48 -0600
Subject: [PATCH 105/571] make - remove stale ref to tmpl backends, fix bennch
 target

---
 Makefile | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/Makefile b/Makefile
index 4d0f150e78..cb25f658b5 100644
--- a/Makefile
+++ b/Makefile
@@ -232,7 +232,6 @@ libceed.c += $(gallery.c)
 libceeds = $(libceed)
 BACKENDS_BUILTIN := /cpu/self/ref/serial /cpu/self/ref/blocked /cpu/self/opt/serial /cpu/self/opt/blocked
 BACKENDS_MAKE := $(BACKENDS_BUILTIN)
-TEST_BACKENDS := /cpu/self/tmpl /cpu/self/tmpl/sub
 
 # Tests
 tests.c   := $(sort $(wildcard tests/t[0-9][0-9][0-9]-*.c))
@@ -361,7 +360,7 @@ info-backends:
 	$(info make: 'lib' with optional backends: $(filter-out $(BACKENDS_BUILTIN),$(BACKENDS)))
 	@true
 info-backends-all:
-	$(info make: 'lib' with backends: $(filter-out $(TEST_BACKENDS),$(BACKENDS)))
+	$(info make: 'lib' with backends: $(BACKENDS))
 	@true
 
 $(libceed.so) : CEED_LDFLAGS += $(if $(DARWIN), -install_name @rpath/$(notdir $(libceed.so)))
@@ -703,7 +702,7 @@ allbenchmarks = petsc-bps
 bench_targets = $(addprefix bench-,$(allbenchmarks))
 .PHONY: $(bench_targets) benchmarks
 $(bench_targets): bench-%: $(OBJDIR)/%
-	cd benchmarks && ./benchmark.sh --ceed "$(BACKENDS_MAKE)" -r $(*).sh
+	cd benchmarks && ./benchmark.sh --ceed "$(BACKENDS)" -r $(*).sh
 benchmarks: $(bench_targets)
 
 $(ceed.pc) : pkgconfig-prefix = $(abspath .)

From 34d146140c2fce42d8bc042f039d47d4ff020864 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Thu, 30 May 2024 15:48:38 -0600
Subject: [PATCH 106/571] cuda - impl BasisApplyAtPoints

---
 backends/cuda-ref/ceed-cuda-ref-basis.c       | 121 ++++++
 backends/cuda-ref/ceed-cuda-ref.h             |   5 +
 .../cuda/cuda-ref-basis-tensor-at-points.h    | 355 ++++++++++++++++++
 3 files changed, 481 insertions(+)
 create mode 100644 include/ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h

diff --git a/backends/cuda-ref/ceed-cuda-ref-basis.c b/backends/cuda-ref/ceed-cuda-ref-basis.c
index e4ec48105d..e1cf00f2eb 100644
--- a/backends/cuda-ref/ceed-cuda-ref-basis.c
+++ b/backends/cuda-ref/ceed-cuda-ref-basis.c
@@ -83,6 +83,124 @@ int CeedBasisApply_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMo
   return CEED_ERROR_SUCCESS;
 }
 
+//------------------------------------------------------------------------------
+// Basis apply - tensor AtPoints
+//------------------------------------------------------------------------------
+int CeedBasisApplyAtPoints_Cuda(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, CeedEvalMode eval_mode,
+                                CeedVector x_ref, CeedVector u, CeedVector v) {
+  Ceed              ceed;
+  CeedInt           Q_1d, dim, max_num_points = num_points[0];
+  const CeedInt     is_transpose   = t_mode == CEED_TRANSPOSE;
+  const int         max_block_size = 32;
+  const CeedScalar *d_x, *d_u;
+  CeedScalar       *d_v;
+  CeedBasis_Cuda   *data;
+
+  CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
+  CeedCallBackend(CeedBasisGetData(basis, &data));
+  CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
+  CeedCallBackend(CeedBasisGetDimension(basis, &dim));
+
+  // Check uniform number of points per elem
+  for (CeedInt i = 1; i < num_elem; i++) {
+    CeedCheck(max_num_points == num_points[i], ceed, CEED_ERROR_BACKEND,
+              "BasisApplyAtPoints only supported for the same number of points in each element");
+  }
+
+  // Weight handled separately
+  if (eval_mode == CEED_EVAL_WEIGHT) {
+    CeedCall(CeedVectorSetValue(v, 1.0));
+    return CEED_ERROR_SUCCESS;
+  }
+
+  // Build kernels if needed
+  if (data->num_points != max_num_points) {
+    CeedInt P_1d;
+
+    CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
+    data->num_points = max_num_points;
+
+    // -- Create interp matrix to Chebyshev coefficients
+    if (!data->d_chebyshev_interp_1d) {
+      CeedSize    interp_bytes;
+      CeedScalar *chebyshev_interp_1d;
+
+      interp_bytes = P_1d * Q_1d * sizeof(CeedScalar);
+      CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d));
+      CeedCall(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d));
+      CeedCallCuda(ceed, cudaMalloc((void **)&data->d_chebyshev_interp_1d, interp_bytes));
+      CeedCallCuda(ceed, cudaMemcpy(data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, cudaMemcpyHostToDevice));
+      CeedCallBackend(CeedFree(&chebyshev_interp_1d));
+    }
+
+    // -- Compile kernels
+    char       *basis_kernel_source;
+    const char *basis_kernel_path;
+    CeedInt     num_comp;
+
+    if (data->moduleAtPoints) CeedCallCuda(ceed, cuModuleUnload(data->moduleAtPoints));
+    CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
+    CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h", &basis_kernel_path));
+    CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n");
+    CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source));
+    CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n");
+    CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->moduleAtPoints, 9, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_BUF_LEN",
+                                     num_comp * CeedIntPow(Q_1d > P_1d ? Q_1d : P_1d, dim), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp,
+                                     "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_NUM_PTS",
+                                     max_num_points, "POINTS_BUFF_LEN",
+                                     max_num_points * CeedIntPow(Q_1d > max_num_points ? Q_1d : max_num_points, dim - 1)));
+    CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "InterpAtPoints", &data->InterpAtPoints));
+    CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "GradAtPoints", &data->GradAtPoints));
+    CeedCallBackend(CeedFree(&basis_kernel_path));
+    CeedCallBackend(CeedFree(&basis_kernel_source));
+  }
+
+  // Get read/write access to u, v
+  CeedCallBackend(CeedVectorGetArrayRead(x_ref, CEED_MEM_DEVICE, &d_x));
+  if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
+  else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
+  CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
+
+  // Clear v for transpose operation
+  if (is_transpose) {
+    CeedSize length;
+
+    CeedCallBackend(CeedVectorGetLength(v, &length));
+    CeedCallCuda(ceed, cudaMemset(d_v, 0, length * sizeof(CeedScalar)));
+  }
+
+  // Basis action
+  switch (eval_mode) {
+    case CEED_EVAL_INTERP: {
+      void         *interp_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &d_x, &d_u, &d_v};
+      const CeedInt block_size    = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
+
+      CeedCallBackend(CeedRunKernel_Cuda(ceed, data->InterpAtPoints, num_elem, block_size, interp_args));
+    } break;
+    case CEED_EVAL_GRAD: {
+      void         *grad_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &d_x, &d_u, &d_v};
+      const CeedInt block_size  = max_block_size;
+
+      CeedCallBackend(CeedRunKernel_Cuda(ceed, data->GradAtPoints, num_elem, block_size, grad_args));
+    } break;
+    case CEED_EVAL_WEIGHT:
+    case CEED_EVAL_NONE: /* handled separately below */
+      break;
+    // LCOV_EXCL_START
+    case CEED_EVAL_DIV:
+    case CEED_EVAL_CURL:
+      return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]);
+      // LCOV_EXCL_STOP
+  }
+
+  // Restore vectors, cover CEED_EVAL_NONE
+  CeedCallBackend(CeedVectorRestoreArrayRead(x_ref, &d_x));
+  CeedCallBackend(CeedVectorRestoreArray(v, &d_v));
+  if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u));
+  if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u));
+  return CEED_ERROR_SUCCESS;
+}
+
 //------------------------------------------------------------------------------
 // Basis apply - non-tensor
 //------------------------------------------------------------------------------
@@ -184,9 +302,11 @@ static int CeedBasisDestroy_Cuda(CeedBasis basis) {
   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
   CeedCallBackend(CeedBasisGetData(basis, &data));
   CeedCallCuda(ceed, cuModuleUnload(data->module));
+  if (data->moduleAtPoints) CeedCallCuda(ceed, cuModuleUnload(data->moduleAtPoints));
   if (data->d_q_weight_1d) CeedCallCuda(ceed, cudaFree(data->d_q_weight_1d));
   CeedCallCuda(ceed, cudaFree(data->d_interp_1d));
   CeedCallCuda(ceed, cudaFree(data->d_grad_1d));
+  CeedCallCuda(ceed, cudaFree(data->d_chebyshev_interp_1d));
   CeedCallBackend(CeedFree(&data));
   return CEED_ERROR_SUCCESS;
 }
@@ -255,6 +375,7 @@ int CeedBasisCreateTensorH1_Cuda(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const
 
   // Register backend functions
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Cuda));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAtPoints", CeedBasisApplyAtPoints_Cuda));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Cuda));
   return CEED_ERROR_SUCCESS;
 }
diff --git a/backends/cuda-ref/ceed-cuda-ref.h b/backends/cuda-ref/ceed-cuda-ref.h
index 0f4ac3f583..312af8ccf5 100644
--- a/backends/cuda-ref/ceed-cuda-ref.h
+++ b/backends/cuda-ref/ceed-cuda-ref.h
@@ -67,9 +67,14 @@ typedef struct {
   CUfunction  Interp;
   CUfunction  Grad;
   CUfunction  Weight;
+  CUmodule    moduleAtPoints;
+  CeedInt     num_points;
+  CUfunction  InterpAtPoints;
+  CUfunction  GradAtPoints;
   CeedScalar *d_interp_1d;
   CeedScalar *d_grad_1d;
   CeedScalar *d_q_weight_1d;
+  CeedScalar *d_chebyshev_interp_1d;
 } CeedBasis_Cuda;
 
 typedef struct {
diff --git a/include/ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h b/include/ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h
new file mode 100644
index 0000000000..2784c1a4fc
--- /dev/null
+++ b/include/ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h
@@ -0,0 +1,355 @@
+// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+/// @file
+/// Internal header for CUDA tensor product basis with AtPoints evaluation
+
+#include <ceed.h>
+
+//------------------------------------------------------------------------------
+// Chebyshev values
+//------------------------------------------------------------------------------
+template <int Q_1D>
+inline __device__ void ChebyshevPolynomialsAtPoint(const CeedScalar x, CeedScalar *chebyshev_x) {
+  chebyshev_x[0] = 1.0;
+  chebyshev_x[1] = 2 * x;
+  for (CeedInt i = 2; i < Q_1D; i++) chebyshev_x[i] = 2 * x * chebyshev_x[i - 1] - chebyshev_x[i - 2];
+}
+
+template <int Q_1D>
+inline __device__ void ChebyshevDerivativeAtPoint(const CeedScalar x, CeedScalar *chebyshev_dx) {
+  CeedScalar chebyshev_x[3];
+
+  chebyshev_x[1]  = 1.0;
+  chebyshev_x[2]  = 2 * x;
+  chebyshev_dx[0] = 0.0;
+  chebyshev_dx[1] = 2.0;
+  for (CeedInt i = 2; i < Q_1D; i++) {
+    chebyshev_x[0]  = chebyshev_x[1];
+    chebyshev_x[1]  = chebyshev_x[2];
+    chebyshev_x[2]  = 2 * x * chebyshev_x[1] - chebyshev_x[0];
+    chebyshev_dx[i] = 2 * x * chebyshev_dx[i - 1] + 2 * chebyshev_x[1] - chebyshev_dx[i - 2];
+  }
+}
+
+//------------------------------------------------------------------------------
+// Tensor Basis Kernels AtPoints
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+// Interp
+//------------------------------------------------------------------------------
+extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt is_transpose, const CeedScalar *__restrict__ chebyshev_interp_1d,
+                                          const CeedScalar *__restrict__ coords, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
+  const CeedInt i = threadIdx.x;
+
+  __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 3 * BASIS_BUF_LEN];
+  CeedScalar           *s_chebyshev_interp_1d = s_mem;
+  CeedScalar           *s_buffer_1            = s_mem + BASIS_Q_1D * BASIS_P_1D;
+  CeedScalar           *s_buffer_2            = s_buffer_1 + BASIS_BUF_LEN;
+  CeedScalar           *s_chebyshev_coeffs    = s_buffer_2 + BASIS_BUF_LEN;
+  CeedScalar            chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN];
+  for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) {
+    s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k];
+  }
+
+  const CeedInt P             = BASIS_P_1D;
+  const CeedInt Q             = BASIS_Q_1D;
+  const CeedInt u_stride      = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES;
+  const CeedInt v_stride      = is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS;
+  const CeedInt u_comp_stride = num_elem * (is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES);
+  const CeedInt v_comp_stride = num_elem * (is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS);
+  const CeedInt u_size        = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES;
+
+  // Apply basis element by element
+  if (is_transpose) {
+    for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
+      for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
+        const CeedScalar *cur_u = u + elem * u_stride + comp * u_comp_stride;
+        CeedScalar       *cur_v = v + elem * v_stride + comp * v_comp_stride;
+        CeedInt           pre   = 1;
+        CeedInt           post  = 1;
+
+        // Clear Chebyshev coeffs
+        for (CeedInt k = i; k < BASIS_NUM_QPTS; k += blockDim.x) {
+          s_chebyshev_coeffs[k] = 0.0;
+        }
+
+        // Map from point
+        for (CeedInt p = blockIdx.x; p < BASIS_NUM_PTS; p += gridDim.x) {
+          pre  = 1;
+          post = 1;
+          for (CeedInt d = 0; d < BASIS_DIM; d++) {
+            // Update buffers used
+            pre /= 1;
+            const CeedScalar *in  = d == 0 ? (cur_u + p) : (d % 2 ? buffer_2 : buffer_1);
+            CeedScalar       *out = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? buffer_1 : buffer_2);
+
+            // Build Chebyshev polynomial values
+            ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * u_stride + d * u_comp_stride + p], chebyshev_x);
+
+            // Contract along middle index
+            for (CeedInt a = 0; a < pre; a++) {
+              for (CeedInt c = 0; c < post; c++) {
+                if (d == BASIS_DIM - 1) {
+                  for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] += chebyshev_x[j] * in[a * post + c];
+                } else {
+                  for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] = chebyshev_x[j] * in[a * post + c];
+                }
+              }
+            }
+            post *= Q;
+          }
+        }
+
+        // Map from coefficients
+        pre  = BASIS_NUM_QPTS;
+        post = 1;
+        for (CeedInt d = 0; d < BASIS_DIM; d++) {
+          __syncthreads();
+          // Update buffers used
+          pre /= Q;
+          const CeedScalar *in       = d == 0 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_2 : s_buffer_1);
+          CeedScalar       *out      = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2);
+          const CeedInt     writeLen = pre * post * P;
+
+          // Contract along middle index
+          for (CeedInt k = i; k < writeLen; k += blockDim.x) {
+            const CeedInt c   = k % post;
+            const CeedInt j   = (k / post) % P;
+            const CeedInt a   = k / (post * P);
+            CeedScalar    v_k = 0;
+
+            for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c];
+            out[k] = v_k;
+          }
+          post *= P;
+        }
+      }
+    }
+  } else {
+    for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
+      for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
+        const CeedScalar *cur_u = u + elem * u_stride + comp * u_comp_stride;
+        CeedScalar       *cur_v = v + elem * v_stride + comp * v_comp_stride;
+        CeedInt           pre   = u_size;
+        CeedInt           post  = 1;
+
+        // Map to coefficients
+        for (CeedInt d = 0; d < BASIS_DIM; d++) {
+          __syncthreads();
+          // Update buffers used
+          pre /= P;
+          const CeedScalar *in       = d == 0 ? cur_u : (d % 2 ? s_buffer_2 : s_buffer_1);
+          CeedScalar       *out      = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_1 : s_buffer_2);
+          const CeedInt     writeLen = pre * post * Q;
+
+          // Contract along middle index
+          for (CeedInt k = i; k < writeLen; k += blockDim.x) {
+            const CeedInt c   = k % post;
+            const CeedInt j   = (k / post) % Q;
+            const CeedInt a   = k / (post * Q);
+            CeedScalar    v_k = 0;
+
+            for (CeedInt b = 0; b < P; b++) v_k += s_chebyshev_interp_1d[j * BASIS_P_1D + b] * in[(a * P + b) * post + c];
+            out[k] = v_k;
+          }
+          post *= Q;
+        }
+
+        // Map to point
+        __syncthreads();
+        for (CeedInt p = blockIdx.x; p < BASIS_NUM_PTS; p += gridDim.x) {
+          pre  = BASIS_NUM_QPTS;
+          post = 1;
+          for (CeedInt d = 0; d < BASIS_DIM; d++) {
+            // Update buffers used
+            pre /= Q;
+            const CeedScalar *in  = d == 0 ? s_chebyshev_coeffs : (d % 2 ? buffer_2 : buffer_1);
+            CeedScalar       *out = d == BASIS_DIM - 1 ? (cur_v + p) : (d % 2 ? buffer_1 : buffer_2);
+
+            // Build Chebyshev polynomial values
+            ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * v_stride + d * v_comp_stride + p], chebyshev_x);
+
+            // Contract along middle index
+            for (CeedInt a = 0; a < pre; a++) {
+              for (CeedInt c = 0; c < post; c++) {
+                CeedScalar v_k = 0;
+
+                for (CeedInt b = 0; b < Q; b++) v_k += chebyshev_x[b] * in[(a * Q + b) * post + c];
+                out[a * post + c] = v_k;
+              }
+            }
+            post *= 1;
+          }
+        }
+      }
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// Grad
+//------------------------------------------------------------------------------
+extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedInt is_transpose, const CeedScalar *__restrict__ chebyshev_interp_1d,
+                                        const CeedScalar *__restrict__ coords, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
+  const CeedInt i = threadIdx.x;
+
+  __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 3 * BASIS_BUF_LEN];
+  CeedScalar           *s_chebyshev_interp_1d = s_mem;
+  CeedScalar           *s_buffer_1            = s_mem + BASIS_Q_1D * BASIS_P_1D;
+  CeedScalar           *s_buffer_2            = s_buffer_1 + BASIS_BUF_LEN;
+  CeedScalar           *s_chebyshev_coeffs    = s_buffer_2 + BASIS_BUF_LEN;
+  CeedScalar            chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN];
+  for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) {
+    s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k];
+  }
+
+  const CeedInt P             = BASIS_P_1D;
+  const CeedInt Q             = BASIS_Q_1D;
+  const CeedInt u_stride      = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES;
+  const CeedInt v_stride      = is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS;
+  const CeedInt u_comp_stride = num_elem * (is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES);
+  const CeedInt v_comp_stride = num_elem * (is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS);
+  const CeedInt u_size        = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES;
+  const CeedInt u_dim_stride  = is_transpose ? num_elem * BASIS_NUM_PTS * BASIS_NUM_COMP : 0;
+  const CeedInt v_dim_stride  = is_transpose ? 0 : num_elem * BASIS_NUM_PTS * BASIS_NUM_COMP;
+
+  // Apply basis element by element
+  if (is_transpose) {
+    for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
+      for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
+        CeedScalar *cur_v = v + elem * v_stride + comp * v_comp_stride;
+        CeedInt     pre   = 1;
+        CeedInt     post  = 1;
+
+        // Clear Chebyshev coeffs
+        for (CeedInt k = i; k < BASIS_NUM_QPTS; k += blockDim.x) {
+          s_chebyshev_coeffs[k] = 0.0;
+        }
+
+        // Map from point
+        for (CeedInt p = blockIdx.x; p < BASIS_NUM_PTS; p += gridDim.x) {
+          for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) {
+            const CeedScalar *cur_u = u + elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride;
+
+            pre  = 1;
+            post = 1;
+            for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) {
+              // Update buffers used
+              pre /= 1;
+              const CeedScalar *in  = dim_2 == 0 ? (cur_u + p) : (dim_2 % 2 ? buffer_2 : buffer_1);
+              CeedScalar       *out = dim_2 == BASIS_DIM - 1 ? s_chebyshev_coeffs : (dim_2 % 2 ? buffer_1 : buffer_2);
+
+              // Build Chebyshev polynomial values
+              if (dim_1 == dim_2) ChebyshevDerivativeAtPoint<BASIS_Q_1D>(coords[elem * u_stride + dim_2 * u_comp_stride + p], chebyshev_x);
+              else ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * u_stride + dim_2 * u_comp_stride + p], chebyshev_x);
+
+              // Contract along middle index
+              for (CeedInt a = 0; a < pre; a++) {
+                for (CeedInt c = 0; c < post; c++) {
+                  if (dim_2 == BASIS_DIM - 1) {
+                    for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] += chebyshev_x[j] * in[a * post + c];
+                  } else {
+                    for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] = chebyshev_x[j] * in[a * post + c];
+                  }
+                }
+              }
+              post *= Q;
+            }
+          }
+        }
+
+        // Map from coefficients
+        pre  = BASIS_NUM_QPTS;
+        post = 1;
+        for (CeedInt d = 0; d < BASIS_DIM; d++) {
+          __syncthreads();
+          // Update buffers used
+          pre /= Q;
+          const CeedScalar *in       = d == 0 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_2 : s_buffer_1);
+          CeedScalar       *out      = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2);
+          const CeedInt     writeLen = pre * post * P;
+
+          // Contract along middle index
+          for (CeedInt k = i; k < writeLen; k += blockDim.x) {
+            const CeedInt c   = k % post;
+            const CeedInt j   = (k / post) % P;
+            const CeedInt a   = k / (post * P);
+            CeedScalar    v_k = 0;
+
+            for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c];
+            out[k] = v_k;
+          }
+          post *= P;
+        }
+      }
+    }
+  } else {
+    for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
+      for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
+        const CeedScalar *cur_u = u + elem * u_stride + comp * u_comp_stride;
+        CeedInt           pre   = u_size;
+        CeedInt           post  = 1;
+
+        // Map to coefficients
+        for (CeedInt d = 0; d < BASIS_DIM; d++) {
+          __syncthreads();
+          // Update buffers used
+          pre /= P;
+          const CeedScalar *in       = d == 0 ? cur_u : (d % 2 ? s_buffer_2 : s_buffer_1);
+          CeedScalar       *out      = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_1 : s_buffer_2);
+          const CeedInt     writeLen = pre * post * Q;
+
+          // Contract along middle index
+          for (CeedInt k = i; k < writeLen; k += blockDim.x) {
+            const CeedInt c   = k % post;
+            const CeedInt j   = (k / post) % Q;
+            const CeedInt a   = k / (post * Q);
+            CeedScalar    v_k = 0;
+
+            for (CeedInt b = 0; b < P; b++) v_k += s_chebyshev_interp_1d[j * BASIS_P_1D + b] * in[(a * P + b) * post + c];
+            out[k] = v_k;
+          }
+          post *= Q;
+        }
+
+        // Map to point
+        __syncthreads();
+        for (CeedInt p = blockIdx.x; p < BASIS_NUM_PTS; p += gridDim.x) {
+          for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) {
+            CeedScalar *cur_v = v + elem * v_stride + dim_1 * v_dim_stride + comp * v_comp_stride;
+
+            pre  = BASIS_NUM_QPTS;
+            post = 1;
+            for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) {
+              // Update buffers used
+              pre /= Q;
+              const CeedScalar *in  = dim_2 == 0 ? s_chebyshev_coeffs : (dim_2 % 2 ? buffer_2 : buffer_1);
+              CeedScalar       *out = dim_2 == BASIS_DIM - 1 ? (cur_v + p) : (dim_2 % 2 ? buffer_1 : buffer_2);
+
+              // Build Chebyshev polynomial values
+              if (dim_1 == dim_2) ChebyshevDerivativeAtPoint<BASIS_Q_1D>(coords[elem * v_stride + dim_2 * v_comp_stride + p], chebyshev_x);
+              else ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * v_stride + dim_2 * v_comp_stride + p], chebyshev_x);
+
+              // Contract along middle index
+              for (CeedInt a = 0; a < pre; a++) {
+                for (CeedInt c = 0; c < post; c++) {
+                  CeedScalar v_k = 0;
+
+                  for (CeedInt b = 0; b < Q; b++) v_k += chebyshev_x[b] * in[(a * Q + b) * post + c];
+                  out[a * post + c] = v_k;
+                }
+              }
+              post *= 1;
+            }
+          }
+        }
+      }
+    }
+  }
+}

From 1c21e86942aba8686e330a1715ed048a6983247b Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Tue, 11 Jun 2024 16:00:58 -0600
Subject: [PATCH 107/571] hip - add BasisApplyAtPoints

---
 backends/hip-ref/ceed-hip-ref-basis.c         | 121 ++++++
 backends/hip-ref/ceed-hip-ref.h               |   5 +
 .../hip/hip-ref-basis-tensor-at-points.h      | 355 ++++++++++++++++++
 3 files changed, 481 insertions(+)
 create mode 100644 include/ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h

diff --git a/backends/hip-ref/ceed-hip-ref-basis.c b/backends/hip-ref/ceed-hip-ref-basis.c
index 243e801e52..06e599d847 100644
--- a/backends/hip-ref/ceed-hip-ref-basis.c
+++ b/backends/hip-ref/ceed-hip-ref-basis.c
@@ -82,6 +82,124 @@ int CeedBasisApply_Hip(CeedBasis basis, const CeedInt num_elem, CeedTransposeMod
   return CEED_ERROR_SUCCESS;
 }
 
+//------------------------------------------------------------------------------
+// Basis apply - tensor AtPoints
+//------------------------------------------------------------------------------
+int CeedBasisApplyAtPoints_Hip(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, CeedEvalMode eval_mode,
+                               CeedVector x_ref, CeedVector u, CeedVector v) {
+  Ceed              ceed;
+  CeedInt           Q_1d, dim, max_num_points = num_points[0];
+  const CeedInt     is_transpose   = t_mode == CEED_TRANSPOSE;
+  const int         max_block_size = 32;
+  const CeedScalar *d_x, *d_u;
+  CeedScalar       *d_v;
+  CeedBasis_Hip    *data;
+
+  CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
+  CeedCallBackend(CeedBasisGetData(basis, &data));
+  CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
+  CeedCallBackend(CeedBasisGetDimension(basis, &dim));
+
+  // Check uniform number of points per elem
+  for (CeedInt i = 1; i < num_elem; i++) {
+    CeedCheck(max_num_points == num_points[i], ceed, CEED_ERROR_BACKEND,
+              "BasisApplyAtPoints only supported for the same number of points in each element");
+  }
+
+  // Weight handled separately
+  if (eval_mode == CEED_EVAL_WEIGHT) {
+    CeedCall(CeedVectorSetValue(v, 1.0));
+    return CEED_ERROR_SUCCESS;
+  }
+
+  // Build kernels if needed
+  if (data->num_points != max_num_points) {
+    CeedInt P_1d;
+
+    CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
+    data->num_points = max_num_points;
+
+    // -- Create interp matrix to Chebyshev coefficients
+    if (!data->d_chebyshev_interp_1d) {
+      CeedSize    interp_bytes;
+      CeedScalar *chebyshev_interp_1d;
+
+      interp_bytes = P_1d * Q_1d * sizeof(CeedScalar);
+      CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d));
+      CeedCall(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d));
+      CeedCallHip(ceed, hipMalloc((void **)&data->d_chebyshev_interp_1d, interp_bytes));
+      CeedCallHip(ceed, hipMemcpy(data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, hipMemcpyHostToDevice));
+      CeedCallBackend(CeedFree(&chebyshev_interp_1d));
+    }
+
+    // -- Compile kernels
+    char       *basis_kernel_source;
+    const char *basis_kernel_path;
+    CeedInt     num_comp;
+
+    if (data->moduleAtPoints) CeedCallHip(ceed, hipModuleUnload(data->moduleAtPoints));
+    CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
+    CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h", &basis_kernel_path));
+    CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n");
+    CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source));
+    CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n");
+    CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->moduleAtPoints, 9, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_BUF_LEN",
+                                    num_comp * CeedIntPow(Q_1d > P_1d ? Q_1d : P_1d, dim), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp,
+                                    "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_NUM_PTS",
+                                    max_num_points, "POINTS_BUFF_LEN",
+                                    max_num_points * CeedIntPow(Q_1d > max_num_points ? Q_1d : max_num_points, dim - 1)));
+    CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "InterpAtPoints", &data->InterpAtPoints));
+    CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "GradAtPoints", &data->GradAtPoints));
+    CeedCallBackend(CeedFree(&basis_kernel_path));
+    CeedCallBackend(CeedFree(&basis_kernel_source));
+  }
+
+  // Get read/write access to u, v
+  CeedCallBackend(CeedVectorGetArrayRead(x_ref, CEED_MEM_DEVICE, &d_x));
+  if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
+  else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
+  CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
+
+  // Clear v for transpose operation
+  if (is_transpose) {
+    CeedSize length;
+
+    CeedCallBackend(CeedVectorGetLength(v, &length));
+    CeedCallHip(ceed, hipMemset(d_v, 0, length * sizeof(CeedScalar)));
+  }
+
+  // Basis action
+  switch (eval_mode) {
+    case CEED_EVAL_INTERP: {
+      void         *interp_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &d_x, &d_u, &d_v};
+      const CeedInt block_size    = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
+
+      CeedCallBackend(CeedRunKernel_Hip(ceed, data->InterpAtPoints, num_elem, block_size, interp_args));
+    } break;
+    case CEED_EVAL_GRAD: {
+      void         *grad_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &d_x, &d_u, &d_v};
+      const CeedInt block_size  = max_block_size;
+
+      CeedCallBackend(CeedRunKernel_Hip(ceed, data->GradAtPoints, num_elem, block_size, grad_args));
+    } break;
+    case CEED_EVAL_WEIGHT:
+    case CEED_EVAL_NONE: /* handled separately below */
+      break;
+    // LCOV_EXCL_START
+    case CEED_EVAL_DIV:
+    case CEED_EVAL_CURL:
+      return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]);
+      // LCOV_EXCL_STOP
+  }
+
+  // Restore vectors, cover CEED_EVAL_NONE
+  CeedCallBackend(CeedVectorRestoreArrayRead(x_ref, &d_x));
+  CeedCallBackend(CeedVectorRestoreArray(v, &d_v));
+  if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u));
+  if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u));
+  return CEED_ERROR_SUCCESS;
+}
+
 //------------------------------------------------------------------------------
 // Basis apply - non-tensor
 //------------------------------------------------------------------------------
@@ -183,9 +301,11 @@ static int CeedBasisDestroy_Hip(CeedBasis basis) {
   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
   CeedCallBackend(CeedBasisGetData(basis, &data));
   CeedCallHip(ceed, hipModuleUnload(data->module));
+  if (data->moduleAtPoints) CeedCallHip(ceed, hipModuleUnload(data->moduleAtPoints));
   if (data->d_q_weight_1d) CeedCallHip(ceed, hipFree(data->d_q_weight_1d));
   CeedCallHip(ceed, hipFree(data->d_interp_1d));
   CeedCallHip(ceed, hipFree(data->d_grad_1d));
+  CeedCallHip(ceed, hipFree(data->d_chebyshev_interp_1d));
   CeedCallBackend(CeedFree(&data));
   return CEED_ERROR_SUCCESS;
 }
@@ -254,6 +374,7 @@ int CeedBasisCreateTensorH1_Hip(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const C
 
   // Register backend functions
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Hip));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAtPoints", CeedBasisApplyAtPoints_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Hip));
   return CEED_ERROR_SUCCESS;
 }
diff --git a/backends/hip-ref/ceed-hip-ref.h b/backends/hip-ref/ceed-hip-ref.h
index 392f9ddb79..b73b72d07c 100644
--- a/backends/hip-ref/ceed-hip-ref.h
+++ b/backends/hip-ref/ceed-hip-ref.h
@@ -71,9 +71,14 @@ typedef struct {
   hipFunction_t Interp;
   hipFunction_t Grad;
   hipFunction_t Weight;
+  hipModule_t   moduleAtPoints;
+  CeedInt       num_points;
+  hipFunction_t InterpAtPoints;
+  hipFunction_t GradAtPoints;
   CeedScalar   *d_interp_1d;
   CeedScalar   *d_grad_1d;
   CeedScalar   *d_q_weight_1d;
+  CeedScalar   *d_chebyshev_interp_1d;
 } CeedBasis_Hip;
 
 typedef struct {
diff --git a/include/ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h b/include/ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h
new file mode 100644
index 0000000000..2784c1a4fc
--- /dev/null
+++ b/include/ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h
@@ -0,0 +1,355 @@
+// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+/// @file
+/// Internal header for CUDA tensor product basis with AtPoints evaluation
+
+#include <ceed.h>
+
+//------------------------------------------------------------------------------
+// Chebyshev values
+//------------------------------------------------------------------------------
+template <int Q_1D>
+inline __device__ void ChebyshevPolynomialsAtPoint(const CeedScalar x, CeedScalar *chebyshev_x) {
+  chebyshev_x[0] = 1.0;
+  chebyshev_x[1] = 2 * x;
+  for (CeedInt i = 2; i < Q_1D; i++) chebyshev_x[i] = 2 * x * chebyshev_x[i - 1] - chebyshev_x[i - 2];
+}
+
+template <int Q_1D>
+inline __device__ void ChebyshevDerivativeAtPoint(const CeedScalar x, CeedScalar *chebyshev_dx) {
+  CeedScalar chebyshev_x[3];
+
+  chebyshev_x[1]  = 1.0;
+  chebyshev_x[2]  = 2 * x;
+  chebyshev_dx[0] = 0.0;
+  chebyshev_dx[1] = 2.0;
+  for (CeedInt i = 2; i < Q_1D; i++) {
+    chebyshev_x[0]  = chebyshev_x[1];
+    chebyshev_x[1]  = chebyshev_x[2];
+    chebyshev_x[2]  = 2 * x * chebyshev_x[1] - chebyshev_x[0];
+    chebyshev_dx[i] = 2 * x * chebyshev_dx[i - 1] + 2 * chebyshev_x[1] - chebyshev_dx[i - 2];
+  }
+}
+
+//------------------------------------------------------------------------------
+// Tensor Basis Kernels AtPoints
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+// Interp
+//------------------------------------------------------------------------------
+extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt is_transpose, const CeedScalar *__restrict__ chebyshev_interp_1d,
+                                          const CeedScalar *__restrict__ coords, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
+  const CeedInt i = threadIdx.x;
+
+  __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 3 * BASIS_BUF_LEN];
+  CeedScalar           *s_chebyshev_interp_1d = s_mem;
+  CeedScalar           *s_buffer_1            = s_mem + BASIS_Q_1D * BASIS_P_1D;
+  CeedScalar           *s_buffer_2            = s_buffer_1 + BASIS_BUF_LEN;
+  CeedScalar           *s_chebyshev_coeffs    = s_buffer_2 + BASIS_BUF_LEN;
+  CeedScalar            chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN];
+  for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) {
+    s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k];
+  }
+
+  const CeedInt P             = BASIS_P_1D;
+  const CeedInt Q             = BASIS_Q_1D;
+  const CeedInt u_stride      = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES;
+  const CeedInt v_stride      = is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS;
+  const CeedInt u_comp_stride = num_elem * (is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES);
+  const CeedInt v_comp_stride = num_elem * (is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS);
+  const CeedInt u_size        = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES;
+
+  // Apply basis element by element
+  if (is_transpose) {
+    for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
+      for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
+        const CeedScalar *cur_u = u + elem * u_stride + comp * u_comp_stride;
+        CeedScalar       *cur_v = v + elem * v_stride + comp * v_comp_stride;
+        CeedInt           pre   = 1;
+        CeedInt           post  = 1;
+
+        // Clear Chebyshev coeffs
+        for (CeedInt k = i; k < BASIS_NUM_QPTS; k += blockDim.x) {
+          s_chebyshev_coeffs[k] = 0.0;
+        }
+
+        // Map from point
+        for (CeedInt p = blockIdx.x; p < BASIS_NUM_PTS; p += gridDim.x) {
+          pre  = 1;
+          post = 1;
+          for (CeedInt d = 0; d < BASIS_DIM; d++) {
+            // Update buffers used
+            pre /= 1;
+            const CeedScalar *in  = d == 0 ? (cur_u + p) : (d % 2 ? buffer_2 : buffer_1);
+            CeedScalar       *out = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? buffer_1 : buffer_2);
+
+            // Build Chebyshev polynomial values
+            ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * u_stride + d * u_comp_stride + p], chebyshev_x);
+
+            // Contract along middle index
+            for (CeedInt a = 0; a < pre; a++) {
+              for (CeedInt c = 0; c < post; c++) {
+                if (d == BASIS_DIM - 1) {
+                  for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] += chebyshev_x[j] * in[a * post + c];
+                } else {
+                  for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] = chebyshev_x[j] * in[a * post + c];
+                }
+              }
+            }
+            post *= Q;
+          }
+        }
+
+        // Map from coefficients
+        pre  = BASIS_NUM_QPTS;
+        post = 1;
+        for (CeedInt d = 0; d < BASIS_DIM; d++) {
+          __syncthreads();
+          // Update buffers used
+          pre /= Q;
+          const CeedScalar *in       = d == 0 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_2 : s_buffer_1);
+          CeedScalar       *out      = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2);
+          const CeedInt     writeLen = pre * post * P;
+
+          // Contract along middle index
+          for (CeedInt k = i; k < writeLen; k += blockDim.x) {
+            const CeedInt c   = k % post;
+            const CeedInt j   = (k / post) % P;
+            const CeedInt a   = k / (post * P);
+            CeedScalar    v_k = 0;
+
+            for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c];
+            out[k] = v_k;
+          }
+          post *= P;
+        }
+      }
+    }
+  } else {
+    for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
+      for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
+        const CeedScalar *cur_u = u + elem * u_stride + comp * u_comp_stride;
+        CeedScalar       *cur_v = v + elem * v_stride + comp * v_comp_stride;
+        CeedInt           pre   = u_size;
+        CeedInt           post  = 1;
+
+        // Map to coefficients
+        for (CeedInt d = 0; d < BASIS_DIM; d++) {
+          __syncthreads();
+          // Update buffers used
+          pre /= P;
+          const CeedScalar *in       = d == 0 ? cur_u : (d % 2 ? s_buffer_2 : s_buffer_1);
+          CeedScalar       *out      = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_1 : s_buffer_2);
+          const CeedInt     writeLen = pre * post * Q;
+
+          // Contract along middle index
+          for (CeedInt k = i; k < writeLen; k += blockDim.x) {
+            const CeedInt c   = k % post;
+            const CeedInt j   = (k / post) % Q;
+            const CeedInt a   = k / (post * Q);
+            CeedScalar    v_k = 0;
+
+            for (CeedInt b = 0; b < P; b++) v_k += s_chebyshev_interp_1d[j * BASIS_P_1D + b] * in[(a * P + b) * post + c];
+            out[k] = v_k;
+          }
+          post *= Q;
+        }
+
+        // Map to point
+        __syncthreads();
+        for (CeedInt p = blockIdx.x; p < BASIS_NUM_PTS; p += gridDim.x) {
+          pre  = BASIS_NUM_QPTS;
+          post = 1;
+          for (CeedInt d = 0; d < BASIS_DIM; d++) {
+            // Update buffers used
+            pre /= Q;
+            const CeedScalar *in  = d == 0 ? s_chebyshev_coeffs : (d % 2 ? buffer_2 : buffer_1);
+            CeedScalar       *out = d == BASIS_DIM - 1 ? (cur_v + p) : (d % 2 ? buffer_1 : buffer_2);
+
+            // Build Chebyshev polynomial values
+            ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * v_stride + d * v_comp_stride + p], chebyshev_x);
+
+            // Contract along middle index
+            for (CeedInt a = 0; a < pre; a++) {
+              for (CeedInt c = 0; c < post; c++) {
+                CeedScalar v_k = 0;
+
+                for (CeedInt b = 0; b < Q; b++) v_k += chebyshev_x[b] * in[(a * Q + b) * post + c];
+                out[a * post + c] = v_k;
+              }
+            }
+            post *= 1;
+          }
+        }
+      }
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// Grad
+//------------------------------------------------------------------------------
+extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedInt is_transpose, const CeedScalar *__restrict__ chebyshev_interp_1d,
+                                        const CeedScalar *__restrict__ coords, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
+  const CeedInt i = threadIdx.x;
+
+  __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 3 * BASIS_BUF_LEN];
+  CeedScalar           *s_chebyshev_interp_1d = s_mem;
+  CeedScalar           *s_buffer_1            = s_mem + BASIS_Q_1D * BASIS_P_1D;
+  CeedScalar           *s_buffer_2            = s_buffer_1 + BASIS_BUF_LEN;
+  CeedScalar           *s_chebyshev_coeffs    = s_buffer_2 + BASIS_BUF_LEN;
+  CeedScalar            chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN];
+  for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) {
+    s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k];
+  }
+
+  const CeedInt P             = BASIS_P_1D;
+  const CeedInt Q             = BASIS_Q_1D;
+  const CeedInt u_stride      = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES;
+  const CeedInt v_stride      = is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS;
+  const CeedInt u_comp_stride = num_elem * (is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES);
+  const CeedInt v_comp_stride = num_elem * (is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS);
+  const CeedInt u_size        = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES;
+  const CeedInt u_dim_stride  = is_transpose ? num_elem * BASIS_NUM_PTS * BASIS_NUM_COMP : 0;
+  const CeedInt v_dim_stride  = is_transpose ? 0 : num_elem * BASIS_NUM_PTS * BASIS_NUM_COMP;
+
+  // Apply basis element by element
+  if (is_transpose) {
+    for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
+      for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
+        CeedScalar *cur_v = v + elem * v_stride + comp * v_comp_stride;
+        CeedInt     pre   = 1;
+        CeedInt     post  = 1;
+
+        // Clear Chebyshev coeffs
+        for (CeedInt k = i; k < BASIS_NUM_QPTS; k += blockDim.x) {
+          s_chebyshev_coeffs[k] = 0.0;
+        }
+
+        // Map from point
+        for (CeedInt p = blockIdx.x; p < BASIS_NUM_PTS; p += gridDim.x) {
+          for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) {
+            const CeedScalar *cur_u = u + elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride;
+
+            pre  = 1;
+            post = 1;
+            for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) {
+              // Update buffers used
+              pre /= 1;
+              const CeedScalar *in  = dim_2 == 0 ? (cur_u + p) : (dim_2 % 2 ? buffer_2 : buffer_1);
+              CeedScalar       *out = dim_2 == BASIS_DIM - 1 ? s_chebyshev_coeffs : (dim_2 % 2 ? buffer_1 : buffer_2);
+
+              // Build Chebyshev polynomial values
+              if (dim_1 == dim_2) ChebyshevDerivativeAtPoint<BASIS_Q_1D>(coords[elem * u_stride + dim_2 * u_comp_stride + p], chebyshev_x);
+              else ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * u_stride + dim_2 * u_comp_stride + p], chebyshev_x);
+
+              // Contract along middle index
+              for (CeedInt a = 0; a < pre; a++) {
+                for (CeedInt c = 0; c < post; c++) {
+                  if (dim_2 == BASIS_DIM - 1) {
+                    for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] += chebyshev_x[j] * in[a * post + c];
+                  } else {
+                    for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] = chebyshev_x[j] * in[a * post + c];
+                  }
+                }
+              }
+              post *= Q;
+            }
+          }
+        }
+
+        // Map from coefficients
+        pre  = BASIS_NUM_QPTS;
+        post = 1;
+        for (CeedInt d = 0; d < BASIS_DIM; d++) {
+          __syncthreads();
+          // Update buffers used
+          pre /= Q;
+          const CeedScalar *in       = d == 0 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_2 : s_buffer_1);
+          CeedScalar       *out      = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2);
+          const CeedInt     writeLen = pre * post * P;
+
+          // Contract along middle index
+          for (CeedInt k = i; k < writeLen; k += blockDim.x) {
+            const CeedInt c   = k % post;
+            const CeedInt j   = (k / post) % P;
+            const CeedInt a   = k / (post * P);
+            CeedScalar    v_k = 0;
+
+            for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c];
+            out[k] = v_k;
+          }
+          post *= P;
+        }
+      }
+    }
+  } else {
+    for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
+      for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
+        const CeedScalar *cur_u = u + elem * u_stride + comp * u_comp_stride;
+        CeedInt           pre   = u_size;
+        CeedInt           post  = 1;
+
+        // Map to coefficients
+        for (CeedInt d = 0; d < BASIS_DIM; d++) {
+          __syncthreads();
+          // Update buffers used
+          pre /= P;
+          const CeedScalar *in       = d == 0 ? cur_u : (d % 2 ? s_buffer_2 : s_buffer_1);
+          CeedScalar       *out      = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_1 : s_buffer_2);
+          const CeedInt     writeLen = pre * post * Q;
+
+          // Contract along middle index
+          for (CeedInt k = i; k < writeLen; k += blockDim.x) {
+            const CeedInt c   = k % post;
+            const CeedInt j   = (k / post) % Q;
+            const CeedInt a   = k / (post * Q);
+            CeedScalar    v_k = 0;
+
+            for (CeedInt b = 0; b < P; b++) v_k += s_chebyshev_interp_1d[j * BASIS_P_1D + b] * in[(a * P + b) * post + c];
+            out[k] = v_k;
+          }
+          post *= Q;
+        }
+
+        // Map to point
+        __syncthreads();
+        for (CeedInt p = blockIdx.x; p < BASIS_NUM_PTS; p += gridDim.x) {
+          for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) {
+            CeedScalar *cur_v = v + elem * v_stride + dim_1 * v_dim_stride + comp * v_comp_stride;
+
+            pre  = BASIS_NUM_QPTS;
+            post = 1;
+            for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) {
+              // Update buffers used
+              pre /= Q;
+              const CeedScalar *in  = dim_2 == 0 ? s_chebyshev_coeffs : (dim_2 % 2 ? buffer_2 : buffer_1);
+              CeedScalar       *out = dim_2 == BASIS_DIM - 1 ? (cur_v + p) : (dim_2 % 2 ? buffer_1 : buffer_2);
+
+              // Build Chebyshev polynomial values
+              if (dim_1 == dim_2) ChebyshevDerivativeAtPoint<BASIS_Q_1D>(coords[elem * v_stride + dim_2 * v_comp_stride + p], chebyshev_x);
+              else ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * v_stride + dim_2 * v_comp_stride + p], chebyshev_x);
+
+              // Contract along middle index
+              for (CeedInt a = 0; a < pre; a++) {
+                for (CeedInt c = 0; c < post; c++) {
+                  CeedScalar v_k = 0;
+
+                  for (CeedInt b = 0; b < Q; b++) v_k += chebyshev_x[b] * in[(a * Q + b) * post + c];
+                  out[a * post + c] = v_k;
+                }
+              }
+              post *= 1;
+            }
+          }
+        }
+      }
+    }
+  }
+}

From 1dda9c1ab34a6c4075a3e9d5ebc86d8c1194a9e1 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Mon, 17 Jun 2024 16:00:58 -0600
Subject: [PATCH 108/571] gpu - add intial AtPoints to shared mem backends, but
 using ref impl

---
 backends/cuda-shared/ceed-cuda-shared-basis.c | 121 ++++++++++++++++++
 backends/cuda-shared/ceed-cuda-shared.h       |   5 +
 backends/hip-shared/ceed-hip-shared-basis.c   | 121 ++++++++++++++++++
 backends/hip-shared/ceed-hip-shared.h         |   5 +
 4 files changed, 252 insertions(+)

diff --git a/backends/cuda-shared/ceed-cuda-shared-basis.c b/backends/cuda-shared/ceed-cuda-shared-basis.c
index f5b5897167..3c90bc03d5 100644
--- a/backends/cuda-shared/ceed-cuda-shared-basis.c
+++ b/backends/cuda-shared/ceed-cuda-shared-basis.c
@@ -186,6 +186,124 @@ int CeedBasisApplyTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, Ce
   return CEED_ERROR_SUCCESS;
 }
 
+//------------------------------------------------------------------------------
+// Basis apply - tensor AtPoints
+//------------------------------------------------------------------------------
+int CeedBasisApplyAtPoints_Cuda_shared(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode,
+                                       CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) {
+  Ceed                   ceed;
+  CeedInt                Q_1d, dim, max_num_points = num_points[0];
+  const CeedInt          is_transpose   = t_mode == CEED_TRANSPOSE;
+  const int              max_block_size = 32;
+  const CeedScalar      *d_x, *d_u;
+  CeedScalar            *d_v;
+  CeedBasis_Cuda_shared *data;
+
+  CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
+  CeedCallBackend(CeedBasisGetData(basis, &data));
+  CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
+  CeedCallBackend(CeedBasisGetDimension(basis, &dim));
+
+  // Check uniform number of points per elem
+  for (CeedInt i = 1; i < num_elem; i++) {
+    CeedCheck(max_num_points == num_points[i], ceed, CEED_ERROR_BACKEND,
+              "BasisApplyAtPoints only supported for the same number of points in each element");
+  }
+
+  // Weight handled separately
+  if (eval_mode == CEED_EVAL_WEIGHT) {
+    CeedCall(CeedVectorSetValue(v, 1.0));
+    return CEED_ERROR_SUCCESS;
+  }
+
+  // Build kernels if needed
+  if (data->num_points != max_num_points) {
+    CeedInt P_1d;
+
+    CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
+    data->num_points = max_num_points;
+
+    // -- Create interp matrix to Chebyshev coefficients
+    if (!data->d_chebyshev_interp_1d) {
+      CeedSize    interp_bytes;
+      CeedScalar *chebyshev_interp_1d;
+
+      interp_bytes = P_1d * Q_1d * sizeof(CeedScalar);
+      CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d));
+      CeedCall(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d));
+      CeedCallCuda(ceed, cudaMalloc((void **)&data->d_chebyshev_interp_1d, interp_bytes));
+      CeedCallCuda(ceed, cudaMemcpy(data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, cudaMemcpyHostToDevice));
+      CeedCallBackend(CeedFree(&chebyshev_interp_1d));
+    }
+
+    // -- Compile kernels
+    char       *basis_kernel_source;
+    const char *basis_kernel_path;
+    CeedInt     num_comp;
+
+    if (data->moduleAtPoints) CeedCallCuda(ceed, cuModuleUnload(data->moduleAtPoints));
+    CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
+    CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h", &basis_kernel_path));
+    CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n");
+    CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source));
+    CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n");
+    CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->moduleAtPoints, 9, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_BUF_LEN",
+                                     num_comp * CeedIntPow(Q_1d > P_1d ? Q_1d : P_1d, dim), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp,
+                                     "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_NUM_PTS",
+                                     max_num_points, "POINTS_BUFF_LEN",
+                                     max_num_points * CeedIntPow(Q_1d > max_num_points ? Q_1d : max_num_points, dim - 1)));
+    CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "InterpAtPoints", &data->InterpAtPoints));
+    CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "GradAtPoints", &data->GradAtPoints));
+    CeedCallBackend(CeedFree(&basis_kernel_path));
+    CeedCallBackend(CeedFree(&basis_kernel_source));
+  }
+
+  // Get read/write access to u, v
+  CeedCallBackend(CeedVectorGetArrayRead(x_ref, CEED_MEM_DEVICE, &d_x));
+  if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
+  else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
+  CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
+
+  // Clear v for transpose operation
+  if (is_transpose) {
+    CeedSize length;
+
+    CeedCallBackend(CeedVectorGetLength(v, &length));
+    CeedCallCuda(ceed, cudaMemset(d_v, 0, length * sizeof(CeedScalar)));
+  }
+
+  // Basis action
+  switch (eval_mode) {
+    case CEED_EVAL_INTERP: {
+      void         *interp_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &d_x, &d_u, &d_v};
+      const CeedInt block_size    = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
+
+      CeedCallBackend(CeedRunKernel_Cuda(ceed, data->InterpAtPoints, num_elem, block_size, interp_args));
+    } break;
+    case CEED_EVAL_GRAD: {
+      void         *grad_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &d_x, &d_u, &d_v};
+      const CeedInt block_size  = max_block_size;
+
+      CeedCallBackend(CeedRunKernel_Cuda(ceed, data->GradAtPoints, num_elem, block_size, grad_args));
+    } break;
+    case CEED_EVAL_WEIGHT:
+    case CEED_EVAL_NONE: /* handled separately below */
+      break;
+    // LCOV_EXCL_START
+    case CEED_EVAL_DIV:
+    case CEED_EVAL_CURL:
+      return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]);
+      // LCOV_EXCL_STOP
+  }
+
+  // Restore vectors, cover CEED_EVAL_NONE
+  CeedCallBackend(CeedVectorRestoreArrayRead(x_ref, &d_x));
+  CeedCallBackend(CeedVectorRestoreArray(v, &d_v));
+  if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u));
+  if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u));
+  return CEED_ERROR_SUCCESS;
+}
+
 //------------------------------------------------------------------------------
 // Destroy basis
 //------------------------------------------------------------------------------
@@ -196,10 +314,12 @@ static int CeedBasisDestroy_Cuda_shared(CeedBasis basis) {
   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
   CeedCallBackend(CeedBasisGetData(basis, &data));
   CeedCallCuda(ceed, cuModuleUnload(data->module));
+  if (data->moduleAtPoints) CeedCallCuda(ceed, cuModuleUnload(data->moduleAtPoints));
   if (data->d_q_weight_1d) CeedCallCuda(ceed, cudaFree(data->d_q_weight_1d));
   CeedCallCuda(ceed, cudaFree(data->d_interp_1d));
   CeedCallCuda(ceed, cudaFree(data->d_grad_1d));
   CeedCallCuda(ceed, cudaFree(data->d_collo_grad_1d));
+  CeedCallCuda(ceed, cudaFree(data->d_chebyshev_interp_1d));
   CeedCallBackend(CeedFree(&data));
   return CEED_ERROR_SUCCESS;
 }
@@ -265,6 +385,7 @@ int CeedBasisCreateTensorH1_Cuda_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d,
 
   // Register backend functions
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyTensor_Cuda_shared));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAtPoints", CeedBasisApplyAtPoints_Cuda_shared));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Cuda_shared));
   return CEED_ERROR_SUCCESS;
 }
diff --git a/backends/cuda-shared/ceed-cuda-shared.h b/backends/cuda-shared/ceed-cuda-shared.h
index ffc70dd6f5..40bbaed6a0 100644
--- a/backends/cuda-shared/ceed-cuda-shared.h
+++ b/backends/cuda-shared/ceed-cuda-shared.h
@@ -17,10 +17,15 @@ typedef struct {
   CUfunction  Grad;
   CUfunction  GradTranspose;
   CUfunction  Weight;
+  CUmodule    moduleAtPoints;
+  CeedInt     num_points;
+  CUfunction  InterpAtPoints;
+  CUfunction  GradAtPoints;
   CeedScalar *d_interp_1d;
   CeedScalar *d_grad_1d;
   CeedScalar *d_collo_grad_1d;
   CeedScalar *d_q_weight_1d;
+  CeedScalar *d_chebyshev_interp_1d;
   CeedScalar *c_B;
   CeedScalar *c_G;
 } CeedBasis_Cuda_shared;
diff --git a/backends/hip-shared/ceed-hip-shared-basis.c b/backends/hip-shared/ceed-hip-shared-basis.c
index 298a270292..5bdf90a251 100644
--- a/backends/hip-shared/ceed-hip-shared-basis.c
+++ b/backends/hip-shared/ceed-hip-shared-basis.c
@@ -245,6 +245,124 @@ int CeedBasisApplyTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, Cee
   return CEED_ERROR_SUCCESS;
 }
 
+//------------------------------------------------------------------------------
+// Basis apply - tensor AtPoints
+//------------------------------------------------------------------------------
+int CeedBasisApplyAtPoints_Hip_shared(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode,
+                                      CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) {
+  Ceed                  ceed;
+  CeedInt               Q_1d, dim, max_num_points = num_points[0];
+  const CeedInt         is_transpose   = t_mode == CEED_TRANSPOSE;
+  const int             max_block_size = 32;
+  const CeedScalar     *d_x, *d_u;
+  CeedScalar           *d_v;
+  CeedBasis_Hip_shared *data;
+
+  CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
+  CeedCallBackend(CeedBasisGetData(basis, &data));
+  CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
+  CeedCallBackend(CeedBasisGetDimension(basis, &dim));
+
+  // Check uniform number of points per elem
+  for (CeedInt i = 1; i < num_elem; i++) {
+    CeedCheck(max_num_points == num_points[i], ceed, CEED_ERROR_BACKEND,
+              "BasisApplyAtPoints only supported for the same number of points in each element");
+  }
+
+  // Weight handled separately
+  if (eval_mode == CEED_EVAL_WEIGHT) {
+    CeedCall(CeedVectorSetValue(v, 1.0));
+    return CEED_ERROR_SUCCESS;
+  }
+
+  // Build kernels if needed
+  if (data->num_points != max_num_points) {
+    CeedInt P_1d;
+
+    CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
+    data->num_points = max_num_points;
+
+    // -- Create interp matrix to Chebyshev coefficients
+    if (!data->d_chebyshev_interp_1d) {
+      CeedSize    interp_bytes;
+      CeedScalar *chebyshev_interp_1d;
+
+      interp_bytes = P_1d * Q_1d * sizeof(CeedScalar);
+      CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d));
+      CeedCall(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d));
+      CeedCallHip(ceed, hipMalloc((void **)&data->d_chebyshev_interp_1d, interp_bytes));
+      CeedCallHip(ceed, hipMemcpy(data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, hipMemcpyHostToDevice));
+      CeedCallBackend(CeedFree(&chebyshev_interp_1d));
+    }
+
+    // -- Compile kernels
+    char       *basis_kernel_source;
+    const char *basis_kernel_path;
+    CeedInt     num_comp;
+
+    if (data->moduleAtPoints) CeedCallHip(ceed, hipModuleUnload(data->moduleAtPoints));
+    CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
+    CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h", &basis_kernel_path));
+    CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n");
+    CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source));
+    CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n");
+    CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->moduleAtPoints, 9, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_BUF_LEN",
+                                    num_comp * CeedIntPow(Q_1d > P_1d ? Q_1d : P_1d, dim), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp,
+                                    "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_NUM_PTS",
+                                    max_num_points, "POINTS_BUFF_LEN",
+                                    max_num_points * CeedIntPow(Q_1d > max_num_points ? Q_1d : max_num_points, dim - 1)));
+    CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "InterpAtPoints", &data->InterpAtPoints));
+    CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "GradAtPoints", &data->GradAtPoints));
+    CeedCallBackend(CeedFree(&basis_kernel_path));
+    CeedCallBackend(CeedFree(&basis_kernel_source));
+  }
+
+  // Get read/write access to u, v
+  CeedCallBackend(CeedVectorGetArrayRead(x_ref, CEED_MEM_DEVICE, &d_x));
+  if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
+  else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
+  CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
+
+  // Clear v for transpose operation
+  if (is_transpose) {
+    CeedSize length;
+
+    CeedCallBackend(CeedVectorGetLength(v, &length));
+    CeedCallHip(ceed, hipMemset(d_v, 0, length * sizeof(CeedScalar)));
+  }
+
+  // Basis action
+  switch (eval_mode) {
+    case CEED_EVAL_INTERP: {
+      void         *interp_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &d_x, &d_u, &d_v};
+      const CeedInt block_size    = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
+
+      CeedCallBackend(CeedRunKernel_Hip(ceed, data->InterpAtPoints, num_elem, block_size, interp_args));
+    } break;
+    case CEED_EVAL_GRAD: {
+      void         *grad_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &d_x, &d_u, &d_v};
+      const CeedInt block_size  = max_block_size;
+
+      CeedCallBackend(CeedRunKernel_Hip(ceed, data->GradAtPoints, num_elem, block_size, grad_args));
+    } break;
+    case CEED_EVAL_WEIGHT:
+    case CEED_EVAL_NONE: /* handled separately below */
+      break;
+    // LCOV_EXCL_START
+    case CEED_EVAL_DIV:
+    case CEED_EVAL_CURL:
+      return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]);
+      // LCOV_EXCL_STOP
+  }
+
+  // Restore vectors, cover CEED_EVAL_NONE
+  CeedCallBackend(CeedVectorRestoreArrayRead(x_ref, &d_x));
+  CeedCallBackend(CeedVectorRestoreArray(v, &d_v));
+  if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u));
+  if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u));
+  return CEED_ERROR_SUCCESS;
+}
+
 //------------------------------------------------------------------------------
 // Destroy basis
 //------------------------------------------------------------------------------
@@ -255,10 +373,12 @@ static int CeedBasisDestroy_Hip_shared(CeedBasis basis) {
   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
   CeedCallBackend(CeedBasisGetData(basis, &data));
   CeedCallHip(ceed, hipModuleUnload(data->module));
+  if (data->moduleAtPoints) CeedCallHip(ceed, hipModuleUnload(data->moduleAtPoints));
   if (data->d_q_weight_1d) CeedCallHip(ceed, hipFree(data->d_q_weight_1d));
   CeedCallHip(ceed, hipFree(data->d_interp_1d));
   CeedCallHip(ceed, hipFree(data->d_grad_1d));
   CeedCallHip(ceed, hipFree(data->d_collo_grad_1d));
+  CeedCallHip(ceed, hipFree(data->d_chebyshev_interp_1d));
   CeedCallBackend(CeedFree(&data));
   return CEED_ERROR_SUCCESS;
 }
@@ -329,6 +449,7 @@ int CeedBasisCreateTensorH1_Hip_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d,
 
   // Register backend functions
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyTensor_Hip_shared));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAtPoints", CeedBasisApplyAtPoints_Hip_shared));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Hip_shared));
   return CEED_ERROR_SUCCESS;
 }
diff --git a/backends/hip-shared/ceed-hip-shared.h b/backends/hip-shared/ceed-hip-shared.h
index 6a7c99d048..8bc9d041a2 100644
--- a/backends/hip-shared/ceed-hip-shared.h
+++ b/backends/hip-shared/ceed-hip-shared.h
@@ -17,11 +17,16 @@ typedef struct {
   hipFunction_t Grad;
   hipFunction_t GradTranspose;
   hipFunction_t Weight;
+  hipModule_t   moduleAtPoints;
+  CeedInt       num_points;
+  hipFunction_t InterpAtPoints;
+  hipFunction_t GradAtPoints;
   CeedInt       block_sizes[3];  // interp, grad, weight thread block sizes
   CeedScalar   *d_interp_1d;
   CeedScalar   *d_grad_1d;
   CeedScalar   *d_collo_grad_1d;
   CeedScalar   *d_q_weight_1d;
+  CeedScalar   *d_chebyshev_interp_1d;
 } CeedBasis_Hip_shared;
 
 CEED_INTERN int CeedBasisCreateTensorH1_Hip_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d,

From 2d10e82cfebdb19665f7837a50e66638591485e2 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Mon, 17 Jun 2024 17:17:41 -0600
Subject: [PATCH 109/571] AtPoints - fix gpu thread usage

---
 backends/cuda-ref/ceed-cuda-ref-basis.c            |  2 +-
 backends/cuda-shared/ceed-cuda-shared-basis.c      |  2 +-
 backends/hip-ref/ceed-hip-ref-basis.c              |  2 +-
 backends/hip-shared/ceed-hip-shared-basis.c        |  2 +-
 .../cuda/cuda-ref-basis-tensor-at-points.h         | 14 ++++++++------
 .../hip/hip-ref-basis-tensor-at-points.h           | 14 ++++++++------
 6 files changed, 20 insertions(+), 16 deletions(-)

diff --git a/backends/cuda-ref/ceed-cuda-ref-basis.c b/backends/cuda-ref/ceed-cuda-ref-basis.c
index e1cf00f2eb..f89c3d970b 100644
--- a/backends/cuda-ref/ceed-cuda-ref-basis.c
+++ b/backends/cuda-ref/ceed-cuda-ref-basis.c
@@ -179,7 +179,7 @@ int CeedBasisApplyAtPoints_Cuda(CeedBasis basis, const CeedInt num_elem, const C
     } break;
     case CEED_EVAL_GRAD: {
       void         *grad_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &d_x, &d_u, &d_v};
-      const CeedInt block_size  = max_block_size;
+      const CeedInt block_size  = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
 
       CeedCallBackend(CeedRunKernel_Cuda(ceed, data->GradAtPoints, num_elem, block_size, grad_args));
     } break;
diff --git a/backends/cuda-shared/ceed-cuda-shared-basis.c b/backends/cuda-shared/ceed-cuda-shared-basis.c
index 3c90bc03d5..9e2f1b5d56 100644
--- a/backends/cuda-shared/ceed-cuda-shared-basis.c
+++ b/backends/cuda-shared/ceed-cuda-shared-basis.c
@@ -282,7 +282,7 @@ int CeedBasisApplyAtPoints_Cuda_shared(CeedBasis basis, const CeedInt num_elem,
     } break;
     case CEED_EVAL_GRAD: {
       void         *grad_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &d_x, &d_u, &d_v};
-      const CeedInt block_size  = max_block_size;
+      const CeedInt block_size  = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
 
       CeedCallBackend(CeedRunKernel_Cuda(ceed, data->GradAtPoints, num_elem, block_size, grad_args));
     } break;
diff --git a/backends/hip-ref/ceed-hip-ref-basis.c b/backends/hip-ref/ceed-hip-ref-basis.c
index 06e599d847..6ca84e9e21 100644
--- a/backends/hip-ref/ceed-hip-ref-basis.c
+++ b/backends/hip-ref/ceed-hip-ref-basis.c
@@ -178,7 +178,7 @@ int CeedBasisApplyAtPoints_Hip(CeedBasis basis, const CeedInt num_elem, const Ce
     } break;
     case CEED_EVAL_GRAD: {
       void         *grad_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &d_x, &d_u, &d_v};
-      const CeedInt block_size  = max_block_size;
+      const CeedInt block_size  = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
 
       CeedCallBackend(CeedRunKernel_Hip(ceed, data->GradAtPoints, num_elem, block_size, grad_args));
     } break;
diff --git a/backends/hip-shared/ceed-hip-shared-basis.c b/backends/hip-shared/ceed-hip-shared-basis.c
index 5bdf90a251..9ea556d1d6 100644
--- a/backends/hip-shared/ceed-hip-shared-basis.c
+++ b/backends/hip-shared/ceed-hip-shared-basis.c
@@ -341,7 +341,7 @@ int CeedBasisApplyAtPoints_Hip_shared(CeedBasis basis, const CeedInt num_elem, c
     } break;
     case CEED_EVAL_GRAD: {
       void         *grad_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &d_x, &d_u, &d_v};
-      const CeedInt block_size  = max_block_size;
+      const CeedInt block_size  = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
 
       CeedCallBackend(CeedRunKernel_Hip(ceed, data->GradAtPoints, num_elem, block_size, grad_args));
     } break;
diff --git a/include/ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h b/include/ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h
index 2784c1a4fc..74f162c27f 100644
--- a/include/ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h
+++ b/include/ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h
@@ -80,7 +80,8 @@ extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt
         }
 
         // Map from point
-        for (CeedInt p = blockIdx.x; p < BASIS_NUM_PTS; p += gridDim.x) {
+        __syncthreads();
+        for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
           pre  = 1;
           post = 1;
           for (CeedInt d = 0; d < BASIS_DIM; d++) {
@@ -96,7 +97,7 @@ extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt
             for (CeedInt a = 0; a < pre; a++) {
               for (CeedInt c = 0; c < post; c++) {
                 if (d == BASIS_DIM - 1) {
-                  for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] += chebyshev_x[j] * in[a * post + c];
+                  for (CeedInt j = 0; j < Q; j++) atomicAdd(&out[(a * Q + j) * post + c], chebyshev_x[j] * in[a * post + c]);
                 } else {
                   for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] = chebyshev_x[j] * in[a * post + c];
                 }
@@ -163,7 +164,7 @@ extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt
 
         // Map to point
         __syncthreads();
-        for (CeedInt p = blockIdx.x; p < BASIS_NUM_PTS; p += gridDim.x) {
+        for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
           pre  = BASIS_NUM_QPTS;
           post = 1;
           for (CeedInt d = 0; d < BASIS_DIM; d++) {
@@ -233,7 +234,8 @@ extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedInt is
         }
 
         // Map from point
-        for (CeedInt p = blockIdx.x; p < BASIS_NUM_PTS; p += gridDim.x) {
+        __syncthreads();
+        for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
           for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) {
             const CeedScalar *cur_u = u + elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride;
 
@@ -253,7 +255,7 @@ extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedInt is
               for (CeedInt a = 0; a < pre; a++) {
                 for (CeedInt c = 0; c < post; c++) {
                   if (dim_2 == BASIS_DIM - 1) {
-                    for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] += chebyshev_x[j] * in[a * post + c];
+                    for (CeedInt j = 0; j < Q; j++) atomicAdd(&out[(a * Q + j) * post + c], chebyshev_x[j] * in[a * post + c]);
                   } else {
                     for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] = chebyshev_x[j] * in[a * post + c];
                   }
@@ -320,7 +322,7 @@ extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedInt is
 
         // Map to point
         __syncthreads();
-        for (CeedInt p = blockIdx.x; p < BASIS_NUM_PTS; p += gridDim.x) {
+        for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
           for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) {
             CeedScalar *cur_v = v + elem * v_stride + dim_1 * v_dim_stride + comp * v_comp_stride;
 
diff --git a/include/ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h b/include/ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h
index 2784c1a4fc..74f162c27f 100644
--- a/include/ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h
+++ b/include/ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h
@@ -80,7 +80,8 @@ extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt
         }
 
         // Map from point
-        for (CeedInt p = blockIdx.x; p < BASIS_NUM_PTS; p += gridDim.x) {
+        __syncthreads();
+        for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
           pre  = 1;
           post = 1;
           for (CeedInt d = 0; d < BASIS_DIM; d++) {
@@ -96,7 +97,7 @@ extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt
             for (CeedInt a = 0; a < pre; a++) {
               for (CeedInt c = 0; c < post; c++) {
                 if (d == BASIS_DIM - 1) {
-                  for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] += chebyshev_x[j] * in[a * post + c];
+                  for (CeedInt j = 0; j < Q; j++) atomicAdd(&out[(a * Q + j) * post + c], chebyshev_x[j] * in[a * post + c]);
                 } else {
                   for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] = chebyshev_x[j] * in[a * post + c];
                 }
@@ -163,7 +164,7 @@ extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt
 
         // Map to point
         __syncthreads();
-        for (CeedInt p = blockIdx.x; p < BASIS_NUM_PTS; p += gridDim.x) {
+        for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
           pre  = BASIS_NUM_QPTS;
           post = 1;
           for (CeedInt d = 0; d < BASIS_DIM; d++) {
@@ -233,7 +234,8 @@ extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedInt is
         }
 
         // Map from point
-        for (CeedInt p = blockIdx.x; p < BASIS_NUM_PTS; p += gridDim.x) {
+        __syncthreads();
+        for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
           for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) {
             const CeedScalar *cur_u = u + elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride;
 
@@ -253,7 +255,7 @@ extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedInt is
               for (CeedInt a = 0; a < pre; a++) {
                 for (CeedInt c = 0; c < post; c++) {
                   if (dim_2 == BASIS_DIM - 1) {
-                    for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] += chebyshev_x[j] * in[a * post + c];
+                    for (CeedInt j = 0; j < Q; j++) atomicAdd(&out[(a * Q + j) * post + c], chebyshev_x[j] * in[a * post + c]);
                   } else {
                     for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] = chebyshev_x[j] * in[a * post + c];
                   }
@@ -320,7 +322,7 @@ extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedInt is
 
         // Map to point
         __syncthreads();
-        for (CeedInt p = blockIdx.x; p < BASIS_NUM_PTS; p += gridDim.x) {
+        for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
           for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) {
             CeedScalar *cur_v = v + elem * v_stride + dim_1 * v_dim_stride + comp * v_comp_stride;
 

From f7c9815f6bdd2aac16e9c7ed574a2a61404669c3 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Thu, 20 Jun 2024 15:16:38 -0600
Subject: [PATCH 110/571] AtPoints - ease memory requirement

---
 backends/cuda-ref/ceed-cuda-ref-basis.c                    | 7 +++----
 backends/cuda-shared/ceed-cuda-shared-basis.c              | 5 ++---
 backends/hip-ref/ceed-hip-ref-basis.c                      | 7 +++----
 backends/hip-shared/ceed-hip-shared-basis.c                | 5 ++---
 .../ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h | 4 ++--
 .../ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h   | 4 ++--
 6 files changed, 14 insertions(+), 18 deletions(-)

diff --git a/backends/cuda-ref/ceed-cuda-ref-basis.c b/backends/cuda-ref/ceed-cuda-ref-basis.c
index f89c3d970b..91958b6b25 100644
--- a/backends/cuda-ref/ceed-cuda-ref-basis.c
+++ b/backends/cuda-ref/ceed-cuda-ref-basis.c
@@ -145,10 +145,9 @@ int CeedBasisApplyAtPoints_Cuda(CeedBasis basis, const CeedInt num_elem, const C
     CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source));
     CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n");
     CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->moduleAtPoints, 9, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_BUF_LEN",
-                                     num_comp * CeedIntPow(Q_1d > P_1d ? Q_1d : P_1d, dim), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp,
+                                     Q_1d * CeedIntPow(Q_1d > P_1d ? Q_1d : P_1d, dim - 1), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp,
                                      "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_NUM_PTS",
-                                     max_num_points, "POINTS_BUFF_LEN",
-                                     max_num_points * CeedIntPow(Q_1d > max_num_points ? Q_1d : max_num_points, dim - 1)));
+                                     max_num_points, "POINTS_BUFF_LEN", CeedIntPow(Q_1d, dim - 1)));
     CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "InterpAtPoints", &data->InterpAtPoints));
     CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "GradAtPoints", &data->GradAtPoints));
     CeedCallBackend(CeedFree(&basis_kernel_path));
@@ -363,7 +362,7 @@ int CeedBasisCreateTensorH1_Cuda(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const
   CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source));
   CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n");
   CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 7, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_BUF_LEN",
-                                   num_comp * CeedIntPow(Q_1d > P_1d ? Q_1d : P_1d, dim), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp,
+                                   Q_1d * CeedIntPow(Q_1d > P_1d ? Q_1d : P_1d, dim - 1), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp,
                                    "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim)));
   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp));
   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Grad", &data->Grad));
diff --git a/backends/cuda-shared/ceed-cuda-shared-basis.c b/backends/cuda-shared/ceed-cuda-shared-basis.c
index 9e2f1b5d56..118d156a84 100644
--- a/backends/cuda-shared/ceed-cuda-shared-basis.c
+++ b/backends/cuda-shared/ceed-cuda-shared-basis.c
@@ -248,10 +248,9 @@ int CeedBasisApplyAtPoints_Cuda_shared(CeedBasis basis, const CeedInt num_elem,
     CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source));
     CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n");
     CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->moduleAtPoints, 9, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_BUF_LEN",
-                                     num_comp * CeedIntPow(Q_1d > P_1d ? Q_1d : P_1d, dim), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp,
+                                     Q_1d * CeedIntPow(Q_1d > P_1d ? Q_1d : P_1d, dim - 1), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp,
                                      "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_NUM_PTS",
-                                     max_num_points, "POINTS_BUFF_LEN",
-                                     max_num_points * CeedIntPow(Q_1d > max_num_points ? Q_1d : max_num_points, dim - 1)));
+                                     max_num_points, "POINTS_BUFF_LEN", CeedIntPow(Q_1d, dim - 1)));
     CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "InterpAtPoints", &data->InterpAtPoints));
     CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "GradAtPoints", &data->GradAtPoints));
     CeedCallBackend(CeedFree(&basis_kernel_path));
diff --git a/backends/hip-ref/ceed-hip-ref-basis.c b/backends/hip-ref/ceed-hip-ref-basis.c
index 6ca84e9e21..fd1a9fabde 100644
--- a/backends/hip-ref/ceed-hip-ref-basis.c
+++ b/backends/hip-ref/ceed-hip-ref-basis.c
@@ -144,10 +144,9 @@ int CeedBasisApplyAtPoints_Hip(CeedBasis basis, const CeedInt num_elem, const Ce
     CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source));
     CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n");
     CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->moduleAtPoints, 9, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_BUF_LEN",
-                                    num_comp * CeedIntPow(Q_1d > P_1d ? Q_1d : P_1d, dim), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp,
+                                    Q_1d * CeedIntPow(Q_1d > P_1d ? Q_1d : P_1d, dim - 1), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp,
                                     "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_NUM_PTS",
-                                    max_num_points, "POINTS_BUFF_LEN",
-                                    max_num_points * CeedIntPow(Q_1d > max_num_points ? Q_1d : max_num_points, dim - 1)));
+                                    max_num_points, "POINTS_BUFF_LEN", CeedIntPow(Q_1d, dim - 1)));
     CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "InterpAtPoints", &data->InterpAtPoints));
     CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "GradAtPoints", &data->GradAtPoints));
     CeedCallBackend(CeedFree(&basis_kernel_path));
@@ -362,7 +361,7 @@ int CeedBasisCreateTensorH1_Hip(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const C
   CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source));
   CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n");
   CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->module, 7, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_BUF_LEN",
-                                  num_comp * CeedIntPow(Q_1d > P_1d ? Q_1d : P_1d, dim), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp,
+                                  Q_1d * CeedIntPow(Q_1d > P_1d ? Q_1d : P_1d, dim - 1), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp,
                                   "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim)));
   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Interp", &data->Interp));
   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Grad", &data->Grad));
diff --git a/backends/hip-shared/ceed-hip-shared-basis.c b/backends/hip-shared/ceed-hip-shared-basis.c
index 9ea556d1d6..ff41f9efe6 100644
--- a/backends/hip-shared/ceed-hip-shared-basis.c
+++ b/backends/hip-shared/ceed-hip-shared-basis.c
@@ -307,10 +307,9 @@ int CeedBasisApplyAtPoints_Hip_shared(CeedBasis basis, const CeedInt num_elem, c
     CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source));
     CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n");
     CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->moduleAtPoints, 9, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_BUF_LEN",
-                                    num_comp * CeedIntPow(Q_1d > P_1d ? Q_1d : P_1d, dim), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp,
+                                    Q_1d * CeedIntPow(Q_1d > P_1d ? Q_1d : P_1d, dim - 1), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp,
                                     "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_NUM_PTS",
-                                    max_num_points, "POINTS_BUFF_LEN",
-                                    max_num_points * CeedIntPow(Q_1d > max_num_points ? Q_1d : max_num_points, dim - 1)));
+                                    max_num_points, "POINTS_BUFF_LEN", CeedIntPow(Q_1d, dim - 1)));
     CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "InterpAtPoints", &data->InterpAtPoints));
     CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "GradAtPoints", &data->GradAtPoints));
     CeedCallBackend(CeedFree(&basis_kernel_path));
diff --git a/include/ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h b/include/ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h
index 74f162c27f..9783dfd14c 100644
--- a/include/ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h
+++ b/include/ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h
@@ -47,7 +47,7 @@ extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt
                                           const CeedScalar *__restrict__ coords, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
   const CeedInt i = threadIdx.x;
 
-  __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 3 * BASIS_BUF_LEN];
+  __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D];
   CeedScalar           *s_chebyshev_interp_1d = s_mem;
   CeedScalar           *s_buffer_1            = s_mem + BASIS_Q_1D * BASIS_P_1D;
   CeedScalar           *s_buffer_2            = s_buffer_1 + BASIS_BUF_LEN;
@@ -200,7 +200,7 @@ extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedInt is
                                         const CeedScalar *__restrict__ coords, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
   const CeedInt i = threadIdx.x;
 
-  __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 3 * BASIS_BUF_LEN];
+  __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D];
   CeedScalar           *s_chebyshev_interp_1d = s_mem;
   CeedScalar           *s_buffer_1            = s_mem + BASIS_Q_1D * BASIS_P_1D;
   CeedScalar           *s_buffer_2            = s_buffer_1 + BASIS_BUF_LEN;
diff --git a/include/ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h b/include/ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h
index 74f162c27f..9783dfd14c 100644
--- a/include/ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h
+++ b/include/ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h
@@ -47,7 +47,7 @@ extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt
                                           const CeedScalar *__restrict__ coords, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
   const CeedInt i = threadIdx.x;
 
-  __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 3 * BASIS_BUF_LEN];
+  __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D];
   CeedScalar           *s_chebyshev_interp_1d = s_mem;
   CeedScalar           *s_buffer_1            = s_mem + BASIS_Q_1D * BASIS_P_1D;
   CeedScalar           *s_buffer_2            = s_buffer_1 + BASIS_BUF_LEN;
@@ -200,7 +200,7 @@ extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedInt is
                                         const CeedScalar *__restrict__ coords, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
   const CeedInt i = threadIdx.x;
 
-  __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 3 * BASIS_BUF_LEN];
+  __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D];
   CeedScalar           *s_chebyshev_interp_1d = s_mem;
   CeedScalar           *s_buffer_1            = s_mem + BASIS_Q_1D * BASIS_P_1D;
   CeedScalar           *s_buffer_2            = s_buffer_1 + BASIS_BUF_LEN;

From 14950a8eea941c036fb81fbb2249468a1035cf45 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 21 Jun 2024 10:35:02 -0600
Subject: [PATCH 111/571] magma - explicitly exclude BasisApplyAtPoints

---
 backends/magma/ceed-magma-basis.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/backends/magma/ceed-magma-basis.c b/backends/magma/ceed-magma-basis.c
index f1e52ba2db..de18a1a2fc 100644
--- a/backends/magma/ceed-magma-basis.c
+++ b/backends/magma/ceed-magma-basis.c
@@ -248,6 +248,14 @@ static int CeedBasisApply_Magma(CeedBasis basis, CeedInt num_elem, CeedTranspose
   return CEED_ERROR_SUCCESS;
 }
 
+//------------------------------------------------------------------------------
+// Basis apply - tensor AtPoints
+//------------------------------------------------------------------------------
+int CeedBasisApplyAtPoints_Magma(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, CeedEvalMode eval_mode,
+                                 CeedVector x_ref, CeedVector u, CeedVector v) {
+  return CeedError(CeedBasisReturnCeed(basis), CEED_ERROR_BACKEND, "Backend does not implement CeedBasisApplyAtPoints");
+}
+
 //------------------------------------------------------------------------------
 // Basis apply - non-tensor
 //------------------------------------------------------------------------------
@@ -580,6 +588,7 @@ int CeedBasisCreateTensorH1_Magma(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const
   CeedCallBackend(CeedBasisSetData(basis, impl));
 
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Magma));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAtPoints", CeedBasisApplyAtPoints_Magma));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Magma));
   return CEED_ERROR_SUCCESS;
 }

From 9dc0ea9a12d5a2dbb50983bee29c25b398979cc0 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Mon, 1 Jul 2024 11:33:29 -0600
Subject: [PATCH 112/571] rust - drop old tarpauln cfg statements

---
 examples/rust/ex1-volume/src/main.rs         | 1 -
 examples/rust/ex1-volume/src/opt.rs          | 1 -
 examples/rust/ex2-surface/src/main.rs        | 1 -
 examples/rust/ex2-surface/src/opt.rs         | 1 -
 examples/rust/ex3-vector-volume/src/main.rs  | 1 -
 examples/rust/ex3-vector-volume/src/opt.rs   | 1 -
 examples/rust/ex4-vector-surface/src/main.rs | 1 -
 examples/rust/ex4-vector-surface/src/opt.rs  | 1 -
 8 files changed, 8 deletions(-)

diff --git a/examples/rust/ex1-volume/src/main.rs b/examples/rust/ex1-volume/src/main.rs
index bea2e2f79c..6120d41999 100644
--- a/examples/rust/ex1-volume/src/main.rs
+++ b/examples/rust/ex1-volume/src/main.rs
@@ -26,7 +26,6 @@ mod transform;
 // ----------------------------------------------------------------------------
 // Example 1
 // ----------------------------------------------------------------------------
-#[cfg(not(tarpaulin_include))]
 fn main() -> libceed::Result<()> {
     let options = opt::Opt::parse();
     example_1(options)
diff --git a/examples/rust/ex1-volume/src/opt.rs b/examples/rust/ex1-volume/src/opt.rs
index 8fd8b71a0d..94fca2594a 100644
--- a/examples/rust/ex1-volume/src/opt.rs
+++ b/examples/rust/ex1-volume/src/opt.rs
@@ -15,7 +15,6 @@ use clap::Parser;
     name = "libCEED Rust Example 1 - Volume",
     about = "This example uses the mass matrix to compute the length, area, or volume of a region, depending upon runtime parameters."
 )]
-#[cfg(not(tarpaulin_include))]
 pub(crate) struct Opt {
     /// libCEED backend resource to use
     #[arg(name = "CEED", short, long = "ceed", default_value = "/cpu/self")]
diff --git a/examples/rust/ex2-surface/src/main.rs b/examples/rust/ex2-surface/src/main.rs
index 02349bc666..1c44e7cba5 100644
--- a/examples/rust/ex2-surface/src/main.rs
+++ b/examples/rust/ex2-surface/src/main.rs
@@ -27,7 +27,6 @@ mod transform;
 // ----------------------------------------------------------------------------
 // Example 2
 // ----------------------------------------------------------------------------
-#[cfg(not(tarpaulin_include))]
 fn main() -> libceed::Result<()> {
     let options = opt::Opt::parse();
     example_2(options)
diff --git a/examples/rust/ex2-surface/src/opt.rs b/examples/rust/ex2-surface/src/opt.rs
index 13b58f26d7..fcf903e501 100644
--- a/examples/rust/ex2-surface/src/opt.rs
+++ b/examples/rust/ex2-surface/src/opt.rs
@@ -15,7 +15,6 @@ use clap::Parser;
     name = "libCEED Rust Example 2 - Surface Area",
     about = "This example illustrates a simple usage of libCEED to compute the surface area of a body using matrix-free application of a diffusion operator."
 )]
-#[cfg(not(tarpaulin_include))]
 pub(crate) struct Opt {
     /// libCEED backend resource to use
     #[arg(name = "CEED", short, long = "ceed", default_value = "/cpu/self")]
diff --git a/examples/rust/ex3-vector-volume/src/main.rs b/examples/rust/ex3-vector-volume/src/main.rs
index 9b3cced2b8..24d40ae1a0 100644
--- a/examples/rust/ex3-vector-volume/src/main.rs
+++ b/examples/rust/ex3-vector-volume/src/main.rs
@@ -27,7 +27,6 @@ mod transform;
 // ----------------------------------------------------------------------------
 // Example 3
 // ----------------------------------------------------------------------------
-#[cfg(not(tarpaulin_include))]
 fn main() -> libceed::Result<()> {
     let options = opt::Opt::parse();
     example_3(options)
diff --git a/examples/rust/ex3-vector-volume/src/opt.rs b/examples/rust/ex3-vector-volume/src/opt.rs
index 5de7c68f08..7ece85540f 100644
--- a/examples/rust/ex3-vector-volume/src/opt.rs
+++ b/examples/rust/ex3-vector-volume/src/opt.rs
@@ -15,7 +15,6 @@ use clap::Parser;
     name = "libCEED Rust Example 3 - Vector Volume",
     about = "This example uses the mass matrix to compute the length, area, or volume of a region in triplicate, depending upon runtime parameters."
 )]
-#[cfg(not(tarpaulin_include))]
 pub(crate) struct Opt {
     /// libCEED backend resource to use
     #[arg(name = "CEED", short, long = "ceed", default_value = "/cpu/self")]
diff --git a/examples/rust/ex4-vector-surface/src/main.rs b/examples/rust/ex4-vector-surface/src/main.rs
index 5847d8033f..5b788274e8 100644
--- a/examples/rust/ex4-vector-surface/src/main.rs
+++ b/examples/rust/ex4-vector-surface/src/main.rs
@@ -28,7 +28,6 @@ mod transform;
 // ----------------------------------------------------------------------------
 // Example 4
 // ----------------------------------------------------------------------------
-#[cfg(not(tarpaulin_include))]
 fn main() -> libceed::Result<()> {
     let options = opt::Opt::parse();
     example_4(options)
diff --git a/examples/rust/ex4-vector-surface/src/opt.rs b/examples/rust/ex4-vector-surface/src/opt.rs
index 8f58427120..7b335a8e53 100644
--- a/examples/rust/ex4-vector-surface/src/opt.rs
+++ b/examples/rust/ex4-vector-surface/src/opt.rs
@@ -15,7 +15,6 @@ use clap::Parser;
     name = "libCEED Rust Example 4 - Vector Surface Area",
     about = "This example illustrates a simple usage of libCEED to compute the surface area of a body using matrix-free application of a 3 component vector diffusion operator."
 )]
-#[cfg(not(tarpaulin_include))]
 pub(crate) struct Opt {
     /// libCEED backend resource to use
     #[arg(name = "CEED", short, long = "ceed", default_value = "/cpu/self")]

From f5113f16dad8d7c9d407fe5e4b38723ab98efdb3 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Mon, 1 Jul 2024 11:34:32 -0600
Subject: [PATCH 113/571] rust - slightly reduce ex03 1D tol

---
 examples/rust/ex3-vector-volume/src/main.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/rust/ex3-vector-volume/src/main.rs b/examples/rust/ex3-vector-volume/src/main.rs
index 24d40ae1a0..a2102e5d0f 100644
--- a/examples/rust/ex3-vector-volume/src/main.rs
+++ b/examples/rust/ex3-vector-volume/src/main.rs
@@ -268,7 +268,7 @@ fn example_3(options: opt::Opt) -> libceed::Result<()> {
         );
     }
     let tolerance = match dim {
-        1 => 100.0 * libceed::EPSILON,
+        1 => 200.0 * libceed::EPSILON,
         _ => 1E-5,
     };
     let error = (volume - exact_volume).abs();

From 392d940c90dcc80d2c8cd0455511b2e57938a796 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Mon, 1 Jul 2024 11:37:27 -0600
Subject: [PATCH 114/571] rust - slightly loosen tols on doctests

---
 rust/libceed/src/operator.rs | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/rust/libceed/src/operator.rs b/rust/libceed/src/operator.rs
index 91e6b6f4a4..bd5d7f37f4 100644
--- a/rust/libceed/src/operator.rs
+++ b/rust/libceed/src/operator.rs
@@ -1625,7 +1625,7 @@ impl<'a> Operator<'a> {
     /// // Check
     /// let sum: Scalar = v_fine.view()?.iter().sum();
     /// assert!(
-    ///     (sum - 2.0).abs() < 50.0 * libceed::EPSILON,
+    ///     (sum - 2.0).abs() < 200.0 * libceed::EPSILON,
     ///     "Incorrect interval length computed"
     /// );
     ///
@@ -1635,7 +1635,7 @@ impl<'a> Operator<'a> {
     /// // Check
     /// let sum: Scalar = v_coarse.view()?.iter().sum();
     /// assert!(
-    ///     (sum - 2.0).abs() < 50.0 * libceed::EPSILON,
+    ///     (sum - 2.0).abs() < 200.0 * libceed::EPSILON,
     ///     "Incorrect interval length computed"
     /// );
     /// # Ok(())
@@ -1814,7 +1814,7 @@ impl<'a> Operator<'a> {
     /// // Check
     /// let sum: Scalar = v_fine.view()?.iter().sum();
     /// assert!(
-    ///     (sum - 2.0).abs() < 10.0 * libceed::EPSILON,
+    ///     (sum - 2.0).abs() < 200.0 * libceed::EPSILON,
     ///     "Incorrect interval length computed"
     /// );
     ///
@@ -1824,7 +1824,7 @@ impl<'a> Operator<'a> {
     /// // Check
     /// let sum: Scalar = v_coarse.view()?.iter().sum();
     /// assert!(
-    ///     (sum - 2.0).abs() < 10.0 * libceed::EPSILON,
+    ///     (sum - 2.0).abs() < 200.0 * libceed::EPSILON,
     ///     "Incorrect interval length computed"
     /// );
     /// # Ok(())
@@ -2005,7 +2005,7 @@ impl<'a> Operator<'a> {
     /// // Check
     /// let sum: Scalar = v_fine.view()?.iter().sum();
     /// assert!(
-    ///     (sum - 2.0).abs() < 10.0 * libceed::EPSILON,
+    ///     (sum - 2.0).abs() < 200.0 * libceed::EPSILON,
     ///     "Incorrect interval length computed"
     /// );
     ///
@@ -2015,7 +2015,7 @@ impl<'a> Operator<'a> {
     /// // Check
     /// let sum: Scalar = v_coarse.view()?.iter().sum();
     /// assert!(
-    ///     (sum - 2.0).abs() < 10.0 * libceed::EPSILON,
+    ///     (sum - 2.0).abs() < 200.0 * libceed::EPSILON,
     ///     "Incorrect interval length computed"
     /// );
     /// # Ok(())

From 38690fecfcf11bfdf51c3082dadeac248fcdcecf Mon Sep 17 00:00:00 2001
From: James Wright <james@jameswright.xyz>
Date: Tue, 2 Jul 2024 16:25:18 -0600
Subject: [PATCH 115/571] fluids: Fix memory leaks

As done in HONEE: https://gitlab.com/phypid/honee/-/merge_requests/18
---
 examples/fluids/navierstokes.c                  | 17 ++++++++++++++++-
 examples/fluids/problems/blasius.c              | 13 ++++++++++---
 examples/fluids/problems/eulervortex.c          |  1 +
 examples/fluids/problems/newtonian.c            |  2 +-
 examples/fluids/problems/stg_shur14.c           |  2 ++
 examples/fluids/src/differential_filter.c       |  7 +++++++
 examples/fluids/src/misc.c                      | 10 +++++-----
 examples/fluids/src/setuplibceed.c              |  1 -
 .../fluids/src/strong_boundary_conditions.c     |  9 +++++----
 examples/fluids/src/turb_spanstats.c            |  1 +
 10 files changed, 48 insertions(+), 15 deletions(-)

diff --git a/examples/fluids/navierstokes.c b/examples/fluids/navierstokes.c
index ed0cafb8dd..4f2c5ae93f 100644
--- a/examples/fluids/navierstokes.c
+++ b/examples/fluids/navierstokes.c
@@ -328,7 +328,22 @@ int main(int argc, char **argv) {
 
   PetscCall(PetscFree(app_ctx->amat_type));
   PetscCall(PetscFree(app_ctx->wall_forces.walls));
-  PetscCall(PetscViewerDestroy(&app_ctx->wall_forces.viewer));
+  {
+    const char *filename  = NULL;
+    PetscBool   is_stdout = PETSC_FALSE;
+
+    if (app_ctx->wall_forces.viewer) {
+      PetscCall(PetscViewerFileGetName(app_ctx->wall_forces.viewer, &filename));
+      if (filename) PetscCall(PetscStrncmp(filename, "stdout", 7, &is_stdout));
+      if (!is_stdout) PetscCall(PetscViewerDestroy(&app_ctx->wall_forces.viewer));
+    }
+
+    if (app_ctx->turb_spanstats_viewer) {
+      PetscCall(PetscViewerFileGetName(app_ctx->turb_spanstats_viewer, &filename));
+      if (filename) PetscCall(PetscStrncmp(filename, "stdout", 7, &is_stdout));
+      if (!is_stdout) PetscCall(PetscViewerDestroy(&app_ctx->turb_spanstats_viewer));
+    }
+  }
 
   // -- Structs
   for (PetscInt i = 0; i < problem->num_bc_defs; i++) {
diff --git a/examples/fluids/problems/blasius.c b/examples/fluids/problems/blasius.c
index bb51102f6f..5481e90896 100644
--- a/examples/fluids/problems/blasius.c
+++ b/examples/fluids/problems/blasius.c
@@ -119,26 +119,33 @@ static PetscErrorCode GetYNodeLocs(const MPI_Comm comm, const char path[PETSC_MA
   FILE          *fp;
   const PetscInt char_array_len = 512;
   char           line[char_array_len];
-  char         **array;
   PetscReal     *node_locs;
 
   PetscFunctionBeginUser;
   PetscCall(PetscFOpen(comm, path, "r", &fp));
   PetscCall(PetscSynchronizedFGets(comm, fp, char_array_len, line));
-  PetscCall(PetscStrToArray(line, ' ', &ndims, &array));
 
-  for (PetscInt i = 0; i < ndims; i++) dims[i] = atoi(array[i]);
+  {
+    char **array;
+
+    PetscCall(PetscStrToArray(line, ' ', &ndims, &array));
+    for (PetscInt i = 0; i < ndims; i++) dims[i] = atoi(array[i]);
+    PetscCall(PetscStrToArrayDestroy(ndims, array));
+  }
   if (ndims < 2) dims[1] = 1;  // Assume 1 column of data is not otherwise specified
   *nynodes = dims[0];
   PetscCall(PetscMalloc1(*nynodes, &node_locs));
 
   for (PetscInt i = 0; i < dims[0]; i++) {
+    char **array;
+
     PetscCall(PetscSynchronizedFGets(comm, fp, char_array_len, line));
     PetscCall(PetscStrToArray(line, ' ', &ndims, &array));
     PetscCheck(ndims == dims[1], comm, PETSC_ERR_FILE_UNEXPECTED,
                "Line %" PetscInt_FMT " of %s does not contain correct number of columns (%d instead of %d)", i, path, ndims, dims[1]);
 
     node_locs[i] = (PetscReal)atof(array[0]);
+    PetscCall(PetscStrToArrayDestroy(ndims, array));
   }
   PetscCall(PetscFClose(comm, fp));
   *pynodes = node_locs;
diff --git a/examples/fluids/problems/eulervortex.c b/examples/fluids/problems/eulervortex.c
index 8397561102..c0b4bff51f 100644
--- a/examples/fluids/problems/eulervortex.c
+++ b/examples/fluids/problems/eulervortex.c
@@ -138,6 +138,7 @@ PetscErrorCode NS_EULER_VORTEX(ProblemData problem, DM dm, void *ctx, SimpleBC b
   PetscCallCeed(ceed, CeedQFunctionContextReferenceCopy(euler_context, &problem->apply_vol_ifunction.qfunction_context));
   PetscCallCeed(ceed, CeedQFunctionContextReferenceCopy(euler_context, &problem->apply_inflow.qfunction_context));
   PetscCallCeed(ceed, CeedQFunctionContextReferenceCopy(euler_context, &problem->apply_outflow.qfunction_context));
+  PetscCallCeed(ceed, CeedQFunctionContextDestroy(&euler_context));
   PetscFunctionReturn(PETSC_SUCCESS);
 }
 
diff --git a/examples/fluids/problems/newtonian.c b/examples/fluids/problems/newtonian.c
index 6d69ca0f63..bc673e00dd 100644
--- a/examples/fluids/problems/newtonian.c
+++ b/examples/fluids/problems/newtonian.c
@@ -203,6 +203,7 @@ PetscErrorCode CreateKSPMassOperator_NewtonianStabilized(User user, CeedOperator
   PetscCallCeed(ceed, CeedOperatorSetField(*op_mass, "v", elem_restr_q, basis_q, CEED_VECTOR_ACTIVE));
   PetscCallCeed(ceed, CeedOperatorSetField(*op_mass, "Grad_v", elem_restr_q, basis_q, CEED_VECTOR_ACTIVE));
 
+  PetscCallCeed(ceed, CeedQFunctionContextDestroy(&qf_ctx));
   PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_mass));
   PetscFunctionReturn(PETSC_SUCCESS);
 }
@@ -290,7 +291,6 @@ PetscErrorCode NS_NEWTONIAN_IG(ProblemData problem, DM dm, void *ctx, SimpleBC b
       problem->apply_inflow_jacobian.qfunction     = BoundaryIntegral_Jacobian_Conserv;
       problem->apply_inflow_jacobian.qfunction_loc = BoundaryIntegral_Jacobian_Conserv_loc;
       break;
-
     case STATEVAR_PRIMITIVE:
       problem->ics.qfunction                       = ICsNewtonianIG_Prim;
       problem->ics.qfunction_loc                   = ICsNewtonianIG_Prim_loc;
diff --git a/examples/fluids/problems/stg_shur14.c b/examples/fluids/problems/stg_shur14.c
index 5deb62c787..58be441f81 100644
--- a/examples/fluids/problems/stg_shur14.c
+++ b/examples/fluids/problems/stg_shur14.c
@@ -99,6 +99,7 @@ static PetscErrorCode ReadStgInflow(const MPI_Comm comm, const char path[PETSC_M
     PetscCheck(wall_dist[i] >= 0, comm, PETSC_ERR_FILE_UNEXPECTED, "Distance to wall in %s cannot be negative", path);
     PetscCheck(lt[i] >= 0, comm, PETSC_ERR_FILE_UNEXPECTED, "Turbulent length scale in %s cannot be negative", path);
     PetscCheck(eps[i] >= 0, comm, PETSC_ERR_FILE_UNEXPECTED, "Turbulent dissipation in %s cannot be negative", path);
+    PetscCall(PetscStrToArrayDestroy(ndims, array));
   }
   CeedScalar(*cij)[stg_ctx->nprofs] = (CeedScalar(*)[stg_ctx->nprofs]) & stg_ctx->data[stg_ctx->offsets.cij];
   PetscCall(CalcCholeskyDecomp(comm, stg_ctx->nprofs, rij, cij));
@@ -144,6 +145,7 @@ static PetscErrorCode ReadStgRand(const MPI_Comm comm, const char path[PETSC_MAX
     sigma[0][i] = (CeedScalar)atof(array[4]);
     sigma[1][i] = (CeedScalar)atof(array[5]);
     sigma[2][i] = (CeedScalar)atof(array[6]);
+    PetscCall(PetscStrToArrayDestroy(ndims, array));
   }
   PetscCall(PetscFClose(comm, fp));
   PetscFunctionReturn(PETSC_SUCCESS);
diff --git a/examples/fluids/src/differential_filter.c b/examples/fluids/src/differential_filter.c
index 04d264a112..c476f1896c 100644
--- a/examples/fluids/src/differential_filter.c
+++ b/examples/fluids/src/differential_filter.c
@@ -8,6 +8,7 @@
 /// Functions for setting up and performing differential filtering
 
 #include "../qfunctions//differential_filter.h"
+#include <ceed.h>
 
 #include <petscdmplex.h>
 
@@ -70,6 +71,9 @@ PetscErrorCode DifferentialFilterCreateOperators(Ceed ceed, User user, CeedData
 
       PetscCall(PetscSNPrintf(field_name, PETSC_MAX_PATH_LEN, "v%" PetscInt_FMT, dm_field));
       PetscCallCeed(ceed, CeedOperatorSetField(op_rhs, field_name, elem_restr_filter, basis_filter, CEED_VECTOR_ACTIVE));
+
+      PetscCallCeed(ceed, CeedElemRestrictionDestroy(&elem_restr_filter));
+      PetscCallCeed(ceed, CeedBasisDestroy(&basis_filter));
     }
 
     PetscCall(OperatorApplyContextCreate(user->dm, dm_filter, ceed, op_rhs, NULL, NULL, user->Q_loc, NULL, &diff_filter->op_rhs_ctx));
@@ -150,6 +154,9 @@ PetscErrorCode DifferentialFilterCreateOperators(Ceed ceed, User user, CeedData
       PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_lhs));
       PetscCallCeed(ceed, CeedOperatorDestroy(&op_lhs_sub));
     }
+    PetscCallCeed(ceed, CeedVectorDestroy(&grid_aniso_ceed));
+    PetscCallCeed(ceed, CeedElemRestrictionDestroy(&elem_restr_grid_aniso));
+
     PetscCallCeed(ceed, CeedOperatorGetContextFieldLabel(op_lhs, "filter width scaling", &diff_filter->filter_width_scaling_label));
     PetscCall(MatCeedCreate(dm_filter, dm_filter, op_lhs, NULL, &mat_lhs));
 
diff --git a/examples/fluids/src/misc.c b/examples/fluids/src/misc.c
index f60f931555..5658f109f1 100644
--- a/examples/fluids/src/misc.c
+++ b/examples/fluids/src/misc.c
@@ -341,24 +341,24 @@ PetscErrorCode PhastaDatFileGetNRows(const MPI_Comm comm, const char path[PETSC_
 
 PetscErrorCode PhastaDatFileReadToArrayReal(MPI_Comm comm, const char path[PETSC_MAX_PATH_LEN], PetscReal array[]) {
   PetscInt       dims[2];
-  int            ndims;
   FILE          *fp;
   const PetscInt char_array_len = 512;
   char           line[char_array_len];
-  char         **row_array;
 
   PetscFunctionBeginUser;
   PetscCall(PhastaDatFileOpen(comm, path, char_array_len, dims, &fp));
 
   for (PetscInt i = 0; i < dims[0]; i++) {
+    int    ndims;
+    char **row_array;
+
     PetscCall(PetscSynchronizedFGets(comm, fp, char_array_len, line));
     PetscCall(PetscStrToArray(line, ' ', &ndims, &row_array));
     PetscCheck(ndims == dims[1], comm, PETSC_ERR_FILE_UNEXPECTED,
                "Line %" PetscInt_FMT " of %s does not contain enough columns (%d instead of %" PetscInt_FMT ")", i, path, ndims, dims[1]);
 
-    for (PetscInt j = 0; j < dims[1]; j++) {
-      array[i * dims[1] + j] = (PetscReal)atof(row_array[j]);
-    }
+    for (PetscInt j = 0; j < dims[1]; j++) array[i * dims[1] + j] = (PetscReal)atof(row_array[j]);
+    PetscCall(PetscStrToArrayDestroy(ndims, row_array));
   }
 
   PetscCall(PetscFClose(comm, fp));
diff --git a/examples/fluids/src/setuplibceed.c b/examples/fluids/src/setuplibceed.c
index 628b370617..29b2d8f825 100644
--- a/examples/fluids/src/setuplibceed.c
+++ b/examples/fluids/src/setuplibceed.c
@@ -81,7 +81,6 @@ static PetscErrorCode CreateKSPMass(User user, ProblemData problem) {
       PetscCall(KSPSetType(user->mass_ksp, KSPPREONLY));
     }
     PetscCall(KSPSetFromOptions_WithMatCeed(user->mass_ksp, mat_mass));
-    PetscCall(KSPSetFromOptions(user->mass_ksp));
     PetscCall(VecDestroy(&Zeros_loc));
     PetscCall(MatDestroy(&mat_mass));
   }
diff --git a/examples/fluids/src/strong_boundary_conditions.c b/examples/fluids/src/strong_boundary_conditions.c
index 532f486412..eeb48cb7af 100644
--- a/examples/fluids/src/strong_boundary_conditions.c
+++ b/examples/fluids/src/strong_boundary_conditions.c
@@ -53,6 +53,7 @@ PetscErrorCode SetupStrongSTG_Ceed(Ceed ceed, CeedData ceed_data, DM dm, Problem
 
   // Setup STG Setup QFunction
   PetscCall(SetupStrongStg_PreProcessing(ceed, problem, num_comp_x, stg_data_size, dXdx_size, &qf_stgdata));
+  PetscCall(SetupStrongStg_QF(ceed, problem, num_comp_x, num_comp_q, stg_data_size, dXdx_size, &qf_strongbc));
 
   // Compute contribution on each boundary face
   for (CeedInt i = 0; i < bc->num_inflow; i++) {
@@ -92,8 +93,7 @@ PetscErrorCode SetupStrongSTG_Ceed(Ceed ceed, CeedData ceed_data, DM dm, Problem
 
     PetscCallCeed(ceed, CeedOperatorApply(op_stgdata, CEED_VECTOR_NONE, stg_data, CEED_REQUEST_IMMEDIATE));
 
-    // -- Setup BC QFunctions
-    PetscCall(SetupStrongStg_QF(ceed, problem, num_comp_x, num_comp_q, stg_data_size, dXdx_size, &qf_strongbc));
+    // -- Setup BC Sub Operator
     PetscCallCeed(ceed, CeedOperatorCreate(ceed, qf_strongbc, NULL, NULL, &op_strong_bc_sub));
     PetscCallCeed(ceed, CeedOperatorSetName(op_strong_bc_sub, "Strong STG"));
 
@@ -117,8 +117,6 @@ PetscErrorCode SetupStrongSTG_Ceed(Ceed ceed, CeedData ceed_data, DM dm, Problem
     PetscCallCeed(ceed, CeedElemRestrictionDestroy(&elem_restr_scale));
     PetscCallCeed(ceed, CeedElemRestrictionDestroy(&elem_restr_stgdata));
     PetscCallCeed(ceed, CeedElemRestrictionDestroy(&elem_restr_dXdx));
-    PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_strongbc));
-    PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_stgdata));
     PetscCallCeed(ceed, CeedOperatorDestroy(&op_strong_bc_sub));
     PetscCallCeed(ceed, CeedOperatorDestroy(&op_setup));
     PetscCallCeed(ceed, CeedOperatorDestroy(&op_stgdata));
@@ -127,6 +125,8 @@ PetscErrorCode SetupStrongSTG_Ceed(Ceed ceed, CeedData ceed_data, DM dm, Problem
   PetscCallCeed(ceed, CeedOperatorGetContextFieldLabel(op_strong_bc, "solution time", &phys->stg_solution_time_label));
 
   PetscCallCeed(ceed, CeedBasisDestroy(&basis_x_to_q_sur));
+  PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_strongbc));
+  PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_stgdata));
   PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_setup));
   PetscFunctionReturn(PETSC_SUCCESS);
 }
@@ -181,5 +181,6 @@ PetscErrorCode SetupStrongBC_Ceed(Ceed ceed, CeedData ceed_data, DM dm, User use
   PetscCall(OperatorApplyContextCreate(NULL, NULL, ceed, op_strong_bc, CEED_VECTOR_NONE, NULL, NULL, NULL, &user->op_strong_bc_ctx));
 
   PetscCall(PetscObjectComposeFunction((PetscObject)dm, "DMPlexInsertBoundaryValues_C", DMPlexInsertBoundaryValues_StrongBCCeed));
+  PetscCallCeed(ceed, CeedOperatorDestroy(&op_strong_bc));
   PetscFunctionReturn(PETSC_SUCCESS);
 }
diff --git a/examples/fluids/src/turb_spanstats.c b/examples/fluids/src/turb_spanstats.c
index 6cd7370a37..08f9ef36b9 100644
--- a/examples/fluids/src/turb_spanstats.c
+++ b/examples/fluids/src/turb_spanstats.c
@@ -65,6 +65,7 @@ PetscErrorCode CreateStatsDM(User user, ProblemData problem, PetscInt degree) {
       for (PetscInt i = 0; i < nleaves; i++) {
         PetscCall(DMLabelSetValue(label, ilocal[i], 1));
       }
+      PetscCall(PetscSFDestroy(&inv_isoperiodicface));
     } else {
       PetscCall(DMGetLabel(user->dm, "Face Sets", &label));
     }

From e00f3be8a4d9def604e9e5af48283226b271555e Mon Sep 17 00:00:00 2001
From: James Wright <james@jameswright.xyz>
Date: Wed, 3 Jul 2024 10:31:29 -0600
Subject: [PATCH 116/571] basis: Projection uses highest dimension topology

---
 interface/ceed-basis.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/interface/ceed-basis.c b/interface/ceed-basis.c
index 30207eb3b3..26c52a8d7d 100644
--- a/interface/ceed-basis.c
+++ b/interface/ceed-basis.c
@@ -1139,7 +1139,7 @@ int CeedBasisCreateTensorH1Lagrange(Ceed ceed, CeedInt dim, CeedInt num_comp, Ce
   @brief Create a non tensor-product basis for \f$H^1\f$ discretizations
 
   @param[in]  ceed      `Ceed` object used to create the `CeedBasis`
-  @param[in]  topo      Topology of element, e.g. hypercube, simplex, ect
+  @param[in]  topo      Topology of element, e.g. hypercube, simplex, etc
   @param[in]  num_comp  Number of field components (1 for scalar fields)
   @param[in]  num_nodes Total number of nodes
   @param[in]  num_qpts  Total number of quadrature points
@@ -1368,7 +1368,7 @@ int CeedBasisCreateProjection(CeedBasis basis_from, CeedBasis basis_to, CeedBasi
     CeedInt          num_nodes_to, num_nodes_from;
     CeedElemTopology topo;
 
-    CeedCall(CeedBasisGetTopology(basis_to, &topo));
+    CeedCall(CeedBasisGetTopology(basis_from, &topo));
     CeedCall(CeedBasisGetNumNodes(basis_from, &num_nodes_from));
     CeedCall(CeedBasisGetNumNodes(basis_to, &num_nodes_to));
     CeedCall(CeedBasisCreateH1(ceed, topo, num_comp, num_nodes_from, num_nodes_to, interp_project, grad_project, NULL, NULL, basis_project));

From b3ed00e56067378526ecb436b5f53e6a8373c1df Mon Sep 17 00:00:00 2001
From: James Wright <james@jameswright.xyz>
Date: Mon, 8 Jul 2024 10:21:05 -0600
Subject: [PATCH 117/571] fix(basis): Use basis_from dim for projection
 matrices

---
 interface/ceed-basis.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/interface/ceed-basis.c b/interface/ceed-basis.c
index 26c52a8d7d..d110988371 100644
--- a/interface/ceed-basis.c
+++ b/interface/ceed-basis.c
@@ -234,7 +234,7 @@ static int CeedBasisCreateProjectionMatrices(CeedBasis basis_from, CeedBasis bas
   CeedScalar       *interp_to_inv, *interp_from;
   const CeedScalar *interp_to_source = NULL, *interp_from_source = NULL, *grad_from_source = NULL;
 
-  CeedCall(CeedBasisGetDimension(basis_to, &dim));
+  CeedCall(CeedBasisGetDimension(basis_from, &dim));
   if (are_both_tensor) {
     CeedCall(CeedBasisGetInterp1D(basis_to, &interp_to_source));
     CeedCall(CeedBasisGetInterp1D(basis_from, &interp_from_source));

From 706bf528b2ed1928aa5b7f1351e7e0b12042c20e Mon Sep 17 00:00:00 2001
From: James Wright <james@jameswright.xyz>
Date: Mon, 8 Jul 2024 11:52:15 -0600
Subject: [PATCH 118/571] test: Add mixed topology basis projection test

---
 tests/t319-basis.c | 36 +++++++++++++++++++++++
 tests/t319-basis.h | 72 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 108 insertions(+)
 create mode 100644 tests/t319-basis.h

diff --git a/tests/t319-basis.c b/tests/t319-basis.c
index 2aee7d4e4f..18afff0a9e 100644
--- a/tests/t319-basis.c
+++ b/tests/t319-basis.c
@@ -1,6 +1,7 @@
 /// @file
 /// Test projection interp and grad in multiple dimensions
 /// \test Test projection interp and grad in multiple dimensions
+#include "t319-basis.h"
 #include <ceed.h>
 #include <math.h>
 #include <stdio.h>
@@ -201,6 +202,41 @@ int main(int argc, char **argv) {
     CeedBasisDestroy(&basis_to_nontensor);
     CeedBasisDestroy(&basis_project);
   }
+
+  // Test projection between basis of different topological dimension
+  {
+    CeedInt   face_dim = 2, P_1D = 2;
+    CeedBasis basis_face, basis_cell_to_face, basis_proj;
+
+    CeedScalar       *q_ref = NULL, *q_weights = NULL;
+    const CeedScalar *grad, *interp;
+    CeedInt           P, Q;
+    GetCellToFaceTabulation(CEED_GAUSS, &P, &Q, &interp, &grad);
+
+    CeedBasisCreateTensorH1Lagrange(ceed, face_dim, 1, 2, P_1D, CEED_GAUSS, &basis_face);
+    CeedBasisCreateH1(ceed, CEED_TOPOLOGY_HEX, 1, P, Q, (CeedScalar *)interp, (CeedScalar *)grad, q_ref, q_weights, &basis_cell_to_face);
+    CeedBasisCreateProjection(basis_cell_to_face, basis_face, &basis_proj);
+    const CeedScalar *interp_proj, *grad_proj, *interp_proj_ref, *grad_proj_ref;
+
+    GetCellToFaceTabulation(CEED_GAUSS_LOBATTO, NULL, NULL, &interp_proj_ref, &grad_proj_ref);
+    CeedBasisGetInterp(basis_proj, &interp_proj);
+    CeedBasisGetGrad(basis_proj, &grad_proj);
+    CeedScalar tol = 100 * CEED_EPSILON;
+
+    for (CeedInt i = 0; i < 4 * 8; i++) {
+      if (fabs(interp_proj[i] - ((CeedScalar *)interp_proj_ref)[i]) > tol)
+        printf("Mixed Topology Projection: interp[%" CeedInt_FMT "] expected %f, got %f\n", i, interp_proj[i], ((CeedScalar *)interp_proj_ref)[i]);
+    }
+
+    for (CeedInt i = 0; i < 3 * 4 * 8; i++) {
+      if (fabs(grad_proj[i] - ((CeedScalar *)grad_proj_ref)[i]) > tol)
+        printf("Mixed Topology Projection: grad[%" CeedInt_FMT "] expected %f, got %f\n", i, grad_proj[i], ((CeedScalar *)grad_proj_ref)[i]);
+    }
+
+    CeedBasisDestroy(&basis_face);
+    CeedBasisDestroy(&basis_cell_to_face);
+    CeedBasisDestroy(&basis_proj);
+  }
   CeedDestroy(&ceed);
   return 0;
 }
diff --git a/tests/t319-basis.h b/tests/t319-basis.h
new file mode 100644
index 0000000000..6f1a0cb5c9
--- /dev/null
+++ b/tests/t319-basis.h
@@ -0,0 +1,72 @@
+// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+#include <ceed.h>
+
+// Interpolation matrices for cell-to-face of Q1 hexahedral element onto it's "5" face (in PETSc)
+// Nodes are at Gauss-Lobatto points and quadrature points are Gauss, all over [-1,1] domain range
+const CeedScalar Q1_interp_gauss[4][8] = {
+    {0.62200846792814612,  0, 0.16666666666666669,  0, 0.16666666666666669,  0, 0.044658198738520463, 0},
+    {0.16666666666666669,  0, 0.62200846792814612,  0, 0.044658198738520463, 0, 0.16666666666666669,  0},
+    {0.16666666666666669,  0, 0.044658198738520463, 0, 0.62200846792814612,  0, 0.16666666666666669,  0},
+    {0.044658198738520463, 0, 0.16666666666666669,  0, 0.16666666666666669,  0, 0.62200846792814612,  0}
+};
+const CeedScalar Q1_grad_gauss[3][4][8] = {
+    {{-0.31100423396407312, 0.31100423396407312, -0.083333333333333343, 0.083333333333333343, -0.083333333333333343, 0.083333333333333343,
+      -0.022329099369260232, 0.022329099369260232},
+     {-0.083333333333333343, 0.083333333333333343, -0.31100423396407312, 0.31100423396407312, -0.022329099369260232, 0.022329099369260232,
+      -0.083333333333333343, 0.083333333333333343},
+     {-0.083333333333333343, 0.083333333333333343, -0.022329099369260232, 0.022329099369260232, -0.31100423396407312, 0.31100423396407312,
+      -0.083333333333333343, 0.083333333333333343},
+     {-0.022329099369260232, 0.022329099369260232, -0.083333333333333343, 0.083333333333333343, -0.083333333333333343, 0.083333333333333343,
+      -0.31100423396407312, 0.31100423396407312}                                                       },
+    {{-0.39433756729740643, 0, 0.39433756729740643, 0, -0.10566243270259357, 0, 0.10566243270259357, 0},
+     {-0.39433756729740643, 0, 0.39433756729740643, 0, -0.10566243270259357, 0, 0.10566243270259357, 0},
+     {-0.10566243270259357, 0, 0.10566243270259357, 0, -0.39433756729740643, 0, 0.39433756729740643, 0},
+     {-0.10566243270259357, 0, 0.10566243270259357, 0, -0.39433756729740643, 0, 0.39433756729740643, 0}},
+    {{-0.39433756729740643, 0, -0.10566243270259357, 0, 0.39433756729740643, 0, 0.10566243270259357, 0},
+     {-0.10566243270259357, 0, -0.39433756729740643, 0, 0.10566243270259357, 0, 0.39433756729740643, 0},
+     {-0.39433756729740643, 0, -0.10566243270259357, 0, 0.39433756729740643, 0, 0.10566243270259357, 0},
+     {-0.10566243270259357, 0, -0.39433756729740643, 0, 0.10566243270259357, 0, 0.39433756729740643, 0}}
+};
+
+const CeedScalar Q1_interp_gauss_lobatto[4][8] = {
+    {1, 0, 0, 0, 0, 0, 0, 0},
+    {0, 0, 1, 0, 0, 0, 0, 0},
+    {0, 0, 0, 0, 1, 0, 0, 0},
+    {0, 0, 0, 0, 0, 0, 1, 0}
+};
+/* clang-format off */
+const CeedScalar Q1_grad_gauss_lobatto[3][4][8] = {
+    {{-0.5,  0.5, 0,    0,   0,    0,   0,    0},
+      {0,    0,   -0.5, 0.5, 0,    0,   0,    0},
+      {0,    0,   0,    0,   -0.5, 0.5, 0,    0},
+      {0,    0,   0,    0,   0,    0,   -0.5, 0.5}},
+    {{-0.5,  0,   0.5,  0,   0,    0,   0,    0},
+      {-0.5, 0,   0.5,  0,   0,    0,   0,    0},
+      {0,    0,   0,    0,   -0.5, 0,   0.5,  0},
+      {0,    0,   0,    0,   -0.5, 0,   0.5,  0}},
+    {{-0.5,  0,   0,    0,   0.5,  0,   0,    0},
+      {0,    0,   -0.5, 0,   0,    0,   0.5,  0},
+      {-0.5, 0,   0,    0,   0.5,  0,   0,    0},
+      {0,    0,   -0.5, 0,   0,    0,   0.5,  0}}
+};
+/* clang-format on */
+
+static void GetCellToFaceTabulation(CeedQuadMode quad_mode, CeedInt *P, CeedInt *Q, const CeedScalar **interp, const CeedScalar **grad) {
+  if (P) *P = 8;
+  if (Q) *Q = 4;
+
+  if (quad_mode == CEED_GAUSS) {
+    *interp = (const CeedScalar *)Q1_interp_gauss;
+    *grad   = (const CeedScalar *)Q1_grad_gauss;
+  }
+  if (quad_mode == CEED_GAUSS_LOBATTO) {
+    *interp = (const CeedScalar *)Q1_interp_gauss_lobatto;
+    *grad   = (const CeedScalar *)Q1_grad_gauss_lobatto;
+  }
+}

From eaae1aea9f90bf9ac99c63e6c0ce6677c03a7837 Mon Sep 17 00:00:00 2001
From: James Wright <james@jameswright.xyz>
Date: Tue, 9 Jul 2024 10:50:13 -0600
Subject: [PATCH 119/571] refactor: Changes for clang-format 18

---
 .github/workflows/c-fortran-test-style.yml                | 6 +++---
 backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp         | 4 ++--
 examples/fluids/qfunctions/blasius.h                      | 2 +-
 include/ceed/jit-source/cuda/cuda-ref-operator-assemble.h | 6 +++---
 include/ceed/jit-source/hip/hip-ref-operator-assemble.h   | 6 +++---
 interface/ceed.c                                          | 3 +--
 6 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/.github/workflows/c-fortran-test-style.yml b/.github/workflows/c-fortran-test-style.yml
index a6daecc241..b570221d44 100644
--- a/.github/workflows/c-fortran-test-style.yml
+++ b/.github/workflows/c-fortran-test-style.yml
@@ -21,12 +21,12 @@ jobs:
     - name: Install clang-format
       run: |
           wget -O- https://apt.llvm.org/llvm-snapshot.gpg.key | sudo apt-key add -
-          sudo add-apt-repository 'deb http://apt.llvm.org/bullseye/ llvm-toolchain-bullseye-17 main'
-          sudo apt update && sudo apt install clang-format-17
+          sudo add-apt-repository 'deb http://apt.llvm.org/noble/ llvm-toolchain-noble-18 main'
+          sudo apt update && sudo apt install clang-format-18
     - name: C style
       env:
         CC: ${{ matrix.compiler }}
         FC: gfortran-11
       run: |
         make info
-        make format-c -j2 CLANG_FORMAT=clang-format-17 && git diff --exit-code
+        make format-c -j2 CLANG_FORMAT=clang-format-18 && git diff --exit-code
diff --git a/backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp b/backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp
index f3cf95641a..2a9c59779f 100644
--- a/backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp
+++ b/backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp
@@ -1190,12 +1190,12 @@ static int CeedOperatorLinearAssemble_Sycl(sycl::queue &sycl_queue, const CeedOp
               result += B_out[b_out_index + j * num_nodes + i] * qf_array[qf_index + j] * B_in[b_in_index + j * num_nodes + l];
             }
           }  // end of  e_mode_out
-        }    // end of  e_mode_in
+        }  // end of  e_mode_in
         CeedSize val_index = comp_in_stride * comp_in + comp_out_stride * comp_out + e_stride * e + num_nodes * i + l;
 
         values_array[val_index] = result;
       }  // end of out component
-    }    // end of in component
+    }  // end of in component
   });
   return CEED_ERROR_SUCCESS;
 }
diff --git a/examples/fluids/qfunctions/blasius.h b/examples/fluids/qfunctions/blasius.h
index d80fe4ce63..738af58898 100644
--- a/examples/fluids/qfunctions/blasius.h
+++ b/examples/fluids/qfunctions/blasius.h
@@ -37,7 +37,7 @@ struct BlasiusContext_ {
 CEED_QFUNCTION_HELPER void ChebyshevEval(int N, const double *Tf, double x, double eta_max, double *f) {
   double dX_deta     = 2 / eta_max;
   double table[4][3] = {
-  // Chebyshev polynomials T_0, T_1, T_2 of the first kind in (-1,1)
+      // Chebyshev polynomials T_0, T_1, T_2 of the first kind in (-1,1)
       {1, x, 2 * x * x - 1},
       {0, 1, 4 * x        },
       {0, 0, 4            },
diff --git a/include/ceed/jit-source/cuda/cuda-ref-operator-assemble.h b/include/ceed/jit-source/cuda/cuda-ref-operator-assemble.h
index 6333f771f2..318a07b163 100644
--- a/include/ceed/jit-source/cuda/cuda-ref-operator-assemble.h
+++ b/include/ceed/jit-source/cuda/cuda-ref-operator-assemble.h
@@ -62,7 +62,7 @@ extern "C" __launch_bounds__(BLOCK_SIZE) __global__
                 result += B_out[b_out_index + j * NUM_NODES_OUT + i] * qf_array[qf_index + j] * B_in[b_in_index + j * NUM_NODES_IN + l];
               }
             }  // end of out eval mode
-          }    // end of in eval mode
+          }  // end of in eval mode
           if (orients_in) {
             result *= orients_in[NUM_NODES_IN * e + l] ? -1.0 : 1.0;
           }
@@ -101,6 +101,6 @@ extern "C" __launch_bounds__(BLOCK_SIZE) __global__
           }
         }
       }  // end of out component
-    }    // end of in component
-  }      // end of element loop
+    }  // end of in component
+  }  // end of element loop
 }
diff --git a/include/ceed/jit-source/hip/hip-ref-operator-assemble.h b/include/ceed/jit-source/hip/hip-ref-operator-assemble.h
index bf86921066..35789443b6 100644
--- a/include/ceed/jit-source/hip/hip-ref-operator-assemble.h
+++ b/include/ceed/jit-source/hip/hip-ref-operator-assemble.h
@@ -62,7 +62,7 @@ extern "C" __launch_bounds__(BLOCK_SIZE) __global__
                 result += B_out[b_out_index + j * NUM_NODES_OUT + i] * qf_array[qf_index + j] * B_in[b_in_index + j * NUM_NODES_IN + l];
               }
             }  // end of out eval mode
-          }    // end of in eval mode
+          }  // end of in eval mode
           if (orients_in) {
             result *= orients_in[NUM_NODES_IN * e + l] ? -1.0 : 1.0;
           }
@@ -101,6 +101,6 @@ extern "C" __launch_bounds__(BLOCK_SIZE) __global__
           }
         }
       }  // end of out component
-    }    // end of in component
-  }      // end of element loop
+    }  // end of in component
+  }  // end of element loop
 }
diff --git a/interface/ceed.c b/interface/ceed.c
index 073220d187..3c79c59b33 100644
--- a/interface/ceed.c
+++ b/interface/ceed.c
@@ -28,8 +28,7 @@ static struct {
 } backends[32];
 static size_t num_backends;
 
-#define CEED_FTABLE_ENTRY(class, method) \
-  { #class #method, offsetof(struct class##_private, method) }
+#define CEED_FTABLE_ENTRY(class, method) {#class #method, offsetof(struct class##_private, method)}
 /// @endcond
 
 /// @file

From ad8059fcea5cf37277b38187e19f76c260580632 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Wed, 10 Jul 2024 15:27:50 -0600
Subject: [PATCH 120/571] gpu - reduce write conflits for AtPoints basis
 operations

---
 backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp           | 4 ++--
 examples/fluids/qfunctions/blasius.h                        | 2 +-
 .../ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h  | 4 ++--
 include/ceed/jit-source/cuda/cuda-ref-operator-assemble.h   | 6 +++---
 .../ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h    | 4 ++--
 include/ceed/jit-source/hip/hip-ref-operator-assemble.h     | 6 +++---
 interface/ceed.c                                            | 3 +--
 7 files changed, 14 insertions(+), 15 deletions(-)

diff --git a/backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp b/backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp
index f3cf95641a..2a9c59779f 100644
--- a/backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp
+++ b/backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp
@@ -1190,12 +1190,12 @@ static int CeedOperatorLinearAssemble_Sycl(sycl::queue &sycl_queue, const CeedOp
               result += B_out[b_out_index + j * num_nodes + i] * qf_array[qf_index + j] * B_in[b_in_index + j * num_nodes + l];
             }
           }  // end of  e_mode_out
-        }    // end of  e_mode_in
+        }  // end of  e_mode_in
         CeedSize val_index = comp_in_stride * comp_in + comp_out_stride * comp_out + e_stride * e + num_nodes * i + l;
 
         values_array[val_index] = result;
       }  // end of out component
-    }    // end of in component
+    }  // end of in component
   });
   return CEED_ERROR_SUCCESS;
 }
diff --git a/examples/fluids/qfunctions/blasius.h b/examples/fluids/qfunctions/blasius.h
index d80fe4ce63..738af58898 100644
--- a/examples/fluids/qfunctions/blasius.h
+++ b/examples/fluids/qfunctions/blasius.h
@@ -37,7 +37,7 @@ struct BlasiusContext_ {
 CEED_QFUNCTION_HELPER void ChebyshevEval(int N, const double *Tf, double x, double eta_max, double *f) {
   double dX_deta     = 2 / eta_max;
   double table[4][3] = {
-  // Chebyshev polynomials T_0, T_1, T_2 of the first kind in (-1,1)
+      // Chebyshev polynomials T_0, T_1, T_2 of the first kind in (-1,1)
       {1, x, 2 * x * x - 1},
       {0, 1, 4 * x        },
       {0, 0, 4            },
diff --git a/include/ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h b/include/ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h
index 9783dfd14c..600a847f37 100644
--- a/include/ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h
+++ b/include/ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h
@@ -97,7 +97,7 @@ extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt
             for (CeedInt a = 0; a < pre; a++) {
               for (CeedInt c = 0; c < post; c++) {
                 if (d == BASIS_DIM - 1) {
-                  for (CeedInt j = 0; j < Q; j++) atomicAdd(&out[(a * Q + j) * post + c], chebyshev_x[j] * in[a * post + c]);
+                  for (CeedInt j = 0; j < Q; j++) atomicAdd(&out[(a * Q + (j + p) % Q) * post + c], chebyshev_x[(j + p) % Q] * in[a * post + c]);
                 } else {
                   for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] = chebyshev_x[j] * in[a * post + c];
                 }
@@ -255,7 +255,7 @@ extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedInt is
               for (CeedInt a = 0; a < pre; a++) {
                 for (CeedInt c = 0; c < post; c++) {
                   if (dim_2 == BASIS_DIM - 1) {
-                    for (CeedInt j = 0; j < Q; j++) atomicAdd(&out[(a * Q + j) * post + c], chebyshev_x[j] * in[a * post + c]);
+                    for (CeedInt j = 0; j < Q; j++) atomicAdd(&out[(a * Q + (j + p) % Q) * post + c], chebyshev_x[(j + p) % Q] * in[a * post + c]);
                   } else {
                     for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] = chebyshev_x[j] * in[a * post + c];
                   }
diff --git a/include/ceed/jit-source/cuda/cuda-ref-operator-assemble.h b/include/ceed/jit-source/cuda/cuda-ref-operator-assemble.h
index 6333f771f2..318a07b163 100644
--- a/include/ceed/jit-source/cuda/cuda-ref-operator-assemble.h
+++ b/include/ceed/jit-source/cuda/cuda-ref-operator-assemble.h
@@ -62,7 +62,7 @@ extern "C" __launch_bounds__(BLOCK_SIZE) __global__
                 result += B_out[b_out_index + j * NUM_NODES_OUT + i] * qf_array[qf_index + j] * B_in[b_in_index + j * NUM_NODES_IN + l];
               }
             }  // end of out eval mode
-          }    // end of in eval mode
+          }  // end of in eval mode
           if (orients_in) {
             result *= orients_in[NUM_NODES_IN * e + l] ? -1.0 : 1.0;
           }
@@ -101,6 +101,6 @@ extern "C" __launch_bounds__(BLOCK_SIZE) __global__
           }
         }
       }  // end of out component
-    }    // end of in component
-  }      // end of element loop
+    }  // end of in component
+  }  // end of element loop
 }
diff --git a/include/ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h b/include/ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h
index 9783dfd14c..600a847f37 100644
--- a/include/ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h
+++ b/include/ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h
@@ -97,7 +97,7 @@ extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt
             for (CeedInt a = 0; a < pre; a++) {
               for (CeedInt c = 0; c < post; c++) {
                 if (d == BASIS_DIM - 1) {
-                  for (CeedInt j = 0; j < Q; j++) atomicAdd(&out[(a * Q + j) * post + c], chebyshev_x[j] * in[a * post + c]);
+                  for (CeedInt j = 0; j < Q; j++) atomicAdd(&out[(a * Q + (j + p) % Q) * post + c], chebyshev_x[(j + p) % Q] * in[a * post + c]);
                 } else {
                   for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] = chebyshev_x[j] * in[a * post + c];
                 }
@@ -255,7 +255,7 @@ extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedInt is
               for (CeedInt a = 0; a < pre; a++) {
                 for (CeedInt c = 0; c < post; c++) {
                   if (dim_2 == BASIS_DIM - 1) {
-                    for (CeedInt j = 0; j < Q; j++) atomicAdd(&out[(a * Q + j) * post + c], chebyshev_x[j] * in[a * post + c]);
+                    for (CeedInt j = 0; j < Q; j++) atomicAdd(&out[(a * Q + (j + p) % Q) * post + c], chebyshev_x[(j + p) % Q] * in[a * post + c]);
                   } else {
                     for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] = chebyshev_x[j] * in[a * post + c];
                   }
diff --git a/include/ceed/jit-source/hip/hip-ref-operator-assemble.h b/include/ceed/jit-source/hip/hip-ref-operator-assemble.h
index bf86921066..35789443b6 100644
--- a/include/ceed/jit-source/hip/hip-ref-operator-assemble.h
+++ b/include/ceed/jit-source/hip/hip-ref-operator-assemble.h
@@ -62,7 +62,7 @@ extern "C" __launch_bounds__(BLOCK_SIZE) __global__
                 result += B_out[b_out_index + j * NUM_NODES_OUT + i] * qf_array[qf_index + j] * B_in[b_in_index + j * NUM_NODES_IN + l];
               }
             }  // end of out eval mode
-          }    // end of in eval mode
+          }  // end of in eval mode
           if (orients_in) {
             result *= orients_in[NUM_NODES_IN * e + l] ? -1.0 : 1.0;
           }
@@ -101,6 +101,6 @@ extern "C" __launch_bounds__(BLOCK_SIZE) __global__
           }
         }
       }  // end of out component
-    }    // end of in component
-  }      // end of element loop
+    }  // end of in component
+  }  // end of element loop
 }
diff --git a/interface/ceed.c b/interface/ceed.c
index 073220d187..3c79c59b33 100644
--- a/interface/ceed.c
+++ b/interface/ceed.c
@@ -28,8 +28,7 @@ static struct {
 } backends[32];
 static size_t num_backends;
 
-#define CEED_FTABLE_ENTRY(class, method) \
-  { #class #method, offsetof(struct class##_private, method) }
+#define CEED_FTABLE_ENTRY(class, method) {#class #method, offsetof(struct class##_private, method)}
 /// @endcond
 
 /// @file

From 80c135a87dd608e39d180e7bb5c260aa9fcc10a1 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Wed, 10 Jul 2024 15:48:50 -0600
Subject: [PATCH 121/571] gpu - less explicit memory shuffling to build
 Chebyshev der

---
 .../ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h  | 6 ++----
 .../ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h    | 6 ++----
 2 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/include/ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h b/include/ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h
index 600a847f37..fc65a10912 100644
--- a/include/ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h
+++ b/include/ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h
@@ -29,10 +29,8 @@ inline __device__ void ChebyshevDerivativeAtPoint(const CeedScalar x, CeedScalar
   chebyshev_dx[0] = 0.0;
   chebyshev_dx[1] = 2.0;
   for (CeedInt i = 2; i < Q_1D; i++) {
-    chebyshev_x[0]  = chebyshev_x[1];
-    chebyshev_x[1]  = chebyshev_x[2];
-    chebyshev_x[2]  = 2 * x * chebyshev_x[1] - chebyshev_x[0];
-    chebyshev_dx[i] = 2 * x * chebyshev_dx[i - 1] + 2 * chebyshev_x[1] - chebyshev_dx[i - 2];
+    chebyshev_x[(i + 1) % 3] = 2 * x * chebyshev_x[(i + 0) % 3] - chebyshev_x[(i + 2) % 3];
+    chebyshev_dx[i]          = 2 * x * chebyshev_dx[i - 1] + 2 * chebyshev_x[(i + 0) % 3] - chebyshev_dx[i - 2];
   }
 }
 
diff --git a/include/ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h b/include/ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h
index 600a847f37..fc65a10912 100644
--- a/include/ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h
+++ b/include/ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h
@@ -29,10 +29,8 @@ inline __device__ void ChebyshevDerivativeAtPoint(const CeedScalar x, CeedScalar
   chebyshev_dx[0] = 0.0;
   chebyshev_dx[1] = 2.0;
   for (CeedInt i = 2; i < Q_1D; i++) {
-    chebyshev_x[0]  = chebyshev_x[1];
-    chebyshev_x[1]  = chebyshev_x[2];
-    chebyshev_x[2]  = 2 * x * chebyshev_x[1] - chebyshev_x[0];
-    chebyshev_dx[i] = 2 * x * chebyshev_dx[i - 1] + 2 * chebyshev_x[1] - chebyshev_dx[i - 2];
+    chebyshev_x[(i + 1) % 3] = 2 * x * chebyshev_x[(i + 0) % 3] - chebyshev_x[(i + 2) % 3];
+    chebyshev_dx[i]          = 2 * x * chebyshev_dx[i - 1] + 2 * chebyshev_x[(i + 0) % 3] - chebyshev_dx[i - 2];
   }
 }
 

From 756ca9e91ae9d9e5d3677ad5d97bcf1a7f71087a Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Thu, 20 Jun 2024 12:53:29 -0600
Subject: [PATCH 122/571] cuda - add AtPoints CeedOperator

---
 backends/cuda-ref/ceed-cuda-ref-operator.c | 255 ++++++++++++++++++++-
 backends/cuda-ref/ceed-cuda-ref.c          |   1 +
 backends/cuda-ref/ceed-cuda-ref.h          |   4 +-
 3 files changed, 254 insertions(+), 6 deletions(-)

diff --git a/backends/cuda-ref/ceed-cuda-ref-operator.c b/backends/cuda-ref/ceed-cuda-ref-operator.c
index c6604907ba..9972a7d8c3 100644
--- a/backends/cuda-ref/ceed-cuda-ref-operator.c
+++ b/backends/cuda-ref/ceed-cuda-ref-operator.c
@@ -42,6 +42,7 @@ static int CeedOperatorDestroy_Cuda(CeedOperator op) {
     CeedCallBackend(CeedVectorDestroy(&impl->q_vecs_out[i]));
   }
   CeedCallBackend(CeedFree(&impl->q_vecs_out));
+  CeedCallBackend(CeedVectorDestroy(&impl->point_coords_elem));
 
   // QFunction assembly data
   for (CeedInt i = 0; i < impl->num_active_in; i++) {
@@ -95,8 +96,8 @@ static int CeedOperatorDestroy_Cuda(CeedOperator op) {
 //------------------------------------------------------------------------------
 // Setup infields or outfields
 //------------------------------------------------------------------------------
-static int CeedOperatorSetupFields_Cuda(CeedQFunction qf, CeedOperator op, bool is_input, CeedVector *e_vecs, CeedVector *q_vecs, CeedInt start_e,
-                                        CeedInt num_fields, CeedInt Q, CeedInt num_elem) {
+static int CeedOperatorSetupFields_Cuda(CeedQFunction qf, CeedOperator op, bool is_input, bool is_at_points, CeedVector *e_vecs, CeedVector *q_vecs,
+                                        CeedInt start_e, CeedInt num_fields, CeedInt Q, CeedInt num_elem) {
   Ceed                ceed;
   CeedQFunctionField *qf_fields;
   CeedOperatorField  *op_fields;
@@ -170,7 +171,15 @@ static int CeedOperatorSetupFields_Cuda(CeedQFunction qf, CeedOperator op, bool
         CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis));
         q_size = (CeedSize)num_elem * Q;
         CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i]));
-        CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT, CEED_VECTOR_NONE, q_vecs[i]));
+        if (is_at_points) {
+          CeedInt num_points[num_elem];
+
+          for (CeedInt i = 0; i < num_elem; i++) num_points[i] = Q;
+          CeedCallBackend(
+              CeedBasisApplyAtPoints(basis, num_elem, num_points, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT, CEED_VECTOR_NONE, CEED_VECTOR_NONE, q_vecs[i]));
+        } else {
+          CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT, CEED_VECTOR_NONE, q_vecs[i]));
+        }
         break;
     }
   }
@@ -210,9 +219,10 @@ static int CeedOperatorSetup_Cuda(CeedOperator op) {
 
   // Set up infield and outfield e_vecs and q_vecs
   // Infields
-  CeedCallBackend(CeedOperatorSetupFields_Cuda(qf, op, true, impl->e_vecs, impl->q_vecs_in, 0, num_input_fields, Q, num_elem));
+  CeedCallBackend(CeedOperatorSetupFields_Cuda(qf, op, true, false, impl->e_vecs, impl->q_vecs_in, 0, num_input_fields, Q, num_elem));
   // Outfields
-  CeedCallBackend(CeedOperatorSetupFields_Cuda(qf, op, false, impl->e_vecs, impl->q_vecs_out, num_input_fields, num_output_fields, Q, num_elem));
+  CeedCallBackend(
+      CeedOperatorSetupFields_Cuda(qf, op, false, false, impl->e_vecs, impl->q_vecs_out, num_input_fields, num_output_fields, Q, num_elem));
 
   CeedCallBackend(CeedOperatorSetSetupDone(op));
   return CEED_ERROR_SUCCESS;
@@ -433,6 +443,209 @@ static int CeedOperatorApplyAdd_Cuda(CeedOperator op, CeedVector in_vec, CeedVec
   return CEED_ERROR_SUCCESS;
 }
 
+//------------------------------------------------------------------------------
+// CeedOperator needs to connect all the named fields (be they active or passive) to the named inputs and outputs of its CeedQFunction.
+//------------------------------------------------------------------------------
+static int CeedOperatorSetupAtPoints_Cuda(CeedOperator op) {
+  Ceed                ceed;
+  bool                is_setup_done;
+  CeedInt             max_num_points = -1, num_elem, num_input_fields, num_output_fields;
+  CeedQFunctionField *qf_input_fields, *qf_output_fields;
+  CeedQFunction       qf;
+  CeedOperatorField  *op_input_fields, *op_output_fields;
+  CeedOperator_Cuda  *impl;
+
+  CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done));
+  if (is_setup_done) return CEED_ERROR_SUCCESS;
+
+  CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
+  CeedCallBackend(CeedOperatorGetData(op, &impl));
+  CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
+  CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
+  CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
+  CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
+  {
+    CeedElemRestriction elem_rstr = NULL;
+
+    CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &elem_rstr, NULL));
+    CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(elem_rstr, &max_num_points));
+  }
+  impl->max_num_points = max_num_points;
+
+  // Allocate
+  CeedCallBackend(CeedCalloc(num_input_fields + num_output_fields, &impl->e_vecs));
+  CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->input_states));
+  CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_in));
+  CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_out));
+  impl->num_inputs  = num_input_fields;
+  impl->num_outputs = num_output_fields;
+
+  // Set up infield and outfield e_vecs and q_vecs
+  // Infields
+  CeedCallBackend(CeedOperatorSetupFields_Cuda(qf, op, true, true, impl->e_vecs, impl->q_vecs_in, 0, num_input_fields, max_num_points, num_elem));
+  // Outfields
+  CeedCallBackend(CeedOperatorSetupFields_Cuda(qf, op, false, true, impl->e_vecs, impl->q_vecs_out, num_input_fields, num_output_fields,
+                                               max_num_points, num_elem));
+
+  CeedCallBackend(CeedOperatorSetSetupDone(op));
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
+// Input Basis Action
+//------------------------------------------------------------------------------
+static inline int CeedOperatorInputBasisAtPoints_Cuda(CeedInt num_elem, const CeedInt *num_points, CeedQFunctionField *qf_input_fields,
+                                                      CeedOperatorField *op_input_fields, CeedInt num_input_fields, const bool skip_active,
+                                                      CeedScalar *e_data[2 * CEED_FIELD_MAX], CeedOperator_Cuda *impl) {
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedInt             elem_size, size;
+    CeedEvalMode        eval_mode;
+    CeedElemRestriction elem_rstr;
+    CeedBasis           basis;
+
+    // Skip active input
+    if (skip_active) {
+      CeedVector vec;
+
+      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
+      if (vec == CEED_VECTOR_ACTIVE) continue;
+    }
+    // Get elem_size, eval_mode, size
+    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
+    CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
+    CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &size));
+    // Basis action
+    switch (eval_mode) {
+      case CEED_EVAL_NONE:
+        CeedCallBackend(CeedVectorSetArray(impl->q_vecs_in[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data[i]));
+        break;
+      case CEED_EVAL_INTERP:
+      case CEED_EVAL_GRAD:
+      case CEED_EVAL_DIV:
+      case CEED_EVAL_CURL:
+        CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
+        CeedCallBackend(CeedBasisApplyAtPoints(basis, num_elem, num_points, CEED_NOTRANSPOSE, eval_mode, impl->point_coords_elem, impl->e_vecs[i],
+                                               impl->q_vecs_in[i]));
+        break;
+      case CEED_EVAL_WEIGHT:
+        break;  // No action
+    }
+  }
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
+// Apply and add to output
+//------------------------------------------------------------------------------
+static int CeedOperatorApplyAddAtPoints_Cuda(CeedOperator op, CeedVector in_vec, CeedVector out_vec, CeedRequest *request) {
+  CeedInt             max_num_points, num_elem, elem_size, num_input_fields, num_output_fields, size;
+  CeedScalar         *e_data[2 * CEED_FIELD_MAX] = {NULL};
+  CeedQFunctionField *qf_input_fields, *qf_output_fields;
+  CeedQFunction       qf;
+  CeedOperatorField  *op_input_fields, *op_output_fields;
+  CeedOperator_Cuda  *impl;
+
+  CeedCallBackend(CeedOperatorGetData(op, &impl));
+  CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
+  CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
+  CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
+  CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
+  CeedInt num_points[num_elem];
+
+  // Setup
+  CeedCallBackend(CeedOperatorSetupAtPoints_Cuda(op));
+  max_num_points = impl->max_num_points;
+  for (CeedInt i = 0; i < num_elem; i++) num_points[i] = max_num_points;
+
+  // Input Evecs and Restriction
+  CeedCallBackend(CeedOperatorSetupInputs_Cuda(num_input_fields, qf_input_fields, op_input_fields, in_vec, false, e_data, impl, request));
+
+  // Get point coordinates
+  if (!impl->point_coords_elem) {
+    CeedVector          point_coords = NULL;
+    CeedElemRestriction rstr_points  = NULL;
+
+    CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, &point_coords));
+    CeedCallBackend(CeedElemRestrictionCreateVector(rstr_points, NULL, &impl->point_coords_elem));
+    CeedCallBackend(CeedElemRestrictionApply(rstr_points, CEED_NOTRANSPOSE, point_coords, impl->point_coords_elem, request));
+  }
+
+  // Input basis apply if needed
+  CeedCallBackend(CeedOperatorInputBasisAtPoints_Cuda(num_elem, num_points, qf_input_fields, op_input_fields, num_input_fields, false, e_data, impl));
+
+  // Output pointers, as necessary
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    CeedEvalMode eval_mode;
+
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
+    if (eval_mode == CEED_EVAL_NONE) {
+      // Set the output Q-Vector to use the E-Vector data directly.
+      CeedCallBackend(CeedVectorGetArrayWrite(impl->e_vecs[i + impl->num_inputs], CEED_MEM_DEVICE, &e_data[i + num_input_fields]));
+      CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data[i + num_input_fields]));
+    }
+  }
+
+  // Q function
+  CeedCallBackend(CeedQFunctionApply(qf, num_elem * max_num_points, impl->q_vecs_in, impl->q_vecs_out));
+
+  // Output basis apply if needed
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    CeedEvalMode        eval_mode;
+    CeedElemRestriction elem_rstr;
+    CeedBasis           basis;
+
+    // Get elem_size, eval_mode, size
+    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
+    CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
+    CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &size));
+    // Basis action
+    switch (eval_mode) {
+      case CEED_EVAL_NONE:
+        break;  // No action
+      case CEED_EVAL_INTERP:
+      case CEED_EVAL_GRAD:
+      case CEED_EVAL_DIV:
+      case CEED_EVAL_CURL:
+        CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
+        CeedCallBackend(CeedBasisApplyAtPoints(basis, num_elem, num_points, CEED_TRANSPOSE, eval_mode, impl->point_coords_elem, impl->q_vecs_out[i],
+                                               impl->e_vecs[i + impl->num_inputs]));
+        break;
+      // LCOV_EXCL_START
+      case CEED_EVAL_WEIGHT: {
+        return CeedError(CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
+        // LCOV_EXCL_STOP
+      }
+    }
+  }
+
+  // Output restriction
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    CeedEvalMode        eval_mode;
+    CeedVector          vec;
+    CeedElemRestriction elem_rstr;
+
+    // Restore evec
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
+    if (eval_mode == CEED_EVAL_NONE) {
+      CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs[i + impl->num_inputs], &e_data[i + num_input_fields]));
+    }
+    // Get output vector
+    CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
+    // Restrict
+    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
+    // Active
+    if (vec == CEED_VECTOR_ACTIVE) vec = out_vec;
+
+    CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_TRANSPOSE, impl->e_vecs[i + impl->num_inputs], vec, request));
+  }
+
+  // Restore input arrays
+  CeedCallBackend(CeedOperatorRestoreInputs_Cuda(num_input_fields, qf_input_fields, op_input_fields, false, e_data, impl));
+  return CEED_ERROR_SUCCESS;
+}
+
 //------------------------------------------------------------------------------
 // Linear QFunction Assembly Core
 //------------------------------------------------------------------------------
@@ -1250,6 +1463,20 @@ static int CeedSingleOperatorAssemble_Cuda(CeedOperator op, CeedInt offset, Ceed
   return CEED_ERROR_SUCCESS;
 }
 
+//------------------------------------------------------------------------------
+// Assemble Linear QFunction AtPoints
+//------------------------------------------------------------------------------
+static int CeedOperatorLinearAssembleQFunctionAtPoints_Cuda(CeedOperator op, CeedVector *assembled, CeedElemRestriction *rstr, CeedRequest *request) {
+  return CeedError(CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND, "Backend does not implement CeedOperatorLinearAssembleQFunction");
+}
+
+//------------------------------------------------------------------------------
+// Assemble Linear Diagonal AtPoints
+//------------------------------------------------------------------------------
+static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda(CeedOperator op, CeedVector assembled, CeedRequest *request) {
+  return CeedError(CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND, "Backend does not implement CeedSingleOperatorLinearAssembleAddDiagonal");
+}
+
 //------------------------------------------------------------------------------
 // Create operator
 //------------------------------------------------------------------------------
@@ -1273,3 +1500,21 @@ int CeedOperatorCreate_Cuda(CeedOperator op) {
 }
 
 //------------------------------------------------------------------------------
+// Create operator
+//------------------------------------------------------------------------------
+int CeedOperatorCreateAtPoints_Cuda(CeedOperator op) {
+  Ceed               ceed;
+  CeedOperator_Cuda *impl;
+
+  CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
+  CeedCallBackend(CeedCalloc(1, &impl));
+  CeedCallBackend(CeedOperatorSetData(op, impl));
+
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunction", CeedOperatorLinearAssembleQFunctionAtPoints_Cuda));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddDiagonal", CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAddAtPoints_Cuda));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "Destroy", CeedOperatorDestroy_Cuda));
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
diff --git a/backends/cuda-ref/ceed-cuda-ref.c b/backends/cuda-ref/ceed-cuda-ref.c
index 2f00d6c1dc..b8bfcd5f79 100644
--- a/backends/cuda-ref/ceed-cuda-ref.c
+++ b/backends/cuda-ref/ceed-cuda-ref.c
@@ -61,6 +61,7 @@ static int CeedInit_Cuda_ref(const char *resource, Ceed ceed) {
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "QFunctionCreate", CeedQFunctionCreate_Cuda));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "QFunctionContextCreate", CeedQFunctionContextCreate_Cuda));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "OperatorCreate", CeedOperatorCreate_Cuda));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "OperatorCreateAtPoints", CeedOperatorCreateAtPoints_Cuda));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "Destroy", CeedDestroy_Cuda));
   return CEED_ERROR_SUCCESS;
 }
diff --git a/backends/cuda-ref/ceed-cuda-ref.h b/backends/cuda-ref/ceed-cuda-ref.h
index 312af8ccf5..0c2c5b4972 100644
--- a/backends/cuda-ref/ceed-cuda-ref.h
+++ b/backends/cuda-ref/ceed-cuda-ref.h
@@ -134,7 +134,8 @@ typedef struct {
   CeedVector                *q_vecs_out;    // Output Q-vectors needed to apply operator
   CeedInt                    num_inputs, num_outputs;
   CeedInt                    num_active_in, num_active_out;
-  CeedVector                *qf_active_in;
+  CeedInt                    max_num_points;
+  CeedVector                *qf_active_in, point_coords_elem;
   CeedOperatorDiag_Cuda     *diag;
   CeedOperatorAssemble_Cuda *asmb;
 } CeedOperator_Cuda;
@@ -160,3 +161,4 @@ CEED_INTERN int CeedQFunctionCreate_Cuda(CeedQFunction qf);
 CEED_INTERN int CeedQFunctionContextCreate_Cuda(CeedQFunctionContext ctx);
 
 CEED_INTERN int CeedOperatorCreate_Cuda(CeedOperator op);
+CEED_INTERN int CeedOperatorCreateAtPoints_Cuda(CeedOperator op);

From 57c38b115ba38ad3c80943a5d59a80b386d4c706 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Thu, 20 Jun 2024 16:20:33 -0600
Subject: [PATCH 123/571] ex - use fewer q_extra in dmswarm tests

---
 examples/petsc/dmswarm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/petsc/dmswarm.c b/examples/petsc/dmswarm.c
index 557ace7ec2..1dc3618fd8 100644
--- a/examples/petsc/dmswarm.c
+++ b/examples/petsc/dmswarm.c
@@ -18,7 +18,7 @@
 //
 //  ./dmswarm -dm_plex_dim 3 -dm_plex_box_faces 3,3,3 -dm_plex_box_lower -1.0,-1.0,-1.0 -dm_plex_simplex 0 -num_comp 2 -swarm gauss
 //
-//TESTARGS(name="Uniform swarm, CG projection") -ceed {ceed_resource} -test -dm_plex_dim 3 -dm_plex_box_faces 3,3,3 -dm_plex_box_lower -1.0,-1.0,-1.0 -dm_plex_simplex 0 -dm_plex_hash_location true -num_comp 2 -swarm uniform -solution_order 3 -points_per_cell 125
+//TESTARGS(name="Uniform swarm, CG projection") -ceed {ceed_resource} -test -dm_plex_dim 3 -dm_plex_box_faces 3,3,3 -dm_plex_box_lower -1.0,-1.0,-1.0 -dm_plex_simplex 0 -dm_plex_hash_location true -num_comp 2 -swarm uniform -solution_order 3 -q_extra 0 -points_per_cell 125
 //TESTARGS(name="Gauss swarm, lumped projection") -ceed {ceed_resource} -test -dm_plex_dim 3 -dm_plex_box_faces 3,3,3 -dm_plex_box_lower -1.0,-1.0,-1.0 -dm_plex_simplex 0 -dm_plex_hash_location true -num_comp 2 -swarm gauss -ksp_type preonly -pc_type jacobi -pc_jacobi_type rowsum -tolerance 9e-2
 
 /// @file

From 67d9480a68a5a9232b5fd6d362b2ef6ee659593b Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Thu, 20 Jun 2024 16:30:11 -0600
Subject: [PATCH 124/571] hip - add AtPoints CeedOperator

---
 backends/cuda-ref/ceed-cuda-ref-operator.c |   6 +-
 backends/hip-ref/ceed-hip-ref-operator.c   | 255 ++++++++++++++++++++-
 backends/hip-ref/ceed-hip-ref.c            |   1 +
 backends/hip-ref/ceed-hip-ref.h            |   4 +-
 4 files changed, 257 insertions(+), 9 deletions(-)

diff --git a/backends/cuda-ref/ceed-cuda-ref-operator.c b/backends/cuda-ref/ceed-cuda-ref-operator.c
index 9972a7d8c3..c30736c4aa 100644
--- a/backends/cuda-ref/ceed-cuda-ref-operator.c
+++ b/backends/cuda-ref/ceed-cuda-ref-operator.c
@@ -492,7 +492,7 @@ static int CeedOperatorSetupAtPoints_Cuda(CeedOperator op) {
 }
 
 //------------------------------------------------------------------------------
-// Input Basis Action
+// Input Basis Action AtPoints
 //------------------------------------------------------------------------------
 static inline int CeedOperatorInputBasisAtPoints_Cuda(CeedInt num_elem, const CeedInt *num_points, CeedQFunctionField *qf_input_fields,
                                                       CeedOperatorField *op_input_fields, CeedInt num_input_fields, const bool skip_active,
@@ -536,7 +536,7 @@ static inline int CeedOperatorInputBasisAtPoints_Cuda(CeedInt num_elem, const Ce
 }
 
 //------------------------------------------------------------------------------
-// Apply and add to output
+// Apply and add to output AtPoints
 //------------------------------------------------------------------------------
 static int CeedOperatorApplyAddAtPoints_Cuda(CeedOperator op, CeedVector in_vec, CeedVector out_vec, CeedRequest *request) {
   CeedInt             max_num_points, num_elem, elem_size, num_input_fields, num_output_fields, size;
@@ -1500,7 +1500,7 @@ int CeedOperatorCreate_Cuda(CeedOperator op) {
 }
 
 //------------------------------------------------------------------------------
-// Create operator
+// Create operator AtPoints
 //------------------------------------------------------------------------------
 int CeedOperatorCreateAtPoints_Cuda(CeedOperator op) {
   Ceed               ceed;
diff --git a/backends/hip-ref/ceed-hip-ref-operator.c b/backends/hip-ref/ceed-hip-ref-operator.c
index 06ff9bfa98..0417c3ca52 100644
--- a/backends/hip-ref/ceed-hip-ref-operator.c
+++ b/backends/hip-ref/ceed-hip-ref-operator.c
@@ -41,6 +41,7 @@ static int CeedOperatorDestroy_Hip(CeedOperator op) {
     CeedCallBackend(CeedVectorDestroy(&impl->q_vecs_out[i]));
   }
   CeedCallBackend(CeedFree(&impl->q_vecs_out));
+  CeedCallBackend(CeedVectorDestroy(&impl->point_coords_elem));
 
   // QFunction assembly data
   for (CeedInt i = 0; i < impl->num_active_in; i++) {
@@ -94,8 +95,8 @@ static int CeedOperatorDestroy_Hip(CeedOperator op) {
 //------------------------------------------------------------------------------
 // Setup infields or outfields
 //------------------------------------------------------------------------------
-static int CeedOperatorSetupFields_Hip(CeedQFunction qf, CeedOperator op, bool is_input, CeedVector *e_vecs, CeedVector *q_vecs, CeedInt start_e,
-                                       CeedInt num_fields, CeedInt Q, CeedInt num_elem) {
+static int CeedOperatorSetupFields_Hip(CeedQFunction qf, CeedOperator op, bool is_input, bool is_at_points, CeedVector *e_vecs, CeedVector *q_vecs,
+                                       CeedInt start_e, CeedInt num_fields, CeedInt Q, CeedInt num_elem) {
   Ceed                ceed;
   CeedQFunctionField *qf_fields;
   CeedOperatorField  *op_fields;
@@ -169,7 +170,15 @@ static int CeedOperatorSetupFields_Hip(CeedQFunction qf, CeedOperator op, bool i
         CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis));
         q_size = (CeedSize)num_elem * Q;
         CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i]));
-        CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT, CEED_VECTOR_NONE, q_vecs[i]));
+        if (is_at_points) {
+          CeedInt num_points[num_elem];
+
+          for (CeedInt i = 0; i < num_elem; i++) num_points[i] = Q;
+          CeedCallBackend(
+              CeedBasisApplyAtPoints(basis, num_elem, num_points, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT, CEED_VECTOR_NONE, CEED_VECTOR_NONE, q_vecs[i]));
+        } else {
+          CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT, CEED_VECTOR_NONE, q_vecs[i]));
+        }
         break;
     }
   }
@@ -209,9 +218,10 @@ static int CeedOperatorSetup_Hip(CeedOperator op) {
 
   // Set up infield and outfield e_vecs and q_vecs
   // Infields
-  CeedCallBackend(CeedOperatorSetupFields_Hip(qf, op, true, impl->e_vecs, impl->q_vecs_in, 0, num_input_fields, Q, num_elem));
+  CeedCallBackend(CeedOperatorSetupFields_Hip(qf, op, true, false, impl->e_vecs, impl->q_vecs_in, 0, num_input_fields, Q, num_elem));
   // Outfields
-  CeedCallBackend(CeedOperatorSetupFields_Hip(qf, op, false, impl->e_vecs, impl->q_vecs_out, num_input_fields, num_output_fields, Q, num_elem));
+  CeedCallBackend(
+      CeedOperatorSetupFields_Hip(qf, op, false, false, impl->e_vecs, impl->q_vecs_out, num_input_fields, num_output_fields, Q, num_elem));
 
   CeedCallBackend(CeedOperatorSetSetupDone(op));
   return CEED_ERROR_SUCCESS;
@@ -432,6 +442,209 @@ static int CeedOperatorApplyAdd_Hip(CeedOperator op, CeedVector in_vec, CeedVect
   return CEED_ERROR_SUCCESS;
 }
 
+//------------------------------------------------------------------------------
+// CeedOperator needs to connect all the named fields (be they active or passive) to the named inputs and outputs of its CeedQFunction.
+//------------------------------------------------------------------------------
+static int CeedOperatorSetupAtPoints_Hip(CeedOperator op) {
+  Ceed                ceed;
+  bool                is_setup_done;
+  CeedInt             max_num_points = -1, num_elem, num_input_fields, num_output_fields;
+  CeedQFunctionField *qf_input_fields, *qf_output_fields;
+  CeedQFunction       qf;
+  CeedOperatorField  *op_input_fields, *op_output_fields;
+  CeedOperator_Hip   *impl;
+
+  CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done));
+  if (is_setup_done) return CEED_ERROR_SUCCESS;
+
+  CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
+  CeedCallBackend(CeedOperatorGetData(op, &impl));
+  CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
+  CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
+  CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
+  CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
+  {
+    CeedElemRestriction elem_rstr = NULL;
+
+    CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &elem_rstr, NULL));
+    CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(elem_rstr, &max_num_points));
+  }
+  impl->max_num_points = max_num_points;
+
+  // Allocate
+  CeedCallBackend(CeedCalloc(num_input_fields + num_output_fields, &impl->e_vecs));
+  CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->input_states));
+  CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_in));
+  CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_out));
+  impl->num_inputs  = num_input_fields;
+  impl->num_outputs = num_output_fields;
+
+  // Set up infield and outfield e_vecs and q_vecs
+  // Infields
+  CeedCallBackend(CeedOperatorSetupFields_Hip(qf, op, true, true, impl->e_vecs, impl->q_vecs_in, 0, num_input_fields, max_num_points, num_elem));
+  // Outfields
+  CeedCallBackend(CeedOperatorSetupFields_Hip(qf, op, false, true, impl->e_vecs, impl->q_vecs_out, num_input_fields, num_output_fields,
+                                              max_num_points, num_elem));
+
+  CeedCallBackend(CeedOperatorSetSetupDone(op));
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
+// Input Basis Action AtPoints
+//------------------------------------------------------------------------------
+static inline int CeedOperatorInputBasisAtPoints_Hip(CeedInt num_elem, const CeedInt *num_points, CeedQFunctionField *qf_input_fields,
+                                                     CeedOperatorField *op_input_fields, CeedInt num_input_fields, const bool skip_active,
+                                                     CeedScalar *e_data[2 * CEED_FIELD_MAX], CeedOperator_Hip *impl) {
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedInt             elem_size, size;
+    CeedEvalMode        eval_mode;
+    CeedElemRestriction elem_rstr;
+    CeedBasis           basis;
+
+    // Skip active input
+    if (skip_active) {
+      CeedVector vec;
+
+      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
+      if (vec == CEED_VECTOR_ACTIVE) continue;
+    }
+    // Get elem_size, eval_mode, size
+    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
+    CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
+    CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &size));
+    // Basis action
+    switch (eval_mode) {
+      case CEED_EVAL_NONE:
+        CeedCallBackend(CeedVectorSetArray(impl->q_vecs_in[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data[i]));
+        break;
+      case CEED_EVAL_INTERP:
+      case CEED_EVAL_GRAD:
+      case CEED_EVAL_DIV:
+      case CEED_EVAL_CURL:
+        CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
+        CeedCallBackend(CeedBasisApplyAtPoints(basis, num_elem, num_points, CEED_NOTRANSPOSE, eval_mode, impl->point_coords_elem, impl->e_vecs[i],
+                                               impl->q_vecs_in[i]));
+        break;
+      case CEED_EVAL_WEIGHT:
+        break;  // No action
+    }
+  }
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
+// Apply and add to output AtPoints
+//------------------------------------------------------------------------------
+static int CeedOperatorApplyAddAtPoints_Hip(CeedOperator op, CeedVector in_vec, CeedVector out_vec, CeedRequest *request) {
+  CeedInt             max_num_points, num_elem, elem_size, num_input_fields, num_output_fields, size;
+  CeedScalar         *e_data[2 * CEED_FIELD_MAX] = {NULL};
+  CeedQFunctionField *qf_input_fields, *qf_output_fields;
+  CeedQFunction       qf;
+  CeedOperatorField  *op_input_fields, *op_output_fields;
+  CeedOperator_Hip   *impl;
+
+  CeedCallBackend(CeedOperatorGetData(op, &impl));
+  CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
+  CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
+  CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
+  CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
+  CeedInt num_points[num_elem];
+
+  // Setup
+  CeedCallBackend(CeedOperatorSetupAtPoints_Hip(op));
+  max_num_points = impl->max_num_points;
+  for (CeedInt i = 0; i < num_elem; i++) num_points[i] = max_num_points;
+
+  // Input Evecs and Restriction
+  CeedCallBackend(CeedOperatorSetupInputs_Hip(num_input_fields, qf_input_fields, op_input_fields, in_vec, false, e_data, impl, request));
+
+  // Get point coordinates
+  if (!impl->point_coords_elem) {
+    CeedVector          point_coords = NULL;
+    CeedElemRestriction rstr_points  = NULL;
+
+    CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, &point_coords));
+    CeedCallBackend(CeedElemRestrictionCreateVector(rstr_points, NULL, &impl->point_coords_elem));
+    CeedCallBackend(CeedElemRestrictionApply(rstr_points, CEED_NOTRANSPOSE, point_coords, impl->point_coords_elem, request));
+  }
+
+  // Input basis apply if needed
+  CeedCallBackend(CeedOperatorInputBasisAtPoints_Hip(num_elem, num_points, qf_input_fields, op_input_fields, num_input_fields, false, e_data, impl));
+
+  // Output pointers, as necessary
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    CeedEvalMode eval_mode;
+
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
+    if (eval_mode == CEED_EVAL_NONE) {
+      // Set the output Q-Vector to use the E-Vector data directly.
+      CeedCallBackend(CeedVectorGetArrayWrite(impl->e_vecs[i + impl->num_inputs], CEED_MEM_DEVICE, &e_data[i + num_input_fields]));
+      CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data[i + num_input_fields]));
+    }
+  }
+
+  // Q function
+  CeedCallBackend(CeedQFunctionApply(qf, num_elem * max_num_points, impl->q_vecs_in, impl->q_vecs_out));
+
+  // Output basis apply if needed
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    CeedEvalMode        eval_mode;
+    CeedElemRestriction elem_rstr;
+    CeedBasis           basis;
+
+    // Get elem_size, eval_mode, size
+    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
+    CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
+    CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &size));
+    // Basis action
+    switch (eval_mode) {
+      case CEED_EVAL_NONE:
+        break;  // No action
+      case CEED_EVAL_INTERP:
+      case CEED_EVAL_GRAD:
+      case CEED_EVAL_DIV:
+      case CEED_EVAL_CURL:
+        CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
+        CeedCallBackend(CeedBasisApplyAtPoints(basis, num_elem, num_points, CEED_TRANSPOSE, eval_mode, impl->point_coords_elem, impl->q_vecs_out[i],
+                                               impl->e_vecs[i + impl->num_inputs]));
+        break;
+      // LCOV_EXCL_START
+      case CEED_EVAL_WEIGHT: {
+        return CeedError(CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
+        // LCOV_EXCL_STOP
+      }
+    }
+  }
+
+  // Output restriction
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    CeedEvalMode        eval_mode;
+    CeedVector          vec;
+    CeedElemRestriction elem_rstr;
+
+    // Restore evec
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
+    if (eval_mode == CEED_EVAL_NONE) {
+      CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs[i + impl->num_inputs], &e_data[i + num_input_fields]));
+    }
+    // Get output vector
+    CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
+    // Restrict
+    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
+    // Active
+    if (vec == CEED_VECTOR_ACTIVE) vec = out_vec;
+
+    CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_TRANSPOSE, impl->e_vecs[i + impl->num_inputs], vec, request));
+  }
+
+  // Restore input arrays
+  CeedCallBackend(CeedOperatorRestoreInputs_Hip(num_input_fields, qf_input_fields, op_input_fields, false, e_data, impl));
+  return CEED_ERROR_SUCCESS;
+}
+
 //------------------------------------------------------------------------------
 // Linear QFunction Assembly Core
 //------------------------------------------------------------------------------
@@ -1247,6 +1460,20 @@ static int CeedSingleOperatorAssemble_Hip(CeedOperator op, CeedInt offset, CeedV
   return CEED_ERROR_SUCCESS;
 }
 
+//------------------------------------------------------------------------------
+// Assemble Linear QFunction AtPoints
+//------------------------------------------------------------------------------
+static int CeedOperatorLinearAssembleQFunctionAtPoints_Hip(CeedOperator op, CeedVector *assembled, CeedElemRestriction *rstr, CeedRequest *request) {
+  return CeedError(CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND, "Backend does not implement CeedOperatorLinearAssembleQFunction");
+}
+
+//------------------------------------------------------------------------------
+// Assemble Linear Diagonal AtPoints
+//------------------------------------------------------------------------------
+static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip(CeedOperator op, CeedVector assembled, CeedRequest *request) {
+  return CeedError(CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND, "Backend does not implement CeedSingleOperatorLinearAssembleAddDiagonal");
+}
+
 //------------------------------------------------------------------------------
 // Create operator
 //------------------------------------------------------------------------------
@@ -1269,3 +1496,21 @@ int CeedOperatorCreate_Hip(CeedOperator op) {
 }
 
 //------------------------------------------------------------------------------
+// Create operator AtPoints
+//------------------------------------------------------------------------------
+int CeedOperatorCreateAtPoints_Hip(CeedOperator op) {
+  Ceed              ceed;
+  CeedOperator_Hip *impl;
+
+  CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
+  CeedCallBackend(CeedCalloc(1, &impl));
+  CeedCallBackend(CeedOperatorSetData(op, impl));
+
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunction", CeedOperatorLinearAssembleQFunctionAtPoints_Hip));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddDiagonal", CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAddAtPoints_Hip));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "Destroy", CeedOperatorDestroy_Hip));
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
diff --git a/backends/hip-ref/ceed-hip-ref.c b/backends/hip-ref/ceed-hip-ref.c
index c9f765c70f..fa3215e027 100644
--- a/backends/hip-ref/ceed-hip-ref.c
+++ b/backends/hip-ref/ceed-hip-ref.c
@@ -61,6 +61,7 @@ static int CeedInit_Hip_ref(const char *resource, Ceed ceed) {
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "QFunctionCreate", CeedQFunctionCreate_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "QFunctionContextCreate", CeedQFunctionContextCreate_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "OperatorCreate", CeedOperatorCreate_Hip));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "OperatorCreateAtPoints", CeedOperatorCreateAtPoints_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "Destroy", CeedDestroy_Hip));
   return CEED_ERROR_SUCCESS;
 }
diff --git a/backends/hip-ref/ceed-hip-ref.h b/backends/hip-ref/ceed-hip-ref.h
index b73b72d07c..b0df567c55 100644
--- a/backends/hip-ref/ceed-hip-ref.h
+++ b/backends/hip-ref/ceed-hip-ref.h
@@ -138,7 +138,8 @@ typedef struct {
   CeedVector               *q_vecs_out;    // Output Q-vectors needed to apply operator
   CeedInt                   num_inputs, num_outputs;
   CeedInt                   num_active_in, num_active_out;
-  CeedVector               *qf_active_in;
+  CeedInt                   max_num_points;
+  CeedVector               *qf_active_in, point_coords_elem;
   CeedOperatorDiag_Hip     *diag;
   CeedOperatorAssemble_Hip *asmb;
 } CeedOperator_Hip;
@@ -164,3 +165,4 @@ CEED_INTERN int CeedQFunctionCreate_Hip(CeedQFunction qf);
 CEED_INTERN int CeedQFunctionContextCreate_Hip(CeedQFunctionContext ctx);
 
 CEED_INTERN int CeedOperatorCreate_Hip(CeedOperator op);
+CEED_INTERN int CeedOperatorCreateAtPoints_Hip(CeedOperator op);

From 349fb27d93e3b8cfa1e9ec4e2d1c9c35fd22455e Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 28 Jun 2024 10:37:00 -0600
Subject: [PATCH 125/571] op - CUDA diagonal assembly AtPoints

---
 backends/cuda-ref/ceed-cuda-ref-operator.c | 196 ++++++++++++++++++++-
 1 file changed, 195 insertions(+), 1 deletion(-)

diff --git a/backends/cuda-ref/ceed-cuda-ref-operator.c b/backends/cuda-ref/ceed-cuda-ref-operator.c
index c30736c4aa..11d9a366df 100644
--- a/backends/cuda-ref/ceed-cuda-ref-operator.c
+++ b/backends/cuda-ref/ceed-cuda-ref-operator.c
@@ -1474,7 +1474,201 @@ static int CeedOperatorLinearAssembleQFunctionAtPoints_Cuda(CeedOperator op, Cee
 // Assemble Linear Diagonal AtPoints
 //------------------------------------------------------------------------------
 static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda(CeedOperator op, CeedVector assembled, CeedRequest *request) {
-  return CeedError(CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND, "Backend does not implement CeedSingleOperatorLinearAssembleAddDiagonal");
+  bool                is_active_at_points = true;
+  CeedSize            e_vec_size          = 0;
+  CeedInt             max_num_points, num_elem, num_input_fields, num_output_fields, elem_size_active = 1, num_comp_active = 1;
+  CeedScalar         *e_data[2 * CEED_FIELD_MAX] = {NULL};
+  CeedQFunctionField *qf_input_fields, *qf_output_fields;
+  CeedQFunction       qf;
+  CeedOperatorField  *op_input_fields, *op_output_fields;
+  CeedOperator_Cuda  *impl;
+
+  CeedCallBackend(CeedOperatorGetData(op, &impl));
+  CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
+  CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
+  CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
+  CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
+  CeedInt num_points[num_elem];
+
+  // Setup
+  CeedCallBackend(CeedOperatorSetupAtPoints_Cuda(op));
+  max_num_points = impl->max_num_points;
+  for (CeedInt i = 0; i < num_elem; i++) num_points[i] = max_num_points;
+
+  // Input Evecs and Restriction
+  CeedCallBackend(CeedOperatorSetupInputs_Cuda(num_input_fields, qf_input_fields, op_input_fields, NULL, true, e_data, impl, request));
+
+  // Check if active field is at points
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedRestrictionType rstr_type;
+    CeedVector          vec;
+    CeedElemRestriction elem_rstr;
+
+    CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
+    // Skip non-active input
+    if (vec != CEED_VECTOR_ACTIVE) continue;
+
+    // Get active restriction type
+    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
+    CeedCallBackend(CeedElemRestrictionGetType(elem_rstr, &rstr_type));
+    CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp_active));
+    is_active_at_points = rstr_type == CEED_RESTRICTION_POINTS;
+    if (!is_active_at_points) CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size_active));
+  }
+
+  // Get point coordinates
+  if (!impl->point_coords_elem) {
+    CeedVector          point_coords = NULL;
+    CeedElemRestriction rstr_points  = NULL;
+
+    CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, &point_coords));
+    CeedCallBackend(CeedElemRestrictionCreateVector(rstr_points, NULL, &impl->point_coords_elem));
+    CeedCallBackend(CeedElemRestrictionApply(rstr_points, CEED_NOTRANSPOSE, point_coords, impl->point_coords_elem, request));
+  }
+
+  // Input basis apply if needed
+  CeedCallBackend(CeedOperatorInputBasisAtPoints_Cuda(num_elem, num_points, qf_input_fields, op_input_fields, num_input_fields, true, e_data, impl));
+
+  // Output pointers, as necessary
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    CeedEvalMode eval_mode;
+
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
+    if (eval_mode == CEED_EVAL_NONE) {
+      // Set the output Q-Vector to use the E-Vector data directly.
+      CeedCallBackend(CeedVectorGetArrayWrite(impl->e_vecs[i + impl->num_inputs], CEED_MEM_DEVICE, &e_data[i + num_input_fields]));
+      CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data[i + num_input_fields]));
+    }
+  }
+
+  // Loop over active fields
+  e_vec_size = (is_active_at_points ? max_num_points : elem_size_active) * num_comp_active;
+  for (CeedInt s = 0; s < e_vec_size; s++) {
+    for (CeedInt i = 0; i < num_input_fields; i++) {
+      bool         is_active_input = false;
+      CeedEvalMode eval_mode;
+      CeedVector   vec;
+      CeedBasis    basis;
+
+      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
+      // Skip non-active input
+      is_active_input = vec == CEED_VECTOR_ACTIVE;
+      if (!is_active_input) continue;
+
+      // Update unit vector
+      if (s == 0) CeedCallBackend(CeedVectorSetValue(impl->e_vecs[i], 0.0));
+      else CeedCallBackend(CeedVectorSetValueStrided(impl->e_vecs[i], s - 1, e_vec_size, 0.0));
+      CeedCallBackend(CeedVectorSetValueStrided(impl->e_vecs[i], s, e_vec_size, 1.0));
+
+      // Basis action
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
+      switch (eval_mode) {
+        case CEED_EVAL_NONE:
+          CeedCallBackend(CeedVectorSetArray(impl->q_vecs_in[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data[i]));
+          break;
+        case CEED_EVAL_INTERP:
+        case CEED_EVAL_GRAD:
+        case CEED_EVAL_DIV:
+        case CEED_EVAL_CURL:
+          CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
+          CeedCallBackend(CeedBasisApplyAtPoints(basis, num_elem, num_points, CEED_NOTRANSPOSE, eval_mode, impl->point_coords_elem, impl->e_vecs[i],
+                                                 impl->q_vecs_in[i]));
+          break;
+        case CEED_EVAL_WEIGHT:
+          break;  // No action
+      }
+    }
+
+    // Q function
+    CeedCallBackend(CeedQFunctionApply(qf, num_elem * max_num_points, impl->q_vecs_in, impl->q_vecs_out));
+
+    // Output basis apply if needed
+    for (CeedInt i = 0; i < num_output_fields; i++) {
+      bool                is_active_output = false;
+      CeedEvalMode        eval_mode;
+      CeedVector          vec;
+      CeedElemRestriction elem_rstr;
+      CeedBasis           basis;
+
+      // Get output vector
+      CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
+      is_active_output = vec == CEED_VECTOR_ACTIVE;
+      if (!is_active_output) continue;
+
+      // Basis action
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
+      switch (eval_mode) {
+        case CEED_EVAL_NONE:
+          CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs[i + impl->num_inputs], &e_data[i + num_input_fields]));
+          break;
+        case CEED_EVAL_INTERP:
+        case CEED_EVAL_GRAD:
+        case CEED_EVAL_DIV:
+        case CEED_EVAL_CURL:
+          CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
+          CeedCallBackend(CeedBasisApplyAtPoints(basis, num_elem, num_points, CEED_TRANSPOSE, eval_mode, impl->point_coords_elem, impl->q_vecs_out[i],
+                                                 impl->e_vecs[i + impl->num_inputs]));
+          break;
+        // LCOV_EXCL_START
+        case CEED_EVAL_WEIGHT: {
+          return CeedError(CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
+          // LCOV_EXCL_STOP
+        }
+      }
+
+      // Mask output e-vec
+      {
+        CeedInt  j = num_input_fields;
+        CeedSize out_size;
+
+        CeedCallBackend(CeedVectorGetLength(impl->e_vecs[i + impl->num_inputs], &out_size));
+        for (j = 0; j < num_input_fields; j++) {
+          bool       is_active_input = false;
+          CeedSize   in_size;
+          CeedVector vec;
+
+          // Skip non-active input
+          CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec));
+          is_active_input = vec == CEED_VECTOR_ACTIVE;
+          if (!is_active_input) continue;
+          CeedCallBackend(CeedVectorGetLength(impl->e_vecs[j], &in_size));
+          if (in_size == out_size) break;
+        }
+        CeedCheck(j < num_input_fields, CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND, "Matching input field not found");
+        CeedCallBackend(CeedVectorPointwiseMult(impl->e_vecs[i + impl->num_inputs], impl->e_vecs[j], impl->e_vecs[i + impl->num_inputs]));
+      }
+
+      // Restrict
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
+      CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_TRANSPOSE, impl->e_vecs[i + impl->num_inputs], assembled, request));
+
+      // Reset q_vec for
+      if (eval_mode == CEED_EVAL_NONE) {
+        CeedCallBackend(CeedVectorGetArrayWrite(impl->e_vecs[i + impl->num_inputs], CEED_MEM_DEVICE, &e_data[i + num_input_fields]));
+        CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data[i + num_input_fields]));
+      }
+    }
+  }
+
+  // Restore CEED_EVAL_NONE
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    CeedEvalMode        eval_mode;
+    CeedElemRestriction elem_rstr;
+
+    // Get eval_mode
+    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
+
+    // Restore evec
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
+    if (eval_mode == CEED_EVAL_NONE) {
+      CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs[i + impl->num_inputs], &e_data[i + num_input_fields]));
+    }
+  }
+
+  // Restore input arrays
+  CeedCallBackend(CeedOperatorRestoreInputs_Cuda(num_input_fields, qf_input_fields, op_input_fields, true, e_data, impl));
+  return CEED_ERROR_SUCCESS;
 }
 
 //------------------------------------------------------------------------------

From afe3bc8a75e92753d9e19eda4e2e3134c127758b Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 28 Jun 2024 11:23:46 -0600
Subject: [PATCH 126/571] op - HIP diagonal assembly AtPoints

---
 backends/hip-ref/ceed-hip-ref-operator.c | 196 ++++++++++++++++++++++-
 1 file changed, 195 insertions(+), 1 deletion(-)

diff --git a/backends/hip-ref/ceed-hip-ref-operator.c b/backends/hip-ref/ceed-hip-ref-operator.c
index 0417c3ca52..611ed6834e 100644
--- a/backends/hip-ref/ceed-hip-ref-operator.c
+++ b/backends/hip-ref/ceed-hip-ref-operator.c
@@ -1471,7 +1471,201 @@ static int CeedOperatorLinearAssembleQFunctionAtPoints_Hip(CeedOperator op, Ceed
 // Assemble Linear Diagonal AtPoints
 //------------------------------------------------------------------------------
 static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip(CeedOperator op, CeedVector assembled, CeedRequest *request) {
-  return CeedError(CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND, "Backend does not implement CeedSingleOperatorLinearAssembleAddDiagonal");
+  bool                is_active_at_points = true;
+  CeedSize            e_vec_size          = 0;
+  CeedInt             max_num_points, num_elem, num_input_fields, num_output_fields, elem_size_active = 1, num_comp_active = 1;
+  CeedScalar         *e_data[2 * CEED_FIELD_MAX] = {NULL};
+  CeedQFunctionField *qf_input_fields, *qf_output_fields;
+  CeedQFunction       qf;
+  CeedOperatorField  *op_input_fields, *op_output_fields;
+  CeedOperator_Hip   *impl;
+
+  CeedCallBackend(CeedOperatorGetData(op, &impl));
+  CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
+  CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
+  CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
+  CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
+  CeedInt num_points[num_elem];
+
+  // Setup
+  CeedCallBackend(CeedOperatorSetupAtPoints_Hip(op));
+  max_num_points = impl->max_num_points;
+  for (CeedInt i = 0; i < num_elem; i++) num_points[i] = max_num_points;
+
+  // Input Evecs and Restriction
+  CeedCallBackend(CeedOperatorSetupInputs_Hip(num_input_fields, qf_input_fields, op_input_fields, NULL, true, e_data, impl, request));
+
+  // Check if active field is at points
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedRestrictionType rstr_type;
+    CeedVector          vec;
+    CeedElemRestriction elem_rstr;
+
+    CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
+    // Skip non-active input
+    if (vec != CEED_VECTOR_ACTIVE) continue;
+
+    // Get active restriction type
+    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
+    CeedCallBackend(CeedElemRestrictionGetType(elem_rstr, &rstr_type));
+    CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp_active));
+    is_active_at_points = rstr_type == CEED_RESTRICTION_POINTS;
+    if (!is_active_at_points) CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size_active));
+  }
+
+  // Get point coordinates
+  if (!impl->point_coords_elem) {
+    CeedVector          point_coords = NULL;
+    CeedElemRestriction rstr_points  = NULL;
+
+    CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, &point_coords));
+    CeedCallBackend(CeedElemRestrictionCreateVector(rstr_points, NULL, &impl->point_coords_elem));
+    CeedCallBackend(CeedElemRestrictionApply(rstr_points, CEED_NOTRANSPOSE, point_coords, impl->point_coords_elem, request));
+  }
+
+  // Input basis apply if needed
+  CeedCallBackend(CeedOperatorInputBasisAtPoints_Hip(num_elem, num_points, qf_input_fields, op_input_fields, num_input_fields, true, e_data, impl));
+
+  // Output pointers, as necessary
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    CeedEvalMode eval_mode;
+
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
+    if (eval_mode == CEED_EVAL_NONE) {
+      // Set the output Q-Vector to use the E-Vector data directly.
+      CeedCallBackend(CeedVectorGetArrayWrite(impl->e_vecs[i + impl->num_inputs], CEED_MEM_DEVICE, &e_data[i + num_input_fields]));
+      CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data[i + num_input_fields]));
+    }
+  }
+
+  // Loop over active fields
+  e_vec_size = (is_active_at_points ? max_num_points : elem_size_active) * num_comp_active;
+  for (CeedInt s = 0; s < e_vec_size; s++) {
+    for (CeedInt i = 0; i < num_input_fields; i++) {
+      bool         is_active_input = false;
+      CeedEvalMode eval_mode;
+      CeedVector   vec;
+      CeedBasis    basis;
+
+      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
+      // Skip non-active input
+      is_active_input = vec == CEED_VECTOR_ACTIVE;
+      if (!is_active_input) continue;
+
+      // Update unit vector
+      if (s == 0) CeedCallBackend(CeedVectorSetValue(impl->e_vecs[i], 0.0));
+      else CeedCallBackend(CeedVectorSetValueStrided(impl->e_vecs[i], s - 1, e_vec_size, 0.0));
+      CeedCallBackend(CeedVectorSetValueStrided(impl->e_vecs[i], s, e_vec_size, 1.0));
+
+      // Basis action
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
+      switch (eval_mode) {
+        case CEED_EVAL_NONE:
+          CeedCallBackend(CeedVectorSetArray(impl->q_vecs_in[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data[i]));
+          break;
+        case CEED_EVAL_INTERP:
+        case CEED_EVAL_GRAD:
+        case CEED_EVAL_DIV:
+        case CEED_EVAL_CURL:
+          CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
+          CeedCallBackend(CeedBasisApplyAtPoints(basis, num_elem, num_points, CEED_NOTRANSPOSE, eval_mode, impl->point_coords_elem, impl->e_vecs[i],
+                                                 impl->q_vecs_in[i]));
+          break;
+        case CEED_EVAL_WEIGHT:
+          break;  // No action
+      }
+    }
+
+    // Q function
+    CeedCallBackend(CeedQFunctionApply(qf, num_elem * max_num_points, impl->q_vecs_in, impl->q_vecs_out));
+
+    // Output basis apply if needed
+    for (CeedInt i = 0; i < num_output_fields; i++) {
+      bool                is_active_output = false;
+      CeedEvalMode        eval_mode;
+      CeedVector          vec;
+      CeedElemRestriction elem_rstr;
+      CeedBasis           basis;
+
+      // Get output vector
+      CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
+      is_active_output = vec == CEED_VECTOR_ACTIVE;
+      if (!is_active_output) continue;
+
+      // Basis action
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
+      switch (eval_mode) {
+        case CEED_EVAL_NONE:
+          CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs[i + impl->num_inputs], &e_data[i + num_input_fields]));
+          break;
+        case CEED_EVAL_INTERP:
+        case CEED_EVAL_GRAD:
+        case CEED_EVAL_DIV:
+        case CEED_EVAL_CURL:
+          CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
+          CeedCallBackend(CeedBasisApplyAtPoints(basis, num_elem, num_points, CEED_TRANSPOSE, eval_mode, impl->point_coords_elem, impl->q_vecs_out[i],
+                                                 impl->e_vecs[i + impl->num_inputs]));
+          break;
+        // LCOV_EXCL_START
+        case CEED_EVAL_WEIGHT: {
+          return CeedError(CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
+          // LCOV_EXCL_STOP
+        }
+      }
+
+      // Mask output e-vec
+      {
+        CeedInt  j = num_input_fields;
+        CeedSize out_size;
+
+        CeedCallBackend(CeedVectorGetLength(impl->e_vecs[i + impl->num_inputs], &out_size));
+        for (j = 0; j < num_input_fields; j++) {
+          bool       is_active_input = false;
+          CeedSize   in_size;
+          CeedVector vec;
+
+          // Skip non-active input
+          CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec));
+          is_active_input = vec == CEED_VECTOR_ACTIVE;
+          if (!is_active_input) continue;
+          CeedCallBackend(CeedVectorGetLength(impl->e_vecs[j], &in_size));
+          if (in_size == out_size) break;
+        }
+        CeedCheck(j < num_input_fields, CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND, "Matching input field not found");
+        CeedCallBackend(CeedVectorPointwiseMult(impl->e_vecs[i + impl->num_inputs], impl->e_vecs[j], impl->e_vecs[i + impl->num_inputs]));
+      }
+
+      // Restrict
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
+      CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_TRANSPOSE, impl->e_vecs[i + impl->num_inputs], assembled, request));
+
+      // Reset q_vec for
+      if (eval_mode == CEED_EVAL_NONE) {
+        CeedCallBackend(CeedVectorGetArrayWrite(impl->e_vecs[i + impl->num_inputs], CEED_MEM_DEVICE, &e_data[i + num_input_fields]));
+        CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data[i + num_input_fields]));
+      }
+    }
+  }
+
+  // Restore CEED_EVAL_NONE
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    CeedEvalMode        eval_mode;
+    CeedElemRestriction elem_rstr;
+
+    // Get eval_mode
+    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
+
+    // Restore evec
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
+    if (eval_mode == CEED_EVAL_NONE) {
+      CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs[i + impl->num_inputs], &e_data[i + num_input_fields]));
+    }
+  }
+
+  // Restore input arrays
+  CeedCallBackend(CeedOperatorRestoreInputs_Hip(num_input_fields, qf_input_fields, op_input_fields, true, e_data, impl));
+  return CEED_ERROR_SUCCESS;
 }
 
 //------------------------------------------------------------------------------

From aa72de07106562bbc82cc6511afb18f45fe8145b Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Wed, 3 Jul 2024 16:33:54 -0600
Subject: [PATCH 127/571] rstr - cast explicitly to avoid size issues

---
 interface/ceed-elemrestriction.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/interface/ceed-elemrestriction.c b/interface/ceed-elemrestriction.c
index 2e78628087..149445ba5a 100644
--- a/interface/ceed-elemrestriction.c
+++ b/interface/ceed-elemrestriction.c
@@ -806,7 +806,7 @@ int CeedElemRestrictionCreateStrided(Ceed ceed, CeedInt num_elem, CeedInt elem_s
   CeedCheck(num_elem >= 0, ceed, CEED_ERROR_DIMENSION, "Number of elements must be non-negative");
   CeedCheck(elem_size > 0, ceed, CEED_ERROR_DIMENSION, "Element size must be at least 1");
   CeedCheck(num_comp > 0, ceed, CEED_ERROR_DIMENSION, "CeedElemRestriction must have at least 1 component");
-  CeedCheck(l_size >= (CeedSize)num_elem * elem_size * num_comp, ceed, CEED_ERROR_DIMENSION,
+  CeedCheck(l_size >= (CeedSize)num_elem * (CeedSize)elem_size * (CeedSize)num_comp, ceed, CEED_ERROR_DIMENSION,
             "L-vector size must be at least num_elem * elem_size * num_comp");
 
   CeedCall(CeedCalloc(1, rstr));
@@ -1136,7 +1136,7 @@ int CeedElemRestrictionCreateBlockedStrided(Ceed ceed, CeedInt num_elem, CeedInt
   CeedCheck(elem_size > 0, ceed, CEED_ERROR_DIMENSION, "Element size must be at least 1");
   CeedCheck(block_size > 0, ceed, CEED_ERROR_DIMENSION, "Block size must be at least 1");
   CeedCheck(num_comp > 0, ceed, CEED_ERROR_DIMENSION, "CeedElemRestriction must have at least 1 component");
-  CeedCheck(l_size >= (CeedSize)num_elem * elem_size * num_comp, ceed, CEED_ERROR_DIMENSION,
+  CeedCheck(l_size >= (CeedSize)num_elem * (CeedSize)elem_size * (CeedSize)num_comp, ceed, CEED_ERROR_DIMENSION,
             "L-vector size must be at least num_elem * elem_size * num_comp");
 
   CeedCall(CeedCalloc(1, rstr));

From 0a5597cece6e3a62854055430a3012108295aea9 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Thu, 11 Jul 2024 11:44:12 -0600
Subject: [PATCH 128/571] op - cast to CeedSize when creating rstr

---
 backends/blocked/ceed-blocked-operator.c   | 4 ++--
 backends/cuda-ref/ceed-cuda-ref-operator.c | 3 ++-
 backends/hip-ref/ceed-hip-ref-operator.c   | 3 ++-
 backends/opt/ceed-opt-operator.c           | 4 ++--
 backends/ref/ceed-ref-operator.c           | 4 ++--
 interface/ceed-preconditioning.c           | 3 ++-
 6 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/backends/blocked/ceed-blocked-operator.c b/backends/blocked/ceed-blocked-operator.c
index 10a72510f3..0df7846d57 100644
--- a/backends/blocked/ceed-blocked-operator.c
+++ b/backends/blocked/ceed-blocked-operator.c
@@ -530,8 +530,8 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Blocked(CeedOperator o
     const CeedInt  strides[3] = {1, Q, qf_size_in * qf_size_out * Q};
 
     // Create output restriction
-    CeedCallBackend(
-        CeedElemRestrictionCreateStrided(ceed, num_elem, Q, qf_size_in * qf_size_out, qf_size_in * qf_size_out * num_elem * Q, strides, rstr));
+    CeedCallBackend(CeedElemRestrictionCreateStrided(ceed, num_elem, Q, qf_size_in * qf_size_out,
+                                                     (CeedSize)qf_size_in * (CeedSize)qf_size_out * (CeedSize)num_elem * (CeedSize)Q, strides, rstr));
     // Create assembled vector
     CeedCallBackend(CeedVectorCreate(ceed, l_size, assembled));
   }
diff --git a/backends/cuda-ref/ceed-cuda-ref-operator.c b/backends/cuda-ref/ceed-cuda-ref-operator.c
index 11d9a366df..4f30418503 100644
--- a/backends/cuda-ref/ceed-cuda-ref-operator.c
+++ b/backends/cuda-ref/ceed-cuda-ref-operator.c
@@ -732,7 +732,8 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Cuda(CeedOperator op,
 
     // Create output restriction
     CeedCallBackend(CeedElemRestrictionCreateStrided(ceed_parent, num_elem, Q, num_active_in * num_active_out,
-                                                     num_active_in * num_active_out * num_elem * Q, strides, rstr));
+                                                     (CeedSize)num_active_in * (CeedSize)num_active_out * (CeedSize)num_elem * (CeedSize)Q, strides,
+                                                     rstr));
     // Create assembled vector
     CeedCallBackend(CeedVectorCreate(ceed_parent, l_size, assembled));
   }
diff --git a/backends/hip-ref/ceed-hip-ref-operator.c b/backends/hip-ref/ceed-hip-ref-operator.c
index 611ed6834e..8363d6e81d 100644
--- a/backends/hip-ref/ceed-hip-ref-operator.c
+++ b/backends/hip-ref/ceed-hip-ref-operator.c
@@ -731,7 +731,8 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Hip(CeedOperator op, b
 
     // Create output restriction
     CeedCallBackend(CeedElemRestrictionCreateStrided(ceed_parent, num_elem, Q, num_active_in * num_active_out,
-                                                     num_active_in * num_active_out * num_elem * Q, strides, rstr));
+                                                     (CeedSize)num_active_in * (CeedSize)num_active_out * (CeedSize)num_elem * (CeedSize)Q, strides,
+                                                     rstr));
     // Create assembled vector
     CeedCallBackend(CeedVectorCreate(ceed_parent, l_size, assembled));
   }
diff --git a/backends/opt/ceed-opt-operator.c b/backends/opt/ceed-opt-operator.c
index bc374cd460..db292879ce 100644
--- a/backends/opt/ceed-opt-operator.c
+++ b/backends/opt/ceed-opt-operator.c
@@ -534,8 +534,8 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Opt(CeedOperator op, b
     CeedInt        strides[3] = {1, Q, qf_size_in * qf_size_out * Q};
 
     // Create output restriction
-    CeedCallBackend(
-        CeedElemRestrictionCreateStrided(ceed, num_elem, Q, qf_size_in * qf_size_out, qf_size_in * qf_size_out * num_elem * Q, strides, rstr));
+    CeedCallBackend(CeedElemRestrictionCreateStrided(ceed, num_elem, Q, qf_size_in * qf_size_out,
+                                                     (CeedSize)qf_size_in * (CeedSize)qf_size_out * (CeedSize)num_elem * (CeedSize)Q, strides, rstr));
     // Create assembled vector
     CeedCallBackend(CeedVectorCreate(ceed, l_size, assembled));
   }
diff --git a/backends/ref/ceed-ref-operator.c b/backends/ref/ceed-ref-operator.c
index 3d65e1af05..e2f68fc38f 100644
--- a/backends/ref/ceed-ref-operator.c
+++ b/backends/ref/ceed-ref-operator.c
@@ -452,8 +452,8 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Ref(CeedOperator op, b
     CeedInt        strides[3] = {1, Q, qf_size_in * qf_size_out * Q}; /* *NOPAD* */
 
     // Create output restriction
-    CeedCallBackend(
-        CeedElemRestrictionCreateStrided(ceed_parent, num_elem, Q, qf_size_in * qf_size_out, qf_size_in * qf_size_out * num_elem * Q, strides, rstr));
+    CeedCallBackend(CeedElemRestrictionCreateStrided(ceed_parent, num_elem, Q, qf_size_in * qf_size_out,
+                                                     (CeedSize)qf_size_in * (CeedSize)qf_size_out * (CeedSize)num_elem * (CeedSize)Q, strides, rstr));
     // Create assembled vector
     CeedCallBackend(CeedVectorCreate(ceed_parent, l_size, assembled));
   }
diff --git a/interface/ceed-preconditioning.c b/interface/ceed-preconditioning.c
index 8eb59409d5..0b499edc77 100644
--- a/interface/ceed-preconditioning.c
+++ b/interface/ceed-preconditioning.c
@@ -2863,7 +2863,8 @@ int CeedOperatorCreateFDMElementInverse(CeedOperator op, CeedOperator *fdm_inv,
   // -- Restriction
   {
     CeedInt strides[3] = {1, num_nodes, num_nodes * num_comp};
-    CeedCall(CeedElemRestrictionCreateStrided(ceed_parent, num_elem, num_nodes, num_comp, num_elem * num_comp * num_nodes, strides, &rstr_qd_i));
+    CeedCall(CeedElemRestrictionCreateStrided(ceed_parent, num_elem, num_nodes, num_comp,
+                                              (CeedSize)num_elem * (CeedSize)num_comp * (CeedSize)num_nodes, strides, &rstr_qd_i));
   }
 
   // -- QFunction

From 870ea2d915fb99eaf6aa7619430d1689b64a6b73 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Thu, 11 Jul 2024 12:32:23 -0600
Subject: [PATCH 129/571] rstr - display size mismatch on error

---
 interface/ceed-elemrestriction.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/interface/ceed-elemrestriction.c b/interface/ceed-elemrestriction.c
index 149445ba5a..6a5399261b 100644
--- a/interface/ceed-elemrestriction.c
+++ b/interface/ceed-elemrestriction.c
@@ -807,7 +807,8 @@ int CeedElemRestrictionCreateStrided(Ceed ceed, CeedInt num_elem, CeedInt elem_s
   CeedCheck(elem_size > 0, ceed, CEED_ERROR_DIMENSION, "Element size must be at least 1");
   CeedCheck(num_comp > 0, ceed, CEED_ERROR_DIMENSION, "CeedElemRestriction must have at least 1 component");
   CeedCheck(l_size >= (CeedSize)num_elem * (CeedSize)elem_size * (CeedSize)num_comp, ceed, CEED_ERROR_DIMENSION,
-            "L-vector size must be at least num_elem * elem_size * num_comp");
+            "L-vector size must be at least num_elem * elem_size * num_comp. Expected: > %" CeedSize_FMT " Found: %" CeedSize_FMT,
+            (CeedSize)num_elem * (CeedSize)elem_size * (CeedSize)num_comp, l_size);
 
   CeedCall(CeedCalloc(1, rstr));
   CeedCall(CeedReferenceCopy(ceed, &(*rstr)->ceed));
@@ -872,7 +873,9 @@ int CeedElemRestrictionCreateAtPoints(Ceed ceed, CeedInt num_elem, CeedInt num_p
   CeedCheck(num_elem >= 0, ceed, CEED_ERROR_DIMENSION, "Number of elements must be non-negative");
   CeedCheck(num_points >= 0, ceed, CEED_ERROR_DIMENSION, "Number of points must be non-negative");
   CeedCheck(num_comp > 0, ceed, CEED_ERROR_DIMENSION, "CeedElemRestriction must have at least 1 component");
-  CeedCheck(l_size >= (CeedSize)num_points * num_comp, ceed, CEED_ERROR_DIMENSION, "L-vector must be at least num_points * num_comp");
+  CeedCheck(l_size >= (CeedSize)num_points * num_comp, ceed, CEED_ERROR_DIMENSION,
+            "L-vector must be at least num_points * num_comp. Expected: > %" CeedSize_FMT " Found: %" CeedSize_FMT, (CeedSize)num_points * num_comp,
+            l_size);
 
   CeedCall(CeedCalloc(1, rstr));
   CeedCall(CeedReferenceCopy(ceed, &(*rstr)->ceed));
@@ -1137,7 +1140,8 @@ int CeedElemRestrictionCreateBlockedStrided(Ceed ceed, CeedInt num_elem, CeedInt
   CeedCheck(block_size > 0, ceed, CEED_ERROR_DIMENSION, "Block size must be at least 1");
   CeedCheck(num_comp > 0, ceed, CEED_ERROR_DIMENSION, "CeedElemRestriction must have at least 1 component");
   CeedCheck(l_size >= (CeedSize)num_elem * (CeedSize)elem_size * (CeedSize)num_comp, ceed, CEED_ERROR_DIMENSION,
-            "L-vector size must be at least num_elem * elem_size * num_comp");
+            "L-vector size must be at least num_elem * elem_size * num_comp. Expected: > %" CeedSize_FMT " Found: %" CeedSize_FMT,
+            (CeedSize)num_elem * (CeedSize)elem_size * (CeedSize)num_comp, l_size);
 
   CeedCall(CeedCalloc(1, rstr));
   CeedCall(CeedReferenceCopy(ceed, &(*rstr)->ceed));

From 3f08121c3514eef2c33780965530b5ca922abfd3 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 12 Jul 2024 09:54:43 -0600
Subject: [PATCH 130/571] err - add more data to CeedCheck messages

---
 interface/ceed-basis.c           | 11 +++++++++--
 interface/ceed-elemrestriction.c |  5 ++++-
 interface/ceed-preconditioning.c | 27 ++++++++++++++++++++-------
 interface/ceed-qfunction.c       |  6 ++++--
 interface/ceed-vector.c          | 15 ++++++++++++---
 5 files changed, 49 insertions(+), 15 deletions(-)

diff --git a/interface/ceed-basis.c b/interface/ceed-basis.c
index d110988371..1f393c3164 100644
--- a/interface/ceed-basis.c
+++ b/interface/ceed-basis.c
@@ -203,7 +203,10 @@ static int CeedBasisCreateProjectionMatrices(CeedBasis basis_from, CeedBasis bas
   // Check for compatible quadrature spaces
   CeedCall(CeedBasisGetNumQuadraturePoints(basis_to, &Q_to));
   CeedCall(CeedBasisGetNumQuadraturePoints(basis_from, &Q_from));
-  CeedCheck(Q_to == Q_from, ceed, CEED_ERROR_DIMENSION, "Bases must have compatible quadrature spaces");
+  CeedCheck(Q_to == Q_from, ceed, CEED_ERROR_DIMENSION,
+            "Bases must have compatible quadrature spaces."
+            " 'basis_from has %" CeedInt_FMT " points and 'basis_to' has %" CeedInt_FMT,
+            Q_from, Q_to);
   Q = Q_to;
 
   // Check for matching tensor or non-tensor
@@ -225,9 +228,13 @@ static int CeedBasisCreateProjectionMatrices(CeedBasis basis_from, CeedBasis bas
 
   // Check for matching FE space
   CeedFESpace fe_space_to, fe_space_from;
+
   CeedCall(CeedBasisGetFESpace(basis_to, &fe_space_to));
   CeedCall(CeedBasisGetFESpace(basis_from, &fe_space_from));
-  CeedCheck(fe_space_to == fe_space_from, ceed, CEED_ERROR_MINOR, "Bases must both be the same FE space type");
+  CeedCheck(fe_space_to == fe_space_from, ceed, CEED_ERROR_MINOR,
+            "Bases must both be the same FE space type."
+            " 'basis_from' is a %s and 'basis_to' is a %s",
+            CeedFESpaces[fe_space_from], CeedFESpaces[fe_space_to]);
 
   // Get source matrices
   CeedInt           dim, q_comp = 1;
diff --git a/interface/ceed-elemrestriction.c b/interface/ceed-elemrestriction.c
index 2e78628087..9fa36d45d2 100644
--- a/interface/ceed-elemrestriction.c
+++ b/interface/ceed-elemrestriction.c
@@ -486,7 +486,10 @@ int CeedElemRestrictionSetAtPointsEVectorSize(CeedElemRestriction rstr, CeedSize
   CeedCall(CeedElemRestrictionGetCeed(rstr, &ceed));
   CeedCall(CeedElemRestrictionGetType(rstr, &rstr_type));
   CeedCheck(rstr_type == CEED_RESTRICTION_POINTS, ceed, CEED_ERROR_INCOMPATIBLE, "Can only compute offset for a points CeedElemRestriction");
-  CeedCheck(e_size >= rstr->e_size, ceed, CEED_ERROR_INCOMPATIBLE, "Can only increase the size of the E-vector for the CeedElemRestriction");
+  CeedCheck(e_size >= rstr->e_size, ceed, CEED_ERROR_INCOMPATIBLE,
+            "Can only increase the size of the E-vector for the CeedElemRestriction."
+            " Current size: %" CeedSize_FMT " New size: %" CeedSize_FMT,
+            rstr->e_size, e_size);
   rstr->e_size = e_size;
   return CEED_ERROR_SUCCESS;
 }
diff --git a/interface/ceed-preconditioning.c b/interface/ceed-preconditioning.c
index 8eb59409d5..8d50f2dba0 100644
--- a/interface/ceed-preconditioning.c
+++ b/interface/ceed-preconditioning.c
@@ -472,7 +472,9 @@ static int CeedSingleOperatorAssembleSymbolic(CeedOperator op, CeedInt offset, C
   if (elem_rstr_in != elem_rstr_out) {
     CeedCall(CeedElemRestrictionGetNumElements(elem_rstr_out, &num_elem_out));
     CeedCheck(num_elem_in == num_elem_out, ceed, CEED_ERROR_UNSUPPORTED,
-              "Active input and output operator restrictions must have the same number of elements");
+              "Active input and output operator restrictions must have the same number of elements."
+              " Input has %" CeedInt_FMT " elements; output has %" CeedInt_FMT "elements.",
+              num_elem_in, num_elem_out);
     CeedCall(CeedElemRestrictionGetElementSize(elem_rstr_out, &elem_size_out));
     CeedCall(CeedElemRestrictionGetNumComponents(elem_rstr_out, &num_comp_out));
     CeedCall(CeedElemRestrictionGetELayout(elem_rstr_out, layout_er_out));
@@ -602,7 +604,7 @@ static int CeedSingleOperatorAssemble(CeedOperator op, CeedInt offset, CeedVecto
   CeedCall(CeedOperatorAssemblyDataGetEvalModes(data, &num_active_bases_in, &num_eval_modes_in, &eval_modes_in, NULL, &num_active_bases_out,
                                                 &num_eval_modes_out, &eval_modes_out, NULL, NULL));
 
-  CeedCheck(num_active_bases_in == num_active_bases_out && num_active_bases_in == 1, ceed, CEED_ERROR_UNSUPPORTED,
+  CeedCheck(num_active_bases_in == 1 && num_active_bases_out == 1, ceed, CEED_ERROR_UNSUPPORTED,
             "Cannot assemble operator with multiple active bases");
   CeedCheck(num_eval_modes_in[0] > 0 && num_eval_modes_out[0] > 0, ceed, CEED_ERROR_UNSUPPORTED, "Cannot assemble operator without inputs/outputs");
 
@@ -629,13 +631,17 @@ static int CeedSingleOperatorAssemble(CeedOperator op, CeedInt offset, CeedVecto
   if (elem_rstr_in != elem_rstr_out) {
     CeedCall(CeedElemRestrictionGetNumElements(elem_rstr_out, &num_elem_out));
     CeedCheck(num_elem_in == num_elem_out, ceed, CEED_ERROR_UNSUPPORTED,
-              "Active input and output operator restrictions must have the same number of elements");
+              "Active input and output operator restrictions must have the same number of elements."
+              " Input has %" CeedInt_FMT " elements; output has %" CeedInt_FMT "elements.",
+              num_elem_in, num_elem_out);
     CeedCall(CeedElemRestrictionGetElementSize(elem_rstr_out, &elem_size_out));
     CeedCall(CeedElemRestrictionGetNumComponents(elem_rstr_out, &num_comp_out));
     if (basis_out == CEED_BASIS_NONE) num_qpts_out = elem_size_out;
     else CeedCall(CeedBasisGetNumQuadraturePoints(basis_out, &num_qpts_out));
     CeedCheck(num_qpts_in == num_qpts_out, ceed, CEED_ERROR_UNSUPPORTED,
-              "Active input and output bases must have the same number of quadrature points");
+              "Active input and output bases must have the same number of quadrature points."
+              " Input has %" CeedInt_FMT " points; output has %" CeedInt_FMT "points.",
+              num_qpts_in, num_qpts_out);
 
     CeedCall(CeedElemRestrictionGetType(elem_rstr_out, &elem_rstr_type_out));
     if (elem_rstr_type_out == CEED_RESTRICTION_ORIENTED) {
@@ -802,7 +808,9 @@ static int CeedSingleOperatorAssemblyCountEntries(CeedOperator op, CeedSize *num
   if (rstr_in != rstr_out) {
     CeedCall(CeedElemRestrictionGetNumElements(rstr_out, &num_elem_out));
     CeedCheck(num_elem_in == num_elem_out, ceed, CEED_ERROR_UNSUPPORTED,
-              "Active input and output operator restrictions must have the same number of elements");
+              "Active input and output operator restrictions must have the same number of elements."
+              " Input has %" CeedInt_FMT " elements; output has %" CeedInt_FMT "elements.",
+              num_elem_in, num_elem_out);
     CeedCall(CeedElemRestrictionGetElementSize(rstr_out, &elem_size_out));
     CeedCall(CeedElemRestrictionGetNumComponents(rstr_out, &num_comp_out));
   } else {
@@ -2123,7 +2131,9 @@ int CeedOperatorLinearAssemblePointBlockDiagonalSymbolic(CeedOperator op, CeedSi
                   "Active element restrictions must have the same component stride: %d vs %d", comp_stride, comp_stride_sub);
         CeedCall(CeedElemRestrictionGetNumComponents(active_elem_rstrs[i], &num_active_components_sub));
         CeedCheck(num_active_components == num_active_components_sub, ceed, CEED_ERROR_INCOMPATIBLE,
-                  "All suboperators must have the same number of output components");
+                  "All suboperators must have the same number of output components."
+                  " Previous: %" CeedInt_FMT " Current: %" CeedInt_FMT,
+                  num_active_components, num_active_components_sub);
       }
     }
   }
@@ -2571,7 +2581,10 @@ int CeedOperatorMultigridLevelCreateTensorH1(CeedOperator op_fine, CeedVector p_
   CeedCall(CeedOperatorGetActiveBasis(op_fine, &basis_fine));
   CeedCall(CeedBasisGetNumQuadraturePoints(basis_fine, &Q_f));
   CeedCall(CeedBasisGetNumQuadraturePoints(basis_coarse, &Q_c));
-  CeedCheck(Q_f == Q_c, ceed, CEED_ERROR_DIMENSION, "Bases must have compatible quadrature spaces");
+  CeedCheck(Q_f == Q_c, ceed, CEED_ERROR_DIMENSION,
+            "Bases must have compatible quadrature spaces."
+            " Fine grid: %" CeedInt_FMT " points, Coarse grid: %" CeedInt_FMT " points",
+            Q_f, Q_c);
 
   // Create coarse to fine basis, if required
   if (op_prolong || op_restrict) {
diff --git a/interface/ceed-qfunction.c b/interface/ceed-qfunction.c
index ad90099412..02715249d4 100644
--- a/interface/ceed-qfunction.c
+++ b/interface/ceed-qfunction.c
@@ -775,10 +775,12 @@ int CeedQFunctionAddInput(CeedQFunction qf, const char *field_name, CeedInt size
   CeedCheck(!is_immutable, ceed, CEED_ERROR_MAJOR, "QFunction cannot be changed after set as immutable");
   CeedCheck(eval_mode != CEED_EVAL_WEIGHT || size == 1, ceed, CEED_ERROR_DIMENSION, "CEED_EVAL_WEIGHT should have size 1");
   for (CeedInt i = 0; i < qf->num_input_fields; i++) {
-    CeedCheck(strcmp(field_name, qf->input_fields[i]->field_name), ceed, CEED_ERROR_MINOR, "CeedQFunction field names must be unique");
+    CeedCheck(strcmp(field_name, qf->input_fields[i]->field_name), ceed, CEED_ERROR_MINOR,
+              "CeedQFunction field names must be unique. Duplicate name: %s", field_name);
   }
   for (CeedInt i = 0; i < qf->num_output_fields; i++) {
-    CeedCheck(strcmp(field_name, qf->output_fields[i]->field_name), ceed, CEED_ERROR_MINOR, "CeedQFunction field names must be unique");
+    CeedCheck(strcmp(field_name, qf->output_fields[i]->field_name), ceed, CEED_ERROR_MINOR,
+              "CeedQFunction field names must be unique. Duplicate name: %s", field_name);
   }
   CeedCall(CeedQFunctionFieldSet(&qf->input_fields[qf->num_input_fields], field_name, size, eval_mode));
   qf->num_input_fields++;
diff --git a/interface/ceed-vector.c b/interface/ceed-vector.c
index e2da964009..7c10ca98bc 100644
--- a/interface/ceed-vector.c
+++ b/interface/ceed-vector.c
@@ -735,7 +735,10 @@ int CeedVectorAXPY(CeedVector y, CeedScalar alpha, CeedVector x) {
   CeedCall(CeedVectorGetCeed(y, &ceed));
   CeedCall(CeedVectorGetLength(y, &length_y));
   CeedCall(CeedVectorGetLength(x, &length_x));
-  CeedCheck(length_x == length_y, ceed, CEED_ERROR_UNSUPPORTED, "Cannot add vector of different lengths");
+  CeedCheck(length_x == length_y, ceed, CEED_ERROR_UNSUPPORTED,
+            "Cannot add vector of different lengths."
+            " x length: %" CeedSize_FMT " y length: %" CeedSize_FMT,
+            length_x, length_y);
   CeedCheck(x != y, ceed, CEED_ERROR_UNSUPPORTED, "Cannot use same vector for x and y in CeedVectorAXPY");
 
   CeedCall(CeedVectorHasValidArray(x, &has_valid_array_x));
@@ -795,7 +798,10 @@ int CeedVectorAXPBY(CeedVector y, CeedScalar alpha, CeedScalar beta, CeedVector
 
   CeedCall(CeedVectorGetLength(y, &length_y));
   CeedCall(CeedVectorGetLength(x, &length_x));
-  CeedCheck(length_x == length_y, ceed, CEED_ERROR_UNSUPPORTED, "Cannot add vector of different lengths");
+  CeedCheck(length_x == length_y, ceed, CEED_ERROR_UNSUPPORTED,
+            "Cannot add vector of different lengths."
+            " x length: %" CeedSize_FMT " y length: %" CeedSize_FMT,
+            length_x, length_y);
   CeedCheck(x != y, ceed, CEED_ERROR_UNSUPPORTED, "Cannot use same vector for x and y in CeedVectorAXPBY");
 
   CeedCall(CeedVectorHasValidArray(x, &has_valid_array_x));
@@ -856,7 +862,10 @@ int CeedVectorPointwiseMult(CeedVector w, CeedVector x, CeedVector y) {
   CeedCall(CeedVectorGetLength(w, &length_w));
   CeedCall(CeedVectorGetLength(x, &length_x));
   CeedCall(CeedVectorGetLength(y, &length_y));
-  CeedCheck(length_w == length_x && length_w == length_y, ceed, CEED_ERROR_UNSUPPORTED, "Cannot multiply vectors of different lengths");
+  CeedCheck(length_w == length_x && length_w == length_y, ceed, CEED_ERROR_UNSUPPORTED,
+            "Cannot multiply vectors of different lengths."
+            " x length: %" CeedSize_FMT " y length: %" CeedSize_FMT,
+            length_x, length_y);
 
   CeedCall(CeedGetParent(w->ceed, &ceed_parent_w));
   CeedCall(CeedGetParent(x->ceed, &ceed_parent_x));

From 236227552b176e9369bb36c1a06460b4aa8f7627 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 12 Jul 2024 14:52:51 -0600
Subject: [PATCH 131/571] minor - fix typo

---
 interface/ceed-basis.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/interface/ceed-basis.c b/interface/ceed-basis.c
index 1f393c3164..bc16186459 100644
--- a/interface/ceed-basis.c
+++ b/interface/ceed-basis.c
@@ -205,7 +205,7 @@ static int CeedBasisCreateProjectionMatrices(CeedBasis basis_from, CeedBasis bas
   CeedCall(CeedBasisGetNumQuadraturePoints(basis_from, &Q_from));
   CeedCheck(Q_to == Q_from, ceed, CEED_ERROR_DIMENSION,
             "Bases must have compatible quadrature spaces."
-            " 'basis_from has %" CeedInt_FMT " points and 'basis_to' has %" CeedInt_FMT,
+            " 'basis_from' has %" CeedInt_FMT " points and 'basis_to' has %" CeedInt_FMT,
             Q_from, Q_to);
   Q = Q_to;
 

From 8bf1b130e1dc187f0ab76c5035337405b3b03737 Mon Sep 17 00:00:00 2001
From: James Wright <james@jameswright.xyz>
Date: Mon, 15 Jul 2024 10:12:17 -0600
Subject: [PATCH 132/571] doc(op): Remove CeedOperatorView_Core from user docs

---
 interface/ceed-operator.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/interface/ceed-operator.c b/interface/ceed-operator.c
index 4acb8f1983..5881846665 100644
--- a/interface/ceed-operator.c
+++ b/interface/ceed-operator.c
@@ -1476,13 +1476,12 @@ int CeedOperatorSetName(CeedOperator op, const char *name) {
   @brief Core logic for viewing a `CeedOperator`
 
   @param[in] op     `CeedOperator` to view brief summary
-  @param[in] stream Stream to write; typically `stdout` or a file
+  @param[in] stream  Stream to write; typically `stdout` or a file
+  @param[in] is_full Whether to write full operator view or terse
 
   @return Error code: 0 - success, otherwise - failure
-
-  @ref User
 **/
-int CeedOperatorView_Core(CeedOperator op, FILE *stream, bool is_full) {
+static int CeedOperatorView_Core(CeedOperator op, FILE *stream, bool is_full) {
   bool has_name = op->name, is_composite;
 
   CeedCall(CeedOperatorIsComposite(op, &is_composite));

From 9b443e3bfd4287070359f52cbb140b235b9f2b08 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Tue, 16 Jul 2024 14:38:38 -0600
Subject: [PATCH 133/571] gpu - minimum input/output array size of 1

---
 backends/cuda-ref/ceed-cuda-ref-qfunction-load.cpp      | 4 ++--
 backends/hip-ref/ceed-hip-ref-qfunction-load.cpp        | 4 ++--
 backends/sycl-ref/ceed-sycl-ref-qfunction-load.sycl.cpp | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/backends/cuda-ref/ceed-cuda-ref-qfunction-load.cpp b/backends/cuda-ref/ceed-cuda-ref-qfunction-load.cpp
index 03ace250fb..ed40b1fca9 100644
--- a/backends/cuda-ref/ceed-cuda-ref-qfunction-load.cpp
+++ b/backends/cuda-ref/ceed-cuda-ref-qfunction-load.cpp
@@ -69,7 +69,7 @@ extern "C" int CeedQFunctionBuildKernel_Cuda_ref(CeedQFunction qf) {
     code << "  const CeedInt size_input_" << i << " = " << size << ";\n";
     code << "  CeedScalar input_" << i << "[size_input_" << i << "];\n";
   }
-  code << "  const CeedScalar* inputs[" << num_input_fields << "];\n";
+  code << "  const CeedScalar *inputs[" << CeedIntMax(num_input_fields, 1) << "];\n";
   for (CeedInt i = 0; i < num_input_fields; i++) {
     code << "  inputs[" << i << "] = input_" << i << ";\n";
   }
@@ -82,7 +82,7 @@ extern "C" int CeedQFunctionBuildKernel_Cuda_ref(CeedQFunction qf) {
     code << "  const CeedInt size_output_" << i << " = " << size << ";\n";
     code << "  CeedScalar output_" << i << "[size_output_" << i << "];\n";
   }
-  code << "  CeedScalar* outputs[" << num_output_fields << "];\n";
+  code << "  CeedScalar *outputs[" << CeedIntMax(num_output_fields, 1) << "];\n";
   for (CeedInt i = 0; i < num_output_fields; i++) {
     code << "  outputs[" << i << "] = output_" << i << ";\n";
   }
diff --git a/backends/hip-ref/ceed-hip-ref-qfunction-load.cpp b/backends/hip-ref/ceed-hip-ref-qfunction-load.cpp
index 222a94fc85..3ba4f23266 100644
--- a/backends/hip-ref/ceed-hip-ref-qfunction-load.cpp
+++ b/backends/hip-ref/ceed-hip-ref-qfunction-load.cpp
@@ -69,7 +69,7 @@ extern "C" int CeedQFunctionBuildKernel_Hip_ref(CeedQFunction qf) {
     code << "  const CeedInt size_input_" << i << " = " << size << ";\n";
     code << "  CeedScalar input_" << i << "[size_input_" << i << "];\n";
   }
-  code << "  const CeedScalar* inputs[" << num_input_fields << "];\n";
+  code << "  const CeedScalar *inputs[" << CeedIntMax(num_input_fields, 1) << "];\n";
   for (CeedInt i = 0; i < num_input_fields; i++) {
     code << "  inputs[" << i << "] = input_" << i << ";\n";
   }
@@ -82,7 +82,7 @@ extern "C" int CeedQFunctionBuildKernel_Hip_ref(CeedQFunction qf) {
     code << "  const CeedInt size_output_" << i << " = " << size << ";\n";
     code << "  CeedScalar output_" << i << "[size_output_" << i << "];\n";
   }
-  code << "  CeedScalar* outputs[" << num_output_fields << "];\n";
+  code << "  CeedScalar *outputs[" << CeedIntMax(num_output_fields, 1) << "];\n";
   for (CeedInt i = 0; i < num_output_fields; i++) {
     code << "  outputs[" << i << "] = output_" << i << ";\n";
   }
diff --git a/backends/sycl-ref/ceed-sycl-ref-qfunction-load.sycl.cpp b/backends/sycl-ref/ceed-sycl-ref-qfunction-load.sycl.cpp
index 759b9b9a5a..1d3cf330ad 100644
--- a/backends/sycl-ref/ceed-sycl-ref-qfunction-load.sycl.cpp
+++ b/backends/sycl-ref/ceed-sycl-ref-qfunction-load.sycl.cpp
@@ -118,7 +118,7 @@ extern "C" int CeedQFunctionBuildKernel_Sycl(CeedQFunction qf) {
   for (CeedInt i = 0; i < num_input_fields; ++i) {
     code << "  CeedScalar U_" << i << "[" << input_sizes[i] << "];\n";
   }
-  code << "  const CeedScalar *inputs[" << num_input_fields << "] = {U_0";
+  code << "  const CeedScalar *inputs[" << CeedIntMax(num_input_fields, 1) << "] = {U_0";
   for (CeedInt i = 1; i < num_input_fields; i++) {
     code << ", U_" << i << "\n";
   }
@@ -129,7 +129,7 @@ extern "C" int CeedQFunctionBuildKernel_Sycl(CeedQFunction qf) {
   for (CeedInt i = 0; i < num_output_fields; i++) {
     code << "  CeedScalar V_" << i << "[" << output_sizes[i] << "];\n";
   }
-  code << "  CeedScalar *outputs[" << num_output_fields << "] = {V_0";
+  code << "  CeedScalar *outputs[" << CeedIntMax(num_output_fields, 1) << "] = {V_0";
   for (CeedInt i = 1; i < num_output_fields; i++) {
     code << ", V_" << i << "\n";
   }

From 382e9c833ac782f49cfe92c0a165d4781243d726 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 2 Aug 2024 11:32:03 -0600
Subject: [PATCH 134/571] atPoints - fix diagonal assembly for mixed

---
 backends/cuda-ref/ceed-cuda-ref-operator.c | 207 +++++++++++----------
 backends/hip-ref/ceed-hip-ref-operator.c   | 207 +++++++++++----------
 backends/ref/ceed-ref-operator.c           | 201 +++++++++++---------
 3 files changed, 325 insertions(+), 290 deletions(-)

diff --git a/backends/cuda-ref/ceed-cuda-ref-operator.c b/backends/cuda-ref/ceed-cuda-ref-operator.c
index 4f30418503..ad8868bc74 100644
--- a/backends/cuda-ref/ceed-cuda-ref-operator.c
+++ b/backends/cuda-ref/ceed-cuda-ref-operator.c
@@ -1475,9 +1475,7 @@ static int CeedOperatorLinearAssembleQFunctionAtPoints_Cuda(CeedOperator op, Cee
 // Assemble Linear Diagonal AtPoints
 //------------------------------------------------------------------------------
 static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda(CeedOperator op, CeedVector assembled, CeedRequest *request) {
-  bool                is_active_at_points = true;
-  CeedSize            e_vec_size          = 0;
-  CeedInt             max_num_points, num_elem, num_input_fields, num_output_fields, elem_size_active = 1, num_comp_active = 1;
+  CeedInt             max_num_points, num_elem, num_input_fields, num_output_fields;
   CeedScalar         *e_data[2 * CEED_FIELD_MAX] = {NULL};
   CeedQFunctionField *qf_input_fields, *qf_output_fields;
   CeedQFunction       qf;
@@ -1499,24 +1497,6 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda(CeedOperator op, C
   // Input Evecs and Restriction
   CeedCallBackend(CeedOperatorSetupInputs_Cuda(num_input_fields, qf_input_fields, op_input_fields, NULL, true, e_data, impl, request));
 
-  // Check if active field is at points
-  for (CeedInt i = 0; i < num_input_fields; i++) {
-    CeedRestrictionType rstr_type;
-    CeedVector          vec;
-    CeedElemRestriction elem_rstr;
-
-    CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
-    // Skip non-active input
-    if (vec != CEED_VECTOR_ACTIVE) continue;
-
-    // Get active restriction type
-    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
-    CeedCallBackend(CeedElemRestrictionGetType(elem_rstr, &rstr_type));
-    CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp_active));
-    is_active_at_points = rstr_type == CEED_RESTRICTION_POINTS;
-    if (!is_active_at_points) CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size_active));
-  }
-
   // Get point coordinates
   if (!impl->point_coords_elem) {
     CeedVector          point_coords = NULL;
@@ -1527,6 +1507,16 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda(CeedOperator op, C
     CeedCallBackend(CeedElemRestrictionApply(rstr_points, CEED_NOTRANSPOSE, point_coords, impl->point_coords_elem, request));
   }
 
+  // Clear active input Qvecs
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedVector vec;
+
+    CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
+    if (vec != CEED_VECTOR_ACTIVE) continue;
+    CeedCallBackend(CeedVectorSetValue(impl->e_vecs[i], 0.0));
+    CeedCallBackend(CeedVectorSetValue(impl->q_vecs_in[i], 0.0));
+  }
+
   // Input basis apply if needed
   CeedCallBackend(CeedOperatorInputBasisAtPoints_Cuda(num_elem, num_points, qf_input_fields, op_input_fields, num_input_fields, true, e_data, impl));
 
@@ -1543,9 +1533,27 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda(CeedOperator op, C
   }
 
   // Loop over active fields
-  e_vec_size = (is_active_at_points ? max_num_points : elem_size_active) * num_comp_active;
-  for (CeedInt s = 0; s < e_vec_size; s++) {
-    for (CeedInt i = 0; i < num_input_fields; i++) {
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    bool                is_active_at_points = true;
+    CeedInt             elem_size = 1, num_comp_active = 1, e_vec_size = 0;
+    CeedRestrictionType rstr_type;
+    CeedVector          vec;
+    CeedElemRestriction elem_rstr;
+
+    CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
+    // -- Skip non-active input
+    if (vec != CEED_VECTOR_ACTIVE) continue;
+
+    // -- Get active restriction type
+    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
+    CeedCallBackend(CeedElemRestrictionGetType(elem_rstr, &rstr_type));
+    is_active_at_points = rstr_type == CEED_RESTRICTION_POINTS;
+    if (!is_active_at_points) CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
+    else elem_size = max_num_points;
+    CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp_active));
+
+    e_vec_size = elem_size * num_comp_active;
+    for (CeedInt s = 0; s < e_vec_size; s++) {
       bool         is_active_input = false;
       CeedEvalMode eval_mode;
       CeedVector   vec;
@@ -1557,7 +1565,6 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda(CeedOperator op, C
       if (!is_active_input) continue;
 
       // Update unit vector
-      if (s == 0) CeedCallBackend(CeedVectorSetValue(impl->e_vecs[i], 0.0));
       else CeedCallBackend(CeedVectorSetValueStrided(impl->e_vecs[i], s - 1, e_vec_size, 0.0));
       CeedCallBackend(CeedVectorSetValueStrided(impl->e_vecs[i], s, e_vec_size, 1.0));
 
@@ -1578,92 +1585,94 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda(CeedOperator op, C
         case CEED_EVAL_WEIGHT:
           break;  // No action
       }
-    }
-
-    // Q function
-    CeedCallBackend(CeedQFunctionApply(qf, num_elem * max_num_points, impl->q_vecs_in, impl->q_vecs_out));
-
-    // Output basis apply if needed
-    for (CeedInt i = 0; i < num_output_fields; i++) {
-      bool                is_active_output = false;
-      CeedEvalMode        eval_mode;
-      CeedVector          vec;
-      CeedElemRestriction elem_rstr;
-      CeedBasis           basis;
 
-      // Get output vector
-      CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
-      is_active_output = vec == CEED_VECTOR_ACTIVE;
-      if (!is_active_output) continue;
+      // Q function
+      CeedCallBackend(CeedQFunctionApply(qf, num_elem * max_num_points, impl->q_vecs_in, impl->q_vecs_out));
+
+      // Output basis apply if needed
+      for (CeedInt j = 0; j < num_output_fields; j++) {
+        bool                is_active_output = false;
+        CeedInt             elem_size        = 0;
+        CeedRestrictionType rstr_type;
+        CeedEvalMode        eval_mode;
+        CeedVector          vec;
+        CeedElemRestriction elem_rstr;
+        CeedBasis           basis;
+
+        CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[j], &vec));
+        // ---- Skip non-active output
+        is_active_output = vec == CEED_VECTOR_ACTIVE;
+        if (!is_active_output) continue;
+
+        // ---- Check if elem size matches
+        CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[j], &elem_rstr));
+        CeedCallBackend(CeedElemRestrictionGetType(elem_rstr, &rstr_type));
+        if (is_active_at_points && rstr_type != CEED_RESTRICTION_POINTS) continue;
+        if (rstr_type == CEED_RESTRICTION_POINTS) {
+          CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(elem_rstr, &elem_size));
+        } else {
+          CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
+        }
+        {
+          CeedInt num_comp = 0;
 
-      // Basis action
-      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
-      switch (eval_mode) {
-        case CEED_EVAL_NONE:
-          CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs[i + impl->num_inputs], &e_data[i + num_input_fields]));
-          break;
-        case CEED_EVAL_INTERP:
-        case CEED_EVAL_GRAD:
-        case CEED_EVAL_DIV:
-        case CEED_EVAL_CURL:
-          CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
-          CeedCallBackend(CeedBasisApplyAtPoints(basis, num_elem, num_points, CEED_TRANSPOSE, eval_mode, impl->point_coords_elem, impl->q_vecs_out[i],
-                                                 impl->e_vecs[i + impl->num_inputs]));
-          break;
-        // LCOV_EXCL_START
-        case CEED_EVAL_WEIGHT: {
-          return CeedError(CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
-          // LCOV_EXCL_STOP
+          CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
+          if (e_vec_size != num_comp * elem_size) continue;
         }
-      }
 
-      // Mask output e-vec
-      {
-        CeedInt  j = num_input_fields;
-        CeedSize out_size;
-
-        CeedCallBackend(CeedVectorGetLength(impl->e_vecs[i + impl->num_inputs], &out_size));
-        for (j = 0; j < num_input_fields; j++) {
-          bool       is_active_input = false;
-          CeedSize   in_size;
-          CeedVector vec;
-
-          // Skip non-active input
-          CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec));
-          is_active_input = vec == CEED_VECTOR_ACTIVE;
-          if (!is_active_input) continue;
-          CeedCallBackend(CeedVectorGetLength(impl->e_vecs[j], &in_size));
-          if (in_size == out_size) break;
+        // Basis action
+        CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode));
+        switch (eval_mode) {
+          case CEED_EVAL_NONE:
+            CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs[j + impl->num_inputs], &e_data[j + num_input_fields]));
+            break;
+          case CEED_EVAL_INTERP:
+          case CEED_EVAL_GRAD:
+          case CEED_EVAL_DIV:
+          case CEED_EVAL_CURL:
+            CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[j], &basis));
+            CeedCallBackend(CeedBasisApplyAtPoints(basis, num_elem, num_points, CEED_TRANSPOSE, eval_mode, impl->point_coords_elem,
+                                                   impl->q_vecs_out[j], impl->e_vecs[j + impl->num_inputs]));
+            break;
+          // LCOV_EXCL_START
+          case CEED_EVAL_WEIGHT: {
+            return CeedError(CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
+            // LCOV_EXCL_STOP
+          }
         }
-        CeedCheck(j < num_input_fields, CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND, "Matching input field not found");
-        CeedCallBackend(CeedVectorPointwiseMult(impl->e_vecs[i + impl->num_inputs], impl->e_vecs[j], impl->e_vecs[i + impl->num_inputs]));
-      }
 
-      // Restrict
-      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
-      CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_TRANSPOSE, impl->e_vecs[i + impl->num_inputs], assembled, request));
+        // Mask output e-vec
+        CeedCallBackend(CeedVectorPointwiseMult(impl->e_vecs[j + impl->num_inputs], impl->e_vecs[i], impl->e_vecs[j + impl->num_inputs]));
 
-      // Reset q_vec for
-      if (eval_mode == CEED_EVAL_NONE) {
-        CeedCallBackend(CeedVectorGetArrayWrite(impl->e_vecs[i + impl->num_inputs], CEED_MEM_DEVICE, &e_data[i + num_input_fields]));
-        CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data[i + num_input_fields]));
+        // Restrict
+        CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[j], &elem_rstr));
+        CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_TRANSPOSE, impl->e_vecs[j + impl->num_inputs], assembled, request));
+
+        // Reset q_vec for
+        if (eval_mode == CEED_EVAL_NONE) {
+          CeedCallBackend(CeedVectorGetArrayWrite(impl->e_vecs[j + impl->num_inputs], CEED_MEM_DEVICE, &e_data[j + num_input_fields]));
+          CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[j], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data[j + num_input_fields]));
+        }
       }
+
+      // Reset vec
+      if (s == e_vec_size - 1) CeedCallBackend(CeedVectorSetValueStrided(impl->e_vecs[i], s, e_vec_size, 0.0));
     }
-  }
 
-  // Restore CEED_EVAL_NONE
-  for (CeedInt i = 0; i < num_output_fields; i++) {
-    CeedEvalMode        eval_mode;
-    CeedElemRestriction elem_rstr;
+    // Restore CEED_EVAL_NONE
+    for (CeedInt i = 0; i < num_output_fields; i++) {
+      CeedEvalMode        eval_mode;
+      CeedElemRestriction elem_rstr;
 
-    // Get eval_mode
-    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
-    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
+      // Get eval_mode
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
 
-    // Restore evec
-    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
-    if (eval_mode == CEED_EVAL_NONE) {
-      CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs[i + impl->num_inputs], &e_data[i + num_input_fields]));
+      // Restore evec
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
+      if (eval_mode == CEED_EVAL_NONE) {
+        CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs[i + impl->num_inputs], &e_data[i + num_input_fields]));
+      }
     }
   }
 
diff --git a/backends/hip-ref/ceed-hip-ref-operator.c b/backends/hip-ref/ceed-hip-ref-operator.c
index 8363d6e81d..7134b9593e 100644
--- a/backends/hip-ref/ceed-hip-ref-operator.c
+++ b/backends/hip-ref/ceed-hip-ref-operator.c
@@ -1472,9 +1472,7 @@ static int CeedOperatorLinearAssembleQFunctionAtPoints_Hip(CeedOperator op, Ceed
 // Assemble Linear Diagonal AtPoints
 //------------------------------------------------------------------------------
 static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip(CeedOperator op, CeedVector assembled, CeedRequest *request) {
-  bool                is_active_at_points = true;
-  CeedSize            e_vec_size          = 0;
-  CeedInt             max_num_points, num_elem, num_input_fields, num_output_fields, elem_size_active = 1, num_comp_active = 1;
+  CeedInt             max_num_points, num_elem, num_input_fields, num_output_fields;
   CeedScalar         *e_data[2 * CEED_FIELD_MAX] = {NULL};
   CeedQFunctionField *qf_input_fields, *qf_output_fields;
   CeedQFunction       qf;
@@ -1496,24 +1494,6 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip(CeedOperator op, Ce
   // Input Evecs and Restriction
   CeedCallBackend(CeedOperatorSetupInputs_Hip(num_input_fields, qf_input_fields, op_input_fields, NULL, true, e_data, impl, request));
 
-  // Check if active field is at points
-  for (CeedInt i = 0; i < num_input_fields; i++) {
-    CeedRestrictionType rstr_type;
-    CeedVector          vec;
-    CeedElemRestriction elem_rstr;
-
-    CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
-    // Skip non-active input
-    if (vec != CEED_VECTOR_ACTIVE) continue;
-
-    // Get active restriction type
-    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
-    CeedCallBackend(CeedElemRestrictionGetType(elem_rstr, &rstr_type));
-    CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp_active));
-    is_active_at_points = rstr_type == CEED_RESTRICTION_POINTS;
-    if (!is_active_at_points) CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size_active));
-  }
-
   // Get point coordinates
   if (!impl->point_coords_elem) {
     CeedVector          point_coords = NULL;
@@ -1524,6 +1504,16 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip(CeedOperator op, Ce
     CeedCallBackend(CeedElemRestrictionApply(rstr_points, CEED_NOTRANSPOSE, point_coords, impl->point_coords_elem, request));
   }
 
+  // Clear active input Qvecs
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedVector vec;
+
+    CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
+    if (vec != CEED_VECTOR_ACTIVE) continue;
+    CeedCallBackend(CeedVectorSetValue(impl->e_vecs[i], 0.0));
+    CeedCallBackend(CeedVectorSetValue(impl->q_vecs_in[i], 0.0));
+  }
+
   // Input basis apply if needed
   CeedCallBackend(CeedOperatorInputBasisAtPoints_Hip(num_elem, num_points, qf_input_fields, op_input_fields, num_input_fields, true, e_data, impl));
 
@@ -1540,9 +1530,27 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip(CeedOperator op, Ce
   }
 
   // Loop over active fields
-  e_vec_size = (is_active_at_points ? max_num_points : elem_size_active) * num_comp_active;
-  for (CeedInt s = 0; s < e_vec_size; s++) {
-    for (CeedInt i = 0; i < num_input_fields; i++) {
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    bool                is_active_at_points = true;
+    CeedInt             elem_size = 1, num_comp_active = 1, e_vec_size = 0;
+    CeedRestrictionType rstr_type;
+    CeedVector          vec;
+    CeedElemRestriction elem_rstr;
+
+    CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
+    // -- Skip non-active input
+    if (vec != CEED_VECTOR_ACTIVE) continue;
+
+    // -- Get active restriction type
+    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
+    CeedCallBackend(CeedElemRestrictionGetType(elem_rstr, &rstr_type));
+    is_active_at_points = rstr_type == CEED_RESTRICTION_POINTS;
+    if (!is_active_at_points) CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
+    else elem_size = max_num_points;
+    CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp_active));
+
+    e_vec_size = elem_size * num_comp_active;
+    for (CeedInt s = 0; s < e_vec_size; s++) {
       bool         is_active_input = false;
       CeedEvalMode eval_mode;
       CeedVector   vec;
@@ -1554,7 +1562,6 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip(CeedOperator op, Ce
       if (!is_active_input) continue;
 
       // Update unit vector
-      if (s == 0) CeedCallBackend(CeedVectorSetValue(impl->e_vecs[i], 0.0));
       else CeedCallBackend(CeedVectorSetValueStrided(impl->e_vecs[i], s - 1, e_vec_size, 0.0));
       CeedCallBackend(CeedVectorSetValueStrided(impl->e_vecs[i], s, e_vec_size, 1.0));
 
@@ -1575,92 +1582,94 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip(CeedOperator op, Ce
         case CEED_EVAL_WEIGHT:
           break;  // No action
       }
-    }
-
-    // Q function
-    CeedCallBackend(CeedQFunctionApply(qf, num_elem * max_num_points, impl->q_vecs_in, impl->q_vecs_out));
-
-    // Output basis apply if needed
-    for (CeedInt i = 0; i < num_output_fields; i++) {
-      bool                is_active_output = false;
-      CeedEvalMode        eval_mode;
-      CeedVector          vec;
-      CeedElemRestriction elem_rstr;
-      CeedBasis           basis;
 
-      // Get output vector
-      CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
-      is_active_output = vec == CEED_VECTOR_ACTIVE;
-      if (!is_active_output) continue;
+      // Q function
+      CeedCallBackend(CeedQFunctionApply(qf, num_elem * max_num_points, impl->q_vecs_in, impl->q_vecs_out));
+
+      // Output basis apply if needed
+      for (CeedInt j = 0; j < num_output_fields; j++) {
+        bool                is_active_output = false;
+        CeedInt             elem_size        = 0;
+        CeedRestrictionType rstr_type;
+        CeedEvalMode        eval_mode;
+        CeedVector          vec;
+        CeedElemRestriction elem_rstr;
+        CeedBasis           basis;
+
+        CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[j], &vec));
+        // ---- Skip non-active output
+        is_active_output = vec == CEED_VECTOR_ACTIVE;
+        if (!is_active_output) continue;
+
+        // ---- Check if elem size matches
+        CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[j], &elem_rstr));
+        CeedCallBackend(CeedElemRestrictionGetType(elem_rstr, &rstr_type));
+        if (is_active_at_points && rstr_type != CEED_RESTRICTION_POINTS) continue;
+        if (rstr_type == CEED_RESTRICTION_POINTS) {
+          CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(elem_rstr, &elem_size));
+        } else {
+          CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
+        }
+        {
+          CeedInt num_comp = 0;
 
-      // Basis action
-      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
-      switch (eval_mode) {
-        case CEED_EVAL_NONE:
-          CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs[i + impl->num_inputs], &e_data[i + num_input_fields]));
-          break;
-        case CEED_EVAL_INTERP:
-        case CEED_EVAL_GRAD:
-        case CEED_EVAL_DIV:
-        case CEED_EVAL_CURL:
-          CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
-          CeedCallBackend(CeedBasisApplyAtPoints(basis, num_elem, num_points, CEED_TRANSPOSE, eval_mode, impl->point_coords_elem, impl->q_vecs_out[i],
-                                                 impl->e_vecs[i + impl->num_inputs]));
-          break;
-        // LCOV_EXCL_START
-        case CEED_EVAL_WEIGHT: {
-          return CeedError(CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
-          // LCOV_EXCL_STOP
+          CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
+          if (e_vec_size != num_comp * elem_size) continue;
         }
-      }
 
-      // Mask output e-vec
-      {
-        CeedInt  j = num_input_fields;
-        CeedSize out_size;
-
-        CeedCallBackend(CeedVectorGetLength(impl->e_vecs[i + impl->num_inputs], &out_size));
-        for (j = 0; j < num_input_fields; j++) {
-          bool       is_active_input = false;
-          CeedSize   in_size;
-          CeedVector vec;
-
-          // Skip non-active input
-          CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec));
-          is_active_input = vec == CEED_VECTOR_ACTIVE;
-          if (!is_active_input) continue;
-          CeedCallBackend(CeedVectorGetLength(impl->e_vecs[j], &in_size));
-          if (in_size == out_size) break;
+        // Basis action
+        CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode));
+        switch (eval_mode) {
+          case CEED_EVAL_NONE:
+            CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs[j + impl->num_inputs], &e_data[j + num_input_fields]));
+            break;
+          case CEED_EVAL_INTERP:
+          case CEED_EVAL_GRAD:
+          case CEED_EVAL_DIV:
+          case CEED_EVAL_CURL:
+            CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[j], &basis));
+            CeedCallBackend(CeedBasisApplyAtPoints(basis, num_elem, num_points, CEED_TRANSPOSE, eval_mode, impl->point_coords_elem,
+                                                   impl->q_vecs_out[j], impl->e_vecs[j + impl->num_inputs]));
+            break;
+          // LCOV_EXCL_START
+          case CEED_EVAL_WEIGHT: {
+            return CeedError(CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
+            // LCOV_EXCL_STOP
+          }
         }
-        CeedCheck(j < num_input_fields, CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND, "Matching input field not found");
-        CeedCallBackend(CeedVectorPointwiseMult(impl->e_vecs[i + impl->num_inputs], impl->e_vecs[j], impl->e_vecs[i + impl->num_inputs]));
-      }
 
-      // Restrict
-      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
-      CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_TRANSPOSE, impl->e_vecs[i + impl->num_inputs], assembled, request));
+        // Mask output e-vec
+        CeedCallBackend(CeedVectorPointwiseMult(impl->e_vecs[j + impl->num_inputs], impl->e_vecs[i], impl->e_vecs[j + impl->num_inputs]));
 
-      // Reset q_vec for
-      if (eval_mode == CEED_EVAL_NONE) {
-        CeedCallBackend(CeedVectorGetArrayWrite(impl->e_vecs[i + impl->num_inputs], CEED_MEM_DEVICE, &e_data[i + num_input_fields]));
-        CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data[i + num_input_fields]));
+        // Restrict
+        CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[j], &elem_rstr));
+        CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_TRANSPOSE, impl->e_vecs[j + impl->num_inputs], assembled, request));
+
+        // Reset q_vec for
+        if (eval_mode == CEED_EVAL_NONE) {
+          CeedCallBackend(CeedVectorGetArrayWrite(impl->e_vecs[j + impl->num_inputs], CEED_MEM_DEVICE, &e_data[j + num_input_fields]));
+          CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[j], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data[j + num_input_fields]));
+        }
       }
+
+      // Reset vec
+      if (s == e_vec_size - 1) CeedCallBackend(CeedVectorSetValueStrided(impl->e_vecs[i], s, e_vec_size, 0.0));
     }
-  }
 
-  // Restore CEED_EVAL_NONE
-  for (CeedInt i = 0; i < num_output_fields; i++) {
-    CeedEvalMode        eval_mode;
-    CeedElemRestriction elem_rstr;
+    // Restore CEED_EVAL_NONE
+    for (CeedInt i = 0; i < num_output_fields; i++) {
+      CeedEvalMode        eval_mode;
+      CeedElemRestriction elem_rstr;
 
-    // Get eval_mode
-    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
-    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
+      // Get eval_mode
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
 
-    // Restore evec
-    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
-    if (eval_mode == CEED_EVAL_NONE) {
-      CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs[i + impl->num_inputs], &e_data[i + num_input_fields]));
+      // Restore evec
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
+      if (eval_mode == CEED_EVAL_NONE) {
+        CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs[i + impl->num_inputs], &e_data[i + num_input_fields]));
+      }
     }
   }
 
diff --git a/backends/ref/ceed-ref-operator.c b/backends/ref/ceed-ref-operator.c
index e2f68fc38f..844dcc2989 100644
--- a/backends/ref/ceed-ref-operator.c
+++ b/backends/ref/ceed-ref-operator.c
@@ -1120,8 +1120,7 @@ static int CeedOperatorLinearAssembleQFunctionAtPointsUpdate_Ref(CeedOperator op
 // Assemble Operator Diagonal AtPoints
 //------------------------------------------------------------------------------
 static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Ref(CeedOperator op, CeedVector assembled, CeedRequest *request) {
-  bool                is_active_at_points = true;
-  CeedInt             num_points_offset = 0, num_input_fields, num_output_fields, num_elem, elem_size_active = 1, num_comp_active = 1;
+  CeedInt             num_points_offset = 0, num_input_fields, num_output_fields, num_elem, num_comp_active = 1;
   CeedScalar         *e_data[2 * CEED_FIELD_MAX] = {0};
   Ceed                ceed;
   CeedVector          point_coords = NULL, in_vec, out_vec;
@@ -1162,27 +1161,12 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Ref(CeedOperator op, Ce
     CeedCallBackend(CeedVectorSetValue(out_vec, 0.0));
   }
 
+  // Clear input Qvecs
+  for (CeedInt i = 0; i < num_input_fields; i++) CeedCallBackend(CeedVectorSetValue(impl->e_vecs_in[i], 0.0));
+
   // Input Evecs and Restriction
   CeedCallBackend(CeedOperatorSetupInputs_Ref(num_input_fields, qf_input_fields, op_input_fields, NULL, true, e_data, impl, request));
 
-  // Check if active field is at points
-  for (CeedInt i = 0; i < num_input_fields; i++) {
-    CeedRestrictionType rstr_type;
-    CeedVector          vec;
-    CeedElemRestriction elem_rstr;
-
-    CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
-    // Skip non-active input
-    if (vec != CEED_VECTOR_ACTIVE) continue;
-
-    // Get active restriction type
-    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
-    CeedCallBackend(CeedElemRestrictionGetType(elem_rstr, &rstr_type));
-    CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp_active));
-    is_active_at_points = rstr_type == CEED_RESTRICTION_POINTS;
-    if (!is_active_at_points) CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size_active));
-  }
-
   // Loop through elements
   for (CeedInt e = 0; e < num_elem; e++) {
     CeedInt num_points, e_vec_size = 0;
@@ -1196,30 +1180,40 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Ref(CeedOperator op, Ce
                                                        impl->point_coords_elem, true, e_data, impl, request));
 
     // Loop over points on element
-    e_vec_size = (is_active_at_points ? num_points : elem_size_active) * num_comp_active;
-    for (CeedInt s = 0; s < e_vec_size; s++) {
-      for (CeedInt i = 0; i < num_input_fields; i++) {
-        bool         is_active_input = false;
+    for (CeedInt i = 0; i < num_input_fields; i++) {
+      bool                is_active_at_points = true;
+      CeedInt             elem_size_active    = 1;
+      CeedRestrictionType rstr_type;
+      CeedVector          vec;
+      CeedElemRestriction elem_rstr;
+
+      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
+      // -- Skip non-active input
+      if (vec != CEED_VECTOR_ACTIVE) continue;
+
+      // -- Get active restriction type
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
+      CeedCallBackend(CeedElemRestrictionGetType(elem_rstr, &rstr_type));
+      is_active_at_points = rstr_type == CEED_RESTRICTION_POINTS;
+      if (!is_active_at_points) CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size_active));
+      else elem_size_active = num_points;
+      CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp_active));
+
+      e_vec_size = elem_size_active * num_comp_active;
+      for (CeedInt s = 0; s < e_vec_size; s++) {
         CeedEvalMode eval_mode;
-        CeedVector   vec;
         CeedBasis    basis;
 
-        CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
-        // Skip non-active input
-        is_active_input = vec == CEED_VECTOR_ACTIVE;
-        if (!is_active_input) continue;
-
-        // Update unit vector
+        // -- Update unit vector
         {
           CeedScalar *array;
 
-          if (s == 0) CeedCallBackend(CeedVectorSetValue(impl->e_vecs_in[i], 0.0));
           CeedCallBackend(CeedVectorGetArray(impl->e_vecs_in[i], CEED_MEM_HOST, &array));
           array[s] = 1.0;
           if (s > 0) array[s - 1] = 0.0;
           CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs_in[i], &array));
         }
-        // Basis action
+        // -- Basis action
         CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
         switch (eval_mode) {
           case CEED_EVAL_NONE:
@@ -1236,69 +1230,92 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Ref(CeedOperator op, Ce
           case CEED_EVAL_WEIGHT:
             break;  // No action
         }
-      }
 
-      // -- Q function
-      if (!impl->is_identity_qf) {
-        CeedCallBackend(CeedQFunctionApply(qf, num_points, impl->q_vecs_in, impl->q_vecs_out));
-      }
+        // -- Q function
+        if (!impl->is_identity_qf) {
+          CeedCallBackend(CeedQFunctionApply(qf, num_points, impl->q_vecs_in, impl->q_vecs_out));
+        }
 
-      // -- Output basis apply and restriction
-      CeedCallBackend(CeedOperatorOutputBasisAtPoints_Ref(e, num_points_offset, num_points, qf_output_fields, op_output_fields, num_input_fields,
-                                                          num_output_fields, op, out_vec, impl->point_coords_elem, impl, request));
-
-      // -- Grab diagonal value
-      for (CeedInt i = 0; i < num_output_fields; i++) {
-        bool                is_active_output = false;
-        CeedRestrictionType rstr_type;
-        CeedEvalMode        eval_mode;
-        CeedVector          vec;
-        CeedElemRestriction elem_rstr;
-        CeedBasis           basis;
-
-        CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
-        // ---- Skip non-active output
-        is_active_output = vec == CEED_VECTOR_ACTIVE;
-        if (!is_active_output) continue;
-
-        // ---- Basis action
-        CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
-        switch (eval_mode) {
-          case CEED_EVAL_NONE:
-            break;  // No action
-          case CEED_EVAL_INTERP:
-          case CEED_EVAL_GRAD:
-          case CEED_EVAL_DIV:
-          case CEED_EVAL_CURL:
-            CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
-            CeedCallBackend(CeedBasisApplyAtPoints(basis, 1, &num_points, CEED_TRANSPOSE, eval_mode, impl->point_coords_elem, impl->q_vecs_out[i],
-                                                   impl->e_vecs_out[i]));
-            break;
-          // LCOV_EXCL_START
-          case CEED_EVAL_WEIGHT: {
-            return CeedError(CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
-            // LCOV_EXCL_STOP
+        // -- Output basis apply and restriction
+        CeedCallBackend(CeedOperatorOutputBasisAtPoints_Ref(e, num_points_offset, num_points, qf_output_fields, op_output_fields, num_input_fields,
+                                                            num_output_fields, op, out_vec, impl->point_coords_elem, impl, request));
+
+        // -- Grab diagonal value
+        for (CeedInt i = 0; i < num_output_fields; i++) {
+          bool                is_active_output = false;
+          CeedInt             elem_size        = 0;
+          CeedRestrictionType rstr_type;
+          CeedEvalMode        eval_mode;
+          CeedVector          vec;
+          CeedElemRestriction elem_rstr;
+          CeedBasis           basis;
+
+          CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
+          // ---- Skip non-active output
+          is_active_output = vec == CEED_VECTOR_ACTIVE;
+          if (!is_active_output) continue;
+
+          // ---- Check if elem size matches
+          CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
+          CeedCallBackend(CeedElemRestrictionGetType(elem_rstr, &rstr_type));
+          if (is_active_at_points && rstr_type != CEED_RESTRICTION_POINTS) continue;
+          if (rstr_type == CEED_RESTRICTION_POINTS) {
+            CeedCallBackend(CeedElemRestrictionGetNumPointsInElement(elem_rstr, e, &elem_size));
+          } else {
+            CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
+          }
+          {
+            CeedInt num_comp = 0;
+
+            CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
+            if (e_vec_size != num_comp * elem_size) continue;
+          }
+
+          // ---- Basis action
+          CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
+          switch (eval_mode) {
+            case CEED_EVAL_NONE:
+              break;  // No action
+            case CEED_EVAL_INTERP:
+            case CEED_EVAL_GRAD:
+            case CEED_EVAL_DIV:
+            case CEED_EVAL_CURL:
+              CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
+              CeedCallBackend(CeedBasisApplyAtPoints(basis, 1, &num_points, CEED_TRANSPOSE, eval_mode, impl->point_coords_elem, impl->q_vecs_out[i],
+                                                     impl->e_vecs_out[i]));
+              break;
+            // LCOV_EXCL_START
+            case CEED_EVAL_WEIGHT: {
+              return CeedError(CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
+              // LCOV_EXCL_STOP
+            }
+          }
+          // ---- Update output vector
+          {
+            CeedScalar *array, current_value = 0.0;
+
+            CeedCallBackend(CeedVectorGetArray(impl->e_vecs_out[i], CEED_MEM_HOST, &array));
+            current_value = array[s];
+            CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs_out[i], &array));
+            CeedCallBackend(CeedVectorSetValue(impl->e_vecs_out[i], 0.0));
+            CeedCallBackend(CeedVectorGetArray(impl->e_vecs_out[i], CEED_MEM_HOST, &array));
+            array[s] = current_value;
+            CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs_out[i], &array));
+          }
+          // ---- Restrict output block
+          if (rstr_type == CEED_RESTRICTION_POINTS) {
+            CeedCallBackend(CeedElemRestrictionApplyAtPointsInElement(elem_rstr, e, CEED_TRANSPOSE, impl->e_vecs_out[i], assembled, request));
+          } else {
+            CeedCallBackend(CeedElemRestrictionApplyBlock(elem_rstr, e, CEED_TRANSPOSE, impl->e_vecs_out[i], assembled, request));
           }
         }
-        // ---- Update output vector
-        {
-          CeedScalar *array, current_value = 0.0;
-
-          CeedCallBackend(CeedVectorGetArray(impl->e_vecs_out[i], CEED_MEM_HOST, &array));
-          current_value = array[s];
-          CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs_out[i], &array));
-          CeedCallBackend(CeedVectorSetValue(impl->e_vecs_out[i], 0.0));
-          CeedCallBackend(CeedVectorGetArray(impl->e_vecs_out[i], CEED_MEM_HOST, &array));
-          array[s] = current_value;
-          CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs_out[i], &array));
-        }
-        // ---- Restrict output block
-        CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
-        CeedCallBackend(CeedElemRestrictionGetType(elem_rstr, &rstr_type));
-        if (rstr_type == CEED_RESTRICTION_POINTS) {
-          CeedCallBackend(CeedElemRestrictionApplyAtPointsInElement(elem_rstr, e, CEED_TRANSPOSE, impl->e_vecs_out[i], assembled, request));
-        } else {
-          CeedCallBackend(CeedElemRestrictionApplyBlock(elem_rstr, e, CEED_TRANSPOSE, impl->e_vecs_out[i], assembled, request));
+        // -- Reset unit vector
+        if (s == e_vec_size - 1) {
+          CeedScalar *array;
+
+          CeedCallBackend(CeedVectorGetArray(impl->e_vecs_in[i], CEED_MEM_HOST, &array));
+          array[s] = 0.0;
+          CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs_in[i], &array));
         }
       }
     }

From 86e107299058a5a08bd170fc9397de9f32c01778 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 2 Aug 2024 12:48:03 -0600
Subject: [PATCH 135/571] atpoints - fix diagonal bug with stale qvec data

---
 backends/cuda-ref/ceed-cuda-ref-operator.c |  5 ++-
 backends/hip-ref/ceed-hip-ref-operator.c   |  5 ++-
 backends/ref/ceed-ref-operator.c           | 38 +++++++++++++---------
 3 files changed, 31 insertions(+), 17 deletions(-)

diff --git a/backends/cuda-ref/ceed-cuda-ref-operator.c b/backends/cuda-ref/ceed-cuda-ref-operator.c
index ad8868bc74..295a8cb579 100644
--- a/backends/cuda-ref/ceed-cuda-ref-operator.c
+++ b/backends/cuda-ref/ceed-cuda-ref-operator.c
@@ -1656,7 +1656,10 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda(CeedOperator op, C
       }
 
       // Reset vec
-      if (s == e_vec_size - 1) CeedCallBackend(CeedVectorSetValueStrided(impl->e_vecs[i], s, e_vec_size, 0.0));
+      if (s == e_vec_size - 1) {
+        CeedCallBackend(CeedVectorSetValueStrided(impl->e_vecs[i], s, e_vec_size, 0.0));
+        CeedCallBackend(CeedVectorSetValue(impl->q_vecs_in[i], 0.0));
+      }
     }
 
     // Restore CEED_EVAL_NONE
diff --git a/backends/hip-ref/ceed-hip-ref-operator.c b/backends/hip-ref/ceed-hip-ref-operator.c
index 7134b9593e..4d0314fdbb 100644
--- a/backends/hip-ref/ceed-hip-ref-operator.c
+++ b/backends/hip-ref/ceed-hip-ref-operator.c
@@ -1653,7 +1653,10 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip(CeedOperator op, Ce
       }
 
       // Reset vec
-      if (s == e_vec_size - 1) CeedCallBackend(CeedVectorSetValueStrided(impl->e_vecs[i], s, e_vec_size, 0.0));
+      if (s == e_vec_size - 1) {
+        CeedCallBackend(CeedVectorSetValueStrided(impl->e_vecs[i], s, e_vec_size, 0.0));
+        CeedCallBackend(CeedVectorSetValue(impl->q_vecs_in[i], 0.0));
+      }
     }
 
     // Restore CEED_EVAL_NONE
diff --git a/backends/ref/ceed-ref-operator.c b/backends/ref/ceed-ref-operator.c
index 844dcc2989..ae4d715cee 100644
--- a/backends/ref/ceed-ref-operator.c
+++ b/backends/ref/ceed-ref-operator.c
@@ -1162,7 +1162,14 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Ref(CeedOperator op, Ce
   }
 
   // Clear input Qvecs
-  for (CeedInt i = 0; i < num_input_fields; i++) CeedCallBackend(CeedVectorSetValue(impl->e_vecs_in[i], 0.0));
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedVector vec;
+
+    CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
+    if (vec != CEED_VECTOR_ACTIVE) continue;
+    CeedCallBackend(CeedVectorSetValue(impl->e_vecs_in[i], 0.0));
+    CeedCallBackend(CeedVectorSetValue(impl->q_vecs_in[i], 0.0));
+  }
 
   // Input Evecs and Restriction
   CeedCallBackend(CeedOperatorSetupInputs_Ref(num_input_fields, qf_input_fields, op_input_fields, NULL, true, e_data, impl, request));
@@ -1241,7 +1248,7 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Ref(CeedOperator op, Ce
                                                             num_output_fields, op, out_vec, impl->point_coords_elem, impl, request));
 
         // -- Grab diagonal value
-        for (CeedInt i = 0; i < num_output_fields; i++) {
+        for (CeedInt j = 0; j < num_output_fields; j++) {
           bool                is_active_output = false;
           CeedInt             elem_size        = 0;
           CeedRestrictionType rstr_type;
@@ -1250,13 +1257,13 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Ref(CeedOperator op, Ce
           CeedElemRestriction elem_rstr;
           CeedBasis           basis;
 
-          CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
+          CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[j], &vec));
           // ---- Skip non-active output
           is_active_output = vec == CEED_VECTOR_ACTIVE;
           if (!is_active_output) continue;
 
           // ---- Check if elem size matches
-          CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
+          CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[j], &elem_rstr));
           CeedCallBackend(CeedElemRestrictionGetType(elem_rstr, &rstr_type));
           if (is_active_at_points && rstr_type != CEED_RESTRICTION_POINTS) continue;
           if (rstr_type == CEED_RESTRICTION_POINTS) {
@@ -1272,7 +1279,7 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Ref(CeedOperator op, Ce
           }
 
           // ---- Basis action
-          CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
+          CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode));
           switch (eval_mode) {
             case CEED_EVAL_NONE:
               break;  // No action
@@ -1280,9 +1287,9 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Ref(CeedOperator op, Ce
             case CEED_EVAL_GRAD:
             case CEED_EVAL_DIV:
             case CEED_EVAL_CURL:
-              CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
-              CeedCallBackend(CeedBasisApplyAtPoints(basis, 1, &num_points, CEED_TRANSPOSE, eval_mode, impl->point_coords_elem, impl->q_vecs_out[i],
-                                                     impl->e_vecs_out[i]));
+              CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[j], &basis));
+              CeedCallBackend(CeedBasisApplyAtPoints(basis, 1, &num_points, CEED_TRANSPOSE, eval_mode, impl->point_coords_elem, impl->q_vecs_out[j],
+                                                     impl->e_vecs_out[j]));
               break;
             // LCOV_EXCL_START
             case CEED_EVAL_WEIGHT: {
@@ -1294,19 +1301,19 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Ref(CeedOperator op, Ce
           {
             CeedScalar *array, current_value = 0.0;
 
-            CeedCallBackend(CeedVectorGetArray(impl->e_vecs_out[i], CEED_MEM_HOST, &array));
+            CeedCallBackend(CeedVectorGetArray(impl->e_vecs_out[j], CEED_MEM_HOST, &array));
             current_value = array[s];
-            CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs_out[i], &array));
-            CeedCallBackend(CeedVectorSetValue(impl->e_vecs_out[i], 0.0));
-            CeedCallBackend(CeedVectorGetArray(impl->e_vecs_out[i], CEED_MEM_HOST, &array));
+            CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs_out[j], &array));
+            CeedCallBackend(CeedVectorSetValue(impl->e_vecs_out[j], 0.0));
+            CeedCallBackend(CeedVectorGetArray(impl->e_vecs_out[j], CEED_MEM_HOST, &array));
             array[s] = current_value;
-            CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs_out[i], &array));
+            CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs_out[j], &array));
           }
           // ---- Restrict output block
           if (rstr_type == CEED_RESTRICTION_POINTS) {
-            CeedCallBackend(CeedElemRestrictionApplyAtPointsInElement(elem_rstr, e, CEED_TRANSPOSE, impl->e_vecs_out[i], assembled, request));
+            CeedCallBackend(CeedElemRestrictionApplyAtPointsInElement(elem_rstr, e, CEED_TRANSPOSE, impl->e_vecs_out[j], assembled, request));
           } else {
-            CeedCallBackend(CeedElemRestrictionApplyBlock(elem_rstr, e, CEED_TRANSPOSE, impl->e_vecs_out[i], assembled, request));
+            CeedCallBackend(CeedElemRestrictionApplyBlock(elem_rstr, e, CEED_TRANSPOSE, impl->e_vecs_out[j], assembled, request));
           }
         }
         // -- Reset unit vector
@@ -1316,6 +1323,7 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Ref(CeedOperator op, Ce
           CeedCallBackend(CeedVectorGetArray(impl->e_vecs_in[i], CEED_MEM_HOST, &array));
           array[s] = 0.0;
           CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs_in[i], &array));
+          CeedCallBackend(CeedVectorSetValue(impl->q_vecs_in[i], 0.0));
         }
       }
     }

From 13062808ae526b03dadc34de20ded43898c5341c Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 2 Aug 2024 14:09:22 -0600
Subject: [PATCH 136/571] atpoints - remove some extra operations

---
 backends/cuda-ref/ceed-cuda-ref-operator.c | 29 +++++++++-------------
 backends/hip-ref/ceed-hip-ref-operator.c   | 29 +++++++++-------------
 backends/ref/ceed-ref-operator.c           | 11 ++------
 3 files changed, 26 insertions(+), 43 deletions(-)

diff --git a/backends/cuda-ref/ceed-cuda-ref-operator.c b/backends/cuda-ref/ceed-cuda-ref-operator.c
index 295a8cb579..98d4dfe406 100644
--- a/backends/cuda-ref/ceed-cuda-ref-operator.c
+++ b/backends/cuda-ref/ceed-cuda-ref-operator.c
@@ -1513,7 +1513,6 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda(CeedOperator op, C
 
     CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
     if (vec != CEED_VECTOR_ACTIVE) continue;
-    CeedCallBackend(CeedVectorSetValue(impl->e_vecs[i], 0.0));
     CeedCallBackend(CeedVectorSetValue(impl->q_vecs_in[i], 0.0));
   }
 
@@ -1565,6 +1564,7 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda(CeedOperator op, C
       if (!is_active_input) continue;
 
       // Update unit vector
+      if (s == 0) CeedCallBackend(CeedVectorSetValue(impl->e_vecs[i], 0.0));
       else CeedCallBackend(CeedVectorSetValueStrided(impl->e_vecs[i], s - 1, e_vec_size, 0.0));
       CeedCallBackend(CeedVectorSetValueStrided(impl->e_vecs[i], s, e_vec_size, 1.0));
 
@@ -1656,26 +1656,21 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda(CeedOperator op, C
       }
 
       // Reset vec
-      if (s == e_vec_size - 1) {
-        CeedCallBackend(CeedVectorSetValueStrided(impl->e_vecs[i], s, e_vec_size, 0.0));
-        CeedCallBackend(CeedVectorSetValue(impl->q_vecs_in[i], 0.0));
-      }
+      if (s == e_vec_size - 1 && i != num_input_fields - 1) CeedCallBackend(CeedVectorSetValue(impl->q_vecs_in[i], 0.0));
     }
+  }
 
-    // Restore CEED_EVAL_NONE
-    for (CeedInt i = 0; i < num_output_fields; i++) {
-      CeedEvalMode        eval_mode;
-      CeedElemRestriction elem_rstr;
+  // Restore CEED_EVAL_NONE
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    CeedEvalMode eval_mode;
 
-      // Get eval_mode
-      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
-      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
+    // Get eval_mode
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
 
-      // Restore evec
-      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
-      if (eval_mode == CEED_EVAL_NONE) {
-        CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs[i + impl->num_inputs], &e_data[i + num_input_fields]));
-      }
+    // Restore evec
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
+    if (eval_mode == CEED_EVAL_NONE) {
+      CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs[i + impl->num_inputs], &e_data[i + num_input_fields]));
     }
   }
 
diff --git a/backends/hip-ref/ceed-hip-ref-operator.c b/backends/hip-ref/ceed-hip-ref-operator.c
index 4d0314fdbb..34fb349f0d 100644
--- a/backends/hip-ref/ceed-hip-ref-operator.c
+++ b/backends/hip-ref/ceed-hip-ref-operator.c
@@ -1510,7 +1510,6 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip(CeedOperator op, Ce
 
     CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
     if (vec != CEED_VECTOR_ACTIVE) continue;
-    CeedCallBackend(CeedVectorSetValue(impl->e_vecs[i], 0.0));
     CeedCallBackend(CeedVectorSetValue(impl->q_vecs_in[i], 0.0));
   }
 
@@ -1562,6 +1561,7 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip(CeedOperator op, Ce
       if (!is_active_input) continue;
 
       // Update unit vector
+      if (s == 0) CeedCallBackend(CeedVectorSetValue(impl->e_vecs[i], 0.0));
       else CeedCallBackend(CeedVectorSetValueStrided(impl->e_vecs[i], s - 1, e_vec_size, 0.0));
       CeedCallBackend(CeedVectorSetValueStrided(impl->e_vecs[i], s, e_vec_size, 1.0));
 
@@ -1653,26 +1653,21 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip(CeedOperator op, Ce
       }
 
       // Reset vec
-      if (s == e_vec_size - 1) {
-        CeedCallBackend(CeedVectorSetValueStrided(impl->e_vecs[i], s, e_vec_size, 0.0));
-        CeedCallBackend(CeedVectorSetValue(impl->q_vecs_in[i], 0.0));
-      }
+      if (s == e_vec_size - 1 && i != num_input_fields - 1) CeedCallBackend(CeedVectorSetValue(impl->q_vecs_in[i], 0.0));
     }
+  }
 
-    // Restore CEED_EVAL_NONE
-    for (CeedInt i = 0; i < num_output_fields; i++) {
-      CeedEvalMode        eval_mode;
-      CeedElemRestriction elem_rstr;
+  // Restore CEED_EVAL_NONE
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    CeedEvalMode eval_mode;
 
-      // Get eval_mode
-      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
-      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
+    // Get eval_mode
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
 
-      // Restore evec
-      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
-      if (eval_mode == CEED_EVAL_NONE) {
-        CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs[i + impl->num_inputs], &e_data[i + num_input_fields]));
-      }
+    // Restore evec
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
+    if (eval_mode == CEED_EVAL_NONE) {
+      CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs[i + impl->num_inputs], &e_data[i + num_input_fields]));
     }
   }
 
diff --git a/backends/ref/ceed-ref-operator.c b/backends/ref/ceed-ref-operator.c
index ae4d715cee..0c472d1d3c 100644
--- a/backends/ref/ceed-ref-operator.c
+++ b/backends/ref/ceed-ref-operator.c
@@ -1167,7 +1167,6 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Ref(CeedOperator op, Ce
 
     CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
     if (vec != CEED_VECTOR_ACTIVE) continue;
-    CeedCallBackend(CeedVectorSetValue(impl->e_vecs_in[i], 0.0));
     CeedCallBackend(CeedVectorSetValue(impl->q_vecs_in[i], 0.0));
   }
 
@@ -1215,6 +1214,7 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Ref(CeedOperator op, Ce
         {
           CeedScalar *array;
 
+          if (s == 0) CeedCallBackend(CeedVectorSetValue(impl->e_vecs_in[i], 0.0));
           CeedCallBackend(CeedVectorGetArray(impl->e_vecs_in[i], CEED_MEM_HOST, &array));
           array[s] = 1.0;
           if (s > 0) array[s - 1] = 0.0;
@@ -1317,14 +1317,7 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Ref(CeedOperator op, Ce
           }
         }
         // -- Reset unit vector
-        if (s == e_vec_size - 1) {
-          CeedScalar *array;
-
-          CeedCallBackend(CeedVectorGetArray(impl->e_vecs_in[i], CEED_MEM_HOST, &array));
-          array[s] = 0.0;
-          CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs_in[i], &array));
-          CeedCallBackend(CeedVectorSetValue(impl->q_vecs_in[i], 0.0));
-        }
+        if (s == e_vec_size - 1) CeedCallBackend(CeedVectorSetValue(impl->q_vecs_in[i], 0.0));
       }
     }
     num_points_offset += num_points;

From 3aab95c02208308af393c9014d0e46d9d208d4a6 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Mon, 5 Aug 2024 09:56:54 -0600
Subject: [PATCH 137/571] op - minor performance improvement for op with repeat
 input rstr

---
 backends/blocked/ceed-blocked-operator.c   | 38 +++++++++---
 backends/blocked/ceed-blocked.h            |  5 +-
 backends/cuda-ref/ceed-cuda-ref-operator.c | 42 ++++++++++---
 backends/cuda-ref/ceed-cuda-ref.h          |  3 +-
 backends/hip-ref/ceed-hip-ref-operator.c   | 42 ++++++++++---
 backends/hip-ref/ceed-hip-ref.h            |  3 +-
 backends/opt/ceed-opt-operator.c           | 33 ++++++++--
 backends/ref/ceed-ref-operator.c           | 70 ++++++++++++++++++----
 backends/ref/ceed-ref.h                    |  3 +-
 9 files changed, 195 insertions(+), 44 deletions(-)

diff --git a/backends/blocked/ceed-blocked-operator.c b/backends/blocked/ceed-blocked-operator.c
index 0df7846d57..788533cbff 100644
--- a/backends/blocked/ceed-blocked-operator.c
+++ b/backends/blocked/ceed-blocked-operator.c
@@ -16,7 +16,7 @@
 //------------------------------------------------------------------------------
 // Setup Input/Output Fields
 //------------------------------------------------------------------------------
-static int CeedOperatorSetupFields_Blocked(CeedQFunction qf, CeedOperator op, bool is_input, const CeedInt block_size,
+static int CeedOperatorSetupFields_Blocked(CeedQFunction qf, CeedOperator op, bool is_input, bool *skip_rstr, const CeedInt block_size,
                                            CeedElemRestriction *block_rstr, CeedVector *e_vecs_full, CeedVector *e_vecs, CeedVector *q_vecs,
                                            CeedInt start_e, CeedInt num_fields, CeedInt Q) {
   Ceed                ceed;
@@ -135,6 +135,28 @@ static int CeedOperatorSetupFields_Blocked(CeedQFunction qf, CeedOperator op, bo
         break;
     }
   }
+  // Drop duplicate input restrictions
+  if (is_input) {
+    for (CeedInt i = 0; i < num_fields; i++) {
+      CeedVector          vec_i;
+      CeedElemRestriction rstr_i;
+
+      CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec_i));
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &rstr_i));
+      for (CeedInt j = i + 1; j < num_fields; j++) {
+        CeedVector          vec_j;
+        CeedElemRestriction rstr_j;
+
+        CeedCallBackend(CeedOperatorFieldGetVector(op_fields[j], &vec_j));
+        CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[j], &rstr_j));
+        if (vec_i == vec_j && rstr_i == rstr_j) {
+          CeedCallBackend(CeedVectorReferenceCopy(e_vecs[i], &e_vecs[j]));
+          CeedCallBackend(CeedVectorReferenceCopy(e_vecs_full[i], &e_vecs_full[j]));
+          skip_rstr[j] = true;
+        }
+      }
+    }
+  }
   return CEED_ERROR_SUCCESS;
 }
 
@@ -166,6 +188,7 @@ static int CeedOperatorSetup_Blocked(CeedOperator op) {
   CeedCallBackend(CeedCalloc(num_input_fields + num_output_fields, &impl->block_rstr));
   CeedCallBackend(CeedCalloc(num_input_fields + num_output_fields, &impl->e_vecs_full));
 
+  CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->skip_rstr_in));
   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->input_states));
   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->e_vecs_in));
   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->e_vecs_out));
@@ -177,11 +200,11 @@ static int CeedOperatorSetup_Blocked(CeedOperator op) {
 
   // Set up infield and outfield pointer arrays
   // Infields
-  CeedCallBackend(CeedOperatorSetupFields_Blocked(qf, op, true, block_size, impl->block_rstr, impl->e_vecs_full, impl->e_vecs_in, impl->q_vecs_in, 0,
-                                                  num_input_fields, Q));
+  CeedCallBackend(CeedOperatorSetupFields_Blocked(qf, op, true, impl->skip_rstr_in, block_size, impl->block_rstr, impl->e_vecs_full, impl->e_vecs_in,
+                                                  impl->q_vecs_in, 0, num_input_fields, Q));
   // Outfields
-  CeedCallBackend(CeedOperatorSetupFields_Blocked(qf, op, false, block_size, impl->block_rstr, impl->e_vecs_full, impl->e_vecs_out, impl->q_vecs_out,
-                                                  num_input_fields, num_output_fields, Q));
+  CeedCallBackend(CeedOperatorSetupFields_Blocked(qf, op, false, NULL, block_size, impl->block_rstr, impl->e_vecs_full, impl->e_vecs_out,
+                                                  impl->q_vecs_out, num_input_fields, num_output_fields, Q));
 
   // Identity QFunctions
   if (impl->is_identity_qf) {
@@ -226,10 +249,10 @@ static inline int CeedOperatorSetupInputs_Blocked(CeedInt num_input_fields, Ceed
     } else {
       // Restrict
       CeedCallBackend(CeedVectorGetState(vec, &state));
-      if (state != impl->input_states[i] || vec == in_vec) {
+      if ((state != impl->input_states[i] || vec == in_vec) && !impl->skip_rstr_in[i]) {
         CeedCallBackend(CeedElemRestrictionApply(impl->block_rstr[i], CEED_NOTRANSPOSE, vec, impl->e_vecs_full[i], request));
-        impl->input_states[i] = state;
       }
+      impl->input_states[i] = state;
       // Get evec
       CeedCallBackend(CeedVectorGetArrayRead(impl->e_vecs_full[i], CEED_MEM_HOST, (const CeedScalar **)&e_data_full[i]));
     }
@@ -647,6 +670,7 @@ static int CeedOperatorDestroy_Blocked(CeedOperator op) {
 
   CeedCallBackend(CeedOperatorGetData(op, &impl));
 
+  CeedCallBackend(CeedFree(&impl->skip_rstr_in));
   for (CeedInt i = 0; i < impl->num_inputs + impl->num_outputs; i++) {
     CeedCallBackend(CeedElemRestrictionDestroy(&impl->block_rstr[i]));
     CeedCallBackend(CeedVectorDestroy(&impl->e_vecs_full[i]));
diff --git a/backends/blocked/ceed-blocked.h b/backends/blocked/ceed-blocked.h
index fef7967518..5876e969b7 100644
--- a/backends/blocked/ceed-blocked.h
+++ b/backends/blocked/ceed-blocked.h
@@ -17,13 +17,14 @@ typedef struct {
 
 typedef struct {
   bool                 is_identity_qf, is_identity_rstr_op;
-  CeedElemRestriction *block_rstr;   /* Blocked versions of restrictions */
-  CeedVector          *e_vecs_full;  /* Full E-vectors, inputs followed by outputs */
+  bool                *skip_rstr_in;
   uint64_t            *input_states; /* State counter of inputs */
+  CeedVector          *e_vecs_full;  /* Full E-vectors, inputs followed by outputs */
   CeedVector          *e_vecs_in;    /* Element block input E-vectors  */
   CeedVector          *e_vecs_out;   /* Element block output E-vectors */
   CeedVector          *q_vecs_in;    /* Element block input Q-vectors  */
   CeedVector          *q_vecs_out;   /* Element block output Q-vectors */
+  CeedElemRestriction *block_rstr;   /* Blocked versions of restrictions */
   CeedInt              num_inputs, num_outputs;
   CeedInt              qf_size_in, qf_size_out;
   CeedVector           qf_l_vec;
diff --git a/backends/cuda-ref/ceed-cuda-ref-operator.c b/backends/cuda-ref/ceed-cuda-ref-operator.c
index 98d4dfe406..9f6d3d14b0 100644
--- a/backends/cuda-ref/ceed-cuda-ref-operator.c
+++ b/backends/cuda-ref/ceed-cuda-ref-operator.c
@@ -27,6 +27,7 @@ static int CeedOperatorDestroy_Cuda(CeedOperator op) {
   CeedCallBackend(CeedOperatorGetData(op, &impl));
 
   // Apply data
+  CeedCallBackend(CeedFree(&impl->skip_rstr_in));
   for (CeedInt i = 0; i < impl->num_inputs + impl->num_outputs; i++) {
     CeedCallBackend(CeedVectorDestroy(&impl->e_vecs[i]));
   }
@@ -96,8 +97,8 @@ static int CeedOperatorDestroy_Cuda(CeedOperator op) {
 //------------------------------------------------------------------------------
 // Setup infields or outfields
 //------------------------------------------------------------------------------
-static int CeedOperatorSetupFields_Cuda(CeedQFunction qf, CeedOperator op, bool is_input, bool is_at_points, CeedVector *e_vecs, CeedVector *q_vecs,
-                                        CeedInt start_e, CeedInt num_fields, CeedInt Q, CeedInt num_elem) {
+static int CeedOperatorSetupFields_Cuda(CeedQFunction qf, CeedOperator op, bool is_input, bool is_at_points, bool *skip_rstr, CeedVector *e_vecs,
+                                        CeedVector *q_vecs, CeedInt start_e, CeedInt num_fields, CeedInt Q, CeedInt num_elem) {
   Ceed                ceed;
   CeedQFunctionField *qf_fields;
   CeedOperatorField  *op_fields;
@@ -183,6 +184,27 @@ static int CeedOperatorSetupFields_Cuda(CeedQFunction qf, CeedOperator op, bool
         break;
     }
   }
+  // Drop duplicate input restrictions
+  if (is_input) {
+    for (CeedInt i = 0; i < num_fields; i++) {
+      CeedVector          vec_i;
+      CeedElemRestriction rstr_i;
+
+      CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec_i));
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &rstr_i));
+      for (CeedInt j = i + 1; j < num_fields; j++) {
+        CeedVector          vec_j;
+        CeedElemRestriction rstr_j;
+
+        CeedCallBackend(CeedOperatorFieldGetVector(op_fields[j], &vec_j));
+        CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[j], &rstr_j));
+        if (vec_i == vec_j && rstr_i == rstr_j) {
+          CeedCallBackend(CeedVectorReferenceCopy(e_vecs[i], &e_vecs[j]));
+          skip_rstr[j] = true;
+        }
+      }
+    }
+  }
   return CEED_ERROR_SUCCESS;
 }
 
@@ -211,6 +233,7 @@ static int CeedOperatorSetup_Cuda(CeedOperator op) {
 
   // Allocate
   CeedCallBackend(CeedCalloc(num_input_fields + num_output_fields, &impl->e_vecs));
+  CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->skip_rstr_in));
   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->input_states));
   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_in));
   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_out));
@@ -219,10 +242,11 @@ static int CeedOperatorSetup_Cuda(CeedOperator op) {
 
   // Set up infield and outfield e_vecs and q_vecs
   // Infields
-  CeedCallBackend(CeedOperatorSetupFields_Cuda(qf, op, true, false, impl->e_vecs, impl->q_vecs_in, 0, num_input_fields, Q, num_elem));
+  CeedCallBackend(
+      CeedOperatorSetupFields_Cuda(qf, op, true, false, impl->skip_rstr_in, impl->e_vecs, impl->q_vecs_in, 0, num_input_fields, Q, num_elem));
   // Outfields
   CeedCallBackend(
-      CeedOperatorSetupFields_Cuda(qf, op, false, false, impl->e_vecs, impl->q_vecs_out, num_input_fields, num_output_fields, Q, num_elem));
+      CeedOperatorSetupFields_Cuda(qf, op, false, false, NULL, impl->e_vecs, impl->q_vecs_out, num_input_fields, num_output_fields, Q, num_elem));
 
   CeedCallBackend(CeedOperatorSetSetupDone(op));
   return CEED_ERROR_SUCCESS;
@@ -262,10 +286,10 @@ static inline int CeedOperatorSetupInputs_Cuda(CeedInt num_input_fields, CeedQFu
         uint64_t state;
 
         CeedCallBackend(CeedVectorGetState(vec, &state));
-        if (state != impl->input_states[i]) {
+        if (state != impl->input_states[i] && !impl->skip_rstr_in[i]) {
           CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_NOTRANSPOSE, vec, impl->e_vecs[i], request));
-          impl->input_states[i] = state;
         }
+        impl->input_states[i] = state;
         // Get evec
         CeedCallBackend(CeedVectorGetArrayRead(impl->e_vecs[i], CEED_MEM_DEVICE, (const CeedScalar **)&e_data[i]));
       }
@@ -474,6 +498,7 @@ static int CeedOperatorSetupAtPoints_Cuda(CeedOperator op) {
 
   // Allocate
   CeedCallBackend(CeedCalloc(num_input_fields + num_output_fields, &impl->e_vecs));
+  CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->skip_rstr_in));
   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->input_states));
   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_in));
   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_out));
@@ -482,9 +507,10 @@ static int CeedOperatorSetupAtPoints_Cuda(CeedOperator op) {
 
   // Set up infield and outfield e_vecs and q_vecs
   // Infields
-  CeedCallBackend(CeedOperatorSetupFields_Cuda(qf, op, true, true, impl->e_vecs, impl->q_vecs_in, 0, num_input_fields, max_num_points, num_elem));
+  CeedCallBackend(CeedOperatorSetupFields_Cuda(qf, op, true, true, impl->skip_rstr_in, impl->e_vecs, impl->q_vecs_in, 0, num_input_fields,
+                                               max_num_points, num_elem));
   // Outfields
-  CeedCallBackend(CeedOperatorSetupFields_Cuda(qf, op, false, true, impl->e_vecs, impl->q_vecs_out, num_input_fields, num_output_fields,
+  CeedCallBackend(CeedOperatorSetupFields_Cuda(qf, op, false, true, NULL, impl->e_vecs, impl->q_vecs_out, num_input_fields, num_output_fields,
                                                max_num_points, num_elem));
 
   CeedCallBackend(CeedOperatorSetSetupDone(op));
diff --git a/backends/cuda-ref/ceed-cuda-ref.h b/backends/cuda-ref/ceed-cuda-ref.h
index 0c2c5b4972..f8430a1b12 100644
--- a/backends/cuda-ref/ceed-cuda-ref.h
+++ b/backends/cuda-ref/ceed-cuda-ref.h
@@ -128,8 +128,9 @@ typedef struct {
 } CeedOperatorAssemble_Cuda;
 
 typedef struct {
-  CeedVector                *e_vecs;        // E-vectors, inputs followed by outputs
+  bool                      *skip_rstr_in;
   uint64_t                  *input_states;  // State tracking for passive inputs
+  CeedVector                *e_vecs;        // E-vectors, inputs followed by outputs
   CeedVector                *q_vecs_in;     // Input Q-vectors needed to apply operator
   CeedVector                *q_vecs_out;    // Output Q-vectors needed to apply operator
   CeedInt                    num_inputs, num_outputs;
diff --git a/backends/hip-ref/ceed-hip-ref-operator.c b/backends/hip-ref/ceed-hip-ref-operator.c
index 34fb349f0d..bb5d09816d 100644
--- a/backends/hip-ref/ceed-hip-ref-operator.c
+++ b/backends/hip-ref/ceed-hip-ref-operator.c
@@ -26,6 +26,7 @@ static int CeedOperatorDestroy_Hip(CeedOperator op) {
   CeedCallBackend(CeedOperatorGetData(op, &impl));
 
   // Apply data
+  CeedCallBackend(CeedFree(&impl->skip_rstr_in));
   for (CeedInt i = 0; i < impl->num_inputs + impl->num_outputs; i++) {
     CeedCallBackend(CeedVectorDestroy(&impl->e_vecs[i]));
   }
@@ -95,8 +96,8 @@ static int CeedOperatorDestroy_Hip(CeedOperator op) {
 //------------------------------------------------------------------------------
 // Setup infields or outfields
 //------------------------------------------------------------------------------
-static int CeedOperatorSetupFields_Hip(CeedQFunction qf, CeedOperator op, bool is_input, bool is_at_points, CeedVector *e_vecs, CeedVector *q_vecs,
-                                       CeedInt start_e, CeedInt num_fields, CeedInt Q, CeedInt num_elem) {
+static int CeedOperatorSetupFields_Hip(CeedQFunction qf, CeedOperator op, bool is_input, bool is_at_points, bool *skip_rstr, CeedVector *e_vecs,
+                                       CeedVector *q_vecs, CeedInt start_e, CeedInt num_fields, CeedInt Q, CeedInt num_elem) {
   Ceed                ceed;
   CeedQFunctionField *qf_fields;
   CeedOperatorField  *op_fields;
@@ -182,6 +183,27 @@ static int CeedOperatorSetupFields_Hip(CeedQFunction qf, CeedOperator op, bool i
         break;
     }
   }
+  // Drop duplicate input restrictions
+  if (is_input) {
+    for (CeedInt i = 0; i < num_fields; i++) {
+      CeedVector          vec_i;
+      CeedElemRestriction rstr_i;
+
+      CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec_i));
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &rstr_i));
+      for (CeedInt j = i + 1; j < num_fields; j++) {
+        CeedVector          vec_j;
+        CeedElemRestriction rstr_j;
+
+        CeedCallBackend(CeedOperatorFieldGetVector(op_fields[j], &vec_j));
+        CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[j], &rstr_j));
+        if (vec_i == vec_j && rstr_i == rstr_j) {
+          CeedCallBackend(CeedVectorReferenceCopy(e_vecs[i], &e_vecs[j]));
+          skip_rstr[j] = true;
+        }
+      }
+    }
+  }
   return CEED_ERROR_SUCCESS;
 }
 
@@ -210,6 +232,7 @@ static int CeedOperatorSetup_Hip(CeedOperator op) {
 
   // Allocate
   CeedCallBackend(CeedCalloc(num_input_fields + num_output_fields, &impl->e_vecs));
+  CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->skip_rstr_in));
   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->input_states));
   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_in));
   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_out));
@@ -218,10 +241,11 @@ static int CeedOperatorSetup_Hip(CeedOperator op) {
 
   // Set up infield and outfield e_vecs and q_vecs
   // Infields
-  CeedCallBackend(CeedOperatorSetupFields_Hip(qf, op, true, false, impl->e_vecs, impl->q_vecs_in, 0, num_input_fields, Q, num_elem));
+  CeedCallBackend(
+      CeedOperatorSetupFields_Hip(qf, op, true, false, impl->skip_rstr_in, impl->e_vecs, impl->q_vecs_in, 0, num_input_fields, Q, num_elem));
   // Outfields
   CeedCallBackend(
-      CeedOperatorSetupFields_Hip(qf, op, false, false, impl->e_vecs, impl->q_vecs_out, num_input_fields, num_output_fields, Q, num_elem));
+      CeedOperatorSetupFields_Hip(qf, op, false, false, NULL, impl->e_vecs, impl->q_vecs_out, num_input_fields, num_output_fields, Q, num_elem));
 
   CeedCallBackend(CeedOperatorSetSetupDone(op));
   return CEED_ERROR_SUCCESS;
@@ -261,10 +285,10 @@ static inline int CeedOperatorSetupInputs_Hip(CeedInt num_input_fields, CeedQFun
         uint64_t state;
 
         CeedCallBackend(CeedVectorGetState(vec, &state));
-        if (state != impl->input_states[i]) {
+        if (state != impl->input_states[i] && !impl->skip_rstr_in[i]) {
           CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_NOTRANSPOSE, vec, impl->e_vecs[i], request));
-          impl->input_states[i] = state;
         }
+        impl->input_states[i] = state;
         // Get evec
         CeedCallBackend(CeedVectorGetArrayRead(impl->e_vecs[i], CEED_MEM_DEVICE, (const CeedScalar **)&e_data[i]));
       }
@@ -473,6 +497,7 @@ static int CeedOperatorSetupAtPoints_Hip(CeedOperator op) {
 
   // Allocate
   CeedCallBackend(CeedCalloc(num_input_fields + num_output_fields, &impl->e_vecs));
+  CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->skip_rstr_in));
   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->input_states));
   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_in));
   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_out));
@@ -481,9 +506,10 @@ static int CeedOperatorSetupAtPoints_Hip(CeedOperator op) {
 
   // Set up infield and outfield e_vecs and q_vecs
   // Infields
-  CeedCallBackend(CeedOperatorSetupFields_Hip(qf, op, true, true, impl->e_vecs, impl->q_vecs_in, 0, num_input_fields, max_num_points, num_elem));
+  CeedCallBackend(CeedOperatorSetupFields_Hip(qf, op, true, true, impl->skip_rstr_in, impl->e_vecs, impl->q_vecs_in, 0, num_input_fields,
+                                              max_num_points, num_elem));
   // Outfields
-  CeedCallBackend(CeedOperatorSetupFields_Hip(qf, op, false, true, impl->e_vecs, impl->q_vecs_out, num_input_fields, num_output_fields,
+  CeedCallBackend(CeedOperatorSetupFields_Hip(qf, op, false, true, NULL, impl->e_vecs, impl->q_vecs_out, num_input_fields, num_output_fields,
                                               max_num_points, num_elem));
 
   CeedCallBackend(CeedOperatorSetSetupDone(op));
diff --git a/backends/hip-ref/ceed-hip-ref.h b/backends/hip-ref/ceed-hip-ref.h
index b0df567c55..5199ce8767 100644
--- a/backends/hip-ref/ceed-hip-ref.h
+++ b/backends/hip-ref/ceed-hip-ref.h
@@ -132,8 +132,9 @@ typedef struct {
 } CeedOperatorAssemble_Hip;
 
 typedef struct {
-  CeedVector               *e_vecs;        // E-vectors, inputs followed by outputs
+  bool                     *skip_rstr_in;
   uint64_t                 *input_states;  // State tracking for passive inputs
+  CeedVector               *e_vecs;        // E-vectors, inputs followed by outputs
   CeedVector               *q_vecs_in;     // Input Q-vectors needed to apply operator
   CeedVector               *q_vecs_out;    // Output Q-vectors needed to apply operator
   CeedInt                   num_inputs, num_outputs;
diff --git a/backends/opt/ceed-opt-operator.c b/backends/opt/ceed-opt-operator.c
index db292879ce..eaacaedc12 100644
--- a/backends/opt/ceed-opt-operator.c
+++ b/backends/opt/ceed-opt-operator.c
@@ -139,6 +139,28 @@ static int CeedOperatorSetupFields_Opt(CeedQFunction qf, CeedOperator op, bool i
     // Initialize E-vec arrays
     if (e_vecs[i]) CeedCallBackend(CeedVectorSetValue(e_vecs[i], 0.0));
   }
+  // Drop duplicate input restrictions
+  if (is_input) {
+    for (CeedInt i = 0; i < num_fields; i++) {
+      CeedVector          vec_i;
+      CeedElemRestriction rstr_i;
+
+      CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec_i));
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &rstr_i));
+      for (CeedInt j = i + 1; j < num_fields; j++) {
+        CeedVector          vec_j;
+        CeedElemRestriction rstr_j;
+
+        CeedCallBackend(CeedOperatorFieldGetVector(op_fields[j], &vec_j));
+        CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[j], &rstr_j));
+        if (vec_i == vec_j && rstr_i == rstr_j) {
+          CeedCallBackend(CeedVectorReferenceCopy(e_vecs[i], &e_vecs[j]));
+          CeedCallBackend(CeedVectorReferenceCopy(e_vecs_full[i], &e_vecs_full[j]));
+          CeedCallBackend(CeedElemRestrictionDestroy(&block_rstr[j]));
+        }
+      }
+    }
+  }
   return CEED_ERROR_SUCCESS;
 }
 
@@ -216,22 +238,23 @@ static inline int CeedOperatorSetupInputs_Opt(CeedInt num_input_fields, CeedQFun
                                               CeedVector in_vec, CeedScalar *e_data[2 * CEED_FIELD_MAX], CeedOperator_Opt *impl,
                                               CeedRequest *request) {
   for (CeedInt i = 0; i < num_input_fields; i++) {
-    uint64_t     state;
     CeedEvalMode eval_mode;
-    CeedVector   vec;
 
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
     if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
     } else {
+      uint64_t   state;
+      CeedVector vec;
+
       // Get input vector
       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
       if (vec != CEED_VECTOR_ACTIVE) {
         // Restrict
         CeedCallBackend(CeedVectorGetState(vec, &state));
-        if (state != impl->input_states[i]) {
+        if (state != impl->input_states[i] && impl->block_rstr[i]) {
           CeedCallBackend(CeedElemRestrictionApply(impl->block_rstr[i], CEED_NOTRANSPOSE, vec, impl->e_vecs_full[i], request));
-          impl->input_states[i] = state;
         }
+        impl->input_states[i] = state;
         // Get evec
         CeedCallBackend(CeedVectorGetArrayRead(impl->e_vecs_full[i], CEED_MEM_HOST, (const CeedScalar **)&e_data[i]));
       } else {
@@ -272,7 +295,7 @@ static inline int CeedOperatorInputBasis_Opt(CeedInt e, CeedInt Q, CeedQFunction
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
     CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &size));
     // Restrict block active input
-    if (is_active_input) {
+    if (is_active_input && impl->block_rstr[i]) {
       CeedCallBackend(CeedElemRestrictionApplyBlock(impl->block_rstr[i], e / block_size, CEED_NOTRANSPOSE, in_vec, impl->e_vecs_in[i], request));
     }
     // Basis action
diff --git a/backends/ref/ceed-ref-operator.c b/backends/ref/ceed-ref-operator.c
index 0c472d1d3c..d605c943e7 100644
--- a/backends/ref/ceed-ref-operator.c
+++ b/backends/ref/ceed-ref-operator.c
@@ -16,7 +16,7 @@
 //------------------------------------------------------------------------------
 // Setup Input/Output Fields
 //------------------------------------------------------------------------------
-static int CeedOperatorSetupFields_Ref(CeedQFunction qf, CeedOperator op, bool is_input, CeedVector *e_vecs_full, CeedVector *e_vecs,
+static int CeedOperatorSetupFields_Ref(CeedQFunction qf, CeedOperator op, bool is_input, bool *skip_rstr, CeedVector *e_vecs_full, CeedVector *e_vecs,
                                        CeedVector *q_vecs, CeedInt start_e, CeedInt num_fields, CeedInt Q) {
   Ceed                ceed;
   CeedSize            e_size, q_size;
@@ -78,6 +78,28 @@ static int CeedOperatorSetupFields_Ref(CeedQFunction qf, CeedOperator op, bool i
         break;
     }
   }
+  // Drop duplicate input restrictions
+  if (is_input) {
+    for (CeedInt i = 0; i < num_fields; i++) {
+      CeedVector          vec_i;
+      CeedElemRestriction rstr_i;
+
+      CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec_i));
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &rstr_i));
+      for (CeedInt j = i + 1; j < num_fields; j++) {
+        CeedVector          vec_j;
+        CeedElemRestriction rstr_j;
+
+        CeedCallBackend(CeedOperatorFieldGetVector(op_fields[j], &vec_j));
+        CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[j], &rstr_j));
+        if (vec_i == vec_j && rstr_i == rstr_j) {
+          CeedCallBackend(CeedVectorReferenceCopy(e_vecs[i], &e_vecs[j]));
+          CeedCallBackend(CeedVectorReferenceCopy(e_vecs_full[i], &e_vecs_full[j]));
+          skip_rstr[j] = true;
+        }
+      }
+    }
+  }
   return CEED_ERROR_SUCCESS;
 }
 
@@ -105,6 +127,7 @@ static int CeedOperatorSetup_Ref(CeedOperator op) {
   // Allocate
   CeedCallBackend(CeedCalloc(num_input_fields + num_output_fields, &impl->e_vecs_full));
 
+  CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->skip_rstr_in));
   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->input_states));
   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->e_vecs_in));
   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->e_vecs_out));
@@ -116,10 +139,11 @@ static int CeedOperatorSetup_Ref(CeedOperator op) {
 
   // Set up infield and outfield e_vecs and q_vecs
   // Infields
-  CeedCallBackend(CeedOperatorSetupFields_Ref(qf, op, true, impl->e_vecs_full, impl->e_vecs_in, impl->q_vecs_in, 0, num_input_fields, Q));
-  // Outfields
   CeedCallBackend(
-      CeedOperatorSetupFields_Ref(qf, op, false, impl->e_vecs_full, impl->e_vecs_out, impl->q_vecs_out, num_input_fields, num_output_fields, Q));
+      CeedOperatorSetupFields_Ref(qf, op, true, impl->skip_rstr_in, impl->e_vecs_full, impl->e_vecs_in, impl->q_vecs_in, 0, num_input_fields, Q));
+  // Outfields
+  CeedCallBackend(CeedOperatorSetupFields_Ref(qf, op, false, NULL, impl->e_vecs_full, impl->e_vecs_out, impl->q_vecs_out, num_input_fields,
+                                              num_output_fields, Q));
 
   // Identity QFunctions
   if (impl->is_identity_qf) {
@@ -167,11 +191,11 @@ static inline int CeedOperatorSetupInputs_Ref(CeedInt num_input_fields, CeedQFun
       // Restrict
       CeedCallBackend(CeedVectorGetState(vec, &state));
       // Skip restriction if input is unchanged
-      if (state != impl->input_states[i] || vec == in_vec) {
+      if ((state != impl->input_states[i] || vec == in_vec) && !impl->skip_rstr_in[i]) {
         CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
         CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_NOTRANSPOSE, vec, impl->e_vecs_full[i], request));
-        impl->input_states[i] = state;
       }
+      impl->input_states[i] = state;
       // Get evec
       CeedCallBackend(CeedVectorGetArrayRead(impl->e_vecs_full[i], CEED_MEM_HOST, (const CeedScalar **)&e_data_full[i]));
     }
@@ -566,8 +590,8 @@ static int CeedOperatorLinearAssembleQFunctionUpdate_Ref(CeedOperator op, CeedVe
 //------------------------------------------------------------------------------
 // Setup Input/Output Fields
 //------------------------------------------------------------------------------
-static int CeedOperatorSetupFieldsAtPoints_Ref(CeedQFunction qf, CeedOperator op, bool is_input, CeedVector *e_vecs_full, CeedVector *e_vecs,
-                                               CeedVector *q_vecs, CeedInt start_e, CeedInt num_fields, CeedInt Q) {
+static int CeedOperatorSetupFieldsAtPoints_Ref(CeedQFunction qf, CeedOperator op, bool is_input, bool *skip_rstr, CeedVector *e_vecs_full,
+                                               CeedVector *e_vecs, CeedVector *q_vecs, CeedInt start_e, CeedInt num_fields, CeedInt Q) {
   Ceed                ceed;
   CeedSize            e_size, q_size;
   CeedInt             max_num_points, num_comp, size, P;
@@ -661,6 +685,27 @@ static int CeedOperatorSetupFieldsAtPoints_Ref(CeedQFunction qf, CeedOperator op
     if (e_vecs[i]) CeedCallBackend(CeedVectorSetValue(e_vecs[i], 0.0));
     if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorSetValue(q_vecs[i], 0.0));
   }
+  // Drop duplicate input restrictions
+  if (is_input) {
+    for (CeedInt i = 0; i < num_fields; i++) {
+      CeedVector          vec_i;
+      CeedElemRestriction rstr_i;
+
+      CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec_i));
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &rstr_i));
+      for (CeedInt j = i + 1; j < num_fields; j++) {
+        CeedVector          vec_j;
+        CeedElemRestriction rstr_j;
+
+        CeedCallBackend(CeedOperatorFieldGetVector(op_fields[j], &vec_j));
+        CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[j], &rstr_j));
+        if (vec_i == vec_j && rstr_i == rstr_j) {
+          CeedCallBackend(CeedVectorReferenceCopy(e_vecs[i], &e_vecs[j]));
+          skip_rstr[j] = true;
+        }
+      }
+    }
+  }
   return CEED_ERROR_SUCCESS;
 }
 
@@ -688,6 +733,7 @@ static int CeedOperatorSetupAtPoints_Ref(CeedOperator op) {
   // Allocate
   CeedCallBackend(CeedCalloc(num_input_fields + num_output_fields, &impl->e_vecs_full));
 
+  CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->skip_rstr_in));
   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->input_states));
   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->e_vecs_in));
   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->e_vecs_out));
@@ -699,9 +745,10 @@ static int CeedOperatorSetupAtPoints_Ref(CeedOperator op) {
 
   // Set up infield and outfield pointer arrays
   // Infields
-  CeedCallBackend(CeedOperatorSetupFieldsAtPoints_Ref(qf, op, true, impl->e_vecs_full, impl->e_vecs_in, impl->q_vecs_in, 0, num_input_fields, Q));
+  CeedCallBackend(CeedOperatorSetupFieldsAtPoints_Ref(qf, op, true, impl->skip_rstr_in, impl->e_vecs_full, impl->e_vecs_in, impl->q_vecs_in, 0,
+                                                      num_input_fields, Q));
   // Outfields
-  CeedCallBackend(CeedOperatorSetupFieldsAtPoints_Ref(qf, op, false, impl->e_vecs_full, impl->e_vecs_out, impl->q_vecs_out, num_input_fields,
+  CeedCallBackend(CeedOperatorSetupFieldsAtPoints_Ref(qf, op, false, NULL, impl->e_vecs_full, impl->e_vecs_out, impl->q_vecs_out, num_input_fields,
                                                       num_output_fields, Q));
 
   // Identity QFunctions
@@ -741,7 +788,7 @@ static inline int CeedOperatorInputBasisAtPoints_Ref(CeedInt e, CeedInt num_poin
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
     CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &size));
     // Restrict block active input
-    if (is_active_input) {
+    if (is_active_input && !impl->skip_rstr_in[i]) {
       if (rstr_type == CEED_RESTRICTION_POINTS) {
         CeedCallBackend(CeedElemRestrictionApplyAtPointsInElement(elem_rstr, e, CEED_NOTRANSPOSE, in_vec, impl->e_vecs_in[i], request));
       } else {
@@ -1341,6 +1388,7 @@ static int CeedOperatorDestroy_Ref(CeedOperator op) {
   CeedOperator_Ref *impl;
 
   CeedCallBackend(CeedOperatorGetData(op, &impl));
+  CeedCallBackend(CeedFree(&impl->skip_rstr_in));
   for (CeedInt i = 0; i < impl->num_inputs + impl->num_outputs; i++) {
     CeedCallBackend(CeedVectorDestroy(&impl->e_vecs_full[i]));
   }
diff --git a/backends/ref/ceed-ref.h b/backends/ref/ceed-ref.h
index 369a27c049..ff8e9fa773 100644
--- a/backends/ref/ceed-ref.h
+++ b/backends/ref/ceed-ref.h
@@ -49,8 +49,9 @@ typedef struct {
 
 typedef struct {
   bool        is_identity_qf, is_identity_rstr_op;
-  CeedVector *e_vecs_full;  /* Full E-vectors, inputs followed by outputs */
+  bool       *skip_rstr_in;
   uint64_t   *input_states; /* State counter of inputs */
+  CeedVector *e_vecs_full;  /* Full E-vectors, inputs followed by outputs */
   CeedVector *e_vecs_in;    /* Single element input E-vectors  */
   CeedVector *e_vecs_out;   /* Single element output E-vectors */
   CeedVector *q_vecs_in;    /* Single element input Q-vectors  */

From db2becc9f302fe8eb3a32ace50ce3f3a5d42e6c4 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Tue, 13 Aug 2024 23:30:20 +0100
Subject: [PATCH 138/571] Add CeedBasisApplyAdd (#1644)

* basis - add CeedBasisApplyAdd + CPU impl

* basis - add ref GPU ApplyAdd

* basis - add shared GPU ApplyAdd

* basis - add MAGMA ApplyAdd

* basis - add CeedBasisApplyAddAtPoints + default impl

* basis - add GPU ApplyAddAtPoints

* tidy - add extra assert to fix clang-tidy

* Apply suggestions from code review

style - consistently use indexing over pointer arithmatic

Co-authored-by: Zach Atkins <zach.atkins@colorado.edu>

* style - more pointer fixes

---------

Co-authored-by: Zach Atkins <zach.atkins@colorado.edu>
---
 backends/cuda-ref/ceed-cuda-ref-basis.c       |  67 +++++--
 backends/cuda-shared/ceed-cuda-shared-basis.c |  63 ++++--
 backends/cuda-shared/ceed-cuda-shared.h       |   2 +
 backends/hip-ref/ceed-hip-ref-basis.c         |  66 +++++--
 backends/hip-shared/ceed-hip-shared-basis.c   |  64 ++++--
 backends/hip-shared/ceed-hip-shared.h         |   2 +
 backends/magma/ceed-magma-basis.c             |  59 +++++-
 backends/magma/ceed-magma.h                   |   4 +
 backends/ref/ceed-ref-basis.c                 |  49 +++--
 include/ceed-impl.h                           |   2 +
 include/ceed/ceed.h                           |   3 +
 .../cuda/cuda-ref-basis-nontensor-templates.h |   4 +-
 .../cuda/cuda-ref-basis-tensor-at-points.h    |  26 +--
 .../jit-source/cuda/cuda-ref-basis-tensor.h   |  23 +--
 .../cuda/cuda-ref-operator-assemble.h         |   2 +-
 .../cuda/cuda-ref-restriction-at-points.h     |   2 +-
 .../cuda/cuda-ref-restriction-curl-oriented.h |   4 +-
 .../cuda/cuda-ref-restriction-offset.h        |   2 +-
 .../cuda/cuda-ref-restriction-oriented.h      |   2 +-
 .../cuda-shared-basis-read-write-templates.h  |  41 ++++
 .../cuda/cuda-shared-basis-tensor-templates.h |  80 ++++----
 .../cuda/cuda-shared-basis-tensor.h           |  68 +++++++
 .../hip/hip-ref-basis-nontensor-templates.h   |   6 +-
 .../hip/hip-ref-basis-tensor-at-points.h      |  30 +--
 .../jit-source/hip/hip-ref-basis-tensor.h     |  23 +--
 .../hip/hip-ref-operator-assemble.h           |   2 +-
 .../hip/hip-ref-restriction-at-points.h       |   2 +-
 .../hip/hip-ref-restriction-curl-oriented.h   |   4 +-
 .../hip/hip-ref-restriction-offset.h          |   2 +-
 .../hip/hip-ref-restriction-oriented.h        |   2 +-
 .../hip-shared-basis-read-write-templates.h   |  41 ++++
 .../hip/hip-shared-basis-tensor-templates.h   |  80 ++++----
 .../jit-source/hip/hip-shared-basis-tensor.h  |  81 ++++++++
 .../jit-source/magma/magma-basis-grad-1d.h    |  45 +++++
 .../jit-source/magma/magma-basis-grad-2d.h    |  51 +++++
 .../jit-source/magma/magma-basis-grad-3d.h    |  58 ++++++
 .../jit-source/magma/magma-basis-interp-1d.h  |  45 +++++
 .../jit-source/magma/magma-basis-interp-2d.h  |  41 ++++
 .../jit-source/magma/magma-basis-interp-3d.h  |  41 ++++
 .../magma-basis-interp-deriv-nontensor.h      | 106 ++++++++++
 .../jit-source/magma/magma-common-nontensor.h |  19 ++
 .../jit-source/magma/magma-common-tensor.h    |  46 +++++
 interface/ceed-basis.c                        | 186 ++++++++++++++++--
 interface/ceed.c                              |   2 +
 tests/README.md                               |   3 +-
 tests/t360-basis.c                            |  56 ++++++
 tests/t361-basis.c                            | 116 +++++++++++
 tests/t362-basis.c                            |  59 ++++++
 tests/t363-basis.c                            |  54 +++++
 tests/t364-basis.c                            |  98 +++++++++
 tests/t365-basis.c                            | 123 ++++++++++++
 51 files changed, 1815 insertions(+), 242 deletions(-)
 create mode 100644 tests/t360-basis.c
 create mode 100644 tests/t361-basis.c
 create mode 100644 tests/t362-basis.c
 create mode 100644 tests/t363-basis.c
 create mode 100644 tests/t364-basis.c
 create mode 100644 tests/t365-basis.c

diff --git a/backends/cuda-ref/ceed-cuda-ref-basis.c b/backends/cuda-ref/ceed-cuda-ref-basis.c
index 91958b6b25..8cf285cbc8 100644
--- a/backends/cuda-ref/ceed-cuda-ref-basis.c
+++ b/backends/cuda-ref/ceed-cuda-ref-basis.c
@@ -18,7 +18,8 @@
 //------------------------------------------------------------------------------
 // Basis apply - tensor
 //------------------------------------------------------------------------------
-int CeedBasisApply_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, CeedVector v) {
+static int CeedBasisApplyCore_Cuda(CeedBasis basis, bool apply_add, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode,
+                                   CeedVector u, CeedVector v) {
   Ceed              ceed;
   CeedInt           Q_1d, dim;
   const CeedInt     is_transpose   = t_mode == CEED_TRANSPOSE;
@@ -33,10 +34,11 @@ int CeedBasisApply_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMo
   // Get read/write access to u, v
   if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
   else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
-  CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
+  if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v));
+  else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
 
   // Clear v for transpose operation
-  if (is_transpose) {
+  if (is_transpose && !apply_add) {
     CeedSize length;
 
     CeedCallBackend(CeedVectorGetLength(v, &length));
@@ -83,11 +85,23 @@ int CeedBasisApply_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMo
   return CEED_ERROR_SUCCESS;
 }
 
+static int CeedBasisApply_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u,
+                               CeedVector v) {
+  CeedCallBackend(CeedBasisApplyCore_Cuda(basis, false, num_elem, t_mode, eval_mode, u, v));
+  return CEED_ERROR_SUCCESS;
+}
+
+static int CeedBasisApplyAdd_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u,
+                                  CeedVector v) {
+  CeedCallBackend(CeedBasisApplyCore_Cuda(basis, true, num_elem, t_mode, eval_mode, u, v));
+  return CEED_ERROR_SUCCESS;
+}
+
 //------------------------------------------------------------------------------
 // Basis apply - tensor AtPoints
 //------------------------------------------------------------------------------
-int CeedBasisApplyAtPoints_Cuda(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, CeedEvalMode eval_mode,
-                                CeedVector x_ref, CeedVector u, CeedVector v) {
+static int CeedBasisApplyAtPointsCore_Cuda(CeedBasis basis, bool apply_add, const CeedInt num_elem, const CeedInt *num_points,
+                                           CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) {
   Ceed              ceed;
   CeedInt           Q_1d, dim, max_num_points = num_points[0];
   const CeedInt     is_transpose   = t_mode == CEED_TRANSPOSE;
@@ -158,10 +172,11 @@ int CeedBasisApplyAtPoints_Cuda(CeedBasis basis, const CeedInt num_elem, const C
   CeedCallBackend(CeedVectorGetArrayRead(x_ref, CEED_MEM_DEVICE, &d_x));
   if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
   else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
-  CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
+  if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v));
+  else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
 
   // Clear v for transpose operation
-  if (is_transpose) {
+  if (is_transpose && !apply_add) {
     CeedSize length;
 
     CeedCallBackend(CeedVectorGetLength(v, &length));
@@ -200,11 +215,23 @@ int CeedBasisApplyAtPoints_Cuda(CeedBasis basis, const CeedInt num_elem, const C
   return CEED_ERROR_SUCCESS;
 }
 
+static int CeedBasisApplyAtPoints_Cuda(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode,
+                                       CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) {
+  CeedCallBackend(CeedBasisApplyAtPointsCore_Cuda(basis, false, num_elem, num_points, t_mode, eval_mode, x_ref, u, v));
+  return CEED_ERROR_SUCCESS;
+}
+
+static int CeedBasisApplyAddAtPoints_Cuda(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode,
+                                          CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) {
+  CeedCallBackend(CeedBasisApplyAtPointsCore_Cuda(basis, true, num_elem, num_points, t_mode, eval_mode, x_ref, u, v));
+  return CEED_ERROR_SUCCESS;
+}
+
 //------------------------------------------------------------------------------
 // Basis apply - non-tensor
 //------------------------------------------------------------------------------
-int CeedBasisApplyNonTensor_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u,
-                                 CeedVector v) {
+static int CeedBasisApplyNonTensorCore_Cuda(CeedBasis basis, bool apply_add, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode,
+                                            CeedVector u, CeedVector v) {
   Ceed                     ceed;
   CeedInt                  num_nodes, num_qpts;
   const CeedInt            is_transpose    = t_mode == CEED_TRANSPOSE;
@@ -222,10 +249,11 @@ int CeedBasisApplyNonTensor_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTr
   // Get read/write access to u, v
   if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
   else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
-  CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
+  if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v));
+  else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
 
   // Clear v for transpose operation
-  if (is_transpose) {
+  if (is_transpose && !apply_add) {
     CeedSize length;
 
     CeedCallBackend(CeedVectorGetLength(v, &length));
@@ -291,6 +319,18 @@ int CeedBasisApplyNonTensor_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTr
   return CEED_ERROR_SUCCESS;
 }
 
+static int CeedBasisApplyNonTensor_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u,
+                                        CeedVector v) {
+  CeedCallBackend(CeedBasisApplyNonTensorCore_Cuda(basis, false, num_elem, t_mode, eval_mode, u, v));
+  return CEED_ERROR_SUCCESS;
+}
+
+static int CeedBasisApplyAddNonTensor_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u,
+                                           CeedVector v) {
+  CeedCallBackend(CeedBasisApplyNonTensorCore_Cuda(basis, true, num_elem, t_mode, eval_mode, u, v));
+  return CEED_ERROR_SUCCESS;
+}
+
 //------------------------------------------------------------------------------
 // Destroy tensor basis
 //------------------------------------------------------------------------------
@@ -374,7 +414,9 @@ int CeedBasisCreateTensorH1_Cuda(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const
 
   // Register backend functions
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Cuda));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAdd_Cuda));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAtPoints", CeedBasisApplyAtPoints_Cuda));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAddAtPoints", CeedBasisApplyAddAtPoints_Cuda));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Cuda));
   return CEED_ERROR_SUCCESS;
 }
@@ -434,6 +476,7 @@ int CeedBasisCreateH1_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes
 
   // Register backend functions
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Cuda));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Cuda));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Cuda));
   return CEED_ERROR_SUCCESS;
 }
@@ -493,6 +536,7 @@ int CeedBasisCreateHdiv_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nod
 
   // Register backend functions
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Cuda));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Cuda));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Cuda));
   return CEED_ERROR_SUCCESS;
 }
@@ -552,6 +596,7 @@ int CeedBasisCreateHcurl_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_no
 
   // Register backend functions
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Cuda));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Cuda));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Cuda));
   return CEED_ERROR_SUCCESS;
 }
diff --git a/backends/cuda-shared/ceed-cuda-shared-basis.c b/backends/cuda-shared/ceed-cuda-shared-basis.c
index 118d156a84..4f0901484d 100644
--- a/backends/cuda-shared/ceed-cuda-shared-basis.c
+++ b/backends/cuda-shared/ceed-cuda-shared-basis.c
@@ -27,8 +27,8 @@ int CeedInit_CudaCollocatedGrad(CeedScalar *d_B, CeedScalar *d_G, CeedInt P_1d,
 //------------------------------------------------------------------------------
 // Apply basis
 //------------------------------------------------------------------------------
-int CeedBasisApplyTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u,
-                                     CeedVector v) {
+static int CeedBasisApplyTensorCore_Cuda_shared(CeedBasis basis, bool apply_add, const CeedInt num_elem, CeedTransposeMode t_mode,
+                                                CeedEvalMode eval_mode, CeedVector u, CeedVector v) {
   Ceed                   ceed;
   Ceed_Cuda             *ceed_Cuda;
   CeedInt                dim, num_comp;
@@ -45,7 +45,8 @@ int CeedBasisApplyTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, Ce
   // Get read/write access to u, v
   if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
   else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
-  CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
+  if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v));
+  else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
 
   // Apply basis operation
   switch (eval_mode) {
@@ -66,7 +67,8 @@ int CeedBasisApplyTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, Ce
         CeedInt shared_mem      = elems_per_block * thread_1d * sizeof(CeedScalar);
 
         if (t_mode == CEED_TRANSPOSE) {
-          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->InterpTranspose, grid, thread_1d, 1, elems_per_block, shared_mem, interp_args));
+          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, grid, thread_1d, 1,
+                                                      elems_per_block, shared_mem, interp_args));
         } else {
           CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Interp, grid, thread_1d, 1, elems_per_block, shared_mem, interp_args));
         }
@@ -78,8 +80,8 @@ int CeedBasisApplyTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, Ce
         CeedInt shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
 
         if (t_mode == CEED_TRANSPOSE) {
-          CeedCallBackend(
-              CeedRunKernelDimShared_Cuda(ceed, data->InterpTranspose, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args));
+          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, grid, thread_1d, thread_1d,
+                                                      elems_per_block, shared_mem, interp_args));
         } else {
           CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Interp, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args));
         }
@@ -89,8 +91,8 @@ int CeedBasisApplyTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, Ce
         CeedInt shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
 
         if (t_mode == CEED_TRANSPOSE) {
-          CeedCallBackend(
-              CeedRunKernelDimShared_Cuda(ceed, data->InterpTranspose, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args));
+          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, grid, thread_1d, thread_1d,
+                                                      elems_per_block, shared_mem, interp_args));
         } else {
           CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Interp, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args));
         }
@@ -116,7 +118,8 @@ int CeedBasisApplyTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, Ce
         CeedInt shared_mem      = elems_per_block * thread_1d * sizeof(CeedScalar);
 
         if (t_mode == CEED_TRANSPOSE) {
-          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->GradTranspose, grid, thread_1d, 1, elems_per_block, shared_mem, grad_args));
+          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, grid, thread_1d, 1,
+                                                      elems_per_block, shared_mem, grad_args));
         } else {
           CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Grad, grid, thread_1d, 1, elems_per_block, shared_mem, grad_args));
         }
@@ -128,7 +131,8 @@ int CeedBasisApplyTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, Ce
         CeedInt shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
 
         if (t_mode == CEED_TRANSPOSE) {
-          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->GradTranspose, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args));
+          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, grid, thread_1d, thread_1d,
+                                                      elems_per_block, shared_mem, grad_args));
         } else {
           CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Grad, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args));
         }
@@ -138,7 +142,8 @@ int CeedBasisApplyTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, Ce
         CeedInt shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
 
         if (t_mode == CEED_TRANSPOSE) {
-          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->GradTranspose, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args));
+          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, grid, thread_1d, thread_1d,
+                                                      elems_per_block, shared_mem, grad_args));
         } else {
           CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Grad, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args));
         }
@@ -186,11 +191,23 @@ int CeedBasisApplyTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, Ce
   return CEED_ERROR_SUCCESS;
 }
 
+static int CeedBasisApplyTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u,
+                                            CeedVector v) {
+  CeedCallBackend(CeedBasisApplyTensorCore_Cuda_shared(basis, false, num_elem, t_mode, eval_mode, u, v));
+  return CEED_ERROR_SUCCESS;
+}
+
+static int CeedBasisApplyAddTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode,
+                                               CeedVector u, CeedVector v) {
+  CeedCallBackend(CeedBasisApplyTensorCore_Cuda_shared(basis, true, num_elem, t_mode, eval_mode, u, v));
+  return CEED_ERROR_SUCCESS;
+}
+
 //------------------------------------------------------------------------------
 // Basis apply - tensor AtPoints
 //------------------------------------------------------------------------------
-int CeedBasisApplyAtPoints_Cuda_shared(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode,
-                                       CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) {
+static int CeedBasisApplyAtPointsCore_Cuda_shared(CeedBasis basis, bool apply_add, const CeedInt num_elem, const CeedInt *num_points,
+                                                  CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) {
   Ceed                   ceed;
   CeedInt                Q_1d, dim, max_num_points = num_points[0];
   const CeedInt          is_transpose   = t_mode == CEED_TRANSPOSE;
@@ -261,10 +278,11 @@ int CeedBasisApplyAtPoints_Cuda_shared(CeedBasis basis, const CeedInt num_elem,
   CeedCallBackend(CeedVectorGetArrayRead(x_ref, CEED_MEM_DEVICE, &d_x));
   if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
   else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
-  CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
+  if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v));
+  else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
 
   // Clear v for transpose operation
-  if (is_transpose) {
+  if (is_transpose && !apply_add) {
     CeedSize length;
 
     CeedCallBackend(CeedVectorGetLength(v, &length));
@@ -303,6 +321,18 @@ int CeedBasisApplyAtPoints_Cuda_shared(CeedBasis basis, const CeedInt num_elem,
   return CEED_ERROR_SUCCESS;
 }
 
+static int CeedBasisApplyAtPoints_Cuda_shared(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode,
+                                              CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) {
+  CeedCallBackend(CeedBasisApplyAtPointsCore_Cuda_shared(basis, false, num_elem, num_points, t_mode, eval_mode, x_ref, u, v));
+  return CEED_ERROR_SUCCESS;
+}
+
+static int CeedBasisApplyAddAtPoints_Cuda_shared(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode,
+                                                 CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) {
+  CeedCallBackend(CeedBasisApplyAtPointsCore_Cuda_shared(basis, true, num_elem, num_points, t_mode, eval_mode, x_ref, u, v));
+  return CEED_ERROR_SUCCESS;
+}
+
 //------------------------------------------------------------------------------
 // Destroy basis
 //------------------------------------------------------------------------------
@@ -374,8 +404,10 @@ int CeedBasisCreateTensorH1_Cuda_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d,
                                    "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_HAS_COLLOCATED_GRAD", has_collocated_grad));
   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp));
   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "InterpTranspose", &data->InterpTranspose));
+  CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "InterpTransposeAdd", &data->InterpTransposeAdd));
   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Grad", &data->Grad));
   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "GradTranspose", &data->GradTranspose));
+  CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "GradTransposeAdd", &data->GradTransposeAdd));
   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight));
   CeedCallBackend(CeedFree(&basis_kernel_path));
   CeedCallBackend(CeedFree(&basis_kernel_source));
@@ -384,6 +416,7 @@ int CeedBasisCreateTensorH1_Cuda_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d,
 
   // Register backend functions
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyTensor_Cuda_shared));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddTensor_Cuda_shared));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAtPoints", CeedBasisApplyAtPoints_Cuda_shared));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Cuda_shared));
   return CEED_ERROR_SUCCESS;
diff --git a/backends/cuda-shared/ceed-cuda-shared.h b/backends/cuda-shared/ceed-cuda-shared.h
index 40bbaed6a0..96800a0a86 100644
--- a/backends/cuda-shared/ceed-cuda-shared.h
+++ b/backends/cuda-shared/ceed-cuda-shared.h
@@ -14,8 +14,10 @@ typedef struct {
   CUmodule    module;
   CUfunction  Interp;
   CUfunction  InterpTranspose;
+  CUfunction  InterpTransposeAdd;
   CUfunction  Grad;
   CUfunction  GradTranspose;
+  CUfunction  GradTransposeAdd;
   CUfunction  Weight;
   CUmodule    moduleAtPoints;
   CeedInt     num_points;
diff --git a/backends/hip-ref/ceed-hip-ref-basis.c b/backends/hip-ref/ceed-hip-ref-basis.c
index fd1a9fabde..78e27a1a5f 100644
--- a/backends/hip-ref/ceed-hip-ref-basis.c
+++ b/backends/hip-ref/ceed-hip-ref-basis.c
@@ -17,7 +17,8 @@
 //------------------------------------------------------------------------------
 // Basis apply - tensor
 //------------------------------------------------------------------------------
-int CeedBasisApply_Hip(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, CeedVector v) {
+static int CeedBasisApplyCore_Hip(CeedBasis basis, bool apply_add, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode,
+                                  CeedVector u, CeedVector v) {
   Ceed              ceed;
   CeedInt           Q_1d, dim;
   const CeedInt     is_transpose   = t_mode == CEED_TRANSPOSE;
@@ -32,10 +33,11 @@ int CeedBasisApply_Hip(CeedBasis basis, const CeedInt num_elem, CeedTransposeMod
   // Get read/write access to u, v
   if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
   else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
-  CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
+  if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v));
+  else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
 
   // Clear v for transpose operation
-  if (is_transpose) {
+  if (is_transpose && !apply_add) {
     CeedSize length;
 
     CeedCallBackend(CeedVectorGetLength(v, &length));
@@ -82,11 +84,22 @@ int CeedBasisApply_Hip(CeedBasis basis, const CeedInt num_elem, CeedTransposeMod
   return CEED_ERROR_SUCCESS;
 }
 
+static int CeedBasisApply_Hip(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, CeedVector v) {
+  CeedCallBackend(CeedBasisApplyCore_Hip(basis, false, num_elem, t_mode, eval_mode, u, v));
+  return CEED_ERROR_SUCCESS;
+}
+
+static int CeedBasisApplyAdd_Hip(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u,
+                                 CeedVector v) {
+  CeedCallBackend(CeedBasisApplyCore_Hip(basis, true, num_elem, t_mode, eval_mode, u, v));
+  return CEED_ERROR_SUCCESS;
+}
+
 //------------------------------------------------------------------------------
 // Basis apply - tensor AtPoints
 //------------------------------------------------------------------------------
-int CeedBasisApplyAtPoints_Hip(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, CeedEvalMode eval_mode,
-                               CeedVector x_ref, CeedVector u, CeedVector v) {
+static int CeedBasisApplyAtPointsCore_Hip(CeedBasis basis, bool apply_add, const CeedInt num_elem, const CeedInt *num_points,
+                                          CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) {
   Ceed              ceed;
   CeedInt           Q_1d, dim, max_num_points = num_points[0];
   const CeedInt     is_transpose   = t_mode == CEED_TRANSPOSE;
@@ -157,10 +170,11 @@ int CeedBasisApplyAtPoints_Hip(CeedBasis basis, const CeedInt num_elem, const Ce
   CeedCallBackend(CeedVectorGetArrayRead(x_ref, CEED_MEM_DEVICE, &d_x));
   if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
   else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
-  CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
+  if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v));
+  else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
 
   // Clear v for transpose operation
-  if (is_transpose) {
+  if (is_transpose && !apply_add) {
     CeedSize length;
 
     CeedCallBackend(CeedVectorGetLength(v, &length));
@@ -199,11 +213,23 @@ int CeedBasisApplyAtPoints_Hip(CeedBasis basis, const CeedInt num_elem, const Ce
   return CEED_ERROR_SUCCESS;
 }
 
+static int CeedBasisApplyAtPoints_Hip(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode,
+                                      CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) {
+  CeedCallBackend(CeedBasisApplyAtPointsCore_Hip(basis, false, num_elem, num_points, t_mode, eval_mode, x_ref, u, v));
+  return CEED_ERROR_SUCCESS;
+}
+
+static int CeedBasisApplyAddAtPoints_Hip(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode,
+                                         CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) {
+  CeedCallBackend(CeedBasisApplyAtPointsCore_Hip(basis, true, num_elem, num_points, t_mode, eval_mode, x_ref, u, v));
+  return CEED_ERROR_SUCCESS;
+}
+
 //------------------------------------------------------------------------------
 // Basis apply - non-tensor
 //------------------------------------------------------------------------------
-int CeedBasisApplyNonTensor_Hip(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u,
-                                CeedVector v) {
+static int CeedBasisApplyNonTensorCore_Hip(CeedBasis basis, bool apply_add, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode,
+                                           CeedVector u, CeedVector v) {
   Ceed                    ceed;
   CeedInt                 num_nodes, num_qpts;
   const CeedInt           is_transpose    = t_mode == CEED_TRANSPOSE;
@@ -221,10 +247,11 @@ int CeedBasisApplyNonTensor_Hip(CeedBasis basis, const CeedInt num_elem, CeedTra
   // Get read/write access to u, v
   if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
   else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
-  CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
+  if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v));
+  else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
 
   // Clear v for transpose operation
-  if (is_transpose) {
+  if (is_transpose && !apply_add) {
     CeedSize length;
 
     CeedCallBackend(CeedVectorGetLength(v, &length));
@@ -290,6 +317,18 @@ int CeedBasisApplyNonTensor_Hip(CeedBasis basis, const CeedInt num_elem, CeedTra
   return CEED_ERROR_SUCCESS;
 }
 
+static int CeedBasisApplyNonTensor_Hip(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u,
+                                       CeedVector v) {
+  CeedCallBackend(CeedBasisApplyNonTensorCore_Hip(basis, false, num_elem, t_mode, eval_mode, u, v));
+  return CEED_ERROR_SUCCESS;
+}
+
+static int CeedBasisApplyAddNonTensor_Hip(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u,
+                                          CeedVector v) {
+  CeedCallBackend(CeedBasisApplyNonTensorCore_Hip(basis, true, num_elem, t_mode, eval_mode, u, v));
+  return CEED_ERROR_SUCCESS;
+}
+
 //------------------------------------------------------------------------------
 // Destroy tensor basis
 //------------------------------------------------------------------------------
@@ -373,7 +412,9 @@ int CeedBasisCreateTensorH1_Hip(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const C
 
   // Register backend functions
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Hip));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAdd_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAtPoints", CeedBasisApplyAtPoints_Hip));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAddAtPoints", CeedBasisApplyAddAtPoints_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Hip));
   return CEED_ERROR_SUCCESS;
 }
@@ -433,6 +474,7 @@ int CeedBasisCreateH1_Hip(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes,
 
   // Register backend functions
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Hip));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Hip));
   return CEED_ERROR_SUCCESS;
 }
@@ -492,6 +534,7 @@ int CeedBasisCreateHdiv_Hip(CeedElemTopology topo, CeedInt dim, CeedInt num_node
 
   // Register backend functions
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Hip));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Hip));
   return CEED_ERROR_SUCCESS;
 }
@@ -551,6 +594,7 @@ int CeedBasisCreateHcurl_Hip(CeedElemTopology topo, CeedInt dim, CeedInt num_nod
 
   // Register backend functions
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Hip));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Hip));
   return CEED_ERROR_SUCCESS;
 }
diff --git a/backends/hip-shared/ceed-hip-shared-basis.c b/backends/hip-shared/ceed-hip-shared-basis.c
index ff41f9efe6..bda080ed2d 100644
--- a/backends/hip-shared/ceed-hip-shared-basis.c
+++ b/backends/hip-shared/ceed-hip-shared-basis.c
@@ -87,8 +87,8 @@ static int ComputeBasisThreadBlockSizes(const CeedInt dim, const CeedInt P_1d, c
 //------------------------------------------------------------------------------
 // Apply basis
 //------------------------------------------------------------------------------
-int CeedBasisApplyTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u,
-                                    CeedVector v) {
+static int CeedBasisApplyTensorCore_Hip_shared(CeedBasis basis, bool apply_add, const CeedInt num_elem, CeedTransposeMode t_mode,
+                                               CeedEvalMode eval_mode, CeedVector u, CeedVector v) {
   Ceed                  ceed;
   Ceed_Hip             *ceed_Hip;
   CeedInt               dim, num_comp;
@@ -105,7 +105,8 @@ int CeedBasisApplyTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, Cee
   // Get read/write access to u, v
   if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
   else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
-  CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
+  if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v));
+  else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
 
   // Apply basis operation
   switch (eval_mode) {
@@ -125,7 +126,8 @@ int CeedBasisApplyTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, Cee
         CeedInt shared_mem      = elems_per_block * thread_1d * sizeof(CeedScalar);
 
         if (t_mode == CEED_TRANSPOSE) {
-          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->InterpTranspose, grid, thread_1d, 1, elems_per_block, shared_mem, interp_args));
+          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, grid, thread_1d, 1,
+                                                     elems_per_block, shared_mem, interp_args));
         } else {
           CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Interp, grid, thread_1d, 1, elems_per_block, shared_mem, interp_args));
         }
@@ -136,8 +138,8 @@ int CeedBasisApplyTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, Cee
         CeedInt       shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
 
         if (t_mode == CEED_TRANSPOSE) {
-          CeedCallBackend(
-              CeedRunKernelDimShared_Hip(ceed, data->InterpTranspose, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args));
+          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, grid, thread_1d, thread_1d,
+                                                     elems_per_block, shared_mem, interp_args));
         } else {
           CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Interp, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args));
         }
@@ -147,8 +149,8 @@ int CeedBasisApplyTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, Cee
         CeedInt       shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
 
         if (t_mode == CEED_TRANSPOSE) {
-          CeedCallBackend(
-              CeedRunKernelDimShared_Hip(ceed, data->InterpTranspose, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args));
+          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, grid, thread_1d, thread_1d,
+                                                     elems_per_block, shared_mem, interp_args));
         } else {
           CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Interp, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args));
         }
@@ -174,7 +176,8 @@ int CeedBasisApplyTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, Cee
         CeedInt shared_mem      = elems_per_block * thread_1d * sizeof(CeedScalar);
 
         if (t_mode == CEED_TRANSPOSE) {
-          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->GradTranspose, grid, thread_1d, 1, elems_per_block, shared_mem, grad_args));
+          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, grid, thread_1d, 1,
+                                                     elems_per_block, shared_mem, grad_args));
         } else {
           CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Grad, grid, thread_1d, 1, elems_per_block, shared_mem, grad_args));
         }
@@ -185,7 +188,8 @@ int CeedBasisApplyTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, Cee
         CeedInt       shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
 
         if (t_mode == CEED_TRANSPOSE) {
-          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->GradTranspose, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args));
+          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, grid, thread_1d, thread_1d,
+                                                     elems_per_block, shared_mem, grad_args));
         } else {
           CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Grad, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args));
         }
@@ -195,7 +199,8 @@ int CeedBasisApplyTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, Cee
         CeedInt       shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
 
         if (t_mode == CEED_TRANSPOSE) {
-          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->GradTranspose, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args));
+          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, grid, thread_1d, thread_1d,
+                                                     elems_per_block, shared_mem, grad_args));
         } else {
           CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Grad, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args));
         }
@@ -245,11 +250,23 @@ int CeedBasisApplyTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, Cee
   return CEED_ERROR_SUCCESS;
 }
 
+int CeedBasisApplyTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u,
+                                    CeedVector v) {
+  CeedCallBackend(CeedBasisApplyTensorCore_Hip_shared(basis, false, num_elem, t_mode, eval_mode, u, v));
+  return CEED_ERROR_SUCCESS;
+}
+
+int CeedBasisApplyAddTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u,
+                                       CeedVector v) {
+  CeedCallBackend(CeedBasisApplyTensorCore_Hip_shared(basis, true, num_elem, t_mode, eval_mode, u, v));
+  return CEED_ERROR_SUCCESS;
+}
+
 //------------------------------------------------------------------------------
 // Basis apply - tensor AtPoints
 //------------------------------------------------------------------------------
-int CeedBasisApplyAtPoints_Hip_shared(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode,
-                                      CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) {
+static int CeedBasisApplyAtPointsCore_Hip_shared(CeedBasis basis, bool apply_add, const CeedInt num_elem, const CeedInt *num_points,
+                                                 CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) {
   Ceed                  ceed;
   CeedInt               Q_1d, dim, max_num_points = num_points[0];
   const CeedInt         is_transpose   = t_mode == CEED_TRANSPOSE;
@@ -320,10 +337,11 @@ int CeedBasisApplyAtPoints_Hip_shared(CeedBasis basis, const CeedInt num_elem, c
   CeedCallBackend(CeedVectorGetArrayRead(x_ref, CEED_MEM_DEVICE, &d_x));
   if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
   else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
-  CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
+  if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v));
+  else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
 
   // Clear v for transpose operation
-  if (is_transpose) {
+  if (is_transpose && !apply_add) {
     CeedSize length;
 
     CeedCallBackend(CeedVectorGetLength(v, &length));
@@ -362,6 +380,18 @@ int CeedBasisApplyAtPoints_Hip_shared(CeedBasis basis, const CeedInt num_elem, c
   return CEED_ERROR_SUCCESS;
 }
 
+static int CeedBasisApplyAtPoints_Hip_shared(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode,
+                                             CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) {
+  CeedCallBackend(CeedBasisApplyAtPointsCore_Hip_shared(basis, false, num_elem, num_points, t_mode, eval_mode, x_ref, u, v));
+  return CEED_ERROR_SUCCESS;
+}
+
+static int CeedBasisApplyAddAtPoints_Hip_shared(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode,
+                                                CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) {
+  CeedCallBackend(CeedBasisApplyAtPointsCore_Hip_shared(basis, true, num_elem, num_points, t_mode, eval_mode, x_ref, u, v));
+  return CEED_ERROR_SUCCESS;
+}
+
 //------------------------------------------------------------------------------
 // Destroy basis
 //------------------------------------------------------------------------------
@@ -438,8 +468,10 @@ int CeedBasisCreateTensorH1_Hip_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d,
                                   has_collocated_grad));
   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Interp", &data->Interp));
   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "InterpTranspose", &data->InterpTranspose));
+  CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "InterpTransposeAdd", &data->InterpTransposeAdd));
   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Grad", &data->Grad));
   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "GradTranspose", &data->GradTranspose));
+  CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "GradTransposeAdd", &data->GradTransposeAdd));
   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Weight", &data->Weight));
   CeedCallBackend(CeedFree(&basis_kernel_path));
   CeedCallBackend(CeedFree(&basis_kernel_source));
@@ -448,7 +480,9 @@ int CeedBasisCreateTensorH1_Hip_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d,
 
   // Register backend functions
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyTensor_Hip_shared));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddTensor_Hip_shared));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAtPoints", CeedBasisApplyAtPoints_Hip_shared));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAddAtPoints", CeedBasisApplyAddAtPoints_Hip_shared));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Hip_shared));
   return CEED_ERROR_SUCCESS;
 }
diff --git a/backends/hip-shared/ceed-hip-shared.h b/backends/hip-shared/ceed-hip-shared.h
index 8bc9d041a2..fe3384f55d 100644
--- a/backends/hip-shared/ceed-hip-shared.h
+++ b/backends/hip-shared/ceed-hip-shared.h
@@ -14,8 +14,10 @@ typedef struct {
   hipModule_t   module;
   hipFunction_t Interp;
   hipFunction_t InterpTranspose;
+  hipFunction_t InterpTransposeAdd;
   hipFunction_t Grad;
   hipFunction_t GradTranspose;
+  hipFunction_t GradTransposeAdd;
   hipFunction_t Weight;
   hipModule_t   moduleAtPoints;
   CeedInt       num_points;
diff --git a/backends/magma/ceed-magma-basis.c b/backends/magma/ceed-magma-basis.c
index de18a1a2fc..71a86d5b8d 100644
--- a/backends/magma/ceed-magma-basis.c
+++ b/backends/magma/ceed-magma-basis.c
@@ -26,7 +26,8 @@
 //------------------------------------------------------------------------------
 // Basis apply - tensor
 //------------------------------------------------------------------------------
-static int CeedBasisApply_Magma(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode e_mode, CeedVector u, CeedVector v) {
+static int CeedBasisApplyCore_Magma(CeedBasis basis, bool apply_add, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode e_mode, CeedVector u,
+                                    CeedVector v) {
   Ceed              ceed;
   Ceed_Magma       *data;
   CeedInt           dim, num_comp, num_nodes, P_1d, Q_1d, P, Q;
@@ -52,7 +53,8 @@ static int CeedBasisApply_Magma(CeedBasis basis, CeedInt num_elem, CeedTranspose
   // Read vectors
   if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
   else CeedCheck(e_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
-  CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
+  if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v));
+  else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
 
   // Apply basis operation
   switch (e_mode) {
@@ -115,7 +117,8 @@ static int CeedBasisApply_Magma(CeedBasis basis, CeedInt num_elem, CeedTranspose
       void   *args[] = {&impl->d_interp_1d, &d_u, &u_elem_stride, &u_comp_stride, &d_v, &v_elem_stride, &v_comp_stride, &num_elem};
 
       if (t_mode == CEED_TRANSPOSE) {
-        CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->InterpTranspose, grid, num_threads, num_t_col, 1, shared_mem, args));
+        CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, apply_add ? impl->InterpTransposeAdd : impl->InterpTranspose, grid, num_threads, num_t_col,
+                                                    1, shared_mem, args));
       } else {
         CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->Interp, grid, num_threads, num_t_col, 1, shared_mem, args));
       }
@@ -192,7 +195,8 @@ static int CeedBasisApply_Magma(CeedBasis basis, CeedInt num_elem, CeedTranspose
                         &v_elem_stride,     &v_comp_stride,   &v_dim_stride, &num_elem};
 
       if (t_mode == CEED_TRANSPOSE) {
-        CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->GradTranspose, grid, num_threads, num_t_col, 1, shared_mem, args));
+        CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, apply_add ? impl->GradTransposeAdd : impl->GradTranspose, grid, num_threads, num_t_col, 1,
+                                                    shared_mem, args));
       } else {
         CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->Grad, grid, num_threads, num_t_col, 1, shared_mem, args));
       }
@@ -248,6 +252,16 @@ static int CeedBasisApply_Magma(CeedBasis basis, CeedInt num_elem, CeedTranspose
   return CEED_ERROR_SUCCESS;
 }
 
+static int CeedBasisApply_Magma(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode e_mode, CeedVector u, CeedVector v) {
+  CeedCallBackend(CeedBasisApplyCore_Magma(basis, false, num_elem, t_mode, e_mode, u, v));
+  return CEED_ERROR_SUCCESS;
+}
+
+static int CeedBasisApplyAdd_Magma(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode e_mode, CeedVector u, CeedVector v) {
+  CeedCallBackend(CeedBasisApplyCore_Magma(basis, true, num_elem, t_mode, e_mode, u, v));
+  return CEED_ERROR_SUCCESS;
+}
+
 //------------------------------------------------------------------------------
 // Basis apply - tensor AtPoints
 //------------------------------------------------------------------------------
@@ -259,8 +273,8 @@ int CeedBasisApplyAtPoints_Magma(CeedBasis basis, const CeedInt num_elem, const
 //------------------------------------------------------------------------------
 // Basis apply - non-tensor
 //------------------------------------------------------------------------------
-static int CeedBasisApplyNonTensor_Magma(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode e_mode, CeedVector u,
-                                         CeedVector v) {
+static int CeedBasisApplyNonTensorCore_Magma(CeedBasis basis, bool apply_add, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode e_mode,
+                                             CeedVector u, CeedVector v) {
   Ceed                      ceed;
   Ceed_Magma               *data;
   CeedInt                   num_comp, num_nodes, num_qpts, P, Q, N;
@@ -281,7 +295,8 @@ static int CeedBasisApplyNonTensor_Magma(CeedBasis basis, CeedInt num_elem, Ceed
   // Read vectors
   if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
   else CeedCheck(e_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
-  CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
+  if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v));
+  else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
 
   // Compile kernels for N as needed
   CeedInt iN = 0;
@@ -344,8 +359,10 @@ static int CeedBasisApplyNonTensor_Magma(CeedBasis basis, CeedInt num_elem, Ceed
                                        impl->NB_deriv_t[iN]));
       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[iN], "magma_interp_nontensor_n", &impl->Interp[iN]));
       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[iN], "magma_interp_nontensor_t", &impl->InterpTranspose[iN]));
+      CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[iN], "magma_interp_nontensor_ta", &impl->InterpTransposeAdd[iN]));
       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[iN], "magma_deriv_nontensor_n", &impl->Deriv[iN]));
       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[iN], "magma_deriv_nontensor_t", &impl->DerivTranspose[iN]));
+      CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[iN], "magma_deriv_nontensor_ta", &impl->DerivTransposeAdd[iN]));
       if (!impl->Weight) {
         CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[iN], "magma_weight_nontensor", &impl->Weight));
         CeedCallBackend(CeedFree(&weight_kernel_path));
@@ -388,7 +405,7 @@ static int CeedBasisApplyNonTensor_Magma(CeedBasis basis, CeedInt num_elem, Ceed
     if (P <= MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_P && Q <= MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_Q) {
       if (e_mode == CEED_EVAL_INTERP) {
         if (t_mode == CEED_TRANSPOSE) {
-          Kernel = impl->InterpTranspose[iN];
+          Kernel = apply_add ? impl->InterpTransposeAdd[iN] : impl->InterpTranspose[iN];
           NB     = impl->NB_interp_t[iN];
         } else {
           Kernel = impl->Interp[iN];
@@ -396,7 +413,7 @@ static int CeedBasisApplyNonTensor_Magma(CeedBasis basis, CeedInt num_elem, Ceed
         }
       } else {
         if (t_mode == CEED_TRANSPOSE) {
-          Kernel = impl->DerivTranspose[iN];
+          Kernel = apply_add ? impl->DerivTransposeAdd[iN] : impl->DerivTranspose[iN];
           NB     = impl->NB_deriv_t[iN];
         } else {
           Kernel = impl->Deriv[iN];
@@ -414,7 +431,7 @@ static int CeedBasisApplyNonTensor_Magma(CeedBasis basis, CeedInt num_elem, Ceed
     } else {
       for (CeedInt d = 0; d < q_comp; d++) {
         if (t_mode == CEED_TRANSPOSE) {
-          const CeedScalar beta = (d > 0) ? 1.0 : 0.0;
+          const CeedScalar beta = (apply_add || (d > 0)) ? 1.0 : 0.0;
           magma_gemm_nontensor(MagmaNoTrans, MagmaNoTrans, P, N, Q, 1.0, d_b + d * P * Q, P, d_u + d * N * Q, Q, beta, d_v, P, data->queue);
         } else {
           magma_gemm_nontensor(MagmaTrans, MagmaNoTrans, Q, N, P, 1.0, d_b + d * P * Q, P, d_u, P, 0.0, d_v + d * N * Q, Q, data->queue);
@@ -443,6 +460,18 @@ static int CeedBasisApplyNonTensor_Magma(CeedBasis basis, CeedInt num_elem, Ceed
   return CEED_ERROR_SUCCESS;
 }
 
+static int CeedBasisApplyNonTensor_Magma(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode e_mode, CeedVector u,
+                                         CeedVector v) {
+  CeedCallBackend(CeedBasisApplyNonTensorCore_Magma(basis, false, num_elem, t_mode, e_mode, u, v));
+  return CEED_ERROR_SUCCESS;
+}
+
+static int CeedBasisApplyAddNonTensor_Magma(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode e_mode, CeedVector u,
+                                            CeedVector v) {
+  CeedCallBackend(CeedBasisApplyNonTensorCore_Magma(basis, true, num_elem, t_mode, e_mode, u, v));
+  return CEED_ERROR_SUCCESS;
+}
+
 //------------------------------------------------------------------------------
 // Destroy tensor basis
 //------------------------------------------------------------------------------
@@ -559,22 +588,28 @@ int CeedBasisCreateTensorH1_Magma(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const
     case 1:
       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpn_1d_kernel", &impl->Interp));
       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpt_1d_kernel", &impl->InterpTranspose));
+      CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpta_1d_kernel", &impl->InterpTransposeAdd));
       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradn_1d_kernel", &impl->Grad));
       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradt_1d_kernel", &impl->GradTranspose));
+      CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradta_1d_kernel", &impl->GradTransposeAdd));
       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_weight_1d_kernel", &impl->Weight));
       break;
     case 2:
       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpn_2d_kernel", &impl->Interp));
       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpt_2d_kernel", &impl->InterpTranspose));
+      CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpta_2d_kernel", &impl->InterpTransposeAdd));
       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradn_2d_kernel", &impl->Grad));
       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradt_2d_kernel", &impl->GradTranspose));
+      CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradta_2d_kernel", &impl->GradTransposeAdd));
       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_weight_2d_kernel", &impl->Weight));
       break;
     case 3:
       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpn_3d_kernel", &impl->Interp));
       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpt_3d_kernel", &impl->InterpTranspose));
+      CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpta_3d_kernel", &impl->InterpTransposeAdd));
       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradn_3d_kernel", &impl->Grad));
       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradt_3d_kernel", &impl->GradTranspose));
+      CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradta_3d_kernel", &impl->GradTransposeAdd));
       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_weight_3d_kernel", &impl->Weight));
       break;
   }
@@ -588,6 +623,7 @@ int CeedBasisCreateTensorH1_Magma(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const
   CeedCallBackend(CeedBasisSetData(basis, impl));
 
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Magma));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAdd_Magma));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAtPoints", CeedBasisApplyAtPoints_Magma));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Magma));
   return CEED_ERROR_SUCCESS;
@@ -650,6 +686,7 @@ int CeedBasisCreateH1_Magma(CeedElemTopology topo, CeedInt dim, CeedInt num_node
 
   // Register backend functions
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Magma));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Magma));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Magma));
   return CEED_ERROR_SUCCESS;
 }
@@ -711,6 +748,7 @@ int CeedBasisCreateHdiv_Magma(CeedElemTopology topo, CeedInt dim, CeedInt num_no
 
   // Register backend functions
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Magma));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Magma));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Magma));
   return CEED_ERROR_SUCCESS;
 }
@@ -772,6 +810,7 @@ int CeedBasisCreateHcurl_Magma(CeedElemTopology topo, CeedInt dim, CeedInt num_n
 
   // Register backend functions
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Magma));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Magma));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Magma));
   return CEED_ERROR_SUCCESS;
 }
diff --git a/backends/magma/ceed-magma.h b/backends/magma/ceed-magma.h
index aa60b37b40..22dd4264b6 100644
--- a/backends/magma/ceed-magma.h
+++ b/backends/magma/ceed-magma.h
@@ -47,8 +47,10 @@ typedef struct {
   CeedMagmaModule   module;
   CeedMagmaFunction Interp;
   CeedMagmaFunction InterpTranspose;
+  CeedMagmaFunction InterpTransposeAdd;
   CeedMagmaFunction Grad;
   CeedMagmaFunction GradTranspose;
+  CeedMagmaFunction GradTransposeAdd;
   CeedMagmaFunction Weight;
   CeedScalar       *d_interp_1d;
   CeedScalar       *d_grad_1d;
@@ -59,8 +61,10 @@ typedef struct {
   CeedMagmaModule   module[MAGMA_NONTENSOR_KERNEL_INSTANCES];
   CeedMagmaFunction Interp[MAGMA_NONTENSOR_KERNEL_INSTANCES];
   CeedMagmaFunction InterpTranspose[MAGMA_NONTENSOR_KERNEL_INSTANCES];
+  CeedMagmaFunction InterpTransposeAdd[MAGMA_NONTENSOR_KERNEL_INSTANCES];
   CeedMagmaFunction Deriv[MAGMA_NONTENSOR_KERNEL_INSTANCES];
   CeedMagmaFunction DerivTranspose[MAGMA_NONTENSOR_KERNEL_INSTANCES];
+  CeedMagmaFunction DerivTransposeAdd[MAGMA_NONTENSOR_KERNEL_INSTANCES];
   CeedMagmaFunction Weight;
   CeedInt           NB_interp[MAGMA_NONTENSOR_KERNEL_INSTANCES], NB_interp_t[MAGMA_NONTENSOR_KERNEL_INSTANCES];
   CeedInt           NB_deriv[MAGMA_NONTENSOR_KERNEL_INSTANCES], NB_deriv_t[MAGMA_NONTENSOR_KERNEL_INSTANCES];
diff --git a/backends/ref/ceed-ref-basis.c b/backends/ref/ceed-ref-basis.c
index b82e8bb278..121669012a 100644
--- a/backends/ref/ceed-ref-basis.c
+++ b/backends/ref/ceed-ref-basis.c
@@ -16,11 +16,11 @@
 //------------------------------------------------------------------------------
 // Basis Apply
 //------------------------------------------------------------------------------
-static int CeedBasisApply_Ref(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector U, CeedVector V) {
+static int CeedBasisApplyCore_Ref(CeedBasis basis, bool apply_add, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector U,
+                                  CeedVector V) {
   Ceed               ceed;
-  bool               is_tensor_basis;
+  bool               is_tensor_basis, add = apply_add || (t_mode == CEED_TRANSPOSE);
   CeedInt            dim, num_comp, q_comp, num_nodes, num_qpts;
-  const CeedInt      add = (t_mode == CEED_TRANSPOSE);
   const CeedScalar  *u;
   CeedScalar        *v;
   CeedTensorContract contract;
@@ -36,13 +36,15 @@ static int CeedBasisApply_Ref(CeedBasis basis, CeedInt num_elem, CeedTransposeMo
   CeedCallBackend(CeedBasisGetTensorContract(basis, &contract));
   if (U != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(U, CEED_MEM_HOST, &u));
   else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
-  CeedCallBackend(CeedVectorGetArrayWrite(V, CEED_MEM_HOST, &v));
-
   // Clear v if operating in transpose
-  if (t_mode == CEED_TRANSPOSE) {
-    const CeedInt v_size = num_elem * num_comp * num_nodes;
+  if (apply_add) CeedCallBackend(CeedVectorGetArray(V, CEED_MEM_HOST, &v));
+  else CeedCallBackend(CeedVectorGetArrayWrite(V, CEED_MEM_HOST, &v));
+
+  if (t_mode == CEED_TRANSPOSE && !apply_add) {
+    CeedSize len;
 
-    for (CeedInt i = 0; i < v_size; i++) v[i] = (CeedScalar)0.0;
+    CeedCallBackend(CeedVectorGetLength(V, &len));
+    for (CeedInt i = 0; i < len; i++) v[i] = 0.0;
   }
 
   CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor_basis));
@@ -101,8 +103,8 @@ static int CeedBasisApply_Ref(CeedBasis basis, CeedInt num_elem, CeedTransposeMo
           //  or Grad to quadrature points (Transpose)
           for (CeedInt d = 0; d < dim; d++) {
             CeedCallBackend(CeedTensorContractApply(contract, pre, P, post, Q, (t_mode == CEED_NOTRANSPOSE ? interp_1d : impl->collo_grad_1d), t_mode,
-                                                    add && (d > 0),
-                                                    (t_mode == CEED_NOTRANSPOSE ? (d == 0 ? u : tmp[d % 2]) : u + d * num_qpts * num_comp * num_elem),
+                                                    (t_mode == CEED_TRANSPOSE) && (d > 0),
+                                                    (t_mode == CEED_NOTRANSPOSE ? (d == 0 ? u : tmp[d % 2]) : &u[d * num_qpts * num_comp * num_elem]),
                                                     (t_mode == CEED_NOTRANSPOSE ? (d == dim - 1 ? interp : tmp[(d + 1) % 2]) : interp)));
             pre /= P;
             post *= Q;
@@ -117,9 +119,10 @@ static int CeedBasisApply_Ref(CeedBasis basis, CeedInt num_elem, CeedTransposeMo
           pre = num_comp * CeedIntPow(P, dim - 1), post = num_elem;
           for (CeedInt d = 0; d < dim; d++) {
             CeedCallBackend(CeedTensorContractApply(
-                contract, pre, P, post, Q, (t_mode == CEED_NOTRANSPOSE ? impl->collo_grad_1d : interp_1d), t_mode, add && (d == dim - 1),
+                contract, pre, P, post, Q, (t_mode == CEED_NOTRANSPOSE ? impl->collo_grad_1d : interp_1d), t_mode,
+                (t_mode == CEED_NOTRANSPOSE && apply_add) || (t_mode == CEED_TRANSPOSE && (d == dim - 1)),
                 (t_mode == CEED_NOTRANSPOSE ? interp : (d == 0 ? interp : tmp[d % 2])),
-                (t_mode == CEED_NOTRANSPOSE ? v + d * num_qpts * num_comp * num_elem : (d == dim - 1 ? v : tmp[(d + 1) % 2]))));
+                (t_mode == CEED_NOTRANSPOSE ? &v[d * num_qpts * num_comp * num_elem] : (d == dim - 1 ? v : tmp[(d + 1) % 2]))));
             pre /= P;
             post *= Q;
           }
@@ -133,8 +136,8 @@ static int CeedBasisApply_Ref(CeedBasis basis, CeedInt num_elem, CeedTransposeMo
 
           for (CeedInt d = 0; d < dim; d++) {
             CeedCallBackend(CeedTensorContractApply(contract, pre, P, post, Q, grad_1d, t_mode, add && (d > 0),
-                                                    t_mode == CEED_NOTRANSPOSE ? u : u + d * num_comp * num_qpts * num_elem,
-                                                    t_mode == CEED_TRANSPOSE ? v : v + d * num_comp * num_qpts * num_elem));
+                                                    t_mode == CEED_NOTRANSPOSE ? u : &u[d * num_comp * num_qpts * num_elem],
+                                                    t_mode == CEED_TRANSPOSE ? v : &v[d * num_comp * num_qpts * num_elem]));
             pre /= P;
             post *= Q;
           }
@@ -156,8 +159,8 @@ static int CeedBasisApply_Ref(CeedBasis basis, CeedInt num_elem, CeedTransposeMo
             for (CeedInt d = 0; d < dim; d++) {
               CeedCallBackend(CeedTensorContractApply(
                   contract, pre, P, post, Q, (p == d) ? grad_1d : interp_1d, t_mode, add && (d == dim - 1),
-                  (d == 0 ? (t_mode == CEED_NOTRANSPOSE ? u : u + p * num_comp * num_qpts * num_elem) : tmp[d % 2]),
-                  (d == dim - 1 ? (t_mode == CEED_TRANSPOSE ? v : v + p * num_comp * num_qpts * num_elem) : tmp[(d + 1) % 2])));
+                  (d == 0 ? (t_mode == CEED_NOTRANSPOSE ? u : &u[p * num_comp * num_qpts * num_elem]) : tmp[d % 2]),
+                  (d == dim - 1 ? (t_mode == CEED_TRANSPOSE ? v : &v[p * num_comp * num_qpts * num_elem]) : tmp[(d + 1) % 2])));
               pre /= P;
               post *= Q;
             }
@@ -249,6 +252,16 @@ static int CeedBasisApply_Ref(CeedBasis basis, CeedInt num_elem, CeedTransposeMo
   return CEED_ERROR_SUCCESS;
 }
 
+static int CeedBasisApply_Ref(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector U, CeedVector V) {
+  CeedCallBackend(CeedBasisApplyCore_Ref(basis, false, num_elem, t_mode, eval_mode, U, V));
+  return CEED_ERROR_SUCCESS;
+}
+
+static int CeedBasisApplyAdd_Ref(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector U, CeedVector V) {
+  CeedCallBackend(CeedBasisApplyCore_Ref(basis, true, num_elem, t_mode, eval_mode, U, V));
+  return CEED_ERROR_SUCCESS;
+}
+
 //------------------------------------------------------------------------------
 // Basis Destroy Tensor
 //------------------------------------------------------------------------------
@@ -297,6 +310,7 @@ int CeedBasisCreateTensorH1_Ref(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const C
   CeedCallBackend(CeedBasisSetTensorContract(basis, contract));
 
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Ref));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAdd_Ref));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyTensor_Ref));
   return CEED_ERROR_SUCCESS;
 }
@@ -316,6 +330,7 @@ int CeedBasisCreateH1_Ref(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes,
   CeedCallBackend(CeedBasisSetTensorContract(basis, contract));
 
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Ref));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAdd_Ref));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -334,6 +349,7 @@ int CeedBasisCreateHdiv_Ref(CeedElemTopology topo, CeedInt dim, CeedInt num_node
   CeedCallBackend(CeedBasisSetTensorContract(basis, contract));
 
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Ref));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAdd_Ref));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -352,6 +368,7 @@ int CeedBasisCreateHcurl_Ref(CeedElemTopology topo, CeedInt dim, CeedInt num_nod
   CeedCallBackend(CeedBasisSetTensorContract(basis, contract));
 
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Ref));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAdd_Ref));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/include/ceed-impl.h b/include/ceed-impl.h
index de206e9e59..0b76071a08 100644
--- a/include/ceed-impl.h
+++ b/include/ceed-impl.h
@@ -186,7 +186,9 @@ struct CeedElemRestriction_private {
 struct CeedBasis_private {
   Ceed ceed;
   int (*Apply)(CeedBasis, CeedInt, CeedTransposeMode, CeedEvalMode, CeedVector, CeedVector);
+  int (*ApplyAdd)(CeedBasis, CeedInt, CeedTransposeMode, CeedEvalMode, CeedVector, CeedVector);
   int (*ApplyAtPoints)(CeedBasis, CeedInt, const CeedInt *, CeedTransposeMode, CeedEvalMode, CeedVector, CeedVector, CeedVector);
+  int (*ApplyAddAtPoints)(CeedBasis, CeedInt, const CeedInt *, CeedTransposeMode, CeedEvalMode, CeedVector, CeedVector, CeedVector);
   int (*Destroy)(CeedBasis);
   int                ref_count;
   bool               is_tensor_basis; /* flag for tensor basis */
diff --git a/include/ceed/ceed.h b/include/ceed/ceed.h
index f55aa9c6f3..d847525da8 100644
--- a/include/ceed/ceed.h
+++ b/include/ceed/ceed.h
@@ -305,8 +305,11 @@ CEED_EXTERN int CeedBasisCreateProjection(CeedBasis basis_from, CeedBasis basis_
 CEED_EXTERN int CeedBasisReferenceCopy(CeedBasis basis, CeedBasis *basis_copy);
 CEED_EXTERN int CeedBasisView(CeedBasis basis, FILE *stream);
 CEED_EXTERN int CeedBasisApply(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, CeedVector v);
+CEED_EXTERN int CeedBasisApplyAdd(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, CeedVector v);
 CEED_EXTERN int CeedBasisApplyAtPoints(CeedBasis basis, CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, CeedEvalMode eval_mode,
                                        CeedVector x_ref, CeedVector u, CeedVector v);
+CEED_EXTERN int CeedBasisApplyAddAtPoints(CeedBasis basis, CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode,
+                                          CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v);
 CEED_EXTERN int CeedBasisGetCeed(CeedBasis basis, Ceed *ceed);
 CEED_EXTERN Ceed CeedBasisReturnCeed(CeedBasis basis);
 CEED_EXTERN int  CeedBasisGetDimension(CeedBasis basis, CeedInt *dim);
diff --git a/include/ceed/jit-source/cuda/cuda-ref-basis-nontensor-templates.h b/include/ceed/jit-source/cuda/cuda-ref-basis-nontensor-templates.h
index 64b57d0d68..6b19ad448d 100644
--- a/include/ceed/jit-source/cuda/cuda-ref-basis-nontensor-templates.h
+++ b/include/ceed/jit-source/cuda/cuda-ref-basis-nontensor-templates.h
@@ -53,9 +53,9 @@ inline __device__ void ContractTranspose(const CeedInt elem, const CeedInt strid
     // Run with P threads
     r_V = 0.0;
     for (CeedInt d = 0; d < Q_COMP; d++) {
-      U = d_U + elem * strides_elem_U + comp * strides_comp_U + d * strides_q_comp_U;
+      U = &d_U[elem * strides_elem_U + comp * strides_comp_U + d * strides_q_comp_U];
       for (CeedInt i = 0; i < Q; i++) r_V += d_B[t_id + i * P + d * P * Q] * U[i];
     }
-    d_V[elem * strides_elem_V + comp * strides_comp_V + t_id] = r_V;
+    d_V[elem * strides_elem_V + comp * strides_comp_V + t_id] += r_V;
   }
 }
diff --git a/include/ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h b/include/ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h
index fc65a10912..263d29338e 100644
--- a/include/ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h
+++ b/include/ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h
@@ -67,8 +67,8 @@ extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt
   if (is_transpose) {
     for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
       for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
-        const CeedScalar *cur_u = u + elem * u_stride + comp * u_comp_stride;
-        CeedScalar       *cur_v = v + elem * v_stride + comp * v_comp_stride;
+        const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride];
+        CeedScalar       *cur_v = &v[elem * v_stride + comp * v_comp_stride];
         CeedInt           pre   = 1;
         CeedInt           post  = 1;
 
@@ -85,7 +85,7 @@ extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt
           for (CeedInt d = 0; d < BASIS_DIM; d++) {
             // Update buffers used
             pre /= 1;
-            const CeedScalar *in  = d == 0 ? (cur_u + p) : (d % 2 ? buffer_2 : buffer_1);
+            const CeedScalar *in  = d == 0 ? (&cur_u[p]) : (d % 2 ? buffer_2 : buffer_1);
             CeedScalar       *out = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? buffer_1 : buffer_2);
 
             // Build Chebyshev polynomial values
@@ -124,7 +124,8 @@ extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt
             CeedScalar    v_k = 0;
 
             for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c];
-            out[k] = v_k;
+            if (d == BASIS_DIM - 1) out[k] += v_k;
+            else out[k] = v_k;
           }
           post *= P;
         }
@@ -133,8 +134,8 @@ extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt
   } else {
     for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
       for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
-        const CeedScalar *cur_u = u + elem * u_stride + comp * u_comp_stride;
-        CeedScalar       *cur_v = v + elem * v_stride + comp * v_comp_stride;
+        const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride];
+        CeedScalar       *cur_v = &v[elem * v_stride + comp * v_comp_stride];
         CeedInt           pre   = u_size;
         CeedInt           post  = 1;
 
@@ -169,7 +170,7 @@ extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt
             // Update buffers used
             pre /= Q;
             const CeedScalar *in  = d == 0 ? s_chebyshev_coeffs : (d % 2 ? buffer_2 : buffer_1);
-            CeedScalar       *out = d == BASIS_DIM - 1 ? (cur_v + p) : (d % 2 ? buffer_1 : buffer_2);
+            CeedScalar       *out = d == BASIS_DIM - 1 ? (&cur_v[p]) : (d % 2 ? buffer_1 : buffer_2);
 
             // Build Chebyshev polynomial values
             ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * v_stride + d * v_comp_stride + p], chebyshev_x);
@@ -222,7 +223,7 @@ extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedInt is
   if (is_transpose) {
     for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
       for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
-        CeedScalar *cur_v = v + elem * v_stride + comp * v_comp_stride;
+        CeedScalar *cur_v = &v[elem * v_stride + comp * v_comp_stride];
         CeedInt     pre   = 1;
         CeedInt     post  = 1;
 
@@ -235,7 +236,7 @@ extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedInt is
         __syncthreads();
         for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
           for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) {
-            const CeedScalar *cur_u = u + elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride;
+            const CeedScalar *cur_u = &u[elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride];
 
             pre  = 1;
             post = 1;
@@ -283,7 +284,8 @@ extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedInt is
             CeedScalar    v_k = 0;
 
             for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c];
-            out[k] = v_k;
+            if (d == BASIS_DIM - 1) out[k] += v_k;
+            else out[k] = v_k;
           }
           post *= P;
         }
@@ -292,7 +294,7 @@ extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedInt is
   } else {
     for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
       for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
-        const CeedScalar *cur_u = u + elem * u_stride + comp * u_comp_stride;
+        const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride];
         CeedInt           pre   = u_size;
         CeedInt           post  = 1;
 
@@ -322,7 +324,7 @@ extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedInt is
         __syncthreads();
         for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
           for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) {
-            CeedScalar *cur_v = v + elem * v_stride + dim_1 * v_dim_stride + comp * v_comp_stride;
+            CeedScalar *cur_v = &v[elem * v_stride + dim_1 * v_dim_stride + comp * v_comp_stride];
 
             pre  = BASIS_NUM_QPTS;
             post = 1;
diff --git a/include/ceed/jit-source/cuda/cuda-ref-basis-tensor.h b/include/ceed/jit-source/cuda/cuda-ref-basis-tensor.h
index 468ec978c0..4c8c2f447c 100644
--- a/include/ceed/jit-source/cuda/cuda-ref-basis-tensor.h
+++ b/include/ceed/jit-source/cuda/cuda-ref-basis-tensor.h
@@ -42,8 +42,8 @@ extern "C" __global__ void Interp(const CeedInt num_elem, const CeedInt is_trans
   // Apply basis element by element
   for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
     for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
-      const CeedScalar *cur_u = u + elem * u_stride + comp * u_comp_stride;
-      CeedScalar       *cur_v = v + elem * v_stride + comp * v_comp_stride;
+      const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride];
+      CeedScalar       *cur_v = &v[elem * v_stride + comp * v_comp_stride];
       CeedInt           pre   = u_size;
       CeedInt           post  = 1;
 
@@ -57,13 +57,14 @@ extern "C" __global__ void Interp(const CeedInt num_elem, const CeedInt is_trans
 
         // Contract along middle index
         for (CeedInt k = i; k < writeLen; k += blockDim.x) {
-          const CeedInt c  = k % post;
-          const CeedInt j  = (k / post) % Q;
-          const CeedInt a  = k / (post * Q);
-          CeedScalar    vk = 0;
-
-          for (CeedInt b = 0; b < P; b++) vk += s_interp_1d[j * stride_0 + b * stride_1] * in[(a * P + b) * post + c];
-          out[k] = vk;
+          const CeedInt c   = k % post;
+          const CeedInt j   = (k / post) % Q;
+          const CeedInt a   = k / (post * Q);
+          CeedScalar    v_k = 0;
+
+          for (CeedInt b = 0; b < P; b++) v_k += s_interp_1d[j * stride_0 + b * stride_1] * in[(a * P + b) * post + c];
+          if (is_transpose && d == BASIS_DIM - 1) out[k] += v_k;
+          else out[k] = v_k;
         }
         post *= Q;
       }
@@ -106,8 +107,8 @@ extern "C" __global__ void Grad(const CeedInt num_elem, const CeedInt is_transpo
       for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) {
         CeedInt           pre   = is_transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES;
         CeedInt           post  = 1;
-        const CeedScalar *cur_u = u + elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride;
-        CeedScalar       *cur_v = v + elem * v_stride + dim_1 * v_dim_stride + comp * v_comp_stride;
+        const CeedScalar *cur_u = &u[elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride];
+        CeedScalar       *cur_v = &v[elem * v_stride + dim_1 * v_dim_stride + comp * v_comp_stride];
 
         for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) {
           __syncthreads();
diff --git a/include/ceed/jit-source/cuda/cuda-ref-operator-assemble.h b/include/ceed/jit-source/cuda/cuda-ref-operator-assemble.h
index 318a07b163..9fc02a1c7a 100644
--- a/include/ceed/jit-source/cuda/cuda-ref-operator-assemble.h
+++ b/include/ceed/jit-source/cuda/cuda-ref-operator-assemble.h
@@ -24,7 +24,7 @@ extern "C" __launch_bounds__(BLOCK_SIZE) __global__
                         const CeedInt8 *curl_orients_in, const bool *orients_out, const CeedInt8 *curl_orients_out,
                         const CeedScalar *__restrict__ qf_array, CeedScalar *__restrict__ values_array) {
   extern __shared__ CeedScalar s_CT[];
-  CeedScalar                  *s_C = s_CT + NUM_NODES_OUT * NUM_NODES_IN;
+  CeedScalar                  *s_C = &s_CT[NUM_NODES_OUT * NUM_NODES_IN];
 
   const int l = threadIdx.x;  // The output column index of each B^T D B operation
                               // such that we have (Bout^T)_ij D_jk Bin_kl = C_il
diff --git a/include/ceed/jit-source/cuda/cuda-ref-restriction-at-points.h b/include/ceed/jit-source/cuda/cuda-ref-restriction-at-points.h
index 83c4086ed0..87aeda2e3b 100644
--- a/include/ceed/jit-source/cuda/cuda-ref-restriction-at-points.h
+++ b/include/ceed/jit-source/cuda/cuda-ref-restriction-at-points.h
@@ -23,7 +23,7 @@ extern "C" __global__ void AtPointsTranspose(const CeedInt *__restrict__ indices
 
     if (loc_node >= points_per_elem[elem]) continue;
     for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) {
-      atomicAdd(v + ind + comp * RSTR_COMP_STRIDE, u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE]);
+      atomicAdd(&v[ind + comp * RSTR_COMP_STRIDE], u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE]);
     }
   }
 }
diff --git a/include/ceed/jit-source/cuda/cuda-ref-restriction-curl-oriented.h b/include/ceed/jit-source/cuda/cuda-ref-restriction-curl-oriented.h
index d317f42cc5..86a4b53545 100644
--- a/include/ceed/jit-source/cuda/cuda-ref-restriction-curl-oriented.h
+++ b/include/ceed/jit-source/cuda/cuda-ref-restriction-curl-oriented.h
@@ -80,7 +80,7 @@ extern "C" __global__ void CurlOrientedTranspose(const CeedInt *__restrict__ ind
       value += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_d;
       value +=
           loc_node < (RSTR_ELEM_SIZE - 1) ? u[loc_node + 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_dl : 0.0;
-      atomicAdd(v + ind + comp * RSTR_COMP_STRIDE, value);
+      atomicAdd(&v[ind + comp * RSTR_COMP_STRIDE], value);
     }
   }
 }
@@ -138,7 +138,7 @@ extern "C" __global__ void CurlOrientedUnsignedTranspose(const CeedInt *__restri
       value += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_d;
       value +=
           loc_node < (RSTR_ELEM_SIZE - 1) ? u[loc_node + 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_dl : 0.0;
-      atomicAdd(v + ind + comp * RSTR_COMP_STRIDE, value);
+      atomicAdd(&v[ind + comp * RSTR_COMP_STRIDE], value);
     }
   }
 }
diff --git a/include/ceed/jit-source/cuda/cuda-ref-restriction-offset.h b/include/ceed/jit-source/cuda/cuda-ref-restriction-offset.h
index 0bd3dc0dd8..9492b31984 100644
--- a/include/ceed/jit-source/cuda/cuda-ref-restriction-offset.h
+++ b/include/ceed/jit-source/cuda/cuda-ref-restriction-offset.h
@@ -36,7 +36,7 @@ extern "C" __global__ void OffsetTranspose(const CeedInt *__restrict__ indices,
     const CeedInt elem     = node / RSTR_ELEM_SIZE;
 
     for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) {
-      atomicAdd(v + ind + comp * RSTR_COMP_STRIDE, u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE]);
+      atomicAdd(&v[ind + comp * RSTR_COMP_STRIDE], u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE]);
     }
   }
 }
diff --git a/include/ceed/jit-source/cuda/cuda-ref-restriction-oriented.h b/include/ceed/jit-source/cuda/cuda-ref-restriction-oriented.h
index d36f27277e..7c667922bf 100644
--- a/include/ceed/jit-source/cuda/cuda-ref-restriction-oriented.h
+++ b/include/ceed/jit-source/cuda/cuda-ref-restriction-oriented.h
@@ -40,7 +40,7 @@ extern "C" __global__ void OrientedTranspose(const CeedInt *__restrict__ indices
     const CeedInt elem     = node / RSTR_ELEM_SIZE;
 
     for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) {
-      atomicAdd(v + ind + comp * RSTR_COMP_STRIDE,
+      atomicAdd(&v[ind + comp * RSTR_COMP_STRIDE],
                 u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * (orient ? -1.0 : 1.0));
     }
   }
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-read-write-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-read-write-templates.h
index b10ba108f8..56234c28e4 100644
--- a/include/ceed/jit-source/cuda/cuda-shared-basis-read-write-templates.h
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-read-write-templates.h
@@ -46,6 +46,19 @@ inline __device__ void WriteElementStrided1d(SharedData_Cuda &data, const CeedIn
   }
 }
 
+template <int NUM_COMP, int P_1D>
+inline __device__ void SumElementStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp,
+                                           const CeedInt strides_elem, const CeedScalar *r_v, CeedScalar *d_v) {
+  if (data.t_id_x < P_1D) {
+    const CeedInt node = data.t_id_x;
+    const CeedInt ind  = node * strides_node + elem * strides_elem;
+
+    for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+      d_v[ind + comp * strides_comp] += r_v[comp];
+    }
+  }
+}
+
 //------------------------------------------------------------------------------
 // 2D
 //------------------------------------------------------------------------------
@@ -82,6 +95,19 @@ inline __device__ void WriteElementStrided2d(SharedData_Cuda &data, const CeedIn
   }
 }
 
+template <int NUM_COMP, int P_1D>
+inline __device__ void SumElementStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp,
+                                           const CeedInt strides_elem, const CeedScalar *r_v, CeedScalar *d_v) {
+  if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
+    const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
+    const CeedInt ind  = node * strides_node + elem * strides_elem;
+
+    for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+      d_v[ind + comp * strides_comp] += r_v[comp];
+    }
+  }
+}
+
 //------------------------------------------------------------------------------
 // 3D
 //------------------------------------------------------------------------------
@@ -121,3 +147,18 @@ inline __device__ void WriteElementStrided3d(SharedData_Cuda &data, const CeedIn
     }
   }
 }
+
+template <int NUM_COMP, int P_1D>
+inline __device__ void SumElementStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp,
+                                           const CeedInt strides_elem, const CeedScalar *r_v, CeedScalar *d_v) {
+  if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
+    for (CeedInt z = 0; z < P_1D; z++) {
+      const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
+      const CeedInt ind  = node * strides_node + elem * strides_elem;
+
+      for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+        d_v[ind + comp * strides_comp] += r_v[z + comp * P_1D];
+      }
+    }
+  }
+}
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h
index f2fde94139..56989f2b69 100644
--- a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h
@@ -52,7 +52,7 @@ inline __device__ void ContractTransposeX1d(SharedData_Cuda &data, const CeedSca
 template <int NUM_COMP, int P_1D, int Q_1D>
 inline __device__ void Interp1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractX1d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_B, r_V + comp);
+    ContractX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, &r_V[comp]);
   }
 }
 
@@ -63,7 +63,7 @@ template <int NUM_COMP, int P_1D, int Q_1D>
 inline __device__ void InterpTranspose1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
                                          CeedScalar *__restrict__ r_V) {
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_B, r_V + comp);
+    ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, &r_V[comp]);
   }
 }
 
@@ -74,7 +74,7 @@ template <int NUM_COMP, int P_1D, int Q_1D>
 inline __device__ void Grad1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
                               CeedScalar *__restrict__ r_V) {
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractX1d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_G, r_V + comp);
+    ContractX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_G, &r_V[comp]);
   }
 }
 
@@ -85,7 +85,7 @@ template <int NUM_COMP, int P_1D, int Q_1D>
 inline __device__ void GradTranspose1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
                                        CeedScalar *__restrict__ r_V) {
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_G, r_V + comp);
+    ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_G, &r_V[comp]);
   }
 }
 
@@ -188,8 +188,8 @@ inline __device__ void InterpTensor2d(SharedData_Cuda &data, const CeedScalar *_
                                       CeedScalar *__restrict__ r_V) {
   CeedScalar r_t[1];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractX2d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_B, r_t);
-    ContractY2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, r_V + comp);
+    ContractX2d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, r_t);
+    ContractY2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, &r_V[comp]);
   }
 }
 
@@ -201,8 +201,8 @@ inline __device__ void InterpTransposeTensor2d(SharedData_Cuda &data, const Ceed
                                                CeedScalar *__restrict__ r_V) {
   CeedScalar r_t[1];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractTransposeY2d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_B, r_t);
-    ContractTransposeX2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, r_V + comp);
+    ContractTransposeY2d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, r_t);
+    ContractTransposeX2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, &r_V[comp]);
   }
 }
 
@@ -214,10 +214,10 @@ inline __device__ void GradTensor2d(SharedData_Cuda &data, const CeedScalar *__r
                                     CeedScalar *__restrict__ r_V) {
   CeedScalar r_t[1];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractX2d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_G, r_t);
-    ContractY2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, r_V + comp + 0 * NUM_COMP);
-    ContractX2d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_B, r_t);
-    ContractY2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_G, r_V + comp + 1 * NUM_COMP);
+    ContractX2d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_G, r_t);
+    ContractY2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, &r_V[comp + 0 * NUM_COMP]);
+    ContractX2d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, r_t);
+    ContractY2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_G, &r_V[comp + 1 * NUM_COMP]);
   }
 }
 
@@ -229,10 +229,10 @@ inline __device__ void GradTransposeTensor2d(SharedData_Cuda &data, const CeedSc
                                              CeedScalar *__restrict__ r_V) {
   CeedScalar r_t[1];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractTransposeY2d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp + 0 * NUM_COMP, c_B, r_t);
-    ContractTransposeX2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_G, r_V + comp);
-    ContractTransposeY2d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp + 1 * NUM_COMP, c_G, r_t);
-    ContractTransposeAddX2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, r_V + comp);
+    ContractTransposeY2d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp + 0 * NUM_COMP], c_B, r_t);
+    ContractTransposeX2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_G, &r_V[comp]);
+    ContractTransposeY2d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp + 1 * NUM_COMP], c_G, r_t);
+    ContractTransposeAddX2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, &r_V[comp]);
   }
 }
 
@@ -423,9 +423,9 @@ inline __device__ void InterpTensor3d(SharedData_Cuda &data, const CeedScalar *_
   CeedScalar r_t1[T_1D];
   CeedScalar r_t2[T_1D];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractX3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * P_1D, c_B, r_t1);
+    ContractX3d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
     ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
-    ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp * Q_1D);
+    ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, &r_V[comp * Q_1D]);
   }
 }
 
@@ -438,9 +438,9 @@ inline __device__ void InterpTransposeTensor3d(SharedData_Cuda &data, const Ceed
   CeedScalar r_t1[T_1D];
   CeedScalar r_t2[T_1D];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * Q_1D, c_B, r_t1);
+    ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp * Q_1D], c_B, r_t1);
     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
-    ContractTransposeX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp * P_1D);
+    ContractTransposeX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
   }
 }
 
@@ -453,15 +453,15 @@ inline __device__ void GradTensor3d(SharedData_Cuda &data, const CeedScalar *__r
   CeedScalar r_t1[T_1D];
   CeedScalar r_t2[T_1D];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractX3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * P_1D, c_G, r_t1);
+    ContractX3d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp * P_1D], c_G, r_t1);
     ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
-    ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp * Q_1D + 0 * NUM_COMP * Q_1D);
-    ContractX3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * P_1D, c_B, r_t1);
+    ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]);
+    ContractX3d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
     ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_G, r_t2);
-    ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp * Q_1D + 1 * NUM_COMP * Q_1D);
-    ContractX3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * P_1D, c_B, r_t1);
+    ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]);
+    ContractX3d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
     ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
-    ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_G, r_V + comp * Q_1D + 2 * NUM_COMP * Q_1D);
+    ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]);
   }
 }
 
@@ -474,15 +474,15 @@ inline __device__ void GradTransposeTensor3d(SharedData_Cuda &data, const CeedSc
   CeedScalar r_t1[T_1D];
   CeedScalar r_t2[T_1D];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * Q_1D + 0 * NUM_COMP * Q_1D, c_B, r_t1);
+    ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_B, r_t1);
     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
-    ContractTransposeX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_G, r_V + comp * P_1D);
-    ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * Q_1D + 1 * NUM_COMP * Q_1D, c_B, r_t1);
+    ContractTransposeX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_G, &r_V[comp * P_1D]);
+    ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_B, r_t1);
     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_G, r_t2);
-    ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp * P_1D);
-    ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * Q_1D + 2 * NUM_COMP * Q_1D, c_G, r_t1);
+    ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
+    ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, r_t1);
     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
-    ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp * P_1D);
+    ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
   }
 }
 
@@ -495,12 +495,12 @@ inline __device__ void GradTensorCollocated3d(SharedData_Cuda &data, const CeedS
   CeedScalar r_t1[T_1D];
   CeedScalar r_t2[T_1D];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractX3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * P_1D, c_B, r_t1);
+    ContractX3d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
     ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
     ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_t1);
-    ContractX3d<NUM_COMP, Q_1D, Q_1D>(data, r_t1, c_G, r_V + comp * Q_1D + 0 * NUM_COMP * Q_1D);
-    ContractY3d<NUM_COMP, Q_1D, Q_1D>(data, r_t1, c_G, r_V + comp * Q_1D + 1 * NUM_COMP * Q_1D);
-    ContractZ3d<NUM_COMP, Q_1D, Q_1D>(data, r_t1, c_G, r_V + comp * Q_1D + 2 * NUM_COMP * Q_1D);
+    ContractX3d<NUM_COMP, Q_1D, Q_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]);
+    ContractY3d<NUM_COMP, Q_1D, Q_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]);
+    ContractZ3d<NUM_COMP, Q_1D, Q_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]);
   }
 }
 
@@ -513,12 +513,12 @@ inline __device__ void GradTransposeTensorCollocated3d(SharedData_Cuda &data, co
   CeedScalar r_t1[T_1D];
   CeedScalar r_t2[T_1D];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractTransposeZ3d<NUM_COMP, Q_1D, Q_1D>(data, r_U + comp * Q_1D + 2 * NUM_COMP * Q_1D, c_G, r_t2);
-    ContractTransposeAddY3d<NUM_COMP, Q_1D, Q_1D>(data, r_U + comp * Q_1D + 1 * NUM_COMP * Q_1D, c_G, r_t2);
-    ContractTransposeAddX3d<NUM_COMP, Q_1D, Q_1D>(data, r_U + comp * Q_1D + 0 * NUM_COMP * Q_1D, c_G, r_t2);
+    ContractTransposeZ3d<NUM_COMP, Q_1D, Q_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, r_t2);
+    ContractTransposeAddY3d<NUM_COMP, Q_1D, Q_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_G, r_t2);
+    ContractTransposeAddX3d<NUM_COMP, Q_1D, Q_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_G, r_t2);
     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_t1);
     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
-    ContractTransposeX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp * P_1D);
+    ContractTransposeX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
   }
 }
 
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor.h
index d6039d3a33..c295362978 100644
--- a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor.h
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor.h
@@ -81,6 +81,39 @@ extern "C" __global__ void InterpTranspose(const CeedInt num_elem, const CeedSca
   }
 }
 
+extern "C" __global__ void InterpTransposeAdd(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U,
+                                              CeedScalar *__restrict__ d_V) {
+  extern __shared__ CeedScalar slice[];
+
+  SharedData_Cuda data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
+
+  CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
+  CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
+
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    if (BASIS_DIM == 1) {
+      ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
+      InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, c_B, r_V);
+      SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 2) {
+      ReadElementStrided2d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
+      InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, c_B, r_V);
+      SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 3) {
+      ReadElementStrided3d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
+                                                       BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
+      InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, c_B, r_V);
+      SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                      BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    }
+  }
+}
+
 //------------------------------------------------------------------------------
 // Grad kernel by dim
 //------------------------------------------------------------------------------
@@ -154,6 +187,41 @@ extern "C" __global__ void GradTranspose(const CeedInt num_elem, const CeedScala
   }
 }
 
+extern "C" __global__ void GradTransposeAdd(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U,
+                                            CeedScalar *__restrict__ d_V) {
+  extern __shared__ CeedScalar slice[];
+
+  SharedData_Cuda data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
+
+  CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
+  CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
+
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    if (BASIS_DIM == 1) {
+      ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
+      GradTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, c_B, c_G, r_V);
+      SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 2) {
+      ReadElementStrided2d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U,
+                                                                   r_U);
+      GradTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, c_B, c_G, r_V);
+      SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 3) {
+      ReadElementStrided3d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
+                                                                   BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
+      if (BASIS_HAS_COLLOCATED_GRAD) GradTransposeTensorCollocated3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, c_B, c_G, r_V);
+      else GradTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, c_B, c_G, r_V);
+      SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                      BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    }
+  }
+}
+
 //------------------------------------------------------------------------------
 // Weight kernels by dim
 //------------------------------------------------------------------------------
diff --git a/include/ceed/jit-source/hip/hip-ref-basis-nontensor-templates.h b/include/ceed/jit-source/hip/hip-ref-basis-nontensor-templates.h
index 00b559ff10..0374d459d5 100644
--- a/include/ceed/jit-source/hip/hip-ref-basis-nontensor-templates.h
+++ b/include/ceed/jit-source/hip/hip-ref-basis-nontensor-templates.h
@@ -24,7 +24,7 @@ inline __device__ void Contract(const CeedInt elem, const CeedInt strides_elem_U
 
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
     // Run with Q threads
-    U = d_U + elem * strides_elem_U + comp * strides_comp_U;
+    U = &d_U[elem * strides_elem_U + comp * strides_comp_U];
     for (CeedInt d = 0; d < Q_COMP; d++) r_V[d] = 0.0;
     for (CeedInt i = 0; i < P; i++) {
       const CeedScalar val = U[i];
@@ -53,9 +53,9 @@ inline __device__ void ContractTranspose(const CeedInt elem, const CeedInt strid
     // Run with P threads
     r_V = 0.0;
     for (CeedInt d = 0; d < Q_COMP; d++) {
-      U = d_U + elem * strides_elem_U + comp * strides_comp_U + d * strides_q_comp_U;
+      U = &d_U[elem * strides_elem_U + comp * strides_comp_U + d * strides_q_comp_U];
       for (CeedInt i = 0; i < Q; i++) r_V += d_B[t_id + i * P + d * P * Q] * U[i];
     }
-    d_V[elem * strides_elem_V + comp * strides_comp_V + t_id] = r_V;
+    d_V[elem * strides_elem_V + comp * strides_comp_V + t_id] += r_V;
   }
 }
diff --git a/include/ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h b/include/ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h
index fc65a10912..22d81bc30a 100644
--- a/include/ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h
+++ b/include/ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h
@@ -67,8 +67,8 @@ extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt
   if (is_transpose) {
     for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
       for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
-        const CeedScalar *cur_u = u + elem * u_stride + comp * u_comp_stride;
-        CeedScalar       *cur_v = v + elem * v_stride + comp * v_comp_stride;
+        const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride];
+        CeedScalar       *cur_v = &v[elem * v_stride + comp * v_comp_stride];
         CeedInt           pre   = 1;
         CeedInt           post  = 1;
 
@@ -85,7 +85,7 @@ extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt
           for (CeedInt d = 0; d < BASIS_DIM; d++) {
             // Update buffers used
             pre /= 1;
-            const CeedScalar *in  = d == 0 ? (cur_u + p) : (d % 2 ? buffer_2 : buffer_1);
+            const CeedScalar *in  = d == 0 ? (&cur_u[p]) : (d % 2 ? buffer_2 : buffer_1);
             CeedScalar       *out = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? buffer_1 : buffer_2);
 
             // Build Chebyshev polynomial values
@@ -124,7 +124,8 @@ extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt
             CeedScalar    v_k = 0;
 
             for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c];
-            out[k] = v_k;
+            if (d == BASIS_DIM - 1) out[k] += v_k;
+            else out[k] = v_k;
           }
           post *= P;
         }
@@ -133,8 +134,8 @@ extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt
   } else {
     for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
       for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
-        const CeedScalar *cur_u = u + elem * u_stride + comp * u_comp_stride;
-        CeedScalar       *cur_v = v + elem * v_stride + comp * v_comp_stride;
+        const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride];
+        CeedScalar       *cur_v = &v[elem * v_stride + comp * v_comp_stride];
         CeedInt           pre   = u_size;
         CeedInt           post  = 1;
 
@@ -169,7 +170,7 @@ extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt
             // Update buffers used
             pre /= Q;
             const CeedScalar *in  = d == 0 ? s_chebyshev_coeffs : (d % 2 ? buffer_2 : buffer_1);
-            CeedScalar       *out = d == BASIS_DIM - 1 ? (cur_v + p) : (d % 2 ? buffer_1 : buffer_2);
+            CeedScalar       *out = d == BASIS_DIM - 1 ? (&cur_v[p]) : (d % 2 ? buffer_1 : buffer_2);
 
             // Build Chebyshev polynomial values
             ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * v_stride + d * v_comp_stride + p], chebyshev_x);
@@ -222,7 +223,7 @@ extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedInt is
   if (is_transpose) {
     for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
       for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
-        CeedScalar *cur_v = v + elem * v_stride + comp * v_comp_stride;
+        CeedScalar *cur_v = &v[elem * v_stride + comp * v_comp_stride];
         CeedInt     pre   = 1;
         CeedInt     post  = 1;
 
@@ -235,14 +236,14 @@ extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedInt is
         __syncthreads();
         for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
           for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) {
-            const CeedScalar *cur_u = u + elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride;
+            const CeedScalar *cur_u = &u[elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride];
 
             pre  = 1;
             post = 1;
             for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) {
               // Update buffers used
               pre /= 1;
-              const CeedScalar *in  = dim_2 == 0 ? (cur_u + p) : (dim_2 % 2 ? buffer_2 : buffer_1);
+              const CeedScalar *in  = dim_2 == 0 ? (&cur_u[p]) : (dim_2 % 2 ? buffer_2 : buffer_1);
               CeedScalar       *out = dim_2 == BASIS_DIM - 1 ? s_chebyshev_coeffs : (dim_2 % 2 ? buffer_1 : buffer_2);
 
               // Build Chebyshev polynomial values
@@ -283,7 +284,8 @@ extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedInt is
             CeedScalar    v_k = 0;
 
             for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c];
-            out[k] = v_k;
+            if (d == BASIS_DIM - 1) out[k] += v_k;
+            else out[k] = v_k;
           }
           post *= P;
         }
@@ -292,7 +294,7 @@ extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedInt is
   } else {
     for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
       for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
-        const CeedScalar *cur_u = u + elem * u_stride + comp * u_comp_stride;
+        const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride];
         CeedInt           pre   = u_size;
         CeedInt           post  = 1;
 
@@ -322,7 +324,7 @@ extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedInt is
         __syncthreads();
         for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
           for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) {
-            CeedScalar *cur_v = v + elem * v_stride + dim_1 * v_dim_stride + comp * v_comp_stride;
+            CeedScalar *cur_v = &v[elem * v_stride + dim_1 * v_dim_stride + comp * v_comp_stride];
 
             pre  = BASIS_NUM_QPTS;
             post = 1;
@@ -330,7 +332,7 @@ extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedInt is
               // Update buffers used
               pre /= Q;
               const CeedScalar *in  = dim_2 == 0 ? s_chebyshev_coeffs : (dim_2 % 2 ? buffer_2 : buffer_1);
-              CeedScalar       *out = dim_2 == BASIS_DIM - 1 ? (cur_v + p) : (dim_2 % 2 ? buffer_1 : buffer_2);
+              CeedScalar       *out = dim_2 == BASIS_DIM - 1 ? (&cur_v[p]) : (dim_2 % 2 ? buffer_1 : buffer_2);
 
               // Build Chebyshev polynomial values
               if (dim_1 == dim_2) ChebyshevDerivativeAtPoint<BASIS_Q_1D>(coords[elem * v_stride + dim_2 * v_comp_stride + p], chebyshev_x);
diff --git a/include/ceed/jit-source/hip/hip-ref-basis-tensor.h b/include/ceed/jit-source/hip/hip-ref-basis-tensor.h
index 7d732f8e77..db509ac2a0 100644
--- a/include/ceed/jit-source/hip/hip-ref-basis-tensor.h
+++ b/include/ceed/jit-source/hip/hip-ref-basis-tensor.h
@@ -42,8 +42,8 @@ extern "C" __global__ void Interp(const CeedInt num_elem, const CeedInt is_trans
   // Apply basis element by element
   for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
     for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
-      const CeedScalar *cur_u = u + elem * u_stride + comp * u_comp_stride;
-      CeedScalar       *cur_v = v + elem * v_stride + comp * v_comp_stride;
+      const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride];
+      CeedScalar       *cur_v = &v[elem * v_stride + comp * v_comp_stride];
       CeedInt           pre   = u_size;
       CeedInt           post  = 1;
 
@@ -57,13 +57,14 @@ extern "C" __global__ void Interp(const CeedInt num_elem, const CeedInt is_trans
 
         // Contract along middle index
         for (CeedInt k = i; k < writeLen; k += blockDim.x) {
-          const CeedInt c  = k % post;
-          const CeedInt j  = (k / post) % Q;
-          const CeedInt a  = k / (post * Q);
-          CeedScalar    vk = 0;
-
-          for (CeedInt b = 0; b < P; b++) vk += s_interp_1d[j * stride_0 + b * stride_1] * in[(a * P + b) * post + c];
-          out[k] = vk;
+          const CeedInt c   = k % post;
+          const CeedInt j   = (k / post) % Q;
+          const CeedInt a   = k / (post * Q);
+          CeedScalar    v_k = 0;
+
+          for (CeedInt b = 0; b < P; b++) v_k += s_interp_1d[j * stride_0 + b * stride_1] * in[(a * P + b) * post + c];
+          if (is_transpose && d == BASIS_DIM - 1) out[k] += v_k;
+          else out[k] = v_k;
         }
         post *= Q;
       }
@@ -106,8 +107,8 @@ extern "C" __global__ void Grad(const CeedInt num_elem, const CeedInt is_transpo
       for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) {
         CeedInt           pre   = is_transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES;
         CeedInt           post  = 1;
-        const CeedScalar *cur_u = u + elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride;
-        CeedScalar       *cur_v = v + elem * v_stride + dim_1 * v_dim_stride + comp * v_comp_stride;
+        const CeedScalar *cur_u = &u[elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride];
+        CeedScalar       *cur_v = &v[elem * v_stride + dim_1 * v_dim_stride + comp * v_comp_stride];
 
         for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) {
           __syncthreads();
diff --git a/include/ceed/jit-source/hip/hip-ref-operator-assemble.h b/include/ceed/jit-source/hip/hip-ref-operator-assemble.h
index 35789443b6..838dcfd4a5 100644
--- a/include/ceed/jit-source/hip/hip-ref-operator-assemble.h
+++ b/include/ceed/jit-source/hip/hip-ref-operator-assemble.h
@@ -24,7 +24,7 @@ extern "C" __launch_bounds__(BLOCK_SIZE) __global__
                         const CeedInt8 *curl_orients_in, const bool *orients_out, const CeedInt8 *curl_orients_out,
                         const CeedScalar *__restrict__ qf_array, CeedScalar *__restrict__ values_array) {
   extern __shared__ CeedScalar s_CT[];
-  CeedScalar                  *s_C = s_CT + NUM_NODES_OUT * NUM_NODES_IN;
+  CeedScalar                  *s_C = &s_CT[NUM_NODES_OUT * NUM_NODES_IN];
 
   const int l = threadIdx.x;  // The output column index of each B^T D B operation
                               // such that we have (Bout^T)_ij D_jk Bin_kl = C_il
diff --git a/include/ceed/jit-source/hip/hip-ref-restriction-at-points.h b/include/ceed/jit-source/hip/hip-ref-restriction-at-points.h
index 614628a81f..f4cb95993b 100644
--- a/include/ceed/jit-source/hip/hip-ref-restriction-at-points.h
+++ b/include/ceed/jit-source/hip/hip-ref-restriction-at-points.h
@@ -23,7 +23,7 @@ extern "C" __global__ void AtPointsTranspose(const CeedInt *__restrict__ indices
 
     if (loc_node >= points_per_elem[elem]) continue;
     for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) {
-      atomicAdd(v + ind + comp * RSTR_COMP_STRIDE, u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE]);
+      atomicAdd(&v[ind + comp * RSTR_COMP_STRIDE], u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE]);
     }
   }
 }
diff --git a/include/ceed/jit-source/hip/hip-ref-restriction-curl-oriented.h b/include/ceed/jit-source/hip/hip-ref-restriction-curl-oriented.h
index 4d3e88ce27..76d9758828 100644
--- a/include/ceed/jit-source/hip/hip-ref-restriction-curl-oriented.h
+++ b/include/ceed/jit-source/hip/hip-ref-restriction-curl-oriented.h
@@ -80,7 +80,7 @@ extern "C" __global__ void CurlOrientedTranspose(const CeedInt *__restrict__ ind
       value += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_d;
       value +=
           loc_node < (RSTR_ELEM_SIZE - 1) ? u[loc_node + 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_dl : 0.0;
-      atomicAdd(v + ind + comp * RSTR_COMP_STRIDE, value);
+      atomicAdd(&v[ind + comp * RSTR_COMP_STRIDE], value);
     }
   }
 }
@@ -138,7 +138,7 @@ extern "C" __global__ void CurlOrientedUnsignedTranspose(const CeedInt *__restri
       value += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_d;
       value +=
           loc_node < (RSTR_ELEM_SIZE - 1) ? u[loc_node + 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_dl : 0.0;
-      atomicAdd(v + ind + comp * RSTR_COMP_STRIDE, value);
+      atomicAdd(&v[ind + comp * RSTR_COMP_STRIDE], value);
     }
   }
 }
diff --git a/include/ceed/jit-source/hip/hip-ref-restriction-offset.h b/include/ceed/jit-source/hip/hip-ref-restriction-offset.h
index 26cd41ee92..65283b7193 100644
--- a/include/ceed/jit-source/hip/hip-ref-restriction-offset.h
+++ b/include/ceed/jit-source/hip/hip-ref-restriction-offset.h
@@ -36,7 +36,7 @@ extern "C" __global__ void OffsetTranspose(const CeedInt *__restrict__ indices,
     const CeedInt elem     = node / RSTR_ELEM_SIZE;
 
     for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) {
-      atomicAdd(v + ind + comp * RSTR_COMP_STRIDE, u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE]);
+      atomicAdd(&v[ind + comp * RSTR_COMP_STRIDE], u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE]);
     }
   }
 }
diff --git a/include/ceed/jit-source/hip/hip-ref-restriction-oriented.h b/include/ceed/jit-source/hip/hip-ref-restriction-oriented.h
index cb987fa8a7..f983a24fc0 100644
--- a/include/ceed/jit-source/hip/hip-ref-restriction-oriented.h
+++ b/include/ceed/jit-source/hip/hip-ref-restriction-oriented.h
@@ -40,7 +40,7 @@ extern "C" __global__ void OrientedTranspose(const CeedInt *__restrict__ indices
     const CeedInt elem     = node / RSTR_ELEM_SIZE;
 
     for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) {
-      atomicAdd(v + ind + comp * RSTR_COMP_STRIDE,
+      atomicAdd(&v[ind + comp * RSTR_COMP_STRIDE],
                 u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * (orient ? -1.0 : 1.0));
     }
   }
diff --git a/include/ceed/jit-source/hip/hip-shared-basis-read-write-templates.h b/include/ceed/jit-source/hip/hip-shared-basis-read-write-templates.h
index a6d945ac56..379d52d13b 100644
--- a/include/ceed/jit-source/hip/hip-shared-basis-read-write-templates.h
+++ b/include/ceed/jit-source/hip/hip-shared-basis-read-write-templates.h
@@ -56,6 +56,19 @@ inline __device__ void WriteElementStrided1d(SharedData_Hip &data, const CeedInt
   }
 }
 
+template <int NUM_COMP, int P_1D>
+inline __device__ void SumElementStrided1d(SharedData_Hip &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp,
+                                           const CeedInt strides_elem, const CeedScalar *r_v, CeedScalar *d_v) {
+  if (data.t_id_x < P_1D) {
+    const CeedInt node = data.t_id_x;
+    const CeedInt ind  = node * strides_node + elem * strides_elem;
+
+    for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+      d_v[ind + comp * strides_comp] += r_v[comp];
+    }
+  }
+}
+
 //------------------------------------------------------------------------------
 // 2D
 //------------------------------------------------------------------------------
@@ -92,6 +105,19 @@ inline __device__ void WriteElementStrided2d(SharedData_Hip &data, const CeedInt
   }
 }
 
+template <int NUM_COMP, int P_1D>
+inline __device__ void SumElementStrided2d(SharedData_Hip &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp,
+                                           const CeedInt strides_elem, const CeedScalar *r_v, CeedScalar *d_v) {
+  if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
+    const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
+    const CeedInt ind  = node * strides_node + elem * strides_elem;
+
+    for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+      d_v[ind + comp * strides_comp] += r_v[comp];
+    }
+  }
+}
+
 //------------------------------------------------------------------------------
 // 3D
 //------------------------------------------------------------------------------
@@ -131,3 +157,18 @@ inline __device__ void WriteElementStrided3d(SharedData_Hip &data, const CeedInt
     }
   }
 }
+
+template <int NUM_COMP, int P_1D>
+inline __device__ void SumElementStrided3d(SharedData_Hip &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp,
+                                           const CeedInt strides_elem, const CeedScalar *r_v, CeedScalar *d_v) {
+  if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
+    for (CeedInt z = 0; z < P_1D; z++) {
+      const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
+      const CeedInt ind  = node * strides_node + elem * strides_elem;
+
+      for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+        d_v[ind + comp * strides_comp] += r_v[z + comp * P_1D];
+      }
+    }
+  }
+}
diff --git a/include/ceed/jit-source/hip/hip-shared-basis-tensor-templates.h b/include/ceed/jit-source/hip/hip-shared-basis-tensor-templates.h
index 5e52d1c829..8dc50e4ed8 100644
--- a/include/ceed/jit-source/hip/hip-shared-basis-tensor-templates.h
+++ b/include/ceed/jit-source/hip/hip-shared-basis-tensor-templates.h
@@ -52,7 +52,7 @@ inline __device__ void ContractTransposeX1d(SharedData_Hip &data, const CeedScal
 template <int NUM_COMP, int P_1D, int Q_1D>
 inline __device__ void Interp1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractX1d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_B, r_V + comp);
+    ContractX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, &r_V[comp]);
   }
 }
 
@@ -63,7 +63,7 @@ template <int NUM_COMP, int P_1D, int Q_1D>
 inline __device__ void InterpTranspose1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
                                          CeedScalar *__restrict__ r_V) {
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_B, r_V + comp);
+    ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, &r_V[comp]);
   }
 }
 
@@ -74,7 +74,7 @@ template <int NUM_COMP, int P_1D, int Q_1D>
 inline __device__ void Grad1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
                               CeedScalar *__restrict__ r_V) {
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractX1d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_G, r_V + comp);
+    ContractX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_G, &r_V[comp]);
   }
 }
 
@@ -85,7 +85,7 @@ template <int NUM_COMP, int P_1D, int Q_1D>
 inline __device__ void GradTranspose1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
                                        CeedScalar *__restrict__ r_V) {
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_G, r_V + comp);
+    ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_G, &r_V[comp]);
   }
 }
 
@@ -187,8 +187,8 @@ template <int NUM_COMP, int P_1D, int Q_1D>
 inline __device__ void InterpTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
   CeedScalar r_t[1];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractX2d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_B, r_t);
-    ContractY2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, r_V + comp);
+    ContractX2d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, r_t);
+    ContractY2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, &r_V[comp]);
   }
 }
 
@@ -200,8 +200,8 @@ inline __device__ void InterpTransposeTensor2d(SharedData_Hip &data, const CeedS
                                                CeedScalar *__restrict__ r_V) {
   CeedScalar r_t[1];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractTransposeY2d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_B, r_t);
-    ContractTransposeX2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, r_V + comp);
+    ContractTransposeY2d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, r_t);
+    ContractTransposeX2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, &r_V[comp]);
   }
 }
 
@@ -213,10 +213,10 @@ inline __device__ void GradTensor2d(SharedData_Hip &data, const CeedScalar *__re
                                     CeedScalar *__restrict__ r_V) {
   CeedScalar r_t[1];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractX2d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_G, r_t);
-    ContractY2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, r_V + comp + 0 * NUM_COMP);
-    ContractX2d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_B, r_t);
-    ContractY2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_G, r_V + comp + 1 * NUM_COMP);
+    ContractX2d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_G, r_t);
+    ContractY2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, &r_V[comp + 0 * NUM_COMP]);
+    ContractX2d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, r_t);
+    ContractY2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_G, &r_V[comp + 1 * NUM_COMP]);
   }
 }
 
@@ -228,10 +228,10 @@ inline __device__ void GradTransposeTensor2d(SharedData_Hip &data, const CeedSca
                                              CeedScalar *__restrict__ r_V) {
   CeedScalar r_t[1];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractTransposeY2d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp + 0 * NUM_COMP, c_B, r_t);
-    ContractTransposeX2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_G, r_V + comp);
-    ContractTransposeY2d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp + 1 * NUM_COMP, c_G, r_t);
-    ContractTransposeAddX2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, r_V + comp);
+    ContractTransposeY2d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp + 0 * NUM_COMP], c_B, r_t);
+    ContractTransposeX2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_G, &r_V[comp]);
+    ContractTransposeY2d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp + 1 * NUM_COMP], c_G, r_t);
+    ContractTransposeAddX2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, &r_V[comp]);
   }
 }
 
@@ -421,9 +421,9 @@ inline __device__ void InterpTensor3d(SharedData_Hip &data, const CeedScalar *__
   CeedScalar r_t1[T_1D];
   CeedScalar r_t2[T_1D];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractX3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * P_1D, c_B, r_t1);
+    ContractX3d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
     ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
-    ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp * Q_1D);
+    ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, &r_V[comp * Q_1D]);
   }
 }
 
@@ -436,9 +436,9 @@ inline __device__ void InterpTransposeTensor3d(SharedData_Hip &data, const CeedS
   CeedScalar r_t1[T_1D];
   CeedScalar r_t2[T_1D];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * Q_1D, c_B, r_t1);
+    ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp * Q_1D], c_B, r_t1);
     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
-    ContractTransposeX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp * P_1D);
+    ContractTransposeX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
   }
 }
 
@@ -451,15 +451,15 @@ inline __device__ void GradTensor3d(SharedData_Hip &data, const CeedScalar *__re
   CeedScalar r_t1[T_1D];
   CeedScalar r_t2[T_1D];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractX3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * P_1D, c_G, r_t1);
+    ContractX3d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp * P_1D], c_G, r_t1);
     ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
-    ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp * Q_1D + 0 * NUM_COMP * Q_1D);
-    ContractX3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * P_1D, c_B, r_t1);
+    ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]);
+    ContractX3d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
     ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_G, r_t2);
-    ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp * Q_1D + 1 * NUM_COMP * Q_1D);
-    ContractX3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * P_1D, c_B, r_t1);
+    ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]);
+    ContractX3d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
     ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
-    ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_G, r_V + comp * Q_1D + 2 * NUM_COMP * Q_1D);
+    ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]);
   }
 }
 
@@ -472,15 +472,15 @@ inline __device__ void GradTransposeTensor3d(SharedData_Hip &data, const CeedSca
   CeedScalar r_t1[T_1D];
   CeedScalar r_t2[T_1D];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * Q_1D + 0 * NUM_COMP * Q_1D, c_B, r_t1);
+    ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_B, r_t1);
     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
-    ContractTransposeX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_G, r_V + comp * P_1D);
-    ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * Q_1D + 1 * NUM_COMP * Q_1D, c_B, r_t1);
+    ContractTransposeX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_G, &r_V[comp * P_1D]);
+    ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_B, r_t1);
     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_G, r_t2);
-    ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp * P_1D);
-    ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * Q_1D + 2 * NUM_COMP * Q_1D, c_G, r_t1);
+    ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
+    ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, r_t1);
     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
-    ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp * P_1D);
+    ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
   }
 }
 
@@ -493,12 +493,12 @@ inline __device__ void GradTensorCollocated3d(SharedData_Hip &data, const CeedSc
   CeedScalar r_t1[T_1D];
   CeedScalar r_t2[T_1D];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractX3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * P_1D, c_B, r_t1);
+    ContractX3d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
     ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
     ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_t1);
-    ContractX3d<NUM_COMP, Q_1D, Q_1D>(data, r_t1, c_G, r_V + comp * Q_1D + 0 * NUM_COMP * Q_1D);
-    ContractY3d<NUM_COMP, Q_1D, Q_1D>(data, r_t1, c_G, r_V + comp * Q_1D + 1 * NUM_COMP * Q_1D);
-    ContractZ3d<NUM_COMP, Q_1D, Q_1D>(data, r_t1, c_G, r_V + comp * Q_1D + 2 * NUM_COMP * Q_1D);
+    ContractX3d<NUM_COMP, Q_1D, Q_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]);
+    ContractY3d<NUM_COMP, Q_1D, Q_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]);
+    ContractZ3d<NUM_COMP, Q_1D, Q_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]);
   }
 }
 
@@ -511,12 +511,12 @@ inline __device__ void GradTransposeTensorCollocated3d(SharedData_Hip &data, con
   CeedScalar r_t1[T_1D];
   CeedScalar r_t2[T_1D];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractTransposeZ3d<NUM_COMP, Q_1D, Q_1D>(data, r_U + comp * Q_1D + 2 * NUM_COMP * Q_1D, c_G, r_t2);
-    ContractTransposeAddY3d<NUM_COMP, Q_1D, Q_1D>(data, r_U + comp * Q_1D + 1 * NUM_COMP * Q_1D, c_G, r_t2);
-    ContractTransposeAddX3d<NUM_COMP, Q_1D, Q_1D>(data, r_U + comp * Q_1D + 0 * NUM_COMP * Q_1D, c_G, r_t2);
+    ContractTransposeZ3d<NUM_COMP, Q_1D, Q_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, r_t2);
+    ContractTransposeAddY3d<NUM_COMP, Q_1D, Q_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_G, r_t2);
+    ContractTransposeAddX3d<NUM_COMP, Q_1D, Q_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_G, r_t2);
     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_t1);
     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
-    ContractTransposeX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp * P_1D);
+    ContractTransposeX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
   }
 }
 
diff --git a/include/ceed/jit-source/hip/hip-shared-basis-tensor.h b/include/ceed/jit-source/hip/hip-shared-basis-tensor.h
index 0a9a1f3cee..d052e53bf1 100644
--- a/include/ceed/jit-source/hip/hip-shared-basis-tensor.h
+++ b/include/ceed/jit-source/hip/hip-shared-basis-tensor.h
@@ -92,6 +92,44 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
   }
 }
 
+extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
+    void InterpTransposeAdd(const CeedInt num_elem, const CeedScalar *d_interp_1d, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
+  extern __shared__ CeedScalar slice[];
+
+  // load interp_1d into shared memory
+  __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
+  loadMatrix<BASIS_P_1D * BASIS_Q_1D>(d_interp_1d, s_B);
+  __syncthreads();
+
+  SharedData_Hip data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = &slice[data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1)];
+
+  CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
+  CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
+
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    if (BASIS_DIM == 1) {
+      ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
+      InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_V);
+      SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 2) {
+      ReadElementStrided2d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
+      InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_V);
+      SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 3) {
+      ReadElementStrided3d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
+                                                       BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
+      InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_V);
+      SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                      BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    }
+  }
+}
+
 //------------------------------------------------------------------------------
 // Grad kernel by dim
 //------------------------------------------------------------------------------
@@ -181,6 +219,49 @@ extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
   }
 }
 
+extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
+    void GradTransposeAdd(const CeedInt num_elem, const CeedScalar *d_interp_1d, const CeedScalar *d_grad_1d, const CeedScalar *__restrict__ d_U,
+                          CeedScalar *__restrict__ d_V) {
+  extern __shared__ CeedScalar slice[];
+
+  // load interp_1d and grad_1d into shared memory
+  __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
+  loadMatrix<BASIS_P_1D * BASIS_Q_1D>(d_interp_1d, s_B);
+  __shared__ CeedScalar s_G[BASIS_Q_1D * (BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)];
+  loadMatrix<BASIS_Q_1D *(BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)>(d_grad_1d, s_G);
+  __syncthreads();
+
+  SharedData_Hip data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = &slice[data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1)];
+
+  CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
+  CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
+
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    if (BASIS_DIM == 1) {
+      ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
+      GradTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V);
+      SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 2) {
+      ReadElementStrided2d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U,
+                                                                   r_U);
+      GradTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V);
+      SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 3) {
+      ReadElementStrided3d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
+                                                                   BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
+      if (BASIS_HAS_COLLOCATED_GRAD) GradTransposeTensorCollocated3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V);
+      else GradTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V);
+      SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                      BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    }
+  }
+}
+
 //------------------------------------------------------------------------------
 // Weight kernels by dim
 //------------------------------------------------------------------------------
diff --git a/include/ceed/jit-source/magma/magma-basis-grad-1d.h b/include/ceed/jit-source/magma/magma-basis-grad-1d.h
index dd21682225..cd6f8548fb 100644
--- a/include/ceed/jit-source/magma/magma-basis-grad-1d.h
+++ b/include/ceed/jit-source/magma/magma-basis-grad-1d.h
@@ -126,3 +126,48 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_
   // write V
   write_1d<CeedScalar, BASIS_P, BASIS_NUM_COMP>(sV, dV, cstrdV, tx);
 }
+
+////////////////////////////////////////////////////////////////////////////////
+extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_1D)) __global__
+    void magma_gradta_1d_kernel(const CeedScalar *dTinterp, const CeedScalar *dTgrad, const CeedScalar *dU, const int estrdU, const int cstrdU,
+                                const int dstrdU, CeedScalar *dV, const int estrdV, const int cstrdV, const int dstrdV, const int nelem) {
+  MAGMA_DEVICE_SHARED(CeedScalar, shared_data)
+
+  const int tx      = threadIdx.x;
+  const int ty      = threadIdx.y;
+  const int elem_id = (blockIdx.x * blockDim.y) + ty;
+
+  if (elem_id >= nelem) return;
+
+  CeedScalar *sU[BASIS_NUM_COMP];
+  CeedScalar *sV[BASIS_NUM_COMP];
+
+  // shift global memory pointers by elem stride
+  dU += elem_id * estrdU;
+  dV += elem_id * estrdV;
+
+  // assign shared memory pointers
+  CeedScalar *sT = (CeedScalar *)shared_data;
+  CeedScalar *sW = sT + BASIS_Q * BASIS_P;
+  sU[0]          = sW + ty * BASIS_NUM_COMP * (BASIS_Q + BASIS_P);
+  sV[0]          = sU[0] + (BASIS_NUM_COMP * 1 * BASIS_Q);
+  for (int comp = 1; comp < BASIS_NUM_COMP; comp++) {
+    sU[comp] = sU[comp - 1] + (1 * BASIS_Q);
+    sV[comp] = sV[comp - 1] + (1 * BASIS_P);
+  }
+
+  // read T
+  if (ty == 0) {
+    read_T_trans_gm2sm<BASIS_Q, BASIS_P>(tx, dTgrad, sT);
+  }
+
+  // read U
+  read_1d<CeedScalar, BASIS_Q, BASIS_NUM_COMP>(dU, cstrdU, sU, tx);
+
+  __syncthreads();
+  magma_grad_1d_device<CeedScalar, BASIS_DIM, BASIS_NUM_COMP, BASIS_Q, BASIS_P>(sT, sU, sV, tx);
+  __syncthreads();
+
+  // sum into V
+  sum_1d<CeedScalar, BASIS_P, BASIS_NUM_COMP>(sV, dV, cstrdV, tx);
+}
diff --git a/include/ceed/jit-source/magma/magma-basis-grad-2d.h b/include/ceed/jit-source/magma/magma-basis-grad-2d.h
index 23559716dc..b4e7e2981a 100644
--- a/include/ceed/jit-source/magma/magma-basis-grad-2d.h
+++ b/include/ceed/jit-source/magma/magma-basis-grad-2d.h
@@ -188,3 +188,54 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_
   // write V
   write_V_2d<CeedScalar, BASIS_P, 1, BASIS_NUM_COMP, BASIS_P, 0>(dV + (0 * dstrdV), cstrdV, rV, tx);
 }
+
+////////////////////////////////////////////////////////////////////////////////
+extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_2D)) __global__
+    void magma_gradta_2d_kernel(const CeedScalar *dinterp1d, const CeedScalar *dgrad1d, const CeedScalar *dU, const int estrdU, const int cstrdU,
+                                const int dstrdU, CeedScalar *dV, const int estrdV, const int cstrdV, const int dstrdV, const int nelem) {
+  MAGMA_DEVICE_SHARED(CeedScalar, shared_data)
+
+  const int tx      = threadIdx.x;
+  const int ty      = threadIdx.y;
+  const int elem_id = (blockIdx.x * blockDim.y) + ty;
+
+  if (elem_id >= nelem) return;
+
+  CeedScalar rU[1][BASIS_NUM_COMP][BASIS_Q] = {0.0};  // here DIM_U = 1, but might be different for a fused operator
+  CeedScalar rV[1][BASIS_NUM_COMP][BASIS_P] = {0.0};  // here DIM_V = 1, but might be different for a fused operator
+  CeedScalar rTmp                           = 0.0;
+
+  // shift global memory pointers by elem stride
+  dU += elem_id * estrdU;
+  dV += elem_id * estrdV;
+
+  // assign shared memory pointers
+  CeedScalar *sTinterp = (CeedScalar *)shared_data;
+  CeedScalar *sTgrad   = sTinterp + BASIS_Q * BASIS_P;
+  CeedScalar *sTmp     = sTgrad + BASIS_Q * BASIS_P;
+  sTmp += ty * (BASIS_Q * BASIS_MAX_P_Q);
+
+  // read T
+  if (ty == 0) {
+    read_T_trans_gm2sm<BASIS_Q, BASIS_P>(tx, dinterp1d, sTinterp);
+    read_T_trans_gm2sm<BASIS_Q, BASIS_P>(tx, dgrad1d, sTgrad);
+  }
+  __syncthreads();
+
+  /* read U (idim = 0 for dU, i_DIM = 0 for rU) --
+     there is a sync at the end of this function */
+  read_U_2d<CeedScalar, BASIS_Q, 1, BASIS_NUM_COMP, BASIS_Q, 0>(dU + (0 * dstrdU), cstrdU, rU, sTmp, tx);
+  /* first call (i_DIM = 0, i_DIM_U = 0, i_DIM_V = 0) */
+  magma_grad_2d_device<CeedScalar, 1, 1, BASIS_NUM_COMP, BASIS_Q, BASIS_P, BASIS_Q, BASIS_P, 0, 0, 0, true>(sTinterp, sTgrad, rU, rV, tx, rTmp, sTmp);
+  /* there is a sync at the end of magma_grad_2d_device */
+
+  /* read U (idim = 1 for dU, i_DIM = 0 for rU) --
+     there is a sync at the end of this function */
+  read_U_2d<CeedScalar, BASIS_Q, 1, BASIS_NUM_COMP, BASIS_Q, 0>(dU + (1 * dstrdU), cstrdU, rU, sTmp, tx);
+  /* second call (i_DIM = 1, i_DIM_U = 0, i_DIM_V = 0) */
+  magma_grad_2d_device<CeedScalar, 1, 1, BASIS_NUM_COMP, BASIS_Q, BASIS_P, BASIS_Q, BASIS_P, 1, 0, 0, true>(sTinterp, sTgrad, rU, rV, tx, rTmp, sTmp);
+  /* there is a sync at the end of magma_grad_2d_device */
+
+  // sum into V
+  sum_V_2d<CeedScalar, BASIS_P, 1, BASIS_NUM_COMP, BASIS_P, 0>(dV + (0 * dstrdV), cstrdV, rV, tx);
+}
diff --git a/include/ceed/jit-source/magma/magma-basis-grad-3d.h b/include/ceed/jit-source/magma/magma-basis-grad-3d.h
index c030f8e9e5..c8028be756 100644
--- a/include/ceed/jit-source/magma/magma-basis-grad-3d.h
+++ b/include/ceed/jit-source/magma/magma-basis-grad-3d.h
@@ -225,3 +225,61 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q *BASIS_MAX_P_Q, MA
   // write V
   write_V_3d<CeedScalar, BASIS_P, 1, BASIS_NUM_COMP, BASIS_P, 0>(dV + (0 * dstrdV), cstrdV, rV, tx);
 }
+
+////////////////////////////////////////////////////////////////////////////////
+extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q *BASIS_MAX_P_Q, MAGMA_MAXTHREADS_3D)) __global__
+    void magma_gradta_3d_kernel(const CeedScalar *dinterp1d, const CeedScalar *dgrad1d, const CeedScalar *dU, const int estrdU, const int cstrdU,
+                                const int dstrdU, CeedScalar *dV, const int estrdV, const int cstrdV, const int dstrdV, const int nelem) {
+  MAGMA_DEVICE_SHARED(CeedScalar, shared_data)
+
+  const int tx      = threadIdx.x;
+  const int ty      = threadIdx.y;
+  const int elem_id = (blockIdx.x * blockDim.y) + ty;
+
+  if (elem_id >= nelem) return;
+
+  CeedScalar rU[1][BASIS_NUM_COMP][BASIS_Q] = {0.0};  // here DIM_U = 1, but might be different for a fused operator
+  CeedScalar rV[1][BASIS_NUM_COMP][BASIS_P] = {0.0};  // here DIM_V = 1, but might be different for a fused operator
+  CeedScalar rTmp                           = 0.0;
+
+  // shift global memory pointers by elem stride
+  dU += elem_id * estrdU;
+  dV += elem_id * estrdV;
+
+  // assign shared memory pointers
+  CeedScalar *sTinterp = (CeedScalar *)shared_data;
+  CeedScalar *sTgrad   = sTinterp + BASIS_Q * BASIS_P;
+  CeedScalar *sTmp     = sTgrad + BASIS_Q * BASIS_P;
+  sTmp += ty * (max(BASIS_Q * BASIS_Q * BASIS_Q, (BASIS_Q * BASIS_Q * BASIS_P) + (BASIS_Q * BASIS_P * BASIS_P)));
+
+  // read T
+  if (ty == 0) {
+    read_T_trans_gm2sm<BASIS_Q, BASIS_P>(tx, dinterp1d, sTinterp);
+    read_T_trans_gm2sm<BASIS_Q, BASIS_P>(tx, dgrad1d, sTgrad);
+  }
+  __syncthreads();
+
+  /* read U (idim = 0 for dU, i_DIM = 0 for rU) --
+     there is a sync at the end of this function */
+  read_U_3d<CeedScalar, BASIS_Q, 1, BASIS_NUM_COMP, BASIS_Q, 0>(dU + (0 * dstrdU), cstrdU, rU, sTmp, tx);
+  /* then first call (i_DIM = 0, i_DIM_U = 0, i_DIM_V = 0) */
+  magma_grad_3d_device<CeedScalar, 1, 1, BASIS_NUM_COMP, BASIS_Q, BASIS_P, BASIS_Q, BASIS_P, 0, 0, 0, true>(sTinterp, sTgrad, rU, rV, tx, rTmp, sTmp);
+  /* there is a sync at the end of magma_grad_3d_device */
+
+  /* read U (idim = 1 for dU, i_DIM = 0 for rU) --
+     there is a sync at the end of this function */
+  read_U_3d<CeedScalar, BASIS_Q, 1, BASIS_NUM_COMP, BASIS_Q, 0>(dU + (1 * dstrdU), cstrdU, rU, sTmp, tx);
+  /* then second call (i_DIM = 1, i_DIM_U = 0, i_DIM_V = 0) */
+  magma_grad_3d_device<CeedScalar, 1, 1, BASIS_NUM_COMP, BASIS_Q, BASIS_P, BASIS_Q, BASIS_P, 1, 0, 0, true>(sTinterp, sTgrad, rU, rV, tx, rTmp, sTmp);
+  /* there is a sync at the end of magma_grad_3d_device */
+
+  /* read U (idim = 2 for dU, i_DIM = 0 for rU) --
+     there is a sync at the end of this function */
+  read_U_3d<CeedScalar, BASIS_Q, 1, BASIS_NUM_COMP, BASIS_Q, 0>(dU + (2 * dstrdU), cstrdU, rU, sTmp, tx);
+  /* then third call (i_DIM = 2, i_DIM_U = 0, i_DIM_V = 0) */
+  magma_grad_3d_device<CeedScalar, 1, 1, BASIS_NUM_COMP, BASIS_Q, BASIS_P, BASIS_Q, BASIS_P, 2, 0, 0, true>(sTinterp, sTgrad, rU, rV, tx, rTmp, sTmp);
+  /* there is a sync at the end of magma_grad_3d_device */
+
+  // sum into V
+  sum_V_3d<CeedScalar, BASIS_P, 1, BASIS_NUM_COMP, BASIS_P, 0>(dV + (0 * dstrdV), cstrdV, rV, tx);
+}
diff --git a/include/ceed/jit-source/magma/magma-basis-interp-1d.h b/include/ceed/jit-source/magma/magma-basis-interp-1d.h
index ae8d082653..02f894ecce 100644
--- a/include/ceed/jit-source/magma/magma-basis-interp-1d.h
+++ b/include/ceed/jit-source/magma/magma-basis-interp-1d.h
@@ -126,3 +126,48 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_
   // write V
   write_1d<CeedScalar, BASIS_P, BASIS_NUM_COMP>(sV, dV, cstrdV, tx);
 }
+
+////////////////////////////////////////////////////////////////////////////////
+extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_1D)) __global__
+    void magma_interpta_1d_kernel(const CeedScalar *dT, const CeedScalar *dU, const int estrdU, const int cstrdU, CeedScalar *dV, const int estrdV,
+                                  const int cstrdV, const int nelem) {
+  MAGMA_DEVICE_SHARED(CeedScalar, shared_data)
+
+  const int tx      = threadIdx.x;
+  const int ty      = threadIdx.y;
+  const int elem_id = (blockIdx.x * blockDim.y) + ty;
+
+  if (elem_id >= nelem) return;
+
+  CeedScalar *sU[BASIS_NUM_COMP];
+  CeedScalar *sV[BASIS_NUM_COMP];
+
+  // shift global memory pointers by elem stride
+  dU += elem_id * estrdU;
+  dV += elem_id * estrdV;
+
+  // assign shared memory pointers
+  CeedScalar *sT = (CeedScalar *)shared_data;
+  CeedScalar *sW = sT + BASIS_Q * BASIS_P;
+  sU[0]          = sW + ty * BASIS_NUM_COMP * (BASIS_Q + BASIS_P);
+  sV[0]          = sU[0] + (BASIS_NUM_COMP * 1 * BASIS_Q);
+  for (int comp = 1; comp < BASIS_NUM_COMP; comp++) {
+    sU[comp] = sU[comp - 1] + (1 * BASIS_Q);
+    sV[comp] = sV[comp - 1] + (1 * BASIS_P);
+  }
+
+  // read T
+  if (ty == 0) {
+    read_T_trans_gm2sm<BASIS_Q, BASIS_P>(tx, dT, sT);
+  }
+
+  // read U
+  read_1d<CeedScalar, BASIS_Q, BASIS_NUM_COMP>(dU, cstrdU, sU, tx);
+
+  __syncthreads();
+  magma_interp_1d_device<CeedScalar, BASIS_DIM, BASIS_NUM_COMP, BASIS_Q, BASIS_P>(sT, sU, sV, tx);
+  __syncthreads();
+
+  // sum into V
+  sum_1d<CeedScalar, BASIS_P, BASIS_NUM_COMP>(sV, dV, cstrdV, tx);
+}
diff --git a/include/ceed/jit-source/magma/magma-basis-interp-2d.h b/include/ceed/jit-source/magma/magma-basis-interp-2d.h
index a2a41a25ae..56c8081c83 100644
--- a/include/ceed/jit-source/magma/magma-basis-interp-2d.h
+++ b/include/ceed/jit-source/magma/magma-basis-interp-2d.h
@@ -144,3 +144,44 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_
   // write V
   write_V_2d<CeedScalar, BASIS_P, 1, BASIS_NUM_COMP, BASIS_P, 0>(dV, cstrdV, rV, tx);
 }
+
+////////////////////////////////////////////////////////////////////////////////
+extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_2D)) __global__
+    void magma_interpta_2d_kernel(const CeedScalar *dT, const CeedScalar *dU, const int estrdU, const int cstrdU, CeedScalar *dV, const int estrdV,
+                                  const int cstrdV, const int nelem) {
+  MAGMA_DEVICE_SHARED(CeedScalar, shared_data)
+
+  const int tx      = threadIdx.x;
+  const int ty      = threadIdx.y;
+  const int elem_id = (blockIdx.x * blockDim.y) + ty;
+
+  if (elem_id >= nelem) return;
+
+  CeedScalar rU[1][BASIS_NUM_COMP][BASIS_Q] = {0.0};  // for a non-fused operator BASIS_DIM is always 1
+  CeedScalar rV[1][BASIS_NUM_COMP][BASIS_P] = {0.0};  // for a non-fused operator BASIS_DIM is always 1
+  CeedScalar rTmp                           = 0.0;
+
+  // shift global memory pointers by elem stride
+  dU += elem_id * estrdU;
+  dV += elem_id * estrdV;
+
+  // assign shared memory pointers
+  CeedScalar *sT   = (CeedScalar *)shared_data;
+  CeedScalar *sTmp = sT + BASIS_Q * BASIS_P;
+  sTmp += ty * (BASIS_Q * BASIS_MAX_P_Q);
+
+  // read T
+  if (ty == 0) {
+    read_T_trans_gm2sm<BASIS_Q, BASIS_P>(tx, dT, sT);
+  }
+
+  // read U -- there is a sync at the end of this function
+  read_U_2d<CeedScalar, BASIS_Q, 1, BASIS_NUM_COMP, BASIS_Q, 0>(dU, cstrdU, rU, sTmp, tx);
+
+  // no sync needed here -- read_U_2d already syncs at the end
+  magma_interp_2d_device<CeedScalar, 1, 1, BASIS_NUM_COMP, BASIS_Q, BASIS_P, BASIS_Q, BASIS_P>(sT, rU, rV, tx, rTmp, sTmp);
+  __syncthreads();
+
+  // sum into V
+  sum_V_2d<CeedScalar, BASIS_P, 1, BASIS_NUM_COMP, BASIS_P, 0>(dV, cstrdV, rV, tx);
+}
diff --git a/include/ceed/jit-source/magma/magma-basis-interp-3d.h b/include/ceed/jit-source/magma/magma-basis-interp-3d.h
index 50c7e4df4a..ac11e3f8df 100644
--- a/include/ceed/jit-source/magma/magma-basis-interp-3d.h
+++ b/include/ceed/jit-source/magma/magma-basis-interp-3d.h
@@ -172,3 +172,44 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q *BASIS_MAX_P_Q, MA
   // write V
   write_V_3d<CeedScalar, BASIS_P, 1, BASIS_NUM_COMP, BASIS_P, 0>(dV, cstrdV, rV, tx);
 }
+
+////////////////////////////////////////////////////////////////////////////////
+extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q *BASIS_MAX_P_Q, MAGMA_MAXTHREADS_3D)) __global__
+    void magma_interpta_3d_kernel(const CeedScalar *dT, const CeedScalar *dU, const int estrdU, const int cstrdU, CeedScalar *dV, const int estrdV,
+                                  const int cstrdV, const int nelem) {
+  MAGMA_DEVICE_SHARED(CeedScalar, shared_data)
+
+  const int tx      = threadIdx.x;
+  const int ty      = threadIdx.y;
+  const int elem_id = (blockIdx.x * blockDim.y) + ty;
+
+  if (elem_id >= nelem) return;
+
+  CeedScalar rU[1][BASIS_NUM_COMP][BASIS_Q] = {0.0};  // for a non-fused operator BASIS_DIM is always 1
+  CeedScalar rV[1][BASIS_NUM_COMP][BASIS_P] = {0.0};  // for a non-fused operator BASIS_DIM is always 1
+  CeedScalar rTmp[BASIS_P]                  = {0.0};
+
+  // shift global memory pointers by elem stride
+  dU += elem_id * estrdU;
+  dV += elem_id * estrdV;
+
+  // assign shared memory pointers
+  CeedScalar *sT   = (CeedScalar *)shared_data;
+  CeedScalar *sTmp = sT + BASIS_Q * BASIS_P;
+  sTmp += ty * (max(BASIS_Q * BASIS_Q * BASIS_MAX_P_Q, BASIS_Q * BASIS_P * BASIS_P));
+
+  // read T
+  if (ty == 0) {
+    read_T_trans_gm2sm<BASIS_Q, BASIS_P>(tx, dT, sT);
+  }
+
+  // read U (idim = 0 for dU, i_DIM = 0 for rU, u_dimstride is always 0)
+  read_U_3d<CeedScalar, BASIS_Q, 1, BASIS_NUM_COMP, BASIS_Q, 0>(dU, cstrdU, rU, sTmp, tx);
+  // there is a sync at the end of this function
+
+  magma_interp_3d_device<CeedScalar, 1, 1, BASIS_NUM_COMP, BASIS_Q, BASIS_P, BASIS_Q, BASIS_P>(sT, rU, rV, tx, rTmp, sTmp);
+  __syncthreads();
+
+  // sum into V
+  sum_V_3d<CeedScalar, BASIS_P, 1, BASIS_NUM_COMP, BASIS_P, 0>(dV, cstrdV, rV, tx);
+}
diff --git a/include/ceed/jit-source/magma/magma-basis-interp-deriv-nontensor.h b/include/ceed/jit-source/magma/magma-basis-interp-deriv-nontensor.h
index f5e2df1e90..0614732f02 100644
--- a/include/ceed/jit-source/magma/magma-basis-interp-deriv-nontensor.h
+++ b/include/ceed/jit-source/magma/magma-basis-interp-deriv-nontensor.h
@@ -99,6 +99,52 @@ static __device__ __inline__ void magma_basis_nontensor_device_t(const int n, Ce
   }
 }
 
+////////////////////////////////////////////////////////////////////////////////
+template <typename T, int Q_COMP, int P, int Q, int NB>
+static __device__ __inline__ void magma_basis_nontensor_device_ta(const int n, const CeedScalar *dA, const CeedScalar *dB, CeedScalar *dC,
+                                                                  CeedScalar *shared_data) {
+  const int tx      = threadIdx.x;
+  const int ty      = threadIdx.y;
+  const int id      = blockIdx.x * blockDim.y + ty;
+  const int nblocks = (n + NB - 1) / NB;
+  const int myn     = min(NB, n - id * NB);
+
+  dB += id * Q * NB;
+  dC += id * P * NB;
+
+  // A is P x Q
+  CeedScalar *sA = shared_data;
+  CeedScalar *sB = shared_data + ty * Q * NB;
+
+  CeedScalar rC[NB] = {0.0};
+
+  // unrolling this loop yields dramatic performance drop using hipcc, so let the compiler decide (no pragma unroll)
+  for (int d = 0; d < Q_COMP; d++) {
+    // read A using all threads
+    CeedScalar rA[Q];
+    read_A_notrans_g2r_1D_nosync<CeedScalar, P, Q, MAGMA_BASIS_NTCOL(P, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA);
+    __syncthreads();
+
+    // read B
+    if (id < nblocks) {
+      read_B_g2s_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, dB, sB);
+    }
+    __syncthreads();
+
+    addmul_rAsBrC_1D_nosync<CeedScalar, P, Q, NB>(rA, sB, rC);
+
+    dA += P * Q;
+    dB += Q * n;
+
+    __syncthreads();
+  }
+
+  // sum into C
+  if (id < nblocks) {
+    sum_C_r2g_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, rC, dC);
+  }
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 template <typename T, int P, int Q, int NB>
 static __device__ __inline__ void magma_basis_nontensor_device_n1(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC,
@@ -171,6 +217,42 @@ static __device__ __inline__ void magma_basis_nontensor_device_t1(const int n, C
   write_C_r2g_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, rC, dC);
 }
 
+////////////////////////////////////////////////////////////////////////////////
+template <typename T, int P, int Q, int NB>
+static __device__ __inline__ void magma_basis_nontensor_device_ta1(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC,
+                                                                   CeedScalar *shared_data) {
+  const int tx      = threadIdx.x;
+  const int ty      = threadIdx.y;
+  const int id      = blockIdx.x * blockDim.y + ty;
+  const int nblocks = (n + NB - 1) / NB;
+  const int myn     = min(NB, n - id * NB);
+
+  dB += id * Q * NB;
+  dC += id * P * NB;
+
+  // A is P x Q
+  CeedScalar *sA = shared_data;
+  CeedScalar *sB = shared_data + ty * Q * NB;
+
+  // read A using all threads
+  CeedScalar rA[Q];
+  read_A_notrans_g2r_1D_nosync<CeedScalar, P, Q, MAGMA_BASIS_NTCOL(P, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA);
+  __syncthreads();
+
+  // terminate threads with no work
+  if (id >= nblocks) return;
+
+  // read B
+  read_B_g2s_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, dB, sB);
+  __syncthreads();
+
+  CeedScalar rC[NB];
+  mul_rAsBrC_1D_nosync<CeedScalar, P, Q, NB>(rA, sB, rC);
+
+  // sum into C
+  sum_C_r2g_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, rC, dC);
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_Q, MAGMA_MAXTHREADS_1D)) __global__
     void magma_interp_nontensor_n(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) {
@@ -195,6 +277,18 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) _
 #endif
 }
 
+////////////////////////////////////////////////////////////////////////////////
+extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) __global__
+    void magma_interp_nontensor_ta(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) {
+  MAGMA_DEVICE_SHARED(CeedScalar, shared_data);
+
+#if BASIS_Q_COMP_INTERP == 1
+  magma_basis_nontensor_device_ta1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_INTERP_T>(n, dA, dB, dC, (CeedScalar *)shared_data);
+#else
+  magma_basis_nontensor_device_ta<CeedScalar, BASIS_Q_COMP_INTERP, BASIS_P, BASIS_Q, BASIS_NB_INTERP_T>(n, dA, dB, dC, (CeedScalar *)shared_data);
+#endif
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_Q, MAGMA_MAXTHREADS_1D)) __global__
     void magma_deriv_nontensor_n(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) {
@@ -218,3 +312,15 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) _
   magma_basis_nontensor_device_t<CeedScalar, BASIS_Q_COMP_DERIV, BASIS_P, BASIS_Q, BASIS_NB_DERIV_T>(n, dA, dB, dC, (CeedScalar *)shared_data);
 #endif
 }
+
+////////////////////////////////////////////////////////////////////////////////
+extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) __global__
+    void magma_deriv_nontensor_ta(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) {
+  MAGMA_DEVICE_SHARED(CeedScalar, shared_data);
+
+#if BASIS_Q_COMP_DERIV == 1
+  magma_basis_nontensor_device_ta1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_DERIV_T>(n, dA, dB, dC, (CeedScalar *)shared_data);
+#else
+  magma_basis_nontensor_device_ta<CeedScalar, BASIS_Q_COMP_DERIV, BASIS_P, BASIS_Q, BASIS_NB_DERIV_T>(n, dA, dB, dC, (CeedScalar *)shared_data);
+#endif
+}
diff --git a/include/ceed/jit-source/magma/magma-common-nontensor.h b/include/ceed/jit-source/magma/magma-common-nontensor.h
index 730acc6419..945227d145 100644
--- a/include/ceed/jit-source/magma/magma-common-nontensor.h
+++ b/include/ceed/jit-source/magma/magma-common-nontensor.h
@@ -104,6 +104,25 @@ static __device__ __inline__ void write_C_r2g_1D_nosync(const int tx, const int
   }
 }
 
+////////////////////////////////////////////////////////////////////////////////
+// sum into C from reg. to global
+// C is (P x NB)
+// 1D thread config. with (P x 1) threads
+// no sync at the end of the function
+template <typename T, int P, int Q, int NB>
+static __device__ __inline__ void sum_C_r2g_1D_nosync(const int tx, const int n, T rC[NB], T *dC) {
+  if (n != NB) {
+    for (int i = 0; i < n; i++) {
+      dC[i * P + tx] += rC[i];
+    }
+  } else {
+#pragma unroll
+    for (int i = 0; i < NB; i++) {
+      dC[i * P + tx] += rC[i];
+    }
+  }
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 // multiply C = A x B using 1D threads in P x 1 config
 // A (P x Q)  in reg., one row per thread
diff --git a/include/ceed/jit-source/magma/magma-common-tensor.h b/include/ceed/jit-source/magma/magma-common-tensor.h
index 6c483abd9d..494afacd87 100644
--- a/include/ceed/jit-source/magma/magma-common-tensor.h
+++ b/include/ceed/jit-source/magma/magma-common-tensor.h
@@ -36,6 +36,18 @@ static __device__ __inline__ void write_1d(T *sBuffer[NUM_COMP], T *devptr, cons
   }
 }
 
+////////////////////////////////////////////////////////////////////////////////
+// sum into V of a 1D element into global memory from sV[][] --  for all components
+// the devptr is assumed to point directly to the element
+template <typename T, int LENGTH, int NUM_COMP>
+static __device__ __inline__ void sum_1d(T *sBuffer[NUM_COMP], T *devptr, const int compstride, const int tx) {
+  if (tx < LENGTH) {
+    for (int comp = 0; comp < NUM_COMP; comp++) {
+      devptr[comp * compstride + tx] += sBuffer[comp][tx];
+    }
+  }
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 // read U of a 2D element into registers rU[][][] --  for all components of a single dim
 // dU is assumed to be offset by elem-stride and dim-stride
@@ -107,6 +119,23 @@ static __device__ __inline__ void write_V_2d(T *dV, const int compstride, T rV[D
   }
 }
 
+////////////////////////////////////////////////////////////////////////////////
+// sum into V of a 2D element from registers rV[][][] to global memory --  for all components of a single dim
+// dV is assumed to be offset by elem-stride and dim-stride
+// register is assumed to be rV[DIM_V][NUM_COMP][rV_SIZE]
+// i_DIM specifies which dimension is being written to in dV
+// rV_SIZE can be different from P (e.g. max(P, Q))
+template <typename T, int Q, int DIM_V, int NUM_COMP, int rV_SIZE, int i_DIM>
+static __device__ __inline__ void sum_V_2d(T *dV, const int compstride, T rV[DIM_V][NUM_COMP][rV_SIZE], const int tx) {
+  if (tx < Q) {
+    for (int comp = 0; comp < NUM_COMP; comp++) {
+      for (int j = 0; j < Q; j++) {
+        dV[comp * compstride + j * Q + tx] += rV[i_DIM][comp][j];
+      }
+    }
+  }
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 // read U of a 3D element into registers rU[][][] --  for all components of a single dim
 // dU is assumed to be offset by elem-stride and dim-stride
@@ -178,6 +207,23 @@ static __device__ __inline__ void write_V_3d(T *dV, const int compstride, T rV[D
   }
 }
 
+////////////////////////////////////////////////////////////////////////////////
+// sum into V of a 3D element from registers rV[][][] to global memory --  for all components of a single dim
+// dV is assumed to point directly to the element (i.e. already offset by elem-stride)
+// register is assumed to be rV[DIM_V][NUM_COMP][rV_SIZE]
+// i_DIM specifies which dimension is being written to in dV
+// rV_SIZE can be different from P (e.g. max(P, Q))
+template <typename T, int Q, int DIM_V, int NUM_COMP, int rV_SIZE, int i_DIM>
+static __device__ __inline__ void sum_V_3d(T *dV, const int compstride, T rV[DIM_V][NUM_COMP][rV_SIZE], const int tx) {
+  if (tx < (Q * Q)) {
+    for (int comp = 0; comp < NUM_COMP; comp++) {
+      for (int j = 0; j < Q; j++) {
+        dV[comp * compstride + j * (Q * Q) + tx] += rV[i_DIM][comp][j];
+      }
+    }
+  }
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 // reads T (no-trans) into shared memory
 // T is B x J
diff --git a/interface/ceed-basis.c b/interface/ceed-basis.c
index bc16186459..6be4e3ddf7 100644
--- a/interface/ceed-basis.c
+++ b/interface/ceed-basis.c
@@ -1486,7 +1486,7 @@ int CeedBasisView(CeedBasis basis, FILE *stream) {
 }
 
 /**
-  @brief Apply basis evaluation from nodes to quadrature points or vice versa
+  @brief Check input vector dimensions for CeedBasisApply[Add]
 
   @param[in]  basis     `CeedBasis` to evaluate
   @param[in]  num_elem  The number of elements to apply the basis evaluation to;
@@ -1504,9 +1504,9 @@ int CeedBasisView(CeedBasis basis, FILE *stream) {
 
   @return An error code: 0 - success, otherwise - failure
 
-  @ref User
+  @ref Developer
 **/
-int CeedBasisApply(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, CeedVector v) {
+static int CeedBasisApplyCheckDims(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, CeedVector v) {
   CeedInt  dim, num_comp, q_comp, num_nodes, num_qpts;
   CeedSize u_length = 0, v_length;
   Ceed     ceed;
@@ -1520,8 +1520,6 @@ int CeedBasisApply(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode,
   CeedCall(CeedVectorGetLength(v, &v_length));
   if (u) CeedCall(CeedVectorGetLength(u, &u_length));
 
-  CeedCheck(basis->Apply, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support CeedBasisApply");
-
   // Check compatibility of topological and geometrical dimensions
   CeedCheck((t_mode == CEED_TRANSPOSE && v_length % num_nodes == 0 && u_length % num_qpts == 0) ||
                 (t_mode == CEED_NOTRANSPOSE && u_length % num_nodes == 0 && v_length % num_qpts == 0),
@@ -1544,13 +1542,68 @@ int CeedBasisApply(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode,
       break;
   }
   CeedCheck(has_good_dims, ceed, CEED_ERROR_DIMENSION, "Input/output vectors too short for basis and evaluation mode");
+  return CEED_ERROR_SUCCESS;
+}
+
+/**
+  @brief Apply basis evaluation from nodes to quadrature points or vice versa
+
+  @param[in]  basis     `CeedBasis` to evaluate
+  @param[in]  num_elem  The number of elements to apply the basis evaluation to;
+                          the backend will specify the ordering in @ref CeedElemRestrictionCreate()
+  @param[in]  t_mode    @ref CEED_NOTRANSPOSE to evaluate from nodes to quadrature points;
+                          @ref CEED_TRANSPOSE to apply the transpose, mapping from quadrature points to nodes
+  @param[in]  eval_mode @ref CEED_EVAL_NONE to use values directly,
+                          @ref CEED_EVAL_INTERP to use interpolated values,
+                          @ref CEED_EVAL_GRAD to use gradients,
+                          @ref CEED_EVAL_DIV to use divergence,
+                          @ref CEED_EVAL_CURL to use curl,
+                          @ref CEED_EVAL_WEIGHT to use quadrature weights
+  @param[in]  u         Input `CeedVector`
+  @param[out] v         Output `CeedVector`
+
+  @return An error code: 0 - success, otherwise - failure
 
+  @ref User
+**/
+int CeedBasisApply(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, CeedVector v) {
+  CeedCall(CeedBasisApplyCheckDims(basis, num_elem, t_mode, eval_mode, u, v));
+  CeedCheck(basis->Apply, CeedBasisReturnCeed(basis), CEED_ERROR_UNSUPPORTED, "Backend does not support CeedBasisApply");
   CeedCall(basis->Apply(basis, num_elem, t_mode, eval_mode, u, v));
   return CEED_ERROR_SUCCESS;
 }
 
 /**
-  @brief Apply basis evaluation from nodes to arbitrary points
+  @brief Apply basis evaluation from quadrature points to nodes and sum into target vector
+
+  @param[in]  basis     `CeedBasis` to evaluate
+  @param[in]  num_elem  The number of elements to apply the basis evaluation to;
+                          the backend will specify the ordering in @ref CeedElemRestrictionCreate()
+  @param[in]  t_mode    @ref CEED_TRANSPOSE to apply the transpose, mapping from quadrature points to nodes;
+                           @ref CEED_NOTRANSPOSE is not valid for `CeedBasisApplyAdd()`
+  @param[in]  eval_mode @ref CEED_EVAL_NONE to use values directly,
+                          @ref CEED_EVAL_INTERP to use interpolated values,
+                          @ref CEED_EVAL_GRAD to use gradients,
+                          @ref CEED_EVAL_DIV to use divergence,
+                          @ref CEED_EVAL_CURL to use curl,
+                          @ref CEED_EVAL_WEIGHT to use quadrature weights
+  @param[in]  u         Input `CeedVector`
+  @param[out] v         Output `CeedVector` to sum into
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref User
+**/
+int CeedBasisApplyAdd(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, CeedVector v) {
+  CeedCheck(t_mode == CEED_TRANSPOSE, CeedBasisReturnCeed(basis), CEED_ERROR_UNSUPPORTED, "CeedBasisApplyAdd only supports CEED_TRANSPOSE");
+  CeedCall(CeedBasisApplyCheckDims(basis, num_elem, t_mode, eval_mode, u, v));
+  CeedCheck(basis->ApplyAdd, CeedBasisReturnCeed(basis), CEED_ERROR_UNSUPPORTED, "Backend does not implement CeedBasisApplyAdd");
+  CeedCall(basis->ApplyAdd(basis, num_elem, t_mode, eval_mode, u, v));
+  return CEED_ERROR_SUCCESS;
+}
+
+/**
+  @brief Check input vector dimensions for CeedBasisApply[Add]AtPoints
 
   @param[in]  basis      `CeedBasis` to evaluate
   @param[in]  num_elem   The number of elements to apply the basis evaluation to;
@@ -1567,11 +1620,10 @@ int CeedBasisApply(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode,
 
   @return An error code: 0 - success, otherwise - failure
 
-  @ref User
+  @ref Developer
 **/
-int CeedBasisApplyAtPoints(CeedBasis basis, CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, CeedEvalMode eval_mode,
-                           CeedVector x_ref, CeedVector u, CeedVector v) {
-  bool     is_tensor_basis;
+static int CeedBasisApplyAtPointsCheckDims(CeedBasis basis, CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode,
+                                           CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) {
   CeedInt  dim, num_comp, num_q_comp, num_nodes, P_1d = 1, Q_1d = 1, total_num_points = 0;
   CeedSize x_length = 0, u_length = 0, v_length;
   Ceed     ceed;
@@ -1624,16 +1676,50 @@ int CeedBasisApplyAtPoints(CeedBasis basis, CeedInt num_elem, const CeedInt *num
       // LCOV_EXCL_STOP
   }
   CeedCheck(has_good_dims, ceed, CEED_ERROR_DIMENSION, "Input/output vectors too short for basis and evaluation mode");
+  return CEED_ERROR_SUCCESS;
+}
 
-  // Backend method
-  if (basis->ApplyAtPoints) {
-    CeedCall(basis->ApplyAtPoints(basis, num_elem, num_points, t_mode, eval_mode, x_ref, u, v));
-    return CEED_ERROR_SUCCESS;
-  }
+/**
+  @brief Default implimentation to apply basis evaluation from nodes to arbitrary points
+
+  @param[in]  basis      `CeedBasis` to evaluate
+  @param[in]  apply_add  Sum result into target vector or overwrite
+  @param[in]  num_elem   The number of elements to apply the basis evaluation to;
+                          the backend will specify the ordering in @ref CeedElemRestrictionCreate()
+  @param[in]  num_points Array of the number of points to apply the basis evaluation to in each element, size `num_elem`
+  @param[in]  t_mode     @ref CEED_NOTRANSPOSE to evaluate from nodes to points;
+                           @ref CEED_TRANSPOSE to apply the transpose, mapping from points to nodes
+  @param[in]  eval_mode  @ref CEED_EVAL_INTERP to use interpolated values,
+                           @ref CEED_EVAL_GRAD to use gradients,
+                           @ref CEED_EVAL_WEIGHT to use quadrature weights
+  @param[in]  x_ref      `CeedVector` holding reference coordinates of each point
+  @param[in]  u          Input `CeedVector`, of length `num_nodes * num_comp` for @ref CEED_NOTRANSPOSE
+  @param[out] v          Output `CeedVector`, of length `num_points * num_q_comp` for @ref CEED_NOTRANSPOSE with @ref CEED_EVAL_INTERP
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Developer
+**/
+static int CeedBasisApplyAtPoints_Core(CeedBasis basis, bool apply_add, CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode,
+                                       CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) {
+  CeedInt dim, num_comp, P_1d = 1, Q_1d = 1, total_num_points = num_points[0];
+  Ceed    ceed;
+
+  CeedCall(CeedBasisGetCeed(basis, &ceed));
+  CeedCall(CeedBasisGetDimension(basis, &dim));
+  // Inserting check because clang-tidy doesn't understand this cannot occur
+  CeedCheck(dim > 0, ceed, CEED_ERROR_UNSUPPORTED, "Malformed CeedBasis, dim > 0 is required");
+  CeedCall(CeedBasisGetNumNodes1D(basis, &P_1d));
+  CeedCall(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
+  CeedCall(CeedBasisGetNumComponents(basis, &num_comp));
 
   // Default implementation
-  CeedCall(CeedBasisIsTensor(basis, &is_tensor_basis));
-  CeedCheck(is_tensor_basis, ceed, CEED_ERROR_UNSUPPORTED, "Evaluation at arbitrary points only supported for tensor product bases");
+  {
+    bool is_tensor_basis;
+
+    CeedCall(CeedBasisIsTensor(basis, &is_tensor_basis));
+    CeedCheck(is_tensor_basis, ceed, CEED_ERROR_UNSUPPORTED, "Evaluation at arbitrary points only supported for tensor product bases");
+  }
   CeedCheck(num_elem == 1, ceed, CEED_ERROR_UNSUPPORTED, "Evaluation at arbitrary  points only supported for a single element at a time");
   if (eval_mode == CEED_EVAL_WEIGHT) {
     CeedCall(CeedVectorSetValue(v, 1.0));
@@ -1805,13 +1891,77 @@ int CeedBasisApplyAtPoints(CeedBasis basis, CeedInt num_elem, const CeedInt *num
       CeedCall(CeedVectorRestoreArrayRead(u, &u_array));
 
       // -- Interpolate transpose from Chebyshev coefficients
-      CeedCall(CeedBasisApply(basis->basis_chebyshev, 1, CEED_TRANSPOSE, CEED_EVAL_INTERP, basis->vec_chebyshev, v));
+      if (apply_add) CeedCall(CeedBasisApplyAdd(basis->basis_chebyshev, 1, CEED_TRANSPOSE, CEED_EVAL_INTERP, basis->vec_chebyshev, v));
+      else CeedCall(CeedBasisApply(basis->basis_chebyshev, 1, CEED_TRANSPOSE, CEED_EVAL_INTERP, basis->vec_chebyshev, v));
       break;
     }
   }
   return CEED_ERROR_SUCCESS;
 }
 
+/**
+  @brief Apply basis evaluation from nodes to arbitrary points
+
+  @param[in]  basis      `CeedBasis` to evaluate
+  @param[in]  num_elem   The number of elements to apply the basis evaluation to;
+                          the backend will specify the ordering in @ref CeedElemRestrictionCreate()
+  @param[in]  num_points Array of the number of points to apply the basis evaluation to in each element, size `num_elem`
+  @param[in]  t_mode     @ref CEED_NOTRANSPOSE to evaluate from nodes to points;
+                           @ref CEED_TRANSPOSE to apply the transpose, mapping from points to nodes
+  @param[in]  eval_mode  @ref CEED_EVAL_INTERP to use interpolated values,
+                           @ref CEED_EVAL_GRAD to use gradients,
+                           @ref CEED_EVAL_WEIGHT to use quadrature weights
+  @param[in]  x_ref      `CeedVector` holding reference coordinates of each point
+  @param[in]  u          Input `CeedVector`, of length `num_nodes * num_comp` for @ref CEED_NOTRANSPOSE
+  @param[out] v          Output `CeedVector`, of length `num_points * num_q_comp` for @ref CEED_NOTRANSPOSE with @ref CEED_EVAL_INTERP
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref User
+**/
+int CeedBasisApplyAtPoints(CeedBasis basis, CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, CeedEvalMode eval_mode,
+                           CeedVector x_ref, CeedVector u, CeedVector v) {
+  CeedCall(CeedBasisApplyAtPointsCheckDims(basis, num_elem, num_points, t_mode, eval_mode, x_ref, u, v));
+  if (basis->ApplyAtPoints) {
+    CeedCall(basis->ApplyAtPoints(basis, num_elem, num_points, t_mode, eval_mode, x_ref, u, v));
+  } else {
+    CeedCall(CeedBasisApplyAtPoints_Core(basis, false, num_elem, num_points, t_mode, eval_mode, x_ref, u, v));
+  }
+  return CEED_ERROR_SUCCESS;
+}
+
+/**
+  @brief Apply basis evaluation from nodes to arbitrary points and sum into target vector
+
+  @param[in]  basis      `CeedBasis` to evaluate
+  @param[in]  num_elem   The number of elements to apply the basis evaluation to;
+                          the backend will specify the ordering in @ref CeedElemRestrictionCreate()
+  @param[in]  num_points Array of the number of points to apply the basis evaluation to in each element, size `num_elem`
+  @param[in]  t_mode     @ref CEED_NOTRANSPOSE to evaluate from nodes to points;
+                           @ref CEED_NOTRANSPOSE is not valid for `CeedBasisApplyAddAtPoints()`
+  @param[in]  eval_mode  @ref CEED_EVAL_INTERP to use interpolated values,
+                           @ref CEED_EVAL_GRAD to use gradients,
+                           @ref CEED_EVAL_WEIGHT to use quadrature weights
+  @param[in]  x_ref      `CeedVector` holding reference coordinates of each point
+  @param[in]  u          Input `CeedVector`, of length `num_nodes * num_comp` for @ref CEED_NOTRANSPOSE
+  @param[out] v          Output `CeedVector`, of length `num_points * num_q_comp` for @ref CEED_NOTRANSPOSE with @ref CEED_EVAL_INTERP
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref User
+**/
+int CeedBasisApplyAddAtPoints(CeedBasis basis, CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, CeedEvalMode eval_mode,
+                              CeedVector x_ref, CeedVector u, CeedVector v) {
+  CeedCheck(t_mode == CEED_TRANSPOSE, CeedBasisReturnCeed(basis), CEED_ERROR_UNSUPPORTED, "CeedBasisApplyAddAtPoints only supports CEED_TRANSPOSE");
+  CeedCall(CeedBasisApplyAtPointsCheckDims(basis, num_elem, num_points, t_mode, eval_mode, x_ref, u, v));
+  if (basis->ApplyAddAtPoints) {
+    CeedCall(basis->ApplyAddAtPoints(basis, num_elem, num_points, t_mode, eval_mode, x_ref, u, v));
+  } else {
+    CeedCall(CeedBasisApplyAtPoints_Core(basis, true, num_elem, num_points, t_mode, eval_mode, x_ref, u, v));
+  }
+  return CEED_ERROR_SUCCESS;
+}
+
 /**
   @brief Get the `Ceed` associated with a `CeedBasis`
 
diff --git a/interface/ceed.c b/interface/ceed.c
index 3c79c59b33..3bdd471454 100644
--- a/interface/ceed.c
+++ b/interface/ceed.c
@@ -952,7 +952,9 @@ int CeedInit(const char *resource, Ceed *ceed) {
       CEED_FTABLE_ENTRY(CeedElemRestriction, GetAtPointsElementOffset),
       CEED_FTABLE_ENTRY(CeedElemRestriction, Destroy),
       CEED_FTABLE_ENTRY(CeedBasis, Apply),
+      CEED_FTABLE_ENTRY(CeedBasis, ApplyAdd),
       CEED_FTABLE_ENTRY(CeedBasis, ApplyAtPoints),
+      CEED_FTABLE_ENTRY(CeedBasis, ApplyAddAtPoints),
       CEED_FTABLE_ENTRY(CeedBasis, Destroy),
       CEED_FTABLE_ENTRY(CeedTensorContract, Apply),
       CEED_FTABLE_ENTRY(CeedTensorContract, Destroy),
diff --git a/tests/README.md b/tests/README.md
index fd6e426420..031ff5a030 100644
--- a/tests/README.md
+++ b/tests/README.md
@@ -15,7 +15,8 @@ The tests are organized by API object, and some tests are further organized, as
     2. CeedBasis simplex basis tests\
     3. CeedBasis non-tensor H(div) basis tests\
     4. CeedBasis non-tensor H(curl) basis tests\
-    5. CeedBasis evaluation at arbitrary points tests
+    5. CeedBasis evaluation at arbitrary points tests\
+    6. CeedBasis ApplyAdd tests
 4. CeedQFunction Tests\
     0. CeedQFunction user code tests\
     1. CeedQFunction gallery code tests
diff --git a/tests/t360-basis.c b/tests/t360-basis.c
new file mode 100644
index 0000000000..f953157e1c
--- /dev/null
+++ b/tests/t360-basis.c
@@ -0,0 +1,56 @@
+/// @file
+/// Test interpolation ApplyAdd in multiple dimensions
+/// \test Test interpolation ApplyAdd in multiple dimensions
+#include <ceed.h>
+#include <math.h>
+#include <stdio.h>
+
+int main(int argc, char **argv) {
+  Ceed ceed;
+
+  CeedInit(argv[1], &ceed);
+
+  for (CeedInt dim = 1; dim <= 3; dim++) {
+    CeedVector u, u_q, v, v_q, w_q;
+    CeedBasis  basis;
+    CeedInt    p = 4, q = 5, p_dim = CeedIntPow(p, dim), q_dim = CeedIntPow(q, dim);
+
+    CeedVectorCreate(ceed, p_dim, &u);
+    CeedVectorCreate(ceed, p_dim, &v);
+    CeedVectorSetValue(u, 1.0);
+    CeedVectorSetValue(v, 0.0);
+    CeedVectorCreate(ceed, q_dim, &u_q);
+    CeedVectorCreate(ceed, q_dim, &v_q);
+    CeedVectorCreate(ceed, q_dim, &w_q);
+
+    CeedBasisCreateTensorH1Lagrange(ceed, dim, 1, p, q, CEED_GAUSS, &basis);
+
+    // Compute area
+    CeedBasisApply(basis, 1, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, u, u_q);
+    CeedBasisApply(basis, 1, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT, CEED_VECTOR_NONE, w_q);
+    CeedVectorPointwiseMult(v_q, u_q, w_q);
+    CeedBasisApply(basis, 1, CEED_TRANSPOSE, CEED_EVAL_INTERP, v_q, v);
+    // Double area computed
+    CeedBasisApplyAdd(basis, 1, CEED_TRANSPOSE, CEED_EVAL_INTERP, v_q, v);
+
+    // Check area
+    {
+      const CeedScalar *v_array;
+      CeedScalar        area = 0.0;
+
+      CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array);
+      for (CeedInt i = 0; i < p_dim; i++) area += v_array[i];
+      if (fabs(area - 2.0 * CeedIntPow(2, dim)) > 5E-6) printf("Incorrect area computed %f != %f\n", area, 2.0 * CeedIntPow(2, dim));
+      CeedVectorRestoreArrayRead(v, &v_array);
+    }
+
+    CeedVectorDestroy(&u);
+    CeedVectorDestroy(&v);
+    CeedVectorDestroy(&u_q);
+    CeedVectorDestroy(&v_q);
+    CeedVectorDestroy(&w_q);
+    CeedBasisDestroy(&basis);
+  }
+  CeedDestroy(&ceed);
+  return 0;
+}
diff --git a/tests/t361-basis.c b/tests/t361-basis.c
new file mode 100644
index 0000000000..6671a39ae5
--- /dev/null
+++ b/tests/t361-basis.c
@@ -0,0 +1,116 @@
+/// @file
+/// Test grad ApplyAdd in multiple dimensions
+/// \test Test grad ApplyAdd in multiple dimensions
+#include <ceed.h>
+#include <math.h>
+#include <stdio.h>
+
+static CeedScalar Eval(CeedInt dim, const CeedScalar x[]) {
+  CeedScalar result = tanh(x[0] + 0.1);
+  if (dim > 1) result += atan(x[1] + 0.2);
+  if (dim > 2) result += exp(-(x[2] + 0.3) * (x[2] + 0.3));
+  return result;
+}
+
+static CeedScalar GetTolerance(CeedScalarType scalar_type, int dim) {
+  CeedScalar tol;
+  if (scalar_type == CEED_SCALAR_FP32) {
+    if (dim == 3) tol = 0.05;
+    else tol = 1.e-3;
+  } else {
+    tol = 1.e-10;
+  }
+  return 2.0 * tol;
+}
+
+int main(int argc, char **argv) {
+  Ceed ceed;
+
+  CeedInit(argv[1], &ceed);
+
+  for (CeedInt dim = 1; dim <= 3; dim++) {
+    CeedVector x, x_q, u, u_q, ones, v;
+    CeedBasis  basis_x_lobatto, basis_u_gauss;
+    CeedInt    p = 8, q = 10, p_dim = CeedIntPow(p, dim), q_dim = CeedIntPow(q, dim), x_dim = CeedIntPow(2, dim);
+    CeedScalar sum_1 = 0, sum_2 = 0;
+
+    CeedVectorCreate(ceed, x_dim * dim, &x);
+    {
+      CeedScalar x_array[x_dim * dim];
+
+      for (CeedInt d = 0; d < dim; d++) {
+        for (CeedInt i = 0; i < x_dim; i++) x_array[d * x_dim + i] = (i % CeedIntPow(2, d + 1)) / CeedIntPow(2, d) ? 1 : -1;
+      }
+      CeedVectorSetArray(x, CEED_MEM_HOST, CEED_COPY_VALUES, x_array);
+    }
+    CeedVectorCreate(ceed, p_dim * dim, &x_q);
+    CeedVectorSetValue(x_q, 0);
+    CeedVectorCreate(ceed, p_dim, &u);
+    CeedVectorCreate(ceed, q_dim * dim, &u_q);
+    CeedVectorSetValue(u_q, 0);
+    CeedVectorCreate(ceed, q_dim * dim, &ones);
+    CeedVectorSetValue(ones, 1);
+    CeedVectorCreate(ceed, p_dim, &v);
+    CeedVectorSetValue(v, 0);
+
+    // Get function values at quadrature points
+    CeedBasisCreateTensorH1Lagrange(ceed, dim, dim, 2, p, CEED_GAUSS_LOBATTO, &basis_x_lobatto);
+    CeedBasisApply(basis_x_lobatto, 1, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, x, x_q);
+
+    {
+      const CeedScalar *x_q_array;
+      CeedScalar        u_array[p_dim];
+
+      CeedVectorGetArrayRead(x_q, CEED_MEM_HOST, &x_q_array);
+      for (CeedInt i = 0; i < p_dim; i++) {
+        CeedScalar coord[dim];
+
+        for (CeedInt d = 0; d < dim; d++) coord[d] = x_q_array[d * p_dim + i];
+        u_array[i] = Eval(dim, coord);
+      }
+      CeedVectorRestoreArrayRead(x_q, &x_q_array);
+      CeedVectorSetArray(u, CEED_MEM_HOST, CEED_COPY_VALUES, u_array);
+    }
+
+    // Calculate G u at quadrature points, G' * 1 at dofs
+    CeedBasisCreateTensorH1Lagrange(ceed, dim, 1, p, q, CEED_GAUSS, &basis_u_gauss);
+    CeedBasisApply(basis_u_gauss, 1, CEED_NOTRANSPOSE, CEED_EVAL_GRAD, u, u_q);
+    CeedVectorScale(u_q, 2.0);
+    CeedBasisApply(basis_u_gauss, 1, CEED_TRANSPOSE, CEED_EVAL_GRAD, ones, v);
+    CeedBasisApplyAdd(basis_u_gauss, 1, CEED_TRANSPOSE, CEED_EVAL_GRAD, ones, v);
+
+    // Check if 1' * G * u = u' * (G' * 1)
+    {
+      const CeedScalar *v_array, *u_array, *u_q_array;
+
+      CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array);
+      CeedVectorGetArrayRead(u, CEED_MEM_HOST, &u_array);
+      CeedVectorGetArrayRead(u_q, CEED_MEM_HOST, &u_q_array);
+      for (CeedInt i = 0; i < p_dim; i++) sum_1 += v_array[i] * u_array[i];
+      for (CeedInt i = 0; i < dim * q_dim; i++) sum_2 += u_q_array[i];
+      CeedVectorRestoreArrayRead(v, &v_array);
+      CeedVectorRestoreArrayRead(u, &u_array);
+      CeedVectorRestoreArrayRead(u_q, &u_q_array);
+    }
+    {
+      CeedScalarType scalar_type;
+
+      CeedGetScalarType(&scalar_type);
+
+      CeedScalar tol = GetTolerance(scalar_type, dim);
+
+      if (fabs(sum_1 - sum_2) > tol) printf("[%" CeedInt_FMT "] %0.12f != %0.12f\n", dim, sum_1, sum_2);
+    }
+
+    CeedVectorDestroy(&x);
+    CeedVectorDestroy(&x_q);
+    CeedVectorDestroy(&u);
+    CeedVectorDestroy(&u_q);
+    CeedVectorDestroy(&ones);
+    CeedVectorDestroy(&v);
+    CeedBasisDestroy(&basis_x_lobatto);
+    CeedBasisDestroy(&basis_u_gauss);
+  }
+  CeedDestroy(&ceed);
+  return 0;
+}
diff --git a/tests/t362-basis.c b/tests/t362-basis.c
new file mode 100644
index 0000000000..bff1937d66
--- /dev/null
+++ b/tests/t362-basis.c
@@ -0,0 +1,59 @@
+/// @file
+/// Test integration ApplyAdd with a 2D Simplex non-tensor H^1 basis
+/// \test Test integration ApplyAdd with a 2D Simplex non-tensor H^1 basis
+#include <ceed.h>
+#include <math.h>
+#include <stdio.h>
+
+#include "t320-basis.h"
+
+// main test
+int main(int argc, char **argv) {
+  Ceed          ceed;
+  CeedVector    u, v, u_q, v_q, w_q;
+  const CeedInt p = 6, q = 4, dim = 2;
+  CeedBasis     basis;
+  CeedScalar    q_ref[dim * q], q_weight[q];
+  CeedScalar    interp[p * q], grad[dim * p * q];
+
+  CeedInit(argv[1], &ceed);
+
+  CeedVectorCreate(ceed, p, &u);
+  CeedVectorCreate(ceed, p, &v);
+  CeedVectorSetValue(u, 1.0);
+  CeedVectorSetValue(v, 0.0);
+  CeedVectorCreate(ceed, q, &u_q);
+  CeedVectorCreate(ceed, q, &v_q);
+  CeedVectorCreate(ceed, q, &w_q);
+
+  Build2DSimplex(q_ref, q_weight, interp, grad);
+  CeedBasisCreateH1(ceed, CEED_TOPOLOGY_TRIANGLE, 1, p, q, interp, grad, q_ref, q_weight, &basis);
+
+  // Compute area
+  CeedBasisApply(basis, 1, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, u, u_q);
+  CeedBasisApply(basis, 1, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT, CEED_VECTOR_NONE, w_q);
+  CeedVectorPointwiseMult(v_q, u_q, w_q);
+  CeedBasisApply(basis, 1, CEED_TRANSPOSE, CEED_EVAL_INTERP, v_q, v);
+  // Double area computed
+  CeedBasisApplyAdd(basis, 1, CEED_TRANSPOSE, CEED_EVAL_INTERP, v_q, v);
+
+  // Check area
+  {
+    const CeedScalar *v_array;
+    CeedScalar        area = 0.0;
+
+    CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array);
+    for (CeedInt i = 0; i < p; i++) area += v_array[i];
+    if (fabs(area - 1.0) > 1E-6) printf("Incorrect area computed %f != %f\n", area, 1.0);
+    CeedVectorRestoreArrayRead(v, &v_array);
+  }
+
+  CeedVectorDestroy(&u);
+  CeedVectorDestroy(&v);
+  CeedVectorDestroy(&u_q);
+  CeedVectorDestroy(&v_q);
+  CeedVectorDestroy(&w_q);
+  CeedBasisDestroy(&basis);
+  CeedDestroy(&ceed);
+  return 0;
+}
diff --git a/tests/t363-basis.c b/tests/t363-basis.c
new file mode 100644
index 0000000000..6c19f34027
--- /dev/null
+++ b/tests/t363-basis.c
@@ -0,0 +1,54 @@
+/// @file
+/// Test grad transpose ApplyAdd with a 2D Simplex non-tensor H^1 basis
+/// \test Test grad transpose ApplyAdd with a 2D Simplex non-tensor H^1 basis
+#include <ceed.h>
+#include <math.h>
+#include <stdio.h>
+
+#include "t320-basis.h"
+
+int main(int argc, char **argv) {
+  Ceed          ceed;
+  CeedVector    u, v;
+  const CeedInt p = 6, q = 4, dim = 2;
+  CeedBasis     basis;
+  CeedScalar    q_ref[dim * q], q_weight[q];
+  CeedScalar    interp[p * q], grad[dim * p * q];
+  CeedScalar    column_sum[p];
+
+  CeedInit(argv[1], &ceed);
+
+  CeedVectorCreate(ceed, q * dim, &u);
+  CeedVectorSetValue(u, 1);
+  CeedVectorCreate(ceed, p, &v);
+  CeedVectorSetValue(v, 0);
+
+  Build2DSimplex(q_ref, q_weight, interp, grad);
+  CeedBasisCreateH1(ceed, CEED_TOPOLOGY_TRIANGLE, 1, p, q, interp, grad, q_ref, q_weight, &basis);
+
+  CeedBasisApply(basis, 1, CEED_TRANSPOSE, CEED_EVAL_GRAD, u, v);
+  CeedBasisApplyAdd(basis, 1, CEED_TRANSPOSE, CEED_EVAL_GRAD, u, v);
+
+  // Check values at quadrature points
+  for (int i = 0; i < p; i++) {
+    column_sum[i] = 0;
+    for (int j = 0; j < q * dim; j++) {
+      column_sum[i] += grad[i + j * p];
+    }
+  }
+  {
+    const CeedScalar *v_array;
+
+    CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array);
+    for (int i = 0; i < p; i++) {
+      if (fabs(column_sum[i] - v_array[i] / 2.0) > 100. * CEED_EPSILON) printf("[%" CeedInt_FMT "] %f != %f\n", i, v_array[i] / 2.0, column_sum[i]);
+    }
+    CeedVectorRestoreArrayRead(v, &v_array);
+  }
+
+  CeedVectorDestroy(&u);
+  CeedVectorDestroy(&v);
+  CeedBasisDestroy(&basis);
+  CeedDestroy(&ceed);
+  return 0;
+}
diff --git a/tests/t364-basis.c b/tests/t364-basis.c
new file mode 100644
index 0000000000..6ab4058d30
--- /dev/null
+++ b/tests/t364-basis.c
@@ -0,0 +1,98 @@
+/// @file
+/// Test polynomial interpolation transpose ApplyAdd from arbitrary points in 1D
+/// \test Test polynomial interpolation transpose ApplyAdd from arbitrary points in 1D
+#include <ceed.h>
+#include <math.h>
+#include <stdio.h>
+
+#define ALEN(a) (sizeof(a) / sizeof((a)[0]))
+
+static CeedScalar Eval(CeedScalar x, CeedInt n, const CeedScalar *c) {
+  CeedScalar y = c[n - 1];
+  for (CeedInt i = n - 2; i >= 0; i--) y = y * x + c[i];
+  return y;
+}
+
+int main(int argc, char **argv) {
+  Ceed             ceed;
+  CeedVector       x, x_nodes, x_points, x_point, u, v, u_point, v_point;
+  CeedBasis        basis_x, basis_u;
+  const CeedInt    p = 5, q = 5, num_points = 4;
+  const CeedScalar c[4] = {1, 2, 3, 4};  // 1 + 2x + 3x^2 + ...
+
+  CeedInit(argv[1], &ceed);
+
+  CeedVectorCreate(ceed, 2, &x);
+  CeedVectorCreate(ceed, p, &x_nodes);
+  CeedVectorCreate(ceed, num_points, &x_points);
+  CeedVectorCreate(ceed, 1, &x_point);
+  CeedVectorCreate(ceed, p, &u);
+  CeedVectorCreate(ceed, num_points, &v);
+  CeedVectorCreate(ceed, p, &u_point);
+  CeedVectorCreate(ceed, 1, &v_point);
+  CeedVectorSetValue(v_point, 1.0);
+
+  // Get nodal coordinates
+  CeedBasisCreateTensorH1Lagrange(ceed, 1, 1, 2, p, CEED_GAUSS_LOBATTO, &basis_x);
+  {
+    CeedScalar x_array[2];
+
+    for (CeedInt i = 0; i < 2; i++) x_array[i] = CeedIntPow(-1, i + 1);
+    CeedVectorSetArray(x, CEED_MEM_HOST, CEED_COPY_VALUES, x_array);
+  }
+  CeedBasisApply(basis_x, 1, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, x, x_nodes);
+
+  // Set values of u at nodes
+  {
+    const CeedScalar *x_array;
+    CeedScalar        u_array[p];
+
+    CeedVectorGetArrayRead(x_nodes, CEED_MEM_HOST, &x_array);
+    for (CeedInt i = 0; i < p; i++) u_array[i] = Eval(x_array[i], ALEN(c), c);
+    CeedVectorRestoreArrayRead(x_nodes, &x_array);
+    CeedVectorSetArray(u, CEED_MEM_HOST, CEED_COPY_VALUES, (CeedScalar *)&u_array);
+  }
+
+  // Interpolate to arbitrary points
+  CeedBasisCreateTensorH1Lagrange(ceed, 1, 1, p, q, CEED_GAUSS, &basis_u);
+  {
+    CeedScalar x_array[4] = {-0.33, -0.65, 0.16, 0.99};
+
+    CeedVectorSetArray(x_points, CEED_MEM_HOST, CEED_COPY_VALUES, x_array);
+  }
+  CeedBasisApplyAtPoints(basis_u, 1, &num_points, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, x_points, u, v);
+
+  for (CeedInt i = 0; i < num_points; i++) {
+    const CeedInt     num_point[1] = {1};
+    CeedScalar        fx           = 0.0;
+    const CeedScalar *x_array, *u_array, *v_array, *u_point_array;
+
+    CeedVectorGetArrayRead(x_points, CEED_MEM_HOST, &x_array);
+    CeedVectorGetArrayRead(u, CEED_MEM_HOST, &u_array);
+    CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array);
+    CeedVectorSetValue(x_point, x_array[i]);
+    CeedBasisApplyAtPoints(basis_u, 1, num_point, CEED_TRANSPOSE, CEED_EVAL_INTERP, x_point, v_point, u_point);
+    // Double it
+    CeedBasisApplyAddAtPoints(basis_u, 1, num_point, CEED_TRANSPOSE, CEED_EVAL_INTERP, x_point, v_point, u_point);
+    CeedVectorGetArrayRead(u_point, CEED_MEM_HOST, &u_point_array);
+    for (CeedInt j = 0; j < p; j++) fx += u_array[j] * u_point_array[j];
+    if (fabs(v_array[i] * 2.0 - fx) > 100. * CEED_EPSILON) printf("%f != %f = f(%f)\n", v_array[i] * 2.0, fx, x_array[i]);
+    CeedVectorRestoreArrayRead(u_point, &u_point_array);
+    CeedVectorRestoreArrayRead(x_points, &x_array);
+    CeedVectorRestoreArrayRead(u, &u_array);
+    CeedVectorRestoreArrayRead(v, &v_array);
+  }
+
+  CeedVectorDestroy(&x);
+  CeedVectorDestroy(&x_nodes);
+  CeedVectorDestroy(&x_points);
+  CeedVectorDestroy(&x_point);
+  CeedVectorDestroy(&u);
+  CeedVectorDestroy(&v);
+  CeedVectorDestroy(&u_point);
+  CeedVectorDestroy(&v_point);
+  CeedBasisDestroy(&basis_x);
+  CeedBasisDestroy(&basis_u);
+  CeedDestroy(&ceed);
+  return 0;
+}
diff --git a/tests/t365-basis.c b/tests/t365-basis.c
new file mode 100644
index 0000000000..74f93ce881
--- /dev/null
+++ b/tests/t365-basis.c
@@ -0,0 +1,123 @@
+/// @file
+/// Test gradient transpose ApplyAdd in multiple dimensions at arbitrary points
+/// \test Test gradient transpose ApplyAdd in multiple dimensions at arbitrary points
+#include <ceed.h>
+#include <math.h>
+#include <stdio.h>
+
+static CeedScalar Eval(CeedInt dim, const CeedScalar x[]) {
+  CeedScalar result = tanh(x[0] + 0.1);
+  if (dim > 1) result += atan(x[1] + 0.2);
+  if (dim > 2) result += exp(-(x[2] + 0.3) * (x[2] + 0.3));
+  return result;
+}
+
+static CeedScalar GetTolerance(CeedScalarType scalar_type, int dim) {
+  CeedScalar tol;
+  if (scalar_type == CEED_SCALAR_FP32) {
+    if (dim == 3) tol = 0.005;
+    else tol = 1.e-4;
+  } else {
+    tol = 1.e-11;
+  }
+  return tol;
+}
+
+int main(int argc, char **argv) {
+  Ceed ceed;
+
+  CeedInit(argv[1], &ceed);
+
+  for (CeedInt dim = 1; dim <= 3; dim++) {
+    CeedVector    x, x_nodes, x_points, u, u_points, v, ones;
+    CeedBasis     basis_x, basis_u;
+    const CeedInt p = 9, q = 9, num_points = 4, x_dim = CeedIntPow(2, dim), p_dim = CeedIntPow(p, dim);
+    CeedScalar    sum_1 = 0, sum_2 = 0;
+
+    CeedVectorCreate(ceed, x_dim * dim, &x);
+    CeedVectorCreate(ceed, p_dim * dim, &x_nodes);
+    CeedVectorCreate(ceed, num_points * dim, &x_points);
+    CeedVectorCreate(ceed, p_dim, &u);
+    CeedVectorCreate(ceed, num_points * dim, &u_points);
+    CeedVectorCreate(ceed, p_dim, &v);
+    CeedVectorCreate(ceed, num_points * dim, &ones);
+
+    CeedVectorSetValue(ones, 1);
+    CeedVectorSetValue(v, 0);
+
+    // Get nodal coordinates
+    CeedBasisCreateTensorH1Lagrange(ceed, dim, dim, 2, p, CEED_GAUSS_LOBATTO, &basis_x);
+    {
+      CeedScalar x_array[x_dim * dim];
+
+      for (CeedInt d = 0; d < dim; d++) {
+        for (CeedInt i = 0; i < x_dim; i++) x_array[d * x_dim + i] = (i % CeedIntPow(2, d + 1)) / CeedIntPow(2, d) ? 1 : -1;
+      }
+      CeedVectorSetArray(x, CEED_MEM_HOST, CEED_COPY_VALUES, x_array);
+    }
+    CeedBasisApply(basis_x, 1, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, x, x_nodes);
+
+    // Set values of u at nodes
+    {
+      const CeedScalar *x_array;
+      CeedScalar        u_array[p_dim];
+
+      CeedVectorGetArrayRead(x_nodes, CEED_MEM_HOST, &x_array);
+      for (CeedInt i = 0; i < p_dim; i++) {
+        CeedScalar coord[dim];
+
+        for (CeedInt d = 0; d < dim; d++) coord[d] = x_array[d * p_dim + i];
+        u_array[i] = Eval(dim, coord);
+      }
+      CeedVectorRestoreArrayRead(x_nodes, &x_array);
+      CeedVectorSetArray(u, CEED_MEM_HOST, CEED_COPY_VALUES, (CeedScalar *)&u_array);
+    }
+
+    // Interpolate to arbitrary points
+    CeedBasisCreateTensorH1Lagrange(ceed, dim, 1, p, q, CEED_GAUSS, &basis_u);
+    {
+      CeedScalar x_array[12] = {-0.33, -0.65, 0.16, 0.99, -0.65, 0.16, 0.99, -0.33, 0.16, 0.99, -0.33, -0.65};
+
+      CeedVectorSetArray(x_points, CEED_MEM_HOST, CEED_COPY_VALUES, x_array);
+    }
+
+    // Calculate G u at arbitrary points, G' * 1 at dofs
+    CeedBasisApplyAtPoints(basis_u, 1, &num_points, CEED_NOTRANSPOSE, CEED_EVAL_GRAD, x_points, u, u_points);
+    CeedBasisApplyAtPoints(basis_u, 1, &num_points, CEED_TRANSPOSE, CEED_EVAL_GRAD, x_points, ones, v);
+    // Double it
+    CeedBasisApplyAddAtPoints(basis_u, 1, &num_points, CEED_TRANSPOSE, CEED_EVAL_GRAD, x_points, ones, v);
+    {
+      const CeedScalar *u_array, *v_array, *u_points_array;
+
+      CeedVectorGetArrayRead(u, CEED_MEM_HOST, &u_array);
+      CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array);
+      CeedVectorGetArrayRead(u_points, CEED_MEM_HOST, &u_points_array);
+      for (CeedInt i = 0; i < p_dim; i++) sum_1 += v_array[i] * u_array[i];
+      for (CeedInt i = 0; i < num_points * dim; i++) sum_2 += u_points_array[i];
+      CeedVectorRestoreArrayRead(u, &u_array);
+      CeedVectorRestoreArrayRead(v, &v_array);
+      CeedVectorRestoreArrayRead(u_points, &u_points_array);
+    }
+    {
+      CeedScalarType scalar_type;
+
+      CeedGetScalarType(&scalar_type);
+
+      CeedScalar tol = GetTolerance(scalar_type, dim);
+
+      if (fabs(sum_1 - 2.0 * sum_2) > tol) printf("[%" CeedInt_FMT "] %f != %f\n", dim, sum_1, 2.0 * sum_2);
+    }
+
+    CeedVectorDestroy(&x);
+    CeedVectorDestroy(&x_nodes);
+    CeedVectorDestroy(&x_points);
+    CeedVectorDestroy(&u);
+    CeedVectorDestroy(&u_points);
+    CeedVectorDestroy(&ones);
+    CeedVectorDestroy(&v);
+    CeedBasisDestroy(&basis_x);
+    CeedBasisDestroy(&basis_u);
+  }
+  CeedDestroy(&ceed);
+  return 0;
+}

From 4b3e95d5da7d4d1a3e8f0acb9d6f6cc36918381b Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Wed, 21 Aug 2024 17:10:02 +0100
Subject: [PATCH 139/571] GPU Gen Reorganize (#1637)

* cuda - pull out basis setup for gen

* cuda - functions for adding basis, rstr gen actions

* cuda - pull QFunction logic into separate fn for gen

* cuda - minor formatting

* cuda - fix basis errorr

* cuda - rename collograd_parallelization to 3d_slices

* cuda - another gen setup function separated

* hip - update gen source building to match cuda

* gpu - fix min size of QF inputs for gen
---
 .../cuda-gen/ceed-cuda-gen-operator-build.cpp | 1056 +++++++++--------
 .../hip-gen/ceed-hip-gen-operator-build.cpp   | 1021 +++++++++-------
 2 files changed, 1162 insertions(+), 915 deletions(-)

diff --git a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
index 8b2a8dfee5..28583b103b 100644
--- a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
+++ b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
@@ -23,358 +23,227 @@
 #include "ceed-cuda-gen.h"
 
 //------------------------------------------------------------------------------
-// Build single operator kernel
+// Determine type of operator
 //------------------------------------------------------------------------------
-extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {
-  using std::ostringstream;
-  using std::string;
-
-  bool                      is_setup_done, is_identity_qf;
-  struct cudaDeviceProp     prop;
-  Ceed                      ceed;
-  Ceed_Cuda                *ceed_data;
-  CeedSize                  l_size;
-  CeedInt                   Q, P_1d = 0, Q_1d = 0, elem_size, num_input_fields, num_output_fields, num_comp, dim = 1;
-  CeedEvalMode              eval_mode;
-  CeedElemRestriction       elem_rstr;
-  CeedElemRestriction_Cuda *rstr_data;
-  CeedBasis                 basis;
-  CeedBasis_Cuda_shared    *basis_data;
-  CeedQFunctionField       *qf_input_fields, *qf_output_fields;
-  CeedQFunction_Cuda_gen   *qf_data;
-  CeedQFunction             qf;
-  CeedOperatorField        *op_input_fields, *op_output_fields;
-  CeedOperator_Cuda_gen    *data;
-
-  CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done));
-  if (is_setup_done) return CEED_ERROR_SUCCESS;
-
-  CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
-  CeedCallBackend(CeedOperatorGetData(op, &data));
-  CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
-  CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
-  CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q));
-  Q_1d = Q;
-  CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
-  CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
-
-  // TODO: put in a function?
-  // Check for restriction only identity operator
-  CeedCallBackend(CeedQFunctionIsIdentity(qf, &is_identity_qf));
-  if (is_identity_qf) {
-    CeedEvalMode eval_mode_in, eval_mode_out;
-
-    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[0], &eval_mode_in));
-    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[0], &eval_mode_out));
-    CeedCheck(eval_mode_in != CEED_EVAL_NONE || eval_mode_out != CEED_EVAL_NONE, ceed, CEED_ERROR_BACKEND,
-              "Backend does not implement restriction only identity operators");
-  }
-
-  ostringstream code;
-
-  // TODO: put in a function?
-  // Add atomicAdd function for old NVidia architectures
-  CeedCallBackend(CeedGetData(ceed, &ceed_data));
-  CeedCallBackend(cudaGetDeviceProperties(&prop, ceed_data->device_id));
-  if ((prop.major < 6) && (CEED_SCALAR_TYPE != CEED_SCALAR_FP32)) {
-    char       *atomic_add_source;
-    const char *atomic_add_path;
-
-    CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-atomic-add-fallback.h", &atomic_add_path));
-    CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Atomic Add Source -----\n");
-    CeedCallBackend(CeedLoadSourceToBuffer(ceed, atomic_add_path, &atomic_add_source));
-    code << atomic_add_source;
-    CeedCallBackend(CeedFree(&atomic_add_path));
-    CeedCallBackend(CeedFree(&atomic_add_source));
-  }
-
-  // Load basis source files
-  // TODO: generalize to accept different device functions?
-  {
-    char       *tensor_basis_kernel_source;
-    const char *tensor_basis_kernel_path;
-
-    CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h", &tensor_basis_kernel_path));
-    CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Tensor Basis Kernel Source -----\n");
-    CeedCallBackend(CeedLoadSourceToBuffer(ceed, tensor_basis_kernel_path, &tensor_basis_kernel_source));
-    code << tensor_basis_kernel_source;
-    CeedCallBackend(CeedFree(&tensor_basis_kernel_path));
-    CeedCallBackend(CeedFree(&tensor_basis_kernel_source));
-  }
-  {
-    char       *cuda_gen_template_source;
-    const char *cuda_gen_template_path;
-
-    CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-gen-templates.h", &cuda_gen_template_path));
-    CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Cuda-Gen Template Source -----\n");
-    CeedCallBackend(CeedLoadSourceToBuffer(ceed, cuda_gen_template_path, &cuda_gen_template_source));
-    code << cuda_gen_template_source;
-    CeedCallBackend(CeedFree(&cuda_gen_template_path));
-    CeedCallBackend(CeedFree(&cuda_gen_template_source));
-  }
-
-  // Get QFunction source and name
-  string qfunction_source(qf_data->qfunction_source);
-  string qfunction_name(qf_data->qfunction_name);
-  string operator_name;
-  operator_name = "CeedKernelCudaGenOperator_" + qfunction_name;
-
+static int CeedOperatorBuildKernelData_Cuda_gen(Ceed ceed, CeedInt num_input_fields, CeedOperatorField *op_input_fields,
+                                                CeedQFunctionField *qf_input_fields, CeedInt num_output_fields, CeedOperatorField *op_output_fields,
+                                                CeedQFunctionField *qf_output_fields, CeedInt *max_P_1d, CeedInt *Q_1d, CeedInt *dim, bool *is_tensor,
+                                                bool *use_3d_slices) {
   // Find dim, P_1d, Q_1d
-  data->max_P_1d = 0;
+  *max_P_1d  = 0;
+  *Q_1d      = 0;
+  *dim       = 0;
+  *is_tensor = true;
   for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedBasis basis;
+
     CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
     if (basis != CEED_BASIS_NONE) {
-      bool is_tensor;
-
-      CeedCallBackend(CeedBasisGetData(basis, &basis_data));
-      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
+      bool    is_field_tensor;
+      CeedInt field_P_1d = 0, field_Q_1d = 0, field_dim = 0;
 
       // Collect dim, P_1d, and Q_1d
-      CeedCallBackend(CeedBasisGetDimension(basis, &dim));
-      CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
-      CeedCheck(is_tensor, ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis");
-      CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
-      CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
-      data->max_P_1d = CeedIntMax(data->max_P_1d, P_1d);
+      CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
+      CeedCheck(is_field_tensor, ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis");
+      *is_tensor = *is_tensor && is_field_tensor;
+      CeedCallBackend(CeedBasisGetNumNodes1D(basis, &field_P_1d));
+      *max_P_1d = CeedIntMax(*max_P_1d, field_P_1d);
+      CeedCallBackend(CeedBasisGetDimension(basis, &field_dim));
+      CeedCheck(*dim == 0 || field_dim == *dim, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
+      *dim = field_dim;
+      CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &field_Q_1d));
+      CeedCheck(*Q_1d == 0 || field_Q_1d == *Q_1d, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
+      *Q_1d = field_Q_1d;
     }
   }
-  // Check output bases for Q_1d, dim as well
-  //   The only input basis might be CEED_BASIS_NONE
   for (CeedInt i = 0; i < num_output_fields; i++) {
+    CeedBasis basis;
+
     CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
     if (basis != CEED_BASIS_NONE) {
-      bool is_tensor;
-
-      CeedCallBackend(CeedBasisGetData(basis, &basis_data));
-      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
+      bool    is_field_tensor;
+      CeedInt field_P_1d = 0, field_Q_1d = 0, field_dim = 0;
 
-      // Collect Q_1d
-      CeedCallBackend(CeedBasisGetDimension(basis, &dim));
-      CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
-      CeedCheck(is_tensor, ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis");
-      CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
+      // Collect dim, P_1d, and Q_1d
+      CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
+      CeedCheck(is_field_tensor, ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis");
+      *is_tensor = *is_tensor && is_field_tensor;
+      CeedCallBackend(CeedBasisGetNumNodes1D(basis, &field_P_1d));
+      *max_P_1d = CeedIntMax(*max_P_1d, field_P_1d);
+      CeedCallBackend(CeedBasisGetDimension(basis, &field_dim));
+      CeedCheck(*dim == 0 || field_dim == *dim, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
+      *dim = field_dim;
+      CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &field_Q_1d));
+      CeedCheck(*Q_1d == 0 || field_Q_1d == *Q_1d, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
+      *Q_1d = field_Q_1d;
     }
   }
-  data->dim  = dim;
-  data->Q_1d = Q_1d;
 
   // Only use 3D collocated gradient parallelization strategy when gradient is computed
-  // TODO: put in a function?
-  bool use_collograd_parallelization = false;
-
-  if (dim == 3) {
+  *use_3d_slices = false;
+  if (*dim == 3) {
     bool was_grad_found = false;
 
     for (CeedInt i = 0; i < num_input_fields; i++) {
+      CeedEvalMode eval_mode;
+
       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
       if (eval_mode == CEED_EVAL_GRAD) {
+        CeedBasis_Cuda_shared *basis_data;
+        CeedBasis              basis;
+
         CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
         CeedCallBackend(CeedBasisGetData(basis, &basis_data));
-        use_collograd_parallelization = basis_data->d_collo_grad_1d && (was_grad_found ? use_collograd_parallelization : true);
-        was_grad_found                = true;
+        *use_3d_slices = basis_data->d_collo_grad_1d && (was_grad_found ? *use_3d_slices : true);
+        was_grad_found = true;
       }
     }
     for (CeedInt i = 0; i < num_output_fields; i++) {
+      CeedEvalMode eval_mode;
+
       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
       if (eval_mode == CEED_EVAL_GRAD) {
+        CeedBasis_Cuda_shared *basis_data;
+        CeedBasis              basis;
+
         CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
         CeedCallBackend(CeedBasisGetData(basis, &basis_data));
-        use_collograd_parallelization = basis_data->d_collo_grad_1d && (was_grad_found ? use_collograd_parallelization : true);
-        was_grad_found                = true;
+        *use_3d_slices = basis_data->d_collo_grad_1d && (was_grad_found ? *use_3d_slices : true);
+        was_grad_found = true;
       }
     }
   }
+  return CEED_ERROR_SUCCESS;
+}
 
-  // Define CEED_Q_VLA
-  code << "\n#undef CEED_Q_VLA\n";
-  if (dim != 3 || use_collograd_parallelization) {
-    code << "#define CEED_Q_VLA 1\n\n";
-  } else {
-    code << "#define CEED_Q_VLA " << Q_1d << "\n\n";
+//------------------------------------------------------------------------------
+// Setup fields
+//------------------------------------------------------------------------------
+static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, CeedInt i, CeedOperatorField op_field,
+                                                     CeedQFunctionField qf_field, CeedInt Q_1d, bool is_input, bool use_3d_slices) {
+  std::string            var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
+  std::string            P_name = "P_1d" + var_suffix, Q_name = "Q_1d";
+  std::string            option_name = (is_input ? "inputs" : "outputs");
+  CeedEvalMode           eval_mode   = CEED_EVAL_NONE;
+  CeedInt                elem_size = 0, num_comp = 0, P_1d = 0;
+  CeedElemRestriction    elem_rstr;
+  CeedBasis_Cuda_shared *basis_data;
+  CeedBasis              basis;
+
+  code << "  // -- " << (is_input ? "Input" : "Output") << " field " << i << "\n";
+
+  // Get field data
+  CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr));
+  if (elem_rstr != CEED_ELEMRESTRICTION_NONE) {
+    CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
+    CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
   }
-
-  code << qfunction_source;
-
-  // Setup
-  code << "\n// -----------------------------------------------------------------------------\n";
-  code << "\nextern \"C\" __global__ void " << operator_name
-       << "(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalar* W) {\n";
-  for (CeedInt i = 0; i < num_input_fields; i++) {
-    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
-    if (eval_mode != CEED_EVAL_WEIGHT) {  // Skip CEED_EVAL_WEIGHT
-      code << "  const CeedScalar* d_u_" << i << " = fields.inputs[" << i << "];\n";
-    }
+  CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis));
+  if (basis != CEED_BASIS_NONE) {
+    CeedCallBackend(CeedBasisGetData(basis, &basis_data));
+    CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
   }
+  CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode));
 
-  for (CeedInt i = 0; i < num_output_fields; i++) {
-    code << "  CeedScalar* d_v_" << i << " = fields.outputs[" << i << "];\n";
+  // Set field constants
+  if (eval_mode != CEED_EVAL_WEIGHT) {
+    code << "  const CeedInt " << P_name << " = " << (basis == CEED_BASIS_NONE ? Q_1d : P_1d) << ";\n";
+    code << "  const CeedInt num_comp" << var_suffix << " = " << num_comp << ";\n";
   }
 
-  code << "  const CeedInt dim = " << dim << ";\n";
-  code << "  const CeedInt Q_1d = " << Q_1d << ";\n";
-
-  code << "  extern __shared__ CeedScalar slice[];\n";
-  // TODO put in a function? InitSharedData_Cuda?
-  code << "  SharedData_Cuda data;\n";
-  code << "  data.t_id_x = threadIdx.x;\n";
-  code << "  data.t_id_y = threadIdx.y;\n";
-  code << "  data.t_id_z = threadIdx.z;\n";
-  code << "  data.t_id  = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
-  code << "  data.slice = slice+data.t_id_z*T_1D" << (dim > 1 ? "*T_1D" : "") << ";\n";
-
-  code << "\n  // -- Input field constants and basis data --\n";
-  // TODO: Put in a function?
-  // Initialize constants, and matrices B and G
-  for (CeedInt i = 0; i < num_input_fields; i++) {
-    code << "  // ---- Input field " << i << " ----\n";
-    // Get elem_size, eval_mode, num_comp
-    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
-    CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
-    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
-    CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
-
-    // Set field constants
-    if (eval_mode != CEED_EVAL_WEIGHT) {
-      CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
-      if (basis != CEED_BASIS_NONE) {
-        CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
-        code << "  const CeedInt P_in_" << i << " = " << P_1d << ";\n";
+  // Load basis data
+  code << "  // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
+  switch (eval_mode) {
+    case CEED_EVAL_NONE:
+      break;
+    case CEED_EVAL_INTERP:
+      if (is_input) data->B.inputs[i] = basis_data->d_interp_1d;
+      else data->B.outputs[i] = basis_data->d_interp_1d;
+      code << "  __shared__ CeedScalar s_B" << var_suffix << "[" << P_1d * Q_1d << "];\n";
+      code << "  loadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
+      break;
+    case CEED_EVAL_GRAD:
+      if (is_input) data->B.inputs[i] = basis_data->d_interp_1d;
+      else data->B.outputs[i] = basis_data->d_interp_1d;
+      code << "  __shared__ CeedScalar s_B" << var_suffix << "[" << P_1d * Q_1d << "];\n";
+      code << "  loadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
+      if (use_3d_slices) {
+        if (is_input) data->G.inputs[i] = basis_data->d_collo_grad_1d;
+        else data->G.outputs[i] = basis_data->d_collo_grad_1d;
+        code << "  __shared__ CeedScalar s_G" << var_suffix << "[" << Q_1d * Q_1d << "];\n";
+        code << "  loadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
       } else {
-        code << "  const CeedInt P_in_" << i << " = " << Q_1d << ";\n";
-      }
-      code << "  const CeedInt num_comp_in_" << i << " = " << num_comp << ";\n";
-    }
+        bool has_collo_grad = basis_data->d_collo_grad_1d;
 
-    // Load basis data
-    code << "  // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
-    switch (eval_mode) {
-      case CEED_EVAL_NONE:
-        break;
-      case CEED_EVAL_INTERP:
-        CeedCallBackend(CeedBasisGetData(basis, &basis_data));
-        data->B.inputs[i] = basis_data->d_interp_1d;
-        code << "  __shared__ CeedScalar s_B_in_" << i << "[" << P_1d * Q_1d << "];\n";
-        code << "  loadMatrix<P_in_" << i << ",Q_1d>(data, B.inputs[" << i << "], s_B_in_" << i << ");\n";
-        break;
-      case CEED_EVAL_GRAD:
-        CeedCallBackend(CeedBasisGetData(basis, &basis_data));
-        data->B.inputs[i] = basis_data->d_interp_1d;
-        code << "  __shared__ CeedScalar s_B_in_" << i << "[" << P_1d * Q_1d << "];\n";
-        code << "  loadMatrix<P_in_" << i << ",Q_1d>(data, B.inputs[" << i << "], s_B_in_" << i << ");\n";
-        if (use_collograd_parallelization) {
-          data->G.inputs[i] = basis_data->d_collo_grad_1d;
-          code << "  __shared__ CeedScalar s_G_in_" << i << "[" << Q_1d * Q_1d << "];\n";
-          code << "  loadMatrix<Q_1d,Q_1d>(data, G.inputs[" << i << "], s_G_in_" << i << ");\n";
+        if (is_input) data->G.inputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d;
+        else data->G.outputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d;
+        if (has_collo_grad) {
+          code << "  __shared__ CeedScalar s_G" << var_suffix << "[" << Q_1d * Q_1d << "];\n";
+          code << "  loadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
         } else {
-          bool has_collo_grad = basis_data->d_collo_grad_1d;
-          data->G.inputs[i]   = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d;
-          code << "  __shared__ CeedScalar s_G_in_" << i << "[" << Q_1d * (has_collo_grad ? Q_1d : P_1d) << "];\n";
-          code << "  loadMatrix<" << (has_collo_grad ? "Q_1d" : ("P_in_" + std::to_string(i))) << ",Q_1d>(data, G.inputs[" << i << "], s_G_in_" << i
-               << ");\n";
+          code << "  __shared__ CeedScalar s_G" << var_suffix << "[" << Q_1d * P_1d << "];\n";
+          code << "  loadMatrix<" << P_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
         }
-        break;
-      case CEED_EVAL_WEIGHT:
-        break;  // No action
-      case CEED_EVAL_DIV:
-        break;  // TODO: Not implemented
-      case CEED_EVAL_CURL:
-        break;  // TODO: Not implemented
-    }
+      }
+      break;
+    case CEED_EVAL_WEIGHT:
+      break;  // No action
+      // LCOV_EXCL_START
+    case CEED_EVAL_DIV:
+      break;  // TODO: Not implemented
+    case CEED_EVAL_CURL:
+      break;  // TODO: Not implemented
+              // LCOV_EXCL_STOP
   }
+  return CEED_ERROR_SUCCESS;
+}
 
-  code << "\n  // -- Output field constants and basis data --\n";
-  for (CeedInt i = 0; i < num_output_fields; i++) {
-    code << "  // ---- Output field " << i << " ----\n";
-    // Get elem_size, eval_mode, num_comp
-    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
-    CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
-    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
-    CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
-
-    // Set field constants
-    CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
-    if (basis != CEED_BASIS_NONE) {
-      CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
-      code << "  const CeedInt P_out_" << i << " = " << P_1d << ";\n";
-    } else {
-      code << "  const CeedInt P_out_" << i << " = " << Q_1d << ";\n";
-    }
-    code << "  const CeedInt num_comp_out_" << i << " = " << num_comp << ";\n";
+//------------------------------------------------------------------------------
+// Restriction
+//------------------------------------------------------------------------------
+static int CeedOperatorBuildKernelRestriction_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, CeedInt i, CeedInt dim,
+                                                       CeedOperatorField op_field, CeedQFunctionField qf_field, CeedInt Q_1d, bool is_input,
+                                                       bool use_3d_slices) {
+  std::string               var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
+  std::string               P_name     = "P_1d" + var_suffix;
+  CeedEvalMode              eval_mode  = CEED_EVAL_NONE;
+  CeedInt                   elem_size = 0, num_comp = 0, P_1d = 0;
+  CeedSize                  l_size;
+  CeedElemRestriction_Cuda *rstr_data;
+  CeedElemRestriction       elem_rstr;
+  CeedBasis                 basis;
 
-    // Load basis data
-    code << "  // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
-    switch (eval_mode) {
-      case CEED_EVAL_NONE:
-        break;  // No action
-      case CEED_EVAL_INTERP:
-        CeedCallBackend(CeedBasisGetData(basis, &basis_data));
-        data->B.outputs[i] = basis_data->d_interp_1d;
-        code << "  __shared__ CeedScalar s_B_out_" << i << "[" << P_1d * Q_1d << "];\n";
-        code << "  loadMatrix<P_out_" << i << ",Q_1d>(data, B.outputs[" << i << "], s_B_out_" << i << ");\n";
-        break;
-      case CEED_EVAL_GRAD:
-        CeedCallBackend(CeedBasisGetData(basis, &basis_data));
-        data->B.outputs[i] = basis_data->d_interp_1d;
-        code << "  __shared__ CeedScalar s_B_out_" << i << "[" << P_1d * Q_1d << "];\n";
-        code << "  loadMatrix<P_out_" << i << ",Q_1d>(data, B.outputs[" << i << "], s_B_out_" << i << ");\n";
-        if (use_collograd_parallelization) {
-          data->G.outputs[i] = basis_data->d_collo_grad_1d;
-          code << "  __shared__ CeedScalar s_G_out_" << i << "[" << Q_1d * Q_1d << "];\n";
-          code << "  loadMatrix<Q_1d,Q_1d>(data, G.outputs[" << i << "], s_G_out_" << i << ");\n";
-        } else {
-          bool has_collo_grad = basis_data->d_collo_grad_1d;
-          data->G.outputs[i]  = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d;
-          code << "  __shared__ CeedScalar s_G_out_" << i << "[" << Q_1d * (has_collo_grad ? Q_1d : P_1d) << "];\n";
-          code << "  loadMatrix<" << (has_collo_grad ? "Q_1d" : ("P_out_" + std::to_string(i))) << ",Q_1d>(data, G.outputs[" << i << "], s_G_out_"
-               << i << ");\n";
-        }
-        break;
-      // LCOV_EXCL_START
-      case CEED_EVAL_WEIGHT: {
-        return CeedError(CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
-        break;  // Should not occur
-      }
-      case CEED_EVAL_DIV:
-      case CEED_EVAL_CURL: {
-        return CeedError(CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]);
-        break;  // Should not occur
-      }
-        // LCOV_EXCL_STOP
-    }
-  }
-  code << "\n  // -- Element loop --\n";
-  code << "  __syncthreads();\n";
-  code << "  for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n";
-  // Input basis apply if needed
-  // Generate the correct eval mode code for each input
-  code << "    // -- Input field restrictions and basis actions --\n";
-  for (CeedInt i = 0; i < num_input_fields; i++) {
-    code << "    // ---- Input field " << i << " ----\n";
-    // Get elem_size, eval_mode, num_comp
-    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
+  // Get field data
+  CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr));
+  if (elem_rstr != CEED_ELEMRESTRICTION_NONE) {
     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
-    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
+    CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data));
+  }
+  CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis));
+  if (basis != CEED_BASIS_NONE) {
+    CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
+  }
+  CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode));
 
-    // TODO: put in a function?
-    // Restriction
-    if (eval_mode != CEED_EVAL_WEIGHT && !((eval_mode == CEED_EVAL_NONE) && use_collograd_parallelization)) {
-      code << "    CeedScalar r_u_" << i << "[num_comp_in_" << i << "*P_in_" << i << "];\n";
-
+  // Restriction
+  if (is_input) {
+    // Input
+    if (eval_mode != CEED_EVAL_WEIGHT && !((eval_mode == CEED_EVAL_NONE) && use_3d_slices)) {
       bool is_strided;
 
+      code << "    CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << P_name << "];\n";
       CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided));
       if (!is_strided) {
         CeedInt comp_stride;
 
         CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
-        code << "    const CeedInt l_size_in_" << i << " = " << l_size << ";\n";
+        code << "    const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
         CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
         code << "    // CompStride: " << comp_stride << "\n";
-        CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data));
         data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets;
-        code << "    readDofsOffset" << dim << "d<num_comp_in_" << i << ", " << comp_stride << ", P_in_" << i << ">(data, l_size_in_" << i
-             << ", elem, indices.inputs[" << i << "], d_u_" << i << ", r_u_" << i << ");\n";
+        code << "    readDofsOffset" << dim << "d<num_comp" << var_suffix << ", " << comp_stride << ", " << P_name << ">(data, l_size" << var_suffix
+             << ", elem, indices.inputs[" << i << "], d" << var_suffix << ", r_e" << var_suffix << ");\n";
       } else {
         bool    has_backend_strides;
         CeedInt num_elem;
@@ -387,86 +256,190 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {
           CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
         }
         code << "    // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n";
-        code << "    readDofsStrided" << dim << "d<num_comp_in_" << i << ",P_in_" << i << "," << strides[0] << "," << strides[1] << "," << strides[2]
-             << ">(data, elem, d_u_" << i << ", r_u_" << i << ");\n";
+        code << "    readDofsStrided" << dim << "d<num_comp" << var_suffix << ", " << P_name << "," << strides[0] << "," << strides[1] << ","
+             << strides[2] << ">(data, elem, d" << var_suffix << ", r_e" << var_suffix << ");\n";
       }
     }
+  } else {
+    // Output
+    bool is_strided;
+
+    CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided));
+    if (!is_strided) {
+      CeedInt comp_stride;
 
-    // TODO: put in a function?
-    // Basis action
-    code << "    // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
+      CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
+      code << "    const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
+      CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
+      code << "    // CompStride: " << comp_stride << "\n";
+      data->indices.outputs[i] = (CeedInt *)rstr_data->d_offsets;
+      code << "    writeDofsOffset" << dim << "d<num_comp" << var_suffix << ", " << comp_stride << ", " << P_name << ">(data, l_size" << var_suffix
+           << ", elem, indices.outputs[" << i << "], r_e" << var_suffix << ", d" << var_suffix << ");\n";
+    } else {
+      bool    has_backend_strides;
+      CeedInt num_elem;
+
+      CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides));
+      CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem));
+      CeedInt strides[3] = {1, elem_size * num_elem, elem_size};
+
+      if (!has_backend_strides) {
+        CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
+      }
+      code << "    // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n";
+      code << "    writeDofsStrided" << dim << "d<num_comp" << var_suffix << ", " << P_name << "," << strides[0] << "," << strides[1] << ","
+           << strides[2] << ">(data, elem, r_e" << var_suffix << ", d" << var_suffix << ");\n";
+    }
+  }
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
+// Basis
+//------------------------------------------------------------------------------
+static int CeedOperatorBuildKernelBasis_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, CeedInt i, CeedInt dim,
+                                                 CeedOperatorField op_field, CeedQFunctionField qf_field, CeedInt Q_1d, bool is_input,
+                                                 bool use_3d_slices) {
+  std::string         var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
+  std::string         P_name = "P_1d" + var_suffix, Q_name = "Q_1d";
+  CeedEvalMode        eval_mode = CEED_EVAL_NONE;
+  CeedInt             elem_size = 0, num_comp = 0, P_1d = 0;
+  CeedElemRestriction elem_rstr;
+  CeedBasis           basis;
+
+  // Get field data
+  CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr));
+  if (elem_rstr != CEED_ELEMRESTRICTION_NONE) {
+    CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
+    CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
+  }
+  CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis));
+  if (basis != CEED_BASIS_NONE) {
+    CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
+  }
+  CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode));
+
+  // Basis
+  code << "    // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
+  if (is_input) {
     switch (eval_mode) {
       case CEED_EVAL_NONE:
-        if (!use_collograd_parallelization) {
-          code << "    CeedScalar* r_t_" << i << " = r_u_" << i << ";\n";
+        if (!use_3d_slices) {
+          code << "    CeedScalar *r_q" << var_suffix << " = r_e" << var_suffix << ";\n";
         }
         break;
       case CEED_EVAL_INTERP:
-        code << "    CeedScalar r_t_" << i << "[num_comp_in_" << i << "*Q_1d];\n";
-        code << "    Interp" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp_in_" << i << ",P_in_" << i << ",Q_1d>(data, r_u_" << i << ", s_B_in_"
-             << i << ", r_t_" << i << ");\n";
+        code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
+        code << "    Interp" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp" << var_suffix << ", P_1d" << var_suffix << ", " << Q_name
+             << ">(data, r_e" << var_suffix << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n";
         break;
       case CEED_EVAL_GRAD:
-        if (use_collograd_parallelization) {
-          code << "    CeedScalar r_t_" << i << "[num_comp_in_" << i << "*Q_1d];\n";
-          code << "    Interp" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp_in_" << i << ",P_in_" << i << ",Q_1d>(data, r_u_" << i
-               << ", s_B_in_" << i << ", r_t_" << i << ");\n";
+        if (use_3d_slices) {
+          code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
+          code << "    Interp" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp" << var_suffix << ", P_1d" << var_suffix << ", " << Q_name
+               << ">(data, r_e" << var_suffix << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n";
         } else {
-          CeedInt P_1d;
-
-          CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
-          CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
-          code << "    CeedScalar r_t_" << i << "[num_comp_in_" << i << "*dim*Q_1d];\n";
-          code << "    Grad" << (dim > 1 ? "Tensor" : "") << (dim == 3 && Q_1d >= P_1d ? "Collocated" : "") << dim << "d<num_comp_in_" << i
-               << ",P_in_" << i << ",Q_1d>(data, r_u_" << i << ", s_B_in_" << i << ", s_G_in_" << i << ", r_t_" << i << ");\n";
+          code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim*" << Q_name << "];\n";
+          code << "    Grad" << (dim > 1 ? "Tensor" : "") << (dim == 3 && Q_1d >= P_1d ? "Collocated" : "") << dim << "d<num_comp" << var_suffix
+               << ", P_1d" << var_suffix << ", " << Q_name << ">(data, r_e" << var_suffix << ", s_B" << var_suffix << ", s_G" << var_suffix << ", r_q"
+               << var_suffix << ");\n";
         }
         break;
-      case CEED_EVAL_WEIGHT:
-        code << "    CeedScalar r_t_" << i << "[Q_1d];\n";
-        CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
+      case CEED_EVAL_WEIGHT: {
+        CeedBasis_Cuda_shared *basis_data;
+
+        code << "    CeedScalar r_q" << var_suffix << "[" << Q_name << "];\n";
         CeedCallBackend(CeedBasisGetData(basis, &basis_data));
         data->W = basis_data->d_q_weight_1d;
-        code << "    Weight" << (dim > 1 ? "Tensor" : "") << dim << "d<Q_1d>(data, W, r_t_" << i << ");\n";
+        code << "    Weight" << (dim > 1 ? "Tensor" : "") << dim << "d<" << Q_name << ">(data, W, r_q" << var_suffix << ");\n";
+        break;
+      }
+      // LCOV_EXCL_START
+      case CEED_EVAL_DIV:
+        break;  // TODO: Not implemented
+      case CEED_EVAL_CURL:
+        break;  // TODO: Not implemented
+                // LCOV_EXCL_STOP
+    }
+  } else {
+    switch (eval_mode) {
+      case CEED_EVAL_NONE:
+        code << "    CeedScalar *r_e" << var_suffix << " = r_q" << var_suffix << ";\n";
         break;  // No action
+      case CEED_EVAL_INTERP:
+        code << "    CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << P_name << "];\n";
+        code << "    InterpTranspose" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp" << var_suffix << ", " << P_name << ", " << Q_name
+             << ">(data, r_q" << var_suffix << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
+        break;
+      case CEED_EVAL_GRAD:
+        code << "    CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << P_name << "];\n";
+        if (use_3d_slices) {
+          code << "    InterpTranspose" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp" << var_suffix << ", " << P_name << ", " << Q_name
+               << ">(data, r_q" << var_suffix << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
+        } else {
+          code << "    GradTranspose" << (dim > 1 ? "Tensor" : "") << (dim == 3 && Q_1d >= P_1d ? "Collocated" : "") << dim << "d<num_comp"
+               << var_suffix << ", " << P_name << "," << Q_name << ">(data, r_q" << var_suffix << ", s_B" << var_suffix << ", s_G" << var_suffix
+               << ", r_e" << var_suffix << ");\n";
+        }
+        break;
+      // LCOV_EXCL_START
+      case CEED_EVAL_WEIGHT:
+        break;  // Should not occur
       case CEED_EVAL_DIV:
         break;  // TODO: Not implemented
       case CEED_EVAL_CURL:
         break;  // TODO: Not implemented
+                // LCOV_EXCL_STOP
     }
   }
+  return CEED_ERROR_SUCCESS;
+}
 
-  // TODO: put in a function + separate collograd logic
-  // Q function
-  code << "\n    // -- Output field setup --\n";
+//------------------------------------------------------------------------------
+// QFunction
+//------------------------------------------------------------------------------
+static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, CeedInt dim, CeedInt num_input_fields,
+                                                     CeedOperatorField *op_input_fields, CeedQFunctionField *qf_input_fields,
+                                                     CeedInt num_output_fields, CeedOperatorField *op_output_fields,
+                                                     CeedQFunctionField *qf_output_fields, std::string qfunction_name, CeedInt Q_1d,
+                                                     bool use_3d_slices) {
+  std::string         Q_name    = "Q_1d";
+  CeedEvalMode        eval_mode = CEED_EVAL_NONE;
+  CeedElemRestriction elem_rstr;
+
+  // Setup output arays
+  code << "\n    // -- Output field setup\n";
   for (CeedInt i = 0; i < num_output_fields; i++) {
-    code << "\n    // ---- Output field " << i << " ----\n";
+    std::string var_suffix = "_out_" + std::to_string(i);
+
+    code << "    // ---- Output field " << i << "\n";
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
+    if (eval_mode == CEED_EVAL_NONE || eval_mode == CEED_EVAL_INTERP) {
+      code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
+    }
     if (eval_mode == CEED_EVAL_GRAD) {
-      if (use_collograd_parallelization) {
+      if (use_3d_slices) {
         // Accumulator for gradient slices
-        code << "    CeedScalar r_tt_" << i << "[num_comp_out_" << i << "*Q_1d];\n";
-        code << "    for (CeedInt i = 0; i < num_comp_out_" << i << "; i++) {\n";
-        code << "      for (CeedInt j = 0; j < Q_1d; ++j) {\n";
-        code << "        r_tt_" << i << "[j + i*Q_1d] = 0.0;\n";
-        code << "      }\n";
+        code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
+        code << "    for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << Q_name << "; i++) {\n";
+        code << "      r_q" << var_suffix << "[i] = 0.0;\n";
         code << "    }\n";
       } else {
-        code << "    CeedScalar r_tt_" << i << "[num_comp_out_" << i << "*dim*Q_1d];\n";
+        code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim*" << Q_name << "];\n";
       }
     }
-    if (eval_mode == CEED_EVAL_NONE || eval_mode == CEED_EVAL_INTERP) {
-      code << "    CeedScalar r_tt_" << i << "[num_comp_out_" << i << "*Q_1d];\n";
-    }
   }
+
   // We treat quadrature points per slice in 3d to save registers
-  if (use_collograd_parallelization) {
+  if (use_3d_slices) {
     code << "\n    // Note: Using planes of 3D elements\n";
     code << "#pragma unroll\n";
-    code << "    for (CeedInt q = 0; q < Q_1d; q++) {\n";
-    code << "      // -- Input fields --\n";
+    code << "    for (CeedInt q = 0; q < " << Q_name << "; q++) {\n";
+    code << "      // -- Input fields\n";
     for (CeedInt i = 0; i < num_input_fields; i++) {
-      code << "      // ---- Input field " << i << " ----\n";
-      // Get elem_size, eval_mode, num_comp
+      std::string var_suffix = "_in_" + std::to_string(i);
+
+      code << "      // ---- Input field " << i << "\n";
       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
       // Basis action
       code << "      // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
@@ -474,25 +447,13 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {
         case CEED_EVAL_NONE:
           bool is_strided;
 
-          code << "      CeedScalar r_q_" << i << "[num_comp_in_" << i << "];\n";
+          code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
 
           CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
           CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided));
-          if (!is_strided) {
-            CeedInt comp_stride;
-
-            CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
-            code << "      const CeedInt l_size_in_" << i << " = " << l_size << ";\n";
-            CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
-            code << "      // CompStride: " << comp_stride << "\n";
-            CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data));
-            data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets;
-            code << "      readSliceQuadsOffset"
-                 << "3d<num_comp_in_" << i << ", " << comp_stride << ", Q_1d>(data, l_size_in_" << i << ", elem, q, indices.inputs[" << i << "], d_u_"
-                 << i << ", r_q_" << i << ");\n";
-          } else {
+          if (is_strided) {
             bool    has_backend_strides;
-            CeedInt num_elem;
+            CeedInt num_elem, elem_size;
 
             CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
             CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides));
@@ -503,199 +464,362 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {
               CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
             }
             code << "      // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n";
-            code << "      readSliceQuadsStrided"
-                 << "3d<num_comp_in_" << i
-                 << ",Q_1d"
-                    ","
-                 << strides[0] << "," << strides[1] << "," << strides[2] << ">(data, elem, q, d_u_" << i << ", r_q_" << i << ");\n";
+            code << "      readSliceQuadsStrided3d<num_comp" << var_suffix << ", " << Q_name << "," << strides[0] << "," << strides[1] << ","
+                 << strides[2] << ">(data, elem, q, d" << var_suffix << ", r_s" << var_suffix << ");\n";
+          } else {
+            CeedSize                  l_size = 0;
+            CeedInt                   comp_stride;
+            CeedElemRestriction_Cuda *rstr_data;
+
+            CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
+            code << "      const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
+            CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
+            code << "      // CompStride: " << comp_stride << "\n";
+            CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data));
+            data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets;
+            code << "      readSliceQuadsOffset3d<num_comp" << var_suffix << ", " << comp_stride << ", " << Q_name << ">(data, l_size" << var_suffix
+                 << ", elem, q, indices.inputs[" << i << "], d" << var_suffix << ", r_s" << var_suffix << ");\n";
           }
           break;
         case CEED_EVAL_INTERP:
-          code << "      CeedScalar r_q_" << i << "[num_comp_in_" << i << "];\n";
-          code << "      for (CeedInt j = 0; j < num_comp_in_" << i << " ; ++j) {\n";
-          code << "        r_q_" << i << "[j] = r_t_" << i << "[q + j*Q_1d];\n";
+          code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
+          code << "      for (CeedInt j = 0; j < num_comp" << var_suffix << "; j++) {\n";
+          code << "        r_s" << var_suffix << "[j] = r_q" << var_suffix << "[q + j*" << Q_name << "];\n";
           code << "      }\n";
           break;
         case CEED_EVAL_GRAD:
-          code << "      CeedScalar r_q_" << i << "[num_comp_in_" << i << "*dim];\n";
-          code << "      gradCollo3d<num_comp_in_" << i << ",Q_1d>(data, q, r_t_" << i << ", s_G_in_" << i << ", r_q_" << i << ");\n";
+          code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim];\n";
+          code << "      gradCollo3d<num_comp" << var_suffix << ", " << Q_name << ">(data, q, r_q" << var_suffix << ", s_G" << var_suffix << ", r_s"
+               << var_suffix << ");\n";
           break;
         case CEED_EVAL_WEIGHT:
-          code << "      CeedScalar r_q_" << i << "[1];\n";
-          code << "      r_q_" << i << "[0] = r_t_" << i << "[q];\n";
+          code << "      CeedScalar r_s" << var_suffix << "[1];\n";
+          code << "      r_s" << var_suffix << "[0] = r_q" << var_suffix << "[q];\n";
           break;  // No action
+                  // LCOV_EXCL_START
         case CEED_EVAL_DIV:
           break;  // TODO: Not implemented
         case CEED_EVAL_CURL:
           break;  // TODO: Not implemented
+                  // LCOV_EXCL_STOP
       }
     }
-    code << "\n      // -- Output fields --\n";
+    code << "\n      // -- Output fields\n";
     for (CeedInt i = 0; i < num_output_fields; i++) {
-      code << "      // ---- Output field " << i << " ----\n";
+      std::string var_suffix = "_out_" + std::to_string(i);
+
+      code << "      // ---- Output field " << i << "\n";
       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
       // Basis action
       switch (eval_mode) {
         case CEED_EVAL_NONE:
-          code << "      CeedScalar r_qq_" << i << "[num_comp_out_" << i << "];\n";
+          code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
           break;  // No action
         case CEED_EVAL_INTERP:
-          code << "      CeedScalar r_qq_" << i << "[num_comp_out_" << i << "];\n";
+          code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
           break;
         case CEED_EVAL_GRAD:
-          code << "      CeedScalar r_qq_" << i << "[num_comp_out_" << i << "*dim];\n";
+          code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim];\n";
           break;
+          // LCOV_EXCL_START
         case CEED_EVAL_WEIGHT:
           break;  // Should not occur
         case CEED_EVAL_DIV:
           break;  // TODO: Not implemented
         case CEED_EVAL_CURL:
           break;  // TODO: Not implemented
+                  // LCOV_EXCL_STOP
       }
     }
   } else {
-    code << "\n      // Note: Using full elements\n";
-    code << "      // -- Input fields --\n";
+    code << "\n    // Note: Using full elements\n";
+    code << "    {\n";
+    code << "      // -- Input fields\n";
     for (CeedInt i = 0; i < num_input_fields; i++) {
-      code << "      // ---- Input field " << i << " ----\n";
-      code << "      CeedScalar* r_q_" << i << " = r_t_" << i << ";\n";
+      code << "      // ---- Input field " << i << "\n";
+      code << "      CeedScalar *r_s_in_" << i << " = r_q_in_" << i << ";\n";
     }
-    code << "      // -- Output fields --\n";
+    code << "      // -- Output fields\n";
     for (CeedInt i = 0; i < num_output_fields; i++) {
-      code << "      // ---- Output field " << i << " ----\n";
-      code << "      CeedScalar* r_qq_" << i << " = r_tt_" << i << ";\n";
+      code << "      // ---- Output field " << i << "\n";
+      code << "      CeedScalar *r_s_out_" << i << " = r_q_out_" << i << ";\n";
     }
   }
-  code << "\n      // -- QFunction Inputs and outputs --\n";
-  code << "      CeedScalar* in[" << num_input_fields << "];\n";
+
+  // Input and output buffers
+  code << "\n      // -- QFunction inputs and outputs\n";
+  code << "      // ---- Inputs\n";
+  code << "      CeedScalar *inputs[" << CeedIntMax(num_input_fields, 1) << "];\n";
   for (CeedInt i = 0; i < num_input_fields; i++) {
-    code << "      // ---- Input field " << i << " ----\n";
-    code << "      in[" << i << "] = r_q_" << i << ";\n";
+    code << "      // ------ Input field " << i << "\n";
+    code << "      inputs[" << i << "] = r_s_in_" << i << ";\n";
   }
-  code << "      CeedScalar* out[" << num_output_fields << "];\n";
+  code << "      // ---- Outputs\n";
+  code << "      CeedScalar *outputs[" << CeedIntMax(num_output_fields, 1) << "];\n";
   for (CeedInt i = 0; i < num_output_fields; i++) {
-    code << "      // ---- Output field " << i << " ----\n";
-    code << "      out[" << i << "] = r_qq_" << i << ";\n";
+    code << "      // ------ Output field " << i << "\n";
+    code << "      outputs[" << i << "] = r_s_out_" << i << ";\n";
   }
-  code << "\n      // -- Apply QFunction --\n";
+
+  // Apply QFunction
+  code << "\n      // -- Apply QFunction\n";
   code << "      " << qfunction_name << "(ctx, ";
-  if (dim != 3 || use_collograd_parallelization) {
+  if (dim != 3 || use_3d_slices) {
     code << "1";
   } else {
     code << "Q_1d";
   }
-  code << ", in, out);\n";
-  if (use_collograd_parallelization) {
-    code << "      // -- Output fields --\n";
+  code << ", inputs, outputs);\n";
+
+  // Copy or apply transpose grad, if needed
+  if (use_3d_slices) {
+    code << "      // -- Output fields\n";
     for (CeedInt i = 0; i < num_output_fields; i++) {
-      code << "      // ---- Output field " << i << " ----\n";
+      std::string var_suffix = "_out_" + std::to_string(i);
+      std::string P_name     = "P_1d" + var_suffix;
+
+      code << "      // ---- Output field " << i << "\n";
       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
       // Basis action
       code << "      // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
       switch (eval_mode) {
         case CEED_EVAL_NONE:
-          code << "      for (CeedInt j = 0; j < num_comp_out_" << i << " ; ++j) {\n";
-          code << "        r_tt_" << i << "[q + j*Q_1d] = r_qq_" << i << "[j];\n";
+          code << "      for (CeedInt j = 0; j < num_comp" << var_suffix << " ; j++) {\n";
+          code << "        r_q" << var_suffix << "[q + j*" << Q_name << "] = r_s" << var_suffix << "[j];\n";
           code << "      }\n";
           break;  // No action
         case CEED_EVAL_INTERP:
-          code << "      for (CeedInt j = 0; j < num_comp_out_" << i << " ; ++j) {\n";
-          code << "        r_tt_" << i << "[q + j*Q_1d] = r_qq_" << i << "[j];\n";
+          code << "      for (CeedInt j = 0; j < num_comp" << var_suffix << " ; j++) {\n";
+          code << "        r_q" << var_suffix << "[q + j*" << Q_name << "] = r_s" << var_suffix << "[j];\n";
           code << "      }\n";
           break;
         case CEED_EVAL_GRAD:
-          code << "      gradColloTranspose3d<num_comp_out_" << i << ",Q_1d>(data, q, r_qq_" << i << ", s_G_out_" << i << ", r_tt_" << i << ");\n";
+          code << "      gradColloTranspose3d<num_comp" << var_suffix << ", " << Q_name << ">(data, q, r_s" << var_suffix << ", s_G" << var_suffix
+               << ", r_q" << var_suffix << ");\n";
           break;
+          // LCOV_EXCL_START
         case CEED_EVAL_WEIGHT:
           break;  // Should not occur
         case CEED_EVAL_DIV:
           break;  // TODO: Not implemented
         case CEED_EVAL_CURL:
           break;  // TODO: Not implemented
+                  // LCOV_EXCL_STOP
       }
     }
-    code << "    }\n";
   }
+  code << "    }\n";
+  return CEED_ERROR_SUCCESS;
+}
 
-  // Output basis apply if needed
-  // Generate the correct eval mode code for each output
-  code << "\n    // -- Output field basis action and restrictions --\n";
-  for (CeedInt i = 0; i < num_output_fields; i++) {
-    code << "    // ---- Output field " << i << " ----\n";
-    // Get elem_size, eval_mode, num_comp
-    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
-    CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
-    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
-    CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
-    // TODO put in a function
-    // Basis action
-    code << "    // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
-    switch (eval_mode) {
-      case CEED_EVAL_NONE:
-        code << "    CeedScalar* r_v_" << i << " = r_tt_" << i << ";\n";
-        break;  // No action
-      case CEED_EVAL_INTERP:
-        code << "    CeedScalar r_v_" << i << "[num_comp_out_" << i << "*P_out_" << i << "];\n";
-        code << "    InterpTranspose" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp_out_" << i << ",P_out_" << i << ",Q_1d>(data, r_tt_" << i
-             << ", s_B_out_" << i << ", r_v_" << i << ");\n";
-        break;
-      case CEED_EVAL_GRAD:
-        code << "    CeedScalar r_v_" << i << "[num_comp_out_" << i << "*P_out_" << i << "];\n";
-        if (use_collograd_parallelization) {
-          code << "    InterpTranspose" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp_out_" << i << ",P_out_" << i << ",Q_1d>(data, r_tt_" << i
-               << ", s_B_out_" << i << ", r_v_" << i << ");\n";
-        } else {
-          CeedInt P_1d;
-          CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
-          CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
-          code << "    GradTranspose" << (dim > 1 ? "Tensor" : "") << (dim == 3 && Q_1d >= P_1d ? "Collocated" : "") << dim << "d<num_comp_out_" << i
-               << ",P_out_" << i << ",Q_1d>(data, r_tt_" << i << ", s_B_out_" << i << ", s_G_out_" << i << ", r_v_" << i << ");\n";
-        }
-        break;
-      // LCOV_EXCL_START
-      case CEED_EVAL_WEIGHT: {
-        return CeedError(CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
-        break;  // Should not occur
-      }
-      case CEED_EVAL_DIV:
-      case CEED_EVAL_CURL: {
-        return CeedError(CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]);
-        break;  // Should not occur
-      }
-        // LCOV_EXCL_STOP
+//------------------------------------------------------------------------------
+// Build single operator kernel
+//------------------------------------------------------------------------------
+extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {
+  bool                    is_tensor = true, use_3d_slices = false;
+  Ceed                    ceed;
+  CeedInt                 Q_1d, num_input_fields, num_output_fields, dim = 1;
+  CeedQFunctionField     *qf_input_fields, *qf_output_fields;
+  CeedQFunction_Cuda_gen *qf_data;
+  CeedQFunction           qf;
+  CeedOperatorField      *op_input_fields, *op_output_fields;
+  CeedOperator_Cuda_gen  *data;
+  std::ostringstream      code;
+
+  {
+    bool is_setup_done;
+
+    CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done));
+    if (is_setup_done) return CEED_ERROR_SUCCESS;
+  }
+
+  CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
+  CeedCallBackend(CeedOperatorGetData(op, &data));
+  CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
+  CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
+  CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
+  CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
+
+  // Get operator data
+  CeedCallBackend(CeedOperatorBuildKernelData_Cuda_gen(ceed, num_input_fields, op_input_fields, qf_input_fields, num_output_fields, op_output_fields,
+                                                       qf_output_fields, &data->max_P_1d, &Q_1d, &dim, &is_tensor, &use_3d_slices));
+  if (dim == 0) dim = 1;
+  data->dim = dim;
+  if (Q_1d == 0) {
+    CeedInt Q;
+
+    CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q));
+    Q_1d = Q;
+  }
+  data->Q_1d = Q_1d;
+
+  // Check for restriction only identity operator
+  {
+    bool is_identity_qf;
+
+    CeedCallBackend(CeedQFunctionIsIdentity(qf, &is_identity_qf));
+    if (is_identity_qf) {
+      CeedEvalMode eval_mode_in, eval_mode_out;
+
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[0], &eval_mode_in));
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[0], &eval_mode_out));
+      CeedCheck(eval_mode_in != CEED_EVAL_NONE || eval_mode_out != CEED_EVAL_NONE, ceed, CEED_ERROR_BACKEND,
+                "Backend does not implement restriction only identity operators");
     }
-    // TODO put in a function
-    // Restriction
-    bool is_strided;
-    CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided));
-    if (!is_strided) {
-      CeedInt comp_stride;
+  }
 
-      CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
-      code << "    const CeedInt l_size_out_" << i << " = " << l_size << ";\n";
-      CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
-      code << "    // CompStride: " << comp_stride << "\n";
-      CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data));
-      data->indices.outputs[i] = (CeedInt *)rstr_data->d_offsets;
-      code << "    writeDofsOffset" << dim << "d<num_comp_out_" << i << ", " << comp_stride << ", P_out_" << i << ">(data, l_size_out_" << i
-           << ", elem, indices.outputs[" << i << "], r_v_" << i << ", d_v_" << i << ");\n";
-    } else {
-      bool    has_backend_strides;
-      CeedInt num_elem;
+  // Add atomicAdd function for old NVidia architectures
+  {
+    Ceed_Cuda            *ceed_data;
+    struct cudaDeviceProp prop;
+
+    CeedCallBackend(CeedGetData(ceed, &ceed_data));
+    CeedCallBackend(cudaGetDeviceProperties(&prop, ceed_data->device_id));
+    if ((prop.major < 6) && (CEED_SCALAR_TYPE != CEED_SCALAR_FP32)) {
+      char       *atomic_add_source;
+      const char *atomic_add_path;
+
+      CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-atomic-add-fallback.h", &atomic_add_path));
+      CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Atomic Add Source -----\n");
+      CeedCallBackend(CeedLoadSourceToBuffer(ceed, atomic_add_path, &atomic_add_source));
+      code << atomic_add_source;
+      CeedCallBackend(CeedFree(&atomic_add_path));
+      CeedCallBackend(CeedFree(&atomic_add_source));
+    }
+  }
 
-      CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides));
-      CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem));
-      CeedInt strides[3] = {1, elem_size * num_elem, elem_size};
+  // Load basis source files
+  // TODO: Add non-tensor, AtPoints
+  {
+    char       *tensor_basis_kernel_source;
+    const char *tensor_basis_kernel_path;
 
-      if (!has_backend_strides) {
-        CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
-      }
-      code << "    // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n";
-      code << "    writeDofsStrided" << dim << "d<num_comp_out_" << i << ",P_out_" << i << "," << strides[0] << "," << strides[1] << "," << strides[2]
-           << ">(data, elem, r_v_" << i << ", d_v_" << i << ");\n";
+    CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h", &tensor_basis_kernel_path));
+    CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Tensor Basis Kernel Source -----\n");
+    CeedCallBackend(CeedLoadSourceToBuffer(ceed, tensor_basis_kernel_path, &tensor_basis_kernel_source));
+    code << tensor_basis_kernel_source;
+    CeedCallBackend(CeedFree(&tensor_basis_kernel_path));
+    CeedCallBackend(CeedFree(&tensor_basis_kernel_source));
+  }
+  {
+    char       *cuda_gen_template_source;
+    const char *cuda_gen_template_path;
+
+    CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-gen-templates.h", &cuda_gen_template_path));
+    CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Cuda-Gen Template Source -----\n");
+    CeedCallBackend(CeedLoadSourceToBuffer(ceed, cuda_gen_template_path, &cuda_gen_template_source));
+    code << cuda_gen_template_source;
+    CeedCallBackend(CeedFree(&cuda_gen_template_path));
+    CeedCallBackend(CeedFree(&cuda_gen_template_source));
+  }
+
+  // Get QFunction name
+  std::string qfunction_name(qf_data->qfunction_name);
+  std::string operator_name;
+
+  operator_name = "CeedKernelCudaGenOperator_" + qfunction_name;
+
+  // Define CEED_Q_VLA
+  code << "\n#undef CEED_Q_VLA\n";
+  if (dim != 3 || use_3d_slices) {
+    code << "#define CEED_Q_VLA 1\n\n";
+  } else {
+    code << "#define CEED_Q_VLA " << Q_1d << "\n\n";
+  }
+
+  // Add user QFunction source
+  {
+    std::string qfunction_source(qf_data->qfunction_source);
+
+    code << qfunction_source;
+  }
+
+  // Setup
+  code << "\n// -----------------------------------------------------------------------------\n";
+  code << "// Operator Kernel\n";
+  code << "// \n";
+  code << "// d_[in,out]_i:   CeedVector device array\n";
+  code << "// r_[in,out]_e_i: Element vector register\n";
+  code << "// r_[in,out]_q_i: Quadrature space vector register\n";
+  code << "// r_[in,out]_s_i: Quadrature space slice  vector register\n";
+  code << "// \n";
+  code << "// s_B_[in,out]_i: Interpolation matrix, shared memory\n";
+  code << "// s_G_[in,out]_i: Gradient matrix, shared memory\n";
+  code << "// -----------------------------------------------------------------------------\n";
+  code << "extern \"C\" __global__ void " << operator_name
+       << "(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalar *W) {\n";
+
+  // Scratch buffers
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedEvalMode eval_mode;
+
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
+    if (eval_mode != CEED_EVAL_WEIGHT) {  // Skip CEED_EVAL_WEIGHT
+      code << "  const CeedScalar *d_in_" << i << " = fields.inputs[" << i << "];\n";
     }
   }
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    code << "  CeedScalar *d_out_" << i << " = fields.outputs[" << i << "];\n";
+  }
+
+  code << "  const CeedInt dim = " << dim << ";\n";
+  code << "  const CeedInt Q_1d = " << Q_1d << ";\n";
+
+  // Shared data
+  code << "  extern __shared__ CeedScalar slice[];\n";
+  code << "  SharedData_Cuda data;\n";
+  code << "  data.t_id_x = threadIdx.x;\n";
+  code << "  data.t_id_y = threadIdx.y;\n";
+  code << "  data.t_id_z = threadIdx.z;\n";
+  code << "  data.t_id  = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
+  code << "  data.slice = slice + data.t_id_z*T_1D" << (dim > 1 ? "*T_1D" : "") << ";\n";
+
+  // Initialize constants, and matrices B and G
+  code << "\n  // Input field constants and basis data\n";
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedCall(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, i, op_input_fields[i], qf_input_fields[i], Q_1d, true, use_3d_slices));
+  }
+  code << "\n  // Output field constants and basis data\n";
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    CeedCall(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, i, op_output_fields[i], qf_output_fields[i], Q_1d, false, use_3d_slices));
+  }
+
+  // Loop over all elements
+  code << "\n  // Element loop\n";
+  code << "  __syncthreads();\n";
+  code << "  for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n";
+
+  // -- Input restriction and basis
+  code << "    // -- Input field restrictions and basis actions\n";
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    code << "    // ---- Input field " << i << "\n";
+
+    // ---- Restriction
+    CeedCallBackend(
+        CeedOperatorBuildKernelRestriction_Cuda_gen(code, data, i, dim, op_input_fields[i], qf_input_fields[i], Q_1d, true, use_3d_slices));
+
+    // ---- Basis action
+    CeedCallBackend(CeedOperatorBuildKernelBasis_Cuda_gen(code, data, i, dim, op_input_fields[i], qf_input_fields[i], Q_1d, true, use_3d_slices));
+  }
+
+  // -- Q function
+  CeedCallBackend(CeedOperatorBuildKernelQFunction_Cuda_gen(code, data, dim, num_input_fields, op_input_fields, qf_input_fields, num_output_fields,
+                                                            op_output_fields, qf_output_fields, qfunction_name, Q_1d, use_3d_slices));
+
+  // -- Output basis and restriction
+  code << "\n    // -- Output field basis action and restrictions\n";
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    code << "    // ---- Output field " << i << "\n";
+
+    // ---- Basis action
+    CeedCallBackend(CeedOperatorBuildKernelBasis_Cuda_gen(code, data, i, dim, op_output_fields[i], qf_output_fields[i], Q_1d, false, use_3d_slices));
+
+    // ---- Restriction
+    CeedCallBackend(
+        CeedOperatorBuildKernelRestriction_Cuda_gen(code, data, i, dim, op_output_fields[i], qf_output_fields[i], Q_1d, false, use_3d_slices));
+  }
 
+  // Close loop and function
   code << "  }\n";
   code << "}\n";
   code << "// -----------------------------------------------------------------------------\n\n";
diff --git a/backends/hip-gen/ceed-hip-gen-operator-build.cpp b/backends/hip-gen/ceed-hip-gen-operator-build.cpp
index c4878a5fed..623c3deb9a 100644
--- a/backends/hip-gen/ceed-hip-gen-operator-build.cpp
+++ b/backends/hip-gen/ceed-hip-gen-operator-build.cpp
@@ -50,340 +50,227 @@ extern "C" int BlockGridCalculate_Hip_gen(const CeedInt dim, const CeedInt num_e
 }
 
 //------------------------------------------------------------------------------
-// Build single operator kernel
+// Determine type of operator
 //------------------------------------------------------------------------------
-extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op) {
-  using std::ostringstream;
-  using std::string;
-
-  Ceed                     ceed;
-  bool                     is_setup_done, is_identity_qf;
-  CeedSize                 l_size;
-  CeedInt                  Q, P_1d = 0, Q_1d = 0, elem_size, num_input_fields, num_output_fields, num_comp, dim = 1;
-  CeedEvalMode             eval_mode;
-  CeedElemRestriction      elem_rstr;
-  CeedElemRestriction_Hip *rstr_data;
-  CeedBasis                basis;
-  CeedBasis_Hip_shared    *basis_data;
-  CeedQFunctionField      *qf_input_fields, *qf_output_fields;
-  CeedQFunction_Hip_gen   *qf_data;
-  CeedQFunction            qf;
-  CeedOperatorField       *op_input_fields, *op_output_fields;
-  CeedOperator_Hip_gen    *data;
-
-  CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done));
-  if (is_setup_done) return CEED_ERROR_SUCCESS;
-
-  CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
-  CeedCallBackend(CeedOperatorGetData(op, &data));
-  CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
-  CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
-  CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q));
-  Q_1d = Q;
-  CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
-  CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
-
-  // TODO: put in a function?
-  // Check for restriction only identity operator
-  CeedCallBackend(CeedQFunctionIsIdentity(qf, &is_identity_qf));
-  if (is_identity_qf) {
-    CeedEvalMode eval_mode_in, eval_mode_out;
-
-    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[0], &eval_mode_in));
-    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[0], &eval_mode_out));
-    CeedCheck(eval_mode_in != CEED_EVAL_NONE || eval_mode_out != CEED_EVAL_NONE, ceed, CEED_ERROR_BACKEND,
-              "Backend does not implement restriction only identity operators");
-  }
-
-  ostringstream code;
-
-  // Load basis source files
-  // TODO: generalize to accept different device functions?
-  {
-    char       *tensor_basis_kernel_source;
-    const char *tensor_basis_kernel_path;
-
-    CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-shared-basis-tensor-templates.h", &tensor_basis_kernel_path));
-    CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Tensor Basis Kernel Source -----\n");
-    CeedCallBackend(CeedLoadSourceToBuffer(ceed, tensor_basis_kernel_path, &tensor_basis_kernel_source));
-    code << tensor_basis_kernel_source;
-    CeedCallBackend(CeedFree(&tensor_basis_kernel_path));
-    CeedCallBackend(CeedFree(&tensor_basis_kernel_source));
-  }
-  {
-    char       *hip_gen_template_source;
-    const char *hip_gen_template_path;
-
-    CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-gen-templates.h", &hip_gen_template_path));
-    CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Hip-Gen Template Source -----\n");
-    CeedCallBackend(CeedLoadSourceToBuffer(ceed, hip_gen_template_path, &hip_gen_template_source));
-    code << hip_gen_template_source;
-    CeedCallBackend(CeedFree(&hip_gen_template_path));
-    CeedCallBackend(CeedFree(&hip_gen_template_source));
-  }
-
-  // Get QFunction source and name
-  string qfunction_source(qf_data->qfunction_source);
-  string qfunction_name(qf_data->qfunction_name);
-  string operator_name;
-  operator_name = "CeedKernelHipGenOperator_" + qfunction_name;
-
+static int CeedOperatorBuildKernelData_Hip_gen(Ceed ceed, CeedInt num_input_fields, CeedOperatorField *op_input_fields,
+                                               CeedQFunctionField *qf_input_fields, CeedInt num_output_fields, CeedOperatorField *op_output_fields,
+                                               CeedQFunctionField *qf_output_fields, CeedInt *max_P_1d, CeedInt *Q_1d, CeedInt *dim, bool *is_tensor,
+                                               bool *use_3d_slices) {
   // Find dim, P_1d, Q_1d
-  data->max_P_1d = 0;
+  *max_P_1d  = 0;
+  *Q_1d      = 0;
+  *dim       = 0;
+  *is_tensor = true;
   for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedBasis basis;
+
     CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
     if (basis != CEED_BASIS_NONE) {
-      bool is_tensor;
-
-      CeedCallBackend(CeedBasisGetData(basis, &basis_data));
-      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
+      bool    is_field_tensor;
+      CeedInt field_P_1d = 0, field_Q_1d = 0, field_dim = 0;
 
       // Collect dim, P_1d, and Q_1d
-      CeedCallBackend(CeedBasisGetDimension(basis, &dim));
-      CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
-      CeedCheck(is_tensor, ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis");
-      CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
-      CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
-      if (P_1d > data->max_P_1d) data->max_P_1d = P_1d;
+      CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
+      CeedCheck(is_field_tensor, ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis");
+      *is_tensor = *is_tensor && is_field_tensor;
+      CeedCallBackend(CeedBasisGetNumNodes1D(basis, &field_P_1d));
+      *max_P_1d = CeedIntMax(*max_P_1d, field_P_1d);
+      CeedCallBackend(CeedBasisGetDimension(basis, &field_dim));
+      CeedCheck(*dim == 0 || field_dim == *dim, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
+      *dim = field_dim;
+      CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &field_Q_1d));
+      CeedCheck(*Q_1d == 0 || field_Q_1d == *Q_1d, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
+      *Q_1d = field_Q_1d;
     }
   }
-  // Check output bases for Q_1d, dim as well
-  //   The only input basis might be CEED_BASIS_NONE
   for (CeedInt i = 0; i < num_output_fields; i++) {
-    CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
+    CeedBasis basis;
 
+    CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
     if (basis != CEED_BASIS_NONE) {
-      bool is_tensor;
-
-      CeedCallBackend(CeedBasisGetData(basis, &basis_data));
-      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
+      bool    is_field_tensor;
+      CeedInt field_P_1d = 0, field_Q_1d = 0, field_dim = 0;
 
-      // Collect Q_1d
-      CeedCallBackend(CeedBasisGetDimension(basis, &dim));
-      CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
-      CeedCheck(is_tensor, ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis");
-      CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
+      // Collect dim, P_1d, and Q_1d
+      CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
+      CeedCheck(is_field_tensor, ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis");
+      *is_tensor = *is_tensor && is_field_tensor;
+      CeedCallBackend(CeedBasisGetNumNodes1D(basis, &field_P_1d));
+      *max_P_1d = CeedIntMax(*max_P_1d, field_P_1d);
+      CeedCallBackend(CeedBasisGetDimension(basis, &field_dim));
+      CeedCheck(*dim == 0 || field_dim == *dim, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
+      *dim = field_dim;
+      CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &field_Q_1d));
+      CeedCheck(*Q_1d == 0 || field_Q_1d == *Q_1d, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
+      *Q_1d = field_Q_1d;
     }
   }
-  data->dim  = dim;
-  data->Q_1d = Q_1d;
 
   // Only use 3D collocated gradient parallelization strategy when gradient is computed
-  // TODO: put in a function?
-  bool use_collograd_parallelization = false;
-
-  if (dim == 3) {
+  *use_3d_slices = false;
+  if (*dim == 3) {
     bool was_grad_found = false;
 
     for (CeedInt i = 0; i < num_input_fields; i++) {
+      CeedEvalMode eval_mode;
+
       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
       if (eval_mode == CEED_EVAL_GRAD) {
+        CeedBasis_Hip_shared *basis_data;
+        CeedBasis             basis;
+
         CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
         CeedCallBackend(CeedBasisGetData(basis, &basis_data));
-        use_collograd_parallelization = basis_data->d_collo_grad_1d && (was_grad_found ? use_collograd_parallelization : true);
-        was_grad_found                = true;
+        *use_3d_slices = basis_data->d_collo_grad_1d && (was_grad_found ? *use_3d_slices : true);
+        was_grad_found = true;
       }
     }
     for (CeedInt i = 0; i < num_output_fields; i++) {
+      CeedEvalMode eval_mode;
+
       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
       if (eval_mode == CEED_EVAL_GRAD) {
+        CeedBasis_Hip_shared *basis_data;
+        CeedBasis             basis;
+
         CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
         CeedCallBackend(CeedBasisGetData(basis, &basis_data));
-        use_collograd_parallelization = basis_data->d_collo_grad_1d && (was_grad_found ? use_collograd_parallelization : true);
-        was_grad_found                = true;
+        *use_3d_slices = basis_data->d_collo_grad_1d && (was_grad_found ? *use_3d_slices : true);
+        was_grad_found = true;
       }
     }
   }
+  return CEED_ERROR_SUCCESS;
+}
 
-  // Define CEED_Q_VLA
-  code << "\n#undef CEED_Q_VLA\n";
-  if (dim != 3 || use_collograd_parallelization) {
-    code << "#define CEED_Q_VLA 1\n\n";
-  } else {
-    code << "#define CEED_Q_VLA " << Q_1d << "\n\n";
+//------------------------------------------------------------------------------
+// Setup fields
+//------------------------------------------------------------------------------
+static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, CeedInt i, CeedOperatorField op_field,
+                                                    CeedQFunctionField qf_field, CeedInt Q_1d, bool is_input, bool use_3d_slices) {
+  std::string           var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
+  std::string           P_name = "P_1d" + var_suffix, Q_name = "Q_1d";
+  std::string           option_name = (is_input ? "inputs" : "outputs");
+  CeedEvalMode          eval_mode   = CEED_EVAL_NONE;
+  CeedInt               elem_size = 0, num_comp = 0, P_1d = 0;
+  CeedElemRestriction   elem_rstr;
+  CeedBasis_Hip_shared *basis_data;
+  CeedBasis             basis;
+
+  code << "  // -- " << (is_input ? "Input" : "Output") << " field " << i << "\n";
+
+  // Get field data
+  CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr));
+  if (elem_rstr != CEED_ELEMRESTRICTION_NONE) {
+    CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
+    CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
   }
-
-  code << qfunction_source;
-
-  // Setup
-  code << "\n// -----------------------------------------------------------------------------\n";
-  code << "\nextern \"C\" __launch_bounds__(BLOCK_SIZE)\n";
-  code << "__global__ void " << operator_name
-       << "(CeedInt num_elem, void* ctx, FieldsInt_Hip indices, Fields_Hip fields, Fields_Hip B, Fields_Hip G, CeedScalar* W) {\n";
-  for (CeedInt i = 0; i < num_input_fields; i++) {
-    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
-    if (eval_mode != CEED_EVAL_WEIGHT) {  // Skip CEED_EVAL_WEIGHT
-      code << "  const CeedScalar* d_u_" << i << " = fields.inputs[" << i << "];\n";
-    }
+  CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis));
+  if (basis != CEED_BASIS_NONE) {
+    CeedCallBackend(CeedBasisGetData(basis, &basis_data));
+    CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
   }
+  CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode));
 
-  for (CeedInt i = 0; i < num_output_fields; i++) {
-    code << "  CeedScalar* d_v_" << i << " = fields.outputs[" << i << "];\n";
+  // Set field constants
+  if (eval_mode != CEED_EVAL_WEIGHT) {
+    code << "  const CeedInt " << P_name << " = " << (basis == CEED_BASIS_NONE ? Q_1d : P_1d) << ";\n";
+    code << "  const CeedInt num_comp" << var_suffix << " = " << num_comp << ";\n";
   }
 
-  code << "  const CeedInt dim = " << dim << ";\n";
-  code << "  const CeedInt Q_1d = " << Q_1d << ";\n";
-
-  code << "  HIP_DYNAMIC_SHARED( CeedScalar, slice)\n";
-  // TODO put in a function? InitSharedData_Hip?
-  code << "  SharedData_Hip data;\n";
-  code << "  data.t_id_x = threadIdx.x;\n";
-  code << "  data.t_id_y = threadIdx.y;\n";
-  code << "  data.t_id_z = threadIdx.z;\n";
-  code << "  data.t_id  = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
-  code << "  data.slice = slice+data.t_id_z*T_1D" << (dim > 1 ? "*T_1D" : "") << ";\n";
-
-  code << "\n  // -- Input field constants and basis data --\n";
-  // TODO: Put in a function?
-  // Initialize constants, and matrices B and G
-  for (CeedInt i = 0; i < num_input_fields; i++) {
-    code << "  // ---- Input field " << i << " ----\n";
-    // Get elem_size, eval_mode, num_comp
-    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
-    CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
-    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
-    CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
-
-    // Set field constants
-    if (eval_mode != CEED_EVAL_WEIGHT) {
-      CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
-      if (basis != CEED_BASIS_NONE) {
-        CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
-        code << "  const CeedInt P_in_" << i << " = " << P_1d << ";\n";
+  // Load basis data
+  code << "  // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
+  switch (eval_mode) {
+    case CEED_EVAL_NONE:
+      break;
+    case CEED_EVAL_INTERP:
+      if (is_input) data->B.inputs[i] = basis_data->d_interp_1d;
+      else data->B.outputs[i] = basis_data->d_interp_1d;
+      code << "  __shared__ CeedScalar s_B" << var_suffix << "[" << P_1d * Q_1d << "];\n";
+      code << "  loadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
+      break;
+    case CEED_EVAL_GRAD:
+      if (is_input) data->B.inputs[i] = basis_data->d_interp_1d;
+      else data->B.outputs[i] = basis_data->d_interp_1d;
+      code << "  __shared__ CeedScalar s_B" << var_suffix << "[" << P_1d * Q_1d << "];\n";
+      code << "  loadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
+      if (use_3d_slices) {
+        if (is_input) data->G.inputs[i] = basis_data->d_collo_grad_1d;
+        else data->G.outputs[i] = basis_data->d_collo_grad_1d;
+        code << "  __shared__ CeedScalar s_G" << var_suffix << "[" << Q_1d * Q_1d << "];\n";
+        code << "  loadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
       } else {
-        code << "  const CeedInt P_in_" << i << " = " << Q_1d << ";\n";
-      }
-      code << "  const CeedInt num_comp_in_" << i << " = " << num_comp << ";\n";
-    }
+        bool has_collo_grad = basis_data->d_collo_grad_1d;
 
-    // Load basis data
-    code << "  // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
-    switch (eval_mode) {
-      case CEED_EVAL_NONE:
-        break;
-      case CEED_EVAL_INTERP:
-        CeedCallBackend(CeedBasisGetData(basis, &basis_data));
-        data->B.inputs[i] = basis_data->d_interp_1d;
-        code << "  __shared__ CeedScalar s_B_in_" << i << "[" << P_1d * Q_1d << "];\n";
-        code << "  loadMatrix<P_in_" << i << ",Q_1d>(data, B.inputs[" << i << "], s_B_in_" << i << ");\n";
-        break;
-      case CEED_EVAL_GRAD:
-        CeedCallBackend(CeedBasisGetData(basis, &basis_data));
-        data->B.inputs[i] = basis_data->d_interp_1d;
-        code << "  __shared__ CeedScalar s_B_in_" << i << "[" << P_1d * Q_1d << "];\n";
-        code << "  loadMatrix<P_in_" << i << ",Q_1d>(data, B.inputs[" << i << "], s_B_in_" << i << ");\n";
-        if (use_collograd_parallelization) {
-          data->G.inputs[i] = basis_data->d_collo_grad_1d;
-          code << "  __shared__ CeedScalar s_G_in_" << i << "[" << Q_1d * Q_1d << "];\n";
-          code << "  loadMatrix<Q_1d,Q_1d>(data, G.inputs[" << i << "], s_G_in_" << i << ");\n";
+        if (is_input) data->G.inputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d;
+        else data->G.outputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d;
+        if (has_collo_grad) {
+          code << "  __shared__ CeedScalar s_G" << var_suffix << "[" << Q_1d * Q_1d << "];\n";
+          code << "  loadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
         } else {
-          bool has_collo_grad = basis_data->d_collo_grad_1d;
-          data->G.inputs[i]   = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d;
-          code << "  __shared__ CeedScalar s_G_in_" << i << "[" << Q_1d * (has_collo_grad ? Q_1d : P_1d) << "];\n";
-          code << "  loadMatrix<" << (has_collo_grad ? "Q_1d" : ("P_in_" + std::to_string(i))) << ",Q_1d>(data, G.inputs[" << i << "], s_G_in_" << i
-               << ");\n";
+          code << "  __shared__ CeedScalar s_G" << var_suffix << "[" << Q_1d * P_1d << "];\n";
+          code << "  loadMatrix<" << P_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
         }
-        break;
-      case CEED_EVAL_WEIGHT:
-        break;  // No action
-      case CEED_EVAL_DIV:
-        break;  // TODO: Not implemented
-      case CEED_EVAL_CURL:
-        break;  // TODO: Not implemented
-    }
+      }
+      break;
+    case CEED_EVAL_WEIGHT:
+      break;  // No action
+      // LCOV_EXCL_START
+    case CEED_EVAL_DIV:
+      break;  // TODO: Not implemented
+    case CEED_EVAL_CURL:
+      break;  // TODO: Not implemented
+              // LCOV_EXCL_STOP
   }
+  return CEED_ERROR_SUCCESS;
+}
 
-  code << "\n  // -- Output field constants and basis data --\n";
-  for (CeedInt i = 0; i < num_output_fields; i++) {
-    code << "  // ---- Output field " << i << " ----\n";
-    // Get elem_size, eval_mode, num_comp
-    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
-    CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
-    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
-    CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
-
-    // Set field constants
-    CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
-    if (basis != CEED_BASIS_NONE) {
-      CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
-      code << "  const CeedInt P_out_" << i << " = " << P_1d << ";\n";
-    } else {
-      code << "  const CeedInt P_out_" << i << " = " << Q_1d << ";\n";
-    }
-    code << "  const CeedInt num_comp_out_" << i << " = " << num_comp << ";\n";
+//------------------------------------------------------------------------------
+// Restriction
+//------------------------------------------------------------------------------
+static int CeedOperatorBuildKernelRestriction_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, CeedInt i, CeedInt dim,
+                                                      CeedOperatorField op_field, CeedQFunctionField qf_field, CeedInt Q_1d, bool is_input,
+                                                      bool use_3d_slices) {
+  std::string              var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
+  std::string              P_name     = "P_1d" + var_suffix;
+  CeedEvalMode             eval_mode  = CEED_EVAL_NONE;
+  CeedInt                  elem_size = 0, num_comp = 0, P_1d = 0;
+  CeedSize                 l_size;
+  CeedElemRestriction_Hip *rstr_data;
+  CeedElemRestriction      elem_rstr;
+  CeedBasis                basis;
 
-    // Load basis data
-    code << "  // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
-    switch (eval_mode) {
-      case CEED_EVAL_NONE:
-        break;  // No action
-      case CEED_EVAL_INTERP:
-        CeedCallBackend(CeedBasisGetData(basis, &basis_data));
-        data->B.outputs[i] = basis_data->d_interp_1d;
-        code << "  __shared__ CeedScalar s_B_out_" << i << "[" << P_1d * Q_1d << "];\n";
-        code << "  loadMatrix<P_out_" << i << ",Q_1d>(data, B.outputs[" << i << "], s_B_out_" << i << ");\n";
-        break;
-      case CEED_EVAL_GRAD:
-        CeedCallBackend(CeedBasisGetData(basis, &basis_data));
-        data->B.outputs[i] = basis_data->d_interp_1d;
-        code << "  __shared__ CeedScalar s_B_out_" << i << "[" << P_1d * Q_1d << "];\n";
-        code << "  loadMatrix<P_out_" << i << ",Q_1d>(data, B.outputs[" << i << "], s_B_out_" << i << ");\n";
-        if (use_collograd_parallelization) {
-          data->G.outputs[i] = basis_data->d_collo_grad_1d;
-          code << "  __shared__ CeedScalar s_G_out_" << i << "[" << Q_1d * Q_1d << "];\n";
-          code << "  loadMatrix<Q_1d,Q_1d>(data, G.outputs[" << i << "], s_G_out_" << i << ");\n";
-        } else {
-          bool has_collo_grad = basis_data->d_collo_grad_1d;
-          data->G.outputs[i]  = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d;
-          code << "  __shared__ CeedScalar s_G_out_" << i << "[" << Q_1d * (has_collo_grad ? Q_1d : P_1d) << "];\n";
-          code << "  loadMatrix<" << (has_collo_grad ? "Q_1d" : ("P_out_" + std::to_string(i))) << ",Q_1d>(data, G.outputs[" << i << "], s_G_out_"
-               << i << ");\n";
-        }
-        break;
-      // LCOV_EXCL_START
-      case CEED_EVAL_WEIGHT: {
-        return CeedError(CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
-        break;  // Should not occur
-      }
-      case CEED_EVAL_DIV:
-      case CEED_EVAL_CURL: {
-        return CeedError(CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]);
-        break;  // Should not occur
-      }
-        // LCOV_EXCL_STOP
-    }
-  }
-  code << "\n  // -- Element loop --\n";
-  code << "  __syncthreads();\n";
-  code << "  for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n";
-  // Input basis apply if needed
-  // Generate the correct eval mode code for each input
-  code << "    // -- Input field restrictions and basis actions --\n";
-  for (CeedInt i = 0; i < num_input_fields; i++) {
-    code << "    // ---- Input field " << i << " ----\n";
-    // Get elem_size, eval_mode, num_comp
-    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
+  // Get field data
+  CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr));
+  if (elem_rstr != CEED_ELEMRESTRICTION_NONE) {
     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
-    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
+    CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data));
+  }
+  CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis));
+  if (basis != CEED_BASIS_NONE) {
+    CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
+  }
+  CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode));
 
-    // Restriction
-    if (eval_mode != CEED_EVAL_WEIGHT && !((eval_mode == CEED_EVAL_NONE) && use_collograd_parallelization)) {
+  // Restriction
+  if (is_input) {
+    // Input
+    if (eval_mode != CEED_EVAL_WEIGHT && !((eval_mode == CEED_EVAL_NONE) && use_3d_slices)) {
       bool is_strided;
 
-      code << "    CeedScalar r_u_" << i << "[num_comp_in_" << i << "*P_in_" << i << "];\n";
-
+      code << "    CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << P_name << "];\n";
       CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided));
       if (!is_strided) {
-        CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
-        code << "    const CeedInt l_size_in_" << i << " = " << l_size << ";\n";
         CeedInt comp_stride;
+
+        CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
+        code << "    const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
         CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
         code << "    // CompStride: " << comp_stride << "\n";
-        CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data));
         data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets;
-        code << "    readDofsOffset" << dim << "d<num_comp_in_" << i << ", " << comp_stride << ", P_in_" << i << ">(data, l_size_in_" << i
-             << ", elem, indices.inputs[" << i << "], d_u_" << i << ", r_u_" << i << ");\n";
+        code << "    readDofsOffset" << dim << "d<num_comp" << var_suffix << ", " << comp_stride << ", " << P_name << ">(data, l_size" << var_suffix
+             << ", elem, indices.inputs[" << i << "], d" << var_suffix << ", r_e" << var_suffix << ");\n";
       } else {
         bool    has_backend_strides;
         CeedInt num_elem;
@@ -396,85 +283,190 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op) {
           CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
         }
         code << "    // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n";
-        code << "    readDofsStrided" << dim << "d<num_comp_in_" << i << ",P_in_" << i << "," << strides[0] << "," << strides[1] << "," << strides[2]
-             << ">(data, elem, d_u_" << i << ", r_u_" << i << ");\n";
+        code << "    readDofsStrided" << dim << "d<num_comp" << var_suffix << ", " << P_name << "," << strides[0] << "," << strides[1] << ","
+             << strides[2] << ">(data, elem, d" << var_suffix << ", r_e" << var_suffix << ");\n";
       }
     }
+  } else {
+    // Output
+    bool is_strided;
 
-    // TODO: put in a function?
-    // Basis action
-    code << "    // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
+    CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided));
+    if (!is_strided) {
+      CeedInt comp_stride;
+
+      CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
+      code << "    const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
+      CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
+      code << "    // CompStride: " << comp_stride << "\n";
+      data->indices.outputs[i] = (CeedInt *)rstr_data->d_offsets;
+      code << "    writeDofsOffset" << dim << "d<num_comp" << var_suffix << ", " << comp_stride << ", " << P_name << ">(data, l_size" << var_suffix
+           << ", elem, indices.outputs[" << i << "], r_e" << var_suffix << ", d" << var_suffix << ");\n";
+    } else {
+      bool    has_backend_strides;
+      CeedInt num_elem;
+
+      CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides));
+      CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem));
+      CeedInt strides[3] = {1, elem_size * num_elem, elem_size};
+
+      if (!has_backend_strides) {
+        CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
+      }
+      code << "    // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n";
+      code << "    writeDofsStrided" << dim << "d<num_comp" << var_suffix << ", " << P_name << "," << strides[0] << "," << strides[1] << ","
+           << strides[2] << ">(data, elem, r_e" << var_suffix << ", d" << var_suffix << ");\n";
+    }
+  }
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
+// Basis
+//------------------------------------------------------------------------------
+static int CeedOperatorBuildKernelBasis_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, CeedInt i, CeedInt dim,
+                                                CeedOperatorField op_field, CeedQFunctionField qf_field, CeedInt Q_1d, bool is_input,
+                                                bool use_3d_slices) {
+  std::string         var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
+  std::string         P_name = "P_1d" + var_suffix, Q_name = "Q_1d";
+  CeedEvalMode        eval_mode = CEED_EVAL_NONE;
+  CeedInt             elem_size = 0, num_comp = 0, P_1d = 0;
+  CeedElemRestriction elem_rstr;
+  CeedBasis           basis;
+
+  // Get field data
+  CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr));
+  if (elem_rstr != CEED_ELEMRESTRICTION_NONE) {
+    CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
+    CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
+  }
+  CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis));
+  if (basis != CEED_BASIS_NONE) {
+    CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
+  }
+  CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode));
+
+  // Basis
+  code << "    // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
+  if (is_input) {
     switch (eval_mode) {
       case CEED_EVAL_NONE:
-        if (!use_collograd_parallelization) {
-          code << "    CeedScalar* r_t_" << i << " = r_u_" << i << ";\n";
+        if (!use_3d_slices) {
+          code << "    CeedScalar *r_q" << var_suffix << " = r_e" << var_suffix << ";\n";
         }
         break;
       case CEED_EVAL_INTERP:
-        code << "    CeedScalar r_t_" << i << "[num_comp_in_" << i << "*Q_1d];\n";
-        code << "    Interp" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp_in_" << i << ",P_in_" << i << ",Q_1d>(data, r_u_" << i << ", s_B_in_"
-             << i << ", r_t_" << i << ");\n";
+        code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
+        code << "    Interp" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp" << var_suffix << ", P_1d" << var_suffix << ", " << Q_name
+             << ">(data, r_e" << var_suffix << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n";
         break;
       case CEED_EVAL_GRAD:
-        if (use_collograd_parallelization) {
-          code << "    CeedScalar r_t_" << i << "[num_comp_in_" << i << "*Q_1d];\n";
-          code << "    Interp" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp_in_" << i << ",P_in_" << i << ",Q_1d>(data, r_u_" << i
-               << ", s_B_in_" << i << ", r_t_" << i << ");\n";
+        if (use_3d_slices) {
+          code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
+          code << "    Interp" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp" << var_suffix << ", P_1d" << var_suffix << ", " << Q_name
+               << ">(data, r_e" << var_suffix << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n";
         } else {
-          CeedInt P_1d;
-          CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
-          CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
-          code << "    CeedScalar r_t_" << i << "[num_comp_in_" << i << "*dim*Q_1d];\n";
-          code << "    Grad" << (dim > 1 ? "Tensor" : "") << (dim == 3 && Q_1d >= P_1d ? "Collocated" : "") << dim << "d<num_comp_in_" << i
-               << ",P_in_" << i << ",Q_1d>(data, r_u_" << i << ", s_B_in_" << i << ", s_G_in_" << i << ", r_t_" << i << ");\n";
+          code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim*" << Q_name << "];\n";
+          code << "    Grad" << (dim > 1 ? "Tensor" : "") << (dim == 3 && Q_1d >= P_1d ? "Collocated" : "") << dim << "d<num_comp" << var_suffix
+               << ", P_1d" << var_suffix << ", " << Q_name << ">(data, r_e" << var_suffix << ", s_B" << var_suffix << ", s_G" << var_suffix << ", r_q"
+               << var_suffix << ");\n";
         }
         break;
-      case CEED_EVAL_WEIGHT:
-        code << "    CeedScalar r_t_" << i << "[Q_1d];\n";
-        CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
+      case CEED_EVAL_WEIGHT: {
+        CeedBasis_Hip_shared *basis_data;
+
+        code << "    CeedScalar r_q" << var_suffix << "[" << Q_name << "];\n";
         CeedCallBackend(CeedBasisGetData(basis, &basis_data));
         data->W = basis_data->d_q_weight_1d;
-        code << "    Weight" << (dim > 1 ? "Tensor" : "") << dim << "d<Q_1d>(data, W, r_t_" << i << ");\n";
+        code << "    Weight" << (dim > 1 ? "Tensor" : "") << dim << "d<" << Q_name << ">(data, W, r_q" << var_suffix << ");\n";
+        break;
+      }
+      // LCOV_EXCL_START
+      case CEED_EVAL_DIV:
+        break;  // TODO: Not implemented
+      case CEED_EVAL_CURL:
+        break;  // TODO: Not implemented
+                // LCOV_EXCL_STOP
+    }
+  } else {
+    switch (eval_mode) {
+      case CEED_EVAL_NONE:
+        code << "    CeedScalar *r_e" << var_suffix << " = r_q" << var_suffix << ";\n";
         break;  // No action
+      case CEED_EVAL_INTERP:
+        code << "    CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << P_name << "];\n";
+        code << "    InterpTranspose" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp" << var_suffix << ", " << P_name << ", " << Q_name
+             << ">(data, r_q" << var_suffix << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
+        break;
+      case CEED_EVAL_GRAD:
+        code << "    CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << P_name << "];\n";
+        if (use_3d_slices) {
+          code << "    InterpTranspose" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp" << var_suffix << ", " << P_name << ", " << Q_name
+               << ">(data, r_q" << var_suffix << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
+        } else {
+          code << "    GradTranspose" << (dim > 1 ? "Tensor" : "") << (dim == 3 && Q_1d >= P_1d ? "Collocated" : "") << dim << "d<num_comp"
+               << var_suffix << ", " << P_name << "," << Q_name << ">(data, r_q" << var_suffix << ", s_B" << var_suffix << ", s_G" << var_suffix
+               << ", r_e" << var_suffix << ");\n";
+        }
+        break;
+      // LCOV_EXCL_START
+      case CEED_EVAL_WEIGHT:
+        break;  // Should not occur
       case CEED_EVAL_DIV:
         break;  // TODO: Not implemented
       case CEED_EVAL_CURL:
         break;  // TODO: Not implemented
+                // LCOV_EXCL_STOP
     }
   }
+  return CEED_ERROR_SUCCESS;
+}
 
-  // TODO: put in a function + separate collograd logic
-  // Q function
-  code << "\n    // -- Output field setup --\n";
+//------------------------------------------------------------------------------
+// QFunction
+//------------------------------------------------------------------------------
+static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, CeedInt dim, CeedInt num_input_fields,
+                                                    CeedOperatorField *op_input_fields, CeedQFunctionField *qf_input_fields,
+                                                    CeedInt num_output_fields, CeedOperatorField *op_output_fields,
+                                                    CeedQFunctionField *qf_output_fields, std::string qfunction_name, CeedInt Q_1d,
+                                                    bool use_3d_slices) {
+  std::string         Q_name    = "Q_1d";
+  CeedEvalMode        eval_mode = CEED_EVAL_NONE;
+  CeedElemRestriction elem_rstr;
+
+  // Setup output arays
+  code << "\n    // -- Output field setup\n";
   for (CeedInt i = 0; i < num_output_fields; i++) {
-    code << "\n    // ---- Output field " << i << " ----\n";
+    std::string var_suffix = "_out_" + std::to_string(i);
+
+    code << "    // ---- Output field " << i << "\n";
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
+    if (eval_mode == CEED_EVAL_NONE || eval_mode == CEED_EVAL_INTERP) {
+      code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
+    }
     if (eval_mode == CEED_EVAL_GRAD) {
-      if (use_collograd_parallelization) {
+      if (use_3d_slices) {
         // Accumulator for gradient slices
-        code << "    CeedScalar r_tt_" << i << "[num_comp_out_" << i << "*Q_1d];\n";
-        code << "    for (CeedInt i = 0; i < num_comp_out_" << i << "; i++) {\n";
-        code << "      for (CeedInt j = 0; j < Q_1d; ++j) {\n";
-        code << "        r_tt_" << i << "[j + i*Q_1d] = 0.0;\n";
-        code << "      }\n";
+        code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
+        code << "    for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << Q_name << "; i++) {\n";
+        code << "      r_q" << var_suffix << "[i] = 0.0;\n";
         code << "    }\n";
       } else {
-        code << "    CeedScalar r_tt_" << i << "[num_comp_out_" << i << "*dim*Q_1d];\n";
+        code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim*" << Q_name << "];\n";
       }
     }
-    if (eval_mode == CEED_EVAL_NONE || eval_mode == CEED_EVAL_INTERP) {
-      code << "    CeedScalar r_tt_" << i << "[num_comp_out_" << i << "*Q_1d];\n";
-    }
   }
+
   // We treat quadrature points per slice in 3d to save registers
-  if (use_collograd_parallelization) {
+  if (use_3d_slices) {
     code << "\n    // Note: Using planes of 3D elements\n";
     code << "#pragma unroll\n";
-    code << "    for (CeedInt q = 0; q < Q_1d; q++) {\n";
-    code << "      // -- Input fields --\n";
+    code << "    for (CeedInt q = 0; q < " << Q_name << "; q++) {\n";
+    code << "      // -- Input fields\n";
     for (CeedInt i = 0; i < num_input_fields; i++) {
-      code << "      // ---- Input field " << i << " ----\n";
-      // Get elem_size, eval_mode, num_comp
+      std::string var_suffix = "_in_" + std::to_string(i);
+
+      code << "      // ---- Input field " << i << "\n";
       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
       // Basis action
       code << "      // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
@@ -482,25 +474,13 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op) {
         case CEED_EVAL_NONE:
           bool is_strided;
 
-          code << "      CeedScalar r_q_" << i << "[num_comp_in_" << i << "];\n";
+          code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
 
           CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
           CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided));
-          if (!is_strided) {
-            CeedInt comp_stride;
-
-            CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
-            code << "      const CeedInt l_size_in_" << i << " = " << l_size << ";\n";
-            CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
-            code << "      // CompStride: " << comp_stride << "\n";
-            CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data));
-            data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets;
-            code << "      readSliceQuadsOffset"
-                 << "3d<num_comp_in_" << i << ", " << comp_stride << ", Q_1d>(data, l_size_in_" << i << ", elem, q, indices.inputs[" << i << "], d_u_"
-                 << i << ", r_q_" << i << ");\n";
-          } else {
+          if (is_strided) {
             bool    has_backend_strides;
-            CeedInt num_elem;
+            CeedInt num_elem, elem_size;
 
             CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
             CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides));
@@ -511,200 +491,343 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op) {
               CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
             }
             code << "      // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n";
-            code << "      readSliceQuadsStrided"
-                 << "3d<num_comp_in_" << i
-                 << ",Q_1d"
-                    ","
-                 << strides[0] << "," << strides[1] << "," << strides[2] << ">(data, elem, q, d_u_" << i << ", r_q_" << i << ");\n";
+            code << "      readSliceQuadsStrided3d<num_comp" << var_suffix << ", " << Q_name << "," << strides[0] << "," << strides[1] << ","
+                 << strides[2] << ">(data, elem, q, d" << var_suffix << ", r_s" << var_suffix << ");\n";
+          } else {
+            CeedSize                 l_size = 0;
+            CeedInt                  comp_stride;
+            CeedElemRestriction_Hip *rstr_data;
+
+            CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
+            code << "      const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
+            CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
+            code << "      // CompStride: " << comp_stride << "\n";
+            CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data));
+            data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets;
+            code << "      readSliceQuadsOffset3d<num_comp" << var_suffix << ", " << comp_stride << ", " << Q_name << ">(data, l_size" << var_suffix
+                 << ", elem, q, indices.inputs[" << i << "], d" << var_suffix << ", r_s" << var_suffix << ");\n";
           }
           break;
         case CEED_EVAL_INTERP:
-          code << "      CeedScalar r_q_" << i << "[num_comp_in_" << i << "];\n";
-          code << "      for (CeedInt j = 0; j < num_comp_in_" << i << " ; ++j) {\n";
-          code << "        r_q_" << i << "[j] = r_t_" << i << "[q + j*Q_1d];\n";
+          code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
+          code << "      for (CeedInt j = 0; j < num_comp" << var_suffix << "; j++) {\n";
+          code << "        r_s" << var_suffix << "[j] = r_q" << var_suffix << "[q + j*" << Q_name << "];\n";
           code << "      }\n";
           break;
         case CEED_EVAL_GRAD:
-          code << "      CeedScalar r_q_" << i << "[num_comp_in_" << i << "*dim];\n";
-          code << "      gradCollo3d<num_comp_in_" << i << ",Q_1d>(data, q, r_t_" << i << ", s_G_in_" << i << ", r_q_" << i << ");\n";
+          code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim];\n";
+          code << "      gradCollo3d<num_comp" << var_suffix << ", " << Q_name << ">(data, q, r_q" << var_suffix << ", s_G" << var_suffix << ", r_s"
+               << var_suffix << ");\n";
           break;
         case CEED_EVAL_WEIGHT:
-          code << "      CeedScalar r_q_" << i << "[1];\n";
-          code << "      r_q_" << i << "[0] = r_t_" << i << "[q];\n";
+          code << "      CeedScalar r_s" << var_suffix << "[1];\n";
+          code << "      r_s" << var_suffix << "[0] = r_q" << var_suffix << "[q];\n";
           break;  // No action
+                  // LCOV_EXCL_START
         case CEED_EVAL_DIV:
           break;  // TODO: Not implemented
         case CEED_EVAL_CURL:
           break;  // TODO: Not implemented
+                  // LCOV_EXCL_STOP
       }
     }
-    code << "\n      // -- Output fields --\n";
+    code << "\n      // -- Output fields\n";
     for (CeedInt i = 0; i < num_output_fields; i++) {
-      code << "      // ---- Output field " << i << " ----\n";
+      std::string var_suffix = "_out_" + std::to_string(i);
+
+      code << "      // ---- Output field " << i << "\n";
       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
       // Basis action
       switch (eval_mode) {
         case CEED_EVAL_NONE:
-          code << "      CeedScalar r_qq_" << i << "[num_comp_out_" << i << "];\n";
+          code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
           break;  // No action
         case CEED_EVAL_INTERP:
-          code << "      CeedScalar r_qq_" << i << "[num_comp_out_" << i << "];\n";
+          code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
           break;
         case CEED_EVAL_GRAD:
-          code << "      CeedScalar r_qq_" << i << "[num_comp_out_" << i << "*dim];\n";
+          code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim];\n";
           break;
+          // LCOV_EXCL_START
         case CEED_EVAL_WEIGHT:
           break;  // Should not occur
         case CEED_EVAL_DIV:
           break;  // TODO: Not implemented
         case CEED_EVAL_CURL:
           break;  // TODO: Not implemented
+                  // LCOV_EXCL_STOP
       }
     }
   } else {
-    code << "\n      // Note: Using full elements\n";
-    code << "      // -- Input fields --\n";
+    code << "\n    // Note: Using full elements\n";
+    code << "    {\n";
+    code << "      // -- Input fields\n";
     for (CeedInt i = 0; i < num_input_fields; i++) {
-      code << "      // ---- Input field " << i << " ----\n";
-      code << "      CeedScalar* r_q_" << i << " = r_t_" << i << ";\n";
+      code << "      // ---- Input field " << i << "\n";
+      code << "      CeedScalar *r_s_in_" << i << " = r_q_in_" << i << ";\n";
     }
-    code << "      // -- Output fields --\n";
+    code << "      // -- Output fields\n";
     for (CeedInt i = 0; i < num_output_fields; i++) {
-      code << "      // ---- Output field " << i << " ----\n";
-      code << "      CeedScalar* r_qq_" << i << " = r_tt_" << i << ";\n";
+      code << "      // ---- Output field " << i << "\n";
+      code << "      CeedScalar *r_s_out_" << i << " = r_q_out_" << i << ";\n";
     }
   }
-  code << "\n      // -- QFunction Inputs and outputs --\n";
-  code << "      CeedScalar* in[" << num_input_fields << "];\n";
+
+  // Input and output buffers
+  code << "\n      // -- QFunction inputs and outputs\n";
+  code << "      // ---- Inputs\n";
+  code << "      CeedScalar *inputs[" << CeedIntMax(num_input_fields, 1) << "];\n";
   for (CeedInt i = 0; i < num_input_fields; i++) {
-    code << "      // ---- Input field " << i << " ----\n";
-    code << "      in[" << i << "] = r_q_" << i << ";\n";
+    code << "      // ------ Input field " << i << "\n";
+    code << "      inputs[" << i << "] = r_s_in_" << i << ";\n";
   }
-  code << "      CeedScalar* out[" << num_output_fields << "];\n";
+  code << "      // ---- Outputs\n";
+  code << "      CeedScalar *outputs[" << CeedIntMax(num_output_fields, 1) << "];\n";
   for (CeedInt i = 0; i < num_output_fields; i++) {
-    code << "      // ---- Output field " << i << " ----\n";
-    code << "      out[" << i << "] = r_qq_" << i << ";\n";
+    code << "      // ------ Output field " << i << "\n";
+    code << "      outputs[" << i << "] = r_s_out_" << i << ";\n";
   }
-  code << "\n      // -- Apply QFunction --\n";
+
+  // Apply QFunction
+  code << "\n      // -- Apply QFunction\n";
   code << "      " << qfunction_name << "(ctx, ";
-  if (dim != 3 || use_collograd_parallelization) {
+  if (dim != 3 || use_3d_slices) {
     code << "1";
   } else {
     code << "Q_1d";
   }
-  code << ", in, out);\n";
-  if (use_collograd_parallelization) {
-    code << "      // -- Output fields --\n";
+  code << ", inputs, outputs);\n";
+
+  // Copy or apply transpose grad, if needed
+  if (use_3d_slices) {
+    code << "      // -- Output fields\n";
     for (CeedInt i = 0; i < num_output_fields; i++) {
-      code << "      // ---- Output field " << i << " ----\n";
+      std::string var_suffix = "_out_" + std::to_string(i);
+      std::string P_name     = "P_1d" + var_suffix;
+
+      code << "      // ---- Output field " << i << "\n";
       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
       // Basis action
       code << "      // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
       switch (eval_mode) {
         case CEED_EVAL_NONE:
-          code << "      for (CeedInt j = 0; j < num_comp_out_" << i << " ; ++j) {\n";
-          code << "        r_tt_" << i << "[q + j*Q_1d] = r_qq_" << i << "[j];\n";
+          code << "      for (CeedInt j = 0; j < num_comp" << var_suffix << " ; j++) {\n";
+          code << "        r_q" << var_suffix << "[q + j*" << Q_name << "] = r_s" << var_suffix << "[j];\n";
           code << "      }\n";
           break;  // No action
         case CEED_EVAL_INTERP:
-          code << "      for (CeedInt j = 0; j < num_comp_out_" << i << " ; ++j) {\n";
-          code << "        r_tt_" << i << "[q + j*Q_1d] = r_qq_" << i << "[j];\n";
+          code << "      for (CeedInt j = 0; j < num_comp" << var_suffix << " ; j++) {\n";
+          code << "        r_q" << var_suffix << "[q + j*" << Q_name << "] = r_s" << var_suffix << "[j];\n";
           code << "      }\n";
           break;
         case CEED_EVAL_GRAD:
-          code << "      gradColloTranspose3d<num_comp_out_" << i << ",Q_1d>(data, q, r_qq_" << i << ", s_G_out_" << i << ", r_tt_" << i << ");\n";
+          code << "      gradColloTranspose3d<num_comp" << var_suffix << ", " << Q_name << ">(data, q, r_s" << var_suffix << ", s_G" << var_suffix
+               << ", r_q" << var_suffix << ");\n";
           break;
+          // LCOV_EXCL_START
         case CEED_EVAL_WEIGHT:
           break;  // Should not occur
         case CEED_EVAL_DIV:
           break;  // TODO: Not implemented
         case CEED_EVAL_CURL:
           break;  // TODO: Not implemented
+                  // LCOV_EXCL_STOP
       }
     }
-    code << "    }\n";
   }
+  code << "    }\n";
+  return CEED_ERROR_SUCCESS;
+}
 
-  // Output basis apply if needed
-  // Generate the correct eval mode code for each output
-  code << "\n    // -- Output field basis action and restrictions --\n";
-  for (CeedInt i = 0; i < num_output_fields; i++) {
-    code << "    // ---- Output field " << i << " ----\n";
-    // Get elem_size, eval_mode, num_comp
-    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
-    CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
-    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
-    CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
-    // TODO put in a function
-    // Basis action
-    code << "    // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
-    switch (eval_mode) {
-      case CEED_EVAL_NONE:
-        code << "    CeedScalar* r_v_" << i << " = r_tt_" << i << ";\n";
-        break;  // No action
-      case CEED_EVAL_INTERP:
-        code << "    CeedScalar r_v_" << i << "[num_comp_out_" << i << "*P_out_" << i << "];\n";
-        code << "    InterpTranspose" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp_out_" << i << ",P_out_" << i << ",Q_1d>(data, r_tt_" << i
-             << ", s_B_out_" << i << ", r_v_" << i << ");\n";
-        break;
-      case CEED_EVAL_GRAD:
-        code << "    CeedScalar r_v_" << i << "[num_comp_out_" << i << "*P_out_" << i << "];\n";
-        if (use_collograd_parallelization) {
-          code << "    InterpTranspose" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp_out_" << i << ",P_out_" << i << ",Q_1d>(data, r_tt_" << i
-               << ", s_B_out_" << i << ", r_v_" << i << ");\n";
-        } else {
-          CeedInt P_1d;
-          CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
-          CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
-          code << "    GradTranspose" << (dim > 1 ? "Tensor" : "") << (dim == 3 && Q_1d >= P_1d ? "Collocated" : "") << dim << "d<num_comp_out_" << i
-               << ",P_out_" << i << ",Q_1d>(data, r_tt_" << i << ", s_B_out_" << i << ", s_G_out_" << i << ", r_v_" << i << ");\n";
-        }
-        break;
-      // LCOV_EXCL_START
-      case CEED_EVAL_WEIGHT: {
-        return CeedError(CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
-        break;  // Should not occur
-      }
-      case CEED_EVAL_DIV:
-      case CEED_EVAL_CURL: {
-        return CeedError(CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]);
-        break;  // Should not occur
-      }
-        // LCOV_EXCL_STOP
+//------------------------------------------------------------------------------
+// Build single operator kernel
+//------------------------------------------------------------------------------
+extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op) {
+  bool                   is_tensor = true, use_3d_slices = false;
+  Ceed                   ceed;
+  CeedInt                Q_1d, num_input_fields, num_output_fields, dim = 1;
+  CeedQFunctionField    *qf_input_fields, *qf_output_fields;
+  CeedQFunction_Hip_gen *qf_data;
+  CeedQFunction          qf;
+  CeedOperatorField     *op_input_fields, *op_output_fields;
+  CeedOperator_Hip_gen  *data;
+  std::ostringstream     code;
+
+  {
+    bool is_setup_done;
+
+    CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done));
+    if (is_setup_done) return CEED_ERROR_SUCCESS;
+  }
+
+  CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
+  CeedCallBackend(CeedOperatorGetData(op, &data));
+  CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
+  CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
+  CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
+  CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
+
+  // Get operator data
+  CeedCallBackend(CeedOperatorBuildKernelData_Hip_gen(ceed, num_input_fields, op_input_fields, qf_input_fields, num_output_fields, op_output_fields,
+                                                      qf_output_fields, &data->max_P_1d, &Q_1d, &dim, &is_tensor, &use_3d_slices));
+  if (dim == 0) dim = 1;
+  data->dim = dim;
+  if (Q_1d == 0) {
+    CeedInt Q;
+
+    CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q));
+    Q_1d = Q;
+  }
+  data->Q_1d = Q_1d;
+
+  // Check for restriction only identity operator
+  {
+    bool is_identity_qf;
+
+    CeedCallBackend(CeedQFunctionIsIdentity(qf, &is_identity_qf));
+    if (is_identity_qf) {
+      CeedEvalMode eval_mode_in, eval_mode_out;
+
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[0], &eval_mode_in));
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[0], &eval_mode_out));
+      CeedCheck(eval_mode_in != CEED_EVAL_NONE || eval_mode_out != CEED_EVAL_NONE, ceed, CEED_ERROR_BACKEND,
+                "Backend does not implement restriction only identity operators");
     }
-    // TODO put in a function
-    // Restriction
-    bool is_strided;
+  }
 
-    CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided));
-    if (!is_strided) {
-      CeedInt comp_stride;
+  // Load basis source files
+  // TODO: Add non-tensor, AtPoints
+  {
+    char       *tensor_basis_kernel_source;
+    const char *tensor_basis_kernel_path;
 
-      CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
-      code << "    const CeedInt l_size_out_" << i << " = " << l_size << ";\n";
-      CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
-      code << "    // CompStride: " << comp_stride << "\n";
-      CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data));
-      data->indices.outputs[i] = (CeedInt *)rstr_data->d_offsets;
-      code << "    writeDofsOffset" << dim << "d<num_comp_out_" << i << ", " << comp_stride << ", P_out_" << i << ">(data, l_size_out_" << i
-           << ", elem, indices.outputs[" << i << "], r_v_" << i << ", d_v_" << i << ");\n";
-    } else {
-      bool    has_backend_strides;
-      CeedInt num_elem;
+    CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-shared-basis-tensor-templates.h", &tensor_basis_kernel_path));
+    CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Tensor Basis Kernel Source -----\n");
+    CeedCallBackend(CeedLoadSourceToBuffer(ceed, tensor_basis_kernel_path, &tensor_basis_kernel_source));
+    code << tensor_basis_kernel_source;
+    CeedCallBackend(CeedFree(&tensor_basis_kernel_path));
+    CeedCallBackend(CeedFree(&tensor_basis_kernel_source));
+  }
+  {
+    char       *hip_gen_template_source;
+    const char *hip_gen_template_path;
 
-      CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides));
-      CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem));
-      CeedInt strides[3] = {1, elem_size * num_elem, elem_size};
+    CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-gen-templates.h", &hip_gen_template_path));
+    CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Hip-Gen Template Source -----\n");
+    CeedCallBackend(CeedLoadSourceToBuffer(ceed, hip_gen_template_path, &hip_gen_template_source));
+    code << hip_gen_template_source;
+    CeedCallBackend(CeedFree(&hip_gen_template_path));
+    CeedCallBackend(CeedFree(&hip_gen_template_source));
+  }
 
-      if (!has_backend_strides) {
-        CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
-      }
-      code << "    // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n";
-      code << "    writeDofsStrided" << dim << "d<num_comp_out_" << i << ",P_out_" << i << "," << strides[0] << "," << strides[1] << "," << strides[2]
-           << ">(data, elem, r_v_" << i << ", d_v_" << i << ");\n";
+  // Get QFunction name
+  std::string qfunction_name(qf_data->qfunction_name);
+  std::string operator_name;
+
+  operator_name = "CeedKernelHipGenOperator_" + qfunction_name;
+
+  // Define CEED_Q_VLA
+  code << "\n#undef CEED_Q_VLA\n";
+  if (dim != 3 || use_3d_slices) {
+    code << "#define CEED_Q_VLA 1\n\n";
+  } else {
+    code << "#define CEED_Q_VLA " << Q_1d << "\n\n";
+  }
+
+  // Add user QFunction source
+  {
+    std::string qfunction_source(qf_data->qfunction_source);
+
+    code << qfunction_source;
+  }
+
+  // Setup
+  code << "\n// -----------------------------------------------------------------------------\n";
+  code << "// Operator Kernel\n";
+  code << "// \n";
+  code << "// d_[in,out]_i:   CeedVector device array\n";
+  code << "// r_[in,out]_e_i: Element vector register\n";
+  code << "// r_[in,out]_q_i: Quadrature space vector register\n";
+  code << "// r_[in,out]_s_i: Quadrature space slice  vector register\n";
+  code << "// \n";
+  code << "// s_B_[in,out]_i: Interpolation matrix, shared memory\n";
+  code << "// s_G_[in,out]_i: Gradient matrix, shared memory\n";
+  code << "// -----------------------------------------------------------------------------\n";
+  code << "\nextern \"C\" __launch_bounds__(BLOCK_SIZE)\n";
+  code << "__global__ void " << operator_name
+       << "(CeedInt num_elem, void* ctx, FieldsInt_Hip indices, Fields_Hip fields, Fields_Hip B, Fields_Hip G, CeedScalar* W) {\n";
+
+  // Scratch buffers
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedEvalMode eval_mode;
+
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
+    if (eval_mode != CEED_EVAL_WEIGHT) {  // Skip CEED_EVAL_WEIGHT
+      code << "  const CeedScalar *d_in_" << i << " = fields.inputs[" << i << "];\n";
     }
   }
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    code << "  CeedScalar *d_out_" << i << " = fields.outputs[" << i << "];\n";
+  }
+
+  code << "  const CeedInt dim = " << dim << ";\n";
+  code << "  const CeedInt Q_1d = " << Q_1d << ";\n";
+
+  // Shared data
+  code << "  extern __shared__ CeedScalar slice[];\n";
+  code << "  SharedData_Hip data;\n";
+  code << "  data.t_id_x = threadIdx.x;\n";
+  code << "  data.t_id_y = threadIdx.y;\n";
+  code << "  data.t_id_z = threadIdx.z;\n";
+  code << "  data.t_id  = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
+  code << "  data.slice = slice + data.t_id_z*T_1D" << (dim > 1 ? "*T_1D" : "") << ";\n";
+
+  // Initialize constants, and matrices B and G
+  code << "\n  // Input field constants and basis data\n";
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedCall(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, i, op_input_fields[i], qf_input_fields[i], Q_1d, true, use_3d_slices));
+  }
+  code << "\n  // Output field constants and basis data\n";
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    CeedCall(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, i, op_output_fields[i], qf_output_fields[i], Q_1d, false, use_3d_slices));
+  }
+
+  // Loop over all elements
+  code << "\n  // Element loop\n";
+  code << "  __syncthreads();\n";
+  code << "  for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n";
+
+  // -- Input restriction and basis
+  code << "    // -- Input field restrictions and basis actions\n";
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    code << "    // ---- Input field " << i << "\n";
+
+    // ---- Restriction
+    CeedCallBackend(
+        CeedOperatorBuildKernelRestriction_Hip_gen(code, data, i, dim, op_input_fields[i], qf_input_fields[i], Q_1d, true, use_3d_slices));
+
+    // ---- Basis action
+    CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, i, dim, op_input_fields[i], qf_input_fields[i], Q_1d, true, use_3d_slices));
+  }
+
+  // -- Q function
+  CeedCallBackend(CeedOperatorBuildKernelQFunction_Hip_gen(code, data, dim, num_input_fields, op_input_fields, qf_input_fields, num_output_fields,
+                                                           op_output_fields, qf_output_fields, qfunction_name, Q_1d, use_3d_slices));
+
+  // -- Output basis and restriction
+  code << "\n    // -- Output field basis action and restrictions\n";
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    code << "    // ---- Output field " << i << "\n";
+
+    // ---- Basis action
+    CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, i, dim, op_output_fields[i], qf_output_fields[i], Q_1d, false, use_3d_slices));
+
+    // ---- Restriction
+    CeedCallBackend(
+        CeedOperatorBuildKernelRestriction_Hip_gen(code, data, i, dim, op_output_fields[i], qf_output_fields[i], Q_1d, false, use_3d_slices));
+  }
 
+  // Close loop and function
   code << "  }\n";
   code << "}\n";
   code << "// -----------------------------------------------------------------------------\n\n";

From f8a0df597ca176fee6b07766b6124704acaa0050 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Wed, 21 Aug 2024 20:14:14 +0100
Subject: [PATCH 140/571] Skip duplicate transpose restrictions (#1645)

* cpu - skip duplicate output rstr

* cuda - skip duplicate output rstr

* hip - skip duplicate output rstr
---
 backends/blocked/ceed-blocked-operator.c   |  70 +++++++++---
 backends/blocked/ceed-blocked.h            |   3 +-
 backends/cuda-ref/ceed-cuda-ref-operator.c |  63 ++++++++---
 backends/cuda-ref/ceed-cuda-ref.h          |   2 +-
 backends/hip-ref/ceed-hip-ref-operator.c   |  63 ++++++++---
 backends/hip-ref/ceed-hip-ref.h            |   2 +-
 backends/opt/ceed-opt-operator.c           |  64 ++++++++---
 backends/opt/ceed-opt.h                    |   1 +
 backends/ref/ceed-ref-operator.c           | 123 ++++++++++++++++-----
 backends/ref/ceed-ref.h                    |   3 +-
 10 files changed, 306 insertions(+), 88 deletions(-)

diff --git a/backends/blocked/ceed-blocked-operator.c b/backends/blocked/ceed-blocked-operator.c
index 788533cbff..80b1f44865 100644
--- a/backends/blocked/ceed-blocked-operator.c
+++ b/backends/blocked/ceed-blocked-operator.c
@@ -16,9 +16,9 @@
 //------------------------------------------------------------------------------
 // Setup Input/Output Fields
 //------------------------------------------------------------------------------
-static int CeedOperatorSetupFields_Blocked(CeedQFunction qf, CeedOperator op, bool is_input, bool *skip_rstr, const CeedInt block_size,
-                                           CeedElemRestriction *block_rstr, CeedVector *e_vecs_full, CeedVector *e_vecs, CeedVector *q_vecs,
-                                           CeedInt start_e, CeedInt num_fields, CeedInt Q) {
+static int CeedOperatorSetupFields_Blocked(CeedQFunction qf, CeedOperator op, bool is_input, bool *skip_rstr, CeedInt *e_data_out_indices,
+                                           bool *apply_add_basis, const CeedInt block_size, CeedElemRestriction *block_rstr, CeedVector *e_vecs_full,
+                                           CeedVector *e_vecs, CeedVector *q_vecs, CeedInt start_e, CeedInt num_fields, CeedInt Q) {
   Ceed                ceed;
   CeedSize            e_size, q_size;
   CeedInt             num_comp, size, P;
@@ -135,7 +135,7 @@ static int CeedOperatorSetupFields_Blocked(CeedQFunction qf, CeedOperator op, bo
         break;
     }
   }
-  // Drop duplicate input restrictions
+  // Drop duplicate restrictions
   if (is_input) {
     for (CeedInt i = 0; i < num_fields; i++) {
       CeedVector          vec_i;
@@ -151,11 +151,33 @@ static int CeedOperatorSetupFields_Blocked(CeedQFunction qf, CeedOperator op, bo
         CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[j], &rstr_j));
         if (vec_i == vec_j && rstr_i == rstr_j) {
           CeedCallBackend(CeedVectorReferenceCopy(e_vecs[i], &e_vecs[j]));
-          CeedCallBackend(CeedVectorReferenceCopy(e_vecs_full[i], &e_vecs_full[j]));
+          CeedCallBackend(CeedVectorReferenceCopy(e_vecs_full[i + start_e], &e_vecs_full[j + start_e]));
           skip_rstr[j] = true;
         }
       }
     }
+  } else {
+    for (CeedInt i = num_fields - 1; i >= 0; i--) {
+      CeedVector          vec_i;
+      CeedElemRestriction rstr_i;
+
+      CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec_i));
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &rstr_i));
+      for (CeedInt j = i - 1; j >= 0; j--) {
+        CeedVector          vec_j;
+        CeedElemRestriction rstr_j;
+
+        CeedCallBackend(CeedOperatorFieldGetVector(op_fields[j], &vec_j));
+        CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[j], &rstr_j));
+        if (vec_i == vec_j && rstr_i == rstr_j) {
+          CeedCallBackend(CeedVectorReferenceCopy(e_vecs[i], &e_vecs[j]));
+          CeedCallBackend(CeedVectorReferenceCopy(e_vecs_full[i + start_e], &e_vecs_full[j + start_e]));
+          skip_rstr[j]          = true;
+          apply_add_basis[i]    = true;
+          e_data_out_indices[j] = i;
+        }
+      }
+    }
   }
   return CEED_ERROR_SUCCESS;
 }
@@ -189,6 +211,9 @@ static int CeedOperatorSetup_Blocked(CeedOperator op) {
   CeedCallBackend(CeedCalloc(num_input_fields + num_output_fields, &impl->e_vecs_full));
 
   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->skip_rstr_in));
+  CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->skip_rstr_out));
+  CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->e_data_out_indices));
+  CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->apply_add_basis_out));
   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->input_states));
   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->e_vecs_in));
   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->e_vecs_out));
@@ -200,11 +225,12 @@ static int CeedOperatorSetup_Blocked(CeedOperator op) {
 
   // Set up infield and outfield pointer arrays
   // Infields
-  CeedCallBackend(CeedOperatorSetupFields_Blocked(qf, op, true, impl->skip_rstr_in, block_size, impl->block_rstr, impl->e_vecs_full, impl->e_vecs_in,
-                                                  impl->q_vecs_in, 0, num_input_fields, Q));
+  CeedCallBackend(CeedOperatorSetupFields_Blocked(qf, op, true, impl->skip_rstr_in, NULL, NULL, block_size, impl->block_rstr, impl->e_vecs_full,
+                                                  impl->e_vecs_in, impl->q_vecs_in, 0, num_input_fields, Q));
   // Outfields
-  CeedCallBackend(CeedOperatorSetupFields_Blocked(qf, op, false, NULL, block_size, impl->block_rstr, impl->e_vecs_full, impl->e_vecs_out,
-                                                  impl->q_vecs_out, num_input_fields, num_output_fields, Q));
+  CeedCallBackend(CeedOperatorSetupFields_Blocked(qf, op, false, impl->skip_rstr_out, impl->e_data_out_indices, impl->apply_add_basis_out, block_size,
+                                                  impl->block_rstr, impl->e_vecs_full, impl->e_vecs_out, impl->q_vecs_out, num_input_fields,
+                                                  num_output_fields, Q));
 
   // Identity QFunctions
   if (impl->is_identity_qf) {
@@ -310,8 +336,8 @@ static inline int CeedOperatorInputBasis_Blocked(CeedInt e, CeedInt Q, CeedQFunc
 // Output Basis Action
 //------------------------------------------------------------------------------
 static inline int CeedOperatorOutputBasis_Blocked(CeedInt e, CeedInt Q, CeedQFunctionField *qf_output_fields, CeedOperatorField *op_output_fields,
-                                                  CeedInt block_size, CeedInt num_input_fields, CeedInt num_output_fields, CeedOperator op,
-                                                  CeedScalar *e_data_full[2 * CEED_FIELD_MAX], CeedOperator_Blocked *impl) {
+                                                  CeedInt block_size, CeedInt num_input_fields, CeedInt num_output_fields, bool *apply_add_basis,
+                                                  CeedOperator op, CeedScalar *e_data_full[2 * CEED_FIELD_MAX], CeedOperator_Blocked *impl) {
   for (CeedInt i = 0; i < num_output_fields; i++) {
     CeedInt             elem_size, num_comp;
     CeedEvalMode        eval_mode;
@@ -334,7 +360,11 @@ static inline int CeedOperatorOutputBasis_Blocked(CeedInt e, CeedInt Q, CeedQFun
         CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
         CeedCallBackend(CeedVectorSetArray(impl->e_vecs_out[i], CEED_MEM_HOST, CEED_USE_POINTER,
                                            &e_data_full[i + num_input_fields][(CeedSize)e * elem_size * num_comp]));
-        CeedCallBackend(CeedBasisApply(basis, block_size, CEED_TRANSPOSE, eval_mode, impl->q_vecs_out[i], impl->e_vecs_out[i]));
+        if (apply_add_basis[i]) {
+          CeedCallBackend(CeedBasisApplyAdd(basis, block_size, CEED_TRANSPOSE, eval_mode, impl->q_vecs_out[i], impl->e_vecs_out[i]));
+        } else {
+          CeedCallBackend(CeedBasisApply(basis, block_size, CEED_TRANSPOSE, eval_mode, impl->q_vecs_out[i], impl->e_vecs_out[i]));
+        }
         break;
       // LCOV_EXCL_START
       case CEED_EVAL_WEIGHT: {
@@ -405,8 +435,12 @@ static int CeedOperatorApplyAdd_Blocked(CeedOperator op, CeedVector in_vec, Ceed
   CeedCallBackend(CeedOperatorSetupInputs_Blocked(num_input_fields, qf_input_fields, op_input_fields, in_vec, false, e_data_full, impl, request));
 
   // Output Evecs
-  for (CeedInt i = 0; i < num_output_fields; i++) {
-    CeedCallBackend(CeedVectorGetArrayWrite(impl->e_vecs_full[i + impl->num_inputs], CEED_MEM_HOST, &e_data_full[i + num_input_fields]));
+  for (CeedInt i = num_output_fields - 1; i >= 0; i--) {
+    if (impl->skip_rstr_out[i]) {
+      e_data_full[i + num_input_fields] = e_data_full[impl->e_data_out_indices[i] + num_input_fields];
+    } else {
+      CeedCallBackend(CeedVectorGetArrayWrite(impl->e_vecs_full[i + impl->num_inputs], CEED_MEM_HOST, &e_data_full[i + num_input_fields]));
+    }
   }
 
   // Loop through elements
@@ -430,14 +464,15 @@ static int CeedOperatorApplyAdd_Blocked(CeedOperator op, CeedVector in_vec, Ceed
     }
 
     // Output basis apply
-    CeedCallBackend(CeedOperatorOutputBasis_Blocked(e, Q, qf_output_fields, op_output_fields, block_size, num_input_fields, num_output_fields, op,
-                                                    e_data_full, impl));
+    CeedCallBackend(CeedOperatorOutputBasis_Blocked(e, Q, qf_output_fields, op_output_fields, block_size, num_input_fields, num_output_fields,
+                                                    impl->apply_add_basis_out, op, e_data_full, impl));
   }
 
   // Output restriction
   for (CeedInt i = 0; i < num_output_fields; i++) {
     CeedVector vec;
 
+    if (impl->skip_rstr_out[i]) continue;
     // Restore evec
     CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs_full[i + impl->num_inputs], &e_data_full[i + num_input_fields]));
     // Get output vector
@@ -671,6 +706,9 @@ static int CeedOperatorDestroy_Blocked(CeedOperator op) {
   CeedCallBackend(CeedOperatorGetData(op, &impl));
 
   CeedCallBackend(CeedFree(&impl->skip_rstr_in));
+  CeedCallBackend(CeedFree(&impl->skip_rstr_out));
+  CeedCallBackend(CeedFree(&impl->e_data_out_indices));
+  CeedCallBackend(CeedFree(&impl->apply_add_basis_out));
   for (CeedInt i = 0; i < impl->num_inputs + impl->num_outputs; i++) {
     CeedCallBackend(CeedElemRestrictionDestroy(&impl->block_rstr[i]));
     CeedCallBackend(CeedVectorDestroy(&impl->e_vecs_full[i]));
diff --git a/backends/blocked/ceed-blocked.h b/backends/blocked/ceed-blocked.h
index 5876e969b7..f04307abdc 100644
--- a/backends/blocked/ceed-blocked.h
+++ b/backends/blocked/ceed-blocked.h
@@ -17,7 +17,8 @@ typedef struct {
 
 typedef struct {
   bool                 is_identity_qf, is_identity_rstr_op;
-  bool                *skip_rstr_in;
+  bool                *skip_rstr_in, *skip_rstr_out, *apply_add_basis_out;
+  CeedInt             *e_data_out_indices;
   uint64_t            *input_states; /* State counter of inputs */
   CeedVector          *e_vecs_full;  /* Full E-vectors, inputs followed by outputs */
   CeedVector          *e_vecs_in;    /* Element block input E-vectors  */
diff --git a/backends/cuda-ref/ceed-cuda-ref-operator.c b/backends/cuda-ref/ceed-cuda-ref-operator.c
index 9f6d3d14b0..748237dab8 100644
--- a/backends/cuda-ref/ceed-cuda-ref-operator.c
+++ b/backends/cuda-ref/ceed-cuda-ref-operator.c
@@ -28,6 +28,8 @@ static int CeedOperatorDestroy_Cuda(CeedOperator op) {
 
   // Apply data
   CeedCallBackend(CeedFree(&impl->skip_rstr_in));
+  CeedCallBackend(CeedFree(&impl->skip_rstr_out));
+  CeedCallBackend(CeedFree(&impl->apply_add_basis_out));
   for (CeedInt i = 0; i < impl->num_inputs + impl->num_outputs; i++) {
     CeedCallBackend(CeedVectorDestroy(&impl->e_vecs[i]));
   }
@@ -97,8 +99,8 @@ static int CeedOperatorDestroy_Cuda(CeedOperator op) {
 //------------------------------------------------------------------------------
 // Setup infields or outfields
 //------------------------------------------------------------------------------
-static int CeedOperatorSetupFields_Cuda(CeedQFunction qf, CeedOperator op, bool is_input, bool is_at_points, bool *skip_rstr, CeedVector *e_vecs,
-                                        CeedVector *q_vecs, CeedInt start_e, CeedInt num_fields, CeedInt Q, CeedInt num_elem) {
+static int CeedOperatorSetupFields_Cuda(CeedQFunction qf, CeedOperator op, bool is_input, bool is_at_points, bool *skip_rstr, bool *apply_add_basis,
+                                        CeedVector *e_vecs, CeedVector *q_vecs, CeedInt start_e, CeedInt num_fields, CeedInt Q, CeedInt num_elem) {
   Ceed                ceed;
   CeedQFunctionField *qf_fields;
   CeedOperatorField  *op_fields;
@@ -184,7 +186,7 @@ static int CeedOperatorSetupFields_Cuda(CeedQFunction qf, CeedOperator op, bool
         break;
     }
   }
-  // Drop duplicate input restrictions
+  // Drop duplicate restrictions
   if (is_input) {
     for (CeedInt i = 0; i < num_fields; i++) {
       CeedVector          vec_i;
@@ -199,11 +201,31 @@ static int CeedOperatorSetupFields_Cuda(CeedQFunction qf, CeedOperator op, bool
         CeedCallBackend(CeedOperatorFieldGetVector(op_fields[j], &vec_j));
         CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[j], &rstr_j));
         if (vec_i == vec_j && rstr_i == rstr_j) {
-          CeedCallBackend(CeedVectorReferenceCopy(e_vecs[i], &e_vecs[j]));
+          CeedCallBackend(CeedVectorReferenceCopy(e_vecs[i + start_e], &e_vecs[j + start_e]));
           skip_rstr[j] = true;
         }
       }
     }
+  } else {
+    for (CeedInt i = num_fields - 1; i >= 0; i--) {
+      CeedVector          vec_i;
+      CeedElemRestriction rstr_i;
+
+      CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec_i));
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &rstr_i));
+      for (CeedInt j = i - 1; j >= 0; j--) {
+        CeedVector          vec_j;
+        CeedElemRestriction rstr_j;
+
+        CeedCallBackend(CeedOperatorFieldGetVector(op_fields[j], &vec_j));
+        CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[j], &rstr_j));
+        if (vec_i == vec_j && rstr_i == rstr_j) {
+          CeedCallBackend(CeedVectorReferenceCopy(e_vecs[i + start_e], &e_vecs[j + start_e]));
+          skip_rstr[j]       = true;
+          apply_add_basis[i] = true;
+        }
+      }
+    }
   }
   return CEED_ERROR_SUCCESS;
 }
@@ -234,6 +256,8 @@ static int CeedOperatorSetup_Cuda(CeedOperator op) {
   // Allocate
   CeedCallBackend(CeedCalloc(num_input_fields + num_output_fields, &impl->e_vecs));
   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->skip_rstr_in));
+  CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->skip_rstr_out));
+  CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->apply_add_basis_out));
   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->input_states));
   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_in));
   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_out));
@@ -243,10 +267,10 @@ static int CeedOperatorSetup_Cuda(CeedOperator op) {
   // Set up infield and outfield e_vecs and q_vecs
   // Infields
   CeedCallBackend(
-      CeedOperatorSetupFields_Cuda(qf, op, true, false, impl->skip_rstr_in, impl->e_vecs, impl->q_vecs_in, 0, num_input_fields, Q, num_elem));
+      CeedOperatorSetupFields_Cuda(qf, op, true, false, impl->skip_rstr_in, NULL, impl->e_vecs, impl->q_vecs_in, 0, num_input_fields, Q, num_elem));
   // Outfields
-  CeedCallBackend(
-      CeedOperatorSetupFields_Cuda(qf, op, false, false, NULL, impl->e_vecs, impl->q_vecs_out, num_input_fields, num_output_fields, Q, num_elem));
+  CeedCallBackend(CeedOperatorSetupFields_Cuda(qf, op, false, false, impl->skip_rstr_out, impl->apply_add_basis_out, impl->e_vecs, impl->q_vecs_out,
+                                               num_input_fields, num_output_fields, Q, num_elem));
 
   CeedCallBackend(CeedOperatorSetSetupDone(op));
   return CEED_ERROR_SUCCESS;
@@ -431,7 +455,11 @@ static int CeedOperatorApplyAdd_Cuda(CeedOperator op, CeedVector in_vec, CeedVec
       case CEED_EVAL_DIV:
       case CEED_EVAL_CURL:
         CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
-        CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_TRANSPOSE, eval_mode, impl->q_vecs_out[i], impl->e_vecs[i + impl->num_inputs]));
+        if (impl->apply_add_basis_out[i]) {
+          CeedCallBackend(CeedBasisApplyAdd(basis, num_elem, CEED_TRANSPOSE, eval_mode, impl->q_vecs_out[i], impl->e_vecs[i + impl->num_inputs]));
+        } else {
+          CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_TRANSPOSE, eval_mode, impl->q_vecs_out[i], impl->e_vecs[i + impl->num_inputs]));
+        }
         break;
       // LCOV_EXCL_START
       case CEED_EVAL_WEIGHT: {
@@ -452,6 +480,7 @@ static int CeedOperatorApplyAdd_Cuda(CeedOperator op, CeedVector in_vec, CeedVec
     if (eval_mode == CEED_EVAL_NONE) {
       CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs[i + impl->num_inputs], &e_data[i + num_input_fields]));
     }
+    if (impl->skip_rstr_out[i]) continue;
     // Get output vector
     CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
     // Restrict
@@ -499,6 +528,8 @@ static int CeedOperatorSetupAtPoints_Cuda(CeedOperator op) {
   // Allocate
   CeedCallBackend(CeedCalloc(num_input_fields + num_output_fields, &impl->e_vecs));
   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->skip_rstr_in));
+  CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->skip_rstr_out));
+  CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->apply_add_basis_out));
   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->input_states));
   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_in));
   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_out));
@@ -507,11 +538,11 @@ static int CeedOperatorSetupAtPoints_Cuda(CeedOperator op) {
 
   // Set up infield and outfield e_vecs and q_vecs
   // Infields
-  CeedCallBackend(CeedOperatorSetupFields_Cuda(qf, op, true, true, impl->skip_rstr_in, impl->e_vecs, impl->q_vecs_in, 0, num_input_fields,
+  CeedCallBackend(CeedOperatorSetupFields_Cuda(qf, op, true, true, impl->skip_rstr_in, NULL, impl->e_vecs, impl->q_vecs_in, 0, num_input_fields,
                                                max_num_points, num_elem));
   // Outfields
-  CeedCallBackend(CeedOperatorSetupFields_Cuda(qf, op, false, true, NULL, impl->e_vecs, impl->q_vecs_out, num_input_fields, num_output_fields,
-                                               max_num_points, num_elem));
+  CeedCallBackend(CeedOperatorSetupFields_Cuda(qf, op, false, true, impl->skip_rstr_out, impl->apply_add_basis_out, impl->e_vecs, impl->q_vecs_out,
+                                               num_input_fields, num_output_fields, max_num_points, num_elem));
 
   CeedCallBackend(CeedOperatorSetSetupDone(op));
   return CEED_ERROR_SUCCESS;
@@ -635,8 +666,13 @@ static int CeedOperatorApplyAddAtPoints_Cuda(CeedOperator op, CeedVector in_vec,
       case CEED_EVAL_DIV:
       case CEED_EVAL_CURL:
         CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
-        CeedCallBackend(CeedBasisApplyAtPoints(basis, num_elem, num_points, CEED_TRANSPOSE, eval_mode, impl->point_coords_elem, impl->q_vecs_out[i],
-                                               impl->e_vecs[i + impl->num_inputs]));
+        if (impl->apply_add_basis_out[i]) {
+          CeedCallBackend(CeedBasisApplyAddAtPoints(basis, num_elem, num_points, CEED_TRANSPOSE, eval_mode, impl->point_coords_elem,
+                                                    impl->q_vecs_out[i], impl->e_vecs[i + impl->num_inputs]));
+        } else {
+          CeedCallBackend(CeedBasisApplyAtPoints(basis, num_elem, num_points, CEED_TRANSPOSE, eval_mode, impl->point_coords_elem, impl->q_vecs_out[i],
+                                                 impl->e_vecs[i + impl->num_inputs]));
+        }
         break;
       // LCOV_EXCL_START
       case CEED_EVAL_WEIGHT: {
@@ -657,6 +693,7 @@ static int CeedOperatorApplyAddAtPoints_Cuda(CeedOperator op, CeedVector in_vec,
     if (eval_mode == CEED_EVAL_NONE) {
       CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs[i + impl->num_inputs], &e_data[i + num_input_fields]));
     }
+    if (impl->skip_rstr_out[i]) continue;
     // Get output vector
     CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
     // Restrict
diff --git a/backends/cuda-ref/ceed-cuda-ref.h b/backends/cuda-ref/ceed-cuda-ref.h
index f8430a1b12..ff0bbaf349 100644
--- a/backends/cuda-ref/ceed-cuda-ref.h
+++ b/backends/cuda-ref/ceed-cuda-ref.h
@@ -128,7 +128,7 @@ typedef struct {
 } CeedOperatorAssemble_Cuda;
 
 typedef struct {
-  bool                      *skip_rstr_in;
+  bool                      *skip_rstr_in, *skip_rstr_out, *apply_add_basis_out;
   uint64_t                  *input_states;  // State tracking for passive inputs
   CeedVector                *e_vecs;        // E-vectors, inputs followed by outputs
   CeedVector                *q_vecs_in;     // Input Q-vectors needed to apply operator
diff --git a/backends/hip-ref/ceed-hip-ref-operator.c b/backends/hip-ref/ceed-hip-ref-operator.c
index bb5d09816d..6045d0ad96 100644
--- a/backends/hip-ref/ceed-hip-ref-operator.c
+++ b/backends/hip-ref/ceed-hip-ref-operator.c
@@ -27,6 +27,8 @@ static int CeedOperatorDestroy_Hip(CeedOperator op) {
 
   // Apply data
   CeedCallBackend(CeedFree(&impl->skip_rstr_in));
+  CeedCallBackend(CeedFree(&impl->skip_rstr_out));
+  CeedCallBackend(CeedFree(&impl->apply_add_basis_out));
   for (CeedInt i = 0; i < impl->num_inputs + impl->num_outputs; i++) {
     CeedCallBackend(CeedVectorDestroy(&impl->e_vecs[i]));
   }
@@ -96,8 +98,8 @@ static int CeedOperatorDestroy_Hip(CeedOperator op) {
 //------------------------------------------------------------------------------
 // Setup infields or outfields
 //------------------------------------------------------------------------------
-static int CeedOperatorSetupFields_Hip(CeedQFunction qf, CeedOperator op, bool is_input, bool is_at_points, bool *skip_rstr, CeedVector *e_vecs,
-                                       CeedVector *q_vecs, CeedInt start_e, CeedInt num_fields, CeedInt Q, CeedInt num_elem) {
+static int CeedOperatorSetupFields_Hip(CeedQFunction qf, CeedOperator op, bool is_input, bool is_at_points, bool *skip_rstr, bool *apply_add_basis,
+                                       CeedVector *e_vecs, CeedVector *q_vecs, CeedInt start_e, CeedInt num_fields, CeedInt Q, CeedInt num_elem) {
   Ceed                ceed;
   CeedQFunctionField *qf_fields;
   CeedOperatorField  *op_fields;
@@ -183,7 +185,7 @@ static int CeedOperatorSetupFields_Hip(CeedQFunction qf, CeedOperator op, bool i
         break;
     }
   }
-  // Drop duplicate input restrictions
+  // Drop duplicate restrictions
   if (is_input) {
     for (CeedInt i = 0; i < num_fields; i++) {
       CeedVector          vec_i;
@@ -198,11 +200,31 @@ static int CeedOperatorSetupFields_Hip(CeedQFunction qf, CeedOperator op, bool i
         CeedCallBackend(CeedOperatorFieldGetVector(op_fields[j], &vec_j));
         CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[j], &rstr_j));
         if (vec_i == vec_j && rstr_i == rstr_j) {
-          CeedCallBackend(CeedVectorReferenceCopy(e_vecs[i], &e_vecs[j]));
+          CeedCallBackend(CeedVectorReferenceCopy(e_vecs[i + start_e], &e_vecs[j + start_e]));
           skip_rstr[j] = true;
         }
       }
     }
+  } else {
+    for (CeedInt i = num_fields - 1; i >= 0; i--) {
+      CeedVector          vec_i;
+      CeedElemRestriction rstr_i;
+
+      CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec_i));
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &rstr_i));
+      for (CeedInt j = i - 1; j >= 0; j--) {
+        CeedVector          vec_j;
+        CeedElemRestriction rstr_j;
+
+        CeedCallBackend(CeedOperatorFieldGetVector(op_fields[j], &vec_j));
+        CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[j], &rstr_j));
+        if (vec_i == vec_j && rstr_i == rstr_j) {
+          CeedCallBackend(CeedVectorReferenceCopy(e_vecs[i + start_e], &e_vecs[j + start_e]));
+          skip_rstr[j]       = true;
+          apply_add_basis[i] = true;
+        }
+      }
+    }
   }
   return CEED_ERROR_SUCCESS;
 }
@@ -233,6 +255,8 @@ static int CeedOperatorSetup_Hip(CeedOperator op) {
   // Allocate
   CeedCallBackend(CeedCalloc(num_input_fields + num_output_fields, &impl->e_vecs));
   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->skip_rstr_in));
+  CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->skip_rstr_out));
+  CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->apply_add_basis_out));
   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->input_states));
   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_in));
   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_out));
@@ -242,10 +266,10 @@ static int CeedOperatorSetup_Hip(CeedOperator op) {
   // Set up infield and outfield e_vecs and q_vecs
   // Infields
   CeedCallBackend(
-      CeedOperatorSetupFields_Hip(qf, op, true, false, impl->skip_rstr_in, impl->e_vecs, impl->q_vecs_in, 0, num_input_fields, Q, num_elem));
+      CeedOperatorSetupFields_Hip(qf, op, true, false, impl->skip_rstr_in, NULL, impl->e_vecs, impl->q_vecs_in, 0, num_input_fields, Q, num_elem));
   // Outfields
-  CeedCallBackend(
-      CeedOperatorSetupFields_Hip(qf, op, false, false, NULL, impl->e_vecs, impl->q_vecs_out, num_input_fields, num_output_fields, Q, num_elem));
+  CeedCallBackend(CeedOperatorSetupFields_Hip(qf, op, false, false, impl->skip_rstr_out, impl->apply_add_basis_out, impl->e_vecs, impl->q_vecs_out,
+                                              num_input_fields, num_output_fields, Q, num_elem));
 
   CeedCallBackend(CeedOperatorSetSetupDone(op));
   return CEED_ERROR_SUCCESS;
@@ -430,7 +454,11 @@ static int CeedOperatorApplyAdd_Hip(CeedOperator op, CeedVector in_vec, CeedVect
       case CEED_EVAL_DIV:
       case CEED_EVAL_CURL:
         CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
-        CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_TRANSPOSE, eval_mode, impl->q_vecs_out[i], impl->e_vecs[i + impl->num_inputs]));
+        if (impl->apply_add_basis_out[i]) {
+          CeedCallBackend(CeedBasisApplyAdd(basis, num_elem, CEED_TRANSPOSE, eval_mode, impl->q_vecs_out[i], impl->e_vecs[i + impl->num_inputs]));
+        } else {
+          CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_TRANSPOSE, eval_mode, impl->q_vecs_out[i], impl->e_vecs[i + impl->num_inputs]));
+        }
         break;
       // LCOV_EXCL_START
       case CEED_EVAL_WEIGHT: {
@@ -451,6 +479,7 @@ static int CeedOperatorApplyAdd_Hip(CeedOperator op, CeedVector in_vec, CeedVect
     if (eval_mode == CEED_EVAL_NONE) {
       CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs[i + impl->num_inputs], &e_data[i + num_input_fields]));
     }
+    if (impl->skip_rstr_out[i]) continue;
     // Get output vector
     CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
     // Restrict
@@ -498,6 +527,8 @@ static int CeedOperatorSetupAtPoints_Hip(CeedOperator op) {
   // Allocate
   CeedCallBackend(CeedCalloc(num_input_fields + num_output_fields, &impl->e_vecs));
   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->skip_rstr_in));
+  CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->skip_rstr_out));
+  CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->apply_add_basis_out));
   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->input_states));
   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_in));
   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_out));
@@ -506,11 +537,11 @@ static int CeedOperatorSetupAtPoints_Hip(CeedOperator op) {
 
   // Set up infield and outfield e_vecs and q_vecs
   // Infields
-  CeedCallBackend(CeedOperatorSetupFields_Hip(qf, op, true, true, impl->skip_rstr_in, impl->e_vecs, impl->q_vecs_in, 0, num_input_fields,
+  CeedCallBackend(CeedOperatorSetupFields_Hip(qf, op, true, true, impl->skip_rstr_in, NULL, impl->e_vecs, impl->q_vecs_in, 0, num_input_fields,
                                               max_num_points, num_elem));
   // Outfields
-  CeedCallBackend(CeedOperatorSetupFields_Hip(qf, op, false, true, NULL, impl->e_vecs, impl->q_vecs_out, num_input_fields, num_output_fields,
-                                              max_num_points, num_elem));
+  CeedCallBackend(CeedOperatorSetupFields_Hip(qf, op, false, true, impl->skip_rstr_out, impl->apply_add_basis_out, impl->e_vecs, impl->q_vecs_out,
+                                              num_input_fields, num_output_fields, max_num_points, num_elem));
 
   CeedCallBackend(CeedOperatorSetSetupDone(op));
   return CEED_ERROR_SUCCESS;
@@ -634,8 +665,13 @@ static int CeedOperatorApplyAddAtPoints_Hip(CeedOperator op, CeedVector in_vec,
       case CEED_EVAL_DIV:
       case CEED_EVAL_CURL:
         CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
-        CeedCallBackend(CeedBasisApplyAtPoints(basis, num_elem, num_points, CEED_TRANSPOSE, eval_mode, impl->point_coords_elem, impl->q_vecs_out[i],
-                                               impl->e_vecs[i + impl->num_inputs]));
+        if (impl->apply_add_basis_out[i]) {
+          CeedCallBackend(CeedBasisApplyAddAtPoints(basis, num_elem, num_points, CEED_TRANSPOSE, eval_mode, impl->point_coords_elem,
+                                                    impl->q_vecs_out[i], impl->e_vecs[i + impl->num_inputs]));
+        } else {
+          CeedCallBackend(CeedBasisApplyAtPoints(basis, num_elem, num_points, CEED_TRANSPOSE, eval_mode, impl->point_coords_elem, impl->q_vecs_out[i],
+                                                 impl->e_vecs[i + impl->num_inputs]));
+        }
         break;
       // LCOV_EXCL_START
       case CEED_EVAL_WEIGHT: {
@@ -656,6 +692,7 @@ static int CeedOperatorApplyAddAtPoints_Hip(CeedOperator op, CeedVector in_vec,
     if (eval_mode == CEED_EVAL_NONE) {
       CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs[i + impl->num_inputs], &e_data[i + num_input_fields]));
     }
+    if (impl->skip_rstr_out[i]) continue;
     // Get output vector
     CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
     // Restrict
diff --git a/backends/hip-ref/ceed-hip-ref.h b/backends/hip-ref/ceed-hip-ref.h
index 5199ce8767..59f8d809c2 100644
--- a/backends/hip-ref/ceed-hip-ref.h
+++ b/backends/hip-ref/ceed-hip-ref.h
@@ -132,7 +132,7 @@ typedef struct {
 } CeedOperatorAssemble_Hip;
 
 typedef struct {
-  bool                     *skip_rstr_in;
+  bool                     *skip_rstr_in, *skip_rstr_out, *apply_add_basis_out;
   uint64_t                 *input_states;  // State tracking for passive inputs
   CeedVector               *e_vecs;        // E-vectors, inputs followed by outputs
   CeedVector               *q_vecs_in;     // Input Q-vectors needed to apply operator
diff --git a/backends/opt/ceed-opt-operator.c b/backends/opt/ceed-opt-operator.c
index eaacaedc12..4de37eaf66 100644
--- a/backends/opt/ceed-opt-operator.c
+++ b/backends/opt/ceed-opt-operator.c
@@ -16,9 +16,9 @@
 //------------------------------------------------------------------------------
 // Setup Input/Output Fields
 //------------------------------------------------------------------------------
-static int CeedOperatorSetupFields_Opt(CeedQFunction qf, CeedOperator op, bool is_input, const CeedInt block_size, CeedElemRestriction *block_rstr,
-                                       CeedVector *e_vecs_full, CeedVector *e_vecs, CeedVector *q_vecs, CeedInt start_e, CeedInt num_fields,
-                                       CeedInt Q) {
+static int CeedOperatorSetupFields_Opt(CeedQFunction qf, CeedOperator op, bool is_input, bool *skip_rstr, bool *apply_add_basis,
+                                       const CeedInt block_size, CeedElemRestriction *block_rstr, CeedVector *e_vecs_full, CeedVector *e_vecs,
+                                       CeedVector *q_vecs, CeedInt start_e, CeedInt num_fields, CeedInt Q) {
   Ceed                ceed;
   CeedSize            e_size, q_size;
   CeedInt             num_comp, size, P;
@@ -139,7 +139,7 @@ static int CeedOperatorSetupFields_Opt(CeedQFunction qf, CeedOperator op, bool i
     // Initialize E-vec arrays
     if (e_vecs[i]) CeedCallBackend(CeedVectorSetValue(e_vecs[i], 0.0));
   }
-  // Drop duplicate input restrictions
+  // Drop duplicate restrictions
   if (is_input) {
     for (CeedInt i = 0; i < num_fields; i++) {
       CeedVector          vec_i;
@@ -155,8 +155,29 @@ static int CeedOperatorSetupFields_Opt(CeedQFunction qf, CeedOperator op, bool i
         CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[j], &rstr_j));
         if (vec_i == vec_j && rstr_i == rstr_j) {
           CeedCallBackend(CeedVectorReferenceCopy(e_vecs[i], &e_vecs[j]));
-          CeedCallBackend(CeedVectorReferenceCopy(e_vecs_full[i], &e_vecs_full[j]));
-          CeedCallBackend(CeedElemRestrictionDestroy(&block_rstr[j]));
+          CeedCallBackend(CeedVectorReferenceCopy(e_vecs_full[i + start_e], &e_vecs_full[j + start_e]));
+          skip_rstr[j] = true;
+        }
+      }
+    }
+  } else {
+    for (CeedInt i = num_fields - 1; i >= 0; i--) {
+      CeedVector          vec_i;
+      CeedElemRestriction rstr_i;
+
+      CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec_i));
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &rstr_i));
+      for (CeedInt j = i - 1; j >= 0; j--) {
+        CeedVector          vec_j;
+        CeedElemRestriction rstr_j;
+
+        CeedCallBackend(CeedOperatorFieldGetVector(op_fields[j], &vec_j));
+        CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[j], &rstr_j));
+        if (vec_i == vec_j && rstr_i == rstr_j) {
+          CeedCallBackend(CeedVectorReferenceCopy(e_vecs[i], &e_vecs[j]));
+          CeedCallBackend(CeedVectorReferenceCopy(e_vecs_full[i + start_e], &e_vecs_full[j + start_e]));
+          skip_rstr[j]       = true;
+          apply_add_basis[i] = true;
         }
       }
     }
@@ -194,6 +215,9 @@ static int CeedOperatorSetup_Opt(CeedOperator op) {
   CeedCallBackend(CeedCalloc(num_input_fields + num_output_fields, &impl->block_rstr));
   CeedCallBackend(CeedCalloc(num_input_fields + num_output_fields, &impl->e_vecs_full));
 
+  CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->skip_rstr_in));
+  CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->skip_rstr_out));
+  CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->apply_add_basis_out));
   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->input_states));
   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->e_vecs_in));
   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->e_vecs_out));
@@ -205,11 +229,11 @@ static int CeedOperatorSetup_Opt(CeedOperator op) {
 
   // Set up infield and outfield pointer arrays
   // Infields
-  CeedCallBackend(CeedOperatorSetupFields_Opt(qf, op, true, block_size, impl->block_rstr, impl->e_vecs_full, impl->e_vecs_in, impl->q_vecs_in, 0,
-                                              num_input_fields, Q));
+  CeedCallBackend(CeedOperatorSetupFields_Opt(qf, op, true, impl->skip_rstr_in, NULL, block_size, impl->block_rstr, impl->e_vecs_full,
+                                              impl->e_vecs_in, impl->q_vecs_in, 0, num_input_fields, Q));
   // Outfields
-  CeedCallBackend(CeedOperatorSetupFields_Opt(qf, op, false, block_size, impl->block_rstr, impl->e_vecs_full, impl->e_vecs_out, impl->q_vecs_out,
-                                              num_input_fields, num_output_fields, Q));
+  CeedCallBackend(CeedOperatorSetupFields_Opt(qf, op, false, impl->skip_rstr_out, impl->apply_add_basis_out, block_size, impl->block_rstr,
+                                              impl->e_vecs_full, impl->e_vecs_out, impl->q_vecs_out, num_input_fields, num_output_fields, Q));
 
   // Identity QFunctions
   if (impl->is_identity_qf) {
@@ -251,7 +275,7 @@ static inline int CeedOperatorSetupInputs_Opt(CeedInt num_input_fields, CeedQFun
       if (vec != CEED_VECTOR_ACTIVE) {
         // Restrict
         CeedCallBackend(CeedVectorGetState(vec, &state));
-        if (state != impl->input_states[i] && impl->block_rstr[i]) {
+        if (state != impl->input_states[i] && impl->block_rstr[i] && !impl->skip_rstr_in[i]) {
           CeedCallBackend(CeedElemRestrictionApply(impl->block_rstr[i], CEED_NOTRANSPOSE, vec, impl->e_vecs_full[i], request));
         }
         impl->input_states[i] = state;
@@ -327,8 +351,8 @@ static inline int CeedOperatorInputBasis_Opt(CeedInt e, CeedInt Q, CeedQFunction
 // Output Basis Action
 //------------------------------------------------------------------------------
 static inline int CeedOperatorOutputBasis_Opt(CeedInt e, CeedInt Q, CeedQFunctionField *qf_output_fields, CeedOperatorField *op_output_fields,
-                                              CeedInt block_size, CeedInt num_input_fields, CeedInt num_output_fields, CeedOperator op,
-                                              CeedVector out_vec, CeedOperator_Opt *impl, CeedRequest *request) {
+                                              CeedInt block_size, CeedInt num_input_fields, CeedInt num_output_fields, bool *apply_add_basis,
+                                              bool *skip_rstr, CeedOperator op, CeedVector out_vec, CeedOperator_Opt *impl, CeedRequest *request) {
   for (CeedInt i = 0; i < num_output_fields; i++) {
     CeedEvalMode        eval_mode;
     CeedVector          vec;
@@ -347,7 +371,11 @@ static inline int CeedOperatorOutputBasis_Opt(CeedInt e, CeedInt Q, CeedQFunctio
       case CEED_EVAL_DIV:
       case CEED_EVAL_CURL:
         CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
-        CeedCallBackend(CeedBasisApply(basis, block_size, CEED_TRANSPOSE, eval_mode, impl->q_vecs_out[i], impl->e_vecs_out[i]));
+        if (apply_add_basis[i]) {
+          CeedCallBackend(CeedBasisApplyAdd(basis, block_size, CEED_TRANSPOSE, eval_mode, impl->q_vecs_out[i], impl->e_vecs_out[i]));
+        } else {
+          CeedCallBackend(CeedBasisApply(basis, block_size, CEED_TRANSPOSE, eval_mode, impl->q_vecs_out[i], impl->e_vecs_out[i]));
+        }
         break;
       // LCOV_EXCL_START
       case CEED_EVAL_WEIGHT: {
@@ -356,6 +384,7 @@ static inline int CeedOperatorOutputBasis_Opt(CeedInt e, CeedInt Q, CeedQFunctio
       }
     }
     // Restrict output block
+    if (skip_rstr[i]) continue;
     // Get output vector
     CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
     if (vec == CEED_VECTOR_ACTIVE) vec = out_vec;
@@ -448,8 +477,8 @@ static int CeedOperatorApplyAdd_Opt(CeedOperator op, CeedVector in_vec, CeedVect
     }
 
     // Output basis apply and restriction
-    CeedCallBackend(CeedOperatorOutputBasis_Opt(e, Q, qf_output_fields, op_output_fields, block_size, num_input_fields, num_output_fields, op,
-                                                out_vec, impl, request));
+    CeedCallBackend(CeedOperatorOutputBasis_Opt(e, Q, qf_output_fields, op_output_fields, block_size, num_input_fields, num_output_fields,
+                                                impl->apply_add_basis_out, impl->skip_rstr_out, op, out_vec, impl, request));
   }
 
   // Restore input arrays
@@ -694,6 +723,9 @@ static int CeedOperatorDestroy_Opt(CeedOperator op) {
   CeedCallBackend(CeedFree(&impl->block_rstr));
   CeedCallBackend(CeedFree(&impl->e_vecs_full));
   CeedCallBackend(CeedFree(&impl->input_states));
+  CeedCallBackend(CeedFree(&impl->skip_rstr_in));
+  CeedCallBackend(CeedFree(&impl->skip_rstr_out));
+  CeedCallBackend(CeedFree(&impl->apply_add_basis_out));
 
   for (CeedInt i = 0; i < impl->num_inputs; i++) {
     CeedCallBackend(CeedVectorDestroy(&impl->e_vecs_in[i]));
diff --git a/backends/opt/ceed-opt.h b/backends/opt/ceed-opt.h
index b40124fb99..d5f7399a89 100644
--- a/backends/opt/ceed-opt.h
+++ b/backends/opt/ceed-opt.h
@@ -21,6 +21,7 @@ typedef struct {
 
 typedef struct {
   bool                 is_identity_qf, is_identity_rstr_op;
+  bool                *skip_rstr_in, *skip_rstr_out, *apply_add_basis_out;
   CeedElemRestriction *block_rstr;   /* Blocked versions of restrictions */
   CeedVector          *e_vecs_full;  /* Full E-vectors, inputs followed by outputs */
   uint64_t            *input_states; /* State counter of inputs */
diff --git a/backends/ref/ceed-ref-operator.c b/backends/ref/ceed-ref-operator.c
index d605c943e7..de79e96d5b 100644
--- a/backends/ref/ceed-ref-operator.c
+++ b/backends/ref/ceed-ref-operator.c
@@ -16,8 +16,9 @@
 //------------------------------------------------------------------------------
 // Setup Input/Output Fields
 //------------------------------------------------------------------------------
-static int CeedOperatorSetupFields_Ref(CeedQFunction qf, CeedOperator op, bool is_input, bool *skip_rstr, CeedVector *e_vecs_full, CeedVector *e_vecs,
-                                       CeedVector *q_vecs, CeedInt start_e, CeedInt num_fields, CeedInt Q) {
+static int CeedOperatorSetupFields_Ref(CeedQFunction qf, CeedOperator op, bool is_input, bool *skip_rstr, CeedInt *e_data_out_indices,
+                                       bool *apply_add_basis, CeedVector *e_vecs_full, CeedVector *e_vecs, CeedVector *q_vecs, CeedInt start_e,
+                                       CeedInt num_fields, CeedInt Q) {
   Ceed                ceed;
   CeedSize            e_size, q_size;
   CeedInt             num_comp, size, P;
@@ -78,7 +79,7 @@ static int CeedOperatorSetupFields_Ref(CeedQFunction qf, CeedOperator op, bool i
         break;
     }
   }
-  // Drop duplicate input restrictions
+  // Drop duplicate restrictions
   if (is_input) {
     for (CeedInt i = 0; i < num_fields; i++) {
       CeedVector          vec_i;
@@ -94,11 +95,33 @@ static int CeedOperatorSetupFields_Ref(CeedQFunction qf, CeedOperator op, bool i
         CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[j], &rstr_j));
         if (vec_i == vec_j && rstr_i == rstr_j) {
           CeedCallBackend(CeedVectorReferenceCopy(e_vecs[i], &e_vecs[j]));
-          CeedCallBackend(CeedVectorReferenceCopy(e_vecs_full[i], &e_vecs_full[j]));
+          CeedCallBackend(CeedVectorReferenceCopy(e_vecs_full[i + start_e], &e_vecs_full[j + start_e]));
           skip_rstr[j] = true;
         }
       }
     }
+  } else {
+    for (CeedInt i = num_fields - 1; i >= 0; i--) {
+      CeedVector          vec_i;
+      CeedElemRestriction rstr_i;
+
+      CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec_i));
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &rstr_i));
+      for (CeedInt j = i - 1; j >= 0; j--) {
+        CeedVector          vec_j;
+        CeedElemRestriction rstr_j;
+
+        CeedCallBackend(CeedOperatorFieldGetVector(op_fields[j], &vec_j));
+        CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[j], &rstr_j));
+        if (vec_i == vec_j && rstr_i == rstr_j) {
+          CeedCallBackend(CeedVectorReferenceCopy(e_vecs[i], &e_vecs[j]));
+          CeedCallBackend(CeedVectorReferenceCopy(e_vecs_full[i + start_e], &e_vecs_full[j + start_e]));
+          skip_rstr[j]          = true;
+          apply_add_basis[i]    = true;
+          e_data_out_indices[j] = i;
+        }
+      }
+    }
   }
   return CEED_ERROR_SUCCESS;
 }
@@ -128,6 +151,9 @@ static int CeedOperatorSetup_Ref(CeedOperator op) {
   CeedCallBackend(CeedCalloc(num_input_fields + num_output_fields, &impl->e_vecs_full));
 
   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->skip_rstr_in));
+  CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->skip_rstr_out));
+  CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->e_data_out_indices));
+  CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->apply_add_basis_out));
   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->input_states));
   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->e_vecs_in));
   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->e_vecs_out));
@@ -139,11 +165,11 @@ static int CeedOperatorSetup_Ref(CeedOperator op) {
 
   // Set up infield and outfield e_vecs and q_vecs
   // Infields
-  CeedCallBackend(
-      CeedOperatorSetupFields_Ref(qf, op, true, impl->skip_rstr_in, impl->e_vecs_full, impl->e_vecs_in, impl->q_vecs_in, 0, num_input_fields, Q));
+  CeedCallBackend(CeedOperatorSetupFields_Ref(qf, op, true, impl->skip_rstr_in, NULL, NULL, impl->e_vecs_full, impl->e_vecs_in, impl->q_vecs_in, 0,
+                                              num_input_fields, Q));
   // Outfields
-  CeedCallBackend(CeedOperatorSetupFields_Ref(qf, op, false, NULL, impl->e_vecs_full, impl->e_vecs_out, impl->q_vecs_out, num_input_fields,
-                                              num_output_fields, Q));
+  CeedCallBackend(CeedOperatorSetupFields_Ref(qf, op, false, impl->skip_rstr_out, impl->e_data_out_indices, impl->apply_add_basis_out,
+                                              impl->e_vecs_full, impl->e_vecs_out, impl->q_vecs_out, num_input_fields, num_output_fields, Q));
 
   // Identity QFunctions
   if (impl->is_identity_qf) {
@@ -252,7 +278,7 @@ static inline int CeedOperatorInputBasis_Ref(CeedInt e, CeedInt Q, CeedQFunction
 // Output Basis Action
 //------------------------------------------------------------------------------
 static inline int CeedOperatorOutputBasis_Ref(CeedInt e, CeedInt Q, CeedQFunctionField *qf_output_fields, CeedOperatorField *op_output_fields,
-                                              CeedInt num_input_fields, CeedInt num_output_fields, CeedOperator op,
+                                              CeedInt num_input_fields, CeedInt num_output_fields, bool *apply_add_basis, CeedOperator op,
                                               CeedScalar *e_data_full[2 * CEED_FIELD_MAX], CeedOperator_Ref *impl) {
   for (CeedInt i = 0; i < num_output_fields; i++) {
     CeedInt             elem_size, num_comp;
@@ -276,7 +302,11 @@ static inline int CeedOperatorOutputBasis_Ref(CeedInt e, CeedInt Q, CeedQFunctio
         CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
         CeedCallBackend(CeedVectorSetArray(impl->e_vecs_out[i], CEED_MEM_HOST, CEED_USE_POINTER,
                                            &e_data_full[i + num_input_fields][(CeedSize)e * elem_size * num_comp]));
-        CeedCallBackend(CeedBasisApply(basis, 1, CEED_TRANSPOSE, eval_mode, impl->q_vecs_out[i], impl->e_vecs_out[i]));
+        if (apply_add_basis[i]) {
+          CeedCallBackend(CeedBasisApplyAdd(basis, 1, CEED_TRANSPOSE, eval_mode, impl->q_vecs_out[i], impl->e_vecs_out[i]));
+        } else {
+          CeedCallBackend(CeedBasisApply(basis, 1, CEED_TRANSPOSE, eval_mode, impl->q_vecs_out[i], impl->e_vecs_out[i]));
+        }
         break;
       // LCOV_EXCL_START
       case CEED_EVAL_WEIGHT: {
@@ -350,8 +380,12 @@ static int CeedOperatorApplyAdd_Ref(CeedOperator op, CeedVector in_vec, CeedVect
   CeedCallBackend(CeedOperatorSetupInputs_Ref(num_input_fields, qf_input_fields, op_input_fields, in_vec, false, e_data_full, impl, request));
 
   // Output Evecs
-  for (CeedInt i = 0; i < num_output_fields; i++) {
-    CeedCallBackend(CeedVectorGetArrayWrite(impl->e_vecs_full[i + impl->num_inputs], CEED_MEM_HOST, &e_data_full[i + num_input_fields]));
+  for (CeedInt i = num_output_fields - 1; i >= 0; i--) {
+    if (impl->skip_rstr_out[i]) {
+      e_data_full[i + num_input_fields] = e_data_full[impl->e_data_out_indices[i] + num_input_fields];
+    } else {
+      CeedCallBackend(CeedVectorGetArrayWrite(impl->e_vecs_full[i + impl->num_inputs], CEED_MEM_HOST, &e_data_full[i + num_input_fields]));
+    }
   }
 
   // Loop through elements
@@ -375,8 +409,8 @@ static int CeedOperatorApplyAdd_Ref(CeedOperator op, CeedVector in_vec, CeedVect
     }
 
     // Output basis apply
-    CeedCallBackend(
-        CeedOperatorOutputBasis_Ref(e, Q, qf_output_fields, op_output_fields, num_input_fields, num_output_fields, op, e_data_full, impl));
+    CeedCallBackend(CeedOperatorOutputBasis_Ref(e, Q, qf_output_fields, op_output_fields, num_input_fields, num_output_fields,
+                                                impl->apply_add_basis_out, op, e_data_full, impl));
   }
 
   // Output restriction
@@ -384,6 +418,7 @@ static int CeedOperatorApplyAdd_Ref(CeedOperator op, CeedVector in_vec, CeedVect
     CeedVector          vec;
     CeedElemRestriction elem_rstr;
 
+    if (impl->skip_rstr_out[i]) continue;
     // Restore Evec
     CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs_full[i + impl->num_inputs], &e_data_full[i + num_input_fields]));
     // Get output vector
@@ -590,8 +625,9 @@ static int CeedOperatorLinearAssembleQFunctionUpdate_Ref(CeedOperator op, CeedVe
 //------------------------------------------------------------------------------
 // Setup Input/Output Fields
 //------------------------------------------------------------------------------
-static int CeedOperatorSetupFieldsAtPoints_Ref(CeedQFunction qf, CeedOperator op, bool is_input, bool *skip_rstr, CeedVector *e_vecs_full,
-                                               CeedVector *e_vecs, CeedVector *q_vecs, CeedInt start_e, CeedInt num_fields, CeedInt Q) {
+static int CeedOperatorSetupFieldsAtPoints_Ref(CeedQFunction qf, CeedOperator op, bool is_input, bool *skip_rstr, bool *apply_add_basis,
+                                               CeedVector *e_vecs_full, CeedVector *e_vecs, CeedVector *q_vecs, CeedInt start_e, CeedInt num_fields,
+                                               CeedInt Q) {
   Ceed                ceed;
   CeedSize            e_size, q_size;
   CeedInt             max_num_points, num_comp, size, P;
@@ -685,7 +721,7 @@ static int CeedOperatorSetupFieldsAtPoints_Ref(CeedQFunction qf, CeedOperator op
     if (e_vecs[i]) CeedCallBackend(CeedVectorSetValue(e_vecs[i], 0.0));
     if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorSetValue(q_vecs[i], 0.0));
   }
-  // Drop duplicate input restrictions
+  // Drop duplicate restrictions
   if (is_input) {
     for (CeedInt i = 0; i < num_fields; i++) {
       CeedVector          vec_i;
@@ -701,10 +737,32 @@ static int CeedOperatorSetupFieldsAtPoints_Ref(CeedQFunction qf, CeedOperator op
         CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[j], &rstr_j));
         if (vec_i == vec_j && rstr_i == rstr_j) {
           CeedCallBackend(CeedVectorReferenceCopy(e_vecs[i], &e_vecs[j]));
+          CeedCallBackend(CeedVectorReferenceCopy(e_vecs_full[i + start_e], &e_vecs_full[j + start_e]));
           skip_rstr[j] = true;
         }
       }
     }
+  } else {
+    for (CeedInt i = num_fields - 1; i >= 0; i--) {
+      CeedVector          vec_i;
+      CeedElemRestriction rstr_i;
+
+      CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec_i));
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &rstr_i));
+      for (CeedInt j = i - 1; j >= 0; j--) {
+        CeedVector          vec_j;
+        CeedElemRestriction rstr_j;
+
+        CeedCallBackend(CeedOperatorFieldGetVector(op_fields[j], &vec_j));
+        CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[j], &rstr_j));
+        if (vec_i == vec_j && rstr_i == rstr_j) {
+          CeedCallBackend(CeedVectorReferenceCopy(e_vecs[i], &e_vecs[j]));
+          CeedCallBackend(CeedVectorReferenceCopy(e_vecs_full[i + start_e], &e_vecs_full[j + start_e]));
+          skip_rstr[j]       = true;
+          apply_add_basis[i] = true;
+        }
+      }
+    }
   }
   return CEED_ERROR_SUCCESS;
 }
@@ -734,6 +792,8 @@ static int CeedOperatorSetupAtPoints_Ref(CeedOperator op) {
   CeedCallBackend(CeedCalloc(num_input_fields + num_output_fields, &impl->e_vecs_full));
 
   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->skip_rstr_in));
+  CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->skip_rstr_out));
+  CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->apply_add_basis_out));
   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->input_states));
   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->e_vecs_in));
   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->e_vecs_out));
@@ -745,11 +805,11 @@ static int CeedOperatorSetupAtPoints_Ref(CeedOperator op) {
 
   // Set up infield and outfield pointer arrays
   // Infields
-  CeedCallBackend(CeedOperatorSetupFieldsAtPoints_Ref(qf, op, true, impl->skip_rstr_in, impl->e_vecs_full, impl->e_vecs_in, impl->q_vecs_in, 0,
+  CeedCallBackend(CeedOperatorSetupFieldsAtPoints_Ref(qf, op, true, impl->skip_rstr_in, NULL, impl->e_vecs_full, impl->e_vecs_in, impl->q_vecs_in, 0,
                                                       num_input_fields, Q));
   // Outfields
-  CeedCallBackend(CeedOperatorSetupFieldsAtPoints_Ref(qf, op, false, NULL, impl->e_vecs_full, impl->e_vecs_out, impl->q_vecs_out, num_input_fields,
-                                                      num_output_fields, Q));
+  CeedCallBackend(CeedOperatorSetupFieldsAtPoints_Ref(qf, op, false, impl->skip_rstr_out, impl->apply_add_basis_out, impl->e_vecs_full,
+                                                      impl->e_vecs_out, impl->q_vecs_out, num_input_fields, num_output_fields, Q));
 
   // Identity QFunctions
   if (impl->is_identity_qf) {
@@ -828,8 +888,8 @@ static inline int CeedOperatorInputBasisAtPoints_Ref(CeedInt e, CeedInt num_poin
 //------------------------------------------------------------------------------
 static inline int CeedOperatorOutputBasisAtPoints_Ref(CeedInt e, CeedInt num_points_offset, CeedInt num_points, CeedQFunctionField *qf_output_fields,
                                                       CeedOperatorField *op_output_fields, CeedInt num_input_fields, CeedInt num_output_fields,
-                                                      CeedOperator op, CeedVector out_vec, CeedVector point_coords_elem, CeedOperator_Ref *impl,
-                                                      CeedRequest *request) {
+                                                      bool *apply_add_basis, bool *skip_rstr, CeedOperator op, CeedVector out_vec,
+                                                      CeedVector point_coords_elem, CeedOperator_Ref *impl, CeedRequest *request) {
   for (CeedInt i = 0; i < num_output_fields; i++) {
     CeedRestrictionType rstr_type;
     CeedEvalMode        eval_mode;
@@ -849,8 +909,13 @@ static inline int CeedOperatorOutputBasisAtPoints_Ref(CeedInt e, CeedInt num_poi
       case CEED_EVAL_DIV:
       case CEED_EVAL_CURL:
         CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
-        CeedCallBackend(
-            CeedBasisApplyAtPoints(basis, 1, &num_points, CEED_TRANSPOSE, eval_mode, point_coords_elem, impl->q_vecs_out[i], impl->e_vecs_out[i]));
+        if (apply_add_basis[i]) {
+          CeedCallBackend(CeedBasisApplyAddAtPoints(basis, 1, &num_points, CEED_TRANSPOSE, eval_mode, point_coords_elem, impl->q_vecs_out[i],
+                                                    impl->e_vecs_out[i]));
+        } else {
+          CeedCallBackend(
+              CeedBasisApplyAtPoints(basis, 1, &num_points, CEED_TRANSPOSE, eval_mode, point_coords_elem, impl->q_vecs_out[i], impl->e_vecs_out[i]));
+        }
         break;
       // LCOV_EXCL_START
       case CEED_EVAL_WEIGHT: {
@@ -859,6 +924,7 @@ static inline int CeedOperatorOutputBasisAtPoints_Ref(CeedInt e, CeedInt num_poi
       }
     }
     // Restrict output block
+    if (skip_rstr[i]) continue;
     // Get output vector
     CeedCallBackend(CeedElemRestrictionGetType(elem_rstr, &rstr_type));
     CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
@@ -920,7 +986,8 @@ static int CeedOperatorApplyAddAtPoints_Ref(CeedOperator op, CeedVector in_vec,
 
     // Output basis apply and restriction
     CeedCallBackend(CeedOperatorOutputBasisAtPoints_Ref(e, num_points_offset, num_points, qf_output_fields, op_output_fields, num_input_fields,
-                                                        num_output_fields, op, out_vec, impl->point_coords_elem, impl, request));
+                                                        num_output_fields, impl->apply_add_basis_out, impl->skip_rstr_out, op, out_vec,
+                                                        impl->point_coords_elem, impl, request));
 
     num_points_offset += num_points;
   }
@@ -1292,7 +1359,8 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Ref(CeedOperator op, Ce
 
         // -- Output basis apply and restriction
         CeedCallBackend(CeedOperatorOutputBasisAtPoints_Ref(e, num_points_offset, num_points, qf_output_fields, op_output_fields, num_input_fields,
-                                                            num_output_fields, op, out_vec, impl->point_coords_elem, impl, request));
+                                                            num_output_fields, impl->apply_add_basis_out, impl->skip_rstr_out, op, out_vec,
+                                                            impl->point_coords_elem, impl, request));
 
         // -- Grab diagonal value
         for (CeedInt j = 0; j < num_output_fields; j++) {
@@ -1389,6 +1457,9 @@ static int CeedOperatorDestroy_Ref(CeedOperator op) {
 
   CeedCallBackend(CeedOperatorGetData(op, &impl));
   CeedCallBackend(CeedFree(&impl->skip_rstr_in));
+  CeedCallBackend(CeedFree(&impl->skip_rstr_out));
+  CeedCallBackend(CeedFree(&impl->e_data_out_indices));
+  CeedCallBackend(CeedFree(&impl->apply_add_basis_out));
   for (CeedInt i = 0; i < impl->num_inputs + impl->num_outputs; i++) {
     CeedCallBackend(CeedVectorDestroy(&impl->e_vecs_full[i]));
   }
diff --git a/backends/ref/ceed-ref.h b/backends/ref/ceed-ref.h
index ff8e9fa773..880b4f89af 100644
--- a/backends/ref/ceed-ref.h
+++ b/backends/ref/ceed-ref.h
@@ -49,7 +49,8 @@ typedef struct {
 
 typedef struct {
   bool        is_identity_qf, is_identity_rstr_op;
-  bool       *skip_rstr_in;
+  bool       *skip_rstr_in, *skip_rstr_out, *apply_add_basis_out;
+  CeedInt    *e_data_out_indices;
   uint64_t   *input_states; /* State counter of inputs */
   CeedVector *e_vecs_full;  /* Full E-vectors, inputs followed by outputs */
   CeedVector *e_vecs_in;    /* Single element input E-vectors  */

From 5a5594ffd3205667210320a8dc336d609d13e75f Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Thu, 22 Aug 2024 11:49:50 -0600
Subject: [PATCH 141/571] minor - fix CeedCall() vs CeedCallBackend() in
 backend code

---
 backends/cuda-gen/ceed-cuda-gen-operator-build.cpp | 4 ++--
 backends/cuda-gen/ceed-cuda-gen.c                  | 2 +-
 backends/cuda-ref/ceed-cuda-ref-basis.c            | 4 ++--
 backends/cuda-ref/ceed-cuda-ref-restriction.c      | 8 ++++----
 backends/cuda-ref/ceed-cuda-ref-vector.c           | 4 ++--
 backends/cuda-shared/ceed-cuda-shared-basis.c      | 4 ++--
 backends/hip-gen/ceed-hip-gen-operator-build.cpp   | 4 ++--
 backends/hip-ref/ceed-hip-ref-basis.c              | 4 ++--
 backends/hip-ref/ceed-hip-ref-restriction.c        | 8 ++++----
 backends/hip-ref/ceed-hip-ref-vector.c             | 4 ++--
 backends/hip-shared/ceed-hip-shared-basis.c        | 4 ++--
 backends/magma/ceed-magma-basis.c                  | 8 ++++----
 backends/sycl/ceed-sycl-compile.sycl.cpp           | 2 +-
 13 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
index 28583b103b..eb8d5ad848 100644
--- a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
+++ b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
@@ -777,11 +777,11 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {
   // Initialize constants, and matrices B and G
   code << "\n  // Input field constants and basis data\n";
   for (CeedInt i = 0; i < num_input_fields; i++) {
-    CeedCall(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, i, op_input_fields[i], qf_input_fields[i], Q_1d, true, use_3d_slices));
+    CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, i, op_input_fields[i], qf_input_fields[i], Q_1d, true, use_3d_slices));
   }
   code << "\n  // Output field constants and basis data\n";
   for (CeedInt i = 0; i < num_output_fields; i++) {
-    CeedCall(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, i, op_output_fields[i], qf_output_fields[i], Q_1d, false, use_3d_slices));
+    CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, i, op_output_fields[i], qf_output_fields[i], Q_1d, false, use_3d_slices));
   }
 
   // Loop over all elements
diff --git a/backends/cuda-gen/ceed-cuda-gen.c b/backends/cuda-gen/ceed-cuda-gen.c
index e1833be2a2..fd4fcef722 100644
--- a/backends/cuda-gen/ceed-cuda-gen.c
+++ b/backends/cuda-gen/ceed-cuda-gen.c
@@ -31,7 +31,7 @@ static int CeedInit_Cuda_gen(const char *resource, Ceed ceed) {
   CeedCallBackend(CeedSetData(ceed, data));
   CeedCallBackend(CeedInit_Cuda(ceed, resource));
 
-  CeedCall(CeedInit("/gpu/cuda/shared", &ceed_shared));
+  CeedCallBackend(CeedInit("/gpu/cuda/shared", &ceed_shared));
   CeedCallBackend(CeedSetDelegate(ceed, ceed_shared));
 
   CeedCallBackend(CeedSetOperatorFallbackResource(ceed, fallback_resource));
diff --git a/backends/cuda-ref/ceed-cuda-ref-basis.c b/backends/cuda-ref/ceed-cuda-ref-basis.c
index 8cf285cbc8..5efaeee456 100644
--- a/backends/cuda-ref/ceed-cuda-ref-basis.c
+++ b/backends/cuda-ref/ceed-cuda-ref-basis.c
@@ -123,7 +123,7 @@ static int CeedBasisApplyAtPointsCore_Cuda(CeedBasis basis, bool apply_add, cons
 
   // Weight handled separately
   if (eval_mode == CEED_EVAL_WEIGHT) {
-    CeedCall(CeedVectorSetValue(v, 1.0));
+    CeedCallBackend(CeedVectorSetValue(v, 1.0));
     return CEED_ERROR_SUCCESS;
   }
 
@@ -141,7 +141,7 @@ static int CeedBasisApplyAtPointsCore_Cuda(CeedBasis basis, bool apply_add, cons
 
       interp_bytes = P_1d * Q_1d * sizeof(CeedScalar);
       CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d));
-      CeedCall(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d));
+      CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d));
       CeedCallCuda(ceed, cudaMalloc((void **)&data->d_chebyshev_interp_1d, interp_bytes));
       CeedCallCuda(ceed, cudaMemcpy(data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, cudaMemcpyHostToDevice));
       CeedCallBackend(CeedFree(&chebyshev_interp_1d));
diff --git a/backends/cuda-ref/ceed-cuda-ref-restriction.c b/backends/cuda-ref/ceed-cuda-ref-restriction.c
index 7e381e77e0..0f20ceca7d 100644
--- a/backends/cuda-ref/ceed-cuda-ref-restriction.c
+++ b/backends/cuda-ref/ceed-cuda-ref-restriction.c
@@ -112,8 +112,8 @@ static inline int CeedElemRestrictionSetupCompile_Cuda(CeedElemRestriction rstr)
       CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OffsetTranspose", &impl->ApplyUnsignedTranspose));
       // Cleanup
       CeedCallBackend(CeedFree(&offset_kernel_path));
-      for (CeedInt i = 0; i < num_file_paths; i++) CeedCall(CeedFree(&file_paths[i]));
-      CeedCall(CeedFree(&file_paths));
+      for (CeedInt i = 0; i < num_file_paths; i++) CeedCallBackend(CeedFree(&file_paths[i]));
+      CeedCallBackend(CeedFree(&file_paths));
     } break;
     case CEED_RESTRICTION_CURL_ORIENTED: {
       const char *offset_kernel_path;
@@ -137,8 +137,8 @@ static inline int CeedElemRestrictionSetupCompile_Cuda(CeedElemRestriction rstr)
       CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OffsetTranspose", &impl->ApplyUnorientedTranspose));
       // Cleanup
       CeedCallBackend(CeedFree(&offset_kernel_path));
-      for (CeedInt i = 0; i < num_file_paths; i++) CeedCall(CeedFree(&file_paths[i]));
-      CeedCall(CeedFree(&file_paths));
+      for (CeedInt i = 0; i < num_file_paths; i++) CeedCallBackend(CeedFree(&file_paths[i]));
+      CeedCallBackend(CeedFree(&file_paths));
     } break;
   }
   CeedCallBackend(CeedFree(&restriction_kernel_path));
diff --git a/backends/cuda-ref/ceed-cuda-ref-vector.c b/backends/cuda-ref/ceed-cuda-ref-vector.c
index e15d44789a..d6622e0e99 100644
--- a/backends/cuda-ref/ceed-cuda-ref-vector.c
+++ b/backends/cuda-ref/ceed-cuda-ref-vector.c
@@ -247,8 +247,8 @@ static int CeedVectorCopyStrided_Cuda(CeedVector vec, CeedSize start, CeedSize s
   {
     CeedSize length_vec, length_copy;
 
-    CeedCall(CeedVectorGetLength(vec, &length_vec));
-    CeedCall(CeedVectorGetLength(vec_copy, &length_copy));
+    CeedCallBackend(CeedVectorGetLength(vec, &length_vec));
+    CeedCallBackend(CeedVectorGetLength(vec_copy, &length_copy));
     length = length_vec < length_copy ? length_vec : length_copy;
   }
   // Set value for synced device/host array
diff --git a/backends/cuda-shared/ceed-cuda-shared-basis.c b/backends/cuda-shared/ceed-cuda-shared-basis.c
index 4f0901484d..8245149f6d 100644
--- a/backends/cuda-shared/ceed-cuda-shared-basis.c
+++ b/backends/cuda-shared/ceed-cuda-shared-basis.c
@@ -229,7 +229,7 @@ static int CeedBasisApplyAtPointsCore_Cuda_shared(CeedBasis basis, bool apply_ad
 
   // Weight handled separately
   if (eval_mode == CEED_EVAL_WEIGHT) {
-    CeedCall(CeedVectorSetValue(v, 1.0));
+    CeedCallBackend(CeedVectorSetValue(v, 1.0));
     return CEED_ERROR_SUCCESS;
   }
 
@@ -247,7 +247,7 @@ static int CeedBasisApplyAtPointsCore_Cuda_shared(CeedBasis basis, bool apply_ad
 
       interp_bytes = P_1d * Q_1d * sizeof(CeedScalar);
       CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d));
-      CeedCall(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d));
+      CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d));
       CeedCallCuda(ceed, cudaMalloc((void **)&data->d_chebyshev_interp_1d, interp_bytes));
       CeedCallCuda(ceed, cudaMemcpy(data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, cudaMemcpyHostToDevice));
       CeedCallBackend(CeedFree(&chebyshev_interp_1d));
diff --git a/backends/hip-gen/ceed-hip-gen-operator-build.cpp b/backends/hip-gen/ceed-hip-gen-operator-build.cpp
index 623c3deb9a..6926e6fb4e 100644
--- a/backends/hip-gen/ceed-hip-gen-operator-build.cpp
+++ b/backends/hip-gen/ceed-hip-gen-operator-build.cpp
@@ -785,11 +785,11 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op) {
   // Initialize constants, and matrices B and G
   code << "\n  // Input field constants and basis data\n";
   for (CeedInt i = 0; i < num_input_fields; i++) {
-    CeedCall(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, i, op_input_fields[i], qf_input_fields[i], Q_1d, true, use_3d_slices));
+    CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, i, op_input_fields[i], qf_input_fields[i], Q_1d, true, use_3d_slices));
   }
   code << "\n  // Output field constants and basis data\n";
   for (CeedInt i = 0; i < num_output_fields; i++) {
-    CeedCall(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, i, op_output_fields[i], qf_output_fields[i], Q_1d, false, use_3d_slices));
+    CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, i, op_output_fields[i], qf_output_fields[i], Q_1d, false, use_3d_slices));
   }
 
   // Loop over all elements
diff --git a/backends/hip-ref/ceed-hip-ref-basis.c b/backends/hip-ref/ceed-hip-ref-basis.c
index 78e27a1a5f..ea3f4e6e3a 100644
--- a/backends/hip-ref/ceed-hip-ref-basis.c
+++ b/backends/hip-ref/ceed-hip-ref-basis.c
@@ -121,7 +121,7 @@ static int CeedBasisApplyAtPointsCore_Hip(CeedBasis basis, bool apply_add, const
 
   // Weight handled separately
   if (eval_mode == CEED_EVAL_WEIGHT) {
-    CeedCall(CeedVectorSetValue(v, 1.0));
+    CeedCallBackend(CeedVectorSetValue(v, 1.0));
     return CEED_ERROR_SUCCESS;
   }
 
@@ -139,7 +139,7 @@ static int CeedBasisApplyAtPointsCore_Hip(CeedBasis basis, bool apply_add, const
 
       interp_bytes = P_1d * Q_1d * sizeof(CeedScalar);
       CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d));
-      CeedCall(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d));
+      CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d));
       CeedCallHip(ceed, hipMalloc((void **)&data->d_chebyshev_interp_1d, interp_bytes));
       CeedCallHip(ceed, hipMemcpy(data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, hipMemcpyHostToDevice));
       CeedCallBackend(CeedFree(&chebyshev_interp_1d));
diff --git a/backends/hip-ref/ceed-hip-ref-restriction.c b/backends/hip-ref/ceed-hip-ref-restriction.c
index 0cbdc64c3b..41bba37520 100644
--- a/backends/hip-ref/ceed-hip-ref-restriction.c
+++ b/backends/hip-ref/ceed-hip-ref-restriction.c
@@ -111,8 +111,8 @@ static inline int CeedElemRestrictionSetupCompile_Hip(CeedElemRestriction rstr)
       CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "OffsetTranspose", &impl->ApplyUnsignedTranspose));
       // Cleanup
       CeedCallBackend(CeedFree(&offset_kernel_path));
-      for (CeedInt i = 0; i < num_file_paths; i++) CeedCall(CeedFree(&file_paths[i]));
-      CeedCall(CeedFree(&file_paths));
+      for (CeedInt i = 0; i < num_file_paths; i++) CeedCallBackend(CeedFree(&file_paths[i]));
+      CeedCallBackend(CeedFree(&file_paths));
     } break;
     case CEED_RESTRICTION_CURL_ORIENTED: {
       const char *offset_kernel_path;
@@ -136,8 +136,8 @@ static inline int CeedElemRestrictionSetupCompile_Hip(CeedElemRestriction rstr)
       CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "OffsetTranspose", &impl->ApplyUnorientedTranspose));
       // Cleanup
       CeedCallBackend(CeedFree(&offset_kernel_path));
-      for (CeedInt i = 0; i < num_file_paths; i++) CeedCall(CeedFree(&file_paths[i]));
-      CeedCall(CeedFree(&file_paths));
+      for (CeedInt i = 0; i < num_file_paths; i++) CeedCallBackend(CeedFree(&file_paths[i]));
+      CeedCallBackend(CeedFree(&file_paths));
     } break;
   }
   CeedCallBackend(CeedFree(&restriction_kernel_path));
diff --git a/backends/hip-ref/ceed-hip-ref-vector.c b/backends/hip-ref/ceed-hip-ref-vector.c
index 2883de9e25..f57d8bcf69 100644
--- a/backends/hip-ref/ceed-hip-ref-vector.c
+++ b/backends/hip-ref/ceed-hip-ref-vector.c
@@ -247,8 +247,8 @@ static int CeedVectorCopyStrided_Hip(CeedVector vec, CeedSize start, CeedSize st
   {
     CeedSize length_vec, length_copy;
 
-    CeedCall(CeedVectorGetLength(vec, &length_vec));
-    CeedCall(CeedVectorGetLength(vec_copy, &length_copy));
+    CeedCallBackend(CeedVectorGetLength(vec, &length_vec));
+    CeedCallBackend(CeedVectorGetLength(vec_copy, &length_copy));
     length = length_vec < length_copy ? length_vec : length_copy;
   }
   // Set value for synced device/host array
diff --git a/backends/hip-shared/ceed-hip-shared-basis.c b/backends/hip-shared/ceed-hip-shared-basis.c
index bda080ed2d..959078e5b1 100644
--- a/backends/hip-shared/ceed-hip-shared-basis.c
+++ b/backends/hip-shared/ceed-hip-shared-basis.c
@@ -288,7 +288,7 @@ static int CeedBasisApplyAtPointsCore_Hip_shared(CeedBasis basis, bool apply_add
 
   // Weight handled separately
   if (eval_mode == CEED_EVAL_WEIGHT) {
-    CeedCall(CeedVectorSetValue(v, 1.0));
+    CeedCallBackend(CeedVectorSetValue(v, 1.0));
     return CEED_ERROR_SUCCESS;
   }
 
@@ -306,7 +306,7 @@ static int CeedBasisApplyAtPointsCore_Hip_shared(CeedBasis basis, bool apply_add
 
       interp_bytes = P_1d * Q_1d * sizeof(CeedScalar);
       CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d));
-      CeedCall(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d));
+      CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d));
       CeedCallHip(ceed, hipMalloc((void **)&data->d_chebyshev_interp_1d, interp_bytes));
       CeedCallHip(ceed, hipMemcpy(data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, hipMemcpyHostToDevice));
       CeedCallBackend(CeedFree(&chebyshev_interp_1d));
diff --git a/backends/magma/ceed-magma-basis.c b/backends/magma/ceed-magma-basis.c
index 71a86d5b8d..6f5fa0a2ca 100644
--- a/backends/magma/ceed-magma-basis.c
+++ b/backends/magma/ceed-magma-basis.c
@@ -369,8 +369,8 @@ static int CeedBasisApplyNonTensorCore_Magma(CeedBasis basis, bool apply_add, Ce
       }
       CeedCallBackend(CeedFree(&basis_kernel_path));
       CeedCallBackend(CeedFree(&basis_kernel_source));
-      for (CeedInt i = 0; i < num_file_paths; i++) CeedCall(CeedFree(&file_paths[i]));
-      CeedCall(CeedFree(&file_paths));
+      for (CeedInt i = 0; i < num_file_paths; i++) CeedCallBackend(CeedFree(&file_paths[i]));
+      CeedCallBackend(CeedFree(&file_paths));
     }
   }
 
@@ -617,8 +617,8 @@ int CeedBasisCreateTensorH1_Magma(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const
   CeedCallBackend(CeedFree(&grad_kernel_path));
   CeedCallBackend(CeedFree(&weight_kernel_path));
   CeedCallBackend(CeedFree(&basis_kernel_source));
-  for (CeedInt i = 0; i < num_file_paths; i++) CeedCall(CeedFree(&file_paths[i]));
-  CeedCall(CeedFree(&file_paths));
+  for (CeedInt i = 0; i < num_file_paths; i++) CeedCallBackend(CeedFree(&file_paths[i]));
+  CeedCallBackend(CeedFree(&file_paths));
 
   CeedCallBackend(CeedBasisSetData(basis, impl));
 
diff --git a/backends/sycl/ceed-sycl-compile.sycl.cpp b/backends/sycl/ceed-sycl-compile.sycl.cpp
index 9615114158..9dc0177401 100644
--- a/backends/sycl/ceed-sycl-compile.sycl.cpp
+++ b/backends/sycl/ceed-sycl-compile.sycl.cpp
@@ -106,7 +106,7 @@ static int CeedLoadModule_Sycl(Ceed ceed, const sycl::context &sycl_context, con
 
     zeModuleBuildLogGetString(lz_log, &log_size, nullptr);
 
-    CeedCall(CeedCalloc(log_size, &log_message));
+    CeedCallBackend(CeedCalloc(log_size, &log_message));
     zeModuleBuildLogGetString(lz_log, &log_size, log_message);
 
     return CeedError(ceed, CEED_ERROR_BACKEND, "Failed to compile Level Zero module:\n%s", log_message);

From bf84744c45d908693f8d34d79c45258a8cb93a26 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Thu, 22 Aug 2024 11:50:44 -0600
Subject: [PATCH 142/571] leak - add missing CeedFree for string

---
 backends/cuda-shared/ceed-cuda-shared.c | 1 +
 interface/ceed.c                        | 4 +++-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/backends/cuda-shared/ceed-cuda-shared.c b/backends/cuda-shared/ceed-cuda-shared.c
index ef704f7193..5ab65815cc 100644
--- a/backends/cuda-shared/ceed-cuda-shared.c
+++ b/backends/cuda-shared/ceed-cuda-shared.c
@@ -24,6 +24,7 @@ static int CeedInit_Cuda_shared(const char *resource, Ceed ceed) {
 
   CeedCallBackend(CeedGetResourceRoot(ceed, resource, ":", &resource_root));
   CeedCheck(!strcmp(resource_root, "/gpu/cuda/shared"), ceed, CEED_ERROR_BACKEND, "Cuda backend cannot use resource: %s", resource);
+  CeedCallBackend(CeedFree(&resource_root));
   CeedCallBackend(CeedSetDeterministic(ceed, true));
 
   CeedCallBackend(CeedCalloc(1, &data));
diff --git a/interface/ceed.c b/interface/ceed.c
index 3bdd471454..b15cba3a51 100644
--- a/interface/ceed.c
+++ b/interface/ceed.c
@@ -443,7 +443,9 @@ int CeedIsDebug(Ceed ceed, bool *is_debug) {
 }
 
 /**
-  @brief Get the root of the requested resource
+  @brief Get the root of the requested resource.
+
+  Note: Caller is responsible for calling @ref CeedFree() on the `resource_root`.
 
   @param[in]  ceed          `Ceed` context to get resource name of
   @param[in]  resource      Full user specified resource

From df8a6b43ba5a7cbac05ea11e2e90d2aae1c4380f Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Thu, 22 Aug 2024 12:02:24 -0600
Subject: [PATCH 143/571] test - fix input misordering

---
 tests/t593-operator.c | 4 ++--
 tests/t594-operator.c | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/t593-operator.c b/tests/t593-operator.c
index 2a2daceb88..2e0710c7fc 100644
--- a/tests/t593-operator.c
+++ b/tests/t593-operator.c
@@ -85,13 +85,13 @@ int main(int argc, char **argv) {
 
   // Setup geometric scaling
   CeedQFunctionCreateInterior(ceed, 1, setup, setup_loc, &qf_setup);
-  CeedQFunctionAddInput(qf_setup, "x", dim * dim, CEED_EVAL_GRAD);
   CeedQFunctionAddInput(qf_setup, "weight", 1, CEED_EVAL_WEIGHT);
+  CeedQFunctionAddInput(qf_setup, "x", dim * dim, CEED_EVAL_GRAD);
   CeedQFunctionAddOutput(qf_setup, "rho", 1, CEED_EVAL_NONE);
 
   CeedOperatorCreateAtPoints(ceed, qf_setup, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_setup);
-  CeedOperatorSetField(op_setup, "x", elem_restriction_x, basis_x, CEED_VECTOR_ACTIVE);
   CeedOperatorSetField(op_setup, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE);
+  CeedOperatorSetField(op_setup, "x", elem_restriction_x, basis_x, CEED_VECTOR_ACTIVE);
   CeedOperatorSetField(op_setup, "rho", elem_restriction_q_data, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE);
   CeedOperatorAtPointsSetPoints(op_setup, elem_restriction_x_points, x_points);
 
diff --git a/tests/t594-operator.c b/tests/t594-operator.c
index 2d4e6d876c..49405e37a4 100644
--- a/tests/t594-operator.c
+++ b/tests/t594-operator.c
@@ -85,13 +85,13 @@ int main(int argc, char **argv) {
 
   // Setup geometric scaling
   CeedQFunctionCreateInterior(ceed, 1, setup, setup_loc, &qf_setup);
-  CeedQFunctionAddInput(qf_setup, "x", dim * dim, CEED_EVAL_GRAD);
   CeedQFunctionAddInput(qf_setup, "weight", 1, CEED_EVAL_WEIGHT);
+  CeedQFunctionAddInput(qf_setup, "x", dim * dim, CEED_EVAL_GRAD);
   CeedQFunctionAddOutput(qf_setup, "rho", 1, CEED_EVAL_NONE);
 
   CeedOperatorCreateAtPoints(ceed, qf_setup, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_setup);
-  CeedOperatorSetField(op_setup, "x", elem_restriction_x, basis_x, CEED_VECTOR_ACTIVE);
   CeedOperatorSetField(op_setup, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE);
+  CeedOperatorSetField(op_setup, "x", elem_restriction_x, basis_x, CEED_VECTOR_ACTIVE);
   CeedOperatorSetField(op_setup, "rho", elem_restriction_q_data, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE);
   CeedOperatorAtPointsSetPoints(op_setup, elem_restriction_x_points, x_points);
 

From d1931fc83dfaa61549375a0461a8efe2c16b442e Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 23 Aug 2024 08:46:06 -0600
Subject: [PATCH 144/571] memcheck - invalidate arrays before  freeing

---
 backends/memcheck/ceed-memcheck-vector.c | 33 +++++++++++++++++-------
 1 file changed, 23 insertions(+), 10 deletions(-)

diff --git a/backends/memcheck/ceed-memcheck-vector.c b/backends/memcheck/ceed-memcheck-vector.c
index ae29245120..0435fe7e5a 100644
--- a/backends/memcheck/ceed-memcheck-vector.c
+++ b/backends/memcheck/ceed-memcheck-vector.c
@@ -49,7 +49,13 @@ static int CeedVectorSetArray_Memcheck(CeedVector vec, CeedMemType mem_type, Cee
 
   CeedCheck(mem_type == CEED_MEM_HOST, CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND, "Can only set HOST memory for this backend");
 
+  if (impl->array_allocated) {
+    for (CeedSize i = 0; i < length; i++) impl->array_allocated[i] = NAN;
+  }
   CeedCallBackend(CeedFree(&impl->array_allocated));
+  if (impl->array_owned) {
+    for (CeedSize i = 0; i < length; i++) impl->array_owned[i] = NAN;
+  }
   CeedCallBackend(CeedFree(&impl->array_owned));
   switch (copy_mode) {
     case CEED_COPY_VALUES:
@@ -57,7 +63,7 @@ static int CeedVectorSetArray_Memcheck(CeedVector vec, CeedMemType mem_type, Cee
       impl->array_borrowed = NULL;
       impl->array          = impl->array_owned;
       if (array) {
-        memcpy(impl->array, array, length * sizeof(array[0]));
+        memcpy(impl->array, array, length * sizeof(CeedScalar));
       } else {
         for (CeedInt i = 0; i < length; i++) impl->array[i] = NAN;
       }
@@ -73,10 +79,10 @@ static int CeedVectorSetArray_Memcheck(CeedVector vec, CeedMemType mem_type, Cee
   }
   // Copy data to check access
   CeedCallBackend(CeedCalloc(length, &impl->array_allocated));
-  memcpy(impl->array_allocated, impl->array, length * sizeof(array[0]));
+  memcpy(impl->array_allocated, impl->array, length * sizeof(CeedScalar));
   impl->array = impl->array_allocated;
   VALGRIND_DISCARD(impl->mem_block_id);
-  impl->mem_block_id = VALGRIND_CREATE_BLOCK(impl->array, length * sizeof(array[0]), "'Vector backend array data copy'");
+  impl->mem_block_id = VALGRIND_CREATE_BLOCK(impl->array, length * sizeof(CeedScalar), "'Vector backend array data copy'");
   return CEED_ERROR_SUCCESS;
 }
 
@@ -84,9 +90,11 @@ static int CeedVectorSetArray_Memcheck(CeedVector vec, CeedMemType mem_type, Cee
 // Vector Take Array
 //------------------------------------------------------------------------------
 static int CeedVectorTakeArray_Memcheck(CeedVector vec, CeedMemType mem_type, CeedScalar **array) {
+  CeedSize             length;
   CeedVector_Memcheck *impl;
 
   CeedCallBackend(CeedVectorGetData(vec, &impl));
+  CeedCallBackend(CeedVectorGetLength(vec, &length));
 
   CeedCheck(mem_type == CEED_MEM_HOST, CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND, "Can only provide HOST memory for this backend");
 
@@ -94,6 +102,9 @@ static int CeedVectorTakeArray_Memcheck(CeedVector vec, CeedMemType mem_type, Ce
   impl->array_borrowed = NULL;
   impl->array          = NULL;
   VALGRIND_DISCARD(impl->mem_block_id);
+  if (impl->array_allocated) {
+    for (CeedSize i = 0; i < length; i++) impl->array_allocated[i] = NAN;
+  }
   CeedCallBackend(CeedFree(&impl->array_allocated));
   return CEED_ERROR_SUCCESS;
 }
@@ -111,7 +122,7 @@ static int CeedVectorGetArray_Memcheck(CeedVector vec, CeedMemType mem_type, Cee
   CeedCheck(mem_type == CEED_MEM_HOST, CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND, "Can only provide HOST memory for this backend");
 
   CeedCallBackend(CeedCalloc(length, &impl->array_writable_copy));
-  memcpy(impl->array_writable_copy, impl->array, length * sizeof((impl->array)[0]));
+  memcpy(impl->array_writable_copy, impl->array, length * sizeof(CeedScalar));
   *array = impl->array_writable_copy;
   return CEED_ERROR_SUCCESS;
 }
@@ -129,11 +140,11 @@ static int CeedVectorGetArrayRead_Memcheck(CeedVector vec, CeedMemType mem_type,
   CeedCheck(mem_type == CEED_MEM_HOST, CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND, "Can only provide HOST memory for this backend");
 
   // Make copy to verify no write occurred
-  *array = impl->array;
   if (!impl->array_read_only_copy) {
     CeedCallBackend(CeedCalloc(length, &impl->array_read_only_copy));
-    memcpy(impl->array_read_only_copy, *array, length * sizeof((*array)[0]));
+    memcpy(impl->array_read_only_copy, impl->array, length * sizeof(CeedScalar));
   }
+  *array = impl->array_read_only_copy;
   return CEED_ERROR_SUCCESS;
 }
 
@@ -167,7 +178,8 @@ static int CeedVectorRestoreArray_Memcheck(CeedVector vec) {
   CeedCallBackend(CeedVectorGetLength(vec, &length));
   CeedCallBackend(CeedVectorGetCeed(vec, &ceed));
 
-  memcpy(impl->array, impl->array_writable_copy, length * sizeof((impl->array)[0]));
+  memcpy(impl->array, impl->array_writable_copy, length * sizeof(CeedScalar));
+  for (CeedSize i = 0; i < length; i++) impl->array_writable_copy[i] = NAN;
   CeedCallBackend(CeedFree(&impl->array_writable_copy));
   if (impl->is_write_only_access) {
     for (CeedSize i = 0; i < length; i++) {
@@ -177,10 +189,10 @@ static int CeedVectorRestoreArray_Memcheck(CeedVector vec) {
     impl->is_write_only_access = false;
   }
   if (impl->array_borrowed) {
-    memcpy(impl->array_borrowed, impl->array, length * sizeof(impl->array[0]));
+    memcpy(impl->array_borrowed, impl->array, length * sizeof(CeedScalar));
   }
   if (impl->array_owned) {
-    memcpy(impl->array_owned, impl->array, length * sizeof(impl->array[0]));
+    memcpy(impl->array_owned, impl->array, length * sizeof(CeedScalar));
   }
   return CEED_ERROR_SUCCESS;
 }
@@ -195,9 +207,10 @@ static int CeedVectorRestoreArrayRead_Memcheck(CeedVector vec) {
   CeedCallBackend(CeedVectorGetData(vec, &impl));
   CeedCallBackend(CeedVectorGetLength(vec, &length));
 
-  CeedCheck(!memcmp(impl->array, impl->array_read_only_copy, length * sizeof(impl->array[0])), CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND,
+  CeedCheck(!memcmp(impl->array, impl->array_read_only_copy, length * sizeof(CeedScalar)), CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND,
             "Array data changed while accessed in read-only mode");
 
+  for (CeedSize i = 0; i < length; i++) impl->array_read_only_copy[i] = NAN;
   CeedCallBackend(CeedFree(&impl->array_read_only_copy));
   return CEED_ERROR_SUCCESS;
 }

From 42a37a0bedc8a227472fb83b1871607d9c2bb231 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Tue, 27 Aug 2024 13:43:08 -0600
Subject: [PATCH 145/571] leak - fix leak in bpsraw

---
 examples/petsc/bpsraw.c | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/examples/petsc/bpsraw.c b/examples/petsc/bpsraw.c
index 5e2cdac76f..e43e6567c3 100644
--- a/examples/petsc/bpsraw.c
+++ b/examples/petsc/bpsraw.c
@@ -438,9 +438,6 @@ int main(int argc, char **argv) {
   PetscCall(VecSetFromOptions(X));
   PetscCall(VecSetUp(X));
 
-  // Set up libCEED
-  CeedInit(ceed_resource, &ceed);
-
   // Print summary
   PetscInt gsize;
 
@@ -798,21 +795,22 @@ int main(int argc, char **argv) {
   CeedVectorDestroy(&op_apply_ctx->y_ceed);
   CeedVectorDestroy(&op_apply_ctx->q_data);
   CeedVectorDestroy(&target);
-  CeedOperatorDestroy(&op_setup_geo);
-  CeedOperatorDestroy(&op_setup_rhs);
-  CeedOperatorDestroy(&op_apply);
-  CeedOperatorDestroy(&op_error);
   CeedElemRestrictionDestroy(&elem_restr_u);
   CeedElemRestrictionDestroy(&elem_restr_x);
   CeedElemRestrictionDestroy(&elem_restr_u_i);
   CeedElemRestrictionDestroy(&elem_restr_qd_i);
+  CeedBasisDestroy(&basis_u);
+  CeedBasisDestroy(&basis_x);
   CeedQFunctionDestroy(&qf_setup_geo);
   CeedQFunctionDestroy(&qf_setup_rhs);
   CeedQFunctionDestroy(&qf_apply);
   CeedQFunctionDestroy(&qf_error);
-  CeedBasisDestroy(&basis_u);
-  CeedBasisDestroy(&basis_x);
+  CeedOperatorDestroy(&op_setup_geo);
+  CeedOperatorDestroy(&op_setup_rhs);
+  CeedOperatorDestroy(&op_apply);
+  CeedOperatorDestroy(&op_error);
   CeedDestroy(&ceed);
+
   PetscCall(PetscFree(op_apply_ctx));
   return PetscFinalize();
 }

From 4dbe2ad5260ad67d4b6134091b73ad6d31c08219 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Tue, 27 Aug 2024 15:33:39 -0600
Subject: [PATCH 146/571] leak - fix leak in multigrid ex

---
 examples/petsc/area.c                 |  2 +-
 examples/petsc/bps.c                  |  2 +-
 examples/petsc/bpssphere.c            |  2 +-
 examples/petsc/include/libceedsetup.h |  4 +-
 examples/petsc/multigrid.c            | 20 ++++----
 examples/petsc/src/libceedsetup.c     | 73 ++++++++++++++-------------
 6 files changed, 53 insertions(+), 50 deletions(-)

diff --git a/examples/petsc/area.c b/examples/petsc/area.c
index c72de7d6fa..6b782e45fa 100644
--- a/examples/petsc/area.c
+++ b/examples/petsc/area.c
@@ -168,7 +168,7 @@ int main(int argc, char **argv) {
   // Setup libCEED's objects and apply setup operator
   PetscCall(PetscMalloc1(1, &ceed_data));
   PetscCall(SetupLibceedByDegree(dm, ceed, degree, topo_dim, q_extra, num_comp_x, num_comp_u, g_size, xl_size, problem_options[problem_choice],
-                                 ceed_data, false, (CeedVector)NULL, (CeedVector *)NULL));
+                                 ceed_data, false, true, (CeedVector)NULL, (CeedVector *)NULL));
 
   // Setup output vector
   PetscCall(VecZeroEntries(V_loc));
diff --git a/examples/petsc/bps.c b/examples/petsc/bps.c
index 29101e1379..42e05a1416 100644
--- a/examples/petsc/bps.c
+++ b/examples/petsc/bps.c
@@ -155,7 +155,7 @@ static PetscErrorCode RunWithDM(RunParams rp, DM dm, const char *ceed_resource)
 
   PetscCall(PetscMalloc1(1, &ceed_data));
   PetscCall(SetupLibceedByDegree(dm, ceed, rp->degree, rp->dim, rp->q_extra, rp->dim, rp->num_comp_u, g_size, xl_size, bp_options[rp->bp_choice],
-                                 ceed_data, true, rhs_ceed, &target));
+                                 ceed_data, true, true, rhs_ceed, &target));
 
   // Gather RHS
   PetscCall(VecC2P(rhs_ceed, mem_type, rhs_loc));
diff --git a/examples/petsc/bpssphere.c b/examples/petsc/bpssphere.c
index d928a815c1..9fc63472fa 100644
--- a/examples/petsc/bpssphere.c
+++ b/examples/petsc/bpssphere.c
@@ -175,7 +175,7 @@ int main(int argc, char **argv) {
   // Setup libCEED's objects
   PetscCall(PetscMalloc1(1, &ceed_data));
   PetscCall(SetupLibceedByDegree(dm, ceed, degree, topo_dim, q_extra, num_comp_x, num_comp_u, g_size, xl_size, bp_options[bp_choice], ceed_data, true,
-                                 rhs_ceed, &target));
+                                 true, rhs_ceed, &target));
 
   // Gather RHS
   PetscCall(VecC2P(rhs_ceed, mem_type, rhs_loc));
diff --git a/examples/petsc/include/libceedsetup.h b/examples/petsc/include/libceedsetup.h
index 611c30eb9a..19f5338784 100644
--- a/examples/petsc/include/libceedsetup.h
+++ b/examples/petsc/include/libceedsetup.h
@@ -16,8 +16,8 @@
 
 PetscErrorCode CeedDataDestroy(CeedInt i, CeedData data);
 PetscErrorCode SetupLibceedByDegree(DM dm, Ceed ceed, CeedInt degree, CeedInt topo_dim, CeedInt q_extra, PetscInt num_comp_x, PetscInt num_comp_u,
-                                    PetscInt g_size, PetscInt xl_size, BPData bp_data, CeedData data, PetscBool setup_rhs, CeedVector rhs_ceed,
-                                    CeedVector *target);
+                                    PetscInt g_size, PetscInt xl_size, BPData bp_data, CeedData data, PetscBool setup_rhs, PetscBool is_fine_level,
+                                    CeedVector rhs_ceed, CeedVector *target);
 PetscErrorCode CeedLevelTransferSetup(DM dm, Ceed ceed, CeedInt level, CeedInt num_comp_u, CeedData *data, BPData bp_data, Vec fine_mult);
 PetscErrorCode SetupErrorOperator(DM dm, Ceed ceed, BPData bp_data, CeedInt topo_dim, PetscInt num_comp_x, PetscInt num_comp_u,
                                   CeedOperator *op_error);
diff --git a/examples/petsc/multigrid.c b/examples/petsc/multigrid.c
index e9f78197a3..1eb51d6dd0 100644
--- a/examples/petsc/multigrid.c
+++ b/examples/petsc/multigrid.c
@@ -155,10 +155,10 @@ int main(int argc, char **argv) {
 
   switch (coarsen) {
     case COARSEN_UNIFORM:
-      for (int i = 0; i < num_levels; i++) level_degrees[i] = i + 1;
+      for (PetscInt i = 0; i < num_levels; i++) level_degrees[i] = i + 1;
       break;
     case COARSEN_LOGARITHMIC:
-      for (int i = 0; i < num_levels - 1; i++) level_degrees[i] = pow(2, i);
+      for (PetscInt i = 0; i < num_levels - 1; i++) level_degrees[i] = pow(2, i);
       level_degrees[fine_level] = degree;
       break;
   }
@@ -199,7 +199,6 @@ int main(int argc, char **argv) {
 
     // Operator
     PetscCall(PetscMalloc1(1, &op_apply_ctx[i]));
-    PetscCall(PetscMalloc1(1, &op_error_ctx));
     PetscCall(MatCreateShell(comm, l_size[i], l_size[i], g_size[i], g_size[i], op_apply_ctx[i], &mat_O[i]));
     PetscCall(MatShellSetOperation(mat_O[i], MATOP_MULT, (void (*)(void))MatMult_Ceed));
     PetscCall(MatShellSetOperation(mat_O[i], MATOP_GET_DIAGONAL, (void (*)(void))MatGetDiag));
@@ -267,7 +266,7 @@ int main(int argc, char **argv) {
     }
     PetscCall(PetscMalloc1(1, &ceed_data[i]));
     PetscCall(SetupLibceedByDegree(dm[i], ceed, level_degrees[i], dim, q_extra, dim, num_comp_u, g_size[i], xl_size[i], bp_options[bp_choice],
-                                   ceed_data[i], i == (fine_level), rhs_ceed, &target));
+                                   ceed_data[i], i == fine_level, i == fine_level, rhs_ceed, &target));
   }
 
   // Gather RHS
@@ -291,7 +290,7 @@ int main(int argc, char **argv) {
   CeedOperatorSetField(op_error, "error", ceed_data[fine_level]->elem_restr_u, ceed_data[fine_level]->basis_u, CEED_VECTOR_ACTIVE);
 
   // Calculate multiplicity
-  for (int i = 0; i < num_levels; i++) {
+  for (PetscInt i = 0; i < num_levels; i++) {
     PetscMemType mem_type;
 
     // CEED vector
@@ -322,7 +321,7 @@ int main(int argc, char **argv) {
   }
 
   // Set up Mat
-  for (int i = 0; i < num_levels; i++) {
+  for (PetscInt i = fine_level; i >= 0; i--) {
     // Set up apply operator context
     PetscCall(SetupApplyOperatorCtx(comm, dm[i], ceed, ceed_data[i], X_loc[i], op_apply_ctx[i]));
 
@@ -335,8 +334,8 @@ int main(int argc, char **argv) {
       pr_restr_ctx[i]->loc_vec_c   = X_loc[i - 1];
       pr_restr_ctx[i]->loc_vec_f   = op_apply_ctx[i]->Y_loc;
       pr_restr_ctx[i]->mult_vec    = mult[i];
-      pr_restr_ctx[i]->ceed_vec_c  = op_apply_ctx[i - 1]->x_ceed;
-      pr_restr_ctx[i]->ceed_vec_f  = op_apply_ctx[i]->y_ceed;
+      pr_restr_ctx[i]->ceed_vec_c  = ceed_data[i - 1]->x_ceed;
+      pr_restr_ctx[i]->ceed_vec_f  = ceed_data[i]->y_ceed;
       pr_restr_ctx[i]->op_prolong  = ceed_data[i]->op_prolong;
       pr_restr_ctx[i]->op_restrict = ceed_data[i]->op_restrict;
       pr_restr_ctx[i]->ceed        = ceed;
@@ -393,7 +392,7 @@ int main(int argc, char **argv) {
 
     // PCMG levels
     PetscCall(PCMGSetLevels(pc, num_levels, NULL));
-    for (int i = 0; i < num_levels; i++) {
+    for (PetscInt i = 0; i < num_levels; i++) {
       // Smoother
       KSP smoother;
       PC  smoother_pc;
@@ -502,6 +501,7 @@ int main(int argc, char **argv) {
     }
     {
       // Set up error operator context
+      PetscCall(PetscMalloc1(1, &op_error_ctx));
       PetscCall(SetupErrorOperatorCtx(comm, dm[fine_level], ceed, ceed_data[fine_level], X_loc[fine_level], op_error, op_error_ctx));
       PetscScalar l2_error;
       PetscCall(ComputeL2Error(X[fine_level], &l2_error, op_error_ctx));
@@ -532,7 +532,7 @@ int main(int argc, char **argv) {
   }
 
   // Cleanup
-  for (int i = 0; i < num_levels; i++) {
+  for (PetscInt i = 0; i < num_levels; i++) {
     PetscCall(VecDestroy(&X[i]));
     PetscCall(VecDestroy(&X_loc[i]));
     PetscCall(VecDestroy(&mult[i]));
diff --git a/examples/petsc/src/libceedsetup.c b/examples/petsc/src/libceedsetup.c
index 086fb669c7..19ecd1880a 100644
--- a/examples/petsc/src/libceedsetup.c
+++ b/examples/petsc/src/libceedsetup.c
@@ -40,14 +40,14 @@ PetscErrorCode CeedDataDestroy(CeedInt i, CeedData data) {
 // Set up libCEED for a given degree
 // -----------------------------------------------------------------------------
 PetscErrorCode SetupLibceedByDegree(DM dm, Ceed ceed, CeedInt degree, CeedInt topo_dim, CeedInt q_extra, PetscInt num_comp_x, PetscInt num_comp_u,
-                                    PetscInt g_size, PetscInt xl_size, BPData bp_data, CeedData data, PetscBool setup_rhs, CeedVector rhs_ceed,
-                                    CeedVector *target) {
+                                    PetscInt g_size, PetscInt xl_size, BPData bp_data, CeedData data, PetscBool setup_rhs, PetscBool is_fine_level,
+                                    CeedVector rhs_ceed, CeedVector *target) {
   DM                  dm_coord;
   Vec                 coords;
   const PetscScalar  *coord_array;
   CeedBasis           basis_x, basis_u;
   CeedElemRestriction elem_restr_x, elem_restr_u, elem_restr_u_i, elem_restr_qd_i;
-  CeedQFunction       qf_setup_geo, qf_apply;
+  CeedQFunction       qf_setup_geo = NULL, qf_apply = NULL;
   CeedOperator        op_setup_geo, op_apply;
   CeedVector          x_coord, q_data, x_ceed, y_ceed;
   PetscInt            c_start, c_end, num_elem;
@@ -86,36 +86,42 @@ PetscErrorCode SetupLibceedByDegree(DM dm, Ceed ceed, CeedInt degree, CeedInt to
   CeedVectorCreate(ceed, xl_size, &x_ceed);
   CeedVectorCreate(ceed, xl_size, &y_ceed);
 
-  // Create the QFunction that builds the context data
-  CeedQFunctionCreateInterior(ceed, 1, bp_data.setup_geo, bp_data.setup_geo_loc, &qf_setup_geo);
-  CeedQFunctionAddInput(qf_setup_geo, "x", num_comp_x, CEED_EVAL_INTERP);
-  CeedQFunctionAddInput(qf_setup_geo, "dx", num_comp_x * topo_dim, CEED_EVAL_GRAD);
-  CeedQFunctionAddInput(qf_setup_geo, "weight", 1, CEED_EVAL_WEIGHT);
-  CeedQFunctionAddOutput(qf_setup_geo, "qdata", q_data_size, CEED_EVAL_NONE);
-
-  // Create the operator that builds the quadrature data
-  CeedOperatorCreate(ceed, qf_setup_geo, NULL, NULL, &op_setup_geo);
-  CeedOperatorSetField(op_setup_geo, "x", elem_restr_x, basis_x, CEED_VECTOR_ACTIVE);
-  CeedOperatorSetField(op_setup_geo, "dx", elem_restr_x, basis_x, CEED_VECTOR_ACTIVE);
-  CeedOperatorSetField(op_setup_geo, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE);
-  CeedOperatorSetField(op_setup_geo, "qdata", elem_restr_qd_i, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE);
-
-  // Setup q_data
-  CeedOperatorApply(op_setup_geo, x_coord, q_data, CEED_REQUEST_IMMEDIATE);
-
-  // Set up PDE operator
-  CeedInt in_scale  = bp_data.in_mode == CEED_EVAL_GRAD ? topo_dim : 1;
-  CeedInt out_scale = bp_data.out_mode == CEED_EVAL_GRAD ? topo_dim : 1;
-  CeedQFunctionCreateInterior(ceed, 1, bp_data.apply, bp_data.apply_loc, &qf_apply);
-  CeedQFunctionAddInput(qf_apply, "u", num_comp_u * in_scale, bp_data.in_mode);
-  CeedQFunctionAddInput(qf_apply, "qdata", q_data_size, CEED_EVAL_NONE);
-  CeedQFunctionAddOutput(qf_apply, "v", num_comp_u * out_scale, bp_data.out_mode);
+  if (is_fine_level) {
+    // Create the QFunction that builds the context data
+    CeedQFunctionCreateInterior(ceed, 1, bp_data.setup_geo, bp_data.setup_geo_loc, &qf_setup_geo);
+    CeedQFunctionAddInput(qf_setup_geo, "x", num_comp_x, CEED_EVAL_INTERP);
+    CeedQFunctionAddInput(qf_setup_geo, "dx", num_comp_x * topo_dim, CEED_EVAL_GRAD);
+    CeedQFunctionAddInput(qf_setup_geo, "weight", 1, CEED_EVAL_WEIGHT);
+    CeedQFunctionAddOutput(qf_setup_geo, "qdata", q_data_size, CEED_EVAL_NONE);
+
+    // Create the operator that builds the quadrature data
+    CeedOperatorCreate(ceed, qf_setup_geo, NULL, NULL, &op_setup_geo);
+    CeedOperatorSetField(op_setup_geo, "x", elem_restr_x, basis_x, CEED_VECTOR_ACTIVE);
+    CeedOperatorSetField(op_setup_geo, "dx", elem_restr_x, basis_x, CEED_VECTOR_ACTIVE);
+    CeedOperatorSetField(op_setup_geo, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE);
+    CeedOperatorSetField(op_setup_geo, "qdata", elem_restr_qd_i, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE);
+
+    // Setup q_data
+    CeedOperatorApply(op_setup_geo, x_coord, q_data, CEED_REQUEST_IMMEDIATE);
+
+    // Set up PDE operator
+    CeedInt in_scale  = bp_data.in_mode == CEED_EVAL_GRAD ? topo_dim : 1;
+    CeedInt out_scale = bp_data.out_mode == CEED_EVAL_GRAD ? topo_dim : 1;
+    CeedQFunctionCreateInterior(ceed, 1, bp_data.apply, bp_data.apply_loc, &qf_apply);
+    CeedQFunctionAddInput(qf_apply, "u", num_comp_u * in_scale, bp_data.in_mode);
+    CeedQFunctionAddInput(qf_apply, "qdata", q_data_size, CEED_EVAL_NONE);
+    CeedQFunctionAddOutput(qf_apply, "v", num_comp_u * out_scale, bp_data.out_mode);
+
+    // Create the mass or diff operator
+    CeedOperatorCreate(ceed, qf_apply, NULL, NULL, &op_apply);
+    CeedOperatorSetField(op_apply, "u", elem_restr_u, basis_u, CEED_VECTOR_ACTIVE);
+    CeedOperatorSetField(op_apply, "qdata", elem_restr_qd_i, CEED_BASIS_NONE, q_data);
+    CeedOperatorSetField(op_apply, "v", elem_restr_u, basis_u, CEED_VECTOR_ACTIVE);
 
-  // Create the mass or diff operator
-  CeedOperatorCreate(ceed, qf_apply, NULL, NULL, &op_apply);
-  CeedOperatorSetField(op_apply, "u", elem_restr_u, basis_u, CEED_VECTOR_ACTIVE);
-  CeedOperatorSetField(op_apply, "qdata", elem_restr_qd_i, CEED_BASIS_NONE, q_data);
-  CeedOperatorSetField(op_apply, "v", elem_restr_u, basis_u, CEED_VECTOR_ACTIVE);
+    // Cleanup
+    CeedQFunctionDestroy(&qf_setup_geo);
+    CeedOperatorDestroy(&op_setup_geo);
+  }
 
   // Set up RHS if needed
   if (setup_rhs) {
@@ -151,10 +157,7 @@ PetscErrorCode SetupLibceedByDegree(DM dm, Ceed ceed, CeedInt degree, CeedInt to
     CeedQFunctionDestroy(&qf_setup_rhs);
     CeedOperatorDestroy(&op_setup_rhs);
   }
-
   // Cleanup
-  CeedQFunctionDestroy(&qf_setup_geo);
-  CeedOperatorDestroy(&op_setup_geo);
   CeedVectorDestroy(&x_coord);
 
   // Save libCEED data required for level

From 637f263ad70b7dae14e8bceec9c44b48048f6410 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Wed, 28 Aug 2024 10:10:27 -0600
Subject: [PATCH 147/571] leak - fix leak in mfem ex

---
 examples/mfem/bp3.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/mfem/bp3.cpp b/examples/mfem/bp3.cpp
index d4b8eb24e9..b4e341db48 100644
--- a/examples/mfem/bp3.cpp
+++ b/examples/mfem/bp3.cpp
@@ -188,6 +188,7 @@ int main(int argc, char *argv[]) {
   delete fespace;
   delete fec;
   delete mesh;
+  delete D;
   CeedDestroy(&ceed);
   return 0;
 }

From d06a2c125c427f0694fb06710c5ee86c8219d1b1 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Wed, 28 Aug 2024 10:14:10 -0600
Subject: [PATCH 148/571] minor - fix warning in ceed/ex2

---
 examples/ceed/ex2-surface.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/ceed/ex2-surface.c b/examples/ceed/ex2-surface.c
index d536068800..2264f26c25 100644
--- a/examples/ceed/ex2-surface.c
+++ b/examples/ceed/ex2-surface.c
@@ -202,7 +202,7 @@ int main(int argc, const char *argv[]) {
   CeedQFunction qf_apply;
   if (gallery) {
     // This creates the QFunction via the gallery.
-    char name[16] = "";
+    char name[25] = "";
     snprintf(name, sizeof name, "Poisson%" CeedInt_FMT "DApply", dim);
     CeedQFunctionCreateInteriorByName(ceed, name, &qf_apply);
   } else {

From 7de238d35179a0144950ac1351aac17ef48eee2b Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Wed, 28 Aug 2024 11:03:21 -0600
Subject: [PATCH 149/571] leak - fix leaks in fluids ex

---
 examples/fluids/navierstokes.c       | 18 ++----------------
 examples/fluids/problems/advection.c |  1 +
 2 files changed, 3 insertions(+), 16 deletions(-)

diff --git a/examples/fluids/navierstokes.c b/examples/fluids/navierstokes.c
index 4f2c5ae93f..7468ef5a05 100644
--- a/examples/fluids/navierstokes.c
+++ b/examples/fluids/navierstokes.c
@@ -328,22 +328,8 @@ int main(int argc, char **argv) {
 
   PetscCall(PetscFree(app_ctx->amat_type));
   PetscCall(PetscFree(app_ctx->wall_forces.walls));
-  {
-    const char *filename  = NULL;
-    PetscBool   is_stdout = PETSC_FALSE;
-
-    if (app_ctx->wall_forces.viewer) {
-      PetscCall(PetscViewerFileGetName(app_ctx->wall_forces.viewer, &filename));
-      if (filename) PetscCall(PetscStrncmp(filename, "stdout", 7, &is_stdout));
-      if (!is_stdout) PetscCall(PetscViewerDestroy(&app_ctx->wall_forces.viewer));
-    }
-
-    if (app_ctx->turb_spanstats_viewer) {
-      PetscCall(PetscViewerFileGetName(app_ctx->turb_spanstats_viewer, &filename));
-      if (filename) PetscCall(PetscStrncmp(filename, "stdout", 7, &is_stdout));
-      if (!is_stdout) PetscCall(PetscViewerDestroy(&app_ctx->turb_spanstats_viewer));
-    }
-  }
+  PetscCall(PetscViewerDestroy(&app_ctx->wall_forces.viewer));
+  PetscCall(PetscViewerDestroy(&app_ctx->turb_spanstats_viewer));
 
   // -- Structs
   for (PetscInt i = 0; i < problem->num_bc_defs; i++) {
diff --git a/examples/fluids/problems/advection.c b/examples/fluids/problems/advection.c
index 3c8423a9d7..1795d6db0b 100644
--- a/examples/fluids/problems/advection.c
+++ b/examples/fluids/problems/advection.c
@@ -74,6 +74,7 @@ PetscErrorCode CreateKSPMassOperator_AdvectionStabilized(User user, CeedOperator
   PetscCallCeed(ceed, CeedOperatorSetField(*op_mass, "v", elem_restr_q, basis_q, CEED_VECTOR_ACTIVE));
   PetscCallCeed(ceed, CeedOperatorSetField(*op_mass, "Grad_v", elem_restr_q, basis_q, CEED_VECTOR_ACTIVE));
 
+  PetscCallCeed(ceed, CeedQFunctionContextDestroy(&qf_ctx));
   PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_mass));
   PetscFunctionReturn(PETSC_SUCCESS);
 }

From 70dc8078dbe287928145230e5a499c81da54eabb Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Wed, 28 Aug 2024 13:24:53 -0600
Subject: [PATCH 150/571] dealii - update to newer syntax

---
 examples/deal.II/bps.cc |  2 +-
 examples/deal.II/bps.h  | 16 ++++++++--------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/examples/deal.II/bps.cc b/examples/deal.II/bps.cc
index 9d72710d65..7205d0987d 100644
--- a/examples/deal.II/bps.cc
+++ b/examples/deal.II/bps.cc
@@ -167,7 +167,7 @@ main(int argc, char *argv[])
 #ifdef DEAL_II_WITH_P4EST
   parallel::distributed::Triangulation<dim> tria(MPI_COMM_WORLD);
 #else
-  parallel::shared::Triangulation<dim> tria(MPI_COMM_WORLD, ::Triangulation<dim>::none, true);
+  Triangulation<dim> tria;
 #endif
 
   GridGenerator::hyper_cube(tria);
diff --git a/examples/deal.II/bps.h b/examples/deal.II/bps.h
index 677ed1a81f..bca093663a 100644
--- a/examples/deal.II/bps.h
+++ b/examples/deal.II/bps.h
@@ -160,13 +160,13 @@ class OperatorCeed : public OperatorBase<Number>
    */
   ~OperatorCeed()
   {
-    CeedOperatorDestroy(&op_apply);
-    CeedQFunctionDestroy(&qf_apply);
-    CeedQFunctionContextDestroy(&build_ctx);
     CeedVectorDestroy(&q_data);
     CeedElemRestrictionDestroy(&q_data_restriction);
     CeedElemRestrictionDestroy(&sol_restriction);
     CeedBasisDestroy(&sol_basis);
+    CeedQFunctionContextDestroy(&build_ctx);
+    CeedQFunctionDestroy(&qf_apply);
+    CeedOperatorDestroy(&op_apply);
     CeedDestroy(&ceed);
   }
 
@@ -641,15 +641,15 @@ class OperatorCeed : public OperatorBase<Number>
 
     CeedOperatorApply(op_build, node_coords, q_data, CEED_REQUEST_IMMEDIATE);
 
-    CeedOperatorDestroy(&op_build);
-    CeedQFunctionDestroy(&qf_build);
-    CeedQFunctionContextDestroy(&build_ctx);
-    CeedElemRestrictionDestroy(&geo_restriction);
     CeedVectorDestroy(&node_coords);
-    CeedElemRestrictionDestroy(&q_data_restriction);
     CeedVectorSyncArray(q_data, CEED_MEM_HOST);
     CeedVectorDestroy(&q_data);
+    CeedElemRestrictionDestroy(&geo_restriction);
+    CeedElemRestrictionDestroy(&q_data_restriction);
     CeedBasisDestroy(&geo_basis);
+    CeedQFunctionContextDestroy(&build_ctx);
+    CeedQFunctionDestroy(&qf_build);
+    CeedOperatorDestroy(&op_build);
 
     return weights;
   }

From a64df932185b6dfd3e11d1422c3282714ed7a05e Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Wed, 28 Aug 2024 13:31:11 -0600
Subject: [PATCH 151/571] ci - use newer MFEM

---
 .gitlab-ci.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index be6b4db93d..518ac31129 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -100,8 +100,8 @@ noether-cpu:
     - export PETSC_DIR=/projects/petsc PETSC_ARCH=mpich-hip-int64 && git -C $PETSC_DIR -c safe.directory=$PETSC_DIR describe
     - echo "-------------- PETSc ---------------" && make -C $PETSC_DIR info
     - make -k -j$((NPROC_CPU / NPROC_POOL)) BACKENDS="$BACKENDS_CPU" JUNIT_BATCH="cpu" junit search="petsc fluids-navierstokes solids"
-# -- MFEM v4.6
-    - cd .. && export MFEM_VERSION=mfem-4.6 && { [[ -d $MFEM_VERSION ]] || { git clone --depth 1 --branch v4.6 https://github.com/mfem/mfem.git $MFEM_VERSION && make -C $MFEM_VERSION -j$(nproc) serial CXXFLAGS="-O -std=c++11"; }; } && export MFEM_DIR=$PWD/$MFEM_VERSION && cd libCEED
+# -- MFEM v4.7
+    - cd .. && export MFEM_VERSION=mfem-4.7 && { [[ -d $MFEM_VERSION ]] || { git clone --depth 1 --branch v4.7 https://github.com/mfem/mfem.git $MFEM_VERSION && make -C $MFEM_VERSION -j$(nproc) serial CXXFLAGS="-O -std=c++11"; }; } && export MFEM_DIR=$PWD/$MFEM_VERSION && cd libCEED
     - echo "-------------- MFEM ----------------" && make -C $MFEM_DIR info
     - make -k -j$((NPROC_CPU / NPROC_POOL)) BACKENDS="$BACKENDS_CPU" JUNIT_BATCH="cpu" junit search=mfem
 # -- Nek5000 v19.0
@@ -265,8 +265,8 @@ noether-rocm:
     - export PETSC_DIR=/projects/petsc PETSC_ARCH=mpich-hip && git -C $PETSC_DIR -c safe.directory=$PETSC_DIR describe
     - echo "-------------- PETSc ---------------" && make -C $PETSC_DIR info
     - make -k -j$((NPROC_GPU / NPROC_POOL)) BACKENDS="$BACKENDS_GPU" JUNIT_BATCH="hip" junit search="petsc fluids solids"
-# -- MFEM v4.6
-    - cd .. && export MFEM_VERSION=mfem-4.6 && { [[ -d $MFEM_VERSION ]] || { git clone --depth 1 --branch v4.6 https://github.com/mfem/mfem.git $MFEM_VERSION && make -C $MFEM_VERSION -j$(nproc) serial CXXFLAGS="-O -std=c++11"; }; } && export MFEM_DIR=$PWD/$MFEM_VERSION && cd libCEED
+# -- MFEM v4.7
+    - cd .. && export MFEM_VERSION=mfem-4.7 && { [[ -d $MFEM_VERSION ]] || { git clone --depth 1 --branch v4.7 https://github.com/mfem/mfem.git $MFEM_VERSION && make -C $MFEM_VERSION -j$(nproc) serial CXXFLAGS="-O -std=c++11"; }; } && export MFEM_DIR=$PWD/$MFEM_VERSION && cd libCEED
     - echo "-------------- MFEM ----------------" && make -C $MFEM_DIR info
     - make -k -j$((NPROC_GPU / NPROC_POOL)) BACKENDS="$BACKENDS_GPU" JUNIT_BATCH="hip" junit search=mfem
 # -- Nek5000 v19.0

From 8efac6967d75d419bd042300912261bc66e56322 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Thu, 29 Aug 2024 09:47:47 -0600
Subject: [PATCH 152/571] dealii - eager destruction of refs

---
 examples/deal.II/bps.h | 31 +++++++++++++++++--------------
 1 file changed, 17 insertions(+), 14 deletions(-)

diff --git a/examples/deal.II/bps.h b/examples/deal.II/bps.h
index bca093663a..d4f5465bf6 100644
--- a/examples/deal.II/bps.h
+++ b/examples/deal.II/bps.h
@@ -160,12 +160,6 @@ class OperatorCeed : public OperatorBase<Number>
    */
   ~OperatorCeed()
   {
-    CeedVectorDestroy(&q_data);
-    CeedElemRestrictionDestroy(&q_data_restriction);
-    CeedElemRestrictionDestroy(&sol_restriction);
-    CeedBasisDestroy(&sol_basis);
-    CeedQFunctionContextDestroy(&build_ctx);
-    CeedQFunctionDestroy(&qf_apply);
     CeedOperatorDestroy(&op_apply);
     CeedDestroy(&ceed);
   }
@@ -176,6 +170,14 @@ class OperatorCeed : public OperatorBase<Number>
   void
   reinit() override
   {
+    CeedVector           q_data;
+    CeedBasis            sol_basis;
+    CeedElemRestriction  sol_restriction;
+    CeedElemRestriction  q_data_restriction;
+    BuildContext         build_ctx_data;
+    CeedQFunctionContext build_ctx;
+    CeedQFunction        qf_apply;
+
     const auto &tria = dof_handler.get_triangulation();
     const auto &fe   = dof_handler.get_fe();
 
@@ -265,7 +267,7 @@ class OperatorCeed : public OperatorBase<Number>
 
     CeedQFunctionContextCreate(ceed, &build_ctx);
     CeedQFunctionContextSetData(
-      build_ctx, CEED_MEM_HOST, CEED_USE_POINTER, sizeof(build_ctx_data), &build_ctx_data);
+      build_ctx, CEED_MEM_HOST, CEED_COPY_VALUES, sizeof(build_ctx_data), &build_ctx_data);
 
     // 5) create q operation
     if (bp == BPType::BP1)
@@ -299,6 +301,14 @@ class OperatorCeed : public OperatorBase<Number>
     CeedOperatorSetField(op_apply, "u", sol_restriction, sol_basis, CEED_VECTOR_ACTIVE);
     CeedOperatorSetField(op_apply, "qdata", q_data_restriction, CEED_BASIS_NONE, q_data);
     CeedOperatorSetField(op_apply, "v", sol_restriction, sol_basis, CEED_VECTOR_ACTIVE);
+
+    // 7) cleanup
+    CeedVectorDestroy(&q_data);
+    CeedElemRestrictionDestroy(&q_data_restriction);
+    CeedElemRestrictionDestroy(&sol_restriction);
+    CeedBasisDestroy(&sol_basis);
+    CeedQFunctionContextDestroy(&build_ctx);
+    CeedQFunctionDestroy(&qf_apply);
   }
 
   /**
@@ -693,15 +703,8 @@ class OperatorCeed : public OperatorBase<Number>
    * libCEED data structures.
    */
   Ceed                   ceed;
-  CeedBasis              sol_basis;
-  CeedElemRestriction    sol_restriction;
-  CeedElemRestriction    q_data_restriction;
   std::vector<double>    weights;
-  CeedVector             q_data;
   std::array<CeedInt, 3> strides;
-  BuildContext           build_ctx_data;
-  CeedQFunctionContext   build_ctx;
-  CeedQFunction          qf_apply;
   CeedOperator           op_apply;
 
   /**

From d2fd4e272d9fe7364fd7c4e70139032463e562d1 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Thu, 29 Aug 2024 11:01:17 -0600
Subject: [PATCH 153/571] minor - fix stray CeedInt=>PetscInt

---
 examples/petsc/multigrid.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/petsc/multigrid.c b/examples/petsc/multigrid.c
index 1eb51d6dd0..346e395e6e 100644
--- a/examples/petsc/multigrid.c
+++ b/examples/petsc/multigrid.c
@@ -181,7 +181,7 @@ int main(int argc, char **argv) {
   CeedElemTopology elem_topo = ElemTopologyP2C(cell_type);
 
   // Setup DM and Operator Mat Shells for each level
-  for (CeedInt i = 0; i < num_levels; i++) {
+  for (PetscInt i = 0; i < num_levels; i++) {
     // Create DM
     PetscCall(DMClone(dm_orig, &dm[i]));
     PetscCall(DMGetVecType(dm_orig, &vec_type));

From 41655a2310ae869cbaf66471563d33865f525ec1 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Thu, 22 Aug 2024 15:52:41 -0600
Subject: [PATCH 154/571] gpu - reuse evecs where able

---
 backends/cuda-ref/ceed-cuda-ref-operator.c | 48 +++++++++++++++++++---
 backends/hip-ref/ceed-hip-ref-operator.c   | 48 +++++++++++++++++++---
 2 files changed, 86 insertions(+), 10 deletions(-)

diff --git a/backends/cuda-ref/ceed-cuda-ref-operator.c b/backends/cuda-ref/ceed-cuda-ref-operator.c
index 748237dab8..0defda9772 100644
--- a/backends/cuda-ref/ceed-cuda-ref-operator.c
+++ b/backends/cuda-ref/ceed-cuda-ref-operator.c
@@ -264,7 +264,7 @@ static int CeedOperatorSetup_Cuda(CeedOperator op) {
   impl->num_inputs  = num_input_fields;
   impl->num_outputs = num_output_fields;
 
-  // Set up infield and outfield e_vecs and q_vecs
+  // Set up infield and outfield e-vecs and q-vecs
   // Infields
   CeedCallBackend(
       CeedOperatorSetupFields_Cuda(qf, op, true, false, impl->skip_rstr_in, NULL, impl->e_vecs, impl->q_vecs_in, 0, num_input_fields, Q, num_elem));
@@ -272,6 +272,44 @@ static int CeedOperatorSetup_Cuda(CeedOperator op) {
   CeedCallBackend(CeedOperatorSetupFields_Cuda(qf, op, false, false, impl->skip_rstr_out, impl->apply_add_basis_out, impl->e_vecs, impl->q_vecs_out,
                                                num_input_fields, num_output_fields, Q, num_elem));
 
+  // Reuse active e-vecs where able
+  {
+    CeedInt              num_used  = 0;
+    CeedElemRestriction *rstr_used = NULL;
+
+    for (CeedInt i = 0; i < num_input_fields; i++) {
+      bool                is_used = false;
+      CeedVector          vec_i;
+      CeedElemRestriction rstr_i;
+
+      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i));
+      if (vec_i != CEED_VECTOR_ACTIVE) continue;
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i));
+      for (CeedInt j = 0; j < num_used; j++) {
+        if (rstr_i == rstr_used[i]) is_used = true;
+      }
+      if (is_used) continue;
+      num_used++;
+      if (num_used == 1) CeedCallBackend(CeedCalloc(num_used, &rstr_used));
+      else CeedCallBackend(CeedRealloc(num_used, &rstr_used));
+      rstr_used[num_used - 1] = rstr_i;
+      for (CeedInt j = num_output_fields - 1; j >= 0; j--) {
+        CeedEvalMode        eval_mode;
+        CeedVector          vec_j;
+        CeedElemRestriction rstr_j;
+
+        CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[j], &vec_j));
+        if (vec_j != CEED_VECTOR_ACTIVE) continue;
+        CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode));
+        if (eval_mode == CEED_EVAL_NONE) continue;
+        CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[j], &rstr_j));
+        if (rstr_i == rstr_j) {
+          CeedCallBackend(CeedVectorReferenceCopy(impl->e_vecs[i], &impl->e_vecs[j + impl->num_inputs]));
+        }
+      }
+    }
+    CeedCallBackend(CeedFree(&rstr_used));
+  }
   CeedCallBackend(CeedOperatorSetSetupDone(op));
   return CEED_ERROR_SUCCESS;
 }
@@ -310,7 +348,7 @@ static inline int CeedOperatorSetupInputs_Cuda(CeedInt num_input_fields, CeedQFu
         uint64_t state;
 
         CeedCallBackend(CeedVectorGetState(vec, &state));
-        if (state != impl->input_states[i] && !impl->skip_rstr_in[i]) {
+        if ((state != impl->input_states[i] || vec == in_vec) && !impl->skip_rstr_in[i]) {
           CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_NOTRANSPOSE, vec, impl->e_vecs[i], request));
         }
         impl->input_states[i] = state;
@@ -435,6 +473,9 @@ static int CeedOperatorApplyAdd_Cuda(CeedOperator op, CeedVector in_vec, CeedVec
   // Q function
   CeedCallBackend(CeedQFunctionApply(qf, num_elem * Q, impl->q_vecs_in, impl->q_vecs_out));
 
+  // Restore input arrays
+  CeedCallBackend(CeedOperatorRestoreInputs_Cuda(num_input_fields, qf_input_fields, op_input_fields, false, e_data, impl));
+
   // Output basis apply if needed
   for (CeedInt i = 0; i < num_output_fields; i++) {
     CeedEvalMode        eval_mode;
@@ -490,9 +531,6 @@ static int CeedOperatorApplyAdd_Cuda(CeedOperator op, CeedVector in_vec, CeedVec
 
     CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_TRANSPOSE, impl->e_vecs[i + impl->num_inputs], vec, request));
   }
-
-  // Restore input arrays
-  CeedCallBackend(CeedOperatorRestoreInputs_Cuda(num_input_fields, qf_input_fields, op_input_fields, false, e_data, impl));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/hip-ref/ceed-hip-ref-operator.c b/backends/hip-ref/ceed-hip-ref-operator.c
index 6045d0ad96..268292b60c 100644
--- a/backends/hip-ref/ceed-hip-ref-operator.c
+++ b/backends/hip-ref/ceed-hip-ref-operator.c
@@ -263,7 +263,7 @@ static int CeedOperatorSetup_Hip(CeedOperator op) {
   impl->num_inputs  = num_input_fields;
   impl->num_outputs = num_output_fields;
 
-  // Set up infield and outfield e_vecs and q_vecs
+  // Set up infield and outfield e-vecs and q-vecs
   // Infields
   CeedCallBackend(
       CeedOperatorSetupFields_Hip(qf, op, true, false, impl->skip_rstr_in, NULL, impl->e_vecs, impl->q_vecs_in, 0, num_input_fields, Q, num_elem));
@@ -271,6 +271,44 @@ static int CeedOperatorSetup_Hip(CeedOperator op) {
   CeedCallBackend(CeedOperatorSetupFields_Hip(qf, op, false, false, impl->skip_rstr_out, impl->apply_add_basis_out, impl->e_vecs, impl->q_vecs_out,
                                               num_input_fields, num_output_fields, Q, num_elem));
 
+  // Reuse active e-vecs where able
+  {
+    CeedInt              num_used  = 0;
+    CeedElemRestriction *rstr_used = NULL;
+
+    for (CeedInt i = 0; i < num_input_fields; i++) {
+      bool                is_used = false;
+      CeedVector          vec_i;
+      CeedElemRestriction rstr_i;
+
+      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i));
+      if (vec_i != CEED_VECTOR_ACTIVE) continue;
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i));
+      for (CeedInt j = 0; j < num_used; j++) {
+        if (rstr_i == rstr_used[i]) is_used = true;
+      }
+      if (is_used) continue;
+      num_used++;
+      if (num_used == 1) CeedCallBackend(CeedCalloc(num_used, &rstr_used));
+      else CeedCallBackend(CeedRealloc(num_used, &rstr_used));
+      rstr_used[num_used - 1] = rstr_i;
+      for (CeedInt j = num_output_fields - 1; j >= 0; j--) {
+        CeedEvalMode        eval_mode;
+        CeedVector          vec_j;
+        CeedElemRestriction rstr_j;
+
+        CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[j], &vec_j));
+        if (vec_j != CEED_VECTOR_ACTIVE) continue;
+        CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode));
+        if (eval_mode == CEED_EVAL_NONE) continue;
+        CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[j], &rstr_j));
+        if (rstr_i == rstr_j) {
+          CeedCallBackend(CeedVectorReferenceCopy(impl->e_vecs[i], &impl->e_vecs[j + impl->num_inputs]));
+        }
+      }
+    }
+    CeedCallBackend(CeedFree(&rstr_used));
+  }
   CeedCallBackend(CeedOperatorSetSetupDone(op));
   return CEED_ERROR_SUCCESS;
 }
@@ -309,7 +347,7 @@ static inline int CeedOperatorSetupInputs_Hip(CeedInt num_input_fields, CeedQFun
         uint64_t state;
 
         CeedCallBackend(CeedVectorGetState(vec, &state));
-        if (state != impl->input_states[i] && !impl->skip_rstr_in[i]) {
+        if ((state != impl->input_states[i] || vec == in_vec) && !impl->skip_rstr_in[i]) {
           CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_NOTRANSPOSE, vec, impl->e_vecs[i], request));
         }
         impl->input_states[i] = state;
@@ -434,6 +472,9 @@ static int CeedOperatorApplyAdd_Hip(CeedOperator op, CeedVector in_vec, CeedVect
   // Q function
   CeedCallBackend(CeedQFunctionApply(qf, num_elem * Q, impl->q_vecs_in, impl->q_vecs_out));
 
+  // Restore input arrays
+  CeedCallBackend(CeedOperatorRestoreInputs_Hip(num_input_fields, qf_input_fields, op_input_fields, false, e_data, impl));
+
   // Output basis apply if needed
   for (CeedInt i = 0; i < num_output_fields; i++) {
     CeedEvalMode        eval_mode;
@@ -489,9 +530,6 @@ static int CeedOperatorApplyAdd_Hip(CeedOperator op, CeedVector in_vec, CeedVect
 
     CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_TRANSPOSE, impl->e_vecs[i + impl->num_inputs], vec, request));
   }
-
-  // Restore input arrays
-  CeedCallBackend(CeedOperatorRestoreInputs_Hip(num_input_fields, qf_input_fields, op_input_fields, false, e_data, impl));
   return CEED_ERROR_SUCCESS;
 }
 

From 8a21357021cf95ebb58183dc35a646c90195a1f2 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Mon, 26 Aug 2024 12:59:59 -0600
Subject: [PATCH 155/571] gpu - reuse evecs for AtPoints where able

---
 backends/cuda-ref/ceed-cuda-ref-operator.c | 61 ++++++++++++++++++++--
 backends/cuda-ref/ceed-cuda-ref.h          |  2 +-
 backends/hip-ref/ceed-hip-ref-operator.c   | 61 ++++++++++++++++++++--
 backends/hip-ref/ceed-hip-ref.h            |  2 +-
 4 files changed, 114 insertions(+), 12 deletions(-)

diff --git a/backends/cuda-ref/ceed-cuda-ref-operator.c b/backends/cuda-ref/ceed-cuda-ref-operator.c
index 0defda9772..04eb0fb0fd 100644
--- a/backends/cuda-ref/ceed-cuda-ref-operator.c
+++ b/backends/cuda-ref/ceed-cuda-ref-operator.c
@@ -310,6 +310,7 @@ static int CeedOperatorSetup_Cuda(CeedOperator op) {
     }
     CeedCallBackend(CeedFree(&rstr_used));
   }
+  impl->has_shared_e_vecs = true;
   CeedCallBackend(CeedOperatorSetSetupDone(op));
   return CEED_ERROR_SUCCESS;
 }
@@ -574,7 +575,7 @@ static int CeedOperatorSetupAtPoints_Cuda(CeedOperator op) {
   impl->num_inputs  = num_input_fields;
   impl->num_outputs = num_output_fields;
 
-  // Set up infield and outfield e_vecs and q_vecs
+  // Set up infield and outfield e-vecs and q-vecs
   // Infields
   CeedCallBackend(CeedOperatorSetupFields_Cuda(qf, op, true, true, impl->skip_rstr_in, NULL, impl->e_vecs, impl->q_vecs_in, 0, num_input_fields,
                                                max_num_points, num_elem));
@@ -582,6 +583,45 @@ static int CeedOperatorSetupAtPoints_Cuda(CeedOperator op) {
   CeedCallBackend(CeedOperatorSetupFields_Cuda(qf, op, false, true, impl->skip_rstr_out, impl->apply_add_basis_out, impl->e_vecs, impl->q_vecs_out,
                                                num_input_fields, num_output_fields, max_num_points, num_elem));
 
+  // Reuse active e-vecs where able
+  {
+    CeedInt              num_used  = 0;
+    CeedElemRestriction *rstr_used = NULL;
+
+    for (CeedInt i = 0; i < num_input_fields; i++) {
+      bool                is_used = false;
+      CeedVector          vec_i;
+      CeedElemRestriction rstr_i;
+
+      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i));
+      if (vec_i != CEED_VECTOR_ACTIVE) continue;
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i));
+      for (CeedInt j = 0; j < num_used; j++) {
+        if (rstr_i == rstr_used[i]) is_used = true;
+      }
+      if (is_used) continue;
+      num_used++;
+      if (num_used == 1) CeedCallBackend(CeedCalloc(num_used, &rstr_used));
+      else CeedCallBackend(CeedRealloc(num_used, &rstr_used));
+      rstr_used[num_used - 1] = rstr_i;
+      for (CeedInt j = num_output_fields - 1; j >= 0; j--) {
+        CeedEvalMode        eval_mode;
+        CeedVector          vec_j;
+        CeedElemRestriction rstr_j;
+
+        CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[j], &vec_j));
+        if (vec_j != CEED_VECTOR_ACTIVE) continue;
+        CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode));
+        if (eval_mode == CEED_EVAL_NONE) continue;
+        CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[j], &rstr_j));
+        if (rstr_i == rstr_j) {
+          CeedCallBackend(CeedVectorReferenceCopy(impl->e_vecs[i], &impl->e_vecs[j + impl->num_inputs]));
+        }
+      }
+    }
+    CeedCallBackend(CeedFree(&rstr_used));
+  }
+  impl->has_shared_e_vecs = true;
   CeedCallBackend(CeedOperatorSetSetupDone(op));
   return CEED_ERROR_SUCCESS;
 }
@@ -684,6 +724,9 @@ static int CeedOperatorApplyAddAtPoints_Cuda(CeedOperator op, CeedVector in_vec,
   // Q function
   CeedCallBackend(CeedQFunctionApply(qf, num_elem * max_num_points, impl->q_vecs_in, impl->q_vecs_out));
 
+  // Restore input arrays
+  CeedCallBackend(CeedOperatorRestoreInputs_Cuda(num_input_fields, qf_input_fields, op_input_fields, false, e_data, impl));
+
   // Output basis apply if needed
   for (CeedInt i = 0; i < num_output_fields; i++) {
     CeedEvalMode        eval_mode;
@@ -741,9 +784,6 @@ static int CeedOperatorApplyAddAtPoints_Cuda(CeedOperator op, CeedVector in_vec,
 
     CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_TRANSPOSE, impl->e_vecs[i + impl->num_inputs], vec, request));
   }
-
-  // Restore input arrays
-  CeedCallBackend(CeedOperatorRestoreInputs_Cuda(num_input_fields, qf_input_fields, op_input_fields, false, e_data, impl));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -868,7 +908,7 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Cuda(CeedOperator op,
     CeedCallBackend(CeedQFunctionApply(qf, Q * num_elem, impl->q_vecs_in, impl->q_vecs_out));
   }
 
-  // Un-set output q_vecs to prevent accidental overwrite of Assembled
+  // Un-set output q-vecs to prevent accidental overwrite of Assembled
   for (CeedInt out = 0; out < num_output_fields; out++) {
     CeedVector vec;
 
@@ -1595,6 +1635,17 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda(CeedOperator op, C
   max_num_points = impl->max_num_points;
   for (CeedInt i = 0; i < num_elem; i++) num_points[i] = max_num_points;
 
+  // Create separate output e-vecs
+  if (impl->has_shared_e_vecs) {
+    for (CeedInt i = 0; i < impl->num_outputs; i++) {
+      CeedCallBackend(CeedVectorDestroy(&impl->q_vecs_out[i]));
+      CeedCallBackend(CeedVectorDestroy(&impl->e_vecs[impl->num_inputs + i]));
+    }
+    CeedCallBackend(CeedOperatorSetupFields_Cuda(qf, op, false, true, impl->skip_rstr_out, impl->apply_add_basis_out, impl->e_vecs, impl->q_vecs_out,
+                                                 num_input_fields, num_output_fields, max_num_points, num_elem));
+  }
+  impl->has_shared_e_vecs = false;
+
   // Input Evecs and Restriction
   CeedCallBackend(CeedOperatorSetupInputs_Cuda(num_input_fields, qf_input_fields, op_input_fields, NULL, true, e_data, impl, request));
 
diff --git a/backends/cuda-ref/ceed-cuda-ref.h b/backends/cuda-ref/ceed-cuda-ref.h
index ff0bbaf349..c664a38ed7 100644
--- a/backends/cuda-ref/ceed-cuda-ref.h
+++ b/backends/cuda-ref/ceed-cuda-ref.h
@@ -128,7 +128,7 @@ typedef struct {
 } CeedOperatorAssemble_Cuda;
 
 typedef struct {
-  bool                      *skip_rstr_in, *skip_rstr_out, *apply_add_basis_out;
+  bool                      *skip_rstr_in, *skip_rstr_out, *apply_add_basis_out, has_shared_e_vecs;
   uint64_t                  *input_states;  // State tracking for passive inputs
   CeedVector                *e_vecs;        // E-vectors, inputs followed by outputs
   CeedVector                *q_vecs_in;     // Input Q-vectors needed to apply operator
diff --git a/backends/hip-ref/ceed-hip-ref-operator.c b/backends/hip-ref/ceed-hip-ref-operator.c
index 268292b60c..509555a375 100644
--- a/backends/hip-ref/ceed-hip-ref-operator.c
+++ b/backends/hip-ref/ceed-hip-ref-operator.c
@@ -309,6 +309,7 @@ static int CeedOperatorSetup_Hip(CeedOperator op) {
     }
     CeedCallBackend(CeedFree(&rstr_used));
   }
+  impl->has_shared_e_vecs = true;
   CeedCallBackend(CeedOperatorSetSetupDone(op));
   return CEED_ERROR_SUCCESS;
 }
@@ -573,7 +574,7 @@ static int CeedOperatorSetupAtPoints_Hip(CeedOperator op) {
   impl->num_inputs  = num_input_fields;
   impl->num_outputs = num_output_fields;
 
-  // Set up infield and outfield e_vecs and q_vecs
+  // Set up infield and outfield e-vecs and q-vecs
   // Infields
   CeedCallBackend(CeedOperatorSetupFields_Hip(qf, op, true, true, impl->skip_rstr_in, NULL, impl->e_vecs, impl->q_vecs_in, 0, num_input_fields,
                                               max_num_points, num_elem));
@@ -581,6 +582,45 @@ static int CeedOperatorSetupAtPoints_Hip(CeedOperator op) {
   CeedCallBackend(CeedOperatorSetupFields_Hip(qf, op, false, true, impl->skip_rstr_out, impl->apply_add_basis_out, impl->e_vecs, impl->q_vecs_out,
                                               num_input_fields, num_output_fields, max_num_points, num_elem));
 
+  // Reuse active e-vecs where able
+  {
+    CeedInt              num_used  = 0;
+    CeedElemRestriction *rstr_used = NULL;
+
+    for (CeedInt i = 0; i < num_input_fields; i++) {
+      bool                is_used = false;
+      CeedVector          vec_i;
+      CeedElemRestriction rstr_i;
+
+      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i));
+      if (vec_i != CEED_VECTOR_ACTIVE) continue;
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i));
+      for (CeedInt j = 0; j < num_used; j++) {
+        if (rstr_i == rstr_used[i]) is_used = true;
+      }
+      if (is_used) continue;
+      num_used++;
+      if (num_used == 1) CeedCallBackend(CeedCalloc(num_used, &rstr_used));
+      else CeedCallBackend(CeedRealloc(num_used, &rstr_used));
+      rstr_used[num_used - 1] = rstr_i;
+      for (CeedInt j = num_output_fields - 1; j >= 0; j--) {
+        CeedEvalMode        eval_mode;
+        CeedVector          vec_j;
+        CeedElemRestriction rstr_j;
+
+        CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[j], &vec_j));
+        if (vec_j != CEED_VECTOR_ACTIVE) continue;
+        CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode));
+        if (eval_mode == CEED_EVAL_NONE) continue;
+        CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[j], &rstr_j));
+        if (rstr_i == rstr_j) {
+          CeedCallBackend(CeedVectorReferenceCopy(impl->e_vecs[i], &impl->e_vecs[j + impl->num_inputs]));
+        }
+      }
+    }
+    CeedCallBackend(CeedFree(&rstr_used));
+  }
+  impl->has_shared_e_vecs = true;
   CeedCallBackend(CeedOperatorSetSetupDone(op));
   return CEED_ERROR_SUCCESS;
 }
@@ -683,6 +723,9 @@ static int CeedOperatorApplyAddAtPoints_Hip(CeedOperator op, CeedVector in_vec,
   // Q function
   CeedCallBackend(CeedQFunctionApply(qf, num_elem * max_num_points, impl->q_vecs_in, impl->q_vecs_out));
 
+  // Restore input arrays
+  CeedCallBackend(CeedOperatorRestoreInputs_Hip(num_input_fields, qf_input_fields, op_input_fields, false, e_data, impl));
+
   // Output basis apply if needed
   for (CeedInt i = 0; i < num_output_fields; i++) {
     CeedEvalMode        eval_mode;
@@ -740,9 +783,6 @@ static int CeedOperatorApplyAddAtPoints_Hip(CeedOperator op, CeedVector in_vec,
 
     CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_TRANSPOSE, impl->e_vecs[i + impl->num_inputs], vec, request));
   }
-
-  // Restore input arrays
-  CeedCallBackend(CeedOperatorRestoreInputs_Hip(num_input_fields, qf_input_fields, op_input_fields, false, e_data, impl));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -867,7 +907,7 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Hip(CeedOperator op, b
     CeedCallBackend(CeedQFunctionApply(qf, Q * num_elem, impl->q_vecs_in, impl->q_vecs_out));
   }
 
-  // Un-set output q_vecs to prevent accidental overwrite of Assembled
+  // Un-set output q-vecs to prevent accidental overwrite of Assembled
   for (CeedInt out = 0; out < num_output_fields; out++) {
     CeedVector vec;
 
@@ -1592,6 +1632,17 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip(CeedOperator op, Ce
   max_num_points = impl->max_num_points;
   for (CeedInt i = 0; i < num_elem; i++) num_points[i] = max_num_points;
 
+  // Create separate output e-vecs
+  if (impl->has_shared_e_vecs) {
+    for (CeedInt i = 0; i < impl->num_outputs; i++) {
+      CeedCallBackend(CeedVectorDestroy(&impl->q_vecs_out[i]));
+      CeedCallBackend(CeedVectorDestroy(&impl->e_vecs[impl->num_inputs + i]));
+    }
+    CeedCallBackend(CeedOperatorSetupFields_Hip(qf, op, false, true, impl->skip_rstr_out, impl->apply_add_basis_out, impl->e_vecs, impl->q_vecs_out,
+                                                num_input_fields, num_output_fields, max_num_points, num_elem));
+  }
+  impl->has_shared_e_vecs = false;
+
   // Input Evecs and Restriction
   CeedCallBackend(CeedOperatorSetupInputs_Hip(num_input_fields, qf_input_fields, op_input_fields, NULL, true, e_data, impl, request));
 
diff --git a/backends/hip-ref/ceed-hip-ref.h b/backends/hip-ref/ceed-hip-ref.h
index 59f8d809c2..38c91060b4 100644
--- a/backends/hip-ref/ceed-hip-ref.h
+++ b/backends/hip-ref/ceed-hip-ref.h
@@ -132,7 +132,7 @@ typedef struct {
 } CeedOperatorAssemble_Hip;
 
 typedef struct {
-  bool                     *skip_rstr_in, *skip_rstr_out, *apply_add_basis_out;
+  bool                     *skip_rstr_in, *skip_rstr_out, *apply_add_basis_out, has_shared_e_vecs;
   uint64_t                 *input_states;  // State tracking for passive inputs
   CeedVector               *e_vecs;        // E-vectors, inputs followed by outputs
   CeedVector               *q_vecs_in;     // Input Q-vectors needed to apply operator

From 2097acd54c344b4e8e24615cff43ae433aeb1497 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Thu, 29 Aug 2024 16:15:00 -0600
Subject: [PATCH 156/571] dealii - remove duplicate GPU vec allocations

---
 examples/deal.II/bps.h | 84 ++++++++++++++++++++++++++++++------------
 1 file changed, 61 insertions(+), 23 deletions(-)

diff --git a/examples/deal.II/bps.h b/examples/deal.II/bps.h
index d4f5465bf6..e43a9fbe4a 100644
--- a/examples/deal.II/bps.h
+++ b/examples/deal.II/bps.h
@@ -27,7 +27,8 @@
 #include <deal.II/matrix_free/tools.h>
 
 // libCEED includes
-#include <ceed/ceed.h>
+#include <ceed.h>
+#include <ceed/backend.h>
 
 // QFunction source
 #include "bps-qfunctions.h"
@@ -160,6 +161,8 @@ class OperatorCeed : public OperatorBase<Number>
    */
   ~OperatorCeed()
   {
+    CeedVectorDestroy(&src_ceed);
+    CeedVectorDestroy(&dst_ceed);
     CeedOperatorDestroy(&op_apply);
     CeedDestroy(&ceed);
   }
@@ -302,7 +305,11 @@ class OperatorCeed : public OperatorBase<Number>
     CeedOperatorSetField(op_apply, "qdata", q_data_restriction, CEED_BASIS_NONE, q_data);
     CeedOperatorSetField(op_apply, "v", sol_restriction, sol_basis, CEED_VECTOR_ACTIVE);
 
-    // 7) cleanup
+    // 7) libCEED vectors
+    CeedElemRestrictionCreateVector(sol_restriction, &src_ceed, NULL);
+    CeedElemRestrictionCreateVector(sol_restriction, &dst_ceed, NULL);
+
+    // 8) cleanup
     CeedVectorDestroy(&q_data);
     CeedElemRestrictionDestroy(&q_data_restriction);
     CeedElemRestrictionDestroy(&sol_restriction);
@@ -322,12 +329,18 @@ class OperatorCeed : public OperatorBase<Number>
 
     if (dof_handler.get_fe().n_components() == 1)
       {
-        // create libCEED view on deal.II vectors
-        VectorTypeCeed src_ceed(ceed, src);
-        VectorTypeCeed dst_ceed(ceed, dst);
+        // pass memory buffers to libCEED
+        VectorTypeCeed x(src_ceed);
+        VectorTypeCeed y(dst_ceed);
+        x.set_array(src);
+        y.set_array(dst);
 
         // apply operator
-        CeedOperatorApply(op_apply, src_ceed(), dst_ceed(), CEED_REQUEST_IMMEDIATE);
+        CeedOperatorApply(op_apply, x(), y(), CEED_REQUEST_IMMEDIATE);
+
+        // pull arrays back to deal.II
+        x.sync_to_host();
+        y.sync_to_host();
       }
     else // TODO: needed for multiple components
       {
@@ -335,17 +348,24 @@ class OperatorCeed : public OperatorBase<Number>
         src_tmp.reinit(this->extended_local_size(), true);
         dst_tmp.reinit(this->extended_local_size(), true);
 
-        copy_to_block_vector(src_tmp, src); // copy to block vector
+        // copy to block vector
+        copy_to_block_vector(src_tmp, src);
 
-        // create libCEED view on deal.II vectors
-        VectorTypeCeed src_ceed(ceed, src_tmp);
-        VectorTypeCeed dst_ceed(ceed, dst_tmp);
+        // pass memory buffers to libCEED
+        VectorTypeCeed x(src_ceed);
+        VectorTypeCeed y(dst_ceed);
+        x.set_array(src_tmp);
+        y.set_array(dst_tmp);
 
         // apply operator
-        CeedOperatorApply(op_apply, src_ceed(), dst_ceed(), CEED_REQUEST_IMMEDIATE);
+        CeedOperatorApply(op_apply, x(), y(), CEED_REQUEST_IMMEDIATE);
 
-        dst_ceed.sync_to_host();              // pull libCEED data back to host
-        copy_from_block_vector(dst, dst_tmp); // copy from block vector
+        // pull arrays back to deal.II
+        x.sync_to_host();
+        y.sync_to_host();
+
+        // copy from block vector
+        copy_from_block_vector(dst, dst_tmp);
       }
 
     // communicate: compress
@@ -373,9 +393,14 @@ class OperatorCeed : public OperatorBase<Number>
   {
     this->initialize_dof_vector(diagonal);
 
-    VectorTypeCeed diagonal_ceed(ceed, diagonal);
+    // pass memory buffer to libCEED
+    VectorTypeCeed y(dst_ceed);
+    y.set_array(diagonal);
+
+    CeedOperatorLinearAssembleDiagonal(op_apply, y(), CEED_REQUEST_IMMEDIATE);
 
-    CeedOperatorLinearAssembleDiagonal(op_apply, diagonal_ceed(), CEED_REQUEST_IMMEDIATE);
+    // pull array back to deal.II
+    y.sync_to_host();
 
     const unsigned int n_components = dof_handler.get_fe().n_components();
 
@@ -404,13 +429,10 @@ class OperatorCeed : public OperatorBase<Number>
     /**
      * Constructor.
      */
-    VectorTypeCeed(const Ceed &ceed, const VectorType &vec)
+    VectorTypeCeed(const CeedVector &vec_orig)
     {
-      const unsigned int n_dofs =
-        vec.get_partitioner()->locally_owned_size() + vec.get_partitioner()->n_ghost_indices();
-
-      CeedVectorCreate(ceed, n_dofs, &vec_ceed);
-      CeedVectorSetArray(vec_ceed, CEED_MEM_HOST, CEED_USE_POINTER, vec.get_values());
+      vec_ceed = NULL;
+      CeedVectorReferenceCopy(vec_orig, &vec_ceed);
     }
 
     /**
@@ -422,6 +444,15 @@ class OperatorCeed : public OperatorBase<Number>
       return vec_ceed;
     }
 
+    /**
+     * Set deal.II memory in libCEED vector.
+     */
+    void
+    set_array(const VectorType &vec)
+    {
+      CeedVectorSetArray(vec_ceed, CEED_MEM_HOST, CEED_USE_POINTER, vec.get_values());
+    }
+
     /**
      * Sync memory from device to host.
      */
@@ -436,8 +467,13 @@ class OperatorCeed : public OperatorBase<Number>
      */
     ~VectorTypeCeed()
     {
-      CeedScalar *ptr;
-      CeedVectorTakeArray(vec_ceed, CEED_MEM_HOST, &ptr);
+      bool has_array;
+      CeedVectorHasBorrowedArrayOfType(vec_ceed, CEED_MEM_HOST, &has_array);
+      if (has_array)
+        {
+          CeedScalar *ptr;
+          CeedVectorTakeArray(vec_ceed, CEED_MEM_HOST, &ptr);
+        }
       CeedVectorDestroy(&vec_ceed);
     }
 
@@ -705,6 +741,8 @@ class OperatorCeed : public OperatorBase<Number>
   Ceed                   ceed;
   std::vector<double>    weights;
   std::array<CeedInt, 3> strides;
+  CeedVector             src_ceed;
+  CeedVector             dst_ceed;
   CeedOperator           op_apply;
 
   /**

From 0850f9959daaba99c47e08ca6af4589f67130272 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 30 Aug 2024 10:20:47 -0600
Subject: [PATCH 157/571] dealii - update array handling method names

---
 examples/deal.II/bps.h | 36 +++++++++++++++++++-----------------
 1 file changed, 19 insertions(+), 17 deletions(-)

diff --git a/examples/deal.II/bps.h b/examples/deal.II/bps.h
index e43a9fbe4a..32ac936aa7 100644
--- a/examples/deal.II/bps.h
+++ b/examples/deal.II/bps.h
@@ -332,15 +332,15 @@ class OperatorCeed : public OperatorBase<Number>
         // pass memory buffers to libCEED
         VectorTypeCeed x(src_ceed);
         VectorTypeCeed y(dst_ceed);
-        x.set_array(src);
-        y.set_array(dst);
+        x.import_array(src, CEED_MEM_HOST);
+        y.import_array(dst, CEED_MEM_HOST);
 
         // apply operator
         CeedOperatorApply(op_apply, x(), y(), CEED_REQUEST_IMMEDIATE);
 
         // pull arrays back to deal.II
-        x.sync_to_host();
-        y.sync_to_host();
+        x.sync_array();
+        y.sync_array();
       }
     else // TODO: needed for multiple components
       {
@@ -354,15 +354,15 @@ class OperatorCeed : public OperatorBase<Number>
         // pass memory buffers to libCEED
         VectorTypeCeed x(src_ceed);
         VectorTypeCeed y(dst_ceed);
-        x.set_array(src_tmp);
-        y.set_array(dst_tmp);
+        x.import_array(src_tmp, CEED_MEM_HOST);
+        y.import_array(dst_tmp, CEED_MEM_HOST);
 
         // apply operator
         CeedOperatorApply(op_apply, x(), y(), CEED_REQUEST_IMMEDIATE);
 
         // pull arrays back to deal.II
-        x.sync_to_host();
-        y.sync_to_host();
+        x.sync_array();
+        y.sync_array();
 
         // copy from block vector
         copy_from_block_vector(dst, dst_tmp);
@@ -395,12 +395,12 @@ class OperatorCeed : public OperatorBase<Number>
 
     // pass memory buffer to libCEED
     VectorTypeCeed y(dst_ceed);
-    y.set_array(diagonal);
+    y.import_array(diagonal, CEED_MEM_HOST);
 
     CeedOperatorLinearAssembleDiagonal(op_apply, y(), CEED_REQUEST_IMMEDIATE);
 
     // pull array back to deal.II
-    y.sync_to_host();
+    y.sync_array();
 
     const unsigned int n_components = dof_handler.get_fe().n_components();
 
@@ -448,18 +448,19 @@ class OperatorCeed : public OperatorBase<Number>
      * Set deal.II memory in libCEED vector.
      */
     void
-    set_array(const VectorType &vec)
+    import_array(const VectorType &vec, const CeedMemType space)
     {
-      CeedVectorSetArray(vec_ceed, CEED_MEM_HOST, CEED_USE_POINTER, vec.get_values());
+      mem_space = space;
+      CeedVectorSetArray(vec_ceed, mem_space, CEED_USE_POINTER, vec.get_values());
     }
 
     /**
      * Sync memory from device to host.
      */
     void
-    sync_to_host()
+    sync_array()
     {
-      CeedVectorSyncArray(vec_ceed, CEED_MEM_HOST);
+      CeedVectorSyncArray(vec_ceed, mem_space);
     }
 
     /**
@@ -468,11 +469,11 @@ class OperatorCeed : public OperatorBase<Number>
     ~VectorTypeCeed()
     {
       bool has_array;
-      CeedVectorHasBorrowedArrayOfType(vec_ceed, CEED_MEM_HOST, &has_array);
+      CeedVectorHasBorrowedArrayOfType(vec_ceed, mem_space, &has_array);
       if (has_array)
         {
           CeedScalar *ptr;
-          CeedVectorTakeArray(vec_ceed, CEED_MEM_HOST, &ptr);
+          CeedVectorTakeArray(vec_ceed, mem_space, &ptr);
         }
       CeedVectorDestroy(&vec_ceed);
     }
@@ -481,7 +482,8 @@ class OperatorCeed : public OperatorBase<Number>
     /**
      * libCEED vector view.
      */
-    CeedVector vec_ceed;
+    CeedMemType mem_space;
+    CeedVector  vec_ceed;
   };
 
   /**

From 2e88d319acd75658c85aee6ae8a14c6e519e582d Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Tue, 3 Sep 2024 10:34:06 -0600
Subject: [PATCH 158/571] gpu - counting points correctly

---
 backends/cuda-ref/ceed-cuda-ref-restriction.c | 2 +-
 backends/hip-ref/ceed-hip-ref-restriction.c   | 2 +-
 interface/ceed-basis.c                        | 6 ++++--
 3 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/backends/cuda-ref/ceed-cuda-ref-restriction.c b/backends/cuda-ref/ceed-cuda-ref-restriction.c
index 0f20ceca7d..8ed9e0c60e 100644
--- a/backends/cuda-ref/ceed-cuda-ref-restriction.c
+++ b/backends/cuda-ref/ceed-cuda-ref-restriction.c
@@ -604,7 +604,7 @@ int CeedElemRestrictionCreate_Cuda(CeedMemType mem_type, CeedCopyMode copy_mode,
     // -- Use padded offsets for the rest of the setup
     offsets   = (const CeedInt *)offsets_padded;
     copy_mode = CEED_OWN_POINTER;
-    CeedCallBackend(CeedElemRestrictionSetAtPointsEVectorSize(rstr, at_points_size * num_comp));
+    CeedCallBackend(CeedElemRestrictionSetAtPointsEVectorSize(rstr, elem_size * num_elem * num_comp));
 
     // -- Points per element
     CeedCallBackend(CeedSetHostCeedIntArray(points_per_elem, CEED_OWN_POINTER, num_elem, &impl->h_points_per_elem_owned,
diff --git a/backends/hip-ref/ceed-hip-ref-restriction.c b/backends/hip-ref/ceed-hip-ref-restriction.c
index 41bba37520..eff205a018 100644
--- a/backends/hip-ref/ceed-hip-ref-restriction.c
+++ b/backends/hip-ref/ceed-hip-ref-restriction.c
@@ -603,7 +603,7 @@ int CeedElemRestrictionCreate_Hip(CeedMemType mem_type, CeedCopyMode copy_mode,
     // -- Use padded offsets for the rest of the setup
     offsets   = (const CeedInt *)offsets_padded;
     copy_mode = CEED_OWN_POINTER;
-    CeedCallBackend(CeedElemRestrictionSetAtPointsEVectorSize(rstr, at_points_size * num_comp));
+    CeedCallBackend(CeedElemRestrictionSetAtPointsEVectorSize(rstr, elem_size * num_elem * num_comp));
 
     // -- Points per element
     CeedCallBackend(CeedSetHostCeedIntArray(points_per_elem, CEED_OWN_POINTER, num_elem, &impl->h_points_per_elem_owned,
diff --git a/interface/ceed-basis.c b/interface/ceed-basis.c
index 6be4e3ddf7..132118cb5a 100644
--- a/interface/ceed-basis.c
+++ b/interface/ceed-basis.c
@@ -1640,14 +1640,16 @@ static int CeedBasisApplyAtPointsCheckDims(CeedBasis basis, CeedInt num_elem, co
   if (u != CEED_VECTOR_NONE) CeedCall(CeedVectorGetLength(u, &u_length));
 
   // Check compatibility of topological and geometrical dimensions
-  for (CeedInt i = 0; i < num_elem; i++) total_num_points += num_points[i];
   CeedCheck((t_mode == CEED_TRANSPOSE && v_length % num_nodes == 0) || (t_mode == CEED_NOTRANSPOSE && u_length % num_nodes == 0) ||
                 (eval_mode == CEED_EVAL_WEIGHT),
             ceed, CEED_ERROR_DIMENSION, "Length of input/output vectors incompatible with basis dimensions and number of points");
 
   // Check compatibility coordinates vector
+  for (CeedInt i = 0; i < num_elem; i++) total_num_points += num_points[i];
   CeedCheck((x_length >= total_num_points * dim) || (eval_mode == CEED_EVAL_WEIGHT), ceed, CEED_ERROR_DIMENSION,
-            "Length of reference coordinate vector incompatible with basis dimension and number of points");
+            "Length of reference coordinate vector incompatible with basis dimension and number of points."
+            " Found reference coordinate vector of length %" CeedSize_FMT ", not of length %" CeedSize_FMT ".",
+            x_length, total_num_points * dim);
 
   // Check CEED_EVAL_WEIGHT only on CEED_NOTRANSPOSE
   CeedCheck(eval_mode != CEED_EVAL_WEIGHT || t_mode == CEED_NOTRANSPOSE, ceed, CEED_ERROR_UNSUPPORTED,

From 111870fe44216042e3619edd402b3642abc96e10 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Wed, 4 Sep 2024 09:54:50 -0600
Subject: [PATCH 159/571] AtPoints - fix transpose basis apply on GPU

---
 backends/cuda-ref/ceed-cuda-ref-basis.c       | 51 +++++++++++++++----
 backends/cuda-ref/ceed-cuda-ref-operator.c    | 24 +++++----
 backends/cuda-ref/ceed-cuda-ref.h             |  4 ++
 backends/cuda-shared/ceed-cuda-shared-basis.c | 51 +++++++++++++++----
 backends/cuda-shared/ceed-cuda-shared.h       |  3 ++
 backends/hip-ref/ceed-hip-ref-basis.c         | 51 +++++++++++++++----
 backends/hip-ref/ceed-hip-ref-operator.c      | 24 +++++----
 backends/hip-ref/ceed-hip-ref.h               |  4 ++
 backends/hip-shared/ceed-hip-shared-basis.c   | 51 +++++++++++++++----
 backends/hip-shared/ceed-hip-shared.h         |  3 ++
 .../cuda/cuda-ref-basis-tensor-at-points.h    |  8 ++-
 .../hip/hip-ref-basis-tensor-at-points.h      |  8 ++-
 12 files changed, 220 insertions(+), 62 deletions(-)

diff --git a/backends/cuda-ref/ceed-cuda-ref-basis.c b/backends/cuda-ref/ceed-cuda-ref-basis.c
index 5efaeee456..738d0c4834 100644
--- a/backends/cuda-ref/ceed-cuda-ref-basis.c
+++ b/backends/cuda-ref/ceed-cuda-ref-basis.c
@@ -10,6 +10,7 @@
 #include <ceed/jit-tools.h>
 #include <cuda.h>
 #include <cuda_runtime.h>
+#include <string.h>
 
 #include "../cuda/ceed-cuda-common.h"
 #include "../cuda/ceed-cuda-compile.h"
@@ -115,18 +116,46 @@ static int CeedBasisApplyAtPointsCore_Cuda(CeedBasis basis, bool apply_add, cons
   CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
   CeedCallBackend(CeedBasisGetDimension(basis, &dim));
 
-  // Check uniform number of points per elem
-  for (CeedInt i = 1; i < num_elem; i++) {
-    CeedCheck(max_num_points == num_points[i], ceed, CEED_ERROR_BACKEND,
-              "BasisApplyAtPoints only supported for the same number of points in each element");
-  }
-
   // Weight handled separately
   if (eval_mode == CEED_EVAL_WEIGHT) {
     CeedCallBackend(CeedVectorSetValue(v, 1.0));
     return CEED_ERROR_SUCCESS;
   }
 
+  // Check padded to uniform number of points per elem
+  for (CeedInt i = 1; i < num_elem; i++) max_num_points = CeedIntMax(max_num_points, num_points[i]);
+  {
+    CeedInt  num_comp, q_comp;
+    CeedSize len, len_required;
+
+    CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
+    CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp));
+    CeedCallBackend(CeedVectorGetLength(is_transpose ? u : v, &len));
+    len_required = (CeedSize)num_comp * (CeedSize)q_comp * (CeedSize)num_elem * (CeedSize)max_num_points;
+    CeedCheck(len >= len_required, ceed, CEED_ERROR_BACKEND,
+              "Vector at points must be padded to the same number of points in each element for BasisApplyAtPoints on GPU backends."
+              " Found %" CeedSize_FMT ", Required %" CeedSize_FMT,
+              len, len_required);
+  }
+
+  // Move num_points array to device
+  if (is_transpose) {
+    const CeedInt num_bytes = num_elem * sizeof(CeedInt);
+
+    if (num_elem != data->num_elem_at_points) {
+      data->num_elem_at_points = num_elem;
+
+      if (data->d_points_per_elem) CeedCallCuda(ceed, cudaFree(data->d_points_per_elem));
+      CeedCallCuda(ceed, cudaMalloc((void **)&data->d_points_per_elem, num_bytes));
+      CeedCallBackend(CeedFree(&data->h_points_per_elem));
+      CeedCallBackend(CeedCalloc(num_elem, &data->h_points_per_elem));
+    }
+    if (!memcmp(data->h_points_per_elem, num_points, num_bytes)) {
+      memcpy(data->h_points_per_elem, num_points, num_bytes);
+      CeedCallCuda(ceed, cudaMemcpy(data->d_points_per_elem, num_points, num_bytes, cudaMemcpyHostToDevice));
+    }
+  }
+
   // Build kernels if needed
   if (data->num_points != max_num_points) {
     CeedInt P_1d;
@@ -186,14 +215,14 @@ static int CeedBasisApplyAtPointsCore_Cuda(CeedBasis basis, bool apply_add, cons
   // Basis action
   switch (eval_mode) {
     case CEED_EVAL_INTERP: {
-      void         *interp_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &d_x, &d_u, &d_v};
-      const CeedInt block_size    = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
+      void *interp_args[]      = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
+      const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
 
       CeedCallBackend(CeedRunKernel_Cuda(ceed, data->InterpAtPoints, num_elem, block_size, interp_args));
     } break;
     case CEED_EVAL_GRAD: {
-      void         *grad_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &d_x, &d_u, &d_v};
-      const CeedInt block_size  = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
+      void *grad_args[]        = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
+      const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
 
       CeedCallBackend(CeedRunKernel_Cuda(ceed, data->GradAtPoints, num_elem, block_size, grad_args));
     } break;
@@ -343,6 +372,8 @@ static int CeedBasisDestroy_Cuda(CeedBasis basis) {
   CeedCallCuda(ceed, cuModuleUnload(data->module));
   if (data->moduleAtPoints) CeedCallCuda(ceed, cuModuleUnload(data->moduleAtPoints));
   if (data->d_q_weight_1d) CeedCallCuda(ceed, cudaFree(data->d_q_weight_1d));
+  CeedCallBackend(CeedFree(&data->h_points_per_elem));
+  if (data->d_points_per_elem) CeedCallCuda(ceed, cudaFree(data->d_points_per_elem));
   CeedCallCuda(ceed, cudaFree(data->d_interp_1d));
   CeedCallCuda(ceed, cudaFree(data->d_grad_1d));
   CeedCallCuda(ceed, cudaFree(data->d_chebyshev_interp_1d));
diff --git a/backends/cuda-ref/ceed-cuda-ref-operator.c b/backends/cuda-ref/ceed-cuda-ref-operator.c
index 04eb0fb0fd..fcc7631e6c 100644
--- a/backends/cuda-ref/ceed-cuda-ref-operator.c
+++ b/backends/cuda-ref/ceed-cuda-ref-operator.c
@@ -27,6 +27,7 @@ static int CeedOperatorDestroy_Cuda(CeedOperator op) {
   CeedCallBackend(CeedOperatorGetData(op, &impl));
 
   // Apply data
+  CeedCallBackend(CeedFree(&impl->num_points));
   CeedCallBackend(CeedFree(&impl->skip_rstr_in));
   CeedCallBackend(CeedFree(&impl->skip_rstr_out));
   CeedCallBackend(CeedFree(&impl->apply_add_basis_out));
@@ -557,10 +558,17 @@ static int CeedOperatorSetupAtPoints_Cuda(CeedOperator op) {
   CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
   {
-    CeedElemRestriction elem_rstr = NULL;
+    CeedElemRestriction rstr_points = NULL;
 
-    CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &elem_rstr, NULL));
-    CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(elem_rstr, &max_num_points));
+    CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL));
+    CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(rstr_points, &max_num_points));
+    CeedCallBackend(CeedCalloc(num_elem, &impl->num_points));
+    for (CeedInt e = 0; e < num_elem; e++) {
+      CeedInt num_points_elem;
+
+      CeedCallBackend(CeedElemRestrictionGetNumPointsInElement(rstr_points, e, &num_points_elem));
+      impl->num_points[e] = num_points_elem;
+    }
   }
   impl->max_num_points = max_num_points;
 
@@ -674,7 +682,7 @@ static inline int CeedOperatorInputBasisAtPoints_Cuda(CeedInt num_elem, const Ce
 // Apply and add to output AtPoints
 //------------------------------------------------------------------------------
 static int CeedOperatorApplyAddAtPoints_Cuda(CeedOperator op, CeedVector in_vec, CeedVector out_vec, CeedRequest *request) {
-  CeedInt             max_num_points, num_elem, elem_size, num_input_fields, num_output_fields, size;
+  CeedInt             max_num_points, *num_points, num_elem, elem_size, num_input_fields, num_output_fields, size;
   CeedScalar         *e_data[2 * CEED_FIELD_MAX] = {NULL};
   CeedQFunctionField *qf_input_fields, *qf_output_fields;
   CeedQFunction       qf;
@@ -686,12 +694,11 @@ static int CeedOperatorApplyAddAtPoints_Cuda(CeedOperator op, CeedVector in_vec,
   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
   CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
-  CeedInt num_points[num_elem];
 
   // Setup
   CeedCallBackend(CeedOperatorSetupAtPoints_Cuda(op));
+  num_points     = impl->num_points;
   max_num_points = impl->max_num_points;
-  for (CeedInt i = 0; i < num_elem; i++) num_points[i] = max_num_points;
 
   // Input Evecs and Restriction
   CeedCallBackend(CeedOperatorSetupInputs_Cuda(num_input_fields, qf_input_fields, op_input_fields, in_vec, false, e_data, impl, request));
@@ -1616,7 +1623,7 @@ static int CeedOperatorLinearAssembleQFunctionAtPoints_Cuda(CeedOperator op, Cee
 // Assemble Linear Diagonal AtPoints
 //------------------------------------------------------------------------------
 static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda(CeedOperator op, CeedVector assembled, CeedRequest *request) {
-  CeedInt             max_num_points, num_elem, num_input_fields, num_output_fields;
+  CeedInt             max_num_points, *num_points, num_elem, num_input_fields, num_output_fields;
   CeedScalar         *e_data[2 * CEED_FIELD_MAX] = {NULL};
   CeedQFunctionField *qf_input_fields, *qf_output_fields;
   CeedQFunction       qf;
@@ -1628,12 +1635,11 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda(CeedOperator op, C
   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
   CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
-  CeedInt num_points[num_elem];
 
   // Setup
   CeedCallBackend(CeedOperatorSetupAtPoints_Cuda(op));
+  num_points     = impl->num_points;
   max_num_points = impl->max_num_points;
-  for (CeedInt i = 0; i < num_elem; i++) num_points[i] = max_num_points;
 
   // Create separate output e-vecs
   if (impl->has_shared_e_vecs) {
diff --git a/backends/cuda-ref/ceed-cuda-ref.h b/backends/cuda-ref/ceed-cuda-ref.h
index c664a38ed7..9a63a5f4f4 100644
--- a/backends/cuda-ref/ceed-cuda-ref.h
+++ b/backends/cuda-ref/ceed-cuda-ref.h
@@ -75,6 +75,9 @@ typedef struct {
   CeedScalar *d_grad_1d;
   CeedScalar *d_q_weight_1d;
   CeedScalar *d_chebyshev_interp_1d;
+  CeedInt     num_elem_at_points;
+  CeedInt    *h_points_per_elem;
+  CeedInt    *d_points_per_elem;
 } CeedBasis_Cuda;
 
 typedef struct {
@@ -136,6 +139,7 @@ typedef struct {
   CeedInt                    num_inputs, num_outputs;
   CeedInt                    num_active_in, num_active_out;
   CeedInt                    max_num_points;
+  CeedInt                   *num_points;
   CeedVector                *qf_active_in, point_coords_elem;
   CeedOperatorDiag_Cuda     *diag;
   CeedOperatorAssemble_Cuda *asmb;
diff --git a/backends/cuda-shared/ceed-cuda-shared-basis.c b/backends/cuda-shared/ceed-cuda-shared-basis.c
index 8245149f6d..664694b859 100644
--- a/backends/cuda-shared/ceed-cuda-shared-basis.c
+++ b/backends/cuda-shared/ceed-cuda-shared-basis.c
@@ -12,6 +12,7 @@
 #include <cuda_runtime.h>
 #include <stdbool.h>
 #include <stddef.h>
+#include <string.h>
 
 #include "../cuda/ceed-cuda-common.h"
 #include "../cuda/ceed-cuda-compile.h"
@@ -221,18 +222,46 @@ static int CeedBasisApplyAtPointsCore_Cuda_shared(CeedBasis basis, bool apply_ad
   CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
   CeedCallBackend(CeedBasisGetDimension(basis, &dim));
 
-  // Check uniform number of points per elem
-  for (CeedInt i = 1; i < num_elem; i++) {
-    CeedCheck(max_num_points == num_points[i], ceed, CEED_ERROR_BACKEND,
-              "BasisApplyAtPoints only supported for the same number of points in each element");
-  }
-
   // Weight handled separately
   if (eval_mode == CEED_EVAL_WEIGHT) {
     CeedCallBackend(CeedVectorSetValue(v, 1.0));
     return CEED_ERROR_SUCCESS;
   }
 
+  // Check padded to uniform number of points per elem
+  for (CeedInt i = 1; i < num_elem; i++) max_num_points = CeedIntMax(max_num_points, num_points[i]);
+  {
+    CeedInt  num_comp, q_comp;
+    CeedSize len, len_required;
+
+    CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
+    CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp));
+    CeedCallBackend(CeedVectorGetLength(is_transpose ? u : v, &len));
+    len_required = (CeedSize)num_comp * (CeedSize)q_comp * (CeedSize)num_elem * (CeedSize)max_num_points;
+    CeedCheck(len >= len_required, ceed, CEED_ERROR_BACKEND,
+              "Vector at points must be padded to the same number of points in each element for BasisApplyAtPoints on GPU backends."
+              " Found %" CeedSize_FMT ", Required %" CeedSize_FMT,
+              len, len_required);
+  }
+
+  // Move num_points array to device
+  if (is_transpose) {
+    const CeedInt num_bytes = num_elem * sizeof(CeedInt);
+
+    if (num_elem != data->num_elem_at_points) {
+      data->num_elem_at_points = num_elem;
+
+      if (data->d_points_per_elem) CeedCallCuda(ceed, cudaFree(data->d_points_per_elem));
+      CeedCallCuda(ceed, cudaMalloc((void **)&data->d_points_per_elem, num_bytes));
+      CeedCallBackend(CeedFree(&data->h_points_per_elem));
+      CeedCallBackend(CeedCalloc(num_elem, &data->h_points_per_elem));
+    }
+    if (!memcmp(data->h_points_per_elem, num_points, num_bytes)) {
+      memcpy(data->h_points_per_elem, num_points, num_bytes);
+      CeedCallCuda(ceed, cudaMemcpy(data->d_points_per_elem, num_points, num_bytes, cudaMemcpyHostToDevice));
+    }
+  }
+
   // Build kernels if needed
   if (data->num_points != max_num_points) {
     CeedInt P_1d;
@@ -292,14 +321,14 @@ static int CeedBasisApplyAtPointsCore_Cuda_shared(CeedBasis basis, bool apply_ad
   // Basis action
   switch (eval_mode) {
     case CEED_EVAL_INTERP: {
-      void         *interp_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &d_x, &d_u, &d_v};
-      const CeedInt block_size    = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
+      void *interp_args[]      = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
+      const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
 
       CeedCallBackend(CeedRunKernel_Cuda(ceed, data->InterpAtPoints, num_elem, block_size, interp_args));
     } break;
     case CEED_EVAL_GRAD: {
-      void         *grad_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &d_x, &d_u, &d_v};
-      const CeedInt block_size  = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
+      void *grad_args[]        = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
+      const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
 
       CeedCallBackend(CeedRunKernel_Cuda(ceed, data->GradAtPoints, num_elem, block_size, grad_args));
     } break;
@@ -345,6 +374,8 @@ static int CeedBasisDestroy_Cuda_shared(CeedBasis basis) {
   CeedCallCuda(ceed, cuModuleUnload(data->module));
   if (data->moduleAtPoints) CeedCallCuda(ceed, cuModuleUnload(data->moduleAtPoints));
   if (data->d_q_weight_1d) CeedCallCuda(ceed, cudaFree(data->d_q_weight_1d));
+  CeedCallBackend(CeedFree(&data->h_points_per_elem));
+  if (data->d_points_per_elem) CeedCallCuda(ceed, cudaFree(data->d_points_per_elem));
   CeedCallCuda(ceed, cudaFree(data->d_interp_1d));
   CeedCallCuda(ceed, cudaFree(data->d_grad_1d));
   CeedCallCuda(ceed, cudaFree(data->d_collo_grad_1d));
diff --git a/backends/cuda-shared/ceed-cuda-shared.h b/backends/cuda-shared/ceed-cuda-shared.h
index 96800a0a86..d70d75ab94 100644
--- a/backends/cuda-shared/ceed-cuda-shared.h
+++ b/backends/cuda-shared/ceed-cuda-shared.h
@@ -30,6 +30,9 @@ typedef struct {
   CeedScalar *d_chebyshev_interp_1d;
   CeedScalar *c_B;
   CeedScalar *c_G;
+  CeedInt     num_elem_at_points;
+  CeedInt    *h_points_per_elem;
+  CeedInt    *d_points_per_elem;
 } CeedBasis_Cuda_shared;
 
 CEED_INTERN int CeedBasisCreateTensorH1_Cuda_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d,
diff --git a/backends/hip-ref/ceed-hip-ref-basis.c b/backends/hip-ref/ceed-hip-ref-basis.c
index ea3f4e6e3a..fdae91a82b 100644
--- a/backends/hip-ref/ceed-hip-ref-basis.c
+++ b/backends/hip-ref/ceed-hip-ref-basis.c
@@ -8,6 +8,7 @@
 #include <ceed.h>
 #include <ceed/backend.h>
 #include <ceed/jit-tools.h>
+#include <string.h>
 #include <hip/hip_runtime.h>
 
 #include "../hip/ceed-hip-common.h"
@@ -113,18 +114,46 @@ static int CeedBasisApplyAtPointsCore_Hip(CeedBasis basis, bool apply_add, const
   CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
   CeedCallBackend(CeedBasisGetDimension(basis, &dim));
 
-  // Check uniform number of points per elem
-  for (CeedInt i = 1; i < num_elem; i++) {
-    CeedCheck(max_num_points == num_points[i], ceed, CEED_ERROR_BACKEND,
-              "BasisApplyAtPoints only supported for the same number of points in each element");
-  }
-
   // Weight handled separately
   if (eval_mode == CEED_EVAL_WEIGHT) {
     CeedCallBackend(CeedVectorSetValue(v, 1.0));
     return CEED_ERROR_SUCCESS;
   }
 
+  // Check padded to uniform number of points per elem
+  for (CeedInt i = 1; i < num_elem; i++) max_num_points = CeedIntMax(max_num_points, num_points[i]);
+  {
+    CeedInt  num_comp, q_comp;
+    CeedSize len, len_required;
+
+    CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
+    CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp));
+    CeedCallBackend(CeedVectorGetLength(is_transpose ? u : v, &len));
+    len_required = (CeedSize)num_comp * (CeedSize)q_comp * (CeedSize)num_elem * (CeedSize)max_num_points;
+    CeedCheck(len >= len_required, ceed, CEED_ERROR_BACKEND,
+              "Vector at points must be padded to the same number of points in each element for BasisApplyAtPoints on GPU backends."
+              " Found %" CeedSize_FMT ", Required %" CeedSize_FMT,
+              len, len_required);
+  }
+
+  // Move num_points array to device
+  if (is_transpose) {
+    const CeedInt num_bytes = num_elem * sizeof(CeedInt);
+
+    if (num_elem != data->num_elem_at_points) {
+      data->num_elem_at_points = num_elem;
+
+      if (data->d_points_per_elem) CeedCallHip(ceed, hipFree(data->d_points_per_elem));
+      CeedCallHip(ceed, hipMalloc((void **)&data->d_points_per_elem, num_bytes));
+      CeedCallBackend(CeedFree(&data->h_points_per_elem));
+      CeedCallBackend(CeedCalloc(num_elem, &data->h_points_per_elem));
+    }
+    if (!memcmp(data->h_points_per_elem, num_points, num_bytes)) {
+      memcpy(data->h_points_per_elem, num_points, num_bytes);
+      CeedCallHip(ceed, hipMemcpy(data->d_points_per_elem, num_points, num_bytes, hipMemcpyHostToDevice));
+    }
+  }
+
   // Build kernels if needed
   if (data->num_points != max_num_points) {
     CeedInt P_1d;
@@ -184,14 +213,14 @@ static int CeedBasisApplyAtPointsCore_Hip(CeedBasis basis, bool apply_add, const
   // Basis action
   switch (eval_mode) {
     case CEED_EVAL_INTERP: {
-      void         *interp_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &d_x, &d_u, &d_v};
-      const CeedInt block_size    = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
+      void *interp_args[]      = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
+      const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
 
       CeedCallBackend(CeedRunKernel_Hip(ceed, data->InterpAtPoints, num_elem, block_size, interp_args));
     } break;
     case CEED_EVAL_GRAD: {
-      void         *grad_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &d_x, &d_u, &d_v};
-      const CeedInt block_size  = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
+      void *grad_args[]        = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
+      const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
 
       CeedCallBackend(CeedRunKernel_Hip(ceed, data->GradAtPoints, num_elem, block_size, grad_args));
     } break;
@@ -341,6 +370,8 @@ static int CeedBasisDestroy_Hip(CeedBasis basis) {
   CeedCallHip(ceed, hipModuleUnload(data->module));
   if (data->moduleAtPoints) CeedCallHip(ceed, hipModuleUnload(data->moduleAtPoints));
   if (data->d_q_weight_1d) CeedCallHip(ceed, hipFree(data->d_q_weight_1d));
+  CeedCallBackend(CeedFree(&data->h_points_per_elem));
+  if (data->d_points_per_elem) CeedCallHip(ceed, hipFree(data->d_points_per_elem));
   CeedCallHip(ceed, hipFree(data->d_interp_1d));
   CeedCallHip(ceed, hipFree(data->d_grad_1d));
   CeedCallHip(ceed, hipFree(data->d_chebyshev_interp_1d));
diff --git a/backends/hip-ref/ceed-hip-ref-operator.c b/backends/hip-ref/ceed-hip-ref-operator.c
index 509555a375..8bc3f0bc35 100644
--- a/backends/hip-ref/ceed-hip-ref-operator.c
+++ b/backends/hip-ref/ceed-hip-ref-operator.c
@@ -26,6 +26,7 @@ static int CeedOperatorDestroy_Hip(CeedOperator op) {
   CeedCallBackend(CeedOperatorGetData(op, &impl));
 
   // Apply data
+  CeedCallBackend(CeedFree(&impl->num_points));
   CeedCallBackend(CeedFree(&impl->skip_rstr_in));
   CeedCallBackend(CeedFree(&impl->skip_rstr_out));
   CeedCallBackend(CeedFree(&impl->apply_add_basis_out));
@@ -556,10 +557,17 @@ static int CeedOperatorSetupAtPoints_Hip(CeedOperator op) {
   CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
   {
-    CeedElemRestriction elem_rstr = NULL;
+    CeedElemRestriction rstr_points = NULL;
 
-    CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &elem_rstr, NULL));
-    CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(elem_rstr, &max_num_points));
+    CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL));
+    CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(rstr_points, &max_num_points));
+    CeedCallBackend(CeedCalloc(num_elem, &impl->num_points));
+    for (CeedInt e = 0; e < num_elem; e++) {
+      CeedInt num_points_elem;
+
+      CeedCallBackend(CeedElemRestrictionGetNumPointsInElement(rstr_points, e, &num_points_elem));
+      impl->num_points[e] = num_points_elem;
+    }
   }
   impl->max_num_points = max_num_points;
 
@@ -673,7 +681,7 @@ static inline int CeedOperatorInputBasisAtPoints_Hip(CeedInt num_elem, const Cee
 // Apply and add to output AtPoints
 //------------------------------------------------------------------------------
 static int CeedOperatorApplyAddAtPoints_Hip(CeedOperator op, CeedVector in_vec, CeedVector out_vec, CeedRequest *request) {
-  CeedInt             max_num_points, num_elem, elem_size, num_input_fields, num_output_fields, size;
+  CeedInt             max_num_points, *num_points, num_elem, elem_size, num_input_fields, num_output_fields, size;
   CeedScalar         *e_data[2 * CEED_FIELD_MAX] = {NULL};
   CeedQFunctionField *qf_input_fields, *qf_output_fields;
   CeedQFunction       qf;
@@ -685,12 +693,11 @@ static int CeedOperatorApplyAddAtPoints_Hip(CeedOperator op, CeedVector in_vec,
   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
   CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
-  CeedInt num_points[num_elem];
 
   // Setup
   CeedCallBackend(CeedOperatorSetupAtPoints_Hip(op));
+  num_points     = impl->num_points;
   max_num_points = impl->max_num_points;
-  for (CeedInt i = 0; i < num_elem; i++) num_points[i] = max_num_points;
 
   // Input Evecs and Restriction
   CeedCallBackend(CeedOperatorSetupInputs_Hip(num_input_fields, qf_input_fields, op_input_fields, in_vec, false, e_data, impl, request));
@@ -1613,7 +1620,7 @@ static int CeedOperatorLinearAssembleQFunctionAtPoints_Hip(CeedOperator op, Ceed
 // Assemble Linear Diagonal AtPoints
 //------------------------------------------------------------------------------
 static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip(CeedOperator op, CeedVector assembled, CeedRequest *request) {
-  CeedInt             max_num_points, num_elem, num_input_fields, num_output_fields;
+  CeedInt             max_num_points, *num_points, num_elem, num_input_fields, num_output_fields;
   CeedScalar         *e_data[2 * CEED_FIELD_MAX] = {NULL};
   CeedQFunctionField *qf_input_fields, *qf_output_fields;
   CeedQFunction       qf;
@@ -1625,12 +1632,11 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip(CeedOperator op, Ce
   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
   CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
-  CeedInt num_points[num_elem];
 
   // Setup
   CeedCallBackend(CeedOperatorSetupAtPoints_Hip(op));
+  num_points     = impl->num_points;
   max_num_points = impl->max_num_points;
-  for (CeedInt i = 0; i < num_elem; i++) num_points[i] = max_num_points;
 
   // Create separate output e-vecs
   if (impl->has_shared_e_vecs) {
diff --git a/backends/hip-ref/ceed-hip-ref.h b/backends/hip-ref/ceed-hip-ref.h
index 38c91060b4..02fb567517 100644
--- a/backends/hip-ref/ceed-hip-ref.h
+++ b/backends/hip-ref/ceed-hip-ref.h
@@ -79,6 +79,9 @@ typedef struct {
   CeedScalar   *d_grad_1d;
   CeedScalar   *d_q_weight_1d;
   CeedScalar   *d_chebyshev_interp_1d;
+  CeedInt       num_elem_at_points;
+  CeedInt      *h_points_per_elem;
+  CeedInt      *d_points_per_elem;
 } CeedBasis_Hip;
 
 typedef struct {
@@ -140,6 +143,7 @@ typedef struct {
   CeedInt                   num_inputs, num_outputs;
   CeedInt                   num_active_in, num_active_out;
   CeedInt                   max_num_points;
+  CeedInt                  *num_points;
   CeedVector               *qf_active_in, point_coords_elem;
   CeedOperatorDiag_Hip     *diag;
   CeedOperatorAssemble_Hip *asmb;
diff --git a/backends/hip-shared/ceed-hip-shared-basis.c b/backends/hip-shared/ceed-hip-shared-basis.c
index 959078e5b1..76819c481c 100644
--- a/backends/hip-shared/ceed-hip-shared-basis.c
+++ b/backends/hip-shared/ceed-hip-shared-basis.c
@@ -10,6 +10,7 @@
 #include <ceed/jit-tools.h>
 #include <stdbool.h>
 #include <stddef.h>
+#include <string.h>
 #include <hip/hip_runtime.h>
 
 #include "../hip/ceed-hip-common.h"
@@ -280,18 +281,46 @@ static int CeedBasisApplyAtPointsCore_Hip_shared(CeedBasis basis, bool apply_add
   CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
   CeedCallBackend(CeedBasisGetDimension(basis, &dim));
 
-  // Check uniform number of points per elem
-  for (CeedInt i = 1; i < num_elem; i++) {
-    CeedCheck(max_num_points == num_points[i], ceed, CEED_ERROR_BACKEND,
-              "BasisApplyAtPoints only supported for the same number of points in each element");
-  }
-
   // Weight handled separately
   if (eval_mode == CEED_EVAL_WEIGHT) {
     CeedCallBackend(CeedVectorSetValue(v, 1.0));
     return CEED_ERROR_SUCCESS;
   }
 
+  // Check padded to uniform number of points per elem
+  for (CeedInt i = 1; i < num_elem; i++) max_num_points = CeedIntMax(max_num_points, num_points[i]);
+  {
+    CeedInt  num_comp, q_comp;
+    CeedSize len, len_required;
+
+    CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
+    CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp));
+    CeedCallBackend(CeedVectorGetLength(is_transpose ? u : v, &len));
+    len_required = (CeedSize)num_comp * (CeedSize)q_comp * (CeedSize)num_elem * (CeedSize)max_num_points;
+    CeedCheck(len >= len_required, ceed, CEED_ERROR_BACKEND,
+              "Vector at points must be padded to the same number of points in each element for BasisApplyAtPoints on GPU backends."
+              " Found %" CeedSize_FMT ", Required %" CeedSize_FMT,
+              len, len_required);
+  }
+
+  // Move num_points array to device
+  if (is_transpose) {
+    const CeedInt num_bytes = num_elem * sizeof(CeedInt);
+
+    if (num_elem != data->num_elem_at_points) {
+      data->num_elem_at_points = num_elem;
+
+      if (data->d_points_per_elem) CeedCallHip(ceed, hipFree(data->d_points_per_elem));
+      CeedCallHip(ceed, hipMalloc((void **)&data->d_points_per_elem, num_bytes));
+      CeedCallBackend(CeedFree(&data->h_points_per_elem));
+      CeedCallBackend(CeedCalloc(num_elem, &data->h_points_per_elem));
+    }
+    if (!memcmp(data->h_points_per_elem, num_points, num_bytes)) {
+      memcpy(data->h_points_per_elem, num_points, num_bytes);
+      CeedCallHip(ceed, hipMemcpy(data->d_points_per_elem, num_points, num_bytes, hipMemcpyHostToDevice));
+    }
+  }
+
   // Build kernels if needed
   if (data->num_points != max_num_points) {
     CeedInt P_1d;
@@ -351,14 +380,14 @@ static int CeedBasisApplyAtPointsCore_Hip_shared(CeedBasis basis, bool apply_add
   // Basis action
   switch (eval_mode) {
     case CEED_EVAL_INTERP: {
-      void         *interp_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &d_x, &d_u, &d_v};
-      const CeedInt block_size    = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
+      void *interp_args[]      = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
+      const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
 
       CeedCallBackend(CeedRunKernel_Hip(ceed, data->InterpAtPoints, num_elem, block_size, interp_args));
     } break;
     case CEED_EVAL_GRAD: {
-      void         *grad_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &d_x, &d_u, &d_v};
-      const CeedInt block_size  = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
+      void *grad_args[]        = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
+      const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
 
       CeedCallBackend(CeedRunKernel_Hip(ceed, data->GradAtPoints, num_elem, block_size, grad_args));
     } break;
@@ -404,6 +433,8 @@ static int CeedBasisDestroy_Hip_shared(CeedBasis basis) {
   CeedCallHip(ceed, hipModuleUnload(data->module));
   if (data->moduleAtPoints) CeedCallHip(ceed, hipModuleUnload(data->moduleAtPoints));
   if (data->d_q_weight_1d) CeedCallHip(ceed, hipFree(data->d_q_weight_1d));
+  CeedCallBackend(CeedFree(&data->h_points_per_elem));
+  if (data->d_points_per_elem) CeedCallHip(ceed, hipFree(data->d_points_per_elem));
   CeedCallHip(ceed, hipFree(data->d_interp_1d));
   CeedCallHip(ceed, hipFree(data->d_grad_1d));
   CeedCallHip(ceed, hipFree(data->d_collo_grad_1d));
diff --git a/backends/hip-shared/ceed-hip-shared.h b/backends/hip-shared/ceed-hip-shared.h
index fe3384f55d..c000b7f873 100644
--- a/backends/hip-shared/ceed-hip-shared.h
+++ b/backends/hip-shared/ceed-hip-shared.h
@@ -29,6 +29,9 @@ typedef struct {
   CeedScalar   *d_collo_grad_1d;
   CeedScalar   *d_q_weight_1d;
   CeedScalar   *d_chebyshev_interp_1d;
+  CeedInt       num_elem_at_points;
+  CeedInt      *h_points_per_elem;
+  CeedInt      *d_points_per_elem;
 } CeedBasis_Hip_shared;
 
 CEED_INTERN int CeedBasisCreateTensorH1_Hip_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d,
diff --git a/include/ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h b/include/ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h
index 263d29338e..7355705660 100644
--- a/include/ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h
+++ b/include/ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h
@@ -42,7 +42,8 @@ inline __device__ void ChebyshevDerivativeAtPoint(const CeedScalar x, CeedScalar
 // Interp
 //------------------------------------------------------------------------------
 extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt is_transpose, const CeedScalar *__restrict__ chebyshev_interp_1d,
-                                          const CeedScalar *__restrict__ coords, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
+                                          const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords,
+                                          const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
   const CeedInt i = threadIdx.x;
 
   __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D];
@@ -80,6 +81,7 @@ extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt
         // Map from point
         __syncthreads();
         for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
+          if (p >= points_per_elem[elem]) continue;
           pre  = 1;
           post = 1;
           for (CeedInt d = 0; d < BASIS_DIM; d++) {
@@ -196,7 +198,8 @@ extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt
 // Grad
 //------------------------------------------------------------------------------
 extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedInt is_transpose, const CeedScalar *__restrict__ chebyshev_interp_1d,
-                                        const CeedScalar *__restrict__ coords, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
+                                        const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords,
+                                        const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
   const CeedInt i = threadIdx.x;
 
   __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D];
@@ -235,6 +238,7 @@ extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedInt is
         // Map from point
         __syncthreads();
         for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
+          if (p >= points_per_elem[elem]) continue;
           for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) {
             const CeedScalar *cur_u = &u[elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride];
 
diff --git a/include/ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h b/include/ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h
index 22d81bc30a..4744b17eb2 100644
--- a/include/ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h
+++ b/include/ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h
@@ -42,7 +42,8 @@ inline __device__ void ChebyshevDerivativeAtPoint(const CeedScalar x, CeedScalar
 // Interp
 //------------------------------------------------------------------------------
 extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt is_transpose, const CeedScalar *__restrict__ chebyshev_interp_1d,
-                                          const CeedScalar *__restrict__ coords, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
+                                          const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords,
+                                          const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
   const CeedInt i = threadIdx.x;
 
   __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D];
@@ -80,6 +81,7 @@ extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt
         // Map from point
         __syncthreads();
         for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
+          if (p >= points_per_elem[elem]) continue;
           pre  = 1;
           post = 1;
           for (CeedInt d = 0; d < BASIS_DIM; d++) {
@@ -196,7 +198,8 @@ extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt
 // Grad
 //------------------------------------------------------------------------------
 extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedInt is_transpose, const CeedScalar *__restrict__ chebyshev_interp_1d,
-                                        const CeedScalar *__restrict__ coords, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
+                                        const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords,
+                                        const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
   const CeedInt i = threadIdx.x;
 
   __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D];
@@ -235,6 +238,7 @@ extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedInt is
         // Map from point
         __syncthreads();
         for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
+          if (p >= points_per_elem[elem]) continue;
           for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) {
             const CeedScalar *cur_u = &u[elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride];
 

From 9e511c80210b4447ceabc14bc1a44d55934e0fbf Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Wed, 4 Sep 2024 12:20:20 -0600
Subject: [PATCH 160/571] atpoints - copy when *not* the same

Co-authored-by: Zach Atkins <zach.atkins@colorado.edu>
---
 backends/cuda-ref/ceed-cuda-ref-basis.c       | 2 +-
 backends/cuda-shared/ceed-cuda-shared-basis.c | 2 +-
 backends/hip-ref/ceed-hip-ref-basis.c         | 2 +-
 backends/hip-shared/ceed-hip-shared-basis.c   | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/backends/cuda-ref/ceed-cuda-ref-basis.c b/backends/cuda-ref/ceed-cuda-ref-basis.c
index 738d0c4834..c245a51489 100644
--- a/backends/cuda-ref/ceed-cuda-ref-basis.c
+++ b/backends/cuda-ref/ceed-cuda-ref-basis.c
@@ -150,7 +150,7 @@ static int CeedBasisApplyAtPointsCore_Cuda(CeedBasis basis, bool apply_add, cons
       CeedCallBackend(CeedFree(&data->h_points_per_elem));
       CeedCallBackend(CeedCalloc(num_elem, &data->h_points_per_elem));
     }
-    if (!memcmp(data->h_points_per_elem, num_points, num_bytes)) {
+    if (memcmp(data->h_points_per_elem, num_points, num_bytes)) {
       memcpy(data->h_points_per_elem, num_points, num_bytes);
       CeedCallCuda(ceed, cudaMemcpy(data->d_points_per_elem, num_points, num_bytes, cudaMemcpyHostToDevice));
     }
diff --git a/backends/cuda-shared/ceed-cuda-shared-basis.c b/backends/cuda-shared/ceed-cuda-shared-basis.c
index 664694b859..8924ff52f4 100644
--- a/backends/cuda-shared/ceed-cuda-shared-basis.c
+++ b/backends/cuda-shared/ceed-cuda-shared-basis.c
@@ -256,7 +256,7 @@ static int CeedBasisApplyAtPointsCore_Cuda_shared(CeedBasis basis, bool apply_ad
       CeedCallBackend(CeedFree(&data->h_points_per_elem));
       CeedCallBackend(CeedCalloc(num_elem, &data->h_points_per_elem));
     }
-    if (!memcmp(data->h_points_per_elem, num_points, num_bytes)) {
+    if (memcmp(data->h_points_per_elem, num_points, num_bytes)) {
       memcpy(data->h_points_per_elem, num_points, num_bytes);
       CeedCallCuda(ceed, cudaMemcpy(data->d_points_per_elem, num_points, num_bytes, cudaMemcpyHostToDevice));
     }
diff --git a/backends/hip-ref/ceed-hip-ref-basis.c b/backends/hip-ref/ceed-hip-ref-basis.c
index fdae91a82b..70dda0a7da 100644
--- a/backends/hip-ref/ceed-hip-ref-basis.c
+++ b/backends/hip-ref/ceed-hip-ref-basis.c
@@ -148,7 +148,7 @@ static int CeedBasisApplyAtPointsCore_Hip(CeedBasis basis, bool apply_add, const
       CeedCallBackend(CeedFree(&data->h_points_per_elem));
       CeedCallBackend(CeedCalloc(num_elem, &data->h_points_per_elem));
     }
-    if (!memcmp(data->h_points_per_elem, num_points, num_bytes)) {
+    if (memcmp(data->h_points_per_elem, num_points, num_bytes)) {
       memcpy(data->h_points_per_elem, num_points, num_bytes);
       CeedCallHip(ceed, hipMemcpy(data->d_points_per_elem, num_points, num_bytes, hipMemcpyHostToDevice));
     }
diff --git a/backends/hip-shared/ceed-hip-shared-basis.c b/backends/hip-shared/ceed-hip-shared-basis.c
index 76819c481c..307107ec6b 100644
--- a/backends/hip-shared/ceed-hip-shared-basis.c
+++ b/backends/hip-shared/ceed-hip-shared-basis.c
@@ -315,7 +315,7 @@ static int CeedBasisApplyAtPointsCore_Hip_shared(CeedBasis basis, bool apply_add
       CeedCallBackend(CeedFree(&data->h_points_per_elem));
       CeedCallBackend(CeedCalloc(num_elem, &data->h_points_per_elem));
     }
-    if (!memcmp(data->h_points_per_elem, num_points, num_bytes)) {
+    if (memcmp(data->h_points_per_elem, num_points, num_bytes)) {
       memcpy(data->h_points_per_elem, num_points, num_bytes);
       CeedCallHip(ceed, hipMemcpy(data->d_points_per_elem, num_points, num_bytes, hipMemcpyHostToDevice));
     }

From 0b31fde2be28a49f7a2571dd394fda5f21a56567 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Thu, 5 Sep 2024 15:07:17 -0600
Subject: [PATCH 161/571] minor - move 2 functions to clean up docs

---
 interface/ceed-basis.c | 780 ++++++++++++++++++++---------------------
 1 file changed, 390 insertions(+), 390 deletions(-)

diff --git a/interface/ceed-basis.c b/interface/ceed-basis.c
index 132118cb5a..67b9d43345 100644
--- a/interface/ceed-basis.c
+++ b/interface/ceed-basis.c
@@ -294,136 +294,435 @@ static int CeedBasisCreateProjectionMatrices(CeedBasis basis_from, CeedBasis bas
   return CEED_ERROR_SUCCESS;
 }
 
-/// @}
-
-/// ----------------------------------------------------------------------------
-/// Ceed Backend API
-/// ----------------------------------------------------------------------------
-/// @addtogroup CeedBasisBackend
-/// @{
-
 /**
-  @brief Return collocated gradient matrix
+  @brief Check input vector dimensions for CeedBasisApply[Add]AtPoints
 
-  @param[in]  basis         `CeedBasis`
-  @param[out] collo_grad_1d Row-major (`Q_1d * Q_1d`) matrix expressing derivatives of basis functions at quadrature points
+  @param[in]  basis      `CeedBasis` to evaluate
+  @param[in]  num_elem   The number of elements to apply the basis evaluation to;
+                          the backend will specify the ordering in @ref CeedElemRestrictionCreate()
+  @param[in]  num_points Array of the number of points to apply the basis evaluation to in each element, size `num_elem`
+  @param[in]  t_mode     @ref CEED_NOTRANSPOSE to evaluate from nodes to points;
+                           @ref CEED_TRANSPOSE to apply the transpose, mapping from points to nodes
+  @param[in]  eval_mode  @ref CEED_EVAL_INTERP to use interpolated values,
+                           @ref CEED_EVAL_GRAD to use gradients,
+                           @ref CEED_EVAL_WEIGHT to use quadrature weights
+  @param[in]  x_ref      `CeedVector` holding reference coordinates of each point
+  @param[in]  u          Input `CeedVector`, of length `num_nodes * num_comp` for @ref CEED_NOTRANSPOSE
+  @param[out] v          Output `CeedVector`, of length `num_points * num_q_comp` for @ref CEED_NOTRANSPOSE with @ref CEED_EVAL_INTERP
 
   @return An error code: 0 - success, otherwise - failure
 
-  @ref Backend
+  @ref Developer
 **/
-int CeedBasisGetCollocatedGrad(CeedBasis basis, CeedScalar *collo_grad_1d) {
-  Ceed              ceed;
-  CeedInt           P_1d, Q_1d;
-  CeedScalar       *interp_1d_pinv;
-  const CeedScalar *grad_1d, *interp_1d;
+static int CeedBasisApplyAtPointsCheckDims(CeedBasis basis, CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode,
+                                           CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) {
+  CeedInt  dim, num_comp, num_q_comp, num_nodes, P_1d = 1, Q_1d = 1, total_num_points = 0;
+  CeedSize x_length = 0, u_length = 0, v_length;
+  Ceed     ceed;
 
-  // Note: This function is for backend use, so all errors are terminal and we do not need to clean up memory on failure.
   CeedCall(CeedBasisGetCeed(basis, &ceed));
+  CeedCall(CeedBasisGetDimension(basis, &dim));
   CeedCall(CeedBasisGetNumNodes1D(basis, &P_1d));
   CeedCall(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
+  CeedCall(CeedBasisGetNumComponents(basis, &num_comp));
+  CeedCall(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &num_q_comp));
+  CeedCall(CeedBasisGetNumNodes(basis, &num_nodes));
+  CeedCall(CeedVectorGetLength(v, &v_length));
+  if (x_ref != CEED_VECTOR_NONE) CeedCall(CeedVectorGetLength(x_ref, &x_length));
+  if (u != CEED_VECTOR_NONE) CeedCall(CeedVectorGetLength(u, &u_length));
 
-  // Compute interp_1d^+, pseudoinverse of interp_1d
-  CeedCall(CeedCalloc(P_1d * Q_1d, &interp_1d_pinv));
-  CeedCall(CeedBasisGetInterp1D(basis, &interp_1d));
-  CeedCall(CeedMatrixPseudoinverse(ceed, interp_1d, Q_1d, P_1d, interp_1d_pinv));
-  CeedCall(CeedBasisGetGrad1D(basis, &grad_1d));
-  CeedCall(CeedMatrixMatrixMultiply(ceed, grad_1d, (const CeedScalar *)interp_1d_pinv, collo_grad_1d, Q_1d, Q_1d, P_1d));
+  // Check compatibility of topological and geometrical dimensions
+  CeedCheck((t_mode == CEED_TRANSPOSE && v_length % num_nodes == 0) || (t_mode == CEED_NOTRANSPOSE && u_length % num_nodes == 0) ||
+                (eval_mode == CEED_EVAL_WEIGHT),
+            ceed, CEED_ERROR_DIMENSION, "Length of input/output vectors incompatible with basis dimensions and number of points");
 
-  CeedCall(CeedFree(&interp_1d_pinv));
+  // Check compatibility coordinates vector
+  for (CeedInt i = 0; i < num_elem; i++) total_num_points += num_points[i];
+  CeedCheck((x_length >= total_num_points * dim) || (eval_mode == CEED_EVAL_WEIGHT), ceed, CEED_ERROR_DIMENSION,
+            "Length of reference coordinate vector incompatible with basis dimension and number of points."
+            " Found reference coordinate vector of length %" CeedSize_FMT ", not of length %" CeedSize_FMT ".",
+            x_length, total_num_points * dim);
+
+  // Check CEED_EVAL_WEIGHT only on CEED_NOTRANSPOSE
+  CeedCheck(eval_mode != CEED_EVAL_WEIGHT || t_mode == CEED_NOTRANSPOSE, ceed, CEED_ERROR_UNSUPPORTED,
+            "CEED_EVAL_WEIGHT only supported with CEED_NOTRANSPOSE");
+
+  // Check vector lengths to prevent out of bounds issues
+  bool has_good_dims = true;
+  switch (eval_mode) {
+    case CEED_EVAL_INTERP:
+      has_good_dims = ((t_mode == CEED_TRANSPOSE && (u_length >= total_num_points * num_q_comp || v_length >= num_elem * num_nodes * num_comp)) ||
+                       (t_mode == CEED_NOTRANSPOSE && (v_length >= total_num_points * num_q_comp || u_length >= num_elem * num_nodes * num_comp)));
+      break;
+    case CEED_EVAL_GRAD:
+      has_good_dims =
+          ((t_mode == CEED_TRANSPOSE && (u_length >= total_num_points * num_q_comp * dim || v_length >= num_elem * num_nodes * num_comp)) ||
+           (t_mode == CEED_NOTRANSPOSE && (v_length >= total_num_points * num_q_comp * dim || u_length >= num_elem * num_nodes * num_comp)));
+      break;
+    case CEED_EVAL_WEIGHT:
+      has_good_dims = t_mode == CEED_NOTRANSPOSE && (v_length >= total_num_points);
+      break;
+      // LCOV_EXCL_START
+    case CEED_EVAL_NONE:
+    case CEED_EVAL_DIV:
+    case CEED_EVAL_CURL:
+      return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "Evaluation at arbitrary points not supported for %s", CeedEvalModes[eval_mode]);
+      // LCOV_EXCL_STOP
+  }
+  CeedCheck(has_good_dims, ceed, CEED_ERROR_DIMENSION, "Input/output vectors too short for basis and evaluation mode");
   return CEED_ERROR_SUCCESS;
 }
 
 /**
-  @brief Return 1D interpolation matrix to Chebyshev polynomial coefficients on quadrature space
+  @brief Default implimentation to apply basis evaluation from nodes to arbitrary points
 
-  @param[in]  basis               `CeedBasis`
-  @param[out] chebyshev_interp_1d Row-major (`P_1d * Q_1d`) matrix interpolating from basis nodes to Chebyshev polynomial coefficients
+  @param[in]  basis      `CeedBasis` to evaluate
+  @param[in]  apply_add  Sum result into target vector or overwrite
+  @param[in]  num_elem   The number of elements to apply the basis evaluation to;
+                          the backend will specify the ordering in @ref CeedElemRestrictionCreate()
+  @param[in]  num_points Array of the number of points to apply the basis evaluation to in each element, size `num_elem`
+  @param[in]  t_mode     @ref CEED_NOTRANSPOSE to evaluate from nodes to points;
+                           @ref CEED_TRANSPOSE to apply the transpose, mapping from points to nodes
+  @param[in]  eval_mode  @ref CEED_EVAL_INTERP to use interpolated values,
+                           @ref CEED_EVAL_GRAD to use gradients,
+                           @ref CEED_EVAL_WEIGHT to use quadrature weights
+  @param[in]  x_ref      `CeedVector` holding reference coordinates of each point
+  @param[in]  u          Input `CeedVector`, of length `num_nodes * num_comp` for @ref CEED_NOTRANSPOSE
+  @param[out] v          Output `CeedVector`, of length `num_points * num_q_comp` for @ref CEED_NOTRANSPOSE with @ref CEED_EVAL_INTERP
 
   @return An error code: 0 - success, otherwise - failure
 
-  @ref Backend
+  @ref Developer
 **/
-int CeedBasisGetChebyshevInterp1D(CeedBasis basis, CeedScalar *chebyshev_interp_1d) {
-  CeedInt           P_1d, Q_1d;
-  CeedScalar       *C, *chebyshev_coeffs_1d_inv;
-  const CeedScalar *interp_1d, *q_ref_1d;
-  Ceed              ceed;
+static int CeedBasisApplyAtPoints_Core(CeedBasis basis, bool apply_add, CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode,
+                                       CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) {
+  CeedInt dim, num_comp, P_1d = 1, Q_1d = 1, total_num_points = num_points[0];
+  Ceed    ceed;
 
   CeedCall(CeedBasisGetCeed(basis, &ceed));
+  CeedCall(CeedBasisGetDimension(basis, &dim));
+  // Inserting check because clang-tidy doesn't understand this cannot occur
+  CeedCheck(dim > 0, ceed, CEED_ERROR_UNSUPPORTED, "Malformed CeedBasis, dim > 0 is required");
   CeedCall(CeedBasisGetNumNodes1D(basis, &P_1d));
   CeedCall(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
+  CeedCall(CeedBasisGetNumComponents(basis, &num_comp));
 
-  // Build coefficient matrix
-  // -- Note: Clang-tidy needs this check
-  CeedCheck((P_1d > 0) && (Q_1d > 0), ceed, CEED_ERROR_INCOMPATIBLE, "CeedBasis dimensions are malformed");
-  CeedCall(CeedCalloc(Q_1d * Q_1d, &C));
-  CeedCall(CeedBasisGetQRef(basis, &q_ref_1d));
-  for (CeedInt i = 0; i < Q_1d; i++) CeedCall(CeedChebyshevPolynomialsAtPoint(q_ref_1d[i], Q_1d, &C[i * Q_1d]));
-
-  // Compute C^+, pseudoinverse of coefficient matrix
-  CeedCall(CeedCalloc(Q_1d * Q_1d, &chebyshev_coeffs_1d_inv));
-  CeedCall(CeedMatrixPseudoinverse(ceed, C, Q_1d, Q_1d, chebyshev_coeffs_1d_inv));
-
-  // Build mapping from nodes to Chebyshev coefficients
-  CeedCall(CeedBasisGetInterp1D(basis, &interp_1d));
-  CeedCall(CeedMatrixMatrixMultiply(ceed, chebyshev_coeffs_1d_inv, interp_1d, chebyshev_interp_1d, Q_1d, P_1d, Q_1d));
-
-  // Cleanup
-  CeedCall(CeedFree(&C));
-  CeedCall(CeedFree(&chebyshev_coeffs_1d_inv));
-  return CEED_ERROR_SUCCESS;
-}
-
-/**
-  @brief Get tensor status for given `CeedBasis`
+  // Default implementation
+  {
+    bool is_tensor_basis;
 
-  @param[in]  basis     `CeedBasis`
-  @param[out] is_tensor Variable to store tensor status
+    CeedCall(CeedBasisIsTensor(basis, &is_tensor_basis));
+    CeedCheck(is_tensor_basis, ceed, CEED_ERROR_UNSUPPORTED, "Evaluation at arbitrary points only supported for tensor product bases");
+  }
+  CeedCheck(num_elem == 1, ceed, CEED_ERROR_UNSUPPORTED, "Evaluation at arbitrary  points only supported for a single element at a time");
+  if (eval_mode == CEED_EVAL_WEIGHT) {
+    CeedCall(CeedVectorSetValue(v, 1.0));
+    return CEED_ERROR_SUCCESS;
+  }
+  if (!basis->basis_chebyshev) {
+    // Build basis mapping from nodes to Chebyshev coefficients
+    CeedScalar       *chebyshev_interp_1d, *chebyshev_grad_1d, *chebyshev_q_weight_1d;
+    const CeedScalar *q_ref_1d;
 
-  @return An error code: 0 - success, otherwise - failure
+    CeedCall(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d));
+    CeedCall(CeedCalloc(P_1d * Q_1d, &chebyshev_grad_1d));
+    CeedCall(CeedCalloc(Q_1d, &chebyshev_q_weight_1d));
+    CeedCall(CeedBasisGetQRef(basis, &q_ref_1d));
+    CeedCall(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d));
 
-  @ref Backend
-**/
-int CeedBasisIsTensor(CeedBasis basis, bool *is_tensor) {
-  *is_tensor = basis->is_tensor_basis;
-  return CEED_ERROR_SUCCESS;
-}
+    CeedCall(CeedVectorCreate(ceed, num_comp * CeedIntPow(Q_1d, dim), &basis->vec_chebyshev));
+    CeedCall(CeedBasisCreateTensorH1(ceed, dim, num_comp, P_1d, Q_1d, chebyshev_interp_1d, chebyshev_grad_1d, q_ref_1d, chebyshev_q_weight_1d,
+                                     &basis->basis_chebyshev));
 
-/**
-  @brief Get backend data of a `CeedBasis`
+    // Cleanup
+    CeedCall(CeedFree(&chebyshev_interp_1d));
+    CeedCall(CeedFree(&chebyshev_grad_1d));
+    CeedCall(CeedFree(&chebyshev_q_weight_1d));
+  }
 
-  @param[in]  basis `CeedBasis`
-  @param[out] data  Variable to store data
+  // Create TensorContract object if needed, such as a basis from the GPU backends
+  if (!basis->contract) {
+    Ceed      ceed_ref;
+    CeedBasis basis_ref = NULL;
 
-  @return An error code: 0 - success, otherwise - failure
+    CeedCall(CeedInit("/cpu/self", &ceed_ref));
+    // Only need matching tensor contraction dimensions, any type of basis will work
+    CeedCall(CeedBasisCreateTensorH1Lagrange(ceed_ref, dim, num_comp, P_1d, Q_1d, CEED_GAUSS, &basis_ref));
+    // Note - clang-tidy doesn't know basis_ref->contract must be valid here
+    CeedCheck(basis_ref && basis_ref->contract, ceed, CEED_ERROR_UNSUPPORTED, "Reference CPU ceed failed to create a tensor contraction object");
+    CeedCall(CeedTensorContractReferenceCopy(basis_ref->contract, &basis->contract));
+    CeedCall(CeedBasisDestroy(&basis_ref));
+    CeedCall(CeedDestroy(&ceed_ref));
+  }
 
-  @ref Backend
-**/
-int CeedBasisGetData(CeedBasis basis, void *data) {
-  *(void **)data = basis->data;
-  return CEED_ERROR_SUCCESS;
-}
+  // Basis evaluation
+  switch (t_mode) {
+    case CEED_NOTRANSPOSE: {
+      // Nodes to arbitrary points
+      CeedScalar       *v_array;
+      const CeedScalar *chebyshev_coeffs, *x_array_read;
 
-/**
-  @brief Set backend data of a `CeedBasis`
+      // -- Interpolate to Chebyshev coefficients
+      CeedCall(CeedBasisApply(basis->basis_chebyshev, 1, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, u, basis->vec_chebyshev));
 
-  @param[in,out] basis  `CeedBasis`
-  @param[in]     data   Data to set
+      // -- Evaluate Chebyshev polynomials at arbitrary points
+      CeedCall(CeedVectorGetArrayRead(basis->vec_chebyshev, CEED_MEM_HOST, &chebyshev_coeffs));
+      CeedCall(CeedVectorGetArrayRead(x_ref, CEED_MEM_HOST, &x_array_read));
+      CeedCall(CeedVectorGetArrayWrite(v, CEED_MEM_HOST, &v_array));
+      switch (eval_mode) {
+        case CEED_EVAL_INTERP: {
+          CeedScalar tmp[2][num_comp * CeedIntPow(Q_1d, dim)], chebyshev_x[Q_1d];
 
-  @return An error code: 0 - success, otherwise - failure
+          // ---- Values at point
+          for (CeedInt p = 0; p < total_num_points; p++) {
+            CeedInt pre = num_comp * CeedIntPow(Q_1d, dim - 1), post = 1;
 
-  @ref Backend
-**/
-int CeedBasisSetData(CeedBasis basis, void *data) {
-  basis->data = data;
-  return CEED_ERROR_SUCCESS;
-}
+            for (CeedInt d = 0; d < dim; d++) {
+              // ------ Tensor contract with current Chebyshev polynomial values
+              CeedCall(CeedChebyshevPolynomialsAtPoint(x_array_read[d * total_num_points + p], Q_1d, chebyshev_x));
+              CeedCall(CeedTensorContractApply(basis->contract, pre, Q_1d, post, 1, chebyshev_x, t_mode, false,
+                                               d == 0 ? chebyshev_coeffs : tmp[d % 2], tmp[(d + 1) % 2]));
+              pre /= Q_1d;
+              post *= 1;
+            }
+            for (CeedInt c = 0; c < num_comp; c++) v_array[c * total_num_points + p] = tmp[dim % 2][c];
+          }
+          break;
+        }
+        case CEED_EVAL_GRAD: {
+          CeedScalar tmp[2][num_comp * CeedIntPow(Q_1d, dim)], chebyshev_x[Q_1d];
 
-/**
-  @brief Increment the reference counter for a `CeedBasis`
+          // ---- Values at point
+          for (CeedInt p = 0; p < total_num_points; p++) {
+            // Dim**2 contractions, apply grad when pass == dim
+            for (CeedInt pass = 0; pass < dim; pass++) {
+              CeedInt pre = num_comp * CeedIntPow(Q_1d, dim - 1), post = 1;
 
-  @param[in,out] basis `CeedBasis` to increment the reference counter
+              for (CeedInt d = 0; d < dim; d++) {
+                // ------ Tensor contract with current Chebyshev polynomial values
+                if (pass == d) CeedCall(CeedChebyshevDerivativeAtPoint(x_array_read[d * total_num_points + p], Q_1d, chebyshev_x));
+                else CeedCall(CeedChebyshevPolynomialsAtPoint(x_array_read[d * total_num_points + p], Q_1d, chebyshev_x));
+                CeedCall(CeedTensorContractApply(basis->contract, pre, Q_1d, post, 1, chebyshev_x, t_mode, false,
+                                                 d == 0 ? chebyshev_coeffs : tmp[d % 2], tmp[(d + 1) % 2]));
+                pre /= Q_1d;
+                post *= 1;
+              }
+              for (CeedInt c = 0; c < num_comp; c++) v_array[(pass * num_comp + c) * total_num_points + p] = tmp[dim % 2][c];
+            }
+          }
+          break;
+        }
+        default:
+          // Nothing to do, excluded above
+          break;
+      }
+      CeedCall(CeedVectorRestoreArrayRead(basis->vec_chebyshev, &chebyshev_coeffs));
+      CeedCall(CeedVectorRestoreArrayRead(x_ref, &x_array_read));
+      CeedCall(CeedVectorRestoreArray(v, &v_array));
+      break;
+    }
+    case CEED_TRANSPOSE: {
+      // Note: No switch on e_mode here because only CEED_EVAL_INTERP is supported at this time
+      // Arbitrary points to nodes
+      CeedScalar       *chebyshev_coeffs;
+      const CeedScalar *u_array, *x_array_read;
+
+      // -- Transpose of evaluation of Chebyshev polynomials at arbitrary points
+      CeedCall(CeedVectorGetArrayWrite(basis->vec_chebyshev, CEED_MEM_HOST, &chebyshev_coeffs));
+      CeedCall(CeedVectorGetArrayRead(x_ref, CEED_MEM_HOST, &x_array_read));
+      CeedCall(CeedVectorGetArrayRead(u, CEED_MEM_HOST, &u_array));
+
+      switch (eval_mode) {
+        case CEED_EVAL_INTERP: {
+          CeedScalar tmp[2][num_comp * CeedIntPow(Q_1d, dim)], chebyshev_x[Q_1d];
+
+          // ---- Values at point
+          for (CeedInt p = 0; p < total_num_points; p++) {
+            CeedInt pre = num_comp * 1, post = 1;
+
+            for (CeedInt c = 0; c < num_comp; c++) tmp[0][c] = u_array[c * total_num_points + p];
+            for (CeedInt d = 0; d < dim; d++) {
+              // ------ Tensor contract with current Chebyshev polynomial values
+              CeedCall(CeedChebyshevPolynomialsAtPoint(x_array_read[d * total_num_points + p], Q_1d, chebyshev_x));
+              CeedCall(CeedTensorContractApply(basis->contract, pre, 1, post, Q_1d, chebyshev_x, t_mode, p > 0 && d == (dim - 1), tmp[d % 2],
+                                               d == (dim - 1) ? chebyshev_coeffs : tmp[(d + 1) % 2]));
+              pre /= 1;
+              post *= Q_1d;
+            }
+          }
+          break;
+        }
+        case CEED_EVAL_GRAD: {
+          CeedScalar tmp[2][num_comp * CeedIntPow(Q_1d, dim)], chebyshev_x[Q_1d];
+
+          // ---- Values at point
+          for (CeedInt p = 0; p < total_num_points; p++) {
+            // Dim**2 contractions, apply grad when pass == dim
+            for (CeedInt pass = 0; pass < dim; pass++) {
+              CeedInt pre = num_comp * 1, post = 1;
+
+              for (CeedInt c = 0; c < num_comp; c++) tmp[0][c] = u_array[(pass * num_comp + c) * total_num_points + p];
+              for (CeedInt d = 0; d < dim; d++) {
+                // ------ Tensor contract with current Chebyshev polynomial values
+                if (pass == d) CeedCall(CeedChebyshevDerivativeAtPoint(x_array_read[d * total_num_points + p], Q_1d, chebyshev_x));
+                else CeedCall(CeedChebyshevPolynomialsAtPoint(x_array_read[d * total_num_points + p], Q_1d, chebyshev_x));
+                CeedCall(CeedTensorContractApply(basis->contract, pre, 1, post, Q_1d, chebyshev_x, t_mode,
+                                                 (p > 0 || (p == 0 && pass > 0)) && d == (dim - 1), tmp[d % 2],
+                                                 d == (dim - 1) ? chebyshev_coeffs : tmp[(d + 1) % 2]));
+                pre /= 1;
+                post *= Q_1d;
+              }
+            }
+          }
+          break;
+        }
+        default:
+          // Nothing to do, excluded above
+          break;
+      }
+      CeedCall(CeedVectorRestoreArray(basis->vec_chebyshev, &chebyshev_coeffs));
+      CeedCall(CeedVectorRestoreArrayRead(x_ref, &x_array_read));
+      CeedCall(CeedVectorRestoreArrayRead(u, &u_array));
+
+      // -- Interpolate transpose from Chebyshev coefficients
+      if (apply_add) CeedCall(CeedBasisApplyAdd(basis->basis_chebyshev, 1, CEED_TRANSPOSE, CEED_EVAL_INTERP, basis->vec_chebyshev, v));
+      else CeedCall(CeedBasisApply(basis->basis_chebyshev, 1, CEED_TRANSPOSE, CEED_EVAL_INTERP, basis->vec_chebyshev, v));
+      break;
+    }
+  }
+  return CEED_ERROR_SUCCESS;
+}
+
+/// @}
+
+/// ----------------------------------------------------------------------------
+/// Ceed Backend API
+/// ----------------------------------------------------------------------------
+/// @addtogroup CeedBasisBackend
+/// @{
+
+/**
+  @brief Return collocated gradient matrix
+
+  @param[in]  basis         `CeedBasis`
+  @param[out] collo_grad_1d Row-major (`Q_1d * Q_1d`) matrix expressing derivatives of basis functions at quadrature points
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Backend
+**/
+int CeedBasisGetCollocatedGrad(CeedBasis basis, CeedScalar *collo_grad_1d) {
+  Ceed              ceed;
+  CeedInt           P_1d, Q_1d;
+  CeedScalar       *interp_1d_pinv;
+  const CeedScalar *grad_1d, *interp_1d;
+
+  // Note: This function is for backend use, so all errors are terminal and we do not need to clean up memory on failure.
+  CeedCall(CeedBasisGetCeed(basis, &ceed));
+  CeedCall(CeedBasisGetNumNodes1D(basis, &P_1d));
+  CeedCall(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
+
+  // Compute interp_1d^+, pseudoinverse of interp_1d
+  CeedCall(CeedCalloc(P_1d * Q_1d, &interp_1d_pinv));
+  CeedCall(CeedBasisGetInterp1D(basis, &interp_1d));
+  CeedCall(CeedMatrixPseudoinverse(ceed, interp_1d, Q_1d, P_1d, interp_1d_pinv));
+  CeedCall(CeedBasisGetGrad1D(basis, &grad_1d));
+  CeedCall(CeedMatrixMatrixMultiply(ceed, grad_1d, (const CeedScalar *)interp_1d_pinv, collo_grad_1d, Q_1d, Q_1d, P_1d));
+
+  CeedCall(CeedFree(&interp_1d_pinv));
+  return CEED_ERROR_SUCCESS;
+}
+
+/**
+  @brief Return 1D interpolation matrix to Chebyshev polynomial coefficients on quadrature space
+
+  @param[in]  basis               `CeedBasis`
+  @param[out] chebyshev_interp_1d Row-major (`P_1d * Q_1d`) matrix interpolating from basis nodes to Chebyshev polynomial coefficients
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Backend
+**/
+int CeedBasisGetChebyshevInterp1D(CeedBasis basis, CeedScalar *chebyshev_interp_1d) {
+  CeedInt           P_1d, Q_1d;
+  CeedScalar       *C, *chebyshev_coeffs_1d_inv;
+  const CeedScalar *interp_1d, *q_ref_1d;
+  Ceed              ceed;
+
+  CeedCall(CeedBasisGetCeed(basis, &ceed));
+  CeedCall(CeedBasisGetNumNodes1D(basis, &P_1d));
+  CeedCall(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
+
+  // Build coefficient matrix
+  // -- Note: Clang-tidy needs this check
+  CeedCheck((P_1d > 0) && (Q_1d > 0), ceed, CEED_ERROR_INCOMPATIBLE, "CeedBasis dimensions are malformed");
+  CeedCall(CeedCalloc(Q_1d * Q_1d, &C));
+  CeedCall(CeedBasisGetQRef(basis, &q_ref_1d));
+  for (CeedInt i = 0; i < Q_1d; i++) CeedCall(CeedChebyshevPolynomialsAtPoint(q_ref_1d[i], Q_1d, &C[i * Q_1d]));
+
+  // Compute C^+, pseudoinverse of coefficient matrix
+  CeedCall(CeedCalloc(Q_1d * Q_1d, &chebyshev_coeffs_1d_inv));
+  CeedCall(CeedMatrixPseudoinverse(ceed, C, Q_1d, Q_1d, chebyshev_coeffs_1d_inv));
+
+  // Build mapping from nodes to Chebyshev coefficients
+  CeedCall(CeedBasisGetInterp1D(basis, &interp_1d));
+  CeedCall(CeedMatrixMatrixMultiply(ceed, chebyshev_coeffs_1d_inv, interp_1d, chebyshev_interp_1d, Q_1d, P_1d, Q_1d));
+
+  // Cleanup
+  CeedCall(CeedFree(&C));
+  CeedCall(CeedFree(&chebyshev_coeffs_1d_inv));
+  return CEED_ERROR_SUCCESS;
+}
+
+/**
+  @brief Get tensor status for given `CeedBasis`
+
+  @param[in]  basis     `CeedBasis`
+  @param[out] is_tensor Variable to store tensor status
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Backend
+**/
+int CeedBasisIsTensor(CeedBasis basis, bool *is_tensor) {
+  *is_tensor = basis->is_tensor_basis;
+  return CEED_ERROR_SUCCESS;
+}
+
+/**
+  @brief Get backend data of a `CeedBasis`
+
+  @param[in]  basis `CeedBasis`
+  @param[out] data  Variable to store data
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Backend
+**/
+int CeedBasisGetData(CeedBasis basis, void *data) {
+  *(void **)data = basis->data;
+  return CEED_ERROR_SUCCESS;
+}
+
+/**
+  @brief Set backend data of a `CeedBasis`
+
+  @param[in,out] basis  `CeedBasis`
+  @param[in]     data   Data to set
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Backend
+**/
+int CeedBasisSetData(CeedBasis basis, void *data) {
+  basis->data = data;
+  return CEED_ERROR_SUCCESS;
+}
+
+/**
+  @brief Increment the reference counter for a `CeedBasis`
+
+  @param[in,out] basis `CeedBasis` to increment the reference counter
 
   @return An error code: 0 - success, otherwise - failure
 
@@ -1602,305 +1901,6 @@ int CeedBasisApplyAdd(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mod
   return CEED_ERROR_SUCCESS;
 }
 
-/**
-  @brief Check input vector dimensions for CeedBasisApply[Add]AtPoints
-
-  @param[in]  basis      `CeedBasis` to evaluate
-  @param[in]  num_elem   The number of elements to apply the basis evaluation to;
-                          the backend will specify the ordering in @ref CeedElemRestrictionCreate()
-  @param[in]  num_points Array of the number of points to apply the basis evaluation to in each element, size `num_elem`
-  @param[in]  t_mode     @ref CEED_NOTRANSPOSE to evaluate from nodes to points;
-                           @ref CEED_TRANSPOSE to apply the transpose, mapping from points to nodes
-  @param[in]  eval_mode  @ref CEED_EVAL_INTERP to use interpolated values,
-                           @ref CEED_EVAL_GRAD to use gradients,
-                           @ref CEED_EVAL_WEIGHT to use quadrature weights
-  @param[in]  x_ref      `CeedVector` holding reference coordinates of each point
-  @param[in]  u          Input `CeedVector`, of length `num_nodes * num_comp` for @ref CEED_NOTRANSPOSE
-  @param[out] v          Output `CeedVector`, of length `num_points * num_q_comp` for @ref CEED_NOTRANSPOSE with @ref CEED_EVAL_INTERP
-
-  @return An error code: 0 - success, otherwise - failure
-
-  @ref Developer
-**/
-static int CeedBasisApplyAtPointsCheckDims(CeedBasis basis, CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode,
-                                           CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) {
-  CeedInt  dim, num_comp, num_q_comp, num_nodes, P_1d = 1, Q_1d = 1, total_num_points = 0;
-  CeedSize x_length = 0, u_length = 0, v_length;
-  Ceed     ceed;
-
-  CeedCall(CeedBasisGetCeed(basis, &ceed));
-  CeedCall(CeedBasisGetDimension(basis, &dim));
-  CeedCall(CeedBasisGetNumNodes1D(basis, &P_1d));
-  CeedCall(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
-  CeedCall(CeedBasisGetNumComponents(basis, &num_comp));
-  CeedCall(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &num_q_comp));
-  CeedCall(CeedBasisGetNumNodes(basis, &num_nodes));
-  CeedCall(CeedVectorGetLength(v, &v_length));
-  if (x_ref != CEED_VECTOR_NONE) CeedCall(CeedVectorGetLength(x_ref, &x_length));
-  if (u != CEED_VECTOR_NONE) CeedCall(CeedVectorGetLength(u, &u_length));
-
-  // Check compatibility of topological and geometrical dimensions
-  CeedCheck((t_mode == CEED_TRANSPOSE && v_length % num_nodes == 0) || (t_mode == CEED_NOTRANSPOSE && u_length % num_nodes == 0) ||
-                (eval_mode == CEED_EVAL_WEIGHT),
-            ceed, CEED_ERROR_DIMENSION, "Length of input/output vectors incompatible with basis dimensions and number of points");
-
-  // Check compatibility coordinates vector
-  for (CeedInt i = 0; i < num_elem; i++) total_num_points += num_points[i];
-  CeedCheck((x_length >= total_num_points * dim) || (eval_mode == CEED_EVAL_WEIGHT), ceed, CEED_ERROR_DIMENSION,
-            "Length of reference coordinate vector incompatible with basis dimension and number of points."
-            " Found reference coordinate vector of length %" CeedSize_FMT ", not of length %" CeedSize_FMT ".",
-            x_length, total_num_points * dim);
-
-  // Check CEED_EVAL_WEIGHT only on CEED_NOTRANSPOSE
-  CeedCheck(eval_mode != CEED_EVAL_WEIGHT || t_mode == CEED_NOTRANSPOSE, ceed, CEED_ERROR_UNSUPPORTED,
-            "CEED_EVAL_WEIGHT only supported with CEED_NOTRANSPOSE");
-
-  // Check vector lengths to prevent out of bounds issues
-  bool has_good_dims = true;
-  switch (eval_mode) {
-    case CEED_EVAL_INTERP:
-      has_good_dims = ((t_mode == CEED_TRANSPOSE && (u_length >= total_num_points * num_q_comp || v_length >= num_elem * num_nodes * num_comp)) ||
-                       (t_mode == CEED_NOTRANSPOSE && (v_length >= total_num_points * num_q_comp || u_length >= num_elem * num_nodes * num_comp)));
-      break;
-    case CEED_EVAL_GRAD:
-      has_good_dims =
-          ((t_mode == CEED_TRANSPOSE && (u_length >= total_num_points * num_q_comp * dim || v_length >= num_elem * num_nodes * num_comp)) ||
-           (t_mode == CEED_NOTRANSPOSE && (v_length >= total_num_points * num_q_comp * dim || u_length >= num_elem * num_nodes * num_comp)));
-      break;
-    case CEED_EVAL_WEIGHT:
-      has_good_dims = t_mode == CEED_NOTRANSPOSE && (v_length >= total_num_points);
-      break;
-      // LCOV_EXCL_START
-    case CEED_EVAL_NONE:
-    case CEED_EVAL_DIV:
-    case CEED_EVAL_CURL:
-      return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "Evaluation at arbitrary points not supported for %s", CeedEvalModes[eval_mode]);
-      // LCOV_EXCL_STOP
-  }
-  CeedCheck(has_good_dims, ceed, CEED_ERROR_DIMENSION, "Input/output vectors too short for basis and evaluation mode");
-  return CEED_ERROR_SUCCESS;
-}
-
-/**
-  @brief Default implimentation to apply basis evaluation from nodes to arbitrary points
-
-  @param[in]  basis      `CeedBasis` to evaluate
-  @param[in]  apply_add  Sum result into target vector or overwrite
-  @param[in]  num_elem   The number of elements to apply the basis evaluation to;
-                          the backend will specify the ordering in @ref CeedElemRestrictionCreate()
-  @param[in]  num_points Array of the number of points to apply the basis evaluation to in each element, size `num_elem`
-  @param[in]  t_mode     @ref CEED_NOTRANSPOSE to evaluate from nodes to points;
-                           @ref CEED_TRANSPOSE to apply the transpose, mapping from points to nodes
-  @param[in]  eval_mode  @ref CEED_EVAL_INTERP to use interpolated values,
-                           @ref CEED_EVAL_GRAD to use gradients,
-                           @ref CEED_EVAL_WEIGHT to use quadrature weights
-  @param[in]  x_ref      `CeedVector` holding reference coordinates of each point
-  @param[in]  u          Input `CeedVector`, of length `num_nodes * num_comp` for @ref CEED_NOTRANSPOSE
-  @param[out] v          Output `CeedVector`, of length `num_points * num_q_comp` for @ref CEED_NOTRANSPOSE with @ref CEED_EVAL_INTERP
-
-  @return An error code: 0 - success, otherwise - failure
-
-  @ref Developer
-**/
-static int CeedBasisApplyAtPoints_Core(CeedBasis basis, bool apply_add, CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode,
-                                       CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) {
-  CeedInt dim, num_comp, P_1d = 1, Q_1d = 1, total_num_points = num_points[0];
-  Ceed    ceed;
-
-  CeedCall(CeedBasisGetCeed(basis, &ceed));
-  CeedCall(CeedBasisGetDimension(basis, &dim));
-  // Inserting check because clang-tidy doesn't understand this cannot occur
-  CeedCheck(dim > 0, ceed, CEED_ERROR_UNSUPPORTED, "Malformed CeedBasis, dim > 0 is required");
-  CeedCall(CeedBasisGetNumNodes1D(basis, &P_1d));
-  CeedCall(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
-  CeedCall(CeedBasisGetNumComponents(basis, &num_comp));
-
-  // Default implementation
-  {
-    bool is_tensor_basis;
-
-    CeedCall(CeedBasisIsTensor(basis, &is_tensor_basis));
-    CeedCheck(is_tensor_basis, ceed, CEED_ERROR_UNSUPPORTED, "Evaluation at arbitrary points only supported for tensor product bases");
-  }
-  CeedCheck(num_elem == 1, ceed, CEED_ERROR_UNSUPPORTED, "Evaluation at arbitrary  points only supported for a single element at a time");
-  if (eval_mode == CEED_EVAL_WEIGHT) {
-    CeedCall(CeedVectorSetValue(v, 1.0));
-    return CEED_ERROR_SUCCESS;
-  }
-  if (!basis->basis_chebyshev) {
-    // Build basis mapping from nodes to Chebyshev coefficients
-    CeedScalar       *chebyshev_interp_1d, *chebyshev_grad_1d, *chebyshev_q_weight_1d;
-    const CeedScalar *q_ref_1d;
-
-    CeedCall(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d));
-    CeedCall(CeedCalloc(P_1d * Q_1d, &chebyshev_grad_1d));
-    CeedCall(CeedCalloc(Q_1d, &chebyshev_q_weight_1d));
-    CeedCall(CeedBasisGetQRef(basis, &q_ref_1d));
-    CeedCall(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d));
-
-    CeedCall(CeedVectorCreate(ceed, num_comp * CeedIntPow(Q_1d, dim), &basis->vec_chebyshev));
-    CeedCall(CeedBasisCreateTensorH1(ceed, dim, num_comp, P_1d, Q_1d, chebyshev_interp_1d, chebyshev_grad_1d, q_ref_1d, chebyshev_q_weight_1d,
-                                     &basis->basis_chebyshev));
-
-    // Cleanup
-    CeedCall(CeedFree(&chebyshev_interp_1d));
-    CeedCall(CeedFree(&chebyshev_grad_1d));
-    CeedCall(CeedFree(&chebyshev_q_weight_1d));
-  }
-
-  // Create TensorContract object if needed, such as a basis from the GPU backends
-  if (!basis->contract) {
-    Ceed      ceed_ref;
-    CeedBasis basis_ref = NULL;
-
-    CeedCall(CeedInit("/cpu/self", &ceed_ref));
-    // Only need matching tensor contraction dimensions, any type of basis will work
-    CeedCall(CeedBasisCreateTensorH1Lagrange(ceed_ref, dim, num_comp, P_1d, Q_1d, CEED_GAUSS, &basis_ref));
-    // Note - clang-tidy doesn't know basis_ref->contract must be valid here
-    CeedCheck(basis_ref && basis_ref->contract, ceed, CEED_ERROR_UNSUPPORTED, "Reference CPU ceed failed to create a tensor contraction object");
-    CeedCall(CeedTensorContractReferenceCopy(basis_ref->contract, &basis->contract));
-    CeedCall(CeedBasisDestroy(&basis_ref));
-    CeedCall(CeedDestroy(&ceed_ref));
-  }
-
-  // Basis evaluation
-  switch (t_mode) {
-    case CEED_NOTRANSPOSE: {
-      // Nodes to arbitrary points
-      CeedScalar       *v_array;
-      const CeedScalar *chebyshev_coeffs, *x_array_read;
-
-      // -- Interpolate to Chebyshev coefficients
-      CeedCall(CeedBasisApply(basis->basis_chebyshev, 1, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, u, basis->vec_chebyshev));
-
-      // -- Evaluate Chebyshev polynomials at arbitrary points
-      CeedCall(CeedVectorGetArrayRead(basis->vec_chebyshev, CEED_MEM_HOST, &chebyshev_coeffs));
-      CeedCall(CeedVectorGetArrayRead(x_ref, CEED_MEM_HOST, &x_array_read));
-      CeedCall(CeedVectorGetArrayWrite(v, CEED_MEM_HOST, &v_array));
-      switch (eval_mode) {
-        case CEED_EVAL_INTERP: {
-          CeedScalar tmp[2][num_comp * CeedIntPow(Q_1d, dim)], chebyshev_x[Q_1d];
-
-          // ---- Values at point
-          for (CeedInt p = 0; p < total_num_points; p++) {
-            CeedInt pre = num_comp * CeedIntPow(Q_1d, dim - 1), post = 1;
-
-            for (CeedInt d = 0; d < dim; d++) {
-              // ------ Tensor contract with current Chebyshev polynomial values
-              CeedCall(CeedChebyshevPolynomialsAtPoint(x_array_read[d * total_num_points + p], Q_1d, chebyshev_x));
-              CeedCall(CeedTensorContractApply(basis->contract, pre, Q_1d, post, 1, chebyshev_x, t_mode, false,
-                                               d == 0 ? chebyshev_coeffs : tmp[d % 2], tmp[(d + 1) % 2]));
-              pre /= Q_1d;
-              post *= 1;
-            }
-            for (CeedInt c = 0; c < num_comp; c++) v_array[c * total_num_points + p] = tmp[dim % 2][c];
-          }
-          break;
-        }
-        case CEED_EVAL_GRAD: {
-          CeedScalar tmp[2][num_comp * CeedIntPow(Q_1d, dim)], chebyshev_x[Q_1d];
-
-          // ---- Values at point
-          for (CeedInt p = 0; p < total_num_points; p++) {
-            // Dim**2 contractions, apply grad when pass == dim
-            for (CeedInt pass = 0; pass < dim; pass++) {
-              CeedInt pre = num_comp * CeedIntPow(Q_1d, dim - 1), post = 1;
-
-              for (CeedInt d = 0; d < dim; d++) {
-                // ------ Tensor contract with current Chebyshev polynomial values
-                if (pass == d) CeedCall(CeedChebyshevDerivativeAtPoint(x_array_read[d * total_num_points + p], Q_1d, chebyshev_x));
-                else CeedCall(CeedChebyshevPolynomialsAtPoint(x_array_read[d * total_num_points + p], Q_1d, chebyshev_x));
-                CeedCall(CeedTensorContractApply(basis->contract, pre, Q_1d, post, 1, chebyshev_x, t_mode, false,
-                                                 d == 0 ? chebyshev_coeffs : tmp[d % 2], tmp[(d + 1) % 2]));
-                pre /= Q_1d;
-                post *= 1;
-              }
-              for (CeedInt c = 0; c < num_comp; c++) v_array[(pass * num_comp + c) * total_num_points + p] = tmp[dim % 2][c];
-            }
-          }
-          break;
-        }
-        default:
-          // Nothing to do, excluded above
-          break;
-      }
-      CeedCall(CeedVectorRestoreArrayRead(basis->vec_chebyshev, &chebyshev_coeffs));
-      CeedCall(CeedVectorRestoreArrayRead(x_ref, &x_array_read));
-      CeedCall(CeedVectorRestoreArray(v, &v_array));
-      break;
-    }
-    case CEED_TRANSPOSE: {
-      // Note: No switch on e_mode here because only CEED_EVAL_INTERP is supported at this time
-      // Arbitrary points to nodes
-      CeedScalar       *chebyshev_coeffs;
-      const CeedScalar *u_array, *x_array_read;
-
-      // -- Transpose of evaluation of Chebyshev polynomials at arbitrary points
-      CeedCall(CeedVectorGetArrayWrite(basis->vec_chebyshev, CEED_MEM_HOST, &chebyshev_coeffs));
-      CeedCall(CeedVectorGetArrayRead(x_ref, CEED_MEM_HOST, &x_array_read));
-      CeedCall(CeedVectorGetArrayRead(u, CEED_MEM_HOST, &u_array));
-
-      switch (eval_mode) {
-        case CEED_EVAL_INTERP: {
-          CeedScalar tmp[2][num_comp * CeedIntPow(Q_1d, dim)], chebyshev_x[Q_1d];
-
-          // ---- Values at point
-          for (CeedInt p = 0; p < total_num_points; p++) {
-            CeedInt pre = num_comp * 1, post = 1;
-
-            for (CeedInt c = 0; c < num_comp; c++) tmp[0][c] = u_array[c * total_num_points + p];
-            for (CeedInt d = 0; d < dim; d++) {
-              // ------ Tensor contract with current Chebyshev polynomial values
-              CeedCall(CeedChebyshevPolynomialsAtPoint(x_array_read[d * total_num_points + p], Q_1d, chebyshev_x));
-              CeedCall(CeedTensorContractApply(basis->contract, pre, 1, post, Q_1d, chebyshev_x, t_mode, p > 0 && d == (dim - 1), tmp[d % 2],
-                                               d == (dim - 1) ? chebyshev_coeffs : tmp[(d + 1) % 2]));
-              pre /= 1;
-              post *= Q_1d;
-            }
-          }
-          break;
-        }
-        case CEED_EVAL_GRAD: {
-          CeedScalar tmp[2][num_comp * CeedIntPow(Q_1d, dim)], chebyshev_x[Q_1d];
-
-          // ---- Values at point
-          for (CeedInt p = 0; p < total_num_points; p++) {
-            // Dim**2 contractions, apply grad when pass == dim
-            for (CeedInt pass = 0; pass < dim; pass++) {
-              CeedInt pre = num_comp * 1, post = 1;
-
-              for (CeedInt c = 0; c < num_comp; c++) tmp[0][c] = u_array[(pass * num_comp + c) * total_num_points + p];
-              for (CeedInt d = 0; d < dim; d++) {
-                // ------ Tensor contract with current Chebyshev polynomial values
-                if (pass == d) CeedCall(CeedChebyshevDerivativeAtPoint(x_array_read[d * total_num_points + p], Q_1d, chebyshev_x));
-                else CeedCall(CeedChebyshevPolynomialsAtPoint(x_array_read[d * total_num_points + p], Q_1d, chebyshev_x));
-                CeedCall(CeedTensorContractApply(basis->contract, pre, 1, post, Q_1d, chebyshev_x, t_mode,
-                                                 (p > 0 || (p == 0 && pass > 0)) && d == (dim - 1), tmp[d % 2],
-                                                 d == (dim - 1) ? chebyshev_coeffs : tmp[(d + 1) % 2]));
-                pre /= 1;
-                post *= Q_1d;
-              }
-            }
-          }
-          break;
-        }
-        default:
-          // Nothing to do, excluded above
-          break;
-      }
-      CeedCall(CeedVectorRestoreArray(basis->vec_chebyshev, &chebyshev_coeffs));
-      CeedCall(CeedVectorRestoreArrayRead(x_ref, &x_array_read));
-      CeedCall(CeedVectorRestoreArrayRead(u, &u_array));
-
-      // -- Interpolate transpose from Chebyshev coefficients
-      if (apply_add) CeedCall(CeedBasisApplyAdd(basis->basis_chebyshev, 1, CEED_TRANSPOSE, CEED_EVAL_INTERP, basis->vec_chebyshev, v));
-      else CeedCall(CeedBasisApply(basis->basis_chebyshev, 1, CEED_TRANSPOSE, CEED_EVAL_INTERP, basis->vec_chebyshev, v));
-      break;
-    }
-  }
-  return CEED_ERROR_SUCCESS;
-}
-
 /**
   @brief Apply basis evaluation from nodes to arbitrary points
 

From e93651e5c446557cdab5b7b1fccac084c7b91909 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Wed, 11 Sep 2024 12:40:00 -0600
Subject: [PATCH 162/571] gpu - reduce memory usage in gen backends

---
 .../cuda-gen/ceed-cuda-gen-operator-build.cpp | 105 +++++++++++++++---
 .../hip-gen/ceed-hip-gen-operator-build.cpp   | 104 ++++++++++++++---
 2 files changed, 180 insertions(+), 29 deletions(-)

diff --git a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
index eb8d5ad848..0b45db90c8 100644
--- a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
+++ b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
@@ -202,8 +202,8 @@ static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, C
 // Restriction
 //------------------------------------------------------------------------------
 static int CeedOperatorBuildKernelRestriction_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, CeedInt i, CeedInt dim,
-                                                       CeedOperatorField op_field, CeedQFunctionField qf_field, CeedInt Q_1d, bool is_input,
-                                                       bool use_3d_slices) {
+                                                       CeedInt field_input_buffer[], CeedOperatorField op_field, CeedQFunctionField qf_field,
+                                                       CeedInt Q_1d, bool is_input, bool use_3d_slices) {
   std::string               var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
   std::string               P_name     = "P_1d" + var_suffix;
   CeedEvalMode              eval_mode  = CEED_EVAL_NONE;
@@ -229,10 +229,21 @@ static int CeedOperatorBuildKernelRestriction_Cuda_gen(std::ostringstream &code,
   // Restriction
   if (is_input) {
     // Input
-    if (eval_mode != CEED_EVAL_WEIGHT && !((eval_mode == CEED_EVAL_NONE) && use_3d_slices)) {
+    if (field_input_buffer[i] != i) {
+      std::string buffer_name = "r_e_in_" + std::to_string(field_input_buffer[i]);
+
+      // Restriction was already done for previous input
+      code << "    CeedScalar *r_e" << var_suffix << " = " << buffer_name << ";\n";
+    } else if (eval_mode != CEED_EVAL_WEIGHT && !((eval_mode == CEED_EVAL_NONE) && use_3d_slices)) {
       bool is_strided;
 
-      code << "    CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << P_name << "];\n";
+      if (eval_mode == CEED_EVAL_NONE) {
+        // No basis action, so r_e_in_* in also r_q_in_* and needs to be allocated
+        code << "    CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << P_name << "];\n";
+      } else {
+        // Otherwise we're using the scratch space
+        code << "    CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
+      }
       CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided));
       if (!is_strided) {
         CeedInt comp_stride;
@@ -356,7 +367,6 @@ static int CeedOperatorBuildKernelBasis_Cuda_gen(std::ostringstream &code, CeedO
       }
       // LCOV_EXCL_START
       case CEED_EVAL_DIV:
-        break;  // TODO: Not implemented
       case CEED_EVAL_CURL:
         break;  // TODO: Not implemented
                 // LCOV_EXCL_STOP
@@ -367,12 +377,12 @@ static int CeedOperatorBuildKernelBasis_Cuda_gen(std::ostringstream &code, CeedO
         code << "    CeedScalar *r_e" << var_suffix << " = r_q" << var_suffix << ";\n";
         break;  // No action
       case CEED_EVAL_INTERP:
-        code << "    CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << P_name << "];\n";
+        code << "    CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
         code << "    InterpTranspose" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp" << var_suffix << ", " << P_name << ", " << Q_name
              << ">(data, r_q" << var_suffix << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
         break;
       case CEED_EVAL_GRAD:
-        code << "    CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << P_name << "];\n";
+        code << "    CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
         if (use_3d_slices) {
           code << "    InterpTranspose" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp" << var_suffix << ", " << P_name << ", " << Q_name
                << ">(data, r_q" << var_suffix << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
@@ -386,7 +396,6 @@ static int CeedOperatorBuildKernelBasis_Cuda_gen(std::ostringstream &code, CeedO
       case CEED_EVAL_WEIGHT:
         break;  // Should not occur
       case CEED_EVAL_DIV:
-        break;  // TODO: Not implemented
       case CEED_EVAL_CURL:
         break;  // TODO: Not implemented
                 // LCOV_EXCL_STOP
@@ -433,7 +442,7 @@ static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, C
   // We treat quadrature points per slice in 3d to save registers
   if (use_3d_slices) {
     code << "\n    // Note: Using planes of 3D elements\n";
-    code << "#pragma unroll\n";
+    code << "    #pragma unroll\n";
     code << "    for (CeedInt q = 0; q < " << Q_name << "; q++) {\n";
     code << "      // -- Input fields\n";
     for (CeedInt i = 0; i < num_input_fields; i++) {
@@ -789,17 +798,83 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {
   code << "  __syncthreads();\n";
   code << "  for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n";
 
+  // -- Compute minimum buffer space needed
+  CeedInt max_rstr_buffer_size = 0;
+
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedInt             num_comp, elem_size;
+    CeedElemRestriction elem_rstr;
+
+    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
+    CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
+    CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
+    max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * elem_size);
+  }
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    CeedInt             num_comp, elem_size;
+    CeedElemRestriction elem_rstr;
+
+    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
+    CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
+    CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
+    max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * elem_size);
+  }
+  code << "    // Scratch restriction buffer space\n";
+  code << "    CeedScalar r_e_scratch[" << max_rstr_buffer_size << "];\n";
+
+  // -- Determine best input field processing order
+  CeedInt field_rstr_in_buffer[CEED_FIELD_MAX], input_field_order[CEED_FIELD_MAX];
+
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    field_rstr_in_buffer[i] = -1;
+    input_field_order[i]    = -1;
+  }
+  {
+    bool    is_ordered[CEED_FIELD_MAX];
+    CeedInt curr_index = 0;
+
+    for (CeedInt i = 0; i < num_input_fields; i++) is_ordered[i] = false;
+    for (CeedInt i = 0; i < num_input_fields; i++) {
+      CeedVector          vec_i;
+      CeedElemRestriction rstr_i;
+
+      if (is_ordered[i]) continue;
+      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i));
+      field_rstr_in_buffer[i]       = i;
+      is_ordered[i]                 = true;
+      input_field_order[curr_index] = i;
+      curr_index++;
+      if (vec_i == CEED_VECTOR_NONE) continue;  // CEED_EVAL_WEIGHT
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i));
+      for (CeedInt j = i + 1; j < num_input_fields; j++) {
+        CeedVector          vec_j;
+        CeedElemRestriction rstr_j;
+
+        CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec_j));
+        CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[j], &rstr_j));
+        if (rstr_i == rstr_j && vec_i == vec_j) {
+          field_rstr_in_buffer[j]       = i;
+          is_ordered[j]                 = true;
+          input_field_order[curr_index] = j;
+          curr_index++;
+        }
+      }
+    }
+  }
+
   // -- Input restriction and basis
-  code << "    // -- Input field restrictions and basis actions\n";
+  code << "\n    // -- Input field restrictions and basis actions\n";
   for (CeedInt i = 0; i < num_input_fields; i++) {
-    code << "    // ---- Input field " << i << "\n";
+    CeedInt f = input_field_order[i];
+
+    code << "    // ---- Input field " << f << "\n";
 
     // ---- Restriction
-    CeedCallBackend(
-        CeedOperatorBuildKernelRestriction_Cuda_gen(code, data, i, dim, op_input_fields[i], qf_input_fields[i], Q_1d, true, use_3d_slices));
+    CeedCallBackend(CeedOperatorBuildKernelRestriction_Cuda_gen(code, data, f, dim, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f],
+                                                                Q_1d, true, use_3d_slices));
 
     // ---- Basis action
-    CeedCallBackend(CeedOperatorBuildKernelBasis_Cuda_gen(code, data, i, dim, op_input_fields[i], qf_input_fields[i], Q_1d, true, use_3d_slices));
+    CeedCallBackend(CeedOperatorBuildKernelBasis_Cuda_gen(code, data, f, dim, op_input_fields[f], qf_input_fields[f], Q_1d, true, use_3d_slices));
   }
 
   // -- Q function
@@ -816,7 +891,7 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {
 
     // ---- Restriction
     CeedCallBackend(
-        CeedOperatorBuildKernelRestriction_Cuda_gen(code, data, i, dim, op_output_fields[i], qf_output_fields[i], Q_1d, false, use_3d_slices));
+        CeedOperatorBuildKernelRestriction_Cuda_gen(code, data, i, dim, NULL, op_output_fields[i], qf_output_fields[i], Q_1d, false, use_3d_slices));
   }
 
   // Close loop and function
diff --git a/backends/hip-gen/ceed-hip-gen-operator-build.cpp b/backends/hip-gen/ceed-hip-gen-operator-build.cpp
index 6926e6fb4e..c2e21a5468 100644
--- a/backends/hip-gen/ceed-hip-gen-operator-build.cpp
+++ b/backends/hip-gen/ceed-hip-gen-operator-build.cpp
@@ -229,8 +229,8 @@ static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, Ce
 // Restriction
 //------------------------------------------------------------------------------
 static int CeedOperatorBuildKernelRestriction_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, CeedInt i, CeedInt dim,
-                                                      CeedOperatorField op_field, CeedQFunctionField qf_field, CeedInt Q_1d, bool is_input,
-                                                      bool use_3d_slices) {
+                                                      CeedInt field_input_buffer[], CeedOperatorField op_field, CeedQFunctionField qf_field,
+                                                      CeedInt Q_1d, bool is_input, bool use_3d_slices) {
   std::string              var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
   std::string              P_name     = "P_1d" + var_suffix;
   CeedEvalMode             eval_mode  = CEED_EVAL_NONE;
@@ -256,10 +256,22 @@ static int CeedOperatorBuildKernelRestriction_Hip_gen(std::ostringstream &code,
   // Restriction
   if (is_input) {
     // Input
-    if (eval_mode != CEED_EVAL_WEIGHT && !((eval_mode == CEED_EVAL_NONE) && use_3d_slices)) {
+    // Input
+    if (field_input_buffer[i] != i) {
+      std::string buffer_name = "r_e_in_" + std::to_string(field_input_buffer[i]);
+
+      // Restriction was already done for previous input
+      code << "    CeedScalar *r_e" << var_suffix << " = " << buffer_name << ";\n";
+    } else if (eval_mode != CEED_EVAL_WEIGHT && !((eval_mode == CEED_EVAL_NONE) && use_3d_slices)) {
       bool is_strided;
 
-      code << "    CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << P_name << "];\n";
+      if (eval_mode == CEED_EVAL_NONE) {
+        // No basis action, so r_e_in_* in also r_q_in_* and needs to be allocated
+        code << "    CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << P_name << "];\n";
+      } else {
+        // Otherwise we're using the scratch space
+        code << "    CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
+      }
       CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided));
       if (!is_strided) {
         CeedInt comp_stride;
@@ -383,7 +395,6 @@ static int CeedOperatorBuildKernelBasis_Hip_gen(std::ostringstream &code, CeedOp
       }
       // LCOV_EXCL_START
       case CEED_EVAL_DIV:
-        break;  // TODO: Not implemented
       case CEED_EVAL_CURL:
         break;  // TODO: Not implemented
                 // LCOV_EXCL_STOP
@@ -394,12 +405,12 @@ static int CeedOperatorBuildKernelBasis_Hip_gen(std::ostringstream &code, CeedOp
         code << "    CeedScalar *r_e" << var_suffix << " = r_q" << var_suffix << ";\n";
         break;  // No action
       case CEED_EVAL_INTERP:
-        code << "    CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << P_name << "];\n";
+        code << "    CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
         code << "    InterpTranspose" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp" << var_suffix << ", " << P_name << ", " << Q_name
              << ">(data, r_q" << var_suffix << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
         break;
       case CEED_EVAL_GRAD:
-        code << "    CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << P_name << "];\n";
+        code << "    CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
         if (use_3d_slices) {
           code << "    InterpTranspose" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp" << var_suffix << ", " << P_name << ", " << Q_name
                << ">(data, r_q" << var_suffix << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
@@ -413,7 +424,6 @@ static int CeedOperatorBuildKernelBasis_Hip_gen(std::ostringstream &code, CeedOp
       case CEED_EVAL_WEIGHT:
         break;  // Should not occur
       case CEED_EVAL_DIV:
-        break;  // TODO: Not implemented
       case CEED_EVAL_CURL:
         break;  // TODO: Not implemented
                 // LCOV_EXCL_STOP
@@ -460,7 +470,7 @@ static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, Ce
   // We treat quadrature points per slice in 3d to save registers
   if (use_3d_slices) {
     code << "\n    // Note: Using planes of 3D elements\n";
-    code << "#pragma unroll\n";
+    code << "    #pragma unroll\n";
     code << "    for (CeedInt q = 0; q < " << Q_name << "; q++) {\n";
     code << "      // -- Input fields\n";
     for (CeedInt i = 0; i < num_input_fields; i++) {
@@ -797,17 +807,83 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op) {
   code << "  __syncthreads();\n";
   code << "  for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n";
 
+  // -- Compute minimum buffer space needed
+  CeedInt max_rstr_buffer_size = 0;
+
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedInt             num_comp, elem_size;
+    CeedElemRestriction elem_rstr;
+
+    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
+    CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
+    CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
+    max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * elem_size);
+  }
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    CeedInt             num_comp, elem_size;
+    CeedElemRestriction elem_rstr;
+
+    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
+    CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
+    CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
+    max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * elem_size);
+  }
+  code << "    // Scratch restriction buffer space\n";
+  code << "    CeedScalar r_e_scratch[" << max_rstr_buffer_size << "];\n";
+
+  // -- Determine best input field processing order
+  CeedInt field_rstr_in_buffer[CEED_FIELD_MAX], input_field_order[CEED_FIELD_MAX];
+
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    field_rstr_in_buffer[i] = -1;
+    input_field_order[i]    = -1;
+  }
+  {
+    bool    is_ordered[CEED_FIELD_MAX];
+    CeedInt curr_index = 0;
+
+    for (CeedInt i = 0; i < num_input_fields; i++) is_ordered[i] = false;
+    for (CeedInt i = 0; i < num_input_fields; i++) {
+      CeedVector          vec_i;
+      CeedElemRestriction rstr_i;
+
+      if (is_ordered[i]) continue;
+      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i));
+      field_rstr_in_buffer[i]       = i;
+      is_ordered[i]                 = true;
+      input_field_order[curr_index] = i;
+      curr_index++;
+      if (vec_i == CEED_VECTOR_NONE) continue;  // CEED_EVAL_WEIGHT
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i));
+      for (CeedInt j = i + 1; j < num_input_fields; j++) {
+        CeedVector          vec_j;
+        CeedElemRestriction rstr_j;
+
+        CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec_j));
+        CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[j], &rstr_j));
+        if (rstr_i == rstr_j && vec_i == vec_j) {
+          field_rstr_in_buffer[j]       = i;
+          is_ordered[j]                 = true;
+          input_field_order[curr_index] = j;
+          curr_index++;
+        }
+      }
+    }
+  }
+
   // -- Input restriction and basis
   code << "    // -- Input field restrictions and basis actions\n";
   for (CeedInt i = 0; i < num_input_fields; i++) {
-    code << "    // ---- Input field " << i << "\n";
+    CeedInt f = input_field_order[i];
+
+    code << "    // ---- Input field " << f << "\n";
 
     // ---- Restriction
-    CeedCallBackend(
-        CeedOperatorBuildKernelRestriction_Hip_gen(code, data, i, dim, op_input_fields[i], qf_input_fields[i], Q_1d, true, use_3d_slices));
+    CeedCallBackend(CeedOperatorBuildKernelRestriction_Hip_gen(code, data, f, dim, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f], Q_1d,
+                                                               true, use_3d_slices));
 
     // ---- Basis action
-    CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, i, dim, op_input_fields[i], qf_input_fields[i], Q_1d, true, use_3d_slices));
+    CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, f, dim, op_input_fields[f], qf_input_fields[f], Q_1d, true, use_3d_slices));
   }
 
   // -- Q function
@@ -824,7 +900,7 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op) {
 
     // ---- Restriction
     CeedCallBackend(
-        CeedOperatorBuildKernelRestriction_Hip_gen(code, data, i, dim, op_output_fields[i], qf_output_fields[i], Q_1d, false, use_3d_slices));
+        CeedOperatorBuildKernelRestriction_Hip_gen(code, data, i, dim, NULL, op_output_fields[i], qf_output_fields[i], Q_1d, false, use_3d_slices));
   }
 
   // Close loop and function

From 22ab0487938d6416bda03d32bba2b2245fabcc02 Mon Sep 17 00:00:00 2001
From: James Wright <james@jameswright.xyz>
Date: Mon, 16 Sep 2024 16:52:06 -0600
Subject: [PATCH 163/571] fix(sycl): Use CeedSize value in Linear Assemble

---
 backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp b/backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp
index 2a9c59779f..8939d84a26 100644
--- a/backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp
+++ b/backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp
@@ -553,8 +553,7 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Sycl(CeedOperator op,
     CeedInt  strides[3] = {1, num_elem * Q, Q}; /* *NOPAD* */
 
     // Create output restriction
-    CeedCallBackend(CeedElemRestrictionCreateStrided(ceed_parent, num_elem, Q, num_active_in * num_active_out,
-                                                     num_active_in * num_active_out * num_elem * Q, strides, rstr));
+    CeedCallBackend(CeedElemRestrictionCreateStrided(ceed_parent, num_elem, Q, num_active_in * num_active_out, l_size, strides, rstr));
     // Create assembled vector
     CeedCallBackend(CeedVectorCreate(ceed_parent, l_size, assembled));
   }

From 7cf95199bdce7094e6c708bc6d0230948407aea7 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Tue, 17 Sep 2024 12:15:05 -0600
Subject: [PATCH 164/571] petsc - update for interface change

---
 examples/fluids/navierstokes.h        | 4 ++--
 examples/petsc/include/petscversion.h | 4 ++--
 examples/petsc/multigrid.c            | 2 +-
 examples/petsc/src/petscutils.c       | 2 +-
 examples/solids/elasticity.h          | 4 ++--
 examples/solids/src/setup-dm.c        | 2 +-
 6 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/examples/fluids/navierstokes.h b/examples/fluids/navierstokes.h
index c2d57c7f32..f374e46849 100644
--- a/examples/fluids/navierstokes.h
+++ b/examples/fluids/navierstokes.h
@@ -17,8 +17,8 @@
 #include "./include/petsc_ops.h"
 #include "qfunctions/newtonian_types.h"
 
-#if PETSC_VERSION_LT(3, 21, 0)
-#error "PETSc v3.21 or later is required"
+#if PETSC_VERSION_LT(3, 22, 0)
+#error "PETSc v3.22 or later is required"
 #endif
 
 // -----------------------------------------------------------------------------
diff --git a/examples/petsc/include/petscversion.h b/examples/petsc/include/petscversion.h
index 8c1d3f92be..bbb377d3cc 100644
--- a/examples/petsc/include/petscversion.h
+++ b/examples/petsc/include/petscversion.h
@@ -9,6 +9,6 @@
 /// Petsc version check
 #pragma once
 
-#if PETSC_VERSION_LT(3, 21, 0)
-#error "PETSc v3.21 or later is required"
+#if PETSC_VERSION_LT(3, 22, 0)
+#error "PETSc v3.22 or later is required"
 #endif
diff --git a/examples/petsc/multigrid.c b/examples/petsc/multigrid.c
index 346e395e6e..4fd9a62166 100644
--- a/examples/petsc/multigrid.c
+++ b/examples/petsc/multigrid.c
@@ -117,7 +117,7 @@ int main(int argc, char **argv) {
   if (read_mesh) {
     PetscCall(DMPlexCreateFromFile(PETSC_COMM_WORLD, filename, NULL, PETSC_TRUE, &dm_orig));
   } else {
-    PetscCall(DMPlexCreateBoxMesh(PETSC_COMM_WORLD, dim, simplex, mesh_elem, NULL, NULL, NULL, PETSC_TRUE, &dm_orig));
+    PetscCall(DMPlexCreateBoxMesh(PETSC_COMM_WORLD, dim, simplex, mesh_elem, NULL, NULL, NULL, PETSC_TRUE, 0, PETSC_FALSE, &dm_orig));
   }
 
   VecType vec_type;
diff --git a/examples/petsc/src/petscutils.c b/examples/petsc/src/petscutils.c
index e8fdc4fac3..4ec84c547b 100644
--- a/examples/petsc/src/petscutils.c
+++ b/examples/petsc/src/petscutils.c
@@ -425,7 +425,7 @@ PetscErrorCode CreateDistributedDM(RunParams rp, DM *dm) {
       }
     }
 
-    PetscCall(DMPlexCreateBoxMesh(PETSC_COMM_WORLD, rp->dim, rp->simplex, rp->mesh_elem, NULL, NULL, NULL, PETSC_TRUE, dm));
+    PetscCall(DMPlexCreateBoxMesh(PETSC_COMM_WORLD, rp->dim, rp->simplex, rp->mesh_elem, NULL, NULL, NULL, PETSC_TRUE, 0, PETSC_FALSE, dm));
   }
 
   PetscCall(DMSetFromOptions(*dm));
diff --git a/examples/solids/elasticity.h b/examples/solids/elasticity.h
index 7ac246eccc..a83789a30f 100644
--- a/examples/solids/elasticity.h
+++ b/examples/solids/elasticity.h
@@ -21,6 +21,6 @@
 #include "include/utils.h"
 #include "problems/problems.h"
 
-#if PETSC_VERSION_LT(3, 21, 0)
-#error "PETSc v3.21 or later is required"
+#if PETSC_VERSION_LT(3, 22, 0)
+#error "PETSc v3.22 or later is required"
 #endif
diff --git a/examples/solids/src/setup-dm.c b/examples/solids/src/setup-dm.c
index e70b4738dc..07c7f179fe 100644
--- a/examples/solids/src/setup-dm.c
+++ b/examples/solids/src/setup-dm.c
@@ -43,7 +43,7 @@ PetscErrorCode CreateDistributedDM(MPI_Comm comm, AppCtx app_ctx, DM *dm) {
     PetscInt dim = 3, faces[3] = {3, 3, 3};
     PetscCall(PetscOptionsGetIntArray(NULL, NULL, "-dm_plex_box_faces", faces, &dim, NULL));
     if (!dim) dim = 3;
-    PetscCall(DMPlexCreateBoxMesh(comm, dim, PETSC_FALSE, faces, NULL, NULL, NULL, interpolate, dm));
+    PetscCall(DMPlexCreateBoxMesh(comm, dim, PETSC_FALSE, faces, NULL, NULL, NULL, interpolate, 0, PETSC_FALSE, dm));
   } else {
     PetscCall(DMPlexCreateFromFile(comm, filename, NULL, interpolate, dm));
   }

From 3e961e14fcf6ad615ceece5f8606a32a7b6c0794 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Thu, 19 Sep 2024 15:28:59 -0600
Subject: [PATCH 165/571] minor - drop unneeded casts

---
 backends/cuda-ref/ceed-cuda-ref-vector.c | 12 ++++++------
 backends/hip-ref/ceed-hip-ref-vector.c   | 12 ++++++------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/backends/cuda-ref/ceed-cuda-ref-vector.c b/backends/cuda-ref/ceed-cuda-ref-vector.c
index d6622e0e99..9deb6dec82 100644
--- a/backends/cuda-ref/ceed-cuda-ref-vector.c
+++ b/backends/cuda-ref/ceed-cuda-ref-vector.c
@@ -810,18 +810,18 @@ int CeedVectorCreate_Cuda(CeedSize n, CeedVector vec) {
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "HasBorrowedArrayOfType", CeedVectorHasBorrowedArrayOfType_Cuda));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "SetArray", CeedVectorSetArray_Cuda));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "TakeArray", CeedVectorTakeArray_Cuda));
-  CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "CopyStrided", (int (*)())CeedVectorCopyStrided_Cuda));
-  CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "SetValue", (int (*)())CeedVectorSetValue_Cuda));
-  CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "SetValueStrided", (int (*)())CeedVectorSetValueStrided_Cuda));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "CopyStrided", CeedVectorCopyStrided_Cuda));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "SetValue", CeedVectorSetValue_Cuda));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "SetValueStrided", CeedVectorSetValueStrided_Cuda));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "SyncArray", CeedVectorSyncArray_Cuda));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "GetArray", CeedVectorGetArray_Cuda));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "GetArrayRead", CeedVectorGetArrayRead_Cuda));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "GetArrayWrite", CeedVectorGetArrayWrite_Cuda));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "Norm", CeedVectorNorm_Cuda));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "Reciprocal", CeedVectorReciprocal_Cuda));
-  CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "Scale", (int (*)())CeedVectorScale_Cuda));
-  CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "AXPY", (int (*)())CeedVectorAXPY_Cuda));
-  CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "AXPBY", (int (*)())CeedVectorAXPBY_Cuda));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "Scale", CeedVectorScale_Cuda));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "AXPY", CeedVectorAXPY_Cuda));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "AXPBY", CeedVectorAXPBY_Cuda));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "PointwiseMult", CeedVectorPointwiseMult_Cuda));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "Destroy", CeedVectorDestroy_Cuda));
   CeedCallBackend(CeedCalloc(1, &impl));
diff --git a/backends/hip-ref/ceed-hip-ref-vector.c b/backends/hip-ref/ceed-hip-ref-vector.c
index f57d8bcf69..164eb822d6 100644
--- a/backends/hip-ref/ceed-hip-ref-vector.c
+++ b/backends/hip-ref/ceed-hip-ref-vector.c
@@ -772,18 +772,18 @@ int CeedVectorCreate_Hip(CeedSize n, CeedVector vec) {
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "HasBorrowedArrayOfType", CeedVectorHasBorrowedArrayOfType_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "SetArray", CeedVectorSetArray_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "TakeArray", CeedVectorTakeArray_Hip));
-  CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "CopyStrided", (int (*)())CeedVectorCopyStrided_Hip));
-  CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "SetValue", (int (*)())CeedVectorSetValue_Hip));
-  CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "SetValueStrided", (int (*)())CeedVectorSetValueStrided_Hip));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "CopyStrided", CeedVectorCopyStrided_Hip));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "SetValue", CeedVectorSetValue_Hip));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "SetValueStrided", CeedVectorSetValueStrided_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "SyncArray", CeedVectorSyncArray_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "GetArray", CeedVectorGetArray_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "GetArrayRead", CeedVectorGetArrayRead_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "GetArrayWrite", CeedVectorGetArrayWrite_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "Norm", CeedVectorNorm_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "Reciprocal", CeedVectorReciprocal_Hip));
-  CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "Scale", (int (*)())CeedVectorScale_Hip));
-  CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "AXPY", (int (*)())CeedVectorAXPY_Hip));
-  CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "AXPBY", (int (*)())CeedVectorAXPBY_Hip));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "Scale", CeedVectorScale_Hip));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "AXPY", CeedVectorAXPY_Hip));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "AXPBY", CeedVectorAXPBY_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "PointwiseMult", CeedVectorPointwiseMult_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "Destroy", CeedVectorDestroy_Hip));
   CeedCallBackend(CeedCalloc(1, &impl));

From 8a31047239c7e4b21930903fe948c2d08c208114 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Thu, 19 Sep 2024 16:28:58 -0600
Subject: [PATCH 166/571] fluids - fix type

---
 examples/fluids/src/misc.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/fluids/src/misc.c b/examples/fluids/src/misc.c
index 5658f109f1..4510be1fa2 100644
--- a/examples/fluids/src/misc.c
+++ b/examples/fluids/src/misc.c
@@ -493,9 +493,10 @@ PetscErrorCode PrintRunInfo(User user, Physics phys_ctx, ProblemData problem, TS
       PetscInt num_remote_roots_total = 0, num_remote_leaves_total = 0, num_ghost_interface_ranks = 0, num_owned_interface_ranks = 0;
       {
         PetscSF            sf;
-        PetscInt           nrranks, niranks;
+        PetscMPIInt        nrranks, niranks;
         const PetscInt    *roffset, *rmine, *rremote, *ioffset, *irootloc;
         const PetscMPIInt *rranks, *iranks;
+
         PetscCall(DMGetSectionSF(user->dm, &sf));
         PetscCall(PetscSFGetRootRanks(sf, &nrranks, &rranks, &roffset, &rmine, &rremote));
         PetscCall(PetscSFGetLeafRanks(sf, &niranks, &iranks, &ioffset, &irootloc));

From a82e6db6eb9ce19b312c1eb9baa0b5188d293bd9 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Thu, 19 Sep 2024 16:35:34 -0600
Subject: [PATCH 167/571] fluids - fix SegBuffer size argument

---
 examples/fluids/navierstokes.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/fluids/navierstokes.h b/examples/fluids/navierstokes.h
index f374e46849..074ae865b5 100644
--- a/examples/fluids/navierstokes.h
+++ b/examples/fluids/navierstokes.h
@@ -235,7 +235,7 @@ struct ProblemData_private {
       apply_inflow_jacobian, apply_outflow_jacobian, apply_freestream_jacobian, apply_slip_jacobian;
   bool          compute_exact_solution_error;
   PetscBool     set_bc_from_ics, use_strong_bc_ceed, uses_newtonian;
-  size_t        num_bc_defs;
+  PetscCount    num_bc_defs;
   BCDefinition *bc_defs;
   PetscErrorCode (*print_info)(User, ProblemData, AppCtx);
   PetscErrorCode (*create_mass_operator)(User, CeedOperator *);

From e5422a12c688862c7ac4f2ce2243f82b3939889e Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 20 Sep 2024 09:37:40 -0600
Subject: [PATCH 168/571] ci - use non int64 due to PETSc leak

---
 .gitlab-ci.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 518ac31129..6febe47e49 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -96,8 +96,8 @@ noether-cpu:
     - export PETSC_DIR= PETSC_ARCH=
     - make -k -j$((NPROC_CPU / NPROC_POOL)) BACKENDS="$BACKENDS_CPU" JUNIT_BATCH="cpu" junit realsearch=%
 # Libraries for examples
-# -- PETSc with HIP (minimal)
-    - export PETSC_DIR=/projects/petsc PETSC_ARCH=mpich-hip-int64 && git -C $PETSC_DIR -c safe.directory=$PETSC_DIR describe
+# -- PETSc (minimal)
+    - export PETSC_DIR=/projects/petsc PETSC_ARCH=mpich-cpu && git -C $PETSC_DIR -c safe.directory=$PETSC_DIR describe
     - echo "-------------- PETSc ---------------" && make -C $PETSC_DIR info
     - make -k -j$((NPROC_CPU / NPROC_POOL)) BACKENDS="$BACKENDS_CPU" JUNIT_BATCH="cpu" junit search="petsc fluids-navierstokes solids"
 # -- MFEM v4.7

From 3472baabeb0c74f35a20649b8ce5fff487e62103 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 20 Sep 2024 13:15:06 -0600
Subject: [PATCH 169/571] ci - restore int64 testing

---
 .gitlab-ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 6febe47e49..7a9e8e44cc 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -97,7 +97,7 @@ noether-cpu:
     - make -k -j$((NPROC_CPU / NPROC_POOL)) BACKENDS="$BACKENDS_CPU" JUNIT_BATCH="cpu" junit realsearch=%
 # Libraries for examples
 # -- PETSc (minimal)
-    - export PETSC_DIR=/projects/petsc PETSC_ARCH=mpich-cpu && git -C $PETSC_DIR -c safe.directory=$PETSC_DIR describe
+    - export PETSC_DIR=/projects/petsc PETSC_ARCH=mpich-cpu-int64 && git -C $PETSC_DIR -c safe.directory=$PETSC_DIR describe
     - echo "-------------- PETSc ---------------" && make -C $PETSC_DIR info
     - make -k -j$((NPROC_CPU / NPROC_POOL)) BACKENDS="$BACKENDS_CPU" JUNIT_BATCH="cpu" junit search="petsc fluids-navierstokes solids"
 # -- MFEM v4.7

From 9937a20e594bfb4e867db08aa6f5f4a7e0ba1b8e Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Mon, 16 Sep 2024 10:51:26 -0600
Subject: [PATCH 170/571] memcheck - clarify vector access

---
 backends/memcheck/ceed-memcheck-vector.c | 158 ++++++++++++++++-------
 backends/memcheck/ceed-memcheck.h        |  14 +-
 2 files changed, 121 insertions(+), 51 deletions(-)

diff --git a/backends/memcheck/ceed-memcheck-vector.c b/backends/memcheck/ceed-memcheck-vector.c
index 0435fe7e5a..48972948f8 100644
--- a/backends/memcheck/ceed-memcheck-vector.c
+++ b/backends/memcheck/ceed-memcheck-vector.c
@@ -21,7 +21,7 @@ static int CeedVectorHasValidArray_Memcheck(CeedVector vec, bool *has_valid_arra
   CeedVector_Memcheck *impl;
 
   CeedCallBackend(CeedVectorGetData(vec, &impl));
-  *has_valid_array = impl->array;
+  *has_valid_array = !!impl->array_allocated;
   return CEED_ERROR_SUCCESS;
 }
 
@@ -31,9 +31,10 @@ static int CeedVectorHasValidArray_Memcheck(CeedVector vec, bool *has_valid_arra
 static inline int CeedVectorHasBorrowedArrayOfType_Memcheck(const CeedVector vec, CeedMemType mem_type, bool *has_borrowed_array_of_type) {
   CeedVector_Memcheck *impl;
 
-  CeedCallBackend(CeedVectorGetData(vec, &impl));
   CeedCheck(mem_type == CEED_MEM_HOST, CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND, "Can only set HOST memory for this backend");
-  *has_borrowed_array_of_type = impl->array_borrowed;
+
+  CeedCallBackend(CeedVectorGetData(vec, &impl));
+  *has_borrowed_array_of_type = !!impl->array_borrowed;
   return CEED_ERROR_SUCCESS;
 }
 
@@ -44,45 +45,74 @@ static int CeedVectorSetArray_Memcheck(CeedVector vec, CeedMemType mem_type, Cee
   CeedSize             length;
   CeedVector_Memcheck *impl;
 
+  CeedCheck(mem_type == CEED_MEM_HOST, CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND, "Can only set HOST memory for this backend");
+
   CeedCallBackend(CeedVectorGetData(vec, &impl));
   CeedCallBackend(CeedVectorGetLength(vec, &length));
 
-  CeedCheck(mem_type == CEED_MEM_HOST, CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND, "Can only set HOST memory for this backend");
-
+  // Clear previous owned arrays
   if (impl->array_allocated) {
     for (CeedSize i = 0; i < length; i++) impl->array_allocated[i] = NAN;
   }
   CeedCallBackend(CeedFree(&impl->array_allocated));
+  VALGRIND_DISCARD(impl->allocated_block_id);
   if (impl->array_owned) {
     for (CeedSize i = 0; i < length; i++) impl->array_owned[i] = NAN;
   }
+  VALGRIND_DISCARD(impl->owned_block_id);
   CeedCallBackend(CeedFree(&impl->array_owned));
+
+  // Clear borrowed block id, if present
+  if (impl->array_borrowed) VALGRIND_DISCARD(impl->borrowed_block_id);
+
+  // Set internal pointers to external arrays
   switch (copy_mode) {
     case CEED_COPY_VALUES:
-      CeedCallBackend(CeedCalloc(length, &impl->array_owned));
+      impl->array_owned    = NULL;
       impl->array_borrowed = NULL;
-      impl->array          = impl->array_owned;
-      if (array) {
-        memcpy(impl->array, array, length * sizeof(CeedScalar));
-      } else {
-        for (CeedInt i = 0; i < length; i++) impl->array[i] = NAN;
-      }
       break;
     case CEED_OWN_POINTER:
       impl->array_owned    = array;
       impl->array_borrowed = NULL;
-      impl->array          = array;
+      impl->owned_block_id = VALGRIND_CREATE_BLOCK(impl->array_owned, length * sizeof(CeedScalar), "Owned external array buffer");
       break;
     case CEED_USE_POINTER:
-      impl->array_borrowed = array;
-      impl->array          = array;
+      impl->array_owned       = NULL;
+      impl->array_borrowed    = array;
+      impl->borrowed_block_id = VALGRIND_CREATE_BLOCK(impl->array_borrowed, length * sizeof(CeedScalar), "Borrowed external array buffer");
+      break;
   }
-  // Copy data to check access
+
+  // Create internal array data buffer
   CeedCallBackend(CeedCalloc(length, &impl->array_allocated));
-  memcpy(impl->array_allocated, impl->array, length * sizeof(CeedScalar));
-  impl->array = impl->array_allocated;
-  VALGRIND_DISCARD(impl->mem_block_id);
-  impl->mem_block_id = VALGRIND_CREATE_BLOCK(impl->array, length * sizeof(CeedScalar), "'Vector backend array data copy'");
+  impl->allocated_block_id = VALGRIND_CREATE_BLOCK(impl->array_allocated, length * sizeof(CeedScalar), "Allocated internal array buffer");
+  if (array) {
+    memcpy(impl->array_allocated, array, length * sizeof(CeedScalar));
+  } else {
+    for (CeedInt i = 0; i < length; i++) impl->array_allocated[i] = NAN;
+  }
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
+// Sync arrays
+//------------------------------------------------------------------------------
+static int CeedVectorSyncArray_Memcheck(const CeedVector vec, CeedMemType mem_type) {
+  CeedSize             length;
+  CeedVector_Memcheck *impl;
+
+  CeedCheck(mem_type == CEED_MEM_HOST, CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND, "Can only provide HOST memory for this backend");
+
+  CeedCallBackend(CeedVectorGetData(vec, &impl));
+  CeedCallBackend(CeedVectorGetLength(vec, &length));
+
+  // Copy internal buffer back to owned or borrowed array
+  if (impl->array_owned) {
+    memcpy(impl->array_owned, impl->array_allocated, length * sizeof(CeedScalar));
+  }
+  if (impl->array_borrowed) {
+    memcpy(impl->array_borrowed, impl->array_allocated, length * sizeof(CeedScalar));
+  }
   return CEED_ERROR_SUCCESS;
 }
 
@@ -93,19 +123,25 @@ static int CeedVectorTakeArray_Memcheck(CeedVector vec, CeedMemType mem_type, Ce
   CeedSize             length;
   CeedVector_Memcheck *impl;
 
+  CeedCheck(mem_type == CEED_MEM_HOST, CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND, "Can only provide HOST memory for this backend");
+
   CeedCallBackend(CeedVectorGetData(vec, &impl));
   CeedCallBackend(CeedVectorGetLength(vec, &length));
 
-  CeedCheck(mem_type == CEED_MEM_HOST, CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND, "Can only provide HOST memory for this backend");
+  // Synchronize memory
+  CeedCallBackend(CeedVectorSyncArray_Memcheck(vec, CEED_MEM_HOST));
 
+  // Return borrowed array
   (*array)             = impl->array_borrowed;
   impl->array_borrowed = NULL;
-  impl->array          = NULL;
-  VALGRIND_DISCARD(impl->mem_block_id);
+  VALGRIND_DISCARD(impl->borrowed_block_id);
+
+  // De-allocate internal memory
   if (impl->array_allocated) {
     for (CeedSize i = 0; i < length; i++) impl->array_allocated[i] = NAN;
   }
   CeedCallBackend(CeedFree(&impl->array_allocated));
+  VALGRIND_DISCARD(impl->allocated_block_id);
   return CEED_ERROR_SUCCESS;
 }
 
@@ -116,13 +152,15 @@ static int CeedVectorGetArray_Memcheck(CeedVector vec, CeedMemType mem_type, Cee
   CeedSize             length;
   CeedVector_Memcheck *impl;
 
+  CeedCheck(mem_type == CEED_MEM_HOST, CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND, "Can only provide HOST memory for this backend");
+
   CeedCallBackend(CeedVectorGetData(vec, &impl));
   CeedCallBackend(CeedVectorGetLength(vec, &length));
 
-  CeedCheck(mem_type == CEED_MEM_HOST, CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND, "Can only provide HOST memory for this backend");
-
+  // Create and return writable buffer
   CeedCallBackend(CeedCalloc(length, &impl->array_writable_copy));
-  memcpy(impl->array_writable_copy, impl->array, length * sizeof(CeedScalar));
+  impl->writable_block_id = VALGRIND_CREATE_BLOCK(impl->array_writable_copy, length * sizeof(CeedScalar), "Allocated writeable array buffer copy");
+  memcpy(impl->array_writable_copy, impl->array_allocated, length * sizeof(CeedScalar));
   *array = impl->array_writable_copy;
   return CEED_ERROR_SUCCESS;
 }
@@ -134,15 +172,16 @@ static int CeedVectorGetArrayRead_Memcheck(CeedVector vec, CeedMemType mem_type,
   CeedSize             length;
   CeedVector_Memcheck *impl;
 
+  CeedCheck(mem_type == CEED_MEM_HOST, CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND, "Can only provide HOST memory for this backend");
+
   CeedCallBackend(CeedVectorGetData(vec, &impl));
   CeedCallBackend(CeedVectorGetLength(vec, &length));
 
-  CeedCheck(mem_type == CEED_MEM_HOST, CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND, "Can only provide HOST memory for this backend");
-
-  // Make copy to verify no write occurred
+  // Create and return read-only buffer
   if (!impl->array_read_only_copy) {
     CeedCallBackend(CeedCalloc(length, &impl->array_read_only_copy));
-    memcpy(impl->array_read_only_copy, impl->array, length * sizeof(CeedScalar));
+    impl->writable_block_id = VALGRIND_CREATE_BLOCK(impl->array_read_only_copy, length * sizeof(CeedScalar), "Allocated read-only array buffer copy");
+    memcpy(impl->array_read_only_copy, impl->array_allocated, length * sizeof(CeedScalar));
   }
   *array = impl->array_read_only_copy;
   return CEED_ERROR_SUCCESS;
@@ -155,12 +194,18 @@ static int CeedVectorGetArrayWrite_Memcheck(CeedVector vec, CeedMemType mem_type
   CeedSize             length;
   CeedVector_Memcheck *impl;
 
+  CeedCheck(mem_type == CEED_MEM_HOST, CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND, "Can only provide HOST memory for this backend");
+
   CeedCallBackend(CeedVectorGetData(vec, &impl));
   CeedCallBackend(CeedVectorGetLength(vec, &length));
 
-  // Invalidate data to make sure no read occurs
-  if (!impl->array) CeedCallBackend(CeedVectorSetArray_Memcheck(vec, mem_type, CEED_COPY_VALUES, NULL));
+  // Allocate buffer if necessary
+  if (!impl->array_allocated) CeedCallBackend(CeedVectorSetArray_Memcheck(vec, mem_type, CEED_COPY_VALUES, NULL));
+
+  // Get writable buffer
   CeedCallBackend(CeedVectorGetArray_Memcheck(vec, mem_type, array));
+
+  // Invalidate array data to prevent accidental reads
   for (CeedSize i = 0; i < length; i++) (*array)[i] = NAN;
   impl->is_write_only_access = true;
   return CEED_ERROR_SUCCESS;
@@ -174,26 +219,27 @@ static int CeedVectorRestoreArray_Memcheck(CeedVector vec) {
   CeedSize             length;
   CeedVector_Memcheck *impl;
 
+  CeedCallBackend(CeedVectorGetCeed(vec, &ceed));
   CeedCallBackend(CeedVectorGetData(vec, &impl));
   CeedCallBackend(CeedVectorGetLength(vec, &length));
-  CeedCallBackend(CeedVectorGetCeed(vec, &ceed));
 
-  memcpy(impl->array, impl->array_writable_copy, length * sizeof(CeedScalar));
-  for (CeedSize i = 0; i < length; i++) impl->array_writable_copy[i] = NAN;
-  CeedCallBackend(CeedFree(&impl->array_writable_copy));
+  // Check for unset entries after write-only access
   if (impl->is_write_only_access) {
     for (CeedSize i = 0; i < length; i++) {
-      if (isnan(impl->array[i]))
+      if (isnan(impl->array_writable_copy[i]))
         CeedDebug256(ceed, CEED_DEBUG_COLOR_WARNING, "WARNING: Vec entry %" CeedSize_FMT " is NaN after restoring write-only access", i);
     }
     impl->is_write_only_access = false;
   }
-  if (impl->array_borrowed) {
-    memcpy(impl->array_borrowed, impl->array, length * sizeof(CeedScalar));
-  }
-  if (impl->array_owned) {
-    memcpy(impl->array_owned, impl->array, length * sizeof(CeedScalar));
-  }
+
+  // Copy back to internal buffer and sync
+  memcpy(impl->array_allocated, impl->array_writable_copy, length * sizeof(CeedScalar));
+  CeedCallBackend(CeedVectorSyncArray_Memcheck(vec, CEED_MEM_HOST));
+
+  // Invalidate writable buffer
+  for (CeedSize i = 0; i < length; i++) impl->array_writable_copy[i] = NAN;
+  CeedCallBackend(CeedFree(&impl->array_writable_copy));
+  VALGRIND_DISCARD(impl->writable_block_id);
   return CEED_ERROR_SUCCESS;
 }
 
@@ -201,17 +247,23 @@ static int CeedVectorRestoreArray_Memcheck(CeedVector vec) {
 // Vector Restore Array Read-Only
 //------------------------------------------------------------------------------
 static int CeedVectorRestoreArrayRead_Memcheck(CeedVector vec) {
+  Ceed                 ceed;
   CeedSize             length;
   CeedVector_Memcheck *impl;
 
+  CeedCallBackend(CeedVectorGetCeed(vec, &ceed));
   CeedCallBackend(CeedVectorGetData(vec, &impl));
   CeedCallBackend(CeedVectorGetLength(vec, &length));
 
-  CeedCheck(!memcmp(impl->array, impl->array_read_only_copy, length * sizeof(CeedScalar)), CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND,
-            "Array data changed while accessed in read-only mode");
+  // Verify no changes made during read-only access
+  bool is_changed = memcmp(impl->array_allocated, impl->array_read_only_copy, length * sizeof(CeedScalar));
+
+  CeedCheck(!is_changed, ceed, CEED_ERROR_BACKEND, "Array data changed while accessed in read-only mode");
 
+  // Invalidate read-only buffer
   for (CeedSize i = 0; i < length; i++) impl->array_read_only_copy[i] = NAN;
   CeedCallBackend(CeedFree(&impl->array_read_only_copy));
+  VALGRIND_DISCARD(impl->read_only_block_id);
   return CEED_ERROR_SUCCESS;
 }
 
@@ -221,10 +273,19 @@ static int CeedVectorRestoreArrayRead_Memcheck(CeedVector vec) {
 static int CeedVectorDestroy_Memcheck(CeedVector vec) {
   CeedVector_Memcheck *impl;
 
+  // Free allocations and discard block ids
   CeedCallBackend(CeedVectorGetData(vec, &impl));
-  VALGRIND_DISCARD(impl->mem_block_id);
-  CeedCallBackend(CeedFree(&impl->array_allocated));
-  CeedCallBackend(CeedFree(&impl->array_owned));
+  if (impl->array_allocated) {
+    CeedCallBackend(CeedFree(&impl->array_allocated));
+    VALGRIND_DISCARD(impl->allocated_block_id);
+  }
+  if (impl->array_owned) {
+    CeedCallBackend(CeedFree(&impl->array_owned));
+    VALGRIND_DISCARD(impl->owned_block_id);
+  }
+  if (impl->array_borrowed) {
+    VALGRIND_DISCARD(impl->borrowed_block_id);
+  }
   CeedCallBackend(CeedFree(&impl));
   return CEED_ERROR_SUCCESS;
 }
@@ -243,6 +304,7 @@ int CeedVectorCreate_Memcheck(CeedSize n, CeedVector vec) {
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "HasValidArray", CeedVectorHasValidArray_Memcheck));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "HasBorrowedArrayOfType", CeedVectorHasBorrowedArrayOfType_Memcheck));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "SetArray", CeedVectorSetArray_Memcheck));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "SyncArray", CeedVectorSyncArray_Memcheck));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "TakeArray", CeedVectorTakeArray_Memcheck));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "GetArray", CeedVectorGetArray_Memcheck));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "GetArrayRead", CeedVectorGetArrayRead_Memcheck));
diff --git a/backends/memcheck/ceed-memcheck.h b/backends/memcheck/ceed-memcheck.h
index f4787cf1ce..8813c62106 100644
--- a/backends/memcheck/ceed-memcheck.h
+++ b/backends/memcheck/ceed-memcheck.h
@@ -10,13 +10,21 @@
 #include <ceed/backend.h>
 
 typedef struct {
-  int         mem_block_id;
-  bool        is_write_only_access;
-  CeedScalar *array;
+  // Internal array buffer
+  int         allocated_block_id;
   CeedScalar *array_allocated;
+  // Owned external array
+  int         owned_block_id;
   CeedScalar *array_owned;
+  // Borrowed external array
+  int         borrowed_block_id;
   CeedScalar *array_borrowed;
+  // Externally viewable read-only array
+  int         read_only_block_id;
   CeedScalar *array_read_only_copy;
+  // Externally viewable writable array
+  bool        is_write_only_access;
+  int         writable_block_id;
   CeedScalar *array_writable_copy;
 } CeedVector_Memcheck;
 

From 3345ba21ea3223800ed4bc1b9a4502bd60b77d9f Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Mon, 16 Sep 2024 11:21:24 -0600
Subject: [PATCH 171/571] memcheck - minor clarify on QF impl

---
 backends/memcheck/ceed-memcheck-qfunction.c | 24 +++++++++++++++------
 backends/memcheck/ceed-memcheck.h           |  2 +-
 2 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/backends/memcheck/ceed-memcheck-qfunction.c b/backends/memcheck/ceed-memcheck-qfunction.c
index 7c66e3601a..79575b3ad2 100644
--- a/backends/memcheck/ceed-memcheck-qfunction.c
+++ b/backends/memcheck/ceed-memcheck-qfunction.c
@@ -19,6 +19,7 @@
 static int CeedQFunctionApply_Memcheck(CeedQFunction qf, CeedInt Q, CeedVector *U, CeedVector *V) {
   Ceed                    ceed;
   void                   *ctx_data = NULL;
+  int                     input_block_ids[CEED_FIELD_MAX], output_block_ids[CEED_FIELD_MAX];
   CeedInt                 num_in, num_out;
   CeedQFunctionUser       f = NULL;
   CeedQFunctionField     *output_fields;
@@ -29,12 +30,21 @@ static int CeedQFunctionApply_Memcheck(CeedQFunction qf, CeedInt Q, CeedVector *
   CeedCallBackend(CeedQFunctionGetContextData(qf, CEED_MEM_HOST, &ctx_data));
   CeedCallBackend(CeedQFunctionGetUserFunction(qf, &f));
   CeedCallBackend(CeedQFunctionGetNumArgs(qf, &num_in, &num_out));
-  int mem_block_ids[num_out];
 
-  // Get input/output arrays
+  // Get input arrays
   for (CeedInt i = 0; i < num_in; i++) {
+    CeedSize len;
+    char     name[32] = "";
+
     CeedCallBackend(CeedVectorGetArrayRead(U[i], CEED_MEM_HOST, &impl->inputs[i]));
+
+    CeedCallBackend(CeedVectorGetLength(U[i], &len));
+
+    snprintf(name, 32, "QFunction input %" CeedInt_FMT, i);
+    input_block_ids[i] = VALGRIND_CREATE_BLOCK(impl->inputs[i], len, name);
   }
+
+  // Get output arrays
   for (CeedInt i = 0; i < num_out; i++) {
     CeedSize len;
     char     name[32] = "";
@@ -44,8 +54,8 @@ static int CeedQFunctionApply_Memcheck(CeedQFunction qf, CeedInt Q, CeedVector *
     CeedCallBackend(CeedVectorGetLength(V[i], &len));
     VALGRIND_MAKE_MEM_UNDEFINED(impl->outputs[i], len);
 
-    snprintf(name, 32, "'QFunction output %" CeedInt_FMT "'", i);
-    mem_block_ids[i] = VALGRIND_CREATE_BLOCK(impl->outputs[i], len, name);
+    snprintf(name, 32, "QFunction output %" CeedInt_FMT, i);
+    output_block_ids[i] = VALGRIND_CREATE_BLOCK(impl->outputs[i], len, name);
   }
 
   // Call user function
@@ -54,8 +64,10 @@ static int CeedQFunctionApply_Memcheck(CeedQFunction qf, CeedInt Q, CeedVector *
   // Restore input arrays
   for (CeedInt i = 0; i < num_in; i++) {
     CeedCallBackend(CeedVectorRestoreArrayRead(U[i], &impl->inputs[i]));
+    VALGRIND_DISCARD(input_block_ids[i]);
   }
-  // Check for unset output values
+
+  // Check for unset output values and restore arrays
   {
     const char *kernel_name, *kernel_path;
 
@@ -73,7 +85,7 @@ static int CeedQFunctionApply_Memcheck(CeedQFunction qf, CeedInt Q, CeedVector *
                   kernel_name);
       }
       CeedCallBackend(CeedVectorRestoreArray(V[i], &impl->outputs[i]));
-      VALGRIND_DISCARD(mem_block_ids[i]);
+      VALGRIND_DISCARD(output_block_ids[i]);
     }
   }
   CeedCallBackend(CeedQFunctionRestoreContextData(qf, &ctx_data));
diff --git a/backends/memcheck/ceed-memcheck.h b/backends/memcheck/ceed-memcheck.h
index 8813c62106..13e68dd035 100644
--- a/backends/memcheck/ceed-memcheck.h
+++ b/backends/memcheck/ceed-memcheck.h
@@ -40,9 +40,9 @@ typedef struct {
 } CeedElemRestriction_Memcheck;
 
 typedef struct {
+  bool               setup_done;
   const CeedScalar **inputs;
   CeedScalar       **outputs;
-  bool               setup_done;
 } CeedQFunction_Memcheck;
 
 typedef struct {

From 0307dd026e7027ceb655d92ffb4302ee80e86e93 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Mon, 16 Sep 2024 11:47:25 -0600
Subject: [PATCH 172/571] memcheck - also tidy up the ctx impl

---
 .../memcheck/ceed-memcheck-qfunctioncontext.c | 175 ++++++++++++++----
 backends/memcheck/ceed-memcheck-vector.c      |   6 +-
 backends/memcheck/ceed-memcheck.h             |  13 +-
 3 files changed, 150 insertions(+), 44 deletions(-)

diff --git a/backends/memcheck/ceed-memcheck-qfunctioncontext.c b/backends/memcheck/ceed-memcheck-qfunctioncontext.c
index 4da0d0ee68..57afe981af 100644
--- a/backends/memcheck/ceed-memcheck-qfunctioncontext.c
+++ b/backends/memcheck/ceed-memcheck-qfunctioncontext.c
@@ -20,7 +20,7 @@ static int CeedQFunctionContextHasValidData_Memcheck(CeedQFunctionContext ctx, b
   CeedQFunctionContext_Memcheck *impl;
 
   CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl));
-  *has_valid_data = impl->data;
+  *has_valid_data = !!impl->data_allocated;
   return CEED_ERROR_SUCCESS;
 }
 
@@ -30,9 +30,10 @@ static int CeedQFunctionContextHasValidData_Memcheck(CeedQFunctionContext ctx, b
 static int CeedQFunctionContextHasBorrowedDataOfType_Memcheck(CeedQFunctionContext ctx, CeedMemType mem_type, bool *has_borrowed_data_of_type) {
   CeedQFunctionContext_Memcheck *impl;
 
-  CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl));
   CeedCheck(mem_type == CEED_MEM_HOST, CeedQFunctionContextReturnCeed(ctx), CEED_ERROR_BACKEND, "Can only set HOST memory for this backend");
-  *has_borrowed_data_of_type = impl->data_borrowed;
+
+  CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl));
+  *has_borrowed_data_of_type = !!impl->data_borrowed;
   return CEED_ERROR_SUCCESS;
 }
 
@@ -43,35 +44,69 @@ static int CeedQFunctionContextSetData_Memcheck(CeedQFunctionContext ctx, CeedMe
   size_t                         ctx_size;
   CeedQFunctionContext_Memcheck *impl;
 
+  CeedCheck(mem_type == CEED_MEM_HOST, CeedQFunctionContextReturnCeed(ctx), CEED_ERROR_BACKEND, "Can only set HOST memory for this backend");
+
   CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl));
   CeedCallBackend(CeedQFunctionContextGetContextSize(ctx, &ctx_size));
 
-  CeedCheck(mem_type == CEED_MEM_HOST, CeedQFunctionContextReturnCeed(ctx), CEED_ERROR_BACKEND, "Can only set HOST memory for this backend");
-
+  // Clear previous owned data buffers
+  if (impl->data_allocated) {
+    memset(impl->data_allocated, -42, ctx_size);
+    VALGRIND_DISCARD(impl->allocated_block_id);
+  }
   CeedCallBackend(CeedFree(&impl->data_allocated));
+  if (impl->data_owned) {
+    memset(impl->data_owned, -42, ctx_size);
+    VALGRIND_DISCARD(impl->owned_block_id);
+  }
   CeedCallBackend(CeedFree(&impl->data_owned));
+
+  // Clear borrowed block id, if present
+  if (impl->data_borrowed) VALGRIND_DISCARD(impl->borrowed_block_id);
+
+  // Set internal pointers to external buffers
   switch (copy_mode) {
     case CEED_COPY_VALUES:
-      CeedCallBackend(CeedMallocArray(1, ctx_size, &impl->data_owned));
+      impl->data_owned    = NULL;
       impl->data_borrowed = NULL;
-      impl->data          = impl->data_owned;
-      memcpy(impl->data, data, ctx_size);
       break;
     case CEED_OWN_POINTER:
-      impl->data_owned    = data;
-      impl->data_borrowed = NULL;
-      impl->data          = data;
+      impl->data_owned     = data;
+      impl->data_borrowed  = NULL;
+      impl->owned_block_id = VALGRIND_CREATE_BLOCK(impl->data_owned, ctx_size, "Owned external data buffer");
       break;
     case CEED_USE_POINTER:
-      impl->data_borrowed = data;
-      impl->data          = data;
+      impl->data_owned     = NULL;
+      impl->data_borrowed  = data;
+      impl->owned_block_id = VALGRIND_CREATE_BLOCK(impl->data_borrowed, ctx_size, "Borrowed external data buffer");
   }
-  // Copy data to check ctx_size bounds
+
+  // Create internal data buffer
   CeedCallBackend(CeedMallocArray(1, ctx_size, &impl->data_allocated));
-  memcpy(impl->data_allocated, impl->data, ctx_size);
-  impl->data = impl->data_allocated;
-  VALGRIND_DISCARD(impl->mem_block_id);
-  impl->mem_block_id = VALGRIND_CREATE_BLOCK(impl->data, ctx_size, "'QFunction backend context data copy'");
+  impl->allocated_block_id = VALGRIND_CREATE_BLOCK(impl->data_allocated, ctx_size, "'Allocated internal context data buffer");
+  memcpy(impl->data_allocated, data, ctx_size);
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
+// Sync data
+//------------------------------------------------------------------------------
+static int CeedQFunctionContextSyncData_Memcheck(CeedQFunctionContext ctx, CeedMemType mem_type) {
+  size_t                         ctx_size;
+  CeedQFunctionContext_Memcheck *impl;
+
+  CeedCheck(mem_type == CEED_MEM_HOST, CeedQFunctionContextReturnCeed(ctx), CEED_ERROR_BACKEND, "Can only set HOST memory for this backend");
+
+  CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl));
+  CeedCallBackend(CeedQFunctionContextGetContextSize(ctx, &ctx_size));
+
+  // Copy internal buffer back to owned or borrowed data buffer
+  if (impl->data_owned) {
+    memcpy(impl->data_owned, impl->data_allocated, ctx_size);
+  }
+  if (impl->data_borrowed) {
+    memcpy(impl->data_borrowed, impl->data_allocated, ctx_size);
+  }
   return CEED_ERROR_SUCCESS;
 }
 
@@ -79,16 +114,27 @@ static int CeedQFunctionContextSetData_Memcheck(CeedQFunctionContext ctx, CeedMe
 // QFunctionContext Take Data
 //------------------------------------------------------------------------------
 static int CeedQFunctionContextTakeData_Memcheck(CeedQFunctionContext ctx, CeedMemType mem_type, void *data) {
+  size_t                         ctx_size;
   CeedQFunctionContext_Memcheck *impl;
 
+  CeedCheck(mem_type == CEED_MEM_HOST, CeedQFunctionContextReturnCeed(ctx), CEED_ERROR_BACKEND, "Can only provide HOST memory for this backend");
+
   CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl));
+  CeedCallBackend(CeedQFunctionContextGetContextSize(ctx, &ctx_size));
 
-  CeedCheck(mem_type == CEED_MEM_HOST, CeedQFunctionContextReturnCeed(ctx), CEED_ERROR_BACKEND, "Can only provide HOST memory for this backend");
+  // Synchronize memory
+  CeedCallBackend(CeedQFunctionContextSyncData_Memcheck(ctx, CEED_MEM_HOST));
 
+  // Return borrowed buffer
   *(void **)data      = impl->data_borrowed;
   impl->data_borrowed = NULL;
-  impl->data          = NULL;
-  VALGRIND_DISCARD(impl->mem_block_id);
+  VALGRIND_DISCARD(impl->borrowed_block_id);
+
+  // De-allocate internal memory
+  if (impl->data_allocated) {
+    memset(impl->data_allocated, -42, ctx_size);
+    VALGRIND_DISCARD(impl->allocated_block_id);
+  }
   CeedCallBackend(CeedFree(&impl->data_allocated));
   return CEED_ERROR_SUCCESS;
 }
@@ -97,13 +143,19 @@ static int CeedQFunctionContextTakeData_Memcheck(CeedQFunctionContext ctx, CeedM
 // QFunctionContext Get Data
 //------------------------------------------------------------------------------
 static int CeedQFunctionContextGetData_Memcheck(CeedQFunctionContext ctx, CeedMemType mem_type, void *data) {
+  size_t                         ctx_size;
   CeedQFunctionContext_Memcheck *impl;
 
-  CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl));
+  CeedCheck(mem_type == CEED_MEM_HOST, CeedQFunctionContextReturnCeed(ctx), CEED_ERROR_BACKEND, "Can only set HOST memory for this backend");
 
-  CeedCheck(mem_type == CEED_MEM_HOST, CeedQFunctionContextReturnCeed(ctx), CEED_ERROR_BACKEND, "Can only provide HOST memory for this backend");
+  CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl));
+  CeedCallBackend(CeedQFunctionContextGetContextSize(ctx, &ctx_size));
 
-  *(void **)data = impl->data;
+  // Create and return writable buffer
+  CeedCallBackend(CeedMallocArray(1, ctx_size, &impl->data_writable_copy));
+  impl->writable_block_id = VALGRIND_CREATE_BLOCK(impl->data_writable_copy, ctx_size, "Allocated writeable data buffer copy");
+  memcpy(impl->data_writable_copy, impl->data_allocated, ctx_size);
+  *(void **)data = impl->data_writable_copy;
   return CEED_ERROR_SUCCESS;
 }
 
@@ -114,13 +166,18 @@ static int CeedQFunctionContextGetDataRead_Memcheck(CeedQFunctionContext ctx, Ce
   size_t                         ctx_size;
   CeedQFunctionContext_Memcheck *impl;
 
+  CeedCheck(mem_type == CEED_MEM_HOST, CeedQFunctionContextReturnCeed(ctx), CEED_ERROR_BACKEND, "Can only set HOST memory for this backend");
+
   CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl));
   CeedCallBackend(CeedQFunctionContextGetContextSize(ctx, &ctx_size));
-  CeedCallBackend(CeedQFunctionContextGetData_Memcheck(ctx, mem_type, data));
 
-  // Make copy to verify no write occurred
-  CeedCallBackend(CeedMallocArray(1, ctx_size, &impl->data_read_only_copy));
-  memcpy(impl->data_read_only_copy, *(void **)data, ctx_size);
+  // Create and return read-only buffer
+  if (!impl->data_read_only_copy) {
+    CeedCallBackend(CeedMallocArray(1, ctx_size, &impl->data_read_only_copy));
+    impl->writable_block_id = VALGRIND_CREATE_BLOCK(impl->data_read_only_copy, ctx_size, "Allocated read-only data buffer copy");
+    memcpy(impl->data_read_only_copy, impl->data_allocated, ctx_size);
+  }
+  *(void **)data = impl->data_read_only_copy;
   return CEED_ERROR_SUCCESS;
 }
 
@@ -134,8 +191,14 @@ static int CeedQFunctionContextRestoreData_Memcheck(CeedQFunctionContext ctx) {
   CeedCallBackend(CeedQFunctionContextGetContextSize(ctx, &ctx_size));
   CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl));
 
-  if (impl->data_borrowed) memcpy(impl->data_borrowed, impl->data, ctx_size);
-  if (impl->data_owned) memcpy(impl->data_owned, impl->data, ctx_size);
+  // Copy back to internal buffer and sync
+  memcpy(impl->data_allocated, impl->data_writable_copy, ctx_size);
+  CeedCallBackend(CeedQFunctionContextSyncData_Memcheck(ctx, CEED_MEM_HOST));
+
+  // Invalidate writable buffer
+  memset(impl->data_writable_copy, -42, ctx_size);
+  CeedCallBackend(CeedFree(&impl->data_writable_copy));
+  VALGRIND_DISCARD(impl->writable_block_id);
   return CEED_ERROR_SUCCESS;
 }
 
@@ -143,16 +206,23 @@ static int CeedQFunctionContextRestoreData_Memcheck(CeedQFunctionContext ctx) {
 // QFunctionContext Restore Data Read-Only
 //------------------------------------------------------------------------------
 static int CeedQFunctionContextRestoreDataRead_Memcheck(CeedQFunctionContext ctx) {
+  Ceed                           ceed;
   size_t                         ctx_size;
   CeedQFunctionContext_Memcheck *impl;
 
+  CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed));
   CeedCallBackend(CeedQFunctionContextGetContextSize(ctx, &ctx_size));
   CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl));
 
-  CeedCheck(!memcmp(impl->data, impl->data_read_only_copy, ctx_size), CeedQFunctionContextReturnCeed(ctx), CEED_ERROR_BACKEND,
-            "Context data changed while accessed in read-only mode");
+  // Verify no changes made during read-only access
+  bool is_changed = memcmp(impl->data_allocated, impl->data_read_only_copy, ctx_size);
+
+  CeedCheck(!is_changed, ceed, CEED_ERROR_BACKEND, "Context data changed while accessed in read-only mode");
 
+  // Invalidate read-only buffer
+  memset(impl->data_read_only_copy, -42, ctx_size);
   CeedCallBackend(CeedFree(&impl->data_read_only_copy));
+  VALGRIND_DISCARD(impl->read_only_block_id);
   return CEED_ERROR_SUCCESS;
 }
 
@@ -160,20 +230,37 @@ static int CeedQFunctionContextRestoreDataRead_Memcheck(CeedQFunctionContext ctx
 // QFunctionContext destroy user data
 //------------------------------------------------------------------------------
 static int CeedQFunctionContextDataDestroy_Memcheck(CeedQFunctionContext ctx) {
+  Ceed                                ceed;
   CeedMemType                         data_destroy_mem_type;
   CeedQFunctionContextDataDestroyUser data_destroy_function;
   CeedQFunctionContext_Memcheck      *impl;
 
+  CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed));
   CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl));
-  CeedCallBackend(CeedQFunctionContextGetDataDestroy(ctx, &data_destroy_mem_type, &data_destroy_function));
 
-  CeedCheck(data_destroy_mem_type == CEED_MEM_HOST, CeedQFunctionContextReturnCeed(ctx), CEED_ERROR_BACKEND,
-            "Can only destroy HOST memory for this backend");
+  CeedCallBackend(CeedQFunctionContextGetDataDestroy(ctx, &data_destroy_mem_type, &data_destroy_function));
+  CeedCheck(data_destroy_mem_type == CEED_MEM_HOST, ceed, CEED_ERROR_BACKEND, "Can only destroy HOST memory for this backend");
 
+  // Run user destroy routine
   if (data_destroy_function) {
-    CeedCallBackend(data_destroy_function(impl->data_borrowed ? impl->data_borrowed : impl->data_owned));
+    bool is_borrowed = !!impl->data_borrowed;
+
+    CeedCallBackend(data_destroy_function(is_borrowed ? impl->data_borrowed : impl->data_owned));
+    if (is_borrowed) VALGRIND_DISCARD(impl->borrowed_block_id);
+    else VALGRIND_DISCARD(impl->owned_block_id);
+  }
+  // Free allocations and discard block ids
+  if (impl->data_allocated) {
+    CeedCallBackend(CeedFree(&impl->data_allocated));
+    VALGRIND_DISCARD(impl->allocated_block_id);
+  }
+  if (impl->data_owned) {
+    CeedCallBackend(CeedFree(&impl->data_owned));
+    VALGRIND_DISCARD(impl->owned_block_id);
+  }
+  if (impl->data_borrowed) {
+    VALGRIND_DISCARD(impl->borrowed_block_id);
   }
-  CeedCallBackend(CeedFree(&impl->data_allocated));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -183,9 +270,19 @@ static int CeedQFunctionContextDataDestroy_Memcheck(CeedQFunctionContext ctx) {
 static int CeedQFunctionContextDestroy_Memcheck(CeedQFunctionContext ctx) {
   CeedQFunctionContext_Memcheck *impl;
 
+  // Free allocations and discard block ids
   CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl));
-  CeedCallBackend(CeedFree(&impl->data_allocated));
-  CeedCallBackend(CeedFree(&impl->data_owned));
+  if (impl->data_allocated) {
+    CeedCallBackend(CeedFree(&impl->data_allocated));
+    VALGRIND_DISCARD(impl->allocated_block_id);
+  }
+  if (impl->data_owned) {
+    CeedCallBackend(CeedFree(&impl->data_owned));
+    VALGRIND_DISCARD(impl->owned_block_id);
+  }
+  if (impl->data_borrowed) {
+    VALGRIND_DISCARD(impl->borrowed_block_id);
+  }
   CeedCallBackend(CeedFree(&impl));
   return CEED_ERROR_SUCCESS;
 }
diff --git a/backends/memcheck/ceed-memcheck-vector.c b/backends/memcheck/ceed-memcheck-vector.c
index 48972948f8..23979c6f77 100644
--- a/backends/memcheck/ceed-memcheck-vector.c
+++ b/backends/memcheck/ceed-memcheck-vector.c
@@ -53,13 +53,13 @@ static int CeedVectorSetArray_Memcheck(CeedVector vec, CeedMemType mem_type, Cee
   // Clear previous owned arrays
   if (impl->array_allocated) {
     for (CeedSize i = 0; i < length; i++) impl->array_allocated[i] = NAN;
+    VALGRIND_DISCARD(impl->allocated_block_id);
   }
   CeedCallBackend(CeedFree(&impl->array_allocated));
-  VALGRIND_DISCARD(impl->allocated_block_id);
   if (impl->array_owned) {
     for (CeedSize i = 0; i < length; i++) impl->array_owned[i] = NAN;
+    VALGRIND_DISCARD(impl->owned_block_id);
   }
-  VALGRIND_DISCARD(impl->owned_block_id);
   CeedCallBackend(CeedFree(&impl->array_owned));
 
   // Clear borrowed block id, if present
@@ -139,9 +139,9 @@ static int CeedVectorTakeArray_Memcheck(CeedVector vec, CeedMemType mem_type, Ce
   // De-allocate internal memory
   if (impl->array_allocated) {
     for (CeedSize i = 0; i < length; i++) impl->array_allocated[i] = NAN;
+    VALGRIND_DISCARD(impl->allocated_block_id);
   }
   CeedCallBackend(CeedFree(&impl->array_allocated));
-  VALGRIND_DISCARD(impl->allocated_block_id);
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/memcheck/ceed-memcheck.h b/backends/memcheck/ceed-memcheck.h
index 13e68dd035..45eb9c5ae3 100644
--- a/backends/memcheck/ceed-memcheck.h
+++ b/backends/memcheck/ceed-memcheck.h
@@ -46,12 +46,21 @@ typedef struct {
 } CeedQFunction_Memcheck;
 
 typedef struct {
-  int   mem_block_id;
-  void *data;
+  // Internal data buffer
+  int   allocated_block_id;
   void *data_allocated;
+  // Owned external data
+  int   owned_block_id;
   void *data_owned;
+  // Borrowed external data
+  int   borrowed_block_id;
   void *data_borrowed;
+  // Externally viewable read-only data
+  int   read_only_block_id;
   void *data_read_only_copy;
+  // Externally viewable writable data
+  int   writable_block_id;
+  void *data_writable_copy;
 } CeedQFunctionContext_Memcheck;
 
 CEED_INTERN int CeedVectorCreate_Memcheck(CeedSize n, CeedVector vec);

From 77d04a1cd52f4bd525fc353da4f86cac2faab0fe Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Thu, 19 Sep 2024 15:21:46 -0600
Subject: [PATCH 173/571] memcheck - emulate device vector methods too

---
 backends/memcheck/ceed-memcheck-vector.c | 118 +++++++++++++++++++++++
 1 file changed, 118 insertions(+)

diff --git a/backends/memcheck/ceed-memcheck-vector.c b/backends/memcheck/ceed-memcheck-vector.c
index 23979c6f77..ca86978165 100644
--- a/backends/memcheck/ceed-memcheck-vector.c
+++ b/backends/memcheck/ceed-memcheck-vector.c
@@ -7,6 +7,7 @@
 
 #include <ceed.h>
 #include <ceed/backend.h>
+#include <assert.h>
 #include <math.h>
 #include <stdbool.h>
 #include <string.h>
@@ -94,6 +95,38 @@ static int CeedVectorSetArray_Memcheck(CeedVector vec, CeedMemType mem_type, Cee
   return CEED_ERROR_SUCCESS;
 }
 
+//------------------------------------------------------------------------------
+// Set internal array to value
+//------------------------------------------------------------------------------
+static int CeedVectorSetValue_Memcheck(CeedVector vec, CeedScalar value) {
+  CeedSize             length;
+  CeedVector_Memcheck *impl;
+
+  CeedCallBackend(CeedVectorGetData(vec, &impl));
+  CeedCallBackend(CeedVectorGetLength(vec, &length));
+
+  if (!impl->array_allocated) CeedCallBackend(CeedVectorSetArray_Memcheck(vec, CEED_MEM_HOST, CEED_COPY_VALUES, NULL));
+  assert(impl->array_allocated);
+  for (CeedSize i = 0; i < length; i++) impl->array_allocated[i] = value;
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
+// Set internal array to value strided
+//------------------------------------------------------------------------------
+static int CeedVectorSetValueStrided_Memcheck(CeedVector vec, CeedSize start, CeedSize step, CeedScalar val) {
+  CeedSize             length;
+  CeedVector_Memcheck *impl;
+
+  CeedCallBackend(CeedVectorGetData(vec, &impl));
+  CeedCallBackend(CeedVectorGetLength(vec, &length));
+
+  if (!impl->array_allocated) CeedCallBackend(CeedVectorSetArray_Memcheck(vec, CEED_MEM_HOST, CEED_COPY_VALUES, NULL));
+  assert(impl->array_allocated);
+  for (CeedSize i = start; i < length; i += step) impl->array_allocated[i] = val;
+  return CEED_ERROR_SUCCESS;
+}
+
 //------------------------------------------------------------------------------
 // Sync arrays
 //------------------------------------------------------------------------------
@@ -267,6 +300,84 @@ static int CeedVectorRestoreArrayRead_Memcheck(CeedVector vec) {
   return CEED_ERROR_SUCCESS;
 }
 
+//------------------------------------------------------------------------------
+// Take reciprocal of a vector
+//------------------------------------------------------------------------------
+static int CeedVectorReciprocal_Memcheck(CeedVector vec) {
+  CeedSize             length;
+  CeedVector_Memcheck *impl;
+
+  CeedCallBackend(CeedVectorGetData(vec, &impl));
+  CeedCallBackend(CeedVectorGetLength(vec, &length));
+
+  for (CeedSize i = 0; i < length; i++) {
+    if (fabs(impl->array_allocated[i]) > CEED_EPSILON) impl->array_allocated[i] = 1. / impl->array_allocated[i];
+  }
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
+// Compute x = alpha x
+//------------------------------------------------------------------------------
+static int CeedVectorScale_Memcheck(CeedVector x, CeedScalar alpha) {
+  CeedSize             length;
+  CeedVector_Memcheck *impl;
+
+  CeedCallBackend(CeedVectorGetData(x, &impl));
+  CeedCallBackend(CeedVectorGetLength(x, &length));
+
+  for (CeedSize i = 0; i < length; i++) impl->array_allocated[i] *= alpha;
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
+// Compute y = alpha x + y
+//------------------------------------------------------------------------------
+static int CeedVectorAXPY_Memcheck(CeedVector y, CeedScalar alpha, CeedVector x) {
+  CeedSize             length;
+  CeedVector_Memcheck *impl_x, *impl_y;
+
+  CeedCallBackend(CeedVectorGetData(x, &impl_x));
+  CeedCallBackend(CeedVectorGetData(y, &impl_y));
+  CeedCallBackend(CeedVectorGetLength(y, &length));
+
+  for (CeedSize i = 0; i < length; i++) impl_y->array_allocated[i] += alpha * impl_x->array_allocated[i];
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
+// Compute y = alpha x + beta y
+//------------------------------------------------------------------------------
+static int CeedVectorAXPBY_Memcheck(CeedVector y, CeedScalar alpha, CeedScalar beta, CeedVector x) {
+  CeedSize             length;
+  CeedVector_Memcheck *impl_x, *impl_y;
+
+  CeedCallBackend(CeedVectorGetData(x, &impl_x));
+  CeedCallBackend(CeedVectorGetData(y, &impl_y));
+  CeedCallBackend(CeedVectorGetLength(y, &length));
+
+  for (CeedSize i = 0; i < length; i++) impl_y->array_allocated[i] = alpha * impl_x->array_allocated[i] + beta * impl_y->array_allocated[i];
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
+// Compute the pointwise multiplication w = x .* y
+//------------------------------------------------------------------------------
+static int CeedVectorPointwiseMult_Memcheck(CeedVector w, CeedVector x, CeedVector y) {
+  CeedSize             length;
+  CeedVector_Memcheck *impl_x, *impl_y, *impl_w;
+
+  CeedCallBackend(CeedVectorGetData(x, &impl_x));
+  CeedCallBackend(CeedVectorGetData(y, &impl_y));
+  CeedCallBackend(CeedVectorGetData(w, &impl_w));
+  CeedCallBackend(CeedVectorGetLength(w, &length));
+
+  if (!impl_w->array_allocated) CeedCallBackend(CeedVectorSetArray_Memcheck(w, CEED_MEM_HOST, CEED_COPY_VALUES, NULL));
+  assert(impl_w->array_allocated);
+  for (CeedSize i = 0; i < length; i++) impl_w->array_allocated[i] = impl_x->array_allocated[i] * impl_y->array_allocated[i];
+  return CEED_ERROR_SUCCESS;
+}
+
 //------------------------------------------------------------------------------
 // Vector Destroy
 //------------------------------------------------------------------------------
@@ -304,6 +415,8 @@ int CeedVectorCreate_Memcheck(CeedSize n, CeedVector vec) {
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "HasValidArray", CeedVectorHasValidArray_Memcheck));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "HasBorrowedArrayOfType", CeedVectorHasBorrowedArrayOfType_Memcheck));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "SetArray", CeedVectorSetArray_Memcheck));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "SetValue", CeedVectorSetValue_Memcheck));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "SetValueStrided", CeedVectorSetValueStrided_Memcheck));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "SyncArray", CeedVectorSyncArray_Memcheck));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "TakeArray", CeedVectorTakeArray_Memcheck));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "GetArray", CeedVectorGetArray_Memcheck));
@@ -311,6 +424,11 @@ int CeedVectorCreate_Memcheck(CeedSize n, CeedVector vec) {
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "GetArrayWrite", CeedVectorGetArrayWrite_Memcheck));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "RestoreArray", CeedVectorRestoreArray_Memcheck));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "RestoreArrayRead", CeedVectorRestoreArrayRead_Memcheck));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "Reciprocal", CeedVectorReciprocal_Memcheck));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "Scale", CeedVectorScale_Memcheck));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "AXPY", CeedVectorAXPY_Memcheck));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "AXPBY", CeedVectorAXPBY_Memcheck));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "PointwiseMult", CeedVectorPointwiseMult_Memcheck));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "Destroy", CeedVectorDestroy_Memcheck));
   return CEED_ERROR_SUCCESS;
 }

From 73501bfef0b51f07a8a68767e825f66e6042112c Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Mon, 23 Sep 2024 10:46:53 -0600
Subject: [PATCH 174/571] ceed - add Ceed[Get,Restore]WorkVector

---
 include/ceed-impl.h    |  23 ++++++---
 include/ceed/backend.h |   2 +
 interface/ceed.c       | 113 +++++++++++++++++++++++++++++++++++++++++
 tests/t130-vector.c    |  44 ++++++++++++++++
 4 files changed, 175 insertions(+), 7 deletions(-)
 create mode 100644 tests/t130-vector.c

diff --git a/include/ceed-impl.h b/include/ceed-impl.h
index 0b76071a08..902dbbe35c 100644
--- a/include/ceed-impl.h
+++ b/include/ceed-impl.h
@@ -82,6 +82,14 @@ typedef struct {
   Ceed  delegate;
 } ObjDelegate;
 
+// Work vector tracking
+typedef struct CeedWorkVectors_private *CeedWorkVectors;
+struct CeedWorkVectors_private {
+  CeedInt     num_vecs, max_vecs;
+  bool       *is_in_use;
+  CeedVector *vecs;
+};
+
 struct Ceed_private {
   const char  *resource;
   Ceed         delegate;
@@ -113,13 +121,14 @@ struct Ceed_private {
   int (*OperatorCreate)(CeedOperator);
   int (*OperatorCreateAtPoints)(CeedOperator);
   int (*CompositeOperatorCreate)(CeedOperator);
-  int      ref_count;
-  void    *data;
-  bool     is_debug;
-  bool     has_valid_op_fallback_resource;
-  bool     is_deterministic;
-  char     err_msg[CEED_MAX_RESOURCE_LEN];
-  FOffset *f_offsets;
+  int             ref_count;
+  void           *data;
+  bool            is_debug;
+  bool            has_valid_op_fallback_resource;
+  bool            is_deterministic;
+  char            err_msg[CEED_MAX_RESOURCE_LEN];
+  FOffset        *f_offsets;
+  CeedWorkVectors work_vectors;
 };
 
 struct CeedVector_private {
diff --git a/include/ceed/backend.h b/include/ceed/backend.h
index d6c01735f2..05da6f8981 100644
--- a/include/ceed/backend.h
+++ b/include/ceed/backend.h
@@ -252,6 +252,8 @@ CEED_EXTERN int CeedSetBackendFunctionImpl(Ceed ceed, const char *type, void *ob
 CEED_EXTERN int CeedGetData(Ceed ceed, void *data);
 CEED_EXTERN int CeedSetData(Ceed ceed, void *data);
 CEED_EXTERN int CeedReference(Ceed ceed);
+CEED_EXTERN int CeedGetWorkVector(Ceed ceed, CeedSize len, CeedVector *vec);
+CEED_EXTERN int CeedRestoreWorkVector(Ceed ceed, CeedVector *vec);
 
 CEED_EXTERN int CeedVectorHasValidArray(CeedVector vec, bool *has_valid_array);
 CEED_EXTERN int CeedVectorHasBorrowedArrayOfType(CeedVector vec, CeedMemType mem_type, bool *has_borrowed_array_of_type);
diff --git a/interface/ceed.c b/interface/ceed.c
index b15cba3a51..1becb3de14 100644
--- a/interface/ceed.c
+++ b/interface/ceed.c
@@ -138,6 +138,43 @@ int CeedRegisterImpl(const char *prefix, int (*init)(const char *, Ceed), unsign
   return CEED_ERROR_SUCCESS;
 }
 
+/**
+  @brief Create a work vector space for a `ceed`
+
+  @param[in,out] ceed `Ceed` to create work vector space for
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Developer
+**/
+static int CeedWorkVectorsCreate(Ceed ceed) {
+  CeedCall(CeedCalloc(1, &ceed->work_vectors));
+  return CEED_ERROR_SUCCESS;
+}
+
+/**
+  @brief Destroy a work vector space for a `ceed`
+
+  @param[in,out] ceed `Ceed` to destroy work vector space for
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Developer
+**/
+static int CeedWorkVectorsDestroy(Ceed ceed) {
+  if (!ceed->work_vectors) return CEED_ERROR_SUCCESS;
+  for (CeedSize i = 0; i < ceed->work_vectors->num_vecs; i++) {
+    CeedCheck(!ceed->work_vectors->is_in_use[i], ceed, CEED_ERROR_ACCESS, "Work vector %" CeedSize_FMT " checked out but not returned");
+    ceed->ref_count += 2;  // Note: increase ref_count to prevent Ceed destructor from triggering again
+    CeedCall(CeedVectorDestroy(&ceed->work_vectors->vecs[i]));
+    ceed->ref_count -= 1;  // Note: restore ref_count
+  }
+  CeedCall(CeedFree(&ceed->work_vectors->is_in_use));
+  CeedCall(CeedFree(&ceed->work_vectors->vecs));
+  CeedCall(CeedFree(&ceed->work_vectors));
+  return CEED_ERROR_SUCCESS;
+}
+
 /// @}
 
 /// ----------------------------------------------------------------------------
@@ -751,6 +788,81 @@ int CeedReference(Ceed ceed) {
   return CEED_ERROR_SUCCESS;
 }
 
+/**
+  @brief Get a `CeedVector` for scratch work from a `Ceed` context.
+
+  Note: This vector must be restored with @ref CeedRestoreWorkVector().
+
+  @param[in]  ceed `Ceed` context
+  @param[in]  len  Minimum length of work vector
+  @param[out] vec  Address of the variable where `CeedVector` will be stored
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Backend
+**/
+int CeedGetWorkVector(Ceed ceed, CeedSize len, CeedVector *vec) {
+  CeedInt i = 0;
+
+  if (!ceed->work_vectors) CeedCall(CeedWorkVectorsCreate(ceed));
+
+  // Search for big enough work vector
+  for (i = 0; i < ceed->work_vectors->num_vecs; i++) {
+    if (!ceed->work_vectors->is_in_use[i]) {
+      CeedSize work_len;
+
+      CeedCall(CeedVectorGetLength(ceed->work_vectors->vecs[i], &work_len));
+      if (work_len >= len) break;
+    }
+  }
+  // Long enough vector was not found
+  if (i == ceed->work_vectors->num_vecs) {
+    if (ceed->work_vectors->max_vecs == 0) {
+      ceed->work_vectors->max_vecs = 1;
+      CeedCall(CeedCalloc(ceed->work_vectors->max_vecs, &ceed->work_vectors->vecs));
+      CeedCall(CeedCalloc(ceed->work_vectors->max_vecs, &ceed->work_vectors->is_in_use));
+    } else if (ceed->work_vectors->max_vecs == i) {
+      ceed->work_vectors->max_vecs *= 2;
+      CeedCall(CeedRealloc(ceed->work_vectors->max_vecs, &ceed->work_vectors->vecs));
+      CeedCall(CeedRealloc(ceed->work_vectors->max_vecs, &ceed->work_vectors->is_in_use));
+    }
+    ceed->work_vectors->num_vecs++;
+    CeedCallBackend(CeedVectorCreate(ceed, len, &ceed->work_vectors->vecs[i]));
+    ceed->ref_count--;  // Note: ref_count manipulation to prevent a ref-loop
+  }
+  // Return pointer to work vector
+  ceed->work_vectors->is_in_use[i] = true;
+  *vec                             = NULL;
+  CeedCall(CeedVectorReferenceCopy(ceed->work_vectors->vecs[i], vec));
+  ceed->ref_count++;  // Note: bump ref_count to account for external access
+  return CEED_ERROR_SUCCESS;
+}
+
+/**
+  @brief Restore a `CeedVector` for scratch work from a `Ceed` context from @ref CeedGetWorkVector()
+
+  @param[in]  ceed `Ceed` context
+  @param[out] vec  `CeedVector` to restore
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Backend
+**/
+int CeedRestoreWorkVector(Ceed ceed, CeedVector *vec) {
+  for (CeedInt i = 0; i < ceed->work_vectors->num_vecs; i++) {
+    if (*vec == ceed->work_vectors->vecs[i]) {
+      CeedCheck(ceed->work_vectors->is_in_use[i], ceed, CEED_ERROR_ACCESS, "Work vector %" CeedSize_FMT " was not checked out but is being returned");
+      CeedCall(CeedVectorDestroy(vec));
+      ceed->work_vectors->is_in_use[i] = false;
+      ceed->ref_count--;  // Note: reduce ref_count again to prevent a ref-loop
+      return CEED_ERROR_SUCCESS;
+    }
+  }
+  // LCOV_EXCL_START
+  return CeedError(ceed, CEED_ERROR_MAJOR, "vec was not checked out via CeedGetWorkVector()");
+  // LCOV_EXCL_STOP
+}
+
 /// @}
 
 /// ----------------------------------------------------------------------------
@@ -1200,6 +1312,7 @@ int CeedDestroy(Ceed *ceed) {
   CeedCall(CeedFree(&(*ceed)->resource));
   CeedCall(CeedDestroy(&(*ceed)->op_fallback_ceed));
   CeedCall(CeedFree(&(*ceed)->op_fallback_resource));
+  CeedCall(CeedWorkVectorsDestroy(*ceed));
   CeedCall(CeedFree(ceed));
   return CEED_ERROR_SUCCESS;
 }
diff --git a/tests/t130-vector.c b/tests/t130-vector.c
new file mode 100644
index 0000000000..d223a1ad06
--- /dev/null
+++ b/tests/t130-vector.c
@@ -0,0 +1,44 @@
+/// @file
+/// Test getting and restoring work vectors
+/// \test Test getting and restoring work vectors
+
+#include <ceed.h>
+#include <ceed/backend.h>
+#include <stdio.h>
+
+int main(int argc, char **argv) {
+  Ceed ceed;
+
+  CeedInit(argv[1], &ceed);
+
+  // Check for getting the same work vector back
+  {
+    CeedVector x, y;
+
+    CeedGetWorkVector(ceed, 20, &x);
+    // Do not do this!
+    CeedVector x_copy = x;
+
+    CeedRestoreWorkVector(ceed, &x);
+    CeedGetWorkVector(ceed, 20, &y);
+    if (y != x_copy) printf("failed to return same work vector");
+    CeedRestoreWorkVector(ceed, &y);
+  }
+
+  // Check for getting a new work vector back
+  {
+    CeedVector x, y;
+
+    CeedGetWorkVector(ceed, 20, &x);
+    // Do not do this!
+    CeedVector x_copy = x;
+
+    CeedRestoreWorkVector(ceed, &x);
+    CeedGetWorkVector(ceed, 30, &y);
+    if (y == x_copy) printf("failed to return new work vector");
+    CeedRestoreWorkVector(ceed, &y);
+  }
+
+  CeedDestroy(&ceed);
+  return 0;
+}

From 2bf66f3ba75e883c03ae18c8a6b7ba12e94847f2 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Wed, 25 Sep 2024 16:28:38 -0600
Subject: [PATCH 175/571] memcheck - clearer QF output NaN error msg

---
 backends/memcheck/ceed-memcheck-qfunction.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/backends/memcheck/ceed-memcheck-qfunction.c b/backends/memcheck/ceed-memcheck-qfunction.c
index 79575b3ad2..1bb8158584 100644
--- a/backends/memcheck/ceed-memcheck-qfunction.c
+++ b/backends/memcheck/ceed-memcheck-qfunction.c
@@ -75,14 +75,16 @@ static int CeedQFunctionApply_Memcheck(CeedQFunction qf, CeedInt Q, CeedVector *
     CeedCallBackend(CeedQFunctionGetKernelName(qf, &kernel_name));
     CeedCallBackend(CeedQFunctionGetFields(qf, NULL, NULL, NULL, &output_fields));
     for (CeedInt i = 0; i < num_out; i++) {
-      CeedInt field_size;
+      const char *field_name;
+      CeedInt     field_size;
 
       // Note: need field size because vector may be longer than needed for output
       CeedCallBackend(CeedQFunctionFieldGetSize(output_fields[i], &field_size));
+      CeedCallBackend(CeedQFunctionFieldGetName(output_fields[i], &field_name));
       for (CeedSize j = 0; j < field_size * (CeedSize)Q; j++) {
         CeedCheck(!isnan(impl->outputs[i][j]), ceed, CEED_ERROR_BACKEND,
-                  "QFunction output %" CeedInt_FMT " entry %" CeedSize_FMT " is NaN after restoring write-only access: %s:%s ", i, j, kernel_path,
-                  kernel_name);
+                  "QFunction output %" CeedInt_FMT " '%s' entry %" CeedSize_FMT " is NaN after restoring write-only access: %s:%s ", i, field_name, j,
+                  kernel_path, kernel_name);
       }
       CeedCallBackend(CeedVectorRestoreArray(V[i], &impl->outputs[i]));
       VALGRIND_DISCARD(output_block_ids[i]);

From 6c10af5d44be86c880e303f5037addb2f5724932 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Thu, 26 Sep 2024 13:30:24 -0600
Subject: [PATCH 176/571] minor - style and lcov fixes

---
 backends/memcheck/ceed-memcheck-vector.c |  3 ++-
 examples/ceed/ex1-volume.c               |  3 ++-
 examples/ceed/ex2-surface.c              |  3 ++-
 examples/fluids/src/petsc_ops.c          | 12 ++++++++----
 examples/fluids/src/setuplibceed.c       |  3 ++-
 examples/petsc/bpsraw.c                  |  3 ++-
 examples/solids/src/setup-libceed.c      |  5 -----
 interface/ceed-preconditioning.c         |  3 ++-
 tests/t217-elemrestriction.c             |  5 +++--
 tests/t319-basis.c                       | 10 ++++++++--
 tests/t530-operator.c                    |  3 ++-
 tests/t531-operator.c                    |  5 ++++-
 tests/t533-operator.c                    |  6 ++++--
 tests/t592-operator.c                    |  3 ++-
 14 files changed, 43 insertions(+), 24 deletions(-)

diff --git a/backends/memcheck/ceed-memcheck-vector.c b/backends/memcheck/ceed-memcheck-vector.c
index ca86978165..325fd52c34 100644
--- a/backends/memcheck/ceed-memcheck-vector.c
+++ b/backends/memcheck/ceed-memcheck-vector.c
@@ -259,8 +259,9 @@ static int CeedVectorRestoreArray_Memcheck(CeedVector vec) {
   // Check for unset entries after write-only access
   if (impl->is_write_only_access) {
     for (CeedSize i = 0; i < length; i++) {
-      if (isnan(impl->array_writable_copy[i]))
+      if (isnan(impl->array_writable_copy[i])) {
         CeedDebug256(ceed, CEED_DEBUG_COLOR_WARNING, "WARNING: Vec entry %" CeedSize_FMT " is NaN after restoring write-only access", i);
+      }
     }
     impl->is_write_only_access = false;
   }
diff --git a/examples/ceed/ex1-volume.c b/examples/ceed/ex1-volume.c
index 04852f28c1..88bd472bf1 100644
--- a/examples/ceed/ex1-volume.c
+++ b/examples/ceed/ex1-volume.c
@@ -318,8 +318,9 @@ int BuildCartesianRestriction(Ceed ceed, CeedInt dim, CeedInt num_xyz[dim], Ceed
   }
   CeedElemRestrictionCreate(ceed, num_elem, num_nodes, num_comp, scalar_size, num_comp * scalar_size, CEED_MEM_HOST, CEED_COPY_VALUES, elem_nodes,
                             restriction);
-  if (q_data_restriction)
+  if (q_data_restriction) {
     CeedElemRestrictionCreateStrided(ceed, num_elem, elem_qpts, num_comp, num_comp * elem_qpts * num_elem, CEED_STRIDES_BACKEND, q_data_restriction);
+  }
   free(elem_nodes);
   return 0;
 }
diff --git a/examples/ceed/ex2-surface.c b/examples/ceed/ex2-surface.c
index 2264f26c25..77b1480227 100644
--- a/examples/ceed/ex2-surface.c
+++ b/examples/ceed/ex2-surface.c
@@ -336,9 +336,10 @@ int BuildCartesianRestriction(Ceed ceed, CeedInt dim, CeedInt num_xyz[3], CeedIn
       local_elem_nodes[l_nodes] = g_nodes;
     }
   }
-  if (restriction)
+  if (restriction) {
     CeedElemRestrictionCreate(ceed, num_elem, num_nodes, num_comp, scalar_size, num_comp * scalar_size, CEED_MEM_HOST, CEED_COPY_VALUES, el_nodes,
                               restriction);
+  }
   free(el_nodes);
 
   if (q_data_restriction) {
diff --git a/examples/fluids/src/petsc_ops.c b/examples/fluids/src/petsc_ops.c
index 398796f33b..f40e156af2 100644
--- a/examples/fluids/src/petsc_ops.c
+++ b/examples/fluids/src/petsc_ops.c
@@ -71,33 +71,37 @@ PetscErrorCode OperatorApplyContextCreate(DM dm_x, DM dm_y, Ceed ceed, CeedOpera
       PetscCall(VecGetLocalSize(X_loc, &X_size));
       PetscCheck(X_size == x_size, PETSC_COMM_WORLD, PETSC_ERR_ARG_SIZ,
                  "X_loc (%" PetscInt_FMT ") not correct size for CeedOperator active input size (%" CeedSize_FMT ")", X_size, x_size);
-      if (dm_x)
+      if (dm_x) {
         PetscCheck(X_size == dm_X_size, PETSC_COMM_WORLD, PETSC_ERR_ARG_SIZ,
                    "X_loc size (%" PetscInt_FMT ") does not match dm_x local vector size (%" PetscInt_FMT ")", X_size, dm_X_size);
+      }
     }
     if (Y_loc) {
       PetscCall(VecGetLocalSize(Y_loc, &Y_size));
       PetscCheck(Y_size == y_size, PETSC_COMM_WORLD, PETSC_ERR_ARG_SIZ,
                  "Y_loc (%" PetscInt_FMT ") not correct size for CeedOperator active output size (%" CeedSize_FMT ")", Y_size, y_size);
-      if (dm_y)
+      if (dm_y) {
         PetscCheck(Y_size == dm_Y_size, PETSC_COMM_WORLD, PETSC_ERR_ARG_SIZ,
                    "Y_loc size (%" PetscInt_FMT ") does not match dm_y local vector size (%" PetscInt_FMT ")", Y_size, dm_Y_size);
+      }
     }
     if (x_ceed && x_ceed != CEED_VECTOR_NONE) {
       PetscCallCeed(ceed, CeedVectorGetLength(x_ceed, &x_ceed_size));
       PetscCheck(x_size >= 0 ? x_ceed_size == x_size : true, PETSC_COMM_WORLD, PETSC_ERR_ARG_SIZ,
                  "x_ceed (%" CeedSize_FMT ") not correct size for CeedOperator active input size (%" CeedSize_FMT ")", x_ceed_size, x_size);
-      if (dm_x)
+      if (dm_x) {
         PetscCheck(x_ceed_size == dm_X_size, PETSC_COMM_WORLD, PETSC_ERR_ARG_SIZ,
                    "x_ceed size (%" CeedSize_FMT ") does not match dm_x local vector size (%" PetscInt_FMT ")", x_ceed_size, dm_X_size);
+      }
     }
     if (y_ceed && y_ceed != CEED_VECTOR_NONE) {
       PetscCallCeed(ceed, CeedVectorGetLength(y_ceed, &y_ceed_size));
       PetscCheck(y_ceed_size == y_size, PETSC_COMM_WORLD, PETSC_ERR_ARG_SIZ,
                  "y_ceed (%" CeedSize_FMT ") not correct size for CeedOperator active input size (%" CeedSize_FMT ")", y_ceed_size, y_size);
-      if (dm_y)
+      if (dm_y) {
         PetscCheck(y_ceed_size == dm_Y_size, PETSC_COMM_WORLD, PETSC_ERR_ARG_SIZ,
                    "y_ceed size (%" CeedSize_FMT ") does not match dm_y local vector size (%" PetscInt_FMT ")", y_ceed_size, dm_Y_size);
+      }
     }
   }
 
diff --git a/examples/fluids/src/setuplibceed.c b/examples/fluids/src/setuplibceed.c
index 29b2d8f825..a5bdfe1bb1 100644
--- a/examples/fluids/src/setuplibceed.c
+++ b/examples/fluids/src/setuplibceed.c
@@ -117,8 +117,9 @@ static PetscErrorCode AddBCSubOperator(Ceed ceed, DM dm, CeedData ceed_data, DML
   PetscCallCeed(ceed, CeedOperatorSetField(op_apply_bc, "surface qdata", elem_restr_qd_i_sur, CEED_BASIS_NONE, q_data_sur));
   PetscCallCeed(ceed, CeedOperatorSetField(op_apply_bc, "x", elem_restr_x_sur, basis_x_sur, ceed_data->x_coord));
   PetscCallCeed(ceed, CeedOperatorSetField(op_apply_bc, "v", elem_restr_q_sur, basis_q_sur, CEED_VECTOR_ACTIVE));
-  if (elem_restr_jd_i_sur)
+  if (elem_restr_jd_i_sur) {
     PetscCallCeed(ceed, CeedOperatorSetField(op_apply_bc, "surface jacobian data", elem_restr_jd_i_sur, CEED_BASIS_NONE, jac_data_sur));
+  }
 
   if (qf_apply_bc_jacobian && elem_restr_jd_i_sur) {
     PetscCallCeed(ceed, CeedOperatorCreate(ceed, qf_apply_bc_jacobian, NULL, NULL, &op_apply_bc_jacobian));
diff --git a/examples/petsc/bpsraw.c b/examples/petsc/bpsraw.c
index e43e6567c3..8a6e21bc93 100644
--- a/examples/petsc/bpsraw.c
+++ b/examples/petsc/bpsraw.c
@@ -506,8 +506,9 @@ int main(int argc, char **argv) {
           l_to_g_ind[here] = g_start[ir][jr][kr] + (ii * g_m_nodes[ir][jr][kr][1] + jj) * g_m_nodes[ir][jr][kr][2] + kk;
           if ((i_rank[0] == 0 && i == 0) || (i_rank[1] == 0 && j == 0) || (i_rank[2] == 0 && k == 0) ||
               (i_rank[0] + 1 == p[0] && i + 1 == l_nodes[0]) || (i_rank[1] + 1 == p[1] && j + 1 == l_nodes[1]) ||
-              (i_rank[2] + 1 == p[2] && k + 1 == l_nodes[2]))
+              (i_rank[2] + 1 == p[2] && k + 1 == l_nodes[2])) {
             continue;
+          }
           l_to_g_ind_0[l_0_count] = l_to_g_ind[here];
           loc_ind[l_0_count++]    = here;
         }
diff --git a/examples/solids/src/setup-libceed.c b/examples/solids/src/setup-libceed.c
index 608278ec5c..717fda952f 100644
--- a/examples/solids/src/setup-libceed.c
+++ b/examples/solids/src/setup-libceed.c
@@ -20,11 +20,6 @@
 #include "../qfunctions/manufactured-force.h"  // Manufactured solution forcing
 #include "../qfunctions/traction-boundary.h"   // Traction boundaries
 
-#if PETSC_VERSION_LT(3, 14, 0)
-#define DMPlexGetClosureIndices(a, b, c, d, e, f, g, h, i) DMPlexGetClosureIndices(a, b, c, d, f, g, i)
-#define DMPlexRestoreClosureIndices(a, b, c, d, e, f, g, h, i) DMPlexRestoreClosureIndices(a, b, c, d, f, g, i)
-#endif
-
 // -----------------------------------------------------------------------------
 // Problem options
 // -----------------------------------------------------------------------------
diff --git a/interface/ceed-preconditioning.c b/interface/ceed-preconditioning.c
index afc2089463..325c5c0017 100644
--- a/interface/ceed-preconditioning.c
+++ b/interface/ceed-preconditioning.c
@@ -2848,8 +2848,9 @@ int CeedOperatorCreateFDMElementInverse(CeedOperator op, CeedOperator *fdm_inv,
     CeedCall(CeedVectorGetArrayWrite(q_data, CEED_MEM_HOST, &q_data_array));
     for (CeedInt e = 0; e < num_elem; e++) {
       for (CeedInt c = 0; c < num_comp; c++) {
-        for (CeedInt n = 0; n < num_nodes; n++)
+        for (CeedInt n = 0; n < num_nodes; n++) {
           q_data_array[(e * num_comp + c) * num_nodes + n] = 1. / (elem_avg[e] * fdm_diagonal[c * num_nodes + n]);
+        }
       }
     }
     CeedCall(CeedFree(&elem_avg));
diff --git a/tests/t217-elemrestriction.c b/tests/t217-elemrestriction.c
index ca4f62a048..b9c52c52eb 100644
--- a/tests/t217-elemrestriction.c
+++ b/tests/t217-elemrestriction.c
@@ -55,10 +55,11 @@ int main(int argc, char **argv) {
 
     CeedVectorGetArrayRead(x, CEED_MEM_HOST, &x_array);
     for (CeedInt i = 0; i < num_elem + 1; i++) {
-      if (x_array[i] != (10 + i) * (i > 0 && i < num_elem ? 2.0 : 1.0))
+      if (x_array[i] != (10 + i) * (i > 0 && i < num_elem ? 2.0 : 1.0)) {
         // LCOV_EXCL_START
         printf("Error in restricted array x[%" CeedInt_FMT "] = %f\n", i, (CeedScalar)x_array[i]);
-      // LCOV_EXCL_STOP
+        // LCOV_EXCL_STOP
+      }
     }
     CeedVectorRestoreArrayRead(x, &x_array);
   }
diff --git a/tests/t319-basis.c b/tests/t319-basis.c
index 18afff0a9e..f63299867a 100644
--- a/tests/t319-basis.c
+++ b/tests/t319-basis.c
@@ -224,13 +224,19 @@ int main(int argc, char **argv) {
     CeedScalar tol = 100 * CEED_EPSILON;
 
     for (CeedInt i = 0; i < 4 * 8; i++) {
-      if (fabs(interp_proj[i] - ((CeedScalar *)interp_proj_ref)[i]) > tol)
+      if (fabs(interp_proj[i] - ((CeedScalar *)interp_proj_ref)[i]) > tol) {
+        // LCOV_EXCL_START
         printf("Mixed Topology Projection: interp[%" CeedInt_FMT "] expected %f, got %f\n", i, interp_proj[i], ((CeedScalar *)interp_proj_ref)[i]);
+        // LCOV_EXCL_STOP
+      }
     }
 
     for (CeedInt i = 0; i < 3 * 4 * 8; i++) {
-      if (fabs(grad_proj[i] - ((CeedScalar *)grad_proj_ref)[i]) > tol)
+      if (fabs(grad_proj[i] - ((CeedScalar *)grad_proj_ref)[i]) > tol) {
+        // LCOV_EXCL_START
         printf("Mixed Topology Projection: grad[%" CeedInt_FMT "] expected %f, got %f\n", i, grad_proj[i], ((CeedScalar *)grad_proj_ref)[i]);
+        // LCOV_EXCL_STOP
+      }
     }
 
     CeedBasisDestroy(&basis_face);
diff --git a/tests/t530-operator.c b/tests/t530-operator.c
index d9d18083b0..60716e4544 100644
--- a/tests/t530-operator.c
+++ b/tests/t530-operator.c
@@ -94,12 +94,13 @@ int main(int argc, char **argv) {
 
     CeedVectorGetArrayRead(qf_assembled, CEED_MEM_HOST, &assembled_array);
     CeedVectorGetArrayRead(q_data, CEED_MEM_HOST, &q_data_array);
-    for (CeedInt i = 0; i < num_qpts; i++)
+    for (CeedInt i = 0; i < num_qpts; i++) {
       if (fabs(q_data_array[i] - assembled_array[i]) > 1e-9) {
         // LCOV_EXCL_START
         printf("Error: qf_assembled[%" CeedInt_FMT "] = %f != %f\n", i, assembled_array[i], q_data_array[i]);
         // LCOV_EXCL_STOP
       }
+    }
     CeedVectorRestoreArrayRead(qf_assembled, &assembled_array);
     CeedVectorRestoreArrayRead(q_data, &q_data_array);
   }
diff --git a/tests/t531-operator.c b/tests/t531-operator.c
index 39168ecba6..767f6a769e 100644
--- a/tests/t531-operator.c
+++ b/tests/t531-operator.c
@@ -131,8 +131,11 @@ int main(int argc, char **argv) {
     CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array);
     CeedVectorGetArrayRead(v_assembled, CEED_MEM_HOST, &v_assembled_array);
     for (CeedInt i = 0; i < num_dofs; i++) {
-      if (fabs(v_array[i] - v_assembled_array[i]) > 100. * CEED_EPSILON)
+      if (fabs(v_array[i] - v_assembled_array[i]) > 100. * CEED_EPSILON) {
+        // LCOV_EXCL_START
         printf("Error: Linearized operator computed v[i] = %f != %f\n", v_assembled_array[i], v_array[i]);
+        // LCOV_EXCL_STOP
+      }
     }
     CeedVectorRestoreArrayRead(v, &v_array);
     CeedVectorRestoreArrayRead(v_assembled, &v_assembled_array);
diff --git a/tests/t533-operator.c b/tests/t533-operator.c
index 2a19143ffa..4ac1c523de 100644
--- a/tests/t533-operator.c
+++ b/tests/t533-operator.c
@@ -28,11 +28,12 @@ int main(int argc, char **argv) {
   {
     CeedScalar x_array[dim * num_dofs];
 
-    for (CeedInt i = 0; i < nx * 2 + 1; i++)
+    for (CeedInt i = 0; i < nx * 2 + 1; i++) {
       for (CeedInt j = 0; j < ny * 2 + 1; j++) {
         x_array[i + j * (nx * 2 + 1) + 0 * num_dofs] = (CeedScalar)i / (2 * nx);
         x_array[i + j * (nx * 2 + 1) + 1 * num_dofs] = (CeedScalar)j / (2 * ny);
       }
+    }
     CeedVectorSetArray(x, CEED_MEM_HOST, CEED_COPY_VALUES, x_array);
   }
   CeedVectorCreate(ceed, num_dofs, &u);
@@ -45,8 +46,9 @@ int main(int argc, char **argv) {
     col    = i % nx;
     row    = i / nx;
     offset = col * (p - 1) + row * (nx * 2 + 1) * (p - 1);
-    for (CeedInt j = 0; j < p; j++)
+    for (CeedInt j = 0; j < p; j++) {
       for (CeedInt k = 0; k < p; k++) ind_x[p * (p * i + k) + j] = offset + k * (nx * 2 + 1) + j;
+    }
   }
   CeedElemRestrictionCreate(ceed, num_elem, p * p, dim, num_dofs, dim * num_dofs, CEED_MEM_HOST, CEED_USE_POINTER, ind_x, &elem_restriction_x);
   CeedElemRestrictionCreate(ceed, num_elem, p * p, 1, 1, num_dofs, CEED_MEM_HOST, CEED_USE_POINTER, ind_x, &elem_restriction_u);
diff --git a/tests/t592-operator.c b/tests/t592-operator.c
index 91e519f3bb..1650e0fa89 100644
--- a/tests/t592-operator.c
+++ b/tests/t592-operator.c
@@ -173,12 +173,13 @@ int main(int argc, char **argv) {
 
     CeedVectorGetArrayRead(qf_assembled, CEED_MEM_HOST, &assembled_array);
     CeedVectorGetArrayRead(q_data, CEED_MEM_HOST, &q_data_array);
-    for (CeedInt i = 0; i < num_points; i++)
+    for (CeedInt i = 0; i < num_points; i++) {
       if (fabs(q_data_array[i] - assembled_array[i]) > 1e-9) {
         // LCOV_EXCL_START
         printf("Error: qf_assembled[%" CeedInt_FMT "] = %f != %f\n", i, assembled_array[i], q_data_array[i]);
         // LCOV_EXCL_STOP
       }
+    }
     CeedVectorRestoreArrayRead(qf_assembled, &assembled_array);
     CeedVectorRestoreArrayRead(q_data, &q_data_array);
   }

From 43e13fee434cb91795700b4c22c1d8174b8fa174 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Tue, 24 Sep 2024 14:47:11 -0600
Subject: [PATCH 177/571] gpu - refactor ref operator

---
 backends/cuda-ref/ceed-cuda-ref-operator.c | 282 +++++++++++----------
 backends/hip-ref/ceed-hip-ref-operator.c   | 282 +++++++++++----------
 2 files changed, 286 insertions(+), 278 deletions(-)

diff --git a/backends/cuda-ref/ceed-cuda-ref-operator.c b/backends/cuda-ref/ceed-cuda-ref-operator.c
index fcc7631e6c..5add9b1e97 100644
--- a/backends/cuda-ref/ceed-cuda-ref-operator.c
+++ b/backends/cuda-ref/ceed-cuda-ref-operator.c
@@ -317,46 +317,45 @@ static int CeedOperatorSetup_Cuda(CeedOperator op) {
 }
 
 //------------------------------------------------------------------------------
-// Setup Operator Inputs
+// Restrict Operator Inputs
 //------------------------------------------------------------------------------
-static inline int CeedOperatorSetupInputs_Cuda(CeedInt num_input_fields, CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields,
-                                               CeedVector in_vec, const bool skip_active, CeedScalar *e_data[2 * CEED_FIELD_MAX],
-                                               CeedOperator_Cuda *impl, CeedRequest *request) {
-  for (CeedInt i = 0; i < num_input_fields; i++) {
-    CeedEvalMode        eval_mode;
-    CeedVector          vec;
-    CeedElemRestriction elem_rstr;
+static inline int CeedOperatorInputRestrict_Cuda(CeedOperatorField op_input_field, CeedQFunctionField qf_input_field, CeedInt input_field,
+                                                 CeedVector in_vec, const bool skip_active, CeedScalar **e_data, CeedOperator_Cuda *impl,
+                                                 CeedRequest *request) {
+  CeedEvalMode        eval_mode;
+  CeedVector          vec;
+  CeedElemRestriction elem_rstr;
+
+  // Get input vector
+  CeedCallBackend(CeedOperatorFieldGetVector(op_input_field, &vec));
+  if (vec == CEED_VECTOR_ACTIVE) {
+    if (skip_active) return CEED_ERROR_SUCCESS;
+    else vec = in_vec;
+  }
 
+  // Restriction action
+  CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_field, &eval_mode));
+  if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
+  } else {
     // Get input vector
-    CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
-    if (vec == CEED_VECTOR_ACTIVE) {
-      if (skip_active) continue;
-      else vec = in_vec;
-    }
-
-    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
-    if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
+    CeedCallBackend(CeedOperatorFieldGetVector(op_input_field, &vec));
+    // Get input element restriction
+    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_field, &elem_rstr));
+    if (vec == CEED_VECTOR_ACTIVE) vec = in_vec;
+    // Restrict, if necessary
+    if (!impl->e_vecs[input_field]) {
+      // No restriction for this field; read data directly from vec.
+      CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, (const CeedScalar **)e_data));
     } else {
-      // Get input vector
-      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
-      // Get input element restriction
-      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
-      if (vec == CEED_VECTOR_ACTIVE) vec = in_vec;
-      // Restrict, if necessary
-      if (!impl->e_vecs[i]) {
-        // No restriction for this field; read data directly from vec.
-        CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, (const CeedScalar **)&e_data[i]));
-      } else {
-        uint64_t state;
+      uint64_t state;
 
-        CeedCallBackend(CeedVectorGetState(vec, &state));
-        if ((state != impl->input_states[i] || vec == in_vec) && !impl->skip_rstr_in[i]) {
-          CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_NOTRANSPOSE, vec, impl->e_vecs[i], request));
-        }
-        impl->input_states[i] = state;
-        // Get evec
-        CeedCallBackend(CeedVectorGetArrayRead(impl->e_vecs[i], CEED_MEM_DEVICE, (const CeedScalar **)&e_data[i]));
+      CeedCallBackend(CeedVectorGetState(vec, &state));
+      if ((state != impl->input_states[input_field] || vec == in_vec) && !impl->skip_rstr_in[input_field]) {
+        CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_NOTRANSPOSE, vec, impl->e_vecs[input_field], request));
       }
+      impl->input_states[input_field] = state;
+      // Get evec
+      CeedCallBackend(CeedVectorGetArrayRead(impl->e_vecs[input_field], CEED_MEM_DEVICE, (const CeedScalar **)e_data));
     }
   }
   return CEED_ERROR_SUCCESS;
@@ -365,42 +364,36 @@ static inline int CeedOperatorSetupInputs_Cuda(CeedInt num_input_fields, CeedQFu
 //------------------------------------------------------------------------------
 // Input Basis Action
 //------------------------------------------------------------------------------
-static inline int CeedOperatorInputBasis_Cuda(CeedInt num_elem, CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields,
-                                              CeedInt num_input_fields, const bool skip_active, CeedScalar *e_data[2 * CEED_FIELD_MAX],
-                                              CeedOperator_Cuda *impl) {
-  for (CeedInt i = 0; i < num_input_fields; i++) {
-    CeedInt             elem_size, size;
-    CeedEvalMode        eval_mode;
-    CeedElemRestriction elem_rstr;
-    CeedBasis           basis;
+static inline int CeedOperatorInputBasis_Cuda(CeedOperatorField op_input_field, CeedQFunctionField qf_input_field, CeedInt input_field,
+                                              CeedInt num_elem, const bool skip_active, CeedScalar *e_data, CeedOperator_Cuda *impl) {
+  CeedEvalMode eval_mode;
 
-    // Skip active input
-    if (skip_active) {
-      CeedVector vec;
+  // Skip active input
+  if (skip_active) {
+    CeedVector vec;
 
-      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
-      if (vec == CEED_VECTOR_ACTIVE) continue;
-    }
-    // Get elem_size, eval_mode, size
-    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
-    CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
-    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
-    CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &size));
-    // Basis action
-    switch (eval_mode) {
-      case CEED_EVAL_NONE:
-        CeedCallBackend(CeedVectorSetArray(impl->q_vecs_in[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data[i]));
-        break;
-      case CEED_EVAL_INTERP:
-      case CEED_EVAL_GRAD:
-      case CEED_EVAL_DIV:
-      case CEED_EVAL_CURL:
-        CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
-        CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_NOTRANSPOSE, eval_mode, impl->e_vecs[i], impl->q_vecs_in[i]));
-        break;
-      case CEED_EVAL_WEIGHT:
-        break;  // No action
+    CeedCallBackend(CeedOperatorFieldGetVector(op_input_field, &vec));
+    if (vec == CEED_VECTOR_ACTIVE) return CEED_ERROR_SUCCESS;
+  }
+
+  // Basis action
+  CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_field, &eval_mode));
+  switch (eval_mode) {
+    case CEED_EVAL_NONE:
+      CeedCallBackend(CeedVectorSetArray(impl->q_vecs_in[input_field], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data));
+      break;
+    case CEED_EVAL_INTERP:
+    case CEED_EVAL_GRAD:
+    case CEED_EVAL_DIV:
+    case CEED_EVAL_CURL: {
+      CeedBasis basis;
+
+      CeedCallBackend(CeedOperatorFieldGetBasis(op_input_field, &basis));
+      CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_NOTRANSPOSE, eval_mode, impl->e_vecs[input_field], impl->q_vecs_in[input_field]));
+      break;
     }
+    case CEED_EVAL_WEIGHT:
+      break;  // No action
   }
   return CEED_ERROR_SUCCESS;
 }
@@ -408,26 +401,26 @@ static inline int CeedOperatorInputBasis_Cuda(CeedInt num_elem, CeedQFunctionFie
 //------------------------------------------------------------------------------
 // Restore Input Vectors
 //------------------------------------------------------------------------------
-static inline int CeedOperatorRestoreInputs_Cuda(CeedInt num_input_fields, CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields,
-                                                 const bool skip_active, CeedScalar *e_data[2 * CEED_FIELD_MAX], CeedOperator_Cuda *impl) {
-  for (CeedInt i = 0; i < num_input_fields; i++) {
-    CeedEvalMode eval_mode;
-    CeedVector   vec;
+static inline int CeedOperatorInputRestore_Cuda(CeedOperatorField op_input_field, CeedQFunctionField qf_input_field, CeedInt input_field,
+                                                const bool skip_active, CeedScalar **e_data, CeedOperator_Cuda *impl) {
+  CeedEvalMode eval_mode;
+  CeedVector   vec;
+
+  // Skip active input
+  if (skip_active) {
+    CeedCallBackend(CeedOperatorFieldGetVector(op_input_field, &vec));
+    if (vec == CEED_VECTOR_ACTIVE) return CEED_ERROR_SUCCESS;
+  }
 
-    // Skip active input
-    if (skip_active) {
-      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
-      if (vec == CEED_VECTOR_ACTIVE) continue;
-    }
-    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
-    if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
+  // Restore e-vec
+  CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_field, &eval_mode));
+  if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
+  } else {
+    if (!impl->e_vecs[input_field]) {  // This was a skip_restriction case
+      CeedCallBackend(CeedOperatorFieldGetVector(op_input_field, &vec));
+      CeedCallBackend(CeedVectorRestoreArrayRead(vec, (const CeedScalar **)e_data));
     } else {
-      if (!impl->e_vecs[i]) {  // This was a skip_restriction case
-        CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
-        CeedCallBackend(CeedVectorRestoreArrayRead(vec, (const CeedScalar **)&e_data[i]));
-      } else {
-        CeedCallBackend(CeedVectorRestoreArrayRead(impl->e_vecs[i], (const CeedScalar **)&e_data[i]));
-      }
+      CeedCallBackend(CeedVectorRestoreArrayRead(impl->e_vecs[input_field], (const CeedScalar **)e_data));
     }
   }
   return CEED_ERROR_SUCCESS;
@@ -454,11 +447,11 @@ static int CeedOperatorApplyAdd_Cuda(CeedOperator op, CeedVector in_vec, CeedVec
   // Setup
   CeedCallBackend(CeedOperatorSetup_Cuda(op));
 
-  // Input Evecs and Restriction
-  CeedCallBackend(CeedOperatorSetupInputs_Cuda(num_input_fields, qf_input_fields, op_input_fields, in_vec, false, e_data, impl, request));
-
-  // Input basis apply if needed
-  CeedCallBackend(CeedOperatorInputBasis_Cuda(num_elem, qf_input_fields, op_input_fields, num_input_fields, false, e_data, impl));
+  // Process inputs
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedCallBackend(CeedOperatorInputRestrict_Cuda(op_input_fields[i], qf_input_fields[i], i, in_vec, false, &e_data[i], impl, request));
+    CeedCallBackend(CeedOperatorInputBasis_Cuda(op_input_fields[i], qf_input_fields[i], i, num_elem, false, e_data[i], impl));
+  }
 
   // Output pointers, as necessary
   for (CeedInt i = 0; i < num_output_fields; i++) {
@@ -476,7 +469,9 @@ static int CeedOperatorApplyAdd_Cuda(CeedOperator op, CeedVector in_vec, CeedVec
   CeedCallBackend(CeedQFunctionApply(qf, num_elem * Q, impl->q_vecs_in, impl->q_vecs_out));
 
   // Restore input arrays
-  CeedCallBackend(CeedOperatorRestoreInputs_Cuda(num_input_fields, qf_input_fields, op_input_fields, false, e_data, impl));
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedCallBackend(CeedOperatorInputRestore_Cuda(op_input_fields[i], qf_input_fields[i], i, false, &e_data[i], impl));
+  }
 
   // Output basis apply if needed
   for (CeedInt i = 0; i < num_output_fields; i++) {
@@ -637,43 +632,38 @@ static int CeedOperatorSetupAtPoints_Cuda(CeedOperator op) {
 //------------------------------------------------------------------------------
 // Input Basis Action AtPoints
 //------------------------------------------------------------------------------
-static inline int CeedOperatorInputBasisAtPoints_Cuda(CeedInt num_elem, const CeedInt *num_points, CeedQFunctionField *qf_input_fields,
-                                                      CeedOperatorField *op_input_fields, CeedInt num_input_fields, const bool skip_active,
-                                                      CeedScalar *e_data[2 * CEED_FIELD_MAX], CeedOperator_Cuda *impl) {
-  for (CeedInt i = 0; i < num_input_fields; i++) {
-    CeedInt             elem_size, size;
-    CeedEvalMode        eval_mode;
-    CeedElemRestriction elem_rstr;
-    CeedBasis           basis;
+static inline int CeedOperatorInputBasisAtPoints_Cuda(CeedOperatorField op_input_field, CeedQFunctionField qf_input_field, CeedInt input_field,
+                                                      CeedInt num_elem, const CeedInt *num_points, const bool skip_active, CeedScalar *e_data,
+                                                      CeedOperator_Cuda *impl) {
+  CeedEvalMode eval_mode;
 
-    // Skip active input
-    if (skip_active) {
-      CeedVector vec;
+  // Skip active input
+  if (skip_active) {
+    CeedVector vec;
 
-      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
-      if (vec == CEED_VECTOR_ACTIVE) continue;
-    }
-    // Get elem_size, eval_mode, size
-    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
-    CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
-    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
-    CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &size));
-    // Basis action
-    switch (eval_mode) {
-      case CEED_EVAL_NONE:
-        CeedCallBackend(CeedVectorSetArray(impl->q_vecs_in[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data[i]));
-        break;
-      case CEED_EVAL_INTERP:
-      case CEED_EVAL_GRAD:
-      case CEED_EVAL_DIV:
-      case CEED_EVAL_CURL:
-        CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
-        CeedCallBackend(CeedBasisApplyAtPoints(basis, num_elem, num_points, CEED_NOTRANSPOSE, eval_mode, impl->point_coords_elem, impl->e_vecs[i],
-                                               impl->q_vecs_in[i]));
-        break;
-      case CEED_EVAL_WEIGHT:
-        break;  // No action
+    CeedCallBackend(CeedOperatorFieldGetVector(op_input_field, &vec));
+    if (vec == CEED_VECTOR_ACTIVE) return CEED_ERROR_SUCCESS;
+  }
+
+  // Basis action
+  CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_field, &eval_mode));
+  switch (eval_mode) {
+    case CEED_EVAL_NONE:
+      CeedCallBackend(CeedVectorSetArray(impl->q_vecs_in[input_field], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data));
+      break;
+    case CEED_EVAL_INTERP:
+    case CEED_EVAL_GRAD:
+    case CEED_EVAL_DIV:
+    case CEED_EVAL_CURL: {
+      CeedBasis basis;
+
+      CeedCallBackend(CeedOperatorFieldGetBasis(op_input_field, &basis));
+      CeedCallBackend(CeedBasisApplyAtPoints(basis, num_elem, num_points, CEED_NOTRANSPOSE, eval_mode, impl->point_coords_elem,
+                                             impl->e_vecs[input_field], impl->q_vecs_in[input_field]));
+      break;
     }
+    case CEED_EVAL_WEIGHT:
+      break;  // No action
   }
   return CEED_ERROR_SUCCESS;
 }
@@ -700,9 +690,6 @@ static int CeedOperatorApplyAddAtPoints_Cuda(CeedOperator op, CeedVector in_vec,
   num_points     = impl->num_points;
   max_num_points = impl->max_num_points;
 
-  // Input Evecs and Restriction
-  CeedCallBackend(CeedOperatorSetupInputs_Cuda(num_input_fields, qf_input_fields, op_input_fields, in_vec, false, e_data, impl, request));
-
   // Get point coordinates
   if (!impl->point_coords_elem) {
     CeedVector          point_coords = NULL;
@@ -713,8 +700,11 @@ static int CeedOperatorApplyAddAtPoints_Cuda(CeedOperator op, CeedVector in_vec,
     CeedCallBackend(CeedElemRestrictionApply(rstr_points, CEED_NOTRANSPOSE, point_coords, impl->point_coords_elem, request));
   }
 
-  // Input basis apply if needed
-  CeedCallBackend(CeedOperatorInputBasisAtPoints_Cuda(num_elem, num_points, qf_input_fields, op_input_fields, num_input_fields, false, e_data, impl));
+  // Process inputs
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedCallBackend(CeedOperatorInputRestrict_Cuda(op_input_fields[i], qf_input_fields[i], i, in_vec, false, &e_data[i], impl, request));
+    CeedCallBackend(CeedOperatorInputBasisAtPoints_Cuda(op_input_fields[i], qf_input_fields[i], i, num_elem, num_points, false, e_data[i], impl));
+  }
 
   // Output pointers, as necessary
   for (CeedInt i = 0; i < num_output_fields; i++) {
@@ -732,7 +722,9 @@ static int CeedOperatorApplyAddAtPoints_Cuda(CeedOperator op, CeedVector in_vec,
   CeedCallBackend(CeedQFunctionApply(qf, num_elem * max_num_points, impl->q_vecs_in, impl->q_vecs_out));
 
   // Restore input arrays
-  CeedCallBackend(CeedOperatorRestoreInputs_Cuda(num_input_fields, qf_input_fields, op_input_fields, false, e_data, impl));
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedCallBackend(CeedOperatorInputRestore_Cuda(op_input_fields[i], qf_input_fields[i], i, false, &e_data[i], impl));
+  }
 
   // Output basis apply if needed
   for (CeedInt i = 0; i < num_output_fields; i++) {
@@ -823,7 +815,9 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Cuda(CeedOperator op,
   CeedCallBackend(CeedOperatorSetup_Cuda(op));
 
   // Input Evecs and Restriction
-  CeedCallBackend(CeedOperatorSetupInputs_Cuda(num_input_fields, qf_input_fields, op_input_fields, NULL, true, e_data, impl, request));
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedCallBackend(CeedOperatorInputRestrict_Cuda(op_input_fields[i], qf_input_fields[i], i, NULL, true, &e_data[i], impl, request));
+  }
 
   // Count number of active input fields
   if (!num_active_in) {
@@ -889,7 +883,9 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Cuda(CeedOperator op,
   CeedCallBackend(CeedVectorGetArray(*assembled, CEED_MEM_DEVICE, &assembled_array));
 
   // Input basis apply
-  CeedCallBackend(CeedOperatorInputBasis_Cuda(num_elem, qf_input_fields, op_input_fields, num_input_fields, true, e_data, impl));
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedCallBackend(CeedOperatorInputBasis_Cuda(op_input_fields[i], qf_input_fields[i], i, num_elem, true, e_data[i], impl));
+  }
 
   // Assemble QFunction
   for (CeedInt in = 0; in < num_active_in; in++) {
@@ -928,7 +924,9 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Cuda(CeedOperator op,
   }
 
   // Restore input arrays
-  CeedCallBackend(CeedOperatorRestoreInputs_Cuda(num_input_fields, qf_input_fields, op_input_fields, true, e_data, impl));
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedCallBackend(CeedOperatorInputRestore_Cuda(op_input_fields[i], qf_input_fields[i], i, true, &e_data[i], impl));
+  }
 
   // Restore output
   CeedCallBackend(CeedVectorRestoreArray(*assembled, &assembled_array));
@@ -1652,9 +1650,6 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda(CeedOperator op, C
   }
   impl->has_shared_e_vecs = false;
 
-  // Input Evecs and Restriction
-  CeedCallBackend(CeedOperatorSetupInputs_Cuda(num_input_fields, qf_input_fields, op_input_fields, NULL, true, e_data, impl, request));
-
   // Get point coordinates
   if (!impl->point_coords_elem) {
     CeedVector          point_coords = NULL;
@@ -1665,6 +1660,11 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda(CeedOperator op, C
     CeedCallBackend(CeedElemRestrictionApply(rstr_points, CEED_NOTRANSPOSE, point_coords, impl->point_coords_elem, request));
   }
 
+  // Input Evecs and Restriction
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedCallBackend(CeedOperatorInputRestrict_Cuda(op_input_fields[i], qf_input_fields[i], i, NULL, true, &e_data[i], impl, request));
+  }
+
   // Clear active input Qvecs
   for (CeedInt i = 0; i < num_input_fields; i++) {
     CeedVector vec;
@@ -1675,7 +1675,9 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda(CeedOperator op, C
   }
 
   // Input basis apply if needed
-  CeedCallBackend(CeedOperatorInputBasisAtPoints_Cuda(num_elem, num_points, qf_input_fields, op_input_fields, num_input_fields, true, e_data, impl));
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedCallBackend(CeedOperatorInputBasisAtPoints_Cuda(op_input_fields[i], qf_input_fields[i], i, num_elem, num_points, true, e_data[i], impl));
+  }
 
   // Output pointers, as necessary
   for (CeedInt i = 0; i < num_output_fields; i++) {
@@ -1833,7 +1835,9 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda(CeedOperator op, C
   }
 
   // Restore input arrays
-  CeedCallBackend(CeedOperatorRestoreInputs_Cuda(num_input_fields, qf_input_fields, op_input_fields, true, e_data, impl));
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedCallBackend(CeedOperatorInputRestore_Cuda(op_input_fields[i], qf_input_fields[i], i, true, &e_data[i], impl));
+  }
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/hip-ref/ceed-hip-ref-operator.c b/backends/hip-ref/ceed-hip-ref-operator.c
index 8bc3f0bc35..858546bb25 100644
--- a/backends/hip-ref/ceed-hip-ref-operator.c
+++ b/backends/hip-ref/ceed-hip-ref-operator.c
@@ -316,46 +316,45 @@ static int CeedOperatorSetup_Hip(CeedOperator op) {
 }
 
 //------------------------------------------------------------------------------
-// Setup Operator Inputs
+// Restrict Operator Inputs
 //------------------------------------------------------------------------------
-static inline int CeedOperatorSetupInputs_Hip(CeedInt num_input_fields, CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields,
-                                              CeedVector in_vec, const bool skip_active, CeedScalar *e_data[2 * CEED_FIELD_MAX],
-                                              CeedOperator_Hip *impl, CeedRequest *request) {
-  for (CeedInt i = 0; i < num_input_fields; i++) {
-    CeedEvalMode        eval_mode;
-    CeedVector          vec;
-    CeedElemRestriction elem_rstr;
+static inline int CeedOperatorInputRestrict_Hip(CeedOperatorField op_input_field, CeedQFunctionField qf_input_field, CeedInt input_field,
+                                                CeedVector in_vec, const bool skip_active, CeedScalar **e_data, CeedOperator_Hip *impl,
+                                                CeedRequest *request) {
+  CeedEvalMode        eval_mode;
+  CeedVector          vec;
+  CeedElemRestriction elem_rstr;
+
+  // Get input vector
+  CeedCallBackend(CeedOperatorFieldGetVector(op_input_field, &vec));
+  if (vec == CEED_VECTOR_ACTIVE) {
+    if (skip_active) return CEED_ERROR_SUCCESS;
+    else vec = in_vec;
+  }
 
+  // Restriction action
+  CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_field, &eval_mode));
+  if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
+  } else {
     // Get input vector
-    CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
-    if (vec == CEED_VECTOR_ACTIVE) {
-      if (skip_active) continue;
-      else vec = in_vec;
-    }
-
-    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
-    if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
+    CeedCallBackend(CeedOperatorFieldGetVector(op_input_field, &vec));
+    // Get input element restriction
+    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_field, &elem_rstr));
+    if (vec == CEED_VECTOR_ACTIVE) vec = in_vec;
+    // Restrict, if necessary
+    if (!impl->e_vecs[input_field]) {
+      // No restriction for this field; read data directly from vec.
+      CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, (const CeedScalar **)e_data));
     } else {
-      // Get input vector
-      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
-      // Get input element restriction
-      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
-      if (vec == CEED_VECTOR_ACTIVE) vec = in_vec;
-      // Restrict, if necessary
-      if (!impl->e_vecs[i]) {
-        // No restriction for this field; read data directly from vec.
-        CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, (const CeedScalar **)&e_data[i]));
-      } else {
-        uint64_t state;
+      uint64_t state;
 
-        CeedCallBackend(CeedVectorGetState(vec, &state));
-        if ((state != impl->input_states[i] || vec == in_vec) && !impl->skip_rstr_in[i]) {
-          CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_NOTRANSPOSE, vec, impl->e_vecs[i], request));
-        }
-        impl->input_states[i] = state;
-        // Get evec
-        CeedCallBackend(CeedVectorGetArrayRead(impl->e_vecs[i], CEED_MEM_DEVICE, (const CeedScalar **)&e_data[i]));
+      CeedCallBackend(CeedVectorGetState(vec, &state));
+      if ((state != impl->input_states[input_field] || vec == in_vec) && !impl->skip_rstr_in[input_field]) {
+        CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_NOTRANSPOSE, vec, impl->e_vecs[input_field], request));
       }
+      impl->input_states[input_field] = state;
+      // Get evec
+      CeedCallBackend(CeedVectorGetArrayRead(impl->e_vecs[input_field], CEED_MEM_DEVICE, (const CeedScalar **)e_data));
     }
   }
   return CEED_ERROR_SUCCESS;
@@ -364,42 +363,36 @@ static inline int CeedOperatorSetupInputs_Hip(CeedInt num_input_fields, CeedQFun
 //------------------------------------------------------------------------------
 // Input Basis Action
 //------------------------------------------------------------------------------
-static inline int CeedOperatorInputBasis_Hip(CeedInt num_elem, CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields,
-                                             CeedInt num_input_fields, const bool skip_active, CeedScalar *e_data[2 * CEED_FIELD_MAX],
-                                             CeedOperator_Hip *impl) {
-  for (CeedInt i = 0; i < num_input_fields; i++) {
-    CeedInt             elem_size, size;
-    CeedEvalMode        eval_mode;
-    CeedElemRestriction elem_rstr;
-    CeedBasis           basis;
+static inline int CeedOperatorInputBasis_Hip(CeedOperatorField op_input_field, CeedQFunctionField qf_input_field, CeedInt input_field,
+                                             CeedInt num_elem, const bool skip_active, CeedScalar *e_data, CeedOperator_Hip *impl) {
+  CeedEvalMode eval_mode;
 
-    // Skip active input
-    if (skip_active) {
-      CeedVector vec;
+  // Skip active input
+  if (skip_active) {
+    CeedVector vec;
 
-      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
-      if (vec == CEED_VECTOR_ACTIVE) continue;
-    }
-    // Get elem_size, eval_mode, size
-    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
-    CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
-    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
-    CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &size));
-    // Basis action
-    switch (eval_mode) {
-      case CEED_EVAL_NONE:
-        CeedCallBackend(CeedVectorSetArray(impl->q_vecs_in[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data[i]));
-        break;
-      case CEED_EVAL_INTERP:
-      case CEED_EVAL_GRAD:
-      case CEED_EVAL_DIV:
-      case CEED_EVAL_CURL:
-        CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
-        CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_NOTRANSPOSE, eval_mode, impl->e_vecs[i], impl->q_vecs_in[i]));
-        break;
-      case CEED_EVAL_WEIGHT:
-        break;  // No action
+    CeedCallBackend(CeedOperatorFieldGetVector(op_input_field, &vec));
+    if (vec == CEED_VECTOR_ACTIVE) return CEED_ERROR_SUCCESS;
+  }
+
+  // Basis action
+  CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_field, &eval_mode));
+  switch (eval_mode) {
+    case CEED_EVAL_NONE:
+      CeedCallBackend(CeedVectorSetArray(impl->q_vecs_in[input_field], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data));
+      break;
+    case CEED_EVAL_INTERP:
+    case CEED_EVAL_GRAD:
+    case CEED_EVAL_DIV:
+    case CEED_EVAL_CURL: {
+      CeedBasis basis;
+
+      CeedCallBackend(CeedOperatorFieldGetBasis(op_input_field, &basis));
+      CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_NOTRANSPOSE, eval_mode, impl->e_vecs[input_field], impl->q_vecs_in[input_field]));
+      break;
     }
+    case CEED_EVAL_WEIGHT:
+      break;  // No action
   }
   return CEED_ERROR_SUCCESS;
 }
@@ -407,26 +400,26 @@ static inline int CeedOperatorInputBasis_Hip(CeedInt num_elem, CeedQFunctionFiel
 //------------------------------------------------------------------------------
 // Restore Input Vectors
 //------------------------------------------------------------------------------
-static inline int CeedOperatorRestoreInputs_Hip(CeedInt num_input_fields, CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields,
-                                                const bool skip_active, CeedScalar *e_data[2 * CEED_FIELD_MAX], CeedOperator_Hip *impl) {
-  for (CeedInt i = 0; i < num_input_fields; i++) {
-    CeedEvalMode eval_mode;
-    CeedVector   vec;
+static inline int CeedOperatorInputRestore_Hip(CeedOperatorField op_input_field, CeedQFunctionField qf_input_field, CeedInt input_field,
+                                               const bool skip_active, CeedScalar **e_data, CeedOperator_Hip *impl) {
+  CeedEvalMode eval_mode;
+  CeedVector   vec;
+
+  // Skip active input
+  if (skip_active) {
+    CeedCallBackend(CeedOperatorFieldGetVector(op_input_field, &vec));
+    if (vec == CEED_VECTOR_ACTIVE) return CEED_ERROR_SUCCESS;
+  }
 
-    // Skip active input
-    if (skip_active) {
-      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
-      if (vec == CEED_VECTOR_ACTIVE) continue;
-    }
-    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
-    if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
+  // Restore e-vec
+  CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_field, &eval_mode));
+  if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
+  } else {
+    if (!impl->e_vecs[input_field]) {  // This was a skip_restriction case
+      CeedCallBackend(CeedOperatorFieldGetVector(op_input_field, &vec));
+      CeedCallBackend(CeedVectorRestoreArrayRead(vec, (const CeedScalar **)e_data));
     } else {
-      if (!impl->e_vecs[i]) {  // This was a skip_restriction case
-        CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
-        CeedCallBackend(CeedVectorRestoreArrayRead(vec, (const CeedScalar **)&e_data[i]));
-      } else {
-        CeedCallBackend(CeedVectorRestoreArrayRead(impl->e_vecs[i], (const CeedScalar **)&e_data[i]));
-      }
+      CeedCallBackend(CeedVectorRestoreArrayRead(impl->e_vecs[input_field], (const CeedScalar **)e_data));
     }
   }
   return CEED_ERROR_SUCCESS;
@@ -453,11 +446,11 @@ static int CeedOperatorApplyAdd_Hip(CeedOperator op, CeedVector in_vec, CeedVect
   // Setup
   CeedCallBackend(CeedOperatorSetup_Hip(op));
 
-  // Input Evecs and Restriction
-  CeedCallBackend(CeedOperatorSetupInputs_Hip(num_input_fields, qf_input_fields, op_input_fields, in_vec, false, e_data, impl, request));
-
-  // Input basis apply if needed
-  CeedCallBackend(CeedOperatorInputBasis_Hip(num_elem, qf_input_fields, op_input_fields, num_input_fields, false, e_data, impl));
+  // Process inputs
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedCallBackend(CeedOperatorInputRestrict_Hip(op_input_fields[i], qf_input_fields[i], i, in_vec, false, &e_data[i], impl, request));
+    CeedCallBackend(CeedOperatorInputBasis_Hip(op_input_fields[i], qf_input_fields[i], i, num_elem, false, e_data[i], impl));
+  }
 
   // Output pointers, as necessary
   for (CeedInt i = 0; i < num_output_fields; i++) {
@@ -475,7 +468,9 @@ static int CeedOperatorApplyAdd_Hip(CeedOperator op, CeedVector in_vec, CeedVect
   CeedCallBackend(CeedQFunctionApply(qf, num_elem * Q, impl->q_vecs_in, impl->q_vecs_out));
 
   // Restore input arrays
-  CeedCallBackend(CeedOperatorRestoreInputs_Hip(num_input_fields, qf_input_fields, op_input_fields, false, e_data, impl));
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedCallBackend(CeedOperatorInputRestore_Hip(op_input_fields[i], qf_input_fields[i], i, false, &e_data[i], impl));
+  }
 
   // Output basis apply if needed
   for (CeedInt i = 0; i < num_output_fields; i++) {
@@ -636,43 +631,38 @@ static int CeedOperatorSetupAtPoints_Hip(CeedOperator op) {
 //------------------------------------------------------------------------------
 // Input Basis Action AtPoints
 //------------------------------------------------------------------------------
-static inline int CeedOperatorInputBasisAtPoints_Hip(CeedInt num_elem, const CeedInt *num_points, CeedQFunctionField *qf_input_fields,
-                                                     CeedOperatorField *op_input_fields, CeedInt num_input_fields, const bool skip_active,
-                                                     CeedScalar *e_data[2 * CEED_FIELD_MAX], CeedOperator_Hip *impl) {
-  for (CeedInt i = 0; i < num_input_fields; i++) {
-    CeedInt             elem_size, size;
-    CeedEvalMode        eval_mode;
-    CeedElemRestriction elem_rstr;
-    CeedBasis           basis;
+static inline int CeedOperatorInputBasisAtPoints_Hip(CeedOperatorField op_input_field, CeedQFunctionField qf_input_field, CeedInt input_field,
+                                                     CeedInt num_elem, const CeedInt *num_points, const bool skip_active, CeedScalar *e_data,
+                                                     CeedOperator_Hip *impl) {
+  CeedEvalMode eval_mode;
 
-    // Skip active input
-    if (skip_active) {
-      CeedVector vec;
+  // Skip active input
+  if (skip_active) {
+    CeedVector vec;
 
-      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
-      if (vec == CEED_VECTOR_ACTIVE) continue;
-    }
-    // Get elem_size, eval_mode, size
-    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
-    CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
-    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
-    CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &size));
-    // Basis action
-    switch (eval_mode) {
-      case CEED_EVAL_NONE:
-        CeedCallBackend(CeedVectorSetArray(impl->q_vecs_in[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data[i]));
-        break;
-      case CEED_EVAL_INTERP:
-      case CEED_EVAL_GRAD:
-      case CEED_EVAL_DIV:
-      case CEED_EVAL_CURL:
-        CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
-        CeedCallBackend(CeedBasisApplyAtPoints(basis, num_elem, num_points, CEED_NOTRANSPOSE, eval_mode, impl->point_coords_elem, impl->e_vecs[i],
-                                               impl->q_vecs_in[i]));
-        break;
-      case CEED_EVAL_WEIGHT:
-        break;  // No action
+    CeedCallBackend(CeedOperatorFieldGetVector(op_input_field, &vec));
+    if (vec == CEED_VECTOR_ACTIVE) return CEED_ERROR_SUCCESS;
+  }
+
+  // Basis action
+  CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_field, &eval_mode));
+  switch (eval_mode) {
+    case CEED_EVAL_NONE:
+      CeedCallBackend(CeedVectorSetArray(impl->q_vecs_in[input_field], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data));
+      break;
+    case CEED_EVAL_INTERP:
+    case CEED_EVAL_GRAD:
+    case CEED_EVAL_DIV:
+    case CEED_EVAL_CURL: {
+      CeedBasis basis;
+
+      CeedCallBackend(CeedOperatorFieldGetBasis(op_input_field, &basis));
+      CeedCallBackend(CeedBasisApplyAtPoints(basis, num_elem, num_points, CEED_NOTRANSPOSE, eval_mode, impl->point_coords_elem,
+                                             impl->e_vecs[input_field], impl->q_vecs_in[input_field]));
+      break;
     }
+    case CEED_EVAL_WEIGHT:
+      break;  // No action
   }
   return CEED_ERROR_SUCCESS;
 }
@@ -699,9 +689,6 @@ static int CeedOperatorApplyAddAtPoints_Hip(CeedOperator op, CeedVector in_vec,
   num_points     = impl->num_points;
   max_num_points = impl->max_num_points;
 
-  // Input Evecs and Restriction
-  CeedCallBackend(CeedOperatorSetupInputs_Hip(num_input_fields, qf_input_fields, op_input_fields, in_vec, false, e_data, impl, request));
-
   // Get point coordinates
   if (!impl->point_coords_elem) {
     CeedVector          point_coords = NULL;
@@ -712,8 +699,11 @@ static int CeedOperatorApplyAddAtPoints_Hip(CeedOperator op, CeedVector in_vec,
     CeedCallBackend(CeedElemRestrictionApply(rstr_points, CEED_NOTRANSPOSE, point_coords, impl->point_coords_elem, request));
   }
 
-  // Input basis apply if needed
-  CeedCallBackend(CeedOperatorInputBasisAtPoints_Hip(num_elem, num_points, qf_input_fields, op_input_fields, num_input_fields, false, e_data, impl));
+  // Process inputs
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedCallBackend(CeedOperatorInputRestrict_Hip(op_input_fields[i], qf_input_fields[i], i, in_vec, false, &e_data[i], impl, request));
+    CeedCallBackend(CeedOperatorInputBasisAtPoints_Hip(op_input_fields[i], qf_input_fields[i], i, num_elem, num_points, false, e_data[i], impl));
+  }
 
   // Output pointers, as necessary
   for (CeedInt i = 0; i < num_output_fields; i++) {
@@ -731,7 +721,9 @@ static int CeedOperatorApplyAddAtPoints_Hip(CeedOperator op, CeedVector in_vec,
   CeedCallBackend(CeedQFunctionApply(qf, num_elem * max_num_points, impl->q_vecs_in, impl->q_vecs_out));
 
   // Restore input arrays
-  CeedCallBackend(CeedOperatorRestoreInputs_Hip(num_input_fields, qf_input_fields, op_input_fields, false, e_data, impl));
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedCallBackend(CeedOperatorInputRestore_Hip(op_input_fields[i], qf_input_fields[i], i, false, &e_data[i], impl));
+  }
 
   // Output basis apply if needed
   for (CeedInt i = 0; i < num_output_fields; i++) {
@@ -822,7 +814,9 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Hip(CeedOperator op, b
   CeedCallBackend(CeedOperatorSetup_Hip(op));
 
   // Input Evecs and Restriction
-  CeedCallBackend(CeedOperatorSetupInputs_Hip(num_input_fields, qf_input_fields, op_input_fields, NULL, true, e_data, impl, request));
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedCallBackend(CeedOperatorInputRestrict_Hip(op_input_fields[i], qf_input_fields[i], i, NULL, true, &e_data[i], impl, request));
+  }
 
   // Count number of active input fields
   if (!num_active_in) {
@@ -888,7 +882,9 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Hip(CeedOperator op, b
   CeedCallBackend(CeedVectorGetArray(*assembled, CEED_MEM_DEVICE, &assembled_array));
 
   // Input basis apply
-  CeedCallBackend(CeedOperatorInputBasis_Hip(num_elem, qf_input_fields, op_input_fields, num_input_fields, true, e_data, impl));
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedCallBackend(CeedOperatorInputBasis_Hip(op_input_fields[i], qf_input_fields[i], i, num_elem, true, e_data[i], impl));
+  }
 
   // Assemble QFunction
   for (CeedInt in = 0; in < num_active_in; in++) {
@@ -927,7 +923,9 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Hip(CeedOperator op, b
   }
 
   // Restore input arrays
-  CeedCallBackend(CeedOperatorRestoreInputs_Hip(num_input_fields, qf_input_fields, op_input_fields, true, e_data, impl));
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedCallBackend(CeedOperatorInputRestore_Hip(op_input_fields[i], qf_input_fields[i], i, true, &e_data[i], impl));
+  }
 
   // Restore output
   CeedCallBackend(CeedVectorRestoreArray(*assembled, &assembled_array));
@@ -1649,9 +1647,6 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip(CeedOperator op, Ce
   }
   impl->has_shared_e_vecs = false;
 
-  // Input Evecs and Restriction
-  CeedCallBackend(CeedOperatorSetupInputs_Hip(num_input_fields, qf_input_fields, op_input_fields, NULL, true, e_data, impl, request));
-
   // Get point coordinates
   if (!impl->point_coords_elem) {
     CeedVector          point_coords = NULL;
@@ -1662,6 +1657,11 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip(CeedOperator op, Ce
     CeedCallBackend(CeedElemRestrictionApply(rstr_points, CEED_NOTRANSPOSE, point_coords, impl->point_coords_elem, request));
   }
 
+  // Input Evecs and Restriction
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedCallBackend(CeedOperatorInputRestrict_Hip(op_input_fields[i], qf_input_fields[i], i, NULL, true, &e_data[i], impl, request));
+  }
+
   // Clear active input Qvecs
   for (CeedInt i = 0; i < num_input_fields; i++) {
     CeedVector vec;
@@ -1672,7 +1672,9 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip(CeedOperator op, Ce
   }
 
   // Input basis apply if needed
-  CeedCallBackend(CeedOperatorInputBasisAtPoints_Hip(num_elem, num_points, qf_input_fields, op_input_fields, num_input_fields, true, e_data, impl));
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedCallBackend(CeedOperatorInputBasisAtPoints_Hip(op_input_fields[i], qf_input_fields[i], i, num_elem, num_points, true, e_data[i], impl));
+  }
 
   // Output pointers, as necessary
   for (CeedInt i = 0; i < num_output_fields; i++) {
@@ -1830,7 +1832,9 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip(CeedOperator op, Ce
   }
 
   // Restore input arrays
-  CeedCallBackend(CeedOperatorRestoreInputs_Hip(num_input_fields, qf_input_fields, op_input_fields, true, e_data, impl));
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedCallBackend(CeedOperatorInputRestore_Hip(op_input_fields[i], qf_input_fields[i], i, true, &e_data[i], impl));
+  }
   return CEED_ERROR_SUCCESS;
 }
 

From 034f99fd28bbd688e3b7ed077b6d504c2f765b9c Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Wed, 25 Sep 2024 14:08:49 -0600
Subject: [PATCH 178/571] gpu - further ref refactoring

---
 .../cuda-gen/ceed-cuda-gen-operator-build.cpp |   2 +-
 backends/cuda-ref/ceed-cuda-ref-operator.c    | 530 ++++++++---------
 backends/cuda-ref/ceed-cuda-ref.h             |   7 +-
 .../hip-gen/ceed-hip-gen-operator-build.cpp   |   2 +-
 backends/hip-ref/ceed-hip-ref-operator.c      | 537 +++++++++---------
 backends/hip-ref/ceed-hip-ref.h               |   7 +-
 6 files changed, 565 insertions(+), 520 deletions(-)

diff --git a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
index 0b45db90c8..40ddf59ad1 100644
--- a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
+++ b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
@@ -839,11 +839,11 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {
       CeedElemRestriction rstr_i;
 
       if (is_ordered[i]) continue;
-      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i));
       field_rstr_in_buffer[i]       = i;
       is_ordered[i]                 = true;
       input_field_order[curr_index] = i;
       curr_index++;
+      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i));
       if (vec_i == CEED_VECTOR_NONE) continue;  // CEED_EVAL_WEIGHT
       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i));
       for (CeedInt j = i + 1; j < num_input_fields; j++) {
diff --git a/backends/cuda-ref/ceed-cuda-ref-operator.c b/backends/cuda-ref/ceed-cuda-ref-operator.c
index 5add9b1e97..515cc9a847 100644
--- a/backends/cuda-ref/ceed-cuda-ref-operator.c
+++ b/backends/cuda-ref/ceed-cuda-ref-operator.c
@@ -31,20 +31,22 @@ static int CeedOperatorDestroy_Cuda(CeedOperator op) {
   CeedCallBackend(CeedFree(&impl->skip_rstr_in));
   CeedCallBackend(CeedFree(&impl->skip_rstr_out));
   CeedCallBackend(CeedFree(&impl->apply_add_basis_out));
-  for (CeedInt i = 0; i < impl->num_inputs + impl->num_outputs; i++) {
-    CeedCallBackend(CeedVectorDestroy(&impl->e_vecs[i]));
-  }
-  CeedCallBackend(CeedFree(&impl->e_vecs));
+  CeedCallBackend(CeedFree(&impl->input_field_order));
+  CeedCallBackend(CeedFree(&impl->output_field_order));
   CeedCallBackend(CeedFree(&impl->input_states));
 
   for (CeedInt i = 0; i < impl->num_inputs; i++) {
+    CeedCallBackend(CeedVectorDestroy(&impl->e_vecs_in[i]));
     CeedCallBackend(CeedVectorDestroy(&impl->q_vecs_in[i]));
   }
+  CeedCallBackend(CeedFree(&impl->e_vecs_in));
   CeedCallBackend(CeedFree(&impl->q_vecs_in));
 
   for (CeedInt i = 0; i < impl->num_outputs; i++) {
+    CeedCallBackend(CeedVectorDestroy(&impl->e_vecs_out[i]));
     CeedCallBackend(CeedVectorDestroy(&impl->q_vecs_out[i]));
   }
+  CeedCallBackend(CeedFree(&impl->e_vecs_out));
   CeedCallBackend(CeedFree(&impl->q_vecs_out));
   CeedCallBackend(CeedVectorDestroy(&impl->point_coords_elem));
 
@@ -101,7 +103,7 @@ static int CeedOperatorDestroy_Cuda(CeedOperator op) {
 // Setup infields or outfields
 //------------------------------------------------------------------------------
 static int CeedOperatorSetupFields_Cuda(CeedQFunction qf, CeedOperator op, bool is_input, bool is_at_points, bool *skip_rstr, bool *apply_add_basis,
-                                        CeedVector *e_vecs, CeedVector *q_vecs, CeedInt start_e, CeedInt num_fields, CeedInt Q, CeedInt num_elem) {
+                                        CeedVector *e_vecs, CeedVector *q_vecs, CeedInt num_fields, CeedInt Q, CeedInt num_elem) {
   Ceed                ceed;
   CeedQFunctionField *qf_fields;
   CeedOperatorField  *op_fields;
@@ -117,7 +119,7 @@ static int CeedOperatorSetupFields_Cuda(CeedQFunction qf, CeedOperator op, bool
 
   // Loop over fields
   for (CeedInt i = 0; i < num_fields; i++) {
-    bool         is_strided = false, skip_restriction = false;
+    bool         is_strided = false, skip_e_vec = false;
     CeedSize     q_size;
     CeedInt      size;
     CeedEvalMode eval_mode;
@@ -133,27 +135,24 @@ static int CeedOperatorSetupFields_Cuda(CeedQFunction qf, CeedOperator op, bool
 
       // First, check whether the field is input or output:
       if (is_input) {
-        CeedVector vec;
+        CeedVector l_vec;
 
         // Check for passive input
-        CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec));
-        if (vec != CEED_VECTOR_ACTIVE) {
-          // Check eval_mode
-          if (eval_mode == CEED_EVAL_NONE) {
-            // Check for strided restriction
-            CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided));
-            if (is_strided) {
-              // Check if vector is already in preferred backend ordering
-              CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &skip_restriction));
-            }
+        CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &l_vec));
+        if (l_vec != CEED_VECTOR_ACTIVE && eval_mode == CEED_EVAL_NONE) {
+          // Check for strided restriction
+          CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided));
+          if (is_strided) {
+            // Check if vector is already in preferred backend ordering
+            CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &skip_e_vec));
           }
         }
       }
-      if (skip_restriction) {
-        // We do not need an E-Vector, but will use the input field vector's data directly in the operator application.
-        e_vecs[i + start_e] = NULL;
+      if (skip_e_vec) {
+        // Either an active field or strided local vec in backend ordering
+        e_vecs[i] = NULL;
       } else {
-        CeedCallBackend(CeedElemRestrictionCreateVector(elem_rstr, NULL, &e_vecs[i + start_e]));
+        CeedCallBackend(CeedElemRestrictionCreateVector(elem_rstr, NULL, &e_vecs[i]));
       }
     }
 
@@ -202,7 +201,7 @@ static int CeedOperatorSetupFields_Cuda(CeedQFunction qf, CeedOperator op, bool
         CeedCallBackend(CeedOperatorFieldGetVector(op_fields[j], &vec_j));
         CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[j], &rstr_j));
         if (vec_i == vec_j && rstr_i == rstr_j) {
-          CeedCallBackend(CeedVectorReferenceCopy(e_vecs[i + start_e], &e_vecs[j + start_e]));
+          CeedCallBackend(CeedVectorReferenceCopy(e_vecs[i], &e_vecs[j]));
           skip_rstr[j] = true;
         }
       }
@@ -221,7 +220,7 @@ static int CeedOperatorSetupFields_Cuda(CeedQFunction qf, CeedOperator op, bool
         CeedCallBackend(CeedOperatorFieldGetVector(op_fields[j], &vec_j));
         CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[j], &rstr_j));
         if (vec_i == vec_j && rstr_i == rstr_j) {
-          CeedCallBackend(CeedVectorReferenceCopy(e_vecs[i + start_e], &e_vecs[j + start_e]));
+          CeedCallBackend(CeedVectorReferenceCopy(e_vecs[i], &e_vecs[j]));
           skip_rstr[j]       = true;
           apply_add_basis[i] = true;
         }
@@ -255,63 +254,92 @@ static int CeedOperatorSetup_Cuda(CeedOperator op) {
   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
 
   // Allocate
-  CeedCallBackend(CeedCalloc(num_input_fields + num_output_fields, &impl->e_vecs));
-  CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->skip_rstr_in));
-  CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->skip_rstr_out));
-  CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->apply_add_basis_out));
-  CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->input_states));
-  CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_in));
-  CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_out));
+  CeedCallBackend(CeedCalloc(num_input_fields, &impl->e_vecs_in));
+  CeedCallBackend(CeedCalloc(num_output_fields, &impl->e_vecs_out));
+  CeedCallBackend(CeedCalloc(num_input_fields, &impl->skip_rstr_in));
+  CeedCallBackend(CeedCalloc(num_output_fields, &impl->skip_rstr_out));
+  CeedCallBackend(CeedCalloc(num_output_fields, &impl->apply_add_basis_out));
+  CeedCallBackend(CeedCalloc(num_input_fields, &impl->input_field_order));
+  CeedCallBackend(CeedCalloc(num_output_fields, &impl->output_field_order));
+  CeedCallBackend(CeedCalloc(num_input_fields, &impl->input_states));
+  CeedCallBackend(CeedCalloc(num_input_fields, &impl->q_vecs_in));
+  CeedCallBackend(CeedCalloc(num_output_fields, &impl->q_vecs_out));
   impl->num_inputs  = num_input_fields;
   impl->num_outputs = num_output_fields;
 
   // Set up infield and outfield e-vecs and q-vecs
-  // Infields
   CeedCallBackend(
-      CeedOperatorSetupFields_Cuda(qf, op, true, false, impl->skip_rstr_in, NULL, impl->e_vecs, impl->q_vecs_in, 0, num_input_fields, Q, num_elem));
-  // Outfields
-  CeedCallBackend(CeedOperatorSetupFields_Cuda(qf, op, false, false, impl->skip_rstr_out, impl->apply_add_basis_out, impl->e_vecs, impl->q_vecs_out,
-                                               num_input_fields, num_output_fields, Q, num_elem));
+      CeedOperatorSetupFields_Cuda(qf, op, true, false, impl->skip_rstr_in, NULL, impl->e_vecs_in, impl->q_vecs_in, num_input_fields, Q, num_elem));
+  CeedCallBackend(CeedOperatorSetupFields_Cuda(qf, op, false, false, impl->skip_rstr_out, impl->apply_add_basis_out, impl->e_vecs_out,
+                                               impl->q_vecs_out, num_output_fields, Q, num_elem));
 
-  // Reuse active e-vecs where able
+  // Reorder fields to allow reuse of buffers
+  impl->max_active_e_vec_len = 0;
   {
-    CeedInt              num_used  = 0;
-    CeedElemRestriction *rstr_used = NULL;
+    bool    is_ordered[CEED_FIELD_MAX];
+    CeedInt curr_index = 0;
 
+    for (CeedInt i = 0; i < num_input_fields; i++) is_ordered[i] = false;
     for (CeedInt i = 0; i < num_input_fields; i++) {
-      bool                is_used = false;
+      CeedSize            e_vec_len_i;
       CeedVector          vec_i;
       CeedElemRestriction rstr_i;
 
+      if (is_ordered[i]) continue;
+      is_ordered[i]                       = true;
+      impl->input_field_order[curr_index] = i;
+      curr_index++;
       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i));
-      if (vec_i != CEED_VECTOR_ACTIVE) continue;
+      if (vec_i == CEED_VECTOR_NONE) continue;  // CEED_EVAL_WEIGHT
       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i));
-      for (CeedInt j = 0; j < num_used; j++) {
-        if (rstr_i == rstr_used[i]) is_used = true;
+      CeedCallBackend(CeedElemRestrictionGetEVectorSize(rstr_i, &e_vec_len_i));
+      impl->max_active_e_vec_len = e_vec_len_i > impl->max_active_e_vec_len ? e_vec_len_i : impl->max_active_e_vec_len;
+      for (CeedInt j = i + 1; j < num_input_fields; j++) {
+        CeedVector          vec_j;
+        CeedElemRestriction rstr_j;
+
+        CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec_j));
+        CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[j], &rstr_j));
+        if (rstr_i == rstr_j && vec_i == vec_j) {
+          is_ordered[j]                       = true;
+          impl->input_field_order[curr_index] = j;
+          curr_index++;
+        }
       }
-      if (is_used) continue;
-      num_used++;
-      if (num_used == 1) CeedCallBackend(CeedCalloc(num_used, &rstr_used));
-      else CeedCallBackend(CeedRealloc(num_used, &rstr_used));
-      rstr_used[num_used - 1] = rstr_i;
-      for (CeedInt j = num_output_fields - 1; j >= 0; j--) {
-        CeedEvalMode        eval_mode;
+    }
+  }
+  {
+    bool    is_ordered[CEED_FIELD_MAX];
+    CeedInt curr_index = 0;
+
+    for (CeedInt i = 0; i < num_output_fields; i++) is_ordered[i] = false;
+    for (CeedInt i = 0; i < num_output_fields; i++) {
+      CeedSize            e_vec_len_i;
+      CeedVector          vec_i;
+      CeedElemRestriction rstr_i;
+
+      if (is_ordered[i]) continue;
+      is_ordered[i]                        = true;
+      impl->output_field_order[curr_index] = i;
+      curr_index++;
+      CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec_i));
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &rstr_i));
+      CeedCallBackend(CeedElemRestrictionGetEVectorSize(rstr_i, &e_vec_len_i));
+      impl->max_active_e_vec_len = e_vec_len_i > impl->max_active_e_vec_len ? e_vec_len_i : impl->max_active_e_vec_len;
+      for (CeedInt j = i + 1; j < num_output_fields; j++) {
         CeedVector          vec_j;
         CeedElemRestriction rstr_j;
 
         CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[j], &vec_j));
-        if (vec_j != CEED_VECTOR_ACTIVE) continue;
-        CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode));
-        if (eval_mode == CEED_EVAL_NONE) continue;
         CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[j], &rstr_j));
-        if (rstr_i == rstr_j) {
-          CeedCallBackend(CeedVectorReferenceCopy(impl->e_vecs[i], &impl->e_vecs[j + impl->num_inputs]));
+        if (rstr_i == rstr_j && vec_i == vec_j) {
+          is_ordered[j]                        = true;
+          impl->output_field_order[curr_index] = j;
+          curr_index++;
         }
       }
     }
-    CeedCallBackend(CeedFree(&rstr_used));
   }
-  impl->has_shared_e_vecs = true;
   CeedCallBackend(CeedOperatorSetSetupDone(op));
   return CEED_ERROR_SUCCESS;
 }
@@ -322,40 +350,39 @@ static int CeedOperatorSetup_Cuda(CeedOperator op) {
 static inline int CeedOperatorInputRestrict_Cuda(CeedOperatorField op_input_field, CeedQFunctionField qf_input_field, CeedInt input_field,
                                                  CeedVector in_vec, const bool skip_active, CeedScalar **e_data, CeedOperator_Cuda *impl,
                                                  CeedRequest *request) {
-  CeedEvalMode        eval_mode;
-  CeedVector          vec;
-  CeedElemRestriction elem_rstr;
+  CeedEvalMode eval_mode;
+  CeedVector   l_vec, e_vec = impl->e_vecs_in[input_field];
 
   // Get input vector
-  CeedCallBackend(CeedOperatorFieldGetVector(op_input_field, &vec));
-  if (vec == CEED_VECTOR_ACTIVE) {
+  CeedCallBackend(CeedOperatorFieldGetVector(op_input_field, &l_vec));
+  if (l_vec == CEED_VECTOR_ACTIVE) {
     if (skip_active) return CEED_ERROR_SUCCESS;
-    else vec = in_vec;
+    else l_vec = in_vec;
   }
 
   // Restriction action
   CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_field, &eval_mode));
   if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
   } else {
-    // Get input vector
-    CeedCallBackend(CeedOperatorFieldGetVector(op_input_field, &vec));
-    // Get input element restriction
-    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_field, &elem_rstr));
-    if (vec == CEED_VECTOR_ACTIVE) vec = in_vec;
-    // Restrict, if necessary
-    if (!impl->e_vecs[input_field]) {
+    if (!e_vec) {
       // No restriction for this field; read data directly from vec.
-      CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, (const CeedScalar **)e_data));
+      CeedCallBackend(CeedVectorGetArrayRead(l_vec, CEED_MEM_DEVICE, (const CeedScalar **)e_data));
     } else {
-      uint64_t state;
+      // Restrict, if necessary
+      if (!impl->skip_rstr_in[input_field]) {
+        uint64_t state;
+
+        CeedCallBackend(CeedVectorGetState(l_vec, &state));
+        if (state != impl->input_states[input_field] || l_vec == in_vec) {
+          CeedElemRestriction elem_rstr;
 
-      CeedCallBackend(CeedVectorGetState(vec, &state));
-      if ((state != impl->input_states[input_field] || vec == in_vec) && !impl->skip_rstr_in[input_field]) {
-        CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_NOTRANSPOSE, vec, impl->e_vecs[input_field], request));
+          CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_field, &elem_rstr));
+          CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_NOTRANSPOSE, l_vec, e_vec, request));
+        }
+        impl->input_states[input_field] = state;
       }
-      impl->input_states[input_field] = state;
-      // Get evec
-      CeedCallBackend(CeedVectorGetArrayRead(impl->e_vecs[input_field], CEED_MEM_DEVICE, (const CeedScalar **)e_data));
+      // Get e-vec
+      CeedCallBackend(CeedVectorGetArrayRead(e_vec, CEED_MEM_DEVICE, (const CeedScalar **)e_data));
     }
   }
   return CEED_ERROR_SUCCESS;
@@ -367,20 +394,21 @@ static inline int CeedOperatorInputRestrict_Cuda(CeedOperatorField op_input_fiel
 static inline int CeedOperatorInputBasis_Cuda(CeedOperatorField op_input_field, CeedQFunctionField qf_input_field, CeedInt input_field,
                                               CeedInt num_elem, const bool skip_active, CeedScalar *e_data, CeedOperator_Cuda *impl) {
   CeedEvalMode eval_mode;
+  CeedVector   e_vec = impl->e_vecs_in[input_field], q_vec = impl->q_vecs_in[input_field];
 
   // Skip active input
   if (skip_active) {
-    CeedVector vec;
+    CeedVector l_vec;
 
-    CeedCallBackend(CeedOperatorFieldGetVector(op_input_field, &vec));
-    if (vec == CEED_VECTOR_ACTIVE) return CEED_ERROR_SUCCESS;
+    CeedCallBackend(CeedOperatorFieldGetVector(op_input_field, &l_vec));
+    if (l_vec == CEED_VECTOR_ACTIVE) return CEED_ERROR_SUCCESS;
   }
 
   // Basis action
   CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_field, &eval_mode));
   switch (eval_mode) {
     case CEED_EVAL_NONE:
-      CeedCallBackend(CeedVectorSetArray(impl->q_vecs_in[input_field], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data));
+      CeedCallBackend(CeedVectorSetArray(q_vec, CEED_MEM_DEVICE, CEED_USE_POINTER, e_data));
       break;
     case CEED_EVAL_INTERP:
     case CEED_EVAL_GRAD:
@@ -389,7 +417,7 @@ static inline int CeedOperatorInputBasis_Cuda(CeedOperatorField op_input_field,
       CeedBasis basis;
 
       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_field, &basis));
-      CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_NOTRANSPOSE, eval_mode, impl->e_vecs[input_field], impl->q_vecs_in[input_field]));
+      CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_NOTRANSPOSE, eval_mode, e_vec, q_vec));
       break;
     }
     case CEED_EVAL_WEIGHT:
@@ -404,23 +432,20 @@ static inline int CeedOperatorInputBasis_Cuda(CeedOperatorField op_input_field,
 static inline int CeedOperatorInputRestore_Cuda(CeedOperatorField op_input_field, CeedQFunctionField qf_input_field, CeedInt input_field,
                                                 const bool skip_active, CeedScalar **e_data, CeedOperator_Cuda *impl) {
   CeedEvalMode eval_mode;
-  CeedVector   vec;
+  CeedVector   l_vec, e_vec = impl->e_vecs_in[input_field];
 
   // Skip active input
-  if (skip_active) {
-    CeedCallBackend(CeedOperatorFieldGetVector(op_input_field, &vec));
-    if (vec == CEED_VECTOR_ACTIVE) return CEED_ERROR_SUCCESS;
-  }
+  CeedCallBackend(CeedOperatorFieldGetVector(op_input_field, &l_vec));
+  if (skip_active && l_vec == CEED_VECTOR_ACTIVE) return CEED_ERROR_SUCCESS;
 
   // Restore e-vec
   CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_field, &eval_mode));
   if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
   } else {
-    if (!impl->e_vecs[input_field]) {  // This was a skip_restriction case
-      CeedCallBackend(CeedOperatorFieldGetVector(op_input_field, &vec));
-      CeedCallBackend(CeedVectorRestoreArrayRead(vec, (const CeedScalar **)e_data));
+    if (!e_vec) {  // This was a skip_restriction case
+      CeedCallBackend(CeedVectorRestoreArrayRead(l_vec, (const CeedScalar **)e_data));
     } else {
-      CeedCallBackend(CeedVectorRestoreArrayRead(impl->e_vecs[input_field], (const CeedScalar **)e_data));
+      CeedCallBackend(CeedVectorRestoreArrayRead(e_vec, (const CeedScalar **)e_data));
     }
   }
   return CEED_ERROR_SUCCESS;
@@ -430,8 +455,8 @@ static inline int CeedOperatorInputRestore_Cuda(CeedOperatorField op_input_field
 // Apply and add to output
 //------------------------------------------------------------------------------
 static int CeedOperatorApplyAdd_Cuda(CeedOperator op, CeedVector in_vec, CeedVector out_vec, CeedRequest *request) {
-  CeedInt             Q, num_elem, elem_size, num_input_fields, num_output_fields, size;
-  CeedScalar         *e_data[2 * CEED_FIELD_MAX] = {NULL};
+  CeedInt             Q, num_elem, num_input_fields, num_output_fields;
+  CeedScalar         *e_data_in[CEED_FIELD_MAX] = {NULL}, *e_data_out[CEED_FIELD_MAX] = {NULL};
   CeedQFunctionField *qf_input_fields, *qf_output_fields;
   CeedQFunction       qf;
   CeedOperatorField  *op_input_fields, *op_output_fields;
@@ -449,8 +474,11 @@ static int CeedOperatorApplyAdd_Cuda(CeedOperator op, CeedVector in_vec, CeedVec
 
   // Process inputs
   for (CeedInt i = 0; i < num_input_fields; i++) {
-    CeedCallBackend(CeedOperatorInputRestrict_Cuda(op_input_fields[i], qf_input_fields[i], i, in_vec, false, &e_data[i], impl, request));
-    CeedCallBackend(CeedOperatorInputBasis_Cuda(op_input_fields[i], qf_input_fields[i], i, num_elem, false, e_data[i], impl));
+    CeedInt field = impl->input_field_order[i];
+
+    CeedCallBackend(
+        CeedOperatorInputRestrict_Cuda(op_input_fields[field], qf_input_fields[field], field, in_vec, false, &e_data_in[field], impl, request));
+    CeedCallBackend(CeedOperatorInputBasis_Cuda(op_input_fields[field], qf_input_fields[field], field, num_elem, false, e_data_in[field], impl));
   }
 
   // Output pointers, as necessary
@@ -460,8 +488,8 @@ static int CeedOperatorApplyAdd_Cuda(CeedOperator op, CeedVector in_vec, CeedVec
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
     if (eval_mode == CEED_EVAL_NONE) {
       // Set the output Q-Vector to use the E-Vector data directly.
-      CeedCallBackend(CeedVectorGetArrayWrite(impl->e_vecs[i + impl->num_inputs], CEED_MEM_DEVICE, &e_data[i + num_input_fields]));
-      CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data[i + num_input_fields]));
+      CeedCallBackend(CeedVectorGetArrayWrite(impl->e_vecs_out[i], CEED_MEM_DEVICE, &e_data_out[i]));
+      CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data_out[i]));
     }
   }
 
@@ -470,21 +498,23 @@ static int CeedOperatorApplyAdd_Cuda(CeedOperator op, CeedVector in_vec, CeedVec
 
   // Restore input arrays
   for (CeedInt i = 0; i < num_input_fields; i++) {
-    CeedCallBackend(CeedOperatorInputRestore_Cuda(op_input_fields[i], qf_input_fields[i], i, false, &e_data[i], impl));
+    CeedCallBackend(CeedOperatorInputRestore_Cuda(op_input_fields[i], qf_input_fields[i], i, false, &e_data_in[i], impl));
   }
 
   // Output basis apply if needed
   for (CeedInt i = 0; i < num_output_fields; i++) {
+    CeedInt             field = impl->output_field_order[i];
     CeedEvalMode        eval_mode;
+    CeedVector          l_vec, e_vec = impl->e_vecs_out[field], q_vec = impl->q_vecs_out[field];
     CeedElemRestriction elem_rstr;
     CeedBasis           basis;
 
-    // Get elem_size, eval_mode, size
-    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
-    CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
-    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
-    CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &size));
+    // Output vector
+    CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[field], &l_vec));
+    if (l_vec == CEED_VECTOR_ACTIVE) l_vec = out_vec;
+
     // Basis action
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[field], &eval_mode));
     switch (eval_mode) {
       case CEED_EVAL_NONE:
         break;  // No action
@@ -492,11 +522,11 @@ static int CeedOperatorApplyAdd_Cuda(CeedOperator op, CeedVector in_vec, CeedVec
       case CEED_EVAL_GRAD:
       case CEED_EVAL_DIV:
       case CEED_EVAL_CURL:
-        CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
-        if (impl->apply_add_basis_out[i]) {
-          CeedCallBackend(CeedBasisApplyAdd(basis, num_elem, CEED_TRANSPOSE, eval_mode, impl->q_vecs_out[i], impl->e_vecs[i + impl->num_inputs]));
+        CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[field], &basis));
+        if (impl->apply_add_basis_out[field]) {
+          CeedCallBackend(CeedBasisApplyAdd(basis, num_elem, CEED_TRANSPOSE, eval_mode, q_vec, e_vec));
         } else {
-          CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_TRANSPOSE, eval_mode, impl->q_vecs_out[i], impl->e_vecs[i + impl->num_inputs]));
+          CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_TRANSPOSE, eval_mode, q_vec, e_vec));
         }
         break;
       // LCOV_EXCL_START
@@ -505,28 +535,16 @@ static int CeedOperatorApplyAdd_Cuda(CeedOperator op, CeedVector in_vec, CeedVec
         // LCOV_EXCL_STOP
       }
     }
-  }
-
-  // Output restriction
-  for (CeedInt i = 0; i < num_output_fields; i++) {
-    CeedEvalMode        eval_mode;
-    CeedVector          vec;
-    CeedElemRestriction elem_rstr;
 
     // Restore evec
-    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
     if (eval_mode == CEED_EVAL_NONE) {
-      CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs[i + impl->num_inputs], &e_data[i + num_input_fields]));
+      CeedCallBackend(CeedVectorRestoreArray(e_vec, &e_data_out[field]));
     }
-    if (impl->skip_rstr_out[i]) continue;
-    // Get output vector
-    CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
-    // Restrict
-    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
-    // Active
-    if (vec == CEED_VECTOR_ACTIVE) vec = out_vec;
 
-    CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_TRANSPOSE, impl->e_vecs[i + impl->num_inputs], vec, request));
+    // Restrict
+    if (impl->skip_rstr_out[field]) continue;
+    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[field], &elem_rstr));
+    CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_TRANSPOSE, e_vec, l_vec, request));
   }
   return CEED_ERROR_SUCCESS;
 }
@@ -568,63 +586,85 @@ static int CeedOperatorSetupAtPoints_Cuda(CeedOperator op) {
   impl->max_num_points = max_num_points;
 
   // Allocate
-  CeedCallBackend(CeedCalloc(num_input_fields + num_output_fields, &impl->e_vecs));
-  CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->skip_rstr_in));
-  CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->skip_rstr_out));
-  CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->apply_add_basis_out));
-  CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->input_states));
-  CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_in));
-  CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_out));
+  CeedCallBackend(CeedCalloc(num_input_fields, &impl->e_vecs_in));
+  CeedCallBackend(CeedCalloc(num_output_fields, &impl->e_vecs_out));
+  CeedCallBackend(CeedCalloc(num_input_fields, &impl->skip_rstr_in));
+  CeedCallBackend(CeedCalloc(num_output_fields, &impl->skip_rstr_out));
+  CeedCallBackend(CeedCalloc(num_output_fields, &impl->apply_add_basis_out));
+  CeedCallBackend(CeedCalloc(num_input_fields, &impl->input_field_order));
+  CeedCallBackend(CeedCalloc(num_output_fields, &impl->output_field_order));
+  CeedCallBackend(CeedCalloc(num_input_fields, &impl->input_states));
+  CeedCallBackend(CeedCalloc(num_input_fields, &impl->q_vecs_in));
+  CeedCallBackend(CeedCalloc(num_output_fields, &impl->q_vecs_out));
   impl->num_inputs  = num_input_fields;
   impl->num_outputs = num_output_fields;
 
   // Set up infield and outfield e-vecs and q-vecs
-  // Infields
-  CeedCallBackend(CeedOperatorSetupFields_Cuda(qf, op, true, true, impl->skip_rstr_in, NULL, impl->e_vecs, impl->q_vecs_in, 0, num_input_fields,
+  CeedCallBackend(CeedOperatorSetupFields_Cuda(qf, op, true, true, impl->skip_rstr_in, NULL, impl->e_vecs_in, impl->q_vecs_in, num_input_fields,
                                                max_num_points, num_elem));
-  // Outfields
-  CeedCallBackend(CeedOperatorSetupFields_Cuda(qf, op, false, true, impl->skip_rstr_out, impl->apply_add_basis_out, impl->e_vecs, impl->q_vecs_out,
-                                               num_input_fields, num_output_fields, max_num_points, num_elem));
+  CeedCallBackend(CeedOperatorSetupFields_Cuda(qf, op, false, true, impl->skip_rstr_out, impl->apply_add_basis_out, impl->e_vecs_out,
+                                               impl->q_vecs_out, num_output_fields, max_num_points, num_elem));
 
-  // Reuse active e-vecs where able
+  // Reorder fields to allow reuse of buffers
   {
-    CeedInt              num_used  = 0;
-    CeedElemRestriction *rstr_used = NULL;
+    bool    is_ordered[CEED_FIELD_MAX];
+    CeedInt curr_index = 0;
 
+    for (CeedInt i = 0; i < num_input_fields; i++) is_ordered[i] = false;
     for (CeedInt i = 0; i < num_input_fields; i++) {
-      bool                is_used = false;
       CeedVector          vec_i;
       CeedElemRestriction rstr_i;
 
+      if (is_ordered[i]) continue;
+      is_ordered[i]                       = true;
+      impl->input_field_order[curr_index] = i;
+      curr_index++;
       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i));
-      if (vec_i != CEED_VECTOR_ACTIVE) continue;
+      if (vec_i == CEED_VECTOR_NONE) continue;  // CEED_EVAL_WEIGHT
       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i));
-      for (CeedInt j = 0; j < num_used; j++) {
-        if (rstr_i == rstr_used[i]) is_used = true;
+      for (CeedInt j = i + 1; j < num_input_fields; j++) {
+        CeedVector          vec_j;
+        CeedElemRestriction rstr_j;
+
+        CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec_j));
+        CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[j], &rstr_j));
+        if (rstr_i == rstr_j && vec_i == vec_j) {
+          is_ordered[j]                       = true;
+          impl->input_field_order[curr_index] = j;
+          curr_index++;
+        }
       }
-      if (is_used) continue;
-      num_used++;
-      if (num_used == 1) CeedCallBackend(CeedCalloc(num_used, &rstr_used));
-      else CeedCallBackend(CeedRealloc(num_used, &rstr_used));
-      rstr_used[num_used - 1] = rstr_i;
-      for (CeedInt j = num_output_fields - 1; j >= 0; j--) {
-        CeedEvalMode        eval_mode;
+    }
+  }
+  {
+    bool    is_ordered[CEED_FIELD_MAX];
+    CeedInt curr_index = 0;
+
+    for (CeedInt i = 0; i < num_output_fields; i++) is_ordered[i] = false;
+    for (CeedInt i = 0; i < num_output_fields; i++) {
+      CeedVector          vec_i;
+      CeedElemRestriction rstr_i;
+
+      if (is_ordered[i]) continue;
+      is_ordered[i]                        = true;
+      impl->output_field_order[curr_index] = i;
+      curr_index++;
+      CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec_i));
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &rstr_i));
+      for (CeedInt j = i + 1; j < num_output_fields; j++) {
         CeedVector          vec_j;
         CeedElemRestriction rstr_j;
 
         CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[j], &vec_j));
-        if (vec_j != CEED_VECTOR_ACTIVE) continue;
-        CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode));
-        if (eval_mode == CEED_EVAL_NONE) continue;
         CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[j], &rstr_j));
-        if (rstr_i == rstr_j) {
-          CeedCallBackend(CeedVectorReferenceCopy(impl->e_vecs[i], &impl->e_vecs[j + impl->num_inputs]));
+        if (rstr_i == rstr_j && vec_i == vec_j) {
+          is_ordered[j]                        = true;
+          impl->output_field_order[curr_index] = j;
+          curr_index++;
         }
       }
     }
-    CeedCallBackend(CeedFree(&rstr_used));
   }
-  impl->has_shared_e_vecs = true;
   CeedCallBackend(CeedOperatorSetSetupDone(op));
   return CEED_ERROR_SUCCESS;
 }
@@ -636,20 +676,21 @@ static inline int CeedOperatorInputBasisAtPoints_Cuda(CeedOperatorField op_input
                                                       CeedInt num_elem, const CeedInt *num_points, const bool skip_active, CeedScalar *e_data,
                                                       CeedOperator_Cuda *impl) {
   CeedEvalMode eval_mode;
+  CeedVector   e_vec = impl->e_vecs_in[input_field], q_vec = impl->q_vecs_in[input_field];
 
   // Skip active input
   if (skip_active) {
-    CeedVector vec;
+    CeedVector l_vec;
 
-    CeedCallBackend(CeedOperatorFieldGetVector(op_input_field, &vec));
-    if (vec == CEED_VECTOR_ACTIVE) return CEED_ERROR_SUCCESS;
+    CeedCallBackend(CeedOperatorFieldGetVector(op_input_field, &l_vec));
+    if (l_vec == CEED_VECTOR_ACTIVE) return CEED_ERROR_SUCCESS;
   }
 
   // Basis action
   CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_field, &eval_mode));
   switch (eval_mode) {
     case CEED_EVAL_NONE:
-      CeedCallBackend(CeedVectorSetArray(impl->q_vecs_in[input_field], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data));
+      CeedCallBackend(CeedVectorSetArray(q_vec, CEED_MEM_DEVICE, CEED_USE_POINTER, e_data));
       break;
     case CEED_EVAL_INTERP:
     case CEED_EVAL_GRAD:
@@ -658,8 +699,7 @@ static inline int CeedOperatorInputBasisAtPoints_Cuda(CeedOperatorField op_input
       CeedBasis basis;
 
       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_field, &basis));
-      CeedCallBackend(CeedBasisApplyAtPoints(basis, num_elem, num_points, CEED_NOTRANSPOSE, eval_mode, impl->point_coords_elem,
-                                             impl->e_vecs[input_field], impl->q_vecs_in[input_field]));
+      CeedCallBackend(CeedBasisApplyAtPoints(basis, num_elem, num_points, CEED_NOTRANSPOSE, eval_mode, impl->point_coords_elem, e_vec, q_vec));
       break;
     }
     case CEED_EVAL_WEIGHT:
@@ -672,8 +712,8 @@ static inline int CeedOperatorInputBasisAtPoints_Cuda(CeedOperatorField op_input
 // Apply and add to output AtPoints
 //------------------------------------------------------------------------------
 static int CeedOperatorApplyAddAtPoints_Cuda(CeedOperator op, CeedVector in_vec, CeedVector out_vec, CeedRequest *request) {
-  CeedInt             max_num_points, *num_points, num_elem, elem_size, num_input_fields, num_output_fields, size;
-  CeedScalar         *e_data[2 * CEED_FIELD_MAX] = {NULL};
+  CeedInt             max_num_points, *num_points, num_elem, num_input_fields, num_output_fields;
+  CeedScalar         *e_data_in[CEED_FIELD_MAX] = {NULL}, *e_data_out[CEED_FIELD_MAX] = {NULL};
   CeedQFunctionField *qf_input_fields, *qf_output_fields;
   CeedQFunction       qf;
   CeedOperatorField  *op_input_fields, *op_output_fields;
@@ -702,8 +742,12 @@ static int CeedOperatorApplyAddAtPoints_Cuda(CeedOperator op, CeedVector in_vec,
 
   // Process inputs
   for (CeedInt i = 0; i < num_input_fields; i++) {
-    CeedCallBackend(CeedOperatorInputRestrict_Cuda(op_input_fields[i], qf_input_fields[i], i, in_vec, false, &e_data[i], impl, request));
-    CeedCallBackend(CeedOperatorInputBasisAtPoints_Cuda(op_input_fields[i], qf_input_fields[i], i, num_elem, num_points, false, e_data[i], impl));
+    CeedInt field = impl->input_field_order[i];
+
+    CeedCallBackend(
+        CeedOperatorInputRestrict_Cuda(op_input_fields[field], qf_input_fields[field], field, in_vec, false, &e_data_in[field], impl, request));
+    CeedCallBackend(CeedOperatorInputBasisAtPoints_Cuda(op_input_fields[field], qf_input_fields[field], field, num_elem, num_points, false,
+                                                        e_data_in[field], impl));
   }
 
   // Output pointers, as necessary
@@ -713,8 +757,8 @@ static int CeedOperatorApplyAddAtPoints_Cuda(CeedOperator op, CeedVector in_vec,
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
     if (eval_mode == CEED_EVAL_NONE) {
       // Set the output Q-Vector to use the E-Vector data directly.
-      CeedCallBackend(CeedVectorGetArrayWrite(impl->e_vecs[i + impl->num_inputs], CEED_MEM_DEVICE, &e_data[i + num_input_fields]));
-      CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data[i + num_input_fields]));
+      CeedCallBackend(CeedVectorGetArrayWrite(impl->e_vecs_out[i], CEED_MEM_DEVICE, &e_data_out[i]));
+      CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data_out[i]));
     }
   }
 
@@ -723,21 +767,23 @@ static int CeedOperatorApplyAddAtPoints_Cuda(CeedOperator op, CeedVector in_vec,
 
   // Restore input arrays
   for (CeedInt i = 0; i < num_input_fields; i++) {
-    CeedCallBackend(CeedOperatorInputRestore_Cuda(op_input_fields[i], qf_input_fields[i], i, false, &e_data[i], impl));
+    CeedCallBackend(CeedOperatorInputRestore_Cuda(op_input_fields[i], qf_input_fields[i], i, false, &e_data_in[i], impl));
   }
 
   // Output basis apply if needed
   for (CeedInt i = 0; i < num_output_fields; i++) {
+    CeedInt             field = impl->output_field_order[i];
     CeedEvalMode        eval_mode;
+    CeedVector          l_vec, e_vec = impl->e_vecs_out[field], q_vec = impl->q_vecs_out[field];
     CeedElemRestriction elem_rstr;
     CeedBasis           basis;
 
-    // Get elem_size, eval_mode, size
-    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
-    CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
-    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
-    CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &size));
+    // Output vector
+    CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[field], &l_vec));
+    if (l_vec == CEED_VECTOR_ACTIVE) l_vec = out_vec;
+
     // Basis action
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[field], &eval_mode));
     switch (eval_mode) {
       case CEED_EVAL_NONE:
         break;  // No action
@@ -745,13 +791,11 @@ static int CeedOperatorApplyAddAtPoints_Cuda(CeedOperator op, CeedVector in_vec,
       case CEED_EVAL_GRAD:
       case CEED_EVAL_DIV:
       case CEED_EVAL_CURL:
-        CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
-        if (impl->apply_add_basis_out[i]) {
-          CeedCallBackend(CeedBasisApplyAddAtPoints(basis, num_elem, num_points, CEED_TRANSPOSE, eval_mode, impl->point_coords_elem,
-                                                    impl->q_vecs_out[i], impl->e_vecs[i + impl->num_inputs]));
+        CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[field], &basis));
+        if (impl->apply_add_basis_out[field]) {
+          CeedCallBackend(CeedBasisApplyAddAtPoints(basis, num_elem, num_points, CEED_TRANSPOSE, eval_mode, impl->point_coords_elem, q_vec, e_vec));
         } else {
-          CeedCallBackend(CeedBasisApplyAtPoints(basis, num_elem, num_points, CEED_TRANSPOSE, eval_mode, impl->point_coords_elem, impl->q_vecs_out[i],
-                                                 impl->e_vecs[i + impl->num_inputs]));
+          CeedCallBackend(CeedBasisApplyAtPoints(basis, num_elem, num_points, CEED_TRANSPOSE, eval_mode, impl->point_coords_elem, q_vec, e_vec));
         }
         break;
       // LCOV_EXCL_START
@@ -760,28 +804,16 @@ static int CeedOperatorApplyAddAtPoints_Cuda(CeedOperator op, CeedVector in_vec,
         // LCOV_EXCL_STOP
       }
     }
-  }
-
-  // Output restriction
-  for (CeedInt i = 0; i < num_output_fields; i++) {
-    CeedEvalMode        eval_mode;
-    CeedVector          vec;
-    CeedElemRestriction elem_rstr;
 
     // Restore evec
-    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
     if (eval_mode == CEED_EVAL_NONE) {
-      CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs[i + impl->num_inputs], &e_data[i + num_input_fields]));
+      CeedCallBackend(CeedVectorRestoreArray(e_vec, &e_data_out[field]));
     }
-    if (impl->skip_rstr_out[i]) continue;
-    // Get output vector
-    CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
-    // Restrict
-    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
-    // Active
-    if (vec == CEED_VECTOR_ACTIVE) vec = out_vec;
 
-    CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_TRANSPOSE, impl->e_vecs[i + impl->num_inputs], vec, request));
+    // Restrict
+    if (impl->skip_rstr_out[field]) continue;
+    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[field], &elem_rstr));
+    CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_TRANSPOSE, e_vec, l_vec, request));
   }
   return CEED_ERROR_SUCCESS;
 }
@@ -814,21 +846,21 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Cuda(CeedOperator op,
   // Setup
   CeedCallBackend(CeedOperatorSetup_Cuda(op));
 
-  // Input Evecs and Restriction
+  // Process inputs
   for (CeedInt i = 0; i < num_input_fields; i++) {
     CeedCallBackend(CeedOperatorInputRestrict_Cuda(op_input_fields[i], qf_input_fields[i], i, NULL, true, &e_data[i], impl, request));
+    CeedCallBackend(CeedOperatorInputBasis_Cuda(op_input_fields[i], qf_input_fields[i], i, num_elem, true, e_data[i], impl));
   }
 
   // Count number of active input fields
   if (!num_active_in) {
     for (CeedInt i = 0; i < num_input_fields; i++) {
       CeedScalar *q_vec_array;
-      CeedVector  vec;
+      CeedVector  l_vec;
 
-      // Get input vector
-      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
       // Check if active input
-      if (vec == CEED_VECTOR_ACTIVE) {
+      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &l_vec));
+      if (l_vec == CEED_VECTOR_ACTIVE) {
         CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &size));
         CeedCallBackend(CeedVectorSetValue(impl->q_vecs_in[i], 0.0));
         CeedCallBackend(CeedVectorGetArray(impl->q_vecs_in[i], CEED_MEM_DEVICE, &q_vec_array));
@@ -851,12 +883,11 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Cuda(CeedOperator op,
   // Count number of active output fields
   if (!num_active_out) {
     for (CeedInt i = 0; i < num_output_fields; i++) {
-      CeedVector vec;
+      CeedVector l_vec;
 
-      // Get output vector
-      CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
       // Check if active output
-      if (vec == CEED_VECTOR_ACTIVE) {
+      CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &l_vec));
+      if (l_vec == CEED_VECTOR_ACTIVE) {
         CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &size));
         num_active_out += size;
       }
@@ -882,11 +913,6 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Cuda(CeedOperator op,
   CeedCallBackend(CeedVectorSetValue(*assembled, 0.0));
   CeedCallBackend(CeedVectorGetArray(*assembled, CEED_MEM_DEVICE, &assembled_array));
 
-  // Input basis apply
-  for (CeedInt i = 0; i < num_input_fields; i++) {
-    CeedCallBackend(CeedOperatorInputBasis_Cuda(op_input_fields[i], qf_input_fields[i], i, num_elem, true, e_data[i], impl));
-  }
-
   // Assemble QFunction
   for (CeedInt in = 0; in < num_active_in; in++) {
     // Set Inputs
@@ -896,12 +922,12 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Cuda(CeedOperator op,
     }
     // Set Outputs
     for (CeedInt out = 0; out < num_output_fields; out++) {
-      CeedVector vec;
+      CeedVector l_vec;
 
       // Get output vector
-      CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec));
+      CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &l_vec));
       // Check if active output
-      if (vec == CEED_VECTOR_ACTIVE) {
+      if (l_vec == CEED_VECTOR_ACTIVE) {
         CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[out], CEED_MEM_DEVICE, CEED_USE_POINTER, assembled_array));
         CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[out], &size));
         assembled_array += size * Q * num_elem;  // Advance the pointer by the size of the output
@@ -913,12 +939,10 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Cuda(CeedOperator op,
 
   // Un-set output q-vecs to prevent accidental overwrite of Assembled
   for (CeedInt out = 0; out < num_output_fields; out++) {
-    CeedVector vec;
+    CeedVector l_vec;
 
-    // Get output vector
-    CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec));
-    // Check if active output
-    if (vec == CEED_VECTOR_ACTIVE) {
+    CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &l_vec));
+    if (l_vec == CEED_VECTOR_ACTIVE) {
       CeedCallBackend(CeedVectorTakeArray(impl->q_vecs_out[out], CEED_MEM_DEVICE, NULL));
     }
   }
@@ -1510,8 +1534,8 @@ static int CeedSingleOperatorAssembleSetup_Cuda(CeedOperator op, CeedInt use_cee
 // Assemble matrix data for COO matrix of assembled operator.
 // The sparsity pattern is set by CeedOperatorLinearAssembleSymbolic.
 //
-// Note that this (and other assembly routines) currently assume only one active input restriction/basis per operator (could have multiple basis eval
-// modes).
+// Note that this (and other assembly routines) currently assume only one active input restriction/basis per operator
+// (could have multiple basis eval modes).
 // TODO: allow multiple active input restrictions/basis objects
 //------------------------------------------------------------------------------
 static int CeedSingleOperatorAssemble_Cuda(CeedOperator op, CeedInt offset, CeedVector values) {
@@ -1622,7 +1646,7 @@ static int CeedOperatorLinearAssembleQFunctionAtPoints_Cuda(CeedOperator op, Cee
 //------------------------------------------------------------------------------
 static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda(CeedOperator op, CeedVector assembled, CeedRequest *request) {
   CeedInt             max_num_points, *num_points, num_elem, num_input_fields, num_output_fields;
-  CeedScalar         *e_data[2 * CEED_FIELD_MAX] = {NULL};
+  CeedScalar         *e_data_in[CEED_FIELD_MAX] = {NULL}, *e_data_out[CEED_FIELD_MAX] = {NULL};
   CeedQFunctionField *qf_input_fields, *qf_output_fields;
   CeedQFunction       qf;
   CeedOperatorField  *op_input_fields, *op_output_fields;
@@ -1643,10 +1667,10 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda(CeedOperator op, C
   if (impl->has_shared_e_vecs) {
     for (CeedInt i = 0; i < impl->num_outputs; i++) {
       CeedCallBackend(CeedVectorDestroy(&impl->q_vecs_out[i]));
-      CeedCallBackend(CeedVectorDestroy(&impl->e_vecs[impl->num_inputs + i]));
+      CeedCallBackend(CeedVectorDestroy(&impl->e_vecs_out[i]));
     }
-    CeedCallBackend(CeedOperatorSetupFields_Cuda(qf, op, false, true, impl->skip_rstr_out, impl->apply_add_basis_out, impl->e_vecs, impl->q_vecs_out,
-                                                 num_input_fields, num_output_fields, max_num_points, num_elem));
+    CeedCallBackend(CeedOperatorSetupFields_Cuda(qf, op, false, true, impl->skip_rstr_out, impl->apply_add_basis_out, impl->e_vecs_out,
+                                                 impl->q_vecs_out, num_output_fields, max_num_points, num_elem));
   }
   impl->has_shared_e_vecs = false;
 
@@ -1660,9 +1684,10 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda(CeedOperator op, C
     CeedCallBackend(CeedElemRestrictionApply(rstr_points, CEED_NOTRANSPOSE, point_coords, impl->point_coords_elem, request));
   }
 
-  // Input Evecs and Restriction
+  // Process inputs
   for (CeedInt i = 0; i < num_input_fields; i++) {
-    CeedCallBackend(CeedOperatorInputRestrict_Cuda(op_input_fields[i], qf_input_fields[i], i, NULL, true, &e_data[i], impl, request));
+    CeedCallBackend(CeedOperatorInputRestrict_Cuda(op_input_fields[i], qf_input_fields[i], i, NULL, true, &e_data_in[i], impl, request));
+    CeedCallBackend(CeedOperatorInputBasisAtPoints_Cuda(op_input_fields[i], qf_input_fields[i], i, num_elem, num_points, true, e_data_in[i], impl));
   }
 
   // Clear active input Qvecs
@@ -1674,11 +1699,6 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda(CeedOperator op, C
     CeedCallBackend(CeedVectorSetValue(impl->q_vecs_in[i], 0.0));
   }
 
-  // Input basis apply if needed
-  for (CeedInt i = 0; i < num_input_fields; i++) {
-    CeedCallBackend(CeedOperatorInputBasisAtPoints_Cuda(op_input_fields[i], qf_input_fields[i], i, num_elem, num_points, true, e_data[i], impl));
-  }
-
   // Output pointers, as necessary
   for (CeedInt i = 0; i < num_output_fields; i++) {
     CeedEvalMode eval_mode;
@@ -1686,8 +1706,8 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda(CeedOperator op, C
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
     if (eval_mode == CEED_EVAL_NONE) {
       // Set the output Q-Vector to use the E-Vector data directly.
-      CeedCallBackend(CeedVectorGetArrayWrite(impl->e_vecs[i + impl->num_inputs], CEED_MEM_DEVICE, &e_data[i + num_input_fields]));
-      CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data[i + num_input_fields]));
+      CeedCallBackend(CeedVectorGetArrayWrite(impl->e_vecs_out[i], CEED_MEM_DEVICE, &e_data_out[i]));
+      CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data_out[i]));
     }
   }
 
@@ -1696,12 +1716,12 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda(CeedOperator op, C
     bool                is_active_at_points = true;
     CeedInt             elem_size = 1, num_comp_active = 1, e_vec_size = 0;
     CeedRestrictionType rstr_type;
-    CeedVector          vec;
+    CeedVector          l_vec;
     CeedElemRestriction elem_rstr;
 
-    CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
+    CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &l_vec));
     // -- Skip non-active input
-    if (vec != CEED_VECTOR_ACTIVE) continue;
+    if (l_vec != CEED_VECTOR_ACTIVE) continue;
 
     // -- Get active restriction type
     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
@@ -1724,23 +1744,23 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda(CeedOperator op, C
       if (!is_active_input) continue;
 
       // Update unit vector
-      if (s == 0) CeedCallBackend(CeedVectorSetValue(impl->e_vecs[i], 0.0));
-      else CeedCallBackend(CeedVectorSetValueStrided(impl->e_vecs[i], s - 1, e_vec_size, 0.0));
-      CeedCallBackend(CeedVectorSetValueStrided(impl->e_vecs[i], s, e_vec_size, 1.0));
+      if (s == 0) CeedCallBackend(CeedVectorSetValue(impl->e_vecs_in[i], 0.0));
+      else CeedCallBackend(CeedVectorSetValueStrided(impl->e_vecs_in[i], s - 1, e_vec_size, 0.0));
+      CeedCallBackend(CeedVectorSetValueStrided(impl->e_vecs_in[i], s, e_vec_size, 1.0));
 
       // Basis action
       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
       switch (eval_mode) {
         case CEED_EVAL_NONE:
-          CeedCallBackend(CeedVectorSetArray(impl->q_vecs_in[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data[i]));
+          CeedCallBackend(CeedVectorSetArray(impl->q_vecs_in[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data_in[i]));
           break;
         case CEED_EVAL_INTERP:
         case CEED_EVAL_GRAD:
         case CEED_EVAL_DIV:
         case CEED_EVAL_CURL:
           CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
-          CeedCallBackend(CeedBasisApplyAtPoints(basis, num_elem, num_points, CEED_NOTRANSPOSE, eval_mode, impl->point_coords_elem, impl->e_vecs[i],
-                                                 impl->q_vecs_in[i]));
+          CeedCallBackend(CeedBasisApplyAtPoints(basis, num_elem, num_points, CEED_NOTRANSPOSE, eval_mode, impl->point_coords_elem,
+                                                 impl->e_vecs_in[i], impl->q_vecs_in[i]));
           break;
         case CEED_EVAL_WEIGHT:
           break;  // No action
@@ -1755,13 +1775,13 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda(CeedOperator op, C
         CeedInt             elem_size        = 0;
         CeedRestrictionType rstr_type;
         CeedEvalMode        eval_mode;
-        CeedVector          vec;
+        CeedVector          l_vec;
         CeedElemRestriction elem_rstr;
         CeedBasis           basis;
 
-        CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[j], &vec));
+        CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[j], &l_vec));
         // ---- Skip non-active output
-        is_active_output = vec == CEED_VECTOR_ACTIVE;
+        is_active_output = l_vec == CEED_VECTOR_ACTIVE;
         if (!is_active_output) continue;
 
         // ---- Check if elem size matches
@@ -1784,7 +1804,7 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda(CeedOperator op, C
         CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode));
         switch (eval_mode) {
           case CEED_EVAL_NONE:
-            CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs[j + impl->num_inputs], &e_data[j + num_input_fields]));
+            CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs_out[j], &e_data_out[j]));
             break;
           case CEED_EVAL_INTERP:
           case CEED_EVAL_GRAD:
@@ -1792,7 +1812,7 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda(CeedOperator op, C
           case CEED_EVAL_CURL:
             CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[j], &basis));
             CeedCallBackend(CeedBasisApplyAtPoints(basis, num_elem, num_points, CEED_TRANSPOSE, eval_mode, impl->point_coords_elem,
-                                                   impl->q_vecs_out[j], impl->e_vecs[j + impl->num_inputs]));
+                                                   impl->q_vecs_out[j], impl->e_vecs_out[j]));
             break;
           // LCOV_EXCL_START
           case CEED_EVAL_WEIGHT: {
@@ -1802,16 +1822,16 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda(CeedOperator op, C
         }
 
         // Mask output e-vec
-        CeedCallBackend(CeedVectorPointwiseMult(impl->e_vecs[j + impl->num_inputs], impl->e_vecs[i], impl->e_vecs[j + impl->num_inputs]));
+        CeedCallBackend(CeedVectorPointwiseMult(impl->e_vecs_out[j], impl->e_vecs_in[i], impl->e_vecs_out[j]));
 
         // Restrict
         CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[j], &elem_rstr));
-        CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_TRANSPOSE, impl->e_vecs[j + impl->num_inputs], assembled, request));
+        CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_TRANSPOSE, impl->e_vecs_out[j], assembled, request));
 
         // Reset q_vec for
         if (eval_mode == CEED_EVAL_NONE) {
-          CeedCallBackend(CeedVectorGetArrayWrite(impl->e_vecs[j + impl->num_inputs], CEED_MEM_DEVICE, &e_data[j + num_input_fields]));
-          CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[j], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data[j + num_input_fields]));
+          CeedCallBackend(CeedVectorGetArrayWrite(impl->e_vecs_out[j], CEED_MEM_DEVICE, &e_data_out[j]));
+          CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[j], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data_out[j]));
         }
       }
 
@@ -1830,13 +1850,13 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda(CeedOperator op, C
     // Restore evec
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
     if (eval_mode == CEED_EVAL_NONE) {
-      CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs[i + impl->num_inputs], &e_data[i + num_input_fields]));
+      CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs_in[i], &e_data_in[i]));
     }
   }
 
   // Restore input arrays
   for (CeedInt i = 0; i < num_input_fields; i++) {
-    CeedCallBackend(CeedOperatorInputRestore_Cuda(op_input_fields[i], qf_input_fields[i], i, true, &e_data[i], impl));
+    CeedCallBackend(CeedOperatorInputRestore_Cuda(op_input_fields[i], qf_input_fields[i], i, true, &e_data_in[i], impl));
   }
   return CEED_ERROR_SUCCESS;
 }
diff --git a/backends/cuda-ref/ceed-cuda-ref.h b/backends/cuda-ref/ceed-cuda-ref.h
index 9a63a5f4f4..b6e7016013 100644
--- a/backends/cuda-ref/ceed-cuda-ref.h
+++ b/backends/cuda-ref/ceed-cuda-ref.h
@@ -133,11 +133,12 @@ typedef struct {
 typedef struct {
   bool                      *skip_rstr_in, *skip_rstr_out, *apply_add_basis_out, has_shared_e_vecs;
   uint64_t                  *input_states;  // State tracking for passive inputs
-  CeedVector                *e_vecs;        // E-vectors, inputs followed by outputs
-  CeedVector                *q_vecs_in;     // Input Q-vectors needed to apply operator
-  CeedVector                *q_vecs_out;    // Output Q-vectors needed to apply operator
+  CeedVector                *e_vecs_in, *e_vecs_out;
+  CeedVector                *q_vecs_in, *q_vecs_out;
   CeedInt                    num_inputs, num_outputs;
   CeedInt                    num_active_in, num_active_out;
+  CeedInt                   *input_field_order, *output_field_order;
+  CeedSize                   max_active_e_vec_len;
   CeedInt                    max_num_points;
   CeedInt                   *num_points;
   CeedVector                *qf_active_in, point_coords_elem;
diff --git a/backends/hip-gen/ceed-hip-gen-operator-build.cpp b/backends/hip-gen/ceed-hip-gen-operator-build.cpp
index c2e21a5468..c3298b1b27 100644
--- a/backends/hip-gen/ceed-hip-gen-operator-build.cpp
+++ b/backends/hip-gen/ceed-hip-gen-operator-build.cpp
@@ -848,11 +848,11 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op) {
       CeedElemRestriction rstr_i;
 
       if (is_ordered[i]) continue;
-      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i));
       field_rstr_in_buffer[i]       = i;
       is_ordered[i]                 = true;
       input_field_order[curr_index] = i;
       curr_index++;
+      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i));
       if (vec_i == CEED_VECTOR_NONE) continue;  // CEED_EVAL_WEIGHT
       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i));
       for (CeedInt j = i + 1; j < num_input_fields; j++) {
diff --git a/backends/hip-ref/ceed-hip-ref-operator.c b/backends/hip-ref/ceed-hip-ref-operator.c
index 858546bb25..4534e840ee 100644
--- a/backends/hip-ref/ceed-hip-ref-operator.c
+++ b/backends/hip-ref/ceed-hip-ref-operator.c
@@ -30,20 +30,22 @@ static int CeedOperatorDestroy_Hip(CeedOperator op) {
   CeedCallBackend(CeedFree(&impl->skip_rstr_in));
   CeedCallBackend(CeedFree(&impl->skip_rstr_out));
   CeedCallBackend(CeedFree(&impl->apply_add_basis_out));
-  for (CeedInt i = 0; i < impl->num_inputs + impl->num_outputs; i++) {
-    CeedCallBackend(CeedVectorDestroy(&impl->e_vecs[i]));
-  }
-  CeedCallBackend(CeedFree(&impl->e_vecs));
+  CeedCallBackend(CeedFree(&impl->input_field_order));
+  CeedCallBackend(CeedFree(&impl->output_field_order));
   CeedCallBackend(CeedFree(&impl->input_states));
 
   for (CeedInt i = 0; i < impl->num_inputs; i++) {
+    CeedCallBackend(CeedVectorDestroy(&impl->e_vecs_in[i]));
     CeedCallBackend(CeedVectorDestroy(&impl->q_vecs_in[i]));
   }
+  CeedCallBackend(CeedFree(&impl->e_vecs_in));
   CeedCallBackend(CeedFree(&impl->q_vecs_in));
 
   for (CeedInt i = 0; i < impl->num_outputs; i++) {
+    CeedCallBackend(CeedVectorDestroy(&impl->e_vecs_out[i]));
     CeedCallBackend(CeedVectorDestroy(&impl->q_vecs_out[i]));
   }
+  CeedCallBackend(CeedFree(&impl->e_vecs_out));
   CeedCallBackend(CeedFree(&impl->q_vecs_out));
   CeedCallBackend(CeedVectorDestroy(&impl->point_coords_elem));
 
@@ -100,7 +102,7 @@ static int CeedOperatorDestroy_Hip(CeedOperator op) {
 // Setup infields or outfields
 //------------------------------------------------------------------------------
 static int CeedOperatorSetupFields_Hip(CeedQFunction qf, CeedOperator op, bool is_input, bool is_at_points, bool *skip_rstr, bool *apply_add_basis,
-                                       CeedVector *e_vecs, CeedVector *q_vecs, CeedInt start_e, CeedInt num_fields, CeedInt Q, CeedInt num_elem) {
+                                       CeedVector *e_vecs, CeedVector *q_vecs, CeedInt num_fields, CeedInt Q, CeedInt num_elem) {
   Ceed                ceed;
   CeedQFunctionField *qf_fields;
   CeedOperatorField  *op_fields;
@@ -116,7 +118,7 @@ static int CeedOperatorSetupFields_Hip(CeedQFunction qf, CeedOperator op, bool i
 
   // Loop over fields
   for (CeedInt i = 0; i < num_fields; i++) {
-    bool         is_strided = false, skip_restriction = false;
+    bool         is_strided = false, skip_e_vec = false;
     CeedSize     q_size;
     CeedInt      size;
     CeedEvalMode eval_mode;
@@ -132,27 +134,24 @@ static int CeedOperatorSetupFields_Hip(CeedQFunction qf, CeedOperator op, bool i
 
       // First, check whether the field is input or output:
       if (is_input) {
-        CeedVector vec;
+        CeedVector l_vec;
 
         // Check for passive input
-        CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec));
-        if (vec != CEED_VECTOR_ACTIVE) {
-          // Check eval_mode
-          if (eval_mode == CEED_EVAL_NONE) {
-            // Check for strided restriction
-            CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided));
-            if (is_strided) {
-              // Check if vector is already in preferred backend ordering
-              CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &skip_restriction));
-            }
+        CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &l_vec));
+        if (l_vec != CEED_VECTOR_ACTIVE && eval_mode == CEED_EVAL_NONE) {
+          // Check for strided restriction
+          CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided));
+          if (is_strided) {
+            // Check if vector is already in preferred backend ordering
+            CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &skip_e_vec));
           }
         }
       }
-      if (skip_restriction) {
-        // We do not need an E-Vector, but will use the input field vector's data directly in the operator application.
-        e_vecs[i + start_e] = NULL;
+      if (skip_e_vec) {
+        // Either an active field or strided local vec in backend ordering
+        e_vecs[i] = NULL;
       } else {
-        CeedCallBackend(CeedElemRestrictionCreateVector(elem_rstr, NULL, &e_vecs[i + start_e]));
+        CeedCallBackend(CeedElemRestrictionCreateVector(elem_rstr, NULL, &e_vecs[i]));
       }
     }
 
@@ -201,7 +200,7 @@ static int CeedOperatorSetupFields_Hip(CeedQFunction qf, CeedOperator op, bool i
         CeedCallBackend(CeedOperatorFieldGetVector(op_fields[j], &vec_j));
         CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[j], &rstr_j));
         if (vec_i == vec_j && rstr_i == rstr_j) {
-          CeedCallBackend(CeedVectorReferenceCopy(e_vecs[i + start_e], &e_vecs[j + start_e]));
+          CeedCallBackend(CeedVectorReferenceCopy(e_vecs[i], &e_vecs[j]));
           skip_rstr[j] = true;
         }
       }
@@ -220,7 +219,7 @@ static int CeedOperatorSetupFields_Hip(CeedQFunction qf, CeedOperator op, bool i
         CeedCallBackend(CeedOperatorFieldGetVector(op_fields[j], &vec_j));
         CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[j], &rstr_j));
         if (vec_i == vec_j && rstr_i == rstr_j) {
-          CeedCallBackend(CeedVectorReferenceCopy(e_vecs[i + start_e], &e_vecs[j + start_e]));
+          CeedCallBackend(CeedVectorReferenceCopy(e_vecs[i], &e_vecs[j]));
           skip_rstr[j]       = true;
           apply_add_basis[i] = true;
         }
@@ -254,63 +253,92 @@ static int CeedOperatorSetup_Hip(CeedOperator op) {
   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
 
   // Allocate
-  CeedCallBackend(CeedCalloc(num_input_fields + num_output_fields, &impl->e_vecs));
-  CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->skip_rstr_in));
-  CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->skip_rstr_out));
-  CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->apply_add_basis_out));
-  CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->input_states));
-  CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_in));
-  CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_out));
+  CeedCallBackend(CeedCalloc(num_input_fields, &impl->e_vecs_in));
+  CeedCallBackend(CeedCalloc(num_output_fields, &impl->e_vecs_out));
+  CeedCallBackend(CeedCalloc(num_input_fields, &impl->skip_rstr_in));
+  CeedCallBackend(CeedCalloc(num_output_fields, &impl->skip_rstr_out));
+  CeedCallBackend(CeedCalloc(num_output_fields, &impl->apply_add_basis_out));
+  CeedCallBackend(CeedCalloc(num_input_fields, &impl->input_field_order));
+  CeedCallBackend(CeedCalloc(num_output_fields, &impl->output_field_order));
+  CeedCallBackend(CeedCalloc(num_input_fields, &impl->input_states));
+  CeedCallBackend(CeedCalloc(num_input_fields, &impl->q_vecs_in));
+  CeedCallBackend(CeedCalloc(num_output_fields, &impl->q_vecs_out));
   impl->num_inputs  = num_input_fields;
   impl->num_outputs = num_output_fields;
 
   // Set up infield and outfield e-vecs and q-vecs
-  // Infields
   CeedCallBackend(
-      CeedOperatorSetupFields_Hip(qf, op, true, false, impl->skip_rstr_in, NULL, impl->e_vecs, impl->q_vecs_in, 0, num_input_fields, Q, num_elem));
-  // Outfields
-  CeedCallBackend(CeedOperatorSetupFields_Hip(qf, op, false, false, impl->skip_rstr_out, impl->apply_add_basis_out, impl->e_vecs, impl->q_vecs_out,
-                                              num_input_fields, num_output_fields, Q, num_elem));
+      CeedOperatorSetupFields_Hip(qf, op, true, false, impl->skip_rstr_in, NULL, impl->e_vecs_in, impl->q_vecs_in, num_input_fields, Q, num_elem));
+  CeedCallBackend(CeedOperatorSetupFields_Hip(qf, op, false, false, impl->skip_rstr_out, impl->apply_add_basis_out, impl->e_vecs_out,
+                                              impl->q_vecs_out, num_output_fields, Q, num_elem));
 
-  // Reuse active e-vecs where able
+  // Reorder fields to allow reuse of buffers
+  impl->max_active_e_vec_len = 0;
   {
-    CeedInt              num_used  = 0;
-    CeedElemRestriction *rstr_used = NULL;
+    bool    is_ordered[CEED_FIELD_MAX];
+    CeedInt curr_index = 0;
 
+    for (CeedInt i = 0; i < num_input_fields; i++) is_ordered[i] = false;
     for (CeedInt i = 0; i < num_input_fields; i++) {
-      bool                is_used = false;
+      CeedSize            e_vec_len_i;
       CeedVector          vec_i;
       CeedElemRestriction rstr_i;
 
+      if (is_ordered[i]) continue;
+      is_ordered[i]                       = true;
+      impl->input_field_order[curr_index] = i;
+      curr_index++;
       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i));
-      if (vec_i != CEED_VECTOR_ACTIVE) continue;
+      if (vec_i == CEED_VECTOR_NONE) continue;  // CEED_EVAL_WEIGHT
       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i));
-      for (CeedInt j = 0; j < num_used; j++) {
-        if (rstr_i == rstr_used[i]) is_used = true;
+      CeedCallBackend(CeedElemRestrictionGetEVectorSize(rstr_i, &e_vec_len_i));
+      impl->max_active_e_vec_len = e_vec_len_i > impl->max_active_e_vec_len ? e_vec_len_i : impl->max_active_e_vec_len;
+      for (CeedInt j = i + 1; j < num_input_fields; j++) {
+        CeedVector          vec_j;
+        CeedElemRestriction rstr_j;
+
+        CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec_j));
+        CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[j], &rstr_j));
+        if (rstr_i == rstr_j && vec_i == vec_j) {
+          is_ordered[j]                       = true;
+          impl->input_field_order[curr_index] = j;
+          curr_index++;
+        }
       }
-      if (is_used) continue;
-      num_used++;
-      if (num_used == 1) CeedCallBackend(CeedCalloc(num_used, &rstr_used));
-      else CeedCallBackend(CeedRealloc(num_used, &rstr_used));
-      rstr_used[num_used - 1] = rstr_i;
-      for (CeedInt j = num_output_fields - 1; j >= 0; j--) {
-        CeedEvalMode        eval_mode;
+    }
+  }
+  {
+    bool    is_ordered[CEED_FIELD_MAX];
+    CeedInt curr_index = 0;
+
+    for (CeedInt i = 0; i < num_output_fields; i++) is_ordered[i] = false;
+    for (CeedInt i = 0; i < num_output_fields; i++) {
+      CeedSize            e_vec_len_i;
+      CeedVector          vec_i;
+      CeedElemRestriction rstr_i;
+
+      if (is_ordered[i]) continue;
+      is_ordered[i]                        = true;
+      impl->output_field_order[curr_index] = i;
+      curr_index++;
+      CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec_i));
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &rstr_i));
+      CeedCallBackend(CeedElemRestrictionGetEVectorSize(rstr_i, &e_vec_len_i));
+      impl->max_active_e_vec_len = e_vec_len_i > impl->max_active_e_vec_len ? e_vec_len_i : impl->max_active_e_vec_len;
+      for (CeedInt j = i + 1; j < num_output_fields; j++) {
         CeedVector          vec_j;
         CeedElemRestriction rstr_j;
 
         CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[j], &vec_j));
-        if (vec_j != CEED_VECTOR_ACTIVE) continue;
-        CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode));
-        if (eval_mode == CEED_EVAL_NONE) continue;
         CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[j], &rstr_j));
-        if (rstr_i == rstr_j) {
-          CeedCallBackend(CeedVectorReferenceCopy(impl->e_vecs[i], &impl->e_vecs[j + impl->num_inputs]));
+        if (rstr_i == rstr_j && vec_i == vec_j) {
+          is_ordered[j]                        = true;
+          impl->output_field_order[curr_index] = j;
+          curr_index++;
         }
       }
     }
-    CeedCallBackend(CeedFree(&rstr_used));
   }
-  impl->has_shared_e_vecs = true;
   CeedCallBackend(CeedOperatorSetSetupDone(op));
   return CEED_ERROR_SUCCESS;
 }
@@ -321,40 +349,39 @@ static int CeedOperatorSetup_Hip(CeedOperator op) {
 static inline int CeedOperatorInputRestrict_Hip(CeedOperatorField op_input_field, CeedQFunctionField qf_input_field, CeedInt input_field,
                                                 CeedVector in_vec, const bool skip_active, CeedScalar **e_data, CeedOperator_Hip *impl,
                                                 CeedRequest *request) {
-  CeedEvalMode        eval_mode;
-  CeedVector          vec;
-  CeedElemRestriction elem_rstr;
+  CeedEvalMode eval_mode;
+  CeedVector   l_vec, e_vec = impl->e_vecs_in[input_field];
 
   // Get input vector
-  CeedCallBackend(CeedOperatorFieldGetVector(op_input_field, &vec));
-  if (vec == CEED_VECTOR_ACTIVE) {
+  CeedCallBackend(CeedOperatorFieldGetVector(op_input_field, &l_vec));
+  if (l_vec == CEED_VECTOR_ACTIVE) {
     if (skip_active) return CEED_ERROR_SUCCESS;
-    else vec = in_vec;
+    else l_vec = in_vec;
   }
 
   // Restriction action
   CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_field, &eval_mode));
   if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
   } else {
-    // Get input vector
-    CeedCallBackend(CeedOperatorFieldGetVector(op_input_field, &vec));
-    // Get input element restriction
-    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_field, &elem_rstr));
-    if (vec == CEED_VECTOR_ACTIVE) vec = in_vec;
-    // Restrict, if necessary
-    if (!impl->e_vecs[input_field]) {
+    if (!e_vec) {
       // No restriction for this field; read data directly from vec.
-      CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, (const CeedScalar **)e_data));
+      CeedCallBackend(CeedVectorGetArrayRead(l_vec, CEED_MEM_DEVICE, (const CeedScalar **)e_data));
     } else {
-      uint64_t state;
+      // Restrict, if necessary
+      if (!impl->skip_rstr_in[input_field]) {
+        uint64_t state;
 
-      CeedCallBackend(CeedVectorGetState(vec, &state));
-      if ((state != impl->input_states[input_field] || vec == in_vec) && !impl->skip_rstr_in[input_field]) {
-        CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_NOTRANSPOSE, vec, impl->e_vecs[input_field], request));
+        CeedCallBackend(CeedVectorGetState(l_vec, &state));
+        if (state != impl->input_states[input_field] || l_vec == in_vec) {
+          CeedElemRestriction elem_rstr;
+
+          CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_field, &elem_rstr));
+          CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_NOTRANSPOSE, l_vec, e_vec, request));
+        }
+        impl->input_states[input_field] = state;
       }
-      impl->input_states[input_field] = state;
-      // Get evec
-      CeedCallBackend(CeedVectorGetArrayRead(impl->e_vecs[input_field], CEED_MEM_DEVICE, (const CeedScalar **)e_data));
+      // Get e-vec
+      CeedCallBackend(CeedVectorGetArrayRead(e_vec, CEED_MEM_DEVICE, (const CeedScalar **)e_data));
     }
   }
   return CEED_ERROR_SUCCESS;
@@ -366,20 +393,21 @@ static inline int CeedOperatorInputRestrict_Hip(CeedOperatorField op_input_field
 static inline int CeedOperatorInputBasis_Hip(CeedOperatorField op_input_field, CeedQFunctionField qf_input_field, CeedInt input_field,
                                              CeedInt num_elem, const bool skip_active, CeedScalar *e_data, CeedOperator_Hip *impl) {
   CeedEvalMode eval_mode;
+  CeedVector   e_vec = impl->e_vecs_in[input_field], q_vec = impl->q_vecs_in[input_field];
 
   // Skip active input
   if (skip_active) {
-    CeedVector vec;
+    CeedVector l_vec;
 
-    CeedCallBackend(CeedOperatorFieldGetVector(op_input_field, &vec));
-    if (vec == CEED_VECTOR_ACTIVE) return CEED_ERROR_SUCCESS;
+    CeedCallBackend(CeedOperatorFieldGetVector(op_input_field, &l_vec));
+    if (l_vec == CEED_VECTOR_ACTIVE) return CEED_ERROR_SUCCESS;
   }
 
   // Basis action
   CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_field, &eval_mode));
   switch (eval_mode) {
     case CEED_EVAL_NONE:
-      CeedCallBackend(CeedVectorSetArray(impl->q_vecs_in[input_field], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data));
+      CeedCallBackend(CeedVectorSetArray(q_vec, CEED_MEM_DEVICE, CEED_USE_POINTER, e_data));
       break;
     case CEED_EVAL_INTERP:
     case CEED_EVAL_GRAD:
@@ -388,7 +416,7 @@ static inline int CeedOperatorInputBasis_Hip(CeedOperatorField op_input_field, C
       CeedBasis basis;
 
       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_field, &basis));
-      CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_NOTRANSPOSE, eval_mode, impl->e_vecs[input_field], impl->q_vecs_in[input_field]));
+      CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_NOTRANSPOSE, eval_mode, e_vec, q_vec));
       break;
     }
     case CEED_EVAL_WEIGHT:
@@ -403,23 +431,20 @@ static inline int CeedOperatorInputBasis_Hip(CeedOperatorField op_input_field, C
 static inline int CeedOperatorInputRestore_Hip(CeedOperatorField op_input_field, CeedQFunctionField qf_input_field, CeedInt input_field,
                                                const bool skip_active, CeedScalar **e_data, CeedOperator_Hip *impl) {
   CeedEvalMode eval_mode;
-  CeedVector   vec;
+  CeedVector   l_vec, e_vec = impl->e_vecs_in[input_field];
 
   // Skip active input
-  if (skip_active) {
-    CeedCallBackend(CeedOperatorFieldGetVector(op_input_field, &vec));
-    if (vec == CEED_VECTOR_ACTIVE) return CEED_ERROR_SUCCESS;
-  }
+  CeedCallBackend(CeedOperatorFieldGetVector(op_input_field, &l_vec));
+  if (skip_active && l_vec == CEED_VECTOR_ACTIVE) return CEED_ERROR_SUCCESS;
 
   // Restore e-vec
   CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_field, &eval_mode));
   if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
   } else {
-    if (!impl->e_vecs[input_field]) {  // This was a skip_restriction case
-      CeedCallBackend(CeedOperatorFieldGetVector(op_input_field, &vec));
-      CeedCallBackend(CeedVectorRestoreArrayRead(vec, (const CeedScalar **)e_data));
+    if (!e_vec) {  // This was a skip_restriction case
+      CeedCallBackend(CeedVectorRestoreArrayRead(l_vec, (const CeedScalar **)e_data));
     } else {
-      CeedCallBackend(CeedVectorRestoreArrayRead(impl->e_vecs[input_field], (const CeedScalar **)e_data));
+      CeedCallBackend(CeedVectorRestoreArrayRead(e_vec, (const CeedScalar **)e_data));
     }
   }
   return CEED_ERROR_SUCCESS;
@@ -429,8 +454,8 @@ static inline int CeedOperatorInputRestore_Hip(CeedOperatorField op_input_field,
 // Apply and add to output
 //------------------------------------------------------------------------------
 static int CeedOperatorApplyAdd_Hip(CeedOperator op, CeedVector in_vec, CeedVector out_vec, CeedRequest *request) {
-  CeedInt             Q, num_elem, elem_size, num_input_fields, num_output_fields, size;
-  CeedScalar         *e_data[2 * CEED_FIELD_MAX] = {NULL};
+  CeedInt             Q, num_elem, num_input_fields, num_output_fields;
+  CeedScalar         *e_data_in[CEED_FIELD_MAX] = {NULL}, *e_data_out[CEED_FIELD_MAX] = {NULL};
   CeedQFunctionField *qf_input_fields, *qf_output_fields;
   CeedQFunction       qf;
   CeedOperatorField  *op_input_fields, *op_output_fields;
@@ -448,8 +473,11 @@ static int CeedOperatorApplyAdd_Hip(CeedOperator op, CeedVector in_vec, CeedVect
 
   // Process inputs
   for (CeedInt i = 0; i < num_input_fields; i++) {
-    CeedCallBackend(CeedOperatorInputRestrict_Hip(op_input_fields[i], qf_input_fields[i], i, in_vec, false, &e_data[i], impl, request));
-    CeedCallBackend(CeedOperatorInputBasis_Hip(op_input_fields[i], qf_input_fields[i], i, num_elem, false, e_data[i], impl));
+    CeedInt field = impl->input_field_order[i];
+
+    CeedCallBackend(
+        CeedOperatorInputRestrict_Hip(op_input_fields[field], qf_input_fields[field], field, in_vec, false, &e_data_in[field], impl, request));
+    CeedCallBackend(CeedOperatorInputBasis_Hip(op_input_fields[field], qf_input_fields[field], field, num_elem, false, e_data_in[field], impl));
   }
 
   // Output pointers, as necessary
@@ -459,8 +487,8 @@ static int CeedOperatorApplyAdd_Hip(CeedOperator op, CeedVector in_vec, CeedVect
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
     if (eval_mode == CEED_EVAL_NONE) {
       // Set the output Q-Vector to use the E-Vector data directly.
-      CeedCallBackend(CeedVectorGetArrayWrite(impl->e_vecs[i + impl->num_inputs], CEED_MEM_DEVICE, &e_data[i + num_input_fields]));
-      CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data[i + num_input_fields]));
+      CeedCallBackend(CeedVectorGetArrayWrite(impl->e_vecs_out[i], CEED_MEM_DEVICE, &e_data_out[i]));
+      CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data_out[i]));
     }
   }
 
@@ -469,21 +497,23 @@ static int CeedOperatorApplyAdd_Hip(CeedOperator op, CeedVector in_vec, CeedVect
 
   // Restore input arrays
   for (CeedInt i = 0; i < num_input_fields; i++) {
-    CeedCallBackend(CeedOperatorInputRestore_Hip(op_input_fields[i], qf_input_fields[i], i, false, &e_data[i], impl));
+    CeedCallBackend(CeedOperatorInputRestore_Hip(op_input_fields[i], qf_input_fields[i], i, false, &e_data_in[i], impl));
   }
 
   // Output basis apply if needed
   for (CeedInt i = 0; i < num_output_fields; i++) {
+    CeedInt             field = impl->output_field_order[i];
     CeedEvalMode        eval_mode;
+    CeedVector          l_vec, e_vec = impl->e_vecs_out[field], q_vec = impl->q_vecs_out[field];
     CeedElemRestriction elem_rstr;
     CeedBasis           basis;
 
-    // Get elem_size, eval_mode, size
-    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
-    CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
-    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
-    CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &size));
+    // Output vector
+    CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[field], &l_vec));
+    if (l_vec == CEED_VECTOR_ACTIVE) l_vec = out_vec;
+
     // Basis action
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[field], &eval_mode));
     switch (eval_mode) {
       case CEED_EVAL_NONE:
         break;  // No action
@@ -491,11 +521,11 @@ static int CeedOperatorApplyAdd_Hip(CeedOperator op, CeedVector in_vec, CeedVect
       case CEED_EVAL_GRAD:
       case CEED_EVAL_DIV:
       case CEED_EVAL_CURL:
-        CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
-        if (impl->apply_add_basis_out[i]) {
-          CeedCallBackend(CeedBasisApplyAdd(basis, num_elem, CEED_TRANSPOSE, eval_mode, impl->q_vecs_out[i], impl->e_vecs[i + impl->num_inputs]));
+        CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[field], &basis));
+        if (impl->apply_add_basis_out[field]) {
+          CeedCallBackend(CeedBasisApplyAdd(basis, num_elem, CEED_TRANSPOSE, eval_mode, q_vec, e_vec));
         } else {
-          CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_TRANSPOSE, eval_mode, impl->q_vecs_out[i], impl->e_vecs[i + impl->num_inputs]));
+          CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_TRANSPOSE, eval_mode, q_vec, e_vec));
         }
         break;
       // LCOV_EXCL_START
@@ -504,28 +534,16 @@ static int CeedOperatorApplyAdd_Hip(CeedOperator op, CeedVector in_vec, CeedVect
         // LCOV_EXCL_STOP
       }
     }
-  }
-
-  // Output restriction
-  for (CeedInt i = 0; i < num_output_fields; i++) {
-    CeedEvalMode        eval_mode;
-    CeedVector          vec;
-    CeedElemRestriction elem_rstr;
 
     // Restore evec
-    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
     if (eval_mode == CEED_EVAL_NONE) {
-      CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs[i + impl->num_inputs], &e_data[i + num_input_fields]));
+      CeedCallBackend(CeedVectorRestoreArray(e_vec, &e_data_out[field]));
     }
-    if (impl->skip_rstr_out[i]) continue;
-    // Get output vector
-    CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
-    // Restrict
-    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
-    // Active
-    if (vec == CEED_VECTOR_ACTIVE) vec = out_vec;
 
-    CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_TRANSPOSE, impl->e_vecs[i + impl->num_inputs], vec, request));
+    // Restrict
+    if (impl->skip_rstr_out[field]) continue;
+    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[field], &elem_rstr));
+    CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_TRANSPOSE, e_vec, l_vec, request));
   }
   return CEED_ERROR_SUCCESS;
 }
@@ -567,63 +585,85 @@ static int CeedOperatorSetupAtPoints_Hip(CeedOperator op) {
   impl->max_num_points = max_num_points;
 
   // Allocate
-  CeedCallBackend(CeedCalloc(num_input_fields + num_output_fields, &impl->e_vecs));
-  CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->skip_rstr_in));
-  CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->skip_rstr_out));
-  CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->apply_add_basis_out));
-  CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->input_states));
-  CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_in));
-  CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_out));
+  CeedCallBackend(CeedCalloc(num_input_fields, &impl->e_vecs_in));
+  CeedCallBackend(CeedCalloc(num_output_fields, &impl->e_vecs_out));
+  CeedCallBackend(CeedCalloc(num_input_fields, &impl->skip_rstr_in));
+  CeedCallBackend(CeedCalloc(num_output_fields, &impl->skip_rstr_out));
+  CeedCallBackend(CeedCalloc(num_output_fields, &impl->apply_add_basis_out));
+  CeedCallBackend(CeedCalloc(num_input_fields, &impl->input_field_order));
+  CeedCallBackend(CeedCalloc(num_output_fields, &impl->output_field_order));
+  CeedCallBackend(CeedCalloc(num_input_fields, &impl->input_states));
+  CeedCallBackend(CeedCalloc(num_input_fields, &impl->q_vecs_in));
+  CeedCallBackend(CeedCalloc(num_output_fields, &impl->q_vecs_out));
   impl->num_inputs  = num_input_fields;
   impl->num_outputs = num_output_fields;
 
   // Set up infield and outfield e-vecs and q-vecs
-  // Infields
-  CeedCallBackend(CeedOperatorSetupFields_Hip(qf, op, true, true, impl->skip_rstr_in, NULL, impl->e_vecs, impl->q_vecs_in, 0, num_input_fields,
+  CeedCallBackend(CeedOperatorSetupFields_Hip(qf, op, true, true, impl->skip_rstr_in, NULL, impl->e_vecs_in, impl->q_vecs_in, num_input_fields,
                                               max_num_points, num_elem));
-  // Outfields
-  CeedCallBackend(CeedOperatorSetupFields_Hip(qf, op, false, true, impl->skip_rstr_out, impl->apply_add_basis_out, impl->e_vecs, impl->q_vecs_out,
-                                              num_input_fields, num_output_fields, max_num_points, num_elem));
+  CeedCallBackend(CeedOperatorSetupFields_Hip(qf, op, false, true, impl->skip_rstr_out, impl->apply_add_basis_out, impl->e_vecs_out, impl->q_vecs_out,
+                                              num_output_fields, max_num_points, num_elem));
 
-  // Reuse active e-vecs where able
+  // Reorder fields to allow reuse of buffers
   {
-    CeedInt              num_used  = 0;
-    CeedElemRestriction *rstr_used = NULL;
+    bool    is_ordered[CEED_FIELD_MAX];
+    CeedInt curr_index = 0;
 
+    for (CeedInt i = 0; i < num_input_fields; i++) is_ordered[i] = false;
     for (CeedInt i = 0; i < num_input_fields; i++) {
-      bool                is_used = false;
       CeedVector          vec_i;
       CeedElemRestriction rstr_i;
 
+      if (is_ordered[i]) continue;
+      is_ordered[i]                       = true;
+      impl->input_field_order[curr_index] = i;
+      curr_index++;
       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i));
-      if (vec_i != CEED_VECTOR_ACTIVE) continue;
+      if (vec_i == CEED_VECTOR_NONE) continue;  // CEED_EVAL_WEIGHT
       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i));
-      for (CeedInt j = 0; j < num_used; j++) {
-        if (rstr_i == rstr_used[i]) is_used = true;
+      for (CeedInt j = i + 1; j < num_input_fields; j++) {
+        CeedVector          vec_j;
+        CeedElemRestriction rstr_j;
+
+        CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec_j));
+        CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[j], &rstr_j));
+        if (rstr_i == rstr_j && vec_i == vec_j) {
+          is_ordered[j]                       = true;
+          impl->input_field_order[curr_index] = j;
+          curr_index++;
+        }
       }
-      if (is_used) continue;
-      num_used++;
-      if (num_used == 1) CeedCallBackend(CeedCalloc(num_used, &rstr_used));
-      else CeedCallBackend(CeedRealloc(num_used, &rstr_used));
-      rstr_used[num_used - 1] = rstr_i;
-      for (CeedInt j = num_output_fields - 1; j >= 0; j--) {
-        CeedEvalMode        eval_mode;
+    }
+  }
+  {
+    bool    is_ordered[CEED_FIELD_MAX];
+    CeedInt curr_index = 0;
+
+    for (CeedInt i = 0; i < num_output_fields; i++) is_ordered[i] = false;
+    for (CeedInt i = 0; i < num_output_fields; i++) {
+      CeedVector          vec_i;
+      CeedElemRestriction rstr_i;
+
+      if (is_ordered[i]) continue;
+      is_ordered[i]                        = true;
+      impl->output_field_order[curr_index] = i;
+      curr_index++;
+      CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec_i));
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &rstr_i));
+      for (CeedInt j = i + 1; j < num_output_fields; j++) {
         CeedVector          vec_j;
         CeedElemRestriction rstr_j;
 
         CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[j], &vec_j));
-        if (vec_j != CEED_VECTOR_ACTIVE) continue;
-        CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode));
-        if (eval_mode == CEED_EVAL_NONE) continue;
         CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[j], &rstr_j));
-        if (rstr_i == rstr_j) {
-          CeedCallBackend(CeedVectorReferenceCopy(impl->e_vecs[i], &impl->e_vecs[j + impl->num_inputs]));
+        if (rstr_i == rstr_j && vec_i == vec_j) {
+          is_ordered[j]                        = true;
+          impl->output_field_order[curr_index] = j;
+          curr_index++;
         }
       }
     }
-    CeedCallBackend(CeedFree(&rstr_used));
   }
-  impl->has_shared_e_vecs = true;
   CeedCallBackend(CeedOperatorSetSetupDone(op));
   return CEED_ERROR_SUCCESS;
 }
@@ -635,20 +675,21 @@ static inline int CeedOperatorInputBasisAtPoints_Hip(CeedOperatorField op_input_
                                                      CeedInt num_elem, const CeedInt *num_points, const bool skip_active, CeedScalar *e_data,
                                                      CeedOperator_Hip *impl) {
   CeedEvalMode eval_mode;
+  CeedVector   e_vec = impl->e_vecs_in[input_field], q_vec = impl->q_vecs_in[input_field];
 
   // Skip active input
   if (skip_active) {
-    CeedVector vec;
+    CeedVector l_vec;
 
-    CeedCallBackend(CeedOperatorFieldGetVector(op_input_field, &vec));
-    if (vec == CEED_VECTOR_ACTIVE) return CEED_ERROR_SUCCESS;
+    CeedCallBackend(CeedOperatorFieldGetVector(op_input_field, &l_vec));
+    if (l_vec == CEED_VECTOR_ACTIVE) return CEED_ERROR_SUCCESS;
   }
 
   // Basis action
   CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_field, &eval_mode));
   switch (eval_mode) {
     case CEED_EVAL_NONE:
-      CeedCallBackend(CeedVectorSetArray(impl->q_vecs_in[input_field], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data));
+      CeedCallBackend(CeedVectorSetArray(q_vec, CEED_MEM_DEVICE, CEED_USE_POINTER, e_data));
       break;
     case CEED_EVAL_INTERP:
     case CEED_EVAL_GRAD:
@@ -657,8 +698,7 @@ static inline int CeedOperatorInputBasisAtPoints_Hip(CeedOperatorField op_input_
       CeedBasis basis;
 
       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_field, &basis));
-      CeedCallBackend(CeedBasisApplyAtPoints(basis, num_elem, num_points, CEED_NOTRANSPOSE, eval_mode, impl->point_coords_elem,
-                                             impl->e_vecs[input_field], impl->q_vecs_in[input_field]));
+      CeedCallBackend(CeedBasisApplyAtPoints(basis, num_elem, num_points, CEED_NOTRANSPOSE, eval_mode, impl->point_coords_elem, e_vec, q_vec));
       break;
     }
     case CEED_EVAL_WEIGHT:
@@ -671,8 +711,8 @@ static inline int CeedOperatorInputBasisAtPoints_Hip(CeedOperatorField op_input_
 // Apply and add to output AtPoints
 //------------------------------------------------------------------------------
 static int CeedOperatorApplyAddAtPoints_Hip(CeedOperator op, CeedVector in_vec, CeedVector out_vec, CeedRequest *request) {
-  CeedInt             max_num_points, *num_points, num_elem, elem_size, num_input_fields, num_output_fields, size;
-  CeedScalar         *e_data[2 * CEED_FIELD_MAX] = {NULL};
+  CeedInt             max_num_points, *num_points, num_elem, num_input_fields, num_output_fields;
+  CeedScalar         *e_data_in[CEED_FIELD_MAX] = {NULL}, *e_data_out[CEED_FIELD_MAX] = {NULL};
   CeedQFunctionField *qf_input_fields, *qf_output_fields;
   CeedQFunction       qf;
   CeedOperatorField  *op_input_fields, *op_output_fields;
@@ -701,8 +741,12 @@ static int CeedOperatorApplyAddAtPoints_Hip(CeedOperator op, CeedVector in_vec,
 
   // Process inputs
   for (CeedInt i = 0; i < num_input_fields; i++) {
-    CeedCallBackend(CeedOperatorInputRestrict_Hip(op_input_fields[i], qf_input_fields[i], i, in_vec, false, &e_data[i], impl, request));
-    CeedCallBackend(CeedOperatorInputBasisAtPoints_Hip(op_input_fields[i], qf_input_fields[i], i, num_elem, num_points, false, e_data[i], impl));
+    CeedInt field = impl->input_field_order[i];
+
+    CeedCallBackend(
+        CeedOperatorInputRestrict_Hip(op_input_fields[field], qf_input_fields[field], field, in_vec, false, &e_data_in[field], impl, request));
+    CeedCallBackend(CeedOperatorInputBasisAtPoints_Hip(op_input_fields[field], qf_input_fields[field], field, num_elem, num_points, false,
+                                                       e_data_in[field], impl));
   }
 
   // Output pointers, as necessary
@@ -712,8 +756,8 @@ static int CeedOperatorApplyAddAtPoints_Hip(CeedOperator op, CeedVector in_vec,
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
     if (eval_mode == CEED_EVAL_NONE) {
       // Set the output Q-Vector to use the E-Vector data directly.
-      CeedCallBackend(CeedVectorGetArrayWrite(impl->e_vecs[i + impl->num_inputs], CEED_MEM_DEVICE, &e_data[i + num_input_fields]));
-      CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data[i + num_input_fields]));
+      CeedCallBackend(CeedVectorGetArrayWrite(impl->e_vecs_out[i], CEED_MEM_DEVICE, &e_data_out[i]));
+      CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data_out[i]));
     }
   }
 
@@ -722,21 +766,23 @@ static int CeedOperatorApplyAddAtPoints_Hip(CeedOperator op, CeedVector in_vec,
 
   // Restore input arrays
   for (CeedInt i = 0; i < num_input_fields; i++) {
-    CeedCallBackend(CeedOperatorInputRestore_Hip(op_input_fields[i], qf_input_fields[i], i, false, &e_data[i], impl));
+    CeedCallBackend(CeedOperatorInputRestore_Hip(op_input_fields[i], qf_input_fields[i], i, false, &e_data_in[i], impl));
   }
 
   // Output basis apply if needed
   for (CeedInt i = 0; i < num_output_fields; i++) {
+    CeedInt             field = impl->output_field_order[i];
     CeedEvalMode        eval_mode;
+    CeedVector          l_vec, e_vec = impl->e_vecs_out[field], q_vec = impl->q_vecs_out[field];
     CeedElemRestriction elem_rstr;
     CeedBasis           basis;
 
-    // Get elem_size, eval_mode, size
-    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
-    CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
-    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
-    CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &size));
+    // Output vector
+    CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[field], &l_vec));
+    if (l_vec == CEED_VECTOR_ACTIVE) l_vec = out_vec;
+
     // Basis action
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[field], &eval_mode));
     switch (eval_mode) {
       case CEED_EVAL_NONE:
         break;  // No action
@@ -744,13 +790,11 @@ static int CeedOperatorApplyAddAtPoints_Hip(CeedOperator op, CeedVector in_vec,
       case CEED_EVAL_GRAD:
       case CEED_EVAL_DIV:
       case CEED_EVAL_CURL:
-        CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
-        if (impl->apply_add_basis_out[i]) {
-          CeedCallBackend(CeedBasisApplyAddAtPoints(basis, num_elem, num_points, CEED_TRANSPOSE, eval_mode, impl->point_coords_elem,
-                                                    impl->q_vecs_out[i], impl->e_vecs[i + impl->num_inputs]));
+        CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[field], &basis));
+        if (impl->apply_add_basis_out[field]) {
+          CeedCallBackend(CeedBasisApplyAddAtPoints(basis, num_elem, num_points, CEED_TRANSPOSE, eval_mode, impl->point_coords_elem, q_vec, e_vec));
         } else {
-          CeedCallBackend(CeedBasisApplyAtPoints(basis, num_elem, num_points, CEED_TRANSPOSE, eval_mode, impl->point_coords_elem, impl->q_vecs_out[i],
-                                                 impl->e_vecs[i + impl->num_inputs]));
+          CeedCallBackend(CeedBasisApplyAtPoints(basis, num_elem, num_points, CEED_TRANSPOSE, eval_mode, impl->point_coords_elem, q_vec, e_vec));
         }
         break;
       // LCOV_EXCL_START
@@ -759,28 +803,16 @@ static int CeedOperatorApplyAddAtPoints_Hip(CeedOperator op, CeedVector in_vec,
         // LCOV_EXCL_STOP
       }
     }
-  }
-
-  // Output restriction
-  for (CeedInt i = 0; i < num_output_fields; i++) {
-    CeedEvalMode        eval_mode;
-    CeedVector          vec;
-    CeedElemRestriction elem_rstr;
 
     // Restore evec
-    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
     if (eval_mode == CEED_EVAL_NONE) {
-      CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs[i + impl->num_inputs], &e_data[i + num_input_fields]));
+      CeedCallBackend(CeedVectorRestoreArray(e_vec, &e_data_out[field]));
     }
-    if (impl->skip_rstr_out[i]) continue;
-    // Get output vector
-    CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
-    // Restrict
-    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
-    // Active
-    if (vec == CEED_VECTOR_ACTIVE) vec = out_vec;
 
-    CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_TRANSPOSE, impl->e_vecs[i + impl->num_inputs], vec, request));
+    // Restrict
+    if (impl->skip_rstr_out[field]) continue;
+    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[field], &elem_rstr));
+    CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_TRANSPOSE, e_vec, l_vec, request));
   }
   return CEED_ERROR_SUCCESS;
 }
@@ -813,21 +845,21 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Hip(CeedOperator op, b
   // Setup
   CeedCallBackend(CeedOperatorSetup_Hip(op));
 
-  // Input Evecs and Restriction
+  // Process inputs
   for (CeedInt i = 0; i < num_input_fields; i++) {
     CeedCallBackend(CeedOperatorInputRestrict_Hip(op_input_fields[i], qf_input_fields[i], i, NULL, true, &e_data[i], impl, request));
+    CeedCallBackend(CeedOperatorInputBasis_Hip(op_input_fields[i], qf_input_fields[i], i, num_elem, true, e_data[i], impl));
   }
 
   // Count number of active input fields
   if (!num_active_in) {
     for (CeedInt i = 0; i < num_input_fields; i++) {
       CeedScalar *q_vec_array;
-      CeedVector  vec;
+      CeedVector  l_vec;
 
-      // Get input vector
-      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
       // Check if active input
-      if (vec == CEED_VECTOR_ACTIVE) {
+      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &l_vec));
+      if (l_vec == CEED_VECTOR_ACTIVE) {
         CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &size));
         CeedCallBackend(CeedVectorSetValue(impl->q_vecs_in[i], 0.0));
         CeedCallBackend(CeedVectorGetArray(impl->q_vecs_in[i], CEED_MEM_DEVICE, &q_vec_array));
@@ -850,12 +882,11 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Hip(CeedOperator op, b
   // Count number of active output fields
   if (!num_active_out) {
     for (CeedInt i = 0; i < num_output_fields; i++) {
-      CeedVector vec;
+      CeedVector l_vec;
 
-      // Get output vector
-      CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
       // Check if active output
-      if (vec == CEED_VECTOR_ACTIVE) {
+      CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &l_vec));
+      if (l_vec == CEED_VECTOR_ACTIVE) {
         CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &size));
         num_active_out += size;
       }
@@ -881,11 +912,6 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Hip(CeedOperator op, b
   CeedCallBackend(CeedVectorSetValue(*assembled, 0.0));
   CeedCallBackend(CeedVectorGetArray(*assembled, CEED_MEM_DEVICE, &assembled_array));
 
-  // Input basis apply
-  for (CeedInt i = 0; i < num_input_fields; i++) {
-    CeedCallBackend(CeedOperatorInputBasis_Hip(op_input_fields[i], qf_input_fields[i], i, num_elem, true, e_data[i], impl));
-  }
-
   // Assemble QFunction
   for (CeedInt in = 0; in < num_active_in; in++) {
     // Set Inputs
@@ -895,12 +921,12 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Hip(CeedOperator op, b
     }
     // Set Outputs
     for (CeedInt out = 0; out < num_output_fields; out++) {
-      CeedVector vec;
+      CeedVector l_vec;
 
       // Get output vector
-      CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec));
+      CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &l_vec));
       // Check if active output
-      if (vec == CEED_VECTOR_ACTIVE) {
+      if (l_vec == CEED_VECTOR_ACTIVE) {
         CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[out], CEED_MEM_DEVICE, CEED_USE_POINTER, assembled_array));
         CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[out], &size));
         assembled_array += size * Q * num_elem;  // Advance the pointer by the size of the output
@@ -912,12 +938,10 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Hip(CeedOperator op, b
 
   // Un-set output q-vecs to prevent accidental overwrite of Assembled
   for (CeedInt out = 0; out < num_output_fields; out++) {
-    CeedVector vec;
+    CeedVector l_vec;
 
-    // Get output vector
-    CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec));
-    // Check if active output
-    if (vec == CEED_VECTOR_ACTIVE) {
+    CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &l_vec));
+    if (l_vec == CEED_VECTOR_ACTIVE) {
       CeedCallBackend(CeedVectorTakeArray(impl->q_vecs_out[out], CEED_MEM_DEVICE, NULL));
     }
   }
@@ -1265,8 +1289,8 @@ static inline int CeedOperatorAssembleDiagonalCore_Hip(CeedOperator op, CeedVect
   CeedCallBackend(CeedElemRestrictionGetElementSize(diag_rstr, &num_nodes));
   if (num_nodes > 0) {
     // Assemble element operator diagonals
-    CeedCallBackend(CeedVectorGetArray(elem_diag, CEED_MEM_DEVICE, &elem_diag_array));
     CeedCallBackend(CeedElemRestrictionGetNumElements(diag_rstr, &num_elem));
+    CeedCallBackend(CeedVectorGetArray(elem_diag, CEED_MEM_DEVICE, &elem_diag_array));
 
     // Compute the diagonal of B^T D B
     CeedInt elems_per_block = 1;
@@ -1315,6 +1339,7 @@ static int CeedOperatorLinearAssembleAddPointBlockDiagonal_Hip(CeedOperator op,
 //------------------------------------------------------------------------------
 static int CeedSingleOperatorAssembleSetup_Hip(CeedOperator op, CeedInt use_ceedsize_idx) {
   Ceed                ceed;
+  Ceed_Hip           *hip_data;
   char               *assembly_kernel_source;
   const char         *assembly_kernel_path;
   CeedInt             num_input_fields, num_output_fields, num_eval_modes_in = 0, num_eval_modes_out = 0;
@@ -1404,7 +1429,8 @@ static int CeedSingleOperatorAssembleSetup_Hip(CeedOperator op, CeedInt use_ceed
   asmb->block_size_x             = elem_size_in;
   asmb->block_size_y             = elem_size_out;
 
-  bool fallback = asmb->block_size_x * asmb->block_size_y * asmb->elems_per_block > 1024;
+  CeedCallBackend(CeedGetData(ceed, &hip_data));
+  bool fallback = asmb->block_size_x * asmb->block_size_y * asmb->elems_per_block > hip_data->device_prop.maxThreadsPerBlock;
 
   if (fallback) {
     // Use fallback kernel with 1D threadblock
@@ -1507,8 +1533,8 @@ static int CeedSingleOperatorAssembleSetup_Hip(CeedOperator op, CeedInt use_ceed
 // Assemble matrix data for COO matrix of assembled operator.
 // The sparsity pattern is set by CeedOperatorLinearAssembleSymbolic.
 //
-// Note that this (and other assembly routines) currently assume only one active input restriction/basis per operator (could have multiple basis eval
-// modes).
+// Note that this (and other assembly routines) currently assume only one active input restriction/basis per operator
+// (could have multiple basis eval modes).
 // TODO: allow multiple active input restrictions/basis objects
 //------------------------------------------------------------------------------
 static int CeedSingleOperatorAssemble_Hip(CeedOperator op, CeedInt offset, CeedVector values) {
@@ -1619,7 +1645,7 @@ static int CeedOperatorLinearAssembleQFunctionAtPoints_Hip(CeedOperator op, Ceed
 //------------------------------------------------------------------------------
 static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip(CeedOperator op, CeedVector assembled, CeedRequest *request) {
   CeedInt             max_num_points, *num_points, num_elem, num_input_fields, num_output_fields;
-  CeedScalar         *e_data[2 * CEED_FIELD_MAX] = {NULL};
+  CeedScalar         *e_data_in[CEED_FIELD_MAX] = {NULL}, *e_data_out[CEED_FIELD_MAX] = {NULL};
   CeedQFunctionField *qf_input_fields, *qf_output_fields;
   CeedQFunction       qf;
   CeedOperatorField  *op_input_fields, *op_output_fields;
@@ -1640,10 +1666,10 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip(CeedOperator op, Ce
   if (impl->has_shared_e_vecs) {
     for (CeedInt i = 0; i < impl->num_outputs; i++) {
       CeedCallBackend(CeedVectorDestroy(&impl->q_vecs_out[i]));
-      CeedCallBackend(CeedVectorDestroy(&impl->e_vecs[impl->num_inputs + i]));
+      CeedCallBackend(CeedVectorDestroy(&impl->e_vecs_out[i]));
     }
-    CeedCallBackend(CeedOperatorSetupFields_Hip(qf, op, false, true, impl->skip_rstr_out, impl->apply_add_basis_out, impl->e_vecs, impl->q_vecs_out,
-                                                num_input_fields, num_output_fields, max_num_points, num_elem));
+    CeedCallBackend(CeedOperatorSetupFields_Hip(qf, op, false, true, impl->skip_rstr_out, impl->apply_add_basis_out, impl->e_vecs_out,
+                                                impl->q_vecs_out, num_output_fields, max_num_points, num_elem));
   }
   impl->has_shared_e_vecs = false;
 
@@ -1657,9 +1683,10 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip(CeedOperator op, Ce
     CeedCallBackend(CeedElemRestrictionApply(rstr_points, CEED_NOTRANSPOSE, point_coords, impl->point_coords_elem, request));
   }
 
-  // Input Evecs and Restriction
+  // Process inputs
   for (CeedInt i = 0; i < num_input_fields; i++) {
-    CeedCallBackend(CeedOperatorInputRestrict_Hip(op_input_fields[i], qf_input_fields[i], i, NULL, true, &e_data[i], impl, request));
+    CeedCallBackend(CeedOperatorInputRestrict_Hip(op_input_fields[i], qf_input_fields[i], i, NULL, true, &e_data_in[i], impl, request));
+    CeedCallBackend(CeedOperatorInputBasisAtPoints_Hip(op_input_fields[i], qf_input_fields[i], i, num_elem, num_points, true, e_data_in[i], impl));
   }
 
   // Clear active input Qvecs
@@ -1671,11 +1698,6 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip(CeedOperator op, Ce
     CeedCallBackend(CeedVectorSetValue(impl->q_vecs_in[i], 0.0));
   }
 
-  // Input basis apply if needed
-  for (CeedInt i = 0; i < num_input_fields; i++) {
-    CeedCallBackend(CeedOperatorInputBasisAtPoints_Hip(op_input_fields[i], qf_input_fields[i], i, num_elem, num_points, true, e_data[i], impl));
-  }
-
   // Output pointers, as necessary
   for (CeedInt i = 0; i < num_output_fields; i++) {
     CeedEvalMode eval_mode;
@@ -1683,8 +1705,8 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip(CeedOperator op, Ce
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
     if (eval_mode == CEED_EVAL_NONE) {
       // Set the output Q-Vector to use the E-Vector data directly.
-      CeedCallBackend(CeedVectorGetArrayWrite(impl->e_vecs[i + impl->num_inputs], CEED_MEM_DEVICE, &e_data[i + num_input_fields]));
-      CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data[i + num_input_fields]));
+      CeedCallBackend(CeedVectorGetArrayWrite(impl->e_vecs_out[i], CEED_MEM_DEVICE, &e_data_out[i]));
+      CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data_out[i]));
     }
   }
 
@@ -1693,12 +1715,12 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip(CeedOperator op, Ce
     bool                is_active_at_points = true;
     CeedInt             elem_size = 1, num_comp_active = 1, e_vec_size = 0;
     CeedRestrictionType rstr_type;
-    CeedVector          vec;
+    CeedVector          l_vec;
     CeedElemRestriction elem_rstr;
 
-    CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
+    CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &l_vec));
     // -- Skip non-active input
-    if (vec != CEED_VECTOR_ACTIVE) continue;
+    if (l_vec != CEED_VECTOR_ACTIVE) continue;
 
     // -- Get active restriction type
     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
@@ -1721,23 +1743,23 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip(CeedOperator op, Ce
       if (!is_active_input) continue;
 
       // Update unit vector
-      if (s == 0) CeedCallBackend(CeedVectorSetValue(impl->e_vecs[i], 0.0));
-      else CeedCallBackend(CeedVectorSetValueStrided(impl->e_vecs[i], s - 1, e_vec_size, 0.0));
-      CeedCallBackend(CeedVectorSetValueStrided(impl->e_vecs[i], s, e_vec_size, 1.0));
+      if (s == 0) CeedCallBackend(CeedVectorSetValue(impl->e_vecs_in[i], 0.0));
+      else CeedCallBackend(CeedVectorSetValueStrided(impl->e_vecs_in[i], s - 1, e_vec_size, 0.0));
+      CeedCallBackend(CeedVectorSetValueStrided(impl->e_vecs_in[i], s, e_vec_size, 1.0));
 
       // Basis action
       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
       switch (eval_mode) {
         case CEED_EVAL_NONE:
-          CeedCallBackend(CeedVectorSetArray(impl->q_vecs_in[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data[i]));
+          CeedCallBackend(CeedVectorSetArray(impl->q_vecs_in[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data_in[i]));
           break;
         case CEED_EVAL_INTERP:
         case CEED_EVAL_GRAD:
         case CEED_EVAL_DIV:
         case CEED_EVAL_CURL:
           CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
-          CeedCallBackend(CeedBasisApplyAtPoints(basis, num_elem, num_points, CEED_NOTRANSPOSE, eval_mode, impl->point_coords_elem, impl->e_vecs[i],
-                                                 impl->q_vecs_in[i]));
+          CeedCallBackend(CeedBasisApplyAtPoints(basis, num_elem, num_points, CEED_NOTRANSPOSE, eval_mode, impl->point_coords_elem,
+                                                 impl->e_vecs_in[i], impl->q_vecs_in[i]));
           break;
         case CEED_EVAL_WEIGHT:
           break;  // No action
@@ -1752,13 +1774,13 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip(CeedOperator op, Ce
         CeedInt             elem_size        = 0;
         CeedRestrictionType rstr_type;
         CeedEvalMode        eval_mode;
-        CeedVector          vec;
+        CeedVector          l_vec;
         CeedElemRestriction elem_rstr;
         CeedBasis           basis;
 
-        CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[j], &vec));
+        CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[j], &l_vec));
         // ---- Skip non-active output
-        is_active_output = vec == CEED_VECTOR_ACTIVE;
+        is_active_output = l_vec == CEED_VECTOR_ACTIVE;
         if (!is_active_output) continue;
 
         // ---- Check if elem size matches
@@ -1781,7 +1803,7 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip(CeedOperator op, Ce
         CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode));
         switch (eval_mode) {
           case CEED_EVAL_NONE:
-            CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs[j + impl->num_inputs], &e_data[j + num_input_fields]));
+            CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs_out[j], &e_data_out[j]));
             break;
           case CEED_EVAL_INTERP:
           case CEED_EVAL_GRAD:
@@ -1789,7 +1811,7 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip(CeedOperator op, Ce
           case CEED_EVAL_CURL:
             CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[j], &basis));
             CeedCallBackend(CeedBasisApplyAtPoints(basis, num_elem, num_points, CEED_TRANSPOSE, eval_mode, impl->point_coords_elem,
-                                                   impl->q_vecs_out[j], impl->e_vecs[j + impl->num_inputs]));
+                                                   impl->q_vecs_out[j], impl->e_vecs_out[j]));
             break;
           // LCOV_EXCL_START
           case CEED_EVAL_WEIGHT: {
@@ -1799,16 +1821,16 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip(CeedOperator op, Ce
         }
 
         // Mask output e-vec
-        CeedCallBackend(CeedVectorPointwiseMult(impl->e_vecs[j + impl->num_inputs], impl->e_vecs[i], impl->e_vecs[j + impl->num_inputs]));
+        CeedCallBackend(CeedVectorPointwiseMult(impl->e_vecs_out[j], impl->e_vecs_in[i], impl->e_vecs_out[j]));
 
         // Restrict
         CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[j], &elem_rstr));
-        CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_TRANSPOSE, impl->e_vecs[j + impl->num_inputs], assembled, request));
+        CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_TRANSPOSE, impl->e_vecs_out[j], assembled, request));
 
         // Reset q_vec for
         if (eval_mode == CEED_EVAL_NONE) {
-          CeedCallBackend(CeedVectorGetArrayWrite(impl->e_vecs[j + impl->num_inputs], CEED_MEM_DEVICE, &e_data[j + num_input_fields]));
-          CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[j], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data[j + num_input_fields]));
+          CeedCallBackend(CeedVectorGetArrayWrite(impl->e_vecs_out[j], CEED_MEM_DEVICE, &e_data_out[j]));
+          CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[j], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data_out[j]));
         }
       }
 
@@ -1827,13 +1849,13 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip(CeedOperator op, Ce
     // Restore evec
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
     if (eval_mode == CEED_EVAL_NONE) {
-      CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs[i + impl->num_inputs], &e_data[i + num_input_fields]));
+      CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs_in[i], &e_data_in[i]));
     }
   }
 
   // Restore input arrays
   for (CeedInt i = 0; i < num_input_fields; i++) {
-    CeedCallBackend(CeedOperatorInputRestore_Hip(op_input_fields[i], qf_input_fields[i], i, true, &e_data[i], impl));
+    CeedCallBackend(CeedOperatorInputRestore_Hip(op_input_fields[i], qf_input_fields[i], i, true, &e_data_in[i], impl));
   }
   return CEED_ERROR_SUCCESS;
 }
@@ -1848,6 +1870,7 @@ int CeedOperatorCreate_Hip(CeedOperator op) {
   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
   CeedCallBackend(CeedCalloc(1, &impl));
   CeedCallBackend(CeedOperatorSetData(op, impl));
+
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunction", CeedOperatorLinearAssembleQFunction_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunctionUpdate", CeedOperatorLinearAssembleQFunctionUpdate_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddDiagonal", CeedOperatorLinearAssembleAddDiagonal_Hip));
diff --git a/backends/hip-ref/ceed-hip-ref.h b/backends/hip-ref/ceed-hip-ref.h
index 02fb567517..fb2c5b565e 100644
--- a/backends/hip-ref/ceed-hip-ref.h
+++ b/backends/hip-ref/ceed-hip-ref.h
@@ -137,11 +137,12 @@ typedef struct {
 typedef struct {
   bool                     *skip_rstr_in, *skip_rstr_out, *apply_add_basis_out, has_shared_e_vecs;
   uint64_t                 *input_states;  // State tracking for passive inputs
-  CeedVector               *e_vecs;        // E-vectors, inputs followed by outputs
-  CeedVector               *q_vecs_in;     // Input Q-vectors needed to apply operator
-  CeedVector               *q_vecs_out;    // Output Q-vectors needed to apply operator
+  CeedVector               *e_vecs_in, *e_vecs_out;
+  CeedVector               *q_vecs_in, *q_vecs_out;
   CeedInt                   num_inputs, num_outputs;
   CeedInt                   num_active_in, num_active_out;
+  CeedInt                  *input_field_order, *output_field_order;
+  CeedSize                  max_active_e_vec_len;
   CeedInt                   max_num_points;
   CeedInt                  *num_points;
   CeedVector               *qf_active_in, point_coords_elem;

From 8bbba8cdca038b4242f0cbb369bb1e31b8a6e289 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Thu, 26 Sep 2024 11:27:23 -0600
Subject: [PATCH 179/571] gpu - use cached work vectors across operators

---
 backends/cuda-ref/ceed-cuda-ref-operator.c | 382 +++++++++++---------
 backends/cuda-ref/ceed-cuda-ref.h          |   2 +-
 backends/hip-ref/ceed-hip-ref-operator.c   | 390 ++++++++++++---------
 backends/hip-ref/ceed-hip-ref.h            |   2 +-
 interface/ceed-basis.c                     |  10 -
 interface/ceed-vector.c                    |   2 +-
 tests/junit.py                             |   2 +-
 tests/t303-basis.c                         |   6 +-
 8 files changed, 454 insertions(+), 342 deletions(-)

diff --git a/backends/cuda-ref/ceed-cuda-ref-operator.c b/backends/cuda-ref/ceed-cuda-ref-operator.c
index 515cc9a847..a9395c7bb7 100644
--- a/backends/cuda-ref/ceed-cuda-ref-operator.c
+++ b/backends/cuda-ref/ceed-cuda-ref-operator.c
@@ -119,60 +119,48 @@ static int CeedOperatorSetupFields_Cuda(CeedQFunction qf, CeedOperator op, bool
 
   // Loop over fields
   for (CeedInt i = 0; i < num_fields; i++) {
-    bool         is_strided = false, skip_e_vec = false;
-    CeedSize     q_size;
-    CeedInt      size;
-    CeedEvalMode eval_mode;
-    CeedBasis    basis;
+    bool                is_active = false, is_strided = false, skip_e_vec = false;
+    CeedSize            q_size;
+    CeedInt             size;
+    CeedEvalMode        eval_mode;
+    CeedVector          l_vec;
+    CeedElemRestriction elem_rstr;
 
+    // Check whether this field can skip the element restriction:
+    // Input CEED_VECTOR_ACTIVE
+    // Output CEED_VECTOR_ACTIVE without CEED_EVAL_NONE
+    // Input CEED_VECTOR_NONE with CEED_EVAL_WEIGHT
+    // Input passive vectorr with CEED_EVAL_NONE and strided restriction with CEED_STRIDES_BACKEND
+    CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &l_vec));
+    is_active = l_vec == CEED_VECTOR_ACTIVE;
+    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &elem_rstr));
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode));
-    if (eval_mode != CEED_EVAL_WEIGHT) {
-      CeedElemRestriction elem_rstr;
-
-      // Check whether this field can skip the element restriction:
-      // Must be passive input, with eval_mode NONE, and have a strided restriction with CEED_STRIDES_BACKEND.
-      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &elem_rstr));
-
-      // First, check whether the field is input or output:
-      if (is_input) {
-        CeedVector l_vec;
-
-        // Check for passive input
-        CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &l_vec));
-        if (l_vec != CEED_VECTOR_ACTIVE && eval_mode == CEED_EVAL_NONE) {
-          // Check for strided restriction
-          CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided));
-          if (is_strided) {
-            // Check if vector is already in preferred backend ordering
-            CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &skip_e_vec));
-          }
-        }
-      }
-      if (skip_e_vec) {
-        // Either an active field or strided local vec in backend ordering
-        e_vecs[i] = NULL;
-      } else {
-        CeedCallBackend(CeedElemRestrictionCreateVector(elem_rstr, NULL, &e_vecs[i]));
-      }
+    skip_e_vec = (is_input && is_active) || (is_active && eval_mode != CEED_EVAL_NONE) || (eval_mode == CEED_EVAL_WEIGHT);
+    if (!skip_e_vec && is_input && !is_active && eval_mode == CEED_EVAL_NONE) {
+      CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided));
+      if (is_strided) CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &skip_e_vec));
+    }
+    if (skip_e_vec) {
+      e_vecs[i] = NULL;
+    } else {
+      CeedCallBackend(CeedElemRestrictionCreateVector(elem_rstr, NULL, &e_vecs[i]));
     }
 
     switch (eval_mode) {
       case CEED_EVAL_NONE:
-        CeedCallBackend(CeedQFunctionFieldGetSize(qf_fields[i], &size));
-        q_size = (CeedSize)num_elem * Q * size;
-        CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i]));
-        break;
       case CEED_EVAL_INTERP:
       case CEED_EVAL_GRAD:
       case CEED_EVAL_DIV:
       case CEED_EVAL_CURL:
         CeedCallBackend(CeedQFunctionFieldGetSize(qf_fields[i], &size));
-        q_size = (CeedSize)num_elem * Q * size;
+        q_size = (CeedSize)num_elem * (CeedSize)Q * (CeedSize)size;
         CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i]));
         break;
-      case CEED_EVAL_WEIGHT:  // Only on input fields
+      case CEED_EVAL_WEIGHT: {
+        CeedBasis basis;
+
         CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis));
-        q_size = (CeedSize)num_elem * Q;
+        q_size = (CeedSize)num_elem * (CeedSize)Q;
         CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i]));
         if (is_at_points) {
           CeedInt num_points[num_elem];
@@ -184,6 +172,7 @@ static int CeedOperatorSetupFields_Cuda(CeedQFunction qf, CeedOperator op, bool
           CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT, CEED_VECTOR_NONE, q_vecs[i]));
         }
         break;
+      }
     }
   }
   // Drop duplicate restrictions
@@ -201,7 +190,7 @@ static int CeedOperatorSetupFields_Cuda(CeedQFunction qf, CeedOperator op, bool
         CeedCallBackend(CeedOperatorFieldGetVector(op_fields[j], &vec_j));
         CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[j], &rstr_j));
         if (vec_i == vec_j && rstr_i == rstr_j) {
-          CeedCallBackend(CeedVectorReferenceCopy(e_vecs[i], &e_vecs[j]));
+          if (e_vecs[i]) CeedCallBackend(CeedVectorReferenceCopy(e_vecs[i], &e_vecs[j]));
           skip_rstr[j] = true;
         }
       }
@@ -220,7 +209,7 @@ static int CeedOperatorSetupFields_Cuda(CeedQFunction qf, CeedOperator op, bool
         CeedCallBackend(CeedOperatorFieldGetVector(op_fields[j], &vec_j));
         CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[j], &rstr_j));
         if (vec_i == vec_j && rstr_i == rstr_j) {
-          CeedCallBackend(CeedVectorReferenceCopy(e_vecs[i], &e_vecs[j]));
+          if (e_vecs[i]) CeedCallBackend(CeedVectorReferenceCopy(e_vecs[i], &e_vecs[j]));
           skip_rstr[j]       = true;
           apply_add_basis[i] = true;
         }
@@ -348,41 +337,34 @@ static int CeedOperatorSetup_Cuda(CeedOperator op) {
 // Restrict Operator Inputs
 //------------------------------------------------------------------------------
 static inline int CeedOperatorInputRestrict_Cuda(CeedOperatorField op_input_field, CeedQFunctionField qf_input_field, CeedInt input_field,
-                                                 CeedVector in_vec, const bool skip_active, CeedScalar **e_data, CeedOperator_Cuda *impl,
+                                                 CeedVector in_vec, CeedVector active_e_vec, const bool skip_active, CeedOperator_Cuda *impl,
                                                  CeedRequest *request) {
-  CeedEvalMode eval_mode;
-  CeedVector   l_vec, e_vec = impl->e_vecs_in[input_field];
+  bool       is_active = false;
+  CeedVector l_vec, e_vec = impl->e_vecs_in[input_field];
 
   // Get input vector
   CeedCallBackend(CeedOperatorFieldGetVector(op_input_field, &l_vec));
-  if (l_vec == CEED_VECTOR_ACTIVE) {
-    if (skip_active) return CEED_ERROR_SUCCESS;
-    else l_vec = in_vec;
+  is_active = l_vec == CEED_VECTOR_ACTIVE;
+  if (is_active && skip_active) return CEED_ERROR_SUCCESS;
+  if (is_active) {
+    l_vec = in_vec;
+    if (!e_vec) e_vec = active_e_vec;
   }
 
   // Restriction action
-  CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_field, &eval_mode));
-  if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
-  } else {
-    if (!e_vec) {
-      // No restriction for this field; read data directly from vec.
-      CeedCallBackend(CeedVectorGetArrayRead(l_vec, CEED_MEM_DEVICE, (const CeedScalar **)e_data));
-    } else {
-      // Restrict, if necessary
-      if (!impl->skip_rstr_in[input_field]) {
-        uint64_t state;
+  if (e_vec) {
+    // Restrict, if necessary
+    if (!impl->skip_rstr_in[input_field]) {
+      uint64_t state;
 
-        CeedCallBackend(CeedVectorGetState(l_vec, &state));
-        if (state != impl->input_states[input_field] || l_vec == in_vec) {
-          CeedElemRestriction elem_rstr;
+      CeedCallBackend(CeedVectorGetState(l_vec, &state));
+      if (is_active || state != impl->input_states[input_field]) {
+        CeedElemRestriction elem_rstr;
 
-          CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_field, &elem_rstr));
-          CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_NOTRANSPOSE, l_vec, e_vec, request));
-        }
-        impl->input_states[input_field] = state;
+        CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_field, &elem_rstr));
+        CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_NOTRANSPOSE, l_vec, e_vec, request));
       }
-      // Get e-vec
-      CeedCallBackend(CeedVectorGetArrayRead(e_vec, CEED_MEM_DEVICE, (const CeedScalar **)e_data));
+      impl->input_states[input_field] = state;
     }
   }
   return CEED_ERROR_SUCCESS;
@@ -392,24 +374,35 @@ static inline int CeedOperatorInputRestrict_Cuda(CeedOperatorField op_input_fiel
 // Input Basis Action
 //------------------------------------------------------------------------------
 static inline int CeedOperatorInputBasis_Cuda(CeedOperatorField op_input_field, CeedQFunctionField qf_input_field, CeedInt input_field,
-                                              CeedInt num_elem, const bool skip_active, CeedScalar *e_data, CeedOperator_Cuda *impl) {
+                                              CeedVector in_vec, CeedVector active_e_vec, CeedInt num_elem, const bool skip_active,
+                                              CeedOperator_Cuda *impl) {
+  bool         is_active = false;
   CeedEvalMode eval_mode;
-  CeedVector   e_vec = impl->e_vecs_in[input_field], q_vec = impl->q_vecs_in[input_field];
+  CeedVector   l_vec, e_vec = impl->e_vecs_in[input_field], q_vec = impl->q_vecs_in[input_field];
 
   // Skip active input
-  if (skip_active) {
-    CeedVector l_vec;
-
-    CeedCallBackend(CeedOperatorFieldGetVector(op_input_field, &l_vec));
-    if (l_vec == CEED_VECTOR_ACTIVE) return CEED_ERROR_SUCCESS;
+  CeedCallBackend(CeedOperatorFieldGetVector(op_input_field, &l_vec));
+  is_active = l_vec == CEED_VECTOR_ACTIVE;
+  if (is_active && skip_active) return CEED_ERROR_SUCCESS;
+  if (is_active) {
+    l_vec = in_vec;
+    if (!e_vec) e_vec = active_e_vec;
   }
 
   // Basis action
   CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_field, &eval_mode));
   switch (eval_mode) {
-    case CEED_EVAL_NONE:
-      CeedCallBackend(CeedVectorSetArray(q_vec, CEED_MEM_DEVICE, CEED_USE_POINTER, e_data));
+    case CEED_EVAL_NONE: {
+      const CeedScalar *e_vec_array;
+
+      if (e_vec) {
+        CeedCallBackend(CeedVectorGetArrayRead(e_vec, CEED_MEM_DEVICE, &e_vec_array));
+      } else {
+        CeedCallBackend(CeedVectorGetArrayRead(l_vec, CEED_MEM_DEVICE, &e_vec_array));
+      }
+      CeedCallBackend(CeedVectorSetArray(q_vec, CEED_MEM_DEVICE, CEED_USE_POINTER, (CeedScalar *)e_vec_array));
       break;
+    }
     case CEED_EVAL_INTERP:
     case CEED_EVAL_GRAD:
     case CEED_EVAL_DIV:
@@ -430,22 +423,30 @@ static inline int CeedOperatorInputBasis_Cuda(CeedOperatorField op_input_field,
 // Restore Input Vectors
 //------------------------------------------------------------------------------
 static inline int CeedOperatorInputRestore_Cuda(CeedOperatorField op_input_field, CeedQFunctionField qf_input_field, CeedInt input_field,
-                                                const bool skip_active, CeedScalar **e_data, CeedOperator_Cuda *impl) {
+                                                CeedVector in_vec, CeedVector active_e_vec, const bool skip_active, CeedOperator_Cuda *impl) {
+  bool         is_active = false;
   CeedEvalMode eval_mode;
   CeedVector   l_vec, e_vec = impl->e_vecs_in[input_field];
 
   // Skip active input
   CeedCallBackend(CeedOperatorFieldGetVector(op_input_field, &l_vec));
-  if (skip_active && l_vec == CEED_VECTOR_ACTIVE) return CEED_ERROR_SUCCESS;
+  is_active = l_vec == CEED_VECTOR_ACTIVE;
+  if (is_active && skip_active) return CEED_ERROR_SUCCESS;
+  if (is_active) {
+    l_vec = in_vec;
+    if (!e_vec) e_vec = active_e_vec;
+  }
 
   // Restore e-vec
   CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_field, &eval_mode));
-  if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
-  } else {
-    if (!e_vec) {  // This was a skip_restriction case
-      CeedCallBackend(CeedVectorRestoreArrayRead(l_vec, (const CeedScalar **)e_data));
+  if (eval_mode == CEED_EVAL_NONE) {
+    const CeedScalar *e_vec_array;
+
+    CeedCallBackend(CeedVectorTakeArray(impl->q_vecs_in[input_field], CEED_MEM_DEVICE, (CeedScalar **)&e_vec_array));
+    if (e_vec) {
+      CeedCallBackend(CeedVectorRestoreArrayRead(e_vec, &e_vec_array));
     } else {
-      CeedCallBackend(CeedVectorRestoreArrayRead(e_vec, (const CeedScalar **)e_data));
+      CeedCallBackend(CeedVectorRestoreArrayRead(l_vec, &e_vec_array));
     }
   }
   return CEED_ERROR_SUCCESS;
@@ -456,12 +457,14 @@ static inline int CeedOperatorInputRestore_Cuda(CeedOperatorField op_input_field
 //------------------------------------------------------------------------------
 static int CeedOperatorApplyAdd_Cuda(CeedOperator op, CeedVector in_vec, CeedVector out_vec, CeedRequest *request) {
   CeedInt             Q, num_elem, num_input_fields, num_output_fields;
-  CeedScalar         *e_data_in[CEED_FIELD_MAX] = {NULL}, *e_data_out[CEED_FIELD_MAX] = {NULL};
+  Ceed                ceed;
+  CeedVector          active_e_vec;
   CeedQFunctionField *qf_input_fields, *qf_output_fields;
   CeedQFunction       qf;
   CeedOperatorField  *op_input_fields, *op_output_fields;
   CeedOperator_Cuda  *impl;
 
+  CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
   CeedCallBackend(CeedOperatorGetData(op, &impl));
   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
   CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q));
@@ -472,13 +475,16 @@ static int CeedOperatorApplyAdd_Cuda(CeedOperator op, CeedVector in_vec, CeedVec
   // Setup
   CeedCallBackend(CeedOperatorSetup_Cuda(op));
 
+  // Work vector
+  CeedCallBackend(CeedGetWorkVector(ceed, impl->max_active_e_vec_len, &active_e_vec));
+
   // Process inputs
   for (CeedInt i = 0; i < num_input_fields; i++) {
     CeedInt field = impl->input_field_order[i];
 
     CeedCallBackend(
-        CeedOperatorInputRestrict_Cuda(op_input_fields[field], qf_input_fields[field], field, in_vec, false, &e_data_in[field], impl, request));
-    CeedCallBackend(CeedOperatorInputBasis_Cuda(op_input_fields[field], qf_input_fields[field], field, num_elem, false, e_data_in[field], impl));
+        CeedOperatorInputRestrict_Cuda(op_input_fields[field], qf_input_fields[field], field, in_vec, active_e_vec, false, impl, request));
+    CeedCallBackend(CeedOperatorInputBasis_Cuda(op_input_fields[field], qf_input_fields[field], field, in_vec, active_e_vec, num_elem, false, impl));
   }
 
   // Output pointers, as necessary
@@ -487,9 +493,10 @@ static int CeedOperatorApplyAdd_Cuda(CeedOperator op, CeedVector in_vec, CeedVec
 
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
     if (eval_mode == CEED_EVAL_NONE) {
-      // Set the output Q-Vector to use the E-Vector data directly.
-      CeedCallBackend(CeedVectorGetArrayWrite(impl->e_vecs_out[i], CEED_MEM_DEVICE, &e_data_out[i]));
-      CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data_out[i]));
+      CeedScalar *e_vec_array;
+
+      CeedCallBackend(CeedVectorGetArrayWrite(impl->e_vecs_out[i], CEED_MEM_DEVICE, &e_vec_array));
+      CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_vec_array));
     }
   }
 
@@ -498,12 +505,13 @@ static int CeedOperatorApplyAdd_Cuda(CeedOperator op, CeedVector in_vec, CeedVec
 
   // Restore input arrays
   for (CeedInt i = 0; i < num_input_fields; i++) {
-    CeedCallBackend(CeedOperatorInputRestore_Cuda(op_input_fields[i], qf_input_fields[i], i, false, &e_data_in[i], impl));
+    CeedCallBackend(CeedOperatorInputRestore_Cuda(op_input_fields[i], qf_input_fields[i], i, in_vec, active_e_vec, false, impl));
   }
 
-  // Output basis apply if needed
+  // Output basis and restriction
   for (CeedInt i = 0; i < num_output_fields; i++) {
-    CeedInt             field = impl->output_field_order[i];
+    bool                is_active = false;
+    CeedInt             field     = impl->output_field_order[i];
     CeedEvalMode        eval_mode;
     CeedVector          l_vec, e_vec = impl->e_vecs_out[field], q_vec = impl->q_vecs_out[field];
     CeedElemRestriction elem_rstr;
@@ -511,7 +519,11 @@ static int CeedOperatorApplyAdd_Cuda(CeedOperator op, CeedVector in_vec, CeedVec
 
     // Output vector
     CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[field], &l_vec));
-    if (l_vec == CEED_VECTOR_ACTIVE) l_vec = out_vec;
+    is_active = l_vec == CEED_VECTOR_ACTIVE;
+    if (is_active) {
+      l_vec = out_vec;
+      if (!e_vec) e_vec = active_e_vec;
+    }
 
     // Basis action
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[field], &eval_mode));
@@ -531,14 +543,17 @@ static int CeedOperatorApplyAdd_Cuda(CeedOperator op, CeedVector in_vec, CeedVec
         break;
       // LCOV_EXCL_START
       case CEED_EVAL_WEIGHT: {
-        return CeedError(CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
+        return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
         // LCOV_EXCL_STOP
       }
     }
 
     // Restore evec
     if (eval_mode == CEED_EVAL_NONE) {
-      CeedCallBackend(CeedVectorRestoreArray(e_vec, &e_data_out[field]));
+      CeedScalar *e_vec_array;
+
+      CeedCallBackend(CeedVectorTakeArray(impl->q_vecs_out[i], CEED_MEM_DEVICE, &e_vec_array));
+      CeedCallBackend(CeedVectorRestoreArray(e_vec, &e_vec_array));
     }
 
     // Restrict
@@ -546,6 +561,9 @@ static int CeedOperatorApplyAdd_Cuda(CeedOperator op, CeedVector in_vec, CeedVec
     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[field], &elem_rstr));
     CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_TRANSPOSE, e_vec, l_vec, request));
   }
+
+  // Return work vector
+  CeedCallBackend(CeedRestoreWorkVector(ceed, &active_e_vec));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -606,12 +624,14 @@ static int CeedOperatorSetupAtPoints_Cuda(CeedOperator op) {
                                                impl->q_vecs_out, num_output_fields, max_num_points, num_elem));
 
   // Reorder fields to allow reuse of buffers
+  impl->max_active_e_vec_len = 0;
   {
     bool    is_ordered[CEED_FIELD_MAX];
     CeedInt curr_index = 0;
 
     for (CeedInt i = 0; i < num_input_fields; i++) is_ordered[i] = false;
     for (CeedInt i = 0; i < num_input_fields; i++) {
+      CeedSize            e_vec_len_i;
       CeedVector          vec_i;
       CeedElemRestriction rstr_i;
 
@@ -622,6 +642,8 @@ static int CeedOperatorSetupAtPoints_Cuda(CeedOperator op) {
       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i));
       if (vec_i == CEED_VECTOR_NONE) continue;  // CEED_EVAL_WEIGHT
       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i));
+      CeedCallBackend(CeedElemRestrictionGetEVectorSize(rstr_i, &e_vec_len_i));
+      impl->max_active_e_vec_len = e_vec_len_i > impl->max_active_e_vec_len ? e_vec_len_i : impl->max_active_e_vec_len;
       for (CeedInt j = i + 1; j < num_input_fields; j++) {
         CeedVector          vec_j;
         CeedElemRestriction rstr_j;
@@ -642,6 +664,7 @@ static int CeedOperatorSetupAtPoints_Cuda(CeedOperator op) {
 
     for (CeedInt i = 0; i < num_output_fields; i++) is_ordered[i] = false;
     for (CeedInt i = 0; i < num_output_fields; i++) {
+      CeedSize            e_vec_len_i;
       CeedVector          vec_i;
       CeedElemRestriction rstr_i;
 
@@ -651,6 +674,8 @@ static int CeedOperatorSetupAtPoints_Cuda(CeedOperator op) {
       curr_index++;
       CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec_i));
       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &rstr_i));
+      CeedCallBackend(CeedElemRestrictionGetEVectorSize(rstr_i, &e_vec_len_i));
+      impl->max_active_e_vec_len = e_vec_len_i > impl->max_active_e_vec_len ? e_vec_len_i : impl->max_active_e_vec_len;
       for (CeedInt j = i + 1; j < num_output_fields; j++) {
         CeedVector          vec_j;
         CeedElemRestriction rstr_j;
@@ -665,6 +690,7 @@ static int CeedOperatorSetupAtPoints_Cuda(CeedOperator op) {
       }
     }
   }
+
   CeedCallBackend(CeedOperatorSetSetupDone(op));
   return CEED_ERROR_SUCCESS;
 }
@@ -673,25 +699,35 @@ static int CeedOperatorSetupAtPoints_Cuda(CeedOperator op) {
 // Input Basis Action AtPoints
 //------------------------------------------------------------------------------
 static inline int CeedOperatorInputBasisAtPoints_Cuda(CeedOperatorField op_input_field, CeedQFunctionField qf_input_field, CeedInt input_field,
-                                                      CeedInt num_elem, const CeedInt *num_points, const bool skip_active, CeedScalar *e_data,
-                                                      CeedOperator_Cuda *impl) {
+                                                      CeedVector in_vec, CeedVector active_e_vec, CeedInt num_elem, const CeedInt *num_points,
+                                                      const bool skip_active, CeedOperator_Cuda *impl) {
+  bool         is_active = false;
   CeedEvalMode eval_mode;
-  CeedVector   e_vec = impl->e_vecs_in[input_field], q_vec = impl->q_vecs_in[input_field];
+  CeedVector   l_vec, e_vec = impl->e_vecs_in[input_field], q_vec = impl->q_vecs_in[input_field];
 
   // Skip active input
-  if (skip_active) {
-    CeedVector l_vec;
-
-    CeedCallBackend(CeedOperatorFieldGetVector(op_input_field, &l_vec));
-    if (l_vec == CEED_VECTOR_ACTIVE) return CEED_ERROR_SUCCESS;
+  CeedCallBackend(CeedOperatorFieldGetVector(op_input_field, &l_vec));
+  is_active = l_vec == CEED_VECTOR_ACTIVE;
+  if (is_active && skip_active) return CEED_ERROR_SUCCESS;
+  if (is_active) {
+    l_vec = in_vec;
+    if (!e_vec) e_vec = active_e_vec;
   }
 
   // Basis action
   CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_field, &eval_mode));
   switch (eval_mode) {
-    case CEED_EVAL_NONE:
-      CeedCallBackend(CeedVectorSetArray(q_vec, CEED_MEM_DEVICE, CEED_USE_POINTER, e_data));
+    case CEED_EVAL_NONE: {
+      const CeedScalar *e_vec_array;
+
+      if (e_vec) {
+        CeedCallBackend(CeedVectorGetArrayRead(e_vec, CEED_MEM_DEVICE, &e_vec_array));
+      } else {
+        CeedCallBackend(CeedVectorGetArrayRead(l_vec, CEED_MEM_DEVICE, &e_vec_array));
+      }
+      CeedCallBackend(CeedVectorSetArray(q_vec, CEED_MEM_DEVICE, CEED_USE_POINTER, (CeedScalar *)e_vec_array));
       break;
+    }
     case CEED_EVAL_INTERP:
     case CEED_EVAL_GRAD:
     case CEED_EVAL_DIV:
@@ -713,12 +749,14 @@ static inline int CeedOperatorInputBasisAtPoints_Cuda(CeedOperatorField op_input
 //------------------------------------------------------------------------------
 static int CeedOperatorApplyAddAtPoints_Cuda(CeedOperator op, CeedVector in_vec, CeedVector out_vec, CeedRequest *request) {
   CeedInt             max_num_points, *num_points, num_elem, num_input_fields, num_output_fields;
-  CeedScalar         *e_data_in[CEED_FIELD_MAX] = {NULL}, *e_data_out[CEED_FIELD_MAX] = {NULL};
+  Ceed                ceed;
+  CeedVector          active_e_vec;
   CeedQFunctionField *qf_input_fields, *qf_output_fields;
   CeedQFunction       qf;
   CeedOperatorField  *op_input_fields, *op_output_fields;
   CeedOperator_Cuda  *impl;
 
+  CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
   CeedCallBackend(CeedOperatorGetData(op, &impl));
   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
@@ -730,6 +768,9 @@ static int CeedOperatorApplyAddAtPoints_Cuda(CeedOperator op, CeedVector in_vec,
   num_points     = impl->num_points;
   max_num_points = impl->max_num_points;
 
+  // Work vector
+  CeedCallBackend(CeedGetWorkVector(ceed, impl->max_active_e_vec_len, &active_e_vec));
+
   // Get point coordinates
   if (!impl->point_coords_elem) {
     CeedVector          point_coords = NULL;
@@ -745,9 +786,9 @@ static int CeedOperatorApplyAddAtPoints_Cuda(CeedOperator op, CeedVector in_vec,
     CeedInt field = impl->input_field_order[i];
 
     CeedCallBackend(
-        CeedOperatorInputRestrict_Cuda(op_input_fields[field], qf_input_fields[field], field, in_vec, false, &e_data_in[field], impl, request));
-    CeedCallBackend(CeedOperatorInputBasisAtPoints_Cuda(op_input_fields[field], qf_input_fields[field], field, num_elem, num_points, false,
-                                                        e_data_in[field], impl));
+        CeedOperatorInputRestrict_Cuda(op_input_fields[field], qf_input_fields[field], field, in_vec, active_e_vec, false, impl, request));
+    CeedCallBackend(CeedOperatorInputBasisAtPoints_Cuda(op_input_fields[field], qf_input_fields[field], field, in_vec, active_e_vec, num_elem,
+                                                        num_points, false, impl));
   }
 
   // Output pointers, as necessary
@@ -756,9 +797,10 @@ static int CeedOperatorApplyAddAtPoints_Cuda(CeedOperator op, CeedVector in_vec,
 
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
     if (eval_mode == CEED_EVAL_NONE) {
-      // Set the output Q-Vector to use the E-Vector data directly.
-      CeedCallBackend(CeedVectorGetArrayWrite(impl->e_vecs_out[i], CEED_MEM_DEVICE, &e_data_out[i]));
-      CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data_out[i]));
+      CeedScalar *e_vec_array;
+
+      CeedCallBackend(CeedVectorGetArrayWrite(impl->e_vecs_out[i], CEED_MEM_DEVICE, &e_vec_array));
+      CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_vec_array));
     }
   }
 
@@ -767,12 +809,13 @@ static int CeedOperatorApplyAddAtPoints_Cuda(CeedOperator op, CeedVector in_vec,
 
   // Restore input arrays
   for (CeedInt i = 0; i < num_input_fields; i++) {
-    CeedCallBackend(CeedOperatorInputRestore_Cuda(op_input_fields[i], qf_input_fields[i], i, false, &e_data_in[i], impl));
+    CeedCallBackend(CeedOperatorInputRestore_Cuda(op_input_fields[i], qf_input_fields[i], i, in_vec, active_e_vec, false, impl));
   }
 
-  // Output basis apply if needed
+  // Output basis and restriction
   for (CeedInt i = 0; i < num_output_fields; i++) {
-    CeedInt             field = impl->output_field_order[i];
+    bool                is_active = false;
+    CeedInt             field     = impl->output_field_order[i];
     CeedEvalMode        eval_mode;
     CeedVector          l_vec, e_vec = impl->e_vecs_out[field], q_vec = impl->q_vecs_out[field];
     CeedElemRestriction elem_rstr;
@@ -780,7 +823,11 @@ static int CeedOperatorApplyAddAtPoints_Cuda(CeedOperator op, CeedVector in_vec,
 
     // Output vector
     CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[field], &l_vec));
-    if (l_vec == CEED_VECTOR_ACTIVE) l_vec = out_vec;
+    is_active = l_vec == CEED_VECTOR_ACTIVE;
+    if (is_active) {
+      l_vec = out_vec;
+      if (!e_vec) e_vec = active_e_vec;
+    }
 
     // Basis action
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[field], &eval_mode));
@@ -800,14 +847,17 @@ static int CeedOperatorApplyAddAtPoints_Cuda(CeedOperator op, CeedVector in_vec,
         break;
       // LCOV_EXCL_START
       case CEED_EVAL_WEIGHT: {
-        return CeedError(CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
+        return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
         // LCOV_EXCL_STOP
       }
     }
 
     // Restore evec
     if (eval_mode == CEED_EVAL_NONE) {
-      CeedCallBackend(CeedVectorRestoreArray(e_vec, &e_data_out[field]));
+      CeedScalar *e_vec_array;
+
+      CeedCallBackend(CeedVectorTakeArray(impl->q_vecs_out[i], CEED_MEM_DEVICE, &e_vec_array));
+      CeedCallBackend(CeedVectorRestoreArray(e_vec, &e_vec_array));
     }
 
     // Restrict
@@ -815,6 +865,9 @@ static int CeedOperatorApplyAddAtPoints_Cuda(CeedOperator op, CeedVector in_vec,
     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[field], &elem_rstr));
     CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_TRANSPOSE, e_vec, l_vec, request));
   }
+
+  // Restore work vector
+  CeedCallBackend(CeedRestoreWorkVector(ceed, &active_e_vec));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -825,7 +878,7 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Cuda(CeedOperator op,
                                                                CeedRequest *request) {
   Ceed                ceed, ceed_parent;
   CeedInt             num_active_in, num_active_out, Q, num_elem, num_input_fields, num_output_fields, size;
-  CeedScalar         *assembled_array, *e_data[2 * CEED_FIELD_MAX] = {NULL};
+  CeedScalar         *assembled_array;
   CeedVector         *active_inputs;
   CeedQFunctionField *qf_input_fields, *qf_output_fields;
   CeedQFunction       qf;
@@ -848,8 +901,8 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Cuda(CeedOperator op,
 
   // Process inputs
   for (CeedInt i = 0; i < num_input_fields; i++) {
-    CeedCallBackend(CeedOperatorInputRestrict_Cuda(op_input_fields[i], qf_input_fields[i], i, NULL, true, &e_data[i], impl, request));
-    CeedCallBackend(CeedOperatorInputBasis_Cuda(op_input_fields[i], qf_input_fields[i], i, num_elem, true, e_data[i], impl));
+    CeedCallBackend(CeedOperatorInputRestrict_Cuda(op_input_fields[i], qf_input_fields[i], i, NULL, NULL, true, impl, request));
+    CeedCallBackend(CeedOperatorInputBasis_Cuda(op_input_fields[i], qf_input_fields[i], i, NULL, NULL, num_elem, true, impl));
   }
 
   // Count number of active input fields
@@ -949,7 +1002,7 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Cuda(CeedOperator op,
 
   // Restore input arrays
   for (CeedInt i = 0; i < num_input_fields; i++) {
-    CeedCallBackend(CeedOperatorInputRestore_Cuda(op_input_fields[i], qf_input_fields[i], i, true, &e_data[i], impl));
+    CeedCallBackend(CeedOperatorInputRestore_Cuda(op_input_fields[i], qf_input_fields[i], i, NULL, NULL, true, impl));
   }
 
   // Restore output
@@ -1646,12 +1699,14 @@ static int CeedOperatorLinearAssembleQFunctionAtPoints_Cuda(CeedOperator op, Cee
 //------------------------------------------------------------------------------
 static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda(CeedOperator op, CeedVector assembled, CeedRequest *request) {
   CeedInt             max_num_points, *num_points, num_elem, num_input_fields, num_output_fields;
-  CeedScalar         *e_data_in[CEED_FIELD_MAX] = {NULL}, *e_data_out[CEED_FIELD_MAX] = {NULL};
+  Ceed                ceed;
+  CeedVector          active_e_vec_in, active_e_vec_out;
   CeedQFunctionField *qf_input_fields, *qf_output_fields;
   CeedQFunction       qf;
   CeedOperatorField  *op_input_fields, *op_output_fields;
   CeedOperator_Cuda  *impl;
 
+  CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
   CeedCallBackend(CeedOperatorGetData(op, &impl));
   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
@@ -1663,16 +1718,9 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda(CeedOperator op, C
   num_points     = impl->num_points;
   max_num_points = impl->max_num_points;
 
-  // Create separate output e-vecs
-  if (impl->has_shared_e_vecs) {
-    for (CeedInt i = 0; i < impl->num_outputs; i++) {
-      CeedCallBackend(CeedVectorDestroy(&impl->q_vecs_out[i]));
-      CeedCallBackend(CeedVectorDestroy(&impl->e_vecs_out[i]));
-    }
-    CeedCallBackend(CeedOperatorSetupFields_Cuda(qf, op, false, true, impl->skip_rstr_out, impl->apply_add_basis_out, impl->e_vecs_out,
-                                                 impl->q_vecs_out, num_output_fields, max_num_points, num_elem));
-  }
-  impl->has_shared_e_vecs = false;
+  // Work vector
+  CeedCallBackend(CeedGetWorkVector(ceed, impl->max_active_e_vec_len, &active_e_vec_in));
+  CeedCallBackend(CeedGetWorkVector(ceed, impl->max_active_e_vec_len, &active_e_vec_out));
 
   // Get point coordinates
   if (!impl->point_coords_elem) {
@@ -1686,8 +1734,8 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda(CeedOperator op, C
 
   // Process inputs
   for (CeedInt i = 0; i < num_input_fields; i++) {
-    CeedCallBackend(CeedOperatorInputRestrict_Cuda(op_input_fields[i], qf_input_fields[i], i, NULL, true, &e_data_in[i], impl, request));
-    CeedCallBackend(CeedOperatorInputBasisAtPoints_Cuda(op_input_fields[i], qf_input_fields[i], i, num_elem, num_points, true, e_data_in[i], impl));
+    CeedCallBackend(CeedOperatorInputRestrict_Cuda(op_input_fields[i], qf_input_fields[i], i, NULL, NULL, true, impl, request));
+    CeedCallBackend(CeedOperatorInputBasisAtPoints_Cuda(op_input_fields[i], qf_input_fields[i], i, NULL, NULL, num_elem, num_points, true, impl));
   }
 
   // Clear active input Qvecs
@@ -1705,9 +1753,10 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda(CeedOperator op, C
 
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
     if (eval_mode == CEED_EVAL_NONE) {
-      // Set the output Q-Vector to use the E-Vector data directly.
-      CeedCallBackend(CeedVectorGetArrayWrite(impl->e_vecs_out[i], CEED_MEM_DEVICE, &e_data_out[i]));
-      CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data_out[i]));
+      CeedScalar *e_vec_array;
+
+      CeedCallBackend(CeedVectorGetArrayWrite(impl->e_vecs_out[i], CEED_MEM_DEVICE, &e_vec_array));
+      CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_vec_array));
     }
   }
 
@@ -1735,32 +1784,36 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda(CeedOperator op, C
     for (CeedInt s = 0; s < e_vec_size; s++) {
       bool         is_active_input = false;
       CeedEvalMode eval_mode;
-      CeedVector   vec;
+      CeedVector   l_vec, q_vec = impl->q_vecs_in[i];
       CeedBasis    basis;
 
-      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
+      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &l_vec));
       // Skip non-active input
-      is_active_input = vec == CEED_VECTOR_ACTIVE;
+      is_active_input = l_vec == CEED_VECTOR_ACTIVE;
       if (!is_active_input) continue;
 
       // Update unit vector
-      if (s == 0) CeedCallBackend(CeedVectorSetValue(impl->e_vecs_in[i], 0.0));
-      else CeedCallBackend(CeedVectorSetValueStrided(impl->e_vecs_in[i], s - 1, e_vec_size, 0.0));
-      CeedCallBackend(CeedVectorSetValueStrided(impl->e_vecs_in[i], s, e_vec_size, 1.0));
+      if (s == 0) CeedCallBackend(CeedVectorSetValue(active_e_vec_in, 0.0));
+      else CeedCallBackend(CeedVectorSetValueStrided(active_e_vec_in, s - 1, e_vec_size, 0.0));
+      CeedCallBackend(CeedVectorSetValueStrided(active_e_vec_in, s, e_vec_size, 1.0));
 
       // Basis action
       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
       switch (eval_mode) {
-        case CEED_EVAL_NONE:
-          CeedCallBackend(CeedVectorSetArray(impl->q_vecs_in[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data_in[i]));
+        case CEED_EVAL_NONE: {
+          const CeedScalar *e_vec_array;
+
+          CeedCallBackend(CeedVectorGetArrayRead(active_e_vec_in, CEED_MEM_DEVICE, &e_vec_array));
+          CeedCallBackend(CeedVectorSetArray(impl->q_vecs_in[i], CEED_MEM_DEVICE, CEED_USE_POINTER, (CeedScalar *)e_vec_array));
           break;
+        }
         case CEED_EVAL_INTERP:
         case CEED_EVAL_GRAD:
         case CEED_EVAL_DIV:
         case CEED_EVAL_CURL:
           CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
-          CeedCallBackend(CeedBasisApplyAtPoints(basis, num_elem, num_points, CEED_NOTRANSPOSE, eval_mode, impl->point_coords_elem,
-                                                 impl->e_vecs_in[i], impl->q_vecs_in[i]));
+          CeedCallBackend(
+              CeedBasisApplyAtPoints(basis, num_elem, num_points, CEED_NOTRANSPOSE, eval_mode, impl->point_coords_elem, active_e_vec_in, q_vec));
           break;
         case CEED_EVAL_WEIGHT:
           break;  // No action
@@ -1775,7 +1828,7 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda(CeedOperator op, C
         CeedInt             elem_size        = 0;
         CeedRestrictionType rstr_type;
         CeedEvalMode        eval_mode;
-        CeedVector          l_vec;
+        CeedVector          l_vec, e_vec = impl->e_vecs_out[j], q_vec = impl->q_vecs_out[j];
         CeedElemRestriction elem_rstr;
         CeedBasis           basis;
 
@@ -1783,6 +1836,7 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda(CeedOperator op, C
         // ---- Skip non-active output
         is_active_output = l_vec == CEED_VECTOR_ACTIVE;
         if (!is_active_output) continue;
+        if (!e_vec) e_vec = active_e_vec_out;
 
         // ---- Check if elem size matches
         CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[j], &elem_rstr));
@@ -1803,16 +1857,19 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda(CeedOperator op, C
         // Basis action
         CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode));
         switch (eval_mode) {
-          case CEED_EVAL_NONE:
-            CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs_out[j], &e_data_out[j]));
+          case CEED_EVAL_NONE: {
+            CeedScalar *e_vec_array;
+
+            CeedCallBackend(CeedVectorTakeArray(q_vec, CEED_MEM_DEVICE, &e_vec_array));
+            CeedCallBackend(CeedVectorRestoreArray(e_vec, &e_vec_array));
             break;
+          }
           case CEED_EVAL_INTERP:
           case CEED_EVAL_GRAD:
           case CEED_EVAL_DIV:
           case CEED_EVAL_CURL:
             CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[j], &basis));
-            CeedCallBackend(CeedBasisApplyAtPoints(basis, num_elem, num_points, CEED_TRANSPOSE, eval_mode, impl->point_coords_elem,
-                                                   impl->q_vecs_out[j], impl->e_vecs_out[j]));
+            CeedCallBackend(CeedBasisApplyAtPoints(basis, num_elem, num_points, CEED_TRANSPOSE, eval_mode, impl->point_coords_elem, q_vec, e_vec));
             break;
           // LCOV_EXCL_START
           case CEED_EVAL_WEIGHT: {
@@ -1822,21 +1879,23 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda(CeedOperator op, C
         }
 
         // Mask output e-vec
-        CeedCallBackend(CeedVectorPointwiseMult(impl->e_vecs_out[j], impl->e_vecs_in[i], impl->e_vecs_out[j]));
+        CeedCallBackend(CeedVectorPointwiseMult(e_vec, active_e_vec_in, e_vec));
 
         // Restrict
         CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[j], &elem_rstr));
-        CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_TRANSPOSE, impl->e_vecs_out[j], assembled, request));
+        CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_TRANSPOSE, e_vec, assembled, request));
 
         // Reset q_vec for
         if (eval_mode == CEED_EVAL_NONE) {
-          CeedCallBackend(CeedVectorGetArrayWrite(impl->e_vecs_out[j], CEED_MEM_DEVICE, &e_data_out[j]));
-          CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[j], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data_out[j]));
+          CeedScalar *e_vec_array;
+
+          CeedCallBackend(CeedVectorGetArrayWrite(e_vec, CEED_MEM_DEVICE, &e_vec_array));
+          CeedCallBackend(CeedVectorSetArray(q_vec, CEED_MEM_DEVICE, CEED_USE_POINTER, e_vec_array));
         }
       }
 
       // Reset vec
-      if (s == e_vec_size - 1 && i != num_input_fields - 1) CeedCallBackend(CeedVectorSetValue(impl->q_vecs_in[i], 0.0));
+      if (s == e_vec_size - 1 && i != num_input_fields - 1) CeedCallBackend(CeedVectorSetValue(q_vec, 0.0));
     }
   }
 
@@ -1850,13 +1909,16 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda(CeedOperator op, C
     // Restore evec
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
     if (eval_mode == CEED_EVAL_NONE) {
-      CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs_in[i], &e_data_in[i]));
+      CeedScalar *e_vec_array;
+
+      CeedCallBackend(CeedVectorTakeArray(impl->q_vecs_in[i], CEED_MEM_DEVICE, &e_vec_array));
+      CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs_in[i], &e_vec_array));
     }
   }
 
   // Restore input arrays
   for (CeedInt i = 0; i < num_input_fields; i++) {
-    CeedCallBackend(CeedOperatorInputRestore_Cuda(op_input_fields[i], qf_input_fields[i], i, true, &e_data_in[i], impl));
+    CeedCallBackend(CeedOperatorInputRestore_Cuda(op_input_fields[i], qf_input_fields[i], i, NULL, NULL, true, impl));
   }
   return CEED_ERROR_SUCCESS;
 }
diff --git a/backends/cuda-ref/ceed-cuda-ref.h b/backends/cuda-ref/ceed-cuda-ref.h
index b6e7016013..9e167463bd 100644
--- a/backends/cuda-ref/ceed-cuda-ref.h
+++ b/backends/cuda-ref/ceed-cuda-ref.h
@@ -131,7 +131,7 @@ typedef struct {
 } CeedOperatorAssemble_Cuda;
 
 typedef struct {
-  bool                      *skip_rstr_in, *skip_rstr_out, *apply_add_basis_out, has_shared_e_vecs;
+  bool                      *skip_rstr_in, *skip_rstr_out, *apply_add_basis_out;
   uint64_t                  *input_states;  // State tracking for passive inputs
   CeedVector                *e_vecs_in, *e_vecs_out;
   CeedVector                *q_vecs_in, *q_vecs_out;
diff --git a/backends/hip-ref/ceed-hip-ref-operator.c b/backends/hip-ref/ceed-hip-ref-operator.c
index 4534e840ee..7f661d2aa3 100644
--- a/backends/hip-ref/ceed-hip-ref-operator.c
+++ b/backends/hip-ref/ceed-hip-ref-operator.c
@@ -118,60 +118,48 @@ static int CeedOperatorSetupFields_Hip(CeedQFunction qf, CeedOperator op, bool i
 
   // Loop over fields
   for (CeedInt i = 0; i < num_fields; i++) {
-    bool         is_strided = false, skip_e_vec = false;
-    CeedSize     q_size;
-    CeedInt      size;
-    CeedEvalMode eval_mode;
-    CeedBasis    basis;
+    bool                is_active = false, is_strided = false, skip_e_vec = false;
+    CeedSize            q_size;
+    CeedInt             size;
+    CeedEvalMode        eval_mode;
+    CeedVector          l_vec;
+    CeedElemRestriction elem_rstr;
 
+    // Check whether this field can skip the element restriction:
+    // Input CEED_VECTOR_ACTIVE
+    // Output CEED_VECTOR_ACTIVE without CEED_EVAL_NONE
+    // Input CEED_VECTOR_NONE with CEED_EVAL_WEIGHT
+    // Input passive vectorr with CEED_EVAL_NONE and strided restriction with CEED_STRIDES_BACKEND
+    CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &l_vec));
+    is_active = l_vec == CEED_VECTOR_ACTIVE;
+    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &elem_rstr));
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode));
-    if (eval_mode != CEED_EVAL_WEIGHT) {
-      CeedElemRestriction elem_rstr;
-
-      // Check whether this field can skip the element restriction:
-      // Must be passive input, with eval_mode NONE, and have a strided restriction with CEED_STRIDES_BACKEND.
-      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &elem_rstr));
-
-      // First, check whether the field is input or output:
-      if (is_input) {
-        CeedVector l_vec;
-
-        // Check for passive input
-        CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &l_vec));
-        if (l_vec != CEED_VECTOR_ACTIVE && eval_mode == CEED_EVAL_NONE) {
-          // Check for strided restriction
-          CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided));
-          if (is_strided) {
-            // Check if vector is already in preferred backend ordering
-            CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &skip_e_vec));
-          }
-        }
-      }
-      if (skip_e_vec) {
-        // Either an active field or strided local vec in backend ordering
-        e_vecs[i] = NULL;
-      } else {
-        CeedCallBackend(CeedElemRestrictionCreateVector(elem_rstr, NULL, &e_vecs[i]));
-      }
+    skip_e_vec = (is_input && is_active) || (is_active && eval_mode != CEED_EVAL_NONE) || (eval_mode == CEED_EVAL_WEIGHT);
+    if (!skip_e_vec && is_input && !is_active && eval_mode == CEED_EVAL_NONE) {
+      CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided));
+      if (is_strided) CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &skip_e_vec));
+    }
+    if (skip_e_vec) {
+      e_vecs[i] = NULL;
+    } else {
+      CeedCallBackend(CeedElemRestrictionCreateVector(elem_rstr, NULL, &e_vecs[i]));
     }
 
     switch (eval_mode) {
       case CEED_EVAL_NONE:
-        CeedCallBackend(CeedQFunctionFieldGetSize(qf_fields[i], &size));
-        q_size = (CeedSize)num_elem * Q * size;
-        CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i]));
-        break;
       case CEED_EVAL_INTERP:
       case CEED_EVAL_GRAD:
       case CEED_EVAL_DIV:
       case CEED_EVAL_CURL:
         CeedCallBackend(CeedQFunctionFieldGetSize(qf_fields[i], &size));
-        q_size = (CeedSize)num_elem * Q * size;
+        q_size = (CeedSize)num_elem * (CeedSize)Q * (CeedSize)size;
         CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i]));
         break;
-      case CEED_EVAL_WEIGHT:  // Only on input fields
+      case CEED_EVAL_WEIGHT: {
+        CeedBasis basis;
+
         CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis));
-        q_size = (CeedSize)num_elem * Q;
+        q_size = (CeedSize)num_elem * (CeedSize)Q;
         CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i]));
         if (is_at_points) {
           CeedInt num_points[num_elem];
@@ -183,6 +171,7 @@ static int CeedOperatorSetupFields_Hip(CeedQFunction qf, CeedOperator op, bool i
           CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT, CEED_VECTOR_NONE, q_vecs[i]));
         }
         break;
+      }
     }
   }
   // Drop duplicate restrictions
@@ -200,7 +189,7 @@ static int CeedOperatorSetupFields_Hip(CeedQFunction qf, CeedOperator op, bool i
         CeedCallBackend(CeedOperatorFieldGetVector(op_fields[j], &vec_j));
         CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[j], &rstr_j));
         if (vec_i == vec_j && rstr_i == rstr_j) {
-          CeedCallBackend(CeedVectorReferenceCopy(e_vecs[i], &e_vecs[j]));
+          if (e_vecs[i]) CeedCallBackend(CeedVectorReferenceCopy(e_vecs[i], &e_vecs[j]));
           skip_rstr[j] = true;
         }
       }
@@ -219,7 +208,7 @@ static int CeedOperatorSetupFields_Hip(CeedQFunction qf, CeedOperator op, bool i
         CeedCallBackend(CeedOperatorFieldGetVector(op_fields[j], &vec_j));
         CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[j], &rstr_j));
         if (vec_i == vec_j && rstr_i == rstr_j) {
-          CeedCallBackend(CeedVectorReferenceCopy(e_vecs[i], &e_vecs[j]));
+          if (e_vecs[i]) CeedCallBackend(CeedVectorReferenceCopy(e_vecs[i], &e_vecs[j]));
           skip_rstr[j]       = true;
           apply_add_basis[i] = true;
         }
@@ -347,41 +336,34 @@ static int CeedOperatorSetup_Hip(CeedOperator op) {
 // Restrict Operator Inputs
 //------------------------------------------------------------------------------
 static inline int CeedOperatorInputRestrict_Hip(CeedOperatorField op_input_field, CeedQFunctionField qf_input_field, CeedInt input_field,
-                                                CeedVector in_vec, const bool skip_active, CeedScalar **e_data, CeedOperator_Hip *impl,
+                                                CeedVector in_vec, CeedVector active_e_vec, const bool skip_active, CeedOperator_Hip *impl,
                                                 CeedRequest *request) {
-  CeedEvalMode eval_mode;
-  CeedVector   l_vec, e_vec = impl->e_vecs_in[input_field];
+  bool       is_active = false;
+  CeedVector l_vec, e_vec = impl->e_vecs_in[input_field];
 
   // Get input vector
   CeedCallBackend(CeedOperatorFieldGetVector(op_input_field, &l_vec));
-  if (l_vec == CEED_VECTOR_ACTIVE) {
-    if (skip_active) return CEED_ERROR_SUCCESS;
-    else l_vec = in_vec;
+  is_active = l_vec == CEED_VECTOR_ACTIVE;
+  if (is_active && skip_active) return CEED_ERROR_SUCCESS;
+  if (is_active) {
+    l_vec = in_vec;
+    if (!e_vec) e_vec = active_e_vec;
   }
 
   // Restriction action
-  CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_field, &eval_mode));
-  if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
-  } else {
-    if (!e_vec) {
-      // No restriction for this field; read data directly from vec.
-      CeedCallBackend(CeedVectorGetArrayRead(l_vec, CEED_MEM_DEVICE, (const CeedScalar **)e_data));
-    } else {
-      // Restrict, if necessary
-      if (!impl->skip_rstr_in[input_field]) {
-        uint64_t state;
+  if (e_vec) {
+    // Restrict, if necessary
+    if (!impl->skip_rstr_in[input_field]) {
+      uint64_t state;
 
-        CeedCallBackend(CeedVectorGetState(l_vec, &state));
-        if (state != impl->input_states[input_field] || l_vec == in_vec) {
-          CeedElemRestriction elem_rstr;
+      CeedCallBackend(CeedVectorGetState(l_vec, &state));
+      if (is_active || state != impl->input_states[input_field]) {
+        CeedElemRestriction elem_rstr;
 
-          CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_field, &elem_rstr));
-          CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_NOTRANSPOSE, l_vec, e_vec, request));
-        }
-        impl->input_states[input_field] = state;
+        CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_field, &elem_rstr));
+        CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_NOTRANSPOSE, l_vec, e_vec, request));
       }
-      // Get e-vec
-      CeedCallBackend(CeedVectorGetArrayRead(e_vec, CEED_MEM_DEVICE, (const CeedScalar **)e_data));
+      impl->input_states[input_field] = state;
     }
   }
   return CEED_ERROR_SUCCESS;
@@ -391,24 +373,35 @@ static inline int CeedOperatorInputRestrict_Hip(CeedOperatorField op_input_field
 // Input Basis Action
 //------------------------------------------------------------------------------
 static inline int CeedOperatorInputBasis_Hip(CeedOperatorField op_input_field, CeedQFunctionField qf_input_field, CeedInt input_field,
-                                             CeedInt num_elem, const bool skip_active, CeedScalar *e_data, CeedOperator_Hip *impl) {
+                                             CeedVector in_vec, CeedVector active_e_vec, CeedInt num_elem, const bool skip_active,
+                                             CeedOperator_Hip *impl) {
+  bool         is_active = false;
   CeedEvalMode eval_mode;
-  CeedVector   e_vec = impl->e_vecs_in[input_field], q_vec = impl->q_vecs_in[input_field];
+  CeedVector   l_vec, e_vec = impl->e_vecs_in[input_field], q_vec = impl->q_vecs_in[input_field];
 
   // Skip active input
-  if (skip_active) {
-    CeedVector l_vec;
-
-    CeedCallBackend(CeedOperatorFieldGetVector(op_input_field, &l_vec));
-    if (l_vec == CEED_VECTOR_ACTIVE) return CEED_ERROR_SUCCESS;
+  CeedCallBackend(CeedOperatorFieldGetVector(op_input_field, &l_vec));
+  is_active = l_vec == CEED_VECTOR_ACTIVE;
+  if (is_active && skip_active) return CEED_ERROR_SUCCESS;
+  if (is_active) {
+    l_vec = in_vec;
+    if (!e_vec) e_vec = active_e_vec;
   }
 
   // Basis action
   CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_field, &eval_mode));
   switch (eval_mode) {
-    case CEED_EVAL_NONE:
-      CeedCallBackend(CeedVectorSetArray(q_vec, CEED_MEM_DEVICE, CEED_USE_POINTER, e_data));
+    case CEED_EVAL_NONE: {
+      const CeedScalar *e_vec_array;
+
+      if (e_vec) {
+        CeedCallBackend(CeedVectorGetArrayRead(e_vec, CEED_MEM_DEVICE, &e_vec_array));
+      } else {
+        CeedCallBackend(CeedVectorGetArrayRead(l_vec, CEED_MEM_DEVICE, &e_vec_array));
+      }
+      CeedCallBackend(CeedVectorSetArray(q_vec, CEED_MEM_DEVICE, CEED_USE_POINTER, (CeedScalar *)e_vec_array));
       break;
+    }
     case CEED_EVAL_INTERP:
     case CEED_EVAL_GRAD:
     case CEED_EVAL_DIV:
@@ -429,22 +422,30 @@ static inline int CeedOperatorInputBasis_Hip(CeedOperatorField op_input_field, C
 // Restore Input Vectors
 //------------------------------------------------------------------------------
 static inline int CeedOperatorInputRestore_Hip(CeedOperatorField op_input_field, CeedQFunctionField qf_input_field, CeedInt input_field,
-                                               const bool skip_active, CeedScalar **e_data, CeedOperator_Hip *impl) {
+                                               CeedVector in_vec, CeedVector active_e_vec, const bool skip_active, CeedOperator_Hip *impl) {
+  bool         is_active = false;
   CeedEvalMode eval_mode;
   CeedVector   l_vec, e_vec = impl->e_vecs_in[input_field];
 
   // Skip active input
   CeedCallBackend(CeedOperatorFieldGetVector(op_input_field, &l_vec));
-  if (skip_active && l_vec == CEED_VECTOR_ACTIVE) return CEED_ERROR_SUCCESS;
+  is_active = l_vec == CEED_VECTOR_ACTIVE;
+  if (is_active && skip_active) return CEED_ERROR_SUCCESS;
+  if (is_active) {
+    l_vec = in_vec;
+    if (!e_vec) e_vec = active_e_vec;
+  }
 
   // Restore e-vec
   CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_field, &eval_mode));
-  if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
-  } else {
-    if (!e_vec) {  // This was a skip_restriction case
-      CeedCallBackend(CeedVectorRestoreArrayRead(l_vec, (const CeedScalar **)e_data));
+  if (eval_mode == CEED_EVAL_NONE) {
+    const CeedScalar *e_vec_array;
+
+    CeedCallBackend(CeedVectorTakeArray(impl->q_vecs_in[input_field], CEED_MEM_DEVICE, (CeedScalar **)&e_vec_array));
+    if (e_vec) {
+      CeedCallBackend(CeedVectorRestoreArrayRead(e_vec, &e_vec_array));
     } else {
-      CeedCallBackend(CeedVectorRestoreArrayRead(e_vec, (const CeedScalar **)e_data));
+      CeedCallBackend(CeedVectorRestoreArrayRead(l_vec, &e_vec_array));
     }
   }
   return CEED_ERROR_SUCCESS;
@@ -455,12 +456,14 @@ static inline int CeedOperatorInputRestore_Hip(CeedOperatorField op_input_field,
 //------------------------------------------------------------------------------
 static int CeedOperatorApplyAdd_Hip(CeedOperator op, CeedVector in_vec, CeedVector out_vec, CeedRequest *request) {
   CeedInt             Q, num_elem, num_input_fields, num_output_fields;
-  CeedScalar         *e_data_in[CEED_FIELD_MAX] = {NULL}, *e_data_out[CEED_FIELD_MAX] = {NULL};
+  Ceed                ceed;
+  CeedVector          active_e_vec;
   CeedQFunctionField *qf_input_fields, *qf_output_fields;
   CeedQFunction       qf;
   CeedOperatorField  *op_input_fields, *op_output_fields;
   CeedOperator_Hip   *impl;
 
+  CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
   CeedCallBackend(CeedOperatorGetData(op, &impl));
   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
   CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q));
@@ -471,13 +474,15 @@ static int CeedOperatorApplyAdd_Hip(CeedOperator op, CeedVector in_vec, CeedVect
   // Setup
   CeedCallBackend(CeedOperatorSetup_Hip(op));
 
+  // Work vector
+  CeedCallBackend(CeedGetWorkVector(ceed, impl->max_active_e_vec_len, &active_e_vec));
+
   // Process inputs
   for (CeedInt i = 0; i < num_input_fields; i++) {
     CeedInt field = impl->input_field_order[i];
 
-    CeedCallBackend(
-        CeedOperatorInputRestrict_Hip(op_input_fields[field], qf_input_fields[field], field, in_vec, false, &e_data_in[field], impl, request));
-    CeedCallBackend(CeedOperatorInputBasis_Hip(op_input_fields[field], qf_input_fields[field], field, num_elem, false, e_data_in[field], impl));
+    CeedCallBackend(CeedOperatorInputRestrict_Hip(op_input_fields[field], qf_input_fields[field], field, in_vec, active_e_vec, false, impl, request));
+    CeedCallBackend(CeedOperatorInputBasis_Hip(op_input_fields[field], qf_input_fields[field], field, in_vec, active_e_vec, num_elem, false, impl));
   }
 
   // Output pointers, as necessary
@@ -486,9 +491,10 @@ static int CeedOperatorApplyAdd_Hip(CeedOperator op, CeedVector in_vec, CeedVect
 
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
     if (eval_mode == CEED_EVAL_NONE) {
-      // Set the output Q-Vector to use the E-Vector data directly.
-      CeedCallBackend(CeedVectorGetArrayWrite(impl->e_vecs_out[i], CEED_MEM_DEVICE, &e_data_out[i]));
-      CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data_out[i]));
+      CeedScalar *e_vec_array;
+
+      CeedCallBackend(CeedVectorGetArrayWrite(impl->e_vecs_out[i], CEED_MEM_DEVICE, &e_vec_array));
+      CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_vec_array));
     }
   }
 
@@ -497,12 +503,13 @@ static int CeedOperatorApplyAdd_Hip(CeedOperator op, CeedVector in_vec, CeedVect
 
   // Restore input arrays
   for (CeedInt i = 0; i < num_input_fields; i++) {
-    CeedCallBackend(CeedOperatorInputRestore_Hip(op_input_fields[i], qf_input_fields[i], i, false, &e_data_in[i], impl));
+    CeedCallBackend(CeedOperatorInputRestore_Hip(op_input_fields[i], qf_input_fields[i], i, in_vec, active_e_vec, false, impl));
   }
 
-  // Output basis apply if needed
+  // Output basis and restriction
   for (CeedInt i = 0; i < num_output_fields; i++) {
-    CeedInt             field = impl->output_field_order[i];
+    bool                is_active = false;
+    CeedInt             field     = impl->output_field_order[i];
     CeedEvalMode        eval_mode;
     CeedVector          l_vec, e_vec = impl->e_vecs_out[field], q_vec = impl->q_vecs_out[field];
     CeedElemRestriction elem_rstr;
@@ -510,7 +517,11 @@ static int CeedOperatorApplyAdd_Hip(CeedOperator op, CeedVector in_vec, CeedVect
 
     // Output vector
     CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[field], &l_vec));
-    if (l_vec == CEED_VECTOR_ACTIVE) l_vec = out_vec;
+    is_active = l_vec == CEED_VECTOR_ACTIVE;
+    if (is_active) {
+      l_vec = out_vec;
+      if (!e_vec) e_vec = active_e_vec;
+    }
 
     // Basis action
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[field], &eval_mode));
@@ -530,14 +541,17 @@ static int CeedOperatorApplyAdd_Hip(CeedOperator op, CeedVector in_vec, CeedVect
         break;
       // LCOV_EXCL_START
       case CEED_EVAL_WEIGHT: {
-        return CeedError(CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
+        return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
         // LCOV_EXCL_STOP
       }
     }
 
     // Restore evec
     if (eval_mode == CEED_EVAL_NONE) {
-      CeedCallBackend(CeedVectorRestoreArray(e_vec, &e_data_out[field]));
+      CeedScalar *e_vec_array;
+
+      CeedCallBackend(CeedVectorTakeArray(impl->q_vecs_out[i], CEED_MEM_DEVICE, &e_vec_array));
+      CeedCallBackend(CeedVectorRestoreArray(e_vec, &e_vec_array));
     }
 
     // Restrict
@@ -545,6 +559,9 @@ static int CeedOperatorApplyAdd_Hip(CeedOperator op, CeedVector in_vec, CeedVect
     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[field], &elem_rstr));
     CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_TRANSPOSE, e_vec, l_vec, request));
   }
+
+  // Return work vector
+  CeedCallBackend(CeedRestoreWorkVector(ceed, &active_e_vec));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -605,12 +622,14 @@ static int CeedOperatorSetupAtPoints_Hip(CeedOperator op) {
                                               num_output_fields, max_num_points, num_elem));
 
   // Reorder fields to allow reuse of buffers
+  impl->max_active_e_vec_len = 0;
   {
     bool    is_ordered[CEED_FIELD_MAX];
     CeedInt curr_index = 0;
 
     for (CeedInt i = 0; i < num_input_fields; i++) is_ordered[i] = false;
     for (CeedInt i = 0; i < num_input_fields; i++) {
+      CeedSize            e_vec_len_i;
       CeedVector          vec_i;
       CeedElemRestriction rstr_i;
 
@@ -621,6 +640,8 @@ static int CeedOperatorSetupAtPoints_Hip(CeedOperator op) {
       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i));
       if (vec_i == CEED_VECTOR_NONE) continue;  // CEED_EVAL_WEIGHT
       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i));
+      CeedCallBackend(CeedElemRestrictionGetEVectorSize(rstr_i, &e_vec_len_i));
+      impl->max_active_e_vec_len = e_vec_len_i > impl->max_active_e_vec_len ? e_vec_len_i : impl->max_active_e_vec_len;
       for (CeedInt j = i + 1; j < num_input_fields; j++) {
         CeedVector          vec_j;
         CeedElemRestriction rstr_j;
@@ -641,6 +662,7 @@ static int CeedOperatorSetupAtPoints_Hip(CeedOperator op) {
 
     for (CeedInt i = 0; i < num_output_fields; i++) is_ordered[i] = false;
     for (CeedInt i = 0; i < num_output_fields; i++) {
+      CeedSize            e_vec_len_i;
       CeedVector          vec_i;
       CeedElemRestriction rstr_i;
 
@@ -650,6 +672,8 @@ static int CeedOperatorSetupAtPoints_Hip(CeedOperator op) {
       curr_index++;
       CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec_i));
       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &rstr_i));
+      CeedCallBackend(CeedElemRestrictionGetEVectorSize(rstr_i, &e_vec_len_i));
+      impl->max_active_e_vec_len = e_vec_len_i > impl->max_active_e_vec_len ? e_vec_len_i : impl->max_active_e_vec_len;
       for (CeedInt j = i + 1; j < num_output_fields; j++) {
         CeedVector          vec_j;
         CeedElemRestriction rstr_j;
@@ -664,6 +688,7 @@ static int CeedOperatorSetupAtPoints_Hip(CeedOperator op) {
       }
     }
   }
+
   CeedCallBackend(CeedOperatorSetSetupDone(op));
   return CEED_ERROR_SUCCESS;
 }
@@ -672,25 +697,35 @@ static int CeedOperatorSetupAtPoints_Hip(CeedOperator op) {
 // Input Basis Action AtPoints
 //------------------------------------------------------------------------------
 static inline int CeedOperatorInputBasisAtPoints_Hip(CeedOperatorField op_input_field, CeedQFunctionField qf_input_field, CeedInt input_field,
-                                                     CeedInt num_elem, const CeedInt *num_points, const bool skip_active, CeedScalar *e_data,
-                                                     CeedOperator_Hip *impl) {
+                                                     CeedVector in_vec, CeedVector active_e_vec, CeedInt num_elem, const CeedInt *num_points,
+                                                     const bool skip_active, CeedOperator_Hip *impl) {
+  bool         is_active = false;
   CeedEvalMode eval_mode;
-  CeedVector   e_vec = impl->e_vecs_in[input_field], q_vec = impl->q_vecs_in[input_field];
+  CeedVector   l_vec, e_vec = impl->e_vecs_in[input_field], q_vec = impl->q_vecs_in[input_field];
 
   // Skip active input
-  if (skip_active) {
-    CeedVector l_vec;
-
-    CeedCallBackend(CeedOperatorFieldGetVector(op_input_field, &l_vec));
-    if (l_vec == CEED_VECTOR_ACTIVE) return CEED_ERROR_SUCCESS;
+  CeedCallBackend(CeedOperatorFieldGetVector(op_input_field, &l_vec));
+  is_active = l_vec == CEED_VECTOR_ACTIVE;
+  if (is_active && skip_active) return CEED_ERROR_SUCCESS;
+  if (is_active) {
+    l_vec = in_vec;
+    if (!e_vec) e_vec = active_e_vec;
   }
 
   // Basis action
   CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_field, &eval_mode));
   switch (eval_mode) {
-    case CEED_EVAL_NONE:
-      CeedCallBackend(CeedVectorSetArray(q_vec, CEED_MEM_DEVICE, CEED_USE_POINTER, e_data));
+    case CEED_EVAL_NONE: {
+      const CeedScalar *e_vec_array;
+
+      if (e_vec) {
+        CeedCallBackend(CeedVectorGetArrayRead(e_vec, CEED_MEM_DEVICE, &e_vec_array));
+      } else {
+        CeedCallBackend(CeedVectorGetArrayRead(l_vec, CEED_MEM_DEVICE, &e_vec_array));
+      }
+      CeedCallBackend(CeedVectorSetArray(q_vec, CEED_MEM_DEVICE, CEED_USE_POINTER, (CeedScalar *)e_vec_array));
       break;
+    }
     case CEED_EVAL_INTERP:
     case CEED_EVAL_GRAD:
     case CEED_EVAL_DIV:
@@ -712,12 +747,14 @@ static inline int CeedOperatorInputBasisAtPoints_Hip(CeedOperatorField op_input_
 //------------------------------------------------------------------------------
 static int CeedOperatorApplyAddAtPoints_Hip(CeedOperator op, CeedVector in_vec, CeedVector out_vec, CeedRequest *request) {
   CeedInt             max_num_points, *num_points, num_elem, num_input_fields, num_output_fields;
-  CeedScalar         *e_data_in[CEED_FIELD_MAX] = {NULL}, *e_data_out[CEED_FIELD_MAX] = {NULL};
+  Ceed                ceed;
+  CeedVector          active_e_vec;
   CeedQFunctionField *qf_input_fields, *qf_output_fields;
   CeedQFunction       qf;
   CeedOperatorField  *op_input_fields, *op_output_fields;
   CeedOperator_Hip   *impl;
 
+  CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
   CeedCallBackend(CeedOperatorGetData(op, &impl));
   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
@@ -729,6 +766,9 @@ static int CeedOperatorApplyAddAtPoints_Hip(CeedOperator op, CeedVector in_vec,
   num_points     = impl->num_points;
   max_num_points = impl->max_num_points;
 
+  // Work vector
+  CeedCallBackend(CeedGetWorkVector(ceed, impl->max_active_e_vec_len, &active_e_vec));
+
   // Get point coordinates
   if (!impl->point_coords_elem) {
     CeedVector          point_coords = NULL;
@@ -743,10 +783,9 @@ static int CeedOperatorApplyAddAtPoints_Hip(CeedOperator op, CeedVector in_vec,
   for (CeedInt i = 0; i < num_input_fields; i++) {
     CeedInt field = impl->input_field_order[i];
 
-    CeedCallBackend(
-        CeedOperatorInputRestrict_Hip(op_input_fields[field], qf_input_fields[field], field, in_vec, false, &e_data_in[field], impl, request));
-    CeedCallBackend(CeedOperatorInputBasisAtPoints_Hip(op_input_fields[field], qf_input_fields[field], field, num_elem, num_points, false,
-                                                       e_data_in[field], impl));
+    CeedCallBackend(CeedOperatorInputRestrict_Hip(op_input_fields[field], qf_input_fields[field], field, in_vec, active_e_vec, false, impl, request));
+    CeedCallBackend(CeedOperatorInputBasisAtPoints_Hip(op_input_fields[field], qf_input_fields[field], field, in_vec, active_e_vec, num_elem,
+                                                       num_points, false, impl));
   }
 
   // Output pointers, as necessary
@@ -755,9 +794,10 @@ static int CeedOperatorApplyAddAtPoints_Hip(CeedOperator op, CeedVector in_vec,
 
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
     if (eval_mode == CEED_EVAL_NONE) {
-      // Set the output Q-Vector to use the E-Vector data directly.
-      CeedCallBackend(CeedVectorGetArrayWrite(impl->e_vecs_out[i], CEED_MEM_DEVICE, &e_data_out[i]));
-      CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data_out[i]));
+      CeedScalar *e_vec_array;
+
+      CeedCallBackend(CeedVectorGetArrayWrite(impl->e_vecs_out[i], CEED_MEM_DEVICE, &e_vec_array));
+      CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_vec_array));
     }
   }
 
@@ -766,12 +806,13 @@ static int CeedOperatorApplyAddAtPoints_Hip(CeedOperator op, CeedVector in_vec,
 
   // Restore input arrays
   for (CeedInt i = 0; i < num_input_fields; i++) {
-    CeedCallBackend(CeedOperatorInputRestore_Hip(op_input_fields[i], qf_input_fields[i], i, false, &e_data_in[i], impl));
+    CeedCallBackend(CeedOperatorInputRestore_Hip(op_input_fields[i], qf_input_fields[i], i, in_vec, active_e_vec, false, impl));
   }
 
-  // Output basis apply if needed
+  // Output basis and restriction
   for (CeedInt i = 0; i < num_output_fields; i++) {
-    CeedInt             field = impl->output_field_order[i];
+    bool                is_active = false;
+    CeedInt             field     = impl->output_field_order[i];
     CeedEvalMode        eval_mode;
     CeedVector          l_vec, e_vec = impl->e_vecs_out[field], q_vec = impl->q_vecs_out[field];
     CeedElemRestriction elem_rstr;
@@ -779,7 +820,11 @@ static int CeedOperatorApplyAddAtPoints_Hip(CeedOperator op, CeedVector in_vec,
 
     // Output vector
     CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[field], &l_vec));
-    if (l_vec == CEED_VECTOR_ACTIVE) l_vec = out_vec;
+    is_active = l_vec == CEED_VECTOR_ACTIVE;
+    if (is_active) {
+      l_vec = out_vec;
+      if (!e_vec) e_vec = active_e_vec;
+    }
 
     // Basis action
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[field], &eval_mode));
@@ -799,14 +844,17 @@ static int CeedOperatorApplyAddAtPoints_Hip(CeedOperator op, CeedVector in_vec,
         break;
       // LCOV_EXCL_START
       case CEED_EVAL_WEIGHT: {
-        return CeedError(CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
+        return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
         // LCOV_EXCL_STOP
       }
     }
 
     // Restore evec
     if (eval_mode == CEED_EVAL_NONE) {
-      CeedCallBackend(CeedVectorRestoreArray(e_vec, &e_data_out[field]));
+      CeedScalar *e_vec_array;
+
+      CeedCallBackend(CeedVectorTakeArray(impl->q_vecs_out[i], CEED_MEM_DEVICE, &e_vec_array));
+      CeedCallBackend(CeedVectorRestoreArray(e_vec, &e_vec_array));
     }
 
     // Restrict
@@ -814,6 +862,9 @@ static int CeedOperatorApplyAddAtPoints_Hip(CeedOperator op, CeedVector in_vec,
     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[field], &elem_rstr));
     CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_TRANSPOSE, e_vec, l_vec, request));
   }
+
+  // Restore work vector
+  CeedCallBackend(CeedRestoreWorkVector(ceed, &active_e_vec));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -824,7 +875,7 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Hip(CeedOperator op, b
                                                               CeedRequest *request) {
   Ceed                ceed, ceed_parent;
   CeedInt             num_active_in, num_active_out, Q, num_elem, num_input_fields, num_output_fields, size;
-  CeedScalar         *assembled_array, *e_data[2 * CEED_FIELD_MAX] = {NULL};
+  CeedScalar         *assembled_array;
   CeedVector         *active_inputs;
   CeedQFunctionField *qf_input_fields, *qf_output_fields;
   CeedQFunction       qf;
@@ -847,8 +898,8 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Hip(CeedOperator op, b
 
   // Process inputs
   for (CeedInt i = 0; i < num_input_fields; i++) {
-    CeedCallBackend(CeedOperatorInputRestrict_Hip(op_input_fields[i], qf_input_fields[i], i, NULL, true, &e_data[i], impl, request));
-    CeedCallBackend(CeedOperatorInputBasis_Hip(op_input_fields[i], qf_input_fields[i], i, num_elem, true, e_data[i], impl));
+    CeedCallBackend(CeedOperatorInputRestrict_Hip(op_input_fields[i], qf_input_fields[i], i, NULL, NULL, true, impl, request));
+    CeedCallBackend(CeedOperatorInputBasis_Hip(op_input_fields[i], qf_input_fields[i], i, NULL, NULL, num_elem, true, impl));
   }
 
   // Count number of active input fields
@@ -948,7 +999,7 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Hip(CeedOperator op, b
 
   // Restore input arrays
   for (CeedInt i = 0; i < num_input_fields; i++) {
-    CeedCallBackend(CeedOperatorInputRestore_Hip(op_input_fields[i], qf_input_fields[i], i, true, &e_data[i], impl));
+    CeedCallBackend(CeedOperatorInputRestore_Hip(op_input_fields[i], qf_input_fields[i], i, NULL, NULL, true, impl));
   }
 
   // Restore output
@@ -1339,7 +1390,7 @@ static int CeedOperatorLinearAssembleAddPointBlockDiagonal_Hip(CeedOperator op,
 //------------------------------------------------------------------------------
 static int CeedSingleOperatorAssembleSetup_Hip(CeedOperator op, CeedInt use_ceedsize_idx) {
   Ceed                ceed;
-  Ceed_Hip           *hip_data;
+  Ceed_Hip           *Hip_data;
   char               *assembly_kernel_source;
   const char         *assembly_kernel_path;
   CeedInt             num_input_fields, num_output_fields, num_eval_modes_in = 0, num_eval_modes_out = 0;
@@ -1429,8 +1480,8 @@ static int CeedSingleOperatorAssembleSetup_Hip(CeedOperator op, CeedInt use_ceed
   asmb->block_size_x             = elem_size_in;
   asmb->block_size_y             = elem_size_out;
 
-  CeedCallBackend(CeedGetData(ceed, &hip_data));
-  bool fallback = asmb->block_size_x * asmb->block_size_y * asmb->elems_per_block > hip_data->device_prop.maxThreadsPerBlock;
+  CeedCallBackend(CeedGetData(ceed, &Hip_data));
+  bool fallback = asmb->block_size_x * asmb->block_size_y * asmb->elems_per_block > Hip_data->device_prop.maxThreadsPerBlock;
 
   if (fallback) {
     // Use fallback kernel with 1D threadblock
@@ -1645,12 +1696,14 @@ static int CeedOperatorLinearAssembleQFunctionAtPoints_Hip(CeedOperator op, Ceed
 //------------------------------------------------------------------------------
 static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip(CeedOperator op, CeedVector assembled, CeedRequest *request) {
   CeedInt             max_num_points, *num_points, num_elem, num_input_fields, num_output_fields;
-  CeedScalar         *e_data_in[CEED_FIELD_MAX] = {NULL}, *e_data_out[CEED_FIELD_MAX] = {NULL};
+  Ceed                ceed;
+  CeedVector          active_e_vec_in, active_e_vec_out;
   CeedQFunctionField *qf_input_fields, *qf_output_fields;
   CeedQFunction       qf;
   CeedOperatorField  *op_input_fields, *op_output_fields;
   CeedOperator_Hip   *impl;
 
+  CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
   CeedCallBackend(CeedOperatorGetData(op, &impl));
   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
@@ -1662,16 +1715,9 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip(CeedOperator op, Ce
   num_points     = impl->num_points;
   max_num_points = impl->max_num_points;
 
-  // Create separate output e-vecs
-  if (impl->has_shared_e_vecs) {
-    for (CeedInt i = 0; i < impl->num_outputs; i++) {
-      CeedCallBackend(CeedVectorDestroy(&impl->q_vecs_out[i]));
-      CeedCallBackend(CeedVectorDestroy(&impl->e_vecs_out[i]));
-    }
-    CeedCallBackend(CeedOperatorSetupFields_Hip(qf, op, false, true, impl->skip_rstr_out, impl->apply_add_basis_out, impl->e_vecs_out,
-                                                impl->q_vecs_out, num_output_fields, max_num_points, num_elem));
-  }
-  impl->has_shared_e_vecs = false;
+  // Work vector
+  CeedCallBackend(CeedGetWorkVector(ceed, impl->max_active_e_vec_len, &active_e_vec_in));
+  CeedCallBackend(CeedGetWorkVector(ceed, impl->max_active_e_vec_len, &active_e_vec_out));
 
   // Get point coordinates
   if (!impl->point_coords_elem) {
@@ -1685,8 +1731,8 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip(CeedOperator op, Ce
 
   // Process inputs
   for (CeedInt i = 0; i < num_input_fields; i++) {
-    CeedCallBackend(CeedOperatorInputRestrict_Hip(op_input_fields[i], qf_input_fields[i], i, NULL, true, &e_data_in[i], impl, request));
-    CeedCallBackend(CeedOperatorInputBasisAtPoints_Hip(op_input_fields[i], qf_input_fields[i], i, num_elem, num_points, true, e_data_in[i], impl));
+    CeedCallBackend(CeedOperatorInputRestrict_Hip(op_input_fields[i], qf_input_fields[i], i, NULL, NULL, true, impl, request));
+    CeedCallBackend(CeedOperatorInputBasisAtPoints_Hip(op_input_fields[i], qf_input_fields[i], i, NULL, NULL, num_elem, num_points, true, impl));
   }
 
   // Clear active input Qvecs
@@ -1704,9 +1750,10 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip(CeedOperator op, Ce
 
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
     if (eval_mode == CEED_EVAL_NONE) {
-      // Set the output Q-Vector to use the E-Vector data directly.
-      CeedCallBackend(CeedVectorGetArrayWrite(impl->e_vecs_out[i], CEED_MEM_DEVICE, &e_data_out[i]));
-      CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data_out[i]));
+      CeedScalar *e_vec_array;
+
+      CeedCallBackend(CeedVectorGetArrayWrite(impl->e_vecs_out[i], CEED_MEM_DEVICE, &e_vec_array));
+      CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_vec_array));
     }
   }
 
@@ -1734,32 +1781,36 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip(CeedOperator op, Ce
     for (CeedInt s = 0; s < e_vec_size; s++) {
       bool         is_active_input = false;
       CeedEvalMode eval_mode;
-      CeedVector   vec;
+      CeedVector   l_vec, q_vec = impl->q_vecs_in[i];
       CeedBasis    basis;
 
-      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
+      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &l_vec));
       // Skip non-active input
-      is_active_input = vec == CEED_VECTOR_ACTIVE;
+      is_active_input = l_vec == CEED_VECTOR_ACTIVE;
       if (!is_active_input) continue;
 
       // Update unit vector
-      if (s == 0) CeedCallBackend(CeedVectorSetValue(impl->e_vecs_in[i], 0.0));
-      else CeedCallBackend(CeedVectorSetValueStrided(impl->e_vecs_in[i], s - 1, e_vec_size, 0.0));
-      CeedCallBackend(CeedVectorSetValueStrided(impl->e_vecs_in[i], s, e_vec_size, 1.0));
+      if (s == 0) CeedCallBackend(CeedVectorSetValue(active_e_vec_in, 0.0));
+      else CeedCallBackend(CeedVectorSetValueStrided(active_e_vec_in, s - 1, e_vec_size, 0.0));
+      CeedCallBackend(CeedVectorSetValueStrided(active_e_vec_in, s, e_vec_size, 1.0));
 
       // Basis action
       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
       switch (eval_mode) {
-        case CEED_EVAL_NONE:
-          CeedCallBackend(CeedVectorSetArray(impl->q_vecs_in[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data_in[i]));
+        case CEED_EVAL_NONE: {
+          const CeedScalar *e_vec_array;
+
+          CeedCallBackend(CeedVectorGetArrayRead(active_e_vec_in, CEED_MEM_DEVICE, &e_vec_array));
+          CeedCallBackend(CeedVectorSetArray(impl->q_vecs_in[i], CEED_MEM_DEVICE, CEED_USE_POINTER, (CeedScalar *)e_vec_array));
           break;
+        }
         case CEED_EVAL_INTERP:
         case CEED_EVAL_GRAD:
         case CEED_EVAL_DIV:
         case CEED_EVAL_CURL:
           CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
-          CeedCallBackend(CeedBasisApplyAtPoints(basis, num_elem, num_points, CEED_NOTRANSPOSE, eval_mode, impl->point_coords_elem,
-                                                 impl->e_vecs_in[i], impl->q_vecs_in[i]));
+          CeedCallBackend(
+              CeedBasisApplyAtPoints(basis, num_elem, num_points, CEED_NOTRANSPOSE, eval_mode, impl->point_coords_elem, active_e_vec_in, q_vec));
           break;
         case CEED_EVAL_WEIGHT:
           break;  // No action
@@ -1774,7 +1825,7 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip(CeedOperator op, Ce
         CeedInt             elem_size        = 0;
         CeedRestrictionType rstr_type;
         CeedEvalMode        eval_mode;
-        CeedVector          l_vec;
+        CeedVector          l_vec, e_vec = impl->e_vecs_out[j], q_vec = impl->q_vecs_out[j];
         CeedElemRestriction elem_rstr;
         CeedBasis           basis;
 
@@ -1782,6 +1833,7 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip(CeedOperator op, Ce
         // ---- Skip non-active output
         is_active_output = l_vec == CEED_VECTOR_ACTIVE;
         if (!is_active_output) continue;
+        if (!e_vec) e_vec = active_e_vec_out;
 
         // ---- Check if elem size matches
         CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[j], &elem_rstr));
@@ -1802,16 +1854,19 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip(CeedOperator op, Ce
         // Basis action
         CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode));
         switch (eval_mode) {
-          case CEED_EVAL_NONE:
-            CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs_out[j], &e_data_out[j]));
+          case CEED_EVAL_NONE: {
+            CeedScalar *e_vec_array;
+
+            CeedCallBackend(CeedVectorTakeArray(q_vec, CEED_MEM_DEVICE, &e_vec_array));
+            CeedCallBackend(CeedVectorRestoreArray(e_vec, &e_vec_array));
             break;
+          }
           case CEED_EVAL_INTERP:
           case CEED_EVAL_GRAD:
           case CEED_EVAL_DIV:
           case CEED_EVAL_CURL:
             CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[j], &basis));
-            CeedCallBackend(CeedBasisApplyAtPoints(basis, num_elem, num_points, CEED_TRANSPOSE, eval_mode, impl->point_coords_elem,
-                                                   impl->q_vecs_out[j], impl->e_vecs_out[j]));
+            CeedCallBackend(CeedBasisApplyAtPoints(basis, num_elem, num_points, CEED_TRANSPOSE, eval_mode, impl->point_coords_elem, q_vec, e_vec));
             break;
           // LCOV_EXCL_START
           case CEED_EVAL_WEIGHT: {
@@ -1821,21 +1876,23 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip(CeedOperator op, Ce
         }
 
         // Mask output e-vec
-        CeedCallBackend(CeedVectorPointwiseMult(impl->e_vecs_out[j], impl->e_vecs_in[i], impl->e_vecs_out[j]));
+        CeedCallBackend(CeedVectorPointwiseMult(e_vec, active_e_vec_in, e_vec));
 
         // Restrict
         CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[j], &elem_rstr));
-        CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_TRANSPOSE, impl->e_vecs_out[j], assembled, request));
+        CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_TRANSPOSE, e_vec, assembled, request));
 
         // Reset q_vec for
         if (eval_mode == CEED_EVAL_NONE) {
-          CeedCallBackend(CeedVectorGetArrayWrite(impl->e_vecs_out[j], CEED_MEM_DEVICE, &e_data_out[j]));
-          CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[j], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data_out[j]));
+          CeedScalar *e_vec_array;
+
+          CeedCallBackend(CeedVectorGetArrayWrite(e_vec, CEED_MEM_DEVICE, &e_vec_array));
+          CeedCallBackend(CeedVectorSetArray(q_vec, CEED_MEM_DEVICE, CEED_USE_POINTER, e_vec_array));
         }
       }
 
       // Reset vec
-      if (s == e_vec_size - 1 && i != num_input_fields - 1) CeedCallBackend(CeedVectorSetValue(impl->q_vecs_in[i], 0.0));
+      if (s == e_vec_size - 1 && i != num_input_fields - 1) CeedCallBackend(CeedVectorSetValue(q_vec, 0.0));
     }
   }
 
@@ -1849,13 +1906,16 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip(CeedOperator op, Ce
     // Restore evec
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
     if (eval_mode == CEED_EVAL_NONE) {
-      CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs_in[i], &e_data_in[i]));
+      CeedScalar *e_vec_array;
+
+      CeedCallBackend(CeedVectorTakeArray(impl->q_vecs_in[i], CEED_MEM_DEVICE, &e_vec_array));
+      CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs_in[i], &e_vec_array));
     }
   }
 
   // Restore input arrays
   for (CeedInt i = 0; i < num_input_fields; i++) {
-    CeedCallBackend(CeedOperatorInputRestore_Hip(op_input_fields[i], qf_input_fields[i], i, true, &e_data_in[i], impl));
+    CeedCallBackend(CeedOperatorInputRestore_Hip(op_input_fields[i], qf_input_fields[i], i, NULL, NULL, true, impl));
   }
   return CEED_ERROR_SUCCESS;
 }
diff --git a/backends/hip-ref/ceed-hip-ref.h b/backends/hip-ref/ceed-hip-ref.h
index fb2c5b565e..52e88129a1 100644
--- a/backends/hip-ref/ceed-hip-ref.h
+++ b/backends/hip-ref/ceed-hip-ref.h
@@ -135,7 +135,7 @@ typedef struct {
 } CeedOperatorAssemble_Hip;
 
 typedef struct {
-  bool                     *skip_rstr_in, *skip_rstr_out, *apply_add_basis_out, has_shared_e_vecs;
+  bool                     *skip_rstr_in, *skip_rstr_out, *apply_add_basis_out;
   uint64_t                 *input_states;  // State tracking for passive inputs
   CeedVector               *e_vecs_in, *e_vecs_out;
   CeedVector               *q_vecs_in, *q_vecs_out;
diff --git a/interface/ceed-basis.c b/interface/ceed-basis.c
index 67b9d43345..aa19489e0a 100644
--- a/interface/ceed-basis.c
+++ b/interface/ceed-basis.c
@@ -331,11 +331,6 @@ static int CeedBasisApplyAtPointsCheckDims(CeedBasis basis, CeedInt num_elem, co
   if (x_ref != CEED_VECTOR_NONE) CeedCall(CeedVectorGetLength(x_ref, &x_length));
   if (u != CEED_VECTOR_NONE) CeedCall(CeedVectorGetLength(u, &u_length));
 
-  // Check compatibility of topological and geometrical dimensions
-  CeedCheck((t_mode == CEED_TRANSPOSE && v_length % num_nodes == 0) || (t_mode == CEED_NOTRANSPOSE && u_length % num_nodes == 0) ||
-                (eval_mode == CEED_EVAL_WEIGHT),
-            ceed, CEED_ERROR_DIMENSION, "Length of input/output vectors incompatible with basis dimensions and number of points");
-
   // Check compatibility coordinates vector
   for (CeedInt i = 0; i < num_elem; i++) total_num_points += num_points[i];
   CeedCheck((x_length >= total_num_points * dim) || (eval_mode == CEED_EVAL_WEIGHT), ceed, CEED_ERROR_DIMENSION,
@@ -1819,11 +1814,6 @@ static int CeedBasisApplyCheckDims(CeedBasis basis, CeedInt num_elem, CeedTransp
   CeedCall(CeedVectorGetLength(v, &v_length));
   if (u) CeedCall(CeedVectorGetLength(u, &u_length));
 
-  // Check compatibility of topological and geometrical dimensions
-  CeedCheck((t_mode == CEED_TRANSPOSE && v_length % num_nodes == 0 && u_length % num_qpts == 0) ||
-                (t_mode == CEED_NOTRANSPOSE && u_length % num_nodes == 0 && v_length % num_qpts == 0),
-            ceed, CEED_ERROR_DIMENSION, "Length of input/output vectors incompatible with basis dimensions");
-
   // Check vector lengths to prevent out of bounds issues
   bool has_good_dims = true;
   switch (eval_mode) {
diff --git a/interface/ceed-vector.c b/interface/ceed-vector.c
index 7c10ca98bc..99672bce09 100644
--- a/interface/ceed-vector.c
+++ b/interface/ceed-vector.c
@@ -862,7 +862,7 @@ int CeedVectorPointwiseMult(CeedVector w, CeedVector x, CeedVector y) {
   CeedCall(CeedVectorGetLength(w, &length_w));
   CeedCall(CeedVectorGetLength(x, &length_x));
   CeedCall(CeedVectorGetLength(y, &length_y));
-  CeedCheck(length_w == length_x && length_w == length_y, ceed, CEED_ERROR_UNSUPPORTED,
+  CeedCheck(length_x >= length_x && length_y >= length_w, ceed, CEED_ERROR_UNSUPPORTED,
             "Cannot multiply vectors of different lengths."
             " x length: %" CeedSize_FMT " y length: %" CeedSize_FMT,
             length_x, length_y);
diff --git a/tests/junit.py b/tests/junit.py
index 923dbee50c..a20360d91f 100755
--- a/tests/junit.py
+++ b/tests/junit.py
@@ -172,7 +172,7 @@ def check_required_failure(self, test: str, spec: TestSpec, resource: str, stder
         elif test_id in ['t215']:
             fail_str = 'Cannot destroy CeedElemRestriction, a process has read access to the offset data'
         elif test_id in ['t303']:
-            fail_str = 'Length of input/output vectors incompatible with basis dimensions'
+            fail_str = 'Input/output vectors too short for basis and evaluation mode'
         elif test_id in ['t408']:
             fail_str = 'CeedQFunctionContextGetData(): Cannot grant CeedQFunctionContext data access, a process has read access'
         elif test_id in ['t409'] and contains_any(resource, ['memcheck']):
diff --git a/tests/t303-basis.c b/tests/t303-basis.c
index d71c97a6e7..baf844bf50 100644
--- a/tests/t303-basis.c
+++ b/tests/t303-basis.c
@@ -1,6 +1,6 @@
 /// @file
-/// Test checking BasisApply input/output vectors compatibility with basis dimensions
-/// \test Test checking BasisApply input/output vectors compatibility with basis dimensions
+/// Test checking BasisApply input/output vectors compatibility with basis
+/// \test Test checking BasisApply input/output vectors compatibility with basis
 
 //TESTARGS(only="cpu") {ceed_resource}
 #include <ceed.h>
@@ -15,7 +15,7 @@ int main(int argc, char **argv) {
   CeedInit(argv[1], &ceed);
 
   CeedVectorCreate(ceed, len, &u);
-  CeedVectorCreate(ceed, len + 1, &v);
+  CeedVectorCreate(ceed, len - 1, &v);
 
   CeedBasisCreateTensorH1Lagrange(ceed, dim, num_comp, p, q, CEED_GAUSS, &basis);
 

From 19a04db8f2245ce5b2896f00805cba58aafa343d Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Thu, 26 Sep 2024 14:40:14 -0600
Subject: [PATCH 180/571] gpu - only overwite portion of basis target used

---
 backends/cuda-ref/ceed-cuda-ref-basis.c       | 19 +++++++++++---
 backends/cuda-shared/ceed-cuda-shared-basis.c |  7 ++++-
 backends/hip-ref/ceed-hip-ref-basis.c         | 14 ++++++++--
 backends/hip-shared/ceed-hip-shared-basis.c   |  7 ++++-
 interface/ceed-basis.c                        | 26 +++++++++++--------
 5 files changed, 55 insertions(+), 18 deletions(-)

diff --git a/backends/cuda-ref/ceed-cuda-ref-basis.c b/backends/cuda-ref/ceed-cuda-ref-basis.c
index c245a51489..1c38ce002c 100644
--- a/backends/cuda-ref/ceed-cuda-ref-basis.c
+++ b/backends/cuda-ref/ceed-cuda-ref-basis.c
@@ -40,9 +40,14 @@ static int CeedBasisApplyCore_Cuda(CeedBasis basis, bool apply_add, const CeedIn
 
   // Clear v for transpose operation
   if (is_transpose && !apply_add) {
+    CeedInt  num_comp, q_comp, num_nodes, num_qpts;
     CeedSize length;
 
-    CeedCallBackend(CeedVectorGetLength(v, &length));
+    CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
+    CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp));
+    CeedCallBackend(CeedBasisGetNumNodes(basis, &num_nodes));
+    CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &num_qpts));
+    length = (CeedSize)num_elem * (CeedSize)num_comp * (t_mode == CEED_TRANSPOSE ? (CeedSize)num_nodes : ((CeedSize)num_qpts * (CeedSize)q_comp));
     CeedCallCuda(ceed, cudaMemset(d_v, 0, length * sizeof(CeedScalar)));
   }
   CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
@@ -206,9 +211,14 @@ static int CeedBasisApplyAtPointsCore_Cuda(CeedBasis basis, bool apply_add, cons
 
   // Clear v for transpose operation
   if (is_transpose && !apply_add) {
+    CeedInt  num_comp, q_comp, num_nodes;
     CeedSize length;
 
-    CeedCallBackend(CeedVectorGetLength(v, &length));
+    CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
+    CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp));
+    CeedCallBackend(CeedBasisGetNumNodes(basis, &num_nodes));
+    length =
+        (CeedSize)num_elem * (CeedSize)num_comp * (t_mode == CEED_TRANSPOSE ? (CeedSize)num_nodes : ((CeedSize)max_num_points * (CeedSize)q_comp));
     CeedCallCuda(ceed, cudaMemset(d_v, 0, length * sizeof(CeedScalar)));
   }
 
@@ -283,9 +293,12 @@ static int CeedBasisApplyNonTensorCore_Cuda(CeedBasis basis, bool apply_add, con
 
   // Clear v for transpose operation
   if (is_transpose && !apply_add) {
+    CeedInt  num_comp, q_comp;
     CeedSize length;
 
-    CeedCallBackend(CeedVectorGetLength(v, &length));
+    CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
+    CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp));
+    length = (CeedSize)num_elem * (CeedSize)num_comp * (t_mode == CEED_TRANSPOSE ? (CeedSize)num_nodes : ((CeedSize)num_qpts * (CeedSize)q_comp));
     CeedCallCuda(ceed, cudaMemset(d_v, 0, length * sizeof(CeedScalar)));
   }
 
diff --git a/backends/cuda-shared/ceed-cuda-shared-basis.c b/backends/cuda-shared/ceed-cuda-shared-basis.c
index 8924ff52f4..c947846f3a 100644
--- a/backends/cuda-shared/ceed-cuda-shared-basis.c
+++ b/backends/cuda-shared/ceed-cuda-shared-basis.c
@@ -312,9 +312,14 @@ static int CeedBasisApplyAtPointsCore_Cuda_shared(CeedBasis basis, bool apply_ad
 
   // Clear v for transpose operation
   if (is_transpose && !apply_add) {
+    CeedInt  num_comp, q_comp, num_nodes;
     CeedSize length;
 
-    CeedCallBackend(CeedVectorGetLength(v, &length));
+    CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
+    CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp));
+    CeedCallBackend(CeedBasisGetNumNodes(basis, &num_nodes));
+    length =
+        (CeedSize)num_elem * (CeedSize)num_comp * (t_mode == CEED_TRANSPOSE ? (CeedSize)num_nodes : ((CeedSize)max_num_points * (CeedSize)q_comp));
     CeedCallCuda(ceed, cudaMemset(d_v, 0, length * sizeof(CeedScalar)));
   }
 
diff --git a/backends/hip-ref/ceed-hip-ref-basis.c b/backends/hip-ref/ceed-hip-ref-basis.c
index 70dda0a7da..f54184f28d 100644
--- a/backends/hip-ref/ceed-hip-ref-basis.c
+++ b/backends/hip-ref/ceed-hip-ref-basis.c
@@ -39,9 +39,14 @@ static int CeedBasisApplyCore_Hip(CeedBasis basis, bool apply_add, const CeedInt
 
   // Clear v for transpose operation
   if (is_transpose && !apply_add) {
+    CeedInt  num_comp, q_comp, num_nodes, num_qpts;
     CeedSize length;
 
-    CeedCallBackend(CeedVectorGetLength(v, &length));
+    CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
+    CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp));
+    CeedCallBackend(CeedBasisGetNumNodes(basis, &num_nodes));
+    CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &num_qpts));
+    length = (CeedSize)num_elem * (CeedSize)num_comp * (t_mode == CEED_TRANSPOSE ? (CeedSize)num_nodes : ((CeedSize)num_qpts * (CeedSize)q_comp));
     CeedCallHip(ceed, hipMemset(d_v, 0, length * sizeof(CeedScalar)));
   }
   CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
@@ -204,9 +209,14 @@ static int CeedBasisApplyAtPointsCore_Hip(CeedBasis basis, bool apply_add, const
 
   // Clear v for transpose operation
   if (is_transpose && !apply_add) {
+    CeedInt  num_comp, q_comp, num_nodes;
     CeedSize length;
 
-    CeedCallBackend(CeedVectorGetLength(v, &length));
+    CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
+    CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp));
+    CeedCallBackend(CeedBasisGetNumNodes(basis, &num_nodes));
+    length =
+        (CeedSize)num_elem * (CeedSize)num_comp * (t_mode == CEED_TRANSPOSE ? (CeedSize)num_nodes : ((CeedSize)max_num_points * (CeedSize)q_comp));
     CeedCallHip(ceed, hipMemset(d_v, 0, length * sizeof(CeedScalar)));
   }
 
diff --git a/backends/hip-shared/ceed-hip-shared-basis.c b/backends/hip-shared/ceed-hip-shared-basis.c
index 307107ec6b..05b564e7f2 100644
--- a/backends/hip-shared/ceed-hip-shared-basis.c
+++ b/backends/hip-shared/ceed-hip-shared-basis.c
@@ -371,9 +371,14 @@ static int CeedBasisApplyAtPointsCore_Hip_shared(CeedBasis basis, bool apply_add
 
   // Clear v for transpose operation
   if (is_transpose && !apply_add) {
+    CeedInt  num_comp, q_comp, num_nodes;
     CeedSize length;
 
-    CeedCallBackend(CeedVectorGetLength(v, &length));
+    CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
+    CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp));
+    CeedCallBackend(CeedBasisGetNumNodes(basis, &num_nodes));
+    length =
+        (CeedSize)num_elem * (CeedSize)num_comp * (t_mode == CEED_TRANSPOSE ? (CeedSize)num_nodes : ((CeedSize)max_num_points * (CeedSize)q_comp));
     CeedCallHip(ceed, hipMemset(d_v, 0, length * sizeof(CeedScalar)));
   }
 
diff --git a/interface/ceed-basis.c b/interface/ceed-basis.c
index aa19489e0a..c6869f2f3b 100644
--- a/interface/ceed-basis.c
+++ b/interface/ceed-basis.c
@@ -333,10 +333,10 @@ static int CeedBasisApplyAtPointsCheckDims(CeedBasis basis, CeedInt num_elem, co
 
   // Check compatibility coordinates vector
   for (CeedInt i = 0; i < num_elem; i++) total_num_points += num_points[i];
-  CeedCheck((x_length >= total_num_points * dim) || (eval_mode == CEED_EVAL_WEIGHT), ceed, CEED_ERROR_DIMENSION,
+  CeedCheck((x_length >= (CeedSize)total_num_points * (CeedSize)dim) || (eval_mode == CEED_EVAL_WEIGHT), ceed, CEED_ERROR_DIMENSION,
             "Length of reference coordinate vector incompatible with basis dimension and number of points."
             " Found reference coordinate vector of length %" CeedSize_FMT ", not of length %" CeedSize_FMT ".",
-            x_length, total_num_points * dim);
+            x_length, (CeedSize)total_num_points * (CeedSize)dim);
 
   // Check CEED_EVAL_WEIGHT only on CEED_NOTRANSPOSE
   CeedCheck(eval_mode != CEED_EVAL_WEIGHT || t_mode == CEED_NOTRANSPOSE, ceed, CEED_ERROR_UNSUPPORTED,
@@ -346,13 +346,16 @@ static int CeedBasisApplyAtPointsCheckDims(CeedBasis basis, CeedInt num_elem, co
   bool has_good_dims = true;
   switch (eval_mode) {
     case CEED_EVAL_INTERP:
-      has_good_dims = ((t_mode == CEED_TRANSPOSE && (u_length >= total_num_points * num_q_comp || v_length >= num_elem * num_nodes * num_comp)) ||
-                       (t_mode == CEED_NOTRANSPOSE && (v_length >= total_num_points * num_q_comp || u_length >= num_elem * num_nodes * num_comp)));
+      has_good_dims = ((t_mode == CEED_TRANSPOSE && (u_length >= (CeedSize)total_num_points * (CeedSize)num_q_comp ||
+                                                     v_length >= (CeedSize)num_elem * (CeedSize)num_nodes * (CeedSize)num_comp)) ||
+                       (t_mode == CEED_NOTRANSPOSE && (v_length >= (CeedSize)total_num_points * (CeedSize)num_q_comp ||
+                                                       u_length >= (CeedSize)num_elem * (CeedSize)num_nodes * (CeedSize)num_comp)));
       break;
     case CEED_EVAL_GRAD:
-      has_good_dims =
-          ((t_mode == CEED_TRANSPOSE && (u_length >= total_num_points * num_q_comp * dim || v_length >= num_elem * num_nodes * num_comp)) ||
-           (t_mode == CEED_NOTRANSPOSE && (v_length >= total_num_points * num_q_comp * dim || u_length >= num_elem * num_nodes * num_comp)));
+      has_good_dims = ((t_mode == CEED_TRANSPOSE && (u_length >= (CeedSize)total_num_points * (CeedSize)num_q_comp * (CeedSize)dim ||
+                                                     v_length >= (CeedSize)num_elem * (CeedSize)num_nodes * (CeedSize)num_comp)) ||
+                       (t_mode == CEED_NOTRANSPOSE && (v_length >= (CeedSize)total_num_points * (CeedSize)num_q_comp * (CeedSize)dim ||
+                                                       u_length >= (CeedSize)num_elem * (CeedSize)num_nodes * (CeedSize)num_comp)));
       break;
     case CEED_EVAL_WEIGHT:
       has_good_dims = t_mode == CEED_NOTRANSPOSE && (v_length >= total_num_points);
@@ -1822,12 +1825,13 @@ static int CeedBasisApplyCheckDims(CeedBasis basis, CeedInt num_elem, CeedTransp
     case CEED_EVAL_GRAD:
     case CEED_EVAL_DIV:
     case CEED_EVAL_CURL:
-      has_good_dims =
-          ((t_mode == CEED_TRANSPOSE && u_length >= num_elem * num_comp * num_qpts * q_comp && v_length >= num_elem * num_comp * num_nodes) ||
-           (t_mode == CEED_NOTRANSPOSE && v_length >= num_elem * num_qpts * num_comp * q_comp && u_length >= num_elem * num_comp * num_nodes));
+      has_good_dims = ((t_mode == CEED_TRANSPOSE && u_length >= (CeedSize)num_elem * (CeedSize)num_comp * (CeedSize)num_qpts * (CeedSize)q_comp &&
+                        v_length >= (CeedSize)num_elem * (CeedSize)num_comp * (CeedSize)num_nodes) ||
+                       (t_mode == CEED_NOTRANSPOSE && v_length >= (CeedSize)num_elem * (CeedSize)num_qpts * (CeedSize)num_comp * (CeedSize)q_comp &&
+                        u_length >= (CeedSize)num_elem * (CeedSize)num_comp * (CeedSize)num_nodes));
       break;
     case CEED_EVAL_WEIGHT:
-      has_good_dims = v_length >= num_elem * num_qpts;
+      has_good_dims = v_length >= (CeedSize)num_elem * (CeedSize)num_qpts;
       break;
   }
   CeedCheck(has_good_dims, ceed, CEED_ERROR_DIMENSION, "Input/output vectors too short for basis and evaluation mode");

From 96093a692cd0c4b43a8682facf0e9dc530ddd7cf Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Thu, 26 Sep 2024 16:47:56 -0600
Subject: [PATCH 181/571] minor - bump max it clip on bps slightly

---
 examples/petsc/bps.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/petsc/bps.c b/examples/petsc/bps.c
index 42e05a1416..9040add679 100644
--- a/examples/petsc/bps.c
+++ b/examples/petsc/bps.c
@@ -25,7 +25,7 @@
 //     ./bps -problem bp6 -degree 3 -ceed /gpu/cuda
 //
 //TESTARGS(name="BP3, tet elements") -ceed {ceed_resource} -test -problem bp3 -degree 3 -ksp_max_it_clip 50,50 -simplex
-//TESTARGS(name="BP5, hex elements") -ceed {ceed_resource} -test -problem bp5 -degree 3 -ksp_max_it_clip 15,15
+//TESTARGS(name="BP5, hex elements") -ceed {ceed_resource} -test -problem bp5 -degree 3 -ksp_max_it_clip 18,18
 
 /// @file
 /// CEED BPs example using PETSc with DMPlex

From ccad5cb9bf047bf371d46bed280e0f2085918088 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Thu, 3 Oct 2024 11:11:49 -0600
Subject: [PATCH 182/571] vec - fix poinwisemult length check

---
 interface/ceed-vector.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/interface/ceed-vector.c b/interface/ceed-vector.c
index 99672bce09..b0e277e9d2 100644
--- a/interface/ceed-vector.c
+++ b/interface/ceed-vector.c
@@ -862,10 +862,10 @@ int CeedVectorPointwiseMult(CeedVector w, CeedVector x, CeedVector y) {
   CeedCall(CeedVectorGetLength(w, &length_w));
   CeedCall(CeedVectorGetLength(x, &length_x));
   CeedCall(CeedVectorGetLength(y, &length_y));
-  CeedCheck(length_x >= length_x && length_y >= length_w, ceed, CEED_ERROR_UNSUPPORTED,
-            "Cannot multiply vectors of different lengths."
-            " x length: %" CeedSize_FMT " y length: %" CeedSize_FMT,
-            length_x, length_y);
+  CeedCheck(length_x >= length_w && length_y >= length_w, ceed, CEED_ERROR_UNSUPPORTED,
+            "Cannot pointwise multiply vectors of incompatible lengths."
+            " w length: %" CeedSize_FMT " x length: %" CeedSize_FMT " y length: %" CeedSize_FMT,
+            length_w, length_x, length_y);
 
   CeedCall(CeedGetParent(w->ceed, &ceed_parent_w));
   CeedCall(CeedGetParent(x->ceed, &ceed_parent_x));

From 54404f0b18b124b59c791cebcf070e3ee7677352 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Thu, 3 Oct 2024 11:37:05 -0600
Subject: [PATCH 183/571] gpu - fix atpoints evec for diag incompatibility

---
 backends/cuda-ref/ceed-cuda-ref-operator.c | 15 ++++++++++++++-
 backends/hip-ref/ceed-hip-ref-operator.c   | 15 ++++++++++++++-
 2 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/backends/cuda-ref/ceed-cuda-ref-operator.c b/backends/cuda-ref/ceed-cuda-ref-operator.c
index a9395c7bb7..a3283f544c 100644
--- a/backends/cuda-ref/ceed-cuda-ref-operator.c
+++ b/backends/cuda-ref/ceed-cuda-ref-operator.c
@@ -1721,6 +1721,19 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda(CeedOperator op, C
   // Work vector
   CeedCallBackend(CeedGetWorkVector(ceed, impl->max_active_e_vec_len, &active_e_vec_in));
   CeedCallBackend(CeedGetWorkVector(ceed, impl->max_active_e_vec_len, &active_e_vec_out));
+  {
+    CeedSize length_in, length_out;
+
+    CeedCallBackend(CeedVectorGetLength(active_e_vec_in, &length_in));
+    CeedCallBackend(CeedVectorGetLength(active_e_vec_out, &length_out));
+    // Need input e_vec to be longer
+    if (length_in < length_out) {
+      CeedVector temp = active_e_vec_in;
+
+      active_e_vec_in  = active_e_vec_out;
+      active_e_vec_out = temp;
+    }
+  }
 
   // Get point coordinates
   if (!impl->point_coords_elem) {
@@ -1804,7 +1817,7 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda(CeedOperator op, C
           const CeedScalar *e_vec_array;
 
           CeedCallBackend(CeedVectorGetArrayRead(active_e_vec_in, CEED_MEM_DEVICE, &e_vec_array));
-          CeedCallBackend(CeedVectorSetArray(impl->q_vecs_in[i], CEED_MEM_DEVICE, CEED_USE_POINTER, (CeedScalar *)e_vec_array));
+          CeedCallBackend(CeedVectorSetArray(q_vec, CEED_MEM_DEVICE, CEED_USE_POINTER, (CeedScalar *)e_vec_array));
           break;
         }
         case CEED_EVAL_INTERP:
diff --git a/backends/hip-ref/ceed-hip-ref-operator.c b/backends/hip-ref/ceed-hip-ref-operator.c
index 7f661d2aa3..95e0589bd1 100644
--- a/backends/hip-ref/ceed-hip-ref-operator.c
+++ b/backends/hip-ref/ceed-hip-ref-operator.c
@@ -1718,6 +1718,19 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip(CeedOperator op, Ce
   // Work vector
   CeedCallBackend(CeedGetWorkVector(ceed, impl->max_active_e_vec_len, &active_e_vec_in));
   CeedCallBackend(CeedGetWorkVector(ceed, impl->max_active_e_vec_len, &active_e_vec_out));
+  {
+    CeedSize length_in, length_out;
+
+    CeedCallBackend(CeedVectorGetLength(active_e_vec_in, &length_in));
+    CeedCallBackend(CeedVectorGetLength(active_e_vec_out, &length_out));
+    // Need input e_vec to be longer
+    if (length_in < length_out) {
+      CeedVector temp = active_e_vec_in;
+
+      active_e_vec_in  = active_e_vec_out;
+      active_e_vec_out = temp;
+    }
+  }
 
   // Get point coordinates
   if (!impl->point_coords_elem) {
@@ -1801,7 +1814,7 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip(CeedOperator op, Ce
           const CeedScalar *e_vec_array;
 
           CeedCallBackend(CeedVectorGetArrayRead(active_e_vec_in, CEED_MEM_DEVICE, &e_vec_array));
-          CeedCallBackend(CeedVectorSetArray(impl->q_vecs_in[i], CEED_MEM_DEVICE, CEED_USE_POINTER, (CeedScalar *)e_vec_array));
+          CeedCallBackend(CeedVectorSetArray(q_vec, CEED_MEM_DEVICE, CEED_USE_POINTER, (CeedScalar *)e_vec_array));
           break;
         }
         case CEED_EVAL_INTERP:

From 70952158361b9d75c803a361a1c499ca025d805a Mon Sep 17 00:00:00 2001
From: James Wright <james@jameswright.xyz>
Date: Fri, 4 Oct 2024 10:40:38 -0600
Subject: [PATCH 184/571] fix(fluids): Update tests for fixed ts_type alpha

The startup logic for `ts_type: alpha` was fixed in petsc/petsc!7816, so
the reference solutions needed to be fixed as well.
---
 examples/fluids/problems/newtonian.c          |   2 +-
 .../fluids-navierstokes-gaussianwave-IDL.bin  | Bin 2340 -> 2340 bytes
 ...fluids-navierstokes-gaussianwave-shell.bin | Bin 7092 -> 7092 bytes
 3 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/fluids/problems/newtonian.c b/examples/fluids/problems/newtonian.c
index bc673e00dd..b7349474e0 100644
--- a/examples/fluids/problems/newtonian.c
+++ b/examples/fluids/problems/newtonian.c
@@ -129,7 +129,7 @@ static PetscErrorCode UnitTests_Newtonian(User user, NewtonianIdealGasContext ga
                                  -rho_div_p};
 
   {
-    CeedScalar rtol = 20 * CEED_EPSILON;
+    CeedScalar rtol = 40 * CEED_EPSILON;
 
     PetscCall(TestState(STATEVAR_PRIMITIVE, STATEVAR_CONSERVATIVE, gas, Y0, rtol, rtol, rtol));
     PetscCall(TestState(STATEVAR_PRIMITIVE, STATEVAR_ENTROPY, gas, Y0, rtol, rtol, rtol));
diff --git a/examples/fluids/tests-output/fluids-navierstokes-gaussianwave-IDL.bin b/examples/fluids/tests-output/fluids-navierstokes-gaussianwave-IDL.bin
index a7133e5ad2fe98d923b67079a96fa4b7c3cb94e0..9ae86474552f7e1087d707dde64416b802b8aa33 100644
GIT binary patch
literal 2340
zcmXBVc|6ql9|!O+C9NEVQK2PtEk$yK^!cPxzsT%&GUckFgD#bdSaMB-h;nq;)THeo
zU8XX&RW{qPY%>)<6j_nlBGy%l=DY8A`~7de-^YC3ulMJ8el$O}Y-Ij>)X0G!T9;kX
zkN)~oH`?pJ*D9uzJ^6j@yUujV#B4K3`+S2kms9U;-76pp{fKXOowcWIMf#K5WD((1
z3hQIaTPd?Sq139jlqg2)45a0jQh{+$?p6D@gzE^n<ZUXZOo+Tire{YKW4_btJTIhd
z-!=7nPFpEsGAr3vUqqSlt>v~eeTYIOoOi78F=cz|vaUtECtUBL#JH#RgyZZ)gAo#<
zV0a5A6kn!nx3<%^6^4{iRkk?$tRkG!>1Ct1kI1=GM~36)P&UIdGb3d;WeP4nbJ00N
zIN8Izn3Wbpp))CPTK+@I7P?*v-<m);Gt=rd3xf#v)`lJF4S@g8$~(8zow9f4*53YQ
z4dI$D?oE2&K$H;^rF5|mk+-iq7`$FY*|$579Es{D-2YV)noZ9VPBAQ;V{J&}N*DhH
z@ed%sky?AlE65!kec7lO@^!V?xo5%nG3<58B>3(xC^>zCGE93*QgSBrVN~;HmnQUQ
zJ9V^_PuVOsuU%qKnezRsQWm|UjGlb4{qs?fKRUq0+Jdqd_En9P4-sWZBUzEJO&JZ}
z!nAUz&$U{PdDcJ$-cS7Z`5O}MVUHr)GLWharn7pMDny|*DtTF&BV~sMT)OW?Qifk6
z{@%WtGR|LQIrG*Lg{oyr_Vdq_?G@#0i}WE}8?*d`MHJ*SsQq=Y3i8Pw9G_!N*{*P}
zo3-JT8MDy6;8-5v`j;_+osrOgj)wV%J(SHnJTh84gEHsenf~*|jBrU!zs&ISCJJq_
z$L6s!Vg8r(Rs~KYoW@XNTXGHIo@@`A9wvhEo4snq7F57aRQ62kCS1L+T3W45{$Bp`
zhc!+HL@_=4Rnh7Rl<f}@I(&LSxZ#u9iRN8I8K7Mu{SZtPYQ36wKTd~ygDX~A%!mE+
zTY&+MhJ3sjXC{uwM_#t<_k#X=4$Tjn*A9Jk@CG+1DWh+G^ZCssP(O!PCVE8K%BF~)
zJj>}Q+fhPMa1+dLg=LnPj3{RN<_8p=q-@>J_)e1{@^QCf<MiWJgzJlKU$V!OC=51l
zDV=RYS3i!e)0a3BrN`;YpS;CnWZW^krK3KQCn?=$bKRS%Kvt00xm`-fP6`~_6zfX3
z(C+b-EqkFpZ=2`G5-RY_3!2?{mV7zlP}NvC5BezNr`Sv*3UmI&R)0U3zhlD*yRT9v
zHmdT}z)s41Gi2d1tV-na#YOHtZj_ZA-1@rAhN>`qcaqM1CfvEhhVqs!qA&?j%NdtW
z*?W85XPi+bBkKYu?l2b<?%liwPumn&AAhUYF>X{)`JyaG{{`W)MRj-fnNf{XOCpoB
z(uw?3@S=hAS5z=AK)un&ovQ5G<Zf*AoT>=p)S`{2!2WS7&aLU9ET<lxIJtmu;<cMx
zcECBGY<+OzS}ClL##`YcDP`MITx0JxP{vRgs~h`<aF@RM!=~{7k$=vz@Hi5R{xs~e
z3_yPjc#i7mPpQZke9(`Wa_Mx`SATvV^BVn;r7C^VpX%VT4qyH8vx0oaK`G|_8R*Xr
z->^OC&(MWr9{Qs>OI(8f=-sHfg8oR(1p1*r6&}0Bqd$|)RGrbE<lT<dFz;#gy0%Zz
zpTN)jzi@q=YFA~WKMSN^G|`{7a#<Mq^Ll;BkLXW!W4s#rle#b@4E=dOdBF<wr%+ox
z0sRq$bycB1=ZsVN=+BE=kK)lElfK|+^yjE#{GaGg&8>F>=uh>5GjZt8$TPR;=+7mq
z?0obmwXkC``V$$y*ctr^z2R{R{Sh8&2S0j@bZYlcfZQ=o63J}zrzg3r0sUEhrDGiW
z!?#=Sfc`x9it0drs>k|eLVvcV2Ej0X7JGYnIGpeD{Tk+<tkEBO-%W}Bgp99$h5p=Y
zs}DwhW)$$@p3w09>-~Lw^vAv`R0jUYeUHg2z@NYK2I4)ypF=x@q50@fiJSEg=+6uz
z9U1!L>Rjyo)t{2Mf6$+JK}se1bGY<)5Bl?M-B?fbr+mSu|3N-k)nSJw^oQ?Md=vdy
z*Yp7PcmFb-lJoXo{Sk_PM}K0b7D4Xw??U$dVF~`cJRK^{h5eoQX?Z{NU%1@Bv&<9y
zS+Z{x1O9}zZ76yK<2U;`=-fho!p(1wpg*~RH+kUC^%v4vmFQ2UR^(LlCw1mCxHpDR
zc0VmIMt|HkPd$zPSbV4eAHbirAE0jULtcBj1^Oe2$Sg#E+BfJ1pg%6Ak9R>|Z?=k*
zndr~O$Ak~{a}3_7c7i|6&Sy>-f<L2|KGXI=f9CF7=>-1x)nAlFqCcX$PY;7Xzukyi
zDFc7v)CPW>js7^grn2Zyyu)QN_#-hZjV}a$@?uho6zET}>A-08=TUe@0{9c8rKj>M
z`lD#fz614n8(f+P(I2~`Ne<x8+>S8rA^PJ}aum*&IX|*GFBs<UxZc%CaPP-PdrO`O
z(4VR6bv@CahWVOH!Jp)pKd)~FfBMaCS=fL-nUnG};QX2dU(+hv5B}`6S`^U={;YF+
z86!b|itopQ|Jwd5zVo(4e@g5OqQIZP)o-@sqCX?u6PnSV`I)xH=#Q?Jjt2O%@vZyz
zH1H=jk<q+`{#<wb0={wTQKj2O;E$+tbJP&}vt@>c4*Jtmm&Tw!ZfjjGpg&WR^i0v8
O`T$oc`eX9AukT;HoIIoe

literal 2340
zcmXBVdpuOz9suy|kr0ZbNMc-dJWky{#pEWe)jgv6Q0?(3J-jL=qY|Nza1%}uI+sc{
zCpS{36g@;LkEuMH8Ha>Cn{=AIX1wpkzU$Z7|Lwg$>$iTtwSMcnG={nlQhz=3(AlV7
z=R)1+@2@)1+yDBkz{2#^_)3+4-I$8I*`U;?im8YA_p+)3NVJ%B+sbMsre93_p(Wgb
z=BBjEqI?oCbtKCEaECt<Devx4U^`-tk-b1^haX}-abH;0R+Il)-MZy2NThNmg7e}(
zm>x52Zt&lODU<75pH4qa=_u<YS&;Y&8M}2R?J+$vP|4+-LyYW$(*~{>F--J{BL5{w
zL@i6W=pw@O&_u7-f?JqUGWEZm>4unDS=AEHEF@$y<|{8W#&qhn36D5WOx?1NPwOi|
z%(L(#28L=#wD9@;m=-CfbG54Ph+Pq5#L_xd*@~Di^^M)X?;zuontgdGn0|J8qu`&n
z5c8q=T#{%7nhO+NEmP}2!oKrbmwojx-7ocY6Ivif{E`y$enL!R{Z>o;9weO8C_cVb
z8Pmf|%amO=$@|(@!>v(>ncLrPbmK6I&x_$YhmiV?ekrcj!4%cTw@$Igl%kK}mkakv
zeKtD;!ycGUA9c~v@55A8v3Hy0-!P>+z4POT86;X{_S;vP7N+x+@1<_FKy&Ume9<cj
zrqpgM_gd#p#%sO_IL5;qFQcxdO;d=e%nLE^PsWN=pv<e{G!kjn_!W8AVtQs?yB7Bi
zraT76B>8oia&S$HNR}ay;y#x!-2_a_?`QYeRv@M~k(IBMNAi;_(m=nG_~QPzmuxXT
zXvKC=*2R>{l?LnN4~UV=mkTmSNc~46Y@5|c{u-UfhYc}x|7&USw@k#GKH1MR-$2%X
z0rzHN4yJEj<Gj_VM~sq9+#Vqr-+YKInu;dz2OTst9%GJY3p-WqDPo#Rwxv8!K_lb5
zre9WvB9VEHNktRMZ+u;OLrXAXX4Gu&>_3R+964JWC;lY;70~qK#$1x$<LZDbB);tI
z!Q`Mq#0*W?Y1c*};Y_a6ZCwFb|ExN7ei*51PU6$H98*j7Id4x%C-dh7{>d7{bako3
zudxOzVJ;qeOV(FibBp1YSR`8W#H~!;j_H=Pw7fOmXt1;>viV~OV#YEuXEWC#k-oLT
z*y};;d1H%?=iNLsXI^O>61@%mPcCA6tlNj)MJ7_|b{?4XVl2>~?}JsHJ_jgk)*$9k
zY?pngCRxAYWlKX8F~{91btyL-&0fJKPv*Tw%<%JDQwlqfXk~U(T-q5-^9vi-CNz=r
z8PwyDS&XR#1~KN__aI@TT9%=nJLzv%#|=tJSV3LpSz|9j%w0R1V16JHv7W?c`VL~c
z;vY}KcC*op^_!lb#$yrF))C)tv!CqG{yRLsi<nb+lO+}HL`?GPRSVJ_v3kjo2_N<}
z5@sv=#VX#w91VH4N7gs2@UMsh&Fe3)!l9!(Ynv{T{#k$Aq;n1CP{)IU?<OK9?gz#E
zhmSC2?C|Q+_-7=Vm*D+$bqS{X_>)zpLzr3?)~e^FikRQ1LywG9kZ?+Aab^9F;E!N*
z)Dir7HQ<y8{_vmGr-46tMF#@FpQevB_25r|H)|UF*=1b12mI-F?2>{%<#pB(;7`oV
zf78LA@v6@A;7`ap{dM3^ch8qh@Fy-^djR~=dn%~|fA;z-3BjMIbqOP+e_AiDWjTUB
zp~2rIq(7F$aa1|rk0i+KG5B+6=lD_Zr<N}}5B{i@uxh}cp;7Bo;19p(1Q+}XuYOPR
zU-<mq_~w(~PsN%E64!{OCSI}<{K@PoJqP}XlWwj6f6NU2m<E6Pct3Q5KfNz49)Uj=
zqK9Pv`=vYA`#Xa_^BcAAfj_>l=Z}Iv_l0N9gFiiM93POpuNM_2?gfA3vrIJj6OT*m
zz#j>3WhnUb>-#)?@Mq~;D=$)?&GwOkF!1M?M(-=|M_YZvJ@9AOr4z*GMP`NBq59xY
zF7xai_~RQU6N5k8s`wZ(UbBAJ*lFUA@#q2LG~!Rb)!6Jy@JBnf>lpZxw)W0D@JG;;
zD+Pb@BLw99Ik=|&`}}u*elG6{27fZn#u8t9&&u3FlSqD_<eBnwB)(WOyjlF+pXsd~
z;LoRbI1K#xd-~VU-~B-=+`u2dCk1lik5p%};3V-UGG|lSH29;vcAf$F6J`3zpZK%>
z(dCCG#GgRb?qcwFa9hLrVd777&Mo#n@Mo1}j3M#I=j}(6N#f7dsSB;X;Lq?jP5|*o
z{e*h<IPr&m95Y}9{z%&+2f!aemdhmRpTzeS#$Dh~koRmq__M8Z{&MiA;X*4H{IS*j
z><a$ad^)k2%%5}EBalt}DJwk9YbX93Hqe%C27mlKgOZ6qPx-0a$^F%m=5PFSAn_;9
z?0t?o_;V%u{X*i8Rp&RJDe=eoxfiR7_)}H=(ZZDYBW%sM*b4qAl&e1gf9|xK#e+X?
zq2bSnKWF%Y3_I{grGLZ{{PF2oRZ9HX==4CG4*t9>sdWW^oIAwc;E!aIT|~}jFgI6R
z3I2o?4txWD-q^gvq`zGRSx&a#&zqJ$UE+_uo$lTQ@Ml$*?jG<*R+FGk{1G&^<P*o+
zI<96EoCSYgx0U|_{unG~>w!O)rCdGY&rxdcD+>JaOxd&>{K;KBwI2N0WHP;<_;aMy
z?X@NN6H(;h3H}8427M*|EUwe<oe%z8PHB^aKitUp<KPdI5qSjsITdhh4*Ut1x|921
LN=fI8d-?wX7iL{{

diff --git a/examples/fluids/tests-output/fluids-navierstokes-gaussianwave-shell.bin b/examples/fluids/tests-output/fluids-navierstokes-gaussianwave-shell.bin
index 67097e2ca604328ebb7a3bba06f6bfc54a30f668..5b9252ae283bf9437019855d65556c7fe5fa22be 100644
GIT binary patch
literal 7092
zcmXY#c_3C>7sg*hiMUFoIaH!3QwiyugQA4nD?^HMB@~HhFjOi^hKf*1)1`zo7(zG-
zks>k+*VH63X2|q@>#XzL|GMw;JnQ`S-fQi(MSEK8IqZMaNPq7(gQs(S@ShSsChY$f
zI7V@M%{$e9DpMR`XRdVZ9+EdXdM|s1CLKK-@BE#h$a0>Gv)H506!&X&?5Y+sio-YR
zJsEPB<VD1a-rt-^q}V$uBz3lu<x|e}f)^fA+`gA4QL83NUhBRHSNa~s;ivd#Mt&v|
z?^#6aAKOV*aJU({qC=FaP~TdWv~MKuk8#@kv=bCZMtnS9Bbd$&sT(o3=a3bGvlNqF
ztY`C6pTp)C`5)%@%1)Pv9(c$9^8km<@2Vr4-_uoWeor`<-#sIKI_2P#Hs$d9n4ds4
z=s|ur*2QCfN4Jh)exi3a(usGhKh~5yXY*Spi212{_ObaLDaHH>x=s?gSCp4uH<HHu
zo~Y|l+#cSo`ltsaZ_0;n*Mnw=H{9^k`Y8SO@pZe}eukfie4~6C_*z-H9@2t1WBU9`
zQ^-?9%6_ncC@L4?cm3c7zEb6yU*#xn|I2NqA-VATqVai|>nM)EJhRl;TSUQSsXpHq
zfe?3G$u+VLxOhoTypn@BAK&}pPQWFnKP==Uov)@pT}javeB&Keb?>wJ<rZUpX7aBv
zzb@h~@%C}%igNQ&Ha~|N%<o7FjrpBR6J+!Ie4H-&nDe)#AG2<dUy=%&-<kt#e*23t
zzm(8gy5O?A#=xs>Y<`Z`nBSyfKAWF?4(4|)UXjRGtu~GO@|w*rmY*`+)LJ;~ZbFX7
zHZSyYo*;Q$Kf{ynIMVGV?axj3181p&q08K+s9n+l?w!`lN#0+*>8ZX6<e16>&xQ9C
zhz@hL{sAlC>~quDDRzP~-LY}yjEyhJv4I@Np2kZQN9eHS%R|O=b64#8XA*v7g)Fz#
zN1=q;wazZF@Y5f1>ZkW|-}eR-zea|B(l2MCJ^f#`C{@VMP)4oD|2pP360C>$xp%C_
z{2r@PM7#0ku);cu&2LjB=GU-Q0P}nOD-iP&t>~jWEONue9Q-gp+h3ZPpGd%d%<rqy
zcg!#IrvuUaWA6GuQx4|mtFsRC%euY=^9!;c!2F)O#n9~;QL`!|+5B>D#!#jkEK@8s
zmr@+U>qWg+1j$P?xLctZO*Gto&`Jft_s%*0xg^Y=;?`()c?QBcQ`mp5*20hC%t$NX
z4NTHaYf=KtEg`>O#*1qSN6J*^<n-btu>MAR$3~W!P@E;gJCkcvh|b2*YllQ%lNIv?
zpAPq(rMMMlZO#)bDUOQ5J5HrE$s0VNAoH!5?uwG{ign?T<(=0fX}>be?{ME=m|v6D
zI?S(S_C>lOLj6i`B%5Cu&jj=H`g;ZDS8jcU$*)jI`xeopJh)}W2hfB38iM&Szq*7W
z%x}D?AM<;>;vU^uS9h{34fG(tv360+?+E7|lV9<u-WtqL(snJ;75$vYF9qKV`Q^p#
zrMLr+FMi!%L2>4FeM_}9qBtU!7sr2|fPH*mYsCpmiaQdQ+@s$HIxC8=pC6<+i=3{m
zn)D(o_yj_}@n47YQ2Jn3zA?!gQRs9}GoU!r{4$z`E#dEPOl*nc!QZF%eGXTpIMSkD
ztnM!)c_$X+_r6mkEBHU9bosYarqe%MCsXVw&O*y}{c$;R>LZ_$fAT0<Axw6(-@XLr
zDCM@$H;Uwq@@0_@qa?3!t>N<bJE5MHB`?xZqqq|S6Nh*u6emLG&mB3?+40tWt5_&m
z{>LpmFHw~;T|0ZcB=<DMcQo12bMXYlnG(8q-)s+AA@Vcgl;kstJDGB~o=hcq^rhil
zhq5Wo>I;|eZ7G8KW-T{clSDo(A8e3MO4~5<8GYh}d_G2tAfNu@LdYk4%9G_2I-BKl
zIR*J}g<*dHpHum89)VADOd;~wx=kASH2nUAd~Bplk<a&>ROHk1pceTMvu7h8@>U-5
z3BQ$#eA>dhk<ZNc$C1zG)0>gcJez-z&%&jH$j92q3;EpMxD5GhZ{s4L=h+6xr?}x0
z@;Tm6hkPQYjkA25yO2*!oFDSxUN%NPNlR`b9|>hCsISAy34XBtz3Rp0uNJ{Ro30dV
zag(4sr+psrkcK)}Tq(M=9yscE3uphPIO1<&vWq(@j-bKfCxXFrQD&hpM-t8z$4*U6
zN)q*wto>QN)x)UwQ2jkke{Xi_?rvrELY|_Y(3J?(%bBwc^%57~B#JUs0xkF`)KhKP
zMRB`_s$yiXle`9(>sA*YlDv)#xr96oBEKoYP_Xkp=-EwkD1Sk5hYKSUN~gg(<epM5
zw}yUeF!Ib;m@YW6!#JI8h4ob5HZBeK0MNT_XvykzMX`G7zv%p?6ql0q%=dzx!{4Zv
zSbdAtyQIzPJ+mPSPMGg*ih(}M=-I+|^lA+VO^5x(mtU=K|C{8ETlN;uX{0|N+nM3E
zKm+u=s^%3yeds3)M_jz2e-*Ef74L$+ZFWN?C5x!bdHFRh$(GrVVOt*DVD+BSsJHbG
z!Ri_OAwC~lC4BO71?ow9!MOsx;VI~wpcg0GjC!j4J#<~p*}Gz@Og{#_U0JaH`3bd_
z>nOg^$s!lG43Zbvb$aPYIx#qsZL7Ql)}wV{ZqScOi1YZfPiO|kmuF|Mcj7hFvCT76
zzU?FiH0R#;TL7FJC5<n=Ze;Wlg@lGt&tA*`^`ur<(}N@TJe71=y@y*2Q7=^XB<gAN
zg`i$)tS&vE@v=KTghIWTmrE$q%Oi?)Lv56x*M_-gT|3Aro&My?Q#o|M2K_Z5t{%>b
zR-we)Ih5(G=Ddl<8Pqg+aZ_1kaf*}pI=8R%13kRhCh+}w4p}k%fKN`%bkyrmYev1x
zQEsSLzbS?2*W7qa$Q#xT^yV#xzIdzo^|VTV)Kf8jfqFgKdx+u1{gT#;uc6-Sqg9lt
zoYtkd-*B$C)J>eXFCux>Y7yoSxI}NLsZPaZ@Rf*?-Rtj4nJUMR8)R)IdGZtCeil56
zGi$qI<;K4W9-rXVONkU&Au02FFExpJVROHuo?9o}Ye3KB>PNaabXfWr$?BO`g`%FC
zcM$5ij9o*$>Uvk2$9H*|q!g=XKAZ^qz#-sh3!LwYpz5yFyKs)?FZ&v{l%T!tj6R<P
zJx;h!;i_cHRAi=L<^i~$486PExnLK?*(<rGeaxIL3tISN<QBy7-#uH`c>?v^kHUFl
z^kSM=y)$JreJZ%_UJdv%dM5B4j9xO-4b+Qe^<or=vY^oP!6a6%z95g{{`~CRnJ7u}
zetmbU+*kqo)X3;r-Z1fu7hKjN$RW$eGj`qh3)X|XdhvhCwy>X51oev5$#JvD+%rWn
zbi(6b2^SxOo-8%%&(1Tf-X&GktB;(CdP!To=w~BJ3wNw#^=?Myqh4$3Q`94s&!e7?
zRWXr}S@y$4lGPinRfc`Ka=%^uJ8~-Vj<H=da9OsaBd;-^_ONa97h`;d<b#T4J5i<&
zKB=s{aE9bP=~J-J%c2DITJtoeCFm00rNf~C5GR~nHbZv->N#=tp`Oh>*ax8Zw#Ak3
zI3$tXT>5`K;aRA+HhUWC9dy`%dfHXZM2WA;DqWr*>WRvEK%KNQbkt(%OZKiWe-^`i
z%UN<TREekwkR>K{B49n@V*Z-B1L}rKscJ9WYx$LLWxEKIys`073lT-S_VLUd9$^c0
z!<}Av`w!|lP={E(Kr7T6I4wli1gulrYXyBB^u%r~LOr!Fa;#py1L|33-X?0}Yx<7&
zuzIq3%v|u$?9pMU1LIqBLY|kAyrRz~r2-UPlB#MbRzZR97&o9t8{!ho-9(hYw_7@U
z_&E4F*?#005+6g(&)Tci3cl_Q@-8K)msBww^@9DMpk8C?3ZmqB?ULfV(x?~s>k8_5
zRBc4P+*zz%)!sz<;}sFR)U~YMYEduBl<#}sOFp>2h=s?lAYM|OO`R#y4@2q7%DfUE
zLpbMxJq@>&KES@>i%z&!2>qy5v*@QM^x+G|3uPV=)!z0OGJ3a><rC6v3;QHdFYW6V
zR_}Th>dpH2fT*mN61ACIfqJ*fp^k%Ie0?73y&eCFdVEF~=xQ&&PfEs~tX}I7aJH}&
z*Z}u7VdsCcyuUE>;f>MsrSP3D30b?unYnONYDhbL=ctQ$0GzjoSd#?{RhW4&N!e#1
za7eE^Wm3b;gU4lle1r9(1gI;%z>^y-1NTgrQ(xpKMZ!5SZm<!7Iax;hOY%9mp9#*=
z&wLY%{`~u7(LZyA2kUS12>mlt$65dLzy<s*65P;Vl88os)oE|h-|LY)`mekTeHZ+7
z7jHoS<x9-bzaR|i3Ha|xgFX-b&tJ5${_C2-pR>oH&K2f1?;&!p3d9-Oxc23Mzeljn
z?_AK|?fu;MD>K)9FpD}3d5BE!cTfj?sT-YphoN4jduTj@`z_z1Nza3B%sfY~FM9;P
zdybayfKRVit4>E2{O;p1nFsr7GIyD1_$Sbhf9oE~+)u%O_yy}<lZXCC=Z&-eflBEA
zY|91KUoMLE{|M)p@z;ZW&G_p&vi?Gv=&yDr5&hq<K7{_d{cF*GYGnud3x~~O{U0{L
zeD-C1*uN#rd`30A`wQl?ZvV5Bi(oEuAVs4qU@kjn(b3Tgb6Jm%_?DwEm)+5>stkg8
zEMjbMIWsS6rFy9uG4t5b6PZkXI21RQTm$veG;%}yDwxCeX@v`}f;sF|*-kkNn8Pwd
zCVo4@e7NM#(bRkBAC;4V{+0ec=<oL73;K(T8l!*NB_;G<6{Uv$dCD2+-<H;l{(b=~
z(BDy3it&HQKVyvb|6+*#7n;7Hzq9EC;~y$$#)tk6Q{P~|GE26}y94vpsVv3nAIyAp
z<ZK%7opY|+<^5HdtEibPZm2MGRkKrEJ<L_<`fnwbVV*L%roP$-<|+557rWrzXQXSm
zgAeZSON0+yJqi5h3kXZJ=D-}aXa4%Ho8f-1sA!{c3GSnI``0~&`*UZMl+jO9^gr$g
z^#S}h>~%-~f9tK$-{T1E5Ac^&g8B>odjep8gMY=^DD>apbRPY0GW7ubP2;zs|GAL2
z=s!Q`6Z$I)kE6dyFzdf8YBu`suyBL@b1<YroMQTIOmqG*rr#1Gu0F7T!Zl+(Uom}l
zvUCO1qoH?gQ6${|_s)Bi?|P8w*XfDmH>l^93NtlezTnH=<8ch;Pa)^|W}TYw_i^H4
zDlq@_s#PaAdoXqU()IECOdTKkv2iu*zhi~YBQa2?8s3={=`;QGg8ypx4t^sUIlY%m
zUls|F0Y2O{iYk`OyHLvV=0($?kKSl=Ue(F;<AYI6nQ%Vh{j#jep<e568p^L^>h<9E
zOa;&%_YA7ZW_}Mj{@We$>*m~i8N<|lg@XT>zU6*+cl0`@U)EgBq?vx%x!E52(y|>>
z$1Ad#zWb#8pghwj6?C-rL7(iOr9&QsKG_nuI87G%V~(BhrxvKkKR&&;g7+=4@Kyt{
z^-Mh;xVuRn>PxSl;SxFcKF-D_8CZYu+U>2@JD57G_;1?;Q-5`JeI6kHN@J+cz`t#G
z2l5v_t%m&LOk`R9fhUlExHi;b;QuCR6Y_uj<TvscZh(28;co!%aSZ>g2A03;cjTWW
z0Q&;?M=atZ|4vnye;NLb{m9=qN(cD|t$}_4{AE77A^&qbl#sug9;_SiuOHJ#{<d<p
z$p04Qg#7b6W+MN-BcKEP_Zw#+|Il;b5Bz=KZbJSsYS<?gDmMfm|2;$So(TMJv<@ME
z%_ru_|8(Rb<o_)U?>!1Tbw4BjO|MOnzw9m<mVb97@-Kesg#5Qh!hI3=`;);e|BMXe
zU;kVL`8Pc*LH>u64<LV1fSDJEl<b<}-1n-N@CC&(^P+<M^4*Ysv5|%*{QbZ-m6LO?
z!{5KTs<F-h*2iFz_D*TY-!9<3?>@*sd-`16I+#bMMCQnShk4|C<NECtFxSjWyQ?w>
zes><;S5ORd$UlA`)&;^jCJfrHyn*>cLU6Z4Co>N&zbb!`nFnj~atfGvP(Y{=`Wx?`
z$7YGsm^rX^?t@%r4%`{N=OXmUZ6Yt5yqNiK&ucTd4_uyl{VPY0nK!aG5m0A!26G*g
z!hnx<_J_apV9xll_U5?&X3l6DI1A@1@%3WAs0%P(I2gn&O^5xVofS5NIluXZ9<%sh
z9e1103pN6O@09ZzUtoV~CncYqg!M0WST#5Y_J`TQ@B3!J_a9JPcl<ca6KYcu^6-AJ
zrM@7bcP+`QR%s9VO~YIgaWx@Z7v_kV``;7vV2+sdXgM3k%n{?>y>QQylv-dl^b`3X
zjMQNHyDKAq#Z#A&zt$=X<nPgH%ktN`jr=v%n;`#tZgB4e{s&g;A%A|aF62*{L;k@3
zO_M3|Pt=@3{(0gJ$p62*`N%(@MHl&pb|oSIHzr}oUwDl-%YU;K@|Rrx5c%KS@6Gae
zh5p3wzjGV;PkMVGf1`MH<nJW50{NFuRUrSkv@(W&vj}|_`JZzZM*i{9%>6?<D|`WP
z1^(_^ijluWbr|wLcYOx(e?r?L|1|~B-++JTjlWs`RW}&^lY?rrk-y_+=#vcpUrUjH
z9JK-Yzk4T*{4@4KodW(|V`9jEm!KH%eR%f20(keI*DYRikYL`Y+V*`Np{wtoSab8W
zAH&(#)p#|=HEP0p2@{tLbq3;W^3sUv`%4@S+2DEbJ@N<cD<iRMzL^7Om6g*!{Mo>~
zXZbpJ?x3q2r3*jagZY<FV5M{x>??@-;rSKgN_Bo=oJ(gvQROJ{Z&DTbLYxfDYux@%
zTP1gC!u!{n@mDWSFz;W<Qoh4PxA_)Vdk<yEFIrZsGzjD7M$W;w9G6oVw{(I-cbhM5
zTino!ahkn%z&CZz`zG*}7X9{51@xU03wG-tn@_iB8SLi2!(rad1ozZdV_ZqzJB+it
z1^W`>wnc6xS~Sk=N&K6Q+ui98?`t39SNlTT!rj)VwF2OMjnnibXM|`<*)eihT^jZQ
zmAOPL2;<}(U_OSpQ<b4?+zc+=l=7F27rb*}oTwt~hvc!Y3yJU^$6qaTs|xCsVz^IK
z1l@jqTa`zHAKZICI5Zj_z_=bRymLX^ii4Li?zHk|qW%2kvHkh4G44NuE%1If?dd;3
zDe!)$yL9Wcz3_fl*t2`O6G7WnR+atYz&oD%nkDJ?FwW}HevH%K`~~AIRKsZ6+Edy8
z8XLE)5%&9936oHtV0iC)8r`x0?mJU6X1^#qKzw?TyDB;Z)<fiH!ApJ`<3i%!VO;Cs
zJdAt3d<p&OflBeyO%%qZO%=lVe5Ubx<9X(NF#XJSm=9K85Y@^lrF)lk?<~352X%a}
k#fL;&j2pOVhH>8e;k-bcww)=_yR1}#wDQBa_eVSa2f0ZOT>t<8

literal 7092
zcmXY#c_39?7sjuU43RXDL?{_c1C^X}%#!G;49S#`sVFJxrJ_PJDWQnw3=s|FCPgYC
z^N=~E$W$TXe(S9B^<Ul9^E~%=_g;IiEz;k%o5lR$C&Pm~Ofu#i!N1NvV!`|~?|zCs
zpr>JYsh?sA_KIFI&?dQ)+p|0RGdPLdc7x|t$H)q<<Mfb(Qxv;9B_O9amtsxRD3dM|
zC%IP)ggX<<i0r-;)s*E>H|5Liy1I~J57tB)<#9-E>z>#zxfh^5%T`F@29d5WJwABR
zkE~?b-<u*&QdToQFSyh@i{y?)J?>_wQ7oCdk7g<_IR%3+WOJL2k(GQpg>gzFn9m@(
zkjZD%j``Z#ZHdIm6xF!O|L4<jWAaT{Gx=`q!+c%y4sf!6o-3YtqL9faRfhRY7UVPe
zURYv2VLoS0I`zXQD7T%-7a4&0qEn@qd};DbzG6$FV5mt(-(wb&?@A2C?(g2F=V(B3
zr!*1+x1~dUO6ZHuFi!D}mC>2;e!$P|$g@~=@ZHrTZ{!Sh`Z*n!?ouosm2~sqHlpOp
ziBCl#mEbG??9=Rb6np5K*@$fld~UN<o?nt;@$OJe>b^?6R#i4rd?^Wa$D8NB3IHy9
zekcemhPuer<!g=rmtbx3wiM^px82p^%eR2<4c=ERPnmo_`I&rqzz^~jq}?ZqZ|u~#
zHqp-HoA`<O7CWUe`HX2@$mdeSDY>$}KUyOK^Ho;VV7@=sD=^>U>>(x}?`F<xRewWg
zwR=py6Azhu`Dd7XfuWd>_+mo5`X(M~WWnSs+89e&QSaZ_sg{!yaTRh~e!%zms^|Cl
z-6~F(>DkFkk`!4f_st>p&Q)sb?9kQ<;t)ACHBV)mY6CewFTl=gZX59>G$v&7r!!e`
zaQxy~^FNdod#COhW0D;2h(5p2#17WYsRbG%Eu7ZIWnU!rz`9bRf>*geq_)Z&E3@Pa
zC8z3E$xqhSQ_~1#yAP5)M3;ntVUZE67d>~8(T}SzpG(bQ%*VE|$9%2Zo)KN98@_IJ
zOvZdy$#l$D`PUZn?fi8H^SS5E<$MX{?KLh=#(d}Wr7_>7{;y0v>)lK~{urXQ;Z)p;
zcqZQ(aSHQo@#LlRDe(UbVe(a$aJnRBw*L@k^8E=GqO8{SB*eVmNwG|=v&Bn2NbbY@
zOBqF~L}P)~j|E#nZ_eYx{jCiYyZW5#r-Ee^OF1TQU+pJ~B_#jLNm`!sdG&sen?WqH
zqHlAe=L{3d%4AOc;ee$S%XVF+;fe1QOIGy9zJ%ArkMY5ktQgRf3<>fpi=^1)x{Ehw
zL{O~d+WCH0*OS~KbKW!d_c(p;A{=X%LcShzH-n&MnD2JxYs@EcaRT$@?b^<1d@9}Z
zS%ro9hQAqMzS{gT%;#kwhWXr-Gl<XYRBs2H9m9Oy?Y@{VBH#t)Gj2)8eBPqxIX_0v
zCteK%J;)budlK_m7|LNj&wpz$-|>D2qOVLZ*y1je&(Po;#U6fXcwYeY=KW`9mRkhA
z+u#4#p9t&NRWi?U9mO7-sVNtcL$Owk<axclK(VBCeLLIVkd-`Q?hTw!*bl$rPG4#u
zxg#a%lEcyzi~rBv%TMWjkb2?Hd3X5#CKu0cRH0akZwwy|<dIzer|~08Uy_x)d`bUW
zD<~^LH@B`2+VDN*%lOV(Pfiu+>^<whi>wrqG`agd1NKqN?2d#8k~_+K$|$~&<kmN;
zQ%d2`&l+CLPWw)=|83dUZr4Gv;-apZe%ed1%(-sDV@71f*g(+qRi>1crXsb+Z4bp0
z?qglc^O@oiI&=D+tvOjK9JwaeyOLs0z5baU4t+oW`{`_tc#5TNUVGzA67;uh^IX?z
z<dZVT0r_m?Q)2jdoIpO$#s!hjlJQ3jp9LDor$f*H`7~ZUihRQKG?9<&Ze8RPyHcLv
zb6o)WMESz`1AHD=RU@B<;<LzS@}DsB`C=1;eB6VlBcHn}k;vzs+F9h&bLcDbF;Evp
zKC7-ZBOi(7rpV{>R#@-A$7pFZ@>$%pl;(4o+hmS>T$gV`KGp3jk<a5C7332$CW(A9
z=ISG#M=|2a=iz)ehEKz6=&!w6&XV~gx4*>iLi=`-JKX-}ew7zdIGiD)wyYic>)VNg
zE98NrTLVjICdHcN?`{)nL$UZIUOF$|$|?0vd8yNu3_RQe*SPwj-mV?Mf!1qdqh4Cn
zGfv^Kzwp;wIn)c3PoVYcC*6Y?y>NZhbJWWvO8wn?zpjUU0D7zIdnk5a>Ban3b>Onh
z(mv=r$?Yu8G!=6siUe+%mus^?&(63*ypv*&8gE-Rl1Z`Tf7Le~`3ZeVq<S)HJ*Ox|
z|K!&d?eKeA&RbK^je2>O=TL8_X*lW`XZvu91ggy97SBSxcFlCuQ|0HNUgP9j)SHS9
zC5lomWF0hve4wXgUIM?LivF3lXK+q5+AbTuOLE7Dd{!3ba+<uS`c?9rL2qBVQR*nH
zqn$}d?!fu<*T-M2*Z|J$u&{4_Z;ASQo7QN5gnTR?%?8WOsCRBY>|<Kb+XwYl304wK
z-oq<iy`j&YedWgIJW(&W_7>`C@ULd{QiD15_g30|s5pjt^}iOu`pKJT9;8L_ggi)k
za}R#!FrAxk)ij9TZNzT5d4ps{>#Mm3Pxe6_QGZ-Ak>W|O6Pq)zi(;vbaeAusiDCDy
zfjqk3G}_%{vv;GOyP*~86`gZMy_aUKoZoFZ@v;e!5A?zn{-WM>kB_J)F$(tu(EEAw
zE@#+7|B=XT-~xJ%Hx5!(XZQPcx4)+NBHq4^t6f7*DaW{GRHbo-UOZlOvkvYlGrCu|
z#LH4v*Q4!Srj#gtLWp{rxshUJKdP!*@sBeyxTrIFEBqe(4^~;FE2G{(_<ht%5{N;)
z=R-S*p@NIKoe_hm*Y0=^_1qf9Q19g#KGdtc`kfdV5|}(zOQGJjv<S*d*=cv&en*mP
zsMmJrVlK(8*u+&zzC;YP%$Xi12fh;8_s%-hQ&y{7?DtHAKBRPNYj1fE#hNYb$e$rh
zvK$r^+`3vwR?Zv!VmJ60^`dyPP|wRb67}TFK5_<HBDM61W2hG)auoG;xS69~lUF$E
z%{ZL*lI38zs&^Fffu6M^^sk|-@%Ni6NN%~bKxJ7K?4xE&1A`qz;l$R7+OzGj4pJWM
z8&{#Mgmyj&G=e_=`>{x|p(DkzoBK##%#Ks8*u=l06!J~G92OwA67?Q(OHr?)uNL(T
zm)dg*Cq$kdC}Q;d_I^XXgB|lxuf&8xJ!)4LQNBdOyDat?qn9*?V*kjKn#sSG<n~mR
zpASo=Sgh1_<@}$C+`o~N8-0(F6%*%ueMF@wE8Ce8=C*J@84I#hI!%xhMzfbbb{6F1
zG<S#h8nEErv)RBTc?F~QHWT$OTiK%C`vfj0cT`ApmktZ{e671t&uV5j>L~`UK)vTL
zEs30t83jCW@22%k)yttj=3m*n_y;)^=dC+449*XQ-YtKg+HkUbbl3azQDmjS8`k^Z
z7bvSc(hqqW=8@c6wQEP;Z=!gOk6m4|cQ>c7tY!V57|191aL%ha(Tv{bPSg|DeSmsn
zgT_RbPlEER&;QqRTFB^yz`Yjq{%*HJJq6jlMB&Go2Y4qLy&@@L=##Pk;wKDYf6rGi
z@%NxuGvssCKWh@T>Un&@`vc(j%{fuaqY8aP`k<u1NvP9(-KzoT^tflNzx*9e9s9%@
zvGy;}H+GvZZa9p3*YtQ9y(Lkor}n9hQ>$Jz+bduO>eYCfp`KI_tYcblmlNvkk7*_9
z*gmS$T})9=J}VXO-9I;9<*$U#xnZ1hdr58~_p<gp9ZspzrmEGBa-b(F{5wtu>XJ*u
zt4@HP_Fbv4<>2eo5)ty7`1s$l?V^*(P<KG2x_ck$B?fw<-isq}?*Tm{o)MySN%T&=
z=0emPkuX6$i3VrXTkuo@_0G;q;(QEA;!0g-^lBF7QC7T<GP>8FhjYenjiP!2#nNA*
zE*}`psabesb;(R;*yo<lyQSd1_1%Mii!%%Aq`ggSZ<E}*znc}4J`uGwYo~kIUxsz{
z>JBG+BkFa%S7r2WE2G}r$RVOe=I(pfU%(Iaz9hr%M(eTGqMq?l=;NR_+clk2TOGbP
zPoe?!Uap7zJD``J@xTermtNTg%FXnAn0ZOK4fr~4F?tT?cXtXa+ieDXPmKb;W!Cgu
zs4sUqot_8F4v1#~hw2I5CR2JI4E)+f^Ubo+(s~D+?8EbZ+0b)fru4{M*auO~r2Zdz
z4$Siyh40O$)0!F%^B?%9TeP9S)XGKZA6-z-_^Ze={z=duY5y)bzrbIn;y?5cSPgR$
z`18LrLI0j^cg9~q3;pw5t<isBAM97~*KI$A{$5RRf1~|%XTW{otHO0DD|)^=ryBAG
z>Qd4+e}4-8ZeR7p$3Wj@g1r&;-IRW@$Q-zb@~A{71e^qY`Tx4zhM-@4^-%sf3IA?m
zV>^7Ap65h9DD}YS$CE!b!dy7u`9w@&JA59g;^hT8lffP*92`LZhR5&JK*qlo`V;u8
zEsjNh3$as-{|q=6!QZkG`abvzCpR(v)p?BnTDb3k|FWrA#^1XX{ogC~G5&Rr(SMd<
z5#wJ`kN%-Ka*Y3HUYO6yH(yS&rsp$>CF>GkJ~OcKdX^4zS&uZ&cqGhaN20w(romh`
zcTQPI9n58Z;k`Z6VICV@e#l4=<}s&{kSk8~JVxl6HpB0Da^L)D*e6PQ<Po?(Z<RT!
zQIb&)bJ$AW9MOF+hp8MTM@nEmbjgqU7>E8TE$Zm6)(H0^@OP22Mt}bJ<LK{G8jb#q
zO6$;nD7y;%54dQff8@(%^e4L1(SM;<Ao?G5%|`#y(>u}s^^?QsuM-IO4%&aiJeaS#
zACzik!F;v-g}C1|dcJaqnVkl6RrbY4cNAf+x_EFyRt?Nmm&Fh4x&`-lg&=<c9hj#q
zh4Mdsgn3F#mH#W8XSQq4A9)4)QdW$2>n^xoOCIGsIQSaoDE+5qif%ARt(7R2eg$*X
zrZtUWaR2DBT)aHM75!VC4x<0ulMB#4Yr$spfAUWm{fU4|^xqRjpnnkWa>ifS8~r7Z
z!}$#Up_|6h-$cX}{WEvM{T=*`=fgV!_&;y^gZ||;1L&`@I3Cu|m3^7s<8a<Ccps5#
zPoKAYw)6gk^^;;>8wK;bkVD0<MQ~63{n&P&;1GSzu9i`VqR-b?@9w{Vejby5r;8u<
z-F%JIW7%{+&sOa5g#Z8er?wUS&LVR?q6l(L(@hK452E|{wOZ{f^!nb_*t81H)8|2Y
z?}2+y)$h59AL#RRe%$Rv^f{`K_9q0+QBkt4>=NvYAa|Dt12{+L2o61XL7$(A(QFym
z9|PVe|6V4!6D<D0><4teexpj-fWPQ96B)Q44<-pNv6KdX?UR+VGIXE5z5IF`e7^st
z#wlAkFJG+W)e4}`%i(ICd-Qqfs%kDkpS#NQZTjeQ^5DuGm>X}^u@>eO!Z{gw)I$d5
zE5V0JX5RauA7|VD{h>qmV~_EZ3UohqdlQ)eeb{S5fnq!KVK;JR{0_PgpOUv}gFZY~
z<Yik7-><OdfZ#9WpMAa<`JdVCh5W64wIcsI3ht-Ce>i0g^7nCt`I+XQ)`9#>4<ASV
zw>;z-{>oYm{{xR0{www%|EMW=hoJfI=Ry8;v-*&K6a6{x*N-wp{!+h+k$-|*A@YBC
zG7$N<?igYCTX-V>-kb1_2K-|uW0C&_Z9nASYu17MH|UKZf76g=<ge_Kg8ZxJoJIaR
z3eCuWg?bC}=W}dC{<V$okpI#<`N&^#)D!t@4(~?(l3^{#->0n@`I|g~`4jk$jCdpe
ze_G#=|D$eyhX2WY<ge*gg8XIkt|5P&cxmMCmrnQJAN;&UF#q<K>{)U>jGh-C3!bw8
zK5rd@&#wo*9u|RXf$uDThftrx@cT$~u}LM!?_QHD5)5-jwuMKs3d|!97YyFlg?VJH
z%Epp!Fo(Pt6zqcc+<#ivMH^uLNw2l5JP7&J{tk%BM#22CeK{p62J_(CflK9l^gMX=
z!YyZd9z0`7R)W9nP{6Kx^c)yv9SZjc`Cn=4mrp@|6Zv)}1=cr@<r07MDVR5cdEMTf
zp!gz69kfmPVBUyzxxBRq=8SKuStI>0XIyu9kaCNjGxF7b&Z6gx4c43DfKR$Y()<Eg
zA8Jazci|m~r_pdI5BmT3&}VDURPgtE;iU`vb7;pGUQu6q&buFcrWbz4FbgmB2k`y9
zi{I+B!8~EdqnreN%}_tDa~XX93NxMloACZ3p(WGWu>j_X?aR)(2*4b1`qCQ<6PP1J
zG{qNp!u;^Nx@w>c`3uM$MgB|6Uo-q=1CjsHnFo;ne&cz_KLp+zX#QJV>Hg$ilcV_#
z`Nyn~K>oeT_Q>Db75Xdim%Uzy{0UJ%hQF^F@)r_&g8UuRpnm}W3-Qo@fd3`F6y(p&
z+K&7W6x1VsJI@K^zs|yf;V-Pp@c)>B{QZ6ABYy+Ub;v(<{RQNoyJ`*cPv7s0{B>>o
z8UC89k$>{@T;#tm0M-TY9~6ao0r+oIc+BvZkwX4F^-;+GKQ7MM7CtkL82+9skiUe<
zX5{~_1Mc&{f3W%w@^3!xg!~V+!MzIj->iW53E(fb=p6DdzYOQgaDL5)%4Yg~>WJD}
zxQ`22FI03d;nZ_G!|KB6KC)}|!xSg1d#jzu)J=zXAgEjH^^~aRet3T68{Frh?m><a
zyhjPY={#r;?^&a~Yo8m^?^#Z%U(GmGQW5S(c3Xg(7_WnfFV=lkk6`L-WwGw|HCduc
zD!I#X(=4oW6fcGEKNcltTm|o6#rGO#MAGkHx1L%!5`9KzGEcr^f$y!dEz0nY1a&ey
zk6~T2(ig0I!V}8rGm1<LZSuo9%Xy$TX85Ne>jC{<W`A4y4gFqbw?}ddr#)@%f4bi(
zcsJu6SQroQ3{Y3Edyc6)0sNrOr)HFBe;D{7Y=Ehg-lhicYkRyb4t3J+YuE4l-h=nG
z^;2of_=pxO4YBq5$Kah!O6r$s3R73Fh;?^1G-2I?Hg8S~`(}}PTryL~hWqwFL(EXA
zEWF3@rEDIC^P;}NKg~ds(~&!@#k-#c_uje)zAd#_H*yj74P7?~=O5H%K6D^Do)AU?
z!|hl%LqQ+bzm;*~z+d|PE^kRcyqlR%RXW=#5apuYSGJMI;2m#J^;2~w*2VWf#k!>~
z8d&FKlE^6+DOsPpkg2Qjf_|*2*lL+34DWruo~cJu;l1zd4AzTmqU<N%+O<-J^t<0d
zQ}!3EGmn3Ubt(Br={kX_%m_|dU$n;2ns%%+S?3M?|K+Rax5Vi8!J858u)fsI-5S;=
va0U-KJ6wJWdVE=%wk<Qjx+k5Vu<q1p7p!YE%q0d7+1&jYS&4O4S+@TJlU?Bk


From 53af47012c6242bc636d97c86e7a3799801a47f2 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 4 Oct 2024 14:06:25 -0600
Subject: [PATCH 185/571] ci - allow early failure on GitLab CI

---
 .gitlab-ci.yml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 7a9e8e44cc..44cfd6afbf 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -5,6 +5,10 @@ stages:
   - test:cpu-and-tidy
   - test:gpu-and-float
 
+workflow:
+  auto_cancel:
+    on_job_failure: all
+
 
 # ----------------------------------------------------------------------------------------
 # Memcheck backends + ASAN

From 8535f0aae359c08d55230a1e48a051b83ac1b33c Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 4 Oct 2024 16:58:20 -0600
Subject: [PATCH 186/571] fluids - more info in error

---
 examples/fluids/src/misc.c | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/examples/fluids/src/misc.c b/examples/fluids/src/misc.c
index 4510be1fa2..3765ee649d 100644
--- a/examples/fluids/src/misc.c
+++ b/examples/fluids/src/misc.c
@@ -140,32 +140,34 @@ PetscErrorCode LoadFluidsBinaryVec(MPI_Comm comm, PetscViewer viewer, Vec Q, Pet
 
 // Compare reference solution values with current test run for CI
 PetscErrorCode RegressionTest(AppCtx app_ctx, Vec Q) {
-  Vec         Qref;
+  Vec         Q_ref;
   PetscViewer viewer;
-  PetscReal   error, Qrefnorm;
+  PetscReal   error, norm_Q, norm_Q_ref;
   MPI_Comm    comm = PetscObjectComm((PetscObject)Q);
 
   PetscFunctionBeginUser;
   // Read reference file
-  PetscCall(VecDuplicate(Q, &Qref));
+  PetscCall(VecDuplicate(Q, &Q_ref));
   PetscCheck(strcmp(app_ctx->test_file_path, "") != 0, comm, PETSC_ERR_FILE_READ, "File for regression test not given");
   PetscCall(PetscViewerBinaryOpen(comm, app_ctx->test_file_path, FILE_MODE_READ, &viewer));
-  PetscCall(LoadFluidsBinaryVec(comm, viewer, Qref, NULL, NULL));
+  PetscCall(LoadFluidsBinaryVec(comm, viewer, Q_ref, NULL, NULL));
 
   // Compute error with respect to reference solution
-  PetscCall(VecAXPY(Q, -1.0, Qref));
-  PetscCall(VecNorm(Qref, NORM_MAX, &Qrefnorm));
-  PetscCall(VecScale(Q, 1. / Qrefnorm));
+  PetscCall(VecNorm(Q_ref, NORM_MAX, &norm_Q));
+  PetscCall(VecNorm(Q_ref, NORM_MAX, &norm_Q_ref));
+  PetscCall(VecAXPY(Q, -1.0, Q_ref));
+  PetscCall(VecScale(Q, 1. / norm_Q_ref));
   PetscCall(VecNorm(Q, NORM_MAX, &error));
 
   // Check error
   if (error > app_ctx->test_tol) {
-    PetscCall(PetscPrintf(PETSC_COMM_WORLD, "Test failed with error norm %g\n", (double)error));
+    PetscCall(PetscPrintf(PETSC_COMM_WORLD, "Test failed with error norm %g\nReference solution max norm: %g Computed solution max norm %g\n",
+                          (double)error, (double)norm_Q_ref, (double)norm_Q));
   }
 
   // Cleanup
   PetscCall(PetscViewerDestroy(&viewer));
-  PetscCall(VecDestroy(&Qref));
+  PetscCall(VecDestroy(&Q_ref));
   PetscFunctionReturn(PETSC_SUCCESS);
 }
 

From d474dafee245095c41305838857d34842980bf14 Mon Sep 17 00:00:00 2001
From: Zach Atkins <zach.atkins@colorado.edu>
Date: Mon, 7 Oct 2024 17:04:48 -0600
Subject: [PATCH 187/571] Register ApplyAddAtPoints for /cuda/shared

---
 backends/cuda-shared/ceed-cuda-shared-basis.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/backends/cuda-shared/ceed-cuda-shared-basis.c b/backends/cuda-shared/ceed-cuda-shared-basis.c
index c947846f3a..fcd09b10f3 100644
--- a/backends/cuda-shared/ceed-cuda-shared-basis.c
+++ b/backends/cuda-shared/ceed-cuda-shared-basis.c
@@ -454,6 +454,7 @@ int CeedBasisCreateTensorH1_Cuda_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d,
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyTensor_Cuda_shared));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddTensor_Cuda_shared));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAtPoints", CeedBasisApplyAtPoints_Cuda_shared));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAddAtPoints", CeedBasisApplyAddAtPoints_Cuda_shared));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Cuda_shared));
   return CEED_ERROR_SUCCESS;
 }

From ecceccc884c9cd225f5362004c4c9d42c6560de6 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 4 Oct 2024 08:32:19 -0600
Subject: [PATCH 188/571] junit - error if output file not found

---
 tests/junit_common.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/junit_common.py b/tests/junit_common.py
index 607d21e9ee..255f8218e2 100644
--- a/tests/junit_common.py
+++ b/tests/junit_common.py
@@ -481,6 +481,8 @@ def run_test(index: int, test: str, spec: TestSpec, backend: str,
                 ref_csv = (ref_csv.parent / ref_csv.name.rsplit('_', 1)[0]).with_suffix('.csv')
             if not ref_csv.is_file():
                 test_case.add_failure_info('csv', output=f'{ref_csv} not found')
+            elif not (Path.cwd() / csv_name).is_file():
+                test_case.add_failure_info('csv', output=f'{csv_name} not found')
             else:
                 diff: str = diff_csv(Path.cwd() / csv_name, ref_csv)
                 if diff:
@@ -495,6 +497,8 @@ def run_test(index: int, test: str, spec: TestSpec, backend: str,
                 ref_cgn = (ref_cgn.parent / ref_cgn.name.rsplit('_', 1)[0]).with_suffix('.cgns')
             if not ref_cgn.is_file():
                 test_case.add_failure_info('cgns', output=f'{ref_cgn} not found')
+            elif not (Path.cwd() / cgn_name).is_file():
+                test_case.add_failure_info('csv', output=f'{cgn_name} not found')
             else:
                 diff = diff_cgns(Path.cwd() / cgn_name, ref_cgn, cgns_tol=suite_spec.cgns_tol)
                 if diff:

From c81f2b9db63a5521a8ea6a1c49884bf189c1d57f Mon Sep 17 00:00:00 2001
From: James Wright <james@jameswright.xyz>
Date: Tue, 8 Oct 2024 13:47:52 -0600
Subject: [PATCH 189/571] fix(precon): Cast to CeedSize before multiplication

---
 interface/ceed-preconditioning.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/interface/ceed-preconditioning.c b/interface/ceed-preconditioning.c
index 325c5c0017..c7f97ae721 100644
--- a/interface/ceed-preconditioning.c
+++ b/interface/ceed-preconditioning.c
@@ -459,7 +459,7 @@ static int CeedSingleOperatorAssembleSymbolic(CeedOperator op, CeedInt offset, C
   // Determine elem_dof relation for input
   CeedCall(CeedVectorCreate(ceed, num_nodes_in, &index_vec_in));
   CeedCall(CeedVectorGetArrayWrite(index_vec_in, CEED_MEM_HOST, &array));
-  for (CeedInt i = 0; i < num_nodes_in; i++) array[i] = i;
+  for (CeedSize i = 0; i < num_nodes_in; i++) array[i] = i;
   CeedCall(CeedVectorRestoreArray(index_vec_in, &array));
   CeedCall(CeedVectorCreate(ceed, num_elem_in * elem_size_in * num_comp_in, &elem_dof_in));
   CeedCall(CeedVectorSetValue(elem_dof_in, 0.0));
@@ -482,7 +482,7 @@ static int CeedSingleOperatorAssembleSymbolic(CeedOperator op, CeedInt offset, C
     // Determine elem_dof relation for output
     CeedCall(CeedVectorCreate(ceed, num_nodes_out, &index_vec_out));
     CeedCall(CeedVectorGetArrayWrite(index_vec_out, CEED_MEM_HOST, &array));
-    for (CeedInt i = 0; i < num_nodes_out; i++) array[i] = i;
+    for (CeedSize i = 0; i < num_nodes_out; i++) array[i] = i;
     CeedCall(CeedVectorRestoreArray(index_vec_out, &array));
     CeedCall(CeedVectorCreate(ceed, num_elem_out * elem_size_out * num_comp_out, &elem_dof_out));
     CeedCall(CeedVectorSetValue(elem_dof_out, 0.0));
@@ -500,7 +500,7 @@ static int CeedSingleOperatorAssembleSymbolic(CeedOperator op, CeedInt offset, C
     layout_er_out[2] = layout_er_in[2];
     elem_dof_a_out   = elem_dof_a_in;
   }
-  local_num_entries = elem_size_out * num_comp_out * elem_size_in * num_comp_in * num_elem_in;
+  local_num_entries = (CeedSize)elem_size_out * num_comp_out * elem_size_in * num_comp_in * num_elem_in;
 
   // Determine i, j locations for element matrices
   for (CeedInt e = 0; e < num_elem_in; e++) {
@@ -658,7 +658,7 @@ static int CeedSingleOperatorAssemble(CeedOperator op, CeedInt offset, CeedVecto
     elem_rstr_orients_out      = elem_rstr_orients_in;
     elem_rstr_curl_orients_out = elem_rstr_curl_orients_in;
   }
-  local_num_entries = elem_size_out * num_comp_out * elem_size_in * num_comp_in * num_elem_in;
+  local_num_entries = (CeedSize)elem_size_out * num_comp_out * elem_size_in * num_comp_in * num_elem_in;
 
   // Loop over elements and put in data structure
   // We store B_mat_in, B_mat_out, BTD, elem_mat in row-major order
@@ -2500,7 +2500,7 @@ int CeedCompositeOperatorGetMultiplicity(CeedOperator op, CeedInt num_skip_indic
     CeedCall(CeedElemRestrictionApply(mult_elem_rstr, CEED_TRANSPOSE, ones_e_vec, sub_mult_l_vec, CEED_REQUEST_IMMEDIATE));
     CeedCall(CeedVectorGetArrayRead(sub_mult_l_vec, CEED_MEM_HOST, &sub_mult_array));
     // ---- Flag every node present in the current suboperator
-    for (CeedInt j = 0; j < l_vec_len; j++) {
+    for (CeedSize j = 0; j < l_vec_len; j++) {
       if (sub_mult_array[j] > 0.0) mult_array[j] += 1.0;
     }
     CeedCall(CeedVectorRestoreArrayRead(sub_mult_l_vec, &sub_mult_array));
@@ -2698,7 +2698,6 @@ int CeedOperatorCreateFDMElementInverse(CeedOperator op, CeedOperator *fdm_inv,
   Ceed                 ceed, ceed_parent;
   bool                 interp = false, grad = false, is_tensor_basis = true;
   CeedInt              num_input_fields, P_1d, Q_1d, num_nodes, num_qpts, dim, num_comp = 1, num_elem = 1;
-  CeedSize             l_size = 1;
   CeedScalar          *mass, *laplace, *x, *fdm_interp, *lambda, *elem_avg;
   const CeedScalar    *interp_1d, *grad_1d, *q_weight_1d;
   CeedVector           q_data;
@@ -2756,7 +2755,6 @@ int CeedOperatorCreateFDMElementInverse(CeedOperator op, CeedOperator *fdm_inv,
   CeedCall(CeedBasisGetDimension(basis, &dim));
   CeedCall(CeedBasisGetNumComponents(basis, &num_comp));
   CeedCall(CeedElemRestrictionGetNumElements(rstr, &num_elem));
-  CeedCall(CeedElemRestrictionGetLVectorSize(rstr, &l_size));
 
   // Build and diagonalize 1D Mass and Laplacian
   CeedCall(CeedBasisIsTensor(basis, &is_tensor_basis));

From 6e25802e8a36deff9cd8f487fe0f213843aae78f Mon Sep 17 00:00:00 2001
From: James Wright <james@jameswright.xyz>
Date: Sun, 6 Oct 2024 10:15:06 -0600
Subject: [PATCH 190/571] fix(jit): OOB array access in CeedNormalizePath

---
 interface/ceed-jit-tools.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/interface/ceed-jit-tools.c b/interface/ceed-jit-tools.c
index 4d4bf44e51..a45054cfc9 100644
--- a/interface/ceed-jit-tools.c
+++ b/interface/ceed-jit-tools.c
@@ -97,7 +97,7 @@ static int CeedNormalizePath(Ceed ceed, const char *source_file_path, char **nor
 
         while (last_slash[0] != '/' && last_slash != *normalized_source_file_path) last_slash--;
         CeedCheck(last_slash != *normalized_source_file_path, ceed, CEED_ERROR_MAJOR, "Malformed source path %s", source_file_path);
-        for (CeedInt i = 0; first_dot[i - 1]; i++) last_slash[i] = first_dot[i + 2];
+        for (CeedInt i = 0; first_dot[i + 1]; i++) last_slash[i] = first_dot[i + 2];
         search_from = last_slash;
       }
     }

From 13da2136da24494112d2fa141db08f720d183c7a Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Thu, 10 Oct 2024 13:01:57 -0600
Subject: [PATCH 191/571] ci - ASAN for CUDA

---
 .gitlab-ci.yml | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 44cfd6afbf..a60ad43ec2 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -186,6 +186,10 @@ noether-cuda:
     - echo "-------------- FC ------------------" && $FC --version
     - echo "-------------- NVCC ----------------" && $NVCC --version
     - echo "-------------- GCOV ----------------" && gcov --version
+# ASAN
+    - echo "-------------- ASAN ----------------"
+    - export ASAN=1 AFLAGS="-fsanitize=address -fsanitize=leak" ASAN_OPTIONS=protect_shadow_gap=0
+    - echo $AFLAGS
   script:
     - rm -f .SUCCESS
 # libCEED
@@ -200,6 +204,10 @@ noether-cuda:
 #    Note: PETSC_DIR is set by default in GitLab runner env, unsetting to isolate core tests
     - export PETSC_DIR= PETSC_ARCH=
     - make -k -j$((NPROC_GPU / NPROC_POOL)) BACKENDS="$BACKENDS_GPU" JUNIT_BATCH="cuda" junit realsearch=%
+# Rebuild without ASAN
+    - export ASAN=0
+    - make clean
+    - PEDANTIC=1 make -k -j$NPROC_CPU -l$NPROC_CPU
 # Libraries for examples
 # -- PETSc with CUDA (minimal)
     - export PETSC_DIR=/projects/petsc PETSC_ARCH=mpich-cuda-O PETSC_OPTIONS='-use_gpu_aware_mpi 0' && git -C $PETSC_DIR -c safe.directory=$PETSC_DIR describe

From 8b0f73481df5f9f5ba2e5a396f0652e2438a7896 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Thu, 10 Oct 2024 13:44:53 -0600
Subject: [PATCH 192/571] gpu - clean up small leaks

---
 backends/cuda-ref/ceed-cuda-ref-operator.c    | 11 +++++++++++
 backends/cuda-ref/ceed-cuda-ref-qfunction.c   |  1 +
 backends/cuda-ref/ceed-cuda-ref-restriction.c |  4 ++++
 backends/hip-ref/ceed-hip-ref-operator.c      | 11 +++++++++++
 backends/hip-ref/ceed-hip-ref-qfunction.c     |  1 +
 backends/hip-ref/ceed-hip-ref-restriction.c   |  4 ++++
 6 files changed, 32 insertions(+)

diff --git a/backends/cuda-ref/ceed-cuda-ref-operator.c b/backends/cuda-ref/ceed-cuda-ref-operator.c
index a3283f544c..8ff601ed4d 100644
--- a/backends/cuda-ref/ceed-cuda-ref-operator.c
+++ b/backends/cuda-ref/ceed-cuda-ref-operator.c
@@ -600,6 +600,7 @@ static int CeedOperatorSetupAtPoints_Cuda(CeedOperator op) {
       CeedCallBackend(CeedElemRestrictionGetNumPointsInElement(rstr_points, e, &num_points_elem));
       impl->num_points[e] = num_points_elem;
     }
+    CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
   }
   impl->max_num_points = max_num_points;
 
@@ -779,6 +780,8 @@ static int CeedOperatorApplyAddAtPoints_Cuda(CeedOperator op, CeedVector in_vec,
     CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, &point_coords));
     CeedCallBackend(CeedElemRestrictionCreateVector(rstr_points, NULL, &impl->point_coords_elem));
     CeedCallBackend(CeedElemRestrictionApply(rstr_points, CEED_NOTRANSPOSE, point_coords, impl->point_coords_elem, request));
+    CeedCallBackend(CeedVectorDestroy(&point_coords));
+    CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
   }
 
   // Process inputs
@@ -1543,6 +1546,7 @@ static int CeedSingleOperatorAssembleSetup_Cuda(CeedOperator op, CeedInt use_cee
       CeedCallBackend(CeedFree(&identity));
     }
   }
+  CeedCallBackend(CeedFree(&eval_modes_in));
 
   // Load into B_out, in order that they will be used in eval_modes_out
   {
@@ -1580,6 +1584,7 @@ static int CeedSingleOperatorAssembleSetup_Cuda(CeedOperator op, CeedInt use_cee
       CeedCallBackend(CeedFree(&identity));
     }
   }
+  CeedCallBackend(CeedFree(&eval_modes_out));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -1743,6 +1748,8 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda(CeedOperator op, C
     CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, &point_coords));
     CeedCallBackend(CeedElemRestrictionCreateVector(rstr_points, NULL, &impl->point_coords_elem));
     CeedCallBackend(CeedElemRestrictionApply(rstr_points, CEED_NOTRANSPOSE, point_coords, impl->point_coords_elem, request));
+    CeedCallBackend(CeedVectorDestroy(&point_coords));
+    CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
   }
 
   // Process inputs
@@ -1933,6 +1940,10 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda(CeedOperator op, C
   for (CeedInt i = 0; i < num_input_fields; i++) {
     CeedCallBackend(CeedOperatorInputRestore_Cuda(op_input_fields[i], qf_input_fields[i], i, NULL, NULL, true, impl));
   }
+
+  // Restore work vector
+  CeedCallBackend(CeedRestoreWorkVector(ceed, &active_e_vec_in));
+  CeedCallBackend(CeedRestoreWorkVector(ceed, &active_e_vec_out));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/cuda-ref/ceed-cuda-ref-qfunction.c b/backends/cuda-ref/ceed-cuda-ref-qfunction.c
index f52aebb685..957f02cbbe 100644
--- a/backends/cuda-ref/ceed-cuda-ref-qfunction.c
+++ b/backends/cuda-ref/ceed-cuda-ref-qfunction.c
@@ -68,6 +68,7 @@ static int CeedQFunctionDestroy_Cuda(CeedQFunction qf) {
   CeedQFunction_Cuda *data;
 
   CeedCallBackend(CeedQFunctionGetData(qf, &data));
+  CeedCallBackend(CeedFree(&data->qfunction_source));
   if (data->module) CeedCallCuda(CeedQFunctionReturnCeed(qf), cuModuleUnload(data->module));
   CeedCallBackend(CeedFree(&data));
   return CEED_ERROR_SUCCESS;
diff --git a/backends/cuda-ref/ceed-cuda-ref-restriction.c b/backends/cuda-ref/ceed-cuda-ref-restriction.c
index 8ed9e0c60e..f89e1f694d 100644
--- a/backends/cuda-ref/ceed-cuda-ref-restriction.c
+++ b/backends/cuda-ref/ceed-cuda-ref-restriction.c
@@ -80,6 +80,10 @@ static inline int CeedElemRestrictionSetupCompile_Cuda(CeedElemRestriction rstr)
                                        "USE_DETERMINISTIC", is_deterministic ? 1 : 0));
       CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OffsetNoTranspose", &impl->ApplyNoTranspose));
       CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "AtPointsTranspose", &impl->ApplyTranspose));
+      // Cleanup
+      CeedCallBackend(CeedFree(&offset_kernel_path));
+      for (CeedInt i = 0; i < num_file_paths; i++) CeedCallBackend(CeedFree(&file_paths[i]));
+      CeedCallBackend(CeedFree(&file_paths));
     } break;
     case CEED_RESTRICTION_STANDARD: {
       CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-restriction-offset.h", &restriction_kernel_path));
diff --git a/backends/hip-ref/ceed-hip-ref-operator.c b/backends/hip-ref/ceed-hip-ref-operator.c
index 95e0589bd1..f52abe5123 100644
--- a/backends/hip-ref/ceed-hip-ref-operator.c
+++ b/backends/hip-ref/ceed-hip-ref-operator.c
@@ -598,6 +598,7 @@ static int CeedOperatorSetupAtPoints_Hip(CeedOperator op) {
       CeedCallBackend(CeedElemRestrictionGetNumPointsInElement(rstr_points, e, &num_points_elem));
       impl->num_points[e] = num_points_elem;
     }
+    CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
   }
   impl->max_num_points = max_num_points;
 
@@ -777,6 +778,8 @@ static int CeedOperatorApplyAddAtPoints_Hip(CeedOperator op, CeedVector in_vec,
     CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, &point_coords));
     CeedCallBackend(CeedElemRestrictionCreateVector(rstr_points, NULL, &impl->point_coords_elem));
     CeedCallBackend(CeedElemRestrictionApply(rstr_points, CEED_NOTRANSPOSE, point_coords, impl->point_coords_elem, request));
+    CeedCallBackend(CeedVectorDestroy(&point_coords));
+    CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
   }
 
   // Process inputs
@@ -1540,6 +1543,7 @@ static int CeedSingleOperatorAssembleSetup_Hip(CeedOperator op, CeedInt use_ceed
       CeedCallBackend(CeedFree(&identity));
     }
   }
+  CeedCallBackend(CeedFree(&eval_modes_in));
 
   // Load into B_out, in order that they will be used in eval_modes_out
   {
@@ -1577,6 +1581,7 @@ static int CeedSingleOperatorAssembleSetup_Hip(CeedOperator op, CeedInt use_ceed
       CeedCallBackend(CeedFree(&identity));
     }
   }
+  CeedCallBackend(CeedFree(&eval_modes_out));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -1740,6 +1745,8 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip(CeedOperator op, Ce
     CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, &point_coords));
     CeedCallBackend(CeedElemRestrictionCreateVector(rstr_points, NULL, &impl->point_coords_elem));
     CeedCallBackend(CeedElemRestrictionApply(rstr_points, CEED_NOTRANSPOSE, point_coords, impl->point_coords_elem, request));
+    CeedCallBackend(CeedVectorDestroy(&point_coords));
+    CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
   }
 
   // Process inputs
@@ -1930,6 +1937,10 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip(CeedOperator op, Ce
   for (CeedInt i = 0; i < num_input_fields; i++) {
     CeedCallBackend(CeedOperatorInputRestore_Hip(op_input_fields[i], qf_input_fields[i], i, NULL, NULL, true, impl));
   }
+
+  // Restore work vector
+  CeedCallBackend(CeedRestoreWorkVector(ceed, &active_e_vec_in));
+  CeedCallBackend(CeedRestoreWorkVector(ceed, &active_e_vec_out));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/hip-ref/ceed-hip-ref-qfunction.c b/backends/hip-ref/ceed-hip-ref-qfunction.c
index e5e72cfd43..15d2dc7ae4 100644
--- a/backends/hip-ref/ceed-hip-ref-qfunction.c
+++ b/backends/hip-ref/ceed-hip-ref-qfunction.c
@@ -70,6 +70,7 @@ static int CeedQFunctionDestroy_Hip(CeedQFunction qf) {
   CeedQFunction_Hip *data;
 
   CeedCallBackend(CeedQFunctionGetData(qf, &data));
+  CeedCallBackend(CeedFree(&data->qfunction_source));
   if (data->module) CeedCallHip(CeedQFunctionReturnCeed(qf), hipModuleUnload(data->module));
   CeedCallBackend(CeedFree(&data));
   return CEED_ERROR_SUCCESS;
diff --git a/backends/hip-ref/ceed-hip-ref-restriction.c b/backends/hip-ref/ceed-hip-ref-restriction.c
index eff205a018..95b0961387 100644
--- a/backends/hip-ref/ceed-hip-ref-restriction.c
+++ b/backends/hip-ref/ceed-hip-ref-restriction.c
@@ -79,6 +79,10 @@ static inline int CeedElemRestrictionSetupCompile_Hip(CeedElemRestriction rstr)
                                       "USE_DETERMINISTIC", is_deterministic ? 1 : 0));
       CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "OffsetNoTranspose", &impl->ApplyNoTranspose));
       CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "AtPointsTranspose", &impl->ApplyTranspose));
+      // Cleanup
+      CeedCallBackend(CeedFree(&offset_kernel_path));
+      for (CeedInt i = 0; i < num_file_paths; i++) CeedCallBackend(CeedFree(&file_paths[i]));
+      CeedCallBackend(CeedFree(&file_paths));
     } break;
     case CEED_RESTRICTION_STANDARD: {
       CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-restriction-offset.h", &restriction_kernel_path));

From 38f1a2a73541e205612e8fbe58e021390b05e7fe Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Thu, 10 Oct 2024 15:09:46 -0600
Subject: [PATCH 193/571] ci - shift some tests to CUDA from ROCM

---
 .gitlab-ci.yml | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index a60ad43ec2..44369f11e0 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -205,14 +205,28 @@ noether-cuda:
     - export PETSC_DIR= PETSC_ARCH=
     - make -k -j$((NPROC_GPU / NPROC_POOL)) BACKENDS="$BACKENDS_GPU" JUNIT_BATCH="cuda" junit realsearch=%
 # Rebuild without ASAN
-    - export ASAN=0
+    - export ASAN=0 AFLAGS= ASAN_OPTIONS=
     - make clean
     - PEDANTIC=1 make -k -j$NPROC_CPU -l$NPROC_CPU
 # Libraries for examples
 # -- PETSc with CUDA (minimal)
     - export PETSC_DIR=/projects/petsc PETSC_ARCH=mpich-cuda-O PETSC_OPTIONS='-use_gpu_aware_mpi 0' && git -C $PETSC_DIR -c safe.directory=$PETSC_DIR describe
     - echo "-------------- PETSc ---------------" && make -C $PETSC_DIR info
-    - make -k -j$((NPROC_GPU / NPROC_POOL)) BACKENDS="$BACKENDS_GPU" JUNIT_BATCH="cuda" junit search="petsc fluids-navierstokes solids"
+    - make -k -j$((NPROC_GPU / NPROC_POOL)) BACKENDS="$BACKENDS_GPU" JUNIT_BATCH="cuda" junit search="petsc fluids solids"
+# -- MFEM v4.7
+    - cd .. && export MFEM_VERSION=mfem-4.7 && { [[ -d $MFEM_VERSION ]] || { git clone --depth 1 --branch v4.7 https://github.com/mfem/mfem.git $MFEM_VERSION && make -C $MFEM_VERSION -j$(nproc) serial CXXFLAGS="-O -std=c++11"; }; } && export MFEM_DIR=$PWD/$MFEM_VERSION && cd libCEED
+    - echo "-------------- MFEM ----------------" && make -C $MFEM_DIR info
+    - make -k -j$((NPROC_GPU / NPROC_POOL)) BACKENDS="$BACKENDS_GPU" JUNIT_BATCH="cuda" junit search=mfem
+# -- Nek5000 v19.0
+    - export COVERAGE=0
+    - cd .. && export NEK5K_VERSION=Nek5000-19.0 && { [[ -d $NEK5K_VERSION ]] || { git clone --depth 1 --branch v19.0 https://github.com/Nek5000/Nek5000.git $NEK5K_VERSION && cd $NEK5K_VERSION/tools && ./maketools genbox genmap reatore2 && cd ../..; }; } && export NEK5K_DIR=$PWD/$NEK5K_VERSION && export PATH=$NEK5K_DIR/bin:$PATH MPI=0 && cd libCEED
+    - echo "-------------- Nek5000 -------------" && git -C $NEK5K_DIR describe --tags
+    - export NPROC_POOL=1
+    - make -k -j$NPROC_GPU BACKENDS="$BACKENDS_GPU" JUNIT_BATCH="cuda" junit search=nek NEK5K_DIR=$NEK5K_DIR
+# -- deal.II 8bd5c262f13e15793aa206b6eed8774a9b25ce11
+    - export DEAL_II_ROOT_DIR=/projects/dealii DEAL_II_DIR=/projects/dealii/install
+    - echo "-------------- deal.II -------------" && git -C $DEAL_II_ROOT_DIR -c safe.directory=$DEAL_II_ROOT_DIR describe --always
+    - make -k -j$((NPROC_GPU / NPROC_POOL)) BACKENDS="$BACKENDS_GPU" JUNIT_BATCH="cuda" junit search=dealii DEAL_II_DIR=$DEAL_II_DIR
 # Clang-tidy
     - echo "-------------- clang-tidy ----------" && clang-tidy --version
     - TIDY_OPTS="-fix-errors" make -j$NPROC_CPU tidy && git diff --color=always --exit-code
@@ -277,20 +291,6 @@ noether-rocm:
     - export PETSC_DIR=/projects/petsc PETSC_ARCH=mpich-hip && git -C $PETSC_DIR -c safe.directory=$PETSC_DIR describe
     - echo "-------------- PETSc ---------------" && make -C $PETSC_DIR info
     - make -k -j$((NPROC_GPU / NPROC_POOL)) BACKENDS="$BACKENDS_GPU" JUNIT_BATCH="hip" junit search="petsc fluids solids"
-# -- MFEM v4.7
-    - cd .. && export MFEM_VERSION=mfem-4.7 && { [[ -d $MFEM_VERSION ]] || { git clone --depth 1 --branch v4.7 https://github.com/mfem/mfem.git $MFEM_VERSION && make -C $MFEM_VERSION -j$(nproc) serial CXXFLAGS="-O -std=c++11"; }; } && export MFEM_DIR=$PWD/$MFEM_VERSION && cd libCEED
-    - echo "-------------- MFEM ----------------" && make -C $MFEM_DIR info
-    - make -k -j$((NPROC_GPU / NPROC_POOL)) BACKENDS="$BACKENDS_GPU" JUNIT_BATCH="hip" junit search=mfem
-# -- Nek5000 v19.0
-    - export COVERAGE=0
-    - cd .. && export NEK5K_VERSION=Nek5000-19.0 && { [[ -d $NEK5K_VERSION ]] || { git clone --depth 1 --branch v19.0 https://github.com/Nek5000/Nek5000.git $NEK5K_VERSION && cd $NEK5K_VERSION/tools && ./maketools genbox genmap reatore2 && cd ../..; }; } && export NEK5K_DIR=$PWD/$NEK5K_VERSION && export PATH=$NEK5K_DIR/bin:$PATH MPI=0 && cd libCEED
-    - echo "-------------- Nek5000 -------------" && git -C $NEK5K_DIR describe --tags
-    - export NPROC_POOL=1
-    - make -k -j$NPROC_GPU BACKENDS="$BACKENDS_GPU" JUNIT_BATCH="hip" junit search=nek NEK5K_DIR=$NEK5K_DIR
-# -- deal.II 8bd5c262f13e15793aa206b6eed8774a9b25ce11
-    - export DEAL_II_ROOT_DIR=/projects/dealii DEAL_II_DIR=/projects/dealii/install
-    - echo "-------------- deal.II -------------" && git -C $DEAL_II_ROOT_DIR -c safe.directory=$DEAL_II_ROOT_DIR describe --always
-    - make -k -j$((NPROC_GPU / NPROC_POOL)) BACKENDS="$BACKENDS_GPU" JUNIT_BATCH="hip" junit search=dealii DEAL_II_DIR=$DEAL_II_DIR
 # Clang-tidy
     - echo "-------------- clang-tidy ----------" && clang-tidy --version
     - TIDY_OPTS="-fix-errors" make -j$NPROC_CPU tidy && git diff --color=always --exit-code

From c43990d09eeadeace583fdf1ca1727d8eeec0548 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Thu, 10 Oct 2024 16:18:48 -0600
Subject: [PATCH 194/571] minor - simplification

---
 backends/cuda-ref/ceed-cuda-ref-operator.c | 10 ++--------
 backends/hip-ref/ceed-hip-ref-operator.c   | 10 ++--------
 2 files changed, 4 insertions(+), 16 deletions(-)

diff --git a/backends/cuda-ref/ceed-cuda-ref-operator.c b/backends/cuda-ref/ceed-cuda-ref-operator.c
index 8ff601ed4d..986d240dff 100644
--- a/backends/cuda-ref/ceed-cuda-ref-operator.c
+++ b/backends/cuda-ref/ceed-cuda-ref-operator.c
@@ -1541,10 +1541,7 @@ static int CeedSingleOperatorAssembleSetup_Cuda(CeedOperator op, CeedInt use_cee
       CeedCallCuda(ceed, cudaMemcpy(&asmb->d_B_in[i * elem_size_in * num_qpts_in], h_B_in, elem_size_in * num_qpts_in * sizeof(CeedScalar),
                                     cudaMemcpyHostToDevice));
     }
-
-    if (identity) {
-      CeedCallBackend(CeedFree(&identity));
-    }
+    CeedCallBackend(CeedFree(&identity));
   }
   CeedCallBackend(CeedFree(&eval_modes_in));
 
@@ -1579,10 +1576,7 @@ static int CeedSingleOperatorAssembleSetup_Cuda(CeedOperator op, CeedInt use_cee
       CeedCallCuda(ceed, cudaMemcpy(&asmb->d_B_out[i * elem_size_out * num_qpts_out], h_B_out, elem_size_out * num_qpts_out * sizeof(CeedScalar),
                                     cudaMemcpyHostToDevice));
     }
-
-    if (identity) {
-      CeedCallBackend(CeedFree(&identity));
-    }
+    CeedCallBackend(CeedFree(&identity));
   }
   CeedCallBackend(CeedFree(&eval_modes_out));
   return CEED_ERROR_SUCCESS;
diff --git a/backends/hip-ref/ceed-hip-ref-operator.c b/backends/hip-ref/ceed-hip-ref-operator.c
index f52abe5123..6525c0b47a 100644
--- a/backends/hip-ref/ceed-hip-ref-operator.c
+++ b/backends/hip-ref/ceed-hip-ref-operator.c
@@ -1538,10 +1538,7 @@ static int CeedSingleOperatorAssembleSetup_Hip(CeedOperator op, CeedInt use_ceed
       CeedCallHip(ceed, hipMemcpy(&asmb->d_B_in[i * elem_size_in * num_qpts_in], h_B_in, elem_size_in * num_qpts_in * sizeof(CeedScalar),
                                   hipMemcpyHostToDevice));
     }
-
-    if (identity) {
-      CeedCallBackend(CeedFree(&identity));
-    }
+    CeedCallBackend(CeedFree(&identity));
   }
   CeedCallBackend(CeedFree(&eval_modes_in));
 
@@ -1576,10 +1573,7 @@ static int CeedSingleOperatorAssembleSetup_Hip(CeedOperator op, CeedInt use_ceed
       CeedCallHip(ceed, hipMemcpy(&asmb->d_B_out[i * elem_size_out * num_qpts_out], h_B_out, elem_size_out * num_qpts_out * sizeof(CeedScalar),
                                   hipMemcpyHostToDevice));
     }
-
-    if (identity) {
-      CeedCallBackend(CeedFree(&identity));
-    }
+    CeedCallBackend(CeedFree(&identity));
   }
   CeedCallBackend(CeedFree(&eval_modes_out));
   return CEED_ERROR_SUCCESS;

From f3296101eabf62630b2cd6bda7e84579c522b5ce Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Thu, 10 Oct 2024 17:57:52 -0600
Subject: [PATCH 195/571] CI - tidier unset

Co-authored-by: James Wright <james@jameswright.xyz>
---
 .gitlab-ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 44369f11e0..e490cb533a 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -205,7 +205,7 @@ noether-cuda:
     - export PETSC_DIR= PETSC_ARCH=
     - make -k -j$((NPROC_GPU / NPROC_POOL)) BACKENDS="$BACKENDS_GPU" JUNIT_BATCH="cuda" junit realsearch=%
 # Rebuild without ASAN
-    - export ASAN=0 AFLAGS= ASAN_OPTIONS=
+    - unset ASAN AFLAGS ASAN_OPTIONS
     - make clean
     - PEDANTIC=1 make -k -j$NPROC_CPU -l$NPROC_CPU
 # Libraries for examples

From 681d0ea73ee05192cf73f31e6e4a886b41175395 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Mon, 19 Aug 2024 14:05:02 -0600
Subject: [PATCH 196/571] op - ReferenceCopy for CeedOperatorFieldGet*

---
 backends/blocked/ceed-blocked-operator.c      |  56 ++-
 .../cuda-gen/ceed-cuda-gen-operator-build.cpp |  17 +
 backends/cuda-gen/ceed-cuda-gen-operator.c    |  36 +-
 backends/cuda-ref/ceed-cuda-ref-operator.c    | 230 +++++++---
 .../hip-gen/ceed-hip-gen-operator-build.cpp   |   2 +
 backends/hip-ref/ceed-hip-ref-operator.c      | 236 +++++++---
 backends/occa/ceed-occa-basis.cpp             |   8 +-
 backends/occa/ceed-occa-elem-restriction.cpp  |   6 +-
 backends/occa/ceed-occa-operator-field.cpp    |   6 +-
 backends/opt/ceed-opt-operator.c              |  74 +--
 backends/ref/ceed-ref-operator.c              | 132 ++++--
 .../ceed-sycl-gen-operator-build.sycl.cpp     |  16 +-
 .../sycl-gen/ceed-sycl-gen-operator.sycl.cpp  |  20 +-
 .../sycl-ref/ceed-sycl-ref-operator.sycl.cpp  | 428 ++++++++++--------
 backends/sycl-ref/ceed-sycl-ref.hpp           |   8 +-
 doc/sphinx/source/releasenotes.md             |   1 +
 examples/fluids/problems/advection.c          |  10 +-
 examples/fluids/problems/newtonian.c          |  10 +-
 examples/fluids/src/differential_filter.c     |   5 +-
 examples/fluids/src/setuplibceed.c            |  12 +-
 interface/ceed-operator.c                     |  74 ++-
 interface/ceed-preconditioning.c              |  59 ++-
 22 files changed, 989 insertions(+), 457 deletions(-)

diff --git a/backends/blocked/ceed-blocked-operator.c b/backends/blocked/ceed-blocked-operator.c
index 80b1f44865..c9f5ccfd46 100644
--- a/backends/blocked/ceed-blocked-operator.c
+++ b/backends/blocked/ceed-blocked-operator.c
@@ -105,6 +105,7 @@ static int CeedOperatorSetupFields_Blocked(CeedQFunction qf, CeedOperator op, bo
           // Empty case - won't occur
           break;
       }
+      CeedCallBackend(CeedElemRestrictionDestroy(&rstr));
       CeedCallBackend(CeedElemRestrictionCreateVector(block_rstr[i + start_e], NULL, &e_vecs_full[i + start_e]));
     }
 
@@ -122,6 +123,7 @@ static int CeedOperatorSetupFields_Blocked(CeedQFunction qf, CeedOperator op, bo
         CeedCallBackend(CeedQFunctionFieldGetSize(qf_fields[i], &size));
         CeedCallBackend(CeedBasisGetNumNodes(basis, &P));
         CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
+        CeedCallBackend(CeedBasisDestroy(&basis));
         e_size = (CeedSize)P * num_comp * block_size;
         CeedCallBackend(CeedVectorCreate(ceed, e_size, &e_vecs[i]));
         q_size = (CeedSize)Q * size * block_size;
@@ -132,6 +134,7 @@ static int CeedOperatorSetupFields_Blocked(CeedQFunction qf, CeedOperator op, bo
         q_size = (CeedSize)Q * block_size;
         CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i]));
         CeedCallBackend(CeedBasisApply(basis, block_size, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT, CEED_VECTOR_NONE, q_vecs[i]));
+        CeedCallBackend(CeedBasisDestroy(&basis));
         break;
     }
   }
@@ -154,7 +157,11 @@ static int CeedOperatorSetupFields_Blocked(CeedQFunction qf, CeedOperator op, bo
           CeedCallBackend(CeedVectorReferenceCopy(e_vecs_full[i + start_e], &e_vecs_full[j + start_e]));
           skip_rstr[j] = true;
         }
+        CeedCallBackend(CeedVectorDestroy(&vec_j));
+        CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j));
       }
+      CeedCallBackend(CeedVectorDestroy(&vec_i));
+      CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
     }
   } else {
     for (CeedInt i = num_fields - 1; i >= 0; i--) {
@@ -176,7 +183,11 @@ static int CeedOperatorSetupFields_Blocked(CeedQFunction qf, CeedOperator op, bo
           apply_add_basis[i]    = true;
           e_data_out_indices[j] = i;
         }
+        CeedCallBackend(CeedVectorDestroy(&vec_j));
+        CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j));
       }
+      CeedCallBackend(CeedVectorDestroy(&vec_i));
+      CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
     }
   }
   return CEED_ERROR_SUCCESS;
@@ -259,13 +270,15 @@ static inline int CeedOperatorSetupInputs_Blocked(CeedInt num_input_fields, Ceed
                                                   CeedVector in_vec, bool skip_active, CeedScalar *e_data_full[2 * CEED_FIELD_MAX],
                                                   CeedOperator_Blocked *impl, CeedRequest *request) {
   for (CeedInt i = 0; i < num_input_fields; i++) {
+    bool         is_active;
     uint64_t     state;
     CeedEvalMode eval_mode;
     CeedVector   vec;
 
     // Get input vector
     CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
-    if (vec == CEED_VECTOR_ACTIVE) {
+    is_active = vec == CEED_VECTOR_ACTIVE;
+    if (is_active) {
       if (skip_active) continue;
       else vec = in_vec;
     }
@@ -282,6 +295,7 @@ static inline int CeedOperatorSetupInputs_Blocked(CeedInt num_input_fields, Ceed
       // Get evec
       CeedCallBackend(CeedVectorGetArrayRead(impl->e_vecs_full[i], CEED_MEM_HOST, (const CeedScalar **)&e_data_full[i]));
     }
+    if (!is_active) CeedCallBackend(CeedVectorDestroy(&vec));
   }
   return CEED_ERROR_SUCCESS;
 }
@@ -300,15 +314,19 @@ static inline int CeedOperatorInputBasis_Blocked(CeedInt e, CeedInt Q, CeedQFunc
 
     // Skip active input
     if (skip_active) {
+      bool       is_active;
       CeedVector vec;
 
       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
-      if (vec == CEED_VECTOR_ACTIVE) continue;
+      is_active = vec == CEED_VECTOR_ACTIVE;
+      CeedCallBackend(CeedVectorDestroy(&vec));
+      if (is_active) continue;
     }
 
     // Get elem_size, eval_mode, size
     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
+    CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
     CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &size));
     // Basis action
@@ -324,6 +342,7 @@ static inline int CeedOperatorInputBasis_Blocked(CeedInt e, CeedInt Q, CeedQFunc
         CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
         CeedCallBackend(CeedVectorSetArray(impl->e_vecs_in[i], CEED_MEM_HOST, CEED_USE_POINTER, &e_data_full[i][(CeedSize)e * elem_size * num_comp]));
         CeedCallBackend(CeedBasisApply(basis, block_size, CEED_NOTRANSPOSE, eval_mode, impl->e_vecs_in[i], impl->q_vecs_in[i]));
+        CeedCallBackend(CeedBasisDestroy(&basis));
         break;
       case CEED_EVAL_WEIGHT:
         break;  // No action
@@ -347,6 +366,7 @@ static inline int CeedOperatorOutputBasis_Blocked(CeedInt e, CeedInt Q, CeedQFun
     // Get elem_size, eval_mode, size
     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
+    CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
     // Basis action
     switch (eval_mode) {
@@ -365,6 +385,7 @@ static inline int CeedOperatorOutputBasis_Blocked(CeedInt e, CeedInt Q, CeedQFun
         } else {
           CeedCallBackend(CeedBasisApply(basis, block_size, CEED_TRANSPOSE, eval_mode, impl->q_vecs_out[i], impl->e_vecs_out[i]));
         }
+        CeedCallBackend(CeedBasisDestroy(&basis));
         break;
       // LCOV_EXCL_START
       case CEED_EVAL_WEIGHT: {
@@ -386,10 +407,13 @@ static inline int CeedOperatorRestoreInputs_Blocked(CeedInt num_input_fields, Ce
 
     // Skip active inputs
     if (skip_active) {
+      bool       is_active;
       CeedVector vec;
 
       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
-      if (vec == CEED_VECTOR_ACTIVE) continue;
+      is_active = vec == CEED_VECTOR_ACTIVE;
+      CeedCallBackend(CeedVectorDestroy(&vec));
+      if (is_active) continue;
     }
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
     if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
@@ -470,6 +494,7 @@ static int CeedOperatorApplyAdd_Blocked(CeedOperator op, CeedVector in_vec, Ceed
 
   // Output restriction
   for (CeedInt i = 0; i < num_output_fields; i++) {
+    bool       is_active;
     CeedVector vec;
 
     if (impl->skip_rstr_out[i]) continue;
@@ -477,11 +502,13 @@ static int CeedOperatorApplyAdd_Blocked(CeedOperator op, CeedVector in_vec, Ceed
     CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs_full[i + impl->num_inputs], &e_data_full[i + num_input_fields]));
     // Get output vector
     CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
+    is_active = vec == CEED_VECTOR_ACTIVE;
     // Active
-    if (vec == CEED_VECTOR_ACTIVE) vec = out_vec;
+    if (is_active) vec = out_vec;
     // Restrict
     CeedCallBackend(
         CeedElemRestrictionApply(impl->block_rstr[i + impl->num_inputs], CEED_TRANSPOSE, impl->e_vecs_full[i + impl->num_inputs], vec, request));
+    if (!is_active) CeedCallBackend(CeedVectorDestroy(&vec));
   }
 
   // Restore input arrays
@@ -533,14 +560,14 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Blocked(CeedOperator o
       CeedInt    field_size;
       CeedVector vec;
 
-      // Get input vector
-      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
       // Check if active input
+      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
       if (vec == CEED_VECTOR_ACTIVE) {
         CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &field_size));
         CeedCallBackend(CeedVectorSetValue(impl->q_vecs_in[i], 0.0));
         qf_size_in += field_size;
       }
+      CeedCallBackend(CeedVectorDestroy(&vec));
     }
     CeedCheck(qf_size_in > 0, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs");
     impl->qf_size_in = qf_size_in;
@@ -552,13 +579,13 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Blocked(CeedOperator o
       CeedInt    field_size;
       CeedVector vec;
 
-      // Get output vector
-      CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
       // Check if active output
+      CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
       if (vec == CEED_VECTOR_ACTIVE) {
         CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &field_size));
         qf_size_out += field_size;
       }
+      CeedCallBackend(CeedVectorDestroy(&vec));
     }
     CeedCheck(qf_size_out > 0, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs");
     impl->qf_size_out = qf_size_out;
@@ -601,13 +628,15 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Blocked(CeedOperator o
 
     // Assemble QFunction
     for (CeedInt i = 0; i < num_input_fields; i++) {
+      bool       is_active;
       CeedInt    field_size;
       CeedVector vec;
 
-      // Get input vector
-      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
       // Check if active input
-      if (vec != CEED_VECTOR_ACTIVE) continue;
+      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
+      is_active = vec == CEED_VECTOR_ACTIVE;
+      CeedCallBackend(CeedVectorDestroy(&vec));
+      if (!is_active) continue;
       CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &field_size));
       for (CeedInt field = 0; field < field_size; field++) {
         // Set current portion of input to 1.0
@@ -633,6 +662,7 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Blocked(CeedOperator o
               CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[out], &field_size));
               l_vec_array += field_size * Q * block_size;  // Advance the pointer by the size of the output
             }
+            CeedCallBackend(CeedVectorDestroy(&vec));
           }
           // Apply QFunction
           CeedCallBackend(CeedQFunctionApply(qf, Q * block_size, impl->q_vecs_in, impl->q_vecs_out));
@@ -664,12 +694,12 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Blocked(CeedOperator o
     for (CeedInt out = 0; out < num_output_fields; out++) {
       CeedVector vec;
 
-      // Get output vector
-      CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec));
       // Check if active output
+      CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec));
       if (vec == CEED_VECTOR_ACTIVE) {
         CeedCallBackend(CeedVectorTakeArray(impl->q_vecs_out[out], CEED_MEM_HOST, NULL));
       }
+      CeedCallBackend(CeedVectorDestroy(&vec));
     }
   }
 
diff --git a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
index 40ddf59ad1..c744ea4254 100644
--- a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
+++ b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
@@ -55,6 +55,7 @@ static int CeedOperatorBuildKernelData_Cuda_gen(Ceed ceed, CeedInt num_input_fie
       CeedCheck(*Q_1d == 0 || field_Q_1d == *Q_1d, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
       *Q_1d = field_Q_1d;
     }
+    CeedCallBackend(CeedBasisDestroy(&basis));
   }
   for (CeedInt i = 0; i < num_output_fields; i++) {
     CeedBasis basis;
@@ -77,6 +78,7 @@ static int CeedOperatorBuildKernelData_Cuda_gen(Ceed ceed, CeedInt num_input_fie
       CeedCheck(*Q_1d == 0 || field_Q_1d == *Q_1d, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
       *Q_1d = field_Q_1d;
     }
+    CeedCallBackend(CeedBasisDestroy(&basis));
   }
 
   // Only use 3D collocated gradient parallelization strategy when gradient is computed
@@ -96,6 +98,7 @@ static int CeedOperatorBuildKernelData_Cuda_gen(Ceed ceed, CeedInt num_input_fie
         CeedCallBackend(CeedBasisGetData(basis, &basis_data));
         *use_3d_slices = basis_data->d_collo_grad_1d && (was_grad_found ? *use_3d_slices : true);
         was_grad_found = true;
+        CeedCallBackend(CeedBasisDestroy(&basis));
       }
     }
     for (CeedInt i = 0; i < num_output_fields; i++) {
@@ -110,6 +113,7 @@ static int CeedOperatorBuildKernelData_Cuda_gen(Ceed ceed, CeedInt num_input_fie
         CeedCallBackend(CeedBasisGetData(basis, &basis_data));
         *use_3d_slices = basis_data->d_collo_grad_1d && (was_grad_found ? *use_3d_slices : true);
         was_grad_found = true;
+        CeedCallBackend(CeedBasisDestroy(&basis));
       }
     }
   }
@@ -138,6 +142,7 @@ static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, C
     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
   }
+  CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
   CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis));
   if (basis != CEED_BASIS_NONE) {
     CeedCallBackend(CeedBasisGetData(basis, &basis_data));
@@ -150,6 +155,7 @@ static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, C
     code << "  const CeedInt " << P_name << " = " << (basis == CEED_BASIS_NONE ? Q_1d : P_1d) << ";\n";
     code << "  const CeedInt num_comp" << var_suffix << " = " << num_comp << ";\n";
   }
+  CeedCallBackend(CeedBasisDestroy(&basis));
 
   // Load basis data
   code << "  // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
@@ -224,6 +230,7 @@ static int CeedOperatorBuildKernelRestriction_Cuda_gen(std::ostringstream &code,
   if (basis != CEED_BASIS_NONE) {
     CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
   }
+  CeedCallBackend(CeedBasisDestroy(&basis));
   CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode));
 
   // Restriction
@@ -302,6 +309,7 @@ static int CeedOperatorBuildKernelRestriction_Cuda_gen(std::ostringstream &code,
            << strides[2] << ">(data, elem, r_e" << var_suffix << ", d" << var_suffix << ");\n";
     }
   }
+  CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -324,6 +332,7 @@ static int CeedOperatorBuildKernelBasis_Cuda_gen(std::ostringstream &code, CeedO
     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
   }
+  CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
   CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis));
   if (basis != CEED_BASIS_NONE) {
     CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
@@ -401,6 +410,7 @@ static int CeedOperatorBuildKernelBasis_Cuda_gen(std::ostringstream &code, CeedO
                 // LCOV_EXCL_STOP
     }
   }
+  CeedCallBackend(CeedBasisDestroy(&basis));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -489,6 +499,7 @@ static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, C
             code << "      readSliceQuadsOffset3d<num_comp" << var_suffix << ", " << comp_stride << ", " << Q_name << ">(data, l_size" << var_suffix
                  << ", elem, q, indices.inputs[" << i << "], d" << var_suffix << ", r_s" << var_suffix << ");\n";
           }
+          CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
           break;
         case CEED_EVAL_INTERP:
           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
@@ -809,6 +820,7 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {
     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
     max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * elem_size);
+    CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
   }
   for (CeedInt i = 0; i < num_output_fields; i++) {
     CeedInt             num_comp, elem_size;
@@ -818,6 +830,7 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {
     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
     max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * elem_size);
+    CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
   }
   code << "    // Scratch restriction buffer space\n";
   code << "    CeedScalar r_e_scratch[" << max_rstr_buffer_size << "];\n";
@@ -858,7 +871,11 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {
           input_field_order[curr_index] = j;
           curr_index++;
         }
+        CeedCallBackend(CeedVectorDestroy(&vec_j));
+        CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j));
       }
+      CeedCallBackend(CeedVectorDestroy(&vec_i));
+      CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
     }
   }
 
diff --git a/backends/cuda-gen/ceed-cuda-gen-operator.c b/backends/cuda-gen/ceed-cuda-gen-operator.c
index 840d97afb9..4c235984fd 100644
--- a/backends/cuda-gen/ceed-cuda-gen-operator.c
+++ b/backends/cuda-gen/ceed-cuda-gen-operator.c
@@ -133,30 +133,35 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec,
 
   // Input vectors
   for (CeedInt i = 0; i < num_input_fields; i++) {
-    CeedVector vec;
-
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
     if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
       data->fields.inputs[i] = NULL;
     } else {
+      bool       is_active;
+      CeedVector vec;
+
       // Get input vector
       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
-      if (vec == CEED_VECTOR_ACTIVE) vec = input_vec;
+      is_active = vec == CEED_VECTOR_ACTIVE;
+      if (is_active) vec = input_vec;
       CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->fields.inputs[i]));
+      if (!is_active) CeedCallBackend(CeedVectorDestroy(&vec));
     }
   }
 
   // Output vectors
   for (CeedInt i = 0; i < num_output_fields; i++) {
-    CeedVector vec;
-
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
     if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
       data->fields.outputs[i] = NULL;
     } else {
+      bool       is_active;
+      CeedVector vec;
+
       // Get output vector
       CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
-      if (vec == CEED_VECTOR_ACTIVE) vec = output_vec;
+      is_active = vec == CEED_VECTOR_ACTIVE;
+      if (is_active) vec = output_vec;
       output_vecs[i] = vec;
       // Check for multiple output modes
       CeedInt index = -1;
@@ -172,6 +177,7 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec,
       } else {
         data->fields.outputs[i] = data->fields.outputs[index];
       }
+      if (!is_active) CeedCallBackend(CeedVectorDestroy(&vec));
     }
   }
 
@@ -203,26 +209,31 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec,
 
   // Restore input arrays
   for (CeedInt i = 0; i < num_input_fields; i++) {
-    CeedVector vec;
-
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
     if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
     } else {
+      bool       is_active;
+      CeedVector vec;
+
       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
-      if (vec == CEED_VECTOR_ACTIVE) vec = input_vec;
+      is_active = vec == CEED_VECTOR_ACTIVE;
+      if (is_active) vec = input_vec;
       CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->fields.inputs[i]));
+      if (!is_active) CeedCallBackend(CeedVectorDestroy(&vec));
     }
   }
 
   // Restore output arrays
   for (CeedInt i = 0; i < num_output_fields; i++) {
-    CeedVector vec;
-
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
     if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
     } else {
+      bool       is_active;
+      CeedVector vec;
+
       CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
-      if (vec == CEED_VECTOR_ACTIVE) vec = output_vec;
+      is_active = vec == CEED_VECTOR_ACTIVE;
+      if (is_active) vec = output_vec;
       // Check for multiple output modes
       CeedInt index = -1;
       for (CeedInt j = 0; j < i; j++) {
@@ -234,6 +245,7 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec,
       if (index == -1) {
         CeedCallBackend(CeedVectorRestoreArray(vec, &data->fields.outputs[i]));
       }
+      if (!is_active) CeedCallBackend(CeedVectorDestroy(&vec));
     }
   }
 
diff --git a/backends/cuda-ref/ceed-cuda-ref-operator.c b/backends/cuda-ref/ceed-cuda-ref-operator.c
index 986d240dff..ceb940dad6 100644
--- a/backends/cuda-ref/ceed-cuda-ref-operator.c
+++ b/backends/cuda-ref/ceed-cuda-ref-operator.c
@@ -133,6 +133,7 @@ static int CeedOperatorSetupFields_Cuda(CeedQFunction qf, CeedOperator op, bool
     // Input passive vectorr with CEED_EVAL_NONE and strided restriction with CEED_STRIDES_BACKEND
     CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &l_vec));
     is_active = l_vec == CEED_VECTOR_ACTIVE;
+    CeedCallBackend(CeedVectorDestroy(&l_vec));
     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &elem_rstr));
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode));
     skip_e_vec = (is_input && is_active) || (is_active && eval_mode != CEED_EVAL_NONE) || (eval_mode == CEED_EVAL_WEIGHT);
@@ -145,6 +146,7 @@ static int CeedOperatorSetupFields_Cuda(CeedQFunction qf, CeedOperator op, bool
     } else {
       CeedCallBackend(CeedElemRestrictionCreateVector(elem_rstr, NULL, &e_vecs[i]));
     }
+    CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
 
     switch (eval_mode) {
       case CEED_EVAL_NONE:
@@ -171,6 +173,7 @@ static int CeedOperatorSetupFields_Cuda(CeedQFunction qf, CeedOperator op, bool
         } else {
           CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT, CEED_VECTOR_NONE, q_vecs[i]));
         }
+        CeedCallBackend(CeedBasisDestroy(&basis));
         break;
       }
     }
@@ -193,7 +196,11 @@ static int CeedOperatorSetupFields_Cuda(CeedQFunction qf, CeedOperator op, bool
           if (e_vecs[i]) CeedCallBackend(CeedVectorReferenceCopy(e_vecs[i], &e_vecs[j]));
           skip_rstr[j] = true;
         }
+        CeedCallBackend(CeedVectorDestroy(&vec_j));
+        CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j));
       }
+      CeedCallBackend(CeedVectorDestroy(&vec_i));
+      CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
     }
   } else {
     for (CeedInt i = num_fields - 1; i >= 0; i--) {
@@ -213,7 +220,11 @@ static int CeedOperatorSetupFields_Cuda(CeedQFunction qf, CeedOperator op, bool
           skip_rstr[j]       = true;
           apply_add_basis[i] = true;
         }
+        CeedCallBackend(CeedVectorDestroy(&vec_j));
+        CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j));
       }
+      CeedCallBackend(CeedVectorDestroy(&vec_i));
+      CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
     }
   }
   return CEED_ERROR_SUCCESS;
@@ -279,7 +290,11 @@ static int CeedOperatorSetup_Cuda(CeedOperator op) {
       impl->input_field_order[curr_index] = i;
       curr_index++;
       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i));
-      if (vec_i == CEED_VECTOR_NONE) continue;  // CEED_EVAL_WEIGHT
+      if (vec_i == CEED_VECTOR_NONE) {
+        // CEED_EVAL_WEIGHT
+        CeedCallBackend(CeedVectorDestroy(&vec_i));
+        continue;
+      };
       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i));
       CeedCallBackend(CeedElemRestrictionGetEVectorSize(rstr_i, &e_vec_len_i));
       impl->max_active_e_vec_len = e_vec_len_i > impl->max_active_e_vec_len ? e_vec_len_i : impl->max_active_e_vec_len;
@@ -294,7 +309,11 @@ static int CeedOperatorSetup_Cuda(CeedOperator op) {
           impl->input_field_order[curr_index] = j;
           curr_index++;
         }
+        CeedCallBackend(CeedVectorDestroy(&vec_j));
+        CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j));
       }
+      CeedCallBackend(CeedVectorDestroy(&vec_i));
+      CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
     }
   }
   {
@@ -326,7 +345,11 @@ static int CeedOperatorSetup_Cuda(CeedOperator op) {
           impl->output_field_order[curr_index] = j;
           curr_index++;
         }
+        CeedCallBackend(CeedVectorDestroy(&vec_j));
+        CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j));
       }
+      CeedCallBackend(CeedVectorDestroy(&vec_i));
+      CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
     }
   }
   CeedCallBackend(CeedOperatorSetSetupDone(op));
@@ -363,10 +386,12 @@ static inline int CeedOperatorInputRestrict_Cuda(CeedOperatorField op_input_fiel
 
         CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_field, &elem_rstr));
         CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_NOTRANSPOSE, l_vec, e_vec, request));
+        CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
       }
       impl->input_states[input_field] = state;
     }
   }
+  if (!is_active) CeedCallBackend(CeedVectorDestroy(&l_vec));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -411,11 +436,13 @@ static inline int CeedOperatorInputBasis_Cuda(CeedOperatorField op_input_field,
 
       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_field, &basis));
       CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_NOTRANSPOSE, eval_mode, e_vec, q_vec));
+      CeedCallBackend(CeedBasisDestroy(&basis));
       break;
     }
     case CEED_EVAL_WEIGHT:
       break;  // No action
   }
+  if (!is_active) CeedCallBackend(CeedVectorDestroy(&l_vec));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -449,6 +476,7 @@ static inline int CeedOperatorInputRestore_Cuda(CeedOperatorField op_input_field
       CeedCallBackend(CeedVectorRestoreArrayRead(l_vec, &e_vec_array));
     }
   }
+  if (!is_active) CeedCallBackend(CeedVectorDestroy(&l_vec));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -510,12 +538,10 @@ static int CeedOperatorApplyAdd_Cuda(CeedOperator op, CeedVector in_vec, CeedVec
 
   // Output basis and restriction
   for (CeedInt i = 0; i < num_output_fields; i++) {
-    bool                is_active = false;
-    CeedInt             field     = impl->output_field_order[i];
-    CeedEvalMode        eval_mode;
-    CeedVector          l_vec, e_vec = impl->e_vecs_out[field], q_vec = impl->q_vecs_out[field];
-    CeedElemRestriction elem_rstr;
-    CeedBasis           basis;
+    bool         is_active = false;
+    CeedInt      field     = impl->output_field_order[i];
+    CeedEvalMode eval_mode;
+    CeedVector   l_vec, e_vec = impl->e_vecs_out[field], q_vec = impl->q_vecs_out[field];
 
     // Output vector
     CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[field], &l_vec));
@@ -533,14 +559,18 @@ static int CeedOperatorApplyAdd_Cuda(CeedOperator op, CeedVector in_vec, CeedVec
       case CEED_EVAL_INTERP:
       case CEED_EVAL_GRAD:
       case CEED_EVAL_DIV:
-      case CEED_EVAL_CURL:
+      case CEED_EVAL_CURL: {
+        CeedBasis basis;
+
         CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[field], &basis));
         if (impl->apply_add_basis_out[field]) {
           CeedCallBackend(CeedBasisApplyAdd(basis, num_elem, CEED_TRANSPOSE, eval_mode, q_vec, e_vec));
         } else {
           CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_TRANSPOSE, eval_mode, q_vec, e_vec));
         }
+        CeedCallBackend(CeedBasisDestroy(&basis));
         break;
+      }
       // LCOV_EXCL_START
       case CEED_EVAL_WEIGHT: {
         return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
@@ -557,9 +587,18 @@ static int CeedOperatorApplyAdd_Cuda(CeedOperator op, CeedVector in_vec, CeedVec
     }
 
     // Restrict
-    if (impl->skip_rstr_out[field]) continue;
-    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[field], &elem_rstr));
-    CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_TRANSPOSE, e_vec, l_vec, request));
+    if (impl->skip_rstr_out[field]) {
+      if (!is_active) CeedCallBackend(CeedVectorDestroy(&l_vec));
+      continue;
+    }
+    {
+      CeedElemRestriction elem_rstr;
+
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[field], &elem_rstr));
+      CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_TRANSPOSE, e_vec, l_vec, request));
+      CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
+    }
+    if (!is_active) CeedCallBackend(CeedVectorDestroy(&l_vec));
   }
 
   // Return work vector
@@ -641,7 +680,11 @@ static int CeedOperatorSetupAtPoints_Cuda(CeedOperator op) {
       impl->input_field_order[curr_index] = i;
       curr_index++;
       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i));
-      if (vec_i == CEED_VECTOR_NONE) continue;  // CEED_EVAL_WEIGHT
+      if (vec_i == CEED_VECTOR_NONE) {
+        // CEED_EVAL_WEIGHT
+        CeedCallBackend(CeedVectorDestroy(&vec_i));
+        continue;
+      };
       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i));
       CeedCallBackend(CeedElemRestrictionGetEVectorSize(rstr_i, &e_vec_len_i));
       impl->max_active_e_vec_len = e_vec_len_i > impl->max_active_e_vec_len ? e_vec_len_i : impl->max_active_e_vec_len;
@@ -656,7 +699,11 @@ static int CeedOperatorSetupAtPoints_Cuda(CeedOperator op) {
           impl->input_field_order[curr_index] = j;
           curr_index++;
         }
+        CeedCallBackend(CeedVectorDestroy(&vec_j));
+        CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j));
       }
+      CeedCallBackend(CeedVectorDestroy(&vec_i));
+      CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
     }
   }
   {
@@ -688,10 +735,13 @@ static int CeedOperatorSetupAtPoints_Cuda(CeedOperator op) {
           impl->output_field_order[curr_index] = j;
           curr_index++;
         }
+        CeedCallBackend(CeedVectorDestroy(&vec_j));
+        CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j));
       }
+      CeedCallBackend(CeedVectorDestroy(&vec_i));
+      CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
     }
   }
-
   CeedCallBackend(CeedOperatorSetSetupDone(op));
   return CEED_ERROR_SUCCESS;
 }
@@ -737,11 +787,13 @@ static inline int CeedOperatorInputBasisAtPoints_Cuda(CeedOperatorField op_input
 
       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_field, &basis));
       CeedCallBackend(CeedBasisApplyAtPoints(basis, num_elem, num_points, CEED_NOTRANSPOSE, eval_mode, impl->point_coords_elem, e_vec, q_vec));
+      CeedCallBackend(CeedBasisDestroy(&basis));
       break;
     }
     case CEED_EVAL_WEIGHT:
       break;  // No action
   }
+  if (!is_active) CeedCallBackend(CeedVectorDestroy(&l_vec));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -817,12 +869,10 @@ static int CeedOperatorApplyAddAtPoints_Cuda(CeedOperator op, CeedVector in_vec,
 
   // Output basis and restriction
   for (CeedInt i = 0; i < num_output_fields; i++) {
-    bool                is_active = false;
-    CeedInt             field     = impl->output_field_order[i];
-    CeedEvalMode        eval_mode;
-    CeedVector          l_vec, e_vec = impl->e_vecs_out[field], q_vec = impl->q_vecs_out[field];
-    CeedElemRestriction elem_rstr;
-    CeedBasis           basis;
+    bool         is_active = false;
+    CeedInt      field     = impl->output_field_order[i];
+    CeedEvalMode eval_mode;
+    CeedVector   l_vec, e_vec = impl->e_vecs_out[field], q_vec = impl->q_vecs_out[field];
 
     // Output vector
     CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[field], &l_vec));
@@ -840,14 +890,18 @@ static int CeedOperatorApplyAddAtPoints_Cuda(CeedOperator op, CeedVector in_vec,
       case CEED_EVAL_INTERP:
       case CEED_EVAL_GRAD:
       case CEED_EVAL_DIV:
-      case CEED_EVAL_CURL:
+      case CEED_EVAL_CURL: {
+        CeedBasis basis;
+
         CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[field], &basis));
         if (impl->apply_add_basis_out[field]) {
           CeedCallBackend(CeedBasisApplyAddAtPoints(basis, num_elem, num_points, CEED_TRANSPOSE, eval_mode, impl->point_coords_elem, q_vec, e_vec));
         } else {
           CeedCallBackend(CeedBasisApplyAtPoints(basis, num_elem, num_points, CEED_TRANSPOSE, eval_mode, impl->point_coords_elem, q_vec, e_vec));
         }
+        CeedCallBackend(CeedBasisDestroy(&basis));
         break;
+      }
       // LCOV_EXCL_START
       case CEED_EVAL_WEIGHT: {
         return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
@@ -864,9 +918,18 @@ static int CeedOperatorApplyAddAtPoints_Cuda(CeedOperator op, CeedVector in_vec,
     }
 
     // Restrict
-    if (impl->skip_rstr_out[field]) continue;
-    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[field], &elem_rstr));
-    CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_TRANSPOSE, e_vec, l_vec, request));
+    if (impl->skip_rstr_out[field]) {
+      if (!is_active) CeedCallBackend(CeedVectorDestroy(&l_vec));
+      continue;
+    }
+    {
+      CeedElemRestriction elem_rstr;
+
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[field], &elem_rstr));
+      CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_TRANSPOSE, e_vec, l_vec, request));
+      CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
+    }
+    if (!is_active) CeedCallBackend(CeedVectorDestroy(&l_vec));
   }
 
   // Restore work vector
@@ -931,6 +994,7 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Cuda(CeedOperator op,
         num_active_in += size;
         CeedCallBackend(CeedVectorRestoreArray(impl->q_vecs_in[i], &q_vec_array));
       }
+      CeedCallBackend(CeedVectorDestroy(&l_vec));
     }
     impl->num_active_in = num_active_in;
     impl->qf_active_in  = active_inputs;
@@ -947,6 +1011,7 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Cuda(CeedOperator op,
         CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &size));
         num_active_out += size;
       }
+      CeedCallBackend(CeedVectorDestroy(&l_vec));
     }
     impl->num_active_out = num_active_out;
   }
@@ -980,14 +1045,14 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Cuda(CeedOperator op,
     for (CeedInt out = 0; out < num_output_fields; out++) {
       CeedVector l_vec;
 
-      // Get output vector
-      CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &l_vec));
       // Check if active output
+      CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &l_vec));
       if (l_vec == CEED_VECTOR_ACTIVE) {
         CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[out], CEED_MEM_DEVICE, CEED_USE_POINTER, assembled_array));
         CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[out], &size));
         assembled_array += size * Q * num_elem;  // Advance the pointer by the size of the output
       }
+      CeedCallBackend(CeedVectorDestroy(&l_vec));
     }
     // Apply QFunction
     CeedCallBackend(CeedQFunctionApply(qf, Q * num_elem, impl->q_vecs_in, impl->q_vecs_out));
@@ -1001,6 +1066,7 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Cuda(CeedOperator op,
     if (l_vec == CEED_VECTOR_ACTIVE) {
       CeedCallBackend(CeedVectorTakeArray(impl->q_vecs_out[out], CEED_MEM_DEVICE, NULL));
     }
+    CeedCallBackend(CeedVectorDestroy(&l_vec));
   }
 
   // Restore input arrays
@@ -1053,13 +1119,14 @@ static inline int CeedOperatorAssembleDiagonalSetup_Cuda(CeedOperator op) {
 
     CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec));
     if (vec == CEED_VECTOR_ACTIVE) {
-      CeedBasis    basis;
       CeedEvalMode eval_mode;
+      CeedBasis    basis;
 
       CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis));
       CeedCheck(!basis_in || basis_in == basis, ceed, CEED_ERROR_BACKEND,
                 "Backend does not implement operator diagonal assembly with multiple active bases");
-      basis_in = basis;
+      if (!basis_in) CeedCallBackend(CeedBasisReferenceCopy(basis, &basis_in));
+      CeedCallBackend(CeedBasisDestroy(&basis));
       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode));
       CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis_in, eval_mode, &q_comp));
       if (eval_mode != CEED_EVAL_WEIGHT) {
@@ -1069,6 +1136,7 @@ static inline int CeedOperatorAssembleDiagonalSetup_Cuda(CeedOperator op) {
         num_eval_modes_in += q_comp;
       }
     }
+    CeedCallBackend(CeedVectorDestroy(&vec));
   }
 
   // Determine active output basis
@@ -1085,7 +1153,8 @@ static inline int CeedOperatorAssembleDiagonalSetup_Cuda(CeedOperator op) {
       CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis));
       CeedCheck(!basis_out || basis_out == basis, ceed, CEED_ERROR_BACKEND,
                 "Backend does not implement operator diagonal assembly with multiple active bases");
-      basis_out = basis;
+      if (!basis_out) CeedCallBackend(CeedBasisReferenceCopy(basis, &basis_out));
+      CeedCallBackend(CeedBasisDestroy(&basis));
       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode));
       CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis_out, eval_mode, &q_comp));
       if (eval_mode != CEED_EVAL_WEIGHT) {
@@ -1095,6 +1164,7 @@ static inline int CeedOperatorAssembleDiagonalSetup_Cuda(CeedOperator op) {
         num_eval_modes_out += q_comp;
       }
     }
+    CeedCallBackend(CeedVectorDestroy(&vec));
   }
 
   // Operator data struct
@@ -1206,6 +1276,8 @@ static inline int CeedOperatorAssembleDiagonalSetup_Cuda(CeedOperator op) {
   CeedCallCuda(ceed, cudaMemcpy(diag->d_eval_modes_out, eval_modes_out, num_eval_modes_out * eval_modes_bytes, cudaMemcpyHostToDevice));
   CeedCallBackend(CeedFree(&eval_modes_in));
   CeedCallBackend(CeedFree(&eval_modes_out));
+  CeedCallBackend(CeedBasisDestroy(&basis_in));
+  CeedCallBackend(CeedBasisDestroy(&basis_out));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -1237,14 +1309,18 @@ static inline int CeedOperatorAssembleDiagonalSetupCompile_Cuda(CeedOperator op,
     CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec));
     if (vec == CEED_VECTOR_ACTIVE) {
       CeedEvalMode eval_mode;
+      CeedBasis    basis;
 
-      CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis_in));
+      CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis));
+      if (!basis_in) CeedCallBackend(CeedBasisReferenceCopy(basis, &basis_in));
+      CeedCallBackend(CeedBasisDestroy(&basis));
       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode));
       CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis_in, eval_mode, &q_comp));
       if (eval_mode != CEED_EVAL_WEIGHT) {
         num_eval_modes_in += q_comp;
       }
     }
+    CeedCallBackend(CeedVectorDestroy(&vec));
   }
 
   // Determine active output basis
@@ -1256,14 +1332,18 @@ static inline int CeedOperatorAssembleDiagonalSetupCompile_Cuda(CeedOperator op,
     CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec));
     if (vec == CEED_VECTOR_ACTIVE) {
       CeedEvalMode eval_mode;
+      CeedBasis    basis;
 
-      CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis_out));
+      CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis));
+      if (!basis_out) CeedCallBackend(CeedBasisReferenceCopy(basis, &basis_out));
+      CeedCallBackend(CeedBasisDestroy(&basis));
       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode));
       CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis_out, eval_mode, &q_comp));
       if (eval_mode != CEED_EVAL_WEIGHT) {
         num_eval_modes_out += q_comp;
       }
     }
+    CeedCallBackend(CeedVectorDestroy(&vec));
   }
 
   // Operator data struct
@@ -1287,6 +1367,8 @@ static inline int CeedOperatorAssembleDiagonalSetupCompile_Cuda(CeedOperator op,
   CeedCallCuda(ceed, CeedGetKernel_Cuda(ceed, *module, "LinearDiagonal", is_point_block ? &diag->LinearPointBlock : &diag->LinearDiagonal));
   CeedCallBackend(CeedFree(&diagonal_kernel_path));
   CeedCallBackend(CeedFree(&diagonal_kernel_source));
+  CeedCallBackend(CeedBasisDestroy(&basis_in));
+  CeedCallBackend(CeedBasisDestroy(&basis_out));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -1338,6 +1420,8 @@ static inline int CeedOperatorAssembleDiagonalCore_Cuda(CeedOperator op, CeedVec
     CeedCallBackend(CeedOperatorCreateActivePointBlockRestriction(rstr_out, &diag->point_block_diag_rstr));
     CeedCallBackend(CeedElemRestrictionCreateVector(diag->point_block_diag_rstr, NULL, &diag->point_block_elem_diag));
   }
+  CeedCallBackend(CeedElemRestrictionDestroy(&rstr_in));
+  CeedCallBackend(CeedElemRestrictionDestroy(&rstr_out));
   diag_rstr = is_point_block ? diag->point_block_diag_rstr : diag->diag_rstr;
   elem_diag = is_point_block ? diag->point_block_elem_diag : diag->elem_diag;
   CeedCallBackend(CeedVectorSetValue(elem_diag, 0.0));
@@ -1423,13 +1507,17 @@ static int CeedSingleOperatorAssembleSetup_Cuda(CeedOperator op, CeedInt use_cee
 
     CeedCallBackend(CeedOperatorFieldGetVector(input_fields[i], &vec));
     if (vec == CEED_VECTOR_ACTIVE) {
-      CeedBasis    basis;
-      CeedEvalMode eval_mode;
+      CeedEvalMode        eval_mode;
+      CeedElemRestriction elem_rstr;
+      CeedBasis           basis;
 
       CeedCallBackend(CeedOperatorFieldGetBasis(input_fields[i], &basis));
       CeedCheck(!basis_in || basis_in == basis, ceed, CEED_ERROR_BACKEND, "Backend does not implement operator assembly with multiple active bases");
-      basis_in = basis;
-      CeedCallBackend(CeedOperatorFieldGetElemRestriction(input_fields[i], &rstr_in));
+      if (!basis_in) CeedCallBackend(CeedBasisReferenceCopy(basis, &basis_in));
+      CeedCallBackend(CeedBasisDestroy(&basis));
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(input_fields[i], &elem_rstr));
+      if (!rstr_in) CeedCallBackend(CeedElemRestrictionReferenceCopy(elem_rstr, &rstr_in));
+      CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
       CeedCallBackend(CeedElemRestrictionGetElementSize(rstr_in, &elem_size_in));
       if (basis_in == CEED_BASIS_NONE) num_qpts_in = elem_size_in;
       else CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis_in, &num_qpts_in));
@@ -1444,6 +1532,7 @@ static int CeedSingleOperatorAssembleSetup_Cuda(CeedOperator op, CeedInt use_cee
         num_eval_modes_in += q_comp;
       }
     }
+    CeedCallBackend(CeedVectorDestroy(&vec));
   }
 
   // Determine active output basis; basis_out and rstr_out only used if same as input, TODO
@@ -1453,14 +1542,18 @@ static int CeedSingleOperatorAssembleSetup_Cuda(CeedOperator op, CeedInt use_cee
 
     CeedCallBackend(CeedOperatorFieldGetVector(output_fields[i], &vec));
     if (vec == CEED_VECTOR_ACTIVE) {
-      CeedBasis    basis;
-      CeedEvalMode eval_mode;
+      CeedEvalMode        eval_mode;
+      CeedElemRestriction elem_rstr;
+      CeedBasis           basis;
 
       CeedCallBackend(CeedOperatorFieldGetBasis(output_fields[i], &basis));
       CeedCheck(!basis_out || basis_out == basis, ceed, CEED_ERROR_BACKEND,
                 "Backend does not implement operator assembly with multiple active bases");
-      basis_out = basis;
-      CeedCallBackend(CeedOperatorFieldGetElemRestriction(output_fields[i], &rstr_out));
+      if (!basis_out) CeedCallBackend(CeedBasisReferenceCopy(basis, &basis_out));
+      CeedCallBackend(CeedBasisDestroy(&basis));
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(output_fields[i], &elem_rstr));
+      if (!rstr_out) CeedCallBackend(CeedElemRestrictionReferenceCopy(elem_rstr, &rstr_out));
+      CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
       CeedCallBackend(CeedElemRestrictionGetElementSize(rstr_out, &elem_size_out));
       if (basis_out == CEED_BASIS_NONE) num_qpts_out = elem_size_out;
       else CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis_out, &num_qpts_out));
@@ -1477,6 +1570,7 @@ static int CeedSingleOperatorAssembleSetup_Cuda(CeedOperator op, CeedInt use_cee
         num_eval_modes_out += q_comp;
       }
     }
+    CeedCallBackend(CeedVectorDestroy(&vec));
   }
   CeedCheck(num_eval_modes_in > 0 && num_eval_modes_out > 0, ceed, CEED_ERROR_UNSUPPORTED, "Cannot assemble operator without inputs/outputs");
 
@@ -1579,6 +1673,10 @@ static int CeedSingleOperatorAssembleSetup_Cuda(CeedOperator op, CeedInt use_cee
     CeedCallBackend(CeedFree(&identity));
   }
   CeedCallBackend(CeedFree(&eval_modes_out));
+  CeedCallBackend(CeedElemRestrictionDestroy(&rstr_in));
+  CeedCallBackend(CeedElemRestrictionDestroy(&rstr_out));
+  CeedCallBackend(CeedBasisDestroy(&basis_in));
+  CeedCallBackend(CeedBasisDestroy(&basis_out));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -1683,6 +1781,8 @@ static int CeedSingleOperatorAssemble_Cuda(CeedOperator op, CeedInt offset, Ceed
       CeedCallBackend(CeedElemRestrictionRestoreCurlOrientations(rstr_out, &curl_orients_out));
     }
   }
+  CeedCallBackend(CeedElemRestrictionDestroy(&rstr_in));
+  CeedCallBackend(CeedElemRestrictionDestroy(&rstr_out));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -1754,10 +1854,13 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda(CeedOperator op, C
 
   // Clear active input Qvecs
   for (CeedInt i = 0; i < num_input_fields; i++) {
-    CeedVector vec;
+    bool       is_active = false;
+    CeedVector l_vec;
 
-    CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
-    if (vec != CEED_VECTOR_ACTIVE) continue;
+    CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &l_vec));
+    is_active = l_vec == CEED_VECTOR_ACTIVE;
+    CeedCallBackend(CeedVectorDestroy(&l_vec));
+    if (!is_active) continue;
     CeedCallBackend(CeedVectorSetValue(impl->q_vecs_in[i], 0.0));
   }
 
@@ -1776,15 +1879,17 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda(CeedOperator op, C
 
   // Loop over active fields
   for (CeedInt i = 0; i < num_input_fields; i++) {
-    bool                is_active_at_points = true;
+    bool                is_active = false, is_active_at_points = true;
     CeedInt             elem_size = 1, num_comp_active = 1, e_vec_size = 0;
     CeedRestrictionType rstr_type;
     CeedVector          l_vec;
     CeedElemRestriction elem_rstr;
 
-    CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &l_vec));
     // -- Skip non-active input
-    if (l_vec != CEED_VECTOR_ACTIVE) continue;
+    CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &l_vec));
+    is_active = l_vec == CEED_VECTOR_ACTIVE;
+    CeedCallBackend(CeedVectorDestroy(&l_vec));
+    if (!is_active) continue;
 
     // -- Get active restriction type
     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
@@ -1793,18 +1898,19 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda(CeedOperator op, C
     if (!is_active_at_points) CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
     else elem_size = max_num_points;
     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp_active));
+    CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
 
     e_vec_size = elem_size * num_comp_active;
     for (CeedInt s = 0; s < e_vec_size; s++) {
-      bool         is_active_input = false;
+      bool         is_active = false;
       CeedEvalMode eval_mode;
       CeedVector   l_vec, q_vec = impl->q_vecs_in[i];
-      CeedBasis    basis;
 
-      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &l_vec));
       // Skip non-active input
-      is_active_input = l_vec == CEED_VECTOR_ACTIVE;
-      if (!is_active_input) continue;
+      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &l_vec));
+      is_active = l_vec == CEED_VECTOR_ACTIVE;
+      CeedCallBackend(CeedVectorDestroy(&l_vec));
+      if (!is_active) continue;
 
       // Update unit vector
       if (s == 0) CeedCallBackend(CeedVectorSetValue(active_e_vec_in, 0.0));
@@ -1824,11 +1930,15 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda(CeedOperator op, C
         case CEED_EVAL_INTERP:
         case CEED_EVAL_GRAD:
         case CEED_EVAL_DIV:
-        case CEED_EVAL_CURL:
+        case CEED_EVAL_CURL: {
+          CeedBasis basis;
+
           CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
           CeedCallBackend(
               CeedBasisApplyAtPoints(basis, num_elem, num_points, CEED_NOTRANSPOSE, eval_mode, impl->point_coords_elem, active_e_vec_in, q_vec));
+          CeedCallBackend(CeedBasisDestroy(&basis));
           break;
+        }
         case CEED_EVAL_WEIGHT:
           break;  // No action
       }
@@ -1838,18 +1948,18 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda(CeedOperator op, C
 
       // Output basis apply if needed
       for (CeedInt j = 0; j < num_output_fields; j++) {
-        bool                is_active_output = false;
-        CeedInt             elem_size        = 0;
+        bool                is_active = false;
+        CeedInt             elem_size = 0;
         CeedRestrictionType rstr_type;
         CeedEvalMode        eval_mode;
         CeedVector          l_vec, e_vec = impl->e_vecs_out[j], q_vec = impl->q_vecs_out[j];
         CeedElemRestriction elem_rstr;
-        CeedBasis           basis;
 
-        CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[j], &l_vec));
         // ---- Skip non-active output
-        is_active_output = l_vec == CEED_VECTOR_ACTIVE;
-        if (!is_active_output) continue;
+        CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[j], &l_vec));
+        is_active = l_vec == CEED_VECTOR_ACTIVE;
+        CeedCallBackend(CeedVectorDestroy(&l_vec));
+        if (!is_active) continue;
         if (!e_vec) e_vec = active_e_vec_out;
 
         // ---- Check if elem size matches
@@ -1881,10 +1991,14 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda(CeedOperator op, C
           case CEED_EVAL_INTERP:
           case CEED_EVAL_GRAD:
           case CEED_EVAL_DIV:
-          case CEED_EVAL_CURL:
+          case CEED_EVAL_CURL: {
+            CeedBasis basis;
+
             CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[j], &basis));
             CeedCallBackend(CeedBasisApplyAtPoints(basis, num_elem, num_points, CEED_TRANSPOSE, eval_mode, impl->point_coords_elem, q_vec, e_vec));
+            CeedCallBackend(CeedBasisDestroy(&basis));
             break;
+          }
           // LCOV_EXCL_START
           case CEED_EVAL_WEIGHT: {
             return CeedError(CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
@@ -1896,8 +2010,8 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda(CeedOperator op, C
         CeedCallBackend(CeedVectorPointwiseMult(e_vec, active_e_vec_in, e_vec));
 
         // Restrict
-        CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[j], &elem_rstr));
         CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_TRANSPOSE, e_vec, assembled, request));
+        CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
 
         // Reset q_vec for
         if (eval_mode == CEED_EVAL_NONE) {
diff --git a/backends/hip-gen/ceed-hip-gen-operator-build.cpp b/backends/hip-gen/ceed-hip-gen-operator-build.cpp
index c3298b1b27..ee0dea2609 100644
--- a/backends/hip-gen/ceed-hip-gen-operator-build.cpp
+++ b/backends/hip-gen/ceed-hip-gen-operator-build.cpp
@@ -818,6 +818,7 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op) {
     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
     max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * elem_size);
+    CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
   }
   for (CeedInt i = 0; i < num_output_fields; i++) {
     CeedInt             num_comp, elem_size;
@@ -827,6 +828,7 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op) {
     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
     max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * elem_size);
+    CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
   }
   code << "    // Scratch restriction buffer space\n";
   code << "    CeedScalar r_e_scratch[" << max_rstr_buffer_size << "];\n";
diff --git a/backends/hip-ref/ceed-hip-ref-operator.c b/backends/hip-ref/ceed-hip-ref-operator.c
index 6525c0b47a..c6307037fa 100644
--- a/backends/hip-ref/ceed-hip-ref-operator.c
+++ b/backends/hip-ref/ceed-hip-ref-operator.c
@@ -132,6 +132,7 @@ static int CeedOperatorSetupFields_Hip(CeedQFunction qf, CeedOperator op, bool i
     // Input passive vectorr with CEED_EVAL_NONE and strided restriction with CEED_STRIDES_BACKEND
     CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &l_vec));
     is_active = l_vec == CEED_VECTOR_ACTIVE;
+    CeedCallBackend(CeedVectorDestroy(&l_vec));
     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &elem_rstr));
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode));
     skip_e_vec = (is_input && is_active) || (is_active && eval_mode != CEED_EVAL_NONE) || (eval_mode == CEED_EVAL_WEIGHT);
@@ -144,6 +145,7 @@ static int CeedOperatorSetupFields_Hip(CeedQFunction qf, CeedOperator op, bool i
     } else {
       CeedCallBackend(CeedElemRestrictionCreateVector(elem_rstr, NULL, &e_vecs[i]));
     }
+    CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
 
     switch (eval_mode) {
       case CEED_EVAL_NONE:
@@ -170,6 +172,7 @@ static int CeedOperatorSetupFields_Hip(CeedQFunction qf, CeedOperator op, bool i
         } else {
           CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT, CEED_VECTOR_NONE, q_vecs[i]));
         }
+        CeedCallBackend(CeedBasisDestroy(&basis));
         break;
       }
     }
@@ -192,7 +195,11 @@ static int CeedOperatorSetupFields_Hip(CeedQFunction qf, CeedOperator op, bool i
           if (e_vecs[i]) CeedCallBackend(CeedVectorReferenceCopy(e_vecs[i], &e_vecs[j]));
           skip_rstr[j] = true;
         }
+        CeedCallBackend(CeedVectorDestroy(&vec_j));
+        CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j));
       }
+      CeedCallBackend(CeedVectorDestroy(&vec_i));
+      CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
     }
   } else {
     for (CeedInt i = num_fields - 1; i >= 0; i--) {
@@ -212,7 +219,11 @@ static int CeedOperatorSetupFields_Hip(CeedQFunction qf, CeedOperator op, bool i
           skip_rstr[j]       = true;
           apply_add_basis[i] = true;
         }
+        CeedCallBackend(CeedVectorDestroy(&vec_j));
+        CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j));
       }
+      CeedCallBackend(CeedVectorDestroy(&vec_i));
+      CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
     }
   }
   return CEED_ERROR_SUCCESS;
@@ -278,7 +289,11 @@ static int CeedOperatorSetup_Hip(CeedOperator op) {
       impl->input_field_order[curr_index] = i;
       curr_index++;
       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i));
-      if (vec_i == CEED_VECTOR_NONE) continue;  // CEED_EVAL_WEIGHT
+      if (vec_i == CEED_VECTOR_NONE) {
+        // CEED_EVAL_WEIGHT
+        CeedCallBackend(CeedVectorDestroy(&vec_i));
+        continue;
+      };
       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i));
       CeedCallBackend(CeedElemRestrictionGetEVectorSize(rstr_i, &e_vec_len_i));
       impl->max_active_e_vec_len = e_vec_len_i > impl->max_active_e_vec_len ? e_vec_len_i : impl->max_active_e_vec_len;
@@ -293,7 +308,11 @@ static int CeedOperatorSetup_Hip(CeedOperator op) {
           impl->input_field_order[curr_index] = j;
           curr_index++;
         }
+        CeedCallBackend(CeedVectorDestroy(&vec_j));
+        CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j));
       }
+      CeedCallBackend(CeedVectorDestroy(&vec_i));
+      CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
     }
   }
   {
@@ -325,7 +344,11 @@ static int CeedOperatorSetup_Hip(CeedOperator op) {
           impl->output_field_order[curr_index] = j;
           curr_index++;
         }
+        CeedCallBackend(CeedVectorDestroy(&vec_j));
+        CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j));
       }
+      CeedCallBackend(CeedVectorDestroy(&vec_i));
+      CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
     }
   }
   CeedCallBackend(CeedOperatorSetSetupDone(op));
@@ -362,10 +385,12 @@ static inline int CeedOperatorInputRestrict_Hip(CeedOperatorField op_input_field
 
         CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_field, &elem_rstr));
         CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_NOTRANSPOSE, l_vec, e_vec, request));
+        CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
       }
       impl->input_states[input_field] = state;
     }
   }
+  if (!is_active) CeedCallBackend(CeedVectorDestroy(&l_vec));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -410,11 +435,13 @@ static inline int CeedOperatorInputBasis_Hip(CeedOperatorField op_input_field, C
 
       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_field, &basis));
       CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_NOTRANSPOSE, eval_mode, e_vec, q_vec));
+      CeedCallBackend(CeedBasisDestroy(&basis));
       break;
     }
     case CEED_EVAL_WEIGHT:
       break;  // No action
   }
+  if (!is_active) CeedCallBackend(CeedVectorDestroy(&l_vec));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -448,6 +475,7 @@ static inline int CeedOperatorInputRestore_Hip(CeedOperatorField op_input_field,
       CeedCallBackend(CeedVectorRestoreArrayRead(l_vec, &e_vec_array));
     }
   }
+  if (!is_active) CeedCallBackend(CeedVectorDestroy(&l_vec));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -508,12 +536,10 @@ static int CeedOperatorApplyAdd_Hip(CeedOperator op, CeedVector in_vec, CeedVect
 
   // Output basis and restriction
   for (CeedInt i = 0; i < num_output_fields; i++) {
-    bool                is_active = false;
-    CeedInt             field     = impl->output_field_order[i];
-    CeedEvalMode        eval_mode;
-    CeedVector          l_vec, e_vec = impl->e_vecs_out[field], q_vec = impl->q_vecs_out[field];
-    CeedElemRestriction elem_rstr;
-    CeedBasis           basis;
+    bool         is_active = false;
+    CeedInt      field     = impl->output_field_order[i];
+    CeedEvalMode eval_mode;
+    CeedVector   l_vec, e_vec = impl->e_vecs_out[field], q_vec = impl->q_vecs_out[field];
 
     // Output vector
     CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[field], &l_vec));
@@ -531,14 +557,18 @@ static int CeedOperatorApplyAdd_Hip(CeedOperator op, CeedVector in_vec, CeedVect
       case CEED_EVAL_INTERP:
       case CEED_EVAL_GRAD:
       case CEED_EVAL_DIV:
-      case CEED_EVAL_CURL:
+      case CEED_EVAL_CURL: {
+        CeedBasis basis;
+
         CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[field], &basis));
         if (impl->apply_add_basis_out[field]) {
           CeedCallBackend(CeedBasisApplyAdd(basis, num_elem, CEED_TRANSPOSE, eval_mode, q_vec, e_vec));
         } else {
           CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_TRANSPOSE, eval_mode, q_vec, e_vec));
         }
+        CeedCallBackend(CeedBasisDestroy(&basis));
         break;
+      }
       // LCOV_EXCL_START
       case CEED_EVAL_WEIGHT: {
         return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
@@ -555,9 +585,18 @@ static int CeedOperatorApplyAdd_Hip(CeedOperator op, CeedVector in_vec, CeedVect
     }
 
     // Restrict
-    if (impl->skip_rstr_out[field]) continue;
-    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[field], &elem_rstr));
-    CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_TRANSPOSE, e_vec, l_vec, request));
+    if (impl->skip_rstr_out[field]) {
+      if (!is_active) CeedCallBackend(CeedVectorDestroy(&l_vec));
+      continue;
+    }
+    {
+      CeedElemRestriction elem_rstr;
+
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[field], &elem_rstr));
+      CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_TRANSPOSE, e_vec, l_vec, request));
+      CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
+    }
+    if (!is_active) CeedCallBackend(CeedVectorDestroy(&l_vec));
   }
 
   // Return work vector
@@ -639,7 +678,11 @@ static int CeedOperatorSetupAtPoints_Hip(CeedOperator op) {
       impl->input_field_order[curr_index] = i;
       curr_index++;
       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i));
-      if (vec_i == CEED_VECTOR_NONE) continue;  // CEED_EVAL_WEIGHT
+      if (vec_i == CEED_VECTOR_NONE) {
+        // CEED_EVAL_WEIGHT
+        CeedCallBackend(CeedVectorDestroy(&vec_i));
+        continue;
+      };
       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i));
       CeedCallBackend(CeedElemRestrictionGetEVectorSize(rstr_i, &e_vec_len_i));
       impl->max_active_e_vec_len = e_vec_len_i > impl->max_active_e_vec_len ? e_vec_len_i : impl->max_active_e_vec_len;
@@ -654,7 +697,11 @@ static int CeedOperatorSetupAtPoints_Hip(CeedOperator op) {
           impl->input_field_order[curr_index] = j;
           curr_index++;
         }
+        CeedCallBackend(CeedVectorDestroy(&vec_j));
+        CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j));
       }
+      CeedCallBackend(CeedVectorDestroy(&vec_i));
+      CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
     }
   }
   {
@@ -686,10 +733,13 @@ static int CeedOperatorSetupAtPoints_Hip(CeedOperator op) {
           impl->output_field_order[curr_index] = j;
           curr_index++;
         }
+        CeedCallBackend(CeedVectorDestroy(&vec_j));
+        CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j));
       }
+      CeedCallBackend(CeedVectorDestroy(&vec_i));
+      CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
     }
   }
-
   CeedCallBackend(CeedOperatorSetSetupDone(op));
   return CEED_ERROR_SUCCESS;
 }
@@ -735,11 +785,13 @@ static inline int CeedOperatorInputBasisAtPoints_Hip(CeedOperatorField op_input_
 
       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_field, &basis));
       CeedCallBackend(CeedBasisApplyAtPoints(basis, num_elem, num_points, CEED_NOTRANSPOSE, eval_mode, impl->point_coords_elem, e_vec, q_vec));
+      CeedCallBackend(CeedBasisDestroy(&basis));
       break;
     }
     case CEED_EVAL_WEIGHT:
       break;  // No action
   }
+  if (!is_active) CeedCallBackend(CeedVectorDestroy(&l_vec));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -814,12 +866,10 @@ static int CeedOperatorApplyAddAtPoints_Hip(CeedOperator op, CeedVector in_vec,
 
   // Output basis and restriction
   for (CeedInt i = 0; i < num_output_fields; i++) {
-    bool                is_active = false;
-    CeedInt             field     = impl->output_field_order[i];
-    CeedEvalMode        eval_mode;
-    CeedVector          l_vec, e_vec = impl->e_vecs_out[field], q_vec = impl->q_vecs_out[field];
-    CeedElemRestriction elem_rstr;
-    CeedBasis           basis;
+    bool         is_active = false;
+    CeedInt      field     = impl->output_field_order[i];
+    CeedEvalMode eval_mode;
+    CeedVector   l_vec, e_vec = impl->e_vecs_out[field], q_vec = impl->q_vecs_out[field];
 
     // Output vector
     CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[field], &l_vec));
@@ -837,14 +887,18 @@ static int CeedOperatorApplyAddAtPoints_Hip(CeedOperator op, CeedVector in_vec,
       case CEED_EVAL_INTERP:
       case CEED_EVAL_GRAD:
       case CEED_EVAL_DIV:
-      case CEED_EVAL_CURL:
+      case CEED_EVAL_CURL: {
+        CeedBasis basis;
+
         CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[field], &basis));
         if (impl->apply_add_basis_out[field]) {
           CeedCallBackend(CeedBasisApplyAddAtPoints(basis, num_elem, num_points, CEED_TRANSPOSE, eval_mode, impl->point_coords_elem, q_vec, e_vec));
         } else {
           CeedCallBackend(CeedBasisApplyAtPoints(basis, num_elem, num_points, CEED_TRANSPOSE, eval_mode, impl->point_coords_elem, q_vec, e_vec));
         }
+        CeedCallBackend(CeedBasisDestroy(&basis));
         break;
+      }
       // LCOV_EXCL_START
       case CEED_EVAL_WEIGHT: {
         return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
@@ -861,9 +915,18 @@ static int CeedOperatorApplyAddAtPoints_Hip(CeedOperator op, CeedVector in_vec,
     }
 
     // Restrict
-    if (impl->skip_rstr_out[field]) continue;
-    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[field], &elem_rstr));
-    CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_TRANSPOSE, e_vec, l_vec, request));
+    if (impl->skip_rstr_out[field]) {
+      if (!is_active) CeedCallBackend(CeedVectorDestroy(&l_vec));
+      continue;
+    }
+    {
+      CeedElemRestriction elem_rstr;
+
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[field], &elem_rstr));
+      CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_TRANSPOSE, e_vec, l_vec, request));
+      CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
+    }
+    if (!is_active) CeedCallBackend(CeedVectorDestroy(&l_vec));
   }
 
   // Restore work vector
@@ -928,6 +991,7 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Hip(CeedOperator op, b
         num_active_in += size;
         CeedCallBackend(CeedVectorRestoreArray(impl->q_vecs_in[i], &q_vec_array));
       }
+      CeedCallBackend(CeedVectorDestroy(&l_vec));
     }
     impl->num_active_in = num_active_in;
     impl->qf_active_in  = active_inputs;
@@ -944,6 +1008,7 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Hip(CeedOperator op, b
         CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &size));
         num_active_out += size;
       }
+      CeedCallBackend(CeedVectorDestroy(&l_vec));
     }
     impl->num_active_out = num_active_out;
   }
@@ -977,14 +1042,14 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Hip(CeedOperator op, b
     for (CeedInt out = 0; out < num_output_fields; out++) {
       CeedVector l_vec;
 
-      // Get output vector
-      CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &l_vec));
       // Check if active output
+      CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &l_vec));
       if (l_vec == CEED_VECTOR_ACTIVE) {
         CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[out], CEED_MEM_DEVICE, CEED_USE_POINTER, assembled_array));
         CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[out], &size));
         assembled_array += size * Q * num_elem;  // Advance the pointer by the size of the output
       }
+      CeedCallBackend(CeedVectorDestroy(&l_vec));
     }
     // Apply QFunction
     CeedCallBackend(CeedQFunctionApply(qf, Q * num_elem, impl->q_vecs_in, impl->q_vecs_out));
@@ -998,6 +1063,7 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Hip(CeedOperator op, b
     if (l_vec == CEED_VECTOR_ACTIVE) {
       CeedCallBackend(CeedVectorTakeArray(impl->q_vecs_out[out], CEED_MEM_DEVICE, NULL));
     }
+    CeedCallBackend(CeedVectorDestroy(&l_vec));
   }
 
   // Restore input arrays
@@ -1050,13 +1116,14 @@ static inline int CeedOperatorAssembleDiagonalSetup_Hip(CeedOperator op) {
 
     CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec));
     if (vec == CEED_VECTOR_ACTIVE) {
-      CeedBasis    basis;
       CeedEvalMode eval_mode;
+      CeedBasis    basis;
 
       CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis));
       CeedCheck(!basis_in || basis_in == basis, ceed, CEED_ERROR_BACKEND,
                 "Backend does not implement operator diagonal assembly with multiple active bases");
-      basis_in = basis;
+      if (!basis_in) CeedCallBackend(CeedBasisReferenceCopy(basis, &basis_in));
+      CeedCallBackend(CeedBasisDestroy(&basis));
       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode));
       CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis_in, eval_mode, &q_comp));
       if (eval_mode != CEED_EVAL_WEIGHT) {
@@ -1066,6 +1133,7 @@ static inline int CeedOperatorAssembleDiagonalSetup_Hip(CeedOperator op) {
         num_eval_modes_in += q_comp;
       }
     }
+    CeedCallBackend(CeedVectorDestroy(&vec));
   }
 
   // Determine active output basis
@@ -1082,7 +1150,8 @@ static inline int CeedOperatorAssembleDiagonalSetup_Hip(CeedOperator op) {
       CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis));
       CeedCheck(!basis_out || basis_out == basis, ceed, CEED_ERROR_BACKEND,
                 "Backend does not implement operator diagonal assembly with multiple active bases");
-      basis_out = basis;
+      if (!basis_out) CeedCallBackend(CeedBasisReferenceCopy(basis, &basis_out));
+      CeedCallBackend(CeedBasisDestroy(&basis));
       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode));
       CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis_out, eval_mode, &q_comp));
       if (eval_mode != CEED_EVAL_WEIGHT) {
@@ -1092,6 +1161,7 @@ static inline int CeedOperatorAssembleDiagonalSetup_Hip(CeedOperator op) {
         num_eval_modes_out += q_comp;
       }
     }
+    CeedCallBackend(CeedVectorDestroy(&vec));
   }
 
   // Operator data struct
@@ -1203,6 +1273,8 @@ static inline int CeedOperatorAssembleDiagonalSetup_Hip(CeedOperator op) {
   CeedCallHip(ceed, hipMemcpy(diag->d_eval_modes_out, eval_modes_out, num_eval_modes_out * eval_modes_bytes, hipMemcpyHostToDevice));
   CeedCallBackend(CeedFree(&eval_modes_in));
   CeedCallBackend(CeedFree(&eval_modes_out));
+  CeedCallBackend(CeedBasisDestroy(&basis_in));
+  CeedCallBackend(CeedBasisDestroy(&basis_out));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -1234,14 +1306,18 @@ static inline int CeedOperatorAssembleDiagonalSetupCompile_Hip(CeedOperator op,
     CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec));
     if (vec == CEED_VECTOR_ACTIVE) {
       CeedEvalMode eval_mode;
+      CeedBasis    basis;
 
-      CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis_in));
+      CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis));
+      if (!basis_in) CeedCallBackend(CeedBasisReferenceCopy(basis, &basis_in));
+      CeedCallBackend(CeedBasisDestroy(&basis));
       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode));
       CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis_in, eval_mode, &q_comp));
       if (eval_mode != CEED_EVAL_WEIGHT) {
         num_eval_modes_in += q_comp;
       }
     }
+    CeedCallBackend(CeedVectorDestroy(&vec));
   }
 
   // Determine active output basis
@@ -1253,14 +1329,18 @@ static inline int CeedOperatorAssembleDiagonalSetupCompile_Hip(CeedOperator op,
     CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec));
     if (vec == CEED_VECTOR_ACTIVE) {
       CeedEvalMode eval_mode;
+      CeedBasis    basis;
 
-      CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis_out));
+      CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis));
+      if (!basis_out) CeedCallBackend(CeedBasisReferenceCopy(basis, &basis_out));
+      CeedCallBackend(CeedBasisDestroy(&basis));
       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode));
       CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis_out, eval_mode, &q_comp));
       if (eval_mode != CEED_EVAL_WEIGHT) {
         num_eval_modes_out += q_comp;
       }
     }
+    CeedCallBackend(CeedVectorDestroy(&vec));
   }
 
   // Operator data struct
@@ -1284,6 +1364,8 @@ static inline int CeedOperatorAssembleDiagonalSetupCompile_Hip(CeedOperator op,
   CeedCallHip(ceed, CeedGetKernel_Hip(ceed, *module, "LinearDiagonal", is_point_block ? &diag->LinearPointBlock : &diag->LinearDiagonal));
   CeedCallBackend(CeedFree(&diagonal_kernel_path));
   CeedCallBackend(CeedFree(&diagonal_kernel_source));
+  CeedCallBackend(CeedBasisDestroy(&basis_in));
+  CeedCallBackend(CeedBasisDestroy(&basis_out));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -1335,6 +1417,8 @@ static inline int CeedOperatorAssembleDiagonalCore_Hip(CeedOperator op, CeedVect
     CeedCallBackend(CeedOperatorCreateActivePointBlockRestriction(rstr_out, &diag->point_block_diag_rstr));
     CeedCallBackend(CeedElemRestrictionCreateVector(diag->point_block_diag_rstr, NULL, &diag->point_block_elem_diag));
   }
+  CeedCallBackend(CeedElemRestrictionDestroy(&rstr_in));
+  CeedCallBackend(CeedElemRestrictionDestroy(&rstr_out));
   diag_rstr = is_point_block ? diag->point_block_diag_rstr : diag->diag_rstr;
   elem_diag = is_point_block ? diag->point_block_elem_diag : diag->elem_diag;
   CeedCallBackend(CeedVectorSetValue(elem_diag, 0.0));
@@ -1393,7 +1477,7 @@ static int CeedOperatorLinearAssembleAddPointBlockDiagonal_Hip(CeedOperator op,
 //------------------------------------------------------------------------------
 static int CeedSingleOperatorAssembleSetup_Hip(CeedOperator op, CeedInt use_ceedsize_idx) {
   Ceed                ceed;
-  Ceed_Hip           *Hip_data;
+  Ceed_Hip           *hip_data;
   char               *assembly_kernel_source;
   const char         *assembly_kernel_path;
   CeedInt             num_input_fields, num_output_fields, num_eval_modes_in = 0, num_eval_modes_out = 0;
@@ -1420,13 +1504,17 @@ static int CeedSingleOperatorAssembleSetup_Hip(CeedOperator op, CeedInt use_ceed
 
     CeedCallBackend(CeedOperatorFieldGetVector(input_fields[i], &vec));
     if (vec == CEED_VECTOR_ACTIVE) {
-      CeedBasis    basis;
-      CeedEvalMode eval_mode;
+      CeedEvalMode        eval_mode;
+      CeedElemRestriction elem_rstr;
+      CeedBasis           basis;
 
       CeedCallBackend(CeedOperatorFieldGetBasis(input_fields[i], &basis));
       CeedCheck(!basis_in || basis_in == basis, ceed, CEED_ERROR_BACKEND, "Backend does not implement operator assembly with multiple active bases");
-      basis_in = basis;
-      CeedCallBackend(CeedOperatorFieldGetElemRestriction(input_fields[i], &rstr_in));
+      if (!basis_in) CeedCallBackend(CeedBasisReferenceCopy(basis, &basis_in));
+      CeedCallBackend(CeedBasisDestroy(&basis));
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(input_fields[i], &elem_rstr));
+      if (!rstr_in) CeedCallBackend(CeedElemRestrictionReferenceCopy(elem_rstr, &rstr_in));
+      CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
       CeedCallBackend(CeedElemRestrictionGetElementSize(rstr_in, &elem_size_in));
       if (basis_in == CEED_BASIS_NONE) num_qpts_in = elem_size_in;
       else CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis_in, &num_qpts_in));
@@ -1441,6 +1529,7 @@ static int CeedSingleOperatorAssembleSetup_Hip(CeedOperator op, CeedInt use_ceed
         num_eval_modes_in += q_comp;
       }
     }
+    CeedCallBackend(CeedVectorDestroy(&vec));
   }
 
   // Determine active output basis; basis_out and rstr_out only used if same as input, TODO
@@ -1450,14 +1539,18 @@ static int CeedSingleOperatorAssembleSetup_Hip(CeedOperator op, CeedInt use_ceed
 
     CeedCallBackend(CeedOperatorFieldGetVector(output_fields[i], &vec));
     if (vec == CEED_VECTOR_ACTIVE) {
-      CeedBasis    basis;
-      CeedEvalMode eval_mode;
+      CeedEvalMode        eval_mode;
+      CeedElemRestriction elem_rstr;
+      CeedBasis           basis;
 
       CeedCallBackend(CeedOperatorFieldGetBasis(output_fields[i], &basis));
       CeedCheck(!basis_out || basis_out == basis, ceed, CEED_ERROR_BACKEND,
                 "Backend does not implement operator assembly with multiple active bases");
-      basis_out = basis;
-      CeedCallBackend(CeedOperatorFieldGetElemRestriction(output_fields[i], &rstr_out));
+      if (!basis_out) CeedCallBackend(CeedBasisReferenceCopy(basis, &basis_out));
+      CeedCallBackend(CeedBasisDestroy(&basis));
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(output_fields[i], &elem_rstr));
+      if (!rstr_out) CeedCallBackend(CeedElemRestrictionReferenceCopy(elem_rstr, &rstr_out));
+      CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
       CeedCallBackend(CeedElemRestrictionGetElementSize(rstr_out, &elem_size_out));
       if (basis_out == CEED_BASIS_NONE) num_qpts_out = elem_size_out;
       else CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis_out, &num_qpts_out));
@@ -1474,6 +1567,7 @@ static int CeedSingleOperatorAssembleSetup_Hip(CeedOperator op, CeedInt use_ceed
         num_eval_modes_out += q_comp;
       }
     }
+    CeedCallBackend(CeedVectorDestroy(&vec));
   }
   CeedCheck(num_eval_modes_in > 0 && num_eval_modes_out > 0, ceed, CEED_ERROR_UNSUPPORTED, "Cannot assemble operator without inputs/outputs");
 
@@ -1483,8 +1577,8 @@ static int CeedSingleOperatorAssembleSetup_Hip(CeedOperator op, CeedInt use_ceed
   asmb->block_size_x             = elem_size_in;
   asmb->block_size_y             = elem_size_out;
 
-  CeedCallBackend(CeedGetData(ceed, &Hip_data));
-  bool fallback = asmb->block_size_x * asmb->block_size_y * asmb->elems_per_block > Hip_data->device_prop.maxThreadsPerBlock;
+  CeedCallBackend(CeedGetData(ceed, &hip_data));
+  bool fallback = asmb->block_size_x * asmb->block_size_y * asmb->elems_per_block > hip_data->device_prop.maxThreadsPerBlock;
 
   if (fallback) {
     // Use fallback kernel with 1D threadblock
@@ -1576,6 +1670,10 @@ static int CeedSingleOperatorAssembleSetup_Hip(CeedOperator op, CeedInt use_ceed
     CeedCallBackend(CeedFree(&identity));
   }
   CeedCallBackend(CeedFree(&eval_modes_out));
+  CeedCallBackend(CeedElemRestrictionDestroy(&rstr_in));
+  CeedCallBackend(CeedElemRestrictionDestroy(&rstr_out));
+  CeedCallBackend(CeedBasisDestroy(&basis_in));
+  CeedCallBackend(CeedBasisDestroy(&basis_out));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -1680,6 +1778,8 @@ static int CeedSingleOperatorAssemble_Hip(CeedOperator op, CeedInt offset, CeedV
       CeedCallBackend(CeedElemRestrictionRestoreCurlOrientations(rstr_out, &curl_orients_out));
     }
   }
+  CeedCallBackend(CeedElemRestrictionDestroy(&rstr_in));
+  CeedCallBackend(CeedElemRestrictionDestroy(&rstr_out));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -1751,10 +1851,13 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip(CeedOperator op, Ce
 
   // Clear active input Qvecs
   for (CeedInt i = 0; i < num_input_fields; i++) {
-    CeedVector vec;
+    bool       is_active = false;
+    CeedVector l_vec;
 
-    CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
-    if (vec != CEED_VECTOR_ACTIVE) continue;
+    CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &l_vec));
+    is_active = l_vec == CEED_VECTOR_ACTIVE;
+    CeedCallBackend(CeedVectorDestroy(&l_vec));
+    if (!is_active) continue;
     CeedCallBackend(CeedVectorSetValue(impl->q_vecs_in[i], 0.0));
   }
 
@@ -1773,15 +1876,17 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip(CeedOperator op, Ce
 
   // Loop over active fields
   for (CeedInt i = 0; i < num_input_fields; i++) {
-    bool                is_active_at_points = true;
+    bool                is_active = false, is_active_at_points = true;
     CeedInt             elem_size = 1, num_comp_active = 1, e_vec_size = 0;
     CeedRestrictionType rstr_type;
     CeedVector          l_vec;
     CeedElemRestriction elem_rstr;
 
-    CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &l_vec));
     // -- Skip non-active input
-    if (l_vec != CEED_VECTOR_ACTIVE) continue;
+    CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &l_vec));
+    is_active = l_vec == CEED_VECTOR_ACTIVE;
+    CeedCallBackend(CeedVectorDestroy(&l_vec));
+    if (!is_active) continue;
 
     // -- Get active restriction type
     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
@@ -1790,18 +1895,19 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip(CeedOperator op, Ce
     if (!is_active_at_points) CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
     else elem_size = max_num_points;
     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp_active));
+    CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
 
     e_vec_size = elem_size * num_comp_active;
     for (CeedInt s = 0; s < e_vec_size; s++) {
-      bool         is_active_input = false;
+      bool         is_active = false;
       CeedEvalMode eval_mode;
       CeedVector   l_vec, q_vec = impl->q_vecs_in[i];
-      CeedBasis    basis;
 
-      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &l_vec));
       // Skip non-active input
-      is_active_input = l_vec == CEED_VECTOR_ACTIVE;
-      if (!is_active_input) continue;
+      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &l_vec));
+      is_active = l_vec == CEED_VECTOR_ACTIVE;
+      CeedCallBackend(CeedVectorDestroy(&l_vec));
+      if (!is_active) continue;
 
       // Update unit vector
       if (s == 0) CeedCallBackend(CeedVectorSetValue(active_e_vec_in, 0.0));
@@ -1821,11 +1927,15 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip(CeedOperator op, Ce
         case CEED_EVAL_INTERP:
         case CEED_EVAL_GRAD:
         case CEED_EVAL_DIV:
-        case CEED_EVAL_CURL:
+        case CEED_EVAL_CURL: {
+          CeedBasis basis;
+
           CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
           CeedCallBackend(
               CeedBasisApplyAtPoints(basis, num_elem, num_points, CEED_NOTRANSPOSE, eval_mode, impl->point_coords_elem, active_e_vec_in, q_vec));
+          CeedCallBackend(CeedBasisDestroy(&basis));
           break;
+        }
         case CEED_EVAL_WEIGHT:
           break;  // No action
       }
@@ -1835,18 +1945,18 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip(CeedOperator op, Ce
 
       // Output basis apply if needed
       for (CeedInt j = 0; j < num_output_fields; j++) {
-        bool                is_active_output = false;
-        CeedInt             elem_size        = 0;
+        bool                is_active = false;
+        CeedInt             elem_size = 0;
         CeedRestrictionType rstr_type;
         CeedEvalMode        eval_mode;
         CeedVector          l_vec, e_vec = impl->e_vecs_out[j], q_vec = impl->q_vecs_out[j];
         CeedElemRestriction elem_rstr;
-        CeedBasis           basis;
 
-        CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[j], &l_vec));
         // ---- Skip non-active output
-        is_active_output = l_vec == CEED_VECTOR_ACTIVE;
-        if (!is_active_output) continue;
+        CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[j], &l_vec));
+        is_active = l_vec == CEED_VECTOR_ACTIVE;
+        CeedCallBackend(CeedVectorDestroy(&l_vec));
+        if (!is_active) continue;
         if (!e_vec) e_vec = active_e_vec_out;
 
         // ---- Check if elem size matches
@@ -1878,10 +1988,14 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip(CeedOperator op, Ce
           case CEED_EVAL_INTERP:
           case CEED_EVAL_GRAD:
           case CEED_EVAL_DIV:
-          case CEED_EVAL_CURL:
+          case CEED_EVAL_CURL: {
+            CeedBasis basis;
+
             CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[j], &basis));
             CeedCallBackend(CeedBasisApplyAtPoints(basis, num_elem, num_points, CEED_TRANSPOSE, eval_mode, impl->point_coords_elem, q_vec, e_vec));
+            CeedCallBackend(CeedBasisDestroy(&basis));
             break;
+          }
           // LCOV_EXCL_START
           case CEED_EVAL_WEIGHT: {
             return CeedError(CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
@@ -1893,8 +2007,8 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip(CeedOperator op, Ce
         CeedCallBackend(CeedVectorPointwiseMult(e_vec, active_e_vec_in, e_vec));
 
         // Restrict
-        CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[j], &elem_rstr));
         CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_TRANSPOSE, e_vec, assembled, request));
+        CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
 
         // Reset q_vec for
         if (eval_mode == CEED_EVAL_NONE) {
diff --git a/backends/occa/ceed-occa-basis.cpp b/backends/occa/ceed-occa-basis.cpp
index 0c33da5453..c9f94d18d0 100644
--- a/backends/occa/ceed-occa-basis.cpp
+++ b/backends/occa/ceed-occa-basis.cpp
@@ -43,9 +43,11 @@ Basis *Basis::from(CeedBasis basis) {
 }
 
 Basis *Basis::from(CeedOperatorField operatorField) {
-  CeedBasis basis;
-  CeedCallOcca(CeedOperatorFieldGetBasis(operatorField, &basis));
-  return from(basis);
+  CeedBasis ceedBasis;
+  CeedCallOcca(CeedOperatorFieldGetBasis(operatorField, &ceedBasis));
+  Basis *basis = from(ceedBasis);
+  CeedCallOcca(CeedBasisDestroy(&ceedBasis));
+  return basis;
 }
 
 int Basis::setCeedFields(CeedBasis basis) {
diff --git a/backends/occa/ceed-occa-elem-restriction.cpp b/backends/occa/ceed-occa-elem-restriction.cpp
index 7bfae3d87f..140041cb1d 100644
--- a/backends/occa/ceed-occa-elem-restriction.cpp
+++ b/backends/occa/ceed-occa-elem-restriction.cpp
@@ -200,10 +200,10 @@ ElemRestriction *ElemRestriction::from(CeedElemRestriction r) {
 
 ElemRestriction *ElemRestriction::from(CeedOperatorField operatorField) {
   CeedElemRestriction ceedElemRestriction;
-
   CeedCallOcca(CeedOperatorFieldGetElemRestriction(operatorField, &ceedElemRestriction));
-
-  return from(ceedElemRestriction);
+  ElemRestriction *elemRestriction = from(ceedElemRestriction);
+  CeedCallOcca(CeedElemRestrictionDestroy(&ceedElemRestriction));
+  return elemRestriction;
 }
 
 ElemRestriction *ElemRestriction::setupFrom(CeedElemRestriction r) {
diff --git a/backends/occa/ceed-occa-operator-field.cpp b/backends/occa/ceed-occa-operator-field.cpp
index 6716d11e06..4745b8dfc0 100644
--- a/backends/occa/ceed-occa-operator-field.cpp
+++ b/backends/occa/ceed-occa-operator-field.cpp
@@ -19,9 +19,7 @@ OperatorField::OperatorField(CeedOperatorField opField) : _isValid(false), _uses
   CeedElemRestriction ceedElemRestriction;
 
   CeedCallOccaValid(_isValid, CeedOperatorFieldGetBasis(opField, &ceedBasis));
-
   CeedCallOccaValid(_isValid, CeedOperatorFieldGetVector(opField, &ceedVector));
-
   CeedCallOccaValid(_isValid, CeedOperatorFieldGetElemRestriction(opField, &ceedElemRestriction));
 
   _isValid          = true;
@@ -30,6 +28,10 @@ OperatorField::OperatorField(CeedOperatorField opField) : _isValid(false), _uses
   vec             = Vector::from(ceedVector);
   basis           = Basis::from(ceedBasis);
   elemRestriction = ElemRestriction::from(ceedElemRestriction);
+
+  CeedCallOccaValid(_isValid, CeedBasisDestroy(&ceedBasis));
+  CeedCallOccaValid(_isValid, CeedVectorDestroy(&ceedVector));
+  CeedCallOccaValid(_isValid, CeedElemRestrictionDestroy(&ceedElemRestriction));
 }
 
 bool OperatorField::isValid() const { return _isValid; }
diff --git a/backends/opt/ceed-opt-operator.c b/backends/opt/ceed-opt-operator.c
index 4de37eaf66..8057741208 100644
--- a/backends/opt/ceed-opt-operator.c
+++ b/backends/opt/ceed-opt-operator.c
@@ -105,6 +105,7 @@ static int CeedOperatorSetupFields_Opt(CeedQFunction qf, CeedOperator op, bool i
           // Empty case - won't occur
           break;
       }
+      CeedCallBackend(CeedElemRestrictionDestroy(&rstr));
       CeedCallBackend(CeedElemRestrictionCreateVector(block_rstr[i + start_e], NULL, &e_vecs_full[i + start_e]));
     }
 
@@ -124,6 +125,7 @@ static int CeedOperatorSetupFields_Opt(CeedQFunction qf, CeedOperator op, bool i
         CeedCallBackend(CeedQFunctionFieldGetSize(qf_fields[i], &size));
         CeedCallBackend(CeedBasisGetNumNodes(basis, &P));
         CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
+        CeedCallBackend(CeedBasisDestroy(&basis));
         e_size = (CeedSize)P * num_comp * block_size;
         CeedCallBackend(CeedVectorCreate(ceed, e_size, &e_vecs[i]));
         q_size = (CeedSize)Q * size * block_size;
@@ -134,6 +136,7 @@ static int CeedOperatorSetupFields_Opt(CeedQFunction qf, CeedOperator op, bool i
         q_size = (CeedSize)Q * block_size;
         CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i]));
         CeedCallBackend(CeedBasisApply(basis, block_size, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT, CEED_VECTOR_NONE, q_vecs[i]));
+        CeedCallBackend(CeedBasisDestroy(&basis));
         break;
     }
     // Initialize E-vec arrays
@@ -158,7 +161,11 @@ static int CeedOperatorSetupFields_Opt(CeedQFunction qf, CeedOperator op, bool i
           CeedCallBackend(CeedVectorReferenceCopy(e_vecs_full[i + start_e], &e_vecs_full[j + start_e]));
           skip_rstr[j] = true;
         }
+        CeedCallBackend(CeedVectorDestroy(&vec_j));
+        CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j));
       }
+      CeedCallBackend(CeedVectorDestroy(&vec_i));
+      CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
     }
   } else {
     for (CeedInt i = num_fields - 1; i >= 0; i--) {
@@ -179,7 +186,11 @@ static int CeedOperatorSetupFields_Opt(CeedQFunction qf, CeedOperator op, bool i
           skip_rstr[j]       = true;
           apply_add_basis[i] = true;
         }
+        CeedCallBackend(CeedVectorDestroy(&vec_j));
+        CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j));
       }
+      CeedCallBackend(CeedVectorDestroy(&vec_i));
+      CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
     }
   }
   return CEED_ERROR_SUCCESS;
@@ -289,6 +300,7 @@ static inline int CeedOperatorSetupInputs_Opt(CeedInt num_input_fields, CeedQFun
           CeedCallBackend(CeedVectorRestoreArrayRead(impl->e_vecs_in[i], (const CeedScalar **)&e_data[i]));
         }
       }
+      CeedCallBackend(CeedVectorDestroy(&vec));
     }
   }
   return CEED_ERROR_SUCCESS;
@@ -301,31 +313,33 @@ static inline int CeedOperatorInputBasis_Opt(CeedInt e, CeedInt Q, CeedQFunction
                                              CeedInt num_input_fields, CeedInt block_size, CeedVector in_vec, bool skip_active,
                                              CeedScalar *e_data[2 * CEED_FIELD_MAX], CeedOperator_Opt *impl, CeedRequest *request) {
   for (CeedInt i = 0; i < num_input_fields; i++) {
-    bool                is_active_input = false;
+    bool                is_active;
     CeedInt             elem_size, size, num_comp;
     CeedEvalMode        eval_mode;
     CeedVector          vec;
     CeedElemRestriction elem_rstr;
     CeedBasis           basis;
 
-    CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
     // Skip active input
-    is_active_input = vec == CEED_VECTOR_ACTIVE;
-    if (skip_active && is_active_input) continue;
+    CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
+    is_active = vec == CEED_VECTOR_ACTIVE;
+    CeedCallBackend(CeedVectorDestroy(&vec));
+    if (skip_active && is_active) continue;
 
     // Get elem_size, eval_mode, size
     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
+    CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
     CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &size));
     // Restrict block active input
-    if (is_active_input && impl->block_rstr[i]) {
+    if (is_active && impl->block_rstr[i]) {
       CeedCallBackend(CeedElemRestrictionApplyBlock(impl->block_rstr[i], e / block_size, CEED_NOTRANSPOSE, in_vec, impl->e_vecs_in[i], request));
     }
     // Basis action
     switch (eval_mode) {
       case CEED_EVAL_NONE:
-        if (!is_active_input) {
+        if (!is_active) {
           CeedCallBackend(CeedVectorSetArray(impl->q_vecs_in[i], CEED_MEM_HOST, CEED_USE_POINTER, &e_data[i][(CeedSize)e * Q * size]));
         }
         break;
@@ -334,11 +348,12 @@ static inline int CeedOperatorInputBasis_Opt(CeedInt e, CeedInt Q, CeedQFunction
       case CEED_EVAL_DIV:
       case CEED_EVAL_CURL:
         CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
-        if (!is_active_input) {
+        if (!is_active) {
           CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
           CeedCallBackend(CeedVectorSetArray(impl->e_vecs_in[i], CEED_MEM_HOST, CEED_USE_POINTER, &e_data[i][(CeedSize)e * elem_size * num_comp]));
         }
         CeedCallBackend(CeedBasisApply(basis, block_size, CEED_NOTRANSPOSE, eval_mode, impl->e_vecs_in[i], impl->q_vecs_in[i]));
+        CeedCallBackend(CeedBasisDestroy(&basis));
         break;
       case CEED_EVAL_WEIGHT:
         break;  // No action
@@ -354,13 +369,12 @@ static inline int CeedOperatorOutputBasis_Opt(CeedInt e, CeedInt Q, CeedQFunctio
                                               CeedInt block_size, CeedInt num_input_fields, CeedInt num_output_fields, bool *apply_add_basis,
                                               bool *skip_rstr, CeedOperator op, CeedVector out_vec, CeedOperator_Opt *impl, CeedRequest *request) {
   for (CeedInt i = 0; i < num_output_fields; i++) {
-    CeedEvalMode        eval_mode;
-    CeedVector          vec;
-    CeedElemRestriction elem_rstr;
-    CeedBasis           basis;
+    bool         is_active;
+    CeedEvalMode eval_mode;
+    CeedVector   vec;
+    CeedBasis    basis;
 
-    // Get elem_size, eval_mode, size
-    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
+    // Get eval_mode
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
     // Basis action
     switch (eval_mode) {
@@ -376,6 +390,7 @@ static inline int CeedOperatorOutputBasis_Opt(CeedInt e, CeedInt Q, CeedQFunctio
         } else {
           CeedCallBackend(CeedBasisApply(basis, block_size, CEED_TRANSPOSE, eval_mode, impl->q_vecs_out[i], impl->e_vecs_out[i]));
         }
+        CeedCallBackend(CeedBasisDestroy(&basis));
         break;
       // LCOV_EXCL_START
       case CEED_EVAL_WEIGHT: {
@@ -387,10 +402,12 @@ static inline int CeedOperatorOutputBasis_Opt(CeedInt e, CeedInt Q, CeedQFunctio
     if (skip_rstr[i]) continue;
     // Get output vector
     CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
-    if (vec == CEED_VECTOR_ACTIVE) vec = out_vec;
+    is_active = vec == CEED_VECTOR_ACTIVE;
+    if (is_active) vec = out_vec;
     // Restrict
     CeedCallBackend(
         CeedElemRestrictionApplyBlock(impl->block_rstr[i + impl->num_inputs], e / block_size, CEED_TRANSPOSE, impl->e_vecs_out[i], vec, request));
+    if (!is_active) CeedCallBackend(CeedVectorDestroy(&vec));
   }
   return CEED_ERROR_SUCCESS;
 }
@@ -409,6 +426,7 @@ static inline int CeedOperatorRestoreInputs_Opt(CeedInt num_input_fields, CeedQF
     if (eval_mode != CEED_EVAL_WEIGHT && vec != CEED_VECTOR_ACTIVE) {
       CeedCallBackend(CeedVectorRestoreArrayRead(impl->e_vecs_full[i], (const CeedScalar **)&e_data[i]));
     }
+    CeedCallBackend(CeedVectorDestroy(&vec));
   }
   return CEED_ERROR_SUCCESS;
 }
@@ -531,14 +549,14 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Opt(CeedOperator op, b
       CeedInt    field_size;
       CeedVector vec;
 
-      // Get input vector
-      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
       // Check if active input
+      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
       if (vec == CEED_VECTOR_ACTIVE) {
         CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &field_size));
         CeedCallBackend(CeedVectorSetValue(impl->q_vecs_in[i], 0.0));
         qf_size_in += field_size;
       }
+      CeedCallBackend(CeedVectorDestroy(&vec));
     }
     CeedCheck(qf_size_in > 0, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs");
     impl->qf_size_in = qf_size_in;
@@ -550,13 +568,13 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Opt(CeedOperator op, b
       CeedInt    field_size;
       CeedVector vec;
 
-      // Get output vector
-      CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
       // Check if active output
+      CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
       if (vec == CEED_VECTOR_ACTIVE) {
         CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &field_size));
         qf_size_out += field_size;
       }
+      CeedCallBackend(CeedVectorDestroy(&vec));
     }
     CeedCheck(qf_size_out > 0, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs");
     impl->qf_size_out = qf_size_out;
@@ -603,13 +621,15 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Opt(CeedOperator op, b
 
     // Assemble QFunction
     for (CeedInt i = 0; i < num_input_fields; i++) {
+      bool       is_active;
       CeedInt    field_size;
       CeedVector vec;
 
-      // Get input vector
-      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
       // Check if active input
-      if (vec != CEED_VECTOR_ACTIVE) continue;
+      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
+      is_active = vec == CEED_VECTOR_ACTIVE;
+      CeedCallBackend(CeedVectorDestroy(&vec));
+      if (!is_active) continue;
       CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &field_size));
       for (CeedInt field = 0; field < field_size; field++) {
         // Set current portion of input to 1.0
@@ -626,9 +646,8 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Opt(CeedOperator op, b
           for (CeedInt out = 0; out < num_output_fields; out++) {
             CeedVector vec;
 
-            // Get output vector
-            CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec));
             // Check if active output
+            CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec));
             if (vec == CEED_VECTOR_ACTIVE) {
               CeedInt field_size;
 
@@ -636,6 +655,7 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Opt(CeedOperator op, b
               CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[out], &field_size));
               l_vec_array += field_size * Q * block_size;  // Advance the pointer by the size of the output
             }
+            CeedCallBackend(CeedVectorDestroy(&vec));
           }
           // Apply QFunction
           CeedCallBackend(CeedQFunctionApply(qf, Q * block_size, impl->q_vecs_in, impl->q_vecs_out));
@@ -666,12 +686,12 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Opt(CeedOperator op, b
       for (CeedInt out = 0; out < num_output_fields; out++) {
         CeedVector vec;
 
-        // Get output vector
-        CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec));
         // Check if active output
+        CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec));
         if (vec == CEED_VECTOR_ACTIVE && num_elem > 0) {
           CeedCallBackend(CeedVectorTakeArray(impl->q_vecs_out[out], CEED_MEM_HOST, NULL));
         }
+        CeedCallBackend(CeedVectorDestroy(&vec));
       }
     }
 
@@ -684,10 +704,10 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Opt(CeedOperator op, b
   for (CeedInt out = 0; out < num_output_fields; out++) {
     CeedVector vec;
 
-    // Get output vector
-    CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec));
     // Initialize array if active output
+    CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec));
     if (vec == CEED_VECTOR_ACTIVE) CeedCallBackend(CeedVectorSetValue(impl->q_vecs_out[out], 0.0));
+    CeedCallBackend(CeedVectorDestroy(&vec));
   }
 
   // Restore input arrays
diff --git a/backends/ref/ceed-ref-operator.c b/backends/ref/ceed-ref-operator.c
index de79e96d5b..4714387744 100644
--- a/backends/ref/ceed-ref-operator.c
+++ b/backends/ref/ceed-ref-operator.c
@@ -50,6 +50,7 @@ static int CeedOperatorSetupFields_Ref(CeedQFunction qf, CeedOperator op, bool i
     if (eval_mode != CEED_EVAL_WEIGHT) {
       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &elem_rstr));
       CeedCallBackend(CeedElemRestrictionCreateVector(elem_rstr, NULL, &e_vecs_full[i + start_e]));
+      CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
     }
 
     switch (eval_mode) {
@@ -70,12 +71,14 @@ static int CeedOperatorSetupFields_Ref(CeedQFunction qf, CeedOperator op, bool i
         CeedCallBackend(CeedVectorCreate(ceed, e_size, &e_vecs[i]));
         q_size = (CeedSize)Q * size;
         CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i]));
+        CeedCallBackend(CeedBasisDestroy(&basis));
         break;
       case CEED_EVAL_WEIGHT:  // Only on input fields
         CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis));
         q_size = (CeedSize)Q;
         CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i]));
         CeedCallBackend(CeedBasisApply(basis, 1, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT, CEED_VECTOR_NONE, q_vecs[i]));
+        CeedCallBackend(CeedBasisDestroy(&basis));
         break;
     }
   }
@@ -98,7 +101,11 @@ static int CeedOperatorSetupFields_Ref(CeedQFunction qf, CeedOperator op, bool i
           CeedCallBackend(CeedVectorReferenceCopy(e_vecs_full[i + start_e], &e_vecs_full[j + start_e]));
           skip_rstr[j] = true;
         }
+        CeedCallBackend(CeedVectorDestroy(&vec_j));
+        CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j));
       }
+      CeedCallBackend(CeedVectorDestroy(&vec_i));
+      CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
     }
   } else {
     for (CeedInt i = num_fields - 1; i >= 0; i--) {
@@ -120,7 +127,11 @@ static int CeedOperatorSetupFields_Ref(CeedQFunction qf, CeedOperator op, bool i
           apply_add_basis[i]    = true;
           e_data_out_indices[j] = i;
         }
+        CeedCallBackend(CeedVectorDestroy(&vec_j));
+        CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j));
       }
+      CeedCallBackend(CeedVectorDestroy(&vec_i));
+      CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
     }
   }
   return CEED_ERROR_SUCCESS;
@@ -198,14 +209,15 @@ static inline int CeedOperatorSetupInputs_Ref(CeedInt num_input_fields, CeedQFun
                                               CeedVector in_vec, const bool skip_active, CeedScalar *e_data_full[2 * CEED_FIELD_MAX],
                                               CeedOperator_Ref *impl, CeedRequest *request) {
   for (CeedInt i = 0; i < num_input_fields; i++) {
-    uint64_t            state;
-    CeedEvalMode        eval_mode;
-    CeedVector          vec;
-    CeedElemRestriction elem_rstr;
+    bool         is_active;
+    uint64_t     state;
+    CeedEvalMode eval_mode;
+    CeedVector   vec;
 
     // Get input vector
     CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
-    if (vec == CEED_VECTOR_ACTIVE) {
+    is_active = vec == CEED_VECTOR_ACTIVE;
+    if (is_active) {
       if (skip_active) continue;
       else vec = in_vec;
     }
@@ -218,13 +230,17 @@ static inline int CeedOperatorSetupInputs_Ref(CeedInt num_input_fields, CeedQFun
       CeedCallBackend(CeedVectorGetState(vec, &state));
       // Skip restriction if input is unchanged
       if ((state != impl->input_states[i] || vec == in_vec) && !impl->skip_rstr_in[i]) {
+        CeedElemRestriction elem_rstr;
+
         CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
         CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_NOTRANSPOSE, vec, impl->e_vecs_full[i], request));
+        CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
       }
       impl->input_states[i] = state;
       // Get evec
       CeedCallBackend(CeedVectorGetArrayRead(impl->e_vecs_full[i], CEED_MEM_HOST, (const CeedScalar **)&e_data_full[i]));
     }
+    if (!is_active) CeedCallBackend(CeedVectorDestroy(&vec));
   }
   return CEED_ERROR_SUCCESS;
 }
@@ -243,14 +259,18 @@ static inline int CeedOperatorInputBasis_Ref(CeedInt e, CeedInt Q, CeedQFunction
 
     // Skip active input
     if (skip_active) {
+      bool       is_active;
       CeedVector vec;
 
       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
-      if (vec == CEED_VECTOR_ACTIVE) continue;
+      is_active = vec == CEED_VECTOR_ACTIVE;
+      CeedCallBackend(CeedVectorDestroy(&vec));
+      if (is_active) continue;
     }
     // Get elem_size, eval_mode, size
     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
+    CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
     CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &size));
     // Basis action
@@ -266,6 +286,7 @@ static inline int CeedOperatorInputBasis_Ref(CeedInt e, CeedInt Q, CeedQFunction
         CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
         CeedCallBackend(CeedVectorSetArray(impl->e_vecs_in[i], CEED_MEM_HOST, CEED_USE_POINTER, &e_data_full[i][(CeedSize)e * elem_size * num_comp]));
         CeedCallBackend(CeedBasisApply(basis, 1, CEED_NOTRANSPOSE, eval_mode, impl->e_vecs_in[i], impl->q_vecs_in[i]));
+        CeedCallBackend(CeedBasisDestroy(&basis));
         break;
       case CEED_EVAL_WEIGHT:
         break;  // No action
@@ -289,6 +310,7 @@ static inline int CeedOperatorOutputBasis_Ref(CeedInt e, CeedInt Q, CeedQFunctio
     // Get elem_size, eval_mode
     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
+    CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
     // Basis action
     switch (eval_mode) {
@@ -307,6 +329,7 @@ static inline int CeedOperatorOutputBasis_Ref(CeedInt e, CeedInt Q, CeedQFunctio
         } else {
           CeedCallBackend(CeedBasisApply(basis, 1, CEED_TRANSPOSE, eval_mode, impl->q_vecs_out[i], impl->e_vecs_out[i]));
         }
+        CeedCallBackend(CeedBasisDestroy(&basis));
         break;
       // LCOV_EXCL_START
       case CEED_EVAL_WEIGHT: {
@@ -328,10 +351,13 @@ static inline int CeedOperatorRestoreInputs_Ref(CeedInt num_input_fields, CeedQF
 
     // Skip active inputs
     if (skip_active) {
+      bool       is_active;
       CeedVector vec;
 
       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
-      if (vec == CEED_VECTOR_ACTIVE) continue;
+      is_active = vec == CEED_VECTOR_ACTIVE;
+      CeedCallBackend(CeedVectorDestroy(&vec));
+      if (is_active) continue;
     }
     // Restore input
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
@@ -371,8 +397,10 @@ static int CeedOperatorApplyAdd_Ref(CeedOperator op, CeedVector in_vec, CeedVect
 
     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[0], &elem_rstr));
     CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_NOTRANSPOSE, in_vec, impl->e_vecs_full[0], request));
+    CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[0], &elem_rstr));
     CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_TRANSPOSE, impl->e_vecs_full[0], out_vec, request));
+    CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
     return CEED_ERROR_SUCCESS;
   }
 
@@ -415,6 +443,7 @@ static int CeedOperatorApplyAdd_Ref(CeedOperator op, CeedVector in_vec, CeedVect
 
   // Output restriction
   for (CeedInt i = 0; i < num_output_fields; i++) {
+    bool                is_active;
     CeedVector          vec;
     CeedElemRestriction elem_rstr;
 
@@ -424,10 +453,13 @@ static int CeedOperatorApplyAdd_Ref(CeedOperator op, CeedVector in_vec, CeedVect
     // Get output vector
     CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
     // Active
-    if (vec == CEED_VECTOR_ACTIVE) vec = out_vec;
+    is_active = vec == CEED_VECTOR_ACTIVE;
+    if (is_active) vec = out_vec;
     // Restrict
     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
     CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_TRANSPOSE, impl->e_vecs_full[i + impl->num_inputs], vec, request));
+    if (!is_active) CeedCallBackend(CeedVectorDestroy(&vec));
+    CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
   }
 
   // Restore input arrays
@@ -482,6 +514,7 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Ref(CeedOperator op, b
         CeedCallBackend(CeedVectorSetValue(impl->q_vecs_in[i], 0.0));
         qf_size_in += field_size;
       }
+      CeedCallBackend(CeedVectorDestroy(&vec));
     }
     CeedCheck(qf_size_in > 0, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs");
     impl->qf_size_in = qf_size_in;
@@ -500,6 +533,7 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Ref(CeedOperator op, b
         CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &field_size));
         qf_size_out += field_size;
       }
+      CeedCallBackend(CeedVectorDestroy(&vec));
     }
     CeedCheck(qf_size_out > 0, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs");
     impl->qf_size_out = qf_size_out;
@@ -528,12 +562,15 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Ref(CeedOperator op, b
     // Assemble QFunction
 
     for (CeedInt i = 0; i < num_input_fields; i++) {
+      bool       is_active;
       CeedInt    field_size;
       CeedVector vec;
 
       // Set Inputs
       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
-      if (vec != CEED_VECTOR_ACTIVE) continue;
+      is_active = vec == CEED_VECTOR_ACTIVE;
+      CeedCallBackend(CeedVectorDestroy(&vec));
+      if (!is_active) continue;
       CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &field_size));
       for (CeedInt field = 0; field < field_size; field++) {
         // Set current portion of input to 1.0
@@ -560,6 +597,7 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Ref(CeedOperator op, b
               CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[out], &field_size));
               assembled_array += field_size * Q;  // Advance the pointer by the size of the output
             }
+            CeedCallBackend(CeedVectorDestroy(&vec));
           }
           // Apply QFunction
           CeedCallBackend(CeedQFunctionApply(qf, Q, impl->q_vecs_in, impl->q_vecs_out));
@@ -597,6 +635,7 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Ref(CeedOperator op, b
       if (vec == CEED_VECTOR_ACTIVE && num_elem > 0) {
         CeedCallBackend(CeedVectorTakeArray(impl->q_vecs_out[out], CEED_MEM_HOST, NULL));
       }
+      CeedCallBackend(CeedVectorDestroy(&vec));
     }
   }
 
@@ -677,6 +716,7 @@ static int CeedOperatorSetupFieldsAtPoints_Ref(CeedQFunction qf, CeedOperator op
 
       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &elem_rstr));
       CeedCallBackend(CeedElemRestrictionCreateVector(elem_rstr, NULL, &e_vecs_full[i + start_e]));
+      CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
       CeedCallBackend(CeedVectorSetValue(e_vecs_full[i + start_e], 0.0));
     }
 
@@ -694,6 +734,7 @@ static int CeedOperatorSetupFieldsAtPoints_Ref(CeedQFunction qf, CeedOperator op
           q_size = (CeedSize)max_num_points * size;
           CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i]));
         }
+        CeedCallBackend(CeedVectorDestroy(&vec));
         break;
       }
       case CEED_EVAL_INTERP:
@@ -708,6 +749,7 @@ static int CeedOperatorSetupFieldsAtPoints_Ref(CeedQFunction qf, CeedOperator op
         CeedCallBackend(CeedVectorCreate(ceed, e_size, &e_vecs[i]));
         q_size = (CeedSize)max_num_points * size;
         CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i]));
+        CeedCallBackend(CeedBasisDestroy(&basis));
         break;
       case CEED_EVAL_WEIGHT:  // Only on input fields
         CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis));
@@ -715,6 +757,7 @@ static int CeedOperatorSetupFieldsAtPoints_Ref(CeedQFunction qf, CeedOperator op
         CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i]));
         CeedCallBackend(
             CeedBasisApplyAtPoints(basis, 1, &max_num_points, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT, CEED_VECTOR_NONE, CEED_VECTOR_NONE, q_vecs[i]));
+        CeedCallBackend(CeedBasisDestroy(&basis));
         break;
     }
     // Initialize full arrays for E-vectors and Q-vectors
@@ -740,7 +783,11 @@ static int CeedOperatorSetupFieldsAtPoints_Ref(CeedQFunction qf, CeedOperator op
           CeedCallBackend(CeedVectorReferenceCopy(e_vecs_full[i + start_e], &e_vecs_full[j + start_e]));
           skip_rstr[j] = true;
         }
+        CeedCallBackend(CeedVectorDestroy(&vec_j));
+        CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j));
       }
+      CeedCallBackend(CeedVectorDestroy(&vec_i));
+      CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
     }
   } else {
     for (CeedInt i = num_fields - 1; i >= 0; i--) {
@@ -761,7 +808,11 @@ static int CeedOperatorSetupFieldsAtPoints_Ref(CeedQFunction qf, CeedOperator op
           skip_rstr[j]       = true;
           apply_add_basis[i] = true;
         }
+        CeedCallBackend(CeedVectorDestroy(&vec_j));
+        CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j));
       }
+      CeedCallBackend(CeedVectorDestroy(&vec_i));
+      CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
     }
   }
   return CEED_ERROR_SUCCESS;
@@ -829,7 +880,7 @@ static inline int CeedOperatorInputBasisAtPoints_Ref(CeedInt e, CeedInt num_poin
                                                      CeedVector point_coords_elem, bool skip_active, CeedScalar *e_data[2 * CEED_FIELD_MAX],
                                                      CeedOperator_Ref *impl, CeedRequest *request) {
   for (CeedInt i = 0; i < num_input_fields; i++) {
-    bool                is_active_input = false;
+    bool                is_active;
     CeedInt             elem_size, size, num_comp;
     CeedRestrictionType rstr_type;
     CeedEvalMode        eval_mode;
@@ -837,10 +888,11 @@ static inline int CeedOperatorInputBasisAtPoints_Ref(CeedInt e, CeedInt num_poin
     CeedElemRestriction elem_rstr;
     CeedBasis           basis;
 
-    CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
     // Skip active input
-    is_active_input = vec == CEED_VECTOR_ACTIVE;
-    if (skip_active && is_active_input) continue;
+    CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
+    is_active = vec == CEED_VECTOR_ACTIVE;
+    CeedCallBackend(CeedVectorDestroy(&vec));
+    if (skip_active && is_active) continue;
 
     // Get elem_size, eval_mode, size
     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
@@ -848,7 +900,7 @@ static inline int CeedOperatorInputBasisAtPoints_Ref(CeedInt e, CeedInt num_poin
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
     CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &size));
     // Restrict block active input
-    if (is_active_input && !impl->skip_rstr_in[i]) {
+    if (is_active && !impl->skip_rstr_in[i]) {
       if (rstr_type == CEED_RESTRICTION_POINTS) {
         CeedCallBackend(CeedElemRestrictionApplyAtPointsInElement(elem_rstr, e, CEED_NOTRANSPOSE, in_vec, impl->e_vecs_in[i], request));
       } else {
@@ -858,7 +910,7 @@ static inline int CeedOperatorInputBasisAtPoints_Ref(CeedInt e, CeedInt num_poin
     // Basis action
     switch (eval_mode) {
       case CEED_EVAL_NONE:
-        if (!is_active_input) {
+        if (!is_active) {
           CeedCallBackend(CeedVectorSetArray(impl->q_vecs_in[i], CEED_MEM_HOST, CEED_USE_POINTER, &e_data[i][num_points_offset * size]));
         }
         break;
@@ -868,17 +920,19 @@ static inline int CeedOperatorInputBasisAtPoints_Ref(CeedInt e, CeedInt num_poin
       case CEED_EVAL_DIV:
       case CEED_EVAL_CURL:
         CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
-        if (!is_active_input) {
+        if (!is_active) {
           CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
           CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
           CeedCallBackend(CeedVectorSetArray(impl->e_vecs_in[i], CEED_MEM_HOST, CEED_USE_POINTER, &e_data[i][(CeedSize)e * elem_size * num_comp]));
         }
         CeedCallBackend(
             CeedBasisApplyAtPoints(basis, 1, &num_points, CEED_NOTRANSPOSE, eval_mode, point_coords_elem, impl->e_vecs_in[i], impl->q_vecs_in[i]));
+        CeedCallBackend(CeedBasisDestroy(&basis));
         break;
       case CEED_EVAL_WEIGHT:
         break;  // No action
     }
+    CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
   }
   return CEED_ERROR_SUCCESS;
 }
@@ -891,6 +945,7 @@ static inline int CeedOperatorOutputBasisAtPoints_Ref(CeedInt e, CeedInt num_poi
                                                       bool *apply_add_basis, bool *skip_rstr, CeedOperator op, CeedVector out_vec,
                                                       CeedVector point_coords_elem, CeedOperator_Ref *impl, CeedRequest *request) {
   for (CeedInt i = 0; i < num_output_fields; i++) {
+    bool                is_active;
     CeedRestrictionType rstr_type;
     CeedEvalMode        eval_mode;
     CeedVector          vec;
@@ -916,6 +971,7 @@ static inline int CeedOperatorOutputBasisAtPoints_Ref(CeedInt e, CeedInt num_poi
           CeedCallBackend(
               CeedBasisApplyAtPoints(basis, 1, &num_points, CEED_TRANSPOSE, eval_mode, point_coords_elem, impl->q_vecs_out[i], impl->e_vecs_out[i]));
         }
+        CeedCallBackend(CeedBasisDestroy(&basis));
         break;
       // LCOV_EXCL_START
       case CEED_EVAL_WEIGHT: {
@@ -928,13 +984,16 @@ static inline int CeedOperatorOutputBasisAtPoints_Ref(CeedInt e, CeedInt num_poi
     // Get output vector
     CeedCallBackend(CeedElemRestrictionGetType(elem_rstr, &rstr_type));
     CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
-    if (vec == CEED_VECTOR_ACTIVE) vec = out_vec;
+    is_active = vec == CEED_VECTOR_ACTIVE;
+    if (is_active) vec = out_vec;
     // Restrict
     if (rstr_type == CEED_RESTRICTION_POINTS) {
       CeedCallBackend(CeedElemRestrictionApplyAtPointsInElement(elem_rstr, e, CEED_TRANSPOSE, impl->e_vecs_out[i], vec, request));
     } else {
       CeedCallBackend(CeedElemRestrictionApplyBlock(elem_rstr, e, CEED_TRANSPOSE, impl->e_vecs_out[i], vec, request));
     }
+    if (!is_active) CeedCallBackend(CeedVectorDestroy(&vec));
+    CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
   }
   return CEED_ERROR_SUCCESS;
 }
@@ -1055,12 +1114,14 @@ static inline int CeedOperatorLinearAssembleQFunctionAtPointsCore_Ref(CeedOperat
 
           CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
           CeedCallBackend(CeedElemRestrictionIsAtPoints(elem_rstr, &is_at_points));
+          CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
           CeedCheck(!is_at_points, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction with active input at points");
         }
         // Get size of active input
         CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &field_size));
         qf_size_in += field_size;
       }
+      CeedCallBackend(CeedVectorDestroy(&vec));
     }
     CeedCheck(qf_size_in, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs");
     impl->qf_size_in = qf_size_in;
@@ -1083,6 +1144,7 @@ static inline int CeedOperatorLinearAssembleQFunctionAtPointsCore_Ref(CeedOperat
 
           CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
           CeedCallBackend(CeedElemRestrictionIsAtPoints(elem_rstr, &is_at_points));
+          CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
           CeedCheck(!is_at_points, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction with active input at points");
         }
         // Get size of active output
@@ -1090,6 +1152,7 @@ static inline int CeedOperatorLinearAssembleQFunctionAtPointsCore_Ref(CeedOperat
         CeedCallBackend(CeedVectorSetValue(impl->q_vecs_in[i], 0.0));
         qf_size_out += field_size;
       }
+      CeedCallBackend(CeedVectorDestroy(&vec));
     }
     CeedCheck(qf_size_out > 0, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs");
     impl->qf_size_out = qf_size_out;
@@ -1129,13 +1192,16 @@ static inline int CeedOperatorLinearAssembleQFunctionAtPointsCore_Ref(CeedOperat
 
     // Assemble QFunction
     for (CeedInt i = 0; i < num_input_fields; i++) {
+      bool       is_active;
       CeedInt    field_size;
       CeedVector vec;
 
       // Get input vector
       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
+      is_active = vec == CEED_VECTOR_ACTIVE;
+      CeedCallBackend(CeedVectorDestroy(&vec));
       // Check if active input
-      if (vec != CEED_VECTOR_ACTIVE) continue;
+      if (!is_active) continue;
       // Get size of active input
       CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &field_size));
       for (CeedInt field = 0; field < field_size; field++) {
@@ -1162,6 +1228,7 @@ static inline int CeedOperatorLinearAssembleQFunctionAtPointsCore_Ref(CeedOperat
               CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[out], &field_size));
               assembled_array += field_size * num_points;  // Advance the pointer by the size of the output
             }
+            CeedCallBackend(CeedVectorDestroy(&vec));
           }
           // Apply QFunction
           CeedCallBackend(CeedQFunctionApply(qf, num_points, impl->q_vecs_in, impl->q_vecs_out));
@@ -1200,6 +1267,7 @@ static inline int CeedOperatorLinearAssembleQFunctionAtPointsCore_Ref(CeedOperat
       if (vec == CEED_VECTOR_ACTIVE && num_elem > 0) {
         CeedCallBackend(CeedVectorTakeArray(impl->q_vecs_out[out], CEED_MEM_HOST, NULL));
       }
+      CeedCallBackend(CeedVectorDestroy(&vec));
     }
   }
 
@@ -1277,10 +1345,13 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Ref(CeedOperator op, Ce
 
   // Clear input Qvecs
   for (CeedInt i = 0; i < num_input_fields; i++) {
+    bool       is_active;
     CeedVector vec;
 
     CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
-    if (vec != CEED_VECTOR_ACTIVE) continue;
+    is_active = vec == CEED_VECTOR_ACTIVE;
+    CeedCallBackend(CeedVectorDestroy(&vec));
+    if (!is_active) continue;
     CeedCallBackend(CeedVectorSetValue(impl->q_vecs_in[i], 0.0));
   }
 
@@ -1301,15 +1372,17 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Ref(CeedOperator op, Ce
 
     // Loop over points on element
     for (CeedInt i = 0; i < num_input_fields; i++) {
-      bool                is_active_at_points = true;
+      bool                is_active_at_points = true, is_active;
       CeedInt             elem_size_active    = 1;
       CeedRestrictionType rstr_type;
       CeedVector          vec;
       CeedElemRestriction elem_rstr;
 
-      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
       // -- Skip non-active input
-      if (vec != CEED_VECTOR_ACTIVE) continue;
+      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
+      is_active = vec == CEED_VECTOR_ACTIVE;
+      CeedCallBackend(CeedVectorDestroy(&vec));
+      if (!is_active) continue;
 
       // -- Get active restriction type
       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
@@ -1318,6 +1391,7 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Ref(CeedOperator op, Ce
       if (!is_active_at_points) CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size_active));
       else elem_size_active = num_points;
       CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp_active));
+      CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
 
       e_vec_size = elem_size_active * num_comp_active;
       for (CeedInt s = 0; s < e_vec_size; s++) {
@@ -1347,6 +1421,7 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Ref(CeedOperator op, Ce
             CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
             CeedCallBackend(CeedBasisApplyAtPoints(basis, 1, &num_points, CEED_NOTRANSPOSE, eval_mode, impl->point_coords_elem, impl->e_vecs_in[i],
                                                    impl->q_vecs_in[i]));
+            CeedCallBackend(CeedBasisDestroy(&basis));
             break;
           case CEED_EVAL_WEIGHT:
             break;  // No action
@@ -1364,18 +1439,19 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Ref(CeedOperator op, Ce
 
         // -- Grab diagonal value
         for (CeedInt j = 0; j < num_output_fields; j++) {
-          bool                is_active_output = false;
-          CeedInt             elem_size        = 0;
+          bool                is_active;
+          CeedInt             elem_size = 0;
           CeedRestrictionType rstr_type;
           CeedEvalMode        eval_mode;
           CeedVector          vec;
           CeedElemRestriction elem_rstr;
           CeedBasis           basis;
 
-          CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[j], &vec));
           // ---- Skip non-active output
-          is_active_output = vec == CEED_VECTOR_ACTIVE;
-          if (!is_active_output) continue;
+          CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[j], &vec));
+          is_active = vec == CEED_VECTOR_ACTIVE;
+          CeedCallBackend(CeedVectorDestroy(&vec));
+          if (!is_active) continue;
 
           // ---- Check if elem size matches
           CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[j], &elem_rstr));
@@ -1405,6 +1481,7 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Ref(CeedOperator op, Ce
               CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[j], &basis));
               CeedCallBackend(CeedBasisApplyAtPoints(basis, 1, &num_points, CEED_TRANSPOSE, eval_mode, impl->point_coords_elem, impl->q_vecs_out[j],
                                                      impl->e_vecs_out[j]));
+              CeedCallBackend(CeedBasisDestroy(&basis));
               break;
             // LCOV_EXCL_START
             case CEED_EVAL_WEIGHT: {
@@ -1430,6 +1507,7 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Ref(CeedOperator op, Ce
           } else {
             CeedCallBackend(CeedElemRestrictionApplyBlock(elem_rstr, e, CEED_TRANSPOSE, impl->e_vecs_out[j], assembled, request));
           }
+          CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
         }
         // -- Reset unit vector
         if (s == e_vec_size - 1) CeedCallBackend(CeedVectorSetValue(impl->q_vecs_in[i], 0.0));
diff --git a/backends/sycl-gen/ceed-sycl-gen-operator-build.sycl.cpp b/backends/sycl-gen/ceed-sycl-gen-operator-build.sycl.cpp
index a4edb6fc2b..b14e155124 100644
--- a/backends/sycl-gen/ceed-sycl-gen-operator-build.sycl.cpp
+++ b/backends/sycl-gen/ceed-sycl-gen-operator-build.sycl.cpp
@@ -155,12 +155,12 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) {
         // LCOV_EXCL_STOP
       }
     }
+    CeedCallBackend(CeedBasisDestroy(&basis));
   }
   // Check output bases for Q_1d, dim as well
   //   The only input basis might be CEED_BASIS_NONE
   for (CeedInt i = 0; i < num_output_fields; i++) {
     CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
-
     if (basis != CEED_BASIS_NONE) {
       bool is_tensor;
 
@@ -178,6 +178,7 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) {
         // LCOV_EXCL_STOP
       }
     }
+    CeedCallBackend(CeedBasisDestroy(&basis));
   }
   impl->dim  = dim;
   impl->Q_1d = Q_1d;
@@ -196,6 +197,7 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) {
         CeedCallBackend(CeedBasisGetData(basis, &basis_impl));
         use_collograd_parallelization = basis_impl->d_collo_grad_1d && (was_grad_found ? use_collograd_parallelization : true);
         was_grad_found                = true;
+        CeedCallBackend(CeedBasisDestroy(&basis));
       }
     }
     for (CeedInt i = 0; i < num_output_fields; i++) {
@@ -205,6 +207,7 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) {
         CeedCallBackend(CeedBasisGetData(basis, &basis_impl));
         use_collograd_parallelization = basis_impl->d_collo_grad_1d && (was_grad_found ? use_collograd_parallelization : true);
         was_grad_found                = true;
+        CeedCallBackend(CeedBasisDestroy(&basis));
       }
     }
   }
@@ -273,6 +276,7 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) {
     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
+    CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
 
     // Set field constants
     if (eval_mode != CEED_EVAL_WEIGHT) {
@@ -321,6 +325,7 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) {
       case CEED_EVAL_CURL:
         break;  // TODO: Not implemented
     }
+    CeedCallBackend(CeedBasisDestroy(&basis));
   }
 
   code << "\n  // -- Output field constants and basis data --\n";
@@ -331,6 +336,7 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) {
     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
+    CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
 
     // Set field constants
     CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
@@ -382,6 +388,7 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) {
       }
         // LCOV_EXCL_STOP
     }
+    CeedCallBackend(CeedBasisDestroy(&basis));
   }
   code << "\n  // -- Element loop --\n";
   code << "  work_group_barrier(CLK_LOCAL_MEM_FENCE);\n";
@@ -431,6 +438,7 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) {
              << ", num_elem, d_u_" << i << ", r_u_" << i << ");\n";
       }
     }
+    CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
 
     // Basis action
     code << "    // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
@@ -452,12 +460,14 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) {
                << i << ", r_t_" << i << ", elem_scratch);\n";
         } else {
           CeedInt P_1d;
+
           CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
           CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
           code << "    CeedScalar r_t_" << i << "[num_comp_in_" << i << "*DIM*Q_1D];\n";
           code << "    Grad" << (dim > 1 ? "Tensor" : "") << (dim == 3 && Q_1d >= P_1d ? "Collocated" : "") << dim << "d(num_comp_in_" << i
                << ", P_in_" << i << ", Q_1D, r_u_" << i << (dim > 1 ? ", s_B_in_" : "") << (dim > 1 ? std::to_string(i) : "") << ", s_G_in_" << i
                << ", r_t_" << i << ", elem_scratch);\n";
+          CeedCallBackend(CeedBasisDestroy(&basis));
         }
         break;
       case CEED_EVAL_WEIGHT:
@@ -466,6 +476,7 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) {
         CeedCallBackend(CeedBasisGetData(basis, &basis_impl));
         impl->W = basis_impl->d_q_weight_1d;
         code << "    Weight" << (dim > 1 ? "Tensor" : "") << dim << "d(Q_1D, W, r_t_" << i << ");\n";
+        CeedCallBackend(CeedBasisDestroy(&basis));
         break;  // No action
       case CEED_EVAL_DIV:
         break;  // TODO: Not implemented
@@ -544,6 +555,7 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) {
                  << "3d(num_comp_in_" << i << ", Q_1D," << strides[0] << ", " << strides[1] << ", " << strides[2] << ", num_elem, q, d_u_" << i
                  << ", r_q_" << i << ");\n";
           }
+          CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
           break;
         case CEED_EVAL_INTERP:
           code << "      CeedScalar r_q_" << i << "[num_comp_in_" << i << "];\n";
@@ -690,6 +702,7 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) {
           code << "    GradTranspose" << (dim > 1 ? "Tensor" : "") << (dim == 3 && Q_1d >= P_1d ? "Collocated" : "") << dim << "d(num_comp_out_" << i
                << ", P_out_" << i << ", Q_1D, r_tt_" << i << (dim > 1 ? ", s_B_out_" : "") << (dim > 1 ? std::to_string(i) : "") << ", s_G_out_" << i
                << ", r_v_" << i << ", elem_scratch);\n";
+          CeedCallBackend(CeedBasisDestroy(&basis));
         }
         break;
       // LCOV_EXCL_START
@@ -734,6 +747,7 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) {
       code << "    writeDofsStrided" << dim << "d(num_comp_out_" << i << ",P_out_" << i << "," << strides[0] << "," << strides[1] << "," << strides[2]
            << ", num_elem, r_v_" << i << ", d_v_" << i << ");\n";
     }
+    CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
   }
 
   code << "  }\n";
diff --git a/backends/sycl-gen/ceed-sycl-gen-operator.sycl.cpp b/backends/sycl-gen/ceed-sycl-gen-operator.sycl.cpp
index 100176b2d7..650a52cf4d 100644
--- a/backends/sycl-gen/ceed-sycl-gen-operator.sycl.cpp
+++ b/backends/sycl-gen/ceed-sycl-gen-operator.sycl.cpp
@@ -73,12 +73,15 @@ static int CeedOperatorApplyAdd_Sycl_gen(CeedOperator op, CeedVector input_vec,
     if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
       impl->fields->inputs[i] = NULL;
     } else {
+      bool       is_active;
       CeedVector vec;
 
       // Get input vector
       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
-      if (vec == CEED_VECTOR_ACTIVE) vec = input_vec;
+      is_active = vec == CEED_VECTOR_ACTIVE;
+      if (is_active) vec = input_vec;
       CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &impl->fields->inputs[i]));
+      if (!is_active) CeedCallBackend(CeedVectorDestroy(&vec));
     }
   }
 
@@ -88,11 +91,13 @@ static int CeedOperatorApplyAdd_Sycl_gen(CeedOperator op, CeedVector input_vec,
     if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
       impl->fields->outputs[i] = NULL;
     } else {
+      bool       is_active;
       CeedVector vec;
 
       // Get output vector
       CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
-      if (vec == CEED_VECTOR_ACTIVE) vec = output_vec;
+      is_active = vec == CEED_VECTOR_ACTIVE;
+      if (is_active) vec = output_vec;
       output_vecs[i] = vec;
       // Check for multiple output modes
       CeedInt index = -1;
@@ -102,6 +107,7 @@ static int CeedOperatorApplyAdd_Sycl_gen(CeedOperator op, CeedVector input_vec,
           break;
         }
       }
+      if (!is_active) CeedCallBackend(CeedVectorDestroy(&vec));
       if (index == -1) {
         CeedCallBackend(CeedVectorGetArray(vec, CEED_MEM_DEVICE, &impl->fields->outputs[i]));
       } else {
@@ -152,11 +158,14 @@ static int CeedOperatorApplyAdd_Sycl_gen(CeedOperator op, CeedVector input_vec,
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
     if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
     } else {
+      bool       is_active;
       CeedVector vec;
 
       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
-      if (vec == CEED_VECTOR_ACTIVE) vec = input_vec;
+      is_active = vec == CEED_VECTOR_ACTIVE;
+      if (is_active) vec = input_vec;
       CeedCallBackend(CeedVectorRestoreArrayRead(vec, &impl->fields->inputs[i]));
+      if (!is_active) CeedCallBackend(CeedVectorDestroy(&vec));
     }
   }
 
@@ -165,10 +174,12 @@ static int CeedOperatorApplyAdd_Sycl_gen(CeedOperator op, CeedVector input_vec,
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
     if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
     } else {
+      bool       is_active;
       CeedVector vec;
 
       CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
-      if (vec == CEED_VECTOR_ACTIVE) vec = output_vec;
+      is_active = vec == CEED_VECTOR_ACTIVE;
+      if (is_active) vec = output_vec;
       // Check for multiple output modes
       CeedInt index = -1;
 
@@ -178,6 +189,7 @@ static int CeedOperatorApplyAdd_Sycl_gen(CeedOperator op, CeedVector input_vec,
           break;
         }
       }
+      if (!is_active) CeedCallBackend(CeedVectorDestroy(&vec));
       if (index == -1) {
         CeedCallBackend(CeedVectorRestoreArray(vec, &impl->fields->outputs[i]));
       }
diff --git a/backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp b/backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp
index 8939d84a26..35c0fd5097 100644
--- a/backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp
+++ b/backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp
@@ -23,9 +23,9 @@ class CeedOperatorSyclLinearAssembleFallback;
 //------------------------------------------------------------------------------
 //  Get Basis Emode Pointer
 //------------------------------------------------------------------------------
-void CeedOperatorGetBasisPointer_Sycl(const CeedScalar **basis_ptr, CeedEvalMode e_mode, const CeedScalar *identity, const CeedScalar *interp,
+void CeedOperatorGetBasisPointer_Sycl(const CeedScalar **basis_ptr, CeedEvalMode eval_mode, const CeedScalar *identity, const CeedScalar *interp,
                                       const CeedScalar *grad) {
-  switch (e_mode) {
+  switch (eval_mode) {
     case CEED_EVAL_NONE:
       *basis_ptr = identity;
       break;
@@ -70,7 +70,7 @@ static int CeedOperatorDestroy_Sycl(CeedOperator op) {
   }
   CeedCallBackend(CeedFree(&impl->q_vecs_out));
 
-  // QFunction assembly data
+  // QFunction assembly dataf
   for (CeedInt i = 0; i < impl->num_active_in; i++) {
     CeedCallBackend(CeedVectorDestroy(&impl->qf_active_in[i]));
   }
@@ -78,12 +78,12 @@ static int CeedOperatorDestroy_Sycl(CeedOperator op) {
 
   // Diag data
   if (impl->diag) {
-    CeedCallBackend(CeedFree(&impl->diag->h_e_mode_in));
-    CeedCallBackend(CeedFree(&impl->diag->h_e_mode_out));
+    CeedCallBackend(CeedFree(&impl->diag->h_eval_mode_in));
+    CeedCallBackend(CeedFree(&impl->diag->h_eval_mode_out));
 
     CeedCallSycl(ceed, sycl_data->sycl_queue.wait_and_throw());
-    CeedCallSycl(ceed, sycl::free(impl->diag->d_e_mode_in, sycl_data->sycl_context));
-    CeedCallSycl(ceed, sycl::free(impl->diag->d_e_mode_out, sycl_data->sycl_context));
+    CeedCallSycl(ceed, sycl::free(impl->diag->d_eval_mode_in, sycl_data->sycl_context));
+    CeedCallSycl(ceed, sycl::free(impl->diag->d_eval_mode_out, sycl_data->sycl_context));
     CeedCallSycl(ceed, sycl::free(impl->diag->d_identity, sycl_data->sycl_context));
     CeedCallSycl(ceed, sycl::free(impl->diag->d_interp_in, sycl_data->sycl_context));
     CeedCallSycl(ceed, sycl::free(impl->diag->d_interp_out, sycl_data->sycl_context));
@@ -130,28 +130,28 @@ static int CeedOperatorSetupFields_Sycl(CeedQFunction qf, CeedOperator op, bool
 
   // Loop over fields
   for (CeedInt i = 0; i < num_fields; i++) {
-    CeedEvalMode        e_mode;
+    CeedEvalMode        eval_mode;
     CeedVector          vec;
     CeedElemRestriction rstr;
     CeedBasis           basis;
 
-    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &e_mode));
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode));
 
     is_strided       = false;
     skip_restriction = false;
-    if (e_mode != CEED_EVAL_WEIGHT) {
+    if (eval_mode != CEED_EVAL_WEIGHT) {
       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &rstr));
 
       // Check whether this field can skip the element restriction:
-      // must be passive input, with  e_mode NONE, and have a strided restriction with CEED_STRIDES_BACKEND.
+      // must be passive input, with  eval_mode NONE, and have a strided restriction with CEED_STRIDES_BACKEND.
 
       // First, check whether the field is input or output:
       if (is_input) {
         // Check for passive input:
         CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec));
         if (vec != CEED_VECTOR_ACTIVE) {
-          // Check  e_mode
-          if (e_mode == CEED_EVAL_NONE) {
+          // Check  eval_mode
+          if (eval_mode == CEED_EVAL_NONE) {
             // Check for  is_strided restriction
             CeedCallBackend(CeedElemRestrictionIsStrided(rstr, &is_strided));
             if (is_strided) {
@@ -160,6 +160,7 @@ static int CeedOperatorSetupFields_Sycl(CeedQFunction qf, CeedOperator op, bool
             }
           }
         }
+        CeedCallBackend(CeedVectorDestroy(&vec));
       }
       if (skip_restriction) {
         // We do not need an E-Vector, but will use the input field vector's data directly in the operator application
@@ -167,9 +168,10 @@ static int CeedOperatorSetupFields_Sycl(CeedQFunction qf, CeedOperator op, bool
       } else {
         CeedCallBackend(CeedElemRestrictionCreateVector(rstr, NULL, &e_vecs[i + start_e]));
       }
+      CeedCallBackend(CeedElemRestrictionDestroy(&rstr));
     }
 
-    switch (e_mode) {
+    switch (eval_mode) {
       case CEED_EVAL_NONE:
         CeedCallBackend(CeedQFunctionFieldGetSize(qf_fields[i], &size));
         q_size = (CeedSize)num_elem * Q * size;
@@ -184,6 +186,7 @@ static int CeedOperatorSetupFields_Sycl(CeedQFunction qf, CeedOperator op, bool
         CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis));
         CeedCallBackend(CeedQFunctionFieldGetSize(qf_fields[i], &size));
         CeedCallBackend(CeedBasisGetDimension(basis, &dim));
+        CeedCallBackend(CeedBasisDestroy(&basis));
         q_size = (CeedSize)num_elem * Q * size;
         CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i]));
         break;
@@ -192,6 +195,7 @@ static int CeedOperatorSetupFields_Sycl(CeedQFunction qf, CeedOperator op, bool
         q_size = (CeedSize)num_elem * Q;
         CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i]));
         CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT, CEED_VECTOR_NONE, q_vecs[i]));
+        CeedCallBackend(CeedBasisDestroy(&basis));
         break;
       case CEED_EVAL_DIV:
         break;  // TODO: Not implemented
@@ -252,35 +256,35 @@ static inline int CeedOperatorSetupInputs_Sycl(CeedInt num_input_fields, CeedQFu
                                                CeedVector in_vec, const bool skip_active, CeedScalar *e_data[2 * CEED_FIELD_MAX],
                                                CeedOperator_Sycl *impl, CeedRequest *request) {
   for (CeedInt i = 0; i < num_input_fields; i++) {
-    CeedEvalMode        e_mode;
-    CeedVector          vec;
-    CeedElemRestriction rstr;
+    bool         is_active;
+    CeedEvalMode eval_mode;
+    CeedVector   vec;
 
     // Get input vector
     CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
-    if (vec == CEED_VECTOR_ACTIVE) {
+    is_active = vec == CEED_VECTOR_ACTIVE;
+    if (is_active) {
       if (skip_active) continue;
       else vec = in_vec;
     }
 
-    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &e_mode));
-    if (e_mode == CEED_EVAL_WEIGHT) {  // Skip
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
+    if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
     } else {
-      // Get input vector
-      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
-      // Get input element restriction
-      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr));
-      if (vec == CEED_VECTOR_ACTIVE) vec = in_vec;
       // Restrict, if necessary
       if (!impl->e_vecs[i]) {
         // No restriction for this field; read data directly from vec.
         CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, (const CeedScalar **)&e_data[i]));
       } else {
+        CeedElemRestriction rstr;
+
+        CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr));
         CeedCallBackend(CeedElemRestrictionApply(rstr, CEED_NOTRANSPOSE, vec, impl->e_vecs[i], request));
-        // Get evec
+        CeedCallBackend(CeedElemRestrictionDestroy(&rstr));
         CeedCallBackend(CeedVectorGetArrayRead(impl->e_vecs[i], CEED_MEM_DEVICE, (const CeedScalar **)&e_data[i]));
       }
     }
+    if (!is_active) CeedCallBackend(CeedVectorDestroy(&vec));
   }
   return CEED_ERROR_SUCCESS;
 }
@@ -292,35 +296,30 @@ static inline int CeedOperatorInputBasis_Sycl(CeedInt num_elem, CeedQFunctionFie
                                               CeedInt num_input_fields, const bool skip_active, CeedScalar *e_data[2 * CEED_FIELD_MAX],
                                               CeedOperator_Sycl *impl) {
   for (CeedInt i = 0; i < num_input_fields; i++) {
-    CeedInt             elem_size, size;
-    CeedElemRestriction rstr;
-    CeedEvalMode        e_mode;
-    CeedBasis           basis;
+    CeedEvalMode eval_mode;
+    CeedBasis    basis;
 
     // Skip active input
     if (skip_active) {
+      bool       is_active;
       CeedVector vec;
 
       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
-      if (vec == CEED_VECTOR_ACTIVE) continue;
+      is_active = vec == CEED_VECTOR_ACTIVE;
+      CeedCallBackend(CeedVectorDestroy(&vec));
+      if (is_active) continue;
     }
-    // Get elem_size,  e_mode, size
-    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr));
-    CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size));
-    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &e_mode));
-    CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &size));
     // Basis action
-    switch (e_mode) {
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
+    switch (eval_mode) {
       case CEED_EVAL_NONE:
         CeedCallBackend(CeedVectorSetArray(impl->q_vecs_in[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data[i]));
         break;
       case CEED_EVAL_INTERP:
-        CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
-        CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, impl->e_vecs[i], impl->q_vecs_in[i]));
-        break;
       case CEED_EVAL_GRAD:
         CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
-        CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_NOTRANSPOSE, CEED_EVAL_GRAD, impl->e_vecs[i], impl->q_vecs_in[i]));
+        CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_NOTRANSPOSE, eval_mode, impl->e_vecs[i], impl->q_vecs_in[i]));
+        CeedCallBackend(CeedBasisDestroy(&basis));
         break;
       case CEED_EVAL_WEIGHT:
         break;  // No action
@@ -339,24 +338,26 @@ static inline int CeedOperatorInputBasis_Sycl(CeedInt num_elem, CeedQFunctionFie
 static inline int CeedOperatorRestoreInputs_Sycl(CeedInt num_input_fields, CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields,
                                                  const bool skip_active, CeedScalar *e_data[2 * CEED_FIELD_MAX], CeedOperator_Sycl *impl) {
   for (CeedInt i = 0; i < num_input_fields; i++) {
-    CeedEvalMode e_mode;
+    bool         is_active;
+    CeedEvalMode eval_mode;
     CeedVector   vec;
 
+    CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
+    is_active = vec == CEED_VECTOR_ACTIVE;
     // Skip active input
     if (skip_active) {
-      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
-      if (vec == CEED_VECTOR_ACTIVE) continue;
+      if (is_active) continue;
     }
-    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &e_mode));
-    if (e_mode == CEED_EVAL_WEIGHT) {  // Skip
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
+    if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
     } else {
       if (!impl->e_vecs[i]) {  // This was a  skip_restriction case
-        CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
         CeedCallBackend(CeedVectorRestoreArrayRead(vec, (const CeedScalar **)&e_data[i]));
       } else {
         CeedCallBackend(CeedVectorRestoreArrayRead(impl->e_vecs[i], (const CeedScalar **)&e_data[i]));
       }
     }
+    if (!is_active) CeedCallBackend(CeedVectorDestroy(&vec));
   }
   return CEED_ERROR_SUCCESS;
 }
@@ -407,9 +408,10 @@ static int CeedOperatorApplyAdd_Sycl(CeedOperator op, CeedVector in_vec, CeedVec
     CeedElemRestriction rstr;
     CeedBasis           basis;
 
-    // Get elem_size,  eval_mode, size
+    // Get elem_size, eval_mode, size
     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &rstr));
     CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size));
+    CeedCallBackend(CeedElemRestrictionDestroy(&rstr));
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
     CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &size));
     // Basis action
@@ -417,12 +419,10 @@ static int CeedOperatorApplyAdd_Sycl(CeedOperator op, CeedVector in_vec, CeedVec
       case CEED_EVAL_NONE:
         break;
       case CEED_EVAL_INTERP:
-        CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
-        CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_TRANSPOSE, CEED_EVAL_INTERP, impl->q_vecs_out[i], impl->e_vecs[i + impl->num_e_in]));
-        break;
       case CEED_EVAL_GRAD:
         CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
-        CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_TRANSPOSE, CEED_EVAL_GRAD, impl->q_vecs_out[i], impl->e_vecs[i + impl->num_e_in]));
+        CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_TRANSPOSE, eval_mode, impl->q_vecs_out[i], impl->e_vecs[i + impl->num_e_in]));
+        CeedCallBackend(CeedBasisDestroy(&basis));
         break;
       // LCOV_EXCL_START
       case CEED_EVAL_WEIGHT:
@@ -445,6 +445,8 @@ static int CeedOperatorApplyAdd_Sycl(CeedOperator op, CeedVector in_vec, CeedVec
 
   // Output restriction
   for (CeedInt i = 0; i < num_output_fields; i++) {
+    bool                is_active;
+    CeedEvalMode        eval_mode;
     CeedVector          vec;
     CeedElemRestriction rstr;
 
@@ -453,14 +455,14 @@ static int CeedOperatorApplyAdd_Sycl(CeedOperator op, CeedVector in_vec, CeedVec
     if (eval_mode == CEED_EVAL_NONE) {
       CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs[i + impl->num_e_in], &e_data[i + num_input_fields]));
     }
-    // Get output vector
-    CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
     // Restrict
+    CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
+    is_active = vec == CEED_VECTOR_ACTIVE;
     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &rstr));
-    // Active
-    if (vec == CEED_VECTOR_ACTIVE) vec = out_vec;
-
-    CeedCallBackend(CeedElemRestrictionApply(rstr, CEED_TRANSPOSE, impl->e_vecs[i + impl->num_e_in], vec, request));
+    if (is_active) vec = out_vec;
+    CeedCallBackend(CeedElemRestrictionApply(rstr, CEED_TRANSPOSE, impl->e_vecs[i + impl->num_inputs], vec, request));
+    if (!is_active) CeedCallBackend(CeedVectorDestroy(&vec));
+    CeedCallBackend(CeedElemRestrictionDestroy(&rstr));
   }
 
   // Restore input arrays
@@ -506,9 +508,8 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Sycl(CeedOperator op,
       CeedScalar *q_vec_array;
       CeedVector  vec;
 
-      // Get input vector
-      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
       // Check if active input
+      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
       if (vec == CEED_VECTOR_ACTIVE) {
         CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &size));
         CeedCallBackend(CeedVectorSetValue(impl->q_vecs_in[i], 0.0));
@@ -523,6 +524,7 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Sycl(CeedOperator op,
         num_active_in += size;
         CeedCallBackend(CeedVectorRestoreArray(impl->q_vecs_in[i], &q_vec_array));
       }
+      CeedCallBackend(CeedVectorDestroy(&vec));
     }
     impl->num_active_in = num_active_in;
     impl->qf_active_in  = active_in;
@@ -533,13 +535,13 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Sycl(CeedOperator op,
     for (CeedInt i = 0; i < num_output_fields; i++) {
       CeedVector vec;
 
-      // Get output vector
-      CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
       // Check if active output
+      CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
       if (vec == CEED_VECTOR_ACTIVE) {
         CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &size));
         num_active_out += size;
       }
+      CeedCallBackend(CeedVectorDestroy(&vec));
     }
     impl->num_active_out = num_active_out;
   }
@@ -574,14 +576,14 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Sycl(CeedOperator op,
     for (CeedInt out = 0; out < num_output_fields; out++) {
       CeedVector vec;
 
-      // Get output vector
-      CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec));
       // Check if active output
+      CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec));
       if (vec == CEED_VECTOR_ACTIVE) {
         CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[out], CEED_MEM_DEVICE, CEED_USE_POINTER, assembled_array));
         CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[out], &size));
         assembled_array += size * Q * num_elem;  // Advance the pointer by the size of the output
       }
+      CeedCallBackend(CeedVectorDestroy(&vec));
     }
     // Apply QFunction
     CeedCallBackend(CeedQFunctionApply(qf, Q * num_elem, impl->q_vecs_in, impl->q_vecs_out));
@@ -591,12 +593,12 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Sycl(CeedOperator op,
   for (CeedInt out = 0; out < num_output_fields; out++) {
     CeedVector vec;
 
-    // Get output vector
-    CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec));
     // Check if active output
+    CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec));
     if (vec == CEED_VECTOR_ACTIVE) {
       CeedCallBackend(CeedVectorTakeArray(impl->q_vecs_out[out], CEED_MEM_DEVICE, NULL));
     }
+    CeedCallBackend(CeedVectorDestroy(&vec));
   }
 
   // Restore input arrays
@@ -627,10 +629,9 @@ static int CeedOperatorLinearAssembleQFunctionUpdate_Sycl(CeedOperator op, CeedV
 static inline int CeedOperatorAssembleDiagonalSetup_Sycl(CeedOperator op) {
   Ceed                ceed;
   Ceed_Sycl          *sycl_data;
-  CeedInt             num_input_fields, num_output_fields, num_e_mode_in = 0, num_comp = 0, dim = 1, num_e_mode_out = 0;
-  CeedEvalMode       *e_mode_in = NULL, *e_mode_out = NULL;
+  CeedInt             num_input_fields, num_output_fields, num_eval_mode_in = 0, num_comp = 0, dim = 1, num_eval_mode_out = 0;
+  CeedEvalMode       *eval_mode_in = NULL, *eval_mode_out = NULL;
   CeedBasis           basis_in = NULL, basis_out = NULL;
-  CeedElemRestriction rstr_in = NULL, rstr_out = NULL;
   CeedQFunctionField *qf_fields;
   CeedQFunction       qf;
   CeedOperatorField  *op_fields;
@@ -648,28 +649,26 @@ static inline int CeedOperatorAssembleDiagonalSetup_Sycl(CeedOperator op) {
 
     CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec));
     if (vec == CEED_VECTOR_ACTIVE) {
-      CeedEvalMode        e_mode;
-      CeedElemRestriction rstr;
-
-      CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis_in));
-      CeedCallBackend(CeedBasisGetNumComponents(basis_in, &num_comp));
-      CeedCallBackend(CeedBasisGetDimension(basis_in, &dim));
-      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &rstr));
-      CeedCheck(!rstr_in || rstr_in == rstr, ceed, CEED_ERROR_BACKEND,
-                "Backend does not implement multi-field non-composite operator diagonal assembly");
-      rstr_in = rstr;
-      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &e_mode));
-      switch (e_mode) {
+      CeedEvalMode eval_mode;
+      CeedBasis    basis;
+
+      CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis));
+      CeedCheck(!basis_in || basis_in == basis, ceed, CEED_ERROR_BACKEND,
+                "Backend does not implement operator diagonal assembly with multiple active bases");
+      if (!basis_in) CeedCallBackend(CeedBasisReferenceCopy(basis, &basis_in));
+      CeedCallBackend(CeedBasisDestroy(&basis));
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode));
+      switch (eval_mode) {
         case CEED_EVAL_NONE:
         case CEED_EVAL_INTERP:
-          CeedCallBackend(CeedRealloc(num_e_mode_in + 1, &e_mode_in));
-          e_mode_in[num_e_mode_in] = e_mode;
-          num_e_mode_in += 1;
+          CeedCallBackend(CeedRealloc(num_eval_mode_in + 1, &eval_mode_in));
+          eval_mode_in[num_eval_mode_in] = eval_mode;
+          num_eval_mode_in += 1;
           break;
         case CEED_EVAL_GRAD:
-          CeedCallBackend(CeedRealloc(num_e_mode_in + dim, &e_mode_in));
-          for (CeedInt d = 0; d < dim; d++) e_mode_in[num_e_mode_in + d] = e_mode;
-          num_e_mode_in += dim;
+          CeedCallBackend(CeedRealloc(num_eval_mode_in + dim, &eval_mode_in));
+          for (CeedInt d = 0; d < dim; d++) eval_mode_in[num_eval_mode_in + d] = eval_mode;
+          num_eval_mode_in += dim;
           break;
         case CEED_EVAL_WEIGHT:
         case CEED_EVAL_DIV:
@@ -677,6 +676,7 @@ static inline int CeedOperatorAssembleDiagonalSetup_Sycl(CeedOperator op) {
           break;  // Caught by QF Assembly
       }
     }
+    CeedCallBackend(CeedVectorDestroy(&vec));
   }
 
   // Determine active output basis
@@ -687,26 +687,26 @@ static inline int CeedOperatorAssembleDiagonalSetup_Sycl(CeedOperator op) {
 
     CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec));
     if (vec == CEED_VECTOR_ACTIVE) {
-      CeedEvalMode        e_mode;
-      CeedElemRestriction rstr;
-
-      CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis_out));
-      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &rstr));
-      CeedCheck(!rstr_out || rstr_out == rstr, ceed, CEED_ERROR_BACKEND,
-                "Backend does not implement multi-field non-composite operator diagonal assembly");
-      rstr_out = rstr;
-      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &e_mode));
-      switch (e_mode) {
+      CeedEvalMode eval_mode;
+      CeedBasis    basis;
+
+      CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis));
+      CeedCheck(!basis_out || basis_out == basis, ceed, CEED_ERROR_BACKEND,
+                "Backend does not implement operator diagonal assembly with multiple active bases");
+      if (!basis_out) CeedCallBackend(CeedBasisReferenceCopy(basis, &basis_out));
+      CeedCallBackend(CeedBasisDestroy(&basis));
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode));
+      switch (eval_mode) {
         case CEED_EVAL_NONE:
         case CEED_EVAL_INTERP:
-          CeedCallBackend(CeedRealloc(num_e_mode_out + 1, &e_mode_out));
-          e_mode_out[num_e_mode_out] = e_mode;
-          num_e_mode_out += 1;
+          CeedCallBackend(CeedRealloc(num_eval_mode_in + 1, &eval_mode_in));
+          eval_mode_in[num_eval_mode_in] = eval_mode;
+          num_eval_mode_in += 1;
           break;
         case CEED_EVAL_GRAD:
-          CeedCallBackend(CeedRealloc(num_e_mode_out + dim, &e_mode_out));
-          for (CeedInt d = 0; d < dim; d++) e_mode_out[num_e_mode_out + d] = e_mode;
-          num_e_mode_out += dim;
+          CeedCallBackend(CeedRealloc(num_eval_mode_in + dim, &eval_mode_in));
+          for (CeedInt d = 0; d < dim; d++) eval_mode_in[num_eval_mode_in + d] = eval_mode;
+          num_eval_mode_in += dim;
           break;
         case CEED_EVAL_WEIGHT:
         case CEED_EVAL_DIV:
@@ -714,6 +714,7 @@ static inline int CeedOperatorAssembleDiagonalSetup_Sycl(CeedOperator op) {
           break;  // Caught by QF Assembly
       }
     }
+    CeedCallBackend(CeedVectorDestroy(&vec));
   }
 
   // Operator data struct
@@ -722,12 +723,12 @@ static inline int CeedOperatorAssembleDiagonalSetup_Sycl(CeedOperator op) {
   CeedCallBackend(CeedCalloc(1, &impl->diag));
   CeedOperatorDiag_Sycl *diag = impl->diag;
 
-  diag->basis_in       = basis_in;
-  diag->basis_out      = basis_out;
-  diag->h_e_mode_in    = e_mode_in;
-  diag->h_e_mode_out   = e_mode_out;
-  diag->num_e_mode_in  = num_e_mode_in;
-  diag->num_e_mode_out = num_e_mode_out;
+  diag->basis_in          = basis_in;
+  diag->basis_out         = basis_out;
+  diag->h_eval_mode_in    = eval_mode_in;
+  diag->h_eval_mode_out   = eval_mode_out;
+  diag->num_eval_mode_in  = num_eval_mode_in;
+  diag->num_eval_mode_out = num_eval_mode_out;
 
   // Kernel parameters
   CeedInt num_nodes, num_qpts;
@@ -745,8 +746,8 @@ static inline int CeedOperatorAssembleDiagonalSetup_Sycl(CeedOperator op) {
   // CEED_EVAL_NONE
   CeedScalar *identity      = NULL;
   bool        has_eval_none = false;
-  for (CeedInt i = 0; i < num_e_mode_in; i++) has_eval_none = has_eval_none || (e_mode_in[i] == CEED_EVAL_NONE);
-  for (CeedInt i = 0; i < num_e_mode_out; i++) has_eval_none = has_eval_none || (e_mode_out[i] == CEED_EVAL_NONE);
+  for (CeedInt i = 0; i < num_eval_mode_in; i++) has_eval_none = has_eval_none || (eval_mode_in[i] == CEED_EVAL_NONE);
+  for (CeedInt i = 0; i < num_eval_mode_out; i++) has_eval_none = has_eval_none || (eval_mode_out[i] == CEED_EVAL_NONE);
 
   std::vector<sycl::event> e;
 
@@ -784,17 +785,23 @@ static inline int CeedOperatorAssembleDiagonalSetup_Sycl(CeedOperator op) {
   sycl::event grad_out_copy = sycl_data->sycl_queue.copy<CeedScalar>(grad_out, diag->d_grad_out, g_len, e);
   copy_events.push_back(grad_out_copy);
 
-  // Arrays of  e_modes
-  CeedCallSycl(ceed, diag->d_e_mode_in = sycl::malloc_device<CeedEvalMode>(num_e_mode_in, sycl_data->sycl_device, sycl_data->sycl_context));
-  sycl::event e_mode_in_copy = sycl_data->sycl_queue.copy<CeedEvalMode>(e_mode_in, diag->d_e_mode_in, num_e_mode_in, e);
-  copy_events.push_back(e_mode_in_copy);
+  // Arrays of  eval_modes
+  CeedCallSycl(ceed, diag->d_eval_mode_in = sycl::malloc_device<CeedEvalMode>(num_eval_mode_in, sycl_data->sycl_device, sycl_data->sycl_context));
+  sycl::event eval_mode_in_copy = sycl_data->sycl_queue.copy<CeedEvalMode>(eval_mode_in, diag->d_eval_mode_in, num_eval_mode_in, e);
+  copy_events.push_back(eval_mode_in_copy);
 
-  CeedCallSycl(ceed, diag->d_e_mode_out = sycl::malloc_device<CeedEvalMode>(num_e_mode_out, sycl_data->sycl_device, sycl_data->sycl_context));
-  sycl::event e_mode_out_copy = sycl_data->sycl_queue.copy<CeedEvalMode>(e_mode_out, diag->d_e_mode_out, num_e_mode_out, e);
-  copy_events.push_back(e_mode_out_copy);
+  CeedCallSycl(ceed, diag->d_eval_mode_out = sycl::malloc_device<CeedEvalMode>(num_eval_mode_out, sycl_data->sycl_device, sycl_data->sycl_context));
+  sycl::event eval_mode_out_copy = sycl_data->sycl_queue.copy<CeedEvalMode>(eval_mode_out, diag->d_eval_mode_out, num_eval_mode_out, e);
+  copy_events.push_back(eval_mode_out_copy);
 
   // Restriction
-  diag->diag_rstr = rstr_out;
+  {
+    CeedElemRestriction rstr_out;
+
+    CeedCallBackend(CeedOperatorGetActiveElemRestrictions(op, NULL, &rstr_out));
+    diag->diag_rstr = rstr_out;
+    CeedCallBackend(CeedElemRestrictionDestroy(&rstr_out));
+  }
 
   // Wait for all copies to complete and handle exceptions
   CeedCallSycl(ceed, sycl::event::wait_and_throw(copy_events));
@@ -806,18 +813,18 @@ static inline int CeedOperatorAssembleDiagonalSetup_Sycl(CeedOperator op) {
 //------------------------------------------------------------------------------
 static int CeedOperatorLinearDiagonal_Sycl(sycl::queue &sycl_queue, const bool is_point_block, const CeedInt num_elem,
                                            const CeedOperatorDiag_Sycl *diag, const CeedScalar *assembled_qf_array, CeedScalar *elem_diag_array) {
-  const CeedSize      num_nodes      = diag->num_nodes;
-  const CeedSize      num_qpts       = diag->num_qpts;
-  const CeedSize      num_comp       = diag->num_comp;
-  const CeedSize      num_e_mode_in  = diag->num_e_mode_in;
-  const CeedSize      num_e_mode_out = diag->num_e_mode_out;
-  const CeedScalar   *identity       = diag->d_identity;
-  const CeedScalar   *interp_in      = diag->d_interp_in;
-  const CeedScalar   *grad_in        = diag->d_grad_in;
-  const CeedScalar   *interp_out     = diag->d_interp_out;
-  const CeedScalar   *grad_out       = diag->d_grad_out;
-  const CeedEvalMode *e_mode_in      = diag->d_e_mode_in;
-  const CeedEvalMode *e_mode_out     = diag->d_e_mode_out;
+  const CeedSize      num_nodes         = diag->num_nodes;
+  const CeedSize      num_qpts          = diag->num_qpts;
+  const CeedSize      num_comp          = diag->num_comp;
+  const CeedSize      num_eval_mode_in  = diag->num_eval_mode_in;
+  const CeedSize      num_eval_mode_out = diag->num_eval_mode_out;
+  const CeedScalar   *identity          = diag->d_identity;
+  const CeedScalar   *interp_in         = diag->d_interp_in;
+  const CeedScalar   *grad_in           = diag->d_grad_in;
+  const CeedScalar   *interp_out        = diag->d_interp_out;
+  const CeedScalar   *grad_out          = diag->d_grad_out;
+  const CeedEvalMode *eval_mode_in      = diag->d_eval_mode_in;
+  const CeedEvalMode *eval_mode_out     = diag->d_eval_mode_out;
 
   sycl::range<1> kernel_range(num_elem * num_nodes);
 
@@ -833,18 +840,18 @@ static int CeedOperatorLinearDiagonal_Sycl(sycl::queue &sycl_queue, const bool i
     // Each element
     CeedInt d_out = -1;
     // Each basis eval mode pair
-    for (CeedSize e_out = 0; e_out < num_e_mode_out; e_out++) {
+    for (CeedSize e_out = 0; e_out < num_eval_mode_out; e_out++) {
       const CeedScalar *bt = NULL;
 
-      if (e_mode_out[e_out] == CEED_EVAL_GRAD) ++d_out;
-      CeedOperatorGetBasisPointer_Sycl(&bt, e_mode_out[e_out], identity, interp_out, &grad_out[d_out * num_qpts * num_nodes]);
+      if (eval_mode_out[e_out] == CEED_EVAL_GRAD) ++d_out;
+      CeedOperatorGetBasisPointer_Sycl(&bt, eval_mode_out[e_out], identity, interp_out, &grad_out[d_out * num_qpts * num_nodes]);
       CeedInt d_in = -1;
 
-      for (CeedSize e_in = 0; e_in < num_e_mode_in; e_in++) {
+      for (CeedSize e_in = 0; e_in < num_eval_mode_in; e_in++) {
         const CeedScalar *b = NULL;
 
-        if (e_mode_in[e_in] == CEED_EVAL_GRAD) ++d_in;
-        CeedOperatorGetBasisPointer_Sycl(&b, e_mode_in[e_in], identity, interp_in, &grad_in[d_in * num_qpts * num_nodes]);
+        if (eval_mode_in[e_in] == CEED_EVAL_GRAD) ++d_in;
+        CeedOperatorGetBasisPointer_Sycl(&b, eval_mode_in[e_in], identity, interp_in, &grad_in[d_in * num_qpts * num_nodes]);
         // Each component
         for (CeedSize comp_out = 0; comp_out < num_comp; comp_out++) {
           // Each qpoint/node pair
@@ -855,7 +862,7 @@ static int CeedOperatorLinearDiagonal_Sycl(sycl::queue &sycl_queue, const bool i
 
               for (CeedSize q = 0; q < num_qpts; q++) {
                 const CeedScalar qf_value =
-                    assembled_qf_array[((((e_in * num_comp + comp_in) * num_e_mode_out + e_out) * num_comp + comp_out) * num_elem + e) * num_qpts +
+                    assembled_qf_array[((((e_in * num_comp + comp_in) * num_eval_mode_out + e_out) * num_comp + comp_out) * num_elem + e) * num_qpts +
                                        q];
 
                 e_value += bt[q * num_nodes + tid] * qf_value * b[q * num_nodes + tid];
@@ -868,7 +875,8 @@ static int CeedOperatorLinearDiagonal_Sycl(sycl::queue &sycl_queue, const bool i
 
             for (CeedSize q = 0; q < num_qpts; q++) {
               const CeedScalar qf_value =
-                  assembled_qf_array[((((e_in * num_comp + comp_out) * num_e_mode_out + e_out) * num_comp + comp_out) * num_elem + e) * num_qpts + q];
+                  assembled_qf_array[((((e_in * num_comp + comp_out) * num_eval_mode_out + e_out) * num_comp + comp_out) * num_elem + e) * num_qpts +
+                                     q];
               e_value += bt[q * num_nodes + tid] * qf_value * b[q * num_nodes + tid];
             }
             elem_diag_array[(comp_out * num_elem + e) * num_nodes + tid] += e_value;
@@ -969,7 +977,7 @@ static int CeedOperatorLinearAssembleAddPointBlockDiagonal_Sycl(CeedOperator op,
 //------------------------------------------------------------------------------
 static int CeedSingleOperatorAssembleSetup_Sycl(CeedOperator op) {
   Ceed    ceed;
-  CeedInt num_input_fields, num_output_fields, num_e_mode_in = 0, dim = 1, num_B_in_mats_to_load = 0, size_B_in = 0, num_e_mode_out = 0,
+  CeedInt num_input_fields, num_output_fields, num_eval_mode_in = 0, dim = 1, num_B_in_mats_to_load = 0, size_B_in = 0, num_eval_mode_out = 0,
                                                num_B_out_mats_to_load = 0, size_B_out = 0, num_qpts = 0, elem_size = 0, num_elem, num_comp,
                                                mat_start = 0;
   CeedEvalMode       *eval_mode_in = NULL, *eval_mode_out = NULL;
@@ -991,19 +999,27 @@ static int CeedSingleOperatorAssembleSetup_Sycl(CeedOperator op) {
   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_fields, NULL, NULL));
   // Note that the kernel will treat each dimension of a gradient action separately;
-  // i.e., when an active input has a CEED_EVAL_GRAD mode, num_ e_mode_in will increment by dim.
-  // However, for the purposes of load_ing the B matrices, it will be treated as one mode, and we will load/copy the entire gradient matrix at once,
+  // i.e., when an active input has a CEED_EVAL_GRAD mode, num_ eval_mode_in will increment by dim.
+  // However, for the purposes of loading the B matrices, it will be treated as one mode, and we will load/copy the entire gradient matrix at once,
   // so num_B_in_mats_to_load will be incremented by 1.
   for (CeedInt i = 0; i < num_input_fields; i++) {
-    CeedEvalMode eval_mode;
-    CeedVector   vec;
+    CeedVector vec;
 
     CeedCallBackend(CeedOperatorFieldGetVector(input_fields[i], &vec));
     if (vec == CEED_VECTOR_ACTIVE) {
-      CeedCallBackend(CeedOperatorFieldGetBasis(input_fields[i], &basis_in));
+      CeedEvalMode        eval_mode;
+      CeedElemRestriction elem_rstr;
+      CeedBasis           basis;
+
+      CeedCallBackend(CeedOperatorFieldGetBasis(input_fields[i], &basis));
+      CeedCheck(!basis_in || basis_in == basis, ceed, CEED_ERROR_BACKEND, "Backend does not implement operator assembly with multiple active bases");
+      if (!basis_in) CeedCallBackend(CeedBasisReferenceCopy(basis, &basis_in));
+      CeedCallBackend(CeedBasisDestroy(&basis));
       CeedCallBackend(CeedBasisGetDimension(basis_in, &dim));
       CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis_in, &num_qpts));
-      CeedCallBackend(CeedOperatorFieldGetElemRestriction(input_fields[i], &rstr_in));
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(input_fields[i], &elem_rstr));
+      if (!rstr_in) CeedCallBackend(CeedElemRestrictionReferenceCopy(elem_rstr, &rstr_in));
+      CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
       CeedCallBackend(CeedElemRestrictionGetElementSize(rstr_in, &elem_size));
       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode));
       if (eval_mode != CEED_EVAL_NONE) {
@@ -1011,43 +1027,53 @@ static int CeedSingleOperatorAssembleSetup_Sycl(CeedOperator op) {
         eval_mode_in[num_B_in_mats_to_load] = eval_mode;
         num_B_in_mats_to_load += 1;
         if (eval_mode == CEED_EVAL_GRAD) {
-          num_e_mode_in += dim;
+          num_eval_mode_in += dim;
           size_B_in += dim * elem_size * num_qpts;
         } else {
-          num_e_mode_in += 1;
+          num_eval_mode_in += 1;
           size_B_in += elem_size * num_qpts;
         }
       }
     }
+    CeedCallBackend(CeedVectorDestroy(&vec));
   }
 
   // Determine active output basis; basis_out and rstr_out only used if same as input, TODO
   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qf_fields));
   for (CeedInt i = 0; i < num_output_fields; i++) {
-    CeedEvalMode eval_mode;
-    CeedVector   vec;
+    CeedVector vec;
 
     CeedCallBackend(CeedOperatorFieldGetVector(output_fields[i], &vec));
     if (vec == CEED_VECTOR_ACTIVE) {
-      CeedCallBackend(CeedOperatorFieldGetBasis(output_fields[i], &basis_out));
-      CeedCallBackend(CeedOperatorFieldGetElemRestriction(output_fields[i], &rstr_out));
-      CeedCheck(!rstr_out || rstr_out == rstr_in, ceed, CEED_ERROR_BACKEND, "Backend does not implement multi-field non-composite operator assembly");
+      CeedEvalMode        eval_mode;
+      CeedElemRestriction elem_rstr;
+      CeedBasis           basis;
+
+      CeedCallBackend(CeedOperatorFieldGetBasis(output_fields[i], &basis));
+      CeedCheck(!basis_out || basis_out == basis, ceed, CEED_ERROR_BACKEND,
+                "Backend does not implement operator assembly with multiple active bases");
+      if (!basis_out) CeedCallBackend(CeedBasisReferenceCopy(basis, &basis_out));
+      CeedCallBackend(CeedBasisDestroy(&basis));
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(output_fields[i], &elem_rstr));
+      if (!rstr_out) CeedCallBackend(CeedElemRestrictionReferenceCopy(elem_rstr, &rstr_out));
+      CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode));
       if (eval_mode != CEED_EVAL_NONE) {
         CeedCallBackend(CeedRealloc(num_B_out_mats_to_load + 1, &eval_mode_out));
         eval_mode_out[num_B_out_mats_to_load] = eval_mode;
         num_B_out_mats_to_load += 1;
         if (eval_mode == CEED_EVAL_GRAD) {
-          num_e_mode_out += dim;
+          num_eval_mode_out += dim;
           size_B_out += dim * elem_size * num_qpts;
         } else {
-          num_e_mode_out += 1;
+          num_eval_mode_out += 1;
           size_B_out += elem_size * num_qpts;
         }
       }
     }
+    CeedCallBackend(CeedVectorDestroy(&vec));
   }
-  CeedCheck(num_e_mode_in > 0 && num_e_mode_out > 0, ceed, CEED_ERROR_UNSUPPORTED, "Cannot assemble operator without inputs/outputs");
+  CeedCheck(num_eval_mode_in > 0 && num_eval_mode_out > 0, ceed, CEED_ERROR_UNSUPPORTED, "Cannot assemble operator without inputs/outputs");
 
   CeedCallBackend(CeedElemRestrictionGetNumElements(rstr_in, &num_elem));
   CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr_in, &num_comp));
@@ -1060,16 +1086,16 @@ static int CeedSingleOperatorAssembleSetup_Sycl(CeedOperator op) {
   CeedCallBackend(CeedGetData(ceed, &sycl_data));
 
   // Kernel setup
-  int elems_per_block   = 1;
-  asmb->elems_per_block = elems_per_block;
-  asmb->block_size_x    = elem_size;
-  asmb->block_size_y    = elem_size;
-  asmb->num_e_mode_in   = num_e_mode_in;
-  asmb->num_e_mode_out  = num_e_mode_out;
-  asmb->num_qpts        = num_qpts;
-  asmb->num_nodes       = elem_size;
-  asmb->block_size      = elem_size * elem_size * elems_per_block;
-  asmb->num_comp        = num_comp;
+  int elems_per_block     = 1;
+  asmb->elems_per_block   = elems_per_block;
+  asmb->block_size_x      = elem_size;
+  asmb->block_size_y      = elem_size;
+  asmb->num_eval_mode_in  = num_eval_mode_in;
+  asmb->num_eval_mode_out = num_eval_mode_out;
+  asmb->num_qpts          = num_qpts;
+  asmb->num_nodes         = elem_size;
+  asmb->block_size        = elem_size * elem_size * elems_per_block;
+  asmb->num_comp          = num_comp;
 
   // Build 'full' B matrices (not 1D arrays used for tensor-product matrices
   CeedCallBackend(CeedBasisGetInterp(basis_in, &interp_in));
@@ -1126,6 +1152,10 @@ static int CeedSingleOperatorAssembleSetup_Sycl(CeedOperator op) {
       mat_start += dim * elem_size * num_qpts;
     }
   }
+  CeedCallBackend(CeedElemRestrictionDestroy(&rstr_in));
+  CeedCallBackend(CeedElemRestrictionDestroy(&rstr_out));
+  CeedCallBackend(CeedBasisDestroy(&basis_in));
+  CeedCallBackend(CeedBasisDestroy(&basis_out));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -1136,25 +1166,25 @@ static int CeedOperatorLinearAssemble_Sycl(sycl::queue &sycl_queue, const CeedOp
                                            CeedScalar *values_array) {
   // This kernels assumes B_in and B_out have the same number of quadrature points and basis points.
   // TODO: expand to more general cases
-  CeedOperatorAssemble_Sycl *asmb           = impl->asmb;
-  const CeedInt              num_elem       = asmb->num_elem;
-  const CeedSize             num_nodes      = asmb->num_nodes;
-  const CeedSize             num_comp       = asmb->num_comp;
-  const CeedSize             num_qpts       = asmb->num_qpts;
-  const CeedSize             num_e_mode_in  = asmb->num_e_mode_in;
-  const CeedSize             num_e_mode_out = asmb->num_e_mode_out;
+  CeedOperatorAssemble_Sycl *asmb              = impl->asmb;
+  const CeedInt              num_elem          = asmb->num_elem;
+  const CeedSize             num_nodes         = asmb->num_nodes;
+  const CeedSize             num_comp          = asmb->num_comp;
+  const CeedSize             num_qpts          = asmb->num_qpts;
+  const CeedSize             num_eval_mode_in  = asmb->num_eval_mode_in;
+  const CeedSize             num_eval_mode_out = asmb->num_eval_mode_out;
 
   // Strides for final output ordering, determined by the reference (inference) implementation of the symbolic assembly, slowest --> fastest: element,
   // comp_in, comp_out, node_row, node_col
   const CeedSize comp_out_stride = num_nodes * num_nodes;
   const CeedSize comp_in_stride  = comp_out_stride * num_comp;
   const CeedSize e_stride        = comp_in_stride * num_comp;
-  // Strides for QF array, slowest --> fastest:  e_mode_in, comp_in,  e_mode_out, comp_out, elem, qpt
-  const CeedSize q_e_stride          = num_qpts;
-  const CeedSize q_comp_out_stride   = num_elem * q_e_stride;
-  const CeedSize q_e_mode_out_stride = q_comp_out_stride * num_comp;
-  const CeedSize q_comp_in_stride    = q_e_mode_out_stride * num_e_mode_out;
-  const CeedSize q_e_mode_in_stride  = q_comp_in_stride * num_comp;
+  // Strides for QF array, slowest --> fastest:  eval_mode_in, comp_in,  eval_mode_out, comp_out, elem, qpt
+  const CeedSize q_e_stride             = num_qpts;
+  const CeedSize q_comp_out_stride      = num_elem * q_e_stride;
+  const CeedSize q_eval_mode_out_stride = q_comp_out_stride * num_comp;
+  const CeedSize q_comp_in_stride       = q_eval_mode_out_stride * num_eval_mode_out;
+  const CeedSize q_eval_mode_in_stride  = q_comp_in_stride * num_comp;
 
   CeedScalar *B_in, *B_out;
   B_in                       = asmb->d_B_in;
@@ -1177,19 +1207,19 @@ static int CeedOperatorLinearAssemble_Sycl(sycl::queue &sycl_queue, const CeedOp
         CeedScalar result        = 0.0;
         CeedSize   qf_index_comp = q_comp_in_stride * comp_in + q_comp_out_stride * comp_out + q_e_stride * e;
 
-        for (CeedSize e_mode_in = 0; e_mode_in < num_e_mode_in; e_mode_in++) {
-          CeedSize b_in_index = e_mode_in * num_qpts * num_nodes;
+        for (CeedSize eval_mode_in = 0; eval_mode_in < num_eval_mode_in; eval_mode_in++) {
+          CeedSize b_in_index = eval_mode_in * num_qpts * num_nodes;
 
-          for (CeedSize e_mode_out = 0; e_mode_out < num_e_mode_out; e_mode_out++) {
-            CeedSize b_out_index = e_mode_out * num_qpts * num_nodes;
-            CeedSize qf_index    = qf_index_comp + q_e_mode_out_stride * e_mode_out + q_e_mode_in_stride * e_mode_in;
+          for (CeedSize eval_mode_out = 0; eval_mode_out < num_eval_mode_out; eval_mode_out++) {
+            CeedSize b_out_index = eval_mode_out * num_qpts * num_nodes;
+            CeedSize qf_index    = qf_index_comp + q_eval_mode_out_stride * eval_mode_out + q_eval_mode_in_stride * eval_mode_in;
 
             // Perform the B^T D B operation for this 'chunk' of D (the qf_array)
             for (CeedSize j = 0; j < num_qpts; j++) {
               result += B_out[b_out_index + j * num_nodes + i] * qf_array[qf_index + j] * B_in[b_in_index + j * num_nodes + l];
             }
-          }  // end of  e_mode_out
-        }  // end of  e_mode_in
+          }  // end of  eval_mode_out
+        }  // end of  eval_mode_in
         CeedSize val_index = comp_in_stride * comp_in + comp_out_stride * comp_out + e_stride * e + num_nodes * i + l;
 
         values_array[val_index] = result;
@@ -1212,20 +1242,20 @@ static int CeedOperatorLinearAssembleFallback_Sycl(sycl::queue &sycl_queue, cons
   const CeedInt              num_nodes      = asmb->num_nodes;
   const CeedInt              num_comp       = asmb->num_comp;
   const CeedInt              num_qpts       = asmb->num_qpts;
-  const CeedInt              num_e_mode_in  = asmb->num_e_mode_in;
-  const CeedInt              num_e_mode_out = asmb->num_e_mode_out;
+  const CeedInt              num_eval_mode_in  = asmb->num_eval_mode_in;
+  const CeedInt              num_eval_mode_out = asmb->num_eval_mode_out;
 
   // Strides for final output ordering, determined by the reference (interface) implementation of the symbolic assembly, slowest --> fastest: elememt,
   // comp_in, comp_out, node_row, node_col
   const CeedInt comp_out_stride = num_nodes * num_nodes;
   const CeedInt comp_in_stride  = comp_out_stride * num_comp;
   const CeedInt e_stride        = comp_in_stride * num_comp;
-  // Strides for QF array, slowest --> fastest:  e_mode_in, comp_in,  e_mode_out, comp_out, elem, qpt
+  // Strides for QF array, slowest --> fastest:  eval_mode_in, comp_in,  eval_mode_out, comp_out, elem, qpt
   const CeedInt q_e_stride         = num_qpts;
   const CeedInt q_comp_out_stride  = num_elem * q_e_stride;
-  const CeedInt q_e_mode_out_stride = q_comp_out_stride * num_comp;
-  const CeedInt q_comp_in_stride   = q_e_mode_out_stride * num_e_mode_out;
-  const CeedInt q_e_mode_in_stride  = q_comp_in_stride * num_comp;
+  const CeedInt q_eval_mode_out_stride = q_comp_out_stride * num_comp;
+  const CeedInt q_comp_in_stride   = q_eval_mode_out_stride * num_eval_mode_out;
+  const CeedInt q_eval_mode_in_stride  = q_comp_in_stride * num_comp;
 
   CeedScalar *B_in, *B_out;
   B_in                        = asmb->d_B_in;
@@ -1254,17 +1284,17 @@ static int CeedOperatorLinearAssembleFallback_Sycl(sycl::queue &sycl_queue, cons
           for (CeedInt i = 0; i < num_nodes; i++) {
             CeedScalar result        = 0.0;
             CeedInt    qf_index_comp = q_comp_in_stride * comp_in + q_comp_out_stride * comp_out + q_e_stride * e;
-            for (CeedInt  e_mode_in = 0;  e_mode_in < num_e_mode_in;  e_mode_in++) {
-              CeedInt b_in_index =  e_mode_in * num_qpts * num_nodes;
-              for (CeedInt  e_mode_out = 0;  e_mode_out < num_e_mode_out;  e_mode_out++) {
-                CeedInt b_out_index =  e_mode_out * num_qpts * num_nodes;
-                CeedInt qf_index    = qf_index_comp + q_e_mode_out_stride *  e_mode_out + q_e_mode_in_stride *  e_mode_in;
+            for (CeedInt  eval_mode_in = 0;  eval_mode_in < num_eval_mode_in;  eval_mode_in++) {
+              CeedInt b_in_index =  eval_mode_in * num_qpts * num_nodes;
+              for (CeedInt  eval_mode_out = 0;  eval_mode_out < num_eval_mode_out;  eval_mode_out++) {
+                CeedInt b_out_index =  eval_mode_out * num_qpts * num_nodes;
+                CeedInt qf_index    = qf_index_comp + q_eval_mode_out_stride *  eval_mode_out + q_eval_mode_in_stride *  eval_mode_in;
                 // Perform the B^T D B operation for this 'chunk' of D (the qf_array)
                 for (CeedInt j = 0; j < num_qpts; j++) {
                   result += B_out[b_out_index + j * num_nodes + i] * qf_array[qf_index + j] * B_in[b_in_index + j * num_nodes + l];
                 }
-              }  // end of  e_mode_out
-            }    // end of  e_mode_in
+              }  // end of  eval_mode_out
+            }    // end of  eval_mode_in
             CeedInt val_index       = comp_in_stride * comp_in + comp_out_stride * comp_out + e_stride * e + num_nodes * i + l;
             values_array[val_index] = result;
           }  // end of loop over element node index, i
diff --git a/backends/sycl-ref/ceed-sycl-ref.hpp b/backends/sycl-ref/ceed-sycl-ref.hpp
index ae765dbafc..c435e65e4f 100644
--- a/backends/sycl-ref/ceed-sycl-ref.hpp
+++ b/backends/sycl-ref/ceed-sycl-ref.hpp
@@ -86,16 +86,16 @@ typedef struct {
   CeedBasis           basis_in, basis_out;
   CeedElemRestriction diag_rstr, point_block_diag_rstr;
   CeedVector          elem_diag, point_block_elem_diag;
-  CeedInt             num_e_mode_in, num_e_mode_out, num_nodes;
+  CeedInt             num_eval_mode_in, num_eval_mode_out, num_nodes;
   CeedInt             num_qpts, num_comp;  // Kernel parameters
-  CeedEvalMode       *h_e_mode_in, *h_e_mode_out;
-  CeedEvalMode       *d_e_mode_in, *d_e_mode_out;
+  CeedEvalMode       *h_eval_mode_in, *h_eval_mode_out;
+  CeedEvalMode       *d_eval_mode_in, *d_eval_mode_out;
   CeedScalar         *d_identity, *d_interp_in, *d_interp_out, *d_grad_in, *d_grad_out;
 } CeedOperatorDiag_Sycl;
 
 typedef struct {
   CeedInt     num_elem, block_size_x, block_size_y, elems_per_block;
-  CeedInt     num_e_mode_in, num_e_mode_out, num_qpts, num_nodes, block_size, num_comp;  // Kernel parameters
+  CeedInt     num_eval_mode_in, num_eval_mode_out, num_qpts, num_nodes, block_size, num_comp;  // Kernel parameters
   bool        fallback;
   CeedScalar *d_B_in, *d_B_out;
 } CeedOperatorAssemble_Sycl;
diff --git a/doc/sphinx/source/releasenotes.md b/doc/sphinx/source/releasenotes.md
index 19300bc7c5..764e2949ea 100644
--- a/doc/sphinx/source/releasenotes.md
+++ b/doc/sphinx/source/releasenotes.md
@@ -11,6 +11,7 @@ On this page we provide a summary of the main API changes, new features and exam
 - Add `bool` field type for `CeedQFunctionContext` and related interfaces to use `bool` fields.
 - `CEED_BASIS_COLLOCATED` removed; users should only use `CEED_BASIS_NONE`.
 - Remove unneeded pointer for `CeedElemRestrictionGetELayout`.
+- Require use of `Ceed*Destroy()` on Ceed objects returned from `CeedOperatorFieldGet*()`;
 
 ### New features
 
diff --git a/examples/fluids/problems/advection.c b/examples/fluids/problems/advection.c
index 1795d6db0b..d1ce21755b 100644
--- a/examples/fluids/problems/advection.c
+++ b/examples/fluids/problems/advection.c
@@ -37,12 +37,10 @@ PetscErrorCode CreateKSPMassOperator_AdvectionStabilized(User user, CeedOperator
 
     PetscCallCeed(ceed, CeedCompositeOperatorGetSubList(user->op_rhs_ctx->op, &sub_ops));
     PetscCallCeed(ceed, CeedOperatorGetFieldByName(sub_ops[sub_op_index], "q", &field));
-    PetscCallCeed(ceed, CeedOperatorFieldGetElemRestriction(field, &elem_restr_q));
-    PetscCallCeed(ceed, CeedOperatorFieldGetBasis(field, &basis_q));
+    PetscCallCeed(ceed, CeedOperatorFieldGetData(field, NULL, &elem_restr_q, &basis_q, NULL));
 
     PetscCallCeed(ceed, CeedOperatorGetFieldByName(sub_ops[sub_op_index], "qdata", &field));
-    PetscCallCeed(ceed, CeedOperatorFieldGetElemRestriction(field, &elem_restr_qd_i));
-    PetscCallCeed(ceed, CeedOperatorFieldGetVector(field, &q_data));
+    PetscCallCeed(ceed, CeedOperatorFieldGetData(field, NULL, &elem_restr_qd_i, NULL, &q_data));
 
     PetscCallCeed(ceed, CeedOperatorGetContext(sub_ops[sub_op_index], &qf_ctx));
   }
@@ -74,6 +72,10 @@ PetscErrorCode CreateKSPMassOperator_AdvectionStabilized(User user, CeedOperator
   PetscCallCeed(ceed, CeedOperatorSetField(*op_mass, "v", elem_restr_q, basis_q, CEED_VECTOR_ACTIVE));
   PetscCallCeed(ceed, CeedOperatorSetField(*op_mass, "Grad_v", elem_restr_q, basis_q, CEED_VECTOR_ACTIVE));
 
+  PetscCallCeed(ceed, CeedVectorDestroy(&q_data));
+  PetscCallCeed(ceed, CeedElemRestrictionDestroy(&elem_restr_q));
+  PetscCallCeed(ceed, CeedElemRestrictionDestroy(&elem_restr_qd_i));
+  PetscCallCeed(ceed, CeedBasisDestroy(&basis_q));
   PetscCallCeed(ceed, CeedQFunctionContextDestroy(&qf_ctx));
   PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_mass));
   PetscFunctionReturn(PETSC_SUCCESS);
diff --git a/examples/fluids/problems/newtonian.c b/examples/fluids/problems/newtonian.c
index b7349474e0..f6033cd392 100644
--- a/examples/fluids/problems/newtonian.c
+++ b/examples/fluids/problems/newtonian.c
@@ -173,12 +173,10 @@ PetscErrorCode CreateKSPMassOperator_NewtonianStabilized(User user, CeedOperator
 
     PetscCallCeed(ceed, CeedCompositeOperatorGetSubList(user->op_rhs_ctx->op, &sub_ops));
     PetscCallCeed(ceed, CeedOperatorGetFieldByName(sub_ops[sub_op_index], "q", &field));
-    PetscCallCeed(ceed, CeedOperatorFieldGetElemRestriction(field, &elem_restr_q));
-    PetscCallCeed(ceed, CeedOperatorFieldGetBasis(field, &basis_q));
+    PetscCallCeed(ceed, CeedOperatorFieldGetData(field, NULL, &elem_restr_q, &basis_q, NULL));
 
     PetscCallCeed(ceed, CeedOperatorGetFieldByName(sub_ops[sub_op_index], "qdata", &field));
-    PetscCallCeed(ceed, CeedOperatorFieldGetElemRestriction(field, &elem_restr_qd_i));
-    PetscCallCeed(ceed, CeedOperatorFieldGetVector(field, &q_data));
+    PetscCallCeed(ceed, CeedOperatorFieldGetData(field, NULL, &elem_restr_qd_i, NULL, &q_data));
 
     PetscCallCeed(ceed, CeedOperatorGetContext(sub_ops[sub_op_index], &qf_ctx));
   }
@@ -203,6 +201,10 @@ PetscErrorCode CreateKSPMassOperator_NewtonianStabilized(User user, CeedOperator
   PetscCallCeed(ceed, CeedOperatorSetField(*op_mass, "v", elem_restr_q, basis_q, CEED_VECTOR_ACTIVE));
   PetscCallCeed(ceed, CeedOperatorSetField(*op_mass, "Grad_v", elem_restr_q, basis_q, CEED_VECTOR_ACTIVE));
 
+  PetscCallCeed(ceed, CeedVectorDestroy(&q_data));
+  PetscCallCeed(ceed, CeedElemRestrictionDestroy(&elem_restr_q));
+  PetscCallCeed(ceed, CeedElemRestrictionDestroy(&elem_restr_qd_i));
+  PetscCallCeed(ceed, CeedBasisDestroy(&basis_q));
   PetscCallCeed(ceed, CeedQFunctionContextDestroy(&qf_ctx));
   PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_mass));
   PetscFunctionReturn(PETSC_SUCCESS);
diff --git a/examples/fluids/src/differential_filter.c b/examples/fluids/src/differential_filter.c
index c476f1896c..12c8771ca3 100644
--- a/examples/fluids/src/differential_filter.c
+++ b/examples/fluids/src/differential_filter.c
@@ -137,8 +137,7 @@ PetscErrorCode DifferentialFilterCreateOperators(Ceed ceed, User user, CeedData
         char              field_name[PETSC_MAX_PATH_LEN];
         PetscCall(PetscSNPrintf(field_name, PETSC_MAX_PATH_LEN, "v%" PetscInt_FMT, i));
         PetscCallCeed(ceed, CeedOperatorGetFieldByName(diff_filter->op_rhs_ctx->op, field_name, &op_field));
-        PetscCallCeed(ceed, CeedOperatorFieldGetElemRestriction(op_field, &elem_restr_filter));
-        PetscCallCeed(ceed, CeedOperatorFieldGetBasis(op_field, &basis_filter));
+        PetscCallCeed(ceed, CeedOperatorFieldGetData(op_field, NULL, &elem_restr_filter, &basis_filter, NULL));
       }
 
       PetscCallCeed(ceed, CeedOperatorCreate(ceed, qf_lhs, NULL, NULL, &op_lhs_sub));
@@ -151,6 +150,8 @@ PetscErrorCode DifferentialFilterCreateOperators(Ceed ceed, User user, CeedData
       PetscCallCeed(ceed, CeedOperatorSetField(op_lhs_sub, "Grad_v", elem_restr_filter, basis_filter, CEED_VECTOR_ACTIVE));
 
       PetscCallCeed(ceed, CeedCompositeOperatorAddSub(op_lhs, op_lhs_sub));
+      PetscCallCeed(ceed, CeedElemRestrictionDestroy(&elem_restr_filter));
+      PetscCallCeed(ceed, CeedBasisDestroy(&basis_filter));
       PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_lhs));
       PetscCallCeed(ceed, CeedOperatorDestroy(&op_lhs_sub));
     }
diff --git a/examples/fluids/src/setuplibceed.c b/examples/fluids/src/setuplibceed.c
index a5bdfe1bb1..1754d5e521 100644
--- a/examples/fluids/src/setuplibceed.c
+++ b/examples/fluids/src/setuplibceed.c
@@ -30,12 +30,10 @@ static PetscErrorCode CreateKSPMassOperator_Unstabilized(User user, CeedOperator
 
     PetscCallCeed(ceed, CeedCompositeOperatorGetSubList(user->op_rhs_ctx->op, &sub_ops));
     PetscCallCeed(ceed, CeedOperatorGetFieldByName(sub_ops[sub_op_index], "q", &field));
-    PetscCallCeed(ceed, CeedOperatorFieldGetElemRestriction(field, &elem_restr_q));
-    PetscCallCeed(ceed, CeedOperatorFieldGetBasis(field, &basis_q));
+    PetscCallCeed(ceed, CeedOperatorFieldGetData(field, NULL, &elem_restr_q, &basis_q, NULL));
 
     PetscCallCeed(ceed, CeedOperatorGetFieldByName(sub_ops[sub_op_index], "qdata", &field));
-    PetscCallCeed(ceed, CeedOperatorFieldGetElemRestriction(field, &elem_restr_qd_i));
-    PetscCallCeed(ceed, CeedOperatorFieldGetVector(field, &q_data));
+    PetscCallCeed(ceed, CeedOperatorFieldGetData(field, NULL, &elem_restr_qd_i, NULL, &q_data));
   }
 
   PetscCallCeed(ceed, CeedElemRestrictionGetNumComponents(elem_restr_q, &num_comp_q));
@@ -47,6 +45,10 @@ static PetscErrorCode CreateKSPMassOperator_Unstabilized(User user, CeedOperator
   PetscCallCeed(ceed, CeedOperatorSetField(*op_mass, "qdata", elem_restr_qd_i, CEED_BASIS_NONE, q_data));
   PetscCallCeed(ceed, CeedOperatorSetField(*op_mass, "v", elem_restr_q, basis_q, CEED_VECTOR_ACTIVE));
 
+  PetscCallCeed(ceed, CeedVectorDestroy(&q_data));
+  PetscCallCeed(ceed, CeedElemRestrictionDestroy(&elem_restr_q));
+  PetscCallCeed(ceed, CeedElemRestrictionDestroy(&elem_restr_qd_i));
+  PetscCallCeed(ceed, CeedBasisDestroy(&basis_q));
   PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_mass));
   PetscFunctionReturn(PETSC_SUCCESS);
 }
@@ -199,10 +201,12 @@ static PetscErrorCode AddBCSubOperators(User user, Ceed ceed, DM dm, SimpleBC bc
     PetscCallCeed(ceed, CeedOperatorGetFieldByName(sub_ops[sub_op_index], "q", &field));
     PetscCallCeed(ceed, CeedOperatorFieldGetElemRestriction(field, &elem_restr_q));
     PetscCallCeed(ceed, CeedElemRestrictionGetNumComponents(elem_restr_q, &num_comp_q));
+    PetscCallCeed(ceed, CeedElemRestrictionDestroy(&elem_restr_q));
 
     PetscCallCeed(ceed, CeedOperatorGetFieldByName(sub_ops[sub_op_index], "x", &field));
     PetscCallCeed(ceed, CeedOperatorFieldGetElemRestriction(field, &elem_restr_x));
     PetscCallCeed(ceed, CeedElemRestrictionGetNumComponents(elem_restr_x, &num_comp_x));
+    PetscCallCeed(ceed, CeedElemRestrictionDestroy(&elem_restr_x));
   }
 
   {  // Get bases
diff --git a/interface/ceed-operator.c b/interface/ceed-operator.c
index 5881846665..66c952117c 100644
--- a/interface/ceed-operator.c
+++ b/interface/ceed-operator.c
@@ -118,6 +118,9 @@ static int CeedOperatorFieldView(CeedOperatorField op_field, CeedQFunctionField
   if (basis == CEED_BASIS_NONE) fprintf(stream, "%s      No basis\n", pre);
   if (vec == CEED_VECTOR_ACTIVE) fprintf(stream, "%s      Active vector\n", pre);
   else if (vec == CEED_VECTOR_NONE) fprintf(stream, "%s      No vector\n", pre);
+
+  CeedCall(CeedVectorDestroy(&vec));
+  CeedCall(CeedBasisDestroy(&basis));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -160,7 +163,9 @@ int CeedOperatorSingleView(CeedOperator op, bool sub, FILE *stream) {
 }
 
 /**
-  @brief Find the active input vector `CeedBasis` for a non-composite `CeedOperator`
+  @brief Find the active input vector `CeedBasis` for a non-composite `CeedOperator`.
+
+  Note: Caller is responsible for destroying the `active_basis` with @ref CeedBasisDestroy().
 
   @param[in]  op           `CeedOperator` to find active `CeedBasis` for
   @param[out] active_basis `CeedBasis` for active input vector or `NULL` for composite operator
@@ -175,7 +180,9 @@ int CeedOperatorGetActiveBasis(CeedOperator op, CeedBasis *active_basis) {
 }
 
 /**
-  @brief Find the active input and output vector `CeedBasis` for a non-composite `CeedOperator`
+  @brief Find the active input and output vector `CeedBasis` for a non-composite `CeedOperator`.
+
+  Note: Caller is responsible for destroying the bases with @ref CeedBasisDestroy().
 
   @param[in]  op                  `CeedOperator` to find active `CeedBasis` for
   @param[out] active_input_basis  `CeedBasis` for active input vector or `NULL` for composite operator
@@ -207,8 +214,10 @@ int CeedOperatorGetActiveBases(CeedOperator op, CeedBasis *active_input_basis, C
 
           CeedCall(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
           CeedCheck(!*active_input_basis || *active_input_basis == basis, ceed, CEED_ERROR_MINOR, "Multiple active input CeedBases found");
-          *active_input_basis = basis;
+          if (!*active_input_basis) CeedCall(CeedBasisReferenceCopy(basis, active_input_basis));
+          CeedCall(CeedBasisDestroy(&basis));
         }
+        CeedCall(CeedVectorDestroy(&vec));
       }
       CeedCheck(*active_input_basis, ceed, CEED_ERROR_INCOMPLETE, "No active input CeedBasis found");
     }
@@ -225,8 +234,10 @@ int CeedOperatorGetActiveBases(CeedOperator op, CeedBasis *active_input_basis, C
 
           CeedCall(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
           CeedCheck(!*active_output_basis || *active_output_basis == basis, ceed, CEED_ERROR_MINOR, "Multiple active output CeedBases found");
-          *active_output_basis = basis;
+          if (!*active_output_basis) CeedCall(CeedBasisReferenceCopy(basis, active_output_basis));
+          CeedCall(CeedBasisDestroy(&basis));
         }
+        CeedCall(CeedVectorDestroy(&vec));
       }
       CeedCheck(*active_output_basis, ceed, CEED_ERROR_INCOMPLETE, "No active output CeedBasis found");
     }
@@ -235,7 +246,9 @@ int CeedOperatorGetActiveBases(CeedOperator op, CeedBasis *active_input_basis, C
 }
 
 /**
-  @brief Find the active vector `CeedElemRestriction` for a non-composite `CeedOperator`
+  @brief Find the active vector `CeedElemRestriction` for a non-composite `CeedOperator`.
+
+  Note: Caller is responsible for destroying the `active_rstr` with @ref CeedElemRestrictionDestroy().
 
   @param[in]  op          `CeedOperator` to find active `CeedElemRestriction` for
   @param[out] active_rstr `CeedElemRestriction` for active input vector or NULL for composite operator
@@ -250,7 +263,9 @@ int CeedOperatorGetActiveElemRestriction(CeedOperator op, CeedElemRestriction *a
 }
 
 /**
-  @brief Find the active input and output vector `CeedElemRestriction` for a non-composite `CeedOperator`
+  @brief Find the active input and output vector `CeedElemRestriction` for a non-composite `CeedOperator`.
+
+  Note: Caller is responsible for destroying the restrictions with @ref CeedElemRestrictionDestroy().
 
   @param[in]  op                 `CeedOperator` to find active `CeedElemRestriction` for
   @param[out] active_input_rstr  `CeedElemRestriction` for active input vector or NULL for composite operator
@@ -282,8 +297,10 @@ int CeedOperatorGetActiveElemRestrictions(CeedOperator op, CeedElemRestriction *
 
           CeedCall(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr));
           CeedCheck(!*active_input_rstr || *active_input_rstr == rstr, ceed, CEED_ERROR_MINOR, "Multiple active input CeedElemRestrictions found");
-          *active_input_rstr = rstr;
+          if (!*active_input_rstr) CeedCall(CeedElemRestrictionReferenceCopy(rstr, active_input_rstr));
+          CeedCall(CeedElemRestrictionDestroy(&rstr));
         }
+        CeedCall(CeedVectorDestroy(&vec));
       }
       CeedCheck(*active_input_rstr, ceed, CEED_ERROR_INCOMPLETE, "No active input CeedElemRestriction found");
     }
@@ -300,8 +317,10 @@ int CeedOperatorGetActiveElemRestrictions(CeedOperator op, CeedElemRestriction *
 
           CeedCall(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &rstr));
           CeedCheck(!*active_output_rstr || *active_output_rstr == rstr, ceed, CEED_ERROR_MINOR, "Multiple active output CeedElemRestrictions found");
-          *active_output_rstr = rstr;
+          if (!*active_output_rstr) CeedCall(CeedElemRestrictionReferenceCopy(rstr, active_output_rstr));
+          CeedCall(CeedElemRestrictionDestroy(&rstr));
         }
+        CeedCall(CeedVectorDestroy(&vec));
       }
       CeedCheck(*active_output_rstr, ceed, CEED_ERROR_INCOMPLETE, "No active output CeedElemRestriction found");
     }
@@ -563,6 +582,7 @@ int CeedOperatorHasTensorBases(CeedOperator op, bool *has_tensor_bases) {
       CeedCall(CeedBasisIsTensor(basis, &is_tensor));
       *has_tensor_bases &= is_tensor;
     }
+    CeedCall(CeedBasisDestroy(&basis));
   }
   for (CeedInt i = 0; i < num_outputs; i++) {
     bool      is_tensor;
@@ -573,6 +593,7 @@ int CeedOperatorHasTensorBases(CeedOperator op, bool *has_tensor_bases) {
       CeedCall(CeedBasisIsTensor(basis, &is_tensor));
       *has_tensor_bases &= is_tensor;
     }
+    CeedCall(CeedBasisDestroy(&basis));
   }
   return CEED_ERROR_SUCCESS;
 }
@@ -1138,7 +1159,9 @@ int CeedOperatorFieldGetName(CeedOperatorField op_field, const char **field_name
 }
 
 /**
-  @brief Get the `CeedElemRestriction` of a `CeedOperator` Field
+  @brief Get the `CeedElemRestriction` of a `CeedOperator` Field.
+
+  Note: Caller is responsible for destroying the `rstr` with @ref CeedElemRestrictionDestroy().
 
   @param[in]  op_field `CeedOperator` Field
   @param[out] rstr     Variable to store `CeedElemRestriction`
@@ -1148,12 +1171,15 @@ int CeedOperatorFieldGetName(CeedOperatorField op_field, const char **field_name
   @ref Advanced
 **/
 int CeedOperatorFieldGetElemRestriction(CeedOperatorField op_field, CeedElemRestriction *rstr) {
-  *rstr = op_field->elem_rstr;
+  *rstr = NULL;
+  CeedCall(CeedElemRestrictionReferenceCopy(op_field->elem_rstr, rstr));
   return CEED_ERROR_SUCCESS;
 }
 
 /**
-  @brief Get the `CeedBasis` of a `CeedOperator` Field
+  @brief Get the `CeedBasis` of a `CeedOperator` Field.
+
+  Note: Caller is responsible for destroying the `basis` with @ref CeedBasisDestroy().
 
   @param[in]  op_field `CeedOperator` Field
   @param[out] basis    Variable to store `CeedBasis`
@@ -1163,12 +1189,15 @@ int CeedOperatorFieldGetElemRestriction(CeedOperatorField op_field, CeedElemRest
   @ref Advanced
 **/
 int CeedOperatorFieldGetBasis(CeedOperatorField op_field, CeedBasis *basis) {
-  *basis = op_field->basis;
+  *basis = NULL;
+  CeedCall(CeedBasisReferenceCopy(op_field->basis, basis));
   return CEED_ERROR_SUCCESS;
 }
 
 /**
-  @brief Get the `CeedVector` of a `CeedOperator` Field
+  @brief Get the `CeedVector` of a `CeedOperator` Field.
+
+  Note: Caller is responsible for destroying the `vec` with @ref CeedVectorDestroy().
 
   @param[in]  op_field `CeedOperator` Field
   @param[out] vec      Variable to store `CeedVector`
@@ -1178,14 +1207,17 @@ int CeedOperatorFieldGetBasis(CeedOperatorField op_field, CeedBasis *basis) {
   @ref Advanced
 **/
 int CeedOperatorFieldGetVector(CeedOperatorField op_field, CeedVector *vec) {
-  *vec = op_field->vec;
+  *vec = NULL;
+  CeedCall(CeedVectorReferenceCopy(op_field->vec, vec));
   return CEED_ERROR_SUCCESS;
 }
 
 /**
   @brief Get the data of a `CeedOperator` Field.
 
-  Any arguments set as `NULL` are ignored.
+  Any arguments set as `NULL` are ignored..
+
+  Note: Caller is responsible for destroying the `rstr`, `basis`, and `vec`.
 
   @param[in]  op_field   `CeedOperator` Field
   @param[out] field_name Variable to store the field name
@@ -1652,12 +1684,15 @@ int CeedOperatorGetFlopsEstimate(CeedOperator op, CeedSize *flops) {
 
         CeedCall(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr));
         CeedCall(CeedElemRestrictionGetFlopsEstimate(rstr, CEED_NOTRANSPOSE, &rstr_flops));
+        CeedCall(CeedElemRestrictionDestroy(&rstr));
         *flops += rstr_flops;
         CeedCall(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
         CeedCall(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
         CeedCall(CeedBasisGetFlopsEstimate(basis, CEED_NOTRANSPOSE, eval_mode, &basis_flops));
+        CeedCall(CeedBasisDestroy(&basis));
         *flops += basis_flops * num_elem;
       }
+      CeedCall(CeedVectorDestroy(&vec));
     }
     // QF FLOPs
     {
@@ -1686,12 +1721,15 @@ int CeedOperatorGetFlopsEstimate(CeedOperator op, CeedSize *flops) {
 
         CeedCall(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &rstr));
         CeedCall(CeedElemRestrictionGetFlopsEstimate(rstr, CEED_TRANSPOSE, &rstr_flops));
+        CeedCall(CeedElemRestrictionDestroy(&rstr));
         *flops += rstr_flops;
         CeedCall(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
         CeedCall(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
         CeedCall(CeedBasisGetFlopsEstimate(basis, CEED_TRANSPOSE, eval_mode, &basis_flops));
+        CeedCall(CeedBasisDestroy(&basis));
         *flops += basis_flops * num_elem;
       }
+      CeedCall(CeedVectorDestroy(&vec));
     }
   }
   return CEED_ERROR_SUCCESS;
@@ -2036,6 +2074,7 @@ int CeedOperatorApply(CeedOperator op, CeedVector in, CeedVector out, CeedReques
           if (vec != CEED_VECTOR_ACTIVE && vec != CEED_VECTOR_NONE) {
             CeedCall(CeedVectorSetValue(vec, 0.0));
           }
+          CeedCall(CeedVectorDestroy(&vec));
         }
       }
       // Apply
@@ -2054,11 +2093,14 @@ int CeedOperatorApply(CeedOperator op, CeedVector in, CeedVector out, CeedReques
       CeedCall(CeedOperatorGetFields(op, NULL, NULL, &num_output_fields, &output_fields));
       // Zero all output vectors
       for (CeedInt i = 0; i < num_output_fields; i++) {
+        bool       is_active;
         CeedVector vec;
 
         CeedCall(CeedOperatorFieldGetVector(output_fields[i], &vec));
-        if (vec == CEED_VECTOR_ACTIVE) vec = out;
+        is_active = vec == CEED_VECTOR_ACTIVE;
+        if (is_active) vec = out;
         if (vec != CEED_VECTOR_NONE) CeedCall(CeedVectorSetValue(vec, 0.0));
+        if (!is_active) CeedCall(CeedVectorDestroy(&vec));
       }
       // Apply
       if (op->num_elem > 0) CeedCall(op->ApplyAdd(op, in, out, request));
diff --git a/interface/ceed-preconditioning.c b/interface/ceed-preconditioning.c
index c7f97ae721..346135ebdf 100644
--- a/interface/ceed-preconditioning.c
+++ b/interface/ceed-preconditioning.c
@@ -150,6 +150,9 @@ static int CeedOperatorCreateFallback(CeedOperator op) {
 
       CeedCall(CeedOperatorFieldGetData(input_fields[i], &field_name, &rstr, &basis, &vec));
       CeedCall(CeedOperatorSetField(op_fallback, field_name, rstr, basis, vec));
+      CeedCall(CeedVectorDestroy(&vec));
+      CeedCall(CeedElemRestrictionDestroy(&rstr));
+      CeedCall(CeedBasisDestroy(&basis));
     }
     for (CeedInt i = 0; i < num_output_fields; i++) {
       const char         *field_name;
@@ -159,6 +162,9 @@ static int CeedOperatorCreateFallback(CeedOperator op) {
 
       CeedCall(CeedOperatorFieldGetData(output_fields[i], &field_name, &rstr, &basis, &vec));
       CeedCall(CeedOperatorSetField(op_fallback, field_name, rstr, basis, vec));
+      CeedCall(CeedVectorDestroy(&vec));
+      CeedCall(CeedElemRestrictionDestroy(&rstr));
+      CeedCall(CeedBasisDestroy(&basis));
     }
     {
       CeedQFunctionAssemblyData data;
@@ -528,6 +534,8 @@ static int CeedSingleOperatorAssembleSymbolic(CeedOperator op, CeedInt offset, C
     CeedCall(CeedVectorRestoreArrayRead(elem_dof_out, &elem_dof_a_out));
     CeedCall(CeedVectorDestroy(&elem_dof_out));
   }
+  CeedCall(CeedElemRestrictionDestroy(&elem_rstr_in));
+  CeedCall(CeedElemRestrictionDestroy(&elem_rstr_out));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -778,6 +786,8 @@ static int CeedSingleOperatorAssemble(CeedOperator op, CeedInt offset, CeedVecto
   }
   CeedCall(CeedVectorRestoreArrayRead(assembled_qf, &assembled_qf_array));
   CeedCall(CeedVectorDestroy(&assembled_qf));
+  CeedCall(CeedElemRestrictionDestroy(&elem_rstr_in));
+  CeedCall(CeedElemRestrictionDestroy(&elem_rstr_out));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -818,6 +828,8 @@ static int CeedSingleOperatorAssemblyCountEntries(CeedOperator op, CeedSize *num
     elem_size_out = elem_size_in;
     num_comp_out  = num_comp_in;
   }
+  CeedCall(CeedElemRestrictionDestroy(&rstr_in));
+  CeedCall(CeedElemRestrictionDestroy(&rstr_out));
   *num_entries = (CeedSize)elem_size_in * num_comp_in * elem_size_out * num_comp_out * num_elem_in;
   return CEED_ERROR_SUCCESS;
 }
@@ -860,39 +872,45 @@ static int CeedSingleOperatorMultigridLevel(CeedOperator op_fine, CeedVector p_m
   for (CeedInt i = 0; i < num_input_fields; i++) {
     const char         *field_name;
     CeedVector          vec;
-    CeedElemRestriction rstr;
-    CeedBasis           basis;
+    CeedElemRestriction rstr  = NULL;
+    CeedBasis           basis = NULL;
 
     CeedCall(CeedOperatorFieldGetName(input_fields[i], &field_name));
     CeedCall(CeedOperatorFieldGetVector(input_fields[i], &vec));
     if (vec == CEED_VECTOR_ACTIVE) {
-      rstr  = rstr_coarse;
-      basis = basis_coarse;
-      CeedCall(CeedOperatorFieldGetElemRestriction(input_fields[i], &rstr_fine));
+      CeedCall(CeedElemRestrictionReferenceCopy(rstr_coarse, &rstr));
+      CeedCall(CeedBasisReferenceCopy(basis_coarse, &basis));
+      if (!rstr_fine) CeedCall(CeedOperatorFieldGetElemRestriction(input_fields[i], &rstr_fine));
     } else {
       CeedCall(CeedOperatorFieldGetElemRestriction(input_fields[i], &rstr));
       CeedCall(CeedOperatorFieldGetBasis(input_fields[i], &basis));
     }
     CeedCall(CeedOperatorSetField(*op_coarse, field_name, rstr, basis, vec));
+    CeedCall(CeedVectorDestroy(&vec));
+    CeedCall(CeedElemRestrictionDestroy(&rstr));
+    CeedCall(CeedBasisDestroy(&basis));
   }
   // -- Clone output fields
   for (CeedInt i = 0; i < num_output_fields; i++) {
     const char         *field_name;
     CeedVector          vec;
-    CeedElemRestriction rstr;
-    CeedBasis           basis;
+    CeedElemRestriction rstr  = NULL;
+    CeedBasis           basis = NULL;
 
     CeedCall(CeedOperatorFieldGetName(output_fields[i], &field_name));
     CeedCall(CeedOperatorFieldGetVector(output_fields[i], &vec));
     if (vec == CEED_VECTOR_ACTIVE) {
-      rstr  = rstr_coarse;
-      basis = basis_coarse;
-      CeedCall(CeedOperatorFieldGetElemRestriction(output_fields[i], &rstr_fine));
+      CeedCall(CeedElemRestrictionReferenceCopy(rstr_coarse, &rstr));
+      CeedCall(CeedBasisReferenceCopy(basis_coarse, &basis));
+      if (!rstr_fine) CeedCall(CeedOperatorFieldGetElemRestriction(output_fields[i], &rstr_fine));
     } else {
       CeedCall(CeedOperatorFieldGetElemRestriction(output_fields[i], &rstr));
       CeedCall(CeedOperatorFieldGetBasis(output_fields[i], &basis));
     }
     CeedCall(CeedOperatorSetField(*op_coarse, field_name, rstr, basis, vec));
+    CeedCall(CeedVectorDestroy(&vec));
+    CeedCall(CeedElemRestrictionDestroy(&rstr));
+    CeedCall(CeedBasisDestroy(&basis));
   }
   // -- Clone QFunctionAssemblyData
   {
@@ -1014,6 +1032,7 @@ static int CeedSingleOperatorMultigridLevel(CeedOperator op_fine, CeedVector p_m
 
   // Cleanup
   CeedCall(CeedVectorDestroy(&mult_vec));
+  CeedCall(CeedElemRestrictionDestroy(&rstr_fine));
   CeedCall(CeedElemRestrictionDestroy(&rstr_p_mult_fine));
   CeedCall(CeedBasisDestroy(&basis_c_to_f));
   return CEED_ERROR_SUCCESS;
@@ -1429,6 +1448,7 @@ int CeedOperatorAssemblyDataCreate(Ceed ceed, CeedOperator op, CeedOperatorAssem
         (*data)->active_elem_rstrs_in[num_active_bases_in] = NULL;
         CeedCall(CeedOperatorFieldGetElemRestriction(op_fields[i], &elem_rstr_in));
         CeedCall(CeedElemRestrictionReferenceCopy(elem_rstr_in, &(*data)->active_elem_rstrs_in[num_active_bases_in]));
+        CeedCall(CeedElemRestrictionDestroy(&elem_rstr_in));
         CeedCall(CeedRealloc(num_active_bases_in + 1, &num_eval_modes_in));
         num_eval_modes_in[index] = 0;
         CeedCall(CeedRealloc(num_active_bases_in + 1, &eval_modes_in));
@@ -1450,7 +1470,9 @@ int CeedOperatorAssemblyDataCreate(Ceed ceed, CeedOperator op, CeedOperatorAssem
         }
         num_eval_modes_in[index] += q_comp;
       }
+      CeedCall(CeedBasisDestroy(&basis_in));
     }
+    CeedCall(CeedVectorDestroy(&vec));
   }
 
   // Determine active output basis
@@ -1484,6 +1506,7 @@ int CeedOperatorAssemblyDataCreate(Ceed ceed, CeedOperator op, CeedOperatorAssem
         (*data)->active_elem_rstrs_out[num_active_bases_out] = NULL;
         CeedCall(CeedOperatorFieldGetElemRestriction(op_fields[i], &elem_rstr_out));
         CeedCall(CeedElemRestrictionReferenceCopy(elem_rstr_out, &(*data)->active_elem_rstrs_out[num_active_bases_out]));
+        CeedCall(CeedElemRestrictionDestroy(&elem_rstr_out));
         CeedCall(CeedRealloc(num_active_bases_out + 1, &num_eval_modes_out));
         num_eval_modes_out[index] = 0;
         CeedCall(CeedRealloc(num_active_bases_out + 1, &eval_modes_out));
@@ -1505,7 +1528,9 @@ int CeedOperatorAssemblyDataCreate(Ceed ceed, CeedOperator op, CeedOperatorAssem
         }
         num_eval_modes_out[index] += q_comp;
       }
+      CeedCall(CeedBasisDestroy(&basis_out));
     }
+    CeedCall(CeedVectorDestroy(&vec));
   }
   (*data)->num_active_bases_in   = num_active_bases_in;
   (*data)->num_eval_modes_in     = num_eval_modes_in;
@@ -2166,6 +2191,7 @@ int CeedOperatorLinearAssemblePointBlockDiagonalSymbolic(CeedOperator op, CeedSi
 
     CeedCall(CeedElemRestrictionRestoreOffsets(active_elem_rstr, &offsets));
     CeedCall(CeedElemRestrictionRestoreOffsets(point_block_active_elem_rstr, &point_block_offsets));
+    CeedCall(CeedElemRestrictionDestroy(&active_elem_rstr));
     CeedCall(CeedElemRestrictionDestroy(&point_block_active_elem_rstr));
   }
   return CEED_ERROR_SUCCESS;
@@ -2494,6 +2520,7 @@ int CeedCompositeOperatorGetMultiplicity(CeedOperator op, CeedInt num_skip_indic
     // -- Sub operator multiplicity
     CeedCall(CeedOperatorGetActiveElemRestriction(sub_operators[i], &elem_rstr));
     CeedCall(CeedElemRestrictionCreateUnorientedCopy(elem_rstr, &mult_elem_rstr));
+    CeedCall(CeedElemRestrictionDestroy(&elem_rstr));
     CeedCall(CeedElemRestrictionCreateVector(mult_elem_rstr, &sub_mult_l_vec, &ones_e_vec));
     CeedCall(CeedVectorSetValue(sub_mult_l_vec, 0.0));
     CeedCall(CeedElemRestrictionApply(mult_elem_rstr, CEED_NOTRANSPOSE, ones_l_vec, ones_e_vec, CEED_REQUEST_IMMEDIATE));
@@ -2542,6 +2569,7 @@ int CeedOperatorMultigridLevelCreate(CeedOperator op_fine, CeedVector p_mult_fin
 
     CeedCall(CeedOperatorGetActiveBasis(op_fine, &basis_fine));
     CeedCall(CeedBasisCreateProjection(basis_coarse, basis_fine, &basis_c_to_f));
+    CeedCall(CeedBasisDestroy(&basis_fine));
   }
 
   // Core code
@@ -2597,6 +2625,7 @@ int CeedOperatorMultigridLevelCreateTensorH1(CeedOperator op_fine, CeedVector p_
     CeedCall(CeedBasisGetDimension(basis_fine, &dim));
     CeedCall(CeedBasisGetNumComponents(basis_fine, &num_comp));
     CeedCall(CeedBasisGetNumNodes1D(basis_fine, &P_1d_f));
+    CeedCall(CeedBasisDestroy(&basis_fine));
     CeedCall(CeedElemRestrictionGetElementSize(rstr_coarse, &num_nodes_c));
     P_1d_c = dim == 1 ? num_nodes_c : dim == 2 ? sqrt(num_nodes_c) : cbrt(num_nodes_c);
     CeedCall(CeedCalloc(P_1d_f, &q_ref));
@@ -2660,6 +2689,7 @@ int CeedOperatorMultigridLevelCreateH1(CeedOperator op_fine, CeedVector p_mult_f
     CeedCall(CeedBasisGetDimension(basis_fine, &dim));
     CeedCall(CeedBasisGetNumComponents(basis_fine, &num_comp));
     CeedCall(CeedBasisGetNumNodes(basis_fine, &num_nodes_f));
+    CeedCall(CeedBasisDestroy(&basis_fine));
     CeedCall(CeedElemRestrictionGetElementSize(rstr_coarse, &num_nodes_c));
     CeedCall(CeedCalloc(num_nodes_f * dim, &q_ref));
     CeedCall(CeedCalloc(num_nodes_f, &q_weight));
@@ -2743,9 +2773,10 @@ int CeedOperatorCreateFDMElementInverse(CeedOperator op, CeedOperator *fdm_inv,
       CeedCall(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode));
       interp = interp || eval_mode == CEED_EVAL_INTERP;
       grad   = grad || eval_mode == CEED_EVAL_GRAD;
-      CeedCall(CeedOperatorFieldGetBasis(op_fields[i], &basis));
-      CeedCall(CeedOperatorFieldGetElemRestriction(op_fields[i], &rstr));
+      if (!basis) CeedCall(CeedOperatorFieldGetBasis(op_fields[i], &basis));
+      if (!rstr) CeedCall(CeedOperatorFieldGetElemRestriction(op_fields[i], &rstr));
     }
+    CeedCall(CeedVectorDestroy(&vec));
   }
   CeedCheck(basis, ceed, CEED_ERROR_BACKEND, "No active field set");
   CeedCall(CeedBasisGetNumNodes1D(basis, &P_1d));
@@ -2906,8 +2937,10 @@ int CeedOperatorCreateFDMElementInverse(CeedOperator op, CeedOperator *fdm_inv,
 
   // Cleanup
   CeedCall(CeedVectorDestroy(&q_data));
-  CeedCall(CeedBasisDestroy(&fdm_basis));
+  CeedCall(CeedElemRestrictionDestroy(&rstr));
   CeedCall(CeedElemRestrictionDestroy(&rstr_qd_i));
+  CeedCall(CeedBasisDestroy(&basis));
+  CeedCall(CeedBasisDestroy(&fdm_basis));
   CeedCall(CeedQFunctionDestroy(&qf_fdm));
   return CEED_ERROR_SUCCESS;
 }

From e03fef56705b317edc4a39dfee40c8366660a6d6 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Mon, 9 Sep 2024 12:26:07 -0600
Subject: [PATCH 197/571] rust - fix CeedOperatorFieldGet*

---
 rust/libceed/src/basis.rs            |   7 ++
 rust/libceed/src/elem_restriction.rs |   7 ++
 rust/libceed/src/operator.rs         | 129 ++++++++++++++++++---------
 3 files changed, 100 insertions(+), 43 deletions(-)

diff --git a/rust/libceed/src/basis.rs b/rust/libceed/src/basis.rs
index 4c11fb79b4..b308076ab6 100644
--- a/rust/libceed/src/basis.rs
+++ b/rust/libceed/src/basis.rs
@@ -173,6 +173,13 @@ impl<'a> Basis<'a> {
         })
     }
 
+    pub(crate) fn from_raw(ptr: bind_ceed::CeedBasis) -> crate::Result<Self> {
+        Ok(Self {
+            ptr,
+            _lifeline: PhantomData,
+        })
+    }
+
     pub fn create_tensor_H1_Lagrange(
         ceed: &crate::Ceed,
         dim: usize,
diff --git a/rust/libceed/src/elem_restriction.rs b/rust/libceed/src/elem_restriction.rs
index 950a840403..e800f56c1e 100644
--- a/rust/libceed/src/elem_restriction.rs
+++ b/rust/libceed/src/elem_restriction.rs
@@ -193,6 +193,13 @@ impl<'a> ElemRestriction<'a> {
         })
     }
 
+    pub(crate) fn from_raw(ptr: bind_ceed::CeedElemRestriction) -> crate::Result<Self> {
+        Ok(Self {
+            ptr,
+            _lifeline: PhantomData,
+        })
+    }
+
     pub fn create_oriented(
         ceed: &crate::Ceed,
         nelem: usize,
diff --git a/rust/libceed/src/operator.rs b/rust/libceed/src/operator.rs
index bd5d7f37f4..a1e5db55cb 100644
--- a/rust/libceed/src/operator.rs
+++ b/rust/libceed/src/operator.rs
@@ -17,6 +17,9 @@ use crate::prelude::*;
 #[derive(Debug)]
 pub struct OperatorField<'a> {
     pub(crate) ptr: bind_ceed::CeedOperatorField,
+    pub(crate) vector: crate::Vector<'a>,
+    pub(crate) elem_restriction: crate::ElemRestriction<'a>,
+    pub(crate) basis: crate::Basis<'a>,
     _lifeline: PhantomData<&'a ()>,
 }
 
@@ -24,6 +27,39 @@ pub struct OperatorField<'a> {
 // Implementations
 // -----------------------------------------------------------------------------
 impl<'a> OperatorField<'a> {
+    pub(crate) fn from_raw(
+        ptr: bind_ceed::CeedOperatorField,
+        ceed: crate::Ceed,
+    ) -> crate::Result<Self> {
+        let vector = {
+            let mut vector_ptr = std::ptr::null_mut();
+            let ierr = unsafe { bind_ceed::CeedOperatorFieldGetVector(ptr, &mut vector_ptr) };
+            ceed.check_error(ierr)?;
+            crate::Vector::from_raw(vector_ptr)?
+        };
+        let elem_restriction = {
+            let mut elem_restriction_ptr = std::ptr::null_mut();
+            let ierr = unsafe {
+                bind_ceed::CeedOperatorFieldGetElemRestriction(ptr, &mut elem_restriction_ptr)
+            };
+            ceed.check_error(ierr)?;
+            crate::ElemRestriction::from_raw(elem_restriction_ptr)?
+        };
+        let basis = {
+            let mut basis_ptr = std::ptr::null_mut();
+            let ierr = unsafe { bind_ceed::CeedOperatorFieldGetBasis(ptr, &mut basis_ptr) };
+            ceed.check_error(ierr)?;
+            crate::Basis::from_raw(basis_ptr)?
+        };
+        Ok(Self {
+            ptr,
+            vector,
+            elem_restriction,
+            basis,
+            _lifeline: PhantomData,
+        })
+    }
+
     /// Get the name of an OperatorField
     ///
     /// ```
@@ -110,24 +146,21 @@ impl<'a> OperatorField<'a> {
     ///     inputs[1].elem_restriction().is_none(),
     ///     "Incorrect field ElemRestriction"
     /// );
+    ///
+    /// let outputs = op.outputs()?;
+    ///
+    /// assert!(
+    ///     outputs[0].elem_restriction().is_some(),
+    ///     "Incorrect field ElemRestriction"
+    /// );
     /// # Ok(())
     /// # }
     /// ```
     pub fn elem_restriction(&self) -> ElemRestrictionOpt {
-        let mut ptr = std::ptr::null_mut();
-        unsafe {
-            bind_ceed::CeedOperatorFieldGetElemRestriction(self.ptr, &mut ptr);
-        }
-        if ptr == unsafe { bind_ceed::CEED_ELEMRESTRICTION_NONE } {
+        if self.elem_restriction.ptr == unsafe { bind_ceed::CEED_ELEMRESTRICTION_NONE } {
             ElemRestrictionOpt::None
         } else {
-            let slice = unsafe {
-                std::slice::from_raw_parts(
-                    &ptr as *const bind_ceed::CeedElemRestriction as *const crate::ElemRestriction,
-                    1 as usize,
-                )
-            };
-            ElemRestrictionOpt::Some(&slice[0])
+            ElemRestrictionOpt::Some(&self.elem_restriction)
         }
     }
 
@@ -172,20 +205,10 @@ impl<'a> OperatorField<'a> {
     /// # }
     /// ```
     pub fn basis(&self) -> BasisOpt {
-        let mut ptr = std::ptr::null_mut();
-        unsafe {
-            bind_ceed::CeedOperatorFieldGetBasis(self.ptr, &mut ptr);
-        }
-        if ptr == unsafe { bind_ceed::CEED_BASIS_NONE } {
+        if self.basis.ptr == unsafe { bind_ceed::CEED_BASIS_NONE } {
             BasisOpt::None
         } else {
-            let slice = unsafe {
-                std::slice::from_raw_parts(
-                    &ptr as *const bind_ceed::CeedBasis as *const crate::Basis,
-                    1 as usize,
-                )
-            };
-            BasisOpt::Some(&slice[0])
+            BasisOpt::Some(&self.basis)
         }
     }
 
@@ -222,26 +245,20 @@ impl<'a> OperatorField<'a> {
     ///
     /// assert!(inputs[0].vector().is_active(), "Incorrect field Vector");
     /// assert!(inputs[1].vector().is_none(), "Incorrect field Vector");
+    ///
+    /// let outputs = op.outputs()?;
+    ///
+    /// assert!(outputs[0].vector().is_active(), "Incorrect field Vector");
     /// # Ok(())
     /// # }
     /// ```
     pub fn vector(&self) -> VectorOpt {
-        let mut ptr = std::ptr::null_mut();
-        unsafe {
-            bind_ceed::CeedOperatorFieldGetVector(self.ptr, &mut ptr);
-        }
-        if ptr == unsafe { bind_ceed::CEED_VECTOR_ACTIVE } {
+        if self.vector.ptr == unsafe { bind_ceed::CEED_VECTOR_ACTIVE } {
             VectorOpt::Active
-        } else if ptr == unsafe { bind_ceed::CEED_VECTOR_NONE } {
+        } else if self.vector.ptr == unsafe { bind_ceed::CEED_VECTOR_NONE } {
             VectorOpt::None
         } else {
-            let slice = unsafe {
-                std::slice::from_raw_parts(
-                    &ptr as *const bind_ceed::CeedVector as *const crate::Vector,
-                    1 as usize,
-                )
-            };
-            VectorOpt::Some(&slice[0])
+            VectorOpt::Some(&self.vector)
         }
     }
 }
@@ -814,7 +831,7 @@ impl<'a> Operator<'a> {
     /// # Ok(())
     /// # }
     /// ```
-    pub fn inputs(&self) -> crate::Result<&[crate::OperatorField]> {
+    pub fn inputs(&self) -> crate::Result<Vec<crate::OperatorField>> {
         // Get array of raw C pointers for inputs
         let mut num_inputs = 0;
         let mut inputs_ptr = std::ptr::null_mut();
@@ -831,11 +848,24 @@ impl<'a> Operator<'a> {
         // Convert raw C pointers to fixed length slice
         let inputs_slice = unsafe {
             std::slice::from_raw_parts(
-                inputs_ptr as *const crate::OperatorField,
+                inputs_ptr as *mut bind_ceed::CeedOperatorField,
                 num_inputs as usize,
             )
         };
-        Ok(inputs_slice)
+        // And finally build vec
+        let ceed = {
+            let mut ptr = std::ptr::null_mut();
+            let mut ptr_copy = std::ptr::null_mut();
+            unsafe {
+                bind_ceed::CeedOperatorGetCeed(self.op_core.ptr, &mut ptr);
+                bind_ceed::CeedReferenceCopy(ptr, &mut ptr_copy); // refcount
+            }
+            crate::Ceed { ptr }
+        };
+        let inputs = (0..num_inputs as usize)
+            .map(|i| crate::OperatorField::from_raw(inputs_slice[i], ceed.clone()))
+            .collect::<crate::Result<Vec<_>>>()?;
+        Ok(inputs)
     }
 
     /// Get a slice of Operator outputs
@@ -873,7 +903,7 @@ impl<'a> Operator<'a> {
     /// # Ok(())
     /// # }
     /// ```
-    pub fn outputs(&self) -> crate::Result<&[crate::OperatorField]> {
+    pub fn outputs(&self) -> crate::Result<Vec<crate::OperatorField>> {
         // Get array of raw C pointers for outputs
         let mut num_outputs = 0;
         let mut outputs_ptr = std::ptr::null_mut();
@@ -890,11 +920,24 @@ impl<'a> Operator<'a> {
         // Convert raw C pointers to fixed length slice
         let outputs_slice = unsafe {
             std::slice::from_raw_parts(
-                outputs_ptr as *const crate::OperatorField,
+                outputs_ptr as *mut bind_ceed::CeedOperatorField,
                 num_outputs as usize,
             )
         };
-        Ok(outputs_slice)
+        // And finally build vec
+        let ceed = {
+            let mut ptr = std::ptr::null_mut();
+            let mut ptr_copy = std::ptr::null_mut();
+            unsafe {
+                bind_ceed::CeedOperatorGetCeed(self.op_core.ptr, &mut ptr);
+                bind_ceed::CeedReferenceCopy(ptr, &mut ptr_copy); // refcount
+            }
+            crate::Ceed { ptr }
+        };
+        let outputs = (0..num_outputs as usize)
+            .map(|i| crate::OperatorField::from_raw(outputs_slice[i], ceed.clone()))
+            .collect::<crate::Result<Vec<_>>>()?;
+        Ok(outputs)
     }
 
     /// Check if Operator is setup correctly

From 567c3c299f2b1379a2173048e3e4ce3fc0fd52d0 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Mon, 9 Sep 2024 14:48:28 -0600
Subject: [PATCH 198/571] rust - restore tests from #1451

---
 rust/libceed/src/operator.rs | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/rust/libceed/src/operator.rs b/rust/libceed/src/operator.rs
index a1e5db55cb..809c78f6e6 100644
--- a/rust/libceed/src/operator.rs
+++ b/rust/libceed/src/operator.rs
@@ -142,6 +142,14 @@ impl<'a> OperatorField<'a> {
     ///     inputs[0].elem_restriction().is_some(),
     ///     "Incorrect field ElemRestriction"
     /// );
+    /// if let ElemRestrictionOpt::Some(r) = inputs[0].elem_restriction() {
+    ///     assert_eq!(
+    ///         r.num_elements(),
+    ///         ne,
+    ///         "Incorrect field ElemRestriction number of elements"
+    ///     );
+    /// }
+    ///
     /// assert!(
     ///     inputs[1].elem_restriction().is_none(),
     ///     "Incorrect field ElemRestriction"
@@ -153,6 +161,13 @@ impl<'a> OperatorField<'a> {
     ///     outputs[0].elem_restriction().is_some(),
     ///     "Incorrect field ElemRestriction"
     /// );
+    /// if let ElemRestrictionOpt::Some(r) = outputs[0].elem_restriction() {
+    ///     assert_eq!(
+    ///         r.num_elements(),
+    ///         ne,
+    ///         "Incorrect field ElemRestriction number of elements"
+    ///     );
+    /// }
     /// # Ok(())
     /// # }
     /// ```
@@ -196,7 +211,21 @@ impl<'a> OperatorField<'a> {
     /// let inputs = op.inputs()?;
     ///
     /// assert!(inputs[0].basis().is_some(), "Incorrect field Basis");
+    /// if let BasisOpt::Some(b) = inputs[0].basis() {
+    ///     assert_eq!(
+    ///         b.num_quadrature_points(),
+    ///         q,
+    ///         "Incorrect field Basis number of quadrature points"
+    ///     );
+    /// }
     /// assert!(inputs[1].basis().is_some(), "Incorrect field Basis");
+    /// if let BasisOpt::Some(b) = inputs[1].basis() {
+    ///     assert_eq!(
+    ///         b.num_quadrature_points(),
+    ///         q,
+    ///         "Incorrect field Basis number of quadrature points"
+    ///     );
+    /// }
     ///
     /// let outputs = op.outputs()?;
     ///

From ac2269bd6b9128b5262c5b76aac90f6c3e7aeebf Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Wed, 11 Sep 2024 11:03:32 -0600
Subject: [PATCH 199/571] sycl - consistency and correctness fixes

---
 .../sycl-ref/ceed-sycl-ref-operator.sycl.cpp  | 91 ++++++++++---------
 backends/sycl-ref/ceed-sycl-ref.hpp           |  1 -
 2 files changed, 50 insertions(+), 42 deletions(-)

diff --git a/backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp b/backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp
index 35c0fd5097..a046690420 100644
--- a/backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp
+++ b/backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp
@@ -70,7 +70,7 @@ static int CeedOperatorDestroy_Sycl(CeedOperator op) {
   }
   CeedCallBackend(CeedFree(&impl->q_vecs_out));
 
-  // QFunction assembly dataf
+  // QFunction assembly data
   for (CeedInt i = 0; i < impl->num_active_in; i++) {
     CeedCallBackend(CeedVectorDestroy(&impl->qf_active_in[i]));
   }
@@ -132,7 +132,7 @@ static int CeedOperatorSetupFields_Sycl(CeedQFunction qf, CeedOperator op, bool
   for (CeedInt i = 0; i < num_fields; i++) {
     CeedEvalMode        eval_mode;
     CeedVector          vec;
-    CeedElemRestriction rstr;
+    CeedElemRestriction elem_rstr;
     CeedBasis           basis;
 
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode));
@@ -140,7 +140,7 @@ static int CeedOperatorSetupFields_Sycl(CeedQFunction qf, CeedOperator op, bool
     is_strided       = false;
     skip_restriction = false;
     if (eval_mode != CEED_EVAL_WEIGHT) {
-      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &rstr));
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &elem_rstr));
 
       // Check whether this field can skip the element restriction:
       // must be passive input, with  eval_mode NONE, and have a strided restriction with CEED_STRIDES_BACKEND.
@@ -153,10 +153,10 @@ static int CeedOperatorSetupFields_Sycl(CeedQFunction qf, CeedOperator op, bool
           // Check  eval_mode
           if (eval_mode == CEED_EVAL_NONE) {
             // Check for  is_strided restriction
-            CeedCallBackend(CeedElemRestrictionIsStrided(rstr, &is_strided));
+            CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided));
             if (is_strided) {
               // Check if vector is already in preferred backend ordering
-              CeedCallBackend(CeedElemRestrictionHasBackendStrides(rstr, &skip_restriction));
+              CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &skip_restriction));
             }
           }
         }
@@ -166,9 +166,9 @@ static int CeedOperatorSetupFields_Sycl(CeedQFunction qf, CeedOperator op, bool
         // We do not need an E-Vector, but will use the input field vector's data directly in the operator application
         e_vecs[i + start_e] = NULL;
       } else {
-        CeedCallBackend(CeedElemRestrictionCreateVector(rstr, NULL, &e_vecs[i + start_e]));
+        CeedCallBackend(CeedElemRestrictionCreateVector(elem_rstr, NULL, &e_vecs[i + start_e]));
       }
-      CeedCallBackend(CeedElemRestrictionDestroy(&rstr));
+      CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
     }
 
     switch (eval_mode) {
@@ -276,11 +276,11 @@ static inline int CeedOperatorSetupInputs_Sycl(CeedInt num_input_fields, CeedQFu
         // No restriction for this field; read data directly from vec.
         CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, (const CeedScalar **)&e_data[i]));
       } else {
-        CeedElemRestriction rstr;
+        CeedElemRestriction elem_rstr;
 
-        CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr));
-        CeedCallBackend(CeedElemRestrictionApply(rstr, CEED_NOTRANSPOSE, vec, impl->e_vecs[i], request));
-        CeedCallBackend(CeedElemRestrictionDestroy(&rstr));
+        CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
+        CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_NOTRANSPOSE, vec, impl->e_vecs[i], request));
+        CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
         CeedCallBackend(CeedVectorGetArrayRead(impl->e_vecs[i], CEED_MEM_DEVICE, (const CeedScalar **)&e_data[i]));
       }
     }
@@ -297,7 +297,6 @@ static inline int CeedOperatorInputBasis_Sycl(CeedInt num_elem, CeedQFunctionFie
                                               CeedOperator_Sycl *impl) {
   for (CeedInt i = 0; i < num_input_fields; i++) {
     CeedEvalMode eval_mode;
-    CeedBasis    basis;
 
     // Skip active input
     if (skip_active) {
@@ -316,11 +315,14 @@ static inline int CeedOperatorInputBasis_Sycl(CeedInt num_elem, CeedQFunctionFie
         CeedCallBackend(CeedVectorSetArray(impl->q_vecs_in[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data[i]));
         break;
       case CEED_EVAL_INTERP:
-      case CEED_EVAL_GRAD:
+      case CEED_EVAL_GRAD: {
+        CeedBasis basis;
+
         CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
         CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_NOTRANSPOSE, eval_mode, impl->e_vecs[i], impl->q_vecs_in[i]));
         CeedCallBackend(CeedBasisDestroy(&basis));
         break;
+      }
       case CEED_EVAL_WEIGHT:
         break;  // No action
       case CEED_EVAL_DIV:
@@ -405,13 +407,12 @@ static int CeedOperatorApplyAdd_Sycl(CeedOperator op, CeedVector in_vec, CeedVec
 
   // Output basis apply if needed
   for (CeedInt i = 0; i < num_output_fields; i++) {
-    CeedElemRestriction rstr;
-    CeedBasis           basis;
+    CeedElemRestriction elem_rstr;
 
     // Get elem_size, eval_mode, size
-    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &rstr));
-    CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size));
-    CeedCallBackend(CeedElemRestrictionDestroy(&rstr));
+    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
+    CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
+    CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
     CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &size));
     // Basis action
@@ -419,18 +420,22 @@ static int CeedOperatorApplyAdd_Sycl(CeedOperator op, CeedVector in_vec, CeedVec
       case CEED_EVAL_NONE:
         break;
       case CEED_EVAL_INTERP:
-      case CEED_EVAL_GRAD:
+      case CEED_EVAL_GRAD: {
+        CeedBasis basis;
+
         CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
         CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_TRANSPOSE, eval_mode, impl->q_vecs_out[i], impl->e_vecs[i + impl->num_e_in]));
         CeedCallBackend(CeedBasisDestroy(&basis));
         break;
+      }
       // LCOV_EXCL_START
-      case CEED_EVAL_WEIGHT:
+      case CEED_EVAL_WEIGHT: {
         Ceed ceed;
 
         CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
         return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
         break;  // Should not occur
+      }
       case CEED_EVAL_DIV:
       case CEED_EVAL_CURL: {
         Ceed ceed;
@@ -448,7 +453,7 @@ static int CeedOperatorApplyAdd_Sycl(CeedOperator op, CeedVector in_vec, CeedVec
     bool                is_active;
     CeedEvalMode        eval_mode;
     CeedVector          vec;
-    CeedElemRestriction rstr;
+    CeedElemRestriction elem_rstr;
 
     // Restore evec
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
@@ -458,11 +463,11 @@ static int CeedOperatorApplyAdd_Sycl(CeedOperator op, CeedVector in_vec, CeedVec
     // Restrict
     CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
     is_active = vec == CEED_VECTOR_ACTIVE;
-    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &rstr));
+    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
     if (is_active) vec = out_vec;
-    CeedCallBackend(CeedElemRestrictionApply(rstr, CEED_TRANSPOSE, impl->e_vecs[i + impl->num_inputs], vec, request));
+    CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_TRANSPOSE, impl->e_vecs[i + impl->num_e_in], vec, request));
     if (!is_active) CeedCallBackend(CeedVectorDestroy(&vec));
-    CeedCallBackend(CeedElemRestrictionDestroy(&rstr));
+    CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
   }
 
   // Restore input arrays
@@ -473,8 +478,8 @@ static int CeedOperatorApplyAdd_Sycl(CeedOperator op, CeedVector in_vec, CeedVec
 //------------------------------------------------------------------------------
 // Core code for assembling linear QFunction
 //------------------------------------------------------------------------------
-static inline int CeedOperatorLinearAssembleQFunctionCore_Sycl(CeedOperator op, bool build_objects, CeedVector *assembled, CeedElemRestriction *rstr,
-                                                               CeedRequest *request) {
+static inline int CeedOperatorLinearAssembleQFunctionCore_Sycl(CeedOperator op, bool build_objects, CeedVector *assembled,
+                                                               CeedElemRestriction *elem_rstr, CeedRequest *request) {
   Ceed                ceed, ceed_parent;
   CeedSize            q_size;
   CeedInt             num_active_in, num_active_out, Q, num_elem, num_input_fields, num_output_fields, size;
@@ -555,7 +560,7 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Sycl(CeedOperator op,
     CeedInt  strides[3] = {1, num_elem * Q, Q}; /* *NOPAD* */
 
     // Create output restriction
-    CeedCallBackend(CeedElemRestrictionCreateStrided(ceed_parent, num_elem, Q, num_active_in * num_active_out, l_size, strides, rstr));
+    CeedCallBackend(CeedElemRestrictionCreateStrided(ceed_parent, num_elem, Q, num_active_in * num_active_out, l_size, strides, elem_rstr));
     // Create assembled vector
     CeedCallBackend(CeedVectorCreate(ceed_parent, l_size, assembled));
   }
@@ -612,15 +617,16 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Sycl(CeedOperator op,
 //------------------------------------------------------------------------------
 // Assemble Linear QFunction
 //------------------------------------------------------------------------------
-static int CeedOperatorLinearAssembleQFunction_Sycl(CeedOperator op, CeedVector *assembled, CeedElemRestriction *rstr, CeedRequest *request) {
-  return CeedOperatorLinearAssembleQFunctionCore_Sycl(op, true, assembled, rstr, request);
+static int CeedOperatorLinearAssembleQFunction_Sycl(CeedOperator op, CeedVector *assembled, CeedElemRestriction *elem_rstr, CeedRequest *request) {
+  return CeedOperatorLinearAssembleQFunctionCore_Sycl(op, true, assembled, elem_rstr, request);
 }
 
 //------------------------------------------------------------------------------
 // Update Assembled Linear QFunction
 //------------------------------------------------------------------------------
-static int CeedOperatorLinearAssembleQFunctionUpdate_Sycl(CeedOperator op, CeedVector assembled, CeedElemRestriction rstr, CeedRequest *request) {
-  return CeedOperatorLinearAssembleQFunctionCore_Sycl(op, false, &assembled, &rstr, request);
+static int CeedOperatorLinearAssembleQFunctionUpdate_Sycl(CeedOperator op, CeedVector assembled, CeedElemRestriction elem_rstr,
+                                                          CeedRequest *request) {
+  return CeedOperatorLinearAssembleQFunctionCore_Sycl(op, false, &assembled, &elem_rstr, request);
 }
 
 //------------------------------------------------------------------------------
@@ -892,22 +898,25 @@ static int CeedOperatorLinearDiagonal_Sycl(sycl::queue &sycl_queue, const bool i
 // Assemble diagonal common code
 //------------------------------------------------------------------------------
 static inline int CeedOperatorAssembleDiagonalCore_Sycl(CeedOperator op, CeedVector assembled, CeedRequest *request, const bool is_point_block) {
-  Ceed                ceed;
-  Ceed_Sycl          *sycl_data;
-  CeedInt             num_elem;
-  CeedScalar         *elem_diag_array;
-  const CeedScalar   *assembled_qf_array;
-  CeedVector          assembled_qf = NULL;
-  CeedElemRestriction rstr         = NULL;
-  CeedOperator_Sycl  *impl;
+  Ceed               ceed;
+  Ceed_Sycl         *sycl_data;
+  CeedInt            num_elem;
+  CeedScalar        *elem_diag_array;
+  const CeedScalar  *assembled_qf_array;
+  CeedVector         assembled_qf = NULL;
+  CeedOperator_Sycl *impl;
 
   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
   CeedCallBackend(CeedOperatorGetData(op, &impl));
   CeedCallBackend(CeedGetData(ceed, &sycl_data));
 
   // Assemble QFunction
-  CeedCallBackend(CeedOperatorLinearAssembleQFunctionBuildOrUpdate(op, &assembled_qf, &rstr, request));
-  CeedCallBackend(CeedElemRestrictionDestroy(&rstr));
+  {
+    CeedElemRestriction elem_rstr = NULL;
+
+    CeedCallBackend(CeedOperatorLinearAssembleQFunctionBuildOrUpdate(op, &assembled_qf, &elem_rstr, request));
+    CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
+  }
 
   // Setup
   if (!impl->diag) {
diff --git a/backends/sycl-ref/ceed-sycl-ref.hpp b/backends/sycl-ref/ceed-sycl-ref.hpp
index c435e65e4f..46123d5569 100644
--- a/backends/sycl-ref/ceed-sycl-ref.hpp
+++ b/backends/sycl-ref/ceed-sycl-ref.hpp
@@ -106,7 +106,6 @@ typedef struct {
   CeedVector                *q_vecs_out;  // Output Q-vectors needed to apply operator
   CeedInt                    num_e_in;
   CeedInt                    num_e_out;
-  CeedInt                    num_inputs, num_outputs;
   CeedInt                    num_active_in, num_active_out;
   CeedVector                *qf_active_in;
   CeedOperatorDiag_Sycl     *diag;

From 85938a6d1dd5e68e6deadca612b182f7422c5a77 Mon Sep 17 00:00:00 2001
From: James Wright <james@jameswright.xyz>
Date: Wed, 11 Sep 2024 17:26:41 +0000
Subject: [PATCH 200/571] sycl - Misc fixes

---
 backends/sycl-gen/ceed-sycl-gen-operator.sycl.cpp | 4 ++--
 backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/backends/sycl-gen/ceed-sycl-gen-operator.sycl.cpp b/backends/sycl-gen/ceed-sycl-gen-operator.sycl.cpp
index 650a52cf4d..0736446f4c 100644
--- a/backends/sycl-gen/ceed-sycl-gen-operator.sycl.cpp
+++ b/backends/sycl-gen/ceed-sycl-gen-operator.sycl.cpp
@@ -107,12 +107,12 @@ static int CeedOperatorApplyAdd_Sycl_gen(CeedOperator op, CeedVector input_vec,
           break;
         }
       }
-      if (!is_active) CeedCallBackend(CeedVectorDestroy(&vec));
       if (index == -1) {
         CeedCallBackend(CeedVectorGetArray(vec, CEED_MEM_DEVICE, &impl->fields->outputs[i]));
       } else {
         impl->fields->outputs[i] = impl->fields->outputs[index];
       }
+      if (!is_active) CeedCallBackend(CeedVectorDestroy(&vec));
     }
   }
 
@@ -189,10 +189,10 @@ static int CeedOperatorApplyAdd_Sycl_gen(CeedOperator op, CeedVector input_vec,
           break;
         }
       }
-      if (!is_active) CeedCallBackend(CeedVectorDestroy(&vec));
       if (index == -1) {
         CeedCallBackend(CeedVectorRestoreArray(vec, &impl->fields->outputs[i]));
       }
+      if (!is_active) CeedCallBackend(CeedVectorDestroy(&vec));
     }
   }
 
diff --git a/backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp b/backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp
index a046690420..69761c340e 100644
--- a/backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp
+++ b/backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp
@@ -1023,9 +1023,9 @@ static int CeedSingleOperatorAssembleSetup_Sycl(CeedOperator op) {
       CeedCallBackend(CeedOperatorFieldGetBasis(input_fields[i], &basis));
       CeedCheck(!basis_in || basis_in == basis, ceed, CEED_ERROR_BACKEND, "Backend does not implement operator assembly with multiple active bases");
       if (!basis_in) CeedCallBackend(CeedBasisReferenceCopy(basis, &basis_in));
+      CeedCallBackend(CeedBasisGetDimension(basis, &dim));
+      CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &num_qpts));
       CeedCallBackend(CeedBasisDestroy(&basis));
-      CeedCallBackend(CeedBasisGetDimension(basis_in, &dim));
-      CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis_in, &num_qpts));
       CeedCallBackend(CeedOperatorFieldGetElemRestriction(input_fields[i], &elem_rstr));
       if (!rstr_in) CeedCallBackend(CeedElemRestrictionReferenceCopy(elem_rstr, &rstr_in));
       CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));

From 6782e2f8092b6c5e48173c28c35c372c61b457a7 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Tue, 17 Sep 2024 11:39:38 -0600
Subject: [PATCH 201/571] sycl - fix regresions

---
 .../ceed-sycl-gen-operator-build.sycl.cpp     |  8 +-
 .../sycl-ref/ceed-sycl-ref-operator.sycl.cpp  | 98 +++++++++++--------
 2 files changed, 60 insertions(+), 46 deletions(-)

diff --git a/backends/sycl-gen/ceed-sycl-gen-operator-build.sycl.cpp b/backends/sycl-gen/ceed-sycl-gen-operator-build.sycl.cpp
index b14e155124..ee7aab812c 100644
--- a/backends/sycl-gen/ceed-sycl-gen-operator-build.sycl.cpp
+++ b/backends/sycl-gen/ceed-sycl-gen-operator-build.sycl.cpp
@@ -274,9 +274,9 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) {
     // Get elem_size, eval_mode, num_comp
     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
-    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
     CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
 
     // Set field constants
     if (eval_mode != CEED_EVAL_WEIGHT) {
@@ -334,9 +334,9 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) {
     // Get elem_size, eval_mode, num_comp
     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
-    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
     CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
 
     // Set field constants
     CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
@@ -401,8 +401,8 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) {
     // Get elem_size, eval_mode, num_comp
     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
-    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
 
     // Restriction
     if (eval_mode != CEED_EVAL_WEIGHT && !((eval_mode == CEED_EVAL_NONE) && use_collograd_parallelization)) {
@@ -677,8 +677,8 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) {
     // Get elem_size, eval_mode, num_comp
     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
-    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
     // Basis action
     code << "    // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
     switch (eval_mode) {
diff --git a/backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp b/backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp
index 69761c340e..e43981c217 100644
--- a/backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp
+++ b/backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp
@@ -89,10 +89,13 @@ static int CeedOperatorDestroy_Sycl(CeedOperator op) {
     CeedCallSycl(ceed, sycl::free(impl->diag->d_interp_out, sycl_data->sycl_context));
     CeedCallSycl(ceed, sycl::free(impl->diag->d_grad_in, sycl_data->sycl_context));
     CeedCallSycl(ceed, sycl::free(impl->diag->d_grad_out, sycl_data->sycl_context));
-    CeedCallBackend(CeedElemRestrictionDestroy(&impl->diag->point_block_diag_rstr));
 
     CeedCallBackend(CeedVectorDestroy(&impl->diag->elem_diag));
     CeedCallBackend(CeedVectorDestroy(&impl->diag->point_block_elem_diag));
+    CeedCallBackend(CeedElemRestrictionDestroy(&impl->diag->diag_rstr));
+    CeedCallBackend(CeedElemRestrictionDestroy(&impl->diag->point_block_diag_rstr));
+    CeedCallBackend(CeedBasisDestroy(&impl->diag->basis_in));
+    CeedCallBackend(CeedBasisDestroy(&impl->diag->basis_out));
   }
   CeedCallBackend(CeedFree(&impl->diag));
 
@@ -115,7 +118,7 @@ static int CeedOperatorSetupFields_Sycl(CeedQFunction qf, CeedOperator op, bool
   Ceed                ceed;
   CeedSize            q_size;
   bool                is_strided, skip_restriction;
-  CeedInt             dim, size;
+  CeedInt             size;
   CeedOperatorField  *op_fields;
   CeedQFunctionField *qf_fields;
 
@@ -133,7 +136,6 @@ static int CeedOperatorSetupFields_Sycl(CeedQFunction qf, CeedOperator op, bool
     CeedEvalMode        eval_mode;
     CeedVector          vec;
     CeedElemRestriction elem_rstr;
-    CeedBasis           basis;
 
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode));
 
@@ -183,20 +185,21 @@ static int CeedOperatorSetupFields_Sycl(CeedQFunction qf, CeedOperator op, bool
         CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i]));
         break;
       case CEED_EVAL_GRAD:
-        CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis));
         CeedCallBackend(CeedQFunctionFieldGetSize(qf_fields[i], &size));
-        CeedCallBackend(CeedBasisGetDimension(basis, &dim));
-        CeedCallBackend(CeedBasisDestroy(&basis));
         q_size = (CeedSize)num_elem * Q * size;
         CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i]));
         break;
-      case CEED_EVAL_WEIGHT:  // Only on input fields
-        CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis));
+      case CEED_EVAL_WEIGHT: {
+        CeedBasis basis;
+
+        // Note: only on input fields
         q_size = (CeedSize)num_elem * Q;
         CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i]));
+        CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis));
         CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT, CEED_VECTOR_NONE, q_vecs[i]));
         CeedCallBackend(CeedBasisDestroy(&basis));
         break;
+      }
       case CEED_EVAL_DIV:
         break;  // TODO: Not implemented
       case CEED_EVAL_CURL:
@@ -463,8 +466,8 @@ static int CeedOperatorApplyAdd_Sycl(CeedOperator op, CeedVector in_vec, CeedVec
     // Restrict
     CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
     is_active = vec == CEED_VECTOR_ACTIVE;
-    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
     if (is_active) vec = out_vec;
+    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
     CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_TRANSPOSE, impl->e_vecs[i + impl->num_e_in], vec, request));
     if (!is_active) CeedCallBackend(CeedVectorDestroy(&vec));
     CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
@@ -637,6 +640,7 @@ static inline int CeedOperatorAssembleDiagonalSetup_Sycl(CeedOperator op) {
   Ceed_Sycl          *sycl_data;
   CeedInt             num_input_fields, num_output_fields, num_eval_mode_in = 0, num_comp = 0, dim = 1, num_eval_mode_out = 0;
   CeedEvalMode       *eval_mode_in = NULL, *eval_mode_out = NULL;
+  CeedElemRestriction rstr_in = NULL, rstr_out = NULL;
   CeedBasis           basis_in = NULL, basis_out = NULL;
   CeedQFunctionField *qf_fields;
   CeedQFunction       qf;
@@ -655,14 +659,19 @@ static inline int CeedOperatorAssembleDiagonalSetup_Sycl(CeedOperator op) {
 
     CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec));
     if (vec == CEED_VECTOR_ACTIVE) {
-      CeedEvalMode eval_mode;
-      CeedBasis    basis;
+      CeedEvalMode        eval_mode;
+      CeedElemRestriction elem_rstr;
+      CeedBasis           basis;
 
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &elem_rstr));
+      if (!rstr_in) CeedCallBackend(CeedElemRestrictionReferenceCopy(elem_rstr, &rstr_in));
+      CeedCheck(rstr_in == elem_rstr, ceed, CEED_ERROR_BACKEND, "Backend does not implement multi-field non-composite operator diagonal assembly");
+      CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
       CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis));
-      CeedCheck(!basis_in || basis_in == basis, ceed, CEED_ERROR_BACKEND,
-                "Backend does not implement operator diagonal assembly with multiple active bases");
       if (!basis_in) CeedCallBackend(CeedBasisReferenceCopy(basis, &basis_in));
+      CeedCheck(basis_in == basis, ceed, CEED_ERROR_BACKEND, "Backend does not implement operator diagonal assembly with multiple active bases");
       CeedCallBackend(CeedBasisDestroy(&basis));
+      CeedCallBackend(CeedBasisGetDimension(basis_in, &dim));
       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode));
       switch (eval_mode) {
         case CEED_EVAL_NONE:
@@ -684,6 +693,7 @@ static inline int CeedOperatorAssembleDiagonalSetup_Sycl(CeedOperator op) {
     }
     CeedCallBackend(CeedVectorDestroy(&vec));
   }
+  CeedCallBackend(CeedElemRestrictionDestroy(&rstr_in));
 
   // Determine active output basis
   CeedCallBackend(CeedOperatorGetFields(op, NULL, NULL, NULL, &op_fields));
@@ -693,26 +703,30 @@ static inline int CeedOperatorAssembleDiagonalSetup_Sycl(CeedOperator op) {
 
     CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec));
     if (vec == CEED_VECTOR_ACTIVE) {
-      CeedEvalMode eval_mode;
-      CeedBasis    basis;
+      CeedEvalMode        eval_mode;
+      CeedElemRestriction elem_rstr;
+      CeedBasis           basis;
 
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &elem_rstr));
+      if (!rstr_out) CeedCallBackend(CeedElemRestrictionReferenceCopy(elem_rstr, &rstr_out));
+      CeedCheck(rstr_out == elem_rstr, ceed, CEED_ERROR_BACKEND, "Backend does not implement multi-field non-composite operator diagonal assembly");
+      CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
       CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis));
-      CeedCheck(!basis_out || basis_out == basis, ceed, CEED_ERROR_BACKEND,
-                "Backend does not implement operator diagonal assembly with multiple active bases");
       if (!basis_out) CeedCallBackend(CeedBasisReferenceCopy(basis, &basis_out));
+      CeedCheck(basis_out == basis, ceed, CEED_ERROR_BACKEND, "Backend does not implement operator diagonal assembly with multiple active bases");
       CeedCallBackend(CeedBasisDestroy(&basis));
       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode));
       switch (eval_mode) {
         case CEED_EVAL_NONE:
         case CEED_EVAL_INTERP:
-          CeedCallBackend(CeedRealloc(num_eval_mode_in + 1, &eval_mode_in));
-          eval_mode_in[num_eval_mode_in] = eval_mode;
-          num_eval_mode_in += 1;
+          CeedCallBackend(CeedRealloc(num_eval_mode_out + 1, &eval_mode_out));
+          eval_mode_out[num_eval_mode_out] = eval_mode;
+          num_eval_mode_out += 1;
           break;
         case CEED_EVAL_GRAD:
-          CeedCallBackend(CeedRealloc(num_eval_mode_in + dim, &eval_mode_in));
-          for (CeedInt d = 0; d < dim; d++) eval_mode_in[num_eval_mode_in + d] = eval_mode;
-          num_eval_mode_in += dim;
+          CeedCallBackend(CeedRealloc(num_eval_mode_out + dim, &eval_mode_out));
+          for (CeedInt d = 0; d < dim; d++) eval_mode_out[num_eval_mode_out + d] = eval_mode;
+          num_eval_mode_out += dim;
           break;
         case CEED_EVAL_WEIGHT:
         case CEED_EVAL_DIV:
@@ -729,8 +743,8 @@ static inline int CeedOperatorAssembleDiagonalSetup_Sycl(CeedOperator op) {
   CeedCallBackend(CeedCalloc(1, &impl->diag));
   CeedOperatorDiag_Sycl *diag = impl->diag;
 
-  diag->basis_in          = basis_in;
-  diag->basis_out         = basis_out;
+  CeedCallBackend(CeedBasisReferenceCopy(basis_in, &diag->basis_in));
+  CeedCallBackend(CeedBasisReferenceCopy(basis_out, &diag->basis_out));
   diag->h_eval_mode_in    = eval_mode_in;
   diag->h_eval_mode_out   = eval_mode_out;
   diag->num_eval_mode_in  = num_eval_mode_in;
@@ -740,6 +754,7 @@ static inline int CeedOperatorAssembleDiagonalSetup_Sycl(CeedOperator op) {
   CeedInt num_nodes, num_qpts;
   CeedCallBackend(CeedBasisGetNumNodes(basis_in, &num_nodes));
   CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis_in, &num_qpts));
+  CeedCallBackend(CeedBasisGetNumComponents(basis_in, &num_comp));
   diag->num_nodes = num_nodes;
   diag->num_qpts  = num_qpts;
   diag->num_comp  = num_comp;
@@ -801,13 +816,12 @@ static inline int CeedOperatorAssembleDiagonalSetup_Sycl(CeedOperator op) {
   copy_events.push_back(eval_mode_out_copy);
 
   // Restriction
-  {
-    CeedElemRestriction rstr_out;
+  CeedCallBackend(CeedElemRestrictionReferenceCopy(rstr_out, &diag->diag_rstr));
+  CeedCallBackend(CeedElemRestrictionDestroy(&rstr_out));
 
-    CeedCallBackend(CeedOperatorGetActiveElemRestrictions(op, NULL, &rstr_out));
-    diag->diag_rstr = rstr_out;
-    CeedCallBackend(CeedElemRestrictionDestroy(&rstr_out));
-  }
+  // Cleanup
+  CeedCallBackend(CeedBasisDestroy(&basis_in));
+  CeedCallBackend(CeedBasisDestroy(&basis_out));
 
   // Wait for all copies to complete and handle exceptions
   CeedCallSycl(ceed, sycl::event::wait_and_throw(copy_events));
@@ -1020,16 +1034,16 @@ static int CeedSingleOperatorAssembleSetup_Sycl(CeedOperator op) {
       CeedElemRestriction elem_rstr;
       CeedBasis           basis;
 
-      CeedCallBackend(CeedOperatorFieldGetBasis(input_fields[i], &basis));
-      CeedCheck(!basis_in || basis_in == basis, ceed, CEED_ERROR_BACKEND, "Backend does not implement operator assembly with multiple active bases");
-      if (!basis_in) CeedCallBackend(CeedBasisReferenceCopy(basis, &basis_in));
-      CeedCallBackend(CeedBasisGetDimension(basis, &dim));
-      CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &num_qpts));
-      CeedCallBackend(CeedBasisDestroy(&basis));
       CeedCallBackend(CeedOperatorFieldGetElemRestriction(input_fields[i], &elem_rstr));
       if (!rstr_in) CeedCallBackend(CeedElemRestrictionReferenceCopy(elem_rstr, &rstr_in));
       CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
       CeedCallBackend(CeedElemRestrictionGetElementSize(rstr_in, &elem_size));
+      CeedCallBackend(CeedOperatorFieldGetBasis(input_fields[i], &basis));
+      if (!basis_in) CeedCallBackend(CeedBasisReferenceCopy(basis, &basis_in));
+      CeedCheck(basis_in == basis, ceed, CEED_ERROR_BACKEND, "Backend does not implement operator assembly with multiple active bases");
+      CeedCallBackend(CeedBasisDestroy(&basis));
+      CeedCallBackend(CeedBasisGetDimension(basis_in, &dim));
+      CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis_in, &num_qpts));
       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode));
       if (eval_mode != CEED_EVAL_NONE) {
         CeedCallBackend(CeedRealloc(num_B_in_mats_to_load + 1, &eval_mode_in));
@@ -1058,14 +1072,14 @@ static int CeedSingleOperatorAssembleSetup_Sycl(CeedOperator op) {
       CeedElemRestriction elem_rstr;
       CeedBasis           basis;
 
-      CeedCallBackend(CeedOperatorFieldGetBasis(output_fields[i], &basis));
-      CeedCheck(!basis_out || basis_out == basis, ceed, CEED_ERROR_BACKEND,
-                "Backend does not implement operator assembly with multiple active bases");
-      if (!basis_out) CeedCallBackend(CeedBasisReferenceCopy(basis, &basis_out));
-      CeedCallBackend(CeedBasisDestroy(&basis));
       CeedCallBackend(CeedOperatorFieldGetElemRestriction(output_fields[i], &elem_rstr));
       if (!rstr_out) CeedCallBackend(CeedElemRestrictionReferenceCopy(elem_rstr, &rstr_out));
+      CeedCheck(rstr_out == rstr_in, ceed, CEED_ERROR_BACKEND, "Backend does not implement multi-field non-composite operator assembly");
       CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
+      CeedCallBackend(CeedOperatorFieldGetBasis(output_fields[i], &basis));
+      if (!basis_out) CeedCallBackend(CeedBasisReferenceCopy(basis, &basis_out));
+      CeedCheck(basis_out == basis, ceed, CEED_ERROR_BACKEND, "Backend does not implement operator assembly with multiple active bases");
+      CeedCallBackend(CeedBasisDestroy(&basis));
       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode));
       if (eval_mode != CEED_EVAL_NONE) {
         CeedCallBackend(CeedRealloc(num_B_out_mats_to_load + 1, &eval_mode_out));

From 8a297abd0abffb6af7c8cbd4ffcd5e5f554c8ef0 Mon Sep 17 00:00:00 2001
From: James Wright <james@jameswright.xyz>
Date: Thu, 10 Oct 2024 21:32:39 -0600
Subject: [PATCH 202/571] feat(op): Add CeedCompositeOperatorGetSubByName()

Co-authored-by: Jeremy L Thompson <jeremy@jeremylt.org>
---
 include/ceed/ceed.h       |  1 +
 interface/ceed-operator.c | 34 ++++++++++++++++++++++++++++++++++
 tests/t520-operator.c     | 12 ++++++++++++
 3 files changed, 47 insertions(+)

diff --git a/include/ceed/ceed.h b/include/ceed/ceed.h
index d847525da8..1660d724a9 100644
--- a/include/ceed/ceed.h
+++ b/include/ceed/ceed.h
@@ -416,6 +416,7 @@ CEED_EXTERN int  CeedOperatorIsAtPoints(CeedOperator op, bool *is_at_points);
 CEED_EXTERN int  CeedCompositeOperatorAddSub(CeedOperator composite_op, CeedOperator sub_op);
 CEED_EXTERN int  CeedCompositeOperatorGetNumSub(CeedOperator op, CeedInt *num_suboperators);
 CEED_EXTERN int  CeedCompositeOperatorGetSubList(CeedOperator op, CeedOperator **sub_operators);
+CEED_EXTERN int  CeedCompositeOperatorGetSubByName(CeedOperator op, const char *op_name, CeedOperator *sub_op);
 CEED_EXTERN int  CeedOperatorCheckReady(CeedOperator op);
 CEED_EXTERN int  CeedOperatorGetActiveVectorLengths(CeedOperator op, CeedSize *input_size, CeedSize *output_size);
 CEED_EXTERN int  CeedOperatorSetQFunctionAssemblyReuse(CeedOperator op, bool reuse_assembly_data);
diff --git a/interface/ceed-operator.c b/interface/ceed-operator.c
index 5881846665..065c5a2cda 100644
--- a/interface/ceed-operator.c
+++ b/interface/ceed-operator.c
@@ -1284,6 +1284,40 @@ int CeedCompositeOperatorGetSubList(CeedOperator op, CeedOperator **sub_operator
   return CEED_ERROR_SUCCESS;
 }
 
+/**
+  @brief Get a sub `CeedOperator` of a composite `CeedOperator` from its name.
+
+  `sub_op` is set to `NULL` if the sub operator is not found.
+
+  Note: Calling this function asserts that setup is complete and sets the `CeedOperator` as immutable.
+
+  @param[in]  op      Composite `CeedOperator`
+  @param[in]  op_name Name of desired sub `CeedOperator`
+  @param[out] sub_op  Sub `CeedOperator` corresponding to the name
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Advanced
+**/
+int CeedCompositeOperatorGetSubByName(CeedOperator op, const char *op_name, CeedOperator *sub_op) {
+  bool          is_composite;
+  CeedInt       num_sub_ops;
+  CeedOperator *sub_ops;
+
+  CeedCall(CeedOperatorIsComposite(op, &is_composite));
+  CeedCheck(is_composite, CeedOperatorReturnCeed(op), CEED_ERROR_MINOR, "Only defined for a composite operator");
+  *sub_op = NULL;
+  CeedCall(CeedCompositeOperatorGetNumSub(op, &num_sub_ops));
+  CeedCall(CeedCompositeOperatorGetSubList(op, &sub_ops));
+  for (CeedInt i = 0; i < num_sub_ops; i++) {
+    if (sub_ops[i]->name && !strcmp(op_name, sub_ops[i]->name)) {
+      *sub_op = sub_ops[i];
+      return CEED_ERROR_SUCCESS;
+    }
+  }
+  return CEED_ERROR_SUCCESS;
+}
+
 /**
   @brief Check if a `CeedOperator` is ready to be used.
 
diff --git a/tests/t520-operator.c b/tests/t520-operator.c
index 9035234ffa..31fceb9c97 100644
--- a/tests/t520-operator.c
+++ b/tests/t520-operator.c
@@ -111,6 +111,7 @@ int main(int argc, char **argv) {
   CeedOperatorSetField(op_mass_tet, "rho", elem_restriction_q_data_tet, CEED_BASIS_NONE, q_data_tet);
   CeedOperatorSetField(op_mass_tet, "u", elem_restriction_u_tet, basis_u_tet, CEED_VECTOR_ACTIVE);
   CeedOperatorSetField(op_mass_tet, "v", elem_restriction_u_tet, basis_u_tet, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetName(op_mass_tet, "mass tet");
 
   // Set up Hex Elements
   // -- Restrictions
@@ -154,6 +155,7 @@ int main(int argc, char **argv) {
   CeedOperatorSetField(op_mass_hex, "rho", elem_restriction_q_data_hex, CEED_BASIS_NONE, q_data_hex);
   CeedOperatorSetField(op_mass_hex, "u", elem_restriction_u_hex, basis_u_hex, CEED_VECTOR_ACTIVE);
   CeedOperatorSetField(op_mass_hex, "v", elem_restriction_u_hex, basis_u_hex, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetName(op_mass_hex, "mass hex");
 
   // Set up Composite Operators
   // -- Create
@@ -168,6 +170,16 @@ int main(int argc, char **argv) {
   CeedCompositeOperatorAddSub(op_mass, op_mass_tet);
   CeedCompositeOperatorAddSub(op_mass, op_mass_hex);
 
+  {  // Test CeedCompositeOperatorGetSubByName
+    CeedOperator op_byname;
+
+    CeedCompositeOperatorGetSubByName(op_mass, "mass hex", &op_byname);
+    if (op_byname != op_mass_hex) printf("CeedCompositeOperatorGetSubByName returned incorrect Sub Operator");
+
+    CeedCompositeOperatorGetSubByName(op_mass, "asdf", &op_byname);
+    if (op_byname != NULL) printf("CeedCompositeOperatorGetSubByName returned non-NULL for non-existent Sub Operator");
+  }
+
   // Apply Setup Operator
   CeedOperatorApply(op_setup, x, CEED_VECTOR_NONE, CEED_REQUEST_IMMEDIATE);
 

From 69d19baced7c427ab4da8cdea7fc4ab45fdaa0db Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 11 Oct 2024 14:54:29 -0600
Subject: [PATCH 203/571] cpu - fix minor leak

---
 backends/ref/ceed-ref-operator.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/backends/ref/ceed-ref-operator.c b/backends/ref/ceed-ref-operator.c
index 4714387744..b53aa14e80 100644
--- a/backends/ref/ceed-ref-operator.c
+++ b/backends/ref/ceed-ref-operator.c
@@ -1456,7 +1456,10 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Ref(CeedOperator op, Ce
           // ---- Check if elem size matches
           CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[j], &elem_rstr));
           CeedCallBackend(CeedElemRestrictionGetType(elem_rstr, &rstr_type));
-          if (is_active_at_points && rstr_type != CEED_RESTRICTION_POINTS) continue;
+          if (is_active_at_points && rstr_type != CEED_RESTRICTION_POINTS) {
+            CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
+            continue;
+          }
           if (rstr_type == CEED_RESTRICTION_POINTS) {
             CeedCallBackend(CeedElemRestrictionGetNumPointsInElement(elem_rstr, e, &elem_size));
           } else {
@@ -1466,7 +1469,10 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Ref(CeedOperator op, Ce
             CeedInt num_comp = 0;
 
             CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
-            if (e_vec_size != num_comp * elem_size) continue;
+            if (e_vec_size != num_comp * elem_size) {
+              CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
+              continue;
+            }
           }
 
           // ---- Basis action

From bbac207aa45994afed01cc8818c3d512f8fcfdb1 Mon Sep 17 00:00:00 2001
From: Zach Atkins <zach.atkins@colorado.edu>
Date: Mon, 14 Oct 2024 15:54:49 -0600
Subject: [PATCH 204/571] Fix memory leak

---
 backends/ref/ceed-ref-operator.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/backends/ref/ceed-ref-operator.c b/backends/ref/ceed-ref-operator.c
index b53aa14e80..4c62608d49 100644
--- a/backends/ref/ceed-ref-operator.c
+++ b/backends/ref/ceed-ref-operator.c
@@ -980,7 +980,11 @@ static inline int CeedOperatorOutputBasisAtPoints_Ref(CeedInt e, CeedInt num_poi
       }
     }
     // Restrict output block
-    if (skip_rstr[i]) continue;
+    if (skip_rstr[i]) {
+      CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
+      continue;
+    }
+
     // Get output vector
     CeedCallBackend(CeedElemRestrictionGetType(elem_rstr, &rstr_type));
     CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));

From daaf13a462f999a7d367f3df68e0e3c34270722c Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Wed, 16 Oct 2024 12:20:23 -0600
Subject: [PATCH 205/571] jit - allow <> includes other than math, std*

---
 backends/cuda/ceed-cuda-compile.cpp |  3 ++-
 interface/ceed-jit-tools.c          | 10 ++++++++++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/backends/cuda/ceed-cuda-compile.cpp b/backends/cuda/ceed-cuda-compile.cpp
index 9bd433fd01..fd2cdfd713 100644
--- a/backends/cuda/ceed-cuda-compile.cpp
+++ b/backends/cuda/ceed-cuda-compile.cpp
@@ -38,7 +38,7 @@ int CeedCompile_Cuda(Ceed ceed, const char *source, CUmodule *module, const Ceed
   size_t                ptx_size;
   char                 *ptx;
   const char           *jit_defs_path, *jit_defs_source;
-  const int             num_opts = 3;
+  const int             num_opts = 4;
   const char           *opts[num_opts];
   nvrtcProgram          prog;
   struct cudaDeviceProp prop;
@@ -93,6 +93,7 @@ int CeedCompile_Cuda(Ceed ceed, const char *source, CUmodule *module, const Ceed
       + std::to_string(prop.major) + std::to_string(prop.minor);
   opts[1] = arch_arg.c_str();
   opts[2] = "-Dint32_t=int";
+  opts[3] = "-I/home/jeremy/Dev/libCEED/include/ceed/jit-source/"
 
   // Add string source argument provided in call
   code << source;
diff --git a/interface/ceed-jit-tools.c b/interface/ceed-jit-tools.c
index a45054cfc9..72b95f766e 100644
--- a/interface/ceed-jit-tools.c
+++ b/interface/ceed-jit-tools.c
@@ -216,6 +216,9 @@ int CeedLoadSourceToInitializedBuffer(Ceed ceed, const char *source_file_path, C
       bool  is_ceed_header    = next_left_chevron && (next_new_line - next_left_chevron > 0) &&
                             (!strncmp(next_left_chevron, "<ceed/jit-source/", 17) || !strncmp(next_left_chevron, "<ceed/types.h>", 14) ||
                              !strncmp(next_left_chevron, "<ceed/ceed-f32.h>", 17) || !strncmp(next_left_chevron, "<ceed/ceed-f64.h>", 17));
+      bool is_std_header =
+          next_left_chevron && (next_new_line - next_left_chevron > 0) &&
+          (!strncmp(next_left_chevron, "<std", 4) || !strncmp(next_left_chevron, "<math.h>", 8) || !strncmp(next_left_chevron, "<ceed", 5));
 
       if (is_local_header || is_ceed_header) {
         // ---- Build source path
@@ -254,6 +257,13 @@ int CeedLoadSourceToInitializedBuffer(Ceed ceed, const char *source_file_path, C
         }
         CeedCall(CeedFree(&include_source_path));
         CeedCall(CeedFree(&normalized_include_source_path));
+      } else if (!is_std_header) {
+        long header_copy_size = next_new_line - first_hash + 1;
+
+        CeedCall(CeedRealloc(current_size + copy_size + header_copy_size + 2, buffer));
+        memcpy(&(*buffer)[current_size + copy_size], "\n", 2);
+        memcpy(&(*buffer)[current_size + copy_size + 1], first_hash, header_copy_size);
+        memcpy(&(*buffer)[current_size + copy_size + header_copy_size], "", 1);
       }
       file_offset = strchr(first_hash, '\n') - temp_buffer + 1;
     }

From b13efd58b277efef1db70d6f06eaaf4d415a7642 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Wed, 16 Oct 2024 14:47:41 -0600
Subject: [PATCH 206/571] jit - I include JiT source dirs set

---
 backends/cuda/ceed-cuda-compile.cpp | 26 +++++++++++++---
 backends/hip/ceed-hip-compile.cpp   | 25 +++++++++++++--
 include/ceed/backend.h              |  2 ++
 interface/ceed-jit-tools.c          | 19 +++++++-----
 interface/ceed.c                    | 47 +++++++++++++++++++++++++++++
 tests/t406-qfunction.h              |  2 +-
 6 files changed, 106 insertions(+), 15 deletions(-)

diff --git a/backends/cuda/ceed-cuda-compile.cpp b/backends/cuda/ceed-cuda-compile.cpp
index fd2cdfd713..7f7a4a9dde 100644
--- a/backends/cuda/ceed-cuda-compile.cpp
+++ b/backends/cuda/ceed-cuda-compile.cpp
@@ -38,8 +38,9 @@ int CeedCompile_Cuda(Ceed ceed, const char *source, CUmodule *module, const Ceed
   size_t                ptx_size;
   char                 *ptx;
   const char           *jit_defs_path, *jit_defs_source;
-  const int             num_opts = 4;
-  const char           *opts[num_opts];
+  const int             num_opts            = 3;
+  CeedInt               num_jit_source_dirs = 0;
+  const char          **opts;
   nvrtcProgram          prog;
   struct cudaDeviceProp prop;
   Ceed_Cuda            *ceed_data;
@@ -77,6 +78,7 @@ int CeedCompile_Cuda(Ceed ceed, const char *source, CUmodule *module, const Ceed
   CeedCallBackend(CeedFree(&jit_defs_source));
 
   // Non-macro options
+  CeedCallBackend(CeedCalloc(num_opts, &opts));
   opts[0] = "-default-device";
   CeedCallBackend(CeedGetData(ceed, &ceed_data));
   CeedCallCuda(ceed, cudaGetDeviceProperties(&prop, ceed_data->device_id));
@@ -93,7 +95,19 @@ int CeedCompile_Cuda(Ceed ceed, const char *source, CUmodule *module, const Ceed
       + std::to_string(prop.major) + std::to_string(prop.minor);
   opts[1] = arch_arg.c_str();
   opts[2] = "-Dint32_t=int";
-  opts[3] = "-I/home/jeremy/Dev/libCEED/include/ceed/jit-source/"
+  {
+    const char **jit_source_dirs;
+
+    CeedCallBackend(CeedGetJitSourceRoots(ceed, &num_jit_source_dirs, &jit_source_dirs));
+    CeedCallBackend(CeedRealloc(num_opts + num_jit_source_dirs, &opts));
+    for (CeedInt i = 0; i < num_jit_source_dirs; i++) {
+      std::ostringstream include_dirs_arg;
+
+      include_dirs_arg << "-I" << jit_source_dirs[i];
+      CeedCallBackend(CeedStringAllocCopy(include_dirs_arg.str().c_str(), (char **)&opts[num_opts + i]));
+    }
+    CeedCallBackend(CeedRestoreJitSourceRoots(ceed, &jit_source_dirs));
+  }
 
   // Add string source argument provided in call
   code << source;
@@ -102,8 +116,12 @@ int CeedCompile_Cuda(Ceed ceed, const char *source, CUmodule *module, const Ceed
   CeedCallNvrtc(ceed, nvrtcCreateProgram(&prog, code.str().c_str(), NULL, 0, NULL, NULL));
 
   // Compile kernel
-  nvrtcResult result = nvrtcCompileProgram(prog, num_opts, opts);
+  nvrtcResult result = nvrtcCompileProgram(prog, num_opts + num_jit_source_dirs, opts);
 
+  for (CeedInt i = 0; i < num_jit_source_dirs; i++) {
+    CeedCallBackend(CeedFree(&opts[num_opts + i]));
+  }
+  CeedCallBackend(CeedFree(&opts));
   if (result != NVRTC_SUCCESS) {
     char  *log;
     size_t log_size;
diff --git a/backends/hip/ceed-hip-compile.cpp b/backends/hip/ceed-hip-compile.cpp
index cafb79ed7f..d86b01dd1b 100644
--- a/backends/hip/ceed-hip-compile.cpp
+++ b/backends/hip/ceed-hip-compile.cpp
@@ -37,8 +37,9 @@ int CeedCompile_Hip(Ceed ceed, const char *source, hipModule_t *module, const Ce
   size_t                 ptx_size;
   char                  *jit_defs_source, *ptx;
   const char            *jit_defs_path;
-  const int              num_opts = 3;
-  const char            *opts[num_opts];
+  const int              num_opts            = 3;
+  CeedInt                num_jit_source_dirs = 0;
+  const char           **opts;
   int                    runtime_version;
   hiprtcProgram          prog;
   struct hipDeviceProp_t prop;
@@ -84,12 +85,26 @@ int CeedCompile_Hip(Ceed ceed, const char *source, hipModule_t *module, const Ce
   CeedCallBackend(CeedFree(&jit_defs_source));
 
   // Non-macro options
+  CeedCallBackend(CeedCalloc(num_opts, &opts));
   opts[0] = "-default-device";
   CeedCallBackend(CeedGetData(ceed, (void **)&ceed_data));
   CeedCallHip(ceed, hipGetDeviceProperties(&prop, ceed_data->device_id));
   std::string arch_arg = "--gpu-architecture=" + std::string(prop.gcnArchName);
   opts[1]              = arch_arg.c_str();
   opts[2]              = "-munsafe-fp-atomics";
+  {
+    const char **jit_source_dirs;
+
+    CeedCallBackend(CeedGetJitSourceRoots(ceed, &num_jit_source_dirs, &jit_source_dirs));
+    CeedCallBackend(CeedRealloc(num_opts + num_jit_source_dirs, &opts));
+    for (CeedInt i = 0; i < num_jit_source_dirs; i++) {
+      std::ostringstream include_dirs_arg;
+
+      include_dirs_arg << "-I" << jit_source_dirs[i];
+      CeedCallBackend(CeedStringAllocCopy(include_dirs_arg.str().c_str(), (char **)&opts[num_opts + i]));
+    }
+    CeedCallBackend(CeedRestoreJitSourceRoots(ceed, &jit_source_dirs));
+  }
 
   // Add string source argument provided in call
   code << source;
@@ -98,8 +113,12 @@ int CeedCompile_Hip(Ceed ceed, const char *source, hipModule_t *module, const Ce
   CeedCallHiprtc(ceed, hiprtcCreateProgram(&prog, code.str().c_str(), NULL, 0, NULL, NULL));
 
   // Compile kernel
-  hiprtcResult result = hiprtcCompileProgram(prog, num_opts, opts);
+  hiprtcResult result = hiprtcCompileProgram(prog, num_opts + num_jit_source_dirs, opts);
 
+  for (CeedInt i = 0; i < num_jit_source_dirs; i++) {
+    CeedCallBackend(CeedFree(&opts[num_opts + i]));
+  }
+  CeedCallBackend(CeedFree(&opts));
   if (result != HIPRTC_SUCCESS) {
     size_t log_size;
     char  *log;
diff --git a/include/ceed/backend.h b/include/ceed/backend.h
index 05da6f8981..43f2d52d20 100644
--- a/include/ceed/backend.h
+++ b/include/ceed/backend.h
@@ -254,6 +254,8 @@ CEED_EXTERN int CeedSetData(Ceed ceed, void *data);
 CEED_EXTERN int CeedReference(Ceed ceed);
 CEED_EXTERN int CeedGetWorkVector(Ceed ceed, CeedSize len, CeedVector *vec);
 CEED_EXTERN int CeedRestoreWorkVector(Ceed ceed, CeedVector *vec);
+CEED_EXTERN int CeedGetJitSourceRoots(Ceed ceed, CeedInt *num_source_roots, const char ***jit_source_roots);
+CEED_EXTERN int CeedRestoreJitSourceRoots(Ceed ceed, const char ***jit_source_roots);
 
 CEED_EXTERN int CeedVectorHasValidArray(CeedVector vec, bool *has_valid_array);
 CEED_EXTERN int CeedVectorHasBorrowedArrayOfType(CeedVector vec, CeedMemType mem_type, bool *has_borrowed_array_of_type);
diff --git a/interface/ceed-jit-tools.c b/interface/ceed-jit-tools.c
index 72b95f766e..d89f713dfe 100644
--- a/interface/ceed-jit-tools.c
+++ b/interface/ceed-jit-tools.c
@@ -400,28 +400,33 @@ int CeedGetJitRelativePath(const char *absolute_file_path, const char **relative
   @ref Backend
 **/
 int CeedGetJitAbsolutePath(Ceed ceed, const char *relative_file_path, const char **absolute_file_path) {
-  Ceed ceed_parent;
+  const char **jit_source_dirs;
+  CeedInt      num_source_dirs;
 
   // Debug
   CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "---------- Ceed JiT ----------\n");
   CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "Relative JiT source file: ");
   CeedDebug(ceed, "%s\n", relative_file_path);
 
-  CeedCall(CeedGetParent(ceed, &ceed_parent));
-  for (CeedInt i = 0; i < ceed_parent->num_jit_source_roots; i++) {
+  CeedCallBackend(CeedGetJitSourceRoots(ceed, &num_source_dirs, &jit_source_dirs));
+  for (CeedInt i = 0; i < num_source_dirs; i++) {
     bool is_valid;
 
     // Debug
     CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "Checking JiT root: ");
-    CeedDebug(ceed, "%s\n", ceed_parent->jit_source_roots[i]);
+    CeedDebug(ceed, "%s\n", jit_source_dirs[i]);
 
     // Build and check absolute path with current root
-    CeedCall(CeedPathConcatenate(ceed, ceed_parent->jit_source_roots[i], relative_file_path, (char **)absolute_file_path));
+    CeedCall(CeedPathConcatenate(ceed, jit_source_dirs[i], relative_file_path, (char **)absolute_file_path));
     CeedCall(CeedCheckFilePath(ceed, *absolute_file_path, &is_valid));
 
-    if (is_valid) return CEED_ERROR_SUCCESS;
+    if (is_valid) {
+      CeedCallBackend(CeedRestoreJitSourceRoots(ceed, &jit_source_dirs));
+      return CEED_ERROR_SUCCESS;
+    }
     // LCOV_EXCL_START
-    else CeedCall(CeedFree(absolute_file_path));
+    else
+      CeedCall(CeedFree(absolute_file_path));
     // LCOV_EXCL_STOP
   }
   // LCOV_EXCL_START
diff --git a/interface/ceed.c b/interface/ceed.c
index 1becb3de14..e214b6eb85 100644
--- a/interface/ceed.c
+++ b/interface/ceed.c
@@ -658,6 +658,16 @@ int CeedGetOperatorFallbackCeed(Ceed ceed, Ceed *fallback_ceed) {
     fallback_ceed->op_fallback_parent = ceed;
     fallback_ceed->Error              = ceed->Error;
     ceed->op_fallback_ceed            = fallback_ceed;
+    {
+      const char **jit_source_dirs;
+      CeedInt      num_jit_source_dirs = 0;
+
+      CeedCall(CeedGetJitSourceRoots(ceed, &num_jit_source_dirs, &jit_source_dirs));
+      for (CeedInt i = 0; i < num_jit_source_dirs; i++) {
+        CeedCall(CeedAddJitSourceRoot(fallback_ceed, jit_source_dirs[i]));
+      }
+      CeedCall(CeedRestoreJitSourceRoots(ceed, &jit_source_dirs));
+    }
   }
   *fallback_ceed = ceed->op_fallback_ceed;
   return CEED_ERROR_SUCCESS;
@@ -863,6 +873,43 @@ int CeedRestoreWorkVector(Ceed ceed, CeedVector *vec) {
   // LCOV_EXCL_STOP
 }
 
+/**
+  @brief Retrieve list ofadditional JiT source roots from `Ceed` context.
+
+  Note: The caller is responsible for restoring `jit_source_roots` with @ref CeedRestoreJitSourceRoots().
+
+  @param[in]  ceed             `Ceed` context
+  @param[out] num_source_roots Number of JiT source directories
+  @param[out] jit_source_roots Absolute paths to additional JiT source directories
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Backend
+**/
+int CeedGetJitSourceRoots(Ceed ceed, CeedInt *num_source_roots, const char ***jit_source_roots) {
+  Ceed ceed_parent;
+
+  CeedCall(CeedGetParent(ceed, &ceed_parent));
+  *num_source_roots = ceed_parent->num_jit_source_roots;
+  *jit_source_roots = (const char **)ceed_parent->jit_source_roots;
+  return CEED_ERROR_SUCCESS;
+}
+
+/**
+  @brief Restore list of additional JiT source roots from with @ref CeedGetJitSourceRoots()
+
+  @param[in]  ceed             `Ceed` context
+  @param[out] jit_source_roots Absolute paths to additional JiT source directories
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Backend
+**/
+int CeedRestoreJitSourceRoots(Ceed ceed, const char ***jit_source_roots) {
+  *jit_source_roots = NULL;
+  return CEED_ERROR_SUCCESS;
+}
+
 /// @}
 
 /// ----------------------------------------------------------------------------
diff --git a/tests/t406-qfunction.h b/tests/t406-qfunction.h
index f4782f7029..8be7fd8d34 100644
--- a/tests/t406-qfunction.h
+++ b/tests/t406-qfunction.h
@@ -5,7 +5,7 @@
 //
 // This file is part of CEED:  http://github.com/ceed
 
-// Note: intentionally testing strange spacing in '#include's
+// Note: intentionally testing strange spacing in include's
 // clang-format off
 #include <ceed.h>
 #  include  <math.h>

From 255dad3207f061d22701e91ddb8337d8c6809493 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Wed, 16 Oct 2024 16:12:15 -0600
Subject: [PATCH 207/571] test - compile QF with extra include dir

---
 Makefile                              |  2 +-
 tests/t406-qfunction-helper.h         |  7 +++++--
 tests/t406-qfunction.c                |  7 +++++++
 tests/test-include/fake-sys-include.h | 10 ++++++++++
 4 files changed, 23 insertions(+), 3 deletions(-)
 create mode 100644 tests/test-include/fake-sys-include.h

diff --git a/Makefile b/Makefile
index cb25f658b5..023d1ed559 100644
--- a/Makefile
+++ b/Makefile
@@ -572,7 +572,7 @@ $(OBJDIR)/%.o : $(CURDIR)/%.sycl.cpp | $$(@D)/.DIR
 	$(call quiet,SYCLCXX) $(SYCLFLAGS) $(CPPFLAGS) -c -o $@ $(abspath $<)
 
 $(OBJDIR)/%$(EXE_SUFFIX) : tests/%.c | $$(@D)/.DIR
-	$(call quiet,LINK.c) $(CEED_LDFLAGS) -o $@ $(abspath $<) $(CEED_LIBS) $(CEED_LDLIBS) $(LDLIBS)
+	$(call quiet,LINK.c) $(CEED_LDFLAGS) -o $@ $(abspath $<) $(CEED_LIBS) $(CEED_LDLIBS) $(LDLIBS) -I./tests/test-include
 
 $(OBJDIR)/%$(EXE_SUFFIX) : tests/%.f90 | $$(@D)/.DIR
 	$(call quiet,LINK.F) -DSOURCE_DIR='"$(abspath $(<D))/"' $(CEED_LDFLAGS) -o $@ $(abspath $<) $(CEED_LIBS) $(CEED_LDLIBS) $(LDLIBS)
diff --git a/tests/t406-qfunction-helper.h b/tests/t406-qfunction-helper.h
index 9db4901023..4c41de3a33 100644
--- a/tests/t406-qfunction-helper.h
+++ b/tests/t406-qfunction-helper.h
@@ -15,6 +15,9 @@
 // Test include path with "/./"
 #include "./t406-qfunction-scales.h"
 
-CEED_QFUNCTION_HELPER CeedScalar times_two(CeedScalar x) { return SCALE_TWO * x; }
+// Test include via -I....
+#include <fake-sys-include.h>
 
-CEED_QFUNCTION_HELPER CeedScalar times_three(CeedScalar x) { return SCALE_THREE * x; }
+CEED_QFUNCTION_HELPER CeedScalar times_two(CeedScalar x) { return FAKE_SYS_SCALE_ONE * SCALE_TWO * x; }
+
+CEED_QFUNCTION_HELPER CeedScalar times_three(CeedScalar x) { return FAKE_SYS_SCALE_ONE * SCALE_THREE * x; }
diff --git a/tests/t406-qfunction.c b/tests/t406-qfunction.c
index 201c3782a4..e558139146 100644
--- a/tests/t406-qfunction.c
+++ b/tests/t406-qfunction.c
@@ -18,6 +18,13 @@ int main(int argc, char **argv) {
   CeedScalar    v_true[q];
 
   CeedInit(argv[1], &ceed);
+  {
+    char  file_path[2056] = __FILE__;
+    char *last_slash      = strrchr(file_path, '/');
+
+    memcpy(&file_path[last_slash - file_path], "/test-include/", 15);
+    CeedAddJitSourceRoot(ceed, file_path);
+  }
 
   CeedVectorCreate(ceed, q, &w);
   CeedVectorCreate(ceed, q, &u);
diff --git a/tests/test-include/fake-sys-include.h b/tests/test-include/fake-sys-include.h
new file mode 100644
index 0000000000..c6069d006a
--- /dev/null
+++ b/tests/test-include/fake-sys-include.h
@@ -0,0 +1,10 @@
+#define FAKE_SYS_SCALE_ONE 1
+
+// Note - files included this way cannot transitively include any files CUDA/ROCm won't compile
+// These are bad
+// #include <ceed.h>
+// #include <math.h>
+// #include <stddef.h>
+
+// This is ok
+#include <ceed/types.h>

From a491a57ee1186d4a57a443cd84e0df1f56768bf7 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Wed, 16 Oct 2024 17:08:42 -0600
Subject: [PATCH 208/571] jit - redirect ceed.h to ceed/types.h over
 ceed/ceed.h for jit

---
 backends/cuda/ceed-cuda-compile.cpp   | 3 ++-
 backends/hip/ceed-hip-compile.cpp     | 3 ++-
 include/ceed.h                        | 4 ++++
 include/ceed/types.h                  | 2 ++
 tests/test-include/fake-sys-include.h | 4 ++--
 5 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/backends/cuda/ceed-cuda-compile.cpp b/backends/cuda/ceed-cuda-compile.cpp
index 7f7a4a9dde..4002671e31 100644
--- a/backends/cuda/ceed-cuda-compile.cpp
+++ b/backends/cuda/ceed-cuda-compile.cpp
@@ -38,7 +38,7 @@ int CeedCompile_Cuda(Ceed ceed, const char *source, CUmodule *module, const Ceed
   size_t                ptx_size;
   char                 *ptx;
   const char           *jit_defs_path, *jit_defs_source;
-  const int             num_opts            = 3;
+  const int             num_opts            = 4;
   CeedInt               num_jit_source_dirs = 0;
   const char          **opts;
   nvrtcProgram          prog;
@@ -95,6 +95,7 @@ int CeedCompile_Cuda(Ceed ceed, const char *source, CUmodule *module, const Ceed
       + std::to_string(prop.major) + std::to_string(prop.minor);
   opts[1] = arch_arg.c_str();
   opts[2] = "-Dint32_t=int";
+  opts[3] = "-DCEED_RUNNING_JIT_PASS=1";
   {
     const char **jit_source_dirs;
 
diff --git a/backends/hip/ceed-hip-compile.cpp b/backends/hip/ceed-hip-compile.cpp
index d86b01dd1b..7e105f9fdb 100644
--- a/backends/hip/ceed-hip-compile.cpp
+++ b/backends/hip/ceed-hip-compile.cpp
@@ -37,7 +37,7 @@ int CeedCompile_Hip(Ceed ceed, const char *source, hipModule_t *module, const Ce
   size_t                 ptx_size;
   char                  *jit_defs_source, *ptx;
   const char            *jit_defs_path;
-  const int              num_opts            = 3;
+  const int              num_opts            = 4;
   CeedInt                num_jit_source_dirs = 0;
   const char           **opts;
   int                    runtime_version;
@@ -92,6 +92,7 @@ int CeedCompile_Hip(Ceed ceed, const char *source, hipModule_t *module, const Ce
   std::string arch_arg = "--gpu-architecture=" + std::string(prop.gcnArchName);
   opts[1]              = arch_arg.c_str();
   opts[2]              = "-munsafe-fp-atomics";
+  opts[3]              = "-DCEED_RUNNING_JIT_PASS=1";
   {
     const char **jit_source_dirs;
 
diff --git a/include/ceed.h b/include/ceed.h
index effe28eaf1..b905b30851 100644
--- a/include/ceed.h
+++ b/include/ceed.h
@@ -1 +1,5 @@
+#ifdef CEED_RUNNING_JIT_PASS
+#include "ceed/types.h"
+#else
 #include "ceed/ceed.h"
+#endif
diff --git a/include/ceed/types.h b/include/ceed/types.h
index 6817a73322..6c6d126548 100644
--- a/include/ceed/types.h
+++ b/include/ceed/types.h
@@ -10,8 +10,10 @@
 #ifndef CEED_QFUNCTION_DEFS_H
 #define CEED_QFUNCTION_DEFS_H
 
+#ifndef CEED_RUNNING_JIT_PASS
 #include <stddef.h>
 #include <stdint.h>
+#endif
 
 /**
   @ingroup CeedQFunction
diff --git a/tests/test-include/fake-sys-include.h b/tests/test-include/fake-sys-include.h
index c6069d006a..ec1c9b2c56 100644
--- a/tests/test-include/fake-sys-include.h
+++ b/tests/test-include/fake-sys-include.h
@@ -2,9 +2,9 @@
 
 // Note - files included this way cannot transitively include any files CUDA/ROCm won't compile
 // These are bad
-// #include <ceed.h>
 // #include <math.h>
 // #include <stddef.h>
 
-// This is ok
+// These are ok
+#include <ceed.h>
 #include <ceed/types.h>

From 91adc9c87122b76e31f051252d6dcc3e7f5ce6f8 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Wed, 16 Oct 2024 17:14:13 -0600
Subject: [PATCH 209/571] cuda/hip - include *-jit.h via #include

---
 backends/cuda/ceed-cuda-compile.cpp | 13 +------------
 backends/hip/ceed-hip-compile.cpp   | 10 ++--------
 2 files changed, 3 insertions(+), 20 deletions(-)

diff --git a/backends/cuda/ceed-cuda-compile.cpp b/backends/cuda/ceed-cuda-compile.cpp
index 4002671e31..6d3faf4cd1 100644
--- a/backends/cuda/ceed-cuda-compile.cpp
+++ b/backends/cuda/ceed-cuda-compile.cpp
@@ -37,7 +37,6 @@
 int CeedCompile_Cuda(Ceed ceed, const char *source, CUmodule *module, const CeedInt num_defines, ...) {
   size_t                ptx_size;
   char                 *ptx;
-  const char           *jit_defs_path, *jit_defs_source;
   const int             num_opts            = 4;
   CeedInt               num_jit_source_dirs = 0;
   const char          **opts;
@@ -65,17 +64,7 @@ int CeedCompile_Cuda(Ceed ceed, const char *source, CUmodule *module, const Ceed
   }
 
   // Standard libCEED definitions for CUDA backends
-  CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-jit.h", &jit_defs_path));
-  {
-    char *source;
-
-    CeedCallBackend(CeedLoadSourceToBuffer(ceed, jit_defs_path, &source));
-    jit_defs_source = source;
-  }
-  code << jit_defs_source;
-  code << "\n\n";
-  CeedCallBackend(CeedFree(&jit_defs_path));
-  CeedCallBackend(CeedFree(&jit_defs_source));
+  code << "#include <ceed/jit-source/cuda/cuda-jit.h>\n\n";
 
   // Non-macro options
   CeedCallBackend(CeedCalloc(num_opts, &opts));
diff --git a/backends/hip/ceed-hip-compile.cpp b/backends/hip/ceed-hip-compile.cpp
index 7e105f9fdb..582c0a2e24 100644
--- a/backends/hip/ceed-hip-compile.cpp
+++ b/backends/hip/ceed-hip-compile.cpp
@@ -35,8 +35,7 @@
 //------------------------------------------------------------------------------
 int CeedCompile_Hip(Ceed ceed, const char *source, hipModule_t *module, const CeedInt num_defines, ...) {
   size_t                 ptx_size;
-  char                  *jit_defs_source, *ptx;
-  const char            *jit_defs_path;
+  char                  *ptx;
   const int              num_opts            = 4;
   CeedInt                num_jit_source_dirs = 0;
   const char           **opts;
@@ -77,12 +76,7 @@ int CeedCompile_Hip(Ceed ceed, const char *source, hipModule_t *module, const Ce
   }
 
   // Standard libCEED definitions for HIP backends
-  CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-jit.h", &jit_defs_path));
-  CeedCallBackend(CeedLoadSourceToBuffer(ceed, jit_defs_path, &jit_defs_source));
-  code << jit_defs_source;
-  code << "\n\n";
-  CeedCallBackend(CeedFree(&jit_defs_path));
-  CeedCallBackend(CeedFree(&jit_defs_source));
+  code << "#include <ceed/jit-source/hip/hip-jit.h>\n\n";
 
   // Non-macro options
   CeedCallBackend(CeedCalloc(num_opts, &opts));

From c0b5abf0f23b15c4f0ada76f8abe9f8d2b6fa247 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Thu, 17 Oct 2024 09:04:13 -0600
Subject: [PATCH 210/571] qf - prefer ceed/types.h over ceed.h in qf source

---
 examples/ceed/ex1-volume.h                             |  2 +-
 examples/ceed/ex2-surface.h                            |  2 +-
 examples/deal.II/bps-qfunctions.h                      |  2 +-
 examples/fluids/qfunctions/advection.h                 |  5 ++++-
 examples/fluids/qfunctions/advection_types.h           |  6 +++++-
 examples/fluids/qfunctions/bc_freestream.h             |  4 ++++
 examples/fluids/qfunctions/blasius.h                   |  5 ++++-
 examples/fluids/qfunctions/channel.h                   |  5 ++++-
 examples/fluids/qfunctions/densitycurrent.h            |  4 +++-
 examples/fluids/qfunctions/differential_filter.h       |  5 ++++-
 examples/fluids/qfunctions/eulervortex.h               |  5 ++++-
 examples/fluids/qfunctions/gaussianwave.h              |  4 +++-
 examples/fluids/qfunctions/grid_anisotropy_tensor.h    |  2 +-
 examples/fluids/qfunctions/inverse_multiplicity.h      |  2 +-
 examples/fluids/qfunctions/mass.h                      |  4 +++-
 examples/fluids/qfunctions/newtonian.h                 |  4 +++-
 examples/fluids/qfunctions/newtonian_state.h           |  4 +++-
 examples/fluids/qfunctions/newtonian_types.h           |  5 ++++-
 examples/fluids/qfunctions/setupgeo.h                  |  4 +++-
 examples/fluids/qfunctions/setupgeo2d.h                |  3 ++-
 examples/fluids/qfunctions/setupgeo_helpers.h          |  4 +++-
 examples/fluids/qfunctions/shocktube.h                 |  5 ++++-
 examples/fluids/qfunctions/stabilization.h             |  2 +-
 examples/fluids/qfunctions/stg_shur14.h                |  4 +++-
 examples/fluids/qfunctions/stg_shur14_type.h           |  5 ++++-
 .../fluids/qfunctions/strong_boundary_conditions.h     |  2 +-
 examples/fluids/qfunctions/taylorgreen.h               |  4 +++-
 examples/fluids/qfunctions/turb_spanstats.h            |  2 +-
 examples/fluids/qfunctions/utils.h                     |  4 +++-
 examples/fluids/qfunctions/utils_eigensolver_jacobi.h  |  5 ++++-
 .../fluids/qfunctions/velocity_gradient_projection.h   |  2 +-
 examples/mfem/bp1.h                                    |  2 +-
 examples/mfem/bp3.h                                    |  2 +-
 examples/nek/bps/bps.h                                 | 10 ++++------
 examples/petsc/qfunctions/area/areacube.h              |  4 +++-
 examples/petsc/qfunctions/area/areasphere.h            |  4 +++-
 examples/petsc/qfunctions/bps/bp1.h                    |  4 +++-
 examples/petsc/qfunctions/bps/bp1sphere.h              |  4 +++-
 examples/petsc/qfunctions/bps/bp2.h                    |  4 +++-
 examples/petsc/qfunctions/bps/bp2sphere.h              |  4 +++-
 examples/petsc/qfunctions/bps/bp3.h                    |  4 +++-
 examples/petsc/qfunctions/bps/bp3sphere.h              |  4 +++-
 examples/petsc/qfunctions/bps/bp4.h                    |  4 +++-
 examples/petsc/qfunctions/bps/bp4sphere.h              |  4 +++-
 examples/petsc/qfunctions/bps/common.h                 |  2 +-
 examples/petsc/qfunctions/swarm/swarmmass.h            |  2 +-
 examples/solids/qfunctions/common.h                    |  2 +-
 examples/solids/qfunctions/constant-force.h            |  4 +++-
 .../solids/qfunctions/finite-strain-mooney-rivlin.h    |  4 +++-
 examples/solids/qfunctions/finite-strain-neo-hookean.h |  4 +++-
 examples/solids/qfunctions/linear.h                    |  4 +++-
 examples/solids/qfunctions/manufactured-force.h        |  4 +++-
 examples/solids/qfunctions/manufactured-true.h         |  4 +++-
 examples/solids/qfunctions/traction-boundary.h         |  2 +-
 .../ceed/jit-source/cuda/cuda-atomic-add-fallback.h    |  2 +-
 include/ceed/jit-source/cuda/cuda-gen-templates.h      |  2 +-
 .../cuda/cuda-ref-basis-nontensor-templates.h          |  2 +-
 .../ceed/jit-source/cuda/cuda-ref-basis-nontensor.h    |  2 +-
 .../jit-source/cuda/cuda-ref-basis-tensor-at-points.h  |  2 +-
 include/ceed/jit-source/cuda/cuda-ref-basis-tensor.h   |  2 +-
 .../cuda/cuda-ref-operator-assemble-diagonal.h         |  2 +-
 .../ceed/jit-source/cuda/cuda-ref-operator-assemble.h  |  2 +-
 include/ceed/jit-source/cuda/cuda-ref-qfunction.h      |  2 +-
 .../jit-source/cuda/cuda-ref-restriction-at-points.h   |  2 +-
 .../cuda/cuda-ref-restriction-curl-oriented.h          |  2 +-
 .../ceed/jit-source/cuda/cuda-ref-restriction-offset.h |  2 +-
 .../jit-source/cuda/cuda-ref-restriction-oriented.h    |  2 +-
 .../jit-source/cuda/cuda-ref-restriction-strided.h     |  2 +-
 .../cuda/cuda-shared-basis-read-write-templates.h      |  2 +-
 .../cuda/cuda-shared-basis-tensor-templates.h          |  2 +-
 .../ceed/jit-source/cuda/cuda-shared-basis-tensor.h    |  2 +-
 include/ceed/jit-source/gallery/ceed-identity.h        |  2 +-
 include/ceed/jit-source/gallery/ceed-mass1dbuild.h     |  2 +-
 include/ceed/jit-source/gallery/ceed-mass2dbuild.h     |  2 +-
 include/ceed/jit-source/gallery/ceed-mass3dbuild.h     |  2 +-
 include/ceed/jit-source/gallery/ceed-massapply.h       |  2 +-
 include/ceed/jit-source/gallery/ceed-poisson1dapply.h  |  2 +-
 include/ceed/jit-source/gallery/ceed-poisson1dbuild.h  |  2 +-
 include/ceed/jit-source/gallery/ceed-poisson2dapply.h  |  2 +-
 include/ceed/jit-source/gallery/ceed-poisson2dbuild.h  |  2 +-
 include/ceed/jit-source/gallery/ceed-poisson3dapply.h  |  2 +-
 include/ceed/jit-source/gallery/ceed-poisson3dbuild.h  |  2 +-
 include/ceed/jit-source/gallery/ceed-scale.h           |  2 +-
 include/ceed/jit-source/gallery/ceed-vectormassapply.h |  2 +-
 .../jit-source/gallery/ceed-vectorpoisson1dapply.h     |  2 +-
 .../jit-source/gallery/ceed-vectorpoisson2dapply.h     |  2 +-
 .../jit-source/gallery/ceed-vectorpoisson3dapply.h     |  2 +-
 include/ceed/jit-source/hip/hip-gen-templates.h        |  2 +-
 .../jit-source/hip/hip-ref-basis-nontensor-templates.h |  2 +-
 include/ceed/jit-source/hip/hip-ref-basis-nontensor.h  |  2 +-
 .../jit-source/hip/hip-ref-basis-tensor-at-points.h    |  2 +-
 include/ceed/jit-source/hip/hip-ref-basis-tensor.h     |  2 +-
 .../hip/hip-ref-operator-assemble-diagonal.h           |  2 +-
 .../ceed/jit-source/hip/hip-ref-operator-assemble.h    |  2 +-
 include/ceed/jit-source/hip/hip-ref-qfunction.h        |  2 +-
 .../jit-source/hip/hip-ref-restriction-at-points.h     |  2 +-
 .../jit-source/hip/hip-ref-restriction-curl-oriented.h |  2 +-
 .../ceed/jit-source/hip/hip-ref-restriction-offset.h   |  2 +-
 .../ceed/jit-source/hip/hip-ref-restriction-oriented.h |  2 +-
 .../ceed/jit-source/hip/hip-ref-restriction-strided.h  |  2 +-
 .../hip/hip-shared-basis-read-write-templates.h        |  2 +-
 .../jit-source/hip/hip-shared-basis-tensor-templates.h |  2 +-
 include/ceed/jit-source/hip/hip-shared-basis-tensor.h  |  2 +-
 include/ceed/jit-source/sycl/sycl-gen-templates.h      |  2 +-
 include/ceed/jit-source/sycl/sycl-ref-qfunction.h      |  2 +-
 .../sycl/sycl-shared-basis-read-write-templates.h      |  2 +-
 .../sycl/sycl-shared-basis-tensor-templates.h          |  2 +-
 .../ceed/jit-source/sycl/sycl-shared-basis-tensor.h    |  2 +-
 interface/ceed-qfunction.c                             |  2 ++
 tests/t400-qfunction.h                                 |  2 +-
 tests/t401-qfunction.h                                 |  2 +-
 tests/t405-qfunction.h                                 |  2 +-
 tests/t406-qfunction-helper.h                          |  1 +
 tests/t406-qfunction.h                                 |  6 +++++-
 tests/t409-qfunction.h                                 |  2 +-
 tests/t500-operator.h                                  |  2 +-
 tests/t502-operator.h                                  |  2 +-
 tests/t507-operator.h                                  |  2 +-
 tests/t510-operator.h                                  |  2 +-
 tests/t522-operator.h                                  |  2 +-
 tests/t530-operator.h                                  |  2 +-
 tests/t531-operator.h                                  |  2 +-
 tests/t532-operator.h                                  |  2 +-
 tests/t534-operator.h                                  |  2 +-
 tests/t535-operator.h                                  |  2 +-
 tests/t537-operator.h                                  |  2 +-
 tests/t539-operator.h                                  |  2 +-
 tests/t540-operator.h                                  |  2 +-
 tests/t541-operator.h                                  |  2 +-
 tests/t566-operator.h                                  |  2 +-
 tests/t567-operator.h                                  |  2 +-
 tests/t568-operator.h                                  |  2 +-
 tests/t580-operator.h                                  |  2 +-
 tests/t590-operator.h                                  |  2 +-
 tests/t591-operator.h                                  |  2 +-
 tests/test-include/fake-sys-include.h                  | 10 +++++++---
 136 files changed, 237 insertions(+), 140 deletions(-)

diff --git a/examples/ceed/ex1-volume.h b/examples/ceed/ex1-volume.h
index d78ea16c6f..3ec78c4366 100644
--- a/examples/ceed/ex1-volume.h
+++ b/examples/ceed/ex1-volume.h
@@ -5,7 +5,7 @@
 //
 // This file is part of CEED:  http://github.com/ceed
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 /// A structure used to pass additional data to f_build_mass
 struct BuildContext {
diff --git a/examples/ceed/ex2-surface.h b/examples/ceed/ex2-surface.h
index 4258a1e944..1355918b70 100644
--- a/examples/ceed/ex2-surface.h
+++ b/examples/ceed/ex2-surface.h
@@ -5,7 +5,7 @@
 //
 // This file is part of CEED:  http://github.com/ceed
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 /// A structure used to pass additional data to f_build_diff
 struct BuildContext {
diff --git a/examples/deal.II/bps-qfunctions.h b/examples/deal.II/bps-qfunctions.h
index 6161fdf840..b6a0c498c7 100644
--- a/examples/deal.II/bps-qfunctions.h
+++ b/examples/deal.II/bps-qfunctions.h
@@ -15,7 +15,7 @@
 //
 // ---------------------------------------------------------------------
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 
diff --git a/examples/fluids/qfunctions/advection.h b/examples/fluids/qfunctions/advection.h
index 43b5293837..41f6b249e7 100644
--- a/examples/fluids/qfunctions/advection.h
+++ b/examples/fluids/qfunctions/advection.h
@@ -7,8 +7,11 @@
 
 /// @file
 /// Advection initial condition and operator for Navier-Stokes example using PETSc
-#include <ceed.h>
+#include <ceed/types.h>
+#ifndef CEED_RUNNING_JIT_PASS
 #include <math.h>
+#include <stdbool.h>
+#endif
 
 #include "advection_types.h"
 #include "newtonian_state.h"
diff --git a/examples/fluids/qfunctions/advection_types.h b/examples/fluids/qfunctions/advection_types.h
index 838995191c..daaee10bf7 100644
--- a/examples/fluids/qfunctions/advection_types.h
+++ b/examples/fluids/qfunctions/advection_types.h
@@ -6,7 +6,11 @@
 // This file is part of CEED:  http://github.com/ceed
 #pragma once
 
-#include <ceed.h>
+#include <ceed/types.h>
+#ifndef CEED_RUNNING_JIT_PASS
+#include <stdbool.h>
+#endif
+
 #include "stabilization_types.h"
 
 typedef enum {
diff --git a/examples/fluids/qfunctions/bc_freestream.h b/examples/fluids/qfunctions/bc_freestream.h
index 90700496e0..b6c0aa33cf 100644
--- a/examples/fluids/qfunctions/bc_freestream.h
+++ b/examples/fluids/qfunctions/bc_freestream.h
@@ -7,6 +7,10 @@
 
 /// @file
 /// QFunctions for the `bc_freestream` and `bc_outflow` boundary conditions
+#ifndef CEED_RUNNING_JIT_PASS
+#include <stdbool.h>
+#endif
+
 #include "bc_freestream_type.h"
 #include "newtonian_state.h"
 #include "newtonian_types.h"
diff --git a/examples/fluids/qfunctions/blasius.h b/examples/fluids/qfunctions/blasius.h
index 738af58898..e372aeedfb 100644
--- a/examples/fluids/qfunctions/blasius.h
+++ b/examples/fluids/qfunctions/blasius.h
@@ -7,7 +7,10 @@
 
 /// @file
 /// Operator for Navier-Stokes example using PETSc
-#include <ceed.h>
+#include <ceed/types.h>
+#ifndef CEED_RUNNING_JIT_PASS
+#include <stdbool.h>
+#endif
 
 #include "newtonian_state.h"
 #include "newtonian_types.h"
diff --git a/examples/fluids/qfunctions/channel.h b/examples/fluids/qfunctions/channel.h
index 21db7c8dd6..9d458b0f31 100644
--- a/examples/fluids/qfunctions/channel.h
+++ b/examples/fluids/qfunctions/channel.h
@@ -7,8 +7,11 @@
 
 /// @file
 /// Operator for Navier-Stokes example using PETSc
-#include <ceed.h>
+#include <ceed/types.h>
+#ifndef CEED_RUNNING_JIT_PASS
 #include <math.h>
+#include <stdbool.h>
+#endif
 
 #include "newtonian_state.h"
 #include "newtonian_types.h"
diff --git a/examples/fluids/qfunctions/densitycurrent.h b/examples/fluids/qfunctions/densitycurrent.h
index c5fe3752a0..d1e61a0a10 100644
--- a/examples/fluids/qfunctions/densitycurrent.h
+++ b/examples/fluids/qfunctions/densitycurrent.h
@@ -11,8 +11,10 @@
 // Model from:
 //   Semi-Implicit Formulations of the Navier-Stokes Equations: Application to
 //   Nonhydrostatic Atmospheric Modeling, Giraldo, Restelli, and Lauter (2010).
-#include <ceed.h>
+#include <ceed/types.h>
+#ifndef CEED_RUNNING_JIT_PASS
 #include <math.h>
+#endif
 
 #include "newtonian_state.h"
 #include "newtonian_types.h"
diff --git a/examples/fluids/qfunctions/differential_filter.h b/examples/fluids/qfunctions/differential_filter.h
index 10b89b70c7..36b4cfa2a5 100644
--- a/examples/fluids/qfunctions/differential_filter.h
+++ b/examples/fluids/qfunctions/differential_filter.h
@@ -7,7 +7,10 @@
 //
 /// @file
 /// Implementation of differential filtering
-#include <ceed.h>
+#include <ceed/types.h>
+#ifndef CEED_RUNNING_JIT_PASS
+#include <stdbool.h>
+#endif
 
 #include "differential_filter_enums.h"
 #include "newtonian_state.h"
diff --git a/examples/fluids/qfunctions/eulervortex.h b/examples/fluids/qfunctions/eulervortex.h
index 308cb50cea..878c5f615c 100644
--- a/examples/fluids/qfunctions/eulervortex.h
+++ b/examples/fluids/qfunctions/eulervortex.h
@@ -11,8 +11,11 @@
 
 // Model from:
 //   On the Order of Accuracy and Numerical Performance of Two Classes of Finite Volume WENO Schemes, Zhang, Zhang, and Shu (2011).
-#include <ceed.h>
+#include <ceed/types.h>
+#ifndef CEED_RUNNING_JIT_PASS
 #include <math.h>
+#include <stdbool.h>
+#endif
 
 #include "utils.h"
 
diff --git a/examples/fluids/qfunctions/gaussianwave.h b/examples/fluids/qfunctions/gaussianwave.h
index 88f9feb126..f48de3bcf2 100644
--- a/examples/fluids/qfunctions/gaussianwave.h
+++ b/examples/fluids/qfunctions/gaussianwave.h
@@ -7,8 +7,10 @@
 
 /// @file
 /// Thermodynamic wave propogation for testing freestream/non-reflecting boundary conditions. Proposed in Mengaldo et. al. 2014
-#include <ceed.h>
+#include <ceed/types.h>
+#ifndef CEED_RUNNING_JIT_PASS
 #include <math.h>
+#endif
 
 #include "newtonian_state.h"
 #include "utils.h"
diff --git a/examples/fluids/qfunctions/grid_anisotropy_tensor.h b/examples/fluids/qfunctions/grid_anisotropy_tensor.h
index ef59a54c6d..cea712726f 100644
--- a/examples/fluids/qfunctions/grid_anisotropy_tensor.h
+++ b/examples/fluids/qfunctions/grid_anisotropy_tensor.h
@@ -8,7 +8,7 @@
 /// @file
 /// Element anisotropy tensor, as defined in 'Invariant data-driven subgrid stress modeling in the strain-rate eigenframe for large eddy simulation'
 /// Prakash et al. 2022
-#include <ceed.h>
+#include <ceed/types.h>
 
 #include "utils.h"
 #include "utils_eigensolver_jacobi.h"
diff --git a/examples/fluids/qfunctions/inverse_multiplicity.h b/examples/fluids/qfunctions/inverse_multiplicity.h
index c51fc0586b..6f83c7b39c 100644
--- a/examples/fluids/qfunctions/inverse_multiplicity.h
+++ b/examples/fluids/qfunctions/inverse_multiplicity.h
@@ -4,7 +4,7 @@
 // SPDX-License-Identifier: BSD-2-Clause
 //
 // This file is part of CEED:  http://github.com/ceed
-#include <ceed.h>
+#include <ceed/types.h>
 
 // @brief Calculate the inverse of the multiplicity, reducing to a single component
 CEED_QFUNCTION(InverseMultiplicity)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
diff --git a/examples/fluids/qfunctions/mass.h b/examples/fluids/qfunctions/mass.h
index 1147a2bb31..42d27b2f68 100644
--- a/examples/fluids/qfunctions/mass.h
+++ b/examples/fluids/qfunctions/mass.h
@@ -7,8 +7,10 @@
 
 /// @file
 /// Mass operator for Navier-Stokes example using PETSc
-#include <ceed.h>
+#include <ceed/types.h>
+#ifndef CEED_RUNNING_JIT_PASS
 #include <math.h>
+#endif
 
 // *****************************************************************************
 // This QFunction applies the mass matrix to five interlaced fields.
diff --git a/examples/fluids/qfunctions/newtonian.h b/examples/fluids/qfunctions/newtonian.h
index 66fc309018..a5ca161b3b 100644
--- a/examples/fluids/qfunctions/newtonian.h
+++ b/examples/fluids/qfunctions/newtonian.h
@@ -7,9 +7,11 @@
 
 /// @file
 /// Operator for Navier-Stokes example using PETSc
-#include <ceed.h>
+#include <ceed/types.h>
+#ifndef CEED_RUNNING_JIT_PASS
 #include <math.h>
 #include <stdlib.h>
+#endif
 
 #include "newtonian_state.h"
 #include "newtonian_types.h"
diff --git a/examples/fluids/qfunctions/newtonian_state.h b/examples/fluids/qfunctions/newtonian_state.h
index fa38c45e68..ab49f0d2c2 100644
--- a/examples/fluids/qfunctions/newtonian_state.h
+++ b/examples/fluids/qfunctions/newtonian_state.h
@@ -9,8 +9,10 @@
 /// Structs and helper functions regarding the state of a newtonian simulation
 #pragma once
 
-#include <ceed.h>
+#include <ceed/types.h>
+#ifndef CEED_RUNNING_JIT_PASS
 #include <math.h>
+#endif
 
 #include "newtonian_types.h"
 #include "utils.h"
diff --git a/examples/fluids/qfunctions/newtonian_types.h b/examples/fluids/qfunctions/newtonian_types.h
index 60478c397b..b7c4e7e36e 100644
--- a/examples/fluids/qfunctions/newtonian_types.h
+++ b/examples/fluids/qfunctions/newtonian_types.h
@@ -6,7 +6,10 @@
 // This file is part of CEED:  http://github.com/ceed
 #pragma once
 
-#include <ceed.h>
+#include <ceed/types.h>
+#ifndef CEED_RUNNING_JIT_PASS
+#include <stdbool.h>
+#endif
 
 #include "stabilization_types.h"
 
diff --git a/examples/fluids/qfunctions/setupgeo.h b/examples/fluids/qfunctions/setupgeo.h
index a4d5181ad7..4e8e9cf8f4 100644
--- a/examples/fluids/qfunctions/setupgeo.h
+++ b/examples/fluids/qfunctions/setupgeo.h
@@ -7,8 +7,10 @@
 
 /// @file
 /// Geometric factors (3D) for Navier-Stokes example using PETSc
-#include <ceed.h>
+#include <ceed/types.h>
+#ifndef CEED_RUNNING_JIT_PASS
 #include <math.h>
+#endif
 
 #include "setupgeo_helpers.h"
 #include "utils.h"
diff --git a/examples/fluids/qfunctions/setupgeo2d.h b/examples/fluids/qfunctions/setupgeo2d.h
index 4bbb39795c..0c2662906c 100644
--- a/examples/fluids/qfunctions/setupgeo2d.h
+++ b/examples/fluids/qfunctions/setupgeo2d.h
@@ -7,7 +7,8 @@
 
 /// @file
 /// Geometric factors (2D) for Navier-Stokes example using PETSc
-#include <ceed.h>
+#include <ceed/types.h>
+
 #include "setupgeo_helpers.h"
 #include "utils.h"
 
diff --git a/examples/fluids/qfunctions/setupgeo_helpers.h b/examples/fluids/qfunctions/setupgeo_helpers.h
index 930ff7bb72..b52c3cdcff 100644
--- a/examples/fluids/qfunctions/setupgeo_helpers.h
+++ b/examples/fluids/qfunctions/setupgeo_helpers.h
@@ -9,8 +9,10 @@
 /// Geometric factors (3D) for Navier-Stokes example using PETSc
 #pragma once
 
-#include <ceed.h>
+#include <ceed/types.h>
+#ifndef CEED_RUNNING_JIT_PASS
 #include <math.h>
+#endif
 
 #include "utils.h"
 
diff --git a/examples/fluids/qfunctions/shocktube.h b/examples/fluids/qfunctions/shocktube.h
index 87cdf73d4d..3ff908f4af 100644
--- a/examples/fluids/qfunctions/shocktube.h
+++ b/examples/fluids/qfunctions/shocktube.h
@@ -10,8 +10,11 @@
 
 // Model from:
 //   On the Order of Accuracy and Numerical Performance of Two Classes of Finite Volume WENO Schemes, Zhang, Zhang, and Shu (2011).
-#include <ceed.h>
+#include <ceed/types.h>
+#ifndef CEED_RUNNING_JIT_PASS
 #include <math.h>
+#include <stdbool.h>
+#endif
 
 #include "utils.h"
 
diff --git a/examples/fluids/qfunctions/stabilization.h b/examples/fluids/qfunctions/stabilization.h
index 55d99820c3..655d49e7a8 100644
--- a/examples/fluids/qfunctions/stabilization.h
+++ b/examples/fluids/qfunctions/stabilization.h
@@ -7,7 +7,7 @@
 
 /// @file
 /// Helper functions for computing stabilization terms of a newtonian simulation
-#include <ceed.h>
+#include <ceed/types.h>
 
 #include "newtonian_state.h"
 
diff --git a/examples/fluids/qfunctions/stg_shur14.h b/examples/fluids/qfunctions/stg_shur14.h
index d1fec17ce5..28a779aa8c 100644
--- a/examples/fluids/qfunctions/stg_shur14.h
+++ b/examples/fluids/qfunctions/stg_shur14.h
@@ -12,9 +12,11 @@
 /// SetupSTG_Rand reads in the input files and fills in STGShur14Context.
 /// Then STGShur14_CalcQF is run over quadrature points.
 /// Before the program exits, TearDownSTG is run to free the memory of the allocated arrays.
-#include <ceed.h>
+#include <ceed/types.h>
+#ifndef CEED_RUNNING_JIT_PASS
 #include <math.h>
 #include <stdlib.h>
+#endif
 
 #include "newtonian_state.h"
 #include "setupgeo_helpers.h"
diff --git a/examples/fluids/qfunctions/stg_shur14_type.h b/examples/fluids/qfunctions/stg_shur14_type.h
index 5e369cd702..f7c8942614 100644
--- a/examples/fluids/qfunctions/stg_shur14_type.h
+++ b/examples/fluids/qfunctions/stg_shur14_type.h
@@ -6,7 +6,10 @@
 // This file is part of CEED:  http://github.com/ceed
 #pragma once
 
-#include <ceed.h>
+#include <ceed/types.h>
+#ifndef CEED_RUNNING_JIT_PASS
+#include <stdbool.h>
+#endif
 
 #include "newtonian_types.h"
 
diff --git a/examples/fluids/qfunctions/strong_boundary_conditions.h b/examples/fluids/qfunctions/strong_boundary_conditions.h
index a503a236d9..7bb0453796 100644
--- a/examples/fluids/qfunctions/strong_boundary_conditions.h
+++ b/examples/fluids/qfunctions/strong_boundary_conditions.h
@@ -4,7 +4,7 @@
 // SPDX-License-Identifier: BSD-2-Clause
 //
 // This file is part of CEED:  http://github.com/ceed
-#include <ceed.h>
+#include <ceed/types.h>
 
 #include "setupgeo_helpers.h"
 
diff --git a/examples/fluids/qfunctions/taylorgreen.h b/examples/fluids/qfunctions/taylorgreen.h
index ddf33e665b..3b42fe18d8 100644
--- a/examples/fluids/qfunctions/taylorgreen.h
+++ b/examples/fluids/qfunctions/taylorgreen.h
@@ -4,8 +4,10 @@
 // SPDX-License-Identifier: BSD-2-Clause
 //
 // This file is part of CEED:  http://github.com/ceed
-#include <ceed.h>
+#include <ceed/types.h>
+#ifndef CEED_RUNNING_JIT_PASS
 #include <math.h>
+#endif
 
 #include "newtonian_state.h"
 #include "newtonian_types.h"
diff --git a/examples/fluids/qfunctions/turb_spanstats.h b/examples/fluids/qfunctions/turb_spanstats.h
index 344adeebaa..377a0bbf75 100644
--- a/examples/fluids/qfunctions/turb_spanstats.h
+++ b/examples/fluids/qfunctions/turb_spanstats.h
@@ -4,7 +4,7 @@
 // SPDX-License-Identifier: BSD-2-Clause
 //
 // This file is part of CEED:  http://github.com/ceed
-#include <ceed.h>
+#include <ceed/types.h>
 
 #include "newtonian_state.h"
 #include "turb_stats_types.h"
diff --git a/examples/fluids/qfunctions/utils.h b/examples/fluids/qfunctions/utils.h
index f414e14e9c..90f67fad24 100644
--- a/examples/fluids/qfunctions/utils.h
+++ b/examples/fluids/qfunctions/utils.h
@@ -6,8 +6,10 @@
 // This file is part of CEED:  http://github.com/ceed
 #pragma once
 
-#include <ceed.h>
+#include <ceed/types.h>
+#ifndef CEED_RUNNING_JIT_PASS
 #include <math.h>
+#endif
 
 #ifndef M_PI
 #define M_PI 3.14159265358979323846
diff --git a/examples/fluids/qfunctions/utils_eigensolver_jacobi.h b/examples/fluids/qfunctions/utils_eigensolver_jacobi.h
index b8236789d2..1c0390d3b9 100644
--- a/examples/fluids/qfunctions/utils_eigensolver_jacobi.h
+++ b/examples/fluids/qfunctions/utils_eigensolver_jacobi.h
@@ -9,8 +9,11 @@
 /// Eigen system solver for symmetric NxN matrices. Modified from the CC0 code provided at https://github.com/jewettaij/jacobi_pd
 #pragma once
 
-#include <ceed.h>
+#include <ceed/types.h>
+#ifndef CEED_RUNNING_JIT_PASS
 #include <math.h>
+#include <stdbool.h>
+#endif
 
 #include "utils.h"
 
diff --git a/examples/fluids/qfunctions/velocity_gradient_projection.h b/examples/fluids/qfunctions/velocity_gradient_projection.h
index c21bb68adc..28914c13d9 100644
--- a/examples/fluids/qfunctions/velocity_gradient_projection.h
+++ b/examples/fluids/qfunctions/velocity_gradient_projection.h
@@ -4,7 +4,7 @@
 // SPDX-License-Identifier: BSD-2-Clause
 //
 // This file is part of CEED:  http://github.com/ceed
-#include <ceed.h>
+#include <ceed/types.h>
 
 #include "newtonian_state.h"
 #include "newtonian_types.h"
diff --git a/examples/mfem/bp1.h b/examples/mfem/bp1.h
index 332340340f..df23dd4b51 100644
--- a/examples/mfem/bp1.h
+++ b/examples/mfem/bp1.h
@@ -5,7 +5,7 @@
 //
 // This file is part of CEED:  http://github.com/ceed
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 /// A structure used to pass additional data to f_build_mass
 struct BuildContext {
diff --git a/examples/mfem/bp3.h b/examples/mfem/bp3.h
index dde37b7446..a546d8aea6 100644
--- a/examples/mfem/bp3.h
+++ b/examples/mfem/bp3.h
@@ -5,7 +5,7 @@
 //
 // This file is part of CEED:  http://github.com/ceed
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 /// A structure used to pass additional data to f_build_diff and f_apply_diff
 struct BuildContext {
diff --git a/examples/nek/bps/bps.h b/examples/nek/bps/bps.h
index 446377b044..a0b6a022c1 100644
--- a/examples/nek/bps/bps.h
+++ b/examples/nek/bps/bps.h
@@ -4,12 +4,12 @@
 // SPDX-License-Identifier: BSD-2-Clause
 //
 // This file is part of CEED:  http://github.com/ceed
+#pragma once
 
-#ifndef bps_h
-#define bps_h
-
-#include <ceed.h>
+#include <ceed/types.h>
+#ifndef CEED_RUNNING_JIT_PASS
 #include <math.h>
+#endif
 
 #ifndef M_PI
 #define M_PI 3.14159265358979323846
@@ -110,5 +110,3 @@ CEED_QFUNCTION(diffusionf)(void *ctx, CeedInt Q, const CeedScalar *const *in, Ce
   }  // End of Quadrature Point Loop
   return 0;
 }
-
-#endif  // bps_h
diff --git a/examples/petsc/qfunctions/area/areacube.h b/examples/petsc/qfunctions/area/areacube.h
index 93be0594b6..1cc7fcccab 100644
--- a/examples/petsc/qfunctions/area/areacube.h
+++ b/examples/petsc/qfunctions/area/areacube.h
@@ -8,8 +8,10 @@
 /// @file
 /// libCEED QFunctions for mass operator example for a scalar field on the sphere using PETSc
 
-#include <ceed.h>
+#include <ceed/types.h>
+#ifndef CEED_RUNNING_JIT_PASS
 #include <math.h>
+#endif
 
 // -----------------------------------------------------------------------------
 // This QFunction sets up the geometric factor required for integration when reference coordinates have a different dimension than the one of physical
diff --git a/examples/petsc/qfunctions/area/areasphere.h b/examples/petsc/qfunctions/area/areasphere.h
index 7cd73ca354..88ee221a7f 100644
--- a/examples/petsc/qfunctions/area/areasphere.h
+++ b/examples/petsc/qfunctions/area/areasphere.h
@@ -8,8 +8,10 @@
 /// @file
 /// libCEED QFunctions for mass operator example for a scalar field on the sphere using PETSc
 
-#include <ceed.h>
+#include <ceed/types.h>
+#ifndef CEED_RUNNING_JIT_PASS
 #include <math.h>
+#endif
 
 // -----------------------------------------------------------------------------
 // This QFunction sets up the geometric factor required for integration when reference coordinates have a different dimension than the one of physical
diff --git a/examples/petsc/qfunctions/bps/bp1.h b/examples/petsc/qfunctions/bps/bp1.h
index a902b29f7c..b5a1f0ad11 100644
--- a/examples/petsc/qfunctions/bps/bp1.h
+++ b/examples/petsc/qfunctions/bps/bp1.h
@@ -8,8 +8,10 @@
 /// @file
 /// libCEED QFunctions for mass operator example using PETSc
 
-#include <ceed.h>
+#include <ceed/types.h>
+#ifndef CEED_RUNNING_JIT_PASS
 #include <math.h>
+#endif
 
 // -----------------------------------------------------------------------------
 // This QFunction sets up the geometric factors required to apply the mass operator
diff --git a/examples/petsc/qfunctions/bps/bp1sphere.h b/examples/petsc/qfunctions/bps/bp1sphere.h
index d604406f29..0129a3ba66 100644
--- a/examples/petsc/qfunctions/bps/bp1sphere.h
+++ b/examples/petsc/qfunctions/bps/bp1sphere.h
@@ -8,8 +8,10 @@
 /// @file
 /// libCEED QFunctions for mass operator example for a scalar field on the sphere using PETSc
 
-#include <ceed.h>
+#include <ceed/types.h>
+#ifndef CEED_RUNNING_JIT_PASS
 #include <math.h>
+#endif
 
 // -----------------------------------------------------------------------------
 // This QFunction sets up the geometric factors required for integration and coordinate transformations when reference coordinates have a different
diff --git a/examples/petsc/qfunctions/bps/bp2.h b/examples/petsc/qfunctions/bps/bp2.h
index 22ba9fb788..12c5fc3521 100644
--- a/examples/petsc/qfunctions/bps/bp2.h
+++ b/examples/petsc/qfunctions/bps/bp2.h
@@ -8,8 +8,10 @@
 /// @file
 /// libCEED QFunctions for mass operator example using PETSc
 
-#include <ceed.h>
+#include <ceed/types.h>
+#ifndef CEED_RUNNING_JIT_PASS
 #include <math.h>
+#endif
 
 // -----------------------------------------------------------------------------
 // This QFunction sets up the rhs and true solution for the problem
diff --git a/examples/petsc/qfunctions/bps/bp2sphere.h b/examples/petsc/qfunctions/bps/bp2sphere.h
index 36a8e95778..2ebff9ef91 100644
--- a/examples/petsc/qfunctions/bps/bp2sphere.h
+++ b/examples/petsc/qfunctions/bps/bp2sphere.h
@@ -8,8 +8,10 @@
 /// @file
 /// libCEED QFunctions for mass operator example for a vector field on the sphere using PETSc
 
-#include <ceed.h>
+#include <ceed/types.h>
+#ifndef CEED_RUNNING_JIT_PASS
 #include <math.h>
+#endif
 
 // -----------------------------------------------------------------------------
 // This QFunction sets up the rhs and true solution for the problem
diff --git a/examples/petsc/qfunctions/bps/bp3.h b/examples/petsc/qfunctions/bps/bp3.h
index dcf84defae..a3674ed031 100644
--- a/examples/petsc/qfunctions/bps/bp3.h
+++ b/examples/petsc/qfunctions/bps/bp3.h
@@ -8,8 +8,10 @@
 /// @file
 /// libCEED QFunctions for diffusion operator example using PETSc
 
-#include <ceed.h>
+#include <ceed/types.h>
+#ifndef CEED_RUNNING_JIT_PASS
 #include <math.h>
+#endif
 
 // -----------------------------------------------------------------------------
 // This QFunction sets up the geometric factors required to apply the diffusion operator
diff --git a/examples/petsc/qfunctions/bps/bp3sphere.h b/examples/petsc/qfunctions/bps/bp3sphere.h
index 1f901dd97a..fdc16b4c84 100644
--- a/examples/petsc/qfunctions/bps/bp3sphere.h
+++ b/examples/petsc/qfunctions/bps/bp3sphere.h
@@ -8,8 +8,10 @@
 /// @file
 /// libCEED QFunctions for diffusion operator example for a scalar field on the sphere using PETSc
 
-#include <ceed.h>
+#include <ceed/types.h>
+#ifndef CEED_RUNNING_JIT_PASS
 #include <math.h>
+#endif
 
 // -----------------------------------------------------------------------------
 // This QFunction sets up the geometric factors required for integration and coordinate transformations when reference coordinates have a different
diff --git a/examples/petsc/qfunctions/bps/bp4.h b/examples/petsc/qfunctions/bps/bp4.h
index 46307c338a..4f8f6fd58d 100644
--- a/examples/petsc/qfunctions/bps/bp4.h
+++ b/examples/petsc/qfunctions/bps/bp4.h
@@ -8,8 +8,10 @@
 /// @file
 /// libCEED QFunctions for diffusion operator example using PETSc
 
-#include <ceed.h>
+#include <ceed/types.h>
+#ifndef CEED_RUNNING_JIT_PASS
 #include <math.h>
+#endif
 
 // -----------------------------------------------------------------------------
 // This QFunction sets up the rhs and true solution for the problem
diff --git a/examples/petsc/qfunctions/bps/bp4sphere.h b/examples/petsc/qfunctions/bps/bp4sphere.h
index 517f353371..39b631173b 100644
--- a/examples/petsc/qfunctions/bps/bp4sphere.h
+++ b/examples/petsc/qfunctions/bps/bp4sphere.h
@@ -8,8 +8,10 @@
 /// @file
 /// libCEED QFunctions for mass operator example for a vector field on the sphere using PETSc
 
-#include <ceed.h>
+#include <ceed/types.h>
+#ifndef CEED_RUNNING_JIT_PASS
 #include <math.h>
+#endif
 
 // -----------------------------------------------------------------------------
 // This QFunction sets up the rhs and true solution for the problem
diff --git a/examples/petsc/qfunctions/bps/common.h b/examples/petsc/qfunctions/bps/common.h
index 26f374d5d4..fd38dbc13d 100644
--- a/examples/petsc/qfunctions/bps/common.h
+++ b/examples/petsc/qfunctions/bps/common.h
@@ -8,7 +8,7 @@
 /// @file
 /// libCEED QFunctions for BP examples using PETSc
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 // -----------------------------------------------------------------------------
 CEED_QFUNCTION(Error)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
diff --git a/examples/petsc/qfunctions/swarm/swarmmass.h b/examples/petsc/qfunctions/swarm/swarmmass.h
index e355eff8d7..4c321871fe 100644
--- a/examples/petsc/qfunctions/swarm/swarmmass.h
+++ b/examples/petsc/qfunctions/swarm/swarmmass.h
@@ -5,7 +5,7 @@
 //
 // This file is part of CEED:  http://github.com/ceed
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 CEED_QFUNCTION(SetupMass)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   const CeedScalar(*J)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0];
diff --git a/examples/solids/qfunctions/common.h b/examples/solids/qfunctions/common.h
index bfdb92522f..acaa815cc5 100644
--- a/examples/solids/qfunctions/common.h
+++ b/examples/solids/qfunctions/common.h
@@ -8,7 +8,7 @@
 /// @file
 /// Geometric factors for solid mechanics example using PETSc
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 // -----------------------------------------------------------------------------
 // This QFunction sets up the geometric factors required for integration and coordinate transformations
diff --git a/examples/solids/qfunctions/constant-force.h b/examples/solids/qfunctions/constant-force.h
index a94dc4f3bf..232f97588e 100644
--- a/examples/solids/qfunctions/constant-force.h
+++ b/examples/solids/qfunctions/constant-force.h
@@ -8,8 +8,10 @@
 /// @file
 /// Constant forcing term for solid mechanics example using PETSc
 
-#include <ceed.h>
+#include <ceed/types.h>
+#ifndef CEED_RUNNING_JIT_PASS
 #include <math.h>
+#endif
 
 #ifndef PHYSICS_STRUCT
 #define PHYSICS_STRUCT
diff --git a/examples/solids/qfunctions/finite-strain-mooney-rivlin.h b/examples/solids/qfunctions/finite-strain-mooney-rivlin.h
index f9c19e81b1..7a802693d2 100644
--- a/examples/solids/qfunctions/finite-strain-mooney-rivlin.h
+++ b/examples/solids/qfunctions/finite-strain-mooney-rivlin.h
@@ -8,8 +8,10 @@
 /// @file
 /// Hyperelasticity, finite strain for solid mechanics example using PETSc
 
-#include <ceed.h>
+#include <ceed/types.h>
+#ifndef CEED_RUNNING_JIT_PASS
 #include <math.h>
+#endif
 
 // -----------------------------------------------------------------------------
 // Mooney-Rivlin context
diff --git a/examples/solids/qfunctions/finite-strain-neo-hookean.h b/examples/solids/qfunctions/finite-strain-neo-hookean.h
index 42b2b46e2c..9b1ff27979 100644
--- a/examples/solids/qfunctions/finite-strain-neo-hookean.h
+++ b/examples/solids/qfunctions/finite-strain-neo-hookean.h
@@ -8,8 +8,10 @@
 /// @file
 /// Hyperelasticity, finite strain for solid mechanics example using PETSc
 
-#include <ceed.h>
+#include <ceed/types.h>
+#ifndef CEED_RUNNING_JIT_PASS
 #include <math.h>
+#endif
 
 #ifndef PHYSICS_STRUCT
 #define PHYSICS_STRUCT
diff --git a/examples/solids/qfunctions/linear.h b/examples/solids/qfunctions/linear.h
index 20b293b6f1..57f5fe4f61 100644
--- a/examples/solids/qfunctions/linear.h
+++ b/examples/solids/qfunctions/linear.h
@@ -8,8 +8,10 @@
 /// @file
 /// Linear elasticity for solid mechanics example using PETSc
 
-#include <ceed.h>
+#include <ceed/types.h>
+#ifndef CEED_RUNNING_JIT_PASS
 #include <math.h>
+#endif
 
 #ifndef PHYSICS_STRUCT
 #define PHYSICS_STRUCT
diff --git a/examples/solids/qfunctions/manufactured-force.h b/examples/solids/qfunctions/manufactured-force.h
index 0764d103e3..de48be4ba3 100644
--- a/examples/solids/qfunctions/manufactured-force.h
+++ b/examples/solids/qfunctions/manufactured-force.h
@@ -8,8 +8,10 @@
 /// @file
 /// Linear elasticity manufactured solution forcing term for solid mechanics example using PETSc
 
-#include <ceed.h>
+#include <ceed/types.h>
+#ifndef CEED_RUNNING_JIT_PASS
 #include <math.h>
+#endif
 
 #ifndef PHYSICS_STRUCT
 #define PHYSICS_STRUCT
diff --git a/examples/solids/qfunctions/manufactured-true.h b/examples/solids/qfunctions/manufactured-true.h
index 389fb6596f..6fd97c1b13 100644
--- a/examples/solids/qfunctions/manufactured-true.h
+++ b/examples/solids/qfunctions/manufactured-true.h
@@ -8,8 +8,10 @@
 /// @file
 /// Linear elasticity manufactured solution true solution for solid mechanics example using PETSc
 
-#include <ceed.h>
+#include <ceed/types.h>
+#ifndef CEED_RUNNING_JIT_PASS
 #include <math.h>
+#endif
 
 // -----------------------------------------------------------------------------
 // True solution for linear elasticity manufactured solution
diff --git a/examples/solids/qfunctions/traction-boundary.h b/examples/solids/qfunctions/traction-boundary.h
index 181b176d0a..7fd59c742c 100644
--- a/examples/solids/qfunctions/traction-boundary.h
+++ b/examples/solids/qfunctions/traction-boundary.h
@@ -8,7 +8,7 @@
 /// @file
 /// Geometric factors for solid mechanics example using PETSc
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 // -----------------------------------------------------------------------------
 // This QFunction computes the surface integral of the user traction vector on the constrained faces.
diff --git a/include/ceed/jit-source/cuda/cuda-atomic-add-fallback.h b/include/ceed/jit-source/cuda/cuda-atomic-add-fallback.h
index da92667707..c566147adb 100644
--- a/include/ceed/jit-source/cuda/cuda-atomic-add-fallback.h
+++ b/include/ceed/jit-source/cuda/cuda-atomic-add-fallback.h
@@ -8,7 +8,7 @@
 /// @file
 /// Internal header for CUDA atomic add fallback definition
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 //------------------------------------------------------------------------------
 // Atomic add, for older CUDA
diff --git a/include/ceed/jit-source/cuda/cuda-gen-templates.h b/include/ceed/jit-source/cuda/cuda-gen-templates.h
index f3d7052e3c..a7fd0f7072 100644
--- a/include/ceed/jit-source/cuda/cuda-gen-templates.h
+++ b/include/ceed/jit-source/cuda/cuda-gen-templates.h
@@ -8,7 +8,7 @@
 /// @file
 /// Internal header for CUDA backend macro and type definitions for JiT source
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 //------------------------------------------------------------------------------
 // Load matrices for basis actions
diff --git a/include/ceed/jit-source/cuda/cuda-ref-basis-nontensor-templates.h b/include/ceed/jit-source/cuda/cuda-ref-basis-nontensor-templates.h
index 6b19ad448d..e4c6f1d165 100644
--- a/include/ceed/jit-source/cuda/cuda-ref-basis-nontensor-templates.h
+++ b/include/ceed/jit-source/cuda/cuda-ref-basis-nontensor-templates.h
@@ -8,7 +8,7 @@
 /// @file
 /// Internal header for CUDA non-tensor product basis templates
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 //------------------------------------------------------------------------------
 // Tensor contraction
diff --git a/include/ceed/jit-source/cuda/cuda-ref-basis-nontensor.h b/include/ceed/jit-source/cuda/cuda-ref-basis-nontensor.h
index 6dbf8771d8..a68b2fd3d6 100644
--- a/include/ceed/jit-source/cuda/cuda-ref-basis-nontensor.h
+++ b/include/ceed/jit-source/cuda/cuda-ref-basis-nontensor.h
@@ -8,7 +8,7 @@
 /// @file
 /// Internal header for CUDA non-tensor product basis
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 #include "cuda-ref-basis-nontensor-templates.h"
 
diff --git a/include/ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h b/include/ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h
index 7355705660..1ce11dacfd 100644
--- a/include/ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h
+++ b/include/ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h
@@ -8,7 +8,7 @@
 /// @file
 /// Internal header for CUDA tensor product basis with AtPoints evaluation
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 //------------------------------------------------------------------------------
 // Chebyshev values
diff --git a/include/ceed/jit-source/cuda/cuda-ref-basis-tensor.h b/include/ceed/jit-source/cuda/cuda-ref-basis-tensor.h
index 4c8c2f447c..71d8ee8c34 100644
--- a/include/ceed/jit-source/cuda/cuda-ref-basis-tensor.h
+++ b/include/ceed/jit-source/cuda/cuda-ref-basis-tensor.h
@@ -8,7 +8,7 @@
 /// @file
 /// Internal header for CUDA tensor product basis
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 //------------------------------------------------------------------------------
 // Tensor Basis Kernels
diff --git a/include/ceed/jit-source/cuda/cuda-ref-operator-assemble-diagonal.h b/include/ceed/jit-source/cuda/cuda-ref-operator-assemble-diagonal.h
index df5b9ad338..7a74ea9723 100644
--- a/include/ceed/jit-source/cuda/cuda-ref-operator-assemble-diagonal.h
+++ b/include/ceed/jit-source/cuda/cuda-ref-operator-assemble-diagonal.h
@@ -7,7 +7,7 @@
 
 /// @file
 /// Internal header for CUDA operator diagonal assembly
-#include <ceed.h>
+#include <ceed/types.h>
 
 #if USE_CEEDSIZE
 typedef CeedSize IndexType;
diff --git a/include/ceed/jit-source/cuda/cuda-ref-operator-assemble.h b/include/ceed/jit-source/cuda/cuda-ref-operator-assemble.h
index 9fc02a1c7a..8c35dc1ee7 100644
--- a/include/ceed/jit-source/cuda/cuda-ref-operator-assemble.h
+++ b/include/ceed/jit-source/cuda/cuda-ref-operator-assemble.h
@@ -8,7 +8,7 @@
 /// @file
 /// Internal header for CUDA operator full assembly
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 #if USE_CEEDSIZE
 typedef CeedSize IndexType;
diff --git a/include/ceed/jit-source/cuda/cuda-ref-qfunction.h b/include/ceed/jit-source/cuda/cuda-ref-qfunction.h
index 7fbf7901bc..5bde9a99f7 100644
--- a/include/ceed/jit-source/cuda/cuda-ref-qfunction.h
+++ b/include/ceed/jit-source/cuda/cuda-ref-qfunction.h
@@ -8,7 +8,7 @@
 /// @file
 /// Internal header for CUDA backend QFunction read/write kernels
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 //------------------------------------------------------------------------------
 // Read from quadrature points
diff --git a/include/ceed/jit-source/cuda/cuda-ref-restriction-at-points.h b/include/ceed/jit-source/cuda/cuda-ref-restriction-at-points.h
index 87aeda2e3b..be23954097 100644
--- a/include/ceed/jit-source/cuda/cuda-ref-restriction-at-points.h
+++ b/include/ceed/jit-source/cuda/cuda-ref-restriction-at-points.h
@@ -8,7 +8,7 @@
 /// @file
 /// Internal header for CUDA offset element restriction kernels
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 //------------------------------------------------------------------------------
 // E-vector -> L-vector, standard (with offsets)
diff --git a/include/ceed/jit-source/cuda/cuda-ref-restriction-curl-oriented.h b/include/ceed/jit-source/cuda/cuda-ref-restriction-curl-oriented.h
index 86a4b53545..5a7a8c29e6 100644
--- a/include/ceed/jit-source/cuda/cuda-ref-restriction-curl-oriented.h
+++ b/include/ceed/jit-source/cuda/cuda-ref-restriction-curl-oriented.h
@@ -8,7 +8,7 @@
 /// @file
 /// Internal header for CUDA curl-oriented element restriction kernels
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 //------------------------------------------------------------------------------
 // L-vector -> E-vector, curl-oriented
diff --git a/include/ceed/jit-source/cuda/cuda-ref-restriction-offset.h b/include/ceed/jit-source/cuda/cuda-ref-restriction-offset.h
index 9492b31984..f93c872ace 100644
--- a/include/ceed/jit-source/cuda/cuda-ref-restriction-offset.h
+++ b/include/ceed/jit-source/cuda/cuda-ref-restriction-offset.h
@@ -8,7 +8,7 @@
 /// @file
 /// Internal header for CUDA offset element restriction kernels
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 //------------------------------------------------------------------------------
 // L-vector -> E-vector, standard (with offsets)
diff --git a/include/ceed/jit-source/cuda/cuda-ref-restriction-oriented.h b/include/ceed/jit-source/cuda/cuda-ref-restriction-oriented.h
index 7c667922bf..a1fda87b40 100644
--- a/include/ceed/jit-source/cuda/cuda-ref-restriction-oriented.h
+++ b/include/ceed/jit-source/cuda/cuda-ref-restriction-oriented.h
@@ -8,7 +8,7 @@
 /// @file
 /// Internal header for CUDA oriented element restriction kernels
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 //------------------------------------------------------------------------------
 // L-vector -> E-vector, oriented
diff --git a/include/ceed/jit-source/cuda/cuda-ref-restriction-strided.h b/include/ceed/jit-source/cuda/cuda-ref-restriction-strided.h
index d10f73c11d..59243fd30a 100644
--- a/include/ceed/jit-source/cuda/cuda-ref-restriction-strided.h
+++ b/include/ceed/jit-source/cuda/cuda-ref-restriction-strided.h
@@ -8,7 +8,7 @@
 /// @file
 /// Internal header for CUDA strided element restriction kernels
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 //------------------------------------------------------------------------------
 // L-vector -> E-vector, strided
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-read-write-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-read-write-templates.h
index 56234c28e4..f7e2a74db0 100644
--- a/include/ceed/jit-source/cuda/cuda-shared-basis-read-write-templates.h
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-read-write-templates.h
@@ -8,7 +8,7 @@
 /// @file
 /// Internal header for CUDA shared memory basis read/write templates
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 //------------------------------------------------------------------------------
 // 1D
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h
index 56989f2b69..f09677828a 100644
--- a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h
@@ -8,7 +8,7 @@
 /// @file
 /// Internal header for CUDA shared memory tensor product basis templates
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 //------------------------------------------------------------------------------
 // 1D
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor.h
index c295362978..5f801cf81c 100644
--- a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor.h
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor.h
@@ -8,7 +8,7 @@
 /// @file
 /// Internal header for CUDA shared memory tensor product basis
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 #include "cuda-shared-basis-read-write-templates.h"
 #include "cuda-shared-basis-tensor-templates.h"
diff --git a/include/ceed/jit-source/gallery/ceed-identity.h b/include/ceed/jit-source/gallery/ceed-identity.h
index 1a84718f4a..45c4ebf827 100644
--- a/include/ceed/jit-source/gallery/ceed-identity.h
+++ b/include/ceed/jit-source/gallery/ceed-identity.h
@@ -9,7 +9,7 @@
   @brief  Identity QFunction that copies inputs directly into outputs
 **/
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 typedef struct {
   CeedInt size;
diff --git a/include/ceed/jit-source/gallery/ceed-mass1dbuild.h b/include/ceed/jit-source/gallery/ceed-mass1dbuild.h
index c266beff64..4b2f60a135 100644
--- a/include/ceed/jit-source/gallery/ceed-mass1dbuild.h
+++ b/include/ceed/jit-source/gallery/ceed-mass1dbuild.h
@@ -9,7 +9,7 @@
   @brief Ceed QFunction for building the geometric data for the 1D mass matrix
 **/
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 CEED_QFUNCTION(Mass1DBuild)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   // in[0] is Jacobians, size (Q)
diff --git a/include/ceed/jit-source/gallery/ceed-mass2dbuild.h b/include/ceed/jit-source/gallery/ceed-mass2dbuild.h
index 7e5f6fbd34..3d24858bca 100644
--- a/include/ceed/jit-source/gallery/ceed-mass2dbuild.h
+++ b/include/ceed/jit-source/gallery/ceed-mass2dbuild.h
@@ -9,7 +9,7 @@
   @brief Ceed QFunction for building the geometric data for the 2D mass matrix
 **/
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 CEED_QFUNCTION(Mass2DBuild)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   // in[0] is Jacobians with shape [2, nc=2, Q]
diff --git a/include/ceed/jit-source/gallery/ceed-mass3dbuild.h b/include/ceed/jit-source/gallery/ceed-mass3dbuild.h
index 71dc961215..81a23aaff9 100644
--- a/include/ceed/jit-source/gallery/ceed-mass3dbuild.h
+++ b/include/ceed/jit-source/gallery/ceed-mass3dbuild.h
@@ -9,7 +9,7 @@
   @brief Ceed QFunction for building the geometric data for the 3D mass matrix
 **/
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 CEED_QFUNCTION(Mass3DBuild)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   // in[0] is Jacobians with shape [2, nc=3, Q]
diff --git a/include/ceed/jit-source/gallery/ceed-massapply.h b/include/ceed/jit-source/gallery/ceed-massapply.h
index 8559ce8a26..9d1f172190 100644
--- a/include/ceed/jit-source/gallery/ceed-massapply.h
+++ b/include/ceed/jit-source/gallery/ceed-massapply.h
@@ -9,7 +9,7 @@
   @brief Ceed QFunction for applying the mass matrix
 **/
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 CEED_QFUNCTION(MassApply)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   // in[0] is u, size (Q)
diff --git a/include/ceed/jit-source/gallery/ceed-poisson1dapply.h b/include/ceed/jit-source/gallery/ceed-poisson1dapply.h
index dc38d4f21a..0a9dbfd9da 100644
--- a/include/ceed/jit-source/gallery/ceed-poisson1dapply.h
+++ b/include/ceed/jit-source/gallery/ceed-poisson1dapply.h
@@ -9,7 +9,7 @@
   @brief Ceed QFunction for applying the 1D Poisson operator
 **/
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 CEED_QFUNCTION(Poisson1DApply)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   // in[0] is gradient u, size (Q)
diff --git a/include/ceed/jit-source/gallery/ceed-poisson1dbuild.h b/include/ceed/jit-source/gallery/ceed-poisson1dbuild.h
index dce08aabb2..cad2d5c672 100644
--- a/include/ceed/jit-source/gallery/ceed-poisson1dbuild.h
+++ b/include/ceed/jit-source/gallery/ceed-poisson1dbuild.h
@@ -9,7 +9,7 @@
   @brief Ceed QFunction for building the geometric data for the 1D Poisson operator
 **/
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 CEED_QFUNCTION(Poisson1DBuild)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   // At every quadrature point, compute w/det(J).adj(J).adj(J)^T and store
diff --git a/include/ceed/jit-source/gallery/ceed-poisson2dapply.h b/include/ceed/jit-source/gallery/ceed-poisson2dapply.h
index dab64be671..e140c8cc8b 100644
--- a/include/ceed/jit-source/gallery/ceed-poisson2dapply.h
+++ b/include/ceed/jit-source/gallery/ceed-poisson2dapply.h
@@ -9,7 +9,7 @@
   @brief Ceed QFunction for applying the 2D Poisson operator
 **/
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 CEED_QFUNCTION(Poisson2DApply)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   // in[0] is gradient u, shape [2, nc=1, Q]
diff --git a/include/ceed/jit-source/gallery/ceed-poisson2dbuild.h b/include/ceed/jit-source/gallery/ceed-poisson2dbuild.h
index 11e15255ad..91a1736748 100644
--- a/include/ceed/jit-source/gallery/ceed-poisson2dbuild.h
+++ b/include/ceed/jit-source/gallery/ceed-poisson2dbuild.h
@@ -9,7 +9,7 @@
   @brief Ceed QFunction for building the geometric data for the 2D Poisson operator
 **/
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 CEED_QFUNCTION(Poisson2DBuild)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   // At every quadrature point, compute w/det(J).adj(J).adj(J)^T and store
diff --git a/include/ceed/jit-source/gallery/ceed-poisson3dapply.h b/include/ceed/jit-source/gallery/ceed-poisson3dapply.h
index 71e76926e7..ba25506034 100644
--- a/include/ceed/jit-source/gallery/ceed-poisson3dapply.h
+++ b/include/ceed/jit-source/gallery/ceed-poisson3dapply.h
@@ -9,7 +9,7 @@
   @brief Ceed QFunction for applying the geometric data for the 3D Poisson operator
 **/
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 CEED_QFUNCTION(Poisson3DApply)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   // in[0] is gradient u, shape [3, nc=1, Q]
diff --git a/include/ceed/jit-source/gallery/ceed-poisson3dbuild.h b/include/ceed/jit-source/gallery/ceed-poisson3dbuild.h
index 2d4e0621e4..640d1e91e7 100644
--- a/include/ceed/jit-source/gallery/ceed-poisson3dbuild.h
+++ b/include/ceed/jit-source/gallery/ceed-poisson3dbuild.h
@@ -9,7 +9,7 @@
   @brief Ceed QFunction for building the geometric data for the 3D Poisson operator
 **/
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 CEED_QFUNCTION(Poisson3DBuild)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   // At every quadrature point, compute w/det(J).adj(J).adj(J)^T and store the symmetric part of the result.
diff --git a/include/ceed/jit-source/gallery/ceed-scale.h b/include/ceed/jit-source/gallery/ceed-scale.h
index 1249810987..bdf7d7ee1c 100644
--- a/include/ceed/jit-source/gallery/ceed-scale.h
+++ b/include/ceed/jit-source/gallery/ceed-scale.h
@@ -9,7 +9,7 @@
   @brief  Scaling QFunction that scales inputs
 **/
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 CEED_QFUNCTION(Scale)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   // Ctx holds field size
diff --git a/include/ceed/jit-source/gallery/ceed-vectormassapply.h b/include/ceed/jit-source/gallery/ceed-vectormassapply.h
index 70a2f3e25c..1486a038b3 100644
--- a/include/ceed/jit-source/gallery/ceed-vectormassapply.h
+++ b/include/ceed/jit-source/gallery/ceed-vectormassapply.h
@@ -9,7 +9,7 @@
   @brief Ceed QFunction for applying the mass matrix on a vector system with three components
 **/
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 CEED_QFUNCTION(Vector3MassApply)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   // in[0] is u, size (Q)
diff --git a/include/ceed/jit-source/gallery/ceed-vectorpoisson1dapply.h b/include/ceed/jit-source/gallery/ceed-vectorpoisson1dapply.h
index e056729422..e5f37c3f79 100644
--- a/include/ceed/jit-source/gallery/ceed-vectorpoisson1dapply.h
+++ b/include/ceed/jit-source/gallery/ceed-vectorpoisson1dapply.h
@@ -9,7 +9,7 @@
   @brief Ceed QFunction for applying the 1D Poisson operator on a vector system with three components
 **/
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 CEED_QFUNCTION(Vector3Poisson1DApply)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   // in[0] is gradient u, shape [1, nc=3, Q]
diff --git a/include/ceed/jit-source/gallery/ceed-vectorpoisson2dapply.h b/include/ceed/jit-source/gallery/ceed-vectorpoisson2dapply.h
index 1b56240048..3742c67dbb 100644
--- a/include/ceed/jit-source/gallery/ceed-vectorpoisson2dapply.h
+++ b/include/ceed/jit-source/gallery/ceed-vectorpoisson2dapply.h
@@ -9,7 +9,7 @@
   @brief Ceed QFunction for applying the 2D Poisson operator on a vector system with three components
 **/
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 CEED_QFUNCTION(Vector3Poisson2DApply)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   // in[0] is gradient u, shape [2, nc=3, Q]
diff --git a/include/ceed/jit-source/gallery/ceed-vectorpoisson3dapply.h b/include/ceed/jit-source/gallery/ceed-vectorpoisson3dapply.h
index 9ca86dba01..29545099b9 100644
--- a/include/ceed/jit-source/gallery/ceed-vectorpoisson3dapply.h
+++ b/include/ceed/jit-source/gallery/ceed-vectorpoisson3dapply.h
@@ -9,7 +9,7 @@
   @brief Ceed QFunction for applying the geometric data for the 3D Poisson on a vector system with three components operator
 **/
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 CEED_QFUNCTION(Vector3Poisson3DApply)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   // in[0] is gradient u, shape [3, nc=3, Q]
diff --git a/include/ceed/jit-source/hip/hip-gen-templates.h b/include/ceed/jit-source/hip/hip-gen-templates.h
index 812e901866..fd014dbb8a 100644
--- a/include/ceed/jit-source/hip/hip-gen-templates.h
+++ b/include/ceed/jit-source/hip/hip-gen-templates.h
@@ -8,7 +8,7 @@
 /// @file
 /// Internal header for HIP backend macro and type definitions for JiT source
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 //------------------------------------------------------------------------------
 // Load matrices for basis actions
diff --git a/include/ceed/jit-source/hip/hip-ref-basis-nontensor-templates.h b/include/ceed/jit-source/hip/hip-ref-basis-nontensor-templates.h
index 0374d459d5..c8160badf1 100644
--- a/include/ceed/jit-source/hip/hip-ref-basis-nontensor-templates.h
+++ b/include/ceed/jit-source/hip/hip-ref-basis-nontensor-templates.h
@@ -8,7 +8,7 @@
 /// @file
 /// Internal header for HIP non-tensor product basis templates
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 //------------------------------------------------------------------------------
 // Tensor contraction
diff --git a/include/ceed/jit-source/hip/hip-ref-basis-nontensor.h b/include/ceed/jit-source/hip/hip-ref-basis-nontensor.h
index 953f6f48e3..bf950e11ab 100644
--- a/include/ceed/jit-source/hip/hip-ref-basis-nontensor.h
+++ b/include/ceed/jit-source/hip/hip-ref-basis-nontensor.h
@@ -8,7 +8,7 @@
 /// @file
 /// Internal header for HIP non-tensor product basis
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 #include "hip-ref-basis-nontensor-templates.h"
 
diff --git a/include/ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h b/include/ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h
index 4744b17eb2..42e63cab65 100644
--- a/include/ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h
+++ b/include/ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h
@@ -8,7 +8,7 @@
 /// @file
 /// Internal header for CUDA tensor product basis with AtPoints evaluation
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 //------------------------------------------------------------------------------
 // Chebyshev values
diff --git a/include/ceed/jit-source/hip/hip-ref-basis-tensor.h b/include/ceed/jit-source/hip/hip-ref-basis-tensor.h
index db509ac2a0..8ddb3552b1 100644
--- a/include/ceed/jit-source/hip/hip-ref-basis-tensor.h
+++ b/include/ceed/jit-source/hip/hip-ref-basis-tensor.h
@@ -8,7 +8,7 @@
 /// @file
 /// Internal header for HIP tensor product basis
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 //------------------------------------------------------------------------------
 // Tensor Basis Kernels
diff --git a/include/ceed/jit-source/hip/hip-ref-operator-assemble-diagonal.h b/include/ceed/jit-source/hip/hip-ref-operator-assemble-diagonal.h
index e6a8b6e6a1..9e4fef865c 100644
--- a/include/ceed/jit-source/hip/hip-ref-operator-assemble-diagonal.h
+++ b/include/ceed/jit-source/hip/hip-ref-operator-assemble-diagonal.h
@@ -8,7 +8,7 @@
 /// @file
 /// Internal header for HIP operator diagonal assembly
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 #if USE_CEEDSIZE
 typedef CeedSize IndexType;
diff --git a/include/ceed/jit-source/hip/hip-ref-operator-assemble.h b/include/ceed/jit-source/hip/hip-ref-operator-assemble.h
index 838dcfd4a5..a9b5bd7bc4 100644
--- a/include/ceed/jit-source/hip/hip-ref-operator-assemble.h
+++ b/include/ceed/jit-source/hip/hip-ref-operator-assemble.h
@@ -8,7 +8,7 @@
 /// @file
 /// Internal header for HIP operator full assembly
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 #if USE_CEEDSIZE
 typedef CeedSize IndexType;
diff --git a/include/ceed/jit-source/hip/hip-ref-qfunction.h b/include/ceed/jit-source/hip/hip-ref-qfunction.h
index 1b423072af..6524f74dfd 100644
--- a/include/ceed/jit-source/hip/hip-ref-qfunction.h
+++ b/include/ceed/jit-source/hip/hip-ref-qfunction.h
@@ -8,7 +8,7 @@
 /// @file
 /// Internal header for HIP backend QFunction read/write kernels
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 //------------------------------------------------------------------------------
 // Read from quadrature points
diff --git a/include/ceed/jit-source/hip/hip-ref-restriction-at-points.h b/include/ceed/jit-source/hip/hip-ref-restriction-at-points.h
index f4cb95993b..5d5b3f9812 100644
--- a/include/ceed/jit-source/hip/hip-ref-restriction-at-points.h
+++ b/include/ceed/jit-source/hip/hip-ref-restriction-at-points.h
@@ -8,7 +8,7 @@
 /// @file
 /// Internal header for HIP offset element restriction kernels
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 //------------------------------------------------------------------------------
 // E-vector -> L-vector, standard (with offsets)
diff --git a/include/ceed/jit-source/hip/hip-ref-restriction-curl-oriented.h b/include/ceed/jit-source/hip/hip-ref-restriction-curl-oriented.h
index 76d9758828..33b03ab673 100644
--- a/include/ceed/jit-source/hip/hip-ref-restriction-curl-oriented.h
+++ b/include/ceed/jit-source/hip/hip-ref-restriction-curl-oriented.h
@@ -8,7 +8,7 @@
 /// @file
 /// Internal header for HIP curl-oriented element restriction kernels
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 //------------------------------------------------------------------------------
 // L-vector -> E-vector, curl-oriented
diff --git a/include/ceed/jit-source/hip/hip-ref-restriction-offset.h b/include/ceed/jit-source/hip/hip-ref-restriction-offset.h
index 65283b7193..a10c7fe957 100644
--- a/include/ceed/jit-source/hip/hip-ref-restriction-offset.h
+++ b/include/ceed/jit-source/hip/hip-ref-restriction-offset.h
@@ -8,7 +8,7 @@
 /// @file
 /// Internal header for HIP offset element restriction kernels
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 //------------------------------------------------------------------------------
 // L-vector -> E-vector, standard (with offsets)
diff --git a/include/ceed/jit-source/hip/hip-ref-restriction-oriented.h b/include/ceed/jit-source/hip/hip-ref-restriction-oriented.h
index f983a24fc0..07d13d6515 100644
--- a/include/ceed/jit-source/hip/hip-ref-restriction-oriented.h
+++ b/include/ceed/jit-source/hip/hip-ref-restriction-oriented.h
@@ -8,7 +8,7 @@
 /// @file
 /// Internal header for HIP oriented element restriction kernels
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 //------------------------------------------------------------------------------
 // L-vector -> E-vector, oriented
diff --git a/include/ceed/jit-source/hip/hip-ref-restriction-strided.h b/include/ceed/jit-source/hip/hip-ref-restriction-strided.h
index de1335c117..2f8bd18446 100644
--- a/include/ceed/jit-source/hip/hip-ref-restriction-strided.h
+++ b/include/ceed/jit-source/hip/hip-ref-restriction-strided.h
@@ -8,7 +8,7 @@
 /// @file
 /// Internal header for HIP strided element restriction kernels
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 //------------------------------------------------------------------------------
 // L-vector -> E-vector, strided
diff --git a/include/ceed/jit-source/hip/hip-shared-basis-read-write-templates.h b/include/ceed/jit-source/hip/hip-shared-basis-read-write-templates.h
index 379d52d13b..47311c3361 100644
--- a/include/ceed/jit-source/hip/hip-shared-basis-read-write-templates.h
+++ b/include/ceed/jit-source/hip/hip-shared-basis-read-write-templates.h
@@ -8,7 +8,7 @@
 /// @file
 /// Internal header for HIP shared memory basis read/write templates
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 //------------------------------------------------------------------------------
 // Helper function: load matrices for basis actions
diff --git a/include/ceed/jit-source/hip/hip-shared-basis-tensor-templates.h b/include/ceed/jit-source/hip/hip-shared-basis-tensor-templates.h
index 8dc50e4ed8..ef2ea90ec6 100644
--- a/include/ceed/jit-source/hip/hip-shared-basis-tensor-templates.h
+++ b/include/ceed/jit-source/hip/hip-shared-basis-tensor-templates.h
@@ -8,7 +8,7 @@
 /// @file
 /// Internal header for HIP shared memory tensor product basis templates
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 //------------------------------------------------------------------------------
 // 1D
diff --git a/include/ceed/jit-source/hip/hip-shared-basis-tensor.h b/include/ceed/jit-source/hip/hip-shared-basis-tensor.h
index d052e53bf1..9c33a96b51 100644
--- a/include/ceed/jit-source/hip/hip-shared-basis-tensor.h
+++ b/include/ceed/jit-source/hip/hip-shared-basis-tensor.h
@@ -8,7 +8,7 @@
 /// @file
 /// Internal header for HIP shared memory tensor product basis
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 #include "hip-shared-basis-read-write-templates.h"
 #include "hip-shared-basis-tensor-templates.h"
diff --git a/include/ceed/jit-source/sycl/sycl-gen-templates.h b/include/ceed/jit-source/sycl/sycl-gen-templates.h
index aa54232c2d..cf6f6cbc15 100644
--- a/include/ceed/jit-source/sycl/sycl-gen-templates.h
+++ b/include/ceed/jit-source/sycl/sycl-gen-templates.h
@@ -7,7 +7,7 @@
 
 /// @file
 /// Internal header for SYCL backend macro and type definitions for JiT source
-#include <ceed.h>
+#include <ceed/types.h>
 
 #pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
 #pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable
diff --git a/include/ceed/jit-source/sycl/sycl-ref-qfunction.h b/include/ceed/jit-source/sycl/sycl-ref-qfunction.h
index d62de2533a..4a2ae56e94 100644
--- a/include/ceed/jit-source/sycl/sycl-ref-qfunction.h
+++ b/include/ceed/jit-source/sycl/sycl-ref-qfunction.h
@@ -8,7 +8,7 @@
 /// @file
 /// Internal header for SYCL backend QFunction read/write kernels
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 //------------------------------------------------------------------------------
 // Read from quadrature points
diff --git a/include/ceed/jit-source/sycl/sycl-shared-basis-read-write-templates.h b/include/ceed/jit-source/sycl/sycl-shared-basis-read-write-templates.h
index 421875b509..5ddc1e84c8 100644
--- a/include/ceed/jit-source/sycl/sycl-shared-basis-read-write-templates.h
+++ b/include/ceed/jit-source/sycl/sycl-shared-basis-read-write-templates.h
@@ -8,7 +8,7 @@
 /// @file
 /// Internal header for SYCL shared memory basis read/write templates
 
-#include <ceed.h>
+#include <ceed/types.h>
 #include "sycl-types.h"
 
 //------------------------------------------------------------------------------
diff --git a/include/ceed/jit-source/sycl/sycl-shared-basis-tensor-templates.h b/include/ceed/jit-source/sycl/sycl-shared-basis-tensor-templates.h
index 28bd24d9f9..9933b2f5f0 100644
--- a/include/ceed/jit-source/sycl/sycl-shared-basis-tensor-templates.h
+++ b/include/ceed/jit-source/sycl/sycl-shared-basis-tensor-templates.h
@@ -8,7 +8,7 @@
 /// @file
 /// Internal header for SYCL shared memory tensor product basis templates
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 //------------------------------------------------------------------------------
 // 1D
diff --git a/include/ceed/jit-source/sycl/sycl-shared-basis-tensor.h b/include/ceed/jit-source/sycl/sycl-shared-basis-tensor.h
index f8e4ccdc0a..60b8bf531d 100644
--- a/include/ceed/jit-source/sycl/sycl-shared-basis-tensor.h
+++ b/include/ceed/jit-source/sycl/sycl-shared-basis-tensor.h
@@ -8,7 +8,7 @@
 /// @file
 /// Internal header for SYCL shared memory tensor product basis
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 #include "sycl-shared-basis-read-write-templates.h"
 #include "sycl-shared-basis-tensor-templates.h"
diff --git a/interface/ceed-qfunction.c b/interface/ceed-qfunction.c
index 02715249d4..7c0cf3a285 100644
--- a/interface/ceed-qfunction.c
+++ b/interface/ceed-qfunction.c
@@ -608,6 +608,8 @@ int CeedQFunctionGetFlopsEstimate(CeedQFunction qf, CeedSize *flops) {
   @param[in]  source     Absolute path to source of `CeedQFunctionUser`, "\abs_path\file.h:function_name".
                            The entire source file must only contain constructs supported by all targeted backends (i.e. CUDA for `/gpu/cuda`, OpenCL/SYCL for `/gpu/sycl`, etc.).
                            The entire contents of this file and all locally included files are used during JiT compilation for GPU backends.
+                           The header `ceed/types.h` is preferred over `ceed.h` or `ceed/ceed.h` for `CeedQFunction` source files.
+                           The macro `CEED_RUNNING_JIT_PASS` is set during JiT and can be used to guard include statements that JiT compilers cannot use, such as `math.h` or `std*.h`.
                            All source files must be at the provided filepath at runtime for JiT to function.
   @param[out] qf         Address of the variable where the newly created `CeedQFunction` will be stored
 
diff --git a/tests/t400-qfunction.h b/tests/t400-qfunction.h
index 1fb64842fd..b3e226df14 100644
--- a/tests/t400-qfunction.h
+++ b/tests/t400-qfunction.h
@@ -5,7 +5,7 @@
 //
 // This file is part of CEED:  http://github.com/ceed
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 CEED_QFUNCTION(setup)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   const CeedScalar *w      = in[0];
diff --git a/tests/t401-qfunction.h b/tests/t401-qfunction.h
index c61cdb8ac6..465ec0b119 100644
--- a/tests/t401-qfunction.h
+++ b/tests/t401-qfunction.h
@@ -5,7 +5,7 @@
 //
 // This file is part of CEED:  http://github.com/ceed
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 CEED_QFUNCTION(setup)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   const CeedScalar *w      = in[0];
diff --git a/tests/t405-qfunction.h b/tests/t405-qfunction.h
index eaf261791f..40be19b47d 100644
--- a/tests/t405-qfunction.h
+++ b/tests/t405-qfunction.h
@@ -5,7 +5,7 @@
 //
 // This file is part of CEED:  http://github.com/ceed
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 CEED_QFUNCTION(setup)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   const CeedScalar *w      = in[0];
diff --git a/tests/t406-qfunction-helper.h b/tests/t406-qfunction-helper.h
index 4c41de3a33..85fdf9999c 100644
--- a/tests/t406-qfunction-helper.h
+++ b/tests/t406-qfunction-helper.h
@@ -10,6 +10,7 @@
 # pragma  once
 // clang-format on
 
+// Note - ceed/types.h should be used over ceed.h
 #include <ceed.h>
 
 // Test include path with "/./"
diff --git a/tests/t406-qfunction.h b/tests/t406-qfunction.h
index 8be7fd8d34..642de84a67 100644
--- a/tests/t406-qfunction.h
+++ b/tests/t406-qfunction.h
@@ -7,8 +7,12 @@
 
 // Note: intentionally testing strange spacing in include's
 // clang-format off
+// Note - ceed/types.h should be used over ceed.h
 #include <ceed.h>
-#  include  <math.h>
+// Note - system headers like math.h and std*.h should be guarded
+#ifndef CEED_RUNNING_JIT_PASS
+#  include <math.h>
+#endif
 
 #include "t406-qfunction-helper.h"
 // Test duplicate includes of guarded files
diff --git a/tests/t409-qfunction.h b/tests/t409-qfunction.h
index 27e2c6585e..5348ffeb9d 100644
--- a/tests/t409-qfunction.h
+++ b/tests/t409-qfunction.h
@@ -5,7 +5,7 @@
 //
 // This file is part of CEED:  http://github.com/ceed
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 CEED_QFUNCTION(scale)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   CeedScalar       *scale = (CeedScalar *)ctx;
diff --git a/tests/t500-operator.h b/tests/t500-operator.h
index de9ca8966a..777978bc34 100644
--- a/tests/t500-operator.h
+++ b/tests/t500-operator.h
@@ -5,7 +5,7 @@
 //
 // This file is part of CEED:  http://github.com/ceed
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 CEED_QFUNCTION(setup)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   const CeedScalar *weight = in[0], *dxdX = in[1];
diff --git a/tests/t502-operator.h b/tests/t502-operator.h
index 9d343b5ab9..9915ee4282 100644
--- a/tests/t502-operator.h
+++ b/tests/t502-operator.h
@@ -5,7 +5,7 @@
 //
 // This file is part of CEED:  http://github.com/ceed
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 CEED_QFUNCTION(setup)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   const CeedScalar *weight = in[0], *dxdX = in[1];
diff --git a/tests/t507-operator.h b/tests/t507-operator.h
index 5d245534be..3166f2ee69 100644
--- a/tests/t507-operator.h
+++ b/tests/t507-operator.h
@@ -5,7 +5,7 @@
 //
 // This file is part of CEED:  http://github.com/ceed
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 CEED_QFUNCTION(setup)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   const CeedScalar *weight = in[0], *dxdX = in[1];
diff --git a/tests/t510-operator.h b/tests/t510-operator.h
index 01cf47450c..20677b157a 100644
--- a/tests/t510-operator.h
+++ b/tests/t510-operator.h
@@ -5,7 +5,7 @@
 //
 // This file is part of CEED:  http://github.com/ceed
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 CEED_QFUNCTION(setup)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   const CeedScalar *weight = in[0], *J = in[1];
diff --git a/tests/t522-operator.h b/tests/t522-operator.h
index 3f70b7d354..52aa9bae28 100644
--- a/tests/t522-operator.h
+++ b/tests/t522-operator.h
@@ -5,7 +5,7 @@
 //
 // This file is part of CEED:  http://github.com/ceed
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 CEED_QFUNCTION(setup)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   const CeedScalar *qw = in[0], *J = in[1];
diff --git a/tests/t530-operator.h b/tests/t530-operator.h
index 01cf47450c..20677b157a 100644
--- a/tests/t530-operator.h
+++ b/tests/t530-operator.h
@@ -5,7 +5,7 @@
 //
 // This file is part of CEED:  http://github.com/ceed
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 CEED_QFUNCTION(setup)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   const CeedScalar *weight = in[0], *J = in[1];
diff --git a/tests/t531-operator.h b/tests/t531-operator.h
index 4050ca35dc..79a083f032 100644
--- a/tests/t531-operator.h
+++ b/tests/t531-operator.h
@@ -5,7 +5,7 @@
 //
 // This file is part of CEED:  http://github.com/ceed
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 CEED_QFUNCTION(setup)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   // At every quadrature point, compute qw/det(J).adj(J).adj(J)^T and store
diff --git a/tests/t532-operator.h b/tests/t532-operator.h
index e15e3aed19..6de6e8b669 100644
--- a/tests/t532-operator.h
+++ b/tests/t532-operator.h
@@ -5,7 +5,7 @@
 //
 // This file is part of CEED:  http://github.com/ceed
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 CEED_QFUNCTION(setup_mass)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   const CeedScalar *J = in[0], *weight = in[1];
diff --git a/tests/t534-operator.h b/tests/t534-operator.h
index 3fc4c58887..cfe2bf73ac 100644
--- a/tests/t534-operator.h
+++ b/tests/t534-operator.h
@@ -5,7 +5,7 @@
 //
 // This file is part of CEED:  http://github.com/ceed
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 CEED_QFUNCTION(setup)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   // At every quadrature point, compute qw/det(J).adj(J).adj(J)^T and store
diff --git a/tests/t535-operator.h b/tests/t535-operator.h
index 7f6797608c..ba3d5498cb 100644
--- a/tests/t535-operator.h
+++ b/tests/t535-operator.h
@@ -5,7 +5,7 @@
 //
 // This file is part of CEED:  http://github.com/ceed
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 CEED_QFUNCTION(setup_mass)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   const CeedScalar *J = in[0], *weight = in[1];
diff --git a/tests/t537-operator.h b/tests/t537-operator.h
index 80b2d22d73..f42f4fc1e4 100644
--- a/tests/t537-operator.h
+++ b/tests/t537-operator.h
@@ -5,7 +5,7 @@
 //
 // This file is part of CEED:  http://github.com/ceed
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 CEED_QFUNCTION(setup)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   const CeedScalar *weight = in[0], *J = in[1];
diff --git a/tests/t539-operator.h b/tests/t539-operator.h
index 3a4fda2475..c51487250b 100644
--- a/tests/t539-operator.h
+++ b/tests/t539-operator.h
@@ -5,7 +5,7 @@
 //
 // This file is part of CEED:  http://github.com/ceed
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 CEED_QFUNCTION(apply)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   // in[0] is gradient u_0, shape [2, num_comp=2, Q]
diff --git a/tests/t540-operator.h b/tests/t540-operator.h
index 79f5006719..f6052946aa 100644
--- a/tests/t540-operator.h
+++ b/tests/t540-operator.h
@@ -5,7 +5,7 @@
 //
 // This file is part of CEED:  http://github.com/ceed
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 CEED_QFUNCTION(setup_mass)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   const CeedScalar *J = in[0], *weight = in[1];
diff --git a/tests/t541-operator.h b/tests/t541-operator.h
index 7eaa675c97..2f588f76be 100644
--- a/tests/t541-operator.h
+++ b/tests/t541-operator.h
@@ -5,7 +5,7 @@
 //
 // This file is part of CEED:  http://github.com/ceed
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 CEED_QFUNCTION(setup_diff)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   // in[0] is Jacobians with shape [2, nc=2, Q]
diff --git a/tests/t566-operator.h b/tests/t566-operator.h
index dfd0da43a2..a1c57cae55 100644
--- a/tests/t566-operator.h
+++ b/tests/t566-operator.h
@@ -5,7 +5,7 @@
 //
 // This file is part of CEED:  http://github.com/ceed
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 CEED_QFUNCTION(setup)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   const CeedScalar *weight = in[0], *J = in[1];
diff --git a/tests/t567-operator.h b/tests/t567-operator.h
index 6b645272dc..faee0aa5ac 100644
--- a/tests/t567-operator.h
+++ b/tests/t567-operator.h
@@ -5,7 +5,7 @@
 //
 // This file is part of CEED:  http://github.com/ceed
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 CEED_QFUNCTION(setup)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   const CeedScalar *w = in[0], (*J)[2][CEED_Q_VLA] = (const CeedScalar(*)[2][CEED_Q_VLA])in[1];
diff --git a/tests/t568-operator.h b/tests/t568-operator.h
index d52bc2d800..8cbb0ba8bf 100644
--- a/tests/t568-operator.h
+++ b/tests/t568-operator.h
@@ -5,7 +5,7 @@
 //
 // This file is part of CEED:  http://github.com/ceed
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 CEED_QFUNCTION(setup)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   // At every quadrature point, compute qw/det(J).adj(J).adj(J)^T and store
diff --git a/tests/t580-operator.h b/tests/t580-operator.h
index 940a3605fc..e53f7817de 100644
--- a/tests/t580-operator.h
+++ b/tests/t580-operator.h
@@ -5,7 +5,7 @@
 //
 // This file is part of CEED:  http://github.com/ceed
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 // Compute det(A)
 CEED_QFUNCTION_HELPER CeedScalar MatDet2x2(const CeedScalar A[2][2]) { return A[0][0] * A[1][1] - A[1][0] * A[0][1]; }
diff --git a/tests/t590-operator.h b/tests/t590-operator.h
index a2018718f8..d4c45b3735 100644
--- a/tests/t590-operator.h
+++ b/tests/t590-operator.h
@@ -5,7 +5,7 @@
 //
 // This file is part of CEED:  http://github.com/ceed
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 CEED_QFUNCTION(mass)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   const CeedScalar *u = in[0];
diff --git a/tests/t591-operator.h b/tests/t591-operator.h
index 1c64f1181f..3385bf9dcc 100644
--- a/tests/t591-operator.h
+++ b/tests/t591-operator.h
@@ -5,7 +5,7 @@
 //
 // This file is part of CEED:  http://github.com/ceed
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 CEED_QFUNCTION(setup)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   const CeedScalar(*J)[2][CEED_Q_VLA] = (const CeedScalar(*)[2][CEED_Q_VLA])in[0];
diff --git a/tests/test-include/fake-sys-include.h b/tests/test-include/fake-sys-include.h
index ec1c9b2c56..edb954cb54 100644
--- a/tests/test-include/fake-sys-include.h
+++ b/tests/test-include/fake-sys-include.h
@@ -1,10 +1,14 @@
 #define FAKE_SYS_SCALE_ONE 1
 
 // Note - files included this way cannot transitively include any files CUDA/ROCm won't compile
-// These are bad
-// #include <math.h>
-// #include <stddef.h>
+// These are bad and need to be guarded
+#ifndef CEED_RUNNING_JIT_PASS
+#include <math.h>
+#include <stddef.h>
+#endif
 
 // These are ok
+// Note - ceed/types.h should be used over ceed.h
+//        ceed.h is replaced with ceed/types.h during JiT
 #include <ceed.h>
 #include <ceed/types.h>

From 9c25dd66b9687765a7022cc762ccaf201b721845 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 18 Oct 2024 09:14:54 -0600
Subject: [PATCH 211/571] cuda/hip - use new include pattern for JiT

---
 .../cuda-gen/ceed-cuda-gen-operator-build.cpp | 45 +++-------
 backends/cuda-gen/ceed-cuda-gen-qfunction.c   |  5 --
 backends/cuda-gen/ceed-cuda-gen.h             |  1 -
 backends/cuda-ref/ceed-cuda-ref-basis.c       | 51 +++---------
 backends/cuda-ref/ceed-cuda-ref-operator.c    | 24 ++----
 .../cuda-ref/ceed-cuda-ref-qfunction-load.cpp | 35 +++-----
 backends/cuda-ref/ceed-cuda-ref-qfunction.c   |  6 +-
 backends/cuda-ref/ceed-cuda-ref-restriction.c | 81 +++++-------------
 backends/cuda-ref/ceed-cuda-ref.h             |  1 -
 backends/cuda-shared/ceed-cuda-shared-basis.c | 21 +----
 .../hip-gen/ceed-hip-gen-operator-build.cpp   | 34 +++-----
 backends/hip-gen/ceed-hip-gen-qfunction.c     |  5 --
 backends/hip-gen/ceed-hip-gen.h               |  1 -
 backends/hip-ref/ceed-hip-ref-basis.c         | 51 +++---------
 backends/hip-ref/ceed-hip-ref-operator.c      | 24 ++----
 .../hip-ref/ceed-hip-ref-qfunction-load.cpp   | 31 +++----
 backends/hip-ref/ceed-hip-ref-qfunction.c     |  6 +-
 backends/hip-ref/ceed-hip-ref-restriction.c   | 82 +++++--------------
 backends/hip-ref/ceed-hip-ref.h               |  1 -
 backends/hip-shared/ceed-hip-shared-basis.c   | 21 +----
 interface/ceed-preconditioning.c              |  3 +
 21 files changed, 133 insertions(+), 396 deletions(-)

diff --git a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
index c744ea4254..315db3844f 100644
--- a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
+++ b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
@@ -696,42 +696,17 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {
     CeedCallBackend(CeedGetData(ceed, &ceed_data));
     CeedCallBackend(cudaGetDeviceProperties(&prop, ceed_data->device_id));
     if ((prop.major < 6) && (CEED_SCALAR_TYPE != CEED_SCALAR_FP32)) {
-      char       *atomic_add_source;
-      const char *atomic_add_path;
-
-      CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-atomic-add-fallback.h", &atomic_add_path));
-      CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Atomic Add Source -----\n");
-      CeedCallBackend(CeedLoadSourceToBuffer(ceed, atomic_add_path, &atomic_add_source));
-      code << atomic_add_source;
-      CeedCallBackend(CeedFree(&atomic_add_path));
-      CeedCallBackend(CeedFree(&atomic_add_source));
+      code << "// AtomicAdd fallback source\n";
+      code << "#include <ceed/jit-source/cuda/cuda-atomic-add-fallback.h>\n\n";
     }
   }
 
   // Load basis source files
   // TODO: Add non-tensor, AtPoints
-  {
-    char       *tensor_basis_kernel_source;
-    const char *tensor_basis_kernel_path;
-
-    CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h", &tensor_basis_kernel_path));
-    CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Tensor Basis Kernel Source -----\n");
-    CeedCallBackend(CeedLoadSourceToBuffer(ceed, tensor_basis_kernel_path, &tensor_basis_kernel_source));
-    code << tensor_basis_kernel_source;
-    CeedCallBackend(CeedFree(&tensor_basis_kernel_path));
-    CeedCallBackend(CeedFree(&tensor_basis_kernel_source));
-  }
-  {
-    char       *cuda_gen_template_source;
-    const char *cuda_gen_template_path;
-
-    CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-gen-templates.h", &cuda_gen_template_path));
-    CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Cuda-Gen Template Source -----\n");
-    CeedCallBackend(CeedLoadSourceToBuffer(ceed, cuda_gen_template_path, &cuda_gen_template_source));
-    code << cuda_gen_template_source;
-    CeedCallBackend(CeedFree(&cuda_gen_template_path));
-    CeedCallBackend(CeedFree(&cuda_gen_template_source));
-  }
+  code << "// Tensor basis source\n";
+  code << "#include <ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h>\n\n";
+  code << "// CodeGen operator source\n";
+  code << "#include <ceed/jit-source/cuda/cuda-gen-templates.h>\n\n";
 
   // Get QFunction name
   std::string qfunction_name(qf_data->qfunction_name);
@@ -749,9 +724,13 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {
 
   // Add user QFunction source
   {
-    std::string qfunction_source(qf_data->qfunction_source);
+    const char *source_path;
+
+    CeedCallBackend(CeedQFunctionGetSourcePath(qf, &source_path));
+    CeedCheck(source_path, ceed, CEED_ERROR_UNSUPPORTED, "/gpu/cuda/gen backend requires QFunction source code file");
 
-    code << qfunction_source;
+    code << "// User QFunction source\n";
+    code << "#include \"" << source_path << "\"\n\n";
   }
 
   // Setup
diff --git a/backends/cuda-gen/ceed-cuda-gen-qfunction.c b/backends/cuda-gen/ceed-cuda-gen-qfunction.c
index ccff67a476..aec5294a8d 100644
--- a/backends/cuda-gen/ceed-cuda-gen-qfunction.c
+++ b/backends/cuda-gen/ceed-cuda-gen-qfunction.c
@@ -27,7 +27,6 @@ static int CeedQFunctionDestroy_Cuda_gen(CeedQFunction qf) {
 
   CeedCallBackend(CeedQFunctionGetData(qf, &data));
   CeedCallCuda(CeedQFunctionReturnCeed(qf), cudaFree(data->d_c));
-  CeedCallBackend(CeedFree(&data->qfunction_source));
   CeedCallBackend(CeedFree(&data));
   return CEED_ERROR_SUCCESS;
 }
@@ -45,10 +44,6 @@ int CeedQFunctionCreate_Cuda_gen(CeedQFunction qf) {
 
   // Read QFunction source
   CeedCallBackend(CeedQFunctionGetKernelName(qf, &data->qfunction_name));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading QFunction User Source -----\n");
-  CeedCallBackend(CeedQFunctionLoadSourceToBuffer(qf, &data->qfunction_source));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading QFunction User Source Complete! -----\n");
-  CeedCheck(data->qfunction_source, ceed, CEED_ERROR_UNSUPPORTED, "/gpu/cuda/gen backend requires QFunction source code file");
 
   CeedCallBackend(CeedSetBackendFunction(ceed, "QFunction", qf, "Apply", CeedQFunctionApply_Cuda_gen));
   CeedCallBackend(CeedSetBackendFunction(ceed, "QFunction", qf, "Destroy", CeedQFunctionDestroy_Cuda_gen));
diff --git a/backends/cuda-gen/ceed-cuda-gen.h b/backends/cuda-gen/ceed-cuda-gen.h
index d10dece242..bd0c76e671 100644
--- a/backends/cuda-gen/ceed-cuda-gen.h
+++ b/backends/cuda-gen/ceed-cuda-gen.h
@@ -26,7 +26,6 @@ typedef struct {
 
 typedef struct {
   const char *qfunction_name;
-  const char *qfunction_source;
   void       *d_c;
 } CeedQFunction_Cuda_gen;
 
diff --git a/backends/cuda-ref/ceed-cuda-ref-basis.c b/backends/cuda-ref/ceed-cuda-ref-basis.c
index 1c38ce002c..3eca8134c3 100644
--- a/backends/cuda-ref/ceed-cuda-ref-basis.c
+++ b/backends/cuda-ref/ceed-cuda-ref-basis.c
@@ -182,24 +182,17 @@ static int CeedBasisApplyAtPointsCore_Cuda(CeedBasis basis, bool apply_add, cons
     }
 
     // -- Compile kernels
-    char       *basis_kernel_source;
-    const char *basis_kernel_path;
-    CeedInt     num_comp;
+    const char basis_kernel_source[] = "// AtPoints basis source\n#include <ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h>\n";
+    CeedInt    num_comp;
 
     if (data->moduleAtPoints) CeedCallCuda(ceed, cuModuleUnload(data->moduleAtPoints));
     CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
-    CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h", &basis_kernel_path));
-    CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n");
-    CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source));
-    CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n");
     CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->moduleAtPoints, 9, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_BUF_LEN",
                                      Q_1d * CeedIntPow(Q_1d > P_1d ? Q_1d : P_1d, dim - 1), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp,
                                      "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_NUM_PTS",
                                      max_num_points, "POINTS_BUFF_LEN", CeedIntPow(Q_1d, dim - 1)));
     CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "InterpAtPoints", &data->InterpAtPoints));
     CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "GradAtPoints", &data->GradAtPoints));
-    CeedCallBackend(CeedFree(&basis_kernel_path));
-    CeedCallBackend(CeedFree(&basis_kernel_source));
   }
 
   // Get read/write access to u, v
@@ -419,8 +412,6 @@ static int CeedBasisDestroyNonTensor_Cuda(CeedBasis basis) {
 int CeedBasisCreateTensorH1_Cuda(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d,
                                  const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis) {
   Ceed            ceed;
-  char           *basis_kernel_source;
-  const char     *basis_kernel_path;
   CeedInt         num_comp;
   const CeedInt   q_bytes      = Q_1d * sizeof(CeedScalar);
   const CeedInt   interp_bytes = q_bytes * P_1d;
@@ -440,19 +431,15 @@ int CeedBasisCreateTensorH1_Cuda(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const
   CeedCallCuda(ceed, cudaMemcpy(data->d_grad_1d, grad_1d, interp_bytes, cudaMemcpyHostToDevice));
 
   // Compile basis kernels
+  const char basis_kernel_source[] = "// Tensor basis source\n#include <ceed/jit-source/cuda/cuda-ref-basis-tensor.h>\n";
+
   CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
-  CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-basis-tensor.h", &basis_kernel_path));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n");
-  CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n");
   CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 7, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_BUF_LEN",
                                    Q_1d * CeedIntPow(Q_1d > P_1d ? Q_1d : P_1d, dim - 1), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp,
                                    "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim)));
   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp));
   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Grad", &data->Grad));
   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight));
-  CeedCallBackend(CeedFree(&basis_kernel_path));
-  CeedCallBackend(CeedFree(&basis_kernel_source));
 
   CeedCallBackend(CeedBasisSetData(basis, data));
 
@@ -471,8 +458,6 @@ int CeedBasisCreateTensorH1_Cuda(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const
 int CeedBasisCreateH1_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *grad,
                            const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) {
   Ceed                     ceed;
-  char                    *basis_kernel_source;
-  const char              *basis_kernel_path;
   CeedInt                  num_comp, q_comp_interp, q_comp_grad;
   const CeedInt            q_bytes = num_qpts * sizeof(CeedScalar);
   CeedBasisNonTensor_Cuda *data;
@@ -501,11 +486,9 @@ int CeedBasisCreateH1_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes
   }
 
   // Compile basis kernels
+  const char basis_kernel_source[] = "// Nontensor basis source\n#include <ceed/jit-source/cuda/cuda-ref-basis-nontensor.h>\n";
+
   CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
-  CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-basis-nontensor.h", &basis_kernel_path));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n");
-  CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n");
   CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_Q_COMP_INTERP",
                                    q_comp_interp, "BASIS_Q_COMP_DERIV", q_comp_grad, "BASIS_NUM_COMP", num_comp));
   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp));
@@ -513,8 +496,6 @@ int CeedBasisCreateH1_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes
   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Deriv", &data->Deriv));
   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "DerivTranspose", &data->DerivTranspose));
   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight));
-  CeedCallBackend(CeedFree(&basis_kernel_path));
-  CeedCallBackend(CeedFree(&basis_kernel_source));
 
   CeedCallBackend(CeedBasisSetData(basis, data));
 
@@ -531,8 +512,6 @@ int CeedBasisCreateH1_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes
 int CeedBasisCreateHdiv_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *div,
                              const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) {
   Ceed                     ceed;
-  char                    *basis_kernel_source;
-  const char              *basis_kernel_path;
   CeedInt                  num_comp, q_comp_interp, q_comp_div;
   const CeedInt            q_bytes = num_qpts * sizeof(CeedScalar);
   CeedBasisNonTensor_Cuda *data;
@@ -561,11 +540,9 @@ int CeedBasisCreateHdiv_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nod
   }
 
   // Compile basis kernels
+  const char basis_kernel_source[] = "// Nontensor basis source\n#include <ceed/jit-source/cuda/cuda-ref-basis-nontensor.h>\n";
+
   CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
-  CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-basis-nontensor.h", &basis_kernel_path));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n");
-  CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n");
   CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_Q_COMP_INTERP",
                                    q_comp_interp, "BASIS_Q_COMP_DERIV", q_comp_div, "BASIS_NUM_COMP", num_comp));
   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp));
@@ -573,8 +550,6 @@ int CeedBasisCreateHdiv_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nod
   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Deriv", &data->Deriv));
   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "DerivTranspose", &data->DerivTranspose));
   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight));
-  CeedCallBackend(CeedFree(&basis_kernel_path));
-  CeedCallBackend(CeedFree(&basis_kernel_source));
 
   CeedCallBackend(CeedBasisSetData(basis, data));
 
@@ -591,8 +566,6 @@ int CeedBasisCreateHdiv_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nod
 int CeedBasisCreateHcurl_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp,
                               const CeedScalar *curl, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) {
   Ceed                     ceed;
-  char                    *basis_kernel_source;
-  const char              *basis_kernel_path;
   CeedInt                  num_comp, q_comp_interp, q_comp_curl;
   const CeedInt            q_bytes = num_qpts * sizeof(CeedScalar);
   CeedBasisNonTensor_Cuda *data;
@@ -621,11 +594,9 @@ int CeedBasisCreateHcurl_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_no
   }
 
   // Compile basis kernels
+  const char basis_kernel_source[] = "// Nontensor basis source\n#include <ceed/jit-source/cuda/cuda-ref-basis-nontensor.h>\n";
+
   CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
-  CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-basis-nontensor.h", &basis_kernel_path));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n");
-  CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n");
   CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_Q_COMP_INTERP",
                                    q_comp_interp, "BASIS_Q_COMP_DERIV", q_comp_curl, "BASIS_NUM_COMP", num_comp));
   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp));
@@ -633,8 +604,6 @@ int CeedBasisCreateHcurl_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_no
   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Deriv", &data->Deriv));
   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "DerivTranspose", &data->DerivTranspose));
   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight));
-  CeedCallBackend(CeedFree(&basis_kernel_path));
-  CeedCallBackend(CeedFree(&basis_kernel_source));
 
   CeedCallBackend(CeedBasisSetData(basis, data));
 
diff --git a/backends/cuda-ref/ceed-cuda-ref-operator.c b/backends/cuda-ref/ceed-cuda-ref-operator.c
index ceb940dad6..8cb8855ba7 100644
--- a/backends/cuda-ref/ceed-cuda-ref-operator.c
+++ b/backends/cuda-ref/ceed-cuda-ref-operator.c
@@ -1286,8 +1286,6 @@ static inline int CeedOperatorAssembleDiagonalSetup_Cuda(CeedOperator op) {
 //------------------------------------------------------------------------------
 static inline int CeedOperatorAssembleDiagonalSetupCompile_Cuda(CeedOperator op, CeedInt use_ceedsize_idx, const bool is_point_block) {
   Ceed                ceed;
-  char               *diagonal_kernel_source;
-  const char         *diagonal_kernel_path;
   CeedInt             num_input_fields, num_output_fields, num_eval_modes_in = 0, num_eval_modes_out = 0;
   CeedInt             num_comp, q_comp, num_nodes, num_qpts;
   CeedBasis           basis_in = NULL, basis_out = NULL;
@@ -1351,22 +1349,18 @@ static inline int CeedOperatorAssembleDiagonalSetupCompile_Cuda(CeedOperator op,
   CeedOperatorDiag_Cuda *diag = impl->diag;
 
   // Assemble kernel
-  CUmodule *module          = is_point_block ? &diag->module_point_block : &diag->module;
-  CeedInt   elems_per_block = 1;
+  const char diagonal_kernel_source[] = "// Diagonal assembly source\n#include <ceed/jit-source/cuda/cuda-ref-operator-assemble-diagonal.h>\n";
+  CUmodule  *module                   = is_point_block ? &diag->module_point_block : &diag->module;
+  CeedInt    elems_per_block          = 1;
+
   CeedCallBackend(CeedBasisGetNumNodes(basis_in, &num_nodes));
   CeedCallBackend(CeedBasisGetNumComponents(basis_in, &num_comp));
   if (basis_in == CEED_BASIS_NONE) num_qpts = num_nodes;
   else CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis_in, &num_qpts));
-  CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-operator-assemble-diagonal.h", &diagonal_kernel_path));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Diagonal Assembly Kernel Source -----\n");
-  CeedCallBackend(CeedLoadSourceToBuffer(ceed, diagonal_kernel_path, &diagonal_kernel_source));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Diagonal Assembly Source Complete! -----\n");
   CeedCallCuda(ceed, CeedCompile_Cuda(ceed, diagonal_kernel_source, module, 8, "NUM_EVAL_MODES_IN", num_eval_modes_in, "NUM_EVAL_MODES_OUT",
                                       num_eval_modes_out, "NUM_COMP", num_comp, "NUM_NODES", num_nodes, "NUM_QPTS", num_qpts, "USE_CEEDSIZE",
                                       use_ceedsize_idx, "USE_POINT_BLOCK", is_point_block ? 1 : 0, "BLOCK_SIZE", num_nodes * elems_per_block));
   CeedCallCuda(ceed, CeedGetKernel_Cuda(ceed, *module, "LinearDiagonal", is_point_block ? &diag->LinearPointBlock : &diag->LinearDiagonal));
-  CeedCallBackend(CeedFree(&diagonal_kernel_path));
-  CeedCallBackend(CeedFree(&diagonal_kernel_source));
   CeedCallBackend(CeedBasisDestroy(&basis_in));
   CeedCallBackend(CeedBasisDestroy(&basis_out));
   return CEED_ERROR_SUCCESS;
@@ -1481,8 +1475,6 @@ static int CeedOperatorLinearAssembleAddPointBlockDiagonal_Cuda(CeedOperator op,
 static int CeedSingleOperatorAssembleSetup_Cuda(CeedOperator op, CeedInt use_ceedsize_idx) {
   Ceed                ceed;
   Ceed_Cuda          *cuda_data;
-  char               *assembly_kernel_source;
-  const char         *assembly_kernel_path;
   CeedInt             num_input_fields, num_output_fields, num_eval_modes_in = 0, num_eval_modes_out = 0;
   CeedInt             elem_size_in, num_qpts_in = 0, num_comp_in, elem_size_out, num_qpts_out, num_comp_out, q_comp;
   CeedEvalMode       *eval_modes_in = NULL, *eval_modes_out = NULL;
@@ -1589,20 +1581,16 @@ static int CeedSingleOperatorAssembleSetup_Cuda(CeedOperator op, CeedInt use_cee
   }
 
   // Compile kernels
+  const char assembly_kernel_source[] = "// Full assembly source\n#include <ceed/jit-source/cuda/cuda-ref-operator-assemble.h>\n";
+
   CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr_in, &num_comp_in));
   CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr_out, &num_comp_out));
-  CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-operator-assemble.h", &assembly_kernel_path));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Assembly Kernel Source -----\n");
-  CeedCallBackend(CeedLoadSourceToBuffer(ceed, assembly_kernel_path, &assembly_kernel_source));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Assembly Source Complete! -----\n");
   CeedCallBackend(CeedCompile_Cuda(ceed, assembly_kernel_source, &asmb->module, 10, "NUM_EVAL_MODES_IN", num_eval_modes_in, "NUM_EVAL_MODES_OUT",
                                    num_eval_modes_out, "NUM_COMP_IN", num_comp_in, "NUM_COMP_OUT", num_comp_out, "NUM_NODES_IN", elem_size_in,
                                    "NUM_NODES_OUT", elem_size_out, "NUM_QPTS", num_qpts_in, "BLOCK_SIZE",
                                    asmb->block_size_x * asmb->block_size_y * asmb->elems_per_block, "BLOCK_SIZE_Y", asmb->block_size_y,
                                    "USE_CEEDSIZE", use_ceedsize_idx));
   CeedCallBackend(CeedGetKernel_Cuda(ceed, asmb->module, "LinearAssemble", &asmb->LinearAssemble));
-  CeedCallBackend(CeedFree(&assembly_kernel_path));
-  CeedCallBackend(CeedFree(&assembly_kernel_source));
 
   // Load into B_in, in order that they will be used in eval_modes_in
   {
diff --git a/backends/cuda-ref/ceed-cuda-ref-qfunction-load.cpp b/backends/cuda-ref/ceed-cuda-ref-qfunction-load.cpp
index ed40b1fca9..2d5540ead8 100644
--- a/backends/cuda-ref/ceed-cuda-ref-qfunction-load.cpp
+++ b/backends/cuda-ref/ceed-cuda-ref-qfunction-load.cpp
@@ -24,7 +24,6 @@ extern "C" int CeedQFunctionBuildKernel_Cuda_ref(CeedQFunction qf) {
   using std::string;
 
   Ceed                ceed;
-  const char         *read_write_kernel_path, *read_write_kernel_source;
   CeedInt             num_input_fields, num_output_fields, size;
   CeedQFunctionField *input_fields, *output_fields;
   CeedQFunction_Cuda *data;
@@ -35,31 +34,26 @@ extern "C" int CeedQFunctionBuildKernel_Cuda_ref(CeedQFunction qf) {
   // QFunction is built
   if (data->QFunction) return CEED_ERROR_SUCCESS;
 
-  CeedCheck(data->qfunction_source, ceed, CEED_ERROR_BACKEND, "No QFunction source or CUfunction provided.");
-
   // QFunction kernel generation
   CeedCallBackend(CeedQFunctionGetFields(qf, &num_input_fields, &input_fields, &num_output_fields, &output_fields));
 
   // Build strings for final kernel
-  CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-qfunction.h", &read_write_kernel_path));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading QFunction Read/Write Kernel Source -----\n");
-  {
-    char *source;
-
-    CeedCallBackend(CeedLoadSourceToBuffer(ceed, read_write_kernel_path, &source));
-    read_write_kernel_source = source;
-  }
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading QFunction Read/Write Kernel Source Complete! -----\n");
-  string        qfunction_source(data->qfunction_source);
   string        qfunction_name(data->qfunction_name);
-  string        read_write(read_write_kernel_source);
   string        kernel_name = "CeedKernelCudaRefQFunction_" + qfunction_name;
   ostringstream code;
 
-  // Defintions
-  code << read_write;
-  code << qfunction_source;
-  code << "\n";
+  // Definitions
+  code << "// QFunction source\n";
+  code << "#include <ceed/jit-source/cuda/cuda-ref-qfunction.h>\n\n";
+  {
+    const char *source_path;
+
+    CeedCallBackend(CeedQFunctionGetSourcePath(qf, &source_path));
+    CeedCheck(source_path, ceed, CEED_ERROR_BACKEND, "No QFunction source or CUfunction provided.");
+
+    code << "// User QFunction source\n";
+    code << "#include \"" << source_path << "\"\n\n";
+  }
   code << "extern \"C\" __global__ void " << kernel_name << "(void *ctx, CeedInt Q, Fields_Cuda fields) {\n";
 
   // Inputs
@@ -118,11 +112,6 @@ extern "C" int CeedQFunctionBuildKernel_Cuda_ref(CeedQFunction qf) {
   // Compile kernel
   CeedCallBackend(CeedCompile_Cuda(ceed, code.str().c_str(), &data->module, 0));
   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, kernel_name.c_str(), &data->QFunction));
-
-  // Cleanup
-  CeedCallBackend(CeedFree(&data->qfunction_source));
-  CeedCallBackend(CeedFree(&read_write_kernel_path));
-  CeedCallBackend(CeedFree(&read_write_kernel_source));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/cuda-ref/ceed-cuda-ref-qfunction.c b/backends/cuda-ref/ceed-cuda-ref-qfunction.c
index 957f02cbbe..eec4aea26c 100644
--- a/backends/cuda-ref/ceed-cuda-ref-qfunction.c
+++ b/backends/cuda-ref/ceed-cuda-ref-qfunction.c
@@ -68,7 +68,6 @@ static int CeedQFunctionDestroy_Cuda(CeedQFunction qf) {
   CeedQFunction_Cuda *data;
 
   CeedCallBackend(CeedQFunctionGetData(qf, &data));
-  CeedCallBackend(CeedFree(&data->qfunction_source));
   if (data->module) CeedCallCuda(CeedQFunctionReturnCeed(qf), cuModuleUnload(data->module));
   CeedCallBackend(CeedFree(&data));
   return CEED_ERROR_SUCCESS;
@@ -96,11 +95,8 @@ int CeedQFunctionCreate_Cuda(CeedQFunction qf) {
   CeedCallBackend(CeedCalloc(1, &data));
   CeedCallBackend(CeedQFunctionSetData(qf, data));
 
-  // Read QFunction source
+  // Read QFunction name
   CeedCallBackend(CeedQFunctionGetKernelName(qf, &data->qfunction_name));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading QFunction User Source -----\n");
-  CeedCallBackend(CeedQFunctionLoadSourceToBuffer(qf, &data->qfunction_source));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading QFunction User Source Complete! -----\n");
 
   // Register backend functions
   CeedCallBackend(CeedSetBackendFunction(ceed, "QFunction", qf, "Apply", CeedQFunctionApply_Cuda));
diff --git a/backends/cuda-ref/ceed-cuda-ref-restriction.c b/backends/cuda-ref/ceed-cuda-ref-restriction.c
index f89e1f694d..c4a5c22dda 100644
--- a/backends/cuda-ref/ceed-cuda-ref-restriction.c
+++ b/backends/cuda-ref/ceed-cuda-ref-restriction.c
@@ -24,8 +24,6 @@
 static inline int CeedElemRestrictionSetupCompile_Cuda(CeedElemRestriction rstr) {
   Ceed                      ceed;
   bool                      is_deterministic;
-  char                     *restriction_kernel_source;
-  const char               *restriction_kernel_path;
   CeedInt                   num_elem, num_comp, elem_size, comp_stride;
   CeedRestrictionType       rstr_type;
   CeedElemRestriction_Cuda *impl;
@@ -46,67 +44,45 @@ static inline int CeedElemRestrictionSetupCompile_Cuda(CeedElemRestriction rstr)
   // Compile CUDA kernels
   switch (rstr_type) {
     case CEED_RESTRICTION_STRIDED: {
-      bool    has_backend_strides;
-      CeedInt strides[3] = {1, num_elem * elem_size, elem_size};
+      const char restriction_kernel_source[] = "// Strided restriction source\n#include <ceed/jit-source/cuda/cuda-ref-restriction-strided.h>\n";
+      bool       has_backend_strides;
+      CeedInt    strides[3] = {1, num_elem * elem_size, elem_size};
 
       CeedCallBackend(CeedElemRestrictionHasBackendStrides(rstr, &has_backend_strides));
       if (!has_backend_strides) {
         CeedCallBackend(CeedElemRestrictionGetStrides(rstr, strides));
       }
-
-      CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-restriction-strided.h", &restriction_kernel_path));
-      CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source -----\n");
-      CeedCallBackend(CeedLoadSourceToBuffer(ceed, restriction_kernel_path, &restriction_kernel_source));
-      CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source Complete! -----\n");
       CeedCallBackend(CeedCompile_Cuda(ceed, restriction_kernel_source, &impl->module, 6, "RSTR_ELEM_SIZE", elem_size, "RSTR_NUM_ELEM", num_elem,
                                        "RSTR_NUM_COMP", num_comp, "RSTR_STRIDE_NODES", strides[0], "RSTR_STRIDE_COMP", strides[1], "RSTR_STRIDE_ELEM",
                                        strides[2]));
       CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "StridedNoTranspose", &impl->ApplyNoTranspose));
       CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "StridedTranspose", &impl->ApplyTranspose));
     } break;
-    case CEED_RESTRICTION_POINTS: {
-      const char *offset_kernel_path;
-      char      **file_paths     = NULL;
-      CeedInt     num_file_paths = 0;
-
-      CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-restriction-at-points.h", &restriction_kernel_path));
-      CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source -----\n");
-      CeedCallBackend(CeedLoadSourceAndInitializeBuffer(ceed, restriction_kernel_path, &num_file_paths, &file_paths, &restriction_kernel_source));
-      CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-restriction-offset.h", &offset_kernel_path));
-      CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, offset_kernel_path, &num_file_paths, &file_paths, &restriction_kernel_source));
-      CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source Complete! -----\n");
+    case CEED_RESTRICTION_STANDARD: {
+      const char restriction_kernel_source[] = "// Standard restriction source\n#include <ceed/jit-source/cuda/cuda-ref-restriction-offset.h>\n";
+
       CeedCallBackend(CeedCompile_Cuda(ceed, restriction_kernel_source, &impl->module, 6, "RSTR_ELEM_SIZE", elem_size, "RSTR_NUM_ELEM", num_elem,
                                        "RSTR_NUM_COMP", num_comp, "RSTR_NUM_NODES", impl->num_nodes, "RSTR_COMP_STRIDE", comp_stride,
                                        "USE_DETERMINISTIC", is_deterministic ? 1 : 0));
       CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OffsetNoTranspose", &impl->ApplyNoTranspose));
-      CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "AtPointsTranspose", &impl->ApplyTranspose));
-      // Cleanup
-      CeedCallBackend(CeedFree(&offset_kernel_path));
-      for (CeedInt i = 0; i < num_file_paths; i++) CeedCallBackend(CeedFree(&file_paths[i]));
-      CeedCallBackend(CeedFree(&file_paths));
+      CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OffsetTranspose", &impl->ApplyTranspose));
     } break;
-    case CEED_RESTRICTION_STANDARD: {
-      CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-restriction-offset.h", &restriction_kernel_path));
-      CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source -----\n");
-      CeedCallBackend(CeedLoadSourceToBuffer(ceed, restriction_kernel_path, &restriction_kernel_source));
-      CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source Complete! -----\n");
+    case CEED_RESTRICTION_POINTS: {
+      const char restriction_kernel_source[] =
+          "// AtPoints restriction source\n#include <ceed/jit-source/cuda/cuda-ref-restriction-at-points.h>\n\n"
+          "// Standard restriction source\n#include <ceed/jit-source/cuda/cuda-ref-restriction-offset.h>\n";
+
       CeedCallBackend(CeedCompile_Cuda(ceed, restriction_kernel_source, &impl->module, 6, "RSTR_ELEM_SIZE", elem_size, "RSTR_NUM_ELEM", num_elem,
                                        "RSTR_NUM_COMP", num_comp, "RSTR_NUM_NODES", impl->num_nodes, "RSTR_COMP_STRIDE", comp_stride,
                                        "USE_DETERMINISTIC", is_deterministic ? 1 : 0));
       CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OffsetNoTranspose", &impl->ApplyNoTranspose));
-      CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OffsetTranspose", &impl->ApplyTranspose));
+      CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "AtPointsTranspose", &impl->ApplyTranspose));
     } break;
     case CEED_RESTRICTION_ORIENTED: {
-      const char *offset_kernel_path;
-      char      **file_paths     = NULL;
-      CeedInt     num_file_paths = 0;
-
-      CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-restriction-oriented.h", &restriction_kernel_path));
-      CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source -----\n");
-      CeedCallBackend(CeedLoadSourceAndInitializeBuffer(ceed, restriction_kernel_path, &num_file_paths, &file_paths, &restriction_kernel_source));
-      CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-restriction-offset.h", &offset_kernel_path));
-      CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, offset_kernel_path, &num_file_paths, &file_paths, &restriction_kernel_source));
-      CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source Complete! -----\n");
+      const char restriction_kernel_source[] =
+          "// Oriented restriction source\n#include <ceed/jit-source/cuda/cuda-ref-restriction-oriented.h>\n\n"
+          "// Standard restriction source\n#include <ceed/jit-source/cuda/cuda-ref-restriction-offset.h>\n";
+
       CeedCallBackend(CeedCompile_Cuda(ceed, restriction_kernel_source, &impl->module, 6, "RSTR_ELEM_SIZE", elem_size, "RSTR_NUM_ELEM", num_elem,
                                        "RSTR_NUM_COMP", num_comp, "RSTR_NUM_NODES", impl->num_nodes, "RSTR_COMP_STRIDE", comp_stride,
                                        "USE_DETERMINISTIC", is_deterministic ? 1 : 0));
@@ -114,22 +90,11 @@ static inline int CeedElemRestrictionSetupCompile_Cuda(CeedElemRestriction rstr)
       CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OffsetNoTranspose", &impl->ApplyUnsignedNoTranspose));
       CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OrientedTranspose", &impl->ApplyTranspose));
       CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OffsetTranspose", &impl->ApplyUnsignedTranspose));
-      // Cleanup
-      CeedCallBackend(CeedFree(&offset_kernel_path));
-      for (CeedInt i = 0; i < num_file_paths; i++) CeedCallBackend(CeedFree(&file_paths[i]));
-      CeedCallBackend(CeedFree(&file_paths));
     } break;
     case CEED_RESTRICTION_CURL_ORIENTED: {
-      const char *offset_kernel_path;
-      char      **file_paths     = NULL;
-      CeedInt     num_file_paths = 0;
-
-      CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-restriction-curl-oriented.h", &restriction_kernel_path));
-      CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source -----\n");
-      CeedCallBackend(CeedLoadSourceAndInitializeBuffer(ceed, restriction_kernel_path, &num_file_paths, &file_paths, &restriction_kernel_source));
-      CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-restriction-offset.h", &offset_kernel_path));
-      CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, offset_kernel_path, &num_file_paths, &file_paths, &restriction_kernel_source));
-      CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source Complete! -----\n");
+      const char restriction_kernel_source[] =
+          "// Curl oriented restriction source\n#include <ceed/jit-source/cuda/cuda-ref-restriction-curl-oriented.h>\n\n"
+          "// Standard restriction source\n#include <ceed/jit-source/cuda/cuda-ref-restriction-offset.h>\n";
       CeedCallBackend(CeedCompile_Cuda(ceed, restriction_kernel_source, &impl->module, 6, "RSTR_ELEM_SIZE", elem_size, "RSTR_NUM_ELEM", num_elem,
                                        "RSTR_NUM_COMP", num_comp, "RSTR_NUM_NODES", impl->num_nodes, "RSTR_COMP_STRIDE", comp_stride,
                                        "USE_DETERMINISTIC", is_deterministic ? 1 : 0));
@@ -139,14 +104,8 @@ static inline int CeedElemRestrictionSetupCompile_Cuda(CeedElemRestriction rstr)
       CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "CurlOrientedTranspose", &impl->ApplyTranspose));
       CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "CurlOrientedUnsignedTranspose", &impl->ApplyUnsignedTranspose));
       CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OffsetTranspose", &impl->ApplyUnorientedTranspose));
-      // Cleanup
-      CeedCallBackend(CeedFree(&offset_kernel_path));
-      for (CeedInt i = 0; i < num_file_paths; i++) CeedCallBackend(CeedFree(&file_paths[i]));
-      CeedCallBackend(CeedFree(&file_paths));
     } break;
   }
-  CeedCallBackend(CeedFree(&restriction_kernel_path));
-  CeedCallBackend(CeedFree(&restriction_kernel_source));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/cuda-ref/ceed-cuda-ref.h b/backends/cuda-ref/ceed-cuda-ref.h
index 9e167463bd..0f6ca9d1cb 100644
--- a/backends/cuda-ref/ceed-cuda-ref.h
+++ b/backends/cuda-ref/ceed-cuda-ref.h
@@ -97,7 +97,6 @@ typedef struct {
 typedef struct {
   CUmodule    module;
   const char *qfunction_name;
-  const char *qfunction_source;
   CUfunction  QFunction;
   Fields_Cuda fields;
   void       *d_c;
diff --git a/backends/cuda-shared/ceed-cuda-shared-basis.c b/backends/cuda-shared/ceed-cuda-shared-basis.c
index fcd09b10f3..b1709787ec 100644
--- a/backends/cuda-shared/ceed-cuda-shared-basis.c
+++ b/backends/cuda-shared/ceed-cuda-shared-basis.c
@@ -283,24 +283,17 @@ static int CeedBasisApplyAtPointsCore_Cuda_shared(CeedBasis basis, bool apply_ad
     }
 
     // -- Compile kernels
-    char       *basis_kernel_source;
-    const char *basis_kernel_path;
-    CeedInt     num_comp;
+    const char basis_kernel_source[] = "// AtPoints basis source\n#include <ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h>\n";
+    CeedInt    num_comp;
 
     if (data->moduleAtPoints) CeedCallCuda(ceed, cuModuleUnload(data->moduleAtPoints));
     CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
-    CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h", &basis_kernel_path));
-    CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n");
-    CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source));
-    CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n");
     CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->moduleAtPoints, 9, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_BUF_LEN",
                                      Q_1d * CeedIntPow(Q_1d > P_1d ? Q_1d : P_1d, dim - 1), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp,
                                      "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_NUM_PTS",
                                      max_num_points, "POINTS_BUFF_LEN", CeedIntPow(Q_1d, dim - 1)));
     CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "InterpAtPoints", &data->InterpAtPoints));
     CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "GradAtPoints", &data->GradAtPoints));
-    CeedCallBackend(CeedFree(&basis_kernel_path));
-    CeedCallBackend(CeedFree(&basis_kernel_source));
   }
 
   // Get read/write access to u, v
@@ -395,8 +388,6 @@ static int CeedBasisDestroy_Cuda_shared(CeedBasis basis) {
 int CeedBasisCreateTensorH1_Cuda_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d,
                                         const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis) {
   Ceed                   ceed;
-  char                  *basis_kernel_source;
-  const char            *basis_kernel_path;
   CeedInt                num_comp;
   const CeedInt          q_bytes      = Q_1d * sizeof(CeedScalar);
   const CeedInt          interp_bytes = q_bytes * P_1d;
@@ -430,11 +421,9 @@ int CeedBasisCreateTensorH1_Cuda_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d,
   }
 
   // Compile basis kernels
+  const char basis_kernel_source[] = "// Tensor basis source\n#include <ceed/jit-source/cuda/cuda-shared-basis-tensor.h>\n";
+
   CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
-  CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-shared-basis-tensor.h", &basis_kernel_path));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n");
-  CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete -----\n");
   CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 8, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "T_1D",
                                    CeedIntMax(Q_1d, P_1d), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, "BASIS_NUM_NODES", CeedIntPow(P_1d, dim),
                                    "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_HAS_COLLOCATED_GRAD", has_collocated_grad));
@@ -445,8 +434,6 @@ int CeedBasisCreateTensorH1_Cuda_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d,
   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "GradTranspose", &data->GradTranspose));
   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "GradTransposeAdd", &data->GradTransposeAdd));
   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight));
-  CeedCallBackend(CeedFree(&basis_kernel_path));
-  CeedCallBackend(CeedFree(&basis_kernel_source));
 
   CeedCallBackend(CeedBasisSetData(basis, data));
 
diff --git a/backends/hip-gen/ceed-hip-gen-operator-build.cpp b/backends/hip-gen/ceed-hip-gen-operator-build.cpp
index ee0dea2609..f1a876ce26 100644
--- a/backends/hip-gen/ceed-hip-gen-operator-build.cpp
+++ b/backends/hip-gen/ceed-hip-gen-operator-build.cpp
@@ -707,28 +707,10 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op) {
 
   // Load basis source files
   // TODO: Add non-tensor, AtPoints
-  {
-    char       *tensor_basis_kernel_source;
-    const char *tensor_basis_kernel_path;
-
-    CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-shared-basis-tensor-templates.h", &tensor_basis_kernel_path));
-    CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Tensor Basis Kernel Source -----\n");
-    CeedCallBackend(CeedLoadSourceToBuffer(ceed, tensor_basis_kernel_path, &tensor_basis_kernel_source));
-    code << tensor_basis_kernel_source;
-    CeedCallBackend(CeedFree(&tensor_basis_kernel_path));
-    CeedCallBackend(CeedFree(&tensor_basis_kernel_source));
-  }
-  {
-    char       *hip_gen_template_source;
-    const char *hip_gen_template_path;
-
-    CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-gen-templates.h", &hip_gen_template_path));
-    CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Hip-Gen Template Source -----\n");
-    CeedCallBackend(CeedLoadSourceToBuffer(ceed, hip_gen_template_path, &hip_gen_template_source));
-    code << hip_gen_template_source;
-    CeedCallBackend(CeedFree(&hip_gen_template_path));
-    CeedCallBackend(CeedFree(&hip_gen_template_source));
-  }
+  code << "// Tensor basis source\n";
+  code << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-templates.h>\n\n";
+  code << "// CodeGen operator source\n";
+  code << "#include <ceed/jit-source/hip/hip-gen-templates.h>\n\n";
 
   // Get QFunction name
   std::string qfunction_name(qf_data->qfunction_name);
@@ -746,9 +728,13 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op) {
 
   // Add user QFunction source
   {
-    std::string qfunction_source(qf_data->qfunction_source);
+    const char *source_path;
+
+    CeedCallBackend(CeedQFunctionGetSourcePath(qf, &source_path));
+    CeedCheck(source_path, ceed, CEED_ERROR_UNSUPPORTED, "/gpu/hip/gen backend requires QFunction source code file");
 
-    code << qfunction_source;
+    code << "// User QFunction source\n";
+    code << "#include \"" << source_path << "\"\n\n";
   }
 
   // Setup
diff --git a/backends/hip-gen/ceed-hip-gen-qfunction.c b/backends/hip-gen/ceed-hip-gen-qfunction.c
index ed10d81ad3..32d5653b98 100644
--- a/backends/hip-gen/ceed-hip-gen-qfunction.c
+++ b/backends/hip-gen/ceed-hip-gen-qfunction.c
@@ -27,7 +27,6 @@ static int CeedQFunctionDestroy_Hip_gen(CeedQFunction qf) {
 
   CeedCallBackend(CeedQFunctionGetData(qf, &data));
   CeedCallHip(CeedQFunctionReturnCeed(qf), hipFree(data->d_c));
-  CeedCallBackend(CeedFree(&data->qfunction_source));
   CeedCallBackend(CeedFree(&data));
   return CEED_ERROR_SUCCESS;
 }
@@ -45,10 +44,6 @@ int CeedQFunctionCreate_Hip_gen(CeedQFunction qf) {
 
   // Read QFunction source
   CeedCallBackend(CeedQFunctionGetKernelName(qf, &data->qfunction_name));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading QFunction User Source -----\n");
-  CeedCallBackend(CeedQFunctionLoadSourceToBuffer(qf, &data->qfunction_source));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading QFunction User Source Complete! -----\n");
-  CeedCheck(data->qfunction_source, ceed, CEED_ERROR_UNSUPPORTED, "/gpu/hip/gen backend requires QFunction source code file");
 
   CeedCallBackend(CeedSetBackendFunction(ceed, "QFunction", qf, "Apply", CeedQFunctionApply_Hip_gen));
   CeedCallBackend(CeedSetBackendFunction(ceed, "QFunction", qf, "Destroy", CeedQFunctionDestroy_Hip_gen));
diff --git a/backends/hip-gen/ceed-hip-gen.h b/backends/hip-gen/ceed-hip-gen.h
index a0a8ac5511..139bab43bb 100644
--- a/backends/hip-gen/ceed-hip-gen.h
+++ b/backends/hip-gen/ceed-hip-gen.h
@@ -26,7 +26,6 @@ typedef struct {
 
 typedef struct {
   const char *qfunction_name;
-  const char *qfunction_source;
   void       *d_c;
 } CeedQFunction_Hip_gen;
 
diff --git a/backends/hip-ref/ceed-hip-ref-basis.c b/backends/hip-ref/ceed-hip-ref-basis.c
index f54184f28d..7e7f0e97e4 100644
--- a/backends/hip-ref/ceed-hip-ref-basis.c
+++ b/backends/hip-ref/ceed-hip-ref-basis.c
@@ -180,24 +180,17 @@ static int CeedBasisApplyAtPointsCore_Hip(CeedBasis basis, bool apply_add, const
     }
 
     // -- Compile kernels
-    char       *basis_kernel_source;
-    const char *basis_kernel_path;
-    CeedInt     num_comp;
+    const char basis_kernel_source[] = "// AtPoints basis source\n#include <ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h>\n";
+    CeedInt    num_comp;
 
     if (data->moduleAtPoints) CeedCallHip(ceed, hipModuleUnload(data->moduleAtPoints));
     CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
-    CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h", &basis_kernel_path));
-    CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n");
-    CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source));
-    CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n");
     CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->moduleAtPoints, 9, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_BUF_LEN",
                                     Q_1d * CeedIntPow(Q_1d > P_1d ? Q_1d : P_1d, dim - 1), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp,
                                     "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_NUM_PTS",
                                     max_num_points, "POINTS_BUFF_LEN", CeedIntPow(Q_1d, dim - 1)));
     CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "InterpAtPoints", &data->InterpAtPoints));
     CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "GradAtPoints", &data->GradAtPoints));
-    CeedCallBackend(CeedFree(&basis_kernel_path));
-    CeedCallBackend(CeedFree(&basis_kernel_source));
   }
 
   // Get read/write access to u, v
@@ -414,8 +407,6 @@ static int CeedBasisDestroyNonTensor_Hip(CeedBasis basis) {
 int CeedBasisCreateTensorH1_Hip(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d,
                                 const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis) {
   Ceed           ceed;
-  char          *basis_kernel_source;
-  const char    *basis_kernel_path;
   CeedInt        num_comp;
   const CeedInt  q_bytes      = Q_1d * sizeof(CeedScalar);
   const CeedInt  interp_bytes = q_bytes * P_1d;
@@ -435,19 +426,15 @@ int CeedBasisCreateTensorH1_Hip(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const C
   CeedCallHip(ceed, hipMemcpy(data->d_grad_1d, grad_1d, interp_bytes, hipMemcpyHostToDevice));
 
   // Compile basis kernels
+  const char basis_kernel_source[] = "// Tensor basis source\n#include <ceed/jit-source/hip/hip-ref-basis-tensor.h>\n";
+
   CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
-  CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-basis-tensor.h", &basis_kernel_path));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n");
-  CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n");
   CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->module, 7, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_BUF_LEN",
                                   Q_1d * CeedIntPow(Q_1d > P_1d ? Q_1d : P_1d, dim - 1), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp,
                                   "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim)));
   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Interp", &data->Interp));
   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Grad", &data->Grad));
   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Weight", &data->Weight));
-  CeedCallBackend(CeedFree(&basis_kernel_path));
-  CeedCallBackend(CeedFree(&basis_kernel_source));
 
   CeedCallBackend(CeedBasisSetData(basis, data));
 
@@ -466,8 +453,6 @@ int CeedBasisCreateTensorH1_Hip(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const C
 int CeedBasisCreateH1_Hip(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *grad,
                           const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) {
   Ceed                    ceed;
-  char                   *basis_kernel_source;
-  const char             *basis_kernel_path;
   CeedInt                 num_comp, q_comp_interp, q_comp_grad;
   const CeedInt           q_bytes = num_qpts * sizeof(CeedScalar);
   CeedBasisNonTensor_Hip *data;
@@ -496,11 +481,9 @@ int CeedBasisCreateH1_Hip(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes,
   }
 
   // Compile basis kernels
+  const char basis_kernel_source[] = "// Nontensor basis source\n#include <ceed/jit-source/hip/hip-ref-basis-nontensor.h>\n";
+
   CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
-  CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-basis-nontensor.h", &basis_kernel_path));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n");
-  CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n");
   CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_Q_COMP_INTERP",
                                   q_comp_interp, "BASIS_Q_COMP_DERIV", q_comp_grad, "BASIS_NUM_COMP", num_comp));
   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Interp", &data->Interp));
@@ -508,8 +491,6 @@ int CeedBasisCreateH1_Hip(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes,
   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Deriv", &data->Deriv));
   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "DerivTranspose", &data->DerivTranspose));
   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Weight", &data->Weight));
-  CeedCallBackend(CeedFree(&basis_kernel_path));
-  CeedCallBackend(CeedFree(&basis_kernel_source));
 
   CeedCallBackend(CeedBasisSetData(basis, data));
 
@@ -526,8 +507,6 @@ int CeedBasisCreateH1_Hip(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes,
 int CeedBasisCreateHdiv_Hip(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *div,
                             const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) {
   Ceed                    ceed;
-  char                   *basis_kernel_source;
-  const char             *basis_kernel_path;
   CeedInt                 num_comp, q_comp_interp, q_comp_div;
   const CeedInt           q_bytes = num_qpts * sizeof(CeedScalar);
   CeedBasisNonTensor_Hip *data;
@@ -556,11 +535,9 @@ int CeedBasisCreateHdiv_Hip(CeedElemTopology topo, CeedInt dim, CeedInt num_node
   }
 
   // Compile basis kernels
+  const char basis_kernel_source[] = "// Nontensor basis source\n#include <ceed/jit-source/hip/hip-ref-basis-nontensor.h>\n";
+
   CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
-  CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-basis-nontensor.h", &basis_kernel_path));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n");
-  CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n");
   CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_Q_COMP_INTERP",
                                   q_comp_interp, "BASIS_Q_COMP_DERIV", q_comp_div, "BASIS_NUM_COMP", num_comp));
   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Interp", &data->Interp));
@@ -568,8 +545,6 @@ int CeedBasisCreateHdiv_Hip(CeedElemTopology topo, CeedInt dim, CeedInt num_node
   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Deriv", &data->Deriv));
   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "DerivTranspose", &data->DerivTranspose));
   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Weight", &data->Weight));
-  CeedCallBackend(CeedFree(&basis_kernel_path));
-  CeedCallBackend(CeedFree(&basis_kernel_source));
 
   CeedCallBackend(CeedBasisSetData(basis, data));
 
@@ -586,8 +561,6 @@ int CeedBasisCreateHdiv_Hip(CeedElemTopology topo, CeedInt dim, CeedInt num_node
 int CeedBasisCreateHcurl_Hip(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp,
                              const CeedScalar *curl, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) {
   Ceed                    ceed;
-  char                   *basis_kernel_source;
-  const char             *basis_kernel_path;
   CeedInt                 num_comp, q_comp_interp, q_comp_curl;
   const CeedInt           q_bytes = num_qpts * sizeof(CeedScalar);
   CeedBasisNonTensor_Hip *data;
@@ -616,11 +589,9 @@ int CeedBasisCreateHcurl_Hip(CeedElemTopology topo, CeedInt dim, CeedInt num_nod
   }
 
   // Compile basis kernels
+  const char basis_kernel_source[] = "// Nontensor basis source\n#include <ceed/jit-source/hip/hip-ref-basis-nontensor.h>\n";
+
   CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
-  CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-basis-nontensor.h", &basis_kernel_path));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n");
-  CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n");
   CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_Q_COMP_INTERP",
                                   q_comp_interp, "BASIS_Q_COMP_DERIV", q_comp_curl, "BASIS_NUM_COMP", num_comp));
   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Interp", &data->Interp));
@@ -628,8 +599,6 @@ int CeedBasisCreateHcurl_Hip(CeedElemTopology topo, CeedInt dim, CeedInt num_nod
   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Deriv", &data->Deriv));
   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "DerivTranspose", &data->DerivTranspose));
   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Weight", &data->Weight));
-  CeedCallBackend(CeedFree(&basis_kernel_path));
-  CeedCallBackend(CeedFree(&basis_kernel_source));
 
   CeedCallBackend(CeedBasisSetData(basis, data));
 
diff --git a/backends/hip-ref/ceed-hip-ref-operator.c b/backends/hip-ref/ceed-hip-ref-operator.c
index c6307037fa..c2e58a1e7f 100644
--- a/backends/hip-ref/ceed-hip-ref-operator.c
+++ b/backends/hip-ref/ceed-hip-ref-operator.c
@@ -1283,8 +1283,6 @@ static inline int CeedOperatorAssembleDiagonalSetup_Hip(CeedOperator op) {
 //------------------------------------------------------------------------------
 static inline int CeedOperatorAssembleDiagonalSetupCompile_Hip(CeedOperator op, CeedInt use_ceedsize_idx, const bool is_point_block) {
   Ceed                ceed;
-  char               *diagonal_kernel_source;
-  const char         *diagonal_kernel_path;
   CeedInt             num_input_fields, num_output_fields, num_eval_modes_in = 0, num_eval_modes_out = 0;
   CeedInt             num_comp, q_comp, num_nodes, num_qpts;
   CeedBasis           basis_in = NULL, basis_out = NULL;
@@ -1348,22 +1346,18 @@ static inline int CeedOperatorAssembleDiagonalSetupCompile_Hip(CeedOperator op,
   CeedOperatorDiag_Hip *diag = impl->diag;
 
   // Assemble kernel
-  hipModule_t *module          = is_point_block ? &diag->module_point_block : &diag->module;
-  CeedInt      elems_per_block = 1;
+  const char   diagonal_kernel_source[] = "// Diagonal assembly source\n#include <ceed/jit-source/hip/hip-ref-operator-assemble-diagonal.h>\n";
+  hipModule_t *module                   = is_point_block ? &diag->module_point_block : &diag->module;
+  CeedInt      elems_per_block          = 1;
+
   CeedCallBackend(CeedBasisGetNumNodes(basis_in, &num_nodes));
   CeedCallBackend(CeedBasisGetNumComponents(basis_in, &num_comp));
   if (basis_in == CEED_BASIS_NONE) num_qpts = num_nodes;
   else CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis_in, &num_qpts));
-  CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-operator-assemble-diagonal.h", &diagonal_kernel_path));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Diagonal Assembly Kernel Source -----\n");
-  CeedCallBackend(CeedLoadSourceToBuffer(ceed, diagonal_kernel_path, &diagonal_kernel_source));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Diagonal Assembly Source Complete! -----\n");
   CeedCallHip(ceed, CeedCompile_Hip(ceed, diagonal_kernel_source, module, 8, "NUM_EVAL_MODES_IN", num_eval_modes_in, "NUM_EVAL_MODES_OUT",
                                     num_eval_modes_out, "NUM_COMP", num_comp, "NUM_NODES", num_nodes, "NUM_QPTS", num_qpts, "USE_CEEDSIZE",
                                     use_ceedsize_idx, "USE_POINT_BLOCK", is_point_block ? 1 : 0, "BLOCK_SIZE", num_nodes * elems_per_block));
   CeedCallHip(ceed, CeedGetKernel_Hip(ceed, *module, "LinearDiagonal", is_point_block ? &diag->LinearPointBlock : &diag->LinearDiagonal));
-  CeedCallBackend(CeedFree(&diagonal_kernel_path));
-  CeedCallBackend(CeedFree(&diagonal_kernel_source));
   CeedCallBackend(CeedBasisDestroy(&basis_in));
   CeedCallBackend(CeedBasisDestroy(&basis_out));
   return CEED_ERROR_SUCCESS;
@@ -1478,8 +1472,6 @@ static int CeedOperatorLinearAssembleAddPointBlockDiagonal_Hip(CeedOperator op,
 static int CeedSingleOperatorAssembleSetup_Hip(CeedOperator op, CeedInt use_ceedsize_idx) {
   Ceed                ceed;
   Ceed_Hip           *hip_data;
-  char               *assembly_kernel_source;
-  const char         *assembly_kernel_path;
   CeedInt             num_input_fields, num_output_fields, num_eval_modes_in = 0, num_eval_modes_out = 0;
   CeedInt             elem_size_in, num_qpts_in = 0, num_comp_in, elem_size_out, num_qpts_out, num_comp_out, q_comp;
   CeedEvalMode       *eval_modes_in = NULL, *eval_modes_out = NULL;
@@ -1586,20 +1578,16 @@ static int CeedSingleOperatorAssembleSetup_Hip(CeedOperator op, CeedInt use_ceed
   }
 
   // Compile kernels
+  const char assembly_kernel_source[] = "// Full assembly source\n#include <ceed/jit-source/hip/hip-ref-operator-assemble.h>\n";
+
   CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr_in, &num_comp_in));
   CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr_out, &num_comp_out));
-  CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-operator-assemble.h", &assembly_kernel_path));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Assembly Kernel Source -----\n");
-  CeedCallBackend(CeedLoadSourceToBuffer(ceed, assembly_kernel_path, &assembly_kernel_source));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Assembly Source Complete! -----\n");
   CeedCallBackend(CeedCompile_Hip(ceed, assembly_kernel_source, &asmb->module, 10, "NUM_EVAL_MODES_IN", num_eval_modes_in, "NUM_EVAL_MODES_OUT",
                                   num_eval_modes_out, "NUM_COMP_IN", num_comp_in, "NUM_COMP_OUT", num_comp_out, "NUM_NODES_IN", elem_size_in,
                                   "NUM_NODES_OUT", elem_size_out, "NUM_QPTS", num_qpts_in, "BLOCK_SIZE",
                                   asmb->block_size_x * asmb->block_size_y * asmb->elems_per_block, "BLOCK_SIZE_Y", asmb->block_size_y, "USE_CEEDSIZE",
                                   use_ceedsize_idx));
   CeedCallBackend(CeedGetKernel_Hip(ceed, asmb->module, "LinearAssemble", &asmb->LinearAssemble));
-  CeedCallBackend(CeedFree(&assembly_kernel_path));
-  CeedCallBackend(CeedFree(&assembly_kernel_source));
 
   // Load into B_in, in order that they will be used in eval_modes_in
   {
diff --git a/backends/hip-ref/ceed-hip-ref-qfunction-load.cpp b/backends/hip-ref/ceed-hip-ref-qfunction-load.cpp
index 3ba4f23266..2311f8a332 100644
--- a/backends/hip-ref/ceed-hip-ref-qfunction-load.cpp
+++ b/backends/hip-ref/ceed-hip-ref-qfunction-load.cpp
@@ -25,8 +25,6 @@ extern "C" int CeedQFunctionBuildKernel_Hip_ref(CeedQFunction qf) {
   using std::string;
 
   Ceed                ceed;
-  char               *read_write_kernel_source;
-  const char         *read_write_kernel_path;
   Ceed_Hip           *ceed_Hip;
   CeedInt             num_input_fields, num_output_fields, size;
   CeedQFunctionField *input_fields, *output_fields;
@@ -39,26 +37,26 @@ extern "C" int CeedQFunctionBuildKernel_Hip_ref(CeedQFunction qf) {
   // QFunction is built
   if (data->QFunction) return CEED_ERROR_SUCCESS;
 
-  CeedCheck(data->qfunction_source, ceed, CEED_ERROR_BACKEND, "No QFunction source or hipFunction_t provided.");
-
   // QFunction kernel generation
   CeedCallBackend(CeedQFunctionGetFields(qf, &num_input_fields, &input_fields, &num_output_fields, &output_fields));
 
   // Build strings for final kernel
-  CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-qfunction.h", &read_write_kernel_path));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading QFunction Read/Write Kernel Source -----\n");
-  CeedCallBackend(CeedLoadSourceToBuffer(ceed, read_write_kernel_path, &read_write_kernel_source));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading QFunction Read/Write Kernel Source Complete! -----\n");
-  string        qfunction_source(data->qfunction_source);
   string        qfunction_name(data->qfunction_name);
-  string        read_write(read_write_kernel_source);
   string        kernel_name = "CeedKernelHipRefQFunction_" + qfunction_name;
   ostringstream code;
 
-  // Defintions
-  code << read_write;
-  code << qfunction_source;
-  code << "\n";
+  // Definitions
+  code << "// QFunction source\n";
+  code << "#include <ceed/jit-source/hip/hip-ref-qfunction.h>\n\n";
+  {
+    const char *source_path;
+
+    CeedCallBackend(CeedQFunctionGetSourcePath(qf, &source_path));
+    CeedCheck(source_path, ceed, CEED_ERROR_BACKEND, "No QFunction source or hipFunction_t provided.");
+
+    code << "// User QFunction source\n";
+    code << "#include \"" << source_path << "\"\n\n";
+  }
   code << "extern \"C\" __launch_bounds__(BLOCK_SIZE)\n";
   code << "__global__ void " << kernel_name << "(void *ctx, CeedInt Q, Fields_Hip fields) {\n";
 
@@ -118,11 +116,6 @@ extern "C" int CeedQFunctionBuildKernel_Hip_ref(CeedQFunction qf) {
   // Compile kernel
   CeedCallBackend(CeedCompile_Hip(ceed, code.str().c_str(), &data->module, 1, "BLOCK_SIZE", ceed_Hip->opt_block_size));
   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, kernel_name.c_str(), &data->QFunction));
-
-  // Cleanup
-  CeedCallBackend(CeedFree(&data->qfunction_source));
-  CeedCallBackend(CeedFree(&read_write_kernel_path));
-  CeedCallBackend(CeedFree(&read_write_kernel_source));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/hip-ref/ceed-hip-ref-qfunction.c b/backends/hip-ref/ceed-hip-ref-qfunction.c
index 15d2dc7ae4..18d531ac11 100644
--- a/backends/hip-ref/ceed-hip-ref-qfunction.c
+++ b/backends/hip-ref/ceed-hip-ref-qfunction.c
@@ -70,7 +70,6 @@ static int CeedQFunctionDestroy_Hip(CeedQFunction qf) {
   CeedQFunction_Hip *data;
 
   CeedCallBackend(CeedQFunctionGetData(qf, &data));
-  CeedCallBackend(CeedFree(&data->qfunction_source));
   if (data->module) CeedCallHip(CeedQFunctionReturnCeed(qf), hipModuleUnload(data->module));
   CeedCallBackend(CeedFree(&data));
   return CEED_ERROR_SUCCESS;
@@ -89,11 +88,8 @@ int CeedQFunctionCreate_Hip(CeedQFunction qf) {
   CeedCallBackend(CeedQFunctionSetData(qf, data));
   CeedCallBackend(CeedQFunctionGetNumArgs(qf, &num_input_fields, &num_output_fields));
 
-  // Read QFunction source
+  // Read QFunction name
   CeedCallBackend(CeedQFunctionGetKernelName(qf, &data->qfunction_name));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading QFunction User Source -----\n");
-  CeedCallBackend(CeedQFunctionLoadSourceToBuffer(qf, &data->qfunction_source));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading QFunction User Source Complete! -----\n");
 
   // Register backend functions
   CeedCallBackend(CeedSetBackendFunction(ceed, "QFunction", qf, "Apply", CeedQFunctionApply_Hip));
diff --git a/backends/hip-ref/ceed-hip-ref-restriction.c b/backends/hip-ref/ceed-hip-ref-restriction.c
index 95b0961387..ca1d19d7a6 100644
--- a/backends/hip-ref/ceed-hip-ref-restriction.c
+++ b/backends/hip-ref/ceed-hip-ref-restriction.c
@@ -23,8 +23,6 @@
 static inline int CeedElemRestrictionSetupCompile_Hip(CeedElemRestriction rstr) {
   Ceed                     ceed;
   bool                     is_deterministic;
-  char                    *restriction_kernel_source;
-  const char              *restriction_kernel_path;
   CeedInt                  num_elem, num_comp, elem_size, comp_stride;
   CeedRestrictionType      rstr_type;
   CeedElemRestriction_Hip *impl;
@@ -45,67 +43,46 @@ static inline int CeedElemRestrictionSetupCompile_Hip(CeedElemRestriction rstr)
   // Compile HIP kernels
   switch (rstr_type) {
     case CEED_RESTRICTION_STRIDED: {
-      bool    has_backend_strides;
-      CeedInt strides[3] = {1, num_elem * elem_size, elem_size};
+      const char restriction_kernel_source[] = "// Strided restriction source\n#include <ceed/jit-source/hip/hip-ref-restriction-strided.h>\n";
+      bool       has_backend_strides;
+      CeedInt    strides[3] = {1, num_elem * elem_size, elem_size};
 
       CeedCallBackend(CeedElemRestrictionHasBackendStrides(rstr, &has_backend_strides));
       if (!has_backend_strides) {
         CeedCallBackend(CeedElemRestrictionGetStrides(rstr, strides));
       }
-
-      CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-restriction-strided.h", &restriction_kernel_path));
-      CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source -----\n");
-      CeedCallBackend(CeedLoadSourceToBuffer(ceed, restriction_kernel_path, &restriction_kernel_source));
-      CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source Complete! -----\n");
       CeedCallBackend(CeedCompile_Hip(ceed, restriction_kernel_source, &impl->module, 6, "RSTR_ELEM_SIZE", elem_size, "RSTR_NUM_ELEM", num_elem,
                                       "RSTR_NUM_COMP", num_comp, "RSTR_STRIDE_NODES", strides[0], "RSTR_STRIDE_COMP", strides[1], "RSTR_STRIDE_ELEM",
                                       strides[2]));
       CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "StridedNoTranspose", &impl->ApplyNoTranspose));
       CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "StridedTranspose", &impl->ApplyTranspose));
     } break;
-    case CEED_RESTRICTION_POINTS: {
-      const char *offset_kernel_path;
-      char      **file_paths     = NULL;
-      CeedInt     num_file_paths = 0;
-
-      CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-restriction-at-points.h", &restriction_kernel_path));
-      CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source -----\n");
-      CeedCallBackend(CeedLoadSourceAndInitializeBuffer(ceed, restriction_kernel_path, &num_file_paths, &file_paths, &restriction_kernel_source));
-      CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-restriction-offset.h", &offset_kernel_path));
-      CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, offset_kernel_path, &num_file_paths, &file_paths, &restriction_kernel_source));
-      CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source Complete! -----\n");
+    case CEED_RESTRICTION_STANDARD: {
+      const char restriction_kernel_source[] = "// Standard restriction source\n#include <ceed/jit-source/hip/hip-ref-restriction-offset.h>\n";
+
       CeedCallBackend(CeedCompile_Hip(ceed, restriction_kernel_source, &impl->module, 6, "RSTR_ELEM_SIZE", elem_size, "RSTR_NUM_ELEM", num_elem,
                                       "RSTR_NUM_COMP", num_comp, "RSTR_NUM_NODES", impl->num_nodes, "RSTR_COMP_STRIDE", comp_stride,
                                       "USE_DETERMINISTIC", is_deterministic ? 1 : 0));
       CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "OffsetNoTranspose", &impl->ApplyNoTranspose));
-      CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "AtPointsTranspose", &impl->ApplyTranspose));
-      // Cleanup
-      CeedCallBackend(CeedFree(&offset_kernel_path));
-      for (CeedInt i = 0; i < num_file_paths; i++) CeedCallBackend(CeedFree(&file_paths[i]));
-      CeedCallBackend(CeedFree(&file_paths));
+      CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "OffsetTranspose", &impl->ApplyTranspose));
     } break;
-    case CEED_RESTRICTION_STANDARD: {
-      CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-restriction-offset.h", &restriction_kernel_path));
-      CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source -----\n");
-      CeedCallBackend(CeedLoadSourceToBuffer(ceed, restriction_kernel_path, &restriction_kernel_source));
+    case CEED_RESTRICTION_POINTS: {
+      const char restriction_kernel_source[] =
+          "// AtPoints restriction source\n#include <ceed/jit-source/hip/hip-ref-restriction-at-points.h>\n\n"
+          "// Standard restriction source\n#include <ceed/jit-source/hip/hip-ref-restriction-offset.h>\n";
+
       CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source Complete! -----\n");
       CeedCallBackend(CeedCompile_Hip(ceed, restriction_kernel_source, &impl->module, 6, "RSTR_ELEM_SIZE", elem_size, "RSTR_NUM_ELEM", num_elem,
                                       "RSTR_NUM_COMP", num_comp, "RSTR_NUM_NODES", impl->num_nodes, "RSTR_COMP_STRIDE", comp_stride,
                                       "USE_DETERMINISTIC", is_deterministic ? 1 : 0));
       CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "OffsetNoTranspose", &impl->ApplyNoTranspose));
-      CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "OffsetTranspose", &impl->ApplyTranspose));
+      CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "AtPointsTranspose", &impl->ApplyTranspose));
     } break;
     case CEED_RESTRICTION_ORIENTED: {
-      const char *offset_kernel_path;
-      char      **file_paths     = NULL;
-      CeedInt     num_file_paths = 0;
-
-      CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-restriction-oriented.h", &restriction_kernel_path));
-      CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source -----\n");
-      CeedCallBackend(CeedLoadSourceAndInitializeBuffer(ceed, restriction_kernel_path, &num_file_paths, &file_paths, &restriction_kernel_source));
-      CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-restriction-offset.h", &offset_kernel_path));
-      CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, offset_kernel_path, &num_file_paths, &file_paths, &restriction_kernel_source));
-      CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source Complete! -----\n");
+      const char restriction_kernel_source[] =
+          "// Oriented restriction source\n#include <ceed/jit-source/hip/hip-ref-restriction-oriented.h>\n\n"
+          "// Standard restriction source\n#include <ceed/jit-source/hip/hip-ref-restriction-offset.h>\n";
+
       CeedCallBackend(CeedCompile_Hip(ceed, restriction_kernel_source, &impl->module, 6, "RSTR_ELEM_SIZE", elem_size, "RSTR_NUM_ELEM", num_elem,
                                       "RSTR_NUM_COMP", num_comp, "RSTR_NUM_NODES", impl->num_nodes, "RSTR_COMP_STRIDE", comp_stride,
                                       "USE_DETERMINISTIC", is_deterministic ? 1 : 0));
@@ -113,22 +90,12 @@ static inline int CeedElemRestrictionSetupCompile_Hip(CeedElemRestriction rstr)
       CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "OffsetNoTranspose", &impl->ApplyUnsignedNoTranspose));
       CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "OrientedTranspose", &impl->ApplyTranspose));
       CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "OffsetTranspose", &impl->ApplyUnsignedTranspose));
-      // Cleanup
-      CeedCallBackend(CeedFree(&offset_kernel_path));
-      for (CeedInt i = 0; i < num_file_paths; i++) CeedCallBackend(CeedFree(&file_paths[i]));
-      CeedCallBackend(CeedFree(&file_paths));
     } break;
     case CEED_RESTRICTION_CURL_ORIENTED: {
-      const char *offset_kernel_path;
-      char      **file_paths     = NULL;
-      CeedInt     num_file_paths = 0;
-
-      CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-restriction-curl-oriented.h", &restriction_kernel_path));
-      CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source -----\n");
-      CeedCallBackend(CeedLoadSourceAndInitializeBuffer(ceed, restriction_kernel_path, &num_file_paths, &file_paths, &restriction_kernel_source));
-      CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-restriction-offset.h", &offset_kernel_path));
-      CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, offset_kernel_path, &num_file_paths, &file_paths, &restriction_kernel_source));
-      CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source Complete! -----\n");
+      const char restriction_kernel_source[] =
+          "// Curl oriented restriction source\n#include <ceed/jit-source/hip/hip-ref-restriction-curl-oriented.h>\n\n"
+          "// Standard restriction source\n#include <ceed/jit-source/hip/hip-ref-restriction-offset.h>\n";
+
       CeedCallBackend(CeedCompile_Hip(ceed, restriction_kernel_source, &impl->module, 6, "RSTR_ELEM_SIZE", elem_size, "RSTR_NUM_ELEM", num_elem,
                                       "RSTR_NUM_COMP", num_comp, "RSTR_NUM_NODES", impl->num_nodes, "RSTR_COMP_STRIDE", comp_stride,
                                       "USE_DETERMINISTIC", is_deterministic ? 1 : 0));
@@ -138,14 +105,9 @@ static inline int CeedElemRestrictionSetupCompile_Hip(CeedElemRestriction rstr)
       CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "CurlOrientedTranspose", &impl->ApplyTranspose));
       CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "CurlOrientedUnsignedTranspose", &impl->ApplyUnsignedTranspose));
       CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "OffsetTranspose", &impl->ApplyUnorientedTranspose));
-      // Cleanup
-      CeedCallBackend(CeedFree(&offset_kernel_path));
-      for (CeedInt i = 0; i < num_file_paths; i++) CeedCallBackend(CeedFree(&file_paths[i]));
-      CeedCallBackend(CeedFree(&file_paths));
+
     } break;
   }
-  CeedCallBackend(CeedFree(&restriction_kernel_path));
-  CeedCallBackend(CeedFree(&restriction_kernel_source));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/hip-ref/ceed-hip-ref.h b/backends/hip-ref/ceed-hip-ref.h
index 52e88129a1..5a695761a9 100644
--- a/backends/hip-ref/ceed-hip-ref.h
+++ b/backends/hip-ref/ceed-hip-ref.h
@@ -101,7 +101,6 @@ typedef struct {
 typedef struct {
   hipModule_t   module;
   const char   *qfunction_name;
-  const char   *qfunction_source;
   hipFunction_t QFunction;
   Fields_Hip    fields;
   void         *d_c;
diff --git a/backends/hip-shared/ceed-hip-shared-basis.c b/backends/hip-shared/ceed-hip-shared-basis.c
index 05b564e7f2..cdcc28ce07 100644
--- a/backends/hip-shared/ceed-hip-shared-basis.c
+++ b/backends/hip-shared/ceed-hip-shared-basis.c
@@ -342,24 +342,17 @@ static int CeedBasisApplyAtPointsCore_Hip_shared(CeedBasis basis, bool apply_add
     }
 
     // -- Compile kernels
-    char       *basis_kernel_source;
-    const char *basis_kernel_path;
-    CeedInt     num_comp;
+    const char basis_kernel_source[] = "// AtPoints basis source\n#include <ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h>\n";
+    CeedInt    num_comp;
 
     if (data->moduleAtPoints) CeedCallHip(ceed, hipModuleUnload(data->moduleAtPoints));
     CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
-    CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h", &basis_kernel_path));
-    CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n");
-    CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source));
-    CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n");
     CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->moduleAtPoints, 9, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_BUF_LEN",
                                     Q_1d * CeedIntPow(Q_1d > P_1d ? Q_1d : P_1d, dim - 1), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp,
                                     "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_NUM_PTS",
                                     max_num_points, "POINTS_BUFF_LEN", CeedIntPow(Q_1d, dim - 1)));
     CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "InterpAtPoints", &data->InterpAtPoints));
     CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "GradAtPoints", &data->GradAtPoints));
-    CeedCallBackend(CeedFree(&basis_kernel_path));
-    CeedCallBackend(CeedFree(&basis_kernel_source));
   }
 
   // Get read/write access to u, v
@@ -454,8 +447,6 @@ static int CeedBasisDestroy_Hip_shared(CeedBasis basis) {
 int CeedBasisCreateTensorH1_Hip_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d,
                                        const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis) {
   Ceed                  ceed;
-  char                 *basis_kernel_source;
-  const char           *basis_kernel_path;
   CeedInt               num_comp;
   const CeedInt         q_bytes      = Q_1d * sizeof(CeedScalar);
   const CeedInt         interp_bytes = q_bytes * P_1d;
@@ -493,10 +484,8 @@ int CeedBasisCreateTensorH1_Hip_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d,
   CeedCallBackend(ComputeBasisThreadBlockSizes(dim, P_1d, Q_1d, num_comp, data->block_sizes));
 
   // Compile basis kernels
-  CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-shared-basis-tensor.h", &basis_kernel_path));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n");
-  CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n");
+  const char basis_kernel_source[] = "// Tensor basis source\n#include <ceed/jit-source/hip/hip-shared-basis-tensor.h>\n";
+
   CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->module, 11, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "T_1D",
                                   CeedIntMax(Q_1d, P_1d), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, "BASIS_NUM_NODES", CeedIntPow(P_1d, dim),
                                   "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_INTERP_BLOCK_SIZE", data->block_sizes[0], "BASIS_GRAD_BLOCK_SIZE",
@@ -509,8 +498,6 @@ int CeedBasisCreateTensorH1_Hip_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d,
   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "GradTranspose", &data->GradTranspose));
   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "GradTransposeAdd", &data->GradTransposeAdd));
   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Weight", &data->Weight));
-  CeedCallBackend(CeedFree(&basis_kernel_path));
-  CeedCallBackend(CeedFree(&basis_kernel_source));
 
   CeedCallBackend(CeedBasisSetData(basis, data));
 
diff --git a/interface/ceed-preconditioning.c b/interface/ceed-preconditioning.c
index 346135ebdf..3e82b38493 100644
--- a/interface/ceed-preconditioning.c
+++ b/interface/ceed-preconditioning.c
@@ -49,10 +49,13 @@ static int CeedQFunctionCreateFallback(Ceed fallback_ceed, CeedQFunction qf, Cee
 
   if (qf->source_path) {
     size_t path_len = strlen(qf->source_path), name_len = strlen(qf->kernel_name);
+
     CeedCall(CeedCalloc(path_len + name_len + 2, &source_path_with_name));
     memcpy(source_path_with_name, qf->source_path, path_len);
     memcpy(&source_path_with_name[path_len], ":", 1);
     memcpy(&source_path_with_name[path_len + 1], qf->kernel_name, name_len);
+  } else if (qf->user_source) {
+    CeedCall(CeedStringAllocCopy(qf->user_source, &source_path_with_name));
   } else {
     CeedCall(CeedCalloc(1, &source_path_with_name));
   }

From 6a96780f4d953613bf84434507c5fffdcf1ec0dd Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 18 Oct 2024 15:31:00 -0600
Subject: [PATCH 212/571] minor - header consistency

---
 backends/avx/ceed-avx-tensor.c                             | 2 +-
 include/ceed/ceed-f32.h                                    | 7 +++----
 include/ceed/ceed-f64.h                                    | 7 +++----
 include/ceed/jit-source/cuda/cuda-atomic-add-fallback.h    | 1 -
 include/ceed/jit-source/cuda/cuda-gen-templates.h          | 1 -
 .../jit-source/cuda/cuda-ref-basis-nontensor-templates.h   | 1 -
 include/ceed/jit-source/cuda/cuda-ref-basis-nontensor.h    | 1 -
 .../ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h | 1 -
 include/ceed/jit-source/cuda/cuda-ref-basis-tensor.h       | 1 -
 include/ceed/jit-source/cuda/cuda-ref-operator-assemble.h  | 1 -
 include/ceed/jit-source/cuda/cuda-ref-qfunction.h          | 1 -
 .../ceed/jit-source/cuda/cuda-ref-restriction-at-points.h  | 1 -
 .../jit-source/cuda/cuda-ref-restriction-curl-oriented.h   | 1 -
 include/ceed/jit-source/cuda/cuda-ref-restriction-offset.h | 1 -
 .../ceed/jit-source/cuda/cuda-ref-restriction-oriented.h   | 1 -
 .../ceed/jit-source/cuda/cuda-ref-restriction-strided.h    | 1 -
 .../cuda/cuda-shared-basis-read-write-templates.h          | 1 -
 .../jit-source/cuda/cuda-shared-basis-tensor-templates.h   | 1 -
 include/ceed/jit-source/cuda/cuda-shared-basis-tensor.h    | 1 -
 include/ceed/jit-source/cuda/cuda-types.h                  | 5 +----
 include/ceed/jit-source/gallery/ceed-identity.h            | 1 -
 include/ceed/jit-source/gallery/ceed-mass1dbuild.h         | 1 -
 include/ceed/jit-source/gallery/ceed-mass2dbuild.h         | 1 -
 include/ceed/jit-source/gallery/ceed-mass3dbuild.h         | 1 -
 include/ceed/jit-source/gallery/ceed-massapply.h           | 1 -
 include/ceed/jit-source/gallery/ceed-poisson1dapply.h      | 1 -
 include/ceed/jit-source/gallery/ceed-poisson1dbuild.h      | 1 -
 include/ceed/jit-source/gallery/ceed-poisson2dapply.h      | 1 -
 include/ceed/jit-source/gallery/ceed-poisson2dbuild.h      | 1 -
 include/ceed/jit-source/gallery/ceed-poisson3dapply.h      | 1 -
 include/ceed/jit-source/gallery/ceed-poisson3dbuild.h      | 1 -
 include/ceed/jit-source/gallery/ceed-scale.h               | 1 -
 include/ceed/jit-source/gallery/ceed-vectormassapply.h     | 1 -
 .../ceed/jit-source/gallery/ceed-vectorpoisson1dapply.h    | 1 -
 .../ceed/jit-source/gallery/ceed-vectorpoisson2dapply.h    | 1 -
 .../ceed/jit-source/gallery/ceed-vectorpoisson3dapply.h    | 1 -
 include/ceed/jit-source/hip/hip-gen-templates.h            | 1 -
 .../jit-source/hip/hip-ref-basis-nontensor-templates.h     | 1 -
 include/ceed/jit-source/hip/hip-ref-basis-nontensor.h      | 1 -
 .../ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h   | 1 -
 include/ceed/jit-source/hip/hip-ref-basis-tensor.h         | 1 -
 .../jit-source/hip/hip-ref-operator-assemble-diagonal.h    | 1 -
 include/ceed/jit-source/hip/hip-ref-operator-assemble.h    | 1 -
 include/ceed/jit-source/hip/hip-ref-qfunction.h            | 1 -
 .../ceed/jit-source/hip/hip-ref-restriction-at-points.h    | 1 -
 .../jit-source/hip/hip-ref-restriction-curl-oriented.h     | 1 -
 include/ceed/jit-source/hip/hip-ref-restriction-offset.h   | 1 -
 include/ceed/jit-source/hip/hip-ref-restriction-oriented.h | 1 -
 include/ceed/jit-source/hip/hip-ref-restriction-strided.h  | 1 -
 .../jit-source/hip/hip-shared-basis-read-write-templates.h | 1 -
 .../jit-source/hip/hip-shared-basis-tensor-templates.h     | 1 -
 include/ceed/jit-source/hip/hip-shared-basis-tensor.h      | 1 -
 include/ceed/jit-source/hip/hip-types.h                    | 5 +----
 include/ceed/jit-source/magma/magma-basis-grad-1d.h        | 1 -
 include/ceed/jit-source/magma/magma-basis-grad-2d.h        | 1 -
 include/ceed/jit-source/magma/magma-basis-grad-3d.h        | 1 -
 include/ceed/jit-source/magma/magma-basis-interp-1d.h      | 1 -
 include/ceed/jit-source/magma/magma-basis-interp-2d.h      | 1 -
 include/ceed/jit-source/magma/magma-basis-interp-3d.h      | 1 -
 .../jit-source/magma/magma-basis-interp-deriv-nontensor.h  | 1 -
 include/ceed/jit-source/magma/magma-basis-weight-1d.h      | 1 -
 include/ceed/jit-source/magma/magma-basis-weight-2d.h      | 1 -
 include/ceed/jit-source/magma/magma-basis-weight-3d.h      | 1 -
 .../ceed/jit-source/magma/magma-basis-weight-nontensor.h   | 1 -
 include/ceed/jit-source/magma/magma-common-defs.h          | 5 +----
 include/ceed/jit-source/sycl/sycl-ref-qfunction.h          | 1 -
 .../sycl/sycl-shared-basis-read-write-templates.h          | 2 --
 .../jit-source/sycl/sycl-shared-basis-tensor-templates.h   | 1 -
 include/ceed/jit-source/sycl/sycl-shared-basis-tensor.h    | 1 -
 include/ceed/jit-source/sycl/sycl-types.h                  | 5 +----
 include/ceed/types.h                                       | 5 +----
 71 files changed, 12 insertions(+), 93 deletions(-)

diff --git a/backends/avx/ceed-avx-tensor.c b/backends/avx/ceed-avx-tensor.c
index cd22249e83..ce8f26b355 100644
--- a/backends/avx/ceed-avx-tensor.c
+++ b/backends/avx/ceed-avx-tensor.c
@@ -10,7 +10,7 @@
 #include <immintrin.h>
 #include <stdbool.h>
 
-#ifdef CEED_F64_H
+#ifdef CEED_SCALAR_IS_FP64
 #define rtype __m256d
 #define loadu _mm256_loadu_pd
 #define storeu _mm256_storeu_pd
diff --git a/include/ceed/ceed-f32.h b/include/ceed/ceed-f32.h
index e605c47a4b..d928f5158a 100644
--- a/include/ceed/ceed-f32.h
+++ b/include/ceed/ceed-f32.h
@@ -8,8 +8,9 @@
 /// @file
 /// Public header for definitions related to using FP32 floating point (single precision) for CeedScalar.
 /// Include this header in ceed.h to use float instead of double.
-#ifndef CEED_F32_H
-#define CEED_F32_H
+#pragma once
+
+#define CEED_SCALAR_IS_FP32
 
 /// Set base scalar type to FP32. (See CeedScalarType enum in ceed.h for all options.)
 #define CEED_SCALAR_TYPE CEED_SCALAR_FP32
@@ -17,5 +18,3 @@ typedef float CeedScalar;
 
 /// Machine epsilon
 #define CEED_EPSILON 6e-08
-
-#endif  // CEED_F32_H
diff --git a/include/ceed/ceed-f64.h b/include/ceed/ceed-f64.h
index 3e6876cc19..bcab40cfd2 100644
--- a/include/ceed/ceed-f64.h
+++ b/include/ceed/ceed-f64.h
@@ -8,8 +8,9 @@
 /// @file
 /// Public header for definitions related to using FP64 floating point (double precision) for CeedScalar.
 /// This is the default header included in ceed.h.
-#ifndef CEED_F64_H
-#define CEED_F64_H
+#pragma once
+
+#define CEED_SCALAR_IS_FP64
 
 /// Set base scalar type to FP64. (See CeedScalarType enum in ceed.h for all options.)
 #define CEED_SCALAR_TYPE CEED_SCALAR_FP64
@@ -17,5 +18,3 @@ typedef double CeedScalar;
 
 /// Machine epsilon
 #define CEED_EPSILON 1e-16
-
-#endif  // CEED_F64_H
diff --git a/include/ceed/jit-source/cuda/cuda-atomic-add-fallback.h b/include/ceed/jit-source/cuda/cuda-atomic-add-fallback.h
index c566147adb..6c3712c36b 100644
--- a/include/ceed/jit-source/cuda/cuda-atomic-add-fallback.h
+++ b/include/ceed/jit-source/cuda/cuda-atomic-add-fallback.h
@@ -7,7 +7,6 @@
 
 /// @file
 /// Internal header for CUDA atomic add fallback definition
-
 #include <ceed/types.h>
 
 //------------------------------------------------------------------------------
diff --git a/include/ceed/jit-source/cuda/cuda-gen-templates.h b/include/ceed/jit-source/cuda/cuda-gen-templates.h
index a7fd0f7072..eb566137ee 100644
--- a/include/ceed/jit-source/cuda/cuda-gen-templates.h
+++ b/include/ceed/jit-source/cuda/cuda-gen-templates.h
@@ -7,7 +7,6 @@
 
 /// @file
 /// Internal header for CUDA backend macro and type definitions for JiT source
-
 #include <ceed/types.h>
 
 //------------------------------------------------------------------------------
diff --git a/include/ceed/jit-source/cuda/cuda-ref-basis-nontensor-templates.h b/include/ceed/jit-source/cuda/cuda-ref-basis-nontensor-templates.h
index e4c6f1d165..9f0fa61b49 100644
--- a/include/ceed/jit-source/cuda/cuda-ref-basis-nontensor-templates.h
+++ b/include/ceed/jit-source/cuda/cuda-ref-basis-nontensor-templates.h
@@ -7,7 +7,6 @@
 
 /// @file
 /// Internal header for CUDA non-tensor product basis templates
-
 #include <ceed/types.h>
 
 //------------------------------------------------------------------------------
diff --git a/include/ceed/jit-source/cuda/cuda-ref-basis-nontensor.h b/include/ceed/jit-source/cuda/cuda-ref-basis-nontensor.h
index a68b2fd3d6..afee25eb8d 100644
--- a/include/ceed/jit-source/cuda/cuda-ref-basis-nontensor.h
+++ b/include/ceed/jit-source/cuda/cuda-ref-basis-nontensor.h
@@ -7,7 +7,6 @@
 
 /// @file
 /// Internal header for CUDA non-tensor product basis
-
 #include <ceed/types.h>
 
 #include "cuda-ref-basis-nontensor-templates.h"
diff --git a/include/ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h b/include/ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h
index 1ce11dacfd..2d17b55b2c 100644
--- a/include/ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h
+++ b/include/ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h
@@ -7,7 +7,6 @@
 
 /// @file
 /// Internal header for CUDA tensor product basis with AtPoints evaluation
-
 #include <ceed/types.h>
 
 //------------------------------------------------------------------------------
diff --git a/include/ceed/jit-source/cuda/cuda-ref-basis-tensor.h b/include/ceed/jit-source/cuda/cuda-ref-basis-tensor.h
index 71d8ee8c34..a5ed841a11 100644
--- a/include/ceed/jit-source/cuda/cuda-ref-basis-tensor.h
+++ b/include/ceed/jit-source/cuda/cuda-ref-basis-tensor.h
@@ -7,7 +7,6 @@
 
 /// @file
 /// Internal header for CUDA tensor product basis
-
 #include <ceed/types.h>
 
 //------------------------------------------------------------------------------
diff --git a/include/ceed/jit-source/cuda/cuda-ref-operator-assemble.h b/include/ceed/jit-source/cuda/cuda-ref-operator-assemble.h
index 8c35dc1ee7..1de68e76c8 100644
--- a/include/ceed/jit-source/cuda/cuda-ref-operator-assemble.h
+++ b/include/ceed/jit-source/cuda/cuda-ref-operator-assemble.h
@@ -7,7 +7,6 @@
 
 /// @file
 /// Internal header for CUDA operator full assembly
-
 #include <ceed/types.h>
 
 #if USE_CEEDSIZE
diff --git a/include/ceed/jit-source/cuda/cuda-ref-qfunction.h b/include/ceed/jit-source/cuda/cuda-ref-qfunction.h
index 5bde9a99f7..6b26aee037 100644
--- a/include/ceed/jit-source/cuda/cuda-ref-qfunction.h
+++ b/include/ceed/jit-source/cuda/cuda-ref-qfunction.h
@@ -7,7 +7,6 @@
 
 /// @file
 /// Internal header for CUDA backend QFunction read/write kernels
-
 #include <ceed/types.h>
 
 //------------------------------------------------------------------------------
diff --git a/include/ceed/jit-source/cuda/cuda-ref-restriction-at-points.h b/include/ceed/jit-source/cuda/cuda-ref-restriction-at-points.h
index be23954097..039eab8cbf 100644
--- a/include/ceed/jit-source/cuda/cuda-ref-restriction-at-points.h
+++ b/include/ceed/jit-source/cuda/cuda-ref-restriction-at-points.h
@@ -7,7 +7,6 @@
 
 /// @file
 /// Internal header for CUDA offset element restriction kernels
-
 #include <ceed/types.h>
 
 //------------------------------------------------------------------------------
diff --git a/include/ceed/jit-source/cuda/cuda-ref-restriction-curl-oriented.h b/include/ceed/jit-source/cuda/cuda-ref-restriction-curl-oriented.h
index 5a7a8c29e6..48d8bda313 100644
--- a/include/ceed/jit-source/cuda/cuda-ref-restriction-curl-oriented.h
+++ b/include/ceed/jit-source/cuda/cuda-ref-restriction-curl-oriented.h
@@ -7,7 +7,6 @@
 
 /// @file
 /// Internal header for CUDA curl-oriented element restriction kernels
-
 #include <ceed/types.h>
 
 //------------------------------------------------------------------------------
diff --git a/include/ceed/jit-source/cuda/cuda-ref-restriction-offset.h b/include/ceed/jit-source/cuda/cuda-ref-restriction-offset.h
index f93c872ace..50c0ddbe92 100644
--- a/include/ceed/jit-source/cuda/cuda-ref-restriction-offset.h
+++ b/include/ceed/jit-source/cuda/cuda-ref-restriction-offset.h
@@ -7,7 +7,6 @@
 
 /// @file
 /// Internal header for CUDA offset element restriction kernels
-
 #include <ceed/types.h>
 
 //------------------------------------------------------------------------------
diff --git a/include/ceed/jit-source/cuda/cuda-ref-restriction-oriented.h b/include/ceed/jit-source/cuda/cuda-ref-restriction-oriented.h
index a1fda87b40..dca2dbb6c7 100644
--- a/include/ceed/jit-source/cuda/cuda-ref-restriction-oriented.h
+++ b/include/ceed/jit-source/cuda/cuda-ref-restriction-oriented.h
@@ -7,7 +7,6 @@
 
 /// @file
 /// Internal header for CUDA oriented element restriction kernels
-
 #include <ceed/types.h>
 
 //------------------------------------------------------------------------------
diff --git a/include/ceed/jit-source/cuda/cuda-ref-restriction-strided.h b/include/ceed/jit-source/cuda/cuda-ref-restriction-strided.h
index 59243fd30a..4d297b09c3 100644
--- a/include/ceed/jit-source/cuda/cuda-ref-restriction-strided.h
+++ b/include/ceed/jit-source/cuda/cuda-ref-restriction-strided.h
@@ -7,7 +7,6 @@
 
 /// @file
 /// Internal header for CUDA strided element restriction kernels
-
 #include <ceed/types.h>
 
 //------------------------------------------------------------------------------
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-read-write-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-read-write-templates.h
index f7e2a74db0..8671dc6423 100644
--- a/include/ceed/jit-source/cuda/cuda-shared-basis-read-write-templates.h
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-read-write-templates.h
@@ -7,7 +7,6 @@
 
 /// @file
 /// Internal header for CUDA shared memory basis read/write templates
-
 #include <ceed/types.h>
 
 //------------------------------------------------------------------------------
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h
index f09677828a..ba2a273a40 100644
--- a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h
@@ -7,7 +7,6 @@
 
 /// @file
 /// Internal header for CUDA shared memory tensor product basis templates
-
 #include <ceed/types.h>
 
 //------------------------------------------------------------------------------
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor.h
index 5f801cf81c..9b80043996 100644
--- a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor.h
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor.h
@@ -7,7 +7,6 @@
 
 /// @file
 /// Internal header for CUDA shared memory tensor product basis
-
 #include <ceed/types.h>
 
 #include "cuda-shared-basis-read-write-templates.h"
diff --git a/include/ceed/jit-source/cuda/cuda-types.h b/include/ceed/jit-source/cuda/cuda-types.h
index 9863caa7e0..3410286f78 100644
--- a/include/ceed/jit-source/cuda/cuda-types.h
+++ b/include/ceed/jit-source/cuda/cuda-types.h
@@ -7,8 +7,7 @@
 
 /// @file
 /// Internal header for CUDA type definitions
-#ifndef CEED_CUDA_TYPES_H
-#define CEED_CUDA_TYPES_H
+#pragma once
 
 #include <ceed/types.h>
 
@@ -31,5 +30,3 @@ typedef struct {
   CeedInt     t_id;
   CeedScalar *slice;
 } SharedData_Cuda;
-
-#endif  // CEED_CUDA_TYPES_H
diff --git a/include/ceed/jit-source/gallery/ceed-identity.h b/include/ceed/jit-source/gallery/ceed-identity.h
index 45c4ebf827..81a005d664 100644
--- a/include/ceed/jit-source/gallery/ceed-identity.h
+++ b/include/ceed/jit-source/gallery/ceed-identity.h
@@ -8,7 +8,6 @@
 /**
   @brief  Identity QFunction that copies inputs directly into outputs
 **/
-
 #include <ceed/types.h>
 
 typedef struct {
diff --git a/include/ceed/jit-source/gallery/ceed-mass1dbuild.h b/include/ceed/jit-source/gallery/ceed-mass1dbuild.h
index 4b2f60a135..4db3634acd 100644
--- a/include/ceed/jit-source/gallery/ceed-mass1dbuild.h
+++ b/include/ceed/jit-source/gallery/ceed-mass1dbuild.h
@@ -8,7 +8,6 @@
 /**
   @brief Ceed QFunction for building the geometric data for the 1D mass matrix
 **/
-
 #include <ceed/types.h>
 
 CEED_QFUNCTION(Mass1DBuild)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
diff --git a/include/ceed/jit-source/gallery/ceed-mass2dbuild.h b/include/ceed/jit-source/gallery/ceed-mass2dbuild.h
index 3d24858bca..583441007a 100644
--- a/include/ceed/jit-source/gallery/ceed-mass2dbuild.h
+++ b/include/ceed/jit-source/gallery/ceed-mass2dbuild.h
@@ -8,7 +8,6 @@
 /**
   @brief Ceed QFunction for building the geometric data for the 2D mass matrix
 **/
-
 #include <ceed/types.h>
 
 CEED_QFUNCTION(Mass2DBuild)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
diff --git a/include/ceed/jit-source/gallery/ceed-mass3dbuild.h b/include/ceed/jit-source/gallery/ceed-mass3dbuild.h
index 81a23aaff9..855f48682c 100644
--- a/include/ceed/jit-source/gallery/ceed-mass3dbuild.h
+++ b/include/ceed/jit-source/gallery/ceed-mass3dbuild.h
@@ -8,7 +8,6 @@
 /**
   @brief Ceed QFunction for building the geometric data for the 3D mass matrix
 **/
-
 #include <ceed/types.h>
 
 CEED_QFUNCTION(Mass3DBuild)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
diff --git a/include/ceed/jit-source/gallery/ceed-massapply.h b/include/ceed/jit-source/gallery/ceed-massapply.h
index 9d1f172190..4ec920ac7a 100644
--- a/include/ceed/jit-source/gallery/ceed-massapply.h
+++ b/include/ceed/jit-source/gallery/ceed-massapply.h
@@ -8,7 +8,6 @@
 /**
   @brief Ceed QFunction for applying the mass matrix
 **/
-
 #include <ceed/types.h>
 
 CEED_QFUNCTION(MassApply)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
diff --git a/include/ceed/jit-source/gallery/ceed-poisson1dapply.h b/include/ceed/jit-source/gallery/ceed-poisson1dapply.h
index 0a9dbfd9da..3d6bbfe513 100644
--- a/include/ceed/jit-source/gallery/ceed-poisson1dapply.h
+++ b/include/ceed/jit-source/gallery/ceed-poisson1dapply.h
@@ -8,7 +8,6 @@
 /**
   @brief Ceed QFunction for applying the 1D Poisson operator
 **/
-
 #include <ceed/types.h>
 
 CEED_QFUNCTION(Poisson1DApply)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
diff --git a/include/ceed/jit-source/gallery/ceed-poisson1dbuild.h b/include/ceed/jit-source/gallery/ceed-poisson1dbuild.h
index cad2d5c672..07096cca96 100644
--- a/include/ceed/jit-source/gallery/ceed-poisson1dbuild.h
+++ b/include/ceed/jit-source/gallery/ceed-poisson1dbuild.h
@@ -8,7 +8,6 @@
 /**
   @brief Ceed QFunction for building the geometric data for the 1D Poisson operator
 **/
-
 #include <ceed/types.h>
 
 CEED_QFUNCTION(Poisson1DBuild)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
diff --git a/include/ceed/jit-source/gallery/ceed-poisson2dapply.h b/include/ceed/jit-source/gallery/ceed-poisson2dapply.h
index e140c8cc8b..5c46422ecf 100644
--- a/include/ceed/jit-source/gallery/ceed-poisson2dapply.h
+++ b/include/ceed/jit-source/gallery/ceed-poisson2dapply.h
@@ -8,7 +8,6 @@
 /**
   @brief Ceed QFunction for applying the 2D Poisson operator
 **/
-
 #include <ceed/types.h>
 
 CEED_QFUNCTION(Poisson2DApply)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
diff --git a/include/ceed/jit-source/gallery/ceed-poisson2dbuild.h b/include/ceed/jit-source/gallery/ceed-poisson2dbuild.h
index 91a1736748..0f4e0b3f54 100644
--- a/include/ceed/jit-source/gallery/ceed-poisson2dbuild.h
+++ b/include/ceed/jit-source/gallery/ceed-poisson2dbuild.h
@@ -8,7 +8,6 @@
 /**
   @brief Ceed QFunction for building the geometric data for the 2D Poisson operator
 **/
-
 #include <ceed/types.h>
 
 CEED_QFUNCTION(Poisson2DBuild)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
diff --git a/include/ceed/jit-source/gallery/ceed-poisson3dapply.h b/include/ceed/jit-source/gallery/ceed-poisson3dapply.h
index ba25506034..c78c2ecbf4 100644
--- a/include/ceed/jit-source/gallery/ceed-poisson3dapply.h
+++ b/include/ceed/jit-source/gallery/ceed-poisson3dapply.h
@@ -8,7 +8,6 @@
 /**
   @brief Ceed QFunction for applying the geometric data for the 3D Poisson operator
 **/
-
 #include <ceed/types.h>
 
 CEED_QFUNCTION(Poisson3DApply)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
diff --git a/include/ceed/jit-source/gallery/ceed-poisson3dbuild.h b/include/ceed/jit-source/gallery/ceed-poisson3dbuild.h
index 640d1e91e7..b2013de28b 100644
--- a/include/ceed/jit-source/gallery/ceed-poisson3dbuild.h
+++ b/include/ceed/jit-source/gallery/ceed-poisson3dbuild.h
@@ -8,7 +8,6 @@
 /**
   @brief Ceed QFunction for building the geometric data for the 3D Poisson operator
 **/
-
 #include <ceed/types.h>
 
 CEED_QFUNCTION(Poisson3DBuild)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
diff --git a/include/ceed/jit-source/gallery/ceed-scale.h b/include/ceed/jit-source/gallery/ceed-scale.h
index bdf7d7ee1c..6ffe081815 100644
--- a/include/ceed/jit-source/gallery/ceed-scale.h
+++ b/include/ceed/jit-source/gallery/ceed-scale.h
@@ -8,7 +8,6 @@
 /**
   @brief  Scaling QFunction that scales inputs
 **/
-
 #include <ceed/types.h>
 
 CEED_QFUNCTION(Scale)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
diff --git a/include/ceed/jit-source/gallery/ceed-vectormassapply.h b/include/ceed/jit-source/gallery/ceed-vectormassapply.h
index 1486a038b3..40825f77f2 100644
--- a/include/ceed/jit-source/gallery/ceed-vectormassapply.h
+++ b/include/ceed/jit-source/gallery/ceed-vectormassapply.h
@@ -8,7 +8,6 @@
 /**
   @brief Ceed QFunction for applying the mass matrix on a vector system with three components
 **/
-
 #include <ceed/types.h>
 
 CEED_QFUNCTION(Vector3MassApply)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
diff --git a/include/ceed/jit-source/gallery/ceed-vectorpoisson1dapply.h b/include/ceed/jit-source/gallery/ceed-vectorpoisson1dapply.h
index e5f37c3f79..4101c7f886 100644
--- a/include/ceed/jit-source/gallery/ceed-vectorpoisson1dapply.h
+++ b/include/ceed/jit-source/gallery/ceed-vectorpoisson1dapply.h
@@ -8,7 +8,6 @@
 /**
   @brief Ceed QFunction for applying the 1D Poisson operator on a vector system with three components
 **/
-
 #include <ceed/types.h>
 
 CEED_QFUNCTION(Vector3Poisson1DApply)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
diff --git a/include/ceed/jit-source/gallery/ceed-vectorpoisson2dapply.h b/include/ceed/jit-source/gallery/ceed-vectorpoisson2dapply.h
index 3742c67dbb..061fe75355 100644
--- a/include/ceed/jit-source/gallery/ceed-vectorpoisson2dapply.h
+++ b/include/ceed/jit-source/gallery/ceed-vectorpoisson2dapply.h
@@ -8,7 +8,6 @@
 /**
   @brief Ceed QFunction for applying the 2D Poisson operator on a vector system with three components
 **/
-
 #include <ceed/types.h>
 
 CEED_QFUNCTION(Vector3Poisson2DApply)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
diff --git a/include/ceed/jit-source/gallery/ceed-vectorpoisson3dapply.h b/include/ceed/jit-source/gallery/ceed-vectorpoisson3dapply.h
index 29545099b9..7aabaa9025 100644
--- a/include/ceed/jit-source/gallery/ceed-vectorpoisson3dapply.h
+++ b/include/ceed/jit-source/gallery/ceed-vectorpoisson3dapply.h
@@ -8,7 +8,6 @@
 /**
   @brief Ceed QFunction for applying the geometric data for the 3D Poisson on a vector system with three components operator
 **/
-
 #include <ceed/types.h>
 
 CEED_QFUNCTION(Vector3Poisson3DApply)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
diff --git a/include/ceed/jit-source/hip/hip-gen-templates.h b/include/ceed/jit-source/hip/hip-gen-templates.h
index fd014dbb8a..02b4a7fd51 100644
--- a/include/ceed/jit-source/hip/hip-gen-templates.h
+++ b/include/ceed/jit-source/hip/hip-gen-templates.h
@@ -7,7 +7,6 @@
 
 /// @file
 /// Internal header for HIP backend macro and type definitions for JiT source
-
 #include <ceed/types.h>
 
 //------------------------------------------------------------------------------
diff --git a/include/ceed/jit-source/hip/hip-ref-basis-nontensor-templates.h b/include/ceed/jit-source/hip/hip-ref-basis-nontensor-templates.h
index c8160badf1..9d840f1edd 100644
--- a/include/ceed/jit-source/hip/hip-ref-basis-nontensor-templates.h
+++ b/include/ceed/jit-source/hip/hip-ref-basis-nontensor-templates.h
@@ -7,7 +7,6 @@
 
 /// @file
 /// Internal header for HIP non-tensor product basis templates
-
 #include <ceed/types.h>
 
 //------------------------------------------------------------------------------
diff --git a/include/ceed/jit-source/hip/hip-ref-basis-nontensor.h b/include/ceed/jit-source/hip/hip-ref-basis-nontensor.h
index bf950e11ab..6efbf47054 100644
--- a/include/ceed/jit-source/hip/hip-ref-basis-nontensor.h
+++ b/include/ceed/jit-source/hip/hip-ref-basis-nontensor.h
@@ -7,7 +7,6 @@
 
 /// @file
 /// Internal header for HIP non-tensor product basis
-
 #include <ceed/types.h>
 
 #include "hip-ref-basis-nontensor-templates.h"
diff --git a/include/ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h b/include/ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h
index 42e63cab65..9ce63a38de 100644
--- a/include/ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h
+++ b/include/ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h
@@ -7,7 +7,6 @@
 
 /// @file
 /// Internal header for CUDA tensor product basis with AtPoints evaluation
-
 #include <ceed/types.h>
 
 //------------------------------------------------------------------------------
diff --git a/include/ceed/jit-source/hip/hip-ref-basis-tensor.h b/include/ceed/jit-source/hip/hip-ref-basis-tensor.h
index 8ddb3552b1..e5cf318dc1 100644
--- a/include/ceed/jit-source/hip/hip-ref-basis-tensor.h
+++ b/include/ceed/jit-source/hip/hip-ref-basis-tensor.h
@@ -7,7 +7,6 @@
 
 /// @file
 /// Internal header for HIP tensor product basis
-
 #include <ceed/types.h>
 
 //------------------------------------------------------------------------------
diff --git a/include/ceed/jit-source/hip/hip-ref-operator-assemble-diagonal.h b/include/ceed/jit-source/hip/hip-ref-operator-assemble-diagonal.h
index 9e4fef865c..c9eed447e6 100644
--- a/include/ceed/jit-source/hip/hip-ref-operator-assemble-diagonal.h
+++ b/include/ceed/jit-source/hip/hip-ref-operator-assemble-diagonal.h
@@ -7,7 +7,6 @@
 
 /// @file
 /// Internal header for HIP operator diagonal assembly
-
 #include <ceed/types.h>
 
 #if USE_CEEDSIZE
diff --git a/include/ceed/jit-source/hip/hip-ref-operator-assemble.h b/include/ceed/jit-source/hip/hip-ref-operator-assemble.h
index a9b5bd7bc4..38625c7c3d 100644
--- a/include/ceed/jit-source/hip/hip-ref-operator-assemble.h
+++ b/include/ceed/jit-source/hip/hip-ref-operator-assemble.h
@@ -7,7 +7,6 @@
 
 /// @file
 /// Internal header for HIP operator full assembly
-
 #include <ceed/types.h>
 
 #if USE_CEEDSIZE
diff --git a/include/ceed/jit-source/hip/hip-ref-qfunction.h b/include/ceed/jit-source/hip/hip-ref-qfunction.h
index 6524f74dfd..f0d436572d 100644
--- a/include/ceed/jit-source/hip/hip-ref-qfunction.h
+++ b/include/ceed/jit-source/hip/hip-ref-qfunction.h
@@ -7,7 +7,6 @@
 
 /// @file
 /// Internal header for HIP backend QFunction read/write kernels
-
 #include <ceed/types.h>
 
 //------------------------------------------------------------------------------
diff --git a/include/ceed/jit-source/hip/hip-ref-restriction-at-points.h b/include/ceed/jit-source/hip/hip-ref-restriction-at-points.h
index 5d5b3f9812..cdc06d6061 100644
--- a/include/ceed/jit-source/hip/hip-ref-restriction-at-points.h
+++ b/include/ceed/jit-source/hip/hip-ref-restriction-at-points.h
@@ -7,7 +7,6 @@
 
 /// @file
 /// Internal header for HIP offset element restriction kernels
-
 #include <ceed/types.h>
 
 //------------------------------------------------------------------------------
diff --git a/include/ceed/jit-source/hip/hip-ref-restriction-curl-oriented.h b/include/ceed/jit-source/hip/hip-ref-restriction-curl-oriented.h
index 33b03ab673..12b3a0250b 100644
--- a/include/ceed/jit-source/hip/hip-ref-restriction-curl-oriented.h
+++ b/include/ceed/jit-source/hip/hip-ref-restriction-curl-oriented.h
@@ -7,7 +7,6 @@
 
 /// @file
 /// Internal header for HIP curl-oriented element restriction kernels
-
 #include <ceed/types.h>
 
 //------------------------------------------------------------------------------
diff --git a/include/ceed/jit-source/hip/hip-ref-restriction-offset.h b/include/ceed/jit-source/hip/hip-ref-restriction-offset.h
index a10c7fe957..3d0d68cb10 100644
--- a/include/ceed/jit-source/hip/hip-ref-restriction-offset.h
+++ b/include/ceed/jit-source/hip/hip-ref-restriction-offset.h
@@ -7,7 +7,6 @@
 
 /// @file
 /// Internal header for HIP offset element restriction kernels
-
 #include <ceed/types.h>
 
 //------------------------------------------------------------------------------
diff --git a/include/ceed/jit-source/hip/hip-ref-restriction-oriented.h b/include/ceed/jit-source/hip/hip-ref-restriction-oriented.h
index 07d13d6515..155173de63 100644
--- a/include/ceed/jit-source/hip/hip-ref-restriction-oriented.h
+++ b/include/ceed/jit-source/hip/hip-ref-restriction-oriented.h
@@ -7,7 +7,6 @@
 
 /// @file
 /// Internal header for HIP oriented element restriction kernels
-
 #include <ceed/types.h>
 
 //------------------------------------------------------------------------------
diff --git a/include/ceed/jit-source/hip/hip-ref-restriction-strided.h b/include/ceed/jit-source/hip/hip-ref-restriction-strided.h
index 2f8bd18446..8af0528756 100644
--- a/include/ceed/jit-source/hip/hip-ref-restriction-strided.h
+++ b/include/ceed/jit-source/hip/hip-ref-restriction-strided.h
@@ -7,7 +7,6 @@
 
 /// @file
 /// Internal header for HIP strided element restriction kernels
-
 #include <ceed/types.h>
 
 //------------------------------------------------------------------------------
diff --git a/include/ceed/jit-source/hip/hip-shared-basis-read-write-templates.h b/include/ceed/jit-source/hip/hip-shared-basis-read-write-templates.h
index 47311c3361..8691a92710 100644
--- a/include/ceed/jit-source/hip/hip-shared-basis-read-write-templates.h
+++ b/include/ceed/jit-source/hip/hip-shared-basis-read-write-templates.h
@@ -7,7 +7,6 @@
 
 /// @file
 /// Internal header for HIP shared memory basis read/write templates
-
 #include <ceed/types.h>
 
 //------------------------------------------------------------------------------
diff --git a/include/ceed/jit-source/hip/hip-shared-basis-tensor-templates.h b/include/ceed/jit-source/hip/hip-shared-basis-tensor-templates.h
index ef2ea90ec6..4f4cc58e78 100644
--- a/include/ceed/jit-source/hip/hip-shared-basis-tensor-templates.h
+++ b/include/ceed/jit-source/hip/hip-shared-basis-tensor-templates.h
@@ -7,7 +7,6 @@
 
 /// @file
 /// Internal header for HIP shared memory tensor product basis templates
-
 #include <ceed/types.h>
 
 //------------------------------------------------------------------------------
diff --git a/include/ceed/jit-source/hip/hip-shared-basis-tensor.h b/include/ceed/jit-source/hip/hip-shared-basis-tensor.h
index 9c33a96b51..d84f5555c8 100644
--- a/include/ceed/jit-source/hip/hip-shared-basis-tensor.h
+++ b/include/ceed/jit-source/hip/hip-shared-basis-tensor.h
@@ -7,7 +7,6 @@
 
 /// @file
 /// Internal header for HIP shared memory tensor product basis
-
 #include <ceed/types.h>
 
 #include "hip-shared-basis-read-write-templates.h"
diff --git a/include/ceed/jit-source/hip/hip-types.h b/include/ceed/jit-source/hip/hip-types.h
index 0042199c8b..418e6fb02c 100644
--- a/include/ceed/jit-source/hip/hip-types.h
+++ b/include/ceed/jit-source/hip/hip-types.h
@@ -7,8 +7,7 @@
 
 /// @file
 /// Internal header for HIP type definitions
-#ifndef CEED_HIP_TYPES_H
-#define CEED_HIP_TYPES_H
+#pragma once
 
 #include <ceed/types.h>
 
@@ -31,5 +30,3 @@ typedef struct {
   CeedInt     t_id;
   CeedScalar *slice;
 } SharedData_Hip;
-
-#endif  // CEED_HIP_TYPES_H
diff --git a/include/ceed/jit-source/magma/magma-basis-grad-1d.h b/include/ceed/jit-source/magma/magma-basis-grad-1d.h
index cd6f8548fb..998b0d5020 100644
--- a/include/ceed/jit-source/magma/magma-basis-grad-1d.h
+++ b/include/ceed/jit-source/magma/magma-basis-grad-1d.h
@@ -7,7 +7,6 @@
 
 /// @file
 /// Internal header for MAGMA tensor basis gradient in 1D
-
 #include "magma-common-tensor.h"
 
 // macros to abstract access of shared memory and reg. file
diff --git a/include/ceed/jit-source/magma/magma-basis-grad-2d.h b/include/ceed/jit-source/magma/magma-basis-grad-2d.h
index b4e7e2981a..b9fedf5c8e 100644
--- a/include/ceed/jit-source/magma/magma-basis-grad-2d.h
+++ b/include/ceed/jit-source/magma/magma-basis-grad-2d.h
@@ -7,7 +7,6 @@
 
 /// @file
 /// Internal header for MAGMA tensor basis gradient in 2D
-
 #include "magma-common-tensor.h"
 
 // macros to abstract access of shared memory and reg. file
diff --git a/include/ceed/jit-source/magma/magma-basis-grad-3d.h b/include/ceed/jit-source/magma/magma-basis-grad-3d.h
index c8028be756..64572a6510 100644
--- a/include/ceed/jit-source/magma/magma-basis-grad-3d.h
+++ b/include/ceed/jit-source/magma/magma-basis-grad-3d.h
@@ -7,7 +7,6 @@
 
 /// @file
 /// Internal header for MAGMA tensor basis gradient in 3D
-
 #include "magma-common-tensor.h"
 
 // macros to abstract access of shared memory and reg. file
diff --git a/include/ceed/jit-source/magma/magma-basis-interp-1d.h b/include/ceed/jit-source/magma/magma-basis-interp-1d.h
index 02f894ecce..c281d430dc 100644
--- a/include/ceed/jit-source/magma/magma-basis-interp-1d.h
+++ b/include/ceed/jit-source/magma/magma-basis-interp-1d.h
@@ -7,7 +7,6 @@
 
 /// @file
 /// Internal header for MAGMA tensor basis interpolation in 1D
-
 #include "magma-common-tensor.h"
 
 // macros to abstract access of shared memory and reg. file
diff --git a/include/ceed/jit-source/magma/magma-basis-interp-2d.h b/include/ceed/jit-source/magma/magma-basis-interp-2d.h
index 56c8081c83..fc2bba223a 100644
--- a/include/ceed/jit-source/magma/magma-basis-interp-2d.h
+++ b/include/ceed/jit-source/magma/magma-basis-interp-2d.h
@@ -7,7 +7,6 @@
 
 /// @file
 /// Internal header for MAGMA tensor basis interpolation in 1D
-
 #include "magma-common-tensor.h"
 
 // macros to abstract access of shared memory and reg. file
diff --git a/include/ceed/jit-source/magma/magma-basis-interp-3d.h b/include/ceed/jit-source/magma/magma-basis-interp-3d.h
index ac11e3f8df..7c214c8624 100644
--- a/include/ceed/jit-source/magma/magma-basis-interp-3d.h
+++ b/include/ceed/jit-source/magma/magma-basis-interp-3d.h
@@ -7,7 +7,6 @@
 
 /// @file
 /// Internal header for MAGMA tensor basis interpolation in 3D
-
 #include "magma-common-tensor.h"
 
 // macros to abstract access of shared memory and reg. file
diff --git a/include/ceed/jit-source/magma/magma-basis-interp-deriv-nontensor.h b/include/ceed/jit-source/magma/magma-basis-interp-deriv-nontensor.h
index 0614732f02..07b4386c07 100644
--- a/include/ceed/jit-source/magma/magma-basis-interp-deriv-nontensor.h
+++ b/include/ceed/jit-source/magma/magma-basis-interp-deriv-nontensor.h
@@ -7,7 +7,6 @@
 
 /// @file
 /// Internal header for MAGMA non-tensor basis interpolation
-
 #include "magma-common-nontensor.h"
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/include/ceed/jit-source/magma/magma-basis-weight-1d.h b/include/ceed/jit-source/magma/magma-basis-weight-1d.h
index 431fbb6d03..8333a3cfc4 100644
--- a/include/ceed/jit-source/magma/magma-basis-weight-1d.h
+++ b/include/ceed/jit-source/magma/magma-basis-weight-1d.h
@@ -7,7 +7,6 @@
 
 /// @file
 /// Internal header for MAGMA tensor basis weight in 1D
-
 #include "magma-common-tensor.h"
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/include/ceed/jit-source/magma/magma-basis-weight-2d.h b/include/ceed/jit-source/magma/magma-basis-weight-2d.h
index 034992e8f1..8fa903096b 100644
--- a/include/ceed/jit-source/magma/magma-basis-weight-2d.h
+++ b/include/ceed/jit-source/magma/magma-basis-weight-2d.h
@@ -7,7 +7,6 @@
 
 /// @file
 /// Internal header for MAGMA tensor basis weight in 2D
-
 #include "magma-common-tensor.h"
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/include/ceed/jit-source/magma/magma-basis-weight-3d.h b/include/ceed/jit-source/magma/magma-basis-weight-3d.h
index a5ee73bd96..2405188dcc 100644
--- a/include/ceed/jit-source/magma/magma-basis-weight-3d.h
+++ b/include/ceed/jit-source/magma/magma-basis-weight-3d.h
@@ -7,7 +7,6 @@
 
 /// @file
 /// Internal header for MAGMA tensor basis weight in 3D
-
 #include "magma-common-tensor.h"
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/include/ceed/jit-source/magma/magma-basis-weight-nontensor.h b/include/ceed/jit-source/magma/magma-basis-weight-nontensor.h
index 6a20ecefd6..4052025c91 100644
--- a/include/ceed/jit-source/magma/magma-basis-weight-nontensor.h
+++ b/include/ceed/jit-source/magma/magma-basis-weight-nontensor.h
@@ -7,7 +7,6 @@
 
 /// @file
 /// Internal header for MAGMA non-tensor basis weight
-
 #include "magma-common-nontensor.h"
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/include/ceed/jit-source/magma/magma-common-defs.h b/include/ceed/jit-source/magma/magma-common-defs.h
index a4913c2082..5dc3550b76 100644
--- a/include/ceed/jit-source/magma/magma-common-defs.h
+++ b/include/ceed/jit-source/magma/magma-common-defs.h
@@ -7,8 +7,7 @@
 
 /// @file
 /// Internal header for MAGMA backend common definitions
-#ifndef CEED_MAGMA_COMMON_DEFS_H
-#define CEED_MAGMA_COMMON_DEFS_H
+#pragma once
 
 #define MAGMA_DEVICE_SHARED(type, name) extern __shared__ type name[];
 
@@ -21,5 +20,3 @@
 
 // Define macro for computing the total threads in a block for use with __launch_bounds__()
 #define MAGMA_BASIS_BOUNDS(x, maxt) (x * MAGMA_BASIS_NTCOL(x, maxt))
-
-#endif  // CEED_MAGMA_COMMON_DEFS_H
diff --git a/include/ceed/jit-source/sycl/sycl-ref-qfunction.h b/include/ceed/jit-source/sycl/sycl-ref-qfunction.h
index 4a2ae56e94..24b7de724f 100644
--- a/include/ceed/jit-source/sycl/sycl-ref-qfunction.h
+++ b/include/ceed/jit-source/sycl/sycl-ref-qfunction.h
@@ -7,7 +7,6 @@
 
 /// @file
 /// Internal header for SYCL backend QFunction read/write kernels
-
 #include <ceed/types.h>
 
 //------------------------------------------------------------------------------
diff --git a/include/ceed/jit-source/sycl/sycl-shared-basis-read-write-templates.h b/include/ceed/jit-source/sycl/sycl-shared-basis-read-write-templates.h
index 5ddc1e84c8..06587592da 100644
--- a/include/ceed/jit-source/sycl/sycl-shared-basis-read-write-templates.h
+++ b/include/ceed/jit-source/sycl/sycl-shared-basis-read-write-templates.h
@@ -7,9 +7,7 @@
 
 /// @file
 /// Internal header for SYCL shared memory basis read/write templates
-
 #include <ceed/types.h>
-#include "sycl-types.h"
 
 //------------------------------------------------------------------------------
 // Helper function: load matrices for basis actions
diff --git a/include/ceed/jit-source/sycl/sycl-shared-basis-tensor-templates.h b/include/ceed/jit-source/sycl/sycl-shared-basis-tensor-templates.h
index 9933b2f5f0..bd6ec34052 100644
--- a/include/ceed/jit-source/sycl/sycl-shared-basis-tensor-templates.h
+++ b/include/ceed/jit-source/sycl/sycl-shared-basis-tensor-templates.h
@@ -7,7 +7,6 @@
 
 /// @file
 /// Internal header for SYCL shared memory tensor product basis templates
-
 #include <ceed/types.h>
 
 //------------------------------------------------------------------------------
diff --git a/include/ceed/jit-source/sycl/sycl-shared-basis-tensor.h b/include/ceed/jit-source/sycl/sycl-shared-basis-tensor.h
index 60b8bf531d..fc38b00351 100644
--- a/include/ceed/jit-source/sycl/sycl-shared-basis-tensor.h
+++ b/include/ceed/jit-source/sycl/sycl-shared-basis-tensor.h
@@ -7,7 +7,6 @@
 
 /// @file
 /// Internal header for SYCL shared memory tensor product basis
-
 #include <ceed/types.h>
 
 #include "sycl-shared-basis-read-write-templates.h"
diff --git a/include/ceed/jit-source/sycl/sycl-types.h b/include/ceed/jit-source/sycl/sycl-types.h
index 58938a4b2a..b42ad10385 100644
--- a/include/ceed/jit-source/sycl/sycl-types.h
+++ b/include/ceed/jit-source/sycl/sycl-types.h
@@ -7,8 +7,7 @@
 
 /// @file
 /// Internal header for SYCL type definitions
-#ifndef CEED_SYCL_TYPES_H
-#define CEED_SYCL_TYPES_H
+#pragma once
 
 #include <ceed/types.h>
 
@@ -35,5 +34,3 @@ typedef struct {
   CeedInt       *outputs[CEED_SYCL_NUMBER_FIELDS];
 } FieldsInt_Sycl;
 #endif
-
-#endif  // CEED_SYCL_TYPES_H
diff --git a/include/ceed/types.h b/include/ceed/types.h
index 6c6d126548..3f858a7ca4 100644
--- a/include/ceed/types.h
+++ b/include/ceed/types.h
@@ -7,8 +7,7 @@
 
 /// @file
 /// Public header for types and macros used in user QFunction source code
-#ifndef CEED_QFUNCTION_DEFS_H
-#define CEED_QFUNCTION_DEFS_H
+#pragma once
 
 #ifndef CEED_RUNNING_JIT_PASS
 #include <stddef.h>
@@ -253,5 +252,3 @@ typedef enum {
   /// Boolean value
   CEED_CONTEXT_FIELD_BOOL = 3,
 } CeedContextFieldType;
-
-#endif  // CEED_QFUNCTION_DEFS_H

From 4753b775a3a8f79e2dd83c2aab10890a6a04e913 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Mon, 21 Oct 2024 13:33:39 -0600
Subject: [PATCH 213/571] jit - add CeedAddJitDefine

---
 backends/cuda/ceed-cuda-compile.cpp | 28 +++++++--
 backends/hip/ceed-hip-compile.cpp   | 28 +++++++--
 doc/sphinx/source/releasenotes.md   |  2 +
 include/ceed-impl.h                 |  4 +-
 include/ceed/backend.h              |  2 +
 include/ceed/ceed.h                 |  1 +
 interface/ceed.c                    | 89 ++++++++++++++++++++++++++++-
 tests/t406-qfunction.c              |  5 +-
 tests/t406-qfunction.h              |  7 ++-
 9 files changed, 150 insertions(+), 16 deletions(-)

diff --git a/backends/cuda/ceed-cuda-compile.cpp b/backends/cuda/ceed-cuda-compile.cpp
index 6d3faf4cd1..c6aafbcbf6 100644
--- a/backends/cuda/ceed-cuda-compile.cpp
+++ b/backends/cuda/ceed-cuda-compile.cpp
@@ -38,7 +38,7 @@ int CeedCompile_Cuda(Ceed ceed, const char *source, CUmodule *module, const Ceed
   size_t                ptx_size;
   char                 *ptx;
   const int             num_opts            = 4;
-  CeedInt               num_jit_source_dirs = 0;
+  CeedInt               num_jit_source_dirs = 0, num_jit_defines = 0;
   const char          **opts;
   nvrtcProgram          prog;
   struct cudaDeviceProp prop;
@@ -85,19 +85,34 @@ int CeedCompile_Cuda(Ceed ceed, const char *source, CUmodule *module, const Ceed
   opts[1] = arch_arg.c_str();
   opts[2] = "-Dint32_t=int";
   opts[3] = "-DCEED_RUNNING_JIT_PASS=1";
+  // Additional include dirs
   {
     const char **jit_source_dirs;
 
     CeedCallBackend(CeedGetJitSourceRoots(ceed, &num_jit_source_dirs, &jit_source_dirs));
     CeedCallBackend(CeedRealloc(num_opts + num_jit_source_dirs, &opts));
     for (CeedInt i = 0; i < num_jit_source_dirs; i++) {
-      std::ostringstream include_dirs_arg;
+      std::ostringstream include_dir_arg;
 
-      include_dirs_arg << "-I" << jit_source_dirs[i];
-      CeedCallBackend(CeedStringAllocCopy(include_dirs_arg.str().c_str(), (char **)&opts[num_opts + i]));
+      include_dir_arg << "-I" << jit_source_dirs[i];
+      CeedCallBackend(CeedStringAllocCopy(include_dir_arg.str().c_str(), (char **)&opts[num_opts + i]));
     }
     CeedCallBackend(CeedRestoreJitSourceRoots(ceed, &jit_source_dirs));
   }
+  // User defines
+  {
+    const char **jit_defines;
+
+    CeedCallBackend(CeedGetJitDefines(ceed, &num_jit_defines, &jit_defines));
+    CeedCallBackend(CeedRealloc(num_opts + num_jit_source_dirs + num_jit_defines, &opts));
+    for (CeedInt i = 0; i < num_jit_defines; i++) {
+      std::ostringstream define_arg;
+
+      define_arg << "-D" << jit_defines[i];
+      CeedCallBackend(CeedStringAllocCopy(define_arg.str().c_str(), (char **)&opts[num_opts + num_jit_source_dirs + i]));
+    }
+    CeedCallBackend(CeedRestoreJitDefines(ceed, &jit_defines));
+  }
 
   // Add string source argument provided in call
   code << source;
@@ -106,11 +121,14 @@ int CeedCompile_Cuda(Ceed ceed, const char *source, CUmodule *module, const Ceed
   CeedCallNvrtc(ceed, nvrtcCreateProgram(&prog, code.str().c_str(), NULL, 0, NULL, NULL));
 
   // Compile kernel
-  nvrtcResult result = nvrtcCompileProgram(prog, num_opts + num_jit_source_dirs, opts);
+  nvrtcResult result = nvrtcCompileProgram(prog, num_opts + num_jit_source_dirs + num_jit_defines, opts);
 
   for (CeedInt i = 0; i < num_jit_source_dirs; i++) {
     CeedCallBackend(CeedFree(&opts[num_opts + i]));
   }
+  for (CeedInt i = 0; i < num_jit_defines; i++) {
+    CeedCallBackend(CeedFree(&opts[num_opts + num_jit_source_dirs + i]));
+  }
   CeedCallBackend(CeedFree(&opts));
   if (result != NVRTC_SUCCESS) {
     char  *log;
diff --git a/backends/hip/ceed-hip-compile.cpp b/backends/hip/ceed-hip-compile.cpp
index 582c0a2e24..20f2eb0e2a 100644
--- a/backends/hip/ceed-hip-compile.cpp
+++ b/backends/hip/ceed-hip-compile.cpp
@@ -37,7 +37,7 @@ int CeedCompile_Hip(Ceed ceed, const char *source, hipModule_t *module, const Ce
   size_t                 ptx_size;
   char                  *ptx;
   const int              num_opts            = 4;
-  CeedInt                num_jit_source_dirs = 0;
+  CeedInt                num_jit_source_dirs = 0, num_jit_defines = 0;
   const char           **opts;
   int                    runtime_version;
   hiprtcProgram          prog;
@@ -87,19 +87,34 @@ int CeedCompile_Hip(Ceed ceed, const char *source, hipModule_t *module, const Ce
   opts[1]              = arch_arg.c_str();
   opts[2]              = "-munsafe-fp-atomics";
   opts[3]              = "-DCEED_RUNNING_JIT_PASS=1";
+  // Additional include dirs
   {
     const char **jit_source_dirs;
 
     CeedCallBackend(CeedGetJitSourceRoots(ceed, &num_jit_source_dirs, &jit_source_dirs));
     CeedCallBackend(CeedRealloc(num_opts + num_jit_source_dirs, &opts));
     for (CeedInt i = 0; i < num_jit_source_dirs; i++) {
-      std::ostringstream include_dirs_arg;
+      std::ostringstream include_dir_arg;
 
-      include_dirs_arg << "-I" << jit_source_dirs[i];
-      CeedCallBackend(CeedStringAllocCopy(include_dirs_arg.str().c_str(), (char **)&opts[num_opts + i]));
+      include_dir_arg << "-I" << jit_source_dirs[i];
+      CeedCallBackend(CeedStringAllocCopy(include_dir_arg.str().c_str(), (char **)&opts[num_opts + i]));
     }
     CeedCallBackend(CeedRestoreJitSourceRoots(ceed, &jit_source_dirs));
   }
+  // User defines
+  {
+    const char **jit_defines;
+
+    CeedCallBackend(CeedGetJitDefines(ceed, &num_jit_defines, &jit_defines));
+    CeedCallBackend(CeedRealloc(num_opts + num_jit_source_dirs + num_jit_defines, &opts));
+    for (CeedInt i = 0; i < num_jit_defines; i++) {
+      std::ostringstream define_arg;
+
+      define_arg << "-D" << jit_defines[i];
+      CeedCallBackend(CeedStringAllocCopy(define_arg.str().c_str(), (char **)&opts[num_opts + num_jit_source_dirs + i]));
+    }
+    CeedCallBackend(CeedRestoreJitDefines(ceed, &jit_defines));
+  }
 
   // Add string source argument provided in call
   code << source;
@@ -108,11 +123,14 @@ int CeedCompile_Hip(Ceed ceed, const char *source, hipModule_t *module, const Ce
   CeedCallHiprtc(ceed, hiprtcCreateProgram(&prog, code.str().c_str(), NULL, 0, NULL, NULL));
 
   // Compile kernel
-  hiprtcResult result = hiprtcCompileProgram(prog, num_opts + num_jit_source_dirs, opts);
+  hiprtcResult result = hiprtcCompileProgram(prog, num_opts + num_jit_source_dirs + num_jit_defines, opts);
 
   for (CeedInt i = 0; i < num_jit_source_dirs; i++) {
     CeedCallBackend(CeedFree(&opts[num_opts + i]));
   }
+  for (CeedInt i = 0; i < num_jit_defines; i++) {
+    CeedCallBackend(CeedFree(&opts[num_opts + num_jit_source_dirs + i]));
+  }
   CeedCallBackend(CeedFree(&opts));
   if (result != HIPRTC_SUCCESS) {
     size_t log_size;
diff --git a/doc/sphinx/source/releasenotes.md b/doc/sphinx/source/releasenotes.md
index 764e2949ea..4115d62647 100644
--- a/doc/sphinx/source/releasenotes.md
+++ b/doc/sphinx/source/releasenotes.md
@@ -19,6 +19,8 @@ On this page we provide a summary of the main API changes, new features and exam
 - Add `CeedElemRestrictionGetLLayout` to provide L-vector layout for strided `CeedElemRestriction` created with `CEED_BACKEND_STRIDES`.
 - Add `CeedVectorReturnCeed` and similar when parent `Ceed` context for a libCEED object is only needed once in a calling scope.
 - Enable `#pragma once` for all JiT source; remove duplicate includes in JiT source string before compilation.
+- Allow user to set additional compiler options for CUDA and HIP JiT.
+Specifically, directories set with `CeedAddJitSourceRoot(ceed, "foo/bar")` will be used to set `-Ifoo/bar` and defines set with `CeedAddJitDefine(ceed, "foo=bar")` will be used to set `-Dfoo=bar`.
 
 ### Examples
 
diff --git a/include/ceed-impl.h b/include/ceed-impl.h
index 902dbbe35c..0758342523 100644
--- a/include/ceed-impl.h
+++ b/include/ceed-impl.h
@@ -99,7 +99,9 @@ struct Ceed_private {
   Ceed         op_fallback_ceed, op_fallback_parent;
   const char  *op_fallback_resource;
   char       **jit_source_roots;
-  CeedInt      num_jit_source_roots;
+  CeedInt      num_jit_source_roots, max_jit_source_roots;
+  char       **jit_defines;
+  CeedInt      num_jit_defines, max_jit_defines;
   int (*Error)(Ceed, const char *, int, const char *, int, const char *, va_list *);
   int (*SetStream)(Ceed, void *);
   int (*GetPreferredMemType)(CeedMemType *);
diff --git a/include/ceed/backend.h b/include/ceed/backend.h
index 43f2d52d20..e27d97cab3 100644
--- a/include/ceed/backend.h
+++ b/include/ceed/backend.h
@@ -256,6 +256,8 @@ CEED_EXTERN int CeedGetWorkVector(Ceed ceed, CeedSize len, CeedVector *vec);
 CEED_EXTERN int CeedRestoreWorkVector(Ceed ceed, CeedVector *vec);
 CEED_EXTERN int CeedGetJitSourceRoots(Ceed ceed, CeedInt *num_source_roots, const char ***jit_source_roots);
 CEED_EXTERN int CeedRestoreJitSourceRoots(Ceed ceed, const char ***jit_source_roots);
+CEED_EXTERN int CeedGetJitDefines(Ceed ceed, CeedInt *num_defines, const char ***jit_defines);
+CEED_EXTERN int CeedRestoreJitDefines(Ceed ceed, const char ***jit_defines);
 
 CEED_EXTERN int CeedVectorHasValidArray(CeedVector vec, bool *has_valid_array);
 CEED_EXTERN int CeedVectorHasBorrowedArrayOfType(CeedVector vec, CeedMemType mem_type, bool *has_borrowed_array_of_type);
diff --git a/include/ceed/ceed.h b/include/ceed/ceed.h
index 1660d724a9..b531bd8d28 100644
--- a/include/ceed/ceed.h
+++ b/include/ceed/ceed.h
@@ -107,6 +107,7 @@ CEED_EXTERN int CeedReferenceCopy(Ceed ceed, Ceed *ceed_copy);
 CEED_EXTERN int CeedGetResource(Ceed ceed, const char **resource);
 CEED_EXTERN int CeedIsDeterministic(Ceed ceed, bool *is_deterministic);
 CEED_EXTERN int CeedAddJitSourceRoot(Ceed ceed, const char *jit_source_root);
+CEED_EXTERN int CeedAddJitDefine(Ceed ceed, const char *jit_define);
 CEED_EXTERN int CeedView(Ceed ceed, FILE *stream);
 CEED_EXTERN int CeedDestroy(Ceed *ceed);
 CEED_EXTERN int CeedErrorImpl(Ceed ceed, const char *filename, int lineno, const char *func, int ecode, const char *format, ...);
diff --git a/interface/ceed.c b/interface/ceed.c
index e214b6eb85..37428a3338 100644
--- a/interface/ceed.c
+++ b/interface/ceed.c
@@ -668,6 +668,16 @@ int CeedGetOperatorFallbackCeed(Ceed ceed, Ceed *fallback_ceed) {
       }
       CeedCall(CeedRestoreJitSourceRoots(ceed, &jit_source_dirs));
     }
+    {
+      const char **jit_defines;
+      CeedInt      num_jit_defines = 0;
+
+      CeedCall(CeedGetJitDefines(ceed, &num_jit_defines, &jit_defines));
+      for (CeedInt i = 0; i < num_jit_defines; i++) {
+        CeedCall(CeedAddJitSourceRoot(fallback_ceed, jit_defines[i]));
+      }
+      CeedCall(CeedRestoreJitDefines(ceed, &jit_defines));
+    }
   }
   *fallback_ceed = ceed->op_fallback_ceed;
   return CEED_ERROR_SUCCESS;
@@ -874,7 +884,7 @@ int CeedRestoreWorkVector(Ceed ceed, CeedVector *vec) {
 }
 
 /**
-  @brief Retrieve list ofadditional JiT source roots from `Ceed` context.
+  @brief Retrieve list of additional JiT source roots from `Ceed` context.
 
   Note: The caller is responsible for restoring `jit_source_roots` with @ref CeedRestoreJitSourceRoots().
 
@@ -910,6 +920,43 @@ int CeedRestoreJitSourceRoots(Ceed ceed, const char ***jit_source_roots) {
   return CEED_ERROR_SUCCESS;
 }
 
+/**
+  @brief Retrieve list of additional JiT defines from `Ceed` context.
+
+  Note: The caller is responsible for restoring `jit_defines` with @ref CeedRestoreJitDefines().
+
+  @param[in]  ceed            `Ceed` context
+  @param[out] num_jit_defines Number of JiT defines
+  @param[out] jit_defines     Strings such as `foo=bar`, used as `-Dfoo=bar` in JiT
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Backend
+**/
+int CeedGetJitDefines(Ceed ceed, CeedInt *num_defines, const char ***jit_defines) {
+  Ceed ceed_parent;
+
+  CeedCall(CeedGetParent(ceed, &ceed_parent));
+  *num_defines = ceed_parent->num_jit_defines;
+  *jit_defines = (const char **)ceed_parent->jit_defines;
+  return CEED_ERROR_SUCCESS;
+}
+
+/**
+  @brief Restore list of additional JiT defines from with @ref CeedGetJitDefines()
+
+  @param[in]  ceed        `Ceed` context
+  @param[out] jit_defines String such as `foo=bar`, used as `-Dfoo=bar` in JiT
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Backend
+**/
+int CeedRestoreJitDefines(Ceed ceed, const char ***jit_defines) {
+  *jit_defines = NULL;
+  return CEED_ERROR_SUCCESS;
+}
+
 /// @}
 
 /// ----------------------------------------------------------------------------
@@ -1294,13 +1341,46 @@ int CeedAddJitSourceRoot(Ceed ceed, const char *jit_source_root) {
   CeedInt index       = ceed_parent->num_jit_source_roots;
   size_t  path_length = strlen(jit_source_root);
 
-  CeedCall(CeedRealloc(index + 1, &ceed_parent->jit_source_roots));
+  if (ceed_parent->num_jit_source_roots == ceed_parent->max_jit_source_roots) {
+    if (ceed_parent->max_jit_source_roots == 0) ceed_parent->max_jit_source_roots = 1;
+    ceed_parent->max_jit_source_roots *= 2;
+    CeedCall(CeedRealloc(ceed_parent->max_jit_source_roots, &ceed_parent->jit_source_roots));
+  }
   CeedCall(CeedCalloc(path_length + 1, &ceed_parent->jit_source_roots[index]));
   memcpy(ceed_parent->jit_source_roots[index], jit_source_root, path_length);
   ceed_parent->num_jit_source_roots++;
   return CEED_ERROR_SUCCESS;
 }
 
+/**
+  @brief Set additional JiT compiler define for `Ceed` context
+
+  @param[in,out] ceed       `Ceed` context
+  @param[in]     jit_define String such as `foo=bar`, used as `-Dfoo=bar` in JiT
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref User
+**/
+int CeedAddJitDefine(Ceed ceed, const char *jit_define) {
+  Ceed ceed_parent;
+
+  CeedCall(CeedGetParent(ceed, &ceed_parent));
+
+  CeedInt index         = ceed_parent->num_jit_defines;
+  size_t  define_length = strlen(jit_define);
+
+  if (ceed_parent->num_jit_defines == ceed_parent->max_jit_defines) {
+    if (ceed_parent->max_jit_defines == 0) ceed_parent->max_jit_defines = 1;
+    ceed_parent->max_jit_defines *= 2;
+    CeedCall(CeedRealloc(ceed_parent->max_jit_defines, &ceed_parent->jit_defines));
+  }
+  CeedCall(CeedCalloc(define_length + 1, &ceed_parent->jit_defines[index]));
+  memcpy(ceed_parent->jit_defines[index], jit_define, define_length);
+  ceed_parent->num_jit_defines++;
+  return CEED_ERROR_SUCCESS;
+}
+
 /**
   @brief View a `Ceed`
 
@@ -1355,6 +1435,11 @@ int CeedDestroy(Ceed *ceed) {
   }
   CeedCall(CeedFree(&(*ceed)->jit_source_roots));
 
+  for (CeedInt i = 0; i < (*ceed)->num_jit_defines; i++) {
+    CeedCall(CeedFree(&(*ceed)->jit_defines[i]));
+  }
+  CeedCall(CeedFree(&(*ceed)->jit_defines));
+
   CeedCall(CeedFree(&(*ceed)->f_offsets));
   CeedCall(CeedFree(&(*ceed)->resource));
   CeedCall(CeedDestroy(&(*ceed)->op_fallback_ceed));
diff --git a/tests/t406-qfunction.c b/tests/t406-qfunction.c
index e558139146..d593f8f73d 100644
--- a/tests/t406-qfunction.c
+++ b/tests/t406-qfunction.c
@@ -24,6 +24,7 @@ int main(int argc, char **argv) {
 
     memcpy(&file_path[last_slash - file_path], "/test-include/", 15);
     CeedAddJitSourceRoot(ceed, file_path);
+    CeedAddJitDefine(ceed, "COMPILER_DEFINED_SCALE=42");
   }
 
   CeedVectorCreate(ceed, q, &w);
@@ -71,9 +72,9 @@ int main(int argc, char **argv) {
 
     CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array);
     for (CeedInt i = 0; i < q; i++) {
-      if (fabs(5 * v_true[i] * sqrt(2.) - v_array[i]) > 1E3 * CEED_EPSILON) {
+      if (fabs(5 * COMPILER_DEFINED_SCALE * v_true[i] * sqrt(2.) - v_array[i]) > 5E3 * CEED_EPSILON) {
         // LCOV_EXCL_START
-        printf("[%" CeedInt_FMT "] v_true %f != v %f\n", i, 5 * v_true[i] * sqrt(2.), v_array[i]);
+        printf("[%" CeedInt_FMT "] v_true %f != v %f\n", i, 5 * COMPILER_DEFINED_SCALE * v_true[i] * sqrt(2.), v_array[i]);
         // LCOV_EXCL_STOP
       }
     }
diff --git a/tests/t406-qfunction.h b/tests/t406-qfunction.h
index 642de84a67..75a4229541 100644
--- a/tests/t406-qfunction.h
+++ b/tests/t406-qfunction.h
@@ -23,6 +23,11 @@
 #  include "t406-qfunction-scales.h"
 // clang-format on
 
+// Extra define set via CeedAddJitDefine() during JiT
+#ifndef CEED_RUNNING_JIT_PASS
+#define COMPILER_DEFINED_SCALE 42
+#endif
+
 CEED_QFUNCTION(setup)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   const CeedScalar *w      = in[0];
   CeedScalar       *q_data = out[0];
@@ -36,7 +41,7 @@ CEED_QFUNCTION(mass)(void *ctx, const CeedInt Q, const CeedScalar *const *in, Ce
   const CeedScalar *q_data = in[0], *u = in[1];
   CeedScalar       *v = out[0];
   for (CeedInt i = 0; i < Q; i++) {
-    v[i] = q_data[i] * (times_two(u[i]) + times_three(u[i])) * sqrt(1.0 * SCALE_TWO);
+    v[i] = q_data[i] * COMPILER_DEFINED_SCALE * (times_two(u[i]) + times_three(u[i])) * sqrt(1.0 * SCALE_TWO);
   }
   return 0;
 }

From aeb3a72da5687f4578b9985188eaa13acb56dd0f Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Tue, 22 Oct 2024 09:52:29 -0600
Subject: [PATCH 214/571] jit - count readers for user jit -i/-d

---
 include/ceed-impl.h |  4 ++--
 interface/ceed.c    | 29 +++++++++++++++++++++++------
 2 files changed, 25 insertions(+), 8 deletions(-)

diff --git a/include/ceed-impl.h b/include/ceed-impl.h
index 0758342523..7af9ecb8c9 100644
--- a/include/ceed-impl.h
+++ b/include/ceed-impl.h
@@ -99,9 +99,9 @@ struct Ceed_private {
   Ceed         op_fallback_ceed, op_fallback_parent;
   const char  *op_fallback_resource;
   char       **jit_source_roots;
-  CeedInt      num_jit_source_roots, max_jit_source_roots;
+  CeedInt      num_jit_source_roots, max_jit_source_roots, num_jit_source_roots_readers;
   char       **jit_defines;
-  CeedInt      num_jit_defines, max_jit_defines;
+  CeedInt      num_jit_defines, max_jit_defines, num_jit_defines_readers;
   int (*Error)(Ceed, const char *, int, const char *, int, const char *, va_list *);
   int (*SetStream)(Ceed, void *);
   int (*GetPreferredMemType)(CeedMemType *);
diff --git a/interface/ceed.c b/interface/ceed.c
index 37428a3338..bc0100da5e 100644
--- a/interface/ceed.c
+++ b/interface/ceed.c
@@ -659,14 +659,14 @@ int CeedGetOperatorFallbackCeed(Ceed ceed, Ceed *fallback_ceed) {
     fallback_ceed->Error              = ceed->Error;
     ceed->op_fallback_ceed            = fallback_ceed;
     {
-      const char **jit_source_dirs;
-      CeedInt      num_jit_source_dirs = 0;
+      const char **jit_source_roots;
+      CeedInt      num_jit_source_roots = 0;
 
-      CeedCall(CeedGetJitSourceRoots(ceed, &num_jit_source_dirs, &jit_source_dirs));
-      for (CeedInt i = 0; i < num_jit_source_dirs; i++) {
-        CeedCall(CeedAddJitSourceRoot(fallback_ceed, jit_source_dirs[i]));
+      CeedCall(CeedGetJitSourceRoots(ceed, &num_jit_source_roots, &jit_source_roots));
+      for (CeedInt i = 0; i < num_jit_source_roots; i++) {
+        CeedCall(CeedAddJitSourceRoot(fallback_ceed, jit_source_roots[i]));
       }
-      CeedCall(CeedRestoreJitSourceRoots(ceed, &jit_source_dirs));
+      CeedCall(CeedRestoreJitSourceRoots(ceed, &jit_source_roots));
     }
     {
       const char **jit_defines;
@@ -902,6 +902,7 @@ int CeedGetJitSourceRoots(Ceed ceed, CeedInt *num_source_roots, const char ***ji
   CeedCall(CeedGetParent(ceed, &ceed_parent));
   *num_source_roots = ceed_parent->num_jit_source_roots;
   *jit_source_roots = (const char **)ceed_parent->jit_source_roots;
+  ceed_parent->num_jit_source_roots_readers++;
   return CEED_ERROR_SUCCESS;
 }
 
@@ -916,7 +917,11 @@ int CeedGetJitSourceRoots(Ceed ceed, CeedInt *num_source_roots, const char ***ji
   @ref Backend
 **/
 int CeedRestoreJitSourceRoots(Ceed ceed, const char ***jit_source_roots) {
+  Ceed ceed_parent;
+
+  CeedCall(CeedGetParent(ceed, &ceed_parent));
   *jit_source_roots = NULL;
+  ceed_parent->num_jit_source_roots_readers--;
   return CEED_ERROR_SUCCESS;
 }
 
@@ -939,6 +944,7 @@ int CeedGetJitDefines(Ceed ceed, CeedInt *num_defines, const char ***jit_defines
   CeedCall(CeedGetParent(ceed, &ceed_parent));
   *num_defines = ceed_parent->num_jit_defines;
   *jit_defines = (const char **)ceed_parent->jit_defines;
+  ceed_parent->num_jit_defines_readers++;
   return CEED_ERROR_SUCCESS;
 }
 
@@ -953,7 +959,11 @@ int CeedGetJitDefines(Ceed ceed, CeedInt *num_defines, const char ***jit_defines
   @ref Backend
 **/
 int CeedRestoreJitDefines(Ceed ceed, const char ***jit_defines) {
+  Ceed ceed_parent;
+
+  CeedCall(CeedGetParent(ceed, &ceed_parent));
   *jit_defines = NULL;
+  ceed_parent->num_jit_defines_readers--;
   return CEED_ERROR_SUCCESS;
 }
 
@@ -1337,6 +1347,7 @@ int CeedAddJitSourceRoot(Ceed ceed, const char *jit_source_root) {
   Ceed ceed_parent;
 
   CeedCall(CeedGetParent(ceed, &ceed_parent));
+  CeedCheck(!ceed_parent->num_jit_source_roots_readers, ceed, CEED_ERROR_ACCESS, "Cannot add JiT source root, read access has been granted");
 
   CeedInt index       = ceed_parent->num_jit_source_roots;
   size_t  path_length = strlen(jit_source_root);
@@ -1366,6 +1377,7 @@ int CeedAddJitDefine(Ceed ceed, const char *jit_define) {
   Ceed ceed_parent;
 
   CeedCall(CeedGetParent(ceed, &ceed_parent));
+  CeedCheck(!ceed_parent->num_jit_defines_readers, ceed, CEED_ERROR_ACCESS, "Cannot add JiT define, read access has been granted");
 
   CeedInt index         = ceed_parent->num_jit_defines;
   size_t  define_length = strlen(jit_define);
@@ -1418,6 +1430,11 @@ int CeedDestroy(Ceed *ceed) {
     *ceed = NULL;
     return CEED_ERROR_SUCCESS;
   }
+
+  CeedCheck(!(*ceed)->num_jit_source_roots_readers, *ceed, CEED_ERROR_ACCESS,
+            "Cannot destroy ceed context, read access for JiT source roots has been granted");
+  CeedCheck(!(*ceed)->num_jit_defines_readers, *ceed, CEED_ERROR_ACCESS, "Cannot add JiT source root, read access for JiT defines has been granted");
+
   if ((*ceed)->delegate) CeedCall(CeedDestroy(&(*ceed)->delegate));
 
   if ((*ceed)->obj_delegate_count > 0) {

From 830fc37b4962fb06dfb1a529a988003b7e3581de Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Tue, 22 Oct 2024 10:20:10 -0600
Subject: [PATCH 215/571] minor - update error message co-authored-by: jrwrigh
 <james@jameswright.xyz>

---
 interface/ceed.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/interface/ceed.c b/interface/ceed.c
index bc0100da5e..5579cca242 100644
--- a/interface/ceed.c
+++ b/interface/ceed.c
@@ -1347,7 +1347,7 @@ int CeedAddJitSourceRoot(Ceed ceed, const char *jit_source_root) {
   Ceed ceed_parent;
 
   CeedCall(CeedGetParent(ceed, &ceed_parent));
-  CeedCheck(!ceed_parent->num_jit_source_roots_readers, ceed, CEED_ERROR_ACCESS, "Cannot add JiT source root, read access has been granted");
+  CeedCheck(!ceed_parent->num_jit_source_roots_readers, ceed, CEED_ERROR_ACCESS, "Cannot add JiT source root, read access has not been restored");
 
   CeedInt index       = ceed_parent->num_jit_source_roots;
   size_t  path_length = strlen(jit_source_root);
@@ -1377,7 +1377,7 @@ int CeedAddJitDefine(Ceed ceed, const char *jit_define) {
   Ceed ceed_parent;
 
   CeedCall(CeedGetParent(ceed, &ceed_parent));
-  CeedCheck(!ceed_parent->num_jit_defines_readers, ceed, CEED_ERROR_ACCESS, "Cannot add JiT define, read access has been granted");
+  CeedCheck(!ceed_parent->num_jit_defines_readers, ceed, CEED_ERROR_ACCESS, "Cannot add JiT define, read access has not been restored");
 
   CeedInt index         = ceed_parent->num_jit_defines;
   size_t  define_length = strlen(jit_define);

From 26ef7cdab87216015dfdd84a70b0ca05f3e531f1 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Tue, 22 Oct 2024 11:27:04 -0600
Subject: [PATCH 216/571] debug - truncate jit output

---
 backends/cuda-gen/ceed-cuda-gen-operator-build.cpp | 3 ---
 backends/cuda-ref/ceed-cuda-ref-qfunction-load.cpp | 4 ----
 backends/cuda/ceed-cuda-compile.cpp                | 6 +++---
 backends/hip-gen/ceed-hip-gen-operator-build.cpp   | 4 ----
 backends/hip-ref/ceed-hip-ref-qfunction-load.cpp   | 4 ----
 backends/hip-ref/ceed-hip-ref-restriction.c        | 1 -
 backends/hip/ceed-hip-compile.cpp                  | 6 +++---
 interface/ceed-jit-tools.c                         | 2 --
 8 files changed, 6 insertions(+), 24 deletions(-)

diff --git a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
index 315db3844f..3019dde777 100644
--- a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
+++ b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
@@ -896,9 +896,6 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {
   code << "// -----------------------------------------------------------------------------\n\n";
 
   // View kernel for debugging
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "Generated Operator Kernels:\n");
-  CeedDebug(ceed, code.str().c_str());
-
   CeedCallBackend(CeedCompile_Cuda(ceed, code.str().c_str(), &data->module, 1, "T_1D", CeedIntMax(Q_1d, data->max_P_1d)));
   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, operator_name.c_str(), &data->op));
 
diff --git a/backends/cuda-ref/ceed-cuda-ref-qfunction-load.cpp b/backends/cuda-ref/ceed-cuda-ref-qfunction-load.cpp
index 2d5540ead8..0ef3dde204 100644
--- a/backends/cuda-ref/ceed-cuda-ref-qfunction-load.cpp
+++ b/backends/cuda-ref/ceed-cuda-ref-qfunction-load.cpp
@@ -105,10 +105,6 @@ extern "C" int CeedQFunctionBuildKernel_Cuda_ref(CeedQFunction qf) {
   code << "  }\n";
   code << "}\n";
 
-  // View kernel for debugging
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "Generated QFunction Kernels:\n");
-  CeedDebug(ceed, code.str().c_str());
-
   // Compile kernel
   CeedCallBackend(CeedCompile_Cuda(ceed, code.str().c_str(), &data->module, 0));
   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, kernel_name.c_str(), &data->QFunction));
diff --git a/backends/cuda/ceed-cuda-compile.cpp b/backends/cuda/ceed-cuda-compile.cpp
index c6aafbcbf6..20c57db2e8 100644
--- a/backends/cuda/ceed-cuda-compile.cpp
+++ b/backends/cuda/ceed-cuda-compile.cpp
@@ -121,6 +121,9 @@ int CeedCompile_Cuda(Ceed ceed, const char *source, CUmodule *module, const Ceed
   CeedCallNvrtc(ceed, nvrtcCreateProgram(&prog, code.str().c_str(), NULL, 0, NULL, NULL));
 
   // Compile kernel
+  CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- ATTEMPTING TO COMPILE JIT SOURCE ----------\n");
+  CeedDebug(ceed, "Source:\n%s\n", code.str().c_str());
+  CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- END OF JIT SOURCE ----------\n");
   nvrtcResult result = nvrtcCompileProgram(prog, num_opts + num_jit_source_dirs + num_jit_defines, opts);
 
   for (CeedInt i = 0; i < num_jit_source_dirs; i++) {
@@ -134,9 +137,6 @@ int CeedCompile_Cuda(Ceed ceed, const char *source, CUmodule *module, const Ceed
     char  *log;
     size_t log_size;
 
-    CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- CEED JIT SOURCE FAILED TO COMPILE ----------\n");
-    CeedDebug(ceed, "Source:\n%s\n", code.str().c_str());
-    CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- CEED JIT SOURCE FAILED TO COMPILE ----------\n");
     CeedCallNvrtc(ceed, nvrtcGetProgramLogSize(prog, &log_size));
     CeedCallBackend(CeedMalloc(log_size, &log));
     CeedCallNvrtc(ceed, nvrtcGetProgramLog(prog, log));
diff --git a/backends/hip-gen/ceed-hip-gen-operator-build.cpp b/backends/hip-gen/ceed-hip-gen-operator-build.cpp
index f1a876ce26..05b4e27c24 100644
--- a/backends/hip-gen/ceed-hip-gen-operator-build.cpp
+++ b/backends/hip-gen/ceed-hip-gen-operator-build.cpp
@@ -896,10 +896,6 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op) {
   code << "}\n";
   code << "// -----------------------------------------------------------------------------\n\n";
 
-  // View kernel for debugging
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "Generated Operator Kernels:\n");
-  CeedDebug(ceed, code.str().c_str());
-
   CeedInt block_sizes[3] = {0, 0, 0};
   CeedInt num_elem;
 
diff --git a/backends/hip-ref/ceed-hip-ref-qfunction-load.cpp b/backends/hip-ref/ceed-hip-ref-qfunction-load.cpp
index 2311f8a332..0e58932439 100644
--- a/backends/hip-ref/ceed-hip-ref-qfunction-load.cpp
+++ b/backends/hip-ref/ceed-hip-ref-qfunction-load.cpp
@@ -109,10 +109,6 @@ extern "C" int CeedQFunctionBuildKernel_Hip_ref(CeedQFunction qf) {
   code << "  }\n";
   code << "}\n";
 
-  // View kernel for debugging
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "Generated QFunction Kernels:\n");
-  CeedDebug(ceed, code.str().c_str());
-
   // Compile kernel
   CeedCallBackend(CeedCompile_Hip(ceed, code.str().c_str(), &data->module, 1, "BLOCK_SIZE", ceed_Hip->opt_block_size));
   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, kernel_name.c_str(), &data->QFunction));
diff --git a/backends/hip-ref/ceed-hip-ref-restriction.c b/backends/hip-ref/ceed-hip-ref-restriction.c
index ca1d19d7a6..7642e09376 100644
--- a/backends/hip-ref/ceed-hip-ref-restriction.c
+++ b/backends/hip-ref/ceed-hip-ref-restriction.c
@@ -71,7 +71,6 @@ static inline int CeedElemRestrictionSetupCompile_Hip(CeedElemRestriction rstr)
           "// AtPoints restriction source\n#include <ceed/jit-source/hip/hip-ref-restriction-at-points.h>\n\n"
           "// Standard restriction source\n#include <ceed/jit-source/hip/hip-ref-restriction-offset.h>\n";
 
-      CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source Complete! -----\n");
       CeedCallBackend(CeedCompile_Hip(ceed, restriction_kernel_source, &impl->module, 6, "RSTR_ELEM_SIZE", elem_size, "RSTR_NUM_ELEM", num_elem,
                                       "RSTR_NUM_COMP", num_comp, "RSTR_NUM_NODES", impl->num_nodes, "RSTR_COMP_STRIDE", comp_stride,
                                       "USE_DETERMINISTIC", is_deterministic ? 1 : 0));
diff --git a/backends/hip/ceed-hip-compile.cpp b/backends/hip/ceed-hip-compile.cpp
index 20f2eb0e2a..dface44ef6 100644
--- a/backends/hip/ceed-hip-compile.cpp
+++ b/backends/hip/ceed-hip-compile.cpp
@@ -123,6 +123,9 @@ int CeedCompile_Hip(Ceed ceed, const char *source, hipModule_t *module, const Ce
   CeedCallHiprtc(ceed, hiprtcCreateProgram(&prog, code.str().c_str(), NULL, 0, NULL, NULL));
 
   // Compile kernel
+  CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- ATTEMPTING TO COMPILE JIT SOURCE ----------\n");
+  CeedDebug(ceed, "Source:\n%s\n", code.str().c_str());
+  CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- END OF JIT SOURCE ----------\n");
   hiprtcResult result = hiprtcCompileProgram(prog, num_opts + num_jit_source_dirs + num_jit_defines, opts);
 
   for (CeedInt i = 0; i < num_jit_source_dirs; i++) {
@@ -136,9 +139,6 @@ int CeedCompile_Hip(Ceed ceed, const char *source, hipModule_t *module, const Ce
     size_t log_size;
     char  *log;
 
-    CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- CEED JIT SOURCE FAILED TO COMPILE ----------\n");
-    CeedDebug(ceed, "Source:\n%s\n", code.str().c_str());
-    CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- CEED JIT SOURCE FAILED TO COMPILE ----------\n");
     CeedChk_hiprtc(ceed, hiprtcGetProgramLogSize(prog, &log_size));
     CeedCallBackend(CeedMalloc(log_size, &log));
     CeedCallHiprtc(ceed, hiprtcGetProgramLog(prog, log));
diff --git a/interface/ceed-jit-tools.c b/interface/ceed-jit-tools.c
index d89f713dfe..14f1babb87 100644
--- a/interface/ceed-jit-tools.c
+++ b/interface/ceed-jit-tools.c
@@ -130,8 +130,6 @@ int CeedLoadSourceToInitializedBuffer(Ceed ceed, const char *source_file_path, C
   CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "---------- Ceed JiT ----------\n");
   CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "Current source file: ");
   CeedDebug(ceed, "%s\n", source_file_path);
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "Current buffer:\n");
-  CeedDebug(ceed, "%s\n", *buffer);
 
   // Read file to temporary buffer
   source_file = fopen(source_file_path, "rb");

From f6c445a1b494684a6a4b8252d1a7c0d095618ba4 Mon Sep 17 00:00:00 2001
From: James Wright <james@jameswright.xyz>
Date: Tue, 22 Oct 2024 22:18:03 -0400
Subject: [PATCH 217/571] doc: Update release notes based on #1696 (#1701)

* doc: Update release notes based on #1696

Co-authored-by: Jeremy L Thompson <jeremy@jeremylt.org>
---
 doc/sphinx/source/releasenotes.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/doc/sphinx/source/releasenotes.md b/doc/sphinx/source/releasenotes.md
index 4115d62647..4e629b48e0 100644
--- a/doc/sphinx/source/releasenotes.md
+++ b/doc/sphinx/source/releasenotes.md
@@ -12,6 +12,9 @@ On this page we provide a summary of the main API changes, new features and exam
 - `CEED_BASIS_COLLOCATED` removed; users should only use `CEED_BASIS_NONE`.
 - Remove unneeded pointer for `CeedElemRestrictionGetELayout`.
 - Require use of `Ceed*Destroy()` on Ceed objects returned from `CeedOperatorFieldGet*()`;
+- Change QFunction source include file handling in JiT compilers
+    - Add `CEED_RUNNING_JIT_PASS` compiler definition for wrapping header files that device JiT compilers cannot read
+    - Users should now prefer `#include <ceed/types.h>` rather than `#include <ceed.h>` in QFunction source files
 
 ### New features
 

From 9bc663991d6482bcb1d60b1f116148f11db83fa1 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Tue, 22 Oct 2024 16:55:55 -0600
Subject: [PATCH 218/571] ceed - require *GetCeed ceed to be Destroyed

---
 backends/avx/ceed-avx-blocked.c               |   1 +
 backends/avx/ceed-avx-serial.c                |   1 +
 backends/blocked/ceed-blocked-operator.c      |   9 +-
 backends/blocked/ceed-blocked.c               |   1 +
 .../cuda-gen/ceed-cuda-gen-operator-build.cpp |   2 +-
 backends/cuda-gen/ceed-cuda-gen-operator.c    |   3 +
 backends/cuda-gen/ceed-cuda-gen-qfunction.c   |   2 +-
 backends/cuda-gen/ceed-cuda-gen.c             |   1 +
 backends/cuda-ref/ceed-cuda-ref-basis.c       |  12 +-
 backends/cuda-ref/ceed-cuda-ref-operator.c    |  35 ++--
 .../cuda-ref/ceed-cuda-ref-qfunction-load.cpp |   7 +-
 backends/cuda-ref/ceed-cuda-ref-qfunction.c   |   3 +-
 .../cuda-ref/ceed-cuda-ref-qfunctioncontext.c |   4 +
 backends/cuda-ref/ceed-cuda-ref-restriction.c |   6 +
 backends/cuda-ref/ceed-cuda-ref-vector.c      |  17 +-
 backends/cuda-shared/ceed-cuda-shared-basis.c |   7 +-
 backends/cuda-shared/ceed-cuda-shared.c       |   1 +
 .../hip-gen/ceed-hip-gen-operator-build.cpp   |   2 +-
 backends/hip-gen/ceed-hip-gen-operator.c      |   3 +
 backends/hip-gen/ceed-hip-gen-qfunction.c     |   2 +-
 backends/hip-gen/ceed-hip-gen.c               |   1 +
 backends/hip-ref/ceed-hip-ref-basis.c         |  12 +-
 backends/hip-ref/ceed-hip-ref-operator.c      |  35 ++--
 .../hip-ref/ceed-hip-ref-qfunction-load.cpp   |   9 +-
 backends/hip-ref/ceed-hip-ref-qfunction.c     |   3 +-
 .../hip-ref/ceed-hip-ref-qfunctioncontext.c   |   4 +
 backends/hip-ref/ceed-hip-ref-restriction.c   |   6 +
 backends/hip-ref/ceed-hip-ref-vector.c        |  17 +-
 backends/hip-shared/ceed-hip-shared-basis.c   |   5 +-
 backends/hip-shared/ceed-hip-shared.c         |   1 +
 backends/magma/ceed-magma-basis.c             |  13 ++
 backends/magma/ceed-magma-det.c               |   1 +
 backends/magma/ceed-magma.c                   |   1 +
 backends/memcheck/ceed-memcheck-blocked.c     |   1 +
 backends/memcheck/ceed-memcheck-qfunction.c   |   5 +-
 .../memcheck/ceed-memcheck-qfunctioncontext.c |  10 +-
 backends/memcheck/ceed-memcheck-restriction.c |   1 +
 backends/memcheck/ceed-memcheck-serial.c      |   1 +
 backends/memcheck/ceed-memcheck-vector.c      |  15 +-
 backends/opt/ceed-opt-blocked.c               |   1 +
 backends/opt/ceed-opt-operator.c              |   9 +-
 backends/opt/ceed-opt-serial.c                |   1 +
 backends/ref/ceed-ref-basis.c                 |  22 ++-
 backends/ref/ceed-ref-operator.c              |  25 ++-
 backends/ref/ceed-ref-qfunction.c             |   1 +
 backends/ref/ceed-ref-qfunctioncontext.c      |   1 +
 backends/ref/ceed-ref-restriction.c           |  12 +-
 backends/ref/ceed-ref-tensor.c                |   1 +
 backends/ref/ceed-ref-vector.c                |   1 +
 .../ceed-sycl-gen-operator-build.sycl.cpp     |   2 +-
 .../sycl-gen/ceed-sycl-gen-operator.sycl.cpp  |   3 +
 .../sycl-gen/ceed-sycl-gen-qfunction.sycl.cpp |   2 +
 backends/sycl-gen/ceed-sycl-gen.sycl.cpp      |   2 +
 .../sycl-ref/ceed-sycl-ref-basis.sycl.cpp     |  21 +-
 .../sycl-ref/ceed-sycl-ref-operator.sycl.cpp  |  40 ++--
 .../ceed-sycl-ref-qfunction-load.sycl.cpp     |   3 +-
 .../sycl-ref/ceed-sycl-ref-qfunction.sycl.cpp |   3 +
 .../ceed-sycl-ref-qfunctioncontext.sycl.cpp   |  15 +-
 .../sycl-ref/ceed-sycl-restriction.sycl.cpp   |   6 +-
 backends/sycl-ref/ceed-sycl-vector.sycl.cpp   |  32 ++-
 .../ceed-sycl-shared-basis.sycl.cpp           |   4 +-
 backends/sycl/ceed-sycl-common.sycl.cpp       |   4 +-
 backends/xsmm/ceed-xsmm-blocked.c             |   1 +
 backends/xsmm/ceed-xsmm-serial.c              |   1 +
 backends/xsmm/ceed-xsmm-tensor.c              |   8 +-
 examples/fluids/include/petsc-ceed-utils.h    |   1 +
 examples/fluids/src/mat-ceed.c                |   1 -
 examples/fluids/src/petsc_ops.c               |   1 +
 examples/fluids/src/turb_spanstats.c          |   2 +
 interface/ceed-basis.c                        |  53 ++---
 interface/ceed-cuda.c                         |   5 +-
 interface/ceed-elemrestriction.c              |  64 +++---
 interface/ceed-hip.c                          |   5 +-
 interface/ceed-operator.c                     | 127 ++++++------
 interface/ceed-preconditioning.c              | 100 +++++-----
 interface/ceed-qfunction.c                    |  37 ++--
 interface/ceed-qfunctioncontext.c             |  50 +++--
 interface/ceed-tensor.c                       |   4 +-
 interface/ceed-vector.c                       | 183 ++++++++++--------
 interface/ceed.c                              |  24 ++-
 tests/t409-qfunction.c                        |   1 +
 81 files changed, 657 insertions(+), 482 deletions(-)

diff --git a/backends/avx/ceed-avx-blocked.c b/backends/avx/ceed-avx-blocked.c
index c565faa653..bf898f571a 100644
--- a/backends/avx/ceed-avx-blocked.c
+++ b/backends/avx/ceed-avx-blocked.c
@@ -25,6 +25,7 @@ static int CeedInit_Avx(const char *resource, Ceed ceed) {
   // Create reference Ceed that implementation will be dispatched through unless overridden
   CeedCallBackend(CeedInit("/cpu/self/opt/blocked", &ceed_ref));
   CeedCallBackend(CeedSetDelegate(ceed, ceed_ref));
+  CeedCallBackend(CeedDestroy(&ceed_ref));
 
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "TensorContractCreate", CeedTensorContractCreate_Avx));
   return CEED_ERROR_SUCCESS;
diff --git a/backends/avx/ceed-avx-serial.c b/backends/avx/ceed-avx-serial.c
index 5ebe28e19a..5d33900758 100644
--- a/backends/avx/ceed-avx-serial.c
+++ b/backends/avx/ceed-avx-serial.c
@@ -25,6 +25,7 @@ static int CeedInit_Avx(const char *resource, Ceed ceed) {
   // Create reference Ceed that implementation will be dispatched through unless overridden
   CeedCallBackend(CeedInit("/cpu/self/opt/serial", &ceed_ref));
   CeedCallBackend(CeedSetDelegate(ceed, ceed_ref));
+  CeedCallBackend(CeedDestroy(&ceed_ref));
 
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "TensorContractCreate", CeedTensorContractCreate_Avx));
   return CEED_ERROR_SUCCESS;
diff --git a/backends/blocked/ceed-blocked-operator.c b/backends/blocked/ceed-blocked-operator.c
index c9f5ccfd46..2f7b0e6dad 100644
--- a/backends/blocked/ceed-blocked-operator.c
+++ b/backends/blocked/ceed-blocked-operator.c
@@ -30,7 +30,8 @@ static int CeedOperatorSetupFields_Blocked(CeedQFunction qf, CeedOperator op, bo
 
     CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
     CeedCallBackend(CeedGetParent(ceed, &ceed_parent));
-    if (ceed_parent) ceed = ceed_parent;
+    CeedCallBackend(CeedReferenceCopy(ceed_parent, &ceed));
+    CeedCallBackend(CeedDestroy(&ceed_parent));
   }
   if (is_input) {
     CeedCallBackend(CeedOperatorGetFields(op, NULL, &op_fields, NULL, NULL));
@@ -105,6 +106,7 @@ static int CeedOperatorSetupFields_Blocked(CeedQFunction qf, CeedOperator op, bo
           // Empty case - won't occur
           break;
       }
+      CeedCallBackend(CeedDestroy(&ceed_rstr));
       CeedCallBackend(CeedElemRestrictionDestroy(&rstr));
       CeedCallBackend(CeedElemRestrictionCreateVector(block_rstr[i + start_e], NULL, &e_vecs_full[i + start_e]));
     }
@@ -190,6 +192,7 @@ static int CeedOperatorSetupFields_Blocked(CeedQFunction qf, CeedOperator op, bo
       CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
     }
   }
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -198,7 +201,6 @@ static int CeedOperatorSetupFields_Blocked(CeedQFunction qf, CeedOperator op, bo
 //------------------------------------------------------------------------------
 static int CeedOperatorSetup_Blocked(CeedOperator op) {
   bool                  is_setup_done;
-  Ceed                  ceed;
   CeedInt               Q, num_input_fields, num_output_fields;
   const CeedInt         block_size = 8;
   CeedQFunctionField   *qf_input_fields, *qf_output_fields;
@@ -209,7 +211,6 @@ static int CeedOperatorSetup_Blocked(CeedOperator op) {
   CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done));
   if (is_setup_done) return CEED_ERROR_SUCCESS;
 
-  CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
   CeedCallBackend(CeedOperatorGetData(op, &impl));
   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
   CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q));
@@ -707,6 +708,7 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Blocked(CeedOperator o
   CeedCallBackend(CeedOperatorRestoreInputs_Blocked(num_input_fields, qf_input_fields, op_input_fields, true, e_data_full, impl));
 
   // Output blocked restriction
+  CeedCallBackend(CeedDestroy(&ceed));
   CeedCallBackend(CeedVectorRestoreArray(l_vec, &l_vec_array));
   CeedCallBackend(CeedVectorSetValue(*assembled, 0.0));
   CeedCallBackend(CeedElemRestrictionApply(block_rstr, CEED_TRANSPOSE, l_vec, *assembled, request));
@@ -783,6 +785,7 @@ int CeedOperatorCreate_Blocked(CeedOperator op) {
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunctionUpdate", CeedOperatorLinearAssembleQFunctionUpdate_Blocked));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAdd_Blocked));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "Destroy", CeedOperatorDestroy_Blocked));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/blocked/ceed-blocked.c b/backends/blocked/ceed-blocked.c
index f37338f0d6..ca55c01e45 100644
--- a/backends/blocked/ceed-blocked.c
+++ b/backends/blocked/ceed-blocked.c
@@ -25,6 +25,7 @@ static int CeedInit_Blocked(const char *resource, Ceed ceed) {
   // Create reference Ceed that implementation will be dispatched through unless overridden
   CeedCallBackend(CeedInit("/cpu/self/ref/serial", &ceed_ref));
   CeedCallBackend(CeedSetDelegate(ceed, ceed_ref));
+  CeedCallBackend(CeedDestroy(&ceed_ref));
 
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "OperatorCreate", CeedOperatorCreate_Blocked));
   return CEED_ERROR_SUCCESS;
diff --git a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
index 315db3844f..3103bcbfb7 100644
--- a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
+++ b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
@@ -901,8 +901,8 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {
 
   CeedCallBackend(CeedCompile_Cuda(ceed, code.str().c_str(), &data->module, 1, "T_1D", CeedIntMax(Q_1d, data->max_P_1d)));
   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, operator_name.c_str(), &data->op));
-
   CeedCallBackend(CeedOperatorSetSetupDone(op));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/cuda-gen/ceed-cuda-gen-operator.c b/backends/cuda-gen/ceed-cuda-gen-operator.c
index 4c235984fd..7ea1002b21 100644
--- a/backends/cuda-gen/ceed-cuda-gen-operator.c
+++ b/backends/cuda-gen/ceed-cuda-gen-operator.c
@@ -122,6 +122,7 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec,
       CeedOperator op_fallback;
 
       CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "Falling back to /gpu/cuda/ref CeedOperator due to non-tensor bases");
+      CeedCallBackend(CeedDestroy(&ceed));
       CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback));
       CeedCallBackend(CeedOperatorApplyAdd(op_fallback, input_vec, output_vec, request));
       return CEED_ERROR_SUCCESS;
@@ -251,6 +252,7 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec,
 
   // Restore context data
   CeedCallBackend(CeedQFunctionRestoreInnerContextData(qf, &qf_data->d_c));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -266,6 +268,7 @@ int CeedOperatorCreate_Cuda_gen(CeedOperator op) {
   CeedCallBackend(CeedOperatorSetData(op, impl));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAdd_Cuda_gen));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "Destroy", CeedOperatorDestroy_Cuda_gen));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/cuda-gen/ceed-cuda-gen-qfunction.c b/backends/cuda-gen/ceed-cuda-gen-qfunction.c
index aec5294a8d..483b520503 100644
--- a/backends/cuda-gen/ceed-cuda-gen-qfunction.c
+++ b/backends/cuda-gen/ceed-cuda-gen-qfunction.c
@@ -42,11 +42,11 @@ int CeedQFunctionCreate_Cuda_gen(CeedQFunction qf) {
   CeedCallBackend(CeedCalloc(1, &data));
   CeedCallBackend(CeedQFunctionSetData(qf, data));
 
-  // Read QFunction source
   CeedCallBackend(CeedQFunctionGetKernelName(qf, &data->qfunction_name));
 
   CeedCallBackend(CeedSetBackendFunction(ceed, "QFunction", qf, "Apply", CeedQFunctionApply_Cuda_gen));
   CeedCallBackend(CeedSetBackendFunction(ceed, "QFunction", qf, "Destroy", CeedQFunctionDestroy_Cuda_gen));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/cuda-gen/ceed-cuda-gen.c b/backends/cuda-gen/ceed-cuda-gen.c
index fd4fcef722..0ab817186d 100644
--- a/backends/cuda-gen/ceed-cuda-gen.c
+++ b/backends/cuda-gen/ceed-cuda-gen.c
@@ -33,6 +33,7 @@ static int CeedInit_Cuda_gen(const char *resource, Ceed ceed) {
 
   CeedCallBackend(CeedInit("/gpu/cuda/shared", &ceed_shared));
   CeedCallBackend(CeedSetDelegate(ceed, ceed_shared));
+  CeedCallBackend(CeedDestroy(&ceed_shared));
 
   CeedCallBackend(CeedSetOperatorFallbackResource(ceed, fallback_resource));
 
diff --git a/backends/cuda-ref/ceed-cuda-ref-basis.c b/backends/cuda-ref/ceed-cuda-ref-basis.c
index 3eca8134c3..d7ab9a4aae 100644
--- a/backends/cuda-ref/ceed-cuda-ref-basis.c
+++ b/backends/cuda-ref/ceed-cuda-ref-basis.c
@@ -88,6 +88,7 @@ static int CeedBasisApplyCore_Cuda(CeedBasis basis, bool apply_add, const CeedIn
   CeedCallBackend(CeedVectorRestoreArray(v, &d_v));
   if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u));
   if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -116,7 +117,6 @@ static int CeedBasisApplyAtPointsCore_Cuda(CeedBasis basis, bool apply_add, cons
   CeedScalar       *d_v;
   CeedBasis_Cuda   *data;
 
-  CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
   CeedCallBackend(CeedBasisGetData(basis, &data));
   CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
   CeedCallBackend(CeedBasisGetDimension(basis, &dim));
@@ -127,6 +127,8 @@ static int CeedBasisApplyAtPointsCore_Cuda(CeedBasis basis, bool apply_add, cons
     return CEED_ERROR_SUCCESS;
   }
 
+  CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
+
   // Check padded to uniform number of points per elem
   for (CeedInt i = 1; i < num_elem; i++) max_num_points = CeedIntMax(max_num_points, num_points[i]);
   {
@@ -244,6 +246,7 @@ static int CeedBasisApplyAtPointsCore_Cuda(CeedBasis basis, bool apply_add, cons
   CeedCallBackend(CeedVectorRestoreArray(v, &d_v));
   if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u));
   if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -351,6 +354,7 @@ static int CeedBasisApplyNonTensorCore_Cuda(CeedBasis basis, bool apply_add, con
   CeedCallBackend(CeedVectorRestoreArray(v, &d_v));
   if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u));
   if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -384,6 +388,7 @@ static int CeedBasisDestroy_Cuda(CeedBasis basis) {
   CeedCallCuda(ceed, cudaFree(data->d_grad_1d));
   CeedCallCuda(ceed, cudaFree(data->d_chebyshev_interp_1d));
   CeedCallBackend(CeedFree(&data));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -403,6 +408,7 @@ static int CeedBasisDestroyNonTensor_Cuda(CeedBasis basis) {
   CeedCallCuda(ceed, cudaFree(data->d_div));
   CeedCallCuda(ceed, cudaFree(data->d_curl));
   CeedCallBackend(CeedFree(&data));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -449,6 +455,7 @@ int CeedBasisCreateTensorH1_Cuda(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAtPoints", CeedBasisApplyAtPoints_Cuda));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAddAtPoints", CeedBasisApplyAddAtPoints_Cuda));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Cuda));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -503,6 +510,7 @@ int CeedBasisCreateH1_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Cuda));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Cuda));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Cuda));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -557,6 +565,7 @@ int CeedBasisCreateHdiv_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nod
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Cuda));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Cuda));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Cuda));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -611,6 +620,7 @@ int CeedBasisCreateHcurl_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_no
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Cuda));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Cuda));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Cuda));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/cuda-ref/ceed-cuda-ref-operator.c b/backends/cuda-ref/ceed-cuda-ref-operator.c
index 8cb8855ba7..ac7b1cad03 100644
--- a/backends/cuda-ref/ceed-cuda-ref-operator.c
+++ b/backends/cuda-ref/ceed-cuda-ref-operator.c
@@ -78,10 +78,11 @@ static int CeedOperatorDestroy_Cuda(CeedOperator op) {
     CeedCallCuda(ceed, cudaFree(impl->diag->d_div_out));
     CeedCallCuda(ceed, cudaFree(impl->diag->d_curl_in));
     CeedCallCuda(ceed, cudaFree(impl->diag->d_curl_out));
-    CeedCallBackend(CeedElemRestrictionDestroy(&impl->diag->diag_rstr));
-    CeedCallBackend(CeedElemRestrictionDestroy(&impl->diag->point_block_diag_rstr));
+    CeedCallBackend(CeedDestroy(&ceed));
     CeedCallBackend(CeedVectorDestroy(&impl->diag->elem_diag));
     CeedCallBackend(CeedVectorDestroy(&impl->diag->point_block_elem_diag));
+    CeedCallBackend(CeedElemRestrictionDestroy(&impl->diag->diag_rstr));
+    CeedCallBackend(CeedElemRestrictionDestroy(&impl->diag->point_block_diag_rstr));
   }
   CeedCallBackend(CeedFree(&impl->diag));
 
@@ -92,6 +93,7 @@ static int CeedOperatorDestroy_Cuda(CeedOperator op) {
     CeedCallCuda(ceed, cuModuleUnload(impl->asmb->module));
     CeedCallCuda(ceed, cudaFree(impl->asmb->d_B_in));
     CeedCallCuda(ceed, cudaFree(impl->asmb->d_B_out));
+    CeedCallBackend(CeedDestroy(&ceed));
   }
   CeedCallBackend(CeedFree(&impl->asmb));
 
@@ -227,6 +229,7 @@ static int CeedOperatorSetupFields_Cuda(CeedQFunction qf, CeedOperator op, bool
       CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
     }
   }
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -234,7 +237,6 @@ static int CeedOperatorSetupFields_Cuda(CeedQFunction qf, CeedOperator op, bool
 // CeedOperator needs to connect all the named fields (be they active or passive) to the named inputs and outputs of its CeedQFunction.
 //------------------------------------------------------------------------------
 static int CeedOperatorSetup_Cuda(CeedOperator op) {
-  Ceed                ceed;
   bool                is_setup_done;
   CeedInt             Q, num_elem, num_input_fields, num_output_fields;
   CeedQFunctionField *qf_input_fields, *qf_output_fields;
@@ -245,7 +247,6 @@ static int CeedOperatorSetup_Cuda(CeedOperator op) {
   CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done));
   if (is_setup_done) return CEED_ERROR_SUCCESS;
 
-  CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
   CeedCallBackend(CeedOperatorGetData(op, &impl));
   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
   CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q));
@@ -587,11 +588,7 @@ static int CeedOperatorApplyAdd_Cuda(CeedOperator op, CeedVector in_vec, CeedVec
     }
 
     // Restrict
-    if (impl->skip_rstr_out[field]) {
-      if (!is_active) CeedCallBackend(CeedVectorDestroy(&l_vec));
-      continue;
-    }
-    {
+    if (!impl->skip_rstr_out[field]) {
       CeedElemRestriction elem_rstr;
 
       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[field], &elem_rstr));
@@ -603,6 +600,7 @@ static int CeedOperatorApplyAdd_Cuda(CeedOperator op, CeedVector in_vec, CeedVec
 
   // Return work vector
   CeedCallBackend(CeedRestoreWorkVector(ceed, &active_e_vec));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -610,7 +608,6 @@ static int CeedOperatorApplyAdd_Cuda(CeedOperator op, CeedVector in_vec, CeedVec
 // CeedOperator needs to connect all the named fields (be they active or passive) to the named inputs and outputs of its CeedQFunction.
 //------------------------------------------------------------------------------
 static int CeedOperatorSetupAtPoints_Cuda(CeedOperator op) {
-  Ceed                ceed;
   bool                is_setup_done;
   CeedInt             max_num_points = -1, num_elem, num_input_fields, num_output_fields;
   CeedQFunctionField *qf_input_fields, *qf_output_fields;
@@ -621,7 +618,6 @@ static int CeedOperatorSetupAtPoints_Cuda(CeedOperator op) {
   CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done));
   if (is_setup_done) return CEED_ERROR_SUCCESS;
 
-  CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
   CeedCallBackend(CeedOperatorGetData(op, &impl));
   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
@@ -918,11 +914,7 @@ static int CeedOperatorApplyAddAtPoints_Cuda(CeedOperator op, CeedVector in_vec,
     }
 
     // Restrict
-    if (impl->skip_rstr_out[field]) {
-      if (!is_active) CeedCallBackend(CeedVectorDestroy(&l_vec));
-      continue;
-    }
-    {
+    if (!impl->skip_rstr_out[field]) {
       CeedElemRestriction elem_rstr;
 
       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[field], &elem_rstr));
@@ -934,6 +926,7 @@ static int CeedOperatorApplyAddAtPoints_Cuda(CeedOperator op, CeedVector in_vec,
 
   // Restore work vector
   CeedCallBackend(CeedRestoreWorkVector(ceed, &active_e_vec));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -1075,6 +1068,8 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Cuda(CeedOperator op,
   }
 
   // Restore output
+  CeedCallBackend(CeedDestroy(&ceed));
+  CeedCallBackend(CeedDestroy(&ceed_parent));
   CeedCallBackend(CeedVectorRestoreArray(*assembled, &assembled_array));
   return CEED_ERROR_SUCCESS;
 }
@@ -1276,6 +1271,7 @@ static inline int CeedOperatorAssembleDiagonalSetup_Cuda(CeedOperator op) {
   CeedCallCuda(ceed, cudaMemcpy(diag->d_eval_modes_out, eval_modes_out, num_eval_modes_out * eval_modes_bytes, cudaMemcpyHostToDevice));
   CeedCallBackend(CeedFree(&eval_modes_in));
   CeedCallBackend(CeedFree(&eval_modes_out));
+  CeedCallBackend(CeedDestroy(&ceed));
   CeedCallBackend(CeedBasisDestroy(&basis_in));
   CeedCallBackend(CeedBasisDestroy(&basis_out));
   return CEED_ERROR_SUCCESS;
@@ -1361,6 +1357,7 @@ static inline int CeedOperatorAssembleDiagonalSetupCompile_Cuda(CeedOperator op,
                                       num_eval_modes_out, "NUM_COMP", num_comp, "NUM_NODES", num_nodes, "NUM_QPTS", num_qpts, "USE_CEEDSIZE",
                                       use_ceedsize_idx, "USE_POINT_BLOCK", is_point_block ? 1 : 0, "BLOCK_SIZE", num_nodes * elems_per_block));
   CeedCallCuda(ceed, CeedGetKernel_Cuda(ceed, *module, "LinearDiagonal", is_point_block ? &diag->LinearPointBlock : &diag->LinearDiagonal));
+  CeedCallBackend(CeedDestroy(&ceed));
   CeedCallBackend(CeedBasisDestroy(&basis_in));
   CeedCallBackend(CeedBasisDestroy(&basis_out));
   return CEED_ERROR_SUCCESS;
@@ -1449,6 +1446,7 @@ static inline int CeedOperatorAssembleDiagonalCore_Cuda(CeedOperator op, CeedVec
   CeedCallBackend(CeedElemRestrictionApply(diag_rstr, CEED_TRANSPOSE, elem_diag, assembled, request));
 
   // Cleanup
+  CeedCallBackend(CeedDestroy(&ceed));
   CeedCallBackend(CeedVectorDestroy(&assembled_qf));
   return CEED_ERROR_SUCCESS;
 }
@@ -1661,6 +1659,7 @@ static int CeedSingleOperatorAssembleSetup_Cuda(CeedOperator op, CeedInt use_cee
     CeedCallBackend(CeedFree(&identity));
   }
   CeedCallBackend(CeedFree(&eval_modes_out));
+  CeedCallBackend(CeedDestroy(&ceed));
   CeedCallBackend(CeedElemRestrictionDestroy(&rstr_in));
   CeedCallBackend(CeedElemRestrictionDestroy(&rstr_out));
   CeedCallBackend(CeedBasisDestroy(&basis_in));
@@ -1769,6 +1768,7 @@ static int CeedSingleOperatorAssemble_Cuda(CeedOperator op, CeedInt offset, Ceed
       CeedCallBackend(CeedElemRestrictionRestoreCurlOrientations(rstr_out, &curl_orients_out));
     }
   }
+  CeedCallBackend(CeedDestroy(&ceed));
   CeedCallBackend(CeedElemRestrictionDestroy(&rstr_in));
   CeedCallBackend(CeedElemRestrictionDestroy(&rstr_out));
   return CEED_ERROR_SUCCESS;
@@ -2040,6 +2040,7 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda(CeedOperator op, C
   // Restore work vector
   CeedCallBackend(CeedRestoreWorkVector(ceed, &active_e_vec_in));
   CeedCallBackend(CeedRestoreWorkVector(ceed, &active_e_vec_out));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -2062,6 +2063,7 @@ int CeedOperatorCreate_Cuda(CeedOperator op) {
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleSingle", CeedSingleOperatorAssemble_Cuda));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAdd_Cuda));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "Destroy", CeedOperatorDestroy_Cuda));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -2080,6 +2082,7 @@ int CeedOperatorCreateAtPoints_Cuda(CeedOperator op) {
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddDiagonal", CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAddAtPoints_Cuda));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "Destroy", CeedOperatorDestroy_Cuda));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/cuda-ref/ceed-cuda-ref-qfunction-load.cpp b/backends/cuda-ref/ceed-cuda-ref-qfunction-load.cpp
index 2d5540ead8..4eaa18d1c2 100644
--- a/backends/cuda-ref/ceed-cuda-ref-qfunction-load.cpp
+++ b/backends/cuda-ref/ceed-cuda-ref-qfunction-load.cpp
@@ -28,12 +28,12 @@ extern "C" int CeedQFunctionBuildKernel_Cuda_ref(CeedQFunction qf) {
   CeedQFunctionField *input_fields, *output_fields;
   CeedQFunction_Cuda *data;
 
-  CeedCallBackend(CeedQFunctionGetCeed(qf, &ceed));
-  CeedCallBackend(CeedQFunctionGetData(qf, (void **)&data));
-
   // QFunction is built
+  CeedCallBackend(CeedQFunctionGetData(qf, (void **)&data));
   if (data->QFunction) return CEED_ERROR_SUCCESS;
 
+  CeedCallBackend(CeedQFunctionGetCeed(qf, &ceed));
+
   // QFunction kernel generation
   CeedCallBackend(CeedQFunctionGetFields(qf, &num_input_fields, &input_fields, &num_output_fields, &output_fields));
 
@@ -112,6 +112,7 @@ extern "C" int CeedQFunctionBuildKernel_Cuda_ref(CeedQFunction qf) {
   // Compile kernel
   CeedCallBackend(CeedCompile_Cuda(ceed, code.str().c_str(), &data->module, 0));
   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, kernel_name.c_str(), &data->QFunction));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/cuda-ref/ceed-cuda-ref-qfunction.c b/backends/cuda-ref/ceed-cuda-ref-qfunction.c
index eec4aea26c..32a02b43dd 100644
--- a/backends/cuda-ref/ceed-cuda-ref-qfunction.c
+++ b/backends/cuda-ref/ceed-cuda-ref-qfunction.c
@@ -58,6 +58,7 @@ static int CeedQFunctionApply_Cuda(CeedQFunction qf, CeedInt Q, CeedVector *U, C
 
   // Restore context
   CeedCallBackend(CeedQFunctionRestoreInnerContextData(qf, &data->d_c));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -95,13 +96,13 @@ int CeedQFunctionCreate_Cuda(CeedQFunction qf) {
   CeedCallBackend(CeedCalloc(1, &data));
   CeedCallBackend(CeedQFunctionSetData(qf, data));
 
-  // Read QFunction name
   CeedCallBackend(CeedQFunctionGetKernelName(qf, &data->qfunction_name));
 
   // Register backend functions
   CeedCallBackend(CeedSetBackendFunction(ceed, "QFunction", qf, "Apply", CeedQFunctionApply_Cuda));
   CeedCallBackend(CeedSetBackendFunction(ceed, "QFunction", qf, "Destroy", CeedQFunctionDestroy_Cuda));
   CeedCallBackend(CeedSetBackendFunction(ceed, "QFunction", qf, "SetCUDAUserFunction", CeedQFunctionSetCUDAUserFunction_Cuda));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/cuda-ref/ceed-cuda-ref-qfunctioncontext.c b/backends/cuda-ref/ceed-cuda-ref-qfunctioncontext.c
index 4257265987..5afbb7aa02 100644
--- a/backends/cuda-ref/ceed-cuda-ref-qfunctioncontext.c
+++ b/backends/cuda-ref/ceed-cuda-ref-qfunctioncontext.c
@@ -37,6 +37,7 @@ static inline int CeedQFunctionContextSyncH2D_Cuda(const CeedQFunctionContext ct
     impl->d_data = impl->d_data_owned;
   }
   CeedCallCuda(ceed, cudaMemcpy(impl->d_data, impl->h_data, ctx_size, cudaMemcpyHostToDevice));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -64,6 +65,7 @@ static inline int CeedQFunctionContextSyncD2H_Cuda(const CeedQFunctionContext ct
     impl->h_data = impl->h_data_owned;
   }
   CeedCallCuda(ceed, cudaMemcpy(impl->h_data, impl->d_data, ctx_size, cudaMemcpyDeviceToHost));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -205,6 +207,7 @@ static int CeedQFunctionContextSetDataDevice_Cuda(const CeedQFunctionContext ctx
       impl->d_data          = data;
       break;
   }
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -335,6 +338,7 @@ int CeedQFunctionContextCreate_Cuda(CeedQFunctionContext ctx) {
   CeedCallBackend(CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "GetData", CeedQFunctionContextGetData_Cuda));
   CeedCallBackend(CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "GetDataRead", CeedQFunctionContextGetDataRead_Cuda));
   CeedCallBackend(CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "Destroy", CeedQFunctionContextDestroy_Cuda));
+  CeedCallBackend(CeedDestroy(&ceed));
   CeedCallBackend(CeedCalloc(1, &impl));
   CeedCallBackend(CeedQFunctionContextSetBackendData(ctx, impl));
   return CEED_ERROR_SUCCESS;
diff --git a/backends/cuda-ref/ceed-cuda-ref-restriction.c b/backends/cuda-ref/ceed-cuda-ref-restriction.c
index c4a5c22dda..3ec1a4ef90 100644
--- a/backends/cuda-ref/ceed-cuda-ref-restriction.c
+++ b/backends/cuda-ref/ceed-cuda-ref-restriction.c
@@ -106,6 +106,7 @@ static inline int CeedElemRestrictionSetupCompile_Cuda(CeedElemRestriction rstr)
       CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OffsetTranspose", &impl->ApplyUnorientedTranspose));
     } break;
   }
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -285,6 +286,7 @@ static inline int CeedElemRestrictionApply_Cuda_Core(CeedElemRestriction rstr, C
   // Restore arrays
   CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u));
   CeedCallBackend(CeedVectorRestoreArray(v, &d_v));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -404,6 +406,7 @@ static int CeedElemRestrictionDestroy_Cuda(CeedElemRestriction rstr) {
   CeedCallBackend(CeedFree(&impl->h_points_per_elem_owned));
   CeedCallCuda(ceed, cudaFree((CeedInt *)impl->d_points_per_elem_owned));
   CeedCallBackend(CeedFree(&impl));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -485,6 +488,7 @@ static int CeedElemRestrictionOffset_Cuda(const CeedElemRestriction rstr, const
   CeedCallBackend(CeedFree(&l_vec_indices));
   CeedCallBackend(CeedFree(&t_offsets));
   CeedCallBackend(CeedFree(&t_indices));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -502,6 +506,7 @@ int CeedElemRestrictionCreate_Cuda(CeedMemType mem_type, CeedCopyMode copy_mode,
   CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed));
   CeedCallBackend(CeedGetParent(ceed, &ceed_parent));
   CeedCallBackend(CeedIsDeterministic(ceed_parent, &is_deterministic));
+  CeedCallBackend(CeedDestroy(&ceed_parent));
   CeedCallBackend(CeedElemRestrictionGetNumElements(rstr, &num_elem));
   CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &num_comp));
   CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size));
@@ -649,6 +654,7 @@ int CeedElemRestrictionCreate_Cuda(CeedMemType mem_type, CeedCopyMode copy_mode,
         CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetAtPointsElementOffset", CeedElemRestrictionGetAtPointsElementOffset_Cuda));
   }
   CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "Destroy", CeedElemRestrictionDestroy_Cuda));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/cuda-ref/ceed-cuda-ref-vector.c b/backends/cuda-ref/ceed-cuda-ref-vector.c
index 9deb6dec82..7365327229 100644
--- a/backends/cuda-ref/ceed-cuda-ref-vector.c
+++ b/backends/cuda-ref/ceed-cuda-ref-vector.c
@@ -41,10 +41,8 @@ static inline int CeedVectorNeedSync_Cuda(const CeedVector vec, CeedMemType mem_
 static inline int CeedVectorSyncH2D_Cuda(const CeedVector vec) {
   CeedSize         length;
   size_t           bytes;
-  Ceed             ceed;
   CeedVector_Cuda *impl;
 
-  CeedCallBackend(CeedVectorGetCeed(vec, &ceed));
   CeedCallBackend(CeedVectorGetData(vec, &impl));
 
   CeedCheck(impl->h_array, CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND, "No valid host data to sync to device");
@@ -56,10 +54,10 @@ static inline int CeedVectorSyncH2D_Cuda(const CeedVector vec) {
   } else if (impl->d_array_owned) {
     impl->d_array = impl->d_array_owned;
   } else {
-    CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_array_owned, bytes));
+    CeedCallCuda(CeedVectorReturnCeed(vec), cudaMalloc((void **)&impl->d_array_owned, bytes));
     impl->d_array = impl->d_array_owned;
   }
-  CeedCallCuda(ceed, cudaMemcpy(impl->d_array, impl->h_array, bytes, cudaMemcpyHostToDevice));
+  CeedCallCuda(CeedVectorReturnCeed(vec), cudaMemcpy(impl->d_array, impl->h_array, bytes, cudaMemcpyHostToDevice));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -68,13 +66,11 @@ static inline int CeedVectorSyncH2D_Cuda(const CeedVector vec) {
 //------------------------------------------------------------------------------
 static inline int CeedVectorSyncD2H_Cuda(const CeedVector vec) {
   CeedSize         length;
-  Ceed             ceed;
   CeedVector_Cuda *impl;
 
-  CeedCallBackend(CeedVectorGetCeed(vec, &ceed));
   CeedCallBackend(CeedVectorGetData(vec, &impl));
 
-  CeedCheck(impl->d_array, ceed, CEED_ERROR_BACKEND, "No valid device data to sync to host");
+  CeedCheck(impl->d_array, CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND, "No valid device data to sync to host");
 
   if (impl->h_array_borrowed) {
     impl->h_array = impl->h_array_borrowed;
@@ -91,7 +87,7 @@ static inline int CeedVectorSyncD2H_Cuda(const CeedVector vec) {
   CeedCallBackend(CeedVectorGetLength(vec, &length));
   size_t bytes = length * sizeof(CeedScalar);
 
-  CeedCallCuda(ceed, cudaMemcpy(impl->h_array, impl->d_array, bytes, cudaMemcpyDeviceToHost));
+  CeedCallCuda(CeedVectorReturnCeed(vec), cudaMemcpy(impl->h_array, impl->d_array, bytes, cudaMemcpyDeviceToHost));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -202,6 +198,7 @@ static int CeedVectorSetArrayDevice_Cuda(const CeedVector vec, const CeedCopyMod
 
   CeedCallBackend(CeedSetDeviceCeedScalarArray_Cuda(ceed, array, copy_mode, length, (const CeedScalar **)&impl->d_array_owned,
                                                     (const CeedScalar **)&impl->d_array_borrowed, (const CeedScalar **)&impl->d_array));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -611,6 +608,7 @@ static int CeedVectorNorm_Cuda(CeedVector vec, CeedNormType type, CeedScalar *no
     }
   }
   CeedCallBackend(CeedVectorRestoreArrayRead(vec, &d_array));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -689,11 +687,9 @@ int CeedDeviceAXPY_Cuda(CeedScalar *y_array, CeedScalar alpha, CeedScalar *x_arr
 // Compute y = alpha x + y
 //------------------------------------------------------------------------------
 static int CeedVectorAXPY_Cuda(CeedVector y, CeedScalar alpha, CeedVector x) {
-  Ceed             ceed;
   CeedSize         length;
   CeedVector_Cuda *y_impl, *x_impl;
 
-  CeedCallBackend(CeedVectorGetCeed(y, &ceed));
   CeedCallBackend(CeedVectorGetData(y, &y_impl));
   CeedCallBackend(CeedVectorGetData(x, &x_impl));
   CeedCallBackend(CeedVectorGetLength(y, &length));
@@ -824,6 +820,7 @@ int CeedVectorCreate_Cuda(CeedSize n, CeedVector vec) {
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "AXPBY", CeedVectorAXPBY_Cuda));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "PointwiseMult", CeedVectorPointwiseMult_Cuda));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "Destroy", CeedVectorDestroy_Cuda));
+  CeedCallBackend(CeedDestroy(&ceed));
   CeedCallBackend(CeedCalloc(1, &impl));
   CeedCallBackend(CeedVectorSetData(vec, impl));
   return CEED_ERROR_SUCCESS;
diff --git a/backends/cuda-shared/ceed-cuda-shared-basis.c b/backends/cuda-shared/ceed-cuda-shared-basis.c
index b1709787ec..bd2467e538 100644
--- a/backends/cuda-shared/ceed-cuda-shared-basis.c
+++ b/backends/cuda-shared/ceed-cuda-shared-basis.c
@@ -189,6 +189,7 @@ static int CeedBasisApplyTensorCore_Cuda_shared(CeedBasis basis, bool apply_add,
   CeedCallBackend(CeedVectorRestoreArray(v, &d_v));
   if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u));
   if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -217,7 +218,6 @@ static int CeedBasisApplyAtPointsCore_Cuda_shared(CeedBasis basis, bool apply_ad
   CeedScalar            *d_v;
   CeedBasis_Cuda_shared *data;
 
-  CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
   CeedCallBackend(CeedBasisGetData(basis, &data));
   CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
   CeedCallBackend(CeedBasisGetDimension(basis, &dim));
@@ -228,6 +228,8 @@ static int CeedBasisApplyAtPointsCore_Cuda_shared(CeedBasis basis, bool apply_ad
     return CEED_ERROR_SUCCESS;
   }
 
+  CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
+
   // Check padded to uniform number of points per elem
   for (CeedInt i = 1; i < num_elem; i++) max_num_points = CeedIntMax(max_num_points, num_points[i]);
   {
@@ -345,6 +347,7 @@ static int CeedBasisApplyAtPointsCore_Cuda_shared(CeedBasis basis, bool apply_ad
   CeedCallBackend(CeedVectorRestoreArray(v, &d_v));
   if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u));
   if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -379,6 +382,7 @@ static int CeedBasisDestroy_Cuda_shared(CeedBasis basis) {
   CeedCallCuda(ceed, cudaFree(data->d_collo_grad_1d));
   CeedCallCuda(ceed, cudaFree(data->d_chebyshev_interp_1d));
   CeedCallBackend(CeedFree(&data));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -443,6 +447,7 @@ int CeedBasisCreateTensorH1_Cuda_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d,
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAtPoints", CeedBasisApplyAtPoints_Cuda_shared));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAddAtPoints", CeedBasisApplyAddAtPoints_Cuda_shared));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Cuda_shared));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/cuda-shared/ceed-cuda-shared.c b/backends/cuda-shared/ceed-cuda-shared.c
index 5ab65815cc..baa374af36 100644
--- a/backends/cuda-shared/ceed-cuda-shared.c
+++ b/backends/cuda-shared/ceed-cuda-shared.c
@@ -33,6 +33,7 @@ static int CeedInit_Cuda_shared(const char *resource, Ceed ceed) {
 
   CeedCallBackend(CeedInit("/gpu/cuda/ref", &ceed_ref));
   CeedCallBackend(CeedSetDelegate(ceed, ceed_ref));
+  CeedCallBackend(CeedDestroy(&ceed_ref));
 
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "BasisCreateTensorH1", CeedBasisCreateTensorH1_Cuda_shared));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "Destroy", CeedDestroy_Cuda));
diff --git a/backends/hip-gen/ceed-hip-gen-operator-build.cpp b/backends/hip-gen/ceed-hip-gen-operator-build.cpp
index f1a876ce26..6a13cd5600 100644
--- a/backends/hip-gen/ceed-hip-gen-operator-build.cpp
+++ b/backends/hip-gen/ceed-hip-gen-operator-build.cpp
@@ -908,8 +908,8 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op) {
   CeedCallBackend(CeedCompile_Hip(ceed, code.str().c_str(), &data->module, 2, "T_1D", block_sizes[0], "BLOCK_SIZE",
                                   block_sizes[0] * block_sizes[1] * block_sizes[2]));
   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, operator_name.c_str(), &data->op));
-
   CeedCallBackend(CeedOperatorSetSetupDone(op));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/hip-gen/ceed-hip-gen-operator.c b/backends/hip-gen/ceed-hip-gen-operator.c
index fcd58ed76d..a61d1df32f 100644
--- a/backends/hip-gen/ceed-hip-gen-operator.c
+++ b/backends/hip-gen/ceed-hip-gen-operator.c
@@ -58,6 +58,7 @@ static int CeedOperatorApplyAdd_Hip_gen(CeedOperator op, CeedVector input_vec, C
       CeedOperator op_fallback;
 
       CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "Falling back to /gpu/hip/ref CeedOperator due to non-tensor bases");
+      CeedCallBackend(CeedDestroy(&ceed));
       CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback));
       CeedCallBackend(CeedOperatorApplyAdd(op_fallback, input_vec, output_vec, request));
       return CEED_ERROR_SUCCESS;
@@ -177,6 +178,7 @@ static int CeedOperatorApplyAdd_Hip_gen(CeedOperator op, CeedVector input_vec, C
 
   // Restore context data
   CeedCallBackend(CeedQFunctionRestoreInnerContextData(qf, &qf_data->d_c));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -192,6 +194,7 @@ int CeedOperatorCreate_Hip_gen(CeedOperator op) {
   CeedCallBackend(CeedOperatorSetData(op, impl));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAdd_Hip_gen));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "Destroy", CeedOperatorDestroy_Hip_gen));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/hip-gen/ceed-hip-gen-qfunction.c b/backends/hip-gen/ceed-hip-gen-qfunction.c
index 32d5653b98..6da2c1d10e 100644
--- a/backends/hip-gen/ceed-hip-gen-qfunction.c
+++ b/backends/hip-gen/ceed-hip-gen-qfunction.c
@@ -42,11 +42,11 @@ int CeedQFunctionCreate_Hip_gen(CeedQFunction qf) {
   CeedCallBackend(CeedCalloc(1, &data));
   CeedCallBackend(CeedQFunctionSetData(qf, data));
 
-  // Read QFunction source
   CeedCallBackend(CeedQFunctionGetKernelName(qf, &data->qfunction_name));
 
   CeedCallBackend(CeedSetBackendFunction(ceed, "QFunction", qf, "Apply", CeedQFunctionApply_Hip_gen));
   CeedCallBackend(CeedSetBackendFunction(ceed, "QFunction", qf, "Destroy", CeedQFunctionDestroy_Hip_gen));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/hip-gen/ceed-hip-gen.c b/backends/hip-gen/ceed-hip-gen.c
index d66ceb041a..e867b4eb9e 100644
--- a/backends/hip-gen/ceed-hip-gen.c
+++ b/backends/hip-gen/ceed-hip-gen.c
@@ -33,6 +33,7 @@ static int CeedInit_Hip_gen(const char *resource, Ceed ceed) {
 
   CeedCallBackend(CeedInit("/gpu/hip/shared", &ceed_shared));
   CeedCallBackend(CeedSetDelegate(ceed, ceed_shared));
+  CeedCallBackend(CeedDestroy(&ceed_shared));
 
   CeedCallBackend(CeedSetOperatorFallbackResource(ceed, fallback_resource));
 
diff --git a/backends/hip-ref/ceed-hip-ref-basis.c b/backends/hip-ref/ceed-hip-ref-basis.c
index 7e7f0e97e4..8fb3d3fa20 100644
--- a/backends/hip-ref/ceed-hip-ref-basis.c
+++ b/backends/hip-ref/ceed-hip-ref-basis.c
@@ -87,6 +87,7 @@ static int CeedBasisApplyCore_Hip(CeedBasis basis, bool apply_add, const CeedInt
   CeedCallBackend(CeedVectorRestoreArray(v, &d_v));
   if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u));
   if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -114,7 +115,6 @@ static int CeedBasisApplyAtPointsCore_Hip(CeedBasis basis, bool apply_add, const
   CeedScalar       *d_v;
   CeedBasis_Hip    *data;
 
-  CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
   CeedCallBackend(CeedBasisGetData(basis, &data));
   CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
   CeedCallBackend(CeedBasisGetDimension(basis, &dim));
@@ -125,6 +125,8 @@ static int CeedBasisApplyAtPointsCore_Hip(CeedBasis basis, bool apply_add, const
     return CEED_ERROR_SUCCESS;
   }
 
+  CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
+
   // Check padded to uniform number of points per elem
   for (CeedInt i = 1; i < num_elem; i++) max_num_points = CeedIntMax(max_num_points, num_points[i]);
   {
@@ -242,6 +244,7 @@ static int CeedBasisApplyAtPointsCore_Hip(CeedBasis basis, bool apply_add, const
   CeedCallBackend(CeedVectorRestoreArray(v, &d_v));
   if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u));
   if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -346,6 +349,7 @@ static int CeedBasisApplyNonTensorCore_Hip(CeedBasis basis, bool apply_add, cons
   CeedCallBackend(CeedVectorRestoreArray(v, &d_v));
   if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u));
   if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -379,6 +383,7 @@ static int CeedBasisDestroy_Hip(CeedBasis basis) {
   CeedCallHip(ceed, hipFree(data->d_grad_1d));
   CeedCallHip(ceed, hipFree(data->d_chebyshev_interp_1d));
   CeedCallBackend(CeedFree(&data));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -398,6 +403,7 @@ static int CeedBasisDestroyNonTensor_Hip(CeedBasis basis) {
   CeedCallHip(ceed, hipFree(data->d_div));
   CeedCallHip(ceed, hipFree(data->d_curl));
   CeedCallBackend(CeedFree(&data));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -444,6 +450,7 @@ int CeedBasisCreateTensorH1_Hip(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const C
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAtPoints", CeedBasisApplyAtPoints_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAddAtPoints", CeedBasisApplyAddAtPoints_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Hip));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -498,6 +505,7 @@ int CeedBasisCreateH1_Hip(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes,
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Hip));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -552,6 +560,7 @@ int CeedBasisCreateHdiv_Hip(CeedElemTopology topo, CeedInt dim, CeedInt num_node
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Hip));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -606,6 +615,7 @@ int CeedBasisCreateHcurl_Hip(CeedElemTopology topo, CeedInt dim, CeedInt num_nod
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Hip));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/hip-ref/ceed-hip-ref-operator.c b/backends/hip-ref/ceed-hip-ref-operator.c
index c2e58a1e7f..04733482ee 100644
--- a/backends/hip-ref/ceed-hip-ref-operator.c
+++ b/backends/hip-ref/ceed-hip-ref-operator.c
@@ -77,10 +77,11 @@ static int CeedOperatorDestroy_Hip(CeedOperator op) {
     CeedCallHip(ceed, hipFree(impl->diag->d_div_out));
     CeedCallHip(ceed, hipFree(impl->diag->d_curl_in));
     CeedCallHip(ceed, hipFree(impl->diag->d_curl_out));
-    CeedCallBackend(CeedElemRestrictionDestroy(&impl->diag->diag_rstr));
-    CeedCallBackend(CeedElemRestrictionDestroy(&impl->diag->point_block_diag_rstr));
+    CeedCallBackend(CeedDestroy(&ceed));
     CeedCallBackend(CeedVectorDestroy(&impl->diag->elem_diag));
     CeedCallBackend(CeedVectorDestroy(&impl->diag->point_block_elem_diag));
+    CeedCallBackend(CeedElemRestrictionDestroy(&impl->diag->diag_rstr));
+    CeedCallBackend(CeedElemRestrictionDestroy(&impl->diag->point_block_diag_rstr));
   }
   CeedCallBackend(CeedFree(&impl->diag));
 
@@ -91,6 +92,7 @@ static int CeedOperatorDestroy_Hip(CeedOperator op) {
     CeedCallHip(ceed, hipModuleUnload(impl->asmb->module));
     CeedCallHip(ceed, hipFree(impl->asmb->d_B_in));
     CeedCallHip(ceed, hipFree(impl->asmb->d_B_out));
+    CeedCallBackend(CeedDestroy(&ceed));
   }
   CeedCallBackend(CeedFree(&impl->asmb));
 
@@ -226,6 +228,7 @@ static int CeedOperatorSetupFields_Hip(CeedQFunction qf, CeedOperator op, bool i
       CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
     }
   }
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -233,7 +236,6 @@ static int CeedOperatorSetupFields_Hip(CeedQFunction qf, CeedOperator op, bool i
 // CeedOperator needs to connect all the named fields (be they active or passive) to the named inputs and outputs of its CeedQFunction.
 //------------------------------------------------------------------------------
 static int CeedOperatorSetup_Hip(CeedOperator op) {
-  Ceed                ceed;
   bool                is_setup_done;
   CeedInt             Q, num_elem, num_input_fields, num_output_fields;
   CeedQFunctionField *qf_input_fields, *qf_output_fields;
@@ -244,7 +246,6 @@ static int CeedOperatorSetup_Hip(CeedOperator op) {
   CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done));
   if (is_setup_done) return CEED_ERROR_SUCCESS;
 
-  CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
   CeedCallBackend(CeedOperatorGetData(op, &impl));
   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
   CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q));
@@ -585,11 +586,7 @@ static int CeedOperatorApplyAdd_Hip(CeedOperator op, CeedVector in_vec, CeedVect
     }
 
     // Restrict
-    if (impl->skip_rstr_out[field]) {
-      if (!is_active) CeedCallBackend(CeedVectorDestroy(&l_vec));
-      continue;
-    }
-    {
+    if (!impl->skip_rstr_out[field]) {
       CeedElemRestriction elem_rstr;
 
       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[field], &elem_rstr));
@@ -601,6 +598,7 @@ static int CeedOperatorApplyAdd_Hip(CeedOperator op, CeedVector in_vec, CeedVect
 
   // Return work vector
   CeedCallBackend(CeedRestoreWorkVector(ceed, &active_e_vec));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -608,7 +606,6 @@ static int CeedOperatorApplyAdd_Hip(CeedOperator op, CeedVector in_vec, CeedVect
 // CeedOperator needs to connect all the named fields (be they active or passive) to the named inputs and outputs of its CeedQFunction.
 //------------------------------------------------------------------------------
 static int CeedOperatorSetupAtPoints_Hip(CeedOperator op) {
-  Ceed                ceed;
   bool                is_setup_done;
   CeedInt             max_num_points = -1, num_elem, num_input_fields, num_output_fields;
   CeedQFunctionField *qf_input_fields, *qf_output_fields;
@@ -619,7 +616,6 @@ static int CeedOperatorSetupAtPoints_Hip(CeedOperator op) {
   CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done));
   if (is_setup_done) return CEED_ERROR_SUCCESS;
 
-  CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
   CeedCallBackend(CeedOperatorGetData(op, &impl));
   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
@@ -915,11 +911,7 @@ static int CeedOperatorApplyAddAtPoints_Hip(CeedOperator op, CeedVector in_vec,
     }
 
     // Restrict
-    if (impl->skip_rstr_out[field]) {
-      if (!is_active) CeedCallBackend(CeedVectorDestroy(&l_vec));
-      continue;
-    }
-    {
+    if (!impl->skip_rstr_out[field]) {
       CeedElemRestriction elem_rstr;
 
       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[field], &elem_rstr));
@@ -931,6 +923,7 @@ static int CeedOperatorApplyAddAtPoints_Hip(CeedOperator op, CeedVector in_vec,
 
   // Restore work vector
   CeedCallBackend(CeedRestoreWorkVector(ceed, &active_e_vec));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -1072,6 +1065,8 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Hip(CeedOperator op, b
   }
 
   // Restore output
+  CeedCallBackend(CeedDestroy(&ceed));
+  CeedCallBackend(CeedDestroy(&ceed_parent));
   CeedCallBackend(CeedVectorRestoreArray(*assembled, &assembled_array));
   return CEED_ERROR_SUCCESS;
 }
@@ -1273,6 +1268,7 @@ static inline int CeedOperatorAssembleDiagonalSetup_Hip(CeedOperator op) {
   CeedCallHip(ceed, hipMemcpy(diag->d_eval_modes_out, eval_modes_out, num_eval_modes_out * eval_modes_bytes, hipMemcpyHostToDevice));
   CeedCallBackend(CeedFree(&eval_modes_in));
   CeedCallBackend(CeedFree(&eval_modes_out));
+  CeedCallBackend(CeedDestroy(&ceed));
   CeedCallBackend(CeedBasisDestroy(&basis_in));
   CeedCallBackend(CeedBasisDestroy(&basis_out));
   return CEED_ERROR_SUCCESS;
@@ -1358,6 +1354,7 @@ static inline int CeedOperatorAssembleDiagonalSetupCompile_Hip(CeedOperator op,
                                     num_eval_modes_out, "NUM_COMP", num_comp, "NUM_NODES", num_nodes, "NUM_QPTS", num_qpts, "USE_CEEDSIZE",
                                     use_ceedsize_idx, "USE_POINT_BLOCK", is_point_block ? 1 : 0, "BLOCK_SIZE", num_nodes * elems_per_block));
   CeedCallHip(ceed, CeedGetKernel_Hip(ceed, *module, "LinearDiagonal", is_point_block ? &diag->LinearPointBlock : &diag->LinearDiagonal));
+  CeedCallBackend(CeedDestroy(&ceed));
   CeedCallBackend(CeedBasisDestroy(&basis_in));
   CeedCallBackend(CeedBasisDestroy(&basis_out));
   return CEED_ERROR_SUCCESS;
@@ -1446,6 +1443,7 @@ static inline int CeedOperatorAssembleDiagonalCore_Hip(CeedOperator op, CeedVect
   CeedCallBackend(CeedElemRestrictionApply(diag_rstr, CEED_TRANSPOSE, elem_diag, assembled, request));
 
   // Cleanup
+  CeedCallBackend(CeedDestroy(&ceed));
   CeedCallBackend(CeedVectorDestroy(&assembled_qf));
   return CEED_ERROR_SUCCESS;
 }
@@ -1658,6 +1656,7 @@ static int CeedSingleOperatorAssembleSetup_Hip(CeedOperator op, CeedInt use_ceed
     CeedCallBackend(CeedFree(&identity));
   }
   CeedCallBackend(CeedFree(&eval_modes_out));
+  CeedCallBackend(CeedDestroy(&ceed));
   CeedCallBackend(CeedElemRestrictionDestroy(&rstr_in));
   CeedCallBackend(CeedElemRestrictionDestroy(&rstr_out));
   CeedCallBackend(CeedBasisDestroy(&basis_in));
@@ -1766,6 +1765,7 @@ static int CeedSingleOperatorAssemble_Hip(CeedOperator op, CeedInt offset, CeedV
       CeedCallBackend(CeedElemRestrictionRestoreCurlOrientations(rstr_out, &curl_orients_out));
     }
   }
+  CeedCallBackend(CeedDestroy(&ceed));
   CeedCallBackend(CeedElemRestrictionDestroy(&rstr_in));
   CeedCallBackend(CeedElemRestrictionDestroy(&rstr_out));
   return CEED_ERROR_SUCCESS;
@@ -2037,6 +2037,7 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip(CeedOperator op, Ce
   // Restore work vector
   CeedCallBackend(CeedRestoreWorkVector(ceed, &active_e_vec_in));
   CeedCallBackend(CeedRestoreWorkVector(ceed, &active_e_vec_out));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -2059,6 +2060,7 @@ int CeedOperatorCreate_Hip(CeedOperator op) {
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleSingle", CeedSingleOperatorAssemble_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAdd_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "Destroy", CeedOperatorDestroy_Hip));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -2077,6 +2079,7 @@ int CeedOperatorCreateAtPoints_Hip(CeedOperator op) {
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddDiagonal", CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAddAtPoints_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "Destroy", CeedOperatorDestroy_Hip));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/hip-ref/ceed-hip-ref-qfunction-load.cpp b/backends/hip-ref/ceed-hip-ref-qfunction-load.cpp
index 2311f8a332..127bd7467d 100644
--- a/backends/hip-ref/ceed-hip-ref-qfunction-load.cpp
+++ b/backends/hip-ref/ceed-hip-ref-qfunction-load.cpp
@@ -30,13 +30,13 @@ extern "C" int CeedQFunctionBuildKernel_Hip_ref(CeedQFunction qf) {
   CeedQFunctionField *input_fields, *output_fields;
   CeedQFunction_Hip  *data;
 
-  CeedCallBackend(CeedQFunctionGetCeed(qf, &ceed));
-  CeedCallBackend(CeedGetData(ceed, &ceed_Hip));
-  CeedCallBackend(CeedQFunctionGetData(qf, (void **)&data));
-
   // QFunction is built
+  CeedCallBackend(CeedQFunctionGetData(qf, (void **)&data));
   if (data->QFunction) return CEED_ERROR_SUCCESS;
 
+  CeedCallBackend(CeedQFunctionGetCeed(qf, &ceed));
+  CeedCallBackend(CeedGetData(ceed, &ceed_Hip));
+
   // QFunction kernel generation
   CeedCallBackend(CeedQFunctionGetFields(qf, &num_input_fields, &input_fields, &num_output_fields, &output_fields));
 
@@ -116,6 +116,7 @@ extern "C" int CeedQFunctionBuildKernel_Hip_ref(CeedQFunction qf) {
   // Compile kernel
   CeedCallBackend(CeedCompile_Hip(ceed, code.str().c_str(), &data->module, 1, "BLOCK_SIZE", ceed_Hip->opt_block_size));
   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, kernel_name.c_str(), &data->QFunction));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/hip-ref/ceed-hip-ref-qfunction.c b/backends/hip-ref/ceed-hip-ref-qfunction.c
index 18d531ac11..92835b897e 100644
--- a/backends/hip-ref/ceed-hip-ref-qfunction.c
+++ b/backends/hip-ref/ceed-hip-ref-qfunction.c
@@ -60,6 +60,7 @@ static int CeedQFunctionApply_Hip(CeedQFunction qf, CeedInt Q, CeedVector *U, Ce
 
   // Restore context
   CeedCallBackend(CeedQFunctionRestoreInnerContextData(qf, &data->d_c));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -88,12 +89,12 @@ int CeedQFunctionCreate_Hip(CeedQFunction qf) {
   CeedCallBackend(CeedQFunctionSetData(qf, data));
   CeedCallBackend(CeedQFunctionGetNumArgs(qf, &num_input_fields, &num_output_fields));
 
-  // Read QFunction name
   CeedCallBackend(CeedQFunctionGetKernelName(qf, &data->qfunction_name));
 
   // Register backend functions
   CeedCallBackend(CeedSetBackendFunction(ceed, "QFunction", qf, "Apply", CeedQFunctionApply_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "QFunction", qf, "Destroy", CeedQFunctionDestroy_Hip));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/hip-ref/ceed-hip-ref-qfunctioncontext.c b/backends/hip-ref/ceed-hip-ref-qfunctioncontext.c
index 52bf13370b..0d09a2087d 100644
--- a/backends/hip-ref/ceed-hip-ref-qfunctioncontext.c
+++ b/backends/hip-ref/ceed-hip-ref-qfunctioncontext.c
@@ -37,6 +37,7 @@ static inline int CeedQFunctionContextSyncH2D_Hip(const CeedQFunctionContext ctx
     impl->d_data = impl->d_data_owned;
   }
   CeedCallHip(ceed, hipMemcpy(impl->d_data, impl->h_data, ctx_size, hipMemcpyHostToDevice));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -63,6 +64,7 @@ static inline int CeedQFunctionContextSyncD2H_Hip(const CeedQFunctionContext ctx
     impl->h_data = impl->h_data_owned;
   }
   CeedCallHip(ceed, hipMemcpy(impl->h_data, impl->d_data, ctx_size, hipMemcpyDeviceToHost));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -204,6 +206,7 @@ static int CeedQFunctionContextSetDataDevice_Hip(const CeedQFunctionContext ctx,
       impl->d_data          = data;
       break;
   }
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -334,6 +337,7 @@ int CeedQFunctionContextCreate_Hip(CeedQFunctionContext ctx) {
   CeedCallBackend(CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "GetData", CeedQFunctionContextGetData_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "GetDataRead", CeedQFunctionContextGetDataRead_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "Destroy", CeedQFunctionContextDestroy_Hip));
+  CeedCallBackend(CeedDestroy(&ceed));
   CeedCallBackend(CeedCalloc(1, &impl));
   CeedCallBackend(CeedQFunctionContextSetBackendData(ctx, impl));
   return CEED_ERROR_SUCCESS;
diff --git a/backends/hip-ref/ceed-hip-ref-restriction.c b/backends/hip-ref/ceed-hip-ref-restriction.c
index ca1d19d7a6..88393e25d3 100644
--- a/backends/hip-ref/ceed-hip-ref-restriction.c
+++ b/backends/hip-ref/ceed-hip-ref-restriction.c
@@ -108,6 +108,7 @@ static inline int CeedElemRestrictionSetupCompile_Hip(CeedElemRestriction rstr)
 
     } break;
   }
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -287,6 +288,7 @@ static inline int CeedElemRestrictionApply_Hip_Core(CeedElemRestriction rstr, Ce
   // Restore arrays
   CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u));
   CeedCallBackend(CeedVectorRestoreArray(v, &d_v));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -406,6 +408,7 @@ static int CeedElemRestrictionDestroy_Hip(CeedElemRestriction rstr) {
   CeedCallBackend(CeedFree(&impl->h_points_per_elem_owned));
   CeedCallHip(ceed, hipFree((CeedInt *)impl->d_points_per_elem_owned));
   CeedCallBackend(CeedFree(&impl));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -487,6 +490,7 @@ static int CeedElemRestrictionOffset_Hip(const CeedElemRestriction rstr, const C
   CeedCallBackend(CeedFree(&l_vec_indices));
   CeedCallBackend(CeedFree(&t_offsets));
   CeedCallBackend(CeedFree(&t_indices));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -504,6 +508,7 @@ int CeedElemRestrictionCreate_Hip(CeedMemType mem_type, CeedCopyMode copy_mode,
   CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed));
   CeedCallBackend(CeedGetParent(ceed, &ceed_parent));
   CeedCallBackend(CeedIsDeterministic(ceed_parent, &is_deterministic));
+  CeedCallBackend(CeedDestroy(&ceed_parent));
   CeedCallBackend(CeedElemRestrictionGetNumElements(rstr, &num_elem));
   CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &num_comp));
   CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size));
@@ -651,6 +656,7 @@ int CeedElemRestrictionCreate_Hip(CeedMemType mem_type, CeedCopyMode copy_mode,
         CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetAtPointsElementOffset", CeedElemRestrictionGetAtPointsElementOffset_Hip));
   }
   CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "Destroy", CeedElemRestrictionDestroy_Hip));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/hip-ref/ceed-hip-ref-vector.c b/backends/hip-ref/ceed-hip-ref-vector.c
index 164eb822d6..0a3a3fe3d0 100644
--- a/backends/hip-ref/ceed-hip-ref-vector.c
+++ b/backends/hip-ref/ceed-hip-ref-vector.c
@@ -39,15 +39,13 @@ static inline int CeedVectorNeedSync_Hip(const CeedVector vec, CeedMemType mem_t
 // Sync host to device
 //------------------------------------------------------------------------------
 static inline int CeedVectorSyncH2D_Hip(const CeedVector vec) {
-  Ceed            ceed;
   CeedSize        length;
   size_t          bytes;
   CeedVector_Hip *impl;
 
-  CeedCallBackend(CeedVectorGetCeed(vec, &ceed));
   CeedCallBackend(CeedVectorGetData(vec, &impl));
 
-  CeedCheck(impl->h_array, ceed, CEED_ERROR_BACKEND, "No valid host data to sync to device");
+  CeedCheck(impl->h_array, CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND, "No valid host data to sync to device");
 
   CeedCallBackend(CeedVectorGetLength(vec, &length));
   bytes = length * sizeof(CeedScalar);
@@ -56,10 +54,10 @@ static inline int CeedVectorSyncH2D_Hip(const CeedVector vec) {
   } else if (impl->d_array_owned) {
     impl->d_array = impl->d_array_owned;
   } else {
-    CeedCallHip(ceed, hipMalloc((void **)&impl->d_array_owned, bytes));
+    CeedCallHip(CeedVectorReturnCeed(vec), hipMalloc((void **)&impl->d_array_owned, bytes));
     impl->d_array = impl->d_array_owned;
   }
-  CeedCallHip(ceed, hipMemcpy(impl->d_array, impl->h_array, bytes, hipMemcpyHostToDevice));
+  CeedCallHip(CeedVectorReturnCeed(vec), hipMemcpy(impl->d_array, impl->h_array, bytes, hipMemcpyHostToDevice));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -67,15 +65,13 @@ static inline int CeedVectorSyncH2D_Hip(const CeedVector vec) {
 // Sync device to host
 //------------------------------------------------------------------------------
 static inline int CeedVectorSyncD2H_Hip(const CeedVector vec) {
-  Ceed            ceed;
   CeedSize        length;
   size_t          bytes;
   CeedVector_Hip *impl;
 
-  CeedCallBackend(CeedVectorGetCeed(vec, &ceed));
   CeedCallBackend(CeedVectorGetData(vec, &impl));
 
-  CeedCheck(impl->d_array, ceed, CEED_ERROR_BACKEND, "No valid device data to sync to host");
+  CeedCheck(impl->d_array, CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND, "No valid device data to sync to host");
 
   if (impl->h_array_borrowed) {
     impl->h_array = impl->h_array_borrowed;
@@ -91,7 +87,7 @@ static inline int CeedVectorSyncD2H_Hip(const CeedVector vec) {
 
   CeedCallBackend(CeedVectorGetLength(vec, &length));
   bytes = length * sizeof(CeedScalar);
-  CeedCallHip(ceed, hipMemcpy(impl->h_array, impl->d_array, bytes, hipMemcpyDeviceToHost));
+  CeedCallHip(CeedVectorReturnCeed(vec), hipMemcpy(impl->h_array, impl->d_array, bytes, hipMemcpyDeviceToHost));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -202,6 +198,7 @@ static int CeedVectorSetArrayDevice_Hip(const CeedVector vec, const CeedCopyMode
 
   CeedCallBackend(CeedSetDeviceCeedScalarArray_Hip(ceed, array, copy_mode, length, (const CeedScalar **)&impl->d_array_owned,
                                                    (const CeedScalar **)&impl->d_array_borrowed, (const CeedScalar **)&impl->d_array));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -574,6 +571,7 @@ static int CeedVectorNorm_Hip(CeedVector vec, CeedNormType type, CeedScalar *nor
     }
   }
   CeedCallBackend(CeedVectorRestoreArrayRead(vec, &d_array));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -786,6 +784,7 @@ int CeedVectorCreate_Hip(CeedSize n, CeedVector vec) {
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "AXPBY", CeedVectorAXPBY_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "PointwiseMult", CeedVectorPointwiseMult_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "Destroy", CeedVectorDestroy_Hip));
+  CeedCallBackend(CeedDestroy(&ceed));
   CeedCallBackend(CeedCalloc(1, &impl));
   CeedCallBackend(CeedVectorSetData(vec, impl));
   return CEED_ERROR_SUCCESS;
diff --git a/backends/hip-shared/ceed-hip-shared-basis.c b/backends/hip-shared/ceed-hip-shared-basis.c
index cdcc28ce07..6d8c858632 100644
--- a/backends/hip-shared/ceed-hip-shared-basis.c
+++ b/backends/hip-shared/ceed-hip-shared-basis.c
@@ -248,6 +248,7 @@ static int CeedBasisApplyTensorCore_Hip_shared(CeedBasis basis, bool apply_add,
   CeedCallBackend(CeedVectorRestoreArray(v, &d_v));
   if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u));
   if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -276,7 +277,6 @@ static int CeedBasisApplyAtPointsCore_Hip_shared(CeedBasis basis, bool apply_add
   CeedScalar           *d_v;
   CeedBasis_Hip_shared *data;
 
-  CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
   CeedCallBackend(CeedBasisGetData(basis, &data));
   CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
   CeedCallBackend(CeedBasisGetDimension(basis, &dim));
@@ -287,6 +287,8 @@ static int CeedBasisApplyAtPointsCore_Hip_shared(CeedBasis basis, bool apply_add
     return CEED_ERROR_SUCCESS;
   }
 
+  CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
+
   // Check padded to uniform number of points per elem
   for (CeedInt i = 1; i < num_elem; i++) max_num_points = CeedIntMax(max_num_points, num_points[i]);
   {
@@ -507,6 +509,7 @@ int CeedBasisCreateTensorH1_Hip_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d,
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAtPoints", CeedBasisApplyAtPoints_Hip_shared));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAddAtPoints", CeedBasisApplyAddAtPoints_Hip_shared));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Hip_shared));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/hip-shared/ceed-hip-shared.c b/backends/hip-shared/ceed-hip-shared.c
index f69c78d4ee..01d173e42e 100644
--- a/backends/hip-shared/ceed-hip-shared.c
+++ b/backends/hip-shared/ceed-hip-shared.c
@@ -33,6 +33,7 @@ static int CeedInit_Hip_shared(const char *resource, Ceed ceed) {
 
   CeedCallBackend(CeedInit("/gpu/hip/ref", &ceed_ref));
   CeedCallBackend(CeedSetDelegate(ceed, ceed_ref));
+  CeedCallBackend(CeedDestroy(&ceed_ref));
 
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "BasisCreateTensorH1", CeedBasisCreateTensorH1_Hip_shared));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "Destroy", CeedDestroy_Hip));
diff --git a/backends/magma/ceed-magma-basis.c b/backends/magma/ceed-magma-basis.c
index 6f5fa0a2ca..e1f39139d6 100644
--- a/backends/magma/ceed-magma-basis.c
+++ b/backends/magma/ceed-magma-basis.c
@@ -249,6 +249,7 @@ static int CeedBasisApplyCore_Magma(CeedBasis basis, bool apply_add, CeedInt num
     CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u));
   }
   CeedCallBackend(CeedVectorRestoreArray(v, &d_v));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -371,6 +372,7 @@ static int CeedBasisApplyNonTensorCore_Magma(CeedBasis basis, bool apply_add, Ce
       CeedCallBackend(CeedFree(&basis_kernel_source));
       for (CeedInt i = 0; i < num_file_paths; i++) CeedCallBackend(CeedFree(&file_paths[i]));
       CeedCallBackend(CeedFree(&file_paths));
+      CeedCallBackend(CeedDestroy(&ceed_delegate));
     }
   }
 
@@ -457,6 +459,7 @@ static int CeedBasisApplyNonTensorCore_Magma(CeedBasis basis, bool apply_add, Ce
     CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u));
   }
   CeedCallBackend(CeedVectorRestoreArray(v, &d_v));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -490,6 +493,7 @@ static int CeedBasisDestroy_Magma(CeedBasis basis) {
   CeedCallBackend(magma_free(impl->d_grad_1d));
   if (impl->d_q_weight_1d) CeedCallBackend(magma_free(impl->d_q_weight_1d));
   CeedCallBackend(CeedFree(&impl));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -517,6 +521,7 @@ static int CeedBasisDestroyNonTensor_Magma(CeedBasis basis) {
   CeedCallBackend(magma_free(impl->d_curl));
   if (impl->d_q_weight) CeedCallBackend(magma_free(impl->d_q_weight));
   CeedCallBackend(CeedFree(&impl));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -626,6 +631,8 @@ int CeedBasisCreateTensorH1_Magma(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAdd_Magma));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAtPoints", CeedBasisApplyAtPoints_Magma));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Magma));
+  CeedCallBackend(CeedDestroy(&ceed));
+  CeedCallBackend(CeedDestroy(&ceed_delegate));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -680,6 +687,7 @@ int CeedBasisCreateH1_Magma(CeedElemTopology topo, CeedInt dim, CeedInt num_node
     CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[0], "magma_weight_nontensor", &impl->Weight));
     CeedCallBackend(CeedFree(&weight_kernel_path));
     CeedCallBackend(CeedFree(&basis_kernel_source));
+    CeedCallBackend(CeedDestroy(&ceed_delegate));
   }
 
   CeedCallBackend(CeedBasisSetData(basis, impl));
@@ -688,6 +696,7 @@ int CeedBasisCreateH1_Magma(CeedElemTopology topo, CeedInt dim, CeedInt num_node
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Magma));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Magma));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Magma));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -742,6 +751,7 @@ int CeedBasisCreateHdiv_Magma(CeedElemTopology topo, CeedInt dim, CeedInt num_no
     CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[0], "magma_weight_nontensor", &impl->Weight));
     CeedCallBackend(CeedFree(&weight_kernel_path));
     CeedCallBackend(CeedFree(&basis_kernel_source));
+    CeedCallBackend(CeedDestroy(&ceed_delegate));
   }
 
   CeedCallBackend(CeedBasisSetData(basis, impl));
@@ -750,6 +760,7 @@ int CeedBasisCreateHdiv_Magma(CeedElemTopology topo, CeedInt dim, CeedInt num_no
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Magma));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Magma));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Magma));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -804,6 +815,7 @@ int CeedBasisCreateHcurl_Magma(CeedElemTopology topo, CeedInt dim, CeedInt num_n
     CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[0], "magma_weight_nontensor", &impl->Weight));
     CeedCallBackend(CeedFree(&weight_kernel_path));
     CeedCallBackend(CeedFree(&basis_kernel_source));
+    CeedCallBackend(CeedDestroy(&ceed_delegate));
   }
 
   CeedCallBackend(CeedBasisSetData(basis, impl));
@@ -812,6 +824,7 @@ int CeedBasisCreateHcurl_Magma(CeedElemTopology topo, CeedInt dim, CeedInt num_n
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Magma));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Magma));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Magma));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/magma/ceed-magma-det.c b/backends/magma/ceed-magma-det.c
index 9b7125ccda..5ecbe462e1 100644
--- a/backends/magma/ceed-magma-det.c
+++ b/backends/magma/ceed-magma-det.c
@@ -35,6 +35,7 @@ static int CeedInit_Magma_Det(const char *resource, Ceed ceed) {
   CeedCallBackend(CeedInit("/gpu/cuda/magma", &ceed_ref));
 #endif
   CeedCallBackend(CeedSetDelegate(ceed, ceed_ref));
+  CeedCallBackend(CeedDestroy(&ceed_ref));
 
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "Destroy", CeedDestroy_Magma));
   return CEED_ERROR_SUCCESS;
diff --git a/backends/magma/ceed-magma.c b/backends/magma/ceed-magma.c
index 06254365b9..f325947920 100644
--- a/backends/magma/ceed-magma.c
+++ b/backends/magma/ceed-magma.c
@@ -36,6 +36,7 @@ static int CeedInit_Magma(const char *resource, Ceed ceed) {
   CeedCallBackend(CeedInit("/gpu/cuda/ref", &ceed_ref));
 #endif
   CeedCallBackend(CeedSetDelegate(ceed, ceed_ref));
+  CeedCallBackend(CeedDestroy(&ceed_ref));
 
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "BasisCreateTensorH1", CeedBasisCreateTensorH1_Magma));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "BasisCreateH1", CeedBasisCreateH1_Magma));
diff --git a/backends/memcheck/ceed-memcheck-blocked.c b/backends/memcheck/ceed-memcheck-blocked.c
index 4d9f557af5..5bc0a5dc10 100644
--- a/backends/memcheck/ceed-memcheck-blocked.c
+++ b/backends/memcheck/ceed-memcheck-blocked.c
@@ -22,6 +22,7 @@ static int CeedInit_Memcheck(const char *resource, Ceed ceed) {
   // Create reference Ceed that implementation will be dispatched through unless overridden
   CeedCallBackend(CeedInit("/cpu/self/ref/blocked", &ceed_ref));
   CeedCallBackend(CeedSetDelegate(ceed, ceed_ref));
+  CeedCallBackend(CeedDestroy(&ceed_ref));
 
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "VectorCreate", CeedVectorCreate_Memcheck));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "ElemRestrictionCreate", CeedElemRestrictionCreate_Memcheck));
diff --git a/backends/memcheck/ceed-memcheck-qfunction.c b/backends/memcheck/ceed-memcheck-qfunction.c
index 1bb8158584..b73e3c5e90 100644
--- a/backends/memcheck/ceed-memcheck-qfunction.c
+++ b/backends/memcheck/ceed-memcheck-qfunction.c
@@ -17,7 +17,6 @@
 // QFunction Apply
 //------------------------------------------------------------------------------
 static int CeedQFunctionApply_Memcheck(CeedQFunction qf, CeedInt Q, CeedVector *U, CeedVector *V) {
-  Ceed                    ceed;
   void                   *ctx_data = NULL;
   int                     input_block_ids[CEED_FIELD_MAX], output_block_ids[CEED_FIELD_MAX];
   CeedInt                 num_in, num_out;
@@ -25,7 +24,6 @@ static int CeedQFunctionApply_Memcheck(CeedQFunction qf, CeedInt Q, CeedVector *
   CeedQFunctionField     *output_fields;
   CeedQFunction_Memcheck *impl;
 
-  CeedCallBackend(CeedQFunctionGetCeed(qf, &ceed));
   CeedCallBackend(CeedQFunctionGetData(qf, &impl));
   CeedCallBackend(CeedQFunctionGetContextData(qf, CEED_MEM_HOST, &ctx_data));
   CeedCallBackend(CeedQFunctionGetUserFunction(qf, &f));
@@ -82,7 +80,7 @@ static int CeedQFunctionApply_Memcheck(CeedQFunction qf, CeedInt Q, CeedVector *
       CeedCallBackend(CeedQFunctionFieldGetSize(output_fields[i], &field_size));
       CeedCallBackend(CeedQFunctionFieldGetName(output_fields[i], &field_name));
       for (CeedSize j = 0; j < field_size * (CeedSize)Q; j++) {
-        CeedCheck(!isnan(impl->outputs[i][j]), ceed, CEED_ERROR_BACKEND,
+        CeedCheck(!isnan(impl->outputs[i][j]), CeedQFunctionReturnCeed(qf), CEED_ERROR_BACKEND,
                   "QFunction output %" CeedInt_FMT " '%s' entry %" CeedSize_FMT " is NaN after restoring write-only access: %s:%s ", i, field_name, j,
                   kernel_path, kernel_name);
       }
@@ -121,6 +119,7 @@ int CeedQFunctionCreate_Memcheck(CeedQFunction qf) {
   CeedCallBackend(CeedQFunctionSetData(qf, impl));
   CeedCallBackend(CeedSetBackendFunction(ceed, "QFunction", qf, "Apply", CeedQFunctionApply_Memcheck));
   CeedCallBackend(CeedSetBackendFunction(ceed, "QFunction", qf, "Destroy", CeedQFunctionDestroy_Memcheck));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/memcheck/ceed-memcheck-qfunctioncontext.c b/backends/memcheck/ceed-memcheck-qfunctioncontext.c
index 57afe981af..6149a5a3ac 100644
--- a/backends/memcheck/ceed-memcheck-qfunctioncontext.c
+++ b/backends/memcheck/ceed-memcheck-qfunctioncontext.c
@@ -206,18 +206,16 @@ static int CeedQFunctionContextRestoreData_Memcheck(CeedQFunctionContext ctx) {
 // QFunctionContext Restore Data Read-Only
 //------------------------------------------------------------------------------
 static int CeedQFunctionContextRestoreDataRead_Memcheck(CeedQFunctionContext ctx) {
-  Ceed                           ceed;
   size_t                         ctx_size;
   CeedQFunctionContext_Memcheck *impl;
 
-  CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed));
   CeedCallBackend(CeedQFunctionContextGetContextSize(ctx, &ctx_size));
   CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl));
 
   // Verify no changes made during read-only access
   bool is_changed = memcmp(impl->data_allocated, impl->data_read_only_copy, ctx_size);
 
-  CeedCheck(!is_changed, ceed, CEED_ERROR_BACKEND, "Context data changed while accessed in read-only mode");
+  CeedCheck(!is_changed, CeedQFunctionContextReturnCeed(ctx), CEED_ERROR_BACKEND, "Context data changed while accessed in read-only mode");
 
   // Invalidate read-only buffer
   memset(impl->data_read_only_copy, -42, ctx_size);
@@ -230,16 +228,15 @@ static int CeedQFunctionContextRestoreDataRead_Memcheck(CeedQFunctionContext ctx
 // QFunctionContext destroy user data
 //------------------------------------------------------------------------------
 static int CeedQFunctionContextDataDestroy_Memcheck(CeedQFunctionContext ctx) {
-  Ceed                                ceed;
   CeedMemType                         data_destroy_mem_type;
   CeedQFunctionContextDataDestroyUser data_destroy_function;
   CeedQFunctionContext_Memcheck      *impl;
 
-  CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed));
   CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl));
 
   CeedCallBackend(CeedQFunctionContextGetDataDestroy(ctx, &data_destroy_mem_type, &data_destroy_function));
-  CeedCheck(data_destroy_mem_type == CEED_MEM_HOST, ceed, CEED_ERROR_BACKEND, "Can only destroy HOST memory for this backend");
+  CeedCheck(data_destroy_mem_type == CEED_MEM_HOST, CeedQFunctionContextReturnCeed(ctx), CEED_ERROR_BACKEND,
+            "Can only destroy HOST memory for this backend");
 
   // Run user destroy routine
   if (data_destroy_function) {
@@ -305,6 +302,7 @@ int CeedQFunctionContextCreate_Memcheck(CeedQFunctionContext ctx) {
   CeedCallBackend(CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "RestoreDataRead", CeedQFunctionContextRestoreDataRead_Memcheck));
   CeedCallBackend(CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "DataDestroy", CeedQFunctionContextDataDestroy_Memcheck));
   CeedCallBackend(CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "Destroy", CeedQFunctionContextDestroy_Memcheck));
+  CeedCallBackend(CeedDestroy(&ceed));
   CeedCallBackend(CeedCalloc(1, &impl));
   CeedCallBackend(CeedQFunctionContextSetBackendData(ctx, impl));
   return CEED_ERROR_SUCCESS;
diff --git a/backends/memcheck/ceed-memcheck-restriction.c b/backends/memcheck/ceed-memcheck-restriction.c
index 35d3016726..57faf28116 100644
--- a/backends/memcheck/ceed-memcheck-restriction.c
+++ b/backends/memcheck/ceed-memcheck-restriction.c
@@ -768,6 +768,7 @@ int CeedElemRestrictionCreate_Memcheck(CeedMemType mem_type, CeedCopyMode copy_m
   CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetOrientations", CeedElemRestrictionGetOrientations_Memcheck));
   CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetCurlOrientations", CeedElemRestrictionGetCurlOrientations_Memcheck));
   CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "Destroy", CeedElemRestrictionDestroy_Memcheck));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/memcheck/ceed-memcheck-serial.c b/backends/memcheck/ceed-memcheck-serial.c
index f23a8013e6..433380d6d4 100644
--- a/backends/memcheck/ceed-memcheck-serial.c
+++ b/backends/memcheck/ceed-memcheck-serial.c
@@ -23,6 +23,7 @@ static int CeedInit_Memcheck(const char *resource, Ceed ceed) {
   // Create reference Ceed that implementation will be dispatched through unless overridden
   CeedCallBackend(CeedInit("/cpu/self/ref/serial", &ceed_ref));
   CeedCallBackend(CeedSetDelegate(ceed, ceed_ref));
+  CeedCallBackend(CeedDestroy(&ceed_ref));
 
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "VectorCreate", CeedVectorCreate_Memcheck));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "ElemRestrictionCreate", CeedElemRestrictionCreate_Memcheck));
diff --git a/backends/memcheck/ceed-memcheck-vector.c b/backends/memcheck/ceed-memcheck-vector.c
index 325fd52c34..52716d5c70 100644
--- a/backends/memcheck/ceed-memcheck-vector.c
+++ b/backends/memcheck/ceed-memcheck-vector.c
@@ -248,11 +248,9 @@ static int CeedVectorGetArrayWrite_Memcheck(CeedVector vec, CeedMemType mem_type
 // Vector Restore Array
 //------------------------------------------------------------------------------
 static int CeedVectorRestoreArray_Memcheck(CeedVector vec) {
-  Ceed                 ceed;
   CeedSize             length;
   CeedVector_Memcheck *impl;
 
-  CeedCallBackend(CeedVectorGetCeed(vec, &ceed));
   CeedCallBackend(CeedVectorGetData(vec, &impl));
   CeedCallBackend(CeedVectorGetLength(vec, &length));
 
@@ -260,7 +258,8 @@ static int CeedVectorRestoreArray_Memcheck(CeedVector vec) {
   if (impl->is_write_only_access) {
     for (CeedSize i = 0; i < length; i++) {
       if (isnan(impl->array_writable_copy[i])) {
-        CeedDebug256(ceed, CEED_DEBUG_COLOR_WARNING, "WARNING: Vec entry %" CeedSize_FMT " is NaN after restoring write-only access", i);
+        CeedDebug256(CeedVectorReturnCeed(vec), CEED_DEBUG_COLOR_WARNING,
+                     "WARNING: Vec entry %" CeedSize_FMT " is NaN after restoring write-only access", i);
       }
     }
     impl->is_write_only_access = false;
@@ -281,18 +280,16 @@ static int CeedVectorRestoreArray_Memcheck(CeedVector vec) {
 // Vector Restore Array Read-Only
 //------------------------------------------------------------------------------
 static int CeedVectorRestoreArrayRead_Memcheck(CeedVector vec) {
-  Ceed                 ceed;
   CeedSize             length;
   CeedVector_Memcheck *impl;
 
-  CeedCallBackend(CeedVectorGetCeed(vec, &ceed));
   CeedCallBackend(CeedVectorGetData(vec, &impl));
   CeedCallBackend(CeedVectorGetLength(vec, &length));
 
   // Verify no changes made during read-only access
   bool is_changed = memcmp(impl->array_allocated, impl->array_read_only_copy, length * sizeof(CeedScalar));
 
-  CeedCheck(!is_changed, ceed, CEED_ERROR_BACKEND, "Array data changed while accessed in read-only mode");
+  CeedCheck(!is_changed, CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND, "Array data changed while accessed in read-only mode");
 
   // Invalidate read-only buffer
   for (CeedSize i = 0; i < length; i++) impl->array_read_only_copy[i] = NAN;
@@ -409,9 +406,6 @@ int CeedVectorCreate_Memcheck(CeedSize n, CeedVector vec) {
   Ceed                 ceed;
   CeedVector_Memcheck *impl;
 
-  CeedCallBackend(CeedCalloc(1, &impl));
-  CeedCallBackend(CeedVectorSetData(vec, impl));
-
   CeedCallBackend(CeedVectorGetCeed(vec, &ceed));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "HasValidArray", CeedVectorHasValidArray_Memcheck));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "HasBorrowedArrayOfType", CeedVectorHasBorrowedArrayOfType_Memcheck));
@@ -431,6 +425,9 @@ int CeedVectorCreate_Memcheck(CeedSize n, CeedVector vec) {
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "AXPBY", CeedVectorAXPBY_Memcheck));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "PointwiseMult", CeedVectorPointwiseMult_Memcheck));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "Destroy", CeedVectorDestroy_Memcheck));
+  CeedCallBackend(CeedDestroy(&ceed));
+  CeedCallBackend(CeedCalloc(1, &impl));
+  CeedCallBackend(CeedVectorSetData(vec, impl));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/opt/ceed-opt-blocked.c b/backends/opt/ceed-opt-blocked.c
index 6b0125f2fa..fcb8140ef1 100644
--- a/backends/opt/ceed-opt-blocked.c
+++ b/backends/opt/ceed-opt-blocked.c
@@ -37,6 +37,7 @@ static int CeedInit_Opt_Blocked(const char *resource, Ceed ceed) {
   // Create reference Ceed that implementation will be dispatched through unless overridden
   CeedCallBackend(CeedInit("/cpu/self/ref/serial", &ceed_ref));
   CeedCallBackend(CeedSetDelegate(ceed, ceed_ref));
+  CeedCallBackend(CeedDestroy(&ceed_ref));
 
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "Destroy", CeedDestroy_Opt));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "TensorContractCreate", CeedTensorContractCreate_Opt));
diff --git a/backends/opt/ceed-opt-operator.c b/backends/opt/ceed-opt-operator.c
index 8057741208..69f4560a06 100644
--- a/backends/opt/ceed-opt-operator.c
+++ b/backends/opt/ceed-opt-operator.c
@@ -30,7 +30,8 @@ static int CeedOperatorSetupFields_Opt(CeedQFunction qf, CeedOperator op, bool i
 
     CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
     CeedCallBackend(CeedGetParent(ceed, &ceed_parent));
-    if (ceed_parent) ceed = ceed_parent;
+    CeedCallBackend(CeedReferenceCopy(ceed_parent, &ceed));
+    CeedCallBackend(CeedDestroy(&ceed_parent));
   }
   if (is_input) {
     CeedCallBackend(CeedOperatorGetFields(op, NULL, &op_fields, NULL, NULL));
@@ -105,6 +106,7 @@ static int CeedOperatorSetupFields_Opt(CeedQFunction qf, CeedOperator op, bool i
           // Empty case - won't occur
           break;
       }
+      CeedCallBackend(CeedDestroy(&ceed_rstr));
       CeedCallBackend(CeedElemRestrictionDestroy(&rstr));
       CeedCallBackend(CeedElemRestrictionCreateVector(block_rstr[i + start_e], NULL, &e_vecs_full[i + start_e]));
     }
@@ -193,6 +195,7 @@ static int CeedOperatorSetupFields_Opt(CeedQFunction qf, CeedOperator op, bool i
       CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
     }
   }
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -214,6 +217,7 @@ static int CeedOperatorSetup_Opt(CeedOperator op) {
 
   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
   CeedCallBackend(CeedGetData(ceed, &ceed_impl));
+  CeedCallBackend(CeedDestroy(&ceed));
   CeedCallBackend(CeedOperatorGetData(op, &impl));
   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
   CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q));
@@ -447,6 +451,7 @@ static int CeedOperatorApplyAdd_Opt(CeedOperator op, CeedVector in_vec, CeedVect
 
   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
   CeedCallBackend(CeedGetData(ceed, &ceed_impl));
+  CeedCallBackend(CeedDestroy(&ceed));
   CeedCallBackend(CeedOperatorGetData(op, &impl));
   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
   CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q));
@@ -712,6 +717,7 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Opt(CeedOperator op, b
 
   // Restore input arrays
   CeedCallBackend(CeedOperatorRestoreInputs_Opt(num_input_fields, qf_input_fields, op_input_fields, e_data, impl));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -790,6 +796,7 @@ int CeedOperatorCreate_Opt(CeedOperator op) {
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunctionUpdate", CeedOperatorLinearAssembleQFunctionUpdate_Opt));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAdd_Opt));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "Destroy", CeedOperatorDestroy_Opt));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/opt/ceed-opt-serial.c b/backends/opt/ceed-opt-serial.c
index ac506a4ec6..66fc1a9cfb 100644
--- a/backends/opt/ceed-opt-serial.c
+++ b/backends/opt/ceed-opt-serial.c
@@ -37,6 +37,7 @@ static int CeedInit_Opt_Serial(const char *resource, Ceed ceed) {
   // Create reference Ceed that implementation will be dispatched through unless overridden
   CeedCallBackend(CeedInit("/cpu/self/ref/serial", &ceed_ref));
   CeedCallBackend(CeedSetDelegate(ceed, ceed_ref));
+  CeedCallBackend(CeedDestroy(&ceed_ref));
 
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "Destroy", CeedDestroy_Opt));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "TensorContractCreate", CeedTensorContractCreate_Opt));
diff --git a/backends/ref/ceed-ref-basis.c b/backends/ref/ceed-ref-basis.c
index 121669012a..550f631159 100644
--- a/backends/ref/ceed-ref-basis.c
+++ b/backends/ref/ceed-ref-basis.c
@@ -18,7 +18,6 @@
 //------------------------------------------------------------------------------
 static int CeedBasisApplyCore_Ref(CeedBasis basis, bool apply_add, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector U,
                                   CeedVector V) {
-  Ceed               ceed;
   bool               is_tensor_basis, add = apply_add || (t_mode == CEED_TRANSPOSE);
   CeedInt            dim, num_comp, q_comp, num_nodes, num_qpts;
   const CeedScalar  *u;
@@ -26,7 +25,6 @@ static int CeedBasisApplyCore_Ref(CeedBasis basis, bool apply_add, CeedInt num_e
   CeedTensorContract contract;
   CeedBasis_Ref     *impl;
 
-  CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
   CeedCallBackend(CeedBasisGetData(basis, &impl));
   CeedCallBackend(CeedBasisGetDimension(basis, &dim));
   CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
@@ -35,7 +33,7 @@ static int CeedBasisApplyCore_Ref(CeedBasis basis, bool apply_add, CeedInt num_e
   CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &num_qpts));
   CeedCallBackend(CeedBasisGetTensorContract(basis, &contract));
   if (U != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(U, CEED_MEM_HOST, &u));
-  else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
+  else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, CeedBasisReturnCeed(basis), CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
   // Clear v if operating in transpose
   if (apply_add) CeedCallBackend(CeedVectorGetArray(V, CEED_MEM_HOST, &v));
   else CeedCallBackend(CeedVectorGetArrayWrite(V, CEED_MEM_HOST, &v));
@@ -172,7 +170,7 @@ static int CeedBasisApplyCore_Ref(CeedBasis basis, bool apply_add, CeedInt num_e
         CeedInt           Q = Q_1d;
         const CeedScalar *q_weight_1d;
 
-        CeedCheck(t_mode == CEED_NOTRANSPOSE, ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE");
+        CeedCheck(t_mode == CEED_NOTRANSPOSE, CeedBasisReturnCeed(basis), CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE");
         CeedCallBackend(CeedBasisGetQWeights(basis, &q_weight_1d));
         for (CeedInt d = 0; d < dim; d++) {
           CeedInt pre = CeedIntPow(Q, dim - d - 1), post = CeedIntPow(Q, d);
@@ -191,9 +189,9 @@ static int CeedBasisApplyCore_Ref(CeedBasis basis, bool apply_add, CeedInt num_e
       // LCOV_EXCL_START
       case CEED_EVAL_DIV:
       case CEED_EVAL_CURL:
-        return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]);
+        return CeedError(CeedBasisReturnCeed(basis), CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]);
       case CEED_EVAL_NONE:
-        return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_NONE does not make sense in this context");
+        return CeedError(CeedBasisReturnCeed(basis), CEED_ERROR_BACKEND, "CEED_EVAL_NONE does not make sense in this context");
         // LCOV_EXCL_STOP
     }
   } else {
@@ -233,7 +231,7 @@ static int CeedBasisApplyCore_Ref(CeedBasis basis, bool apply_add, CeedInt num_e
       case CEED_EVAL_WEIGHT: {
         const CeedScalar *q_weight;
 
-        CeedCheck(t_mode == CEED_NOTRANSPOSE, ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE");
+        CeedCheck(t_mode == CEED_NOTRANSPOSE, CeedBasisReturnCeed(basis), CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE");
         CeedCallBackend(CeedBasisGetQWeights(basis, &q_weight));
         for (CeedInt i = 0; i < num_qpts; i++) {
           for (CeedInt e = 0; e < num_elem; e++) v[i * num_elem + e] = q_weight[i];
@@ -241,7 +239,7 @@ static int CeedBasisApplyCore_Ref(CeedBasis basis, bool apply_add, CeedInt num_e
       } break;
       // LCOV_EXCL_START
       case CEED_EVAL_NONE:
-        return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_NONE does not make sense in this context");
+        return CeedError(CeedBasisReturnCeed(basis), CEED_ERROR_BACKEND, "CEED_EVAL_NONE does not make sense in this context");
         // LCOV_EXCL_STOP
     }
   }
@@ -312,6 +310,8 @@ int CeedBasisCreateTensorH1_Ref(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const C
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Ref));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAdd_Ref));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyTensor_Ref));
+  CeedCallBackend(CeedDestroy(&ceed));
+  CeedCallBackend(CeedDestroy(&ceed_parent));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -331,6 +331,8 @@ int CeedBasisCreateH1_Ref(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes,
 
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Ref));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAdd_Ref));
+  CeedCallBackend(CeedDestroy(&ceed));
+  CeedCallBackend(CeedDestroy(&ceed_parent));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -350,6 +352,8 @@ int CeedBasisCreateHdiv_Ref(CeedElemTopology topo, CeedInt dim, CeedInt num_node
 
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Ref));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAdd_Ref));
+  CeedCallBackend(CeedDestroy(&ceed));
+  CeedCallBackend(CeedDestroy(&ceed_parent));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -369,6 +373,8 @@ int CeedBasisCreateHcurl_Ref(CeedElemTopology topo, CeedInt dim, CeedInt num_nod
 
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Ref));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAdd_Ref));
+  CeedCallBackend(CeedDestroy(&ceed));
+  CeedCallBackend(CeedDestroy(&ceed_parent));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/ref/ceed-ref-operator.c b/backends/ref/ceed-ref-operator.c
index 4c62608d49..fa461f899a 100644
--- a/backends/ref/ceed-ref-operator.c
+++ b/backends/ref/ceed-ref-operator.c
@@ -30,7 +30,8 @@ static int CeedOperatorSetupFields_Ref(CeedQFunction qf, CeedOperator op, bool i
 
     CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
     CeedCallBackend(CeedGetParent(ceed, &ceed_parent));
-    if (ceed_parent) ceed = ceed_parent;
+    CeedCallBackend(CeedReferenceCopy(ceed_parent, &ceed));
+    CeedCallBackend(CeedDestroy(&ceed_parent));
   }
   if (is_input) {
     CeedCallBackend(CeedOperatorGetFields(op, NULL, &op_fields, NULL, NULL));
@@ -134,6 +135,7 @@ static int CeedOperatorSetupFields_Ref(CeedQFunction qf, CeedOperator op, bool i
       CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
     }
   }
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -472,7 +474,7 @@ static int CeedOperatorApplyAdd_Ref(CeedOperator op, CeedVector in_vec, CeedVect
 //------------------------------------------------------------------------------
 static inline int CeedOperatorLinearAssembleQFunctionCore_Ref(CeedOperator op, bool build_objects, CeedVector *assembled, CeedElemRestriction *rstr,
                                                               CeedRequest *request) {
-  Ceed                ceed, ceed_parent;
+  Ceed                ceed_parent;
   CeedInt             qf_size_in, qf_size_out, Q, num_elem, num_input_fields, num_output_fields;
   CeedScalar         *assembled_array, *e_data_full[2 * CEED_FIELD_MAX] = {NULL};
   CeedQFunctionField *qf_input_fields, *qf_output_fields;
@@ -480,7 +482,6 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Ref(CeedOperator op, b
   CeedOperatorField  *op_input_fields, *op_output_fields;
   CeedOperator_Ref   *impl;
 
-  CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
   CeedCallBackend(CeedOperatorGetFallbackParentCeed(op, &ceed_parent));
   CeedCallBackend(CeedOperatorGetData(op, &impl));
   qf_size_in  = impl->qf_size_in;
@@ -495,7 +496,7 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Ref(CeedOperator op, b
   CeedCallBackend(CeedOperatorSetup_Ref(op));
 
   // Check for restriction only operator
-  CeedCheck(!impl->is_identity_rstr_op, ceed, CEED_ERROR_BACKEND, "Assembling restriction only operators is not supported");
+  CeedCheck(!impl->is_identity_rstr_op, CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND, "Assembling restriction only operators is not supported");
 
   // Input Evecs and Restriction
   CeedCallBackend(CeedOperatorSetupInputs_Ref(num_input_fields, qf_input_fields, op_input_fields, NULL, true, e_data_full, impl, request));
@@ -516,7 +517,7 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Ref(CeedOperator op, b
       }
       CeedCallBackend(CeedVectorDestroy(&vec));
     }
-    CeedCheck(qf_size_in > 0, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs");
+    CeedCheck(qf_size_in > 0, CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs");
     impl->qf_size_in = qf_size_in;
   }
 
@@ -535,7 +536,7 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Ref(CeedOperator op, b
       }
       CeedCallBackend(CeedVectorDestroy(&vec));
     }
-    CeedCheck(qf_size_out > 0, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs");
+    CeedCheck(qf_size_out > 0, CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs");
     impl->qf_size_out = qf_size_out;
   }
 
@@ -644,6 +645,7 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Ref(CeedOperator op, b
 
   // Restore output
   CeedCallBackend(CeedVectorRestoreArray(*assembled, &assembled_array));
+  CeedCallBackend(CeedDestroy(&ceed_parent));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -678,7 +680,8 @@ static int CeedOperatorSetupFieldsAtPoints_Ref(CeedQFunction qf, CeedOperator op
 
     CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
     CeedCallBackend(CeedGetParent(ceed, &ceed_parent));
-    if (ceed_parent) ceed = ceed_parent;
+    CeedCallBackend(CeedReferenceCopy(ceed_parent, &ceed));
+    CeedCallBackend(CeedDestroy(&ceed_parent));
   }
   if (is_input) {
     CeedCallBackend(CeedOperatorGetFields(op, NULL, &op_fields, NULL, NULL));
@@ -815,6 +818,7 @@ static int CeedOperatorSetupFieldsAtPoints_Ref(CeedQFunction qf, CeedOperator op
       CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
     }
   }
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -1282,6 +1286,7 @@ static inline int CeedOperatorLinearAssembleQFunctionAtPointsCore_Ref(CeedOperat
   CeedCallBackend(CeedVectorRestoreArray(*assembled, &assembled_array));
 
   // Cleanup
+  CeedCallBackend(CeedDestroy(&ceed));
   CeedCallBackend(CeedVectorDestroy(&point_coords));
   CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
   return CEED_ERROR_SUCCESS;
@@ -1331,7 +1336,8 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Ref(CeedOperator op, Ce
 
     CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
     CeedCallBackend(CeedGetParent(ceed, &ceed_parent));
-    if (ceed_parent) ceed = ceed_parent;
+    CeedCallBackend(CeedReferenceCopy(ceed_parent, &ceed));
+    CeedCallBackend(CeedDestroy(&ceed_parent));
   }
 
   // Point coordinates
@@ -1530,6 +1536,7 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Ref(CeedOperator op, Ce
   CeedCallBackend(CeedOperatorRestoreInputs_Ref(num_input_fields, qf_input_fields, op_input_fields, true, e_data, impl));
 
   // Cleanup
+  CeedCallBackend(CeedDestroy(&ceed));
   CeedCallBackend(CeedVectorDestroy(&in_vec));
   CeedCallBackend(CeedVectorDestroy(&out_vec));
   CeedCallBackend(CeedVectorDestroy(&point_coords));
@@ -1587,6 +1594,7 @@ int CeedOperatorCreate_Ref(CeedOperator op) {
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunctionUpdate", CeedOperatorLinearAssembleQFunctionUpdate_Ref));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAdd_Ref));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "Destroy", CeedOperatorDestroy_Ref));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -1606,6 +1614,7 @@ int CeedOperatorCreateAtPoints_Ref(CeedOperator op) {
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddDiagonal", CeedOperatorLinearAssembleAddDiagonalAtPoints_Ref));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAddAtPoints_Ref));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "Destroy", CeedOperatorDestroy_Ref));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/ref/ceed-ref-qfunction.c b/backends/ref/ceed-ref-qfunction.c
index d2bbd07ad1..efddda2dcc 100644
--- a/backends/ref/ceed-ref-qfunction.c
+++ b/backends/ref/ceed-ref-qfunction.c
@@ -71,6 +71,7 @@ int CeedQFunctionCreate_Ref(CeedQFunction qf) {
   CeedCallBackend(CeedQFunctionSetData(qf, impl));
   CeedCallBackend(CeedSetBackendFunction(ceed, "QFunction", qf, "Apply", CeedQFunctionApply_Ref));
   CeedCallBackend(CeedSetBackendFunction(ceed, "QFunction", qf, "Destroy", CeedQFunctionDestroy_Ref));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/ref/ceed-ref-qfunctioncontext.c b/backends/ref/ceed-ref-qfunctioncontext.c
index 9fd2d013db..0d3c8bba36 100644
--- a/backends/ref/ceed-ref-qfunctioncontext.c
+++ b/backends/ref/ceed-ref-qfunctioncontext.c
@@ -131,6 +131,7 @@ int CeedQFunctionContextCreate_Ref(CeedQFunctionContext ctx) {
   CeedCallBackend(CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "RestoreData", CeedQFunctionContextRestoreData_Ref));
   CeedCallBackend(CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "RestoreDataRead", CeedQFunctionContextRestoreData_Ref));
   CeedCallBackend(CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "Destroy", CeedQFunctionContextDestroy_Ref));
+  CeedCallBackend(CeedDestroy(&ceed));
   CeedCallBackend(CeedCalloc(1, &impl));
   CeedCallBackend(CeedQFunctionContextSetBackendData(ctx, impl));
   return CEED_ERROR_SUCCESS;
diff --git a/backends/ref/ceed-ref-restriction.c b/backends/ref/ceed-ref-restriction.c
index 068b9906f8..65b6cf080f 100644
--- a/backends/ref/ceed-ref-restriction.c
+++ b/backends/ref/ceed-ref-restriction.c
@@ -774,14 +774,11 @@ int CeedElemRestrictionCreate_Ref(CeedMemType mem_type, CeedCopyMode copy_mode,
 
     // Check indices for ref or memcheck backends
     {
-      Ceed current = ceed, parent = NULL;
+      Ceed current = ceed, ceed_parent = NULL;
 
-      CeedCallBackend(CeedGetParent(current, &parent));
-      while (current != parent) {
-        current = parent;
-        CeedCallBackend(CeedGetParent(current, &parent));
-      }
-      CeedCallBackend(CeedGetResource(parent, &resource));
+      CeedCallBackend(CeedGetParent(current, &ceed_parent));
+      CeedCallBackend(CeedGetResource(ceed_parent, &resource));
+      CeedCallBackend(CeedDestroy(&ceed_parent));
     }
     if (!strcmp(resource, "/cpu/self/ref/serial") || !strcmp(resource, "/cpu/self/ref/blocked")) {
       CeedSize l_size;
@@ -871,6 +868,7 @@ int CeedElemRestrictionCreate_Ref(CeedMemType mem_type, CeedCopyMode copy_mode,
   CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetOrientations", CeedElemRestrictionGetOrientations_Ref));
   CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetCurlOrientations", CeedElemRestrictionGetCurlOrientations_Ref));
   CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "Destroy", CeedElemRestrictionDestroy_Ref));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/ref/ceed-ref-tensor.c b/backends/ref/ceed-ref-tensor.c
index a2064cfce6..17499172a6 100644
--- a/backends/ref/ceed-ref-tensor.c
+++ b/backends/ref/ceed-ref-tensor.c
@@ -51,6 +51,7 @@ int CeedTensorContractCreate_Ref(CeedTensorContract contract) {
   CeedCallBackend(CeedTensorContractGetCeed(contract, &ceed));
   CeedCallBackend(CeedSetBackendFunction(ceed, "TensorContract", contract, "Apply", CeedTensorContractApply_Ref));
   CeedCallBackend(CeedSetBackendFunction(ceed, "TensorContract", contract, "Destroy", CeedTensorContractDestroy_Ref));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/ref/ceed-ref-vector.c b/backends/ref/ceed-ref-vector.c
index f907d232c8..2af3a8770c 100644
--- a/backends/ref/ceed-ref-vector.c
+++ b/backends/ref/ceed-ref-vector.c
@@ -149,6 +149,7 @@ int CeedVectorCreate_Ref(CeedSize n, CeedVector vec) {
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "RestoreArray", CeedVectorRestoreArray_Ref));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "RestoreArrayRead", CeedVectorRestoreArrayRead_Ref));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "Destroy", CeedVectorDestroy_Ref));
+  CeedCallBackend(CeedDestroy(&ceed));
   CeedCallBackend(CeedCalloc(1, &impl));
   CeedCallBackend(CeedVectorSetData(vec, impl));
   return CEED_ERROR_SUCCESS;
diff --git a/backends/sycl-gen/ceed-sycl-gen-operator-build.sycl.cpp b/backends/sycl-gen/ceed-sycl-gen-operator-build.sycl.cpp
index ee7aab812c..f1fb58e42f 100644
--- a/backends/sycl-gen/ceed-sycl-gen-operator-build.sycl.cpp
+++ b/backends/sycl-gen/ceed-sycl-gen-operator-build.sycl.cpp
@@ -780,8 +780,8 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) {
 
   // Load kernel function
   CeedCallBackend(CeedGetKernel_Sycl(ceed, impl->sycl_module, operator_name, &impl->op));
-
   CeedCallBackend(CeedOperatorSetSetupDone(op));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/sycl-gen/ceed-sycl-gen-operator.sycl.cpp b/backends/sycl-gen/ceed-sycl-gen-operator.sycl.cpp
index 0736446f4c..eb74176bbf 100644
--- a/backends/sycl-gen/ceed-sycl-gen-operator.sycl.cpp
+++ b/backends/sycl-gen/ceed-sycl-gen-operator.sycl.cpp
@@ -58,6 +58,7 @@ static int CeedOperatorApplyAdd_Sycl_gen(CeedOperator op, CeedVector input_vec,
       CeedOperator op_fallback;
 
       CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "Falling back to sycl/ref CeedOperator due to non-tensor bases");
+      CeedCallBackend(CeedDestroy(&ceed));
       CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback));
       CeedCallBackend(CeedOperatorApplyAdd(op_fallback, input_vec, output_vec, request));
       return CEED_ERROR_SUCCESS;
@@ -198,6 +199,7 @@ static int CeedOperatorApplyAdd_Sycl_gen(CeedOperator op, CeedVector input_vec,
 
   // Restore context data
   CeedCallBackend(CeedQFunctionRestoreInnerContextData(qf, &qf_impl->d_c));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -223,6 +225,7 @@ int CeedOperatorCreate_Sycl_gen(CeedOperator op) {
 
   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAdd_Sycl_gen));
   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Operator", op, "Destroy", CeedOperatorDestroy_Sycl_gen));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/sycl-gen/ceed-sycl-gen-qfunction.sycl.cpp b/backends/sycl-gen/ceed-sycl-gen-qfunction.sycl.cpp
index 05774e6237..e810bfbf7a 100644
--- a/backends/sycl-gen/ceed-sycl-gen-qfunction.sycl.cpp
+++ b/backends/sycl-gen/ceed-sycl-gen-qfunction.sycl.cpp
@@ -38,6 +38,7 @@ static int CeedQFunctionDestroy_Sycl_gen(CeedQFunction qf) {
 
   CeedCallBackend(CeedFree(&impl->qfunction_source));
   CeedCallBackend(CeedFree(&impl));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -61,6 +62,7 @@ int CeedQFunctionCreate_Sycl_gen(CeedQFunction qf) {
 
   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "QFunction", qf, "Apply", CeedQFunctionApply_Sycl_gen));
   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "QFunction", qf, "Destroy", CeedQFunctionDestroy_Sycl_gen));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/sycl-gen/ceed-sycl-gen.sycl.cpp b/backends/sycl-gen/ceed-sycl-gen.sycl.cpp
index 2cee0469a7..2a1ed4ad49 100644
--- a/backends/sycl-gen/ceed-sycl-gen.sycl.cpp
+++ b/backends/sycl-gen/ceed-sycl-gen.sycl.cpp
@@ -35,12 +35,14 @@ static int CeedInit_Sycl_gen(const char *resource, Ceed ceed) {
   CeedCallBackend(CeedInit("/gpu/sycl/shared", &ceed_shared));
   CeedCallBackend(CeedSetDelegate(ceed, ceed_shared));
   CeedCallBackend(CeedSetStream_Sycl(ceed_shared, &(data->sycl_queue)));
+  CeedCallBackend(CeedDestroy(&ceed_shared));
 
   CeedCallBackend(CeedSetOperatorFallbackResource(ceed, fallback_resource));
 
   Ceed ceed_fallback = NULL;
   CeedCallBackend(CeedGetOperatorFallbackCeed(ceed, &ceed_fallback));
   CeedCallBackend(CeedSetStream_Sycl(ceed_fallback, &(data->sycl_queue)));
+  CeedCallBackend(CeedDestroy(&ceed_fallback));
 
   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Ceed", ceed, "QFunctionCreate", CeedQFunctionCreate_Sycl_gen));
   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Ceed", ceed, "OperatorCreate", CeedOperatorCreate_Sycl_gen));
diff --git a/backends/sycl-ref/ceed-sycl-ref-basis.sycl.cpp b/backends/sycl-ref/ceed-sycl-ref-basis.sycl.cpp
index e5495fe83f..1330d61a6a 100644
--- a/backends/sycl-ref/ceed-sycl-ref-basis.sycl.cpp
+++ b/backends/sycl-ref/ceed-sycl-ref-basis.sycl.cpp
@@ -328,6 +328,7 @@ static int CeedBasisApply_Sycl(CeedBasis basis, const CeedInt num_elem, CeedTran
   CeedCallBackend(CeedVectorRestoreArray(v, &d_v));
   if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u));
   if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -504,7 +505,7 @@ static int CeedBasisApplyNonTensor_Sycl(CeedBasis basis, const CeedInt num_elem,
   CeedCallBackend(CeedVectorRestoreArray(v, &d_v));
   if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u));
   if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u));
-
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -527,6 +528,7 @@ static int CeedBasisDestroy_Sycl(CeedBasis basis) {
   CeedCallSycl(ceed, sycl::free(impl->d_grad_1d, data->sycl_context));
 
   CeedCallBackend(CeedFree(&impl));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -549,6 +551,7 @@ static int CeedBasisDestroyNonTensor_Sycl(CeedBasis basis) {
   CeedCallSycl(ceed, sycl::free(impl->d_grad, data->sycl_context));
 
   CeedCallBackend(CeedFree(&impl));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -557,11 +560,12 @@ static int CeedBasisDestroyNonTensor_Sycl(CeedBasis basis) {
 //------------------------------------------------------------------------------
 int CeedBasisCreateTensorH1_Sycl(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d,
                                  const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis) {
-  Ceed ceed;
-  CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
+  Ceed            ceed;
   CeedBasis_Sycl *impl;
+  Ceed_Sycl      *data;
+
+  CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
   CeedCallBackend(CeedCalloc(1, &impl));
-  Ceed_Sycl *data;
   CeedCallBackend(CeedGetData(ceed, &data));
 
   CeedInt num_comp;
@@ -617,6 +621,7 @@ int CeedBasisCreateTensorH1_Sycl(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const
   // Register backend functions
   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Basis", basis, "Apply", CeedBasisApply_Sycl));
   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Sycl));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -625,11 +630,12 @@ int CeedBasisCreateTensorH1_Sycl(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const
 //------------------------------------------------------------------------------
 int CeedBasisCreateH1_Sycl(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *grad,
                            const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) {
-  Ceed ceed;
-  CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
+  Ceed                     ceed;
   CeedBasisNonTensor_Sycl *impl;
+  Ceed_Sycl               *data;
+
+  CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
   CeedCallBackend(CeedCalloc(1, &impl));
-  Ceed_Sycl *data;
   CeedCallBackend(CeedGetData(ceed, &data));
 
   CeedInt num_comp;
@@ -668,6 +674,7 @@ int CeedBasisCreateH1_Sycl(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes
   // Register backend functions
   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Sycl));
   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Sycl));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp b/backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp
index e43981c217..e843015df1 100644
--- a/backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp
+++ b/backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp
@@ -107,6 +107,7 @@ static int CeedOperatorDestroy_Sycl(CeedOperator op) {
   CeedCallBackend(CeedFree(&impl->asmb));
 
   CeedCallBackend(CeedFree(&impl));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -206,6 +207,7 @@ static int CeedOperatorSetupFields_Sycl(CeedQFunction qf, CeedOperator op, bool
         break;  // TODO: Not implemented
     }
   }
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -214,7 +216,6 @@ static int CeedOperatorSetupFields_Sycl(CeedQFunction qf, CeedOperator op, bool
 // passive) to the named inputs and outputs of its CeedQFunction.
 //------------------------------------------------------------------------------
 static int CeedOperatorSetup_Sycl(CeedOperator op) {
-  Ceed                ceed;
   bool                is_setup_done;
   CeedInt             Q, num_elem, num_input_fields, num_output_fields;
   CeedQFunctionField *qf_input_fields, *qf_output_fields;
@@ -225,7 +226,6 @@ static int CeedOperatorSetup_Sycl(CeedOperator op) {
   CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done));
   if (is_setup_done) return CEED_ERROR_SUCCESS;
 
-  CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
   CeedCallBackend(CeedOperatorGetData(op, &impl));
   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
   CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q));
@@ -432,22 +432,14 @@ static int CeedOperatorApplyAdd_Sycl(CeedOperator op, CeedVector in_vec, CeedVec
         break;
       }
       // LCOV_EXCL_START
-      case CEED_EVAL_WEIGHT: {
-        Ceed ceed;
-
-        CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
-        return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
+      case CEED_EVAL_WEIGHT:
+        return CeedError(CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
         break;  // Should not occur
-      }
       case CEED_EVAL_DIV:
-      case CEED_EVAL_CURL: {
-        Ceed ceed;
-
-        CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
-        return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]);
+      case CEED_EVAL_CURL:
+        return CeedError(CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]);
         break;  // Should not occur
-      }
-        // LCOV_EXCL_STOP
+                // LCOV_EXCL_STOP
     }
   }
 
@@ -483,7 +475,7 @@ static int CeedOperatorApplyAdd_Sycl(CeedOperator op, CeedVector in_vec, CeedVec
 //------------------------------------------------------------------------------
 static inline int CeedOperatorLinearAssembleQFunctionCore_Sycl(CeedOperator op, bool build_objects, CeedVector *assembled,
                                                                CeedElemRestriction *elem_rstr, CeedRequest *request) {
-  Ceed                ceed, ceed_parent;
+  Ceed                ceed_parent;
   CeedSize            q_size;
   CeedInt             num_active_in, num_active_out, Q, num_elem, num_input_fields, num_output_fields, size;
   CeedScalar         *assembled_array, *e_data[2 * CEED_FIELD_MAX] = {NULL};
@@ -493,7 +485,6 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Sycl(CeedOperator op,
   CeedOperatorField  *op_input_fields, *op_output_fields;
   CeedOperator_Sycl  *impl;
 
-  CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
   CeedCallBackend(CeedOperatorGetFallbackParentCeed(op, &ceed_parent));
   CeedCallBackend(CeedOperatorGetData(op, &impl));
   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
@@ -525,7 +516,7 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Sycl(CeedOperator op,
         CeedCallBackend(CeedRealloc(num_active_in + size, &active_in));
         for (CeedInt field = 0; field < size; field++) {
           q_size = (CeedSize)Q * num_elem;
-          CeedCallBackend(CeedVectorCreate(ceed, q_size, &active_in[num_active_in + field]));
+          CeedCallBackend(CeedVectorCreate(ceed_parent, q_size, &active_in[num_active_in + field]));
           CeedCallBackend(
               CeedVectorSetArray(active_in[num_active_in + field], CEED_MEM_DEVICE, CEED_USE_POINTER, &q_vec_array[field * Q * num_elem]));
         }
@@ -555,7 +546,8 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Sycl(CeedOperator op,
   }
 
   // Check sizes
-  CeedCheck(num_active_in > 0 && num_active_out > 0, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs");
+  CeedCheck(num_active_in > 0 && num_active_out > 0, CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND,
+            "Cannot assemble QFunction without active inputs and outputs");
 
   // Build objects if needed
   if (build_objects) {
@@ -614,6 +606,7 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Sycl(CeedOperator op,
 
   // Restore output
   CeedCallBackend(CeedVectorRestoreArray(*assembled, &assembled_array));
+  CeedCallBackend(CeedDestroy(&ceed_parent));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -820,6 +813,7 @@ static inline int CeedOperatorAssembleDiagonalSetup_Sycl(CeedOperator op) {
   CeedCallBackend(CeedElemRestrictionDestroy(&rstr_out));
 
   // Cleanup
+  CeedCallBackend(CeedDestroy(&ceed));
   CeedCallBackend(CeedBasisDestroy(&basis_in));
   CeedCallBackend(CeedBasisDestroy(&basis_out));
 
@@ -921,8 +915,9 @@ static inline int CeedOperatorAssembleDiagonalCore_Sycl(CeedOperator op, CeedVec
   CeedOperator_Sycl *impl;
 
   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
-  CeedCallBackend(CeedOperatorGetData(op, &impl));
   CeedCallBackend(CeedGetData(ceed, &sycl_data));
+  CeedCallBackend(CeedDestroy(&ceed));
+  CeedCallBackend(CeedOperatorGetData(op, &impl));
 
   // Assemble QFunction
   {
@@ -1175,6 +1170,7 @@ static int CeedSingleOperatorAssembleSetup_Sycl(CeedOperator op) {
       mat_start += dim * elem_size * num_qpts;
     }
   }
+  CeedCallBackend(CeedDestroy(&ceed));
   CeedCallBackend(CeedElemRestrictionDestroy(&rstr_in));
   CeedCallBackend(CeedElemRestrictionDestroy(&rstr_out));
   CeedCallBackend(CeedBasisDestroy(&basis_in));
@@ -1346,8 +1342,9 @@ static int CeedSingleOperatorAssemble_Sycl(CeedOperator op, CeedInt offset, Ceed
   CeedOperator_Sycl  *impl;
 
   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
-  CeedCallBackend(CeedOperatorGetData(op, &impl));
   CeedCallBackend(CeedGetData(ceed, &sycl_data));
+  CeedCallBackend(CeedDestroy(&ceed));
+  CeedCallBackend(CeedOperatorGetData(op, &impl));
 
   // Setup
   if (!impl->asmb) {
@@ -1398,6 +1395,7 @@ int CeedOperatorCreate_Sycl(CeedOperator op) {
   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Operator", op, "LinearAssembleSingle", CeedSingleOperatorAssemble_Sycl));
   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAdd_Sycl));
   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Operator", op, "Destroy", CeedOperatorDestroy_Sycl));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/sycl-ref/ceed-sycl-ref-qfunction-load.sycl.cpp b/backends/sycl-ref/ceed-sycl-ref-qfunction-load.sycl.cpp
index 1d3cf330ad..606a1f45ad 100644
--- a/backends/sycl-ref/ceed-sycl-ref-qfunction-load.sycl.cpp
+++ b/backends/sycl-ref/ceed-sycl-ref-qfunction-load.sycl.cpp
@@ -35,8 +35,8 @@ extern "C" int CeedQFunctionBuildKernel_Sycl(CeedQFunction qf) {
   CeedQFunctionField *input_fields, *output_fields;
   CeedQFunction_Sycl *impl;
 
-  CeedCallBackend(CeedQFunctionGetData(qf, (void **)&impl));
   // QFunction is built
+  CeedCallBackend(CeedQFunctionGetData(qf, (void **)&impl));
   if (impl->QFunction) return CEED_ERROR_SUCCESS;
 
   CeedCallBackend(CeedQFunctionGetCeed(qf, &ceed));
@@ -175,6 +175,7 @@ extern "C" int CeedQFunctionBuildKernel_Sycl(CeedQFunction qf) {
   CeedCallBackend(CeedFree(&qfunction_source));
   CeedCallBackend(CeedFree(&read_write_kernel_path));
   CeedCallBackend(CeedFree(&read_write_kernel_source));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/sycl-ref/ceed-sycl-ref-qfunction.sycl.cpp b/backends/sycl-ref/ceed-sycl-ref-qfunction.sycl.cpp
index 4de8fcf379..6a2c7f060b 100644
--- a/backends/sycl-ref/ceed-sycl-ref-qfunction.sycl.cpp
+++ b/backends/sycl-ref/ceed-sycl-ref-qfunction.sycl.cpp
@@ -37,6 +37,7 @@ static int CeedQFunctionApply_Sycl(CeedQFunction qf, CeedInt Q, CeedVector *U, C
 
   CeedCallBackend(CeedQFunctionGetCeed(qf, &ceed));
   CeedCallBackend(CeedGetData(ceed, &ceed_Sycl));
+  CeedCallBackend(CeedDestroy(&ceed));
 
   CeedCallBackend(CeedQFunctionGetNumArgs(qf, &num_input_fields, &num_output_fields));
 
@@ -118,6 +119,7 @@ static int CeedQFunctionDestroy_Sycl(CeedQFunction qf) {
   delete impl->QFunction;
   delete impl->sycl_module;
   CeedCallBackend(CeedFree(&impl));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -134,6 +136,7 @@ int CeedQFunctionCreate_Sycl(CeedQFunction qf) {
   // Register backend functions
   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "QFunction", qf, "Apply", CeedQFunctionApply_Sycl));
   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "QFunction", qf, "Destroy", CeedQFunctionDestroy_Sycl));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/sycl-ref/ceed-sycl-ref-qfunctioncontext.sycl.cpp b/backends/sycl-ref/ceed-sycl-ref-qfunctioncontext.sycl.cpp
index 1c942a645b..7130a0dead 100644
--- a/backends/sycl-ref/ceed-sycl-ref-qfunctioncontext.sycl.cpp
+++ b/backends/sycl-ref/ceed-sycl-ref-qfunctioncontext.sycl.cpp
@@ -42,6 +42,7 @@ static inline int CeedQFunctionContextSyncH2D_Sycl(const CeedQFunctionContext ct
   if (!sycl_data->sycl_queue.is_in_order()) e = {sycl_data->sycl_queue.ext_oneapi_submit_barrier()};
   sycl::event copy_event = sycl_data->sycl_queue.memcpy(impl->d_data, impl->h_data, ctx_size, e);
   CeedCallSycl(ceed, copy_event.wait_and_throw());
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -75,6 +76,7 @@ static inline int CeedQFunctionContextSyncD2H_Sycl(const CeedQFunctionContext ct
   if (!sycl_data->sycl_queue.is_in_order()) e = {sycl_data->sycl_queue.ext_oneapi_submit_barrier()};
   sycl::event copy_event = sycl_data->sycl_queue.memcpy(impl->h_data, impl->d_data, ctx_size, e);
   CeedCallSycl(ceed, copy_event.wait_and_throw());
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -229,6 +231,7 @@ static int CeedQFunctionContextSetDataDevice_Sycl(const CeedQFunctionContext ctx
       impl->d_data          = data;
     } break;
   }
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -237,9 +240,6 @@ static int CeedQFunctionContextSetDataDevice_Sycl(const CeedQFunctionContext ctx
 //   freeing any previously allocated data if applicable
 //------------------------------------------------------------------------------
 static int CeedQFunctionContextSetData_Sycl(const CeedQFunctionContext ctx, const CeedMemType mem_type, const CeedCopyMode copy_mode, void *data) {
-  Ceed ceed;
-
-  CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed));
   CeedCallBackend(CeedQFunctionContextSetAllInvalid_Sycl(ctx));
   switch (mem_type) {
     case CEED_MEM_HOST:
@@ -260,8 +260,9 @@ static int CeedQFunctionContextTakeData_Sycl(const CeedQFunctionContext ctx, con
   CeedQFunctionContext_Sycl *impl;
 
   CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed));
-  CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl));
   CeedCallBackend(CeedGetData(ceed, &ceedSycl));
+  CeedCallBackend(CeedDestroy(&ceed));
+  CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl));
 
   // Order queue if needed
   if (!ceedSycl->sycl_queue.is_in_order()) ceedSycl->sycl_queue.ext_oneapi_submit_barrier();
@@ -291,11 +292,9 @@ static int CeedQFunctionContextTakeData_Sycl(const CeedQFunctionContext ctx, con
 //   If a different memory type is most up to date, this will perform a copy
 //------------------------------------------------------------------------------
 static int CeedQFunctionContextGetDataCore_Sycl(const CeedQFunctionContext ctx, const CeedMemType mem_type, void *data) {
-  Ceed                       ceed;
   bool                       need_sync = false;
   CeedQFunctionContext_Sycl *impl;
 
-  CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed));
   CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl));
 
   // Sync data to requested mem_type
@@ -325,11 +324,9 @@ static int CeedQFunctionContextGetDataRead_Sycl(const CeedQFunctionContext ctx,
 // Get read/write access to the data
 //------------------------------------------------------------------------------
 static int CeedQFunctionContextGetData_Sycl(const CeedQFunctionContext ctx, const CeedMemType mem_type, void *data) {
-  Ceed                       ceed;
   CeedQFunctionContext_Sycl *impl;
 
   CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl));
-  CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed));
   CeedCallBackend(CeedQFunctionContextGetDataCore_Sycl(ctx, mem_type, data));
 
   // Mark only pointer for requested memory as valid
@@ -360,6 +357,7 @@ static int CeedQFunctionContextDestroy_Sycl(const CeedQFunctionContext ctx) {
   // Wait for all work to finish before freeing memory
   CeedCallSycl(ceed, sycl_data->sycl_queue.wait_and_throw());
   CeedCallSycl(ceed, sycl::free(impl->d_data_owned, sycl_data->sycl_context));
+  CeedCallBackend(CeedDestroy(&ceed));
   CeedCallBackend(CeedFree(&impl->h_data_owned));
   CeedCallBackend(CeedFree(&impl));
   return CEED_ERROR_SUCCESS;
@@ -380,6 +378,7 @@ int CeedQFunctionContextCreate_Sycl(CeedQFunctionContext ctx) {
   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "QFunctionContext", ctx, "GetData", CeedQFunctionContextGetData_Sycl));
   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "QFunctionContext", ctx, "GetDataRead", CeedQFunctionContextGetDataRead_Sycl));
   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "QFunctionContext", ctx, "Destroy", CeedQFunctionContextDestroy_Sycl));
+  CeedCallBackend(CeedDestroy(&ceed));
   CeedCallBackend(CeedCalloc(1, &impl));
   CeedCallBackend(CeedQFunctionContextSetBackendData(ctx, impl));
   return CEED_ERROR_SUCCESS;
diff --git a/backends/sycl-ref/ceed-sycl-restriction.sycl.cpp b/backends/sycl-ref/ceed-sycl-restriction.sycl.cpp
index d85d036587..07a451213b 100644
--- a/backends/sycl-ref/ceed-sycl-restriction.sycl.cpp
+++ b/backends/sycl-ref/ceed-sycl-restriction.sycl.cpp
@@ -195,6 +195,7 @@ static int CeedElemRestrictionApply_Sycl(CeedElemRestriction rstr, CeedTranspose
   // Restore arrays
   CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u));
   CeedCallBackend(CeedVectorRestoreArray(v, &d_v));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -202,10 +203,8 @@ static int CeedElemRestrictionApply_Sycl(CeedElemRestriction rstr, CeedTranspose
 // Get offsets
 //------------------------------------------------------------------------------
 static int CeedElemRestrictionGetOffsets_Sycl(CeedElemRestriction rstr, CeedMemType m_type, const CeedInt **offsets) {
-  Ceed                      ceed;
   CeedElemRestriction_Sycl *impl;
 
-  CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed));
   CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
 
   switch (m_type) {
@@ -240,6 +239,7 @@ static int CeedElemRestrictionDestroy_Sycl(CeedElemRestriction rstr) {
   CeedCallSycl(ceed, sycl::free(impl->d_t_indices, data->sycl_context));
   CeedCallSycl(ceed, sycl::free(impl->d_l_vec_indices, data->sycl_context));
   CeedCallBackend(CeedFree(&impl));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -328,6 +328,7 @@ static int CeedElemRestrictionOffset_Sycl(const CeedElemRestriction rstr, const
   CeedCallBackend(CeedFree(&l_vec_indices));
   CeedCallBackend(CeedFree(&t_offsets));
   CeedCallBackend(CeedFree(&t_indices));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -472,5 +473,6 @@ int CeedElemRestrictionCreate_Sycl(CeedMemType mem_type, CeedCopyMode copy_mode,
   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "ElemRestriction", rstr, "ApplyUnoriented", CeedElemRestrictionApply_Sycl));
   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "ElemRestriction", rstr, "GetOffsets", CeedElemRestrictionGetOffsets_Sycl));
   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "ElemRestriction", rstr, "Destroy", CeedElemRestrictionDestroy_Sycl));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
diff --git a/backends/sycl-ref/ceed-sycl-vector.sycl.cpp b/backends/sycl-ref/ceed-sycl-vector.sycl.cpp
index 427f51f727..32dda419f0 100644
--- a/backends/sycl-ref/ceed-sycl-vector.sycl.cpp
+++ b/backends/sycl-ref/ceed-sycl-vector.sycl.cpp
@@ -44,8 +44,9 @@ static inline int CeedVectorSyncH2D_Sycl(const CeedVector vec) {
   CeedVector_Sycl *impl;
 
   CeedCallBackend(CeedVectorGetCeed(vec, &ceed));
-  CeedCallBackend(CeedVectorGetData(vec, &impl));
   CeedCallBackend(CeedGetData(ceed, &data));
+  CeedCallBackend(CeedVectorGetData(vec, &impl));
+
   CeedCheck(impl->h_array, ceed, CEED_ERROR_BACKEND, "No valid host data to sync to device");
 
   CeedCallBackend(CeedVectorGetLength(vec, &length));
@@ -63,6 +64,7 @@ static inline int CeedVectorSyncH2D_Sycl(const CeedVector vec) {
 
   if (!data->sycl_queue.is_in_order()) e = {data->sycl_queue.ext_oneapi_submit_barrier()};
   CeedCallSycl(ceed, data->sycl_queue.copy<CeedScalar>(impl->h_array, impl->d_array, length, e).wait_and_throw());
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -76,8 +78,8 @@ static inline int CeedVectorSyncD2H_Sycl(const CeedVector vec) {
   CeedVector_Sycl *impl;
 
   CeedCallBackend(CeedVectorGetCeed(vec, &ceed));
-  CeedCallBackend(CeedVectorGetData(vec, &impl));
   CeedCallBackend(CeedGetData(ceed, &data));
+  CeedCallBackend(CeedVectorGetData(vec, &impl));
 
   CeedCheck(impl->d_array, ceed, CEED_ERROR_BACKEND, "No valid device data to sync to host");
 
@@ -96,6 +98,7 @@ static inline int CeedVectorSyncD2H_Sycl(const CeedVector vec) {
 
   if (!data->sycl_queue.is_in_order()) e = {data->sycl_queue.ext_oneapi_submit_barrier()};
   CeedCallSycl(ceed, data->sycl_queue.copy<CeedScalar>(impl->d_array, impl->h_array, length, e).wait_and_throw());
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -244,6 +247,7 @@ static int CeedVectorSetArrayDevice_Sycl(const CeedVector vec, const CeedCopyMod
       impl->d_array          = impl->d_array_borrowed;
       break;
   }
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -295,9 +299,10 @@ static int CeedVectorSetValue_Sycl(CeedVector vec, CeedScalar val) {
   CeedVector_Sycl *impl;
 
   CeedCallBackend(CeedVectorGetCeed(vec, &ceed));
+  CeedCallBackend(CeedGetData(ceed, &data));
+  CeedCallBackend(CeedDestroy(&ceed));
   CeedCallBackend(CeedVectorGetData(vec, &impl));
   CeedCallBackend(CeedVectorGetLength(vec, &length));
-  CeedCallBackend(CeedGetData(ceed, &data));
 
   // Set value for synced device/host array
   if (!impl->d_array && !impl->h_array) {
@@ -333,8 +338,10 @@ static int CeedVectorTakeArray_Sycl(CeedVector vec, CeedMemType mem_type, CeedSc
   CeedVector_Sycl *impl;
 
   CeedCallBackend(CeedVectorGetCeed(vec, &ceed));
-  CeedCallBackend(CeedVectorGetData(vec, &impl));
   CeedCallBackend(CeedGetData(ceed, &data));
+  CeedCallBackend(CeedVectorGetData(vec, &impl));
+  CeedCallBackend(CeedDestroy(&ceed));
+  CeedCallBackend(CeedVectorGetData(vec, &impl));
 
   // Order queue if needed
   if (!data->sycl_queue.is_in_order()) data->sycl_queue.ext_oneapi_submit_barrier();
@@ -447,9 +454,10 @@ static int CeedVectorNorm_Sycl(CeedVector vec, CeedNormType type, CeedScalar *no
   CeedVector_Sycl  *impl;
 
   CeedCallBackend(CeedVectorGetCeed(vec, &ceed));
+  CeedCallBackend(CeedGetData(ceed, &data));
+  CeedCallBackend(CeedDestroy(&ceed));
   CeedCallBackend(CeedVectorGetData(vec, &impl));
   CeedCallBackend(CeedVectorGetLength(vec, &length));
-  CeedCallBackend(CeedGetData(ceed, &data));
 
   // Compute norm
   CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &d_array));
@@ -515,9 +523,10 @@ static int CeedVectorReciprocal_Sycl(CeedVector vec) {
   CeedVector_Sycl *impl;
 
   CeedCallBackend(CeedVectorGetCeed(vec, &ceed));
+  CeedCallBackend(CeedGetData(ceed, &data));
+  CeedCallBackend(CeedDestroy(&ceed));
   CeedCallBackend(CeedVectorGetData(vec, &impl));
   CeedCallBackend(CeedVectorGetLength(vec, &length));
-  CeedCallBackend(CeedGetData(ceed, &data));
 
   // Set value for synced device/host array
   if (impl->d_array) CeedCallBackend(CeedDeviceReciprocal_Sycl(data->sycl_queue, impl->d_array, length));
@@ -554,9 +563,10 @@ static int CeedVectorScale_Sycl(CeedVector x, CeedScalar alpha) {
   CeedVector_Sycl *x_impl;
 
   CeedCallBackend(CeedVectorGetCeed(x, &ceed));
+  CeedCallBackend(CeedGetData(ceed, &data));
+  CeedCallBackend(CeedDestroy(&ceed));
   CeedCallBackend(CeedVectorGetData(x, &x_impl));
   CeedCallBackend(CeedVectorGetLength(x, &length));
-  CeedCallBackend(CeedGetData(ceed, &data));
 
   // Set value for synced device/host array
   if (x_impl->d_array) CeedCallBackend(CeedDeviceScale_Sycl(data->sycl_queue, x_impl->d_array, alpha, length));
@@ -593,10 +603,11 @@ static int CeedVectorAXPY_Sycl(CeedVector y, CeedScalar alpha, CeedVector x) {
   CeedVector_Sycl *y_impl, *x_impl;
 
   CeedCallBackend(CeedVectorGetCeed(y, &ceed));
+  CeedCallBackend(CeedGetData(ceed, &data));
+  CeedCallBackend(CeedDestroy(&ceed));
   CeedCallBackend(CeedVectorGetData(y, &y_impl));
   CeedCallBackend(CeedVectorGetData(x, &x_impl));
   CeedCallBackend(CeedVectorGetLength(y, &length));
-  CeedCallBackend(CeedGetData(ceed, &data));
 
   // Set value for synced device/host array
   if (y_impl->d_array) {
@@ -639,11 +650,12 @@ static int CeedVectorPointwiseMult_Sycl(CeedVector w, CeedVector x, CeedVector y
   CeedVector_Sycl *w_impl, *x_impl, *y_impl;
 
   CeedCallBackend(CeedVectorGetCeed(w, &ceed));
+  CeedCallBackend(CeedGetData(ceed, &data));
+  CeedCallBackend(CeedDestroy(&ceed));
   CeedCallBackend(CeedVectorGetData(w, &w_impl));
   CeedCallBackend(CeedVectorGetData(x, &x_impl));
   CeedCallBackend(CeedVectorGetData(y, &y_impl));
   CeedCallBackend(CeedVectorGetLength(w, &length));
-  CeedCallBackend(CeedGetData(ceed, &data));
 
   // Set value for synced device/host array
   if (!w_impl->d_array && !w_impl->h_array) {
@@ -681,6 +693,7 @@ static int CeedVectorDestroy_Sycl(const CeedVector vec) {
 
   CeedCallBackend(CeedFree(&impl->h_array_owned));
   CeedCallBackend(CeedFree(&impl));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -711,6 +724,7 @@ int CeedVectorCreate_Sycl(CeedSize n, CeedVector vec) {
   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Vector", vec, "Scale", CeedVectorScale_Sycl));
   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Vector", vec, "PointwiseMult", CeedVectorPointwiseMult_Sycl));
   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Vector", vec, "Destroy", CeedVectorDestroy_Sycl));
+  CeedCallBackend(CeedDestroy(&ceed));
   CeedCallBackend(CeedVectorSetData(vec, impl));
   return CEED_ERROR_SUCCESS;
 }
diff --git a/backends/sycl-shared/ceed-sycl-shared-basis.sycl.cpp b/backends/sycl-shared/ceed-sycl-shared-basis.sycl.cpp
index d549f6cd4f..7d8302599f 100644
--- a/backends/sycl-shared/ceed-sycl-shared-basis.sycl.cpp
+++ b/backends/sycl-shared/ceed-sycl-shared-basis.sycl.cpp
@@ -128,7 +128,7 @@ int CeedBasisApplyTensor_Sycl_shared(CeedBasis basis, const CeedInt num_elem, Ce
   CeedCallBackend(CeedVectorRestoreArray(v, &d_v));
   if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u));
   if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u));
-
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -157,6 +157,7 @@ static int CeedBasisDestroy_Sycl_shared(CeedBasis basis) {
   delete impl->sycl_module;
 
   CeedCallBackend(CeedFree(&impl));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -277,6 +278,7 @@ int CeedBasisCreateTensorH1_Sycl_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d,
   // Register backend functions
   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Basis", basis, "Apply", CeedBasisApplyTensor_Sycl_shared));
   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Sycl_shared));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/sycl/ceed-sycl-common.sycl.cpp b/backends/sycl/ceed-sycl-common.sycl.cpp
index e51405d7fa..176b39cd84 100644
--- a/backends/sycl/ceed-sycl-common.sycl.cpp
+++ b/backends/sycl/ceed-sycl-common.sycl.cpp
@@ -107,12 +107,14 @@ int CeedSetStream_Sycl(Ceed ceed, void *handle) {
   if (ceed_delegate) {
     CeedCallBackend(CeedSetStream_Sycl(ceed_delegate, handle));
   }
+  CeedCallBackend(CeedDestroy(&ceed_delegate));
 
   // Set queue and context for Ceed Fallback object
-  CeedGetOperatorFallbackCeed(ceed, &ceed_fallback);
+  CeedCallBackend(CeedGetOperatorFallbackCeed(ceed, &ceed_fallback));
   if (ceed_fallback) {
     CeedCallBackend(CeedSetStream_Sycl(ceed_fallback, handle));
   }
+  CeedCallBackend(CeedDestroy(&ceed_fallback));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/xsmm/ceed-xsmm-blocked.c b/backends/xsmm/ceed-xsmm-blocked.c
index 90dc19e741..1bd5d724f1 100644
--- a/backends/xsmm/ceed-xsmm-blocked.c
+++ b/backends/xsmm/ceed-xsmm-blocked.c
@@ -25,6 +25,7 @@ static int CeedInit_Xsmm_Blocked(const char *resource, Ceed ceed) {
   // Create reference Ceed that implementation will be dispatched through unless overridden
   CeedCallBackend(CeedInit("/cpu/self/opt/blocked", &ceed_ref));
   CeedCallBackend(CeedSetDelegate(ceed, ceed_ref));
+  CeedCallBackend(CeedDestroy(&ceed_ref));
 
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "TensorContractCreate", CeedTensorContractCreate_Xsmm));
   return CEED_ERROR_SUCCESS;
diff --git a/backends/xsmm/ceed-xsmm-serial.c b/backends/xsmm/ceed-xsmm-serial.c
index 68e51a63e3..69d51b769f 100644
--- a/backends/xsmm/ceed-xsmm-serial.c
+++ b/backends/xsmm/ceed-xsmm-serial.c
@@ -25,6 +25,7 @@ static int CeedInit_Xsmm_Serial(const char *resource, Ceed ceed) {
   // Create reference Ceed that implementation will be dispatched through unless overridden
   CeedCallBackend(CeedInit("/cpu/self/opt/serial", &ceed_ref));
   CeedCallBackend(CeedSetDelegate(ceed, ceed_ref));
+  CeedCallBackend(CeedDestroy(&ceed_ref));
 
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "TensorContractCreate", CeedTensorContractCreate_Xsmm));
   return CEED_ERROR_SUCCESS;
diff --git a/backends/xsmm/ceed-xsmm-tensor.c b/backends/xsmm/ceed-xsmm-tensor.c
index 1dc69b30a8..899726ef09 100644
--- a/backends/xsmm/ceed-xsmm-tensor.c
+++ b/backends/xsmm/ceed-xsmm-tensor.c
@@ -16,10 +16,6 @@
 //------------------------------------------------------------------------------
 static int CeedTensorContractApply_Xsmm(CeedTensorContract contract, CeedInt A, CeedInt B, CeedInt C, CeedInt J, const CeedScalar *restrict t,
                                         CeedTransposeMode t_mode, const CeedInt add, const CeedScalar *restrict u, CeedScalar *restrict v) {
-  Ceed ceed;
-
-  CeedCallBackend(CeedTensorContractGetCeed(contract, &ceed));
-
   if (C == 1) {
     // Build or query the required kernel
     const int                  flags_t    = LIBXSMM_GEMM_FLAGS(!t_mode ? 'T' : 'N', 'N');
@@ -33,7 +29,7 @@ static int CeedTensorContractApply_Xsmm(CeedTensorContract contract, CeedInt A,
     const libxsmm_gemmfunction kernel = libxsmm_dispatch_gemm(gemm_shape, (libxsmm_bitfield)(flags), (libxsmm_bitfield)LIBXSMM_GEMM_PREFETCH_NONE);
     libxsmm_gemm_param         gemm_param;
 
-    CeedCheck(kernel, ceed, CEED_ERROR_BACKEND, "LIBXSMM kernel failed to build.");
+    CeedCheck(kernel, CeedTensorContractReturnCeed(contract), CEED_ERROR_BACKEND, "LIBXSMM kernel failed to build.");
 
     // Run kernel
     gemm_param.a.primary = (CeedScalar *)&t[0];
@@ -53,7 +49,7 @@ static int CeedTensorContractApply_Xsmm(CeedTensorContract contract, CeedInt A,
     const libxsmm_gemmfunction kernel = libxsmm_dispatch_gemm(gemm_shape, (libxsmm_bitfield)(flags), (libxsmm_bitfield)LIBXSMM_GEMM_PREFETCH_NONE);
     libxsmm_gemm_param         gemm_param;
 
-    CeedCheck(kernel, ceed, CEED_ERROR_BACKEND, "LIBXSMM kernel failed to build.");
+    CeedCheck(kernel, CeedTensorContractReturnCeed(contract), CEED_ERROR_BACKEND, "LIBXSMM kernel failed to build.");
 
     // Run kernel
     gemm_param.b.primary = (CeedScalar *)&t[0];
diff --git a/examples/fluids/include/petsc-ceed-utils.h b/examples/fluids/include/petsc-ceed-utils.h
index 936f278ee3..d085ad670f 100644
--- a/examples/fluids/include/petsc-ceed-utils.h
+++ b/examples/fluids/include/petsc-ceed-utils.h
@@ -220,6 +220,7 @@ static inline PetscErrorCode VecCopyPetscToCeed(Vec X_petsc, CeedVector x_ceed)
   PetscCall(VecGetArrayReadAndMemType(X_petsc, (const PetscScalar **)&x, &mem_type));
   PetscCallCeed(ceed, CeedVectorSetArray(x_ceed, MemTypePetscToCeed(mem_type), CEED_COPY_VALUES, x));
   PetscCall(VecRestoreArrayReadAndMemType(X_petsc, (const PetscScalar **)&x));
+  PetscCheck(CeedDestroy(&ceed) == CEED_ERROR_SUCCESS, PetscObjectComm((PetscObject)X_petsc), PETSC_ERR_LIB, "Destroying Ceed object failed");
   PetscFunctionReturn(PETSC_SUCCESS);
 }
 
diff --git a/examples/fluids/src/mat-ceed.c b/examples/fluids/src/mat-ceed.c
index cd164598ed..358160f678 100644
--- a/examples/fluids/src/mat-ceed.c
+++ b/examples/fluids/src/mat-ceed.c
@@ -1229,7 +1229,6 @@ PetscErrorCode MatCeedContextCreate(DM dm_x, DM dm_y, Vec X_loc, Vec Y_loc_trans
   // libCEED objects
   PetscCheck(CeedOperatorGetCeed(op_mult, &(*ctx)->ceed) == CEED_ERROR_SUCCESS, PETSC_COMM_SELF, PETSC_ERR_LIB,
              "retrieving Ceed context object failed");
-  PetscCallCeed((*ctx)->ceed, CeedReference((*ctx)->ceed));
   PetscCallCeed((*ctx)->ceed, CeedOperatorGetActiveVectorLengths(op_mult, &x_loc_len, &y_loc_len));
   PetscCallCeed((*ctx)->ceed, CeedOperatorReferenceCopy(op_mult, &(*ctx)->op_mult));
   if (op_mult_transpose) PetscCallCeed((*ctx)->ceed, CeedOperatorReferenceCopy(op_mult_transpose, &(*ctx)->op_mult_transpose));
diff --git a/examples/fluids/src/petsc_ops.c b/examples/fluids/src/petsc_ops.c
index f40e156af2..b9b51209a3 100644
--- a/examples/fluids/src/petsc_ops.c
+++ b/examples/fluids/src/petsc_ops.c
@@ -197,6 +197,7 @@ PetscErrorCode CeedOperatorCreateLocalVecs(CeedOperator op, VecType vec_type, MP
     PetscCall(VecSetType(*output, vec_type));
     PetscCall(VecSetSizes(*output, output_size, output_size));
   }
+  PetscCheck(CeedDestroy(&ceed) == CEED_ERROR_SUCCESS, comm, PETSC_ERR_LIB, "Destroying Ceed object failed");
   PetscFunctionReturn(PETSC_SUCCESS);
 }
 
diff --git a/examples/fluids/src/turb_spanstats.c b/examples/fluids/src/turb_spanstats.c
index 08f9ef36b9..db0532fdce 100644
--- a/examples/fluids/src/turb_spanstats.c
+++ b/examples/fluids/src/turb_spanstats.c
@@ -238,6 +238,8 @@ PetscErrorCode SpanStatsSetupDataDestroy(SpanStatsSetupData data) {
 
   PetscCallCeed(ceed, CeedVectorDestroy(&data->x_coord));
 
+  PetscCheck(CeedDestroy(&ceed) == CEED_ERROR_SUCCESS, PETSC_COMM_WORLD, PETSC_ERR_LIB, "Destroying Ceed object failed");
+
   PetscCall(PetscFree(data));
   PetscFunctionReturn(PETSC_SUCCESS);
 }
diff --git a/interface/ceed-basis.c b/interface/ceed-basis.c
index c6869f2f3b..902207f75e 100644
--- a/interface/ceed-basis.c
+++ b/interface/ceed-basis.c
@@ -194,16 +194,13 @@ static int CeedScalarView(const char *name, const char *fp_fmt, CeedInt m, CeedI
   @ref Developer
 **/
 static int CeedBasisCreateProjectionMatrices(CeedBasis basis_from, CeedBasis basis_to, CeedScalar **interp_project, CeedScalar **grad_project) {
-  Ceed    ceed;
   bool    are_both_tensor;
   CeedInt Q, Q_to, Q_from, P_to, P_from;
 
-  CeedCall(CeedBasisGetCeed(basis_to, &ceed));
-
   // Check for compatible quadrature spaces
   CeedCall(CeedBasisGetNumQuadraturePoints(basis_to, &Q_to));
   CeedCall(CeedBasisGetNumQuadraturePoints(basis_from, &Q_from));
-  CeedCheck(Q_to == Q_from, ceed, CEED_ERROR_DIMENSION,
+  CeedCheck(Q_to == Q_from, CeedBasisReturnCeed(basis_to), CEED_ERROR_DIMENSION,
             "Bases must have compatible quadrature spaces."
             " 'basis_from' has %" CeedInt_FMT " points and 'basis_to' has %" CeedInt_FMT,
             Q_from, Q_to);
@@ -231,7 +228,7 @@ static int CeedBasisCreateProjectionMatrices(CeedBasis basis_from, CeedBasis bas
 
   CeedCall(CeedBasisGetFESpace(basis_to, &fe_space_to));
   CeedCall(CeedBasisGetFESpace(basis_from, &fe_space_from));
-  CeedCheck(fe_space_to == fe_space_from, ceed, CEED_ERROR_MINOR,
+  CeedCheck(fe_space_to == fe_space_from, CeedBasisReturnCeed(basis_to), CEED_ERROR_MINOR,
             "Bases must both be the same FE space type."
             " 'basis_from' is a %s and 'basis_to' is a %s",
             CeedFESpaces[fe_space_from], CeedFESpaces[fe_space_to]);
@@ -267,7 +264,7 @@ static int CeedBasisCreateProjectionMatrices(CeedBasis basis_from, CeedBasis bas
 
   // Compute interp_to^+, pseudoinverse of interp_to
   CeedCall(CeedCalloc(Q * q_comp * P_to, &interp_to_inv));
-  CeedCall(CeedMatrixPseudoinverse(ceed, interp_to_source, Q * q_comp, P_to, interp_to_inv));
+  CeedCall(CeedMatrixPseudoinverse(CeedBasisReturnCeed(basis_to), interp_to_source, Q * q_comp, P_to, interp_to_inv));
   // Build matrices
   CeedInt     num_matrices = 1 + (fe_space_to == CEED_FE_SPACE_H1) * (are_both_tensor ? 1 : dim);
   CeedScalar *input_from[num_matrices], *output_project[num_matrices];
@@ -281,7 +278,7 @@ static int CeedBasisCreateProjectionMatrices(CeedBasis basis_from, CeedBasis bas
   for (CeedInt m = 0; m < num_matrices; m++) {
     // output_project = interp_to^+ * interp_from
     memcpy(interp_from, input_from[m], Q * P_from * q_comp * sizeof(input_from[m][0]));
-    CeedCall(CeedMatrixMatrixMultiply(ceed, interp_to_inv, input_from[m], output_project[m], P_to, P_from, Q * q_comp));
+    CeedCall(CeedMatrixMatrixMultiply(CeedBasisReturnCeed(basis_to), interp_to_inv, input_from[m], output_project[m], P_to, P_from, Q * q_comp));
     // Round zero to machine precision
     for (CeedInt i = 0; i < P_to * P_from; i++) {
       if (fabs(output_project[m][i]) < 10 * CEED_EPSILON) output_project[m][i] = 0.0;
@@ -318,9 +315,7 @@ static int CeedBasisApplyAtPointsCheckDims(CeedBasis basis, CeedInt num_elem, co
                                            CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) {
   CeedInt  dim, num_comp, num_q_comp, num_nodes, P_1d = 1, Q_1d = 1, total_num_points = 0;
   CeedSize x_length = 0, u_length = 0, v_length;
-  Ceed     ceed;
 
-  CeedCall(CeedBasisGetCeed(basis, &ceed));
   CeedCall(CeedBasisGetDimension(basis, &dim));
   CeedCall(CeedBasisGetNumNodes1D(basis, &P_1d));
   CeedCall(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
@@ -333,13 +328,14 @@ static int CeedBasisApplyAtPointsCheckDims(CeedBasis basis, CeedInt num_elem, co
 
   // Check compatibility coordinates vector
   for (CeedInt i = 0; i < num_elem; i++) total_num_points += num_points[i];
-  CeedCheck((x_length >= (CeedSize)total_num_points * (CeedSize)dim) || (eval_mode == CEED_EVAL_WEIGHT), ceed, CEED_ERROR_DIMENSION,
+  CeedCheck((x_length >= (CeedSize)total_num_points * (CeedSize)dim) || (eval_mode == CEED_EVAL_WEIGHT), CeedBasisReturnCeed(basis),
+            CEED_ERROR_DIMENSION,
             "Length of reference coordinate vector incompatible with basis dimension and number of points."
             " Found reference coordinate vector of length %" CeedSize_FMT ", not of length %" CeedSize_FMT ".",
             x_length, (CeedSize)total_num_points * (CeedSize)dim);
 
   // Check CEED_EVAL_WEIGHT only on CEED_NOTRANSPOSE
-  CeedCheck(eval_mode != CEED_EVAL_WEIGHT || t_mode == CEED_NOTRANSPOSE, ceed, CEED_ERROR_UNSUPPORTED,
+  CeedCheck(eval_mode != CEED_EVAL_WEIGHT || t_mode == CEED_NOTRANSPOSE, CeedBasisReturnCeed(basis), CEED_ERROR_UNSUPPORTED,
             "CEED_EVAL_WEIGHT only supported with CEED_NOTRANSPOSE");
 
   // Check vector lengths to prevent out of bounds issues
@@ -364,10 +360,11 @@ static int CeedBasisApplyAtPointsCheckDims(CeedBasis basis, CeedInt num_elem, co
     case CEED_EVAL_NONE:
     case CEED_EVAL_DIV:
     case CEED_EVAL_CURL:
-      return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "Evaluation at arbitrary points not supported for %s", CeedEvalModes[eval_mode]);
+      return CeedError(CeedBasisReturnCeed(basis), CEED_ERROR_UNSUPPORTED, "Evaluation at arbitrary points not supported for %s",
+                       CeedEvalModes[eval_mode]);
       // LCOV_EXCL_STOP
   }
-  CeedCheck(has_good_dims, ceed, CEED_ERROR_DIMENSION, "Input/output vectors too short for basis and evaluation mode");
+  CeedCheck(has_good_dims, CeedBasisReturnCeed(basis), CEED_ERROR_DIMENSION, "Input/output vectors too short for basis and evaluation mode");
   return CEED_ERROR_SUCCESS;
 }
 
@@ -395,12 +392,10 @@ static int CeedBasisApplyAtPointsCheckDims(CeedBasis basis, CeedInt num_elem, co
 static int CeedBasisApplyAtPoints_Core(CeedBasis basis, bool apply_add, CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode,
                                        CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) {
   CeedInt dim, num_comp, P_1d = 1, Q_1d = 1, total_num_points = num_points[0];
-  Ceed    ceed;
 
-  CeedCall(CeedBasisGetCeed(basis, &ceed));
   CeedCall(CeedBasisGetDimension(basis, &dim));
   // Inserting check because clang-tidy doesn't understand this cannot occur
-  CeedCheck(dim > 0, ceed, CEED_ERROR_UNSUPPORTED, "Malformed CeedBasis, dim > 0 is required");
+  CeedCheck(dim > 0, CeedBasisReturnCeed(basis), CEED_ERROR_UNSUPPORTED, "Malformed CeedBasis, dim > 0 is required");
   CeedCall(CeedBasisGetNumNodes1D(basis, &P_1d));
   CeedCall(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
   CeedCall(CeedBasisGetNumComponents(basis, &num_comp));
@@ -410,9 +405,11 @@ static int CeedBasisApplyAtPoints_Core(CeedBasis basis, bool apply_add, CeedInt
     bool is_tensor_basis;
 
     CeedCall(CeedBasisIsTensor(basis, &is_tensor_basis));
-    CeedCheck(is_tensor_basis, ceed, CEED_ERROR_UNSUPPORTED, "Evaluation at arbitrary points only supported for tensor product bases");
+    CeedCheck(is_tensor_basis, CeedBasisReturnCeed(basis), CEED_ERROR_UNSUPPORTED,
+              "Evaluation at arbitrary points only supported for tensor product bases");
   }
-  CeedCheck(num_elem == 1, ceed, CEED_ERROR_UNSUPPORTED, "Evaluation at arbitrary  points only supported for a single element at a time");
+  CeedCheck(num_elem == 1, CeedBasisReturnCeed(basis), CEED_ERROR_UNSUPPORTED,
+            "Evaluation at arbitrary  points only supported for a single element at a time");
   if (eval_mode == CEED_EVAL_WEIGHT) {
     CeedCall(CeedVectorSetValue(v, 1.0));
     return CEED_ERROR_SUCCESS;
@@ -421,6 +418,7 @@ static int CeedBasisApplyAtPoints_Core(CeedBasis basis, bool apply_add, CeedInt
     // Build basis mapping from nodes to Chebyshev coefficients
     CeedScalar       *chebyshev_interp_1d, *chebyshev_grad_1d, *chebyshev_q_weight_1d;
     const CeedScalar *q_ref_1d;
+    Ceed              ceed;
 
     CeedCall(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d));
     CeedCall(CeedCalloc(P_1d * Q_1d, &chebyshev_grad_1d));
@@ -428,6 +426,7 @@ static int CeedBasisApplyAtPoints_Core(CeedBasis basis, bool apply_add, CeedInt
     CeedCall(CeedBasisGetQRef(basis, &q_ref_1d));
     CeedCall(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d));
 
+    CeedCall(CeedBasisGetCeed(basis, &ceed));
     CeedCall(CeedVectorCreate(ceed, num_comp * CeedIntPow(Q_1d, dim), &basis->vec_chebyshev));
     CeedCall(CeedBasisCreateTensorH1(ceed, dim, num_comp, P_1d, Q_1d, chebyshev_interp_1d, chebyshev_grad_1d, q_ref_1d, chebyshev_q_weight_1d,
                                      &basis->basis_chebyshev));
@@ -436,6 +435,7 @@ static int CeedBasisApplyAtPoints_Core(CeedBasis basis, bool apply_add, CeedInt
     CeedCall(CeedFree(&chebyshev_interp_1d));
     CeedCall(CeedFree(&chebyshev_grad_1d));
     CeedCall(CeedFree(&chebyshev_q_weight_1d));
+    CeedCall(CeedDestroy(&ceed));
   }
 
   // Create TensorContract object if needed, such as a basis from the GPU backends
@@ -447,7 +447,8 @@ static int CeedBasisApplyAtPoints_Core(CeedBasis basis, bool apply_add, CeedInt
     // Only need matching tensor contraction dimensions, any type of basis will work
     CeedCall(CeedBasisCreateTensorH1Lagrange(ceed_ref, dim, num_comp, P_1d, Q_1d, CEED_GAUSS, &basis_ref));
     // Note - clang-tidy doesn't know basis_ref->contract must be valid here
-    CeedCheck(basis_ref && basis_ref->contract, ceed, CEED_ERROR_UNSUPPORTED, "Reference CPU ceed failed to create a tensor contraction object");
+    CeedCheck(basis_ref && basis_ref->contract, CeedBasisReturnCeed(basis), CEED_ERROR_UNSUPPORTED,
+              "Reference CPU ceed failed to create a tensor contraction object");
     CeedCall(CeedTensorContractReferenceCopy(basis_ref->contract, &basis->contract));
     CeedCall(CeedBasisDestroy(&basis_ref));
     CeedCall(CeedDestroy(&ceed_ref));
@@ -628,6 +629,7 @@ int CeedBasisGetCollocatedGrad(CeedBasis basis, CeedScalar *collo_grad_1d) {
   CeedCall(CeedMatrixMatrixMultiply(ceed, grad_1d, (const CeedScalar *)interp_1d_pinv, collo_grad_1d, Q_1d, Q_1d, P_1d));
 
   CeedCall(CeedFree(&interp_1d_pinv));
+  CeedCall(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -669,6 +671,7 @@ int CeedBasisGetChebyshevInterp1D(CeedBasis basis, CeedScalar *chebyshev_interp_
   // Cleanup
   CeedCall(CeedFree(&C));
   CeedCall(CeedFree(&chebyshev_coeffs_1d_inv));
+  CeedCall(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -1328,6 +1331,7 @@ int CeedBasisCreateTensorH1(Ceed ceed, CeedInt dim, CeedInt num_comp, CeedInt P_
     CeedCall(CeedGetObjectDelegate(ceed, &delegate, "Basis"));
     CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement BasisCreateTensorH1");
     CeedCall(CeedBasisCreateTensorH1(delegate, dim, num_comp, P_1d, Q_1d, interp_1d, grad_1d, q_ref_1d, q_weight_1d, basis));
+    CeedCall(CeedDestroy(&delegate));
     return CEED_ERROR_SUCCESS;
   }
 
@@ -1467,6 +1471,7 @@ int CeedBasisCreateH1(Ceed ceed, CeedElemTopology topo, CeedInt num_comp, CeedIn
     CeedCall(CeedGetObjectDelegate(ceed, &delegate, "Basis"));
     CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement BasisCreateH1");
     CeedCall(CeedBasisCreateH1(delegate, topo, num_comp, num_nodes, num_qpts, interp, grad, q_ref, q_weight, basis));
+    CeedCall(CeedDestroy(&delegate));
     return CEED_ERROR_SUCCESS;
   }
 
@@ -1526,6 +1531,7 @@ int CeedBasisCreateHdiv(Ceed ceed, CeedElemTopology topo, CeedInt num_comp, Ceed
     CeedCall(CeedGetObjectDelegate(ceed, &delegate, "Basis"));
     CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement BasisCreateHdiv");
     CeedCall(CeedBasisCreateHdiv(delegate, topo, num_comp, num_nodes, num_qpts, interp, div, q_ref, q_weight, basis));
+    CeedCall(CeedDestroy(&delegate));
     return CEED_ERROR_SUCCESS;
   }
 
@@ -1585,6 +1591,7 @@ int CeedBasisCreateHcurl(Ceed ceed, CeedElemTopology topo, CeedInt num_comp, Cee
     CeedCall(CeedGetObjectDelegate(ceed, &delegate, "Basis"));
     CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement BasisCreateHcurl");
     CeedCall(CeedBasisCreateHcurl(delegate, topo, num_comp, num_nodes, num_qpts, interp, curl, q_ref, q_weight, basis));
+    CeedCall(CeedDestroy(&delegate));
     return CEED_ERROR_SUCCESS;
   }
 
@@ -1681,6 +1688,7 @@ int CeedBasisCreateProjection(CeedBasis basis_from, CeedBasis basis_to, CeedBasi
   // Cleanup
   CeedCall(CeedFree(&interp_project));
   CeedCall(CeedFree(&grad_project));
+  CeedCall(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -1806,9 +1814,7 @@ int CeedBasisView(CeedBasis basis, FILE *stream) {
 static int CeedBasisApplyCheckDims(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, CeedVector v) {
   CeedInt  dim, num_comp, q_comp, num_nodes, num_qpts;
   CeedSize u_length = 0, v_length;
-  Ceed     ceed;
 
-  CeedCall(CeedBasisGetCeed(basis, &ceed));
   CeedCall(CeedBasisGetDimension(basis, &dim));
   CeedCall(CeedBasisGetNumComponents(basis, &num_comp));
   CeedCall(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp));
@@ -1834,7 +1840,7 @@ static int CeedBasisApplyCheckDims(CeedBasis basis, CeedInt num_elem, CeedTransp
       has_good_dims = v_length >= (CeedSize)num_elem * (CeedSize)num_qpts;
       break;
   }
-  CeedCheck(has_good_dims, ceed, CEED_ERROR_DIMENSION, "Input/output vectors too short for basis and evaluation mode");
+  CeedCheck(has_good_dims, CeedBasisReturnCeed(basis), CEED_ERROR_DIMENSION, "Input/output vectors too short for basis and evaluation mode");
   return CEED_ERROR_SUCCESS;
 }
 
@@ -1969,7 +1975,8 @@ int CeedBasisApplyAddAtPoints(CeedBasis basis, CeedInt num_elem, const CeedInt *
   @ref Advanced
 **/
 int CeedBasisGetCeed(CeedBasis basis, Ceed *ceed) {
-  *ceed = CeedBasisReturnCeed(basis);
+  *ceed = NULL;
+  CeedCall(CeedReferenceCopy(CeedBasisReturnCeed(basis), ceed));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/interface/ceed-cuda.c b/interface/ceed-cuda.c
index c4463b738d..ff28d10d14 100644
--- a/interface/ceed-cuda.c
+++ b/interface/ceed-cuda.c
@@ -23,10 +23,7 @@
 **/
 int CeedQFunctionSetCUDAUserFunction(CeedQFunction qf, CUfunction f) {
   if (!qf->SetCUDAUserFunction) {
-    Ceed ceed;
-
-    CeedCall(CeedQFunctionGetCeed(qf, &ceed));
-    CeedDebug(ceed, "Backend does not support CUfunction pointers for QFunctions.");
+    CeedDebug(CeedQFunctionReturnCeed(qf), "Backend does not support CUfunction pointers for QFunctions.");
   } else {
     CeedCall(qf->SetCUDAUserFunction(qf, f));
   }
diff --git a/interface/ceed-elemrestriction.c b/interface/ceed-elemrestriction.c
index 7d832f1a97..50028e6934 100644
--- a/interface/ceed-elemrestriction.c
+++ b/interface/ceed-elemrestriction.c
@@ -164,13 +164,12 @@ int CeedElemRestrictionIsAtPoints(CeedElemRestriction rstr, bool *is_points) {
 **/
 int CeedElemRestrictionAtPointsAreCompatible(CeedElemRestriction rstr_a, CeedElemRestriction rstr_b, bool *are_compatible) {
   CeedInt num_elem_a, num_elem_b, num_points_a, num_points_b;
-  Ceed    ceed;
-
-  CeedCall(CeedElemRestrictionGetCeed(rstr_a, &ceed));
 
   // Cannot compare non-points restrictions
-  CeedCheck(rstr_a->rstr_type == CEED_RESTRICTION_POINTS, ceed, CEED_ERROR_UNSUPPORTED, "First CeedElemRestriction must be AtPoints");
-  CeedCheck(rstr_b->rstr_type == CEED_RESTRICTION_POINTS, ceed, CEED_ERROR_UNSUPPORTED, "Second CeedElemRestriction must be AtPoints");
+  CeedCheck(rstr_a->rstr_type == CEED_RESTRICTION_POINTS, CeedElemRestrictionReturnCeed(rstr_a), CEED_ERROR_UNSUPPORTED,
+            "First CeedElemRestriction must be AtPoints");
+  CeedCheck(rstr_b->rstr_type == CEED_RESTRICTION_POINTS, CeedElemRestrictionReturnCeed(rstr_a), CEED_ERROR_UNSUPPORTED,
+            "Second CeedElemRestriction must be AtPoints");
 
   CeedCall(CeedElemRestrictionGetNumElements(rstr_a, &num_elem_a));
   CeedCall(CeedElemRestrictionGetNumElements(rstr_b, &num_elem_b));
@@ -357,14 +356,13 @@ int CeedElemRestrictionRestoreCurlOrientations(CeedElemRestriction rstr, const C
 int CeedElemRestrictionGetLLayout(CeedElemRestriction rstr, CeedInt layout[3]) {
   bool                has_backend_strides;
   CeedRestrictionType rstr_type;
-  Ceed                ceed;
 
-  CeedCall(CeedElemRestrictionGetCeed(rstr, &ceed));
   CeedCall(CeedElemRestrictionGetType(rstr, &rstr_type));
-  CeedCheck(rstr_type == CEED_RESTRICTION_STRIDED, ceed, CEED_ERROR_MINOR, "Only strided CeedElemRestriction have strided L-vector layout");
+  CeedCheck(rstr_type == CEED_RESTRICTION_STRIDED, CeedElemRestrictionReturnCeed(rstr), CEED_ERROR_MINOR,
+            "Only strided CeedElemRestriction have strided L-vector layout");
   CeedCall(CeedElemRestrictionHasBackendStrides(rstr, &has_backend_strides));
   if (has_backend_strides) {
-    CeedCheck(rstr->l_layout[0], ceed, CEED_ERROR_MINOR, "CeedElemRestriction has no L-vector layout data");
+    CeedCheck(rstr->l_layout[0], CeedElemRestrictionReturnCeed(rstr), CEED_ERROR_MINOR, "CeedElemRestriction has no L-vector layout data");
     for (CeedInt i = 0; i < 3; i++) layout[i] = rstr->l_layout[i];
   } else {
     CeedCall(CeedElemRestrictionGetStrides(rstr, layout));
@@ -481,12 +479,11 @@ int CeedElemRestrictionGetAtPointsElementOffset(CeedElemRestriction rstr, CeedIn
 **/
 int CeedElemRestrictionSetAtPointsEVectorSize(CeedElemRestriction rstr, CeedSize e_size) {
   CeedRestrictionType rstr_type;
-  Ceed                ceed;
 
-  CeedCall(CeedElemRestrictionGetCeed(rstr, &ceed));
   CeedCall(CeedElemRestrictionGetType(rstr, &rstr_type));
-  CeedCheck(rstr_type == CEED_RESTRICTION_POINTS, ceed, CEED_ERROR_INCOMPATIBLE, "Can only compute offset for a points CeedElemRestriction");
-  CeedCheck(e_size >= rstr->e_size, ceed, CEED_ERROR_INCOMPATIBLE,
+  CeedCheck(rstr_type == CEED_RESTRICTION_POINTS, CeedElemRestrictionReturnCeed(rstr), CEED_ERROR_INCOMPATIBLE,
+            "Can only compute offset for a points CeedElemRestriction");
+  CeedCheck(e_size >= rstr->e_size, CeedElemRestrictionReturnCeed(rstr), CEED_ERROR_INCOMPATIBLE,
             "Can only increase the size of the E-vector for the CeedElemRestriction."
             " Current size: %" CeedSize_FMT " New size: %" CeedSize_FMT,
             rstr->e_size, e_size);
@@ -636,6 +633,7 @@ int CeedElemRestrictionCreate(Ceed ceed, CeedInt num_elem, CeedInt elem_size, Ce
     CeedCall(CeedGetObjectDelegate(ceed, &delegate, "ElemRestriction"));
     CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement CeedElemRestrictionCreate");
     CeedCall(CeedElemRestrictionCreate(delegate, num_elem, elem_size, num_comp, comp_stride, l_size, mem_type, copy_mode, offsets, rstr));
+    CeedCall(CeedDestroy(&delegate));
     return CEED_ERROR_SUCCESS;
   }
 
@@ -693,6 +691,7 @@ int CeedElemRestrictionCreateOriented(Ceed ceed, CeedInt num_elem, CeedInt elem_
     CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement CeedElemRestrictionCreateOriented");
     CeedCall(
         CeedElemRestrictionCreateOriented(delegate, num_elem, elem_size, num_comp, comp_stride, l_size, mem_type, copy_mode, offsets, orients, rstr));
+    CeedCall(CeedDestroy(&delegate));
     return CEED_ERROR_SUCCESS;
   }
 
@@ -751,6 +750,7 @@ int CeedElemRestrictionCreateCurlOriented(Ceed ceed, CeedInt num_elem, CeedInt e
     CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement CeedElemRestrictionCreateCurlOriented");
     CeedCall(CeedElemRestrictionCreateCurlOriented(delegate, num_elem, elem_size, num_comp, comp_stride, l_size, mem_type, copy_mode, offsets,
                                                    curl_orients, rstr));
+    CeedCall(CeedDestroy(&delegate));
     return CEED_ERROR_SUCCESS;
   }
 
@@ -803,6 +803,7 @@ int CeedElemRestrictionCreateStrided(Ceed ceed, CeedInt num_elem, CeedInt elem_s
     CeedCall(CeedGetObjectDelegate(ceed, &delegate, "ElemRestriction"));
     CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement CeedElemRestrictionCreateStrided");
     CeedCall(CeedElemRestrictionCreateStrided(delegate, num_elem, elem_size, num_comp, l_size, strides, rstr));
+    CeedCall(CeedDestroy(&delegate));
     return CEED_ERROR_SUCCESS;
   }
 
@@ -870,6 +871,7 @@ int CeedElemRestrictionCreateAtPoints(Ceed ceed, CeedInt num_elem, CeedInt num_p
     CeedCall(CeedGetObjectDelegate(ceed, &delegate, "ElemRestriction"));
     CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement CeedElemRestrictionCreateAtPoints");
     CeedCall(CeedElemRestrictionCreateAtPoints(delegate, num_elem, num_points, num_comp, l_size, mem_type, copy_mode, offsets, rstr));
+    CeedCall(CeedDestroy(&delegate));
     return CEED_ERROR_SUCCESS;
   }
 
@@ -933,6 +935,7 @@ int CeedElemRestrictionCreateBlocked(Ceed ceed, CeedInt num_elem, CeedInt elem_s
     CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement CeedElemRestrictionCreateBlocked");
     CeedCall(CeedElemRestrictionCreateBlocked(delegate, num_elem, elem_size, block_size, num_comp, comp_stride, l_size, mem_type, copy_mode, offsets,
                                               rstr));
+    CeedCall(CeedDestroy(&delegate));
     return CEED_ERROR_SUCCESS;
   }
 
@@ -1002,6 +1005,7 @@ int CeedElemRestrictionCreateBlockedOriented(Ceed ceed, CeedInt num_elem, CeedIn
     CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement CeedElemRestrictionCreateBlockedOriented");
     CeedCall(CeedElemRestrictionCreateBlockedOriented(delegate, num_elem, elem_size, block_size, num_comp, comp_stride, l_size, mem_type, copy_mode,
                                                       offsets, orients, rstr));
+    CeedCall(CeedDestroy(&delegate));
     return CEED_ERROR_SUCCESS;
   }
 
@@ -1074,6 +1078,7 @@ int CeedElemRestrictionCreateBlockedCurlOriented(Ceed ceed, CeedInt num_elem, Ce
     CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement CeedElemRestrictionCreateBlockedCurlOriented");
     CeedCall(CeedElemRestrictionCreateBlockedCurlOriented(delegate, num_elem, elem_size, block_size, num_comp, comp_stride, l_size, mem_type,
                                                           copy_mode, offsets, curl_orients, rstr));
+    CeedCall(CeedDestroy(&delegate));
     return CEED_ERROR_SUCCESS;
   }
 
@@ -1135,6 +1140,7 @@ int CeedElemRestrictionCreateBlockedStrided(Ceed ceed, CeedInt num_elem, CeedInt
     CeedCall(CeedGetObjectDelegate(ceed, &delegate, "ElemRestriction"));
     CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement CeedElemRestrictionCreateBlockedStrided");
     CeedCall(CeedElemRestrictionCreateBlockedStrided(delegate, num_elem, elem_size, block_size, num_comp, l_size, strides, rstr));
+    CeedCall(CeedDestroy(&delegate));
     return CEED_ERROR_SUCCESS;
   }
 
@@ -1269,6 +1275,7 @@ int CeedElemRestrictionCreateVector(CeedElemRestriction rstr, CeedVector *l_vec,
   CeedCall(CeedElemRestrictionGetEVectorSize(rstr, &e_size));
   if (l_vec) CeedCall(CeedVectorCreate(ceed, l_size, l_vec));
   if (e_vec) CeedCall(CeedVectorCreate(ceed, e_size, e_vec));
+  CeedCall(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -1289,9 +1296,7 @@ int CeedElemRestrictionCreateVector(CeedElemRestriction rstr, CeedVector *l_vec,
 int CeedElemRestrictionApply(CeedElemRestriction rstr, CeedTransposeMode t_mode, CeedVector u, CeedVector ru, CeedRequest *request) {
   CeedSize min_u_len, min_ru_len, len;
   CeedInt  num_elem;
-  Ceed     ceed;
 
-  CeedCall(CeedElemRestrictionGetCeed(rstr, &ceed));
   if (t_mode == CEED_NOTRANSPOSE) {
     CeedCall(CeedElemRestrictionGetEVectorSize(rstr, &min_ru_len));
     CeedCall(CeedElemRestrictionGetLVectorSize(rstr, &min_u_len));
@@ -1300,11 +1305,11 @@ int CeedElemRestrictionApply(CeedElemRestriction rstr, CeedTransposeMode t_mode,
     CeedCall(CeedElemRestrictionGetLVectorSize(rstr, &min_ru_len));
   }
   CeedCall(CeedVectorGetLength(u, &len));
-  CeedCheck(min_u_len <= len, ceed, CEED_ERROR_DIMENSION,
+  CeedCheck(min_u_len <= len, CeedElemRestrictionReturnCeed(rstr), CEED_ERROR_DIMENSION,
             "Input vector size %" CeedInt_FMT " not compatible with element restriction (%" CeedInt_FMT ", %" CeedInt_FMT ")", len, min_ru_len,
             min_u_len);
   CeedCall(CeedVectorGetLength(ru, &len));
-  CeedCheck(min_ru_len <= len, ceed, CEED_ERROR_DIMENSION,
+  CeedCheck(min_ru_len <= len, CeedElemRestrictionReturnCeed(rstr), CEED_ERROR_DIMENSION,
             "Output vector size %" CeedInt_FMT " not compatible with element restriction (%" CeedInt_FMT ", %" CeedInt_FMT ")", len, min_u_len,
             min_ru_len);
   CeedCall(CeedElemRestrictionGetNumElements(rstr, &num_elem));
@@ -1331,10 +1336,9 @@ int CeedElemRestrictionApplyAtPointsInElement(CeedElemRestriction rstr, CeedInt
                                               CeedRequest *request) {
   CeedSize min_u_len, min_ru_len, len;
   CeedInt  num_elem;
-  Ceed     ceed;
 
-  CeedCall(CeedElemRestrictionGetCeed(rstr, &ceed));
-  CeedCheck(rstr->ApplyAtPointsInElement, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement CeedElemRestrictionApplyAtPointsInElement");
+  CeedCheck(rstr->ApplyAtPointsInElement, CeedElemRestrictionReturnCeed(rstr), CEED_ERROR_UNSUPPORTED,
+            "Backend does not implement CeedElemRestrictionApplyAtPointsInElement");
 
   if (t_mode == CEED_NOTRANSPOSE) {
     CeedInt num_points, num_comp;
@@ -1352,17 +1356,17 @@ int CeedElemRestrictionApplyAtPointsInElement(CeedElemRestriction rstr, CeedInt
     CeedCall(CeedElemRestrictionGetLVectorSize(rstr, &min_ru_len));
   }
   CeedCall(CeedVectorGetLength(u, &len));
-  CeedCheck(min_u_len <= len, ceed, CEED_ERROR_DIMENSION,
+  CeedCheck(min_u_len <= len, CeedElemRestrictionReturnCeed(rstr), CEED_ERROR_DIMENSION,
             "Input vector size %" CeedInt_FMT " not compatible with element restriction (%" CeedInt_FMT ", %" CeedInt_FMT
             ") for element %" CeedInt_FMT,
             len, min_ru_len, min_u_len, elem);
   CeedCall(CeedVectorGetLength(ru, &len));
-  CeedCheck(min_ru_len <= len, ceed, CEED_ERROR_DIMENSION,
+  CeedCheck(min_ru_len <= len, CeedElemRestrictionReturnCeed(rstr), CEED_ERROR_DIMENSION,
             "Output vector size %" CeedInt_FMT " not compatible with element restriction (%" CeedInt_FMT ", %" CeedInt_FMT
             ") for element %" CeedInt_FMT,
             len, min_ru_len, min_u_len, elem);
   CeedCall(CeedElemRestrictionGetNumElements(rstr, &num_elem));
-  CeedCheck(elem < num_elem, ceed, CEED_ERROR_DIMENSION,
+  CeedCheck(elem < num_elem, CeedElemRestrictionReturnCeed(rstr), CEED_ERROR_DIMENSION,
             "Cannot retrieve element %" CeedInt_FMT ", element %" CeedInt_FMT " > total elements %" CeedInt_FMT "", elem, elem, num_elem);
   if (num_elem > 0) CeedCall(rstr->ApplyAtPointsInElement(rstr, elem, t_mode, u, ru, request));
   return CEED_ERROR_SUCCESS;
@@ -1387,10 +1391,9 @@ int CeedElemRestrictionApplyBlock(CeedElemRestriction rstr, CeedInt block, CeedT
                                   CeedRequest *request) {
   CeedSize min_u_len, min_ru_len, len;
   CeedInt  block_size, num_elem;
-  Ceed     ceed;
 
-  CeedCall(CeedElemRestrictionGetCeed(rstr, &ceed));
-  CeedCheck(rstr->ApplyBlock, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement CeedElemRestrictionApplyBlock");
+  CeedCheck(rstr->ApplyBlock, CeedElemRestrictionReturnCeed(rstr), CEED_ERROR_UNSUPPORTED,
+            "Backend does not implement CeedElemRestrictionApplyBlock");
 
   CeedCall(CeedElemRestrictionGetBlockSize(rstr, &block_size));
   if (t_mode == CEED_NOTRANSPOSE) {
@@ -1409,15 +1412,15 @@ int CeedElemRestrictionApplyBlock(CeedElemRestriction rstr, CeedInt block, CeedT
     min_u_len = (CeedSize)block_size * (CeedSize)elem_size * (CeedSize)num_comp;
   }
   CeedCall(CeedVectorGetLength(u, &len));
-  CeedCheck(min_u_len == len, ceed, CEED_ERROR_DIMENSION,
+  CeedCheck(min_u_len == len, CeedElemRestrictionReturnCeed(rstr), CEED_ERROR_DIMENSION,
             "Input vector size %" CeedInt_FMT " not compatible with element restriction (%" CeedInt_FMT ", %" CeedInt_FMT ")", len, min_u_len,
             min_ru_len);
   CeedCall(CeedVectorGetLength(ru, &len));
-  CeedCheck(min_ru_len == len, ceed, CEED_ERROR_DIMENSION,
+  CeedCheck(min_ru_len == len, CeedElemRestrictionReturnCeed(rstr), CEED_ERROR_DIMENSION,
             "Output vector size %" CeedInt_FMT " not compatible with element restriction (%" CeedInt_FMT ", %" CeedInt_FMT ")", len, min_ru_len,
             min_u_len);
   CeedCall(CeedElemRestrictionGetNumElements(rstr, &num_elem));
-  CeedCheck(block_size * block <= num_elem, ceed, CEED_ERROR_DIMENSION,
+  CeedCheck(block_size * block <= num_elem, CeedElemRestrictionReturnCeed(rstr), CEED_ERROR_DIMENSION,
             "Cannot retrieve block %" CeedInt_FMT ", element %" CeedInt_FMT " > total elements %" CeedInt_FMT "", block, block_size * block,
             num_elem);
   CeedCall(rstr->ApplyBlock(rstr, block, t_mode, u, ru, request));
@@ -1435,7 +1438,8 @@ int CeedElemRestrictionApplyBlock(CeedElemRestriction rstr, CeedInt block, CeedT
   @ref Advanced
 **/
 int CeedElemRestrictionGetCeed(CeedElemRestriction rstr, Ceed *ceed) {
-  *ceed = CeedElemRestrictionReturnCeed(rstr);
+  *ceed = NULL;
+  CeedCall(CeedReferenceCopy(CeedElemRestrictionReturnCeed(rstr), ceed));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/interface/ceed-hip.c b/interface/ceed-hip.c
index cc4a625853..f14df51eb5 100644
--- a/interface/ceed-hip.c
+++ b/interface/ceed-hip.c
@@ -23,10 +23,7 @@
 **/
 int CeedQFunctionSetHIPUserFunction(CeedQFunction qf, hipFunction_t f) {
   if (!qf->SetHIPUserFunction) {
-    Ceed ceed;
-
-    CeedCall(CeedQFunctionGetCeed(qf, &ceed));
-    CeedDebug(ceed, "Backend does not support hipFunction_t pointers for QFunctions.");
+    CeedDebug(CeedQFunctionReturnCeed(qf), "Backend does not support hipFunction_t pointers for QFunctions.");
   } else {
     CeedCall(qf->SetHIPUserFunction(qf, f));
   }
diff --git a/interface/ceed-operator.c b/interface/ceed-operator.c
index 301d5584fa..0bdc136ca1 100644
--- a/interface/ceed-operator.c
+++ b/interface/ceed-operator.c
@@ -195,10 +195,8 @@ int CeedOperatorGetActiveBasis(CeedOperator op, CeedBasis *active_basis) {
 int CeedOperatorGetActiveBases(CeedOperator op, CeedBasis *active_input_basis, CeedBasis *active_output_basis) {
   bool               is_composite;
   CeedInt            num_input_fields, num_output_fields;
-  Ceed               ceed;
   CeedOperatorField *op_input_fields, *op_output_fields;
 
-  CeedCall(CeedOperatorGetCeed(op, &ceed));
   CeedCall(CeedOperatorIsComposite(op, &is_composite));
   CeedCall(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
 
@@ -213,13 +211,14 @@ int CeedOperatorGetActiveBases(CeedOperator op, CeedBasis *active_input_basis, C
           CeedBasis basis;
 
           CeedCall(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
-          CeedCheck(!*active_input_basis || *active_input_basis == basis, ceed, CEED_ERROR_MINOR, "Multiple active input CeedBases found");
+          CeedCheck(!*active_input_basis || *active_input_basis == basis, CeedOperatorReturnCeed(op), CEED_ERROR_MINOR,
+                    "Multiple active input CeedBases found");
           if (!*active_input_basis) CeedCall(CeedBasisReferenceCopy(basis, active_input_basis));
           CeedCall(CeedBasisDestroy(&basis));
         }
         CeedCall(CeedVectorDestroy(&vec));
       }
-      CeedCheck(*active_input_basis, ceed, CEED_ERROR_INCOMPLETE, "No active input CeedBasis found");
+      CeedCheck(*active_input_basis, CeedOperatorReturnCeed(op), CEED_ERROR_INCOMPLETE, "No active input CeedBasis found");
     }
   }
   if (active_output_basis) {
@@ -233,13 +232,14 @@ int CeedOperatorGetActiveBases(CeedOperator op, CeedBasis *active_input_basis, C
           CeedBasis basis;
 
           CeedCall(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
-          CeedCheck(!*active_output_basis || *active_output_basis == basis, ceed, CEED_ERROR_MINOR, "Multiple active output CeedBases found");
+          CeedCheck(!*active_output_basis || *active_output_basis == basis, CeedOperatorReturnCeed(op), CEED_ERROR_MINOR,
+                    "Multiple active output CeedBases found");
           if (!*active_output_basis) CeedCall(CeedBasisReferenceCopy(basis, active_output_basis));
           CeedCall(CeedBasisDestroy(&basis));
         }
         CeedCall(CeedVectorDestroy(&vec));
       }
-      CeedCheck(*active_output_basis, ceed, CEED_ERROR_INCOMPLETE, "No active output CeedBasis found");
+      CeedCheck(*active_output_basis, CeedOperatorReturnCeed(op), CEED_ERROR_INCOMPLETE, "No active output CeedBasis found");
     }
   }
   return CEED_ERROR_SUCCESS;
@@ -278,10 +278,8 @@ int CeedOperatorGetActiveElemRestriction(CeedOperator op, CeedElemRestriction *a
 int CeedOperatorGetActiveElemRestrictions(CeedOperator op, CeedElemRestriction *active_input_rstr, CeedElemRestriction *active_output_rstr) {
   bool               is_composite;
   CeedInt            num_input_fields, num_output_fields;
-  Ceed               ceed;
   CeedOperatorField *op_input_fields, *op_output_fields;
 
-  CeedCall(CeedOperatorGetCeed(op, &ceed));
   CeedCall(CeedOperatorIsComposite(op, &is_composite));
   CeedCall(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
 
@@ -296,13 +294,14 @@ int CeedOperatorGetActiveElemRestrictions(CeedOperator op, CeedElemRestriction *
           CeedElemRestriction rstr;
 
           CeedCall(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr));
-          CeedCheck(!*active_input_rstr || *active_input_rstr == rstr, ceed, CEED_ERROR_MINOR, "Multiple active input CeedElemRestrictions found");
+          CeedCheck(!*active_input_rstr || *active_input_rstr == rstr, CeedOperatorReturnCeed(op), CEED_ERROR_MINOR,
+                    "Multiple active input CeedElemRestrictions found");
           if (!*active_input_rstr) CeedCall(CeedElemRestrictionReferenceCopy(rstr, active_input_rstr));
           CeedCall(CeedElemRestrictionDestroy(&rstr));
         }
         CeedCall(CeedVectorDestroy(&vec));
       }
-      CeedCheck(*active_input_rstr, ceed, CEED_ERROR_INCOMPLETE, "No active input CeedElemRestriction found");
+      CeedCheck(*active_input_rstr, CeedOperatorReturnCeed(op), CEED_ERROR_INCOMPLETE, "No active input CeedElemRestriction found");
     }
   }
   if (active_output_rstr) {
@@ -316,13 +315,14 @@ int CeedOperatorGetActiveElemRestrictions(CeedOperator op, CeedElemRestriction *
           CeedElemRestriction rstr;
 
           CeedCall(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &rstr));
-          CeedCheck(!*active_output_rstr || *active_output_rstr == rstr, ceed, CEED_ERROR_MINOR, "Multiple active output CeedElemRestrictions found");
+          CeedCheck(!*active_output_rstr || *active_output_rstr == rstr, CeedOperatorReturnCeed(op), CEED_ERROR_MINOR,
+                    "Multiple active output CeedElemRestrictions found");
           if (!*active_output_rstr) CeedCall(CeedElemRestrictionReferenceCopy(rstr, active_output_rstr));
           CeedCall(CeedElemRestrictionDestroy(&rstr));
         }
         CeedCall(CeedVectorDestroy(&vec));
       }
-      CeedCheck(*active_output_rstr, ceed, CEED_ERROR_INCOMPLETE, "No active output CeedElemRestriction found");
+      CeedCheck(*active_output_rstr, CeedOperatorReturnCeed(op), CEED_ERROR_INCOMPLETE, "No active output CeedElemRestriction found");
     }
   }
   return CEED_ERROR_SUCCESS;
@@ -345,10 +345,8 @@ int CeedOperatorGetActiveElemRestrictions(CeedOperator op, CeedElemRestriction *
 **/
 static int CeedOperatorContextSetGeneric(CeedOperator op, CeedContextFieldLabel field_label, CeedContextFieldType field_type, void *values) {
   bool is_composite = false;
-  Ceed ceed;
 
-  CeedCall(CeedOperatorGetCeed(op, &ceed));
-  CeedCheck(field_label, ceed, CEED_ERROR_UNSUPPORTED, "Invalid field label");
+  CeedCheck(field_label, CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED, "Invalid field label");
 
   // Check if field_label and op correspond
   if (field_label->from_op) {
@@ -357,7 +355,7 @@ static int CeedOperatorContextSetGeneric(CeedOperator op, CeedContextFieldLabel
     for (CeedInt i = 0; i < op->num_context_labels; i++) {
       if (op->context_labels[i] == field_label) index = i;
     }
-    CeedCheck(index != -1, ceed, CEED_ERROR_UNSUPPORTED, "ContextFieldLabel does not correspond to the operator");
+    CeedCheck(index != -1, CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED, "ContextFieldLabel does not correspond to the operator");
   }
 
   CeedCall(CeedOperatorIsComposite(op, &is_composite));
@@ -367,7 +365,8 @@ static int CeedOperatorContextSetGeneric(CeedOperator op, CeedContextFieldLabel
 
     CeedCall(CeedCompositeOperatorGetNumSub(op, &num_sub));
     CeedCall(CeedCompositeOperatorGetSubList(op, &sub_operators));
-    CeedCheck(num_sub == field_label->num_sub_labels, ceed, CEED_ERROR_UNSUPPORTED, "Composite operator modified after ContextFieldLabel created");
+    CeedCheck(num_sub == field_label->num_sub_labels, CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED,
+              "Composite operator modified after ContextFieldLabel created");
 
     for (CeedInt i = 0; i < num_sub; i++) {
       CeedQFunction        qf;
@@ -386,7 +385,7 @@ static int CeedOperatorContextSetGeneric(CeedOperator op, CeedContextFieldLabel
 
     CeedCall(CeedOperatorGetQFunction(op, &qf));
     CeedCall(CeedQFunctionGetContext(qf, &ctx));
-    CeedCheck(ctx, ceed, CEED_ERROR_UNSUPPORTED, "QFunction does not have context data");
+    CeedCheck(ctx, CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED, "QFunction does not have context data");
     CeedCall(CeedQFunctionContextSetGeneric(ctx, field_label, field_type, values));
   }
   CeedCall(CeedOperatorSetQFunctionAssemblyDataUpdateNeeded(op, true));
@@ -412,10 +411,8 @@ static int CeedOperatorContextSetGeneric(CeedOperator op, CeedContextFieldLabel
 static int CeedOperatorContextGetGenericRead(CeedOperator op, CeedContextFieldLabel field_label, CeedContextFieldType field_type, size_t *num_values,
                                              void *values) {
   bool is_composite = false;
-  Ceed ceed;
 
-  CeedCall(CeedOperatorGetCeed(op, &ceed));
-  CeedCheck(field_label, ceed, CEED_ERROR_UNSUPPORTED, "Invalid field label");
+  CeedCheck(field_label, CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED, "Invalid field label");
 
   *(void **)values = NULL;
   *num_values      = 0;
@@ -427,7 +424,7 @@ static int CeedOperatorContextGetGenericRead(CeedOperator op, CeedContextFieldLa
     for (CeedInt i = 0; i < op->num_context_labels; i++) {
       if (op->context_labels[i] == field_label) index = i;
     }
-    CeedCheck(index != -1, ceed, CEED_ERROR_UNSUPPORTED, "ContextFieldLabel does not correspond to the operator");
+    CeedCheck(index != -1, CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED, "ContextFieldLabel does not correspond to the operator");
   }
 
   CeedCall(CeedOperatorIsComposite(op, &is_composite));
@@ -437,7 +434,8 @@ static int CeedOperatorContextGetGenericRead(CeedOperator op, CeedContextFieldLa
 
     CeedCall(CeedCompositeOperatorGetNumSub(op, &num_sub));
     CeedCall(CeedCompositeOperatorGetSubList(op, &sub_operators));
-    CeedCheck(num_sub == field_label->num_sub_labels, ceed, CEED_ERROR_UNSUPPORTED, "Composite operator modified after ContextFieldLabel created");
+    CeedCheck(num_sub == field_label->num_sub_labels, CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED,
+              "Composite operator modified after ContextFieldLabel created");
 
     for (CeedInt i = 0; i < num_sub; i++) {
       CeedQFunction        qf;
@@ -457,7 +455,7 @@ static int CeedOperatorContextGetGenericRead(CeedOperator op, CeedContextFieldLa
 
     CeedCall(CeedOperatorGetQFunction(op, &qf));
     CeedCall(CeedQFunctionGetContext(qf, &ctx));
-    CeedCheck(ctx, ceed, CEED_ERROR_UNSUPPORTED, "QFunction does not have context data");
+    CeedCheck(ctx, CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED, "QFunction does not have context data");
     CeedCall(CeedQFunctionContextGetGenericRead(ctx, field_label, field_type, num_values, values));
   }
   return CEED_ERROR_SUCCESS;
@@ -480,10 +478,8 @@ static int CeedOperatorContextGetGenericRead(CeedOperator op, CeedContextFieldLa
 **/
 static int CeedOperatorContextRestoreGenericRead(CeedOperator op, CeedContextFieldLabel field_label, CeedContextFieldType field_type, void *values) {
   bool is_composite = false;
-  Ceed ceed;
 
-  CeedCall(CeedOperatorGetCeed(op, &ceed));
-  CeedCheck(field_label, ceed, CEED_ERROR_UNSUPPORTED, "Invalid field label");
+  CeedCheck(field_label, CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED, "Invalid field label");
 
   // Check if field_label and op correspond
   if (field_label->from_op) {
@@ -492,7 +488,7 @@ static int CeedOperatorContextRestoreGenericRead(CeedOperator op, CeedContextFie
     for (CeedInt i = 0; i < op->num_context_labels; i++) {
       if (op->context_labels[i] == field_label) index = i;
     }
-    CeedCheck(index != -1, ceed, CEED_ERROR_UNSUPPORTED, "ContextFieldLabel does not correspond to the operator");
+    CeedCheck(index != -1, CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED, "ContextFieldLabel does not correspond to the operator");
   }
 
   CeedCall(CeedOperatorIsComposite(op, &is_composite));
@@ -502,7 +498,8 @@ static int CeedOperatorContextRestoreGenericRead(CeedOperator op, CeedContextFie
 
     CeedCall(CeedCompositeOperatorGetNumSub(op, &num_sub));
     CeedCall(CeedCompositeOperatorGetSubList(op, &sub_operators));
-    CeedCheck(num_sub == field_label->num_sub_labels, ceed, CEED_ERROR_UNSUPPORTED, "Composite operator modified after ContextFieldLabel created");
+    CeedCheck(num_sub == field_label->num_sub_labels, CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED,
+              "Composite operator modified after ContextFieldLabel created");
 
     for (CeedInt i = 0; i < num_sub; i++) {
       CeedQFunction        qf;
@@ -522,7 +519,7 @@ static int CeedOperatorContextRestoreGenericRead(CeedOperator op, CeedContextFie
 
     CeedCall(CeedOperatorGetQFunction(op, &qf));
     CeedCall(CeedQFunctionGetContext(qf, &ctx));
-    CeedCheck(ctx, ceed, CEED_ERROR_UNSUPPORTED, "QFunction does not have context data");
+    CeedCheck(ctx, CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED, "QFunction does not have context data");
     CeedCall(CeedQFunctionContextRestoreGenericRead(ctx, field_label, field_type, values));
   }
   return CEED_ERROR_SUCCESS;
@@ -750,6 +747,7 @@ int CeedOperatorCreate(Ceed ceed, CeedQFunction qf, CeedQFunction dqf, CeedQFunc
     CeedCall(CeedGetObjectDelegate(ceed, &delegate, "Operator"));
     CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement CeedOperatorCreate");
     CeedCall(CeedOperatorCreate(delegate, qf, dqf, dqfT, op));
+    CeedCall(CeedDestroy(&delegate));
     return CEED_ERROR_SUCCESS;
   }
 
@@ -792,6 +790,7 @@ int CeedOperatorCreateAtPoints(Ceed ceed, CeedQFunction qf, CeedQFunction dqf, C
     CeedCall(CeedGetObjectDelegate(ceed, &delegate, "Operator"));
     CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement CeedOperatorCreateAtPoints");
     CeedCall(CeedOperatorCreateAtPoints(delegate, qf, dqf, dqfT, op));
+    CeedCall(CeedDestroy(&delegate));
     return CEED_ERROR_SUCCESS;
   }
 
@@ -829,6 +828,7 @@ int CeedCompositeOperatorCreate(Ceed ceed, CeedOperator *op) {
     CeedCall(CeedGetObjectDelegate(ceed, &delegate, "Operator"));
     if (delegate) {
       CeedCall(CeedCompositeOperatorCreate(delegate, op));
+      CeedCall(CeedDestroy(&delegate));
       return CEED_ERROR_SUCCESS;
     }
   }
@@ -893,38 +893,38 @@ int CeedOperatorReferenceCopy(CeedOperator op, CeedOperator *op_copy) {
 int CeedOperatorSetField(CeedOperator op, const char *field_name, CeedElemRestriction rstr, CeedBasis basis, CeedVector vec) {
   bool               is_input = true, is_at_points, is_composite, is_immutable;
   CeedInt            num_elem = 0, num_qpts = 0, num_input_fields, num_output_fields;
-  Ceed               ceed;
   CeedQFunction      qf;
   CeedQFunctionField qf_field, *qf_input_fields, *qf_output_fields;
   CeedOperatorField *op_field;
 
-  CeedCall(CeedOperatorGetCeed(op, &ceed));
   CeedCall(CeedOperatorIsAtPoints(op, &is_at_points));
   CeedCall(CeedOperatorIsComposite(op, &is_composite));
   CeedCall(CeedOperatorIsImmutable(op, &is_immutable));
-  CeedCheck(!is_composite, ceed, CEED_ERROR_INCOMPATIBLE, "Cannot add field to composite operator.");
-  CeedCheck(!is_immutable, ceed, CEED_ERROR_MAJOR, "Operator cannot be changed after set as immutable");
-  CeedCheck(rstr, ceed, CEED_ERROR_INCOMPATIBLE, "CeedElemRestriction rstr for field \"%s\" must be non-NULL.", field_name);
-  CeedCheck(basis, ceed, CEED_ERROR_INCOMPATIBLE, "CeedBasis basis for field \"%s\" must be non-NULL.", field_name);
-  CeedCheck(vec, ceed, CEED_ERROR_INCOMPATIBLE, "CeedVector vec for field \"%s\" must be non-NULL.", field_name);
+  CeedCheck(!is_composite, CeedOperatorReturnCeed(op), CEED_ERROR_INCOMPATIBLE, "Cannot add field to composite operator.");
+  CeedCheck(!is_immutable, CeedOperatorReturnCeed(op), CEED_ERROR_MAJOR, "Operator cannot be changed after set as immutable");
+  CeedCheck(rstr, CeedOperatorReturnCeed(op), CEED_ERROR_INCOMPATIBLE, "CeedElemRestriction rstr for field \"%s\" must be non-NULL.", field_name);
+  CeedCheck(basis, CeedOperatorReturnCeed(op), CEED_ERROR_INCOMPATIBLE, "CeedBasis basis for field \"%s\" must be non-NULL.", field_name);
+  CeedCheck(vec, CeedOperatorReturnCeed(op), CEED_ERROR_INCOMPATIBLE, "CeedVector vec for field \"%s\" must be non-NULL.", field_name);
 
   CeedCall(CeedElemRestrictionGetNumElements(rstr, &num_elem));
-  CeedCheck(rstr == CEED_ELEMRESTRICTION_NONE || !op->has_restriction || num_elem == op->num_elem, ceed, CEED_ERROR_DIMENSION,
+  CeedCheck(rstr == CEED_ELEMRESTRICTION_NONE || !op->has_restriction || num_elem == op->num_elem, CeedOperatorReturnCeed(op), CEED_ERROR_DIMENSION,
             "CeedElemRestriction with %" CeedInt_FMT " elements incompatible with prior %" CeedInt_FMT " elements", num_elem, op->num_elem);
   {
     CeedRestrictionType rstr_type;
 
     CeedCall(CeedElemRestrictionGetType(rstr, &rstr_type));
     if (rstr_type == CEED_RESTRICTION_POINTS) {
-      CeedCheck(is_at_points, ceed, CEED_ERROR_UNSUPPORTED, "CeedElemRestriction AtPoints not supported for standard operator fields");
-      CeedCheck(basis == CEED_BASIS_NONE, ceed, CEED_ERROR_UNSUPPORTED, "CeedElemRestriction AtPoints must be used with CEED_BASIS_NONE");
+      CeedCheck(is_at_points, CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED,
+                "CeedElemRestriction AtPoints not supported for standard operator fields");
+      CeedCheck(basis == CEED_BASIS_NONE, CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED,
+                "CeedElemRestriction AtPoints must be used with CEED_BASIS_NONE");
       if (!op->first_points_rstr) {
         CeedCall(CeedElemRestrictionReferenceCopy(rstr, &op->first_points_rstr));
       } else {
         bool are_compatible;
 
         CeedCall(CeedElemRestrictionAtPointsAreCompatible(op->first_points_rstr, rstr, &are_compatible));
-        CeedCheck(are_compatible, ceed, CEED_ERROR_INCOMPATIBLE,
+        CeedCheck(are_compatible, CeedOperatorReturnCeed(op), CEED_ERROR_INCOMPATIBLE,
                   "CeedElemRestriction must have compatible offsets with previously set CeedElemRestriction");
       }
     }
@@ -932,7 +932,7 @@ int CeedOperatorSetField(CeedOperator op, const char *field_name, CeedElemRestri
 
   if (basis == CEED_BASIS_NONE) CeedCall(CeedElemRestrictionGetElementSize(rstr, &num_qpts));
   else CeedCall(CeedBasisGetNumQuadraturePoints(basis, &num_qpts));
-  CeedCheck(op->num_qpts == 0 || num_qpts == op->num_qpts, ceed, CEED_ERROR_DIMENSION,
+  CeedCheck(op->num_qpts == 0 || num_qpts == op->num_qpts, CeedOperatorReturnCeed(op), CEED_ERROR_DIMENSION,
             "%s must correspond to the same number of quadrature points as previously added CeedBases. Found %" CeedInt_FMT
             " quadrature points but expected %" CeedInt_FMT " quadrature points.",
             basis == CEED_BASIS_NONE ? "CeedElemRestriction" : "CeedBasis", num_qpts, op->num_qpts);
@@ -961,10 +961,10 @@ int CeedOperatorSetField(CeedOperator op, const char *field_name, CeedElemRestri
     }
   }
   // LCOV_EXCL_START
-  return CeedError(ceed, CEED_ERROR_INCOMPLETE, "CeedQFunction has no knowledge of field '%s'", field_name);
+  return CeedError(CeedOperatorReturnCeed(op), CEED_ERROR_INCOMPLETE, "CeedQFunction has no knowledge of field '%s'", field_name);
   // LCOV_EXCL_STOP
 found:
-  CeedCall(CeedOperatorCheckField(ceed, qf_field, rstr, basis));
+  CeedCall(CeedOperatorCheckField(CeedOperatorReturnCeed(op), qf_field, rstr, basis));
   CeedCall(CeedCalloc(1, op_field));
 
   if (vec == CEED_VECTOR_ACTIVE) {
@@ -973,11 +973,11 @@ int CeedOperatorSetField(CeedOperator op, const char *field_name, CeedElemRestri
     CeedCall(CeedElemRestrictionGetLVectorSize(rstr, &l_size));
     if (is_input) {
       if (op->input_size == -1) op->input_size = l_size;
-      CeedCheck(l_size == op->input_size, ceed, CEED_ERROR_INCOMPATIBLE,
+      CeedCheck(l_size == op->input_size, CeedOperatorReturnCeed(op), CEED_ERROR_INCOMPATIBLE,
                 "LVector size %" CeedSize_FMT " does not match previous size %" CeedSize_FMT "", l_size, op->input_size);
     } else {
       if (op->output_size == -1) op->output_size = l_size;
-      CeedCheck(l_size == op->output_size, ceed, CEED_ERROR_INCOMPATIBLE,
+      CeedCheck(l_size == op->output_size, CeedOperatorReturnCeed(op), CEED_ERROR_INCOMPATIBLE,
                 "LVector size %" CeedSize_FMT " does not match previous size %" CeedSize_FMT "", l_size, op->output_size);
     }
   }
@@ -1041,13 +1041,11 @@ int CeedOperatorGetFields(CeedOperator op, CeedInt *num_input_fields, CeedOperat
 **/
 int CeedOperatorAtPointsSetPoints(CeedOperator op, CeedElemRestriction rstr_points, CeedVector point_coords) {
   bool is_at_points, is_immutable;
-  Ceed ceed;
 
-  CeedCall(CeedOperatorGetCeed(op, &ceed));
   CeedCall(CeedOperatorIsAtPoints(op, &is_at_points));
   CeedCall(CeedOperatorIsImmutable(op, &is_immutable));
-  CeedCheck(is_at_points, ceed, CEED_ERROR_MINOR, "Only defined for operator at points");
-  CeedCheck(!is_immutable, ceed, CEED_ERROR_MAJOR, "Operator cannot be changed after set as immutable");
+  CeedCheck(is_at_points, CeedOperatorReturnCeed(op), CEED_ERROR_MINOR, "Only defined for operator at points");
+  CeedCheck(!is_immutable, CeedOperatorReturnCeed(op), CEED_ERROR_MAJOR, "Operator cannot be changed after set as immutable");
 
   if (!op->first_points_rstr) {
     CeedCall(CeedElemRestrictionReferenceCopy(rstr_points, &op->first_points_rstr));
@@ -1055,7 +1053,7 @@ int CeedOperatorAtPointsSetPoints(CeedOperator op, CeedElemRestriction rstr_poin
     bool are_compatible;
 
     CeedCall(CeedElemRestrictionAtPointsAreCompatible(op->first_points_rstr, rstr_points, &are_compatible));
-    CeedCheck(are_compatible, ceed, CEED_ERROR_INCOMPATIBLE,
+    CeedCheck(are_compatible, CeedOperatorReturnCeed(op), CEED_ERROR_INCOMPATIBLE,
               "CeedElemRestriction must have compatible offsets with previously set field CeedElemRestriction");
   }
 
@@ -1249,13 +1247,12 @@ int CeedOperatorFieldGetData(CeedOperatorField op_field, const char **field_name
  */
 int CeedCompositeOperatorAddSub(CeedOperator composite_op, CeedOperator sub_op) {
   bool is_immutable;
-  Ceed ceed;
 
-  CeedCall(CeedOperatorGetCeed(composite_op, &ceed));
-  CeedCheck(composite_op->is_composite, ceed, CEED_ERROR_MINOR, "CeedOperator is not a composite operator");
-  CeedCheck(composite_op->num_suboperators < CEED_COMPOSITE_MAX, ceed, CEED_ERROR_UNSUPPORTED, "Cannot add additional sub-operators");
+  CeedCheck(composite_op->is_composite, CeedOperatorReturnCeed(composite_op), CEED_ERROR_MINOR, "CeedOperator is not a composite operator");
+  CeedCheck(composite_op->num_suboperators < CEED_COMPOSITE_MAX, CeedOperatorReturnCeed(composite_op), CEED_ERROR_UNSUPPORTED,
+            "Cannot add additional sub-operators");
   CeedCall(CeedOperatorIsImmutable(composite_op, &is_immutable));
-  CeedCheck(!is_immutable, ceed, CEED_ERROR_MAJOR, "Operator cannot be changed after set as immutable");
+  CeedCheck(!is_immutable, CeedOperatorReturnCeed(composite_op), CEED_ERROR_MAJOR, "Operator cannot be changed after set as immutable");
 
   {
     CeedSize input_size, output_size;
@@ -1264,8 +1261,8 @@ int CeedCompositeOperatorAddSub(CeedOperator composite_op, CeedOperator sub_op)
     if (composite_op->input_size == -1) composite_op->input_size = input_size;
     if (composite_op->output_size == -1) composite_op->output_size = output_size;
     // Note, a size of -1 means no active vector restriction set, so no incompatibility
-    CeedCheck((input_size == -1 || input_size == composite_op->input_size) && (output_size == -1 || output_size == composite_op->output_size), ceed,
-              CEED_ERROR_MAJOR,
+    CeedCheck((input_size == -1 || input_size == composite_op->input_size) && (output_size == -1 || output_size == composite_op->output_size),
+              CeedOperatorReturnCeed(composite_op), CEED_ERROR_MAJOR,
               "Sub-operators must have compatible dimensions; composite operator of shape (%" CeedSize_FMT ", %" CeedSize_FMT
               ") not compatible with sub-operator of "
               "shape (%" CeedSize_FMT ", %" CeedSize_FMT ")",
@@ -1361,12 +1358,10 @@ int CeedCompositeOperatorGetSubByName(CeedOperator op, const char *op_name, Ceed
 **/
 int CeedOperatorCheckReady(CeedOperator op) {
   bool          is_at_points, is_composite;
-  Ceed          ceed;
   CeedQFunction qf = NULL;
 
   if (op->is_interface_setup) return CEED_ERROR_SUCCESS;
 
-  CeedCall(CeedOperatorGetCeed(op, &ceed));
   CeedCall(CeedOperatorIsAtPoints(op, &is_at_points));
   CeedCall(CeedOperatorIsComposite(op, &is_composite));
   if (!is_composite) CeedCall(CeedOperatorGetQFunction(op, &qf));
@@ -1393,11 +1388,12 @@ int CeedOperatorCheckReady(CeedOperator op) {
   } else {
     CeedInt num_input_fields, num_output_fields;
 
-    CeedCheck(op->num_fields > 0, ceed, CEED_ERROR_INCOMPLETE, "No operator fields set");
+    CeedCheck(op->num_fields > 0, CeedOperatorReturnCeed(op), CEED_ERROR_INCOMPLETE, "No operator fields set");
     CeedCall(CeedQFunctionGetFields(qf, &num_input_fields, NULL, &num_output_fields, NULL));
-    CeedCheck(op->num_fields == num_input_fields + num_output_fields, ceed, CEED_ERROR_INCOMPLETE, "Not all operator fields set");
-    CeedCheck(op->has_restriction, ceed, CEED_ERROR_INCOMPLETE, "At least one restriction required");
-    CeedCheck(op->num_qpts > 0 || is_at_points, ceed, CEED_ERROR_INCOMPLETE,
+    CeedCheck(op->num_fields == num_input_fields + num_output_fields, CeedOperatorReturnCeed(op), CEED_ERROR_INCOMPLETE,
+              "Not all operator fields set");
+    CeedCheck(op->has_restriction, CeedOperatorReturnCeed(op), CEED_ERROR_INCOMPLETE, "At least one restriction required");
+    CeedCheck(op->num_qpts > 0 || is_at_points, CeedOperatorReturnCeed(op), CEED_ERROR_INCOMPLETE,
               "At least one non-collocated CeedBasis is required or the number of quadrature points must be set");
   }
 
@@ -1612,7 +1608,8 @@ int CeedOperatorViewTerse(CeedOperator op, FILE *stream) {
   @ref Advanced
 **/
 int CeedOperatorGetCeed(CeedOperator op, Ceed *ceed) {
-  *ceed = CeedOperatorReturnCeed(op);
+  *ceed = NULL;
+  CeedCall(CeedReferenceCopy(CeedOperatorReturnCeed(op), ceed));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/interface/ceed-preconditioning.c b/interface/ceed-preconditioning.c
index 3e82b38493..937d7fde0d 100644
--- a/interface/ceed-preconditioning.c
+++ b/interface/ceed-preconditioning.c
@@ -37,15 +37,13 @@
 static int CeedQFunctionCreateFallback(Ceed fallback_ceed, CeedQFunction qf, CeedQFunction *qf_fallback) {
   char               *source_path_with_name = NULL;
   CeedInt             num_input_fields, num_output_fields;
-  Ceed                ceed;
   CeedQFunctionField *input_fields, *output_fields;
 
   // Check if NULL qf passed in
   if (!qf) return CEED_ERROR_SUCCESS;
 
-  CeedCall(CeedQFunctionGetCeed(qf, &ceed));
-  CeedDebug256(ceed, 1, "---------- CeedOperator Fallback ----------\n");
-  CeedDebug(ceed, "Creating fallback CeedQFunction\n");
+  CeedDebug256(CeedQFunctionReturnCeed(qf), 1, "---------- CeedOperator Fallback ----------\n");
+  CeedDebug(CeedQFunctionReturnCeed(qf), "Creating fallback CeedQFunction\n");
 
   if (qf->source_path) {
     size_t path_len = strlen(qf->source_path), name_len = strlen(qf->kernel_name);
@@ -115,10 +113,11 @@ static int CeedOperatorCreateFallback(CeedOperator op) {
   // Fallback Ceed
   CeedCall(CeedOperatorGetCeed(op, &ceed));
   CeedCall(CeedGetOperatorFallbackCeed(ceed, &ceed_fallback));
+  CeedCall(CeedDestroy(&ceed));
   if (!ceed_fallback) return CEED_ERROR_SUCCESS;
 
-  CeedDebug256(ceed, 1, "---------- CeedOperator Fallback ----------\n");
-  CeedDebug(ceed, "Creating fallback CeedOperator\n");
+  CeedDebug256(CeedOperatorReturnCeed(op), 1, "---------- CeedOperator Fallback ----------\n");
+  CeedDebug(CeedOperatorReturnCeed(op), "Creating fallback CeedOperator\n");
 
   // Clone Op
   CeedCall(CeedOperatorIsComposite(op, &is_composite));
@@ -186,6 +185,7 @@ static int CeedOperatorCreateFallback(CeedOperator op) {
   //       The op holds the only reference to op_fallback and is responsible for deleting itself and op_fallback.
   op->op_fallback                 = op_fallback;
   op_fallback->op_fallback_parent = op;
+  CeedCall(CeedDestroy(&ceed_fallback));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -203,12 +203,10 @@ static int CeedOperatorCreateFallback(CeedOperator op) {
 **/
 static inline int CeedSingleOperatorLinearAssembleAddDiagonal_Mesh(CeedOperator op, CeedRequest *request, const bool is_point_block,
                                                                    CeedVector assembled) {
-  Ceed ceed;
   bool is_composite;
 
-  CeedCall(CeedOperatorGetCeed(op, &ceed));
   CeedCall(CeedOperatorIsComposite(op, &is_composite));
-  CeedCheck(!is_composite, ceed, CEED_ERROR_UNSUPPORTED, "Composite operator not supported");
+  CeedCheck(!is_composite, CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED, "Composite operator not supported");
 
   // Assemble QFunction
   CeedInt             layout_qf[3];
@@ -265,7 +263,7 @@ static inline int CeedSingleOperatorLinearAssembleAddDiagonal_Mesh(CeedOperator
         continue;
       }  // No matching output basis found
     }
-    CeedCheck(active_elem_rstrs_in[b_in] == active_elem_rstrs_out[b_out], ceed, CEED_ERROR_UNSUPPORTED,
+    CeedCheck(active_elem_rstrs_in[b_in] == active_elem_rstrs_out[b_out], CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED,
               "Cannot assemble operator diagonal with different input and output active element restrictions");
 
     // Assemble point block diagonal restriction, if needed
@@ -390,12 +388,10 @@ static inline int CeedSingleOperatorLinearAssembleAddDiagonal_Mesh(CeedOperator
 **/
 static inline int CeedSingleOperatorLinearAssembleAddDiagonal(CeedOperator op, CeedRequest *request, const bool is_point_block,
                                                               CeedVector assembled) {
-  Ceed ceed;
   bool is_at_points;
 
-  CeedCall(CeedOperatorGetCeed(op, &ceed));
   CeedCall(CeedOperatorIsAtPoints(op, &is_at_points));
-  CeedCheck(!is_at_points, ceed, CEED_ERROR_UNSUPPORTED, "AtPoints operator not supported");
+  CeedCheck(!is_at_points, CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED, "AtPoints operator not supported");
   CeedCall(CeedSingleOperatorLinearAssembleAddDiagonal_Mesh(op, request, is_point_block, assembled));
   return CEED_ERROR_SUCCESS;
 }
@@ -454,9 +450,9 @@ static int CeedSingleOperatorAssembleSymbolic(CeedOperator op, CeedInt offset, C
   CeedVector          index_vec_in, index_vec_out, elem_dof_in, elem_dof_out;
   CeedElemRestriction elem_rstr_in, elem_rstr_out, index_elem_rstr_in, index_elem_rstr_out;
 
-  CeedCall(CeedOperatorGetCeed(op, &ceed));
   CeedCall(CeedOperatorIsComposite(op, &is_composite));
   CeedCheck(!is_composite, ceed, CEED_ERROR_UNSUPPORTED, "Composite operator not supported");
+  CeedCall(CeedOperatorGetCeed(op, &ceed));
 
   CeedCall(CeedOperatorGetActiveVectorLengths(op, &num_nodes_in, &num_nodes_out));
   CeedCall(CeedOperatorGetActiveElemRestrictions(op, &elem_rstr_in, &elem_rstr_out));
@@ -539,6 +535,7 @@ static int CeedSingleOperatorAssembleSymbolic(CeedOperator op, CeedInt offset, C
   }
   CeedCall(CeedElemRestrictionDestroy(&elem_rstr_in));
   CeedCall(CeedElemRestrictionDestroy(&elem_rstr_out));
+  CeedCall(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -556,12 +553,10 @@ static int CeedSingleOperatorAssembleSymbolic(CeedOperator op, CeedInt offset, C
   @ref Developer
 **/
 static int CeedSingleOperatorAssemble(CeedOperator op, CeedInt offset, CeedVector values) {
-  Ceed ceed;
   bool is_composite;
 
-  CeedCall(CeedOperatorGetCeed(op, &ceed));
   CeedCall(CeedOperatorIsComposite(op, &is_composite));
-  CeedCheck(!is_composite, ceed, CEED_ERROR_UNSUPPORTED, "Composite operator not supported");
+  CeedCheck(!is_composite, CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED, "Composite operator not supported");
 
   // Early exit for empty operator
   {
@@ -615,9 +610,10 @@ static int CeedSingleOperatorAssemble(CeedOperator op, CeedInt offset, CeedVecto
   CeedCall(CeedOperatorAssemblyDataGetEvalModes(data, &num_active_bases_in, &num_eval_modes_in, &eval_modes_in, NULL, &num_active_bases_out,
                                                 &num_eval_modes_out, &eval_modes_out, NULL, NULL));
 
-  CeedCheck(num_active_bases_in == 1 && num_active_bases_out == 1, ceed, CEED_ERROR_UNSUPPORTED,
+  CeedCheck(num_active_bases_in == 1 && num_active_bases_out == 1, CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED,
             "Cannot assemble operator with multiple active bases");
-  CeedCheck(num_eval_modes_in[0] > 0 && num_eval_modes_out[0] > 0, ceed, CEED_ERROR_UNSUPPORTED, "Cannot assemble operator without inputs/outputs");
+  CeedCheck(num_eval_modes_in[0] > 0 && num_eval_modes_out[0] > 0, CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED,
+            "Cannot assemble operator without inputs/outputs");
 
   CeedCall(CeedOperatorAssemblyDataGetBases(data, NULL, &active_bases_in, &B_mats_in, NULL, &active_bases_out, &B_mats_out));
   CeedCall(CeedOperatorGetActiveElemRestrictions(op, &elem_rstr_in, &elem_rstr_out));
@@ -641,7 +637,7 @@ static int CeedSingleOperatorAssemble(CeedOperator op, CeedInt offset, CeedVecto
 
   if (elem_rstr_in != elem_rstr_out) {
     CeedCall(CeedElemRestrictionGetNumElements(elem_rstr_out, &num_elem_out));
-    CeedCheck(num_elem_in == num_elem_out, ceed, CEED_ERROR_UNSUPPORTED,
+    CeedCheck(num_elem_in == num_elem_out, CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED,
               "Active input and output operator restrictions must have the same number of elements."
               " Input has %" CeedInt_FMT " elements; output has %" CeedInt_FMT "elements.",
               num_elem_in, num_elem_out);
@@ -649,7 +645,7 @@ static int CeedSingleOperatorAssemble(CeedOperator op, CeedInt offset, CeedVecto
     CeedCall(CeedElemRestrictionGetNumComponents(elem_rstr_out, &num_comp_out));
     if (basis_out == CEED_BASIS_NONE) num_qpts_out = elem_size_out;
     else CeedCall(CeedBasisGetNumQuadraturePoints(basis_out, &num_qpts_out));
-    CeedCheck(num_qpts_in == num_qpts_out, ceed, CEED_ERROR_UNSUPPORTED,
+    CeedCheck(num_qpts_in == num_qpts_out, CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED,
               "Active input and output bases must have the same number of quadrature points."
               " Input has %" CeedInt_FMT " points; output has %" CeedInt_FMT "points.",
               num_qpts_in, num_qpts_out);
@@ -709,7 +705,11 @@ static int CeedSingleOperatorAssemble(CeedOperator op, CeedInt offset, CeedVecto
           CeedCall(CeedTensorContractApply(contract, 1, num_qpts_in * num_eval_modes_in[0], elem_size_in, elem_size_out, BTD_mat, CEED_NOTRANSPOSE,
                                            false, B_mat_in, elem_mat));
         } else {
+          Ceed ceed;
+
+          CeedCall(CeedOperatorGetCeed(op, &ceed));
           CeedCall(CeedMatrixMatrixMultiply(ceed, BTD_mat, B_mat_in, elem_mat, elem_size_out, elem_size_in, num_qpts_in * num_eval_modes_in[0]));
+          CeedCall(CeedDestroy(&ceed));
         }
 
         // Transform the element matrix if required
@@ -768,7 +768,7 @@ static int CeedSingleOperatorAssemble(CeedOperator op, CeedInt offset, CeedVecto
       }
     }
   }
-  CeedCheck(count == local_num_entries, ceed, CEED_ERROR_MAJOR, "Error computing entries");
+  CeedCheck(count == local_num_entries, CeedOperatorReturnCeed(op), CEED_ERROR_MAJOR, "Error computing entries");
   CeedCall(CeedVectorRestoreArray(values, &vals));
 
   // Cleanup
@@ -807,12 +807,10 @@ static int CeedSingleOperatorAssemble(CeedOperator op, CeedInt offset, CeedVecto
 static int CeedSingleOperatorAssemblyCountEntries(CeedOperator op, CeedSize *num_entries) {
   bool                is_composite;
   CeedInt             num_elem_in, elem_size_in, num_comp_in, num_elem_out, elem_size_out, num_comp_out;
-  Ceed                ceed;
   CeedElemRestriction rstr_in, rstr_out;
 
-  CeedCall(CeedOperatorGetCeed(op, &ceed));
   CeedCall(CeedOperatorIsComposite(op, &is_composite));
-  CeedCheck(!is_composite, ceed, CEED_ERROR_UNSUPPORTED, "Composite operator not supported");
+  CeedCheck(!is_composite, CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED, "Composite operator not supported");
 
   CeedCall(CeedOperatorGetActiveElemRestrictions(op, &rstr_in, &rstr_out));
   CeedCall(CeedElemRestrictionGetNumElements(rstr_in, &num_elem_in));
@@ -820,7 +818,7 @@ static int CeedSingleOperatorAssemblyCountEntries(CeedOperator op, CeedSize *num
   CeedCall(CeedElemRestrictionGetNumComponents(rstr_in, &num_comp_in));
   if (rstr_in != rstr_out) {
     CeedCall(CeedElemRestrictionGetNumElements(rstr_out, &num_elem_out));
-    CeedCheck(num_elem_in == num_elem_out, ceed, CEED_ERROR_UNSUPPORTED,
+    CeedCheck(num_elem_in == num_elem_out, CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED,
               "Active input and output operator restrictions must have the same number of elements."
               " Input has %" CeedInt_FMT " elements; output has %" CeedInt_FMT "elements.",
               num_elem_in, num_elem_out);
@@ -1034,6 +1032,7 @@ static int CeedSingleOperatorMultigridLevel(CeedOperator op_fine, CeedVector p_m
   CeedCall(CeedOperatorCheckReady(*op_coarse));
 
   // Cleanup
+  CeedCall(CeedDestroy(&ceed));
   CeedCall(CeedVectorDestroy(&mult_vec));
   CeedCall(CeedElemRestrictionDestroy(&rstr_fine));
   CeedCall(CeedElemRestrictionDestroy(&rstr_p_mult_fine));
@@ -1162,6 +1161,7 @@ int CeedOperatorCreateActivePointBlockRestriction(CeedElemRestriction rstr, Ceed
 
   // Cleanup
   CeedCall(CeedElemRestrictionRestoreOffsets(rstr, &offsets));
+  CeedCall(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -1811,7 +1811,9 @@ int CeedOperatorGetFallback(CeedOperator op, CeedOperator *op_fallback) {
 
       CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "---------- CeedOperator Fallback ----------\n");
       CeedDebug(ceed, "Falling back from %s operator at address %p to %s operator at address %p\n", resource, op, resource_fallback, op->op_fallback);
+      CeedCall(CeedDestroy(&ceed_fallback));
     }
+    CeedCall(CeedDestroy(&ceed));
   }
   *op_fallback = op->op_fallback;
   return CEED_ERROR_SUCCESS;
@@ -1843,7 +1845,9 @@ int CeedOperatorGetFallbackParent(CeedOperator op, CeedOperator *parent) {
   @ref Backend
 **/
 int CeedOperatorGetFallbackParentCeed(CeedOperator op, Ceed *parent) {
-  *parent = op->op_fallback_parent ? op->op_fallback_parent->ceed : op->ceed;
+  *parent = NULL;
+  if (op->op_fallback_parent) CeedCall(CeedReferenceCopy(op->op_fallback_parent->ceed, parent));
+  else CeedCall(CeedReferenceCopy(CeedOperatorReturnCeed(op), parent));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -1883,13 +1887,11 @@ int CeedOperatorLinearAssembleQFunction(CeedOperator op, CeedVector *assembled,
     CeedCall(op->LinearAssembleQFunction(op, assembled, rstr, request));
   } else {
     // Operator fallback
-    Ceed         ceed;
     CeedOperator op_fallback;
 
-    CeedCall(CeedOperatorGetCeed(op, &ceed));
     CeedCall(CeedOperatorGetFallback(op, &op_fallback));
     if (op_fallback) CeedCall(CeedOperatorLinearAssembleQFunction(op_fallback, assembled, rstr, request));
-    else return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support CeedOperatorLinearAssembleQFunction");
+    else return CeedError(CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED, "Backend does not support CeedOperatorLinearAssembleQFunction");
   }
   return CEED_ERROR_SUCCESS;
 }
@@ -1961,13 +1963,11 @@ int CeedOperatorLinearAssembleQFunctionBuildOrUpdate(CeedOperator op, CeedVector
     CeedCall(CeedElemRestrictionDestroy(&assembled_rstr));
   } else {
     // Operator fallback
-    Ceed         ceed;
     CeedOperator op_fallback;
 
-    CeedCall(CeedOperatorGetCeed(op, &ceed));
     CeedCall(CeedOperatorGetFallback(op, &op_fallback));
     if (op_fallback) CeedCall(CeedOperatorLinearAssembleQFunctionBuildOrUpdate(op_fallback, assembled, rstr, request));
-    else return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support CeedOperatorLinearAssembleQFunctionUpdate");
+    else return CeedError(CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED, "Backend does not support CeedOperatorLinearAssembleQFunctionUpdate");
   }
   return CEED_ERROR_SUCCESS;
 }
@@ -1992,14 +1992,12 @@ int CeedOperatorLinearAssembleQFunctionBuildOrUpdate(CeedOperator op, CeedVector
 int CeedOperatorLinearAssembleDiagonal(CeedOperator op, CeedVector assembled, CeedRequest *request) {
   bool     is_composite;
   CeedSize input_size = 0, output_size = 0;
-  Ceed     ceed;
 
-  CeedCall(CeedOperatorGetCeed(op, &ceed));
   CeedCall(CeedOperatorCheckReady(op));
   CeedCall(CeedOperatorIsComposite(op, &is_composite));
 
   CeedCall(CeedOperatorGetActiveVectorLengths(op, &input_size, &output_size));
-  CeedCheck(input_size == output_size, ceed, CEED_ERROR_DIMENSION, "Operator must be square");
+  CeedCheck(input_size == output_size, CeedOperatorReturnCeed(op), CEED_ERROR_DIMENSION, "Operator must be square");
 
   // Early exit for empty operator
   if (!is_composite) {
@@ -2054,14 +2052,12 @@ int CeedOperatorLinearAssembleDiagonal(CeedOperator op, CeedVector assembled, Ce
 int CeedOperatorLinearAssembleAddDiagonal(CeedOperator op, CeedVector assembled, CeedRequest *request) {
   bool     is_composite;
   CeedSize input_size = 0, output_size = 0;
-  Ceed     ceed;
 
-  CeedCall(CeedOperatorGetCeed(op, &ceed));
   CeedCall(CeedOperatorCheckReady(op));
   CeedCall(CeedOperatorIsComposite(op, &is_composite));
 
   CeedCall(CeedOperatorGetActiveVectorLengths(op, &input_size, &output_size));
-  CeedCheck(input_size == output_size, ceed, CEED_ERROR_DIMENSION, "Operator must be square");
+  CeedCheck(input_size == output_size, CeedOperatorReturnCeed(op), CEED_ERROR_DIMENSION, "Operator must be square");
 
   // Early exit for empty operator
   if (!is_composite) {
@@ -2115,17 +2111,15 @@ int CeedOperatorLinearAssembleAddDiagonal(CeedOperator op, CeedVector assembled,
    @ref User
 **/
 int CeedOperatorLinearAssemblePointBlockDiagonalSymbolic(CeedOperator op, CeedSize *num_entries, CeedInt **rows, CeedInt **cols) {
-  Ceed          ceed;
   bool          is_composite;
   CeedInt       num_active_components, num_sub_operators;
   CeedOperator *sub_operators;
 
-  CeedCall(CeedOperatorGetCeed(op, &ceed));
   CeedCall(CeedOperatorIsComposite(op, &is_composite));
 
   CeedSize input_size = 0, output_size = 0;
   CeedCall(CeedOperatorGetActiveVectorLengths(op, &input_size, &output_size));
-  CeedCheck(input_size == output_size, ceed, CEED_ERROR_DIMENSION, "Operator must be square");
+  CeedCheck(input_size == output_size, CeedOperatorReturnCeed(op), CEED_ERROR_DIMENSION, "Operator must be square");
 
   if (is_composite) {
     CeedCall(CeedCompositeOperatorGetNumSub(op, &num_sub_operators));
@@ -2155,10 +2149,10 @@ int CeedOperatorLinearAssemblePointBlockDiagonalSymbolic(CeedOperator op, CeedSi
         CeedInt comp_stride_sub, num_active_components_sub;
 
         CeedCall(CeedElemRestrictionGetCompStride(active_elem_rstrs[i], &comp_stride_sub));
-        CeedCheck(comp_stride == comp_stride_sub, ceed, CEED_ERROR_DIMENSION,
+        CeedCheck(comp_stride == comp_stride_sub, CeedOperatorReturnCeed(op), CEED_ERROR_DIMENSION,
                   "Active element restrictions must have the same component stride: %d vs %d", comp_stride, comp_stride_sub);
         CeedCall(CeedElemRestrictionGetNumComponents(active_elem_rstrs[i], &num_active_components_sub));
-        CeedCheck(num_active_components == num_active_components_sub, ceed, CEED_ERROR_INCOMPATIBLE,
+        CeedCheck(num_active_components == num_active_components_sub, CeedOperatorReturnCeed(op), CEED_ERROR_INCOMPATIBLE,
                   "All suboperators must have the same number of output components."
                   " Previous: %" CeedInt_FMT " Current: %" CeedInt_FMT,
                   num_active_components, num_active_components_sub);
@@ -2222,14 +2216,12 @@ int CeedOperatorLinearAssemblePointBlockDiagonalSymbolic(CeedOperator op, CeedSi
 int CeedOperatorLinearAssemblePointBlockDiagonal(CeedOperator op, CeedVector assembled, CeedRequest *request) {
   bool     is_composite;
   CeedSize input_size = 0, output_size = 0;
-  Ceed     ceed;
 
-  CeedCall(CeedOperatorGetCeed(op, &ceed));
   CeedCall(CeedOperatorCheckReady(op));
   CeedCall(CeedOperatorIsComposite(op, &is_composite));
 
   CeedCall(CeedOperatorGetActiveVectorLengths(op, &input_size, &output_size));
-  CeedCheck(input_size == output_size, ceed, CEED_ERROR_DIMENSION, "Operator must be square");
+  CeedCheck(input_size == output_size, CeedOperatorReturnCeed(op), CEED_ERROR_DIMENSION, "Operator must be square");
 
   // Early exit for empty operator
   if (!is_composite) {
@@ -2286,14 +2278,12 @@ int CeedOperatorLinearAssemblePointBlockDiagonal(CeedOperator op, CeedVector ass
 int CeedOperatorLinearAssembleAddPointBlockDiagonal(CeedOperator op, CeedVector assembled, CeedRequest *request) {
   bool     is_composite;
   CeedSize input_size = 0, output_size = 0;
-  Ceed     ceed;
 
-  CeedCall(CeedOperatorGetCeed(op, &ceed));
   CeedCall(CeedOperatorCheckReady(op));
   CeedCall(CeedOperatorIsComposite(op, &is_composite));
 
   CeedCall(CeedOperatorGetActiveVectorLengths(op, &input_size, &output_size));
-  CeedCheck(input_size == output_size, ceed, CEED_ERROR_DIMENSION, "Operator must be square");
+  CeedCheck(input_size == output_size, CeedOperatorReturnCeed(op), CEED_ERROR_DIMENSION, "Operator must be square");
 
   // Early exit for empty operator
   if (!is_composite) {
@@ -2494,19 +2484,19 @@ int CeedCompositeOperatorGetMultiplicity(CeedOperator op, CeedInt num_skip_indic
 
   CeedCall(CeedOperatorCheckReady(op));
 
-  CeedCall(CeedOperatorGetCeed(op, &ceed));
-
   // Zero mult vector
   CeedCall(CeedVectorSetValue(mult, 0.0));
 
   // Get suboperators
   CeedCall(CeedCompositeOperatorGetNumSub(op, &num_suboperators));
-  CeedCall(CeedCompositeOperatorGetSubList(op, &sub_operators));
   if (num_suboperators == 0) return CEED_ERROR_SUCCESS;
+  CeedCall(CeedCompositeOperatorGetSubList(op, &sub_operators));
 
   // Work vector
   CeedCall(CeedVectorGetLength(mult, &l_vec_len));
+  CeedCall(CeedOperatorGetCeed(op, &ceed));
   CeedCall(CeedVectorCreate(ceed, l_vec_len, &ones_l_vec));
+  CeedCall(CeedDestroy(&ceed));
   CeedCall(CeedVectorSetValue(ones_l_vec, 1.0));
   CeedCall(CeedVectorGetArray(mult, CEED_MEM_HOST, &mult_array));
 
@@ -2642,6 +2632,7 @@ int CeedOperatorMultigridLevelCreateTensorH1(CeedOperator op_fine, CeedVector p_
 
   // Core code
   CeedCall(CeedSingleOperatorMultigridLevel(op_fine, p_mult_fine, rstr_coarse, basis_coarse, basis_c_to_f, op_coarse, op_prolong, op_restrict));
+  CeedCall(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -2705,6 +2696,7 @@ int CeedOperatorMultigridLevelCreateH1(CeedOperator op_fine, CeedVector p_mult_f
 
   // Core code
   CeedCall(CeedSingleOperatorMultigridLevel(op_fine, p_mult_fine, rstr_coarse, basis_coarse, basis_c_to_f, op_coarse, op_prolong, op_restrict));
+  CeedCall(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -2939,6 +2931,8 @@ int CeedOperatorCreateFDMElementInverse(CeedOperator op, CeedOperator *fdm_inv,
   CeedCall(CeedOperatorSetField(*fdm_inv, "output", rstr, fdm_basis, CEED_VECTOR_ACTIVE));
 
   // Cleanup
+  CeedCall(CeedDestroy(&ceed));
+  CeedCall(CeedDestroy(&ceed_parent));
   CeedCall(CeedVectorDestroy(&q_data));
   CeedCall(CeedElemRestrictionDestroy(&rstr));
   CeedCall(CeedElemRestrictionDestroy(&rstr_qd_i));
diff --git a/interface/ceed-qfunction.c b/interface/ceed-qfunction.c
index 7c0cf3a285..0daa4b98fc 100644
--- a/interface/ceed-qfunction.c
+++ b/interface/ceed-qfunction.c
@@ -252,6 +252,7 @@ int CeedQFunctionGetSourcePath(CeedQFunction qf, const char **source_path) {
     } else {
       CeedCall(CeedGetJitAbsolutePath(ceed, qf->user_source, &absolute_path));
     }
+    CeedCall(CeedDestroy(&ceed));
 
     size_t source_len = strlen(absolute_path) - kernel_name_len - 1;
 
@@ -295,6 +296,7 @@ int CeedQFunctionLoadSourceToBuffer(CeedQFunction qf, const char **source_buffer
 
     CeedCall(CeedQFunctionGetCeed(qf, &ceed));
     CeedCall(CeedLoadSourceToBuffer(ceed, source_path, &buffer));
+    CeedCall(CeedDestroy(&ceed));
     *source_buffer = buffer;
   }
   return CEED_ERROR_SUCCESS;
@@ -628,6 +630,7 @@ int CeedQFunctionCreateInterior(Ceed ceed, CeedInt vec_length, CeedQFunctionUser
     CeedCall(CeedGetObjectDelegate(ceed, &delegate, "QFunction"));
     CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement CeedQFunctionCreateInterior");
     CeedCall(CeedQFunctionCreateInterior(delegate, vec_length, f, source, qf));
+    CeedCall(CeedDestroy(&delegate));
     return CEED_ERROR_SUCCESS;
   }
 
@@ -770,18 +773,16 @@ int CeedQFunctionReferenceCopy(CeedQFunction qf, CeedQFunction *qf_copy) {
 **/
 int CeedQFunctionAddInput(CeedQFunction qf, const char *field_name, CeedInt size, CeedEvalMode eval_mode) {
   bool is_immutable;
-  Ceed ceed;
 
-  CeedCall(CeedQFunctionGetCeed(qf, &ceed));
   CeedCall(CeedQFunctionIsImmutable(qf, &is_immutable));
-  CeedCheck(!is_immutable, ceed, CEED_ERROR_MAJOR, "QFunction cannot be changed after set as immutable");
-  CeedCheck(eval_mode != CEED_EVAL_WEIGHT || size == 1, ceed, CEED_ERROR_DIMENSION, "CEED_EVAL_WEIGHT should have size 1");
+  CeedCheck(!is_immutable, CeedQFunctionReturnCeed(qf), CEED_ERROR_MAJOR, "QFunction cannot be changed after set as immutable");
+  CeedCheck(eval_mode != CEED_EVAL_WEIGHT || size == 1, CeedQFunctionReturnCeed(qf), CEED_ERROR_DIMENSION, "CEED_EVAL_WEIGHT should have size 1");
   for (CeedInt i = 0; i < qf->num_input_fields; i++) {
-    CeedCheck(strcmp(field_name, qf->input_fields[i]->field_name), ceed, CEED_ERROR_MINOR,
+    CeedCheck(strcmp(field_name, qf->input_fields[i]->field_name), CeedQFunctionReturnCeed(qf), CEED_ERROR_MINOR,
               "CeedQFunction field names must be unique. Duplicate name: %s", field_name);
   }
   for (CeedInt i = 0; i < qf->num_output_fields; i++) {
-    CeedCheck(strcmp(field_name, qf->output_fields[i]->field_name), ceed, CEED_ERROR_MINOR,
+    CeedCheck(strcmp(field_name, qf->output_fields[i]->field_name), CeedQFunctionReturnCeed(qf), CEED_ERROR_MINOR,
               "CeedQFunction field names must be unique. Duplicate name: %s", field_name);
   }
   CeedCall(CeedQFunctionFieldSet(&qf->input_fields[qf->num_input_fields], field_name, size, eval_mode));
@@ -807,17 +808,18 @@ int CeedQFunctionAddInput(CeedQFunction qf, const char *field_name, CeedInt size
 **/
 int CeedQFunctionAddOutput(CeedQFunction qf, const char *field_name, CeedInt size, CeedEvalMode eval_mode) {
   bool is_immutable;
-  Ceed ceed;
 
-  CeedCall(CeedQFunctionGetCeed(qf, &ceed));
   CeedCall(CeedQFunctionIsImmutable(qf, &is_immutable));
-  CeedCheck(!is_immutable, ceed, CEED_ERROR_MAJOR, "CeedQFunction cannot be changed after set as immutable");
-  CeedCheck(eval_mode != CEED_EVAL_WEIGHT, ceed, CEED_ERROR_DIMENSION, "Cannot create CeedQFunction output with CEED_EVAL_WEIGHT");
+  CeedCheck(!is_immutable, CeedQFunctionReturnCeed(qf), CEED_ERROR_MAJOR, "CeedQFunction cannot be changed after set as immutable");
+  CeedCheck(eval_mode != CEED_EVAL_WEIGHT, CeedQFunctionReturnCeed(qf), CEED_ERROR_DIMENSION,
+            "Cannot create CeedQFunction output with CEED_EVAL_WEIGHT");
   for (CeedInt i = 0; i < qf->num_input_fields; i++) {
-    CeedCheck(strcmp(field_name, qf->input_fields[i]->field_name), ceed, CEED_ERROR_MINOR, "CeedQFunction field names must be unique");
+    CeedCheck(strcmp(field_name, qf->input_fields[i]->field_name), CeedQFunctionReturnCeed(qf), CEED_ERROR_MINOR,
+              "CeedQFunction field names must be unique");
   }
   for (CeedInt i = 0; i < qf->num_output_fields; i++) {
-    CeedCheck(strcmp(field_name, qf->output_fields[i]->field_name), ceed, CEED_ERROR_MINOR, "CeedQFunction field names must be unique");
+    CeedCheck(strcmp(field_name, qf->output_fields[i]->field_name), CeedQFunctionReturnCeed(qf), CEED_ERROR_MINOR,
+              "CeedQFunction field names must be unique");
   }
   CeedCall(CeedQFunctionFieldSet(&qf->output_fields[qf->num_output_fields], field_name, size, eval_mode));
   qf->num_output_fields++;
@@ -1009,7 +1011,8 @@ int CeedQFunctionView(CeedQFunction qf, FILE *stream) {
   @ref Advanced
 **/
 int CeedQFunctionGetCeed(CeedQFunction qf, Ceed *ceed) {
-  *ceed = CeedQFunctionReturnCeed(qf);
+  *ceed = NULL;
+  CeedCall(CeedReferenceCopy(CeedQFunctionReturnCeed(qf), ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -1040,13 +1043,11 @@ Ceed CeedQFunctionReturnCeed(CeedQFunction qf) { return qf->ceed; }
 **/
 int CeedQFunctionApply(CeedQFunction qf, CeedInt Q, CeedVector *u, CeedVector *v) {
   CeedInt vec_length;
-  Ceed    ceed;
 
-  CeedCall(CeedQFunctionGetCeed(qf, &ceed));
-  CeedCheck(qf->Apply, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support CeedQFunctionApply");
+  CeedCheck(qf->Apply, CeedQFunctionReturnCeed(qf), CEED_ERROR_UNSUPPORTED, "Backend does not support CeedQFunctionApply");
   CeedCall(CeedQFunctionGetVectorLength(qf, &vec_length));
-  CeedCheck(Q % vec_length == 0, ceed, CEED_ERROR_DIMENSION, "Number of quadrature points %" CeedInt_FMT " must be a multiple of %" CeedInt_FMT, Q,
-            qf->vec_length);
+  CeedCheck(Q % vec_length == 0, CeedQFunctionReturnCeed(qf), CEED_ERROR_DIMENSION,
+            "Number of quadrature points %" CeedInt_FMT " must be a multiple of %" CeedInt_FMT, Q, qf->vec_length);
   CeedCall(CeedQFunctionSetImmutable(qf));
   CeedCall(qf->Apply(qf, Q, u, v));
   return CEED_ERROR_SUCCESS;
diff --git a/interface/ceed-qfunctioncontext.c b/interface/ceed-qfunctioncontext.c
index 16cc22cebe..a32dee73a9 100644
--- a/interface/ceed-qfunctioncontext.c
+++ b/interface/ceed-qfunctioncontext.c
@@ -59,12 +59,11 @@ int CeedQFunctionContextRegisterGeneric(CeedQFunctionContext ctx, const char *fi
                                         CeedContextFieldType field_type, size_t num_values) {
   size_t  field_size  = 0;
   CeedInt field_index = -1;
-  Ceed    ceed;
 
   // Check for duplicate
-  CeedCall(CeedQFunctionContextGetCeed(ctx, &ceed));
   CeedCall(CeedQFunctionContextGetFieldIndex(ctx, field_name, &field_index));
-  CeedCheck(field_index == -1, ceed, CEED_ERROR_UNSUPPORTED, "QFunctionContext field with name \"%s\" already registered", field_name);
+  CeedCheck(field_index == -1, CeedQFunctionContextReturnCeed(ctx), CEED_ERROR_UNSUPPORTED,
+            "QFunctionContext field with name \"%s\" already registered", field_name);
 
   // Allocate space for field data
   if (ctx->num_fields == 0) {
@@ -147,7 +146,8 @@ static int CeedQFunctionContextDestroyData(CeedQFunctionContext ctx) {
   @ref Backend
 **/
 int CeedQFunctionContextGetCeed(CeedQFunctionContext ctx, Ceed *ceed) {
-  *ceed = CeedQFunctionContextReturnCeed(ctx);
+  *ceed = NULL;
+  CeedCall(CeedReferenceCopy(CeedQFunctionContextReturnCeed(ctx), ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -571,6 +571,7 @@ int CeedQFunctionContextCreate(Ceed ceed, CeedQFunctionContext *ctx) {
     CeedCall(CeedGetObjectDelegate(ceed, &delegate, "Context"));
     CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement CeedQFunctionContextCreate");
     CeedCall(CeedQFunctionContextCreate(delegate, ctx));
+    CeedCall(CeedDestroy(&delegate));
     return CEED_ERROR_SUCCESS;
   }
 
@@ -620,11 +621,9 @@ int CeedQFunctionContextReferenceCopy(CeedQFunctionContext ctx, CeedQFunctionCon
   @ref User
 **/
 int CeedQFunctionContextSetData(CeedQFunctionContext ctx, CeedMemType mem_type, CeedCopyMode copy_mode, size_t size, void *data) {
-  Ceed ceed;
-
-  CeedCall(CeedQFunctionContextGetCeed(ctx, &ceed));
-  CeedCheck(ctx->SetData, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support CeedQFunctionContextSetData");
-  CeedCheck(ctx->state % 2 == 0, ceed, 1, "Cannot grant CeedQFunctionContext data access, the access lock is already in use");
+  CeedCheck(ctx->SetData, CeedQFunctionContextReturnCeed(ctx), CEED_ERROR_UNSUPPORTED, "Backend does not support CeedQFunctionContextSetData");
+  CeedCheck(ctx->state % 2 == 0, CeedQFunctionContextReturnCeed(ctx), 1,
+            "Cannot grant CeedQFunctionContext data access, the access lock is already in use");
 
   CeedCall(CeedQFunctionContextDestroyData(ctx));
   ctx->ctx_size = size;
@@ -650,17 +649,16 @@ int CeedQFunctionContextSetData(CeedQFunctionContext ctx, CeedMemType mem_type,
 int CeedQFunctionContextTakeData(CeedQFunctionContext ctx, CeedMemType mem_type, void *data) {
   void *temp_data      = NULL;
   bool  has_valid_data = true, has_borrowed_data_of_type = true;
-  Ceed  ceed;
 
-  CeedCall(CeedQFunctionContextGetCeed(ctx, &ceed));
   CeedCall(CeedQFunctionContextHasValidData(ctx, &has_valid_data));
-  CeedCheck(has_valid_data, ceed, CEED_ERROR_BACKEND, "CeedQFunctionContext has no valid data to take, must set data");
+  CeedCheck(has_valid_data, CeedQFunctionContextReturnCeed(ctx), CEED_ERROR_BACKEND, "CeedQFunctionContext has no valid data to take, must set data");
 
-  CeedCheck(ctx->TakeData, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support CeedQFunctionContextTakeData");
-  CeedCheck(ctx->state % 2 == 0, ceed, 1, "Cannot grant CeedQFunctionContext data access, the access lock is already in use");
+  CeedCheck(ctx->TakeData, CeedQFunctionContextReturnCeed(ctx), CEED_ERROR_UNSUPPORTED, "Backend does not support CeedQFunctionContextTakeData");
+  CeedCheck(ctx->state % 2 == 0, CeedQFunctionContextReturnCeed(ctx), 1,
+            "Cannot grant CeedQFunctionContext data access, the access lock is already in use");
 
   CeedCall(CeedQFunctionContextHasBorrowedDataOfType(ctx, mem_type, &has_borrowed_data_of_type));
-  CeedCheck(has_borrowed_data_of_type, ceed, CEED_ERROR_BACKEND,
+  CeedCheck(has_borrowed_data_of_type, CeedQFunctionContextReturnCeed(ctx), CEED_ERROR_BACKEND,
             "CeedQFunctionContext has no borrowed %s data, must set data with CeedQFunctionContextSetData", CeedMemTypes[mem_type]);
 
   CeedCall(ctx->TakeData(ctx, mem_type, &temp_data));
@@ -687,15 +685,15 @@ int CeedQFunctionContextTakeData(CeedQFunctionContext ctx, CeedMemType mem_type,
 **/
 int CeedQFunctionContextGetData(CeedQFunctionContext ctx, CeedMemType mem_type, void *data) {
   bool has_valid_data = true;
-  Ceed ceed;
 
-  CeedCall(CeedQFunctionContextGetCeed(ctx, &ceed));
-  CeedCheck(ctx->GetData, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support CeedQFunctionContextGetData");
-  CeedCheck(ctx->state % 2 == 0, ceed, 1, "Cannot grant CeedQFunctionContext data access, the access lock is already in use");
-  CeedCheck(ctx->num_readers == 0, ceed, 1, "Cannot grant CeedQFunctionContext data access, a process has read access");
+  CeedCheck(ctx->GetData, CeedQFunctionContextReturnCeed(ctx), CEED_ERROR_UNSUPPORTED, "Backend does not support CeedQFunctionContextGetData");
+  CeedCheck(ctx->state % 2 == 0, CeedQFunctionContextReturnCeed(ctx), 1,
+            "Cannot grant CeedQFunctionContext data access, the access lock is already in use");
+  CeedCheck(ctx->num_readers == 0, CeedQFunctionContextReturnCeed(ctx), 1,
+            "Cannot grant CeedQFunctionContext data access, a process has read access");
 
   CeedCall(CeedQFunctionContextHasValidData(ctx, &has_valid_data));
-  CeedCheck(has_valid_data, ceed, CEED_ERROR_BACKEND, "CeedQFunctionContext has no valid data to get, must set data");
+  CeedCheck(has_valid_data, CeedQFunctionContextReturnCeed(ctx), CEED_ERROR_BACKEND, "CeedQFunctionContext has no valid data to get, must set data");
 
   CeedCall(ctx->GetData(ctx, mem_type, data));
   ctx->state++;
@@ -721,14 +719,14 @@ int CeedQFunctionContextGetData(CeedQFunctionContext ctx, CeedMemType mem_type,
 **/
 int CeedQFunctionContextGetDataRead(CeedQFunctionContext ctx, CeedMemType mem_type, void *data) {
   bool has_valid_data = true;
-  Ceed ceed;
 
-  CeedCall(CeedQFunctionContextGetCeed(ctx, &ceed));
-  CeedCheck(ctx->GetDataRead, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support CeedQFunctionContextGetDataRead");
-  CeedCheck(ctx->state % 2 == 0, ceed, 1, "Cannot grant CeedQFunctionContext data access, the access lock is already in use");
+  CeedCheck(ctx->GetDataRead, CeedQFunctionContextReturnCeed(ctx), CEED_ERROR_UNSUPPORTED,
+            "Backend does not support CeedQFunctionContextGetDataRead");
+  CeedCheck(ctx->state % 2 == 0, CeedQFunctionContextReturnCeed(ctx), 1,
+            "Cannot grant CeedQFunctionContext data access, the access lock is already in use");
 
   CeedCall(CeedQFunctionContextHasValidData(ctx, &has_valid_data));
-  CeedCheck(has_valid_data, ceed, CEED_ERROR_BACKEND, "CeedQFunctionContext has no valid data to get, must set data");
+  CeedCheck(has_valid_data, CeedQFunctionContextReturnCeed(ctx), CEED_ERROR_BACKEND, "CeedQFunctionContext has no valid data to get, must set data");
 
   CeedCall(ctx->GetDataRead(ctx, mem_type, data));
   ctx->num_readers++;
diff --git a/interface/ceed-tensor.c b/interface/ceed-tensor.c
index d5b5578589..dd8b81a118 100644
--- a/interface/ceed-tensor.c
+++ b/interface/ceed-tensor.c
@@ -36,6 +36,7 @@ int CeedTensorContractCreate(Ceed ceed, CeedTensorContract *contract) {
     CeedCall(CeedGetObjectDelegate(ceed, &delegate, "TensorContract"));
     CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement CeedTensorContractCreate");
     CeedCall(CeedTensorContractCreate(delegate, contract));
+    CeedCall(CeedDestroy(&delegate));
     return CEED_ERROR_SUCCESS;
   }
 
@@ -123,7 +124,8 @@ int CeedTensorContractStridedApply(CeedTensorContract contract, CeedInt A, CeedI
   @ref Backend
 **/
 int CeedTensorContractGetCeed(CeedTensorContract contract, Ceed *ceed) {
-  *ceed = CeedTensorContractReturnCeed(contract);
+  *ceed = NULL;
+  CeedCall(CeedReferenceCopy(CeedTensorContractReturnCeed(contract), ceed));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/interface/ceed-vector.c b/interface/ceed-vector.c
index b0e277e9d2..839946762b 100644
--- a/interface/ceed-vector.c
+++ b/interface/ceed-vector.c
@@ -165,6 +165,7 @@ int CeedVectorCreate(Ceed ceed, CeedSize length, CeedVector *vec) {
     CeedCall(CeedGetObjectDelegate(ceed, &delegate, "Vector"));
     CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement VectorCreate");
     CeedCall(CeedVectorCreate(delegate, length, vec));
+    CeedCall(CeedDestroy(&delegate));
     return CEED_ERROR_SUCCESS;
   }
 
@@ -210,17 +211,21 @@ int CeedVectorReferenceCopy(CeedVector vec, CeedVector *vec_copy) {
   @ref User
 **/
 int CeedVectorCopy(CeedVector vec, CeedVector vec_copy) {
-  Ceed        ceed;
   CeedMemType mem_type, mem_type_copy;
   CeedScalar *array;
 
-  // Get the preferred memory type
-  CeedCall(CeedVectorGetCeed(vec, &ceed));
-  CeedCall(CeedGetPreferredMemType(ceed, &mem_type));
+  // Get the preferred memory types
+  {
+    Ceed ceed;
+
+    CeedCall(CeedVectorGetCeed(vec, &ceed));
+    CeedCall(CeedGetPreferredMemType(ceed, &mem_type));
+    CeedCall(CeedDestroy(&ceed));
 
-  // Get the preferred memory type
-  CeedCall(CeedVectorGetCeed(vec_copy, &ceed));
-  CeedCall(CeedGetPreferredMemType(ceed, &mem_type_copy));
+    CeedCall(CeedVectorGetCeed(vec_copy, &ceed));
+    CeedCall(CeedGetPreferredMemType(ceed, &mem_type_copy));
+    CeedCall(CeedDestroy(&ceed));
+  }
 
   // Check that both have same memory type
   if (mem_type != mem_type_copy) mem_type = CEED_MEM_HOST;
@@ -231,7 +236,7 @@ int CeedVectorCopy(CeedVector vec, CeedVector vec_copy) {
 
     CeedCall(CeedVectorGetLength(vec, &length_vec));
     CeedCall(CeedVectorGetLength(vec_copy, &length_copy));
-    CeedCheck(length_vec == length_copy, ceed, CEED_ERROR_INCOMPATIBLE, "CeedVectors must have the same length to copy");
+    CeedCheck(length_vec == length_copy, CeedVectorReturnCeed(vec), CEED_ERROR_INCOMPATIBLE, "CeedVectors must have the same length to copy");
   }
 
   // Copy the values from vec to vec_copy
@@ -304,13 +309,11 @@ int CeedVectorCopyStrided(CeedVector vec, CeedSize start, CeedInt step, CeedVect
 **/
 int CeedVectorSetArray(CeedVector vec, CeedMemType mem_type, CeedCopyMode copy_mode, CeedScalar *array) {
   CeedSize length;
-  Ceed     ceed;
-
-  CeedCall(CeedVectorGetCeed(vec, &ceed));
 
-  CeedCheck(vec->SetArray, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support VectorSetArray");
-  CeedCheck(vec->state % 2 == 0, ceed, CEED_ERROR_ACCESS, "Cannot grant CeedVector array access, the access lock is already in use");
-  CeedCheck(vec->num_readers == 0, ceed, CEED_ERROR_ACCESS, "Cannot grant CeedVector array access, a process has read access");
+  CeedCheck(vec->SetArray, CeedVectorReturnCeed(vec), CEED_ERROR_UNSUPPORTED, "Backend does not support VectorSetArray");
+  CeedCheck(vec->state % 2 == 0, CeedVectorReturnCeed(vec), CEED_ERROR_ACCESS,
+            "Cannot grant CeedVector array access, the access lock is already in use");
+  CeedCheck(vec->num_readers == 0, CeedVectorReturnCeed(vec), CEED_ERROR_ACCESS, "Cannot grant CeedVector array access, a process has read access");
 
   CeedCall(CeedVectorGetLength(vec, &length));
   if (length > 0) CeedCall(vec->SetArray(vec, mem_type, copy_mode, array));
@@ -329,11 +332,9 @@ int CeedVectorSetArray(CeedVector vec, CeedMemType mem_type, CeedCopyMode copy_m
   @ref User
 **/
 int CeedVectorSetValue(CeedVector vec, CeedScalar value) {
-  Ceed ceed;
-
-  CeedCall(CeedVectorGetCeed(vec, &ceed));
-  CeedCheck(vec->state % 2 == 0, ceed, CEED_ERROR_ACCESS, "Cannot grant CeedVector array access, the access lock is already in use");
-  CeedCheck(vec->num_readers == 0, ceed, CEED_ERROR_ACCESS, "Cannot grant CeedVector array access, a process has read access");
+  CeedCheck(vec->state % 2 == 0, CeedVectorReturnCeed(vec), CEED_ERROR_ACCESS,
+            "Cannot grant CeedVector array access, the access lock is already in use");
+  CeedCheck(vec->num_readers == 0, CeedVectorReturnCeed(vec), CEED_ERROR_ACCESS, "Cannot grant CeedVector array access, a process has read access");
 
   if (vec->SetValue) {
     CeedCall(vec->SetValue(vec, value));
@@ -365,11 +366,9 @@ int CeedVectorSetValue(CeedVector vec, CeedScalar value) {
   @ref User
 **/
 int CeedVectorSetValueStrided(CeedVector vec, CeedSize start, CeedInt step, CeedScalar value) {
-  Ceed ceed;
-
-  CeedCall(CeedVectorGetCeed(vec, &ceed));
-  CeedCheck(vec->state % 2 == 0, ceed, CEED_ERROR_ACCESS, "Cannot grant CeedVector array access, the access lock is already in use");
-  CeedCheck(vec->num_readers == 0, ceed, CEED_ERROR_ACCESS, "Cannot grant CeedVector array access, a process has read access");
+  CeedCheck(vec->state % 2 == 0, CeedVectorReturnCeed(vec), CEED_ERROR_ACCESS,
+            "Cannot grant CeedVector array access, the access lock is already in use");
+  CeedCheck(vec->num_readers == 0, CeedVectorReturnCeed(vec), CEED_ERROR_ACCESS, "Cannot grant CeedVector array access, a process has read access");
 
   if (vec->SetValueStrided) {
     CeedCall(vec->SetValueStrided(vec, start, step, value));
@@ -438,22 +437,20 @@ int CeedVectorSyncArray(CeedVector vec, CeedMemType mem_type) {
 int CeedVectorTakeArray(CeedVector vec, CeedMemType mem_type, CeedScalar **array) {
   CeedSize    length;
   CeedScalar *temp_array = NULL;
-  Ceed        ceed;
 
-  CeedCall(CeedVectorGetCeed(vec, &ceed));
-  CeedCheck(vec->state % 2 == 0, ceed, CEED_ERROR_ACCESS, "Cannot take CeedVector array, the access lock is already in use");
-  CeedCheck(vec->num_readers == 0, ceed, CEED_ERROR_ACCESS, "Cannot take CeedVector array, a process has read access");
+  CeedCheck(vec->state % 2 == 0, CeedVectorReturnCeed(vec), CEED_ERROR_ACCESS, "Cannot take CeedVector array, the access lock is already in use");
+  CeedCheck(vec->num_readers == 0, CeedVectorReturnCeed(vec), CEED_ERROR_ACCESS, "Cannot take CeedVector array, a process has read access");
 
   CeedCall(CeedVectorGetLength(vec, &length));
   if (length > 0) {
     bool has_borrowed_array_of_type = true, has_valid_array = true;
 
     CeedCall(CeedVectorHasBorrowedArrayOfType(vec, mem_type, &has_borrowed_array_of_type));
-    CeedCheck(has_borrowed_array_of_type, ceed, CEED_ERROR_BACKEND, "CeedVector has no borrowed %s array, must set array with CeedVectorSetArray",
-              CeedMemTypes[mem_type]);
+    CeedCheck(has_borrowed_array_of_type, CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND,
+              "CeedVector has no borrowed %s array, must set array with CeedVectorSetArray", CeedMemTypes[mem_type]);
 
     CeedCall(CeedVectorHasValidArray(vec, &has_valid_array));
-    CeedCheck(has_valid_array, ceed, CEED_ERROR_BACKEND,
+    CeedCheck(has_valid_array, CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND,
               "CeedVector has no valid data to take, must set data with CeedVectorSetValue or CeedVectorSetArray");
 
     CeedCall(vec->TakeArray(vec, mem_type, &temp_array));
@@ -481,19 +478,18 @@ int CeedVectorTakeArray(CeedVector vec, CeedMemType mem_type, CeedScalar **array
 **/
 int CeedVectorGetArray(CeedVector vec, CeedMemType mem_type, CeedScalar **array) {
   CeedSize length;
-  Ceed     ceed;
 
-  CeedCall(CeedVectorGetCeed(vec, &ceed));
-  CeedCheck(vec->GetArray, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support GetArray");
-  CeedCheck(vec->state % 2 == 0, ceed, CEED_ERROR_ACCESS, "Cannot grant CeedVector array access, the access lock is already in use");
-  CeedCheck(vec->num_readers == 0, ceed, CEED_ERROR_ACCESS, "Cannot grant CeedVector array access, a process has read access");
+  CeedCheck(vec->GetArray, CeedVectorReturnCeed(vec), CEED_ERROR_UNSUPPORTED, "Backend does not support GetArray");
+  CeedCheck(vec->state % 2 == 0, CeedVectorReturnCeed(vec), CEED_ERROR_ACCESS,
+            "Cannot grant CeedVector array access, the access lock is already in use");
+  CeedCheck(vec->num_readers == 0, CeedVectorReturnCeed(vec), CEED_ERROR_ACCESS, "Cannot grant CeedVector array access, a process has read access");
 
   CeedCall(CeedVectorGetLength(vec, &length));
   if (length > 0) {
     bool has_valid_array = true;
 
     CeedCall(CeedVectorHasValidArray(vec, &has_valid_array));
-    CeedCheck(has_valid_array, ceed, CEED_ERROR_BACKEND,
+    CeedCheck(has_valid_array, CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND,
               "CeedVector has no valid data to read, must set data with CeedVectorSetValue or CeedVectorSetArray");
 
     CeedCall(vec->GetArray(vec, mem_type, array));
@@ -520,18 +516,17 @@ int CeedVectorGetArray(CeedVector vec, CeedMemType mem_type, CeedScalar **array)
 **/
 int CeedVectorGetArrayRead(CeedVector vec, CeedMemType mem_type, const CeedScalar **array) {
   CeedSize length;
-  Ceed     ceed;
 
-  CeedCall(CeedVectorGetCeed(vec, &ceed));
-  CeedCheck(vec->GetArrayRead, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support GetArrayRead");
-  CeedCheck(vec->state % 2 == 0, ceed, CEED_ERROR_ACCESS, "Cannot grant CeedVector read-only array access, the access lock is already in use");
+  CeedCheck(vec->GetArrayRead, CeedVectorReturnCeed(vec), CEED_ERROR_UNSUPPORTED, "Backend does not support GetArrayRead");
+  CeedCheck(vec->state % 2 == 0, CeedVectorReturnCeed(vec), CEED_ERROR_ACCESS,
+            "Cannot grant CeedVector read-only array access, the access lock is already in use");
 
   CeedCall(CeedVectorGetLength(vec, &length));
   if (length > 0) {
     bool has_valid_array = true;
 
     CeedCall(CeedVectorHasValidArray(vec, &has_valid_array));
-    CeedCheck(has_valid_array, ceed, CEED_ERROR_BACKEND,
+    CeedCheck(has_valid_array, CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND,
               "CeedVector has no valid data to read, must set data with CeedVectorSetValue or CeedVectorSetArray");
 
     CeedCall(vec->GetArrayRead(vec, mem_type, array));
@@ -558,12 +553,11 @@ int CeedVectorGetArrayRead(CeedVector vec, CeedMemType mem_type, const CeedScala
 **/
 int CeedVectorGetArrayWrite(CeedVector vec, CeedMemType mem_type, CeedScalar **array) {
   CeedSize length;
-  Ceed     ceed;
 
-  CeedCall(CeedVectorGetCeed(vec, &ceed));
-  CeedCheck(vec->GetArrayWrite, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support CeedVectorGetArrayWrite");
-  CeedCheck(vec->state % 2 == 0, ceed, CEED_ERROR_ACCESS, "Cannot grant CeedVector array access, the access lock is already in use");
-  CeedCheck(vec->num_readers == 0, ceed, CEED_ERROR_ACCESS, "Cannot grant CeedVector array access, a process has read access");
+  CeedCheck(vec->GetArrayWrite, CeedVectorReturnCeed(vec), CEED_ERROR_UNSUPPORTED, "Backend does not support CeedVectorGetArrayWrite");
+  CeedCheck(vec->state % 2 == 0, CeedVectorReturnCeed(vec), CEED_ERROR_ACCESS,
+            "Cannot grant CeedVector array access, the access lock is already in use");
+  CeedCheck(vec->num_readers == 0, CeedVectorReturnCeed(vec), CEED_ERROR_ACCESS, "Cannot grant CeedVector array access, a process has read access");
 
   CeedCall(CeedVectorGetLength(vec, &length));
   if (length > 0) {
@@ -730,27 +724,36 @@ int CeedVectorAXPY(CeedVector y, CeedScalar alpha, CeedVector x) {
   CeedSize          length_x, length_y;
   CeedScalar       *y_array = NULL;
   CeedScalar const *x_array = NULL;
-  Ceed              ceed, ceed_parent_x, ceed_parent_y;
 
-  CeedCall(CeedVectorGetCeed(y, &ceed));
   CeedCall(CeedVectorGetLength(y, &length_y));
   CeedCall(CeedVectorGetLength(x, &length_x));
-  CeedCheck(length_x == length_y, ceed, CEED_ERROR_UNSUPPORTED,
+  CeedCheck(length_x == length_y, CeedVectorReturnCeed(y), CEED_ERROR_UNSUPPORTED,
             "Cannot add vector of different lengths."
             " x length: %" CeedSize_FMT " y length: %" CeedSize_FMT,
             length_x, length_y);
-  CeedCheck(x != y, ceed, CEED_ERROR_UNSUPPORTED, "Cannot use same vector for x and y in CeedVectorAXPY");
+  CeedCheck(x != y, CeedVectorReturnCeed(y), CEED_ERROR_UNSUPPORTED, "Cannot use same vector for x and y in CeedVectorAXPY");
 
   CeedCall(CeedVectorHasValidArray(x, &has_valid_array_x));
-  CeedCheck(has_valid_array_x, ceed, CEED_ERROR_BACKEND,
+  CeedCheck(has_valid_array_x, CeedVectorReturnCeed(y), CEED_ERROR_BACKEND,
             "CeedVector x has no valid data, must set data with CeedVectorSetValue or CeedVectorSetArray");
   CeedCall(CeedVectorHasValidArray(y, &has_valid_array_y));
-  CeedCheck(has_valid_array_y, ceed, CEED_ERROR_BACKEND,
+  CeedCheck(has_valid_array_y, CeedVectorReturnCeed(y), CEED_ERROR_BACKEND,
             "CeedVector y has no valid data, must set data with CeedVectorSetValue or CeedVectorSetArray");
 
-  CeedCall(CeedGetParent(x->ceed, &ceed_parent_x));
-  CeedCall(CeedGetParent(y->ceed, &ceed_parent_y));
-  CeedCheck(ceed_parent_x == ceed_parent_y, ceed, CEED_ERROR_INCOMPATIBLE, "Vectors x and y must be created by the same Ceed context");
+  {
+    Ceed ceed_x, ceed_y, ceed_parent_x, ceed_parent_y;
+
+    CeedCall(CeedVectorGetCeed(y, &ceed_y));
+    CeedCall(CeedVectorGetCeed(x, &ceed_x));
+    CeedCall(CeedGetParent(ceed_x, &ceed_parent_x));
+    CeedCall(CeedGetParent(ceed_y, &ceed_parent_y));
+    CeedCall(CeedDestroy(&ceed_x));
+    CeedCall(CeedDestroy(&ceed_y));
+    CeedCheck(ceed_parent_x == ceed_parent_y, CeedVectorReturnCeed(y), CEED_ERROR_INCOMPATIBLE,
+              "Vectors x and y must be created by the same Ceed context");
+    CeedCall(CeedDestroy(&ceed_parent_x));
+    CeedCall(CeedDestroy(&ceed_parent_y));
+  }
 
   // Return early for empty vectors
   if (length_y == 0) return CEED_ERROR_SUCCESS;
@@ -792,28 +795,36 @@ int CeedVectorAXPBY(CeedVector y, CeedScalar alpha, CeedScalar beta, CeedVector
   CeedSize          length_x, length_y;
   CeedScalar       *y_array = NULL;
   CeedScalar const *x_array = NULL;
-  Ceed              ceed, ceed_parent_x, ceed_parent_y;
-
-  CeedCall(CeedVectorGetCeed(y, &ceed));
 
   CeedCall(CeedVectorGetLength(y, &length_y));
   CeedCall(CeedVectorGetLength(x, &length_x));
-  CeedCheck(length_x == length_y, ceed, CEED_ERROR_UNSUPPORTED,
+  CeedCheck(length_x == length_y, CeedVectorReturnCeed(y), CEED_ERROR_UNSUPPORTED,
             "Cannot add vector of different lengths."
             " x length: %" CeedSize_FMT " y length: %" CeedSize_FMT,
             length_x, length_y);
-  CeedCheck(x != y, ceed, CEED_ERROR_UNSUPPORTED, "Cannot use same vector for x and y in CeedVectorAXPBY");
+  CeedCheck(x != y, CeedVectorReturnCeed(y), CEED_ERROR_UNSUPPORTED, "Cannot use same vector for x and y in CeedVectorAXPBY");
 
   CeedCall(CeedVectorHasValidArray(x, &has_valid_array_x));
-  CeedCheck(has_valid_array_x, ceed, CEED_ERROR_BACKEND,
+  CeedCheck(has_valid_array_x, CeedVectorReturnCeed(y), CEED_ERROR_BACKEND,
             "CeedVector x has no valid data, must set data with CeedVectorSetValue or CeedVectorSetArray");
   CeedCall(CeedVectorHasValidArray(y, &has_valid_array_y));
-  CeedCheck(has_valid_array_y, ceed, CEED_ERROR_BACKEND,
+  CeedCheck(has_valid_array_y, CeedVectorReturnCeed(y), CEED_ERROR_BACKEND,
             "CeedVector y has no valid data, must set data with CeedVectorSetValue or CeedVectorSetArray");
 
-  CeedCall(CeedGetParent(x->ceed, &ceed_parent_x));
-  CeedCall(CeedGetParent(y->ceed, &ceed_parent_y));
-  CeedCheck(ceed_parent_x == ceed_parent_y, ceed, CEED_ERROR_INCOMPATIBLE, "Vectors x and y must be created by the same Ceed context");
+  {
+    Ceed ceed_x, ceed_y, ceed_parent_x, ceed_parent_y;
+
+    CeedCall(CeedVectorGetCeed(y, &ceed_y));
+    CeedCall(CeedVectorGetCeed(x, &ceed_x));
+    CeedCall(CeedGetParent(ceed_x, &ceed_parent_x));
+    CeedCall(CeedGetParent(ceed_y, &ceed_parent_y));
+    CeedCall(CeedDestroy(&ceed_x));
+    CeedCall(CeedDestroy(&ceed_y));
+    CeedCheck(ceed_parent_x == ceed_parent_y, CeedVectorReturnCeed(y), CEED_ERROR_INCOMPATIBLE,
+              "Vectors x and y must be created by the same Ceed context");
+    CeedCall(CeedDestroy(&ceed_parent_x));
+    CeedCall(CeedDestroy(&ceed_parent_y));
+  }
 
   // Return early for empty vectors
   if (length_y == 0) return CEED_ERROR_SUCCESS;
@@ -856,28 +867,39 @@ int CeedVectorPointwiseMult(CeedVector w, CeedVector x, CeedVector y) {
   CeedScalar       *w_array = NULL;
   CeedScalar const *x_array = NULL, *y_array = NULL;
   CeedSize          length_w, length_x, length_y;
-  Ceed              ceed, ceed_parent_w, ceed_parent_x, ceed_parent_y;
 
-  CeedCall(CeedVectorGetCeed(w, &ceed));
   CeedCall(CeedVectorGetLength(w, &length_w));
   CeedCall(CeedVectorGetLength(x, &length_x));
   CeedCall(CeedVectorGetLength(y, &length_y));
-  CeedCheck(length_x >= length_w && length_y >= length_w, ceed, CEED_ERROR_UNSUPPORTED,
+  CeedCheck(length_x >= length_w && length_y >= length_w, CeedVectorReturnCeed(w), CEED_ERROR_UNSUPPORTED,
             "Cannot pointwise multiply vectors of incompatible lengths."
             " w length: %" CeedSize_FMT " x length: %" CeedSize_FMT " y length: %" CeedSize_FMT,
             length_w, length_x, length_y);
 
-  CeedCall(CeedGetParent(w->ceed, &ceed_parent_w));
-  CeedCall(CeedGetParent(x->ceed, &ceed_parent_x));
-  CeedCall(CeedGetParent(y->ceed, &ceed_parent_y));
-  CeedCheck(ceed_parent_w == ceed_parent_x && ceed_parent_w == ceed_parent_y, ceed, CEED_ERROR_INCOMPATIBLE,
-            "Vectors w, x, and y must be created by the same Ceed context");
+  {
+    Ceed ceed_w, ceed_x, ceed_y, ceed_parent_w, ceed_parent_x, ceed_parent_y;
+
+    CeedCall(CeedVectorGetCeed(w, &ceed_w));
+    CeedCall(CeedVectorGetCeed(x, &ceed_x));
+    CeedCall(CeedVectorGetCeed(y, &ceed_y));
+    CeedCall(CeedGetParent(ceed_w, &ceed_parent_w));
+    CeedCall(CeedGetParent(ceed_x, &ceed_parent_x));
+    CeedCall(CeedGetParent(ceed_y, &ceed_parent_y));
+    CeedCall(CeedDestroy(&ceed_w));
+    CeedCall(CeedDestroy(&ceed_x));
+    CeedCall(CeedDestroy(&ceed_y));
+    CeedCheck(ceed_parent_w == ceed_parent_x && ceed_parent_w == ceed_parent_y, CeedVectorReturnCeed(w), CEED_ERROR_INCOMPATIBLE,
+              "Vectors w, x, and y must be created by the same Ceed context");
+    CeedCall(CeedDestroy(&ceed_parent_w));
+    CeedCall(CeedDestroy(&ceed_parent_x));
+    CeedCall(CeedDestroy(&ceed_parent_y));
+  }
 
   CeedCall(CeedVectorHasValidArray(x, &has_valid_array_x));
-  CeedCheck(has_valid_array_x, ceed, CEED_ERROR_BACKEND,
+  CeedCheck(has_valid_array_x, CeedVectorReturnCeed(w), CEED_ERROR_BACKEND,
             "CeedVector x has no valid data, must set data with CeedVectorSetValue or CeedVectorSetArray");
   CeedCall(CeedVectorHasValidArray(y, &has_valid_array_y));
-  CeedCheck(has_valid_array_y, ceed, CEED_ERROR_BACKEND,
+  CeedCheck(has_valid_array_y, CeedVectorReturnCeed(w), CEED_ERROR_BACKEND,
             "CeedVector y has no valid data, must set data with CeedVectorSetValue or CeedVectorSetArray");
 
   // Return early for empty vectors
@@ -933,15 +955,13 @@ int CeedVectorReciprocal(CeedVector vec) {
   bool        has_valid_array = true;
   CeedSize    length;
   CeedScalar *array;
-  Ceed        ceed;
 
-  CeedCall(CeedVectorGetCeed(vec, &ceed));
   CeedCall(CeedVectorHasValidArray(vec, &has_valid_array));
-  CeedCheck(has_valid_array, ceed, CEED_ERROR_BACKEND,
+  CeedCheck(has_valid_array, CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND,
             "CeedVector has no valid data to compute reciprocal, must set data with CeedVectorSetValue or CeedVectorSetArray");
 
   // Check if vector data set
-  CeedCheck(vec->state > 0, ceed, CEED_ERROR_INCOMPLETE, "CeedVector must have data set to take reciprocal");
+  CeedCheck(vec->state > 0, CeedVectorReturnCeed(vec), CEED_ERROR_INCOMPLETE, "CeedVector must have data set to take reciprocal");
 
   // Return early for empty vector
   CeedCall(CeedVectorGetLength(vec, &length));
@@ -1032,7 +1052,8 @@ int CeedVectorView(CeedVector vec, const char *fp_fmt, FILE *stream) {
   @ref Advanced
 **/
 int CeedVectorGetCeed(CeedVector vec, Ceed *ceed) {
-  *ceed = CeedVectorReturnCeed(vec);
+  *ceed = NULL;
+  CeedCall(CeedReferenceCopy(CeedVectorReturnCeed(vec), ceed));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/interface/ceed.c b/interface/ceed.c
index 5579cca242..a25e97a6b7 100644
--- a/interface/ceed.c
+++ b/interface/ceed.c
@@ -517,7 +517,8 @@ int CeedGetParent(Ceed ceed, Ceed *parent) {
     CeedCall(CeedGetParent(ceed->parent, parent));
     return CEED_ERROR_SUCCESS;
   }
-  *parent = ceed;
+  *parent = NULL;
+  CeedCall(CeedReferenceCopy(ceed, parent));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -532,7 +533,8 @@ int CeedGetParent(Ceed ceed, Ceed *parent) {
   @ref Backend
 **/
 int CeedGetDelegate(Ceed ceed, Ceed *delegate) {
-  *delegate = ceed->delegate;
+  *delegate = NULL;
+  if (ceed->delegate) CeedCall(CeedReferenceCopy(ceed->delegate, delegate));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -550,7 +552,7 @@ int CeedGetDelegate(Ceed ceed, Ceed *delegate) {
   @ref Backend
 **/
 int CeedSetDelegate(Ceed ceed, Ceed delegate) {
-  ceed->delegate   = delegate;
+  CeedCall(CeedReferenceCopy(delegate, &ceed->delegate));
   delegate->parent = ceed;
   return CEED_ERROR_SUCCESS;
 }
@@ -570,7 +572,8 @@ int CeedGetObjectDelegate(Ceed ceed, Ceed *delegate, const char *obj_name) {
   // Check for object delegate
   for (CeedInt i = 0; i < ceed->obj_delegate_count; i++) {
     if (!strcmp(obj_name, ceed->obj_delegates->obj_name)) {
-      *delegate = ceed->obj_delegates->delegate;
+      *delegate = NULL;
+      CeedCall(CeedReferenceCopy(ceed->obj_delegates->delegate, delegate));
       return CEED_ERROR_SUCCESS;
     }
   }
@@ -607,7 +610,7 @@ int CeedSetObjectDelegate(Ceed ceed, Ceed delegate, const char *obj_name) {
   ceed->obj_delegate_count++;
 
   // Set object delegate
-  ceed->obj_delegates[count].delegate = delegate;
+  CeedCall(CeedReferenceCopy(delegate, &ceed->obj_delegates[count].delegate));
   CeedCall(CeedStringAllocCopy(obj_name, &ceed->obj_delegates[count].obj_name));
 
   // Set delegate parent
@@ -679,7 +682,8 @@ int CeedGetOperatorFallbackCeed(Ceed ceed, Ceed *fallback_ceed) {
       CeedCall(CeedRestoreJitDefines(ceed, &jit_defines));
     }
   }
-  *fallback_ceed = ceed->op_fallback_ceed;
+  *fallback_ceed = NULL;
+  if (ceed->op_fallback_ceed) CeedCall(CeedReferenceCopy(ceed->op_fallback_ceed, fallback_ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -903,6 +907,7 @@ int CeedGetJitSourceRoots(Ceed ceed, CeedInt *num_source_roots, const char ***ji
   *num_source_roots = ceed_parent->num_jit_source_roots;
   *jit_source_roots = (const char **)ceed_parent->jit_source_roots;
   ceed_parent->num_jit_source_roots_readers++;
+  CeedCall(CeedDestroy(&ceed_parent));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -922,6 +927,7 @@ int CeedRestoreJitSourceRoots(Ceed ceed, const char ***jit_source_roots) {
   CeedCall(CeedGetParent(ceed, &ceed_parent));
   *jit_source_roots = NULL;
   ceed_parent->num_jit_source_roots_readers--;
+  CeedCall(CeedDestroy(&ceed_parent));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -945,6 +951,7 @@ int CeedGetJitDefines(Ceed ceed, CeedInt *num_defines, const char ***jit_defines
   *num_defines = ceed_parent->num_jit_defines;
   *jit_defines = (const char **)ceed_parent->jit_defines;
   ceed_parent->num_jit_defines_readers++;
+  CeedCall(CeedDestroy(&ceed_parent));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -964,6 +971,7 @@ int CeedRestoreJitDefines(Ceed ceed, const char ***jit_defines) {
   CeedCall(CeedGetParent(ceed, &ceed_parent));
   *jit_defines = NULL;
   ceed_parent->num_jit_defines_readers--;
+  CeedCall(CeedDestroy(&ceed_parent));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -1251,6 +1259,7 @@ int CeedSetStream(Ceed ceed, void *handle) {
 
     if (delegate) CeedCall(CeedSetStream(delegate, handle));
     else return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support setting stream");
+    CeedCall(CeedDestroy(&delegate));
   }
   return CEED_ERROR_SUCCESS;
 }
@@ -1314,6 +1323,7 @@ int CeedGetPreferredMemType(Ceed ceed, CeedMemType *mem_type) {
     } else {
       *mem_type = CEED_MEM_HOST;
     }
+    CeedCall(CeedDestroy(&delegate));
   }
   return CEED_ERROR_SUCCESS;
 }
@@ -1360,6 +1370,7 @@ int CeedAddJitSourceRoot(Ceed ceed, const char *jit_source_root) {
   CeedCall(CeedCalloc(path_length + 1, &ceed_parent->jit_source_roots[index]));
   memcpy(ceed_parent->jit_source_roots[index], jit_source_root, path_length);
   ceed_parent->num_jit_source_roots++;
+  CeedCall(CeedDestroy(&ceed_parent));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -1390,6 +1401,7 @@ int CeedAddJitDefine(Ceed ceed, const char *jit_define) {
   CeedCall(CeedCalloc(define_length + 1, &ceed_parent->jit_defines[index]));
   memcpy(ceed_parent->jit_defines[index], jit_define, define_length);
   ceed_parent->num_jit_defines++;
+  CeedCall(CeedDestroy(&ceed_parent));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/tests/t409-qfunction.c b/tests/t409-qfunction.c
index 44029cc797..5f17a5614a 100644
--- a/tests/t409-qfunction.c
+++ b/tests/t409-qfunction.c
@@ -74,6 +74,7 @@ int main(int argc, char **argv) {
   CeedQFunctionContextRestoreData(ctx, &ctx_data_new);
   is_writable = false;
   CeedQFunctionSetContextWritable(qf, is_writable);
+
   {
     in[0]  = u;
     out[0] = v;

From c11e12f4f794f6cb6040b1c3c2be08186369c43d Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Wed, 23 Oct 2024 11:36:08 -0600
Subject: [PATCH 219/571] qf - require CeedOpGetQF to be Destroyed

---
 backends/blocked/ceed-blocked-operator.c      | 21 ++++++++++--------
 .../cuda-gen/ceed-cuda-gen-operator-build.cpp |  1 +
 backends/cuda-gen/ceed-cuda-gen-operator.c    | 22 +++++++++----------
 backends/cuda-ref/ceed-cuda-ref-operator.c    | 11 +++++++++-
 .../hip-gen/ceed-hip-gen-operator-build.cpp   |  1 +
 backends/hip-gen/ceed-hip-gen-operator.c      | 20 ++++++++---------
 backends/hip-ref/ceed-hip-ref-operator.c      | 11 +++++++++-
 backends/opt/ceed-opt-operator.c              | 18 +++++++++------
 backends/ref/ceed-ref-operator.c              | 22 +++++++++++++------
 .../ceed-sycl-gen-operator-build.sycl.cpp     |  1 +
 .../sycl-gen/ceed-sycl-gen-operator.sycl.cpp  | 22 +++++++++----------
 .../sycl-ref/ceed-sycl-ref-operator.sycl.cpp  | 11 +++++++---
 interface/ceed-operator.c                     | 17 +++++++++++++-
 interface/ceed-preconditioning.c              |  2 ++
 14 files changed, 119 insertions(+), 61 deletions(-)

diff --git a/backends/blocked/ceed-blocked-operator.c b/backends/blocked/ceed-blocked-operator.c
index 2f7b0e6dad..11951e19ba 100644
--- a/backends/blocked/ceed-blocked-operator.c
+++ b/backends/blocked/ceed-blocked-operator.c
@@ -261,6 +261,7 @@ static int CeedOperatorSetup_Blocked(CeedOperator op) {
   }
 
   CeedCallBackend(CeedOperatorSetSetupDone(op));
+  CeedCallBackend(CeedQFunctionDestroy(&qf));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -438,23 +439,23 @@ static int CeedOperatorApplyAdd_Blocked(CeedOperator op, CeedVector in_vec, Ceed
   CeedOperatorField    *op_input_fields, *op_output_fields;
   CeedOperator_Blocked *impl;
 
-  CeedCallBackend(CeedOperatorGetData(op, &impl));
-  CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
-  CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q));
-  CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
-  CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
-  CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
-  const CeedInt num_blocks = (num_elem / block_size) + !!(num_elem % block_size);
-
   // Setup
   CeedCallBackend(CeedOperatorSetup_Blocked(op));
 
+  CeedCallBackend(CeedOperatorGetData(op, &impl));
+
   // Restriction only operator
   if (impl->is_identity_rstr_op) {
     CeedCallBackend(CeedElemRestrictionApply(impl->block_rstr[0], CEED_NOTRANSPOSE, in_vec, impl->e_vecs_full[0], request));
     CeedCallBackend(CeedElemRestrictionApply(impl->block_rstr[1], CEED_TRANSPOSE, impl->e_vecs_full[0], out_vec, request));
     return CEED_ERROR_SUCCESS;
   }
+  CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
+  CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q));
+  CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
+  CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
+  CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
+  const CeedInt num_blocks = (num_elem / block_size) + !!(num_elem % block_size);
 
   // Input Evecs and Restriction
   CeedCallBackend(CeedOperatorSetupInputs_Blocked(num_input_fields, qf_input_fields, op_input_fields, in_vec, false, e_data_full, impl, request));
@@ -514,6 +515,7 @@ static int CeedOperatorApplyAdd_Blocked(CeedOperator op, CeedVector in_vec, Ceed
 
   // Restore input arrays
   CeedCallBackend(CeedOperatorRestoreInputs_Blocked(num_input_fields, qf_input_fields, op_input_fields, false, e_data_full, impl));
+  CeedCallBackend(CeedQFunctionDestroy(&qf));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -708,10 +710,11 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Blocked(CeedOperator o
   CeedCallBackend(CeedOperatorRestoreInputs_Blocked(num_input_fields, qf_input_fields, op_input_fields, true, e_data_full, impl));
 
   // Output blocked restriction
-  CeedCallBackend(CeedDestroy(&ceed));
   CeedCallBackend(CeedVectorRestoreArray(l_vec, &l_vec_array));
   CeedCallBackend(CeedVectorSetValue(*assembled, 0.0));
   CeedCallBackend(CeedElemRestrictionApply(block_rstr, CEED_TRANSPOSE, l_vec, *assembled, request));
+  CeedCallBackend(CeedDestroy(&ceed));
+  CeedCallBackend(CeedQFunctionDestroy(&qf));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
index 3103bcbfb7..96277aa8cf 100644
--- a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
+++ b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
@@ -903,6 +903,7 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {
   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, operator_name.c_str(), &data->op));
   CeedCallBackend(CeedOperatorSetSetupDone(op));
   CeedCallBackend(CeedDestroy(&ceed));
+  CeedCallBackend(CeedQFunctionDestroy(&qf));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/cuda-gen/ceed-cuda-gen-operator.c b/backends/cuda-gen/ceed-cuda-gen-operator.c
index 7ea1002b21..d999dc8caa 100644
--- a/backends/cuda-gen/ceed-cuda-gen-operator.c
+++ b/backends/cuda-gen/ceed-cuda-gen-operator.c
@@ -103,15 +103,6 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec,
   CeedOperatorField      *op_input_fields, *op_output_fields;
   CeedOperator_Cuda_gen  *data;
 
-  CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
-  CeedCallBackend(CeedGetData(ceed, &cuda_data));
-  CeedCallBackend(CeedOperatorGetData(op, &data));
-  CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
-  CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
-  CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
-  CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
-  CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
-
   // Check for tensor-product bases
   {
     bool has_tensor_bases;
@@ -121,14 +112,22 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec,
     if (!has_tensor_bases) {
       CeedOperator op_fallback;
 
-      CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "Falling back to /gpu/cuda/ref CeedOperator due to non-tensor bases");
-      CeedCallBackend(CeedDestroy(&ceed));
+      CeedDebug256(CeedOperatorReturnCeed(op), CEED_DEBUG_COLOR_SUCCESS, "Falling back to /gpu/cuda/ref CeedOperator due to non-tensor bases");
       CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback));
       CeedCallBackend(CeedOperatorApplyAdd(op_fallback, input_vec, output_vec, request));
       return CEED_ERROR_SUCCESS;
     }
   }
 
+  CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
+  CeedCallBackend(CeedGetData(ceed, &cuda_data));
+  CeedCallBackend(CeedOperatorGetData(op, &data));
+  CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
+  CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
+  CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
+  CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
+  CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
+
   // Creation of the operator
   CeedCallBackend(CeedOperatorBuildKernel_Cuda_gen(op));
 
@@ -253,6 +252,7 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec,
   // Restore context data
   CeedCallBackend(CeedQFunctionRestoreInnerContextData(qf, &qf_data->d_c));
   CeedCallBackend(CeedDestroy(&ceed));
+  CeedCallBackend(CeedQFunctionDestroy(&qf));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/cuda-ref/ceed-cuda-ref-operator.c b/backends/cuda-ref/ceed-cuda-ref-operator.c
index ac7b1cad03..fe873c74b5 100644
--- a/backends/cuda-ref/ceed-cuda-ref-operator.c
+++ b/backends/cuda-ref/ceed-cuda-ref-operator.c
@@ -354,6 +354,7 @@ static int CeedOperatorSetup_Cuda(CeedOperator op) {
     }
   }
   CeedCallBackend(CeedOperatorSetSetupDone(op));
+  CeedCallBackend(CeedQFunctionDestroy(&qf));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -601,6 +602,7 @@ static int CeedOperatorApplyAdd_Cuda(CeedOperator op, CeedVector in_vec, CeedVec
   // Return work vector
   CeedCallBackend(CeedRestoreWorkVector(ceed, &active_e_vec));
   CeedCallBackend(CeedDestroy(&ceed));
+  CeedCallBackend(CeedQFunctionDestroy(&qf));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -739,6 +741,7 @@ static int CeedOperatorSetupAtPoints_Cuda(CeedOperator op) {
     }
   }
   CeedCallBackend(CeedOperatorSetSetupDone(op));
+  CeedCallBackend(CeedQFunctionDestroy(&qf));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -927,6 +930,7 @@ static int CeedOperatorApplyAddAtPoints_Cuda(CeedOperator op, CeedVector in_vec,
   // Restore work vector
   CeedCallBackend(CeedRestoreWorkVector(ceed, &active_e_vec));
   CeedCallBackend(CeedDestroy(&ceed));
+  CeedCallBackend(CeedQFunctionDestroy(&qf));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -1068,9 +1072,10 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Cuda(CeedOperator op,
   }
 
   // Restore output
+  CeedCallBackend(CeedVectorRestoreArray(*assembled, &assembled_array));
   CeedCallBackend(CeedDestroy(&ceed));
   CeedCallBackend(CeedDestroy(&ceed_parent));
-  CeedCallBackend(CeedVectorRestoreArray(*assembled, &assembled_array));
+  CeedCallBackend(CeedQFunctionDestroy(&qf));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -1274,6 +1279,7 @@ static inline int CeedOperatorAssembleDiagonalSetup_Cuda(CeedOperator op) {
   CeedCallBackend(CeedDestroy(&ceed));
   CeedCallBackend(CeedBasisDestroy(&basis_in));
   CeedCallBackend(CeedBasisDestroy(&basis_out));
+  CeedCallBackend(CeedQFunctionDestroy(&qf));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -1360,6 +1366,7 @@ static inline int CeedOperatorAssembleDiagonalSetupCompile_Cuda(CeedOperator op,
   CeedCallBackend(CeedDestroy(&ceed));
   CeedCallBackend(CeedBasisDestroy(&basis_in));
   CeedCallBackend(CeedBasisDestroy(&basis_out));
+  CeedCallBackend(CeedQFunctionDestroy(&qf));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -1664,6 +1671,7 @@ static int CeedSingleOperatorAssembleSetup_Cuda(CeedOperator op, CeedInt use_cee
   CeedCallBackend(CeedElemRestrictionDestroy(&rstr_out));
   CeedCallBackend(CeedBasisDestroy(&basis_in));
   CeedCallBackend(CeedBasisDestroy(&basis_out));
+  CeedCallBackend(CeedQFunctionDestroy(&qf));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -2041,6 +2049,7 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda(CeedOperator op, C
   CeedCallBackend(CeedRestoreWorkVector(ceed, &active_e_vec_in));
   CeedCallBackend(CeedRestoreWorkVector(ceed, &active_e_vec_out));
   CeedCallBackend(CeedDestroy(&ceed));
+  CeedCallBackend(CeedQFunctionDestroy(&qf));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/hip-gen/ceed-hip-gen-operator-build.cpp b/backends/hip-gen/ceed-hip-gen-operator-build.cpp
index 6a13cd5600..0819519512 100644
--- a/backends/hip-gen/ceed-hip-gen-operator-build.cpp
+++ b/backends/hip-gen/ceed-hip-gen-operator-build.cpp
@@ -910,6 +910,7 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op) {
   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, operator_name.c_str(), &data->op));
   CeedCallBackend(CeedOperatorSetSetupDone(op));
   CeedCallBackend(CeedDestroy(&ceed));
+  CeedCallBackend(CeedQFunctionDestroy(&qf));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/hip-gen/ceed-hip-gen-operator.c b/backends/hip-gen/ceed-hip-gen-operator.c
index a61d1df32f..11ee4943b8 100644
--- a/backends/hip-gen/ceed-hip-gen-operator.c
+++ b/backends/hip-gen/ceed-hip-gen-operator.c
@@ -40,14 +40,6 @@ static int CeedOperatorApplyAdd_Hip_gen(CeedOperator op, CeedVector input_vec, C
   CeedOperatorField     *op_input_fields, *op_output_fields;
   CeedOperator_Hip_gen  *data;
 
-  CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
-  CeedCallBackend(CeedOperatorGetData(op, &data));
-  CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
-  CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
-  CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
-  CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
-  CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
-
   // Check for tensor-product bases
   {
     bool has_tensor_bases;
@@ -57,14 +49,21 @@ static int CeedOperatorApplyAdd_Hip_gen(CeedOperator op, CeedVector input_vec, C
     if (!has_tensor_bases) {
       CeedOperator op_fallback;
 
-      CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "Falling back to /gpu/hip/ref CeedOperator due to non-tensor bases");
-      CeedCallBackend(CeedDestroy(&ceed));
+      CeedDebug256(CeedOperatorReturnCeed(op), CEED_DEBUG_COLOR_SUCCESS, "Falling back to /gpu/hip/ref CeedOperator due to non-tensor bases");
       CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback));
       CeedCallBackend(CeedOperatorApplyAdd(op_fallback, input_vec, output_vec, request));
       return CEED_ERROR_SUCCESS;
     }
   }
 
+  CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
+  CeedCallBackend(CeedOperatorGetData(op, &data));
+  CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
+  CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
+  CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
+  CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
+  CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
+
   // Creation of the operator
   CeedCallBackend(CeedOperatorBuildKernel_Hip_gen(op));
 
@@ -179,6 +178,7 @@ static int CeedOperatorApplyAdd_Hip_gen(CeedOperator op, CeedVector input_vec, C
   // Restore context data
   CeedCallBackend(CeedQFunctionRestoreInnerContextData(qf, &qf_data->d_c));
   CeedCallBackend(CeedDestroy(&ceed));
+  CeedCallBackend(CeedQFunctionDestroy(&qf));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/hip-ref/ceed-hip-ref-operator.c b/backends/hip-ref/ceed-hip-ref-operator.c
index 04733482ee..6f1119084b 100644
--- a/backends/hip-ref/ceed-hip-ref-operator.c
+++ b/backends/hip-ref/ceed-hip-ref-operator.c
@@ -353,6 +353,7 @@ static int CeedOperatorSetup_Hip(CeedOperator op) {
     }
   }
   CeedCallBackend(CeedOperatorSetSetupDone(op));
+  CeedCallBackend(CeedQFunctionDestroy(&qf));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -599,6 +600,7 @@ static int CeedOperatorApplyAdd_Hip(CeedOperator op, CeedVector in_vec, CeedVect
   // Return work vector
   CeedCallBackend(CeedRestoreWorkVector(ceed, &active_e_vec));
   CeedCallBackend(CeedDestroy(&ceed));
+  CeedCallBackend(CeedQFunctionDestroy(&qf));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -737,6 +739,7 @@ static int CeedOperatorSetupAtPoints_Hip(CeedOperator op) {
     }
   }
   CeedCallBackend(CeedOperatorSetSetupDone(op));
+  CeedCallBackend(CeedQFunctionDestroy(&qf));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -924,6 +927,7 @@ static int CeedOperatorApplyAddAtPoints_Hip(CeedOperator op, CeedVector in_vec,
   // Restore work vector
   CeedCallBackend(CeedRestoreWorkVector(ceed, &active_e_vec));
   CeedCallBackend(CeedDestroy(&ceed));
+  CeedCallBackend(CeedQFunctionDestroy(&qf));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -1065,9 +1069,10 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Hip(CeedOperator op, b
   }
 
   // Restore output
+  CeedCallBackend(CeedVectorRestoreArray(*assembled, &assembled_array));
   CeedCallBackend(CeedDestroy(&ceed));
   CeedCallBackend(CeedDestroy(&ceed_parent));
-  CeedCallBackend(CeedVectorRestoreArray(*assembled, &assembled_array));
+  CeedCallBackend(CeedQFunctionDestroy(&qf));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -1271,6 +1276,7 @@ static inline int CeedOperatorAssembleDiagonalSetup_Hip(CeedOperator op) {
   CeedCallBackend(CeedDestroy(&ceed));
   CeedCallBackend(CeedBasisDestroy(&basis_in));
   CeedCallBackend(CeedBasisDestroy(&basis_out));
+  CeedCallBackend(CeedQFunctionDestroy(&qf));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -1357,6 +1363,7 @@ static inline int CeedOperatorAssembleDiagonalSetupCompile_Hip(CeedOperator op,
   CeedCallBackend(CeedDestroy(&ceed));
   CeedCallBackend(CeedBasisDestroy(&basis_in));
   CeedCallBackend(CeedBasisDestroy(&basis_out));
+  CeedCallBackend(CeedQFunctionDestroy(&qf));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -1661,6 +1668,7 @@ static int CeedSingleOperatorAssembleSetup_Hip(CeedOperator op, CeedInt use_ceed
   CeedCallBackend(CeedElemRestrictionDestroy(&rstr_out));
   CeedCallBackend(CeedBasisDestroy(&basis_in));
   CeedCallBackend(CeedBasisDestroy(&basis_out));
+  CeedCallBackend(CeedQFunctionDestroy(&qf));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -2038,6 +2046,7 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip(CeedOperator op, Ce
   CeedCallBackend(CeedRestoreWorkVector(ceed, &active_e_vec_in));
   CeedCallBackend(CeedRestoreWorkVector(ceed, &active_e_vec_out));
   CeedCallBackend(CeedDestroy(&ceed));
+  CeedCallBackend(CeedQFunctionDestroy(&qf));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/opt/ceed-opt-operator.c b/backends/opt/ceed-opt-operator.c
index 69f4560a06..679e88163b 100644
--- a/backends/opt/ceed-opt-operator.c
+++ b/backends/opt/ceed-opt-operator.c
@@ -267,6 +267,7 @@ static int CeedOperatorSetup_Opt(CeedOperator op) {
   }
 
   CeedCallBackend(CeedOperatorSetSetupDone(op));
+  CeedCallBackend(CeedQFunctionDestroy(&qf));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -449,21 +450,17 @@ static int CeedOperatorApplyAdd_Opt(CeedOperator op, CeedVector in_vec, CeedVect
   CeedOperatorField  *op_input_fields, *op_output_fields;
   CeedOperator_Opt   *impl;
 
+  // Setup
+  CeedCallBackend(CeedOperatorSetup_Opt(op));
+
   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
   CeedCallBackend(CeedGetData(ceed, &ceed_impl));
   CeedCallBackend(CeedDestroy(&ceed));
   CeedCallBackend(CeedOperatorGetData(op, &impl));
   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
-  CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q));
-  CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
-  CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
-  CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
   const CeedInt block_size = ceed_impl->block_size;
   const CeedInt num_blocks = (num_elem / block_size) + !!(num_elem % block_size);
 
-  // Setup
-  CeedCallBackend(CeedOperatorSetup_Opt(op));
-
   // Restriction only operator
   if (impl->is_identity_rstr_op) {
     for (CeedInt b = 0; b < num_blocks; b++) {
@@ -473,6 +470,11 @@ static int CeedOperatorApplyAdd_Opt(CeedOperator op, CeedVector in_vec, CeedVect
     return CEED_ERROR_SUCCESS;
   }
 
+  CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q));
+  CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
+  CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
+  CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
+
   // Input Evecs and Restriction
   CeedCallBackend(CeedOperatorSetupInputs_Opt(num_input_fields, qf_input_fields, op_input_fields, in_vec, e_data, impl, request));
 
@@ -506,6 +508,7 @@ static int CeedOperatorApplyAdd_Opt(CeedOperator op, CeedVector in_vec, CeedVect
 
   // Restore input arrays
   CeedCallBackend(CeedOperatorRestoreInputs_Opt(num_input_fields, qf_input_fields, op_input_fields, e_data, impl));
+  CeedCallBackend(CeedQFunctionDestroy(&qf));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -718,6 +721,7 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Opt(CeedOperator op, b
   // Restore input arrays
   CeedCallBackend(CeedOperatorRestoreInputs_Opt(num_input_fields, qf_input_fields, op_input_fields, e_data, impl));
   CeedCallBackend(CeedDestroy(&ceed));
+  CeedCallBackend(CeedQFunctionDestroy(&qf));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/ref/ceed-ref-operator.c b/backends/ref/ceed-ref-operator.c
index fa461f899a..b53a8c6150 100644
--- a/backends/ref/ceed-ref-operator.c
+++ b/backends/ref/ceed-ref-operator.c
@@ -201,6 +201,7 @@ static int CeedOperatorSetup_Ref(CeedOperator op) {
   }
 
   CeedCallBackend(CeedOperatorSetSetupDone(op));
+  CeedCallBackend(CeedQFunctionDestroy(&qf));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -383,16 +384,12 @@ static int CeedOperatorApplyAdd_Ref(CeedOperator op, CeedVector in_vec, CeedVect
   CeedOperatorField  *op_input_fields, *op_output_fields;
   CeedOperator_Ref   *impl;
 
-  CeedCallBackend(CeedOperatorGetData(op, &impl));
-  CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
-  CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q));
-  CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
-  CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
-  CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
-
   // Setup
   CeedCallBackend(CeedOperatorSetup_Ref(op));
 
+  CeedCallBackend(CeedOperatorGetData(op, &impl));
+  CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
+
   // Restriction only operator
   if (impl->is_identity_rstr_op) {
     CeedElemRestriction elem_rstr;
@@ -406,6 +403,11 @@ static int CeedOperatorApplyAdd_Ref(CeedOperator op, CeedVector in_vec, CeedVect
     return CEED_ERROR_SUCCESS;
   }
 
+  CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
+  CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q));
+  CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
+  CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
+
   // Input Evecs and Restriction
   CeedCallBackend(CeedOperatorSetupInputs_Ref(num_input_fields, qf_input_fields, op_input_fields, in_vec, false, e_data_full, impl, request));
 
@@ -466,6 +468,7 @@ static int CeedOperatorApplyAdd_Ref(CeedOperator op, CeedVector in_vec, CeedVect
 
   // Restore input arrays
   CeedCallBackend(CeedOperatorRestoreInputs_Ref(num_input_fields, qf_input_fields, op_input_fields, false, e_data_full, impl));
+  CeedCallBackend(CeedQFunctionDestroy(&qf));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -646,6 +649,7 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Ref(CeedOperator op, b
   // Restore output
   CeedCallBackend(CeedVectorRestoreArray(*assembled, &assembled_array));
   CeedCallBackend(CeedDestroy(&ceed_parent));
+  CeedCallBackend(CeedQFunctionDestroy(&qf));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -873,6 +877,7 @@ static int CeedOperatorSetupAtPoints_Ref(CeedOperator op) {
   }
 
   CeedCallBackend(CeedOperatorSetSetupDone(op));
+  CeedCallBackend(CeedQFunctionDestroy(&qf));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -1065,6 +1070,7 @@ static int CeedOperatorApplyAddAtPoints_Ref(CeedOperator op, CeedVector in_vec,
   // Cleanup point coordinates
   CeedCallBackend(CeedVectorDestroy(&point_coords));
   CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
+  CeedCallBackend(CeedQFunctionDestroy(&qf));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -1289,6 +1295,7 @@ static inline int CeedOperatorLinearAssembleQFunctionAtPointsCore_Ref(CeedOperat
   CeedCallBackend(CeedDestroy(&ceed));
   CeedCallBackend(CeedVectorDestroy(&point_coords));
   CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
+  CeedCallBackend(CeedQFunctionDestroy(&qf));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -1541,6 +1548,7 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Ref(CeedOperator op, Ce
   CeedCallBackend(CeedVectorDestroy(&out_vec));
   CeedCallBackend(CeedVectorDestroy(&point_coords));
   CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
+  CeedCallBackend(CeedQFunctionDestroy(&qf));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/sycl-gen/ceed-sycl-gen-operator-build.sycl.cpp b/backends/sycl-gen/ceed-sycl-gen-operator-build.sycl.cpp
index f1fb58e42f..b3850c05fc 100644
--- a/backends/sycl-gen/ceed-sycl-gen-operator-build.sycl.cpp
+++ b/backends/sycl-gen/ceed-sycl-gen-operator-build.sycl.cpp
@@ -782,6 +782,7 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) {
   CeedCallBackend(CeedGetKernel_Sycl(ceed, impl->sycl_module, operator_name, &impl->op));
   CeedCallBackend(CeedOperatorSetSetupDone(op));
   CeedCallBackend(CeedDestroy(&ceed));
+  CeedCallBackend(CeedQFunctionDestroy(&qf));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/sycl-gen/ceed-sycl-gen-operator.sycl.cpp b/backends/sycl-gen/ceed-sycl-gen-operator.sycl.cpp
index eb74176bbf..1acd7147ee 100644
--- a/backends/sycl-gen/ceed-sycl-gen-operator.sycl.cpp
+++ b/backends/sycl-gen/ceed-sycl-gen-operator.sycl.cpp
@@ -39,15 +39,6 @@ static int CeedOperatorApplyAdd_Sycl_gen(CeedOperator op, CeedVector input_vec,
   CeedOperatorField      *op_input_fields, *op_output_fields;
   CeedOperator_Sycl_gen  *impl;
 
-  CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
-  CeedCallBackend(CeedGetData(ceed, &ceed_Sycl));
-  CeedCallBackend(CeedOperatorGetData(op, &impl));
-  CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
-  CeedCallBackend(CeedQFunctionGetData(qf, &qf_impl));
-  CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
-  CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
-  CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
-
   // Check for tensor-product bases
   {
     bool has_tensor_bases;
@@ -57,14 +48,22 @@ static int CeedOperatorApplyAdd_Sycl_gen(CeedOperator op, CeedVector input_vec,
     if (!has_tensor_bases) {
       CeedOperator op_fallback;
 
-      CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "Falling back to sycl/ref CeedOperator due to non-tensor bases");
-      CeedCallBackend(CeedDestroy(&ceed));
+      CeedDebug256(CeedOperatorReturnCeed(op), CEED_DEBUG_COLOR_SUCCESS, "Falling back to sycl/ref CeedOperator due to non-tensor bases");
       CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback));
       CeedCallBackend(CeedOperatorApplyAdd(op_fallback, input_vec, output_vec, request));
       return CEED_ERROR_SUCCESS;
     }
   }
 
+  CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
+  CeedCallBackend(CeedGetData(ceed, &ceed_Sycl));
+  CeedCallBackend(CeedOperatorGetData(op, &impl));
+  CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
+  CeedCallBackend(CeedQFunctionGetData(qf, &qf_impl));
+  CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
+  CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
+  CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
+
   // Creation of the operator
   CeedCallBackend(CeedOperatorBuildKernel_Sycl_gen(op));
 
@@ -200,6 +199,7 @@ static int CeedOperatorApplyAdd_Sycl_gen(CeedOperator op, CeedVector input_vec,
   // Restore context data
   CeedCallBackend(CeedQFunctionRestoreInnerContextData(qf, &qf_impl->d_c));
   CeedCallBackend(CeedDestroy(&ceed));
+  CeedCallBackend(CeedQFunctionDestroy(&qf));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp b/backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp
index e843015df1..89148678d3 100644
--- a/backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp
+++ b/backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp
@@ -249,6 +249,7 @@ static int CeedOperatorSetup_Sycl(CeedOperator op) {
   CeedCallBackend(CeedOperatorSetupFields_Sycl(qf, op, false, impl->e_vecs, impl->q_vecs_out, num_input_fields, num_output_fields, Q, num_elem));
 
   CeedCallBackend(CeedOperatorSetSetupDone(op));
+  CeedCallBackend(CeedQFunctionDestroy(&qf));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -467,6 +468,7 @@ static int CeedOperatorApplyAdd_Sycl(CeedOperator op, CeedVector in_vec, CeedVec
 
   // Restore input arrays
   CeedCallBackend(CeedOperatorRestoreInputs_Sycl(num_input_fields, qf_input_fields, op_input_fields, false, e_data, impl));
+  CeedCallBackend(CeedQFunctionDestroy(&qf));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -607,6 +609,7 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Sycl(CeedOperator op,
   // Restore output
   CeedCallBackend(CeedVectorRestoreArray(*assembled, &assembled_array));
   CeedCallBackend(CeedDestroy(&ceed_parent));
+  CeedCallBackend(CeedQFunctionDestroy(&qf));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -812,13 +815,14 @@ static inline int CeedOperatorAssembleDiagonalSetup_Sycl(CeedOperator op) {
   CeedCallBackend(CeedElemRestrictionReferenceCopy(rstr_out, &diag->diag_rstr));
   CeedCallBackend(CeedElemRestrictionDestroy(&rstr_out));
 
+  // Wait for all copies to complete and handle exceptions
+  CeedCallSycl(ceed, sycl::event::wait_and_throw(copy_events));
+
   // Cleanup
   CeedCallBackend(CeedDestroy(&ceed));
   CeedCallBackend(CeedBasisDestroy(&basis_in));
   CeedCallBackend(CeedBasisDestroy(&basis_out));
-
-  // Wait for all copies to complete and handle exceptions
-  CeedCallSycl(ceed, sycl::event::wait_and_throw(copy_events));
+  CeedCallBackend(CeedQFunctionDestroy(&qf));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -1175,6 +1179,7 @@ static int CeedSingleOperatorAssembleSetup_Sycl(CeedOperator op) {
   CeedCallBackend(CeedElemRestrictionDestroy(&rstr_out));
   CeedCallBackend(CeedBasisDestroy(&basis_in));
   CeedCallBackend(CeedBasisDestroy(&basis_out));
+  CeedCallBackend(CeedQFunctionDestroy(&qf));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/interface/ceed-operator.c b/interface/ceed-operator.c
index 0bdc136ca1..3d6f6dd004 100644
--- a/interface/ceed-operator.c
+++ b/interface/ceed-operator.c
@@ -148,6 +148,7 @@ int CeedOperatorSingleView(CeedOperator op, bool sub, FILE *stream) {
   CeedCall(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
   CeedCall(CeedOperatorGetQFunction(op, &qf));
   CeedCall(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
+  CeedCall(CeedQFunctionDestroy(&qf));
 
   fprintf(stream, "%s  %" CeedInt_FMT " elements with %" CeedInt_FMT " quadrature points each\n", pre, num_elem, num_qpts);
   fprintf(stream, "%s  %" CeedInt_FMT " field%s\n", pre, total_fields, total_fields > 1 ? "s" : "");
@@ -374,6 +375,7 @@ static int CeedOperatorContextSetGeneric(CeedOperator op, CeedContextFieldLabel
 
       CeedCall(CeedOperatorGetQFunction(sub_operators[i], &qf));
       CeedCall(CeedQFunctionGetContext(qf, &ctx));
+      CeedCall(CeedQFunctionDestroy(&qf));
       // Try every sub-operator, ok if some sub-operators do not have field
       if (field_label->sub_labels[i] && ctx) {
         CeedCall(CeedQFunctionContextSetGeneric(ctx, field_label->sub_labels[i], field_type, values));
@@ -385,6 +387,7 @@ static int CeedOperatorContextSetGeneric(CeedOperator op, CeedContextFieldLabel
 
     CeedCall(CeedOperatorGetQFunction(op, &qf));
     CeedCall(CeedQFunctionGetContext(qf, &ctx));
+    CeedCall(CeedQFunctionDestroy(&qf));
     CeedCheck(ctx, CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED, "QFunction does not have context data");
     CeedCall(CeedQFunctionContextSetGeneric(ctx, field_label, field_type, values));
   }
@@ -443,6 +446,7 @@ static int CeedOperatorContextGetGenericRead(CeedOperator op, CeedContextFieldLa
 
       CeedCall(CeedOperatorGetQFunction(sub_operators[i], &qf));
       CeedCall(CeedQFunctionGetContext(qf, &ctx));
+      CeedCall(CeedQFunctionDestroy(&qf));
       // Try every sub-operator, ok if some sub-operators do not have field
       if (field_label->sub_labels[i] && ctx) {
         CeedCall(CeedQFunctionContextGetGenericRead(ctx, field_label->sub_labels[i], field_type, num_values, values));
@@ -455,6 +459,7 @@ static int CeedOperatorContextGetGenericRead(CeedOperator op, CeedContextFieldLa
 
     CeedCall(CeedOperatorGetQFunction(op, &qf));
     CeedCall(CeedQFunctionGetContext(qf, &ctx));
+    CeedCall(CeedQFunctionDestroy(&qf));
     CeedCheck(ctx, CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED, "QFunction does not have context data");
     CeedCall(CeedQFunctionContextGetGenericRead(ctx, field_label, field_type, num_values, values));
   }
@@ -507,6 +512,7 @@ static int CeedOperatorContextRestoreGenericRead(CeedOperator op, CeedContextFie
 
       CeedCall(CeedOperatorGetQFunction(sub_operators[i], &qf));
       CeedCall(CeedQFunctionGetContext(qf, &ctx));
+      CeedCall(CeedQFunctionDestroy(&qf));
       // Try every sub-operator, ok if some sub-operators do not have field
       if (field_label->sub_labels[i] && ctx) {
         CeedCall(CeedQFunctionContextRestoreGenericRead(ctx, field_label->sub_labels[i], field_type, values));
@@ -519,6 +525,7 @@ static int CeedOperatorContextRestoreGenericRead(CeedOperator op, CeedContextFie
 
     CeedCall(CeedOperatorGetQFunction(op, &qf));
     CeedCall(CeedQFunctionGetContext(qf, &ctx));
+    CeedCall(CeedQFunctionDestroy(&qf));
     CeedCheck(ctx, CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED, "QFunction does not have context data");
     CeedCall(CeedQFunctionContextRestoreGenericRead(ctx, field_label, field_type, values));
   }
@@ -640,7 +647,8 @@ int CeedOperatorGetQFunction(CeedOperator op, CeedQFunction *qf) {
 
   CeedCall(CeedOperatorIsComposite(op, &is_composite));
   CeedCheck(!is_composite, CeedOperatorReturnCeed(op), CEED_ERROR_MINOR, "Not defined for composite operator");
-  *qf = op->qf;
+  *qf = NULL;
+  CeedCall(CeedQFunctionReferenceCopy(op->qf, qf));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -939,6 +947,7 @@ int CeedOperatorSetField(CeedOperator op, const char *field_name, CeedElemRestri
 
   CeedCall(CeedOperatorGetQFunction(op, &qf));
   CeedCall(CeedQFunctionGetFields(qf, &num_input_fields, &qf_input_fields, &num_output_fields, &qf_output_fields));
+  CeedCall(CeedQFunctionDestroy(&qf));
   for (CeedInt i = 0; i < num_input_fields; i++) {
     const char *qf_field_name;
 
@@ -1021,6 +1030,7 @@ int CeedOperatorGetFields(CeedOperator op, CeedInt *num_input_fields, CeedOperat
 
   CeedCall(CeedOperatorGetQFunction(op, &qf));
   CeedCall(CeedQFunctionGetFields(qf, num_input_fields, NULL, num_output_fields, NULL));
+  CeedCall(CeedQFunctionDestroy(&qf));
   if (input_fields) *input_fields = op->input_fields;
   if (output_fields) *output_fields = op->output_fields;
   return CEED_ERROR_SUCCESS;
@@ -1400,6 +1410,7 @@ int CeedOperatorCheckReady(CeedOperator op) {
   // Flag as immutable and ready
   op->is_interface_setup = true;
   if (qf && qf != CEED_QFUNCTION_NONE) CeedCall(CeedQFunctionSetImmutable(qf));
+  CeedCall(CeedQFunctionDestroy(&qf));
   if (op->dqf && op->dqf != CEED_QFUNCTION_NONE) CeedCall(CeedQFunctionSetImmutable(op->dqf));
   if (op->dqfT && op->dqfT != CEED_QFUNCTION_NONE) CeedCall(CeedQFunctionSetImmutable(op->dqfT));
   return CEED_ERROR_SUCCESS;
@@ -1699,6 +1710,7 @@ int CeedOperatorGetFlopsEstimate(CeedOperator op, CeedSize *flops) {
 
     CeedCall(CeedOperatorGetQFunction(op, &qf));
     CeedCall(CeedQFunctionGetFields(qf, &num_input_fields, &qf_input_fields, &num_output_fields, &qf_output_fields));
+    CeedCall(CeedQFunctionDestroy(&qf));
     CeedCall(CeedOperatorGetFields(op, NULL, &op_input_fields, NULL, &op_output_fields));
     CeedCall(CeedOperatorGetNumElements(op, &num_elem));
 
@@ -1734,6 +1746,7 @@ int CeedOperatorGetFlopsEstimate(CeedOperator op, CeedSize *flops) {
       CeedCall(CeedOperatorGetNumQuadraturePoints(op, &num_qpts));
       CeedCall(CeedOperatorGetQFunction(op, &qf));
       CeedCall(CeedQFunctionGetFlopsEstimate(qf, &qf_flops));
+      CeedCall(CeedQFunctionDestroy(&qf));
       CeedCheck(qf_flops > -1, CeedOperatorReturnCeed(op), CEED_ERROR_INCOMPLETE,
                 "Must set CeedQFunction FLOPs estimate with CeedQFunctionSetUserFlopsEstimate");
       *flops += num_elem * num_qpts * qf_flops;
@@ -1790,6 +1803,7 @@ int CeedOperatorGetContext(CeedOperator op, CeedQFunctionContext *ctx) {
   CeedCheck(!is_composite, CeedOperatorReturnCeed(op), CEED_ERROR_INCOMPATIBLE, "Cannot retrieve CeedQFunctionContext for composite operator");
   CeedCall(CeedOperatorGetQFunction(op, &qf));
   CeedCall(CeedQFunctionGetInnerContext(qf, &qf_ctx));
+  CeedCall(CeedQFunctionDestroy(&qf));
   if (qf_ctx) CeedCall(CeedQFunctionContextReferenceCopy(qf_ctx, ctx));
   return CEED_ERROR_SUCCESS;
 }
@@ -1882,6 +1896,7 @@ int CeedOperatorGetContextFieldLabel(CeedOperator op, const char *field_name, Ce
     // Single, non-composite operator
     CeedCall(CeedOperatorGetQFunction(op, &qf));
     CeedCall(CeedQFunctionGetInnerContext(qf, &ctx));
+    CeedCall(CeedQFunctionDestroy(&qf));
     if (ctx) {
       CeedCall(CeedQFunctionContextGetFieldLabel(ctx, field_name, field_label));
     } else {
diff --git a/interface/ceed-preconditioning.c b/interface/ceed-preconditioning.c
index 937d7fde0d..5535b91661 100644
--- a/interface/ceed-preconditioning.c
+++ b/interface/ceed-preconditioning.c
@@ -1535,6 +1535,7 @@ int CeedOperatorAssemblyDataCreate(Ceed ceed, CeedOperator op, CeedOperatorAssem
     }
     CeedCall(CeedVectorDestroy(&vec));
   }
+  CeedCall(CeedQFunctionDestroy(&qf));
   (*data)->num_active_bases_in   = num_active_bases_in;
   (*data)->num_eval_modes_in     = num_eval_modes_in;
   (*data)->eval_modes_in         = eval_modes_in;
@@ -2938,6 +2939,7 @@ int CeedOperatorCreateFDMElementInverse(CeedOperator op, CeedOperator *fdm_inv,
   CeedCall(CeedElemRestrictionDestroy(&rstr_qd_i));
   CeedCall(CeedBasisDestroy(&basis));
   CeedCall(CeedBasisDestroy(&fdm_basis));
+  CeedCall(CeedQFunctionDestroy(&qf));
   CeedCall(CeedQFunctionDestroy(&qf_fdm));
   return CEED_ERROR_SUCCESS;
 }

From 1485364c160e7dd269a03246437f5ec5e1f33a6b Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Wed, 23 Oct 2024 11:56:46 -0600
Subject: [PATCH 220/571] ctx - require CeedQFGetCtx to be Destroyed

---
 interface/ceed-fortran.c         |  2 ++
 interface/ceed-operator.c        | 45 +++++++++++++-------------------
 interface/ceed-preconditioning.c |  1 +
 interface/ceed-qfunction.c       |  8 ++++--
 tests/t525-operator.c            |  1 +
 5 files changed, 28 insertions(+), 29 deletions(-)

diff --git a/interface/ceed-fortran.c b/interface/ceed-fortran.c
index d3ed061c3e..501d901c2e 100644
--- a/interface/ceed-fortran.c
+++ b/interface/ceed-fortran.c
@@ -845,6 +845,8 @@ CEED_EXTERN void fCeedQFunctionSetContext(int *qf, int *ctx, int *err) {
   if (*err) return;
   fctxdata->inner_ctx = ctx_;
   *err                = CeedQFunctionContextRestoreData(fctx, (void **)&fctxdata);
+  if (*err) return;
+  *err = CeedQFunctionContextDestroy(&fctx);
 }
 
 #define fCeedQFunctionView FORTRAN_NAME(ceedqfunctionview, CEEDQFUNCTIONVIEW)
diff --git a/interface/ceed-operator.c b/interface/ceed-operator.c
index 3d6f6dd004..96fe26a834 100644
--- a/interface/ceed-operator.c
+++ b/interface/ceed-operator.c
@@ -370,26 +370,22 @@ static int CeedOperatorContextSetGeneric(CeedOperator op, CeedContextFieldLabel
               "Composite operator modified after ContextFieldLabel created");
 
     for (CeedInt i = 0; i < num_sub; i++) {
-      CeedQFunction        qf;
       CeedQFunctionContext ctx;
 
-      CeedCall(CeedOperatorGetQFunction(sub_operators[i], &qf));
-      CeedCall(CeedQFunctionGetContext(qf, &ctx));
-      CeedCall(CeedQFunctionDestroy(&qf));
+      CeedCall(CeedOperatorGetContext(sub_operators[i], &ctx));
       // Try every sub-operator, ok if some sub-operators do not have field
-      if (field_label->sub_labels[i] && ctx) {
+      if (ctx && field_label->sub_labels[i]) {
         CeedCall(CeedQFunctionContextSetGeneric(ctx, field_label->sub_labels[i], field_type, values));
       }
+      CeedCall(CeedQFunctionContextDestroy(&ctx));
     }
   } else {
-    CeedQFunction        qf;
     CeedQFunctionContext ctx;
 
-    CeedCall(CeedOperatorGetQFunction(op, &qf));
-    CeedCall(CeedQFunctionGetContext(qf, &ctx));
-    CeedCall(CeedQFunctionDestroy(&qf));
+    CeedCall(CeedOperatorGetContext(op, &ctx));
     CeedCheck(ctx, CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED, "QFunction does not have context data");
     CeedCall(CeedQFunctionContextSetGeneric(ctx, field_label, field_type, values));
+    CeedCall(CeedQFunctionContextDestroy(&ctx));
   }
   CeedCall(CeedOperatorSetQFunctionAssemblyDataUpdateNeeded(op, true));
   return CEED_ERROR_SUCCESS;
@@ -441,27 +437,24 @@ static int CeedOperatorContextGetGenericRead(CeedOperator op, CeedContextFieldLa
               "Composite operator modified after ContextFieldLabel created");
 
     for (CeedInt i = 0; i < num_sub; i++) {
-      CeedQFunction        qf;
       CeedQFunctionContext ctx;
 
-      CeedCall(CeedOperatorGetQFunction(sub_operators[i], &qf));
-      CeedCall(CeedQFunctionGetContext(qf, &ctx));
-      CeedCall(CeedQFunctionDestroy(&qf));
+      CeedCall(CeedOperatorGetContext(sub_operators[i], &ctx));
       // Try every sub-operator, ok if some sub-operators do not have field
-      if (field_label->sub_labels[i] && ctx) {
+      if (ctx && field_label->sub_labels[i]) {
         CeedCall(CeedQFunctionContextGetGenericRead(ctx, field_label->sub_labels[i], field_type, num_values, values));
+        CeedCall(CeedQFunctionContextDestroy(&ctx));
         return CEED_ERROR_SUCCESS;
       }
+      CeedCall(CeedQFunctionContextDestroy(&ctx));
     }
   } else {
-    CeedQFunction        qf;
     CeedQFunctionContext ctx;
 
-    CeedCall(CeedOperatorGetQFunction(op, &qf));
-    CeedCall(CeedQFunctionGetContext(qf, &ctx));
-    CeedCall(CeedQFunctionDestroy(&qf));
+    CeedCall(CeedOperatorGetContext(op, &ctx));
     CeedCheck(ctx, CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED, "QFunction does not have context data");
     CeedCall(CeedQFunctionContextGetGenericRead(ctx, field_label, field_type, num_values, values));
+    CeedCall(CeedQFunctionContextDestroy(&ctx));
   }
   return CEED_ERROR_SUCCESS;
 }
@@ -507,27 +500,24 @@ static int CeedOperatorContextRestoreGenericRead(CeedOperator op, CeedContextFie
               "Composite operator modified after ContextFieldLabel created");
 
     for (CeedInt i = 0; i < num_sub; i++) {
-      CeedQFunction        qf;
       CeedQFunctionContext ctx;
 
-      CeedCall(CeedOperatorGetQFunction(sub_operators[i], &qf));
-      CeedCall(CeedQFunctionGetContext(qf, &ctx));
-      CeedCall(CeedQFunctionDestroy(&qf));
+      CeedCall(CeedOperatorGetContext(sub_operators[i], &ctx));
       // Try every sub-operator, ok if some sub-operators do not have field
-      if (field_label->sub_labels[i] && ctx) {
+      if (ctx && field_label->sub_labels[i]) {
         CeedCall(CeedQFunctionContextRestoreGenericRead(ctx, field_label->sub_labels[i], field_type, values));
+        CeedCall(CeedQFunctionContextDestroy(&ctx));
         return CEED_ERROR_SUCCESS;
       }
+      CeedCall(CeedQFunctionContextDestroy(&ctx));
     }
   } else {
-    CeedQFunction        qf;
     CeedQFunctionContext ctx;
 
-    CeedCall(CeedOperatorGetQFunction(op, &qf));
-    CeedCall(CeedQFunctionGetContext(qf, &ctx));
-    CeedCall(CeedQFunctionDestroy(&qf));
+    CeedCall(CeedOperatorGetContext(op, &ctx));
     CeedCheck(ctx, CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED, "QFunction does not have context data");
     CeedCall(CeedQFunctionContextRestoreGenericRead(ctx, field_label, field_type, values));
+    CeedCall(CeedQFunctionContextDestroy(&ctx));
   }
   return CEED_ERROR_SUCCESS;
 }
@@ -1804,6 +1794,7 @@ int CeedOperatorGetContext(CeedOperator op, CeedQFunctionContext *ctx) {
   CeedCall(CeedOperatorGetQFunction(op, &qf));
   CeedCall(CeedQFunctionGetInnerContext(qf, &qf_ctx));
   CeedCall(CeedQFunctionDestroy(&qf));
+  *ctx = NULL;
   if (qf_ctx) CeedCall(CeedQFunctionContextReferenceCopy(qf_ctx, ctx));
   return CEED_ERROR_SUCCESS;
 }
diff --git a/interface/ceed-preconditioning.c b/interface/ceed-preconditioning.c
index 5535b91661..1cc767cb08 100644
--- a/interface/ceed-preconditioning.c
+++ b/interface/ceed-preconditioning.c
@@ -71,6 +71,7 @@ static int CeedQFunctionCreateFallback(Ceed fallback_ceed, CeedQFunction qf, Cee
 
     CeedCall(CeedQFunctionGetContext(qf, &ctx));
     CeedCall(CeedQFunctionSetContext(*qf_fallback, ctx));
+    CeedCall(CeedQFunctionContextDestroy(&ctx));
   }
   CeedCall(CeedQFunctionGetFields(qf, &num_input_fields, &input_fields, &num_output_fields, &output_fields));
   for (CeedInt i = 0; i < num_input_fields; i++) {
diff --git a/interface/ceed-qfunction.c b/interface/ceed-qfunction.c
index 0daa4b98fc..6936974fa4 100644
--- a/interface/ceed-qfunction.c
+++ b/interface/ceed-qfunction.c
@@ -330,7 +330,8 @@ int CeedQFunctionGetUserFunction(CeedQFunction qf, CeedQFunctionUser *f) {
   @ref Backend
 **/
 int CeedQFunctionGetContext(CeedQFunction qf, CeedQFunctionContext *ctx) {
-  *ctx = qf->ctx;
+  *ctx = NULL;
+  if (qf->ctx) CeedCall(CeedQFunctionContextReferenceCopy(qf->ctx, ctx));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -361,6 +362,7 @@ int CeedQFunctionGetContextData(CeedQFunction qf, CeedMemType mem_type, void *da
   } else {
     *(void **)data = NULL;
   }
+  CeedCall(CeedQFunctionContextDestroy(&ctx));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -387,7 +389,7 @@ int CeedQFunctionRestoreContextData(CeedQFunction qf, void *data) {
       CeedCall(CeedQFunctionContextRestoreDataRead(ctx, data));
     }
   }
-  *(void **)data = NULL;
+  CeedCall(CeedQFunctionContextDestroy(&ctx));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -415,6 +417,7 @@ int CeedQFunctionGetInnerContext(CeedQFunction qf, CeedQFunctionContext *ctx) {
   } else {
     *ctx = qf_ctx;
   }
+  CeedCall(CeedQFunctionContextDestroy(&qf_ctx));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -730,6 +733,7 @@ int CeedQFunctionCreateIdentity(Ceed ceed, CeedInt size, CeedEvalMode in_mode, C
   CeedCall(CeedQFunctionGetContext(*qf, &ctx));
   CeedCall(CeedQFunctionContextGetFieldLabel(ctx, "size", &size_label));
   CeedCall(CeedQFunctionContextSetInt32(ctx, size_label, &size));
+  CeedCall(CeedQFunctionContextDestroy(&ctx));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/tests/t525-operator.c b/tests/t525-operator.c
index 9d0d80442f..9617d94f32 100644
--- a/tests/t525-operator.c
+++ b/tests/t525-operator.c
@@ -112,6 +112,7 @@ int main(int argc, char **argv) {
 
     CeedOperatorGetContext(op_sub_1, &ctx_copy);
     if (ctx_copy != qf_ctx_sub_1) printf("Incorrect QFunctionContext retrieved");
+    CeedQFunctionContextDestroy(&ctx_copy);
 
     CeedOperatorGetContext(op_sub_2, &ctx_copy);  // Destroys reference to qf_ctx_sub_1
     if (ctx_copy != qf_ctx_sub_2) printf("Incorrect QFunctionContext retrieved");

From dc3318a4fa50e4a2bf558a69684a8f36486f60c8 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Wed, 23 Oct 2024 12:23:53 -0600
Subject: [PATCH 221/571] doc - update release notes for new Destroy rqmts

---
 doc/sphinx/source/releasenotes.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/sphinx/source/releasenotes.md b/doc/sphinx/source/releasenotes.md
index 4e629b48e0..816187fd1f 100644
--- a/doc/sphinx/source/releasenotes.md
+++ b/doc/sphinx/source/releasenotes.md
@@ -11,10 +11,10 @@ On this page we provide a summary of the main API changes, new features and exam
 - Add `bool` field type for `CeedQFunctionContext` and related interfaces to use `bool` fields.
 - `CEED_BASIS_COLLOCATED` removed; users should only use `CEED_BASIS_NONE`.
 - Remove unneeded pointer for `CeedElemRestrictionGetELayout`.
-- Require use of `Ceed*Destroy()` on Ceed objects returned from `CeedOperatorFieldGet*()`;
 - Change QFunction source include file handling in JiT compilers
     - Add `CEED_RUNNING_JIT_PASS` compiler definition for wrapping header files that device JiT compilers cannot read
     - Users should now prefer `#include <ceed/types.h>` rather than `#include <ceed.h>` in QFunction source files
+- Require use of `Ceed*Destroy()` on Ceed objects returned from `Ceed*Get*()`.
 
 ### New features
 

From 3e551a327d6c97f9de071b988b42ffdb7bed19a7 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Wed, 23 Oct 2024 13:07:13 -0600
Subject: [PATCH 222/571] rust - update for destroy requirements

---
 rust/libceed/src/basis.rs            | 6 +-----
 rust/libceed/src/elem_restriction.rs | 6 +-----
 rust/libceed/src/operator.rs         | 6 +-----
 rust/libceed/src/qfunction.rs        | 8 +++-----
 rust/libceed/src/vector.rs           | 6 +-----
 5 files changed, 7 insertions(+), 25 deletions(-)

diff --git a/rust/libceed/src/basis.rs b/rust/libceed/src/basis.rs
index b308076ab6..2d7cdba637 100644
--- a/rust/libceed/src/basis.rs
+++ b/rust/libceed/src/basis.rs
@@ -326,11 +326,7 @@ impl<'a> Basis<'a> {
     // Error handling
     #[doc(hidden)]
     fn check_error(&self, ierr: i32) -> crate::Result<i32> {
-        let mut ptr = std::ptr::null_mut();
-        unsafe {
-            bind_ceed::CeedBasisGetCeed(self.ptr, &mut ptr);
-        }
-        crate::check_error(ptr, ierr)
+        unsafe { crate::check_error(bind_ceed::CeedBasisReturnCeed(self.ptr), ierr) }
     }
 
     /// Apply basis evaluation from nodes to quadrature points or vice versa
diff --git a/rust/libceed/src/elem_restriction.rs b/rust/libceed/src/elem_restriction.rs
index e800f56c1e..500de95e80 100644
--- a/rust/libceed/src/elem_restriction.rs
+++ b/rust/libceed/src/elem_restriction.rs
@@ -320,11 +320,7 @@ impl<'a> ElemRestriction<'a> {
     // Error handling
     #[doc(hidden)]
     fn check_error(&self, ierr: i32) -> crate::Result<i32> {
-        let mut ptr = std::ptr::null_mut();
-        unsafe {
-            bind_ceed::CeedElemRestrictionGetCeed(self.ptr, &mut ptr);
-        }
-        crate::check_error(ptr, ierr)
+        unsafe { crate::check_error(bind_ceed::CeedElemRestrictionReturnCeed(self.ptr), ierr) }
     }
 
     /// Create an Lvector for an ElemRestriction
diff --git a/rust/libceed/src/operator.rs b/rust/libceed/src/operator.rs
index 809c78f6e6..9f3506a493 100644
--- a/rust/libceed/src/operator.rs
+++ b/rust/libceed/src/operator.rs
@@ -442,11 +442,7 @@ impl<'a> OperatorCore<'a> {
     // Error handling
     #[doc(hidden)]
     fn check_error(&self, ierr: i32) -> crate::Result<i32> {
-        let mut ptr = std::ptr::null_mut();
-        unsafe {
-            bind_ceed::CeedOperatorGetCeed(self.ptr, &mut ptr);
-        }
-        crate::check_error(ptr, ierr)
+        unsafe { crate::check_error(bind_ceed::CeedOperatorReturnCeed(self.ptr), ierr) }
     }
 
     // Common implementations
diff --git a/rust/libceed/src/qfunction.rs b/rust/libceed/src/qfunction.rs
index 0d32d01d28..a09c0ea918 100644
--- a/rust/libceed/src/qfunction.rs
+++ b/rust/libceed/src/qfunction.rs
@@ -442,11 +442,7 @@ impl<'a> QFunctionCore<'a> {
     // Error handling
     #[doc(hidden)]
     fn check_error(&self, ierr: i32) -> crate::Result<i32> {
-        let mut ptr = std::ptr::null_mut();
-        unsafe {
-            bind_ceed::CeedQFunctionGetCeed(self.ptr, &mut ptr);
-        }
-        crate::check_error(ptr, ierr)
+        unsafe { crate::check_error(bind_ceed::CeedQFunctionReturnCeed(self.ptr), ierr) }
     }
 
     // Common implementation
@@ -644,6 +640,8 @@ impl<'a> QFunction<'a> {
         ceed.check_error(ierr)?;
         ierr = unsafe { bind_ceed::CeedQFunctionSetContext(ptr, qf_ctx_ptr) };
         ceed.check_error(ierr)?;
+        ierr = unsafe { bind_ceed::CeedQFunctionContextDestroy(&mut qf_ctx_ptr) };
+        ceed.check_error(ierr)?;
         Ok(Self {
             qf_core: QFunctionCore {
                 ptr,
diff --git a/rust/libceed/src/vector.rs b/rust/libceed/src/vector.rs
index c90d8a295a..9173070180 100644
--- a/rust/libceed/src/vector.rs
+++ b/rust/libceed/src/vector.rs
@@ -348,11 +348,7 @@ impl<'a> Vector<'a> {
     // Error handling
     #[doc(hidden)]
     fn check_error(&self, ierr: i32) -> crate::Result<i32> {
-        let mut ptr = std::ptr::null_mut();
-        unsafe {
-            bind_ceed::CeedVectorGetCeed(self.ptr, &mut ptr);
-        }
-        crate::check_error(ptr, ierr)
+        unsafe { crate::check_error(bind_ceed::CeedVectorReturnCeed(self.ptr), ierr) }
     }
 
     /// Returns the length of a Vector

From 11544396610b36de1cb2f0d18032eefe5c670568 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Wed, 23 Oct 2024 14:04:31 -0600
Subject: [PATCH 223/571] rust - tidier error checking

---
 rust/libceed/src/basis.rs            | 8 +++++++-
 rust/libceed/src/elem_restriction.rs | 8 +++++++-
 rust/libceed/src/lib.rs              | 7 +++++--
 rust/libceed/src/operator.rs         | 8 +++++++-
 rust/libceed/src/qfunction.rs        | 8 +++++++-
 rust/libceed/src/vector.rs           | 8 +++++++-
 6 files changed, 40 insertions(+), 7 deletions(-)

diff --git a/rust/libceed/src/basis.rs b/rust/libceed/src/basis.rs
index 2d7cdba637..9b8ef714dd 100644
--- a/rust/libceed/src/basis.rs
+++ b/rust/libceed/src/basis.rs
@@ -323,10 +323,16 @@ impl<'a> Basis<'a> {
         })
     }
 
+    // Raw Ceed for error handling
+    #[doc(hidden)]
+    fn ceed(&self) -> bind_ceed::Ceed {
+        unsafe { bind_ceed::CeedBasisReturnCeed(self.ptr) }
+    }
+
     // Error handling
     #[doc(hidden)]
     fn check_error(&self, ierr: i32) -> crate::Result<i32> {
-        unsafe { crate::check_error(bind_ceed::CeedBasisReturnCeed(self.ptr), ierr) }
+        crate::check_error(|| self.ceed(), ierr)
     }
 
     /// Apply basis evaluation from nodes to quadrature points or vice versa
diff --git a/rust/libceed/src/elem_restriction.rs b/rust/libceed/src/elem_restriction.rs
index 500de95e80..e2779e876c 100644
--- a/rust/libceed/src/elem_restriction.rs
+++ b/rust/libceed/src/elem_restriction.rs
@@ -317,10 +317,16 @@ impl<'a> ElemRestriction<'a> {
         })
     }
 
+    // Raw Ceed for error handling
+    #[doc(hidden)]
+    fn ceed(&self) -> bind_ceed::Ceed {
+        unsafe { bind_ceed::CeedElemRestrictionReturnCeed(self.ptr) }
+    }
+
     // Error handling
     #[doc(hidden)]
     fn check_error(&self, ierr: i32) -> crate::Result<i32> {
-        unsafe { crate::check_error(bind_ceed::CeedElemRestrictionReturnCeed(self.ptr), ierr) }
+        crate::check_error(|| self.ceed(), ierr)
     }
 
     /// Create an Lvector for an ElemRestriction
diff --git a/rust/libceed/src/lib.rs b/rust/libceed/src/lib.rs
index bf7de98e07..36ad38a1c5 100755
--- a/rust/libceed/src/lib.rs
+++ b/rust/libceed/src/lib.rs
@@ -161,7 +161,10 @@ impl fmt::Display for Error {
 // Internal error checker
 // -----------------------------------------------------------------------------
 #[doc(hidden)]
-pub(crate) fn check_error(ceed_ptr: bind_ceed::Ceed, ierr: i32) -> Result<i32> {
+pub(crate) fn check_error<F>(ceed_ptr: F, ierr: i32) -> Result<i32>
+where
+    F: FnOnce() -> bind_ceed::Ceed,
+{
     // Return early if code is clean
     if ierr == bind_ceed::CeedErrorType_CEED_ERROR_SUCCESS {
         return Ok(ierr);
@@ -169,7 +172,7 @@ pub(crate) fn check_error(ceed_ptr: bind_ceed::Ceed, ierr: i32) -> Result<i32> {
     // Retrieve error message
     let mut ptr: *const std::os::raw::c_char = std::ptr::null_mut();
     let c_str = unsafe {
-        bind_ceed::CeedGetErrorMessage(ceed_ptr, &mut ptr);
+        bind_ceed::CeedGetErrorMessage(ceed_ptr(), &mut ptr);
         std::ffi::CStr::from_ptr(ptr)
     };
     let message = c_str.to_string_lossy().to_string();
diff --git a/rust/libceed/src/operator.rs b/rust/libceed/src/operator.rs
index 9f3506a493..a1a5116ec2 100644
--- a/rust/libceed/src/operator.rs
+++ b/rust/libceed/src/operator.rs
@@ -439,10 +439,16 @@ impl<'a> fmt::Display for CompositeOperator<'a> {
 // Core functionality
 // -----------------------------------------------------------------------------
 impl<'a> OperatorCore<'a> {
+    // Raw Ceed for error handling
+    #[doc(hidden)]
+    fn ceed(&self) -> bind_ceed::Ceed {
+        unsafe { bind_ceed::CeedOperatorReturnCeed(self.ptr) }
+    }
+
     // Error handling
     #[doc(hidden)]
     fn check_error(&self, ierr: i32) -> crate::Result<i32> {
-        unsafe { crate::check_error(bind_ceed::CeedOperatorReturnCeed(self.ptr), ierr) }
+        crate::check_error(|| self.ceed(), ierr)
     }
 
     // Common implementations
diff --git a/rust/libceed/src/qfunction.rs b/rust/libceed/src/qfunction.rs
index a09c0ea918..48c27e781a 100644
--- a/rust/libceed/src/qfunction.rs
+++ b/rust/libceed/src/qfunction.rs
@@ -439,10 +439,16 @@ impl<'a> fmt::Display for QFunctionByName<'a> {
 // Core functionality
 // -----------------------------------------------------------------------------
 impl<'a> QFunctionCore<'a> {
+    // Raw Ceed for error handling
+    #[doc(hidden)]
+    fn ceed(&self) -> bind_ceed::Ceed {
+        unsafe { bind_ceed::CeedQFunctionReturnCeed(self.ptr) }
+    }
+
     // Error handling
     #[doc(hidden)]
     fn check_error(&self, ierr: i32) -> crate::Result<i32> {
-        unsafe { crate::check_error(bind_ceed::CeedQFunctionReturnCeed(self.ptr), ierr) }
+        crate::check_error(|| self.ceed(), ierr)
     }
 
     // Common implementation
diff --git a/rust/libceed/src/vector.rs b/rust/libceed/src/vector.rs
index 9173070180..0ffe84dfac 100644
--- a/rust/libceed/src/vector.rs
+++ b/rust/libceed/src/vector.rs
@@ -345,10 +345,16 @@ impl<'a> Vector<'a> {
         Ok(x)
     }
 
+    // Raw Ceed for error handling
+    #[doc(hidden)]
+    fn ceed(&self) -> bind_ceed::Ceed {
+        unsafe { bind_ceed::CeedVectorReturnCeed(self.ptr) }
+    }
+
     // Error handling
     #[doc(hidden)]
     fn check_error(&self, ierr: i32) -> crate::Result<i32> {
-        unsafe { crate::check_error(bind_ceed::CeedVectorReturnCeed(self.ptr), ierr) }
+        crate::check_error(|| self.ceed(), ierr)
     }
 
     /// Returns the length of a Vector

From 656ef1e5dc8a79430517f8559593f57085006221 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Wed, 23 Oct 2024 15:27:06 -0600
Subject: [PATCH 224/571] rust - directly pass ierr to check_error()

---
 rust/libceed/src/basis.rs            |  36 +++++-----
 rust/libceed/src/elem_restriction.rs |  47 ++++++------
 rust/libceed/src/lib.rs              |   4 +-
 rust/libceed/src/operator.rs         | 103 +++++++++++----------------
 rust/libceed/src/qfunction.rs        |  56 ++++++---------
 rust/libceed/src/vector.rs           |  67 +++++++----------
 6 files changed, 130 insertions(+), 183 deletions(-)

diff --git a/rust/libceed/src/basis.rs b/rust/libceed/src/basis.rs
index 9b8ef714dd..a3188f7bfb 100644
--- a/rust/libceed/src/basis.rs
+++ b/rust/libceed/src/basis.rs
@@ -152,7 +152,7 @@ impl<'a> Basis<'a> {
             i32::try_from(P1d).unwrap(),
             i32::try_from(Q1d).unwrap(),
         );
-        let ierr = unsafe {
+        ceed.check_error(unsafe {
             bind_ceed::CeedBasisCreateTensorH1(
                 ceed.ptr,
                 dim,
@@ -165,8 +165,7 @@ impl<'a> Basis<'a> {
                 qweight1d.as_ptr(),
                 &mut ptr,
             )
-        };
-        ceed.check_error(ierr)?;
+        })?;
         Ok(Self {
             ptr,
             _lifeline: PhantomData,
@@ -196,10 +195,9 @@ impl<'a> Basis<'a> {
             i32::try_from(Q).unwrap(),
             qmode as bind_ceed::CeedQuadMode,
         );
-        let ierr = unsafe {
+        ceed.check_error(unsafe {
             bind_ceed::CeedBasisCreateTensorH1Lagrange(ceed.ptr, dim, ncomp, P, Q, qmode, &mut ptr)
-        };
-        ceed.check_error(ierr)?;
+        })?;
         Ok(Self {
             ptr,
             _lifeline: PhantomData,
@@ -224,7 +222,7 @@ impl<'a> Basis<'a> {
             i32::try_from(nnodes).unwrap(),
             i32::try_from(nqpts).unwrap(),
         );
-        let ierr = unsafe {
+        ceed.check_error(unsafe {
             bind_ceed::CeedBasisCreateH1(
                 ceed.ptr,
                 topo,
@@ -237,8 +235,7 @@ impl<'a> Basis<'a> {
                 qweight.as_ptr(),
                 &mut ptr,
             )
-        };
-        ceed.check_error(ierr)?;
+        })?;
         Ok(Self {
             ptr,
             _lifeline: PhantomData,
@@ -263,7 +260,7 @@ impl<'a> Basis<'a> {
             i32::try_from(nnodes).unwrap(),
             i32::try_from(nqpts).unwrap(),
         );
-        let ierr = unsafe {
+        ceed.check_error(unsafe {
             bind_ceed::CeedBasisCreateHdiv(
                 ceed.ptr,
                 topo,
@@ -276,8 +273,7 @@ impl<'a> Basis<'a> {
                 qweight.as_ptr(),
                 &mut ptr,
             )
-        };
-        ceed.check_error(ierr)?;
+        })?;
         Ok(Self {
             ptr,
             _lifeline: PhantomData,
@@ -302,7 +298,7 @@ impl<'a> Basis<'a> {
             i32::try_from(nnodes).unwrap(),
             i32::try_from(nqpts).unwrap(),
         );
-        let ierr = unsafe {
+        ceed.check_error(unsafe {
             bind_ceed::CeedBasisCreateHcurl(
                 ceed.ptr,
                 topo,
@@ -315,8 +311,7 @@ impl<'a> Basis<'a> {
                 qweight.as_ptr(),
                 &mut ptr,
             )
-        };
-        ceed.check_error(ierr)?;
+        })?;
         Ok(Self {
             ptr,
             _lifeline: PhantomData,
@@ -413,9 +408,9 @@ impl<'a> Basis<'a> {
             tmode as bind_ceed::CeedTransposeMode,
             emode as bind_ceed::CeedEvalMode,
         );
-        let ierr =
-            unsafe { bind_ceed::CeedBasisApply(self.ptr, nelem, tmode, emode, u.ptr, v.ptr) };
-        self.check_error(ierr)
+        self.check_error(unsafe {
+            bind_ceed::CeedBasisApply(self.ptr, nelem, tmode, emode, u.ptr, v.ptr)
+        })
     }
 
     /// Returns the dimension for given Basis
@@ -532,8 +527,9 @@ impl<'a> Basis<'a> {
     /// ```
     pub fn create_projection(&self, to: &Self) -> crate::Result<Self> {
         let mut ptr = std::ptr::null_mut();
-        let ierr = unsafe { bind_ceed::CeedBasisCreateProjection(self.ptr, to.ptr, &mut ptr) };
-        self.check_error(ierr)?;
+        self.check_error(unsafe {
+            bind_ceed::CeedBasisCreateProjection(self.ptr, to.ptr, &mut ptr)
+        })?;
         Ok(Self {
             ptr,
             _lifeline: PhantomData,
diff --git a/rust/libceed/src/elem_restriction.rs b/rust/libceed/src/elem_restriction.rs
index e2779e876c..1bbf6b0643 100644
--- a/rust/libceed/src/elem_restriction.rs
+++ b/rust/libceed/src/elem_restriction.rs
@@ -172,7 +172,7 @@ impl<'a> ElemRestriction<'a> {
             isize::try_from(lsize).unwrap(),
             mtype as bind_ceed::CeedMemType,
         );
-        let ierr = unsafe {
+        ceed.check_error(unsafe {
             bind_ceed::CeedElemRestrictionCreate(
                 ceed.ptr,
                 nelem,
@@ -185,8 +185,7 @@ impl<'a> ElemRestriction<'a> {
                 offsets.as_ptr(),
                 &mut ptr,
             )
-        };
-        ceed.check_error(ierr)?;
+        })?;
         Ok(Self {
             ptr,
             _lifeline: PhantomData,
@@ -220,7 +219,7 @@ impl<'a> ElemRestriction<'a> {
             isize::try_from(lsize).unwrap(),
             mtype as bind_ceed::CeedMemType,
         );
-        let ierr = unsafe {
+        ceed.check_error(unsafe {
             bind_ceed::CeedElemRestrictionCreateOriented(
                 ceed.ptr,
                 nelem,
@@ -234,8 +233,7 @@ impl<'a> ElemRestriction<'a> {
                 orients.as_ptr(),
                 &mut ptr,
             )
-        };
-        ceed.check_error(ierr)?;
+        })?;
         Ok(Self {
             ptr,
             _lifeline: PhantomData,
@@ -262,7 +260,7 @@ impl<'a> ElemRestriction<'a> {
             isize::try_from(lsize).unwrap(),
             mtype as bind_ceed::CeedMemType,
         );
-        let ierr = unsafe {
+        ceed.check_error(unsafe {
             bind_ceed::CeedElemRestrictionCreateCurlOriented(
                 ceed.ptr,
                 nelem,
@@ -276,8 +274,7 @@ impl<'a> ElemRestriction<'a> {
                 curlorients.as_ptr(),
                 &mut ptr,
             )
-        };
-        ceed.check_error(ierr)?;
+        })?;
         Ok(Self {
             ptr,
             _lifeline: PhantomData,
@@ -299,7 +296,7 @@ impl<'a> ElemRestriction<'a> {
             i32::try_from(ncomp).unwrap(),
             isize::try_from(lsize).unwrap(),
         );
-        let ierr = unsafe {
+        ceed.check_error(unsafe {
             bind_ceed::CeedElemRestrictionCreateStrided(
                 ceed.ptr,
                 nelem,
@@ -309,8 +306,7 @@ impl<'a> ElemRestriction<'a> {
                 strides.as_ptr(),
                 &mut ptr,
             )
-        };
-        ceed.check_error(ierr)?;
+        })?;
         Ok(Self {
             ptr,
             _lifeline: PhantomData,
@@ -352,9 +348,9 @@ impl<'a> ElemRestriction<'a> {
     pub fn create_lvector<'b>(&self) -> crate::Result<Vector<'b>> {
         let mut ptr_lvector = std::ptr::null_mut();
         let null = std::ptr::null_mut() as *mut _;
-        let ierr =
-            unsafe { bind_ceed::CeedElemRestrictionCreateVector(self.ptr, &mut ptr_lvector, null) };
-        self.check_error(ierr)?;
+        self.check_error(unsafe {
+            bind_ceed::CeedElemRestrictionCreateVector(self.ptr, &mut ptr_lvector, null)
+        })?;
         Vector::from_raw(ptr_lvector)
     }
 
@@ -381,9 +377,9 @@ impl<'a> ElemRestriction<'a> {
     pub fn create_evector<'b>(&self) -> crate::Result<Vector<'b>> {
         let mut ptr_evector = std::ptr::null_mut();
         let null = std::ptr::null_mut() as *mut _;
-        let ierr =
-            unsafe { bind_ceed::CeedElemRestrictionCreateVector(self.ptr, null, &mut ptr_evector) };
-        self.check_error(ierr)?;
+        self.check_error(unsafe {
+            bind_ceed::CeedElemRestrictionCreateVector(self.ptr, null, &mut ptr_evector)
+        })?;
         Vector::from_raw(ptr_evector)
     }
 
@@ -411,10 +407,9 @@ impl<'a> ElemRestriction<'a> {
     pub fn create_vectors<'b, 'c>(&self) -> crate::Result<(Vector<'b>, Vector<'c>)> {
         let mut ptr_lvector = std::ptr::null_mut();
         let mut ptr_evector = std::ptr::null_mut();
-        let ierr = unsafe {
+        self.check_error(unsafe {
             bind_ceed::CeedElemRestrictionCreateVector(self.ptr, &mut ptr_lvector, &mut ptr_evector)
-        };
-        self.check_error(ierr)?;
+        })?;
         let lvector = Vector::from_raw(ptr_lvector)?;
         let evector = Vector::from_raw(ptr_evector)?;
         Ok((lvector, evector))
@@ -460,7 +455,7 @@ impl<'a> ElemRestriction<'a> {
     /// ```
     pub fn apply(&self, tmode: TransposeMode, u: &Vector, ru: &mut Vector) -> crate::Result<i32> {
         let tmode = tmode as bind_ceed::CeedTransposeMode;
-        let ierr = unsafe {
+        self.check_error(unsafe {
             bind_ceed::CeedElemRestrictionApply(
                 self.ptr,
                 tmode,
@@ -468,8 +463,7 @@ impl<'a> ElemRestriction<'a> {
                 ru.ptr,
                 bind_ceed::CEED_REQUEST_IMMEDIATE,
             )
-        };
-        self.check_error(ierr)
+        })
     }
 
     /// Returns the Lvector component stride
@@ -630,8 +624,9 @@ impl<'a> ElemRestriction<'a> {
     /// # }
     /// ```
     pub fn multiplicity(&self, mult: &mut Vector) -> crate::Result<i32> {
-        let ierr = unsafe { bind_ceed::CeedElemRestrictionGetMultiplicity(self.ptr, mult.ptr) };
-        self.check_error(ierr)
+        self.check_error(unsafe {
+            bind_ceed::CeedElemRestrictionGetMultiplicity(self.ptr, mult.ptr)
+        })
     }
 }
 
diff --git a/rust/libceed/src/lib.rs b/rust/libceed/src/lib.rs
index 36ad38a1c5..b248aa9fa7 100755
--- a/rust/libceed/src/lib.rs
+++ b/rust/libceed/src/lib.rs
@@ -228,8 +228,8 @@ impl Clone for Ceed {
     /// ```
     fn clone(&self) -> Self {
         let mut ptr_clone = std::ptr::null_mut();
-        let ierr = unsafe { bind_ceed::CeedReferenceCopy(self.ptr, &mut ptr_clone) };
-        self.check_error(ierr).expect("failed to clone Ceed");
+        self.check_error(unsafe { bind_ceed::CeedReferenceCopy(self.ptr, &mut ptr_clone) })
+            .expect("failed to clone Ceed");
         Self { ptr: ptr_clone }
     }
 }
diff --git a/rust/libceed/src/operator.rs b/rust/libceed/src/operator.rs
index a1a5116ec2..6f2cb11409 100644
--- a/rust/libceed/src/operator.rs
+++ b/rust/libceed/src/operator.rs
@@ -33,22 +33,21 @@ impl<'a> OperatorField<'a> {
     ) -> crate::Result<Self> {
         let vector = {
             let mut vector_ptr = std::ptr::null_mut();
-            let ierr = unsafe { bind_ceed::CeedOperatorFieldGetVector(ptr, &mut vector_ptr) };
-            ceed.check_error(ierr)?;
+            ceed.check_error(unsafe {
+                bind_ceed::CeedOperatorFieldGetVector(ptr, &mut vector_ptr)
+            })?;
             crate::Vector::from_raw(vector_ptr)?
         };
         let elem_restriction = {
             let mut elem_restriction_ptr = std::ptr::null_mut();
-            let ierr = unsafe {
+            ceed.check_error(unsafe {
                 bind_ceed::CeedOperatorFieldGetElemRestriction(ptr, &mut elem_restriction_ptr)
-            };
-            ceed.check_error(ierr)?;
+            })?;
             crate::ElemRestriction::from_raw(elem_restriction_ptr)?
         };
         let basis = {
             let mut basis_ptr = std::ptr::null_mut();
-            let ierr = unsafe { bind_ceed::CeedOperatorFieldGetBasis(ptr, &mut basis_ptr) };
-            ceed.check_error(ierr)?;
+            ceed.check_error(unsafe { bind_ceed::CeedOperatorFieldGetBasis(ptr, &mut basis_ptr) })?;
             crate::Basis::from_raw(basis_ptr)?
         };
         Ok(Self {
@@ -453,88 +452,80 @@ impl<'a> OperatorCore<'a> {
 
     // Common implementations
     pub fn check(&self) -> crate::Result<i32> {
-        let ierr = unsafe { bind_ceed::CeedOperatorCheckReady(self.ptr) };
-        self.check_error(ierr)
+        self.check_error(unsafe { bind_ceed::CeedOperatorCheckReady(self.ptr) })
     }
 
     pub fn name(&self, name: &str) -> crate::Result<i32> {
         let name_c = CString::new(name).expect("CString::new failed");
-        let ierr = unsafe { bind_ceed::CeedOperatorSetName(self.ptr, name_c.as_ptr()) };
-        self.check_error(ierr)
+        self.check_error(unsafe { bind_ceed::CeedOperatorSetName(self.ptr, name_c.as_ptr()) })
     }
 
     pub fn apply(&self, input: &Vector, output: &mut Vector) -> crate::Result<i32> {
-        let ierr = unsafe {
+        self.check_error(unsafe {
             bind_ceed::CeedOperatorApply(
                 self.ptr,
                 input.ptr,
                 output.ptr,
                 bind_ceed::CEED_REQUEST_IMMEDIATE,
             )
-        };
-        self.check_error(ierr)
+        })
     }
 
     pub fn apply_add(&self, input: &Vector, output: &mut Vector) -> crate::Result<i32> {
-        let ierr = unsafe {
+        self.check_error(unsafe {
             bind_ceed::CeedOperatorApplyAdd(
                 self.ptr,
                 input.ptr,
                 output.ptr,
                 bind_ceed::CEED_REQUEST_IMMEDIATE,
             )
-        };
-        self.check_error(ierr)
+        })
     }
 
     pub fn linear_assemble_diagonal(&self, assembled: &mut Vector) -> crate::Result<i32> {
-        let ierr = unsafe {
+        self.check_error(unsafe {
             bind_ceed::CeedOperatorLinearAssembleDiagonal(
                 self.ptr,
                 assembled.ptr,
                 bind_ceed::CEED_REQUEST_IMMEDIATE,
             )
-        };
-        self.check_error(ierr)
+        })
     }
 
     pub fn linear_assemble_add_diagonal(&self, assembled: &mut Vector) -> crate::Result<i32> {
-        let ierr = unsafe {
+        self.check_error(unsafe {
             bind_ceed::CeedOperatorLinearAssembleAddDiagonal(
                 self.ptr,
                 assembled.ptr,
                 bind_ceed::CEED_REQUEST_IMMEDIATE,
             )
-        };
-        self.check_error(ierr)
+        })
     }
 
     pub fn linear_assemble_point_block_diagonal(
         &self,
         assembled: &mut Vector,
     ) -> crate::Result<i32> {
-        let ierr = unsafe {
+        self.check_error(unsafe {
             bind_ceed::CeedOperatorLinearAssemblePointBlockDiagonal(
                 self.ptr,
                 assembled.ptr,
                 bind_ceed::CEED_REQUEST_IMMEDIATE,
             )
-        };
-        self.check_error(ierr)
+        })
     }
 
     pub fn linear_assemble_add_point_block_diagonal(
         &self,
         assembled: &mut Vector,
     ) -> crate::Result<i32> {
-        let ierr = unsafe {
+        self.check_error(unsafe {
             bind_ceed::CeedOperatorLinearAssembleAddPointBlockDiagonal(
                 self.ptr,
                 assembled.ptr,
                 bind_ceed::CEED_REQUEST_IMMEDIATE,
             )
-        };
-        self.check_error(ierr)
+        })
     }
 }
 
@@ -550,7 +541,7 @@ impl<'a> Operator<'a> {
         dqfT: impl Into<QFunctionOpt<'b>>,
     ) -> crate::Result<Self> {
         let mut ptr = std::ptr::null_mut();
-        let ierr = unsafe {
+        ceed.check_error(unsafe {
             bind_ceed::CeedOperatorCreate(
                 ceed.ptr,
                 qf.into().to_raw(),
@@ -558,8 +549,7 @@ impl<'a> Operator<'a> {
                 dqfT.into().to_raw(),
                 &mut ptr,
             )
-        };
-        ceed.check_error(ierr)?;
+        })?;
         Ok(Self {
             op_core: OperatorCore {
                 ptr,
@@ -814,7 +804,7 @@ impl<'a> Operator<'a> {
     ) -> crate::Result<Self> {
         let fieldname = CString::new(fieldname).expect("CString::new failed");
         let fieldname = fieldname.as_ptr() as *const i8;
-        let ierr = unsafe {
+        self.op_core.check_error(unsafe {
             bind_ceed::CeedOperatorSetField(
                 self.op_core.ptr,
                 fieldname,
@@ -822,8 +812,7 @@ impl<'a> Operator<'a> {
                 b.into().to_raw(),
                 v.into().to_raw(),
             )
-        };
-        self.op_core.check_error(ierr)?;
+        })?;
         Ok(self)
     }
 
@@ -866,7 +855,7 @@ impl<'a> Operator<'a> {
         // Get array of raw C pointers for inputs
         let mut num_inputs = 0;
         let mut inputs_ptr = std::ptr::null_mut();
-        let ierr = unsafe {
+        self.op_core.check_error(unsafe {
             bind_ceed::CeedOperatorGetFields(
                 self.op_core.ptr,
                 &mut num_inputs,
@@ -874,8 +863,7 @@ impl<'a> Operator<'a> {
                 std::ptr::null_mut() as *mut bind_ceed::CeedInt,
                 std::ptr::null_mut() as *mut *mut bind_ceed::CeedOperatorField,
             )
-        };
-        self.op_core.check_error(ierr)?;
+        })?;
         // Convert raw C pointers to fixed length slice
         let inputs_slice = unsafe {
             std::slice::from_raw_parts(
@@ -885,11 +873,10 @@ impl<'a> Operator<'a> {
         };
         // And finally build vec
         let ceed = {
+            let ceed_raw = self.op_core.ceed();
             let mut ptr = std::ptr::null_mut();
-            let mut ptr_copy = std::ptr::null_mut();
             unsafe {
-                bind_ceed::CeedOperatorGetCeed(self.op_core.ptr, &mut ptr);
-                bind_ceed::CeedReferenceCopy(ptr, &mut ptr_copy); // refcount
+                bind_ceed::CeedReferenceCopy(ceed_raw, &mut ptr); // refcount
             }
             crate::Ceed { ptr }
         };
@@ -938,7 +925,7 @@ impl<'a> Operator<'a> {
         // Get array of raw C pointers for outputs
         let mut num_outputs = 0;
         let mut outputs_ptr = std::ptr::null_mut();
-        let ierr = unsafe {
+        self.op_core.check_error(unsafe {
             bind_ceed::CeedOperatorGetFields(
                 self.op_core.ptr,
                 std::ptr::null_mut() as *mut bind_ceed::CeedInt,
@@ -946,8 +933,7 @@ impl<'a> Operator<'a> {
                 &mut num_outputs,
                 &mut outputs_ptr,
             )
-        };
-        self.op_core.check_error(ierr)?;
+        })?;
         // Convert raw C pointers to fixed length slice
         let outputs_slice = unsafe {
             std::slice::from_raw_parts(
@@ -957,11 +943,10 @@ impl<'a> Operator<'a> {
         };
         // And finally build vec
         let ceed = {
+            let ceed_raw = self.op_core.ceed();
             let mut ptr = std::ptr::null_mut();
-            let mut ptr_copy = std::ptr::null_mut();
             unsafe {
-                bind_ceed::CeedOperatorGetCeed(self.op_core.ptr, &mut ptr);
-                bind_ceed::CeedReferenceCopy(ptr, &mut ptr_copy); // refcount
+                bind_ceed::CeedReferenceCopy(ceed_raw, &mut ptr); // refcount
             }
             crate::Ceed { ptr }
         };
@@ -1724,7 +1709,7 @@ impl<'a> Operator<'a> {
         let mut ptr_coarse = std::ptr::null_mut();
         let mut ptr_prolong = std::ptr::null_mut();
         let mut ptr_restrict = std::ptr::null_mut();
-        let ierr = unsafe {
+        self.op_core.check_error(unsafe {
             bind_ceed::CeedOperatorMultigridLevelCreate(
                 self.op_core.ptr,
                 p_mult_fine.ptr,
@@ -1734,8 +1719,7 @@ impl<'a> Operator<'a> {
                 &mut ptr_prolong,
                 &mut ptr_restrict,
             )
-        };
-        self.op_core.check_error(ierr)?;
+        })?;
         let op_coarse = Operator::from_raw(ptr_coarse)?;
         let op_prolong = Operator::from_raw(ptr_prolong)?;
         let op_restrict = Operator::from_raw(ptr_restrict)?;
@@ -1914,7 +1898,7 @@ impl<'a> Operator<'a> {
         let mut ptr_coarse = std::ptr::null_mut();
         let mut ptr_prolong = std::ptr::null_mut();
         let mut ptr_restrict = std::ptr::null_mut();
-        let ierr = unsafe {
+        self.op_core.check_error(unsafe {
             bind_ceed::CeedOperatorMultigridLevelCreateTensorH1(
                 self.op_core.ptr,
                 p_mult_fine.ptr,
@@ -1925,8 +1909,7 @@ impl<'a> Operator<'a> {
                 &mut ptr_prolong,
                 &mut ptr_restrict,
             )
-        };
-        self.op_core.check_error(ierr)?;
+        })?;
         let op_coarse = Operator::from_raw(ptr_coarse)?;
         let op_prolong = Operator::from_raw(ptr_prolong)?;
         let op_restrict = Operator::from_raw(ptr_restrict)?;
@@ -2105,7 +2088,7 @@ impl<'a> Operator<'a> {
         let mut ptr_coarse = std::ptr::null_mut();
         let mut ptr_prolong = std::ptr::null_mut();
         let mut ptr_restrict = std::ptr::null_mut();
-        let ierr = unsafe {
+        self.op_core.check_error(unsafe {
             bind_ceed::CeedOperatorMultigridLevelCreateH1(
                 self.op_core.ptr,
                 p_mult_fine.ptr,
@@ -2116,8 +2099,7 @@ impl<'a> Operator<'a> {
                 &mut ptr_prolong,
                 &mut ptr_restrict,
             )
-        };
-        self.op_core.check_error(ierr)?;
+        })?;
         let op_coarse = Operator::from_raw(ptr_coarse)?;
         let op_prolong = Operator::from_raw(ptr_prolong)?;
         let op_restrict = Operator::from_raw(ptr_restrict)?;
@@ -2132,8 +2114,7 @@ impl<'a> CompositeOperator<'a> {
     // Constructor
     pub fn create(ceed: &crate::Ceed) -> crate::Result<Self> {
         let mut ptr = std::ptr::null_mut();
-        let ierr = unsafe { bind_ceed::CeedCompositeOperatorCreate(ceed.ptr, &mut ptr) };
-        ceed.check_error(ierr)?;
+        ceed.check_error(unsafe { bind_ceed::CeedCompositeOperatorCreate(ceed.ptr, &mut ptr) })?;
         Ok(Self {
             op_core: OperatorCore {
                 ptr,
@@ -2414,9 +2395,9 @@ impl<'a> CompositeOperator<'a> {
     /// ```
     #[allow(unused_mut)]
     pub fn sub_operator(mut self, subop: &Operator) -> crate::Result<Self> {
-        let ierr =
-            unsafe { bind_ceed::CeedCompositeOperatorAddSub(self.op_core.ptr, subop.op_core.ptr) };
-        self.op_core.check_error(ierr)?;
+        self.op_core.check_error(unsafe {
+            bind_ceed::CeedCompositeOperatorAddSub(self.op_core.ptr, subop.op_core.ptr)
+        })?;
         Ok(self)
     }
 
diff --git a/rust/libceed/src/qfunction.rs b/rust/libceed/src/qfunction.rs
index 48c27e781a..f5b746eefd 100644
--- a/rust/libceed/src/qfunction.rs
+++ b/rust/libceed/src/qfunction.rs
@@ -462,17 +462,16 @@ impl<'a> QFunctionCore<'a> {
             v_c[i] = v[i].ptr;
         }
         let Q = i32::try_from(Q).unwrap();
-        let ierr = unsafe {
+        self.check_error(unsafe {
             bind_ceed::CeedQFunctionApply(self.ptr, Q, u_c.as_mut_ptr(), v_c.as_mut_ptr())
-        };
-        self.check_error(ierr)
+        })
     }
 
     pub fn inputs(&self) -> crate::Result<&[crate::QFunctionField]> {
         // Get array of raw C pointers for inputs
         let mut num_inputs = 0;
         let mut inputs_ptr = std::ptr::null_mut();
-        let ierr = unsafe {
+        self.check_error(unsafe {
             bind_ceed::CeedQFunctionGetFields(
                 self.ptr,
                 &mut num_inputs,
@@ -480,8 +479,7 @@ impl<'a> QFunctionCore<'a> {
                 std::ptr::null_mut() as *mut bind_ceed::CeedInt,
                 std::ptr::null_mut() as *mut *mut bind_ceed::CeedQFunctionField,
             )
-        };
-        self.check_error(ierr)?;
+        })?;
         // Convert raw C pointers to fixed length slice
         let inputs_slice = unsafe {
             std::slice::from_raw_parts(
@@ -496,7 +494,7 @@ impl<'a> QFunctionCore<'a> {
         // Get array of raw C pointers for outputs
         let mut num_outputs = 0;
         let mut outputs_ptr = std::ptr::null_mut();
-        let ierr = unsafe {
+        self.check_error(unsafe {
             bind_ceed::CeedQFunctionGetFields(
                 self.ptr,
                 std::ptr::null_mut() as *mut bind_ceed::CeedInt,
@@ -504,8 +502,7 @@ impl<'a> QFunctionCore<'a> {
                 &mut num_outputs,
                 &mut outputs_ptr,
             )
-        };
-        self.check_error(ierr)?;
+        })?;
         // Convert raw C pointers to fixed length slice
         let outputs_slice = unsafe {
             std::slice::from_raw_parts(
@@ -611,7 +608,7 @@ impl<'a> QFunction<'a> {
 
         // Create QFunction
         let vlength = i32::try_from(vlength).unwrap();
-        let mut ierr = unsafe {
+        ceed.check_error(unsafe {
             bind_ceed::CeedQFunctionCreateInterior(
                 ceed.ptr,
                 vlength,
@@ -619,14 +616,14 @@ impl<'a> QFunction<'a> {
                 source_c.as_ptr(),
                 &mut ptr,
             )
-        };
-        ceed.check_error(ierr)?;
+        })?;
 
         // Set closure
         let mut qf_ctx_ptr = std::ptr::null_mut();
-        ierr = unsafe { bind_ceed::CeedQFunctionContextCreate(ceed.ptr, &mut qf_ctx_ptr) };
-        ceed.check_error(ierr)?;
-        ierr = unsafe {
+        ceed.check_error(unsafe {
+            bind_ceed::CeedQFunctionContextCreate(ceed.ptr, &mut qf_ctx_ptr)
+        })?;
+        ceed.check_error(unsafe {
             bind_ceed::CeedQFunctionContextSetData(
                 qf_ctx_ptr,
                 crate::MemType::Host as bind_ceed::CeedMemType,
@@ -634,20 +631,16 @@ impl<'a> QFunction<'a> {
                 std::mem::size_of::<QFunctionTrampolineData>(),
                 std::mem::transmute(trampoline_data.as_ref()),
             )
-        };
-        ceed.check_error(ierr)?;
-        ierr = unsafe {
+        })?;
+        ceed.check_error(unsafe {
             bind_ceed::CeedQFunctionContextSetDataDestroy(
                 qf_ctx_ptr,
                 crate::MemType::Host as bind_ceed::CeedMemType,
                 Some(destroy_trampoline),
             )
-        };
-        ceed.check_error(ierr)?;
-        ierr = unsafe { bind_ceed::CeedQFunctionSetContext(ptr, qf_ctx_ptr) };
-        ceed.check_error(ierr)?;
-        ierr = unsafe { bind_ceed::CeedQFunctionContextDestroy(&mut qf_ctx_ptr) };
-        ceed.check_error(ierr)?;
+        })?;
+        ceed.check_error(unsafe { bind_ceed::CeedQFunctionSetContext(ptr, qf_ctx_ptr) })?;
+        ceed.check_error(unsafe { bind_ceed::CeedQFunctionContextDestroy(&mut qf_ctx_ptr) })?;
         Ok(Self {
             qf_core: QFunctionCore {
                 ptr,
@@ -767,10 +760,9 @@ impl<'a> QFunction<'a> {
             i32::try_from(size).unwrap(),
             emode as bind_ceed::CeedEvalMode,
         );
-        let ierr = unsafe {
+        self.qf_core.check_error(unsafe {
             bind_ceed::CeedQFunctionAddInput(self.qf_core.ptr, name_c.as_ptr(), size, emode)
-        };
-        self.qf_core.check_error(ierr)?;
+        })?;
         Ok(self)
     }
 
@@ -817,10 +809,9 @@ impl<'a> QFunction<'a> {
             i32::try_from(size).unwrap(),
             emode as bind_ceed::CeedEvalMode,
         );
-        let ierr = unsafe {
+        self.qf_core.check_error(unsafe {
             bind_ceed::CeedQFunctionAddOutput(self.qf_core.ptr, name_c.as_ptr(), size, emode)
-        };
-        self.qf_core.check_error(ierr)?;
+        })?;
         Ok(self)
     }
 
@@ -894,10 +885,9 @@ impl<'a> QFunctionByName<'a> {
     pub fn create(ceed: &crate::Ceed, name: &str) -> crate::Result<Self> {
         let name_c = CString::new(name).expect("CString::new failed");
         let mut ptr = std::ptr::null_mut();
-        let ierr = unsafe {
+        ceed.check_error(unsafe {
             bind_ceed::CeedQFunctionCreateInteriorByName(ceed.ptr, name_c.as_ptr(), &mut ptr)
-        };
-        ceed.check_error(ierr)?;
+        })?;
         Ok(Self {
             qf_core: QFunctionCore {
                 ptr,
diff --git a/rust/libceed/src/vector.rs b/rust/libceed/src/vector.rs
index 0ffe84dfac..a6fd181907 100644
--- a/rust/libceed/src/vector.rs
+++ b/rust/libceed/src/vector.rs
@@ -157,16 +157,14 @@ impl<'a> VectorSliceWrapper<'a> {
             crate::MemType::Host as bind_ceed::CeedMemType,
             crate::CopyMode::UsePointer as bind_ceed::CeedCopyMode,
         );
-        let ierr = unsafe {
+        vec.check_error(unsafe {
             bind_ceed::CeedVectorSetArray(
                 vec.ptr,
                 host,
                 copy_mode,
                 slice.as_ptr() as *mut crate::Scalar,
             )
-        };
-        vec.check_error(ierr)?;
-
+        })?;
         Ok(Self {
             vector: crate::Vector::from_raw(vec.ptr_copy_mut()?)?,
             _slice: slice,
@@ -247,8 +245,7 @@ impl<'a> Vector<'a> {
     pub fn create(ceed: &crate::Ceed, n: usize) -> crate::Result<Self> {
         let n = isize::try_from(n).unwrap();
         let mut ptr = std::ptr::null_mut();
-        let ierr = unsafe { bind_ceed::CeedVectorCreate(ceed.ptr, n, &mut ptr) };
-        ceed.check_error(ierr)?;
+        ceed.check_error(unsafe { bind_ceed::CeedVectorCreate(ceed.ptr, n, &mut ptr) })?;
         Ok(Self {
             ptr,
             _lifeline: PhantomData,
@@ -264,8 +261,7 @@ impl<'a> Vector<'a> {
 
     fn ptr_copy_mut(&mut self) -> crate::Result<bind_ceed::CeedVector> {
         let mut ptr_copy = std::ptr::null_mut();
-        let ierr = unsafe { bind_ceed::CeedVectorReferenceCopy(self.ptr, &mut ptr_copy) };
-        self.check_error(ierr)?;
+        self.check_error(unsafe { bind_ceed::CeedVectorReferenceCopy(self.ptr, &mut ptr_copy) })?;
         Ok(ptr_copy)
     }
 
@@ -291,8 +287,7 @@ impl<'a> Vector<'a> {
     /// ```
     /// ```
     pub fn copy_from(&mut self, vec_source: &crate::Vector) -> crate::Result<i32> {
-        let ierr = unsafe { bind_ceed::CeedVectorCopy(vec_source.ptr, self.ptr) };
-        self.check_error(ierr)
+        self.check_error(unsafe { bind_ceed::CeedVectorCopy(vec_source.ptr, self.ptr) })
     }
 
     /// Create a Vector from a slice
@@ -340,8 +335,7 @@ impl<'a> Vector<'a> {
             crate::CopyMode::UsePointer as bind_ceed::CeedCopyMode,
         );
         let v = v.as_ptr() as *mut crate::Scalar;
-        let ierr = unsafe { bind_ceed::CeedVectorSetArray(x.ptr, host, user_pointer, v) };
-        ceed.check_error(ierr)?;
+        ceed.check_error(unsafe { bind_ceed::CeedVectorSetArray(x.ptr, host, user_pointer, v) })?;
         Ok(x)
     }
 
@@ -414,8 +408,7 @@ impl<'a> Vector<'a> {
     /// # }
     /// ```
     pub fn set_value(&mut self, value: crate::Scalar) -> crate::Result<i32> {
-        let ierr = unsafe { bind_ceed::CeedVectorSetValue(self.ptr, value) };
-        self.check_error(ierr)
+        self.check_error(unsafe { bind_ceed::CeedVectorSetValue(self.ptr, value) })
     }
 
     /// Set values from a slice of the same length
@@ -443,15 +436,14 @@ impl<'a> Vector<'a> {
             crate::MemType::Host as bind_ceed::CeedMemType,
             crate::CopyMode::CopyValues as bind_ceed::CeedCopyMode,
         );
-        let ierr = unsafe {
+        self.check_error(unsafe {
             bind_ceed::CeedVectorSetArray(
                 self.ptr,
                 host,
                 copy_mode,
                 slice.as_ptr() as *mut crate::Scalar,
             )
-        };
-        self.check_error(ierr)
+        })
     }
 
     /// Wrap a mutable slice in a Vector of the same length
@@ -530,9 +522,9 @@ impl<'a> Vector<'a> {
     /// # }
     /// ```
     pub fn sync(&self, mtype: crate::MemType) -> crate::Result<i32> {
-        let ierr =
-            unsafe { bind_ceed::CeedVectorSyncArray(self.ptr, mtype as bind_ceed::CeedMemType) };
-        self.check_error(ierr)
+        self.check_error(unsafe {
+            bind_ceed::CeedVectorSyncArray(self.ptr, mtype as bind_ceed::CeedMemType)
+        })
     }
 
     /// Create an immutable view
@@ -603,10 +595,9 @@ impl<'a> Vector<'a> {
     /// ```
     pub fn norm(&self, ntype: crate::NormType) -> crate::Result<crate::Scalar> {
         let mut res: crate::Scalar = 0.0;
-        let ierr = unsafe {
+        self.check_error(unsafe {
             bind_ceed::CeedVectorNorm(self.ptr, ntype as bind_ceed::CeedNormType, &mut res)
-        };
-        self.check_error(ierr)?;
+        })?;
         Ok(res)
     }
 
@@ -631,8 +622,7 @@ impl<'a> Vector<'a> {
     /// ```
     #[allow(unused_mut)]
     pub fn scale(mut self, alpha: crate::Scalar) -> crate::Result<Self> {
-        let ierr = unsafe { bind_ceed::CeedVectorScale(self.ptr, alpha) };
-        self.check_error(ierr)?;
+        self.check_error(unsafe { bind_ceed::CeedVectorScale(self.ptr, alpha) })?;
         Ok(self)
     }
 
@@ -659,8 +649,7 @@ impl<'a> Vector<'a> {
     /// ```
     #[allow(unused_mut)]
     pub fn axpy(mut self, alpha: crate::Scalar, x: &crate::Vector) -> crate::Result<Self> {
-        let ierr = unsafe { bind_ceed::CeedVectorAXPY(self.ptr, alpha, x.ptr) };
-        self.check_error(ierr)?;
+        self.check_error(unsafe { bind_ceed::CeedVectorAXPY(self.ptr, alpha, x.ptr) })?;
         Ok(self)
     }
 
@@ -693,8 +682,7 @@ impl<'a> Vector<'a> {
         beta: crate::Scalar,
         x: &crate::Vector,
     ) -> crate::Result<Self> {
-        let ierr = unsafe { bind_ceed::CeedVectorAXPBY(self.ptr, alpha, beta, x.ptr) };
-        self.check_error(ierr)?;
+        self.check_error(unsafe { bind_ceed::CeedVectorAXPBY(self.ptr, alpha, beta, x.ptr) })?;
         Ok(self)
     }
 
@@ -722,8 +710,7 @@ impl<'a> Vector<'a> {
     /// ```
     #[allow(unused_mut)]
     pub fn pointwise_mult(mut self, x: &crate::Vector, y: &crate::Vector) -> crate::Result<Self> {
-        let ierr = unsafe { bind_ceed::CeedVectorPointwiseMult(self.ptr, x.ptr, y.ptr) };
-        self.check_error(ierr)?;
+        self.check_error(unsafe { bind_ceed::CeedVectorPointwiseMult(self.ptr, x.ptr, y.ptr) })?;
         Ok(self)
     }
 
@@ -749,8 +736,7 @@ impl<'a> Vector<'a> {
     /// ```
     #[allow(unused_mut)]
     pub fn pointwise_scale(mut self, x: &crate::Vector) -> crate::Result<Self> {
-        let ierr = unsafe { bind_ceed::CeedVectorPointwiseMult(self.ptr, self.ptr, x.ptr) };
-        self.check_error(ierr)?;
+        self.check_error(unsafe { bind_ceed::CeedVectorPointwiseMult(self.ptr, self.ptr, x.ptr) })?;
         Ok(self)
     }
 
@@ -771,8 +757,9 @@ impl<'a> Vector<'a> {
     /// ```
     #[allow(unused_mut)]
     pub fn pointwise_square(mut self) -> crate::Result<Self> {
-        let ierr = unsafe { bind_ceed::CeedVectorPointwiseMult(self.ptr, self.ptr, self.ptr) };
-        self.check_error(ierr)?;
+        self.check_error(unsafe {
+            bind_ceed::CeedVectorPointwiseMult(self.ptr, self.ptr, self.ptr)
+        })?;
         Ok(self)
     }
 }
@@ -793,14 +780,13 @@ impl<'a> VectorView<'a> {
     /// Construct a VectorView from a Vector reference
     fn new(vec: &'a Vector) -> crate::Result<Self> {
         let mut array = std::ptr::null();
-        let ierr = unsafe {
+        vec.check_error(unsafe {
             bind_ceed::CeedVectorGetArrayRead(
                 vec.ptr,
                 crate::MemType::Host as bind_ceed::CeedMemType,
                 &mut array,
             )
-        };
-        vec.check_error(ierr)?;
+        })?;
         Ok(Self {
             vec: vec,
             array: array,
@@ -846,14 +832,13 @@ impl<'a> VectorViewMut<'a> {
     /// Construct a VectorViewMut from a Vector reference
     fn new(vec: &'a mut Vector) -> crate::Result<Self> {
         let mut ptr = std::ptr::null_mut();
-        let ierr = unsafe {
+        vec.check_error(unsafe {
             bind_ceed::CeedVectorGetArray(
                 vec.ptr,
                 crate::MemType::Host as bind_ceed::CeedMemType,
                 &mut ptr,
             )
-        };
-        vec.check_error(ierr)?;
+        })?;
         Ok(Self {
             vec: vec,
             array: ptr,

From 243afec996543dd9d0cad1d190b7ec15127a478e Mon Sep 17 00:00:00 2001
From: James Wright <james@jameswright.xyz>
Date: Wed, 23 Oct 2024 15:46:34 -0600
Subject: [PATCH 225/571] fluids: Update MatCeed to be up-to-date with HONEE
 and Ratel

---
 examples/fluids/include/mat-ceed-impl.h       |   2 +-
 examples/fluids/include/mat-ceed.h            |  13 +-
 examples/fluids/src/differential_filter.c     |   2 +-
 examples/fluids/src/grid_anisotropy_tensor.c  |   2 +-
 examples/fluids/src/mat-ceed.c                | 331 +++++++++++++-----
 examples/fluids/src/setuplibceed.c            |   4 +-
 examples/fluids/src/turb_spanstats.c          |   2 +-
 .../fluids/src/velocity_gradient_projection.c |   2 +-
 8 files changed, 252 insertions(+), 106 deletions(-)

diff --git a/examples/fluids/include/mat-ceed-impl.h b/examples/fluids/include/mat-ceed-impl.h
index cfc1bd61f6..d035fd230d 100644
--- a/examples/fluids/include/mat-ceed-impl.h
+++ b/examples/fluids/include/mat-ceed-impl.h
@@ -36,7 +36,7 @@ PETSC_CEED_EXTERN PetscErrorCode MatCeedContextCreate(DM dm_x, DM dm_y, Vec X_lo
                                                       PetscLogEvent log_event_ceed_mult_transpose, MatCeedContext *ctx);
 PETSC_CEED_EXTERN PetscErrorCode MatCeedContextReference(MatCeedContext ctx);
 PETSC_CEED_EXTERN PetscErrorCode MatCeedContextReferenceCopy(MatCeedContext ctx, MatCeedContext *ctx_copy);
-PETSC_CEED_EXTERN PetscErrorCode MatCeedContextDestroy(MatCeedContext ctx);
+PETSC_CEED_EXTERN PetscErrorCode MatCeedContextDestroy(MatCeedContext *ctx);
 
 // MatCEED
 PETSC_CEED_EXTERN PetscErrorCode MatGetDiagonal_Ceed(Mat A, Vec D);
diff --git a/examples/fluids/include/mat-ceed.h b/examples/fluids/include/mat-ceed.h
index 42a192fa12..d392c6532f 100644
--- a/examples/fluids/include/mat-ceed.h
+++ b/examples/fluids/include/mat-ceed.h
@@ -14,20 +14,25 @@
 #define MATCEED "ceed"
 
 // Core functionality
-PETSC_CEED_EXTERN PetscErrorCode MatCeedCreate(DM dm_x, DM dm_y, CeedOperator op_mult, CeedOperator op_mult_transpose, Mat *mat);
+PETSC_CEED_EXTERN PetscErrorCode MatCreateCeed(DM dm_x, DM dm_y, CeedOperator op_mult, CeedOperator op_mult_transpose, Mat *mat);
 PETSC_CEED_EXTERN PetscErrorCode MatCeedCopy(Mat mat_ceed, Mat mat_other);
+PETSC_CEED_EXTERN PetscErrorCode MatCeedSetAssemblyDataUpdateNeeded(Mat mat_ceed, PetscBool update_needed);
 PETSC_CEED_EXTERN PetscErrorCode MatCeedCreateMatCOO(Mat mat_ceed, Mat *mat_coo);
 PETSC_CEED_EXTERN PetscErrorCode MatCeedSetPreallocationCOO(Mat mat_ceed, Mat mat_coo);
 PETSC_CEED_EXTERN PetscErrorCode MatCeedAssembleCOO(Mat mat_ceed, Mat mat_coo);
 
 PETSC_CEED_INTERN PetscErrorCode MatCeedSetContextDouble(Mat mat, const char *name, double value);
 PETSC_CEED_INTERN PetscErrorCode MatCeedGetContextDouble(Mat mat, const char *name, double *value);
+PETSC_CEED_EXTERN PetscErrorCode MatCeedSetContextReal(Mat mat, const char *name, PetscReal value);
+PETSC_CEED_EXTERN PetscErrorCode MatCeedGetContextReal(Mat mat, const char *name, PetscReal *value);
+PETSC_CEED_INTERN PetscErrorCode MatCeedSetTime(Mat mat, PetscReal time);
+PETSC_CEED_INTERN PetscErrorCode MatCeedGetTime(Mat mat, PetscReal *time);
+PETSC_CEED_INTERN PetscErrorCode MatCeedSetDt(Mat mat, PetscReal dt);
+PETSC_CEED_INTERN PetscErrorCode MatCeedSetShifts(Mat mat, PetscReal shift_v, PetscReal shift_a);
 
 // Advanced functionality
-PETSC_CEED_EXTERN PetscErrorCode MatCeedSetContext(Mat mat, PetscErrorCode (*f)(void *), void *ctx);
+PETSC_CEED_EXTERN PetscErrorCode MatCeedSetContext(Mat mat, PetscCtxDestroyFn f, void *ctx);
 PETSC_CEED_EXTERN PetscErrorCode MatCeedGetContext(Mat mat, void *ctx);
-PETSC_CEED_EXTERN PetscErrorCode MatCeedSetContextReal(Mat mat, const char *name, PetscReal value);
-PETSC_CEED_EXTERN PetscErrorCode MatCeedGetContextReal(Mat mat, const char *name, PetscReal *value);
 
 PETSC_CEED_EXTERN PetscErrorCode MatCeedSetOperation(Mat mat, MatOperation op, void (*g)(void));
 PETSC_CEED_EXTERN PetscErrorCode MatCeedSetCOOMatType(Mat mat, MatType type);
diff --git a/examples/fluids/src/differential_filter.c b/examples/fluids/src/differential_filter.c
index 12c8771ca3..e6c0db2120 100644
--- a/examples/fluids/src/differential_filter.c
+++ b/examples/fluids/src/differential_filter.c
@@ -159,7 +159,7 @@ PetscErrorCode DifferentialFilterCreateOperators(Ceed ceed, User user, CeedData
     PetscCallCeed(ceed, CeedElemRestrictionDestroy(&elem_restr_grid_aniso));
 
     PetscCallCeed(ceed, CeedOperatorGetContextFieldLabel(op_lhs, "filter width scaling", &diff_filter->filter_width_scaling_label));
-    PetscCall(MatCeedCreate(dm_filter, dm_filter, op_lhs, NULL, &mat_lhs));
+    PetscCall(MatCreateCeed(dm_filter, dm_filter, op_lhs, NULL, &mat_lhs));
 
     PetscCall(KSPCreate(PetscObjectComm((PetscObject)dm_filter), &diff_filter->ksp));
     PetscCall(KSPSetOptionsPrefix(diff_filter->ksp, "diff_filter_"));
diff --git a/examples/fluids/src/grid_anisotropy_tensor.c b/examples/fluids/src/grid_anisotropy_tensor.c
index 15692ee7d6..02f78bbb67 100644
--- a/examples/fluids/src/grid_anisotropy_tensor.c
+++ b/examples/fluids/src/grid_anisotropy_tensor.c
@@ -75,7 +75,7 @@ PetscErrorCode GridAnisotropyTensorProjectionSetupApply(Ceed ceed, User user, Ce
   {  // -- Setup KSP for L^2 projection
     Mat mat_mass;
 
-    PetscCall(MatCeedCreate(grid_aniso_proj->dm, grid_aniso_proj->dm, op_mass, NULL, &mat_mass));
+    PetscCall(MatCreateCeed(grid_aniso_proj->dm, grid_aniso_proj->dm, op_mass, NULL, &mat_mass));
 
     PetscCall(KSPCreate(comm, &ksp));
     PetscCall(KSPSetOptionsPrefix(ksp, "grid_anisotropy_tensor_projection_"));
diff --git a/examples/fluids/src/mat-ceed.c b/examples/fluids/src/mat-ceed.c
index cd164598ed..f5fb608935 100644
--- a/examples/fluids/src/mat-ceed.c
+++ b/examples/fluids/src/mat-ceed.c
@@ -7,7 +7,10 @@
 #include <mat-ceed.h>
 #include <petsc-ceed-utils.h>
 #include <petsc-ceed.h>
-#include <petscdmplex.h>
+#include <petscdm.h>
+#include <petscmat.h>
+#include <stdbool.h>
+#include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 
@@ -163,79 +166,76 @@ static PetscErrorCode MatCeedAssembleInnerBlockDiagonalMat(Mat mat_ceed, PetscBo
 }
 
 /**
-  @brief Get `MATCEED` diagonal block for Jacobi.
+  @brief Get `MATCEED` variable block diagonal for Jacobi.
 
   Collective across MPI processes.
 
   @param[in]   mat_ceed   `MATCEED` to invert
-  @param[out]  mat_block  The diagonal block matrix
+  @param[out]  mat_vblock  The variable diagonal block matrix
 
   @return An error code: 0 - success, otherwise - failure
 **/
-static PetscErrorCode MatGetDiagonalBlock_Ceed(Mat mat_ceed, Mat *mat_block) {
-  Mat            mat_inner = NULL;
+static PetscErrorCode MatGetVariableBlockDiagonal_Ceed(Mat mat_ceed, Mat *mat_vblock) {
   MatCeedContext ctx;
 
   PetscFunctionBeginUser;
   PetscCall(MatShellGetContext(mat_ceed, &ctx));
 
   // Assemble inner mat if needed
-  PetscCall(MatCeedAssembleInnerBlockDiagonalMat(mat_ceed, ctx->is_ceed_pbd_valid, &mat_inner));
-
-  // Get block diagonal
-  PetscCall(MatGetDiagonalBlock(mat_inner, mat_block));
+  PetscCall(MatCeedAssembleInnerBlockDiagonalMat(mat_ceed, ctx->is_ceed_vpbd_valid, mat_vblock));
+  PetscCall(PetscObjectReference((PetscObject)*mat_vblock));
   PetscFunctionReturn(PETSC_SUCCESS);
 }
 
 /**
-  @brief Invert `MATCEED` diagonal block for Jacobi.
+  @brief Get `MATCEED` block diagonal for Jacobi.
 
   Collective across MPI processes.
 
-  @param[in]   mat_ceed  `MATCEED` to invert
-  @param[out]  values    The block inverses in column major order
+  @param[in]   mat_ceed   `MATCEED` to invert
+  @param[out]  mat_block  The variable diagonal block matrix
 
   @return An error code: 0 - success, otherwise - failure
 **/
-static PetscErrorCode MatInvertBlockDiagonal_Ceed(Mat mat_ceed, const PetscScalar **values) {
-  Mat            mat_inner = NULL;
+static PetscErrorCode MatGetBlockDiagonal_Ceed(Mat mat_ceed, Mat *mat_block) {
   MatCeedContext ctx;
 
   PetscFunctionBeginUser;
   PetscCall(MatShellGetContext(mat_ceed, &ctx));
 
   // Assemble inner mat if needed
-  PetscCall(MatCeedAssembleInnerBlockDiagonalMat(mat_ceed, ctx->is_ceed_pbd_valid, &mat_inner));
-
-  // Invert PB diagonal
-  PetscCall(MatInvertBlockDiagonal(mat_inner, values));
+  PetscCall(MatCeedAssembleInnerBlockDiagonalMat(mat_ceed, ctx->is_ceed_pbd_valid, mat_block));
+  PetscCall(PetscObjectReference((PetscObject)*mat_block));
   PetscFunctionReturn(PETSC_SUCCESS);
 }
 
 /**
-  @brief Invert `MATCEED` variable diagonal block for Jacobi.
+  @brief Get on-process diagonal block of `MATCEED`
+
+  This is a block per-process of the diagonal of the global matrix.
+  This is NOT the diagonal blocks associated with the block size of the matrix (i.e. `MatSetBlockSize()` has no effect on this function).
 
   Collective across MPI processes.
 
-  @param[in]   mat_ceed     `MATCEED` to invert
-  @param[in]   num_blocks   The number of blocks on the process
-  @param[in]   block_sizes  The size of each block on the process
-  @param[out]  values       The block inverses in column major order
+  @param[in]   mat_ceed   `MATCEED` to invert
+  @param[out]  mat_block  The diagonal block matrix
 
   @return An error code: 0 - success, otherwise - failure
 **/
-static PetscErrorCode MatInvertVariableBlockDiagonal_Ceed(Mat mat_ceed, PetscInt num_blocks, const PetscInt *block_sizes, PetscScalar *values) {
-  Mat            mat_inner = NULL;
+static PetscErrorCode MatGetDiagonalBlock_Ceed(Mat mat_ceed, Mat *mat_block) {
   MatCeedContext ctx;
 
   PetscFunctionBeginUser;
   PetscCall(MatShellGetContext(mat_ceed, &ctx));
 
-  // Assemble inner mat if needed
-  PetscCall(MatCeedAssembleInnerBlockDiagonalMat(mat_ceed, ctx->is_ceed_vpbd_valid, &mat_inner));
+  // Check if COO pattern set
+  if (!ctx->mat_assembled_full_internal) PetscCall(MatCeedCreateMatCOO(mat_ceed, &ctx->mat_assembled_full_internal));
+
+  // Assemble mat_assembled_full_internal
+  PetscCall(MatCeedAssembleCOO(mat_ceed, ctx->mat_assembled_full_internal));
 
-  // Invert PB diagonal
-  PetscCall(MatInvertVariableBlockDiagonal(mat_inner, num_blocks, block_sizes, values));
+  // Get diagonal block
+  PetscCall(MatGetDiagonalBlock(ctx->mat_assembled_full_internal, mat_block));
   PetscFunctionReturn(PETSC_SUCCESS);
 }
 
@@ -252,7 +252,7 @@ static PetscErrorCode MatInvertVariableBlockDiagonal_Ceed(Mat mat_ceed, PetscInt
 static PetscErrorCode MatView_Ceed(Mat mat_ceed, PetscViewer viewer) {
   PetscBool         is_ascii;
   PetscViewerFormat format;
-  PetscMPIInt       size;
+  PetscMPIInt       size, rank;
   MatCeedContext    ctx;
 
   PetscFunctionBeginUser;
@@ -264,18 +264,35 @@ static PetscErrorCode MatView_Ceed(Mat mat_ceed, PetscViewer viewer) {
   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)mat_ceed), &size));
   if (size == 1 && format == PETSC_VIEWER_LOAD_BALANCE) PetscFunctionReturn(PETSC_SUCCESS);
 
+  PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)mat_ceed), &rank));
+  if (rank != 0) PetscFunctionReturn(PETSC_SUCCESS);
+
   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &is_ascii));
   {
-    FILE *file;
-
-    PetscCall(PetscViewerASCIIPrintf(viewer, "MatCEED:\n  Default COO MatType:%s\n", ctx->coo_mat_type));
+    PetscBool is_detailed     = format == PETSC_VIEWER_ASCII_INFO_DETAIL;
+    char      rank_string[16] = {'\0'};
+    FILE     *file;
+
+    PetscCall(PetscViewerASCIIPrintf(viewer, "MatCEED:\n"));
+    PetscCall(PetscViewerASCIIPushTab(viewer));  // MatCEED
+    PetscCall(PetscViewerASCIIPrintf(viewer, "Default COO MatType: %s\n", ctx->coo_mat_type));
+    PetscCall(PetscSNPrintf(rank_string, 16, "on Rank %d", rank));
+    PetscCall(PetscViewerASCIIPrintf(viewer, "CeedOperator Apply %s:\n", is_detailed ? rank_string : "Summary"));
+    PetscCall(PetscViewerASCIIPrintf(viewer, "libCEED PB Diagonal Assembly: %s\n", ctx->is_ceed_pbd_valid ? "True" : "False"));
+    PetscCall(PetscViewerASCIIPrintf(viewer, "libCEED VPB Diagonal Assembly: %s\n", ctx->is_ceed_vpbd_valid ? "True" : "False"));
     PetscCall(PetscViewerASCIIGetPointer(viewer, &file));
-    PetscCall(PetscViewerASCIIPrintf(viewer, " libCEED Operator:\n"));
-    PetscCallCeed(ctx->ceed, CeedOperatorView(ctx->op_mult, file));
+    PetscCall(PetscViewerASCIIPushTab(viewer));  // CeedOperator
+    if (is_detailed) PetscCallCeed(ctx->ceed, CeedOperatorView(ctx->op_mult, file));
+    else PetscCallCeed(ctx->ceed, CeedOperatorViewTerse(ctx->op_mult, file));
+    PetscCall(PetscViewerASCIIPopTab(viewer));  // CeedOperator
     if (ctx->op_mult_transpose) {
-      PetscCall(PetscViewerASCIIPrintf(viewer, "  libCEED Transpose Operator:\n"));
-      PetscCallCeed(ctx->ceed, CeedOperatorView(ctx->op_mult_transpose, file));
+      PetscCall(PetscViewerASCIIPrintf(viewer, "CeedOperator ApplyTranspose %s:\n", is_detailed ? rank_string : "Summary"));
+      PetscCall(PetscViewerASCIIPushTab(viewer));  // CeedOperator
+      if (is_detailed) PetscCallCeed(ctx->ceed, CeedOperatorView(ctx->op_mult_transpose, file));
+      else PetscCallCeed(ctx->ceed, CeedOperatorViewTerse(ctx->op_mult_transpose, file));
+      PetscCall(PetscViewerASCIIPopTab(viewer));  // CeedOperator
     }
+    PetscCall(PetscViewerASCIIPopTab(viewer));  // MatCEED
   }
   PetscFunctionReturn(PETSC_SUCCESS);
 }
@@ -297,7 +314,7 @@ static PetscErrorCode MatView_Ceed(Mat mat_ceed, PetscViewer viewer) {
 
   @return An error code: 0 - success, otherwise - failure
 **/
-PetscErrorCode MatCeedCreate(DM dm_x, DM dm_y, CeedOperator op_mult, CeedOperator op_mult_transpose, Mat *mat) {
+PetscErrorCode MatCreateCeed(DM dm_x, DM dm_y, CeedOperator op_mult, CeedOperator op_mult_transpose, Mat *mat) {
   PetscInt       X_l_size, X_g_size, Y_l_size, Y_g_size;
   VecType        vec_type;
   MatCeedContext ctx;
@@ -449,14 +466,14 @@ PetscErrorCode MatCeedCreate(DM dm_x, DM dm_y, CeedOperator op_mult, CeedOperato
     PetscCall(PetscStrallocpy(coo_mat_type, &ctx->coo_mat_type));
   }
   // -- Set mat operations
-  PetscCall(MatShellSetContextDestroy(*mat, (PetscErrorCode(*)(void *))MatCeedContextDestroy));
+  PetscCall(MatShellSetContextDestroy(*mat, (PetscCtxDestroyFn *)MatCeedContextDestroy));
   PetscCall(MatShellSetOperation(*mat, MATOP_VIEW, (void (*)(void))MatView_Ceed));
   PetscCall(MatShellSetOperation(*mat, MATOP_MULT, (void (*)(void))MatMult_Ceed));
   if (op_mult_transpose) PetscCall(MatShellSetOperation(*mat, MATOP_MULT_TRANSPOSE, (void (*)(void))MatMultTranspose_Ceed));
   PetscCall(MatShellSetOperation(*mat, MATOP_GET_DIAGONAL, (void (*)(void))MatGetDiagonal_Ceed));
   PetscCall(MatShellSetOperation(*mat, MATOP_GET_DIAGONAL_BLOCK, (void (*)(void))MatGetDiagonalBlock_Ceed));
-  PetscCall(MatShellSetOperation(*mat, MATOP_INVERT_BLOCK_DIAGONAL, (void (*)(void))MatInvertBlockDiagonal_Ceed));
-  PetscCall(MatShellSetOperation(*mat, MATOP_INVERT_VBLOCK_DIAGONAL, (void (*)(void))MatInvertVariableBlockDiagonal_Ceed));
+  PetscCall(MatShellSetOperation(*mat, MATOP_GET_BLOCK_DIAGONAL, (void (*)(void))MatGetBlockDiagonal_Ceed));
+  PetscCall(MatShellSetOperation(*mat, MATOP_GET_VBLOCK_DIAGONAL, (void (*)(void))MatGetVariableBlockDiagonal_Ceed));
   PetscCall(MatShellSetVecType(*mat, vec_type));
   PetscFunctionReturn(PETSC_SUCCESS);
 }
@@ -515,14 +532,14 @@ PetscErrorCode MatCeedCopy(Mat mat_ceed, Mat mat_other) {
     PetscCall(MatShellGetContext(mat_ceed, &ctx));
     PetscCall(MatCeedContextReference(ctx));
     PetscCall(MatShellSetContext(mat_other, ctx));
-    PetscCall(MatShellSetContextDestroy(mat_other, (PetscErrorCode(*)(void *))MatCeedContextDestroy));
+    PetscCall(MatShellSetContextDestroy(mat_other, (PetscCtxDestroyFn *)MatCeedContextDestroy));
     PetscCall(MatShellSetOperation(mat_other, MATOP_VIEW, (void (*)(void))MatView_Ceed));
     PetscCall(MatShellSetOperation(mat_other, MATOP_MULT, (void (*)(void))MatMult_Ceed));
     if (ctx->op_mult_transpose) PetscCall(MatShellSetOperation(mat_other, MATOP_MULT_TRANSPOSE, (void (*)(void))MatMultTranspose_Ceed));
     PetscCall(MatShellSetOperation(mat_other, MATOP_GET_DIAGONAL, (void (*)(void))MatGetDiagonal_Ceed));
     PetscCall(MatShellSetOperation(mat_other, MATOP_GET_DIAGONAL_BLOCK, (void (*)(void))MatGetDiagonalBlock_Ceed));
-    PetscCall(MatShellSetOperation(mat_other, MATOP_INVERT_BLOCK_DIAGONAL, (void (*)(void))MatInvertBlockDiagonal_Ceed));
-    PetscCall(MatShellSetOperation(mat_other, MATOP_INVERT_VBLOCK_DIAGONAL, (void (*)(void))MatInvertVariableBlockDiagonal_Ceed));
+    PetscCall(MatShellSetOperation(mat_other, MATOP_GET_BLOCK_DIAGONAL, (void (*)(void))MatGetBlockDiagonal_Ceed));
+    PetscCall(MatShellSetOperation(mat_other, MATOP_GET_VBLOCK_DIAGONAL, (void (*)(void))MatGetVariableBlockDiagonal_Ceed));
     {
       PetscInt block_size;
 
@@ -542,6 +559,32 @@ PetscErrorCode MatCeedCopy(Mat mat_ceed, Mat mat_other) {
   PetscFunctionReturn(PETSC_SUCCESS);
 }
 
+/**
+  @brief Mark `CeedQFunction` data as updated and the `CeedQFunction` as requiring re-assembly for a `MatCEED`.
+
+  Collective across MPI processes.
+
+  @param[in]   mat_ceed       `MATCEED`
+  @param[out]  update_needed  Boolean flag indicating `CeedQFunction` update needed
+
+  @return An error code: 0 - success, otherwise - failure
+**/
+PetscErrorCode MatCeedSetAssemblyDataUpdateNeeded(Mat mat_ceed, PetscBool update_needed) {
+  MatCeedContext ctx;
+
+  PetscFunctionBeginUser;
+  PetscCall(MatShellGetContext(mat_ceed, &ctx));
+  PetscCallCeed(ctx->ceed, CeedOperatorSetQFunctionAssemblyDataUpdateNeeded(ctx->op_mult, update_needed));
+  if (ctx->op_mult_transpose) {
+    PetscCallCeed(ctx->ceed, CeedOperatorSetQFunctionAssemblyDataUpdateNeeded(ctx->op_mult_transpose, update_needed));
+  }
+  if (update_needed) {
+    PetscCall(MatAssemblyBegin(mat_ceed, MAT_FINAL_ASSEMBLY));
+    PetscCall(MatAssemblyEnd(mat_ceed, MAT_FINAL_ASSEMBLY));
+  }
+  PetscFunctionReturn(PETSC_SUCCESS);
+}
+
 /**
   @brief Setup a `Mat` with the same COO pattern as a `MatCEED`.
 
@@ -720,15 +763,25 @@ PetscErrorCode MatCeedSetContextDouble(Mat mat, const char *name, double value)
 
     PetscCallCeed(ctx->ceed, CeedOperatorGetContextFieldLabel(ctx->op_mult, name, &label));
     if (label) {
-      PetscCallCeed(ctx->ceed, CeedOperatorSetContextDouble(ctx->op_mult, label, &value));
-      was_updated = PETSC_TRUE;
+      double set_value = 2 * value + 1.0;
+
+      PetscCall(MatCeedGetContextDouble(mat, name, &set_value));
+      if (set_value != value) {
+        PetscCallCeed(ctx->ceed, CeedOperatorSetContextDouble(ctx->op_mult, label, &value));
+        was_updated = PETSC_TRUE;
+      }
     }
     if (ctx->op_mult_transpose) {
       label = NULL;
       PetscCallCeed(ctx->ceed, CeedOperatorGetContextFieldLabel(ctx->op_mult_transpose, name, &label));
       if (label) {
-        PetscCallCeed(ctx->ceed, CeedOperatorSetContextDouble(ctx->op_mult_transpose, label, &value));
-        was_updated = PETSC_TRUE;
+        double set_value = 2 * value + 1.0;
+
+        PetscCall(MatCeedGetContextDouble(mat, name, &set_value));
+        if (set_value != value) {
+          PetscCallCeed(ctx->ceed, CeedOperatorSetContextDouble(ctx->op_mult_transpose, label, &value));
+          was_updated = PETSC_TRUE;
+        }
       }
     }
   }
@@ -739,25 +792,6 @@ PetscErrorCode MatCeedSetContextDouble(Mat mat, const char *name, double value)
   PetscFunctionReturn(PETSC_SUCCESS);
 }
 
-/**
-  @brief Set the current `PetscReal` value of a context field for a `MatCEED`.
-
-  Not collective across MPI processes.
-
-  @param[in,out]  mat    `MatCEED`
-  @param[in]      name   Name of the context field
-  @param[in]      value  New context field value
-
-  @return An error code: 0 - success, otherwise - failure
-**/
-PetscErrorCode MatCeedSetContextReal(Mat mat, const char *name, PetscReal value) {
-  double value_double = value;
-
-  PetscFunctionBeginUser;
-  PetscCall(MatCeedSetContextDouble(mat, name, value_double));
-  PetscFunctionReturn(PETSC_SUCCESS);
-}
-
 /**
   @brief Get the current value of a context field for a `MatCEED`.
 
@@ -795,6 +829,25 @@ PetscErrorCode MatCeedGetContextDouble(Mat mat, const char *name, double *value)
   PetscFunctionReturn(PETSC_SUCCESS);
 }
 
+/**
+  @brief Set the current `PetscReal` value of a context field for a `MatCEED`.
+
+  Not collective across MPI processes.
+
+  @param[in,out]  mat    `MatCEED`
+  @param[in]      name   Name of the context field
+  @param[in]      value  New context field value
+
+  @return An error code: 0 - success, otherwise - failure
+**/
+PetscErrorCode MatCeedSetContextReal(Mat mat, const char *name, PetscReal value) {
+  double value_double = value;
+
+  PetscFunctionBeginUser;
+  PetscCall(MatCeedSetContextDouble(mat, name, value_double));
+  PetscFunctionReturn(PETSC_SUCCESS);
+}
+
 /**
   @brief Get the current `PetscReal` value of a context field for a `MatCEED`.
 
@@ -807,7 +860,7 @@ PetscErrorCode MatCeedGetContextDouble(Mat mat, const char *name, double *value)
   @return An error code: 0 - success, otherwise - failure
 **/
 PetscErrorCode MatCeedGetContextReal(Mat mat, const char *name, PetscReal *value) {
-  double value_double;
+  double value_double = 0.0;
 
   PetscFunctionBeginUser;
   PetscCall(MatCeedGetContextDouble(mat, name, &value_double));
@@ -815,6 +868,94 @@ PetscErrorCode MatCeedGetContextReal(Mat mat, const char *name, PetscReal *value
   PetscFunctionReturn(PETSC_SUCCESS);
 }
 
+/**
+  @brief Set the current time for a `MatCEED`.
+
+  Not collective across MPI processes.
+
+  @param[in,out]  mat   `MatCEED`
+  @param[in]      time  Current time
+
+  @return An error code: 0 - success, otherwise - failure
+**/
+PetscErrorCode MatCeedSetTime(Mat mat, PetscReal time) {
+  PetscFunctionBeginUser;
+  {
+    double time_ceed = time;
+
+    PetscCall(MatCeedSetContextDouble(mat, "time", time_ceed));
+  }
+  PetscFunctionReturn(PETSC_SUCCESS);
+}
+
+/**
+  @brief Get the current time for a `MatCEED`.
+
+  Not collective across MPI processes.
+
+  @param[in]   mat   `MatCEED`
+  @param[out]  time  Current time, or -1.0 if the boundary evaluator has no time field
+
+  @return An error code: 0 - success, otherwise - failure
+**/
+PetscErrorCode MatCeedGetTime(Mat mat, PetscReal *time) {
+  PetscFunctionBeginUser;
+  *time = -1.0;
+  {
+    double time_ceed = -1.0;
+
+    PetscCall(MatCeedGetContextDouble(mat, "time", &time_ceed));
+    *time = time_ceed;
+  }
+  PetscFunctionReturn(PETSC_SUCCESS);
+}
+
+/**
+  @brief Set the current time step for a `MatCEED`.
+
+  Not collective across MPI processes.
+
+  @param[in,out]  mat  `MatCEED`
+  @param[in]      dt   Current time step
+
+  @return An error code: 0 - success, otherwise - failure
+**/
+PetscErrorCode MatCeedSetDt(Mat mat, PetscReal dt) {
+  PetscFunctionBeginUser;
+  {
+    double dt_ceed = dt;
+
+    PetscCall(MatCeedSetContextDouble(mat, "dt", dt_ceed));
+  }
+  PetscFunctionReturn(PETSC_SUCCESS);
+}
+
+/**
+  @brief Set the Jacobian shifts for a `MatCEED`.
+
+  Not collective across MPI processes.
+
+  @param[in,out]  mat      `MatCEED`
+  @param[in]      shift_v  Velocity shift
+  @param[in]      shift_a  Acceleration shift
+
+  @return An error code: 0 - success, otherwise - failure
+**/
+PetscErrorCode MatCeedSetShifts(Mat mat, PetscReal shift_v, PetscReal shift_a) {
+  PetscFunctionBeginUser;
+  {
+    double shift_v_ceed = shift_v;
+
+    PetscCall(MatCeedSetContextDouble(mat, "shift v", shift_v_ceed));
+  }
+  if (shift_a) {
+    double shift_a_ceed = shift_a;
+
+    PetscCall(MatCeedSetContextDouble(mat, "shift a", shift_a_ceed));
+  }
+  PetscFunctionReturn(PETSC_SUCCESS);
+}
+
 /**
   @brief Set user context for a `MATCEED`.
 
@@ -826,14 +967,14 @@ PetscErrorCode MatCeedGetContextReal(Mat mat, const char *name, PetscReal *value
 
   @return An error code: 0 - success, otherwise - failure
 **/
-PetscErrorCode MatCeedSetContext(Mat mat, PetscErrorCode (*f)(void *), void *ctx) {
+PetscErrorCode MatCeedSetContext(Mat mat, PetscCtxDestroyFn f, void *ctx) {
   PetscContainer user_ctx = NULL;
 
   PetscFunctionBeginUser;
   if (ctx) {
     PetscCall(PetscContainerCreate(PetscObjectComm((PetscObject)mat), &user_ctx));
     PetscCall(PetscContainerSetPointer(user_ctx, ctx));
-    PetscCall(PetscContainerSetUserDestroy(user_ctx, f));
+    PetscCall(PetscContainerSetCtxDestroy(user_ctx, f));
   }
   PetscCall(PetscObjectCompose((PetscObject)mat, "MatCeed user context", (PetscObject)user_ctx));
   PetscCall(PetscContainerDestroy(&user_ctx));
@@ -1316,7 +1457,7 @@ PetscErrorCode MatCeedContextReference(MatCeedContext ctx) {
 PetscErrorCode MatCeedContextReferenceCopy(MatCeedContext ctx, MatCeedContext *ctx_copy) {
   PetscFunctionBeginUser;
   PetscCall(MatCeedContextReference(ctx));
-  PetscCall(MatCeedContextDestroy(*ctx_copy));
+  PetscCall(MatCeedContextDestroy(ctx_copy));
   *ctx_copy = ctx;
   PetscFunctionReturn(PETSC_SUCCESS);
 }
@@ -1330,33 +1471,33 @@ PetscErrorCode MatCeedContextReferenceCopy(MatCeedContext ctx, MatCeedContext *c
 
   @return An error code: 0 - success, otherwise - failure
 **/
-PetscErrorCode MatCeedContextDestroy(MatCeedContext ctx) {
+PetscErrorCode MatCeedContextDestroy(MatCeedContext *ctx) {
   PetscFunctionBeginUser;
-  if (!ctx || --ctx->ref_count > 0) PetscFunctionReturn(PETSC_SUCCESS);
+  if (!ctx || --(*ctx)->ref_count > 0) PetscFunctionReturn(PETSC_SUCCESS);
 
   // PETSc objects
-  PetscCall(DMDestroy(&ctx->dm_x));
-  PetscCall(DMDestroy(&ctx->dm_y));
-  PetscCall(VecDestroy(&ctx->X_loc));
-  PetscCall(VecDestroy(&ctx->Y_loc_transpose));
-  PetscCall(MatDestroy(&ctx->mat_assembled_full_internal));
-  PetscCall(MatDestroy(&ctx->mat_assembled_pbd_internal));
-  PetscCall(PetscFree(ctx->coo_mat_type));
-  PetscCall(PetscFree(ctx->mats_assembled_full));
-  PetscCall(PetscFree(ctx->mats_assembled_pbd));
+  PetscCall(DMDestroy(&(*ctx)->dm_x));
+  PetscCall(DMDestroy(&(*ctx)->dm_y));
+  PetscCall(VecDestroy(&(*ctx)->X_loc));
+  PetscCall(VecDestroy(&(*ctx)->Y_loc_transpose));
+  PetscCall(MatDestroy(&(*ctx)->mat_assembled_full_internal));
+  PetscCall(MatDestroy(&(*ctx)->mat_assembled_pbd_internal));
+  PetscCall(PetscFree((*ctx)->coo_mat_type));
+  PetscCall(PetscFree((*ctx)->mats_assembled_full));
+  PetscCall(PetscFree((*ctx)->mats_assembled_pbd));
 
   // libCEED objects
-  PetscCallCeed(ctx->ceed, CeedVectorDestroy(&ctx->x_loc));
-  PetscCallCeed(ctx->ceed, CeedVectorDestroy(&ctx->y_loc));
-  PetscCallCeed(ctx->ceed, CeedVectorDestroy(&ctx->coo_values_full));
-  PetscCallCeed(ctx->ceed, CeedVectorDestroy(&ctx->coo_values_pbd));
-  PetscCallCeed(ctx->ceed, CeedOperatorDestroy(&ctx->op_mult));
-  PetscCallCeed(ctx->ceed, CeedOperatorDestroy(&ctx->op_mult_transpose));
-  PetscCheck(CeedDestroy(&ctx->ceed) == CEED_ERROR_SUCCESS, PETSC_COMM_SELF, PETSC_ERR_LIB, "destroying libCEED context object failed");
+  PetscCallCeed((*ctx)->ceed, CeedVectorDestroy(&(*ctx)->x_loc));
+  PetscCallCeed((*ctx)->ceed, CeedVectorDestroy(&(*ctx)->y_loc));
+  PetscCallCeed((*ctx)->ceed, CeedVectorDestroy(&(*ctx)->coo_values_full));
+  PetscCallCeed((*ctx)->ceed, CeedVectorDestroy(&(*ctx)->coo_values_pbd));
+  PetscCallCeed((*ctx)->ceed, CeedOperatorDestroy(&(*ctx)->op_mult));
+  PetscCallCeed((*ctx)->ceed, CeedOperatorDestroy(&(*ctx)->op_mult_transpose));
+  PetscCheck(CeedDestroy(&(*ctx)->ceed) == CEED_ERROR_SUCCESS, PETSC_COMM_SELF, PETSC_ERR_LIB, "destroying libCEED context object failed");
 
   // Deallocate
-  ctx->is_destroyed = PETSC_TRUE;  // Flag as destroyed in case someone has stale ref
-  PetscCall(PetscFree(ctx));
+  (*ctx)->is_destroyed = PETSC_TRUE;  // Flag as destroyed in case someone has stale ref
+  PetscCall(PetscFree(*ctx));
   PetscFunctionReturn(PETSC_SUCCESS);
 }
 
@@ -1434,11 +1575,11 @@ PetscErrorCode MatMult_Ceed(Mat A, Vec X, Vec Y) {
     PetscCall(VecPetscToCeed(Y_loc, &y_mem_type, ctx->y_loc));
 
     // Apply libCEED operator
-    PetscCall(PetscLogGpuTimeBegin());
     PetscCall(PetscLogEventBegin(ctx->log_event_ceed_mult, A, X, Y, NULL));
+    PetscCall(PetscLogGpuTimeBegin());
     PetscCallCeed(ctx->ceed, CeedOperatorApplyAdd(ctx->op_mult, ctx->x_loc, ctx->y_loc, CEED_REQUEST_IMMEDIATE));
-    PetscCall(PetscLogEventEnd(ctx->log_event_ceed_mult, A, X, Y, NULL));
     PetscCall(PetscLogGpuTimeEnd());
+    PetscCall(PetscLogEventEnd(ctx->log_event_ceed_mult, A, X, Y, NULL));
 
     // Restore PETSc vectors
     PetscCall(VecReadCeedToPetsc(ctx->x_loc, x_mem_type, X_loc));
@@ -1495,11 +1636,11 @@ PetscErrorCode MatMultTranspose_Ceed(Mat A, Vec Y, Vec X) {
     PetscCall(VecPetscToCeed(X_loc, &x_mem_type, ctx->x_loc));
 
     // Apply libCEED operator
-    PetscCall(PetscLogGpuTimeBegin());
     PetscCall(PetscLogEventBegin(ctx->log_event_ceed_mult_transpose, A, Y, X, NULL));
+    PetscCall(PetscLogGpuTimeBegin());
     PetscCallCeed(ctx->ceed, CeedOperatorApplyAdd(ctx->op_mult_transpose, ctx->y_loc, ctx->x_loc, CEED_REQUEST_IMMEDIATE));
-    PetscCall(PetscLogEventEnd(ctx->log_event_ceed_mult_transpose, A, Y, X, NULL));
     PetscCall(PetscLogGpuTimeEnd());
+    PetscCall(PetscLogEventEnd(ctx->log_event_ceed_mult_transpose, A, Y, X, NULL));
 
     // Restore PETSc vectors
     PetscCall(VecReadCeedToPetsc(ctx->y_loc, y_mem_type, Y_loc));
diff --git a/examples/fluids/src/setuplibceed.c b/examples/fluids/src/setuplibceed.c
index 1754d5e521..9ce48ae762 100644
--- a/examples/fluids/src/setuplibceed.c
+++ b/examples/fluids/src/setuplibceed.c
@@ -70,7 +70,7 @@ static PetscErrorCode CreateKSPMass(User user, ProblemData problem) {
 
     PetscCall(DMCreateLocalVector(dm, &Zeros_loc));
     PetscCall(VecZeroEntries(Zeros_loc));
-    PetscCall(MatCeedCreate(dm, dm, op_mass, NULL, &mat_mass));
+    PetscCall(MatCreateCeed(dm, dm, op_mass, NULL, &mat_mass));
     PetscCall(MatCeedSetLocalVectors(mat_mass, Zeros_loc, NULL));
 
     PetscCall(KSPCreate(comm, &user->mass_ksp));
@@ -469,7 +469,7 @@ PetscErrorCode SetupLibceed(Ceed ceed, CeedData ceed_data, DM dm, User user, App
     PetscCallCeed(ceed, CeedOperatorGetContextFieldLabel(user->op_ifunction, "timestep size", &user->phys->timestep_size_label));
 
     if (op_ijacobian) {
-      PetscCall(MatCeedCreate(user->dm, user->dm, op_ijacobian, NULL, &user->mat_ijacobian));
+      PetscCall(MatCreateCeed(user->dm, user->dm, op_ijacobian, NULL, &user->mat_ijacobian));
       PetscCall(MatCeedSetLocalVectors(user->mat_ijacobian, user->Q_dot_loc, NULL));
       PetscCallCeed(ceed, CeedOperatorDestroy(&op_ijacobian));
     }
diff --git a/examples/fluids/src/turb_spanstats.c b/examples/fluids/src/turb_spanstats.c
index 08f9ef36b9..6c27978fde 100644
--- a/examples/fluids/src/turb_spanstats.c
+++ b/examples/fluids/src/turb_spanstats.c
@@ -326,7 +326,7 @@ PetscErrorCode SetupL2ProjectionStats(Ceed ceed, User user, CeedData ceed_data,
     Mat mat_mass;
     KSP ksp;
 
-    PetscCall(MatCeedCreate(user->spanstats.dm, user->spanstats.dm, op_mass, NULL, &mat_mass));
+    PetscCall(MatCreateCeed(user->spanstats.dm, user->spanstats.dm, op_mass, NULL, &mat_mass));
 
     PetscCall(KSPCreate(PetscObjectComm((PetscObject)user->spanstats.dm), &ksp));
     PetscCall(KSPSetOptionsPrefix(ksp, "turbulence_spanstats_"));
diff --git a/examples/fluids/src/velocity_gradient_projection.c b/examples/fluids/src/velocity_gradient_projection.c
index 931b69f57d..0ee457139a 100644
--- a/examples/fluids/src/velocity_gradient_projection.c
+++ b/examples/fluids/src/velocity_gradient_projection.c
@@ -107,7 +107,7 @@ PetscErrorCode VelocityGradientProjectionSetup(Ceed ceed, User user, CeedData ce
     Mat      mat_mass;
     MPI_Comm comm = PetscObjectComm((PetscObject)grad_velo_proj->dm);
 
-    PetscCall(MatCeedCreate(grad_velo_proj->dm, grad_velo_proj->dm, op_mass, NULL, &mat_mass));
+    PetscCall(MatCreateCeed(grad_velo_proj->dm, grad_velo_proj->dm, op_mass, NULL, &mat_mass));
 
     PetscCall(KSPCreate(comm, &grad_velo_proj->ksp));
     PetscCall(KSPSetOptionsPrefix(grad_velo_proj->ksp, "velocity_gradient_projection_"));

From 2b671a0a8bf5062b196159b6c524acbfbb639e7b Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Thu, 24 Oct 2024 16:46:23 -0600
Subject: [PATCH 226/571] rust - from_raw is unsafe

---
 rust/libceed/src/basis.rs            |  2 +-
 rust/libceed/src/elem_restriction.rs | 10 +++----
 rust/libceed/src/operator.rs         | 39 ++++++++++++++--------------
 rust/libceed/src/vector.rs           |  4 +--
 4 files changed, 27 insertions(+), 28 deletions(-)

diff --git a/rust/libceed/src/basis.rs b/rust/libceed/src/basis.rs
index a3188f7bfb..2e719097e7 100644
--- a/rust/libceed/src/basis.rs
+++ b/rust/libceed/src/basis.rs
@@ -172,7 +172,7 @@ impl<'a> Basis<'a> {
         })
     }
 
-    pub(crate) fn from_raw(ptr: bind_ceed::CeedBasis) -> crate::Result<Self> {
+    pub(crate) unsafe fn from_raw(ptr: bind_ceed::CeedBasis) -> crate::Result<Self> {
         Ok(Self {
             ptr,
             _lifeline: PhantomData,
diff --git a/rust/libceed/src/elem_restriction.rs b/rust/libceed/src/elem_restriction.rs
index 1bbf6b0643..081e08c61b 100644
--- a/rust/libceed/src/elem_restriction.rs
+++ b/rust/libceed/src/elem_restriction.rs
@@ -192,7 +192,7 @@ impl<'a> ElemRestriction<'a> {
         })
     }
 
-    pub(crate) fn from_raw(ptr: bind_ceed::CeedElemRestriction) -> crate::Result<Self> {
+    pub(crate) unsafe fn from_raw(ptr: bind_ceed::CeedElemRestriction) -> crate::Result<Self> {
         Ok(Self {
             ptr,
             _lifeline: PhantomData,
@@ -351,7 +351,7 @@ impl<'a> ElemRestriction<'a> {
         self.check_error(unsafe {
             bind_ceed::CeedElemRestrictionCreateVector(self.ptr, &mut ptr_lvector, null)
         })?;
-        Vector::from_raw(ptr_lvector)
+        unsafe { Vector::from_raw(ptr_lvector) }
     }
 
     /// Create an Evector for an ElemRestriction
@@ -380,7 +380,7 @@ impl<'a> ElemRestriction<'a> {
         self.check_error(unsafe {
             bind_ceed::CeedElemRestrictionCreateVector(self.ptr, null, &mut ptr_evector)
         })?;
-        Vector::from_raw(ptr_evector)
+        unsafe { Vector::from_raw(ptr_evector) }
     }
 
     /// Create Vectors for an ElemRestriction
@@ -410,8 +410,8 @@ impl<'a> ElemRestriction<'a> {
         self.check_error(unsafe {
             bind_ceed::CeedElemRestrictionCreateVector(self.ptr, &mut ptr_lvector, &mut ptr_evector)
         })?;
-        let lvector = Vector::from_raw(ptr_lvector)?;
-        let evector = Vector::from_raw(ptr_evector)?;
+        let lvector = unsafe { Vector::from_raw(ptr_lvector)? };
+        let evector = unsafe { Vector::from_raw(ptr_evector)? };
         Ok((lvector, evector))
     }
 
diff --git a/rust/libceed/src/operator.rs b/rust/libceed/src/operator.rs
index 6f2cb11409..828ac5b998 100644
--- a/rust/libceed/src/operator.rs
+++ b/rust/libceed/src/operator.rs
@@ -27,27 +27,26 @@ pub struct OperatorField<'a> {
 // Implementations
 // -----------------------------------------------------------------------------
 impl<'a> OperatorField<'a> {
-    pub(crate) fn from_raw(
+    pub(crate) unsafe fn from_raw(
         ptr: bind_ceed::CeedOperatorField,
         ceed: crate::Ceed,
     ) -> crate::Result<Self> {
         let vector = {
             let mut vector_ptr = std::ptr::null_mut();
-            ceed.check_error(unsafe {
-                bind_ceed::CeedOperatorFieldGetVector(ptr, &mut vector_ptr)
-            })?;
+            ceed.check_error(bind_ceed::CeedOperatorFieldGetVector(ptr, &mut vector_ptr))?;
             crate::Vector::from_raw(vector_ptr)?
         };
         let elem_restriction = {
             let mut elem_restriction_ptr = std::ptr::null_mut();
-            ceed.check_error(unsafe {
-                bind_ceed::CeedOperatorFieldGetElemRestriction(ptr, &mut elem_restriction_ptr)
-            })?;
+            ceed.check_error(bind_ceed::CeedOperatorFieldGetElemRestriction(
+                ptr,
+                &mut elem_restriction_ptr,
+            ))?;
             crate::ElemRestriction::from_raw(elem_restriction_ptr)?
         };
         let basis = {
             let mut basis_ptr = std::ptr::null_mut();
-            ceed.check_error(unsafe { bind_ceed::CeedOperatorFieldGetBasis(ptr, &mut basis_ptr) })?;
+            ceed.check_error(bind_ceed::CeedOperatorFieldGetBasis(ptr, &mut basis_ptr))?;
             crate::Basis::from_raw(basis_ptr)?
         };
         Ok(Self {
@@ -558,7 +557,7 @@ impl<'a> Operator<'a> {
         })
     }
 
-    fn from_raw(ptr: bind_ceed::CeedOperator) -> crate::Result<Self> {
+    unsafe fn from_raw(ptr: bind_ceed::CeedOperator) -> crate::Result<Self> {
         Ok(Self {
             op_core: OperatorCore {
                 ptr,
@@ -881,7 +880,7 @@ impl<'a> Operator<'a> {
             crate::Ceed { ptr }
         };
         let inputs = (0..num_inputs as usize)
-            .map(|i| crate::OperatorField::from_raw(inputs_slice[i], ceed.clone()))
+            .map(|i| unsafe { crate::OperatorField::from_raw(inputs_slice[i], ceed.clone()) })
             .collect::<crate::Result<Vec<_>>>()?;
         Ok(inputs)
     }
@@ -951,7 +950,7 @@ impl<'a> Operator<'a> {
             crate::Ceed { ptr }
         };
         let outputs = (0..num_outputs as usize)
-            .map(|i| crate::OperatorField::from_raw(outputs_slice[i], ceed.clone()))
+            .map(|i| unsafe { crate::OperatorField::from_raw(outputs_slice[i], ceed.clone()) })
             .collect::<crate::Result<Vec<_>>>()?;
         Ok(outputs)
     }
@@ -1720,9 +1719,9 @@ impl<'a> Operator<'a> {
                 &mut ptr_restrict,
             )
         })?;
-        let op_coarse = Operator::from_raw(ptr_coarse)?;
-        let op_prolong = Operator::from_raw(ptr_prolong)?;
-        let op_restrict = Operator::from_raw(ptr_restrict)?;
+        let op_coarse = unsafe { Operator::from_raw(ptr_coarse)? };
+        let op_prolong = unsafe { Operator::from_raw(ptr_prolong)? };
+        let op_restrict = unsafe { Operator::from_raw(ptr_restrict)? };
         Ok((op_coarse, op_prolong, op_restrict))
     }
 
@@ -1910,9 +1909,9 @@ impl<'a> Operator<'a> {
                 &mut ptr_restrict,
             )
         })?;
-        let op_coarse = Operator::from_raw(ptr_coarse)?;
-        let op_prolong = Operator::from_raw(ptr_prolong)?;
-        let op_restrict = Operator::from_raw(ptr_restrict)?;
+        let op_coarse = unsafe { Operator::from_raw(ptr_coarse)? };
+        let op_prolong = unsafe { Operator::from_raw(ptr_prolong)? };
+        let op_restrict = unsafe { Operator::from_raw(ptr_restrict)? };
         Ok((op_coarse, op_prolong, op_restrict))
     }
 
@@ -2100,9 +2099,9 @@ impl<'a> Operator<'a> {
                 &mut ptr_restrict,
             )
         })?;
-        let op_coarse = Operator::from_raw(ptr_coarse)?;
-        let op_prolong = Operator::from_raw(ptr_prolong)?;
-        let op_restrict = Operator::from_raw(ptr_restrict)?;
+        let op_coarse = unsafe { Operator::from_raw(ptr_coarse)? };
+        let op_prolong = unsafe { Operator::from_raw(ptr_prolong)? };
+        let op_restrict = unsafe { Operator::from_raw(ptr_restrict)? };
         Ok((op_coarse, op_prolong, op_restrict))
     }
 }
diff --git a/rust/libceed/src/vector.rs b/rust/libceed/src/vector.rs
index a6fd181907..3394ecd870 100644
--- a/rust/libceed/src/vector.rs
+++ b/rust/libceed/src/vector.rs
@@ -166,7 +166,7 @@ impl<'a> VectorSliceWrapper<'a> {
             )
         })?;
         Ok(Self {
-            vector: crate::Vector::from_raw(vec.ptr_copy_mut()?)?,
+            vector: unsafe { crate::Vector::from_raw(vec.ptr_copy_mut()?)? },
             _slice: slice,
         })
     }
@@ -252,7 +252,7 @@ impl<'a> Vector<'a> {
         })
     }
 
-    pub(crate) fn from_raw(ptr: bind_ceed::CeedVector) -> crate::Result<Self> {
+    pub(crate) unsafe fn from_raw(ptr: bind_ceed::CeedVector) -> crate::Result<Self> {
         Ok(Self {
             ptr,
             _lifeline: PhantomData,

From 391f7d98b23119a3db61bce3e938b5dca7339952 Mon Sep 17 00:00:00 2001
From: James Wright <james@jameswright.xyz>
Date: Sat, 26 Oct 2024 21:44:43 -0600
Subject: [PATCH 227/571] swarm: Implement RestorePointsPerCell

---
 examples/petsc/dmswarm.c        | 4 ++--
 examples/petsc/src/swarmutils.c | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/petsc/dmswarm.c b/examples/petsc/dmswarm.c
index 1dc3618fd8..113730e15a 100644
--- a/examples/petsc/dmswarm.c
+++ b/examples/petsc/dmswarm.c
@@ -398,7 +398,7 @@ PetscErrorCode DMSwarmInterpolateFromCellToSwarm_Petsc(DM dm_swarm, const char *
     PetscCall(DMRestoreWorkArray(dm_mesh, num_points_in_cell * dim, MPIU_REAL, &coords_points_cell_true));
     PetscCall(DMRestoreWorkArray(dm_mesh, num_points_in_cell * dim, MPIU_REAL, &coords_points_cell_ref));
     PetscCall(PetscTabulationDestroy(&tabulation));
-    PetscCall(PetscFree(points_cell));
+    PetscCall(DMSwarmSortRestorePointsPerCell(dm_swarm, cell, &num_points_in_cell, &points_cell));
   }
 
   // Cleanup
@@ -486,7 +486,7 @@ PetscErrorCode DMSwarmCheckSwarmValues(DM dm_swarm, const char *field, PetscScal
     }
 
     // -- Cleanup
-    PetscCall(PetscFree(points));
+    PetscCall(DMSwarmSortRestorePointsPerCell(dm_swarm, cell, &num_points_in_cell, &points));
   }
 
   // Cleanup
diff --git a/examples/petsc/src/swarmutils.c b/examples/petsc/src/swarmutils.c
index f736581ee5..ee047b724f 100644
--- a/examples/petsc/src/swarmutils.c
+++ b/examples/petsc/src/swarmutils.c
@@ -391,7 +391,7 @@ PetscErrorCode DMSwarmCreateReferenceCoordinates(DM dm_swarm, IS *is_points, Vec
     }
 
     // -- Cleanup
-    PetscCall(PetscFree(points_in_cell));
+    PetscCall(DMSwarmSortRestorePointsPerCell(dm_swarm, cell, &num_points_in_cell, &points_in_cell));
   }
   cell_points[points_offset - 1] = num_points_local + points_offset;
 

From 7f836c3130692982ef72689a0598b6a4d81ae7d4 Mon Sep 17 00:00:00 2001
From: James Wright <james@jameswright.xyz>
Date: Sun, 27 Oct 2024 00:20:11 +0000
Subject: [PATCH 228/571] fix(sycl): Replaces pragma once with include guards

OpenCL doesn't really like pragma once evidently. I think we've been
'getting away' with it previously as the JIT processing automatically
doesn't allow for nested includes, but the same is not done for
<ceed/types.h>?
---
 include/ceed/jit-source/sycl/sycl-types.h | 5 ++++-
 include/ceed/types.h                      | 5 ++++-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/include/ceed/jit-source/sycl/sycl-types.h b/include/ceed/jit-source/sycl/sycl-types.h
index b42ad10385..58938a4b2a 100644
--- a/include/ceed/jit-source/sycl/sycl-types.h
+++ b/include/ceed/jit-source/sycl/sycl-types.h
@@ -7,7 +7,8 @@
 
 /// @file
 /// Internal header for SYCL type definitions
-#pragma once
+#ifndef CEED_SYCL_TYPES_H
+#define CEED_SYCL_TYPES_H
 
 #include <ceed/types.h>
 
@@ -34,3 +35,5 @@ typedef struct {
   CeedInt       *outputs[CEED_SYCL_NUMBER_FIELDS];
 } FieldsInt_Sycl;
 #endif
+
+#endif  // CEED_SYCL_TYPES_H
diff --git a/include/ceed/types.h b/include/ceed/types.h
index 3f858a7ca4..6c6d126548 100644
--- a/include/ceed/types.h
+++ b/include/ceed/types.h
@@ -7,7 +7,8 @@
 
 /// @file
 /// Public header for types and macros used in user QFunction source code
-#pragma once
+#ifndef CEED_QFUNCTION_DEFS_H
+#define CEED_QFUNCTION_DEFS_H
 
 #ifndef CEED_RUNNING_JIT_PASS
 #include <stddef.h>
@@ -252,3 +253,5 @@ typedef enum {
   /// Boolean value
   CEED_CONTEXT_FIELD_BOOL = 3,
 } CeedContextFieldType;
+
+#endif  // CEED_QFUNCTION_DEFS_H

From 81ae61599cc0e14ccce523d66e38bf75c6f7903c Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Mon, 11 Nov 2024 13:15:29 -0700
Subject: [PATCH 229/571] gpu - split AtPoints basis between Transpose/no

---
 backends/cuda-ref/ceed-cuda-ref-basis.c       |  15 +-
 backends/cuda-ref/ceed-cuda-ref.h             |   2 +
 backends/cuda-shared/ceed-cuda-shared-basis.c |  15 +-
 backends/cuda-shared/ceed-cuda-shared.h       |   2 +
 backends/hip-ref/ceed-hip-ref-basis.c         |  15 +-
 backends/hip-ref/ceed-hip-ref.h               |   2 +
 backends/hip-shared/ceed-hip-shared-basis.c   |  15 +-
 backends/hip-shared/ceed-hip-shared.h         |   2 +
 .../cuda/cuda-ref-basis-tensor-at-points.h    | 494 ++++++++++--------
 .../hip/hip-ref-basis-tensor-at-points.h      | 494 ++++++++++--------
 10 files changed, 586 insertions(+), 470 deletions(-)

diff --git a/backends/cuda-ref/ceed-cuda-ref-basis.c b/backends/cuda-ref/ceed-cuda-ref-basis.c
index d7ab9a4aae..544a5cb188 100644
--- a/backends/cuda-ref/ceed-cuda-ref-basis.c
+++ b/backends/cuda-ref/ceed-cuda-ref-basis.c
@@ -194,7 +194,9 @@ static int CeedBasisApplyAtPointsCore_Cuda(CeedBasis basis, bool apply_add, cons
                                      "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_NUM_PTS",
                                      max_num_points, "POINTS_BUFF_LEN", CeedIntPow(Q_1d, dim - 1)));
     CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "InterpAtPoints", &data->InterpAtPoints));
+    CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "InterpTransposeAtPoints", &data->InterpTransposeAtPoints));
     CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "GradAtPoints", &data->GradAtPoints));
+    CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "GradTransposeAtPoints", &data->GradTransposeAtPoints));
   }
 
   // Get read/write access to u, v
@@ -220,16 +222,17 @@ static int CeedBasisApplyAtPointsCore_Cuda(CeedBasis basis, bool apply_add, cons
   // Basis action
   switch (eval_mode) {
     case CEED_EVAL_INTERP: {
-      void *interp_args[]      = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
-      const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
+      void         *interp_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
+      const CeedInt block_size    = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
 
-      CeedCallBackend(CeedRunKernel_Cuda(ceed, data->InterpAtPoints, num_elem, block_size, interp_args));
+      CeedCallBackend(
+          CeedRunKernel_Cuda(ceed, is_transpose ? data->InterpTransposeAtPoints : data->InterpAtPoints, num_elem, block_size, interp_args));
     } break;
     case CEED_EVAL_GRAD: {
-      void *grad_args[]        = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
-      const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
+      void         *grad_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
+      const CeedInt block_size  = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
 
-      CeedCallBackend(CeedRunKernel_Cuda(ceed, data->GradAtPoints, num_elem, block_size, grad_args));
+      CeedCallBackend(CeedRunKernel_Cuda(ceed, is_transpose ? data->GradTransposeAtPoints : data->GradAtPoints, num_elem, block_size, grad_args));
     } break;
     case CEED_EVAL_WEIGHT:
     case CEED_EVAL_NONE: /* handled separately below */
diff --git a/backends/cuda-ref/ceed-cuda-ref.h b/backends/cuda-ref/ceed-cuda-ref.h
index 0f6ca9d1cb..582b43a975 100644
--- a/backends/cuda-ref/ceed-cuda-ref.h
+++ b/backends/cuda-ref/ceed-cuda-ref.h
@@ -70,7 +70,9 @@ typedef struct {
   CUmodule    moduleAtPoints;
   CeedInt     num_points;
   CUfunction  InterpAtPoints;
+  CUfunction  InterpTransposeAtPoints;
   CUfunction  GradAtPoints;
+  CUfunction  GradTransposeAtPoints;
   CeedScalar *d_interp_1d;
   CeedScalar *d_grad_1d;
   CeedScalar *d_q_weight_1d;
diff --git a/backends/cuda-shared/ceed-cuda-shared-basis.c b/backends/cuda-shared/ceed-cuda-shared-basis.c
index bd2467e538..67f3d7ac7c 100644
--- a/backends/cuda-shared/ceed-cuda-shared-basis.c
+++ b/backends/cuda-shared/ceed-cuda-shared-basis.c
@@ -295,7 +295,9 @@ static int CeedBasisApplyAtPointsCore_Cuda_shared(CeedBasis basis, bool apply_ad
                                      "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_NUM_PTS",
                                      max_num_points, "POINTS_BUFF_LEN", CeedIntPow(Q_1d, dim - 1)));
     CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "InterpAtPoints", &data->InterpAtPoints));
+    CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "InterpTransposeAtPoints", &data->InterpTransposeAtPoints));
     CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "GradAtPoints", &data->GradAtPoints));
+    CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "GradTransposeAtPoints", &data->GradTransposeAtPoints));
   }
 
   // Get read/write access to u, v
@@ -321,16 +323,17 @@ static int CeedBasisApplyAtPointsCore_Cuda_shared(CeedBasis basis, bool apply_ad
   // Basis action
   switch (eval_mode) {
     case CEED_EVAL_INTERP: {
-      void *interp_args[]      = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
-      const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
+      void         *interp_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
+      const CeedInt block_size    = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
 
-      CeedCallBackend(CeedRunKernel_Cuda(ceed, data->InterpAtPoints, num_elem, block_size, interp_args));
+      CeedCallBackend(
+          CeedRunKernel_Cuda(ceed, is_transpose ? data->InterpTransposeAtPoints : data->InterpAtPoints, num_elem, block_size, interp_args));
     } break;
     case CEED_EVAL_GRAD: {
-      void *grad_args[]        = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
-      const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
+      void         *grad_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
+      const CeedInt block_size  = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
 
-      CeedCallBackend(CeedRunKernel_Cuda(ceed, data->GradAtPoints, num_elem, block_size, grad_args));
+      CeedCallBackend(CeedRunKernel_Cuda(ceed, is_transpose ? data->GradTransposeAtPoints : data->GradAtPoints, num_elem, block_size, grad_args));
     } break;
     case CEED_EVAL_WEIGHT:
     case CEED_EVAL_NONE: /* handled separately below */
diff --git a/backends/cuda-shared/ceed-cuda-shared.h b/backends/cuda-shared/ceed-cuda-shared.h
index d70d75ab94..f42f2b1cff 100644
--- a/backends/cuda-shared/ceed-cuda-shared.h
+++ b/backends/cuda-shared/ceed-cuda-shared.h
@@ -22,7 +22,9 @@ typedef struct {
   CUmodule    moduleAtPoints;
   CeedInt     num_points;
   CUfunction  InterpAtPoints;
+  CUfunction  InterpTransposeAtPoints;
   CUfunction  GradAtPoints;
+  CUfunction  GradTransposeAtPoints;
   CeedScalar *d_interp_1d;
   CeedScalar *d_grad_1d;
   CeedScalar *d_collo_grad_1d;
diff --git a/backends/hip-ref/ceed-hip-ref-basis.c b/backends/hip-ref/ceed-hip-ref-basis.c
index 8fb3d3fa20..7fdfb9f16d 100644
--- a/backends/hip-ref/ceed-hip-ref-basis.c
+++ b/backends/hip-ref/ceed-hip-ref-basis.c
@@ -192,7 +192,9 @@ static int CeedBasisApplyAtPointsCore_Hip(CeedBasis basis, bool apply_add, const
                                     "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_NUM_PTS",
                                     max_num_points, "POINTS_BUFF_LEN", CeedIntPow(Q_1d, dim - 1)));
     CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "InterpAtPoints", &data->InterpAtPoints));
+    CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "InterpTransposeAtPoints", &data->InterpTransposeAtPoints));
     CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "GradAtPoints", &data->GradAtPoints));
+    CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "GradTransposeAtPoints", &data->GradTransposeAtPoints));
   }
 
   // Get read/write access to u, v
@@ -218,16 +220,17 @@ static int CeedBasisApplyAtPointsCore_Hip(CeedBasis basis, bool apply_add, const
   // Basis action
   switch (eval_mode) {
     case CEED_EVAL_INTERP: {
-      void *interp_args[]      = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
-      const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
+      void         *interp_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
+      const CeedInt block_size    = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
 
-      CeedCallBackend(CeedRunKernel_Hip(ceed, data->InterpAtPoints, num_elem, block_size, interp_args));
+      CeedCallBackend(
+          CeedRunKernel_Hip(ceed, is_transpose ? data->InterpTransposeAtPoints : data->InterpAtPoints, num_elem, block_size, interp_args));
     } break;
     case CEED_EVAL_GRAD: {
-      void *grad_args[]        = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
-      const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
+      void         *grad_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
+      const CeedInt block_size  = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
 
-      CeedCallBackend(CeedRunKernel_Hip(ceed, data->GradAtPoints, num_elem, block_size, grad_args));
+      CeedCallBackend(CeedRunKernel_Hip(ceed, is_transpose ? data->GradTransposeAtPoints : data->GradAtPoints, num_elem, block_size, grad_args));
     } break;
     case CEED_EVAL_WEIGHT:
     case CEED_EVAL_NONE: /* handled separately below */
diff --git a/backends/hip-ref/ceed-hip-ref.h b/backends/hip-ref/ceed-hip-ref.h
index 5a695761a9..9740700d87 100644
--- a/backends/hip-ref/ceed-hip-ref.h
+++ b/backends/hip-ref/ceed-hip-ref.h
@@ -74,7 +74,9 @@ typedef struct {
   hipModule_t   moduleAtPoints;
   CeedInt       num_points;
   hipFunction_t InterpAtPoints;
+  hipFunction_t InterpTransposeAtPoints;
   hipFunction_t GradAtPoints;
+  hipFunction_t GradTransposeAtPoints;
   CeedScalar   *d_interp_1d;
   CeedScalar   *d_grad_1d;
   CeedScalar   *d_q_weight_1d;
diff --git a/backends/hip-shared/ceed-hip-shared-basis.c b/backends/hip-shared/ceed-hip-shared-basis.c
index 6d8c858632..b08d1fa271 100644
--- a/backends/hip-shared/ceed-hip-shared-basis.c
+++ b/backends/hip-shared/ceed-hip-shared-basis.c
@@ -354,7 +354,9 @@ static int CeedBasisApplyAtPointsCore_Hip_shared(CeedBasis basis, bool apply_add
                                     "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_NUM_PTS",
                                     max_num_points, "POINTS_BUFF_LEN", CeedIntPow(Q_1d, dim - 1)));
     CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "InterpAtPoints", &data->InterpAtPoints));
+    CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "InterpTransposeAtPoints", &data->InterpTransposeAtPoints));
     CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "GradAtPoints", &data->GradAtPoints));
+    CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "GradTransposeAtPoints", &data->GradTransposeAtPoints));
   }
 
   // Get read/write access to u, v
@@ -380,16 +382,17 @@ static int CeedBasisApplyAtPointsCore_Hip_shared(CeedBasis basis, bool apply_add
   // Basis action
   switch (eval_mode) {
     case CEED_EVAL_INTERP: {
-      void *interp_args[]      = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
-      const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
+      void         *interp_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
+      const CeedInt block_size    = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
 
-      CeedCallBackend(CeedRunKernel_Hip(ceed, data->InterpAtPoints, num_elem, block_size, interp_args));
+      CeedCallBackend(
+          CeedRunKernel_Hip(ceed, is_transpose ? data->InterpTransposeAtPoints : data->InterpAtPoints, num_elem, block_size, interp_args));
     } break;
     case CEED_EVAL_GRAD: {
-      void *grad_args[]        = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
-      const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
+      void         *grad_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
+      const CeedInt block_size  = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
 
-      CeedCallBackend(CeedRunKernel_Hip(ceed, data->GradAtPoints, num_elem, block_size, grad_args));
+      CeedCallBackend(CeedRunKernel_Hip(ceed, is_transpose ? data->GradTransposeAtPoints : data->GradAtPoints, num_elem, block_size, grad_args));
     } break;
     case CEED_EVAL_WEIGHT:
     case CEED_EVAL_NONE: /* handled separately below */
diff --git a/backends/hip-shared/ceed-hip-shared.h b/backends/hip-shared/ceed-hip-shared.h
index c000b7f873..962c088bc0 100644
--- a/backends/hip-shared/ceed-hip-shared.h
+++ b/backends/hip-shared/ceed-hip-shared.h
@@ -22,7 +22,9 @@ typedef struct {
   hipModule_t   moduleAtPoints;
   CeedInt       num_points;
   hipFunction_t InterpAtPoints;
+  hipFunction_t InterpTransposeAtPoints;
   hipFunction_t GradAtPoints;
+  hipFunction_t GradTransposeAtPoints;
   CeedInt       block_sizes[3];  // interp, grad, weight thread block sizes
   CeedScalar   *d_interp_1d;
   CeedScalar   *d_grad_1d;
diff --git a/include/ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h b/include/ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h
index 2d17b55b2c..134547ecce 100644
--- a/include/ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h
+++ b/include/ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h
@@ -40,7 +40,7 @@ inline __device__ void ChebyshevDerivativeAtPoint(const CeedScalar x, CeedScalar
 //------------------------------------------------------------------------------
 // Interp
 //------------------------------------------------------------------------------
-extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt is_transpose, const CeedScalar *__restrict__ chebyshev_interp_1d,
+extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ chebyshev_interp_1d,
                                           const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords,
                                           const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
   const CeedInt i = threadIdx.x;
@@ -57,124 +57,239 @@ extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt
 
   const CeedInt P             = BASIS_P_1D;
   const CeedInt Q             = BASIS_Q_1D;
-  const CeedInt u_stride      = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES;
-  const CeedInt v_stride      = is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS;
-  const CeedInt u_comp_stride = num_elem * (is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES);
-  const CeedInt v_comp_stride = num_elem * (is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS);
-  const CeedInt u_size        = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES;
+  const CeedInt u_stride      = BASIS_NUM_NODES;
+  const CeedInt v_stride      = BASIS_NUM_PTS;
+  const CeedInt u_comp_stride = num_elem * BASIS_NUM_NODES;
+  const CeedInt v_comp_stride = num_elem * BASIS_NUM_PTS;
+  const CeedInt u_size        = BASIS_NUM_NODES;
 
   // Apply basis element by element
-  if (is_transpose) {
-    for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
-      for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
-        const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride];
-        CeedScalar       *cur_v = &v[elem * v_stride + comp * v_comp_stride];
-        CeedInt           pre   = 1;
-        CeedInt           post  = 1;
-
-        // Clear Chebyshev coeffs
-        for (CeedInt k = i; k < BASIS_NUM_QPTS; k += blockDim.x) {
-          s_chebyshev_coeffs[k] = 0.0;
-        }
-
-        // Map from point
+  for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
+    for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
+      const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride];
+      CeedScalar       *cur_v = &v[elem * v_stride + comp * v_comp_stride];
+      CeedInt           pre   = u_size;
+      CeedInt           post  = 1;
+
+      // Map to coefficients
+      for (CeedInt d = 0; d < BASIS_DIM; d++) {
         __syncthreads();
-        for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
-          if (p >= points_per_elem[elem]) continue;
-          pre  = 1;
-          post = 1;
-          for (CeedInt d = 0; d < BASIS_DIM; d++) {
-            // Update buffers used
-            pre /= 1;
-            const CeedScalar *in  = d == 0 ? (&cur_u[p]) : (d % 2 ? buffer_2 : buffer_1);
-            CeedScalar       *out = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? buffer_1 : buffer_2);
-
-            // Build Chebyshev polynomial values
-            ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * u_stride + d * u_comp_stride + p], chebyshev_x);
-
-            // Contract along middle index
-            for (CeedInt a = 0; a < pre; a++) {
-              for (CeedInt c = 0; c < post; c++) {
-                if (d == BASIS_DIM - 1) {
-                  for (CeedInt j = 0; j < Q; j++) atomicAdd(&out[(a * Q + (j + p) % Q) * post + c], chebyshev_x[(j + p) % Q] * in[a * post + c]);
-                } else {
-                  for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] = chebyshev_x[j] * in[a * post + c];
-                }
-              }
-            }
-            post *= Q;
-          }
+        // Update buffers used
+        pre /= P;
+        const CeedScalar *in       = d == 0 ? cur_u : (d % 2 ? s_buffer_2 : s_buffer_1);
+        CeedScalar       *out      = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_1 : s_buffer_2);
+        const CeedInt     writeLen = pre * post * Q;
+
+        // Contract along middle index
+        for (CeedInt k = i; k < writeLen; k += blockDim.x) {
+          const CeedInt c   = k % post;
+          const CeedInt j   = (k / post) % Q;
+          const CeedInt a   = k / (post * Q);
+          CeedScalar    v_k = 0;
+
+          for (CeedInt b = 0; b < P; b++) v_k += s_chebyshev_interp_1d[j * BASIS_P_1D + b] * in[(a * P + b) * post + c];
+          out[k] = v_k;
         }
+        post *= Q;
+      }
 
-        // Map from coefficients
+      // Map to point
+      __syncthreads();
+      for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
         pre  = BASIS_NUM_QPTS;
         post = 1;
         for (CeedInt d = 0; d < BASIS_DIM; d++) {
-          __syncthreads();
           // Update buffers used
           pre /= Q;
-          const CeedScalar *in       = d == 0 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_2 : s_buffer_1);
-          CeedScalar       *out      = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2);
-          const CeedInt     writeLen = pre * post * P;
+          const CeedScalar *in  = d == 0 ? s_chebyshev_coeffs : (d % 2 ? buffer_2 : buffer_1);
+          CeedScalar       *out = d == BASIS_DIM - 1 ? (&cur_v[p]) : (d % 2 ? buffer_1 : buffer_2);
+
+          // Build Chebyshev polynomial values
+          ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * v_stride + d * v_comp_stride + p], chebyshev_x);
 
           // Contract along middle index
-          for (CeedInt k = i; k < writeLen; k += blockDim.x) {
-            const CeedInt c   = k % post;
-            const CeedInt j   = (k / post) % P;
-            const CeedInt a   = k / (post * P);
-            CeedScalar    v_k = 0;
-
-            for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c];
-            if (d == BASIS_DIM - 1) out[k] += v_k;
-            else out[k] = v_k;
+          for (CeedInt a = 0; a < pre; a++) {
+            for (CeedInt c = 0; c < post; c++) {
+              CeedScalar v_k = 0;
+
+              for (CeedInt b = 0; b < Q; b++) v_k += chebyshev_x[b] * in[(a * Q + b) * post + c];
+              out[a * post + c] = v_k;
+            }
           }
-          post *= P;
+          post *= 1;
         }
       }
     }
-  } else {
-    for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
-      for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
-        const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride];
-        CeedScalar       *cur_v = &v[elem * v_stride + comp * v_comp_stride];
-        CeedInt           pre   = u_size;
-        CeedInt           post  = 1;
-
-        // Map to coefficients
+  }
+}
+
+extern "C" __global__ void InterpTransposeAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ chebyshev_interp_1d,
+                                                   const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords,
+                                                   const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
+  const CeedInt i = threadIdx.x;
+
+  __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D];
+  CeedScalar           *s_chebyshev_interp_1d = s_mem;
+  CeedScalar           *s_buffer_1            = s_mem + BASIS_Q_1D * BASIS_P_1D;
+  CeedScalar           *s_buffer_2            = s_buffer_1 + BASIS_BUF_LEN;
+  CeedScalar           *s_chebyshev_coeffs    = s_buffer_2 + BASIS_BUF_LEN;
+  CeedScalar            chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN];
+  for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) {
+    s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k];
+  }
+
+  const CeedInt P             = BASIS_P_1D;
+  const CeedInt Q             = BASIS_Q_1D;
+  const CeedInt u_stride      = BASIS_NUM_PTS;
+  const CeedInt v_stride      = BASIS_NUM_NODES;
+  const CeedInt u_comp_stride = num_elem * BASIS_NUM_PTS;
+  const CeedInt v_comp_stride = num_elem * BASIS_NUM_NODES;
+  const CeedInt u_size        = BASIS_NUM_PTS;
+
+  // Apply basis element by element
+  for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
+    for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
+      const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride];
+      CeedScalar       *cur_v = &v[elem * v_stride + comp * v_comp_stride];
+      CeedInt           pre   = 1;
+      CeedInt           post  = 1;
+
+      // Clear Chebyshev coeffs
+      for (CeedInt k = i; k < BASIS_NUM_QPTS; k += blockDim.x) {
+        s_chebyshev_coeffs[k] = 0.0;
+      }
+
+      // Map from point
+      __syncthreads();
+      for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
+        if (p >= points_per_elem[elem]) continue;
+        pre  = 1;
+        post = 1;
         for (CeedInt d = 0; d < BASIS_DIM; d++) {
-          __syncthreads();
           // Update buffers used
-          pre /= P;
-          const CeedScalar *in       = d == 0 ? cur_u : (d % 2 ? s_buffer_2 : s_buffer_1);
-          CeedScalar       *out      = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_1 : s_buffer_2);
-          const CeedInt     writeLen = pre * post * Q;
+          pre /= 1;
+          const CeedScalar *in  = d == 0 ? (&cur_u[p]) : (d % 2 ? buffer_2 : buffer_1);
+          CeedScalar       *out = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? buffer_1 : buffer_2);
+
+          // Build Chebyshev polynomial values
+          ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * u_stride + d * u_comp_stride + p], chebyshev_x);
 
           // Contract along middle index
-          for (CeedInt k = i; k < writeLen; k += blockDim.x) {
-            const CeedInt c   = k % post;
-            const CeedInt j   = (k / post) % Q;
-            const CeedInt a   = k / (post * Q);
-            CeedScalar    v_k = 0;
-
-            for (CeedInt b = 0; b < P; b++) v_k += s_chebyshev_interp_1d[j * BASIS_P_1D + b] * in[(a * P + b) * post + c];
-            out[k] = v_k;
+          for (CeedInt a = 0; a < pre; a++) {
+            for (CeedInt c = 0; c < post; c++) {
+              if (d == BASIS_DIM - 1) {
+                for (CeedInt j = 0; j < Q; j++) atomicAdd(&out[(a * Q + (j + p) % Q) * post + c], chebyshev_x[(j + p) % Q] * in[a * post + c]);
+              } else {
+                for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] = chebyshev_x[j] * in[a * post + c];
+              }
+            }
           }
           post *= Q;
         }
+      }
 
-        // Map to point
+      // Map from coefficients
+      pre  = BASIS_NUM_QPTS;
+      post = 1;
+      for (CeedInt d = 0; d < BASIS_DIM; d++) {
         __syncthreads();
-        for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
+        // Update buffers used
+        pre /= Q;
+        const CeedScalar *in       = d == 0 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_2 : s_buffer_1);
+        CeedScalar       *out      = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2);
+        const CeedInt     writeLen = pre * post * P;
+
+        // Contract along middle index
+        for (CeedInt k = i; k < writeLen; k += blockDim.x) {
+          const CeedInt c   = k % post;
+          const CeedInt j   = (k / post) % P;
+          const CeedInt a   = k / (post * P);
+          CeedScalar    v_k = 0;
+
+          for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c];
+          if (d == BASIS_DIM - 1) out[k] += v_k;
+          else out[k] = v_k;
+        }
+        post *= P;
+      }
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// Grad
+//------------------------------------------------------------------------------
+extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ chebyshev_interp_1d,
+                                        const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords,
+                                        const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
+  const CeedInt i = threadIdx.x;
+
+  __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D];
+  CeedScalar           *s_chebyshev_interp_1d = s_mem;
+  CeedScalar           *s_buffer_1            = s_mem + BASIS_Q_1D * BASIS_P_1D;
+  CeedScalar           *s_buffer_2            = s_buffer_1 + BASIS_BUF_LEN;
+  CeedScalar           *s_chebyshev_coeffs    = s_buffer_2 + BASIS_BUF_LEN;
+  CeedScalar            chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN];
+  for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) {
+    s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k];
+  }
+
+  const CeedInt P             = BASIS_P_1D;
+  const CeedInt Q             = BASIS_Q_1D;
+  const CeedInt u_stride      = BASIS_NUM_NODES;
+  const CeedInt v_stride      = BASIS_NUM_PTS;
+  const CeedInt u_comp_stride = num_elem * BASIS_NUM_NODES;
+  const CeedInt v_comp_stride = num_elem * BASIS_NUM_PTS;
+  const CeedInt u_size        = BASIS_NUM_NODES;
+  const CeedInt u_dim_stride  = 0;
+  const CeedInt v_dim_stride  = num_elem * BASIS_NUM_PTS * BASIS_NUM_COMP;
+
+  // Apply basis element by element
+  for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
+    for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
+      const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride];
+      CeedInt           pre   = u_size;
+      CeedInt           post  = 1;
+
+      // Map to coefficients
+      for (CeedInt d = 0; d < BASIS_DIM; d++) {
+        __syncthreads();
+        // Update buffers used
+        pre /= P;
+        const CeedScalar *in       = d == 0 ? cur_u : (d % 2 ? s_buffer_2 : s_buffer_1);
+        CeedScalar       *out      = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_1 : s_buffer_2);
+        const CeedInt     writeLen = pre * post * Q;
+
+        // Contract along middle index
+        for (CeedInt k = i; k < writeLen; k += blockDim.x) {
+          const CeedInt c   = k % post;
+          const CeedInt j   = (k / post) % Q;
+          const CeedInt a   = k / (post * Q);
+          CeedScalar    v_k = 0;
+
+          for (CeedInt b = 0; b < P; b++) v_k += s_chebyshev_interp_1d[j * BASIS_P_1D + b] * in[(a * P + b) * post + c];
+          out[k] = v_k;
+        }
+        post *= Q;
+      }
+
+      // Map to point
+      __syncthreads();
+      for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
+        for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) {
+          CeedScalar *cur_v = &v[elem * v_stride + dim_1 * v_dim_stride + comp * v_comp_stride];
+
           pre  = BASIS_NUM_QPTS;
           post = 1;
-          for (CeedInt d = 0; d < BASIS_DIM; d++) {
+          for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) {
             // Update buffers used
             pre /= Q;
-            const CeedScalar *in  = d == 0 ? s_chebyshev_coeffs : (d % 2 ? buffer_2 : buffer_1);
-            CeedScalar       *out = d == BASIS_DIM - 1 ? (&cur_v[p]) : (d % 2 ? buffer_1 : buffer_2);
+            const CeedScalar *in  = dim_2 == 0 ? s_chebyshev_coeffs : (dim_2 % 2 ? buffer_2 : buffer_1);
+            CeedScalar       *out = dim_2 == BASIS_DIM - 1 ? (cur_v + p) : (dim_2 % 2 ? buffer_1 : buffer_2);
 
             // Build Chebyshev polynomial values
-            ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * v_stride + d * v_comp_stride + p], chebyshev_x);
+            if (dim_1 == dim_2) ChebyshevDerivativeAtPoint<BASIS_Q_1D>(coords[elem * v_stride + dim_2 * v_comp_stride + p], chebyshev_x);
+            else ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * v_stride + dim_2 * v_comp_stride + p], chebyshev_x);
 
             // Contract along middle index
             for (CeedInt a = 0; a < pre; a++) {
@@ -193,12 +308,9 @@ extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt
   }
 }
 
-//------------------------------------------------------------------------------
-// Grad
-//------------------------------------------------------------------------------
-extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedInt is_transpose, const CeedScalar *__restrict__ chebyshev_interp_1d,
-                                        const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords,
-                                        const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
+extern "C" __global__ void GradTransposeAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ chebyshev_interp_1d,
+                                                 const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords,
+                                                 const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
   const CeedInt i = threadIdx.x;
 
   __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D];
@@ -213,147 +325,83 @@ extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedInt is
 
   const CeedInt P             = BASIS_P_1D;
   const CeedInt Q             = BASIS_Q_1D;
-  const CeedInt u_stride      = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES;
-  const CeedInt v_stride      = is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS;
-  const CeedInt u_comp_stride = num_elem * (is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES);
-  const CeedInt v_comp_stride = num_elem * (is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS);
-  const CeedInt u_size        = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES;
-  const CeedInt u_dim_stride  = is_transpose ? num_elem * BASIS_NUM_PTS * BASIS_NUM_COMP : 0;
-  const CeedInt v_dim_stride  = is_transpose ? 0 : num_elem * BASIS_NUM_PTS * BASIS_NUM_COMP;
+  const CeedInt u_stride      = BASIS_NUM_PTS;
+  const CeedInt v_stride      = BASIS_NUM_NODES;
+  const CeedInt u_comp_stride = num_elem * BASIS_NUM_PTS;
+  const CeedInt v_comp_stride = num_elem * BASIS_NUM_NODES;
+  const CeedInt u_size        = BASIS_NUM_PTS;
+  const CeedInt u_dim_stride  = num_elem * BASIS_NUM_PTS * BASIS_NUM_COMP;
+  const CeedInt v_dim_stride  = 0;
 
   // Apply basis element by element
-  if (is_transpose) {
-    for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
-      for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
-        CeedScalar *cur_v = &v[elem * v_stride + comp * v_comp_stride];
-        CeedInt     pre   = 1;
-        CeedInt     post  = 1;
-
-        // Clear Chebyshev coeffs
-        for (CeedInt k = i; k < BASIS_NUM_QPTS; k += blockDim.x) {
-          s_chebyshev_coeffs[k] = 0.0;
-        }
+  for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
+    for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
+      CeedScalar *cur_v = &v[elem * v_stride + comp * v_comp_stride];
+      CeedInt     pre   = 1;
+      CeedInt     post  = 1;
+
+      // Clear Chebyshev coeffs
+      for (CeedInt k = i; k < BASIS_NUM_QPTS; k += blockDim.x) {
+        s_chebyshev_coeffs[k] = 0.0;
+      }
 
-        // Map from point
-        __syncthreads();
-        for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
-          if (p >= points_per_elem[elem]) continue;
-          for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) {
-            const CeedScalar *cur_u = &u[elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride];
-
-            pre  = 1;
-            post = 1;
-            for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) {
-              // Update buffers used
-              pre /= 1;
-              const CeedScalar *in  = dim_2 == 0 ? (cur_u + p) : (dim_2 % 2 ? buffer_2 : buffer_1);
-              CeedScalar       *out = dim_2 == BASIS_DIM - 1 ? s_chebyshev_coeffs : (dim_2 % 2 ? buffer_1 : buffer_2);
-
-              // Build Chebyshev polynomial values
-              if (dim_1 == dim_2) ChebyshevDerivativeAtPoint<BASIS_Q_1D>(coords[elem * u_stride + dim_2 * u_comp_stride + p], chebyshev_x);
-              else ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * u_stride + dim_2 * u_comp_stride + p], chebyshev_x);
-
-              // Contract along middle index
-              for (CeedInt a = 0; a < pre; a++) {
-                for (CeedInt c = 0; c < post; c++) {
-                  if (dim_2 == BASIS_DIM - 1) {
-                    for (CeedInt j = 0; j < Q; j++) atomicAdd(&out[(a * Q + (j + p) % Q) * post + c], chebyshev_x[(j + p) % Q] * in[a * post + c]);
-                  } else {
-                    for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] = chebyshev_x[j] * in[a * post + c];
-                  }
+      // Map from point
+      __syncthreads();
+      for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
+        if (p >= points_per_elem[elem]) continue;
+        for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) {
+          const CeedScalar *cur_u = &u[elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride];
+
+          pre  = 1;
+          post = 1;
+          for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) {
+            // Update buffers used
+            pre /= 1;
+            const CeedScalar *in  = dim_2 == 0 ? (cur_u + p) : (dim_2 % 2 ? buffer_2 : buffer_1);
+            CeedScalar       *out = dim_2 == BASIS_DIM - 1 ? s_chebyshev_coeffs : (dim_2 % 2 ? buffer_1 : buffer_2);
+
+            // Build Chebyshev polynomial values
+            if (dim_1 == dim_2) ChebyshevDerivativeAtPoint<BASIS_Q_1D>(coords[elem * u_stride + dim_2 * u_comp_stride + p], chebyshev_x);
+            else ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * u_stride + dim_2 * u_comp_stride + p], chebyshev_x);
+
+            // Contract along middle index
+            for (CeedInt a = 0; a < pre; a++) {
+              for (CeedInt c = 0; c < post; c++) {
+                if (dim_2 == BASIS_DIM - 1) {
+                  for (CeedInt j = 0; j < Q; j++) atomicAdd(&out[(a * Q + (j + p) % Q) * post + c], chebyshev_x[(j + p) % Q] * in[a * post + c]);
+                } else {
+                  for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] = chebyshev_x[j] * in[a * post + c];
                 }
               }
-              post *= Q;
             }
+            post *= Q;
           }
         }
-
-        // Map from coefficients
-        pre  = BASIS_NUM_QPTS;
-        post = 1;
-        for (CeedInt d = 0; d < BASIS_DIM; d++) {
-          __syncthreads();
-          // Update buffers used
-          pre /= Q;
-          const CeedScalar *in       = d == 0 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_2 : s_buffer_1);
-          CeedScalar       *out      = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2);
-          const CeedInt     writeLen = pre * post * P;
-
-          // Contract along middle index
-          for (CeedInt k = i; k < writeLen; k += blockDim.x) {
-            const CeedInt c   = k % post;
-            const CeedInt j   = (k / post) % P;
-            const CeedInt a   = k / (post * P);
-            CeedScalar    v_k = 0;
-
-            for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c];
-            if (d == BASIS_DIM - 1) out[k] += v_k;
-            else out[k] = v_k;
-          }
-          post *= P;
-        }
       }
-    }
-  } else {
-    for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
-      for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
-        const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride];
-        CeedInt           pre   = u_size;
-        CeedInt           post  = 1;
-
-        // Map to coefficients
-        for (CeedInt d = 0; d < BASIS_DIM; d++) {
-          __syncthreads();
-          // Update buffers used
-          pre /= P;
-          const CeedScalar *in       = d == 0 ? cur_u : (d % 2 ? s_buffer_2 : s_buffer_1);
-          CeedScalar       *out      = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_1 : s_buffer_2);
-          const CeedInt     writeLen = pre * post * Q;
 
-          // Contract along middle index
-          for (CeedInt k = i; k < writeLen; k += blockDim.x) {
-            const CeedInt c   = k % post;
-            const CeedInt j   = (k / post) % Q;
-            const CeedInt a   = k / (post * Q);
-            CeedScalar    v_k = 0;
-
-            for (CeedInt b = 0; b < P; b++) v_k += s_chebyshev_interp_1d[j * BASIS_P_1D + b] * in[(a * P + b) * post + c];
-            out[k] = v_k;
-          }
-          post *= Q;
-        }
-
-        // Map to point
+      // Map from coefficients
+      pre  = BASIS_NUM_QPTS;
+      post = 1;
+      for (CeedInt d = 0; d < BASIS_DIM; d++) {
         __syncthreads();
-        for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
-          for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) {
-            CeedScalar *cur_v = &v[elem * v_stride + dim_1 * v_dim_stride + comp * v_comp_stride];
-
-            pre  = BASIS_NUM_QPTS;
-            post = 1;
-            for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) {
-              // Update buffers used
-              pre /= Q;
-              const CeedScalar *in  = dim_2 == 0 ? s_chebyshev_coeffs : (dim_2 % 2 ? buffer_2 : buffer_1);
-              CeedScalar       *out = dim_2 == BASIS_DIM - 1 ? (cur_v + p) : (dim_2 % 2 ? buffer_1 : buffer_2);
-
-              // Build Chebyshev polynomial values
-              if (dim_1 == dim_2) ChebyshevDerivativeAtPoint<BASIS_Q_1D>(coords[elem * v_stride + dim_2 * v_comp_stride + p], chebyshev_x);
-              else ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * v_stride + dim_2 * v_comp_stride + p], chebyshev_x);
-
-              // Contract along middle index
-              for (CeedInt a = 0; a < pre; a++) {
-                for (CeedInt c = 0; c < post; c++) {
-                  CeedScalar v_k = 0;
-
-                  for (CeedInt b = 0; b < Q; b++) v_k += chebyshev_x[b] * in[(a * Q + b) * post + c];
-                  out[a * post + c] = v_k;
-                }
-              }
-              post *= 1;
-            }
-          }
+        // Update buffers used
+        pre /= Q;
+        const CeedScalar *in       = d == 0 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_2 : s_buffer_1);
+        CeedScalar       *out      = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2);
+        const CeedInt     writeLen = pre * post * P;
+
+        // Contract along middle index
+        for (CeedInt k = i; k < writeLen; k += blockDim.x) {
+          const CeedInt c   = k % post;
+          const CeedInt j   = (k / post) % P;
+          const CeedInt a   = k / (post * P);
+          CeedScalar    v_k = 0;
+
+          for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c];
+          if (d == BASIS_DIM - 1) out[k] += v_k;
+          else out[k] = v_k;
         }
+        post *= P;
       }
     }
   }
diff --git a/include/ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h b/include/ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h
index 9ce63a38de..188846386b 100644
--- a/include/ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h
+++ b/include/ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h
@@ -40,7 +40,7 @@ inline __device__ void ChebyshevDerivativeAtPoint(const CeedScalar x, CeedScalar
 //------------------------------------------------------------------------------
 // Interp
 //------------------------------------------------------------------------------
-extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt is_transpose, const CeedScalar *__restrict__ chebyshev_interp_1d,
+extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ chebyshev_interp_1d,
                                           const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords,
                                           const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
   const CeedInt i = threadIdx.x;
@@ -57,124 +57,239 @@ extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt
 
   const CeedInt P             = BASIS_P_1D;
   const CeedInt Q             = BASIS_Q_1D;
-  const CeedInt u_stride      = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES;
-  const CeedInt v_stride      = is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS;
-  const CeedInt u_comp_stride = num_elem * (is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES);
-  const CeedInt v_comp_stride = num_elem * (is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS);
-  const CeedInt u_size        = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES;
+  const CeedInt u_stride      = BASIS_NUM_NODES;
+  const CeedInt v_stride      = BASIS_NUM_PTS;
+  const CeedInt u_comp_stride = num_elem * BASIS_NUM_NODES;
+  const CeedInt v_comp_stride = num_elem * BASIS_NUM_PTS;
+  const CeedInt u_size        = BASIS_NUM_NODES;
 
   // Apply basis element by element
-  if (is_transpose) {
-    for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
-      for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
-        const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride];
-        CeedScalar       *cur_v = &v[elem * v_stride + comp * v_comp_stride];
-        CeedInt           pre   = 1;
-        CeedInt           post  = 1;
-
-        // Clear Chebyshev coeffs
-        for (CeedInt k = i; k < BASIS_NUM_QPTS; k += blockDim.x) {
-          s_chebyshev_coeffs[k] = 0.0;
-        }
-
-        // Map from point
+  for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
+    for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
+      const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride];
+      CeedScalar       *cur_v = &v[elem * v_stride + comp * v_comp_stride];
+      CeedInt           pre   = u_size;
+      CeedInt           post  = 1;
+
+      // Map to coefficients
+      for (CeedInt d = 0; d < BASIS_DIM; d++) {
         __syncthreads();
-        for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
-          if (p >= points_per_elem[elem]) continue;
-          pre  = 1;
-          post = 1;
-          for (CeedInt d = 0; d < BASIS_DIM; d++) {
-            // Update buffers used
-            pre /= 1;
-            const CeedScalar *in  = d == 0 ? (&cur_u[p]) : (d % 2 ? buffer_2 : buffer_1);
-            CeedScalar       *out = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? buffer_1 : buffer_2);
-
-            // Build Chebyshev polynomial values
-            ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * u_stride + d * u_comp_stride + p], chebyshev_x);
-
-            // Contract along middle index
-            for (CeedInt a = 0; a < pre; a++) {
-              for (CeedInt c = 0; c < post; c++) {
-                if (d == BASIS_DIM - 1) {
-                  for (CeedInt j = 0; j < Q; j++) atomicAdd(&out[(a * Q + (j + p) % Q) * post + c], chebyshev_x[(j + p) % Q] * in[a * post + c]);
-                } else {
-                  for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] = chebyshev_x[j] * in[a * post + c];
-                }
-              }
-            }
-            post *= Q;
-          }
+        // Update buffers used
+        pre /= P;
+        const CeedScalar *in       = d == 0 ? cur_u : (d % 2 ? s_buffer_2 : s_buffer_1);
+        CeedScalar       *out      = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_1 : s_buffer_2);
+        const CeedInt     writeLen = pre * post * Q;
+
+        // Contract along middle index
+        for (CeedInt k = i; k < writeLen; k += blockDim.x) {
+          const CeedInt c   = k % post;
+          const CeedInt j   = (k / post) % Q;
+          const CeedInt a   = k / (post * Q);
+          CeedScalar    v_k = 0;
+
+          for (CeedInt b = 0; b < P; b++) v_k += s_chebyshev_interp_1d[j * BASIS_P_1D + b] * in[(a * P + b) * post + c];
+          out[k] = v_k;
         }
+        post *= Q;
+      }
 
-        // Map from coefficients
+      // Map to point
+      __syncthreads();
+      for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
         pre  = BASIS_NUM_QPTS;
         post = 1;
         for (CeedInt d = 0; d < BASIS_DIM; d++) {
-          __syncthreads();
           // Update buffers used
           pre /= Q;
-          const CeedScalar *in       = d == 0 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_2 : s_buffer_1);
-          CeedScalar       *out      = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2);
-          const CeedInt     writeLen = pre * post * P;
+          const CeedScalar *in  = d == 0 ? s_chebyshev_coeffs : (d % 2 ? buffer_2 : buffer_1);
+          CeedScalar       *out = d == BASIS_DIM - 1 ? (&cur_v[p]) : (d % 2 ? buffer_1 : buffer_2);
+
+          // Build Chebyshev polynomial values
+          ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * v_stride + d * v_comp_stride + p], chebyshev_x);
 
           // Contract along middle index
-          for (CeedInt k = i; k < writeLen; k += blockDim.x) {
-            const CeedInt c   = k % post;
-            const CeedInt j   = (k / post) % P;
-            const CeedInt a   = k / (post * P);
-            CeedScalar    v_k = 0;
-
-            for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c];
-            if (d == BASIS_DIM - 1) out[k] += v_k;
-            else out[k] = v_k;
+          for (CeedInt a = 0; a < pre; a++) {
+            for (CeedInt c = 0; c < post; c++) {
+              CeedScalar v_k = 0;
+
+              for (CeedInt b = 0; b < Q; b++) v_k += chebyshev_x[b] * in[(a * Q + b) * post + c];
+              out[a * post + c] = v_k;
+            }
           }
-          post *= P;
+          post *= 1;
         }
       }
     }
-  } else {
-    for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
-      for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
-        const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride];
-        CeedScalar       *cur_v = &v[elem * v_stride + comp * v_comp_stride];
-        CeedInt           pre   = u_size;
-        CeedInt           post  = 1;
-
-        // Map to coefficients
+  }
+}
+
+extern "C" __global__ void InterpTransposeAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ chebyshev_interp_1d,
+                                                   const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords,
+                                                   const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
+  const CeedInt i = threadIdx.x;
+
+  __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D];
+  CeedScalar           *s_chebyshev_interp_1d = s_mem;
+  CeedScalar           *s_buffer_1            = s_mem + BASIS_Q_1D * BASIS_P_1D;
+  CeedScalar           *s_buffer_2            = s_buffer_1 + BASIS_BUF_LEN;
+  CeedScalar           *s_chebyshev_coeffs    = s_buffer_2 + BASIS_BUF_LEN;
+  CeedScalar            chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN];
+  for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) {
+    s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k];
+  }
+
+  const CeedInt P             = BASIS_P_1D;
+  const CeedInt Q             = BASIS_Q_1D;
+  const CeedInt u_stride      = BASIS_NUM_PTS;
+  const CeedInt v_stride      = BASIS_NUM_NODES;
+  const CeedInt u_comp_stride = num_elem * BASIS_NUM_PTS;
+  const CeedInt v_comp_stride = num_elem * BASIS_NUM_NODES;
+  const CeedInt u_size        = BASIS_NUM_PTS;
+
+  // Apply basis element by element
+  for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
+    for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
+      const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride];
+      CeedScalar       *cur_v = &v[elem * v_stride + comp * v_comp_stride];
+      CeedInt           pre   = 1;
+      CeedInt           post  = 1;
+
+      // Clear Chebyshev coeffs
+      for (CeedInt k = i; k < BASIS_NUM_QPTS; k += blockDim.x) {
+        s_chebyshev_coeffs[k] = 0.0;
+      }
+
+      // Map from point
+      __syncthreads();
+      for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
+        if (p >= points_per_elem[elem]) continue;
+        pre  = 1;
+        post = 1;
         for (CeedInt d = 0; d < BASIS_DIM; d++) {
-          __syncthreads();
           // Update buffers used
-          pre /= P;
-          const CeedScalar *in       = d == 0 ? cur_u : (d % 2 ? s_buffer_2 : s_buffer_1);
-          CeedScalar       *out      = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_1 : s_buffer_2);
-          const CeedInt     writeLen = pre * post * Q;
+          pre /= 1;
+          const CeedScalar *in  = d == 0 ? (&cur_u[p]) : (d % 2 ? buffer_2 : buffer_1);
+          CeedScalar       *out = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? buffer_1 : buffer_2);
+
+          // Build Chebyshev polynomial values
+          ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * u_stride + d * u_comp_stride + p], chebyshev_x);
 
           // Contract along middle index
-          for (CeedInt k = i; k < writeLen; k += blockDim.x) {
-            const CeedInt c   = k % post;
-            const CeedInt j   = (k / post) % Q;
-            const CeedInt a   = k / (post * Q);
-            CeedScalar    v_k = 0;
-
-            for (CeedInt b = 0; b < P; b++) v_k += s_chebyshev_interp_1d[j * BASIS_P_1D + b] * in[(a * P + b) * post + c];
-            out[k] = v_k;
+          for (CeedInt a = 0; a < pre; a++) {
+            for (CeedInt c = 0; c < post; c++) {
+              if (d == BASIS_DIM - 1) {
+                for (CeedInt j = 0; j < Q; j++) atomicAdd(&out[(a * Q + (j + p) % Q) * post + c], chebyshev_x[(j + p) % Q] * in[a * post + c]);
+              } else {
+                for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] = chebyshev_x[j] * in[a * post + c];
+              }
+            }
           }
           post *= Q;
         }
+      }
 
-        // Map to point
+      // Map from coefficients
+      pre  = BASIS_NUM_QPTS;
+      post = 1;
+      for (CeedInt d = 0; d < BASIS_DIM; d++) {
         __syncthreads();
-        for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
+        // Update buffers used
+        pre /= Q;
+        const CeedScalar *in       = d == 0 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_2 : s_buffer_1);
+        CeedScalar       *out      = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2);
+        const CeedInt     writeLen = pre * post * P;
+
+        // Contract along middle index
+        for (CeedInt k = i; k < writeLen; k += blockDim.x) {
+          const CeedInt c   = k % post;
+          const CeedInt j   = (k / post) % P;
+          const CeedInt a   = k / (post * P);
+          CeedScalar    v_k = 0;
+
+          for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c];
+          if (d == BASIS_DIM - 1) out[k] += v_k;
+          else out[k] = v_k;
+        }
+        post *= P;
+      }
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// Grad
+//------------------------------------------------------------------------------
+extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ chebyshev_interp_1d,
+                                        const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords,
+                                        const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
+  const CeedInt i = threadIdx.x;
+
+  __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D];
+  CeedScalar           *s_chebyshev_interp_1d = s_mem;
+  CeedScalar           *s_buffer_1            = s_mem + BASIS_Q_1D * BASIS_P_1D;
+  CeedScalar           *s_buffer_2            = s_buffer_1 + BASIS_BUF_LEN;
+  CeedScalar           *s_chebyshev_coeffs    = s_buffer_2 + BASIS_BUF_LEN;
+  CeedScalar            chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN];
+  for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) {
+    s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k];
+  }
+
+  const CeedInt P             = BASIS_P_1D;
+  const CeedInt Q             = BASIS_Q_1D;
+  const CeedInt u_stride      = BASIS_NUM_NODES;
+  const CeedInt v_stride      = BASIS_NUM_PTS;
+  const CeedInt u_comp_stride = num_elem * BASIS_NUM_NODES;
+  const CeedInt v_comp_stride = num_elem * BASIS_NUM_PTS;
+  const CeedInt u_size        = BASIS_NUM_NODES;
+  const CeedInt u_dim_stride  = 0;
+  const CeedInt v_dim_stride  = num_elem * BASIS_NUM_PTS * BASIS_NUM_COMP;
+
+  // Apply basis element by element
+  for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
+    for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
+      const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride];
+      CeedInt           pre   = u_size;
+      CeedInt           post  = 1;
+
+      // Map to coefficients
+      for (CeedInt d = 0; d < BASIS_DIM; d++) {
+        __syncthreads();
+        // Update buffers used
+        pre /= P;
+        const CeedScalar *in       = d == 0 ? cur_u : (d % 2 ? s_buffer_2 : s_buffer_1);
+        CeedScalar       *out      = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_1 : s_buffer_2);
+        const CeedInt     writeLen = pre * post * Q;
+
+        // Contract along middle index
+        for (CeedInt k = i; k < writeLen; k += blockDim.x) {
+          const CeedInt c   = k % post;
+          const CeedInt j   = (k / post) % Q;
+          const CeedInt a   = k / (post * Q);
+          CeedScalar    v_k = 0;
+
+          for (CeedInt b = 0; b < P; b++) v_k += s_chebyshev_interp_1d[j * BASIS_P_1D + b] * in[(a * P + b) * post + c];
+          out[k] = v_k;
+        }
+        post *= Q;
+      }
+
+      // Map to point
+      __syncthreads();
+      for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
+        for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) {
+          CeedScalar *cur_v = &v[elem * v_stride + dim_1 * v_dim_stride + comp * v_comp_stride];
+
           pre  = BASIS_NUM_QPTS;
           post = 1;
-          for (CeedInt d = 0; d < BASIS_DIM; d++) {
+          for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) {
             // Update buffers used
             pre /= Q;
-            const CeedScalar *in  = d == 0 ? s_chebyshev_coeffs : (d % 2 ? buffer_2 : buffer_1);
-            CeedScalar       *out = d == BASIS_DIM - 1 ? (&cur_v[p]) : (d % 2 ? buffer_1 : buffer_2);
+            const CeedScalar *in  = dim_2 == 0 ? s_chebyshev_coeffs : (dim_2 % 2 ? buffer_2 : buffer_1);
+            CeedScalar       *out = dim_2 == BASIS_DIM - 1 ? (&cur_v[p]) : (dim_2 % 2 ? buffer_1 : buffer_2);
 
             // Build Chebyshev polynomial values
-            ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * v_stride + d * v_comp_stride + p], chebyshev_x);
+            if (dim_1 == dim_2) ChebyshevDerivativeAtPoint<BASIS_Q_1D>(coords[elem * v_stride + dim_2 * v_comp_stride + p], chebyshev_x);
+            else ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * v_stride + dim_2 * v_comp_stride + p], chebyshev_x);
 
             // Contract along middle index
             for (CeedInt a = 0; a < pre; a++) {
@@ -193,12 +308,9 @@ extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt
   }
 }
 
-//------------------------------------------------------------------------------
-// Grad
-//------------------------------------------------------------------------------
-extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedInt is_transpose, const CeedScalar *__restrict__ chebyshev_interp_1d,
-                                        const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords,
-                                        const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
+extern "C" __global__ void GradTransposeAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ chebyshev_interp_1d,
+                                                 const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords,
+                                                 const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
   const CeedInt i = threadIdx.x;
 
   __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D];
@@ -213,147 +325,83 @@ extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedInt is
 
   const CeedInt P             = BASIS_P_1D;
   const CeedInt Q             = BASIS_Q_1D;
-  const CeedInt u_stride      = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES;
-  const CeedInt v_stride      = is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS;
-  const CeedInt u_comp_stride = num_elem * (is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES);
-  const CeedInt v_comp_stride = num_elem * (is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS);
-  const CeedInt u_size        = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES;
-  const CeedInt u_dim_stride  = is_transpose ? num_elem * BASIS_NUM_PTS * BASIS_NUM_COMP : 0;
-  const CeedInt v_dim_stride  = is_transpose ? 0 : num_elem * BASIS_NUM_PTS * BASIS_NUM_COMP;
+  const CeedInt u_stride      = BASIS_NUM_PTS;
+  const CeedInt v_stride      = BASIS_NUM_NODES;
+  const CeedInt u_comp_stride = num_elem * BASIS_NUM_PTS;
+  const CeedInt v_comp_stride = num_elem * BASIS_NUM_NODES;
+  const CeedInt u_size        = BASIS_NUM_PTS;
+  const CeedInt u_dim_stride  = num_elem * BASIS_NUM_PTS * BASIS_NUM_COMP;
+  const CeedInt v_dim_stride  = 0;
 
   // Apply basis element by element
-  if (is_transpose) {
-    for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
-      for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
-        CeedScalar *cur_v = &v[elem * v_stride + comp * v_comp_stride];
-        CeedInt     pre   = 1;
-        CeedInt     post  = 1;
-
-        // Clear Chebyshev coeffs
-        for (CeedInt k = i; k < BASIS_NUM_QPTS; k += blockDim.x) {
-          s_chebyshev_coeffs[k] = 0.0;
-        }
+  for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
+    for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
+      CeedScalar *cur_v = &v[elem * v_stride + comp * v_comp_stride];
+      CeedInt     pre   = 1;
+      CeedInt     post  = 1;
+
+      // Clear Chebyshev coeffs
+      for (CeedInt k = i; k < BASIS_NUM_QPTS; k += blockDim.x) {
+        s_chebyshev_coeffs[k] = 0.0;
+      }
 
-        // Map from point
-        __syncthreads();
-        for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
-          if (p >= points_per_elem[elem]) continue;
-          for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) {
-            const CeedScalar *cur_u = &u[elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride];
-
-            pre  = 1;
-            post = 1;
-            for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) {
-              // Update buffers used
-              pre /= 1;
-              const CeedScalar *in  = dim_2 == 0 ? (&cur_u[p]) : (dim_2 % 2 ? buffer_2 : buffer_1);
-              CeedScalar       *out = dim_2 == BASIS_DIM - 1 ? s_chebyshev_coeffs : (dim_2 % 2 ? buffer_1 : buffer_2);
-
-              // Build Chebyshev polynomial values
-              if (dim_1 == dim_2) ChebyshevDerivativeAtPoint<BASIS_Q_1D>(coords[elem * u_stride + dim_2 * u_comp_stride + p], chebyshev_x);
-              else ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * u_stride + dim_2 * u_comp_stride + p], chebyshev_x);
-
-              // Contract along middle index
-              for (CeedInt a = 0; a < pre; a++) {
-                for (CeedInt c = 0; c < post; c++) {
-                  if (dim_2 == BASIS_DIM - 1) {
-                    for (CeedInt j = 0; j < Q; j++) atomicAdd(&out[(a * Q + (j + p) % Q) * post + c], chebyshev_x[(j + p) % Q] * in[a * post + c]);
-                  } else {
-                    for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] = chebyshev_x[j] * in[a * post + c];
-                  }
+      // Map from point
+      __syncthreads();
+      for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
+        if (p >= points_per_elem[elem]) continue;
+        for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) {
+          const CeedScalar *cur_u = &u[elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride];
+
+          pre  = 1;
+          post = 1;
+          for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) {
+            // Update buffers used
+            pre /= 1;
+            const CeedScalar *in  = dim_2 == 0 ? (&cur_u[p]) : (dim_2 % 2 ? buffer_2 : buffer_1);
+            CeedScalar       *out = dim_2 == BASIS_DIM - 1 ? s_chebyshev_coeffs : (dim_2 % 2 ? buffer_1 : buffer_2);
+
+            // Build Chebyshev polynomial values
+            if (dim_1 == dim_2) ChebyshevDerivativeAtPoint<BASIS_Q_1D>(coords[elem * u_stride + dim_2 * u_comp_stride + p], chebyshev_x);
+            else ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * u_stride + dim_2 * u_comp_stride + p], chebyshev_x);
+
+            // Contract along middle index
+            for (CeedInt a = 0; a < pre; a++) {
+              for (CeedInt c = 0; c < post; c++) {
+                if (dim_2 == BASIS_DIM - 1) {
+                  for (CeedInt j = 0; j < Q; j++) atomicAdd(&out[(a * Q + (j + p) % Q) * post + c], chebyshev_x[(j + p) % Q] * in[a * post + c]);
+                } else {
+                  for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] = chebyshev_x[j] * in[a * post + c];
                 }
               }
-              post *= Q;
             }
+            post *= Q;
           }
         }
-
-        // Map from coefficients
-        pre  = BASIS_NUM_QPTS;
-        post = 1;
-        for (CeedInt d = 0; d < BASIS_DIM; d++) {
-          __syncthreads();
-          // Update buffers used
-          pre /= Q;
-          const CeedScalar *in       = d == 0 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_2 : s_buffer_1);
-          CeedScalar       *out      = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2);
-          const CeedInt     writeLen = pre * post * P;
-
-          // Contract along middle index
-          for (CeedInt k = i; k < writeLen; k += blockDim.x) {
-            const CeedInt c   = k % post;
-            const CeedInt j   = (k / post) % P;
-            const CeedInt a   = k / (post * P);
-            CeedScalar    v_k = 0;
-
-            for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c];
-            if (d == BASIS_DIM - 1) out[k] += v_k;
-            else out[k] = v_k;
-          }
-          post *= P;
-        }
       }
-    }
-  } else {
-    for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
-      for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
-        const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride];
-        CeedInt           pre   = u_size;
-        CeedInt           post  = 1;
-
-        // Map to coefficients
-        for (CeedInt d = 0; d < BASIS_DIM; d++) {
-          __syncthreads();
-          // Update buffers used
-          pre /= P;
-          const CeedScalar *in       = d == 0 ? cur_u : (d % 2 ? s_buffer_2 : s_buffer_1);
-          CeedScalar       *out      = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_1 : s_buffer_2);
-          const CeedInt     writeLen = pre * post * Q;
 
-          // Contract along middle index
-          for (CeedInt k = i; k < writeLen; k += blockDim.x) {
-            const CeedInt c   = k % post;
-            const CeedInt j   = (k / post) % Q;
-            const CeedInt a   = k / (post * Q);
-            CeedScalar    v_k = 0;
-
-            for (CeedInt b = 0; b < P; b++) v_k += s_chebyshev_interp_1d[j * BASIS_P_1D + b] * in[(a * P + b) * post + c];
-            out[k] = v_k;
-          }
-          post *= Q;
-        }
-
-        // Map to point
+      // Map from coefficients
+      pre  = BASIS_NUM_QPTS;
+      post = 1;
+      for (CeedInt d = 0; d < BASIS_DIM; d++) {
         __syncthreads();
-        for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
-          for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) {
-            CeedScalar *cur_v = &v[elem * v_stride + dim_1 * v_dim_stride + comp * v_comp_stride];
-
-            pre  = BASIS_NUM_QPTS;
-            post = 1;
-            for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) {
-              // Update buffers used
-              pre /= Q;
-              const CeedScalar *in  = dim_2 == 0 ? s_chebyshev_coeffs : (dim_2 % 2 ? buffer_2 : buffer_1);
-              CeedScalar       *out = dim_2 == BASIS_DIM - 1 ? (&cur_v[p]) : (dim_2 % 2 ? buffer_1 : buffer_2);
-
-              // Build Chebyshev polynomial values
-              if (dim_1 == dim_2) ChebyshevDerivativeAtPoint<BASIS_Q_1D>(coords[elem * v_stride + dim_2 * v_comp_stride + p], chebyshev_x);
-              else ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * v_stride + dim_2 * v_comp_stride + p], chebyshev_x);
-
-              // Contract along middle index
-              for (CeedInt a = 0; a < pre; a++) {
-                for (CeedInt c = 0; c < post; c++) {
-                  CeedScalar v_k = 0;
-
-                  for (CeedInt b = 0; b < Q; b++) v_k += chebyshev_x[b] * in[(a * Q + b) * post + c];
-                  out[a * post + c] = v_k;
-                }
-              }
-              post *= 1;
-            }
-          }
+        // Update buffers used
+        pre /= Q;
+        const CeedScalar *in       = d == 0 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_2 : s_buffer_1);
+        CeedScalar       *out      = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2);
+        const CeedInt     writeLen = pre * post * P;
+
+        // Contract along middle index
+        for (CeedInt k = i; k < writeLen; k += blockDim.x) {
+          const CeedInt c   = k % post;
+          const CeedInt j   = (k / post) % P;
+          const CeedInt a   = k / (post * P);
+          CeedScalar    v_k = 0;
+
+          for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c];
+          if (d == BASIS_DIM - 1) out[k] += v_k;
+          else out[k] = v_k;
         }
+        post *= P;
       }
     }
   }

From 9e1d4b8291fc4f4e19d20cdfdecd866260d0e6d2 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Thu, 7 Nov 2024 14:43:33 -0700
Subject: [PATCH 230/571] gpu  - shared AtPoints

---
 backends/cuda-shared/ceed-cuda-shared-basis.c |  94 +++-
 backends/hip-shared/ceed-hip-shared-basis.c   |  82 +++-
 ...-shared-basis-tensor-at-points-templates.h | 448 ++++++++++++++++++
 .../cuda/cuda-shared-basis-tensor-at-points.h | 252 ++++++++++
 ...-shared-basis-tensor-at-points-templates.h | 448 ++++++++++++++++++
 .../hip/hip-shared-basis-tensor-at-points.h   | 274 +++++++++++
 tests/t354-basis.c                            |   4 +-
 7 files changed, 1568 insertions(+), 34 deletions(-)
 create mode 100644 include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h
 create mode 100644 include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points.h
 create mode 100644 include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h
 create mode 100644 include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points.h

diff --git a/backends/cuda-shared/ceed-cuda-shared-basis.c b/backends/cuda-shared/ceed-cuda-shared-basis.c
index 67f3d7ac7c..623433ccf2 100644
--- a/backends/cuda-shared/ceed-cuda-shared-basis.c
+++ b/backends/cuda-shared/ceed-cuda-shared-basis.c
@@ -211,9 +211,9 @@ static int CeedBasisApplyAddTensor_Cuda_shared(CeedBasis basis, const CeedInt nu
 static int CeedBasisApplyAtPointsCore_Cuda_shared(CeedBasis basis, bool apply_add, const CeedInt num_elem, const CeedInt *num_points,
                                                   CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) {
   Ceed                   ceed;
-  CeedInt                Q_1d, dim, max_num_points = num_points[0];
-  const CeedInt          is_transpose   = t_mode == CEED_TRANSPOSE;
-  const int              max_block_size = 32;
+  Ceed_Cuda             *ceed_Cuda;
+  CeedInt                Q_1d, dim, num_comp, max_num_points = num_points[0];
+  const CeedInt          is_transpose = t_mode == CEED_TRANSPOSE;
   const CeedScalar      *d_x, *d_u;
   CeedScalar            *d_v;
   CeedBasis_Cuda_shared *data;
@@ -221,6 +221,7 @@ static int CeedBasisApplyAtPointsCore_Cuda_shared(CeedBasis basis, bool apply_ad
   CeedCallBackend(CeedBasisGetData(basis, &data));
   CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
   CeedCallBackend(CeedBasisGetDimension(basis, &dim));
+  CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
 
   // Weight handled separately
   if (eval_mode == CEED_EVAL_WEIGHT) {
@@ -229,14 +230,13 @@ static int CeedBasisApplyAtPointsCore_Cuda_shared(CeedBasis basis, bool apply_ad
   }
 
   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
+  CeedCallBackend(CeedGetData(ceed, &ceed_Cuda));
 
   // Check padded to uniform number of points per elem
   for (CeedInt i = 1; i < num_elem; i++) max_num_points = CeedIntMax(max_num_points, num_points[i]);
   {
-    CeedInt  num_comp, q_comp;
+    CeedInt  q_comp;
     CeedSize len, len_required;
-
-    CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
     CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp));
     CeedCallBackend(CeedVectorGetLength(is_transpose ? u : v, &len));
     len_required = (CeedSize)num_comp * (CeedSize)q_comp * (CeedSize)num_elem * (CeedSize)max_num_points;
@@ -285,15 +285,14 @@ static int CeedBasisApplyAtPointsCore_Cuda_shared(CeedBasis basis, bool apply_ad
     }
 
     // -- Compile kernels
-    const char basis_kernel_source[] = "// AtPoints basis source\n#include <ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h>\n";
+    const char basis_kernel_source[] = "// AtPoints basis source\n#include <ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points.h>\n";
     CeedInt    num_comp;
 
     if (data->moduleAtPoints) CeedCallCuda(ceed, cuModuleUnload(data->moduleAtPoints));
     CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
-    CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->moduleAtPoints, 9, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_BUF_LEN",
-                                     Q_1d * CeedIntPow(Q_1d > P_1d ? Q_1d : P_1d, dim - 1), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp,
-                                     "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_NUM_PTS",
-                                     max_num_points, "POINTS_BUFF_LEN", CeedIntPow(Q_1d, dim - 1)));
+    CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->moduleAtPoints, 8, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "T_1D",
+                                     CeedIntMax(Q_1d, P_1d), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, "BASIS_NUM_NODES", CeedIntPow(P_1d, dim),
+                                     "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_NUM_PTS", max_num_points));
     CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "InterpAtPoints", &data->InterpAtPoints));
     CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "InterpTransposeAtPoints", &data->InterpTransposeAtPoints));
     CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "GradAtPoints", &data->GradAtPoints));
@@ -323,17 +322,76 @@ static int CeedBasisApplyAtPointsCore_Cuda_shared(CeedBasis basis, bool apply_ad
   // Basis action
   switch (eval_mode) {
     case CEED_EVAL_INTERP: {
-      void         *interp_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
-      const CeedInt block_size    = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
+      CeedInt P_1d, Q_1d;
+
+      CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
+      CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
+      CeedInt thread_1d = CeedIntMax(Q_1d, P_1d);
+
+      CeedCallBackend(CeedInit_CudaInterp(data->d_chebyshev_interp_1d, P_1d, Q_1d, &data->c_B));
+      void *interp_args[] = {(void *)&num_elem, &data->c_B, &data->d_points_per_elem, &d_x, &d_u, &d_v};
+
+      if (dim == 1) {
+        CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread_1d,
+                                                                                                 1));  // avoid >512 total threads
+        CeedInt grid            = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
+        CeedInt shared_mem      = elems_per_block * thread_1d * sizeof(CeedScalar);
+
+        CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, is_transpose ? data->InterpTransposeAtPoints : data->InterpAtPoints, grid, thread_1d, 1,
+                                                    elems_per_block, shared_mem, interp_args));
+      } else if (dim == 2) {
+        const CeedInt opt_elems[7] = {0, 32, 8, 6, 4, 2, 8};
+        // elems_per_block must be at least 1
+        CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] / num_comp : 1, 1);
+        CeedInt grid            = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
+        CeedInt shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
 
-      CeedCallBackend(
-          CeedRunKernel_Cuda(ceed, is_transpose ? data->InterpTransposeAtPoints : data->InterpAtPoints, num_elem, block_size, interp_args));
+        CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, is_transpose ? data->InterpTransposeAtPoints : data->InterpAtPoints, grid, thread_1d,
+                                                    thread_1d, elems_per_block, shared_mem, interp_args));
+      } else if (dim == 3) {
+        CeedInt elems_per_block = 1;
+        CeedInt grid            = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
+        CeedInt shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
+
+        CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, is_transpose ? data->InterpTransposeAtPoints : data->InterpAtPoints, grid, thread_1d,
+                                                    thread_1d, elems_per_block, shared_mem, interp_args));
+      }
     } break;
     case CEED_EVAL_GRAD: {
-      void         *grad_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
-      const CeedInt block_size  = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
+      CeedInt P_1d, Q_1d;
+
+      CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
+      CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
+      CeedInt thread_1d = CeedIntMax(Q_1d, P_1d);
+
+      CeedCallBackend(CeedInit_CudaInterp(data->d_chebyshev_interp_1d, P_1d, Q_1d, &data->c_B));
+      void *grad_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
+
+      if (dim == 1) {
+        CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread_1d,
+                                                                                                 1));  // avoid >512 total threads
+        CeedInt grid            = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
+        CeedInt shared_mem      = elems_per_block * thread_1d * sizeof(CeedScalar);
 
-      CeedCallBackend(CeedRunKernel_Cuda(ceed, is_transpose ? data->GradTransposeAtPoints : data->GradAtPoints, num_elem, block_size, grad_args));
+        CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, is_transpose ? data->GradTransposeAtPoints : data->GradAtPoints, grid, thread_1d, 1,
+                                                    elems_per_block, shared_mem, grad_args));
+      } else if (dim == 2) {
+        const CeedInt opt_elems[7] = {0, 32, 8, 6, 4, 2, 8};
+        // elems_per_block must be at least 1
+        CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] / num_comp : 1, 1);
+        CeedInt grid            = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
+        CeedInt shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
+
+        CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, is_transpose ? data->GradTransposeAtPoints : data->GradAtPoints, grid, thread_1d, thread_1d,
+                                                    elems_per_block, shared_mem, grad_args));
+      } else if (dim == 3) {
+        CeedInt elems_per_block = 1;
+        CeedInt grid            = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
+        CeedInt shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
+
+        CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, is_transpose ? data->GradTransposeAtPoints : data->GradAtPoints, grid, thread_1d, thread_1d,
+                                                    elems_per_block, shared_mem, grad_args));
+      }
     } break;
     case CEED_EVAL_WEIGHT:
     case CEED_EVAL_NONE: /* handled separately below */
diff --git a/backends/hip-shared/ceed-hip-shared-basis.c b/backends/hip-shared/ceed-hip-shared-basis.c
index b08d1fa271..e1162b76d8 100644
--- a/backends/hip-shared/ceed-hip-shared-basis.c
+++ b/backends/hip-shared/ceed-hip-shared-basis.c
@@ -271,8 +271,7 @@ static int CeedBasisApplyAtPointsCore_Hip_shared(CeedBasis basis, bool apply_add
                                                  CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) {
   Ceed                  ceed;
   CeedInt               Q_1d, dim, max_num_points = num_points[0];
-  const CeedInt         is_transpose   = t_mode == CEED_TRANSPOSE;
-  const int             max_block_size = 32;
+  const CeedInt         is_transpose = t_mode == CEED_TRANSPOSE;
   const CeedScalar     *d_x, *d_u;
   CeedScalar           *d_v;
   CeedBasis_Hip_shared *data;
@@ -344,15 +343,15 @@ static int CeedBasisApplyAtPointsCore_Hip_shared(CeedBasis basis, bool apply_add
     }
 
     // -- Compile kernels
-    const char basis_kernel_source[] = "// AtPoints basis source\n#include <ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h>\n";
+    const char basis_kernel_source[] = "// AtPoints basis source\n#include <ceed/jit-source/hip/hip-shared-basis-tensor-at-points.h>\n";
     CeedInt    num_comp;
 
     if (data->moduleAtPoints) CeedCallHip(ceed, hipModuleUnload(data->moduleAtPoints));
     CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
-    CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->moduleAtPoints, 9, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_BUF_LEN",
-                                    Q_1d * CeedIntPow(Q_1d > P_1d ? Q_1d : P_1d, dim - 1), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp,
-                                    "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_NUM_PTS",
-                                    max_num_points, "POINTS_BUFF_LEN", CeedIntPow(Q_1d, dim - 1)));
+    CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->moduleAtPoints, 9, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "T_1D",
+                                    CeedIntMax(Q_1d, P_1d), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, "BASIS_NUM_NODES", CeedIntPow(P_1d, dim),
+                                    "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_NUM_PTS", max_num_points, "BASIS_INTERP_BLOCK_SIZE",
+                                    data->block_sizes[0]));
     CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "InterpAtPoints", &data->InterpAtPoints));
     CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "InterpTransposeAtPoints", &data->InterpTransposeAtPoints));
     CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "GradAtPoints", &data->GradAtPoints));
@@ -382,17 +381,72 @@ static int CeedBasisApplyAtPointsCore_Hip_shared(CeedBasis basis, bool apply_add
   // Basis action
   switch (eval_mode) {
     case CEED_EVAL_INTERP: {
-      void         *interp_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
-      const CeedInt block_size    = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
+      CeedInt P_1d, Q_1d;
+      CeedInt block_size = data->block_sizes[0];
+
+      CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
+      CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
+      CeedInt thread_1d     = CeedIntMax(Q_1d, P_1d);
+      void   *interp_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
+
+      if (dim == 1) {
+        CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64;
+        elems_per_block         = elems_per_block > 0 ? elems_per_block : 1;
+        CeedInt grid            = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
+        CeedInt shared_mem      = elems_per_block * thread_1d * sizeof(CeedScalar);
 
-      CeedCallBackend(
-          CeedRunKernel_Hip(ceed, is_transpose ? data->InterpTransposeAtPoints : data->InterpAtPoints, num_elem, block_size, interp_args));
+        CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, is_transpose ? data->InterpTransposeAtPoints : data->InterpAtPoints, grid, thread_1d, 1,
+                                                   elems_per_block, shared_mem, interp_args));
+      } else if (dim == 2) {
+        // Check if required threads is small enough to do multiple elems
+        const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1);
+        CeedInt       grid            = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
+        CeedInt       shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
+
+        CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, is_transpose ? data->InterpTransposeAtPoints : data->InterpAtPoints, grid, thread_1d,
+                                                   thread_1d, elems_per_block, shared_mem, interp_args));
+      } else if (dim == 3) {
+        const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1);
+        CeedInt       grid            = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
+        CeedInt       shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
+
+        CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, is_transpose ? data->InterpTransposeAtPoints : data->InterpAtPoints, grid, thread_1d,
+                                                   thread_1d, elems_per_block, shared_mem, interp_args));
+      }
     } break;
     case CEED_EVAL_GRAD: {
-      void         *grad_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
-      const CeedInt block_size  = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
+      CeedInt P_1d, Q_1d;
+      CeedInt block_size = data->block_sizes[0];
+
+      CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
+      CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
+      CeedInt thread_1d   = CeedIntMax(Q_1d, P_1d);
+      void   *grad_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
+
+      if (dim == 1) {
+        CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64;
+        elems_per_block         = elems_per_block > 0 ? elems_per_block : 1;
+        CeedInt grid            = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
+        CeedInt shared_mem      = elems_per_block * thread_1d * sizeof(CeedScalar);
 
-      CeedCallBackend(CeedRunKernel_Hip(ceed, is_transpose ? data->GradTransposeAtPoints : data->GradAtPoints, num_elem, block_size, grad_args));
+        CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, is_transpose ? data->GradTransposeAtPoints : data->GradAtPoints, grid, thread_1d, 1,
+                                                   elems_per_block, shared_mem, grad_args));
+      } else if (dim == 2) {
+        // Check if required threads is small enough to do multiple elems
+        const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1);
+        CeedInt       grid            = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
+        CeedInt       shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
+
+        CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, is_transpose ? data->GradTransposeAtPoints : data->GradAtPoints, grid, thread_1d, thread_1d,
+                                                   elems_per_block, shared_mem, grad_args));
+      } else if (dim == 3) {
+        const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1);
+        CeedInt       grid            = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
+        CeedInt       shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
+
+        CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, is_transpose ? data->GradTransposeAtPoints : data->GradAtPoints, grid, thread_1d, thread_1d,
+                                                   elems_per_block, shared_mem, grad_args));
+      }
     } break;
     case CEED_EVAL_WEIGHT:
     case CEED_EVAL_NONE: /* handled separately below */
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h
new file mode 100644
index 0000000000..acf35a0dd7
--- /dev/null
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h
@@ -0,0 +1,448 @@
+// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+/// @file
+/// Internal header for CUDA shared memory tensor product basis AtPoints templates
+#include <ceed/types.h>
+
+//------------------------------------------------------------------------------
+// Chebyshev values
+//------------------------------------------------------------------------------
+template <int Q_1D>
+inline __device__ void ChebyshevPolynomialsAtPoint(const CeedScalar x, CeedScalar *chebyshev_x) {
+  chebyshev_x[0] = 1.0;
+  chebyshev_x[1] = 2 * x;
+  for (CeedInt i = 2; i < Q_1D; i++) chebyshev_x[i] = 2 * x * chebyshev_x[i - 1] - chebyshev_x[i - 2];
+}
+
+template <int Q_1D>
+inline __device__ void ChebyshevDerivativeAtPoint(const CeedScalar x, CeedScalar *chebyshev_dx) {
+  CeedScalar chebyshev_x[3];
+
+  chebyshev_x[1]  = 1.0;
+  chebyshev_x[2]  = 2 * x;
+  chebyshev_dx[0] = 0.0;
+  chebyshev_dx[1] = 2.0;
+  for (CeedInt i = 2; i < Q_1D; i++) {
+    chebyshev_x[(i + 1) % 3] = 2 * x * chebyshev_x[(i + 0) % 3] - chebyshev_x[(i + 2) % 3];
+    chebyshev_dx[i]          = 2 * x * chebyshev_dx[i - 1] + 2 * chebyshev_x[(i + 0) % 3] - chebyshev_dx[i - 2];
+  }
+}
+
+//------------------------------------------------------------------------------
+// 1D
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+// 1D interpolate to points
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int NUM_POINTS, int Q_1D>
+inline __device__ void InterpAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
+                                        CeedScalar *__restrict__ r_V) {
+  CeedScalar chebyshev_x[Q_1D];
+
+  for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
+  ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    // Load coefficients
+    if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = r_C[comp];
+    __syncthreads();
+    // Contract x direction
+    for (CeedInt i = 0; i < Q_1D; i++) {
+      r_V[comp] += chebyshev_x[i] * data.slice[i];
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// 1D interpolate transpose
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int NUM_POINTS, int Q_1D>
+inline __device__ void InterpTransposeAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
+                                                 CeedScalar *__restrict__ r_C) {
+  CeedScalar chebyshev_x[Q_1D];
+
+  ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    // Clear shared memory
+    if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = 0.0;
+    __syncthreads();
+    // Contract x direction
+    if (p < NUM_POINTS) {
+      for (CeedInt i = 0; i < Q_1D; i++) {
+        atomicAdd(&data.slice[comp * Q_1D + (i + p) % Q_1D], chebyshev_x[(i + p) % Q_1D] * r_U[comp]);
+      }
+    }
+    // Pull from shared to register
+    __syncthreads();
+    if (data.t_id_x < Q_1D) r_C[comp] = data.slice[p];
+  }
+}
+
+//------------------------------------------------------------------------------
+// 1D derivatives at points
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int NUM_POINTS, int Q_1D>
+inline __device__ void GradAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
+                                      CeedScalar *__restrict__ r_V) {
+  CeedScalar chebyshev_x[Q_1D];
+
+  ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
+  for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    // Load coefficients
+    if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = r_C[comp];
+    __syncthreads();
+    // Contract x direction
+    for (CeedInt i = 0; i < Q_1D; i++) {
+      r_V[comp] += chebyshev_x[i] * data.slice[i];
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// 1D derivatives transpose
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int NUM_POINTS, int Q_1D>
+inline __device__ void GradTransposeAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
+                                               CeedScalar *__restrict__ r_C) {
+  CeedScalar chebyshev_x[Q_1D];
+
+  ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    // Clear shared memory
+    if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = 0.0;
+    __syncthreads();
+    // Contract x direction
+    if (p < NUM_POINTS) {
+      for (CeedInt i = 0; i < Q_1D; i++) {
+        atomicAdd(&data.slice[comp * Q_1D + (i + p) % Q_1D], chebyshev_x[(i + p) % Q_1D] * r_U[comp]);
+      }
+    }
+    // Pull from shared to register
+    __syncthreads();
+    if (data.t_id_x < Q_1D) r_C[comp] = data.slice[p];
+  }
+}
+
+//------------------------------------------------------------------------------
+// 2D
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+// 2D interpolate to points
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int NUM_POINTS, int Q_1D>
+inline __device__ void InterpAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
+                                        CeedScalar *__restrict__ r_V) {
+  for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    CeedScalar buffer[Q_1D];
+    CeedScalar chebyshev_x[Q_1D];
+
+    // Load coefficients
+    if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[comp];
+    __syncthreads();
+    // Contract x direction
+    ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
+    for (CeedInt i = 0; i < Q_1D; i++) {
+      buffer[i] = 0.0;
+      for (CeedInt j = 0; j < Q_1D; j++) {
+        buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D];
+      }
+    }
+    // Contract y direction
+    ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
+    for (CeedInt i = 0; i < Q_1D; i++) {
+      r_V[comp] += chebyshev_x[i] * buffer[i];
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// 2D interpolate transpose
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int NUM_POINTS, int Q_1D>
+inline __device__ void InterpTransposeAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
+                                                 CeedScalar *__restrict__ r_C) {
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    CeedScalar buffer[Q_1D];
+    CeedScalar chebyshev_x[Q_1D];
+
+    // Clear shared memory
+    if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
+    __syncthreads();
+    // Contract y direction
+    ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
+    for (CeedInt i = 0; i < Q_1D; i++) {
+      buffer[i] = chebyshev_x[i] * r_U[comp];
+    }
+    // Contract x direction
+    ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
+    if (p < NUM_POINTS) {
+      for (CeedInt i = 0; i < Q_1D; i++) {
+        // Note: shifting to avoid atomic adds
+        const CeedInt ii = (i + (p / Q_1D)) % Q_1D;
+
+        for (CeedInt j = 0; j < Q_1D; j++) {
+          const CeedInt jj = (j + p) % Q_1D;
+
+          atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
+        }
+      }
+    }
+    // Pull from shared to register
+    __syncthreads();
+    if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[comp] += data.slice[data.t_id_x + data.t_id_y * Q_1D];
+  }
+}
+
+//------------------------------------------------------------------------------
+// 2D derivatives at points
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int NUM_POINTS, int Q_1D>
+inline __device__ void GradAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
+                                      CeedScalar *__restrict__ r_V) {
+  for (CeedInt i = 0; i < NUM_COMP * 2; i++) r_V[i] = 0.0;
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    CeedScalar buffer[Q_1D];
+    CeedScalar chebyshev_x[Q_1D];
+
+    // Load coefficients
+    if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[comp];
+    __syncthreads();
+    for (CeedInt dim = 0; dim < 2; dim++) {
+      // Contract x direction
+      if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
+      else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
+      for (CeedInt i = 0; i < Q_1D; i++) {
+        buffer[i] = 0.0;
+        for (CeedInt j = 0; j < Q_1D; j++) {
+          buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D];
+        }
+      }
+      // Contract y direction
+      if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
+      else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
+      for (CeedInt i = 0; i < Q_1D; i++) {
+        r_V[comp + dim * NUM_COMP] += chebyshev_x[i] * buffer[i];
+      }
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// 2D derivatives transpose
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int NUM_POINTS, int Q_1D>
+inline __device__ void GradTransposeAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
+                                               CeedScalar *__restrict__ r_C) {
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    CeedScalar buffer[Q_1D];
+    CeedScalar chebyshev_x[Q_1D];
+
+    // Clear shared memory
+    if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
+    __syncthreads();
+    for (CeedInt dim = 0; dim < 2; dim++) {
+      // Contract y direction
+      if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
+      else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
+      for (CeedInt i = 0; i < Q_1D; i++) {
+        buffer[i] = chebyshev_x[i] * r_U[comp + dim * NUM_COMP];
+      }
+      // Contract x direction
+      if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
+      else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
+      if (p < NUM_POINTS) {
+        for (CeedInt i = 0; i < Q_1D; i++) {
+          // Note: shifting to avoid atomic adds
+          const CeedInt ii = (i + (p / Q_1D)) % Q_1D;
+
+          for (CeedInt j = 0; j < Q_1D; j++) {
+            const CeedInt jj = (j + p) % Q_1D;
+
+            atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
+          }
+        }
+      }
+    }
+    // Pull from shared to register
+    __syncthreads();
+    if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[comp] += data.slice[data.t_id_x + data.t_id_y * Q_1D];
+  }
+}
+
+//------------------------------------------------------------------------------
+// 3D
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+// 3D interpolate to points
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int NUM_POINTS, int Q_1D>
+inline __device__ void InterpAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
+                                        CeedScalar *__restrict__ r_V) {
+  for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    for (CeedInt k = 0; k < Q_1D; k++) {
+      CeedScalar buffer[Q_1D];
+      CeedScalar chebyshev_x[Q_1D];
+
+      // Load coefficients
+      if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[k + comp * Q_1D];
+      __syncthreads();
+      // Contract x direction
+      ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
+      for (CeedInt i = 0; i < Q_1D; i++) {
+        buffer[i] = 0.0;
+        for (CeedInt j = 0; j < Q_1D; j++) {
+          buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D];
+        }
+      }
+      // Contract y and z direction
+      ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
+      const CeedScalar z = chebyshev_x[k];
+
+      ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
+      for (CeedInt i = 0; i < Q_1D; i++) {
+        r_V[comp] += chebyshev_x[i] * buffer[i] * z;
+      }
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// 3D interpolate transpose
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int NUM_POINTS, int Q_1D>
+inline __device__ void InterpTransposeAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
+                                                 CeedScalar *__restrict__ r_C) {
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    for (CeedInt k = 0; k < Q_1D; k++) {
+      CeedScalar buffer[Q_1D];
+      CeedScalar chebyshev_x[Q_1D];
+
+      // Clear shared memory
+      if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
+      __syncthreads();
+      // Contract y and z direction
+      ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
+      const CeedScalar z = chebyshev_x[k];
+
+      ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
+      for (CeedInt i = 0; i < Q_1D; i++) {
+        buffer[i] = chebyshev_x[i] * r_U[comp] * z;
+      }
+      // Contract x direction
+      ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
+      if (p < NUM_POINTS) {
+        for (CeedInt i = 0; i < Q_1D; i++) {
+          // Note: shifting to avoid atomic adds
+          const CeedInt ii = (i + (p / Q_1D)) % Q_1D;
+
+          for (CeedInt j = 0; j < Q_1D; j++) {
+            const CeedInt jj = ((j + p) % Q_1D);
+
+            atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
+          }
+        }
+      }
+      // Pull from shared to register
+      __syncthreads();
+      if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[k + comp * Q_1D] += data.slice[data.t_id_x + data.t_id_y * Q_1D];
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// 3D derivatives at points
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int NUM_POINTS, int Q_1D>
+inline __device__ void GradAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
+                                      CeedScalar *__restrict__ r_V) {
+  for (CeedInt i = 0; i < NUM_COMP * 3; i++) r_V[i] = 0.0;
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    for (CeedInt k = 0; k < Q_1D; k++) {
+      CeedScalar buffer[Q_1D];
+      CeedScalar chebyshev_x[Q_1D];
+
+      // Load coefficients
+      if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[k + comp * Q_1D];
+      __syncthreads();
+      for (CeedInt dim = 0; dim < 3; dim++) {
+        // Contract x direction
+        if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
+        else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
+        for (CeedInt i = 0; i < Q_1D; i++) {
+          buffer[i] = 0.0;
+          for (CeedInt j = 0; j < Q_1D; j++) {
+            buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D];
+          }
+        }
+        // Contract y and z direction
+        if (dim == 2) ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x);
+        else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
+        const CeedScalar z = chebyshev_x[k];
+
+        if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
+        else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
+        for (CeedInt i = 0; i < Q_1D; i++) {
+          r_V[comp + dim * NUM_COMP] += chebyshev_x[i] * buffer[i] * z;
+        }
+      }
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// 3D derivatives transpose
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int NUM_POINTS, int Q_1D>
+inline __device__ void GradTransposeAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
+                                               CeedScalar *__restrict__ r_C) {
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    for (CeedInt k = 0; k < Q_1D; k++) {
+      CeedScalar buffer[Q_1D];
+      CeedScalar chebyshev_x[Q_1D];
+
+      // Clear shared memory
+      if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
+      __syncthreads();
+      for (CeedInt dim = 0; dim < 3; dim++) {
+        // Contract y and z direction
+        if (dim == 2) ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x);
+        else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
+        const CeedScalar z = chebyshev_x[k];
+
+        if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
+        else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
+        for (CeedInt i = 0; i < Q_1D; i++) {
+          buffer[i] = chebyshev_x[i] * r_U[comp + dim * NUM_COMP] * z;
+        }
+        // Contract x direction
+        if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
+        else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
+        if (p < NUM_POINTS) {
+          for (CeedInt i = 0; i < Q_1D; i++) {
+            // Note: shifting to avoid atomic adds
+            const CeedInt ii = (i + (p / Q_1D)) % Q_1D;
+
+            for (CeedInt j = 0; j < Q_1D; j++) {
+              const CeedInt jj = ((j + p) % Q_1D);
+
+              atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
+            }
+          }
+        }
+      }
+      // Pull from shared to register
+      __syncthreads();
+      if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[k + comp * Q_1D] += data.slice[data.t_id_x + data.t_id_y * Q_1D];
+    }
+  }
+}
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points.h
new file mode 100644
index 0000000000..38e162bd45
--- /dev/null
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points.h
@@ -0,0 +1,252 @@
+// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+/// @file
+/// Internal header for CUDA tensor product basis with AtPoints evaluation
+#include <ceed/types.h>
+
+#include "cuda-shared-basis-read-write-templates.h"
+#include "cuda-shared-basis-tensor-at-points-templates.h"
+#include "cuda-shared-basis-tensor-templates.h"
+
+//------------------------------------------------------------------------------
+// Tensor Basis Kernels AtPoints
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+// Interp
+//------------------------------------------------------------------------------
+extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ c_B, const CeedInt *__restrict__ points_per_elem,
+                                          const CeedScalar *__restrict__ d_X, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
+  extern __shared__ CeedScalar slice[];
+
+  SharedData_Cuda data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
+
+  CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
+  CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
+  CeedScalar r_V[BASIS_NUM_COMP];
+
+  // Apply basis element by element
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    // Map to coefficients
+    if (BASIS_DIM == 1) {
+      ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U);
+      Interp1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, c_B, r_C);
+    } else if (BASIS_DIM == 2) {
+      ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U);
+      InterpTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, c_B, r_C);
+    } else if (BASIS_DIM == 3) {
+      ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                       BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U);
+      InterpTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, c_B, r_C);
+    }
+
+    // Map to points
+    const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y));
+
+    for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) {
+      const CeedInt p = i % BASIS_NUM_PTS;
+      CeedScalar    r_X[BASIS_DIM];
+
+      for (CeedInt d = 0; d < BASIS_DIM; d++) {
+        r_X[d] = d_X[elem * BASIS_NUM_PTS + num_elem * BASIS_NUM_PTS * d + p];
+      }
+      if (BASIS_DIM == 1) {
+        InterpAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
+      } else if (BASIS_DIM == 2) {
+        InterpAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
+      } else if (BASIS_DIM == 3) {
+        InterpAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
+      }
+      for (CeedInt j = 0; j < BASIS_NUM_COMP; j++) {
+        if (i < BASIS_NUM_PTS) d_V[elem * BASIS_NUM_PTS + num_elem * BASIS_NUM_PTS * j + i] = r_V[j];
+      }
+    }
+  }
+}
+
+extern "C" __global__ void InterpTransposeAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ c_B,
+                                                   const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ d_X,
+                                                   const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
+  extern __shared__ CeedScalar slice[];
+
+  SharedData_Cuda data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
+
+  CeedScalar r_U[BASIS_NUM_COMP];
+  CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
+  CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
+
+  // Apply basis element by element
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    // Clear register
+    for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_C[i] = 0.0;
+
+    // Map from points
+    const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y));
+
+    for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) {
+      const CeedInt p = i % BASIS_NUM_PTS;
+      CeedScalar    r_X[BASIS_DIM];
+
+      for (CeedInt d = 0; d < BASIS_DIM; d++) {
+        r_X[d] = d_X[elem * BASIS_NUM_PTS + num_elem * BASIS_NUM_PTS * d + p];
+      }
+      for (CeedInt j = 0; j < BASIS_NUM_COMP; j++) {
+        if (i < points_per_elem[elem]) r_U[j] = d_U[elem * BASIS_NUM_PTS + num_elem * BASIS_NUM_PTS * j + p];
+        else r_U[j] = 0.0;
+      }
+      if (BASIS_DIM == 1) {
+        InterpTransposeAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
+      } else if (BASIS_DIM == 2) {
+        InterpTransposeAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
+      } else if (BASIS_DIM == 3) {
+        InterpTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
+      }
+    }
+    __syncthreads();
+
+    // Map from coefficients
+    if (BASIS_DIM == 1) {
+      InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, c_B, r_V);
+      SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 2) {
+      InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, c_B, r_V);
+      SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 3) {
+      InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, c_B, r_V);
+      SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                      BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// Grad
+//------------------------------------------------------------------------------
+extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ c_B, const CeedInt *__restrict__ points_per_elem,
+                                        const CeedScalar *__restrict__ d_X, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
+  extern __shared__ CeedScalar slice[];
+
+  SharedData_Cuda data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
+
+  CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
+  CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
+  CeedScalar r_V[BASIS_NUM_COMP * BASIS_DIM];
+
+  // Apply basis element by element
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    // Map to coefficients
+    if (BASIS_DIM == 1) {
+      ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U);
+      Interp1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, c_B, r_C);
+    } else if (BASIS_DIM == 2) {
+      ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U);
+      InterpTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, c_B, r_C);
+    } else if (BASIS_DIM == 3) {
+      ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                       BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U);
+      InterpTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, c_B, r_C);
+    }
+
+    // Map to points
+    const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y));
+
+    for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) {
+      const CeedInt p = i % BASIS_NUM_PTS;
+      CeedScalar    r_X[BASIS_DIM];
+
+      for (CeedInt d = 0; d < BASIS_DIM; d++) {
+        r_X[d] = d_X[elem * BASIS_NUM_PTS + num_elem * BASIS_NUM_PTS * d + p];
+      }
+      if (BASIS_DIM == 1) {
+        GradAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
+      } else if (BASIS_DIM == 2) {
+        GradAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
+      } else if (BASIS_DIM == 3) {
+        GradAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
+      }
+      for (CeedInt j = 0; j < BASIS_NUM_COMP * BASIS_DIM; j++) {
+        if (i < BASIS_NUM_PTS) d_V[elem * BASIS_NUM_PTS + num_elem * BASIS_NUM_PTS * j + i] = r_V[j];
+      }
+    }
+  }
+}
+
+extern "C" __global__ void GradTransposeAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ c_B,
+                                                 const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ d_X,
+                                                 const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
+  extern __shared__ CeedScalar slice[];
+
+  SharedData_Cuda data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
+
+  CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM];
+  CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
+  CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
+
+  // Apply basis element by element
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    // Clear register
+    for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_C[i] = 0.0;
+
+    // Map from points
+    const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y));
+
+    for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) {
+      const CeedInt p = i % BASIS_NUM_PTS;
+      CeedScalar    r_X[BASIS_DIM];
+
+      for (CeedInt d = 0; d < BASIS_DIM; d++) {
+        r_X[d] = d_X[elem * BASIS_NUM_PTS + num_elem * BASIS_NUM_PTS * d + p];
+      }
+      for (CeedInt j = 0; j < BASIS_NUM_COMP * BASIS_DIM; j++) {
+        if (i < points_per_elem[elem]) r_U[j] = d_U[elem * BASIS_NUM_PTS + num_elem * BASIS_NUM_PTS * j + p];
+        else r_U[j] = 0.0;
+      }
+      if (BASIS_DIM == 1) {
+        GradTransposeAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
+      } else if (BASIS_DIM == 2) {
+        GradTransposeAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
+      } else if (BASIS_DIM == 3) {
+        GradTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
+      }
+    }
+    __syncthreads();
+
+    // Map from coefficients
+    if (BASIS_DIM == 1) {
+      InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, c_B, r_V);
+      SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 2) {
+      InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, c_B, r_V);
+      SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 3) {
+      InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, c_B, r_V);
+      SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                      BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    }
+  }
+}
diff --git a/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h b/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h
new file mode 100644
index 0000000000..73d8cba91b
--- /dev/null
+++ b/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h
@@ -0,0 +1,448 @@
+// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+/// @file
+/// Internal header for HIP shared memory tensor product basis AtPoints templates
+#include <ceed/types.h>
+
+//------------------------------------------------------------------------------
+// Chebyshev values
+//------------------------------------------------------------------------------
+template <int Q_1D>
+inline __device__ void ChebyshevPolynomialsAtPoint(const CeedScalar x, CeedScalar *chebyshev_x) {
+  chebyshev_x[0] = 1.0;
+  chebyshev_x[1] = 2 * x;
+  for (CeedInt i = 2; i < Q_1D; i++) chebyshev_x[i] = 2 * x * chebyshev_x[i - 1] - chebyshev_x[i - 2];
+}
+
+template <int Q_1D>
+inline __device__ void ChebyshevDerivativeAtPoint(const CeedScalar x, CeedScalar *chebyshev_dx) {
+  CeedScalar chebyshev_x[3];
+
+  chebyshev_x[1]  = 1.0;
+  chebyshev_x[2]  = 2 * x;
+  chebyshev_dx[0] = 0.0;
+  chebyshev_dx[1] = 2.0;
+  for (CeedInt i = 2; i < Q_1D; i++) {
+    chebyshev_x[(i + 1) % 3] = 2 * x * chebyshev_x[(i + 0) % 3] - chebyshev_x[(i + 2) % 3];
+    chebyshev_dx[i]          = 2 * x * chebyshev_dx[i - 1] + 2 * chebyshev_x[(i + 0) % 3] - chebyshev_dx[i - 2];
+  }
+}
+
+//------------------------------------------------------------------------------
+// 1D
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+// 1D interpolate to points
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int NUM_POINTS, int Q_1D>
+inline __device__ void InterpAtPoints1d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X,
+                                        CeedScalar *__restrict__ r_V) {
+  CeedScalar chebyshev_x[Q_1D];
+
+  for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
+  ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    // Load coefficients
+    if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = r_C[comp];
+    __syncthreads();
+    // Contract x direction
+    for (CeedInt i = 0; i < Q_1D; i++) {
+      r_V[comp] += chebyshev_x[i] * data.slice[i];
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// 1D interpolate transpose
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int NUM_POINTS, int Q_1D>
+inline __device__ void InterpTransposeAtPoints1d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U,
+                                                 const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) {
+  CeedScalar chebyshev_x[Q_1D];
+
+  ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    // Clear shared memory
+    if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = 0.0;
+    __syncthreads();
+    // Contract x direction
+    if (p < NUM_POINTS) {
+      for (CeedInt i = 0; i < Q_1D; i++) {
+        atomicAdd(&data.slice[comp * Q_1D + (i + p) % Q_1D], chebyshev_x[(i + p) % Q_1D] * r_U[comp]);
+      }
+    }
+    // Pull from shared to register
+    __syncthreads();
+    if (data.t_id_x < Q_1D) r_C[comp] = data.slice[p];
+  }
+}
+
+//------------------------------------------------------------------------------
+// 1D derivatives at points
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int NUM_POINTS, int Q_1D>
+inline __device__ void GradAtPoints1d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X,
+                                      CeedScalar *__restrict__ r_V) {
+  CeedScalar chebyshev_x[Q_1D];
+
+  ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
+  for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    // Load coefficients
+    if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = r_C[comp];
+    __syncthreads();
+    // Contract x direction
+    for (CeedInt i = 0; i < Q_1D; i++) {
+      r_V[comp] += chebyshev_x[i] * data.slice[i];
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// 1D derivatives transpose
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int NUM_POINTS, int Q_1D>
+inline __device__ void GradTransposeAtPoints1d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U,
+                                               const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) {
+  CeedScalar chebyshev_x[Q_1D];
+
+  ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    // Clear shared memory
+    if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = 0.0;
+    __syncthreads();
+    // Contract x direction
+    if (p < NUM_POINTS) {
+      for (CeedInt i = 0; i < Q_1D; i++) {
+        atomicAdd(&data.slice[comp * Q_1D + (i + p) % Q_1D], chebyshev_x[(i + p) % Q_1D] * r_U[comp]);
+      }
+    }
+    // Pull from shared to register
+    __syncthreads();
+    if (data.t_id_x < Q_1D) r_C[comp] = data.slice[p];
+  }
+}
+
+//------------------------------------------------------------------------------
+// 2D
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+// 2D interpolate to points
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int NUM_POINTS, int Q_1D>
+inline __device__ void InterpAtPoints2d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X,
+                                        CeedScalar *__restrict__ r_V) {
+  for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    CeedScalar buffer[Q_1D];
+    CeedScalar chebyshev_x[Q_1D];
+
+    // Load coefficients
+    if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[comp];
+    __syncthreads();
+    // Contract x direction
+    ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
+    for (CeedInt i = 0; i < Q_1D; i++) {
+      buffer[i] = 0.0;
+      for (CeedInt j = 0; j < Q_1D; j++) {
+        buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D];
+      }
+    }
+    // Contract y direction
+    ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
+    for (CeedInt i = 0; i < Q_1D; i++) {
+      r_V[comp] += chebyshev_x[i] * buffer[i];
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// 2D interpolate transpose
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int NUM_POINTS, int Q_1D>
+inline __device__ void InterpTransposeAtPoints2d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U,
+                                                 const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) {
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    CeedScalar buffer[Q_1D];
+    CeedScalar chebyshev_x[Q_1D];
+
+    // Clear shared memory
+    if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
+    __syncthreads();
+    // Contract y direction
+    ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
+    for (CeedInt i = 0; i < Q_1D; i++) {
+      buffer[i] = chebyshev_x[i] * r_U[comp];
+    }
+    // Contract x direction
+    ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
+    if (p < NUM_POINTS) {
+      for (CeedInt i = 0; i < Q_1D; i++) {
+        // Note: shifting to avoid atomic adds
+        const CeedInt ii = (i + (p / Q_1D)) % Q_1D;
+
+        for (CeedInt j = 0; j < Q_1D; j++) {
+          const CeedInt jj = (j + p) % Q_1D;
+
+          atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
+        }
+      }
+    }
+    // Pull from shared to register
+    __syncthreads();
+    if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[comp] += data.slice[data.t_id_x + data.t_id_y * Q_1D];
+  }
+}
+
+//------------------------------------------------------------------------------
+// 2D derivatives at points
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int NUM_POINTS, int Q_1D>
+inline __device__ void GradAtPoints2d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X,
+                                      CeedScalar *__restrict__ r_V) {
+  for (CeedInt i = 0; i < NUM_COMP * 2; i++) r_V[i] = 0.0;
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    CeedScalar buffer[Q_1D];
+    CeedScalar chebyshev_x[Q_1D];
+
+    // Load coefficients
+    if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[comp];
+    __syncthreads();
+    for (CeedInt dim = 0; dim < 2; dim++) {
+      // Contract x direction
+      if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
+      else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
+      for (CeedInt i = 0; i < Q_1D; i++) {
+        buffer[i] = 0.0;
+        for (CeedInt j = 0; j < Q_1D; j++) {
+          buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D];
+        }
+      }
+      // Contract y direction
+      if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
+      else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
+      for (CeedInt i = 0; i < Q_1D; i++) {
+        r_V[comp + dim * NUM_COMP] += chebyshev_x[i] * buffer[i];
+      }
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// 2D derivatives transpose
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int NUM_POINTS, int Q_1D>
+inline __device__ void GradTransposeAtPoints2d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U,
+                                               const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) {
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    CeedScalar buffer[Q_1D];
+    CeedScalar chebyshev_x[Q_1D];
+
+    // Clear shared memory
+    if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
+    __syncthreads();
+    for (CeedInt dim = 0; dim < 2; dim++) {
+      // Contract y direction
+      if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
+      else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
+      for (CeedInt i = 0; i < Q_1D; i++) {
+        buffer[i] = chebyshev_x[i] * r_U[comp + dim * NUM_COMP];
+      }
+      // Contract x direction
+      if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
+      else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
+      if (p < NUM_POINTS) {
+        for (CeedInt i = 0; i < Q_1D; i++) {
+          // Note: shifting to avoid atomic adds
+          const CeedInt ii = (i + (p / Q_1D)) % Q_1D;
+
+          for (CeedInt j = 0; j < Q_1D; j++) {
+            const CeedInt jj = (j + p) % Q_1D;
+
+            atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
+          }
+        }
+      }
+    }
+    // Pull from shared to register
+    __syncthreads();
+    if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[comp] += data.slice[data.t_id_x + data.t_id_y * Q_1D];
+  }
+}
+
+//------------------------------------------------------------------------------
+// 3D
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+// 3D interpolate to points
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int NUM_POINTS, int Q_1D>
+inline __device__ void InterpAtPoints3d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X,
+                                        CeedScalar *__restrict__ r_V) {
+  for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    for (CeedInt k = 0; k < Q_1D; k++) {
+      CeedScalar buffer[Q_1D];
+      CeedScalar chebyshev_x[Q_1D];
+
+      // Load coefficients
+      if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[k + comp * Q_1D];
+      __syncthreads();
+      // Contract x direction
+      ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
+      for (CeedInt i = 0; i < Q_1D; i++) {
+        buffer[i] = 0.0;
+        for (CeedInt j = 0; j < Q_1D; j++) {
+          buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D];
+        }
+      }
+      // Contract y and z direction
+      ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
+      const CeedScalar z = chebyshev_x[k];
+
+      ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
+      for (CeedInt i = 0; i < Q_1D; i++) {
+        r_V[comp] += chebyshev_x[i] * buffer[i] * z;
+      }
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// 3D interpolate transpose
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int NUM_POINTS, int Q_1D>
+inline __device__ void InterpTransposeAtPoints3d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U,
+                                                 const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) {
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    for (CeedInt k = 0; k < Q_1D; k++) {
+      CeedScalar buffer[Q_1D];
+      CeedScalar chebyshev_x[Q_1D];
+
+      // Clear shared memory
+      if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
+      __syncthreads();
+      // Contract y and z direction
+      ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
+      const CeedScalar z = chebyshev_x[k];
+
+      ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
+      for (CeedInt i = 0; i < Q_1D; i++) {
+        buffer[i] = chebyshev_x[i] * r_U[comp] * z;
+      }
+      // Contract x direction
+      ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
+      if (p < NUM_POINTS) {
+        for (CeedInt i = 0; i < Q_1D; i++) {
+          // Note: shifting to avoid atomic adds
+          const CeedInt ii = (i + (p / Q_1D)) % Q_1D;
+
+          for (CeedInt j = 0; j < Q_1D; j++) {
+            const CeedInt jj = ((j + p) % Q_1D);
+
+            atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
+          }
+        }
+      }
+      // Pull from shared to register
+      __syncthreads();
+      if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[k + comp * Q_1D] += data.slice[data.t_id_x + data.t_id_y * Q_1D];
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// 3D derivatives at points
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int NUM_POINTS, int Q_1D>
+inline __device__ void GradAtPoints3d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X,
+                                      CeedScalar *__restrict__ r_V) {
+  for (CeedInt i = 0; i < NUM_COMP * 3; i++) r_V[i] = 0.0;
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    for (CeedInt k = 0; k < Q_1D; k++) {
+      CeedScalar buffer[Q_1D];
+      CeedScalar chebyshev_x[Q_1D];
+
+      // Load coefficients
+      if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[k + comp * Q_1D];
+      __syncthreads();
+      for (CeedInt dim = 0; dim < 3; dim++) {
+        // Contract x direction
+        if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
+        else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
+        for (CeedInt i = 0; i < Q_1D; i++) {
+          buffer[i] = 0.0;
+          for (CeedInt j = 0; j < Q_1D; j++) {
+            buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D];
+          }
+        }
+        // Contract y and z direction
+        if (dim == 2) ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x);
+        else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
+        const CeedScalar z = chebyshev_x[k];
+
+        if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
+        else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
+        for (CeedInt i = 0; i < Q_1D; i++) {
+          r_V[comp + dim * NUM_COMP] += chebyshev_x[i] * buffer[i] * z;
+        }
+      }
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// 3D derivatives transpose
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int NUM_POINTS, int Q_1D>
+inline __device__ void GradTransposeAtPoints3d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U,
+                                               const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) {
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    for (CeedInt k = 0; k < Q_1D; k++) {
+      CeedScalar buffer[Q_1D];
+      CeedScalar chebyshev_x[Q_1D];
+
+      // Clear shared memory
+      if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
+      __syncthreads();
+      for (CeedInt dim = 0; dim < 3; dim++) {
+        // Contract y and z direction
+        if (dim == 2) ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x);
+        else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
+        const CeedScalar z = chebyshev_x[k];
+
+        if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
+        else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
+        for (CeedInt i = 0; i < Q_1D; i++) {
+          buffer[i] = chebyshev_x[i] * r_U[comp + dim * NUM_COMP] * z;
+        }
+        // Contract x direction
+        if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
+        else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
+        if (p < NUM_POINTS) {
+          for (CeedInt i = 0; i < Q_1D; i++) {
+            // Note: shifting to avoid atomic adds
+            const CeedInt ii = (i + (p / Q_1D)) % Q_1D;
+
+            for (CeedInt j = 0; j < Q_1D; j++) {
+              const CeedInt jj = ((j + p) % Q_1D);
+
+              atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
+            }
+          }
+        }
+      }
+      // Pull from shared to register
+      __syncthreads();
+      if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[k + comp * Q_1D] += data.slice[data.t_id_x + data.t_id_y * Q_1D];
+    }
+  }
+}
diff --git a/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points.h b/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points.h
new file mode 100644
index 0000000000..f96b843f30
--- /dev/null
+++ b/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points.h
@@ -0,0 +1,274 @@
+// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+/// @file
+/// Internal header for HIP tensor product basis with AtPoints evaluation
+#include <ceed/types.h>
+
+#include "hip-shared-basis-read-write-templates.h"
+#include "hip-shared-basis-tensor-at-points-templates.h"
+#include "hip-shared-basis-tensor-templates.h"
+
+//------------------------------------------------------------------------------
+// Tensor Basis Kernels AtPoints
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+// Interp
+//------------------------------------------------------------------------------
+extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
+    void InterpAtPoints(const CeedInt num_elem, const CeedScalar *d_chebyshev_interp_1d, const CeedInt *points_per_elem,
+                        const CeedScalar *__restrict__ d_X, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
+  extern __shared__ CeedScalar slice[];
+
+  // load chebyshev_interp_1d into shared memory
+  __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
+  loadMatrix<BASIS_P_1D * BASIS_Q_1D>(d_chebyshev_interp_1d, s_B);
+  __syncthreads();
+
+  SharedData_Hip data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
+
+  CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
+  CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
+  CeedScalar r_V[BASIS_NUM_COMP];
+
+  // Apply basis element by element
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    // Map to coefficients
+    if (BASIS_DIM == 1) {
+      ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U);
+      Interp1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_C);
+    } else if (BASIS_DIM == 2) {
+      ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U);
+      InterpTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_C);
+    } else if (BASIS_DIM == 3) {
+      ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                       BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U);
+      InterpTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_C);
+    }
+
+    // Map to points
+    const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y));
+
+    for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) {
+      const CeedInt p = i % BASIS_NUM_PTS;
+      CeedScalar    r_X[BASIS_DIM];
+
+      for (CeedInt d = 0; d < BASIS_DIM; d++) {
+        r_X[d] = d_X[elem * BASIS_NUM_PTS + num_elem * BASIS_NUM_PTS * d + p];
+      }
+      if (BASIS_DIM == 1) {
+        InterpAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
+      } else if (BASIS_DIM == 2) {
+        InterpAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
+      } else if (BASIS_DIM == 3) {
+        InterpAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
+      }
+      for (CeedInt j = 0; j < BASIS_NUM_COMP; j++) {
+        if (i < BASIS_NUM_PTS) d_V[elem * BASIS_NUM_PTS + num_elem * BASIS_NUM_PTS * j + i] = r_V[j];
+      }
+    }
+  }
+}
+
+extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
+    void InterpTransposeAtPoints(const CeedInt num_elem, const CeedScalar *d_chebyshev_interp_1d, const CeedInt *points_per_elem,
+                                 const CeedScalar *__restrict__ d_X, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
+  extern __shared__ CeedScalar slice[];
+
+  // load chebyshev_interp_1d into shared memory
+  __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
+  loadMatrix<BASIS_P_1D * BASIS_Q_1D>(d_chebyshev_interp_1d, s_B);
+  __syncthreads();
+
+  SharedData_Hip data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
+
+  CeedScalar r_U[BASIS_NUM_COMP];
+  CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
+  CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
+
+  // Apply basis element by element
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    // Clear register
+    for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_C[i] = 0.0;
+
+    // Map from points
+    const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y));
+
+    for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) {
+      const CeedInt p = i % BASIS_NUM_PTS;
+      CeedScalar    r_X[BASIS_DIM];
+
+      for (CeedInt d = 0; d < BASIS_DIM; d++) {
+        r_X[d] = d_X[elem * BASIS_NUM_PTS + num_elem * BASIS_NUM_PTS * d + p];
+      }
+      for (CeedInt j = 0; j < BASIS_NUM_COMP; j++) {
+        if (i < points_per_elem[elem]) r_U[j] = d_U[elem * BASIS_NUM_PTS + num_elem * BASIS_NUM_PTS * j + p];
+        else r_U[j] = 0.0;
+      }
+      if (BASIS_DIM == 1) {
+        InterpTransposeAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
+      } else if (BASIS_DIM == 2) {
+        InterpTransposeAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
+      } else if (BASIS_DIM == 3) {
+        InterpTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
+      }
+    }
+    __syncthreads();
+
+    // Map from coefficients
+    if (BASIS_DIM == 1) {
+      InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V);
+      SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 2) {
+      InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V);
+      SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 3) {
+      InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V);
+      SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                      BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// Grad
+//------------------------------------------------------------------------------
+extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
+    void GradAtPoints(const CeedInt num_elem, const CeedScalar *d_chebyshev_interp_1d, const CeedInt *points_per_elem,
+                      const CeedScalar *__restrict__ d_X, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
+  extern __shared__ CeedScalar slice[];
+
+  // load chebyshev_interp_1d into shared memory
+  __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
+  loadMatrix<BASIS_P_1D * BASIS_Q_1D>(d_chebyshev_interp_1d, s_B);
+  __syncthreads();
+
+  SharedData_Hip data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
+
+  CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
+  CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
+  CeedScalar r_V[BASIS_NUM_COMP * BASIS_DIM];
+
+  // Apply basis element by element
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    // Map to coefficients
+    if (BASIS_DIM == 1) {
+      ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U);
+      Interp1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_C);
+    } else if (BASIS_DIM == 2) {
+      ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U);
+      InterpTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_C);
+    } else if (BASIS_DIM == 3) {
+      ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                       BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U);
+      InterpTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_C);
+    }
+
+    // Map to points
+    const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y));
+
+    for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) {
+      const CeedInt p = i % BASIS_NUM_PTS;
+      CeedScalar    r_X[BASIS_DIM];
+
+      for (CeedInt d = 0; d < BASIS_DIM; d++) {
+        r_X[d] = d_X[elem * BASIS_NUM_PTS + num_elem * BASIS_NUM_PTS * d + p];
+      }
+      if (BASIS_DIM == 1) {
+        GradAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
+      } else if (BASIS_DIM == 2) {
+        GradAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
+      } else if (BASIS_DIM == 3) {
+        GradAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
+      }
+      for (CeedInt j = 0; j < BASIS_NUM_COMP * BASIS_DIM; j++) {
+        if (i < BASIS_NUM_PTS) d_V[elem * BASIS_NUM_PTS + num_elem * BASIS_NUM_PTS * j + i] = r_V[j];
+      }
+    }
+  }
+}
+
+extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
+    void GradTransposeAtPoints(const CeedInt num_elem, const CeedScalar *d_chebyshev_interp_1d, const CeedInt *points_per_elem,
+                               const CeedScalar *__restrict__ d_X, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
+  extern __shared__ CeedScalar slice[];
+
+  // load chebyshev_interp_1d into shared memory
+  __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
+  loadMatrix<BASIS_P_1D * BASIS_Q_1D>(d_chebyshev_interp_1d, s_B);
+  __syncthreads();
+
+  SharedData_Hip data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
+
+  CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM];
+  CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
+  CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
+
+  // Apply basis element by element
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    // Clear register
+    for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_C[i] = 0.0;
+
+    // Map from points
+    const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y));
+
+    for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) {
+      const CeedInt p = i % BASIS_NUM_PTS;
+      CeedScalar    r_X[BASIS_DIM];
+
+      for (CeedInt d = 0; d < BASIS_DIM; d++) {
+        r_X[d] = d_X[elem * BASIS_NUM_PTS + num_elem * BASIS_NUM_PTS * d + p];
+      }
+      for (CeedInt j = 0; j < BASIS_NUM_COMP * BASIS_DIM; j++) {
+        if (i < points_per_elem[elem]) r_U[j] = d_U[elem * BASIS_NUM_PTS + num_elem * BASIS_NUM_PTS * j + p];
+        else r_U[j] = 0.0;
+      }
+      if (BASIS_DIM == 1) {
+        GradTransposeAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
+      } else if (BASIS_DIM == 2) {
+        GradTransposeAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
+      } else if (BASIS_DIM == 3) {
+        GradTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
+      }
+    }
+    __syncthreads();
+
+    // Map from coefficients
+    if (BASIS_DIM == 1) {
+      InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V);
+      SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 2) {
+      InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V);
+      SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 3) {
+      InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V);
+      SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                      BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    }
+  }
+}
diff --git a/tests/t354-basis.c b/tests/t354-basis.c
index e137f8c44f..156e4a9b26 100644
--- a/tests/t354-basis.c
+++ b/tests/t354-basis.c
@@ -1,6 +1,6 @@
 /// @file
-/// Test polynomial interpolation to arbitrary points in multiple dimensions
-/// \test Test polynomial interpolation to arbitrary points in multiple dimensions
+/// Test polynomial interpolation transpose to arbitrary points in multiple dimensions
+/// \test Test polynomial interpolation transpose to arbitrary points in multiple dimensions
 #include <ceed.h>
 #include <math.h>
 #include <stdio.h>

From 5e1f751ef2b988c9b3bd40351d2ca6b559914046 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 15 Nov 2024 13:56:20 -0700
Subject: [PATCH 231/571] tidy - minor fix

---
 interface/ceed-preconditioning.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/interface/ceed-preconditioning.c b/interface/ceed-preconditioning.c
index 1cc767cb08..1879464450 100644
--- a/interface/ceed-preconditioning.c
+++ b/interface/ceed-preconditioning.c
@@ -452,8 +452,8 @@ static int CeedSingleOperatorAssembleSymbolic(CeedOperator op, CeedInt offset, C
   CeedElemRestriction elem_rstr_in, elem_rstr_out, index_elem_rstr_in, index_elem_rstr_out;
 
   CeedCall(CeedOperatorIsComposite(op, &is_composite));
-  CeedCheck(!is_composite, ceed, CEED_ERROR_UNSUPPORTED, "Composite operator not supported");
   CeedCall(CeedOperatorGetCeed(op, &ceed));
+  CeedCheck(!is_composite, ceed, CEED_ERROR_UNSUPPORTED, "Composite operator not supported");
 
   CeedCall(CeedOperatorGetActiveVectorLengths(op, &num_nodes_in, &num_nodes_out));
   CeedCall(CeedOperatorGetActiveElemRestrictions(op, &elem_rstr_in, &elem_rstr_out));

From 1b3d9bd6d56939545c1e08ee5e87a420f2c02c77 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Mon, 18 Nov 2024 12:31:25 -0700
Subject: [PATCH 232/571] test - slightly loosen t354 tol

---
 tests/t354-basis.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/t354-basis.c b/tests/t354-basis.c
index 156e4a9b26..4d3402b257 100644
--- a/tests/t354-basis.c
+++ b/tests/t354-basis.c
@@ -85,7 +85,7 @@ int main(int argc, char **argv) {
       CeedBasisApplyAtPoints(basis_u, 1, num_point, CEED_TRANSPOSE, CEED_EVAL_INTERP, x_point, v_point, u_point);
       CeedVectorGetArrayRead(u_point, CEED_MEM_HOST, &u_point_array);
       for (CeedInt j = 0; j < p_dim; j++) fx += u_array[j] * u_point_array[j];
-      if (fabs(v_array[i] - fx) > 100. * CEED_EPSILON) {
+      if (fabs(v_array[i] - fx) > 500. * CEED_EPSILON) {
         // LCOV_EXCL_START
         printf("[%" CeedInt_FMT "] %f != %f = f(%f", dim, v_array[i], fx, coord[0]);
         for (CeedInt d = 1; d < dim; d++) printf(", %f", coord[d]);

From 5f954c19a7f1ef8f2e1bd141b22369f85709825b Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Wed, 27 Nov 2024 11:16:05 -0700
Subject: [PATCH 233/571] gpu - template loop over points for basis action

---
 ...-shared-basis-tensor-at-points-templates.h | 120 ++++++++++++++++++
 .../cuda/cuda-shared-basis-tensor-at-points.h |  98 ++------------
 ...-shared-basis-tensor-at-points-templates.h | 120 ++++++++++++++++++
 .../hip/hip-shared-basis-tensor-at-points.h   |  98 ++------------
 4 files changed, 256 insertions(+), 180 deletions(-)

diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h
index acf35a0dd7..35681df470 100644
--- a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h
@@ -446,3 +446,123 @@ inline __device__ void GradTransposeAtPoints3d(SharedData_Cuda &data, const Ceed
     }
   }
 }
+
+//------------------------------------------------------------------------------
+// Loops over points
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+// Interpolate to points
+//------------------------------------------------------------------------------
+template <int DIM, int NUM_COMP, int NUM_PTS, int Q_1D>
+inline __device__ void InterpAtPoints(SharedData_Cuda &data, const CeedInt comp_stride, const CeedScalar *__restrict__ r_C,
+                                      const CeedScalar *__restrict__ d_X, CeedScalar *__restrict__ r_V, CeedScalar *__restrict__ d_V) {
+  const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * NUM_PTS / (blockDim.x * blockDim.y));
+
+  for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) {
+    const CeedInt p = i % NUM_PTS;
+    CeedScalar    r_X[DIM];
+
+    for (CeedInt d = 0; d < BASIS_DIM; d++) r_X[d] = d_X[comp_stride * d + p];
+    if (DIM == 1) {
+      InterpAtPoints1d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_C, r_X, r_V);
+    } else if (DIM == 2) {
+      InterpAtPoints2d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_C, r_X, r_V);
+    } else if (DIM == 3) {
+      InterpAtPoints3d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_C, r_X, r_V);
+    }
+    if (i < NUM_PTS) {
+      for (CeedInt j = 0; j < NUM_COMP; j++) d_V[comp_stride * j + p] = r_V[j];
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// Interpolate from points
+//------------------------------------------------------------------------------
+template <int DIM, int NUM_COMP, int NUM_PTS, int Q_1D>
+inline __device__ void InterpTransposeAtPoints(SharedData_Cuda &data, const CeedInt comp_stride, const CeedInt points_per_elem,
+                                               const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ r_U, const CeedScalar *__restrict__ d_X,
+                                               CeedScalar *__restrict__ r_C) {
+  // Clear register
+  for (CeedInt i = 0; i < NUM_COMP * (DIM > 2 ? Q_1D : 1); i++) r_C[i] = 0.0;
+
+  const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * NUM_PTS / (blockDim.x * blockDim.y));
+
+  for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) {
+    const CeedInt p = i % NUM_PTS;
+    CeedScalar    r_X[DIM];
+
+    for (CeedInt d = 0; d < DIM; d++) r_X[d] = d_X[comp_stride * d + p];
+    for (CeedInt j = 0; j < NUM_COMP; j++) {
+      if (i < points_per_elem) r_U[j] = d_U[comp_stride * j + p];
+      else r_U[j] = 0.0;
+    }
+    if (BASIS_DIM == 1) {
+      InterpTransposeAtPoints1d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_U, r_X, r_C);
+    } else if (BASIS_DIM == 2) {
+      InterpTransposeAtPoints2d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_U, r_X, r_C);
+    } else if (BASIS_DIM == 3) {
+      InterpTransposeAtPoints3d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_U, r_X, r_C);
+    }
+  }
+  __syncthreads();
+}
+
+//------------------------------------------------------------------------------
+// Gradient at points
+//------------------------------------------------------------------------------
+template <int DIM, int NUM_COMP, int NUM_PTS, int Q_1D>
+inline __device__ void GradAtPoints(SharedData_Cuda &data, const CeedInt comp_stride, const CeedScalar *__restrict__ r_C,
+                                    const CeedScalar *__restrict__ d_X, CeedScalar *__restrict__ r_V, CeedScalar *__restrict__ d_V) {
+  const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * NUM_PTS / (blockDim.x * blockDim.y));
+
+  for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) {
+    const CeedInt p = i % NUM_PTS;
+    CeedScalar    r_X[DIM];
+
+    for (CeedInt d = 0; d < BASIS_DIM; d++) r_X[d] = d_X[comp_stride * d + p];
+    if (DIM == 1) {
+      GradAtPoints1d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_C, r_X, r_V);
+    } else if (DIM == 2) {
+      GradAtPoints2d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_C, r_X, r_V);
+    } else if (DIM == 3) {
+      GradAtPoints3d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_C, r_X, r_V);
+    }
+    if (i < NUM_PTS) {
+      for (CeedInt j = 0; j < NUM_COMP * DIM; j++) d_V[comp_stride * j + p] = r_V[j];
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// Grad from points
+//------------------------------------------------------------------------------
+template <int DIM, int NUM_COMP, int NUM_PTS, int Q_1D>
+inline __device__ void GradTransposeAtPoints(SharedData_Cuda &data, const CeedInt comp_stride, const CeedInt points_per_elem,
+                                             const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ r_U, const CeedScalar *__restrict__ d_X,
+                                             CeedScalar *__restrict__ r_C) {
+  // Clear register
+  for (CeedInt i = 0; i < NUM_COMP * (DIM > 2 ? Q_1D : 1); i++) r_C[i] = 0.0;
+
+  const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * NUM_PTS / (blockDim.x * blockDim.y));
+
+  for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) {
+    const CeedInt p = i % NUM_PTS;
+    CeedScalar    r_X[DIM];
+
+    for (CeedInt d = 0; d < DIM; d++) r_X[d] = d_X[comp_stride * d + p];
+    for (CeedInt j = 0; j < NUM_COMP * DIM; j++) {
+      if (i < points_per_elem) r_U[j] = d_U[comp_stride * j + p];
+      else r_U[j] = 0.0;
+    }
+    if (BASIS_DIM == 1) {
+      GradTransposeAtPoints1d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_U, r_X, r_C);
+    } else if (BASIS_DIM == 2) {
+      GradTransposeAtPoints2d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_U, r_X, r_C);
+    } else if (BASIS_DIM == 3) {
+      GradTransposeAtPoints3d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_U, r_X, r_C);
+    }
+  }
+  __syncthreads();
+}
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points.h
index 38e162bd45..cfc6899476 100644
--- a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points.h
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points.h
@@ -51,26 +51,8 @@ extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedScal
     }
 
     // Map to points
-    const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y));
-
-    for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) {
-      const CeedInt p = i % BASIS_NUM_PTS;
-      CeedScalar    r_X[BASIS_DIM];
-
-      for (CeedInt d = 0; d < BASIS_DIM; d++) {
-        r_X[d] = d_X[elem * BASIS_NUM_PTS + num_elem * BASIS_NUM_PTS * d + p];
-      }
-      if (BASIS_DIM == 1) {
-        InterpAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
-      } else if (BASIS_DIM == 2) {
-        InterpAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
-      } else if (BASIS_DIM == 3) {
-        InterpAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
-      }
-      for (CeedInt j = 0; j < BASIS_NUM_COMP; j++) {
-        if (i < BASIS_NUM_PTS) d_V[elem * BASIS_NUM_PTS + num_elem * BASIS_NUM_PTS * j + i] = r_V[j];
-      }
-    }
+    InterpAtPoints<BASIS_DIM, BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, num_elem * BASIS_NUM_PTS, r_C, &d_X[elem * BASIS_NUM_PTS], r_V,
+                                                                         &d_V[elem * BASIS_NUM_PTS]);
   }
 }
 
@@ -92,32 +74,9 @@ extern "C" __global__ void InterpTransposeAtPoints(const CeedInt num_elem, const
 
   // Apply basis element by element
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
-    // Clear register
-    for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_C[i] = 0.0;
-
     // Map from points
-    const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y));
-
-    for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) {
-      const CeedInt p = i % BASIS_NUM_PTS;
-      CeedScalar    r_X[BASIS_DIM];
-
-      for (CeedInt d = 0; d < BASIS_DIM; d++) {
-        r_X[d] = d_X[elem * BASIS_NUM_PTS + num_elem * BASIS_NUM_PTS * d + p];
-      }
-      for (CeedInt j = 0; j < BASIS_NUM_COMP; j++) {
-        if (i < points_per_elem[elem]) r_U[j] = d_U[elem * BASIS_NUM_PTS + num_elem * BASIS_NUM_PTS * j + p];
-        else r_U[j] = 0.0;
-      }
-      if (BASIS_DIM == 1) {
-        InterpTransposeAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
-      } else if (BASIS_DIM == 2) {
-        InterpTransposeAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
-      } else if (BASIS_DIM == 3) {
-        InterpTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
-      }
-    }
-    __syncthreads();
+    InterpTransposeAtPoints<BASIS_DIM, BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, num_elem * BASIS_NUM_PTS, points_per_elem[elem],
+                                                                                  &d_U[elem * BASIS_NUM_PTS], r_U, &d_X[elem * BASIS_NUM_PTS], r_C);
 
     // Map from coefficients
     if (BASIS_DIM == 1) {
@@ -168,26 +127,8 @@ extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedScalar
     }
 
     // Map to points
-    const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y));
-
-    for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) {
-      const CeedInt p = i % BASIS_NUM_PTS;
-      CeedScalar    r_X[BASIS_DIM];
-
-      for (CeedInt d = 0; d < BASIS_DIM; d++) {
-        r_X[d] = d_X[elem * BASIS_NUM_PTS + num_elem * BASIS_NUM_PTS * d + p];
-      }
-      if (BASIS_DIM == 1) {
-        GradAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
-      } else if (BASIS_DIM == 2) {
-        GradAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
-      } else if (BASIS_DIM == 3) {
-        GradAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
-      }
-      for (CeedInt j = 0; j < BASIS_NUM_COMP * BASIS_DIM; j++) {
-        if (i < BASIS_NUM_PTS) d_V[elem * BASIS_NUM_PTS + num_elem * BASIS_NUM_PTS * j + i] = r_V[j];
-      }
-    }
+    GradAtPoints<BASIS_DIM, BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, num_elem * BASIS_NUM_PTS, r_C, &d_X[elem * BASIS_NUM_PTS], r_V,
+                                                                       &d_V[elem * BASIS_NUM_PTS]);
   }
 }
 
@@ -209,32 +150,9 @@ extern "C" __global__ void GradTransposeAtPoints(const CeedInt num_elem, const C
 
   // Apply basis element by element
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
-    // Clear register
-    for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_C[i] = 0.0;
-
     // Map from points
-    const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y));
-
-    for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) {
-      const CeedInt p = i % BASIS_NUM_PTS;
-      CeedScalar    r_X[BASIS_DIM];
-
-      for (CeedInt d = 0; d < BASIS_DIM; d++) {
-        r_X[d] = d_X[elem * BASIS_NUM_PTS + num_elem * BASIS_NUM_PTS * d + p];
-      }
-      for (CeedInt j = 0; j < BASIS_NUM_COMP * BASIS_DIM; j++) {
-        if (i < points_per_elem[elem]) r_U[j] = d_U[elem * BASIS_NUM_PTS + num_elem * BASIS_NUM_PTS * j + p];
-        else r_U[j] = 0.0;
-      }
-      if (BASIS_DIM == 1) {
-        GradTransposeAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
-      } else if (BASIS_DIM == 2) {
-        GradTransposeAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
-      } else if (BASIS_DIM == 3) {
-        GradTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
-      }
-    }
-    __syncthreads();
+    GradTransposeAtPoints<BASIS_DIM, BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, num_elem * BASIS_NUM_PTS, points_per_elem[elem],
+                                                                                &d_U[elem * BASIS_NUM_PTS], r_U, &d_X[elem * BASIS_NUM_PTS], r_C);
 
     // Map from coefficients
     if (BASIS_DIM == 1) {
diff --git a/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h b/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h
index 73d8cba91b..7844810c2d 100644
--- a/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h
+++ b/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h
@@ -446,3 +446,123 @@ inline __device__ void GradTransposeAtPoints3d(SharedData_Hip &data, const CeedI
     }
   }
 }
+
+//------------------------------------------------------------------------------
+// Loops over points
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+// Interpolate to points
+//------------------------------------------------------------------------------
+template <int DIM, int NUM_COMP, int NUM_PTS, int Q_1D>
+inline __device__ void InterpAtPoints(SharedData_Hip &data, const CeedInt comp_stride, const CeedScalar *__restrict__ r_C,
+                                      const CeedScalar *__restrict__ d_X, CeedScalar *__restrict__ r_V, CeedScalar *__restrict__ d_V) {
+  const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * NUM_PTS / (blockDim.x * blockDim.y));
+
+  for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) {
+    const CeedInt p = i % NUM_PTS;
+    CeedScalar    r_X[DIM];
+
+    for (CeedInt d = 0; d < BASIS_DIM; d++) r_X[d] = d_X[comp_stride * d + p];
+    if (DIM == 1) {
+      InterpAtPoints1d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_C, r_X, r_V);
+    } else if (DIM == 2) {
+      InterpAtPoints2d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_C, r_X, r_V);
+    } else if (DIM == 3) {
+      InterpAtPoints3d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_C, r_X, r_V);
+    }
+    if (i < NUM_PTS) {
+      for (CeedInt j = 0; j < NUM_COMP; j++) d_V[comp_stride * j + p] = r_V[j];
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// Interpolate from points
+//------------------------------------------------------------------------------
+template <int DIM, int NUM_COMP, int NUM_PTS, int Q_1D>
+inline __device__ void InterpTransposeAtPoints(SharedData_Hip &data, const CeedInt comp_stride, const CeedInt points_per_elem,
+                                               const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ r_U, const CeedScalar *__restrict__ d_X,
+                                               CeedScalar *__restrict__ r_C) {
+  // Clear register
+  for (CeedInt i = 0; i < NUM_COMP * (DIM > 2 ? Q_1D : 1); i++) r_C[i] = 0.0;
+
+  const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * NUM_PTS / (blockDim.x * blockDim.y));
+
+  for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) {
+    const CeedInt p = i % NUM_PTS;
+    CeedScalar    r_X[DIM];
+
+    for (CeedInt d = 0; d < DIM; d++) r_X[d] = d_X[comp_stride * d + p];
+    for (CeedInt j = 0; j < NUM_COMP; j++) {
+      if (i < points_per_elem) r_U[j] = d_U[comp_stride * j + p];
+      else r_U[j] = 0.0;
+    }
+    if (BASIS_DIM == 1) {
+      InterpTransposeAtPoints1d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_U, r_X, r_C);
+    } else if (BASIS_DIM == 2) {
+      InterpTransposeAtPoints2d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_U, r_X, r_C);
+    } else if (BASIS_DIM == 3) {
+      InterpTransposeAtPoints3d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_U, r_X, r_C);
+    }
+  }
+  __syncthreads();
+}
+
+//------------------------------------------------------------------------------
+// Gradient at points
+//------------------------------------------------------------------------------
+template <int DIM, int NUM_COMP, int NUM_PTS, int Q_1D>
+inline __device__ void GradAtPoints(SharedData_Hip &data, const CeedInt comp_stride, const CeedScalar *__restrict__ r_C,
+                                    const CeedScalar *__restrict__ d_X, CeedScalar *__restrict__ r_V, CeedScalar *__restrict__ d_V) {
+  const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * NUM_PTS / (blockDim.x * blockDim.y));
+
+  for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) {
+    const CeedInt p = i % NUM_PTS;
+    CeedScalar    r_X[DIM];
+
+    for (CeedInt d = 0; d < BASIS_DIM; d++) r_X[d] = d_X[comp_stride * d + p];
+    if (DIM == 1) {
+      GradAtPoints1d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_C, r_X, r_V);
+    } else if (DIM == 2) {
+      GradAtPoints2d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_C, r_X, r_V);
+    } else if (DIM == 3) {
+      GradAtPoints3d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_C, r_X, r_V);
+    }
+    if (i < NUM_PTS) {
+      for (CeedInt j = 0; j < NUM_COMP * DIM; j++) d_V[comp_stride * j + p] = r_V[j];
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// Grad from points
+//------------------------------------------------------------------------------
+template <int DIM, int NUM_COMP, int NUM_PTS, int Q_1D>
+inline __device__ void GradTransposeAtPoints(SharedData_Hip &data, const CeedInt comp_stride, const CeedInt points_per_elem,
+                                             const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ r_U, const CeedScalar *__restrict__ d_X,
+                                             CeedScalar *__restrict__ r_C) {
+  // Clear register
+  for (CeedInt i = 0; i < NUM_COMP * (DIM > 2 ? Q_1D : 1); i++) r_C[i] = 0.0;
+
+  const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * NUM_PTS / (blockDim.x * blockDim.y));
+
+  for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) {
+    const CeedInt p = i % NUM_PTS;
+    CeedScalar    r_X[DIM];
+
+    for (CeedInt d = 0; d < DIM; d++) r_X[d] = d_X[comp_stride * d + p];
+    for (CeedInt j = 0; j < NUM_COMP * DIM; j++) {
+      if (i < points_per_elem) r_U[j] = d_U[comp_stride * j + p];
+      else r_U[j] = 0.0;
+    }
+    if (BASIS_DIM == 1) {
+      GradTransposeAtPoints1d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_U, r_X, r_C);
+    } else if (BASIS_DIM == 2) {
+      GradTransposeAtPoints2d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_U, r_X, r_C);
+    } else if (BASIS_DIM == 3) {
+      GradTransposeAtPoints3d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_U, r_X, r_C);
+    }
+  }
+  __syncthreads();
+}
diff --git a/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points.h b/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points.h
index f96b843f30..355f53d0f4 100644
--- a/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points.h
+++ b/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points.h
@@ -57,26 +57,8 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
     }
 
     // Map to points
-    const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y));
-
-    for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) {
-      const CeedInt p = i % BASIS_NUM_PTS;
-      CeedScalar    r_X[BASIS_DIM];
-
-      for (CeedInt d = 0; d < BASIS_DIM; d++) {
-        r_X[d] = d_X[elem * BASIS_NUM_PTS + num_elem * BASIS_NUM_PTS * d + p];
-      }
-      if (BASIS_DIM == 1) {
-        InterpAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
-      } else if (BASIS_DIM == 2) {
-        InterpAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
-      } else if (BASIS_DIM == 3) {
-        InterpAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
-      }
-      for (CeedInt j = 0; j < BASIS_NUM_COMP; j++) {
-        if (i < BASIS_NUM_PTS) d_V[elem * BASIS_NUM_PTS + num_elem * BASIS_NUM_PTS * j + i] = r_V[j];
-      }
-    }
+    InterpAtPoints<BASIS_DIM, BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, num_elem * BASIS_NUM_PTS, r_C, &d_X[elem * BASIS_NUM_PTS], r_V,
+                                                                         &d_V[elem * BASIS_NUM_PTS]);
   }
 }
 
@@ -103,32 +85,9 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
 
   // Apply basis element by element
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
-    // Clear register
-    for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_C[i] = 0.0;
-
     // Map from points
-    const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y));
-
-    for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) {
-      const CeedInt p = i % BASIS_NUM_PTS;
-      CeedScalar    r_X[BASIS_DIM];
-
-      for (CeedInt d = 0; d < BASIS_DIM; d++) {
-        r_X[d] = d_X[elem * BASIS_NUM_PTS + num_elem * BASIS_NUM_PTS * d + p];
-      }
-      for (CeedInt j = 0; j < BASIS_NUM_COMP; j++) {
-        if (i < points_per_elem[elem]) r_U[j] = d_U[elem * BASIS_NUM_PTS + num_elem * BASIS_NUM_PTS * j + p];
-        else r_U[j] = 0.0;
-      }
-      if (BASIS_DIM == 1) {
-        InterpTransposeAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
-      } else if (BASIS_DIM == 2) {
-        InterpTransposeAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
-      } else if (BASIS_DIM == 3) {
-        InterpTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
-      }
-    }
-    __syncthreads();
+    InterpTransposeAtPoints<BASIS_DIM, BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, num_elem * BASIS_NUM_PTS, points_per_elem[elem],
+                                                                                  &d_U[elem * BASIS_NUM_PTS], r_U, &d_X[elem * BASIS_NUM_PTS], r_C);
 
     // Map from coefficients
     if (BASIS_DIM == 1) {
@@ -185,26 +144,8 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
     }
 
     // Map to points
-    const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y));
-
-    for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) {
-      const CeedInt p = i % BASIS_NUM_PTS;
-      CeedScalar    r_X[BASIS_DIM];
-
-      for (CeedInt d = 0; d < BASIS_DIM; d++) {
-        r_X[d] = d_X[elem * BASIS_NUM_PTS + num_elem * BASIS_NUM_PTS * d + p];
-      }
-      if (BASIS_DIM == 1) {
-        GradAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
-      } else if (BASIS_DIM == 2) {
-        GradAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
-      } else if (BASIS_DIM == 3) {
-        GradAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
-      }
-      for (CeedInt j = 0; j < BASIS_NUM_COMP * BASIS_DIM; j++) {
-        if (i < BASIS_NUM_PTS) d_V[elem * BASIS_NUM_PTS + num_elem * BASIS_NUM_PTS * j + i] = r_V[j];
-      }
-    }
+    GradAtPoints<BASIS_DIM, BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, num_elem * BASIS_NUM_PTS, r_C, &d_X[elem * BASIS_NUM_PTS], r_V,
+                                                                       &d_V[elem * BASIS_NUM_PTS]);
   }
 }
 
@@ -231,32 +172,9 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
 
   // Apply basis element by element
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
-    // Clear register
-    for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_C[i] = 0.0;
-
     // Map from points
-    const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y));
-
-    for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) {
-      const CeedInt p = i % BASIS_NUM_PTS;
-      CeedScalar    r_X[BASIS_DIM];
-
-      for (CeedInt d = 0; d < BASIS_DIM; d++) {
-        r_X[d] = d_X[elem * BASIS_NUM_PTS + num_elem * BASIS_NUM_PTS * d + p];
-      }
-      for (CeedInt j = 0; j < BASIS_NUM_COMP * BASIS_DIM; j++) {
-        if (i < points_per_elem[elem]) r_U[j] = d_U[elem * BASIS_NUM_PTS + num_elem * BASIS_NUM_PTS * j + p];
-        else r_U[j] = 0.0;
-      }
-      if (BASIS_DIM == 1) {
-        GradTransposeAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
-      } else if (BASIS_DIM == 2) {
-        GradTransposeAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
-      } else if (BASIS_DIM == 3) {
-        GradTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
-      }
-    }
-    __syncthreads();
+    GradTransposeAtPoints<BASIS_DIM, BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, num_elem * BASIS_NUM_PTS, points_per_elem[elem],
+                                                                                &d_U[elem * BASIS_NUM_PTS], r_U, &d_X[elem * BASIS_NUM_PTS], r_C);
 
     // Map from coefficients
     if (BASIS_DIM == 1) {

From 12235d7fd7d08f5cebdb10bab90c33bada4bdd05 Mon Sep 17 00:00:00 2001
From: James Wright <james@jameswright.xyz>
Date: Mon, 2 Dec 2024 13:06:49 -0700
Subject: [PATCH 234/571] test: Update diff_csv to process commented lines

---
 tests/junit_common.py | 43 +++++++++++++++++++++++++++++++++++++------
 1 file changed, 37 insertions(+), 6 deletions(-)

diff --git a/tests/junit_common.py b/tests/junit_common.py
index 255f8218e2..94a888440f 100644
--- a/tests/junit_common.py
+++ b/tests/junit_common.py
@@ -13,7 +13,7 @@
 from itertools import product
 import sys
 import time
-from typing import Optional, Tuple, List
+from typing import Optional, Tuple, List, Callable
 
 sys.path.insert(0, str(Path(__file__).parent / "junit-xml"))
 from junit_xml import TestCase, TestSuite, to_xml_report_string  # nopep8
@@ -106,6 +106,15 @@ def cgns_tol(self):
     def cgns_tol(self, val):
         self._cgns_tol = val
 
+    @property
+    def diff_csv_kwargs(self):
+        """Keyword arguments to be passed to diff_csv()"""
+        return getattr(self, '_diff_csv_kwargs', {})
+
+    @diff_csv_kwargs.setter
+    def diff_csv_kwargs(self, val):
+        self._diff_csv_kwargs = val
+
     def post_test_hook(self, test: str, spec: TestSpec) -> None:
         """Function callback ran after each test case
 
@@ -262,7 +271,8 @@ def get_test_args(source_file: Path) -> List[TestSpec]:
             if line.startswith(f'{comment_str}TESTARGS')] or [TestSpec('', args=['{ceed_resource}'])]
 
 
-def diff_csv(test_csv: Path, true_csv: Path, zero_tol: float = 3e-10, rel_tol: float = 1e-2) -> str:
+def diff_csv(test_csv: Path, true_csv: Path, zero_tol: float = 3e-10, rel_tol: float = 1e-2,
+             comment_str: str = '#', comment_func: Optional[Callable[[str, str], Optional[str]]] = None) -> str:
     """Compare CSV results against an expected CSV file with tolerances
 
     Args:
@@ -270,6 +280,8 @@ def diff_csv(test_csv: Path, true_csv: Path, zero_tol: float = 3e-10, rel_tol: f
         true_csv (Path): Path to expected CSV results
         zero_tol (float, optional): Tolerance below which values are considered to be zero. Defaults to 3e-10.
         rel_tol (float, optional): Relative tolerance for comparing non-zero values. Defaults to 1e-2.
+        comment_str (str, optional): String to denoting commented line
+        comment_func (Callable, optional): Function to determine if test and true line are different
 
     Returns:
         str: Diff output between result and expected CSVs
@@ -281,6 +293,27 @@ def diff_csv(test_csv: Path, true_csv: Path, zero_tol: float = 3e-10, rel_tol: f
         return f'No lines found in test output {test_csv}'
     if len(true_lines) == 0:
         return f'No lines found in test source {true_csv}'
+    if len(test_lines) != len(true_lines):
+        return f'Number of lines in {test_csv} and {true_csv} do not match'
+
+    # Process commented lines
+    uncommented_lines: List[int] = []
+    for n, (test_line, true_line) in enumerate(zip(test_lines, true_lines)):
+        if test_line[0] == comment_str and true_line[0] == comment_str:
+            if comment_func:
+                output = comment_func(test_line, true_line)
+                if output:
+                    return output
+        elif test_line[0] == comment_str and true_line[0] != comment_str:
+            return f'Commented line found in {test_csv} at line {n} but not in {true_csv}'
+        elif test_line[0] != comment_str and true_line[0] == comment_str:
+            return f'Commented line found in {true_csv} at line {n} but not in {test_csv}'
+        else:
+            uncommented_lines.append(n)
+
+    # Remove commented lines
+    test_lines = [test_lines[line] for line in uncommented_lines]
+    true_lines = [true_lines[line] for line in uncommented_lines]
 
     test_reader: csv.DictReader = csv.DictReader(test_lines)
     true_reader: csv.DictReader = csv.DictReader(true_lines)
@@ -288,8 +321,6 @@ def diff_csv(test_csv: Path, true_csv: Path, zero_tol: float = 3e-10, rel_tol: f
         return ''.join(difflib.unified_diff([f'{test_lines[0]}\n'], [f'{true_lines[0]}\n'],
                        tofile='found CSV columns', fromfile='expected CSV columns'))
 
-    if len(test_lines) != len(true_lines):
-        return f'Number of lines in {test_csv} and {true_csv} do not match'
     diff_lines: List[str] = list()
     for test_line, true_line in zip(test_reader, true_reader):
         for key in test_reader.fieldnames:
@@ -435,7 +466,7 @@ def run_test(index: int, test: str, spec: TestSpec, backend: str,
         ref_csvs: List[Path] = []
         output_files: List[str] = [arg for arg in run_args if 'ascii:' in arg]
         if output_files:
-            ref_csvs = [suite_spec.get_output_path(test, file.split('ascii:')[-1]) for file in output_files]
+            ref_csvs = [suite_spec.get_output_path(test, file.split(':')[1]) for file in output_files]
         ref_cgns: List[Path] = []
         output_files = [arg for arg in run_args if 'cgns:' in arg]
         if output_files:
@@ -484,7 +515,7 @@ def run_test(index: int, test: str, spec: TestSpec, backend: str,
             elif not (Path.cwd() / csv_name).is_file():
                 test_case.add_failure_info('csv', output=f'{csv_name} not found')
             else:
-                diff: str = diff_csv(Path.cwd() / csv_name, ref_csv)
+                diff: str = diff_csv(Path.cwd() / csv_name, ref_csv, **suite_spec.diff_csv_kwargs)
                 if diff:
                     test_case.add_failure_info('csv', output=diff)
                 else:

From a8d440fb43f487d0488a49b7d39fdc2c4f91ac2b Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Mon, 2 Dec 2024 15:30:31 -0700
Subject: [PATCH 235/571] gpu - simplify shared grid counting co-authored-by:
 zatkins-dev <zach.atkins@colorado.edu>

---
 backends/cuda-shared/ceed-cuda-shared-basis.c | 30 +++++++++----------
 backends/hip-shared/ceed-hip-shared-basis.c   | 30 +++++++++----------
 2 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/backends/cuda-shared/ceed-cuda-shared-basis.c b/backends/cuda-shared/ceed-cuda-shared-basis.c
index 623433ccf2..33b07a3087 100644
--- a/backends/cuda-shared/ceed-cuda-shared-basis.c
+++ b/backends/cuda-shared/ceed-cuda-shared-basis.c
@@ -64,7 +64,7 @@ static int CeedBasisApplyTensorCore_Cuda_shared(CeedBasis basis, bool apply_add,
       if (dim == 1) {
         CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread_1d,
                                                                                                  1));  // avoid >512 total threads
-        CeedInt grid            = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
+        CeedInt grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
         CeedInt shared_mem      = elems_per_block * thread_1d * sizeof(CeedScalar);
 
         if (t_mode == CEED_TRANSPOSE) {
@@ -77,7 +77,7 @@ static int CeedBasisApplyTensorCore_Cuda_shared(CeedBasis basis, bool apply_add,
         const CeedInt opt_elems[7] = {0, 32, 8, 6, 4, 2, 8};
         // elems_per_block must be at least 1
         CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] / num_comp : 1, 1);
-        CeedInt grid            = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
+        CeedInt grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
         CeedInt shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
 
         if (t_mode == CEED_TRANSPOSE) {
@@ -88,7 +88,7 @@ static int CeedBasisApplyTensorCore_Cuda_shared(CeedBasis basis, bool apply_add,
         }
       } else if (dim == 3) {
         CeedInt elems_per_block = 1;
-        CeedInt grid            = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
+        CeedInt grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
         CeedInt shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
 
         if (t_mode == CEED_TRANSPOSE) {
@@ -115,7 +115,7 @@ static int CeedBasisApplyTensorCore_Cuda_shared(CeedBasis basis, bool apply_add,
       if (dim == 1) {
         CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread_1d,
                                                                                                  1));  // avoid >512 total threads
-        CeedInt grid            = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
+        CeedInt grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
         CeedInt shared_mem      = elems_per_block * thread_1d * sizeof(CeedScalar);
 
         if (t_mode == CEED_TRANSPOSE) {
@@ -128,7 +128,7 @@ static int CeedBasisApplyTensorCore_Cuda_shared(CeedBasis basis, bool apply_add,
         const CeedInt opt_elems[7] = {0, 32, 8, 6, 4, 2, 8};
         // elems_per_block must be at least 1
         CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] / num_comp : 1, 1);
-        CeedInt grid            = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
+        CeedInt grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
         CeedInt shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
 
         if (t_mode == CEED_TRANSPOSE) {
@@ -139,7 +139,7 @@ static int CeedBasisApplyTensorCore_Cuda_shared(CeedBasis basis, bool apply_add,
         }
       } else if (dim == 3) {
         CeedInt elems_per_block = 1;
-        CeedInt grid            = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
+        CeedInt grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
         CeedInt shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
 
         if (t_mode == CEED_TRANSPOSE) {
@@ -159,19 +159,19 @@ static int CeedBasisApplyTensorCore_Cuda_shared(CeedBasis basis, bool apply_add,
       void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight_1d, &d_v};
       if (dim == 1) {
         const CeedInt elems_per_block = block_size / Q_1d;
-        const CeedInt grid_size       = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
+        const CeedInt grid_size       = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
 
         CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, grid_size, Q_1d, elems_per_block, 1, weight_args));
       } else if (dim == 2) {
         const CeedInt opt_elems       = block_size / (Q_1d * Q_1d);
         const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1;
-        const CeedInt grid_size       = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
+        const CeedInt grid_size       = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
 
         CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, grid_size, Q_1d, Q_1d, elems_per_block, weight_args));
       } else if (dim == 3) {
         const CeedInt opt_elems       = block_size / (Q_1d * Q_1d);
         const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1;
-        const CeedInt grid_size       = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
+        const CeedInt grid_size       = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
 
         CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, grid_size, Q_1d, Q_1d, elems_per_block, weight_args));
       }
@@ -334,7 +334,7 @@ static int CeedBasisApplyAtPointsCore_Cuda_shared(CeedBasis basis, bool apply_ad
       if (dim == 1) {
         CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread_1d,
                                                                                                  1));  // avoid >512 total threads
-        CeedInt grid            = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
+        CeedInt grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
         CeedInt shared_mem      = elems_per_block * thread_1d * sizeof(CeedScalar);
 
         CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, is_transpose ? data->InterpTransposeAtPoints : data->InterpAtPoints, grid, thread_1d, 1,
@@ -343,14 +343,14 @@ static int CeedBasisApplyAtPointsCore_Cuda_shared(CeedBasis basis, bool apply_ad
         const CeedInt opt_elems[7] = {0, 32, 8, 6, 4, 2, 8};
         // elems_per_block must be at least 1
         CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] / num_comp : 1, 1);
-        CeedInt grid            = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
+        CeedInt grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
         CeedInt shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
 
         CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, is_transpose ? data->InterpTransposeAtPoints : data->InterpAtPoints, grid, thread_1d,
                                                     thread_1d, elems_per_block, shared_mem, interp_args));
       } else if (dim == 3) {
         CeedInt elems_per_block = 1;
-        CeedInt grid            = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
+        CeedInt grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
         CeedInt shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
 
         CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, is_transpose ? data->InterpTransposeAtPoints : data->InterpAtPoints, grid, thread_1d,
@@ -370,7 +370,7 @@ static int CeedBasisApplyAtPointsCore_Cuda_shared(CeedBasis basis, bool apply_ad
       if (dim == 1) {
         CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread_1d,
                                                                                                  1));  // avoid >512 total threads
-        CeedInt grid            = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
+        CeedInt grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
         CeedInt shared_mem      = elems_per_block * thread_1d * sizeof(CeedScalar);
 
         CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, is_transpose ? data->GradTransposeAtPoints : data->GradAtPoints, grid, thread_1d, 1,
@@ -379,14 +379,14 @@ static int CeedBasisApplyAtPointsCore_Cuda_shared(CeedBasis basis, bool apply_ad
         const CeedInt opt_elems[7] = {0, 32, 8, 6, 4, 2, 8};
         // elems_per_block must be at least 1
         CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] / num_comp : 1, 1);
-        CeedInt grid            = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
+        CeedInt grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
         CeedInt shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
 
         CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, is_transpose ? data->GradTransposeAtPoints : data->GradAtPoints, grid, thread_1d, thread_1d,
                                                     elems_per_block, shared_mem, grad_args));
       } else if (dim == 3) {
         CeedInt elems_per_block = 1;
-        CeedInt grid            = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
+        CeedInt grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
         CeedInt shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
 
         CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, is_transpose ? data->GradTransposeAtPoints : data->GradAtPoints, grid, thread_1d, thread_1d,
diff --git a/backends/hip-shared/ceed-hip-shared-basis.c b/backends/hip-shared/ceed-hip-shared-basis.c
index e1162b76d8..3926623a21 100644
--- a/backends/hip-shared/ceed-hip-shared-basis.c
+++ b/backends/hip-shared/ceed-hip-shared-basis.c
@@ -123,7 +123,7 @@ static int CeedBasisApplyTensorCore_Hip_shared(CeedBasis basis, bool apply_add,
       if (dim == 1) {
         CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64;
         elems_per_block         = elems_per_block > 0 ? elems_per_block : 1;
-        CeedInt grid            = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
+        CeedInt grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
         CeedInt shared_mem      = elems_per_block * thread_1d * sizeof(CeedScalar);
 
         if (t_mode == CEED_TRANSPOSE) {
@@ -135,7 +135,7 @@ static int CeedBasisApplyTensorCore_Hip_shared(CeedBasis basis, bool apply_add,
       } else if (dim == 2) {
         // Check if required threads is small enough to do multiple elems
         const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1);
-        CeedInt       grid            = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
+        CeedInt       grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
         CeedInt       shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
 
         if (t_mode == CEED_TRANSPOSE) {
@@ -146,7 +146,7 @@ static int CeedBasisApplyTensorCore_Hip_shared(CeedBasis basis, bool apply_add,
         }
       } else if (dim == 3) {
         const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1);
-        CeedInt       grid            = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
+        CeedInt       grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
         CeedInt       shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
 
         if (t_mode == CEED_TRANSPOSE) {
@@ -173,7 +173,7 @@ static int CeedBasisApplyTensorCore_Hip_shared(CeedBasis basis, bool apply_add,
       if (dim == 1) {
         CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64;
         elems_per_block         = elems_per_block > 0 ? elems_per_block : 1;
-        CeedInt grid            = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
+        CeedInt grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
         CeedInt shared_mem      = elems_per_block * thread_1d * sizeof(CeedScalar);
 
         if (t_mode == CEED_TRANSPOSE) {
@@ -185,7 +185,7 @@ static int CeedBasisApplyTensorCore_Hip_shared(CeedBasis basis, bool apply_add,
       } else if (dim == 2) {
         // Check if required threads is small enough to do multiple elems
         const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1);
-        CeedInt       grid            = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
+        CeedInt       grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
         CeedInt       shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
 
         if (t_mode == CEED_TRANSPOSE) {
@@ -196,7 +196,7 @@ static int CeedBasisApplyTensorCore_Hip_shared(CeedBasis basis, bool apply_add,
         }
       } else if (dim == 3) {
         const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1);
-        CeedInt       grid            = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
+        CeedInt       grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
         CeedInt       shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
 
         if (t_mode == CEED_TRANSPOSE) {
@@ -218,19 +218,19 @@ static int CeedBasisApplyTensorCore_Hip_shared(CeedBasis basis, bool apply_add,
       if (dim == 1) {
         const CeedInt opt_elems       = block_size / Q_1d;
         const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1;
-        const CeedInt grid_size       = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
+        const CeedInt grid_size       = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
 
         CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Weight, grid_size, Q_1d, elems_per_block, 1, weight_args));
       } else if (dim == 2) {
         const CeedInt opt_elems       = block_size / (Q_1d * Q_1d);
         const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1;
-        const CeedInt grid_size       = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
+        const CeedInt grid_size       = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
 
         CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Weight, grid_size, Q_1d, Q_1d, elems_per_block, weight_args));
       } else if (dim == 3) {
         const CeedInt opt_elems       = block_size / (Q_1d * Q_1d);
         const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1;
-        const CeedInt grid_size       = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
+        const CeedInt grid_size       = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
 
         CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Weight, grid_size, Q_1d, Q_1d, elems_per_block, weight_args));
       }
@@ -392,7 +392,7 @@ static int CeedBasisApplyAtPointsCore_Hip_shared(CeedBasis basis, bool apply_add
       if (dim == 1) {
         CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64;
         elems_per_block         = elems_per_block > 0 ? elems_per_block : 1;
-        CeedInt grid            = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
+        CeedInt grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
         CeedInt shared_mem      = elems_per_block * thread_1d * sizeof(CeedScalar);
 
         CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, is_transpose ? data->InterpTransposeAtPoints : data->InterpAtPoints, grid, thread_1d, 1,
@@ -400,14 +400,14 @@ static int CeedBasisApplyAtPointsCore_Hip_shared(CeedBasis basis, bool apply_add
       } else if (dim == 2) {
         // Check if required threads is small enough to do multiple elems
         const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1);
-        CeedInt       grid            = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
+        CeedInt       grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
         CeedInt       shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
 
         CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, is_transpose ? data->InterpTransposeAtPoints : data->InterpAtPoints, grid, thread_1d,
                                                    thread_1d, elems_per_block, shared_mem, interp_args));
       } else if (dim == 3) {
         const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1);
-        CeedInt       grid            = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
+        CeedInt       grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
         CeedInt       shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
 
         CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, is_transpose ? data->InterpTransposeAtPoints : data->InterpAtPoints, grid, thread_1d,
@@ -426,7 +426,7 @@ static int CeedBasisApplyAtPointsCore_Hip_shared(CeedBasis basis, bool apply_add
       if (dim == 1) {
         CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64;
         elems_per_block         = elems_per_block > 0 ? elems_per_block : 1;
-        CeedInt grid            = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
+        CeedInt grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
         CeedInt shared_mem      = elems_per_block * thread_1d * sizeof(CeedScalar);
 
         CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, is_transpose ? data->GradTransposeAtPoints : data->GradAtPoints, grid, thread_1d, 1,
@@ -434,14 +434,14 @@ static int CeedBasisApplyAtPointsCore_Hip_shared(CeedBasis basis, bool apply_add
       } else if (dim == 2) {
         // Check if required threads is small enough to do multiple elems
         const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1);
-        CeedInt       grid            = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
+        CeedInt       grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
         CeedInt       shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
 
         CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, is_transpose ? data->GradTransposeAtPoints : data->GradAtPoints, grid, thread_1d, thread_1d,
                                                    elems_per_block, shared_mem, grad_args));
       } else if (dim == 3) {
         const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1);
-        CeedInt       grid            = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
+        CeedInt       grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
         CeedInt       shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
 
         CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, is_transpose ? data->GradTransposeAtPoints : data->GradAtPoints, grid, thread_1d, thread_1d,

From 3f919cbc23e59285304a4fc152ac9ef29c71e01d Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Tue, 3 Dec 2024 14:27:53 -0700
Subject: [PATCH 236/571] op - fix FLOPs estimates AtPoints

---
 include/ceed/backend.h    |   3 +-
 interface/ceed-basis.c    |  90 +++++++++++++++++++--------
 interface/ceed-operator.c |  44 +++++++++++---
 tests/t595-operator.c     | 125 ++++++++++++++++++++++++++++++++++++++
 tests/t595-operator.h     |  17 ++++++
 5 files changed, 246 insertions(+), 33 deletions(-)
 create mode 100644 tests/t595-operator.c
 create mode 100644 tests/t595-operator.h

diff --git a/include/ceed/backend.h b/include/ceed/backend.h
index e27d97cab3..5ec604ee5d 100644
--- a/include/ceed/backend.h
+++ b/include/ceed/backend.h
@@ -324,7 +324,8 @@ CEED_EXTERN int CeedBasisGetData(CeedBasis basis, void *data);
 CEED_EXTERN int CeedBasisSetData(CeedBasis basis, void *data);
 CEED_EXTERN int CeedBasisReference(CeedBasis basis);
 CEED_EXTERN int CeedBasisGetNumQuadratureComponents(CeedBasis basis, CeedEvalMode eval_mode, CeedInt *q_comp);
-CEED_EXTERN int CeedBasisGetFlopsEstimate(CeedBasis basis, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedSize *flops);
+CEED_EXTERN int CeedBasisGetFlopsEstimate(CeedBasis basis, CeedTransposeMode t_mode, CeedEvalMode eval_mode, bool is_at_points, CeedInt num_points,
+                                          CeedSize *flops);
 CEED_EXTERN int CeedBasisGetFESpace(CeedBasis basis, CeedFESpace *fe_space);
 CEED_EXTERN int CeedBasisGetTopologyDimension(CeedElemTopology topo, CeedInt *dim);
 CEED_EXTERN int CeedBasisGetTensorContract(CeedBasis basis, CeedTensorContract *contract);
diff --git a/interface/ceed-basis.c b/interface/ceed-basis.c
index 902207f75e..4a4f5fb180 100644
--- a/interface/ceed-basis.c
+++ b/interface/ceed-basis.c
@@ -779,17 +779,21 @@ int CeedBasisGetNumQuadratureComponents(CeedBasis basis, CeedEvalMode eval_mode,
 /**
   @brief Estimate number of FLOPs required to apply `CeedBasis` in `t_mode` and `eval_mode`
 
-  @param[in]  basis     `CeedBasis` to estimate FLOPs for
-  @param[in]  t_mode    Apply basis or transpose
-  @param[in]  eval_mode @ref CeedEvalMode
-  @param[out] flops     Address of variable to hold FLOPs estimate
+  @param[in]  basis        `CeedBasis` to estimate FLOPs for
+  @param[in]  t_mode       Apply basis or transpose
+  @param[in]  eval_mode    @ref CeedEvalMode
+  @param[in]  is_at_points Evaluate the basis at points or quadrature points
+  @param[in]  num_points   Number of points basis is evaluated at
+  @param[out] flops        Address of variable to hold FLOPs estimate
 
   @ref Backend
 **/
-int CeedBasisGetFlopsEstimate(CeedBasis basis, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedSize *flops) {
+int CeedBasisGetFlopsEstimate(CeedBasis basis, CeedTransposeMode t_mode, CeedEvalMode eval_mode, bool is_at_points, CeedInt num_points,
+                              CeedSize *flops) {
   bool is_tensor;
 
   CeedCall(CeedBasisIsTensor(basis, &is_tensor));
+  CeedCheck(!is_at_points || is_tensor, CeedBasisReturnCeed(basis), CEED_ERROR_INCOMPATIBLE, "Can only evaluate tensor-product bases at points");
   if (is_tensor) {
     CeedInt dim, num_comp, P_1d, Q_1d;
 
@@ -802,32 +806,68 @@ int CeedBasisGetFlopsEstimate(CeedBasis basis, CeedTransposeMode t_mode, CeedEva
       Q_1d = P_1d;
     }
     CeedInt tensor_flops = 0, pre = num_comp * CeedIntPow(P_1d, dim - 1), post = 1;
+
     for (CeedInt d = 0; d < dim; d++) {
       tensor_flops += 2 * pre * P_1d * post * Q_1d;
       pre /= P_1d;
       post *= Q_1d;
     }
-    switch (eval_mode) {
-      case CEED_EVAL_NONE:
-        *flops = 0;
-        break;
-      case CEED_EVAL_INTERP:
-        *flops = tensor_flops;
-        break;
-      case CEED_EVAL_GRAD:
-        *flops = tensor_flops * 2;
-        break;
-      case CEED_EVAL_DIV:
-      case CEED_EVAL_CURL: {
-        // LCOV_EXCL_START
-        return CeedError(CeedBasisReturnCeed(basis), CEED_ERROR_INCOMPATIBLE, "Tensor basis evaluation for %s not supported",
-                         CeedEvalModes[eval_mode]);
-        break;
-        // LCOV_EXCL_STOP
+    if (is_at_points) {
+      CeedInt chebyshev_flops = (Q_1d - 2) * 3 + 1, d_chebyshev_flops = (Q_1d - 2) * 8 + 1;
+      CeedInt point_tensor_flops = 0, pre = CeedIntPow(Q_1d, dim - 1), post = 1;
+
+      for (CeedInt d = 0; d < dim; d++) {
+        point_tensor_flops += 2 * pre * Q_1d * post * 1;
+        pre /= P_1d;
+        post *= Q_1d;
+      }
+
+      switch (eval_mode) {
+        case CEED_EVAL_NONE:
+          *flops = 0;
+          break;
+        case CEED_EVAL_INTERP:
+          *flops = tensor_flops + num_points * (dim * chebyshev_flops + point_tensor_flops + (t_mode == CEED_TRANSPOSE ? CeedIntPow(Q_1d, dim) : 0));
+          break;
+        case CEED_EVAL_GRAD:
+          *flops = tensor_flops + num_points * (dim * (d_chebyshev_flops + (dim - 1) * chebyshev_flops + point_tensor_flops +
+                                                       (t_mode == CEED_TRANSPOSE ? CeedIntPow(Q_1d, dim) : 0)));
+          break;
+        case CEED_EVAL_DIV:
+        case CEED_EVAL_CURL: {
+          // LCOV_EXCL_START
+          return CeedError(CeedBasisReturnCeed(basis), CEED_ERROR_INCOMPATIBLE, "Tensor basis evaluation for %s not supported",
+                           CeedEvalModes[eval_mode]);
+          break;
+          // LCOV_EXCL_STOP
+        }
+        case CEED_EVAL_WEIGHT:
+          *flops = num_points;
+          break;
+      }
+    } else {
+      switch (eval_mode) {
+        case CEED_EVAL_NONE:
+          *flops = 0;
+          break;
+        case CEED_EVAL_INTERP:
+          *flops = tensor_flops;
+          break;
+        case CEED_EVAL_GRAD:
+          *flops = tensor_flops * 2;
+          break;
+        case CEED_EVAL_DIV:
+        case CEED_EVAL_CURL: {
+          // LCOV_EXCL_START
+          return CeedError(CeedBasisReturnCeed(basis), CEED_ERROR_INCOMPATIBLE, "Tensor basis evaluation for %s not supported",
+                           CeedEvalModes[eval_mode]);
+          break;
+          // LCOV_EXCL_STOP
+        }
+        case CEED_EVAL_WEIGHT:
+          *flops = dim * CeedIntPow(Q_1d, dim);
+          break;
       }
-      case CEED_EVAL_WEIGHT:
-        *flops = dim * CeedIntPow(Q_1d, dim);
-        break;
     }
   } else {
     CeedInt dim, num_comp, q_comp, num_nodes, num_qpts;
diff --git a/interface/ceed-operator.c b/interface/ceed-operator.c
index 96fe26a834..cc3493db87 100644
--- a/interface/ceed-operator.c
+++ b/interface/ceed-operator.c
@@ -1097,8 +1097,14 @@ int CeedOperatorAtPointsGetPoints(CeedOperator op, CeedElemRestriction *rstr_poi
   CeedCheck(is_at_points, CeedOperatorReturnCeed(op), CEED_ERROR_MINOR, "Only defined for operator at points");
   CeedCall(CeedOperatorCheckReady(op));
 
-  if (rstr_points) CeedCall(CeedElemRestrictionReferenceCopy(op->rstr_points, rstr_points));
-  if (point_coords) CeedCall(CeedVectorReferenceCopy(op->point_coords, point_coords));
+  if (rstr_points) {
+    *rstr_points = NULL;
+    CeedCall(CeedElemRestrictionReferenceCopy(op->rstr_points, rstr_points));
+  }
+  if (point_coords) {
+    *point_coords = NULL;
+    CeedCall(CeedVectorReferenceCopy(op->point_coords, point_coords));
+  }
   return CEED_ERROR_SUCCESS;
 }
 
@@ -1693,16 +1699,39 @@ int CeedOperatorGetFlopsEstimate(CeedOperator op, CeedSize *flops) {
       *flops += suboperator_flops;
     }
   } else {
-    CeedInt             num_input_fields, num_output_fields, num_elem = 0;
+    bool                is_at_points;
+    CeedInt             num_input_fields, num_output_fields, num_elem = 0, num_points = 0;
     CeedQFunction       qf;
     CeedQFunctionField *qf_input_fields, *qf_output_fields;
     CeedOperatorField  *op_input_fields, *op_output_fields;
 
+    CeedCall(CeedOperatorIsAtPoints(op, &is_at_points));
+    CeedCall(CeedOperatorGetNumElements(op, &num_elem));
+    if (is_at_points) {
+      CeedMemType         mem_type;
+      CeedElemRestriction rstr_points = NULL;
+
+      CeedCall(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL));
+      CeedCall(CeedGetPreferredMemType(CeedOperatorReturnCeed(op), &mem_type));
+      if (mem_type == CEED_MEM_DEVICE) {
+        // Device backends pad out to the same number of points per element
+        CeedCall(CeedElemRestrictionGetMaxPointsInElement(rstr_points, &num_points));
+      } else {
+        num_points = 0;
+        for (CeedInt i = 0; i < num_elem; i++) {
+          CeedInt points_in_elem = 0;
+
+          CeedCall(CeedElemRestrictionGetNumPointsInElement(rstr_points, i, &points_in_elem));
+          num_points += points_in_elem;
+        }
+        num_points = num_points / num_elem + (num_points % num_elem > 0);
+      }
+      CeedCall(CeedElemRestrictionDestroy(&rstr_points));
+    }
     CeedCall(CeedOperatorGetQFunction(op, &qf));
     CeedCall(CeedQFunctionGetFields(qf, &num_input_fields, &qf_input_fields, &num_output_fields, &qf_output_fields));
     CeedCall(CeedQFunctionDestroy(&qf));
     CeedCall(CeedOperatorGetFields(op, NULL, &op_input_fields, NULL, &op_output_fields));
-    CeedCall(CeedOperatorGetNumElements(op, &num_elem));
 
     // Input FLOPs
     for (CeedInt i = 0; i < num_input_fields; i++) {
@@ -1721,7 +1750,7 @@ int CeedOperatorGetFlopsEstimate(CeedOperator op, CeedSize *flops) {
         *flops += rstr_flops;
         CeedCall(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
         CeedCall(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
-        CeedCall(CeedBasisGetFlopsEstimate(basis, CEED_NOTRANSPOSE, eval_mode, &basis_flops));
+        CeedCall(CeedBasisGetFlopsEstimate(basis, CEED_NOTRANSPOSE, eval_mode, is_at_points, num_points, &basis_flops));
         CeedCall(CeedBasisDestroy(&basis));
         *flops += basis_flops * num_elem;
       }
@@ -1733,7 +1762,8 @@ int CeedOperatorGetFlopsEstimate(CeedOperator op, CeedSize *flops) {
       CeedSize      qf_flops;
       CeedQFunction qf;
 
-      CeedCall(CeedOperatorGetNumQuadraturePoints(op, &num_qpts));
+      if (is_at_points) num_qpts = num_points;
+      else CeedCall(CeedOperatorGetNumQuadraturePoints(op, &num_qpts));
       CeedCall(CeedOperatorGetQFunction(op, &qf));
       CeedCall(CeedQFunctionGetFlopsEstimate(qf, &qf_flops));
       CeedCall(CeedQFunctionDestroy(&qf));
@@ -1759,7 +1789,7 @@ int CeedOperatorGetFlopsEstimate(CeedOperator op, CeedSize *flops) {
         *flops += rstr_flops;
         CeedCall(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
         CeedCall(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
-        CeedCall(CeedBasisGetFlopsEstimate(basis, CEED_TRANSPOSE, eval_mode, &basis_flops));
+        CeedCall(CeedBasisGetFlopsEstimate(basis, CEED_TRANSPOSE, eval_mode, is_at_points, num_points, &basis_flops));
         CeedCall(CeedBasisDestroy(&basis));
         *flops += basis_flops * num_elem;
       }
diff --git a/tests/t595-operator.c b/tests/t595-operator.c
new file mode 100644
index 0000000000..e874ccb2ba
--- /dev/null
+++ b/tests/t595-operator.c
@@ -0,0 +1,125 @@
+/// @file
+/// Test FLOP estimation for mass matrix operator at points
+/// \test Test FLOP estimation for mass matrix operator at points
+#include "t595-operator.h"
+
+#include <ceed.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+int main(int argc, char **argv) {
+  Ceed    ceed;
+  CeedInt num_elem_1d = 3, num_elem = num_elem_1d * num_elem_1d, dim = 2, p = 3, q = 5;
+  CeedInt num_nodes = (num_elem_1d * (p - 1) + 1) * (num_elem_1d * (p - 1) + 1), num_points_per_elem = 4, num_points = num_elem * num_points_per_elem;
+  CeedSize            flop_estimate = 0;
+  CeedVector          x_points, q_data;
+  CeedElemRestriction elem_restriction_x_points, elem_restriction_q_data, elem_restriction_u;
+  CeedBasis           basis_x, basis_u;
+  CeedQFunction       qf_mass;
+  CeedOperator        op_mass;
+  bool                is_at_points;
+
+  CeedInit(argv[1], &ceed);
+
+  // Point reference coordinates
+  CeedVectorCreate(ceed, dim * num_points, &x_points);
+  {
+    CeedScalar x_array[dim * num_points];
+
+    for (CeedInt e = 0; e < num_elem; e++) {
+      for (CeedInt d = 0; d < dim; d++) {
+        x_array[num_points_per_elem * (e * dim + d) + 0] = 0.25;
+        x_array[num_points_per_elem * (e * dim + d) + 1] = d == 0 ? -0.25 : 0.25;
+        x_array[num_points_per_elem * (e * dim + d) + 2] = d == 0 ? 0.25 : -0.25;
+        x_array[num_points_per_elem * (e * dim + d) + 3] = 0.25;
+      }
+    }
+    CeedVectorSetArray(x_points, CEED_MEM_HOST, CEED_COPY_VALUES, x_array);
+  }
+  {
+    CeedInt ind_x[num_elem + 1 + num_points];
+
+    for (CeedInt i = 0; i <= num_elem; i++) ind_x[i] = num_elem + 1 + i * num_points_per_elem;
+    for (CeedInt i = 0; i < num_points; i++) ind_x[num_elem + 1 + i] = i;
+    CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points, dim, num_points * dim, CEED_MEM_HOST, CEED_COPY_VALUES, ind_x,
+                                      &elem_restriction_x_points);
+    CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points, 1, num_points, CEED_MEM_HOST, CEED_COPY_VALUES, ind_x, &elem_restriction_q_data);
+  }
+
+  // Q data
+  CeedVectorCreate(ceed, num_points, &q_data);
+
+  CeedBasisCreateTensorH1Lagrange(ceed, dim, dim, 2, q, CEED_GAUSS, &basis_x);
+
+  // Cell solution
+  {
+    CeedInt ind_u[num_elem * p * p];
+
+    for (CeedInt e = 0; e < num_elem; e++) {
+      CeedInt elem_xy[2] = {1, 1}, n_d[2] = {0, 0};
+
+      for (CeedInt d = 0; d < dim; d++) n_d[d] = num_elem_1d * (p - 1) + 1;
+      {
+        CeedInt r_e = e;
+
+        for (CeedInt d = 0; d < dim; d++) {
+          elem_xy[d] = r_e % num_elem_1d;
+          r_e /= num_elem_1d;
+        }
+      }
+      CeedInt num_nodes_in_elem = p * p, *elem_nodes = ind_u + e * num_nodes_in_elem;
+
+      for (CeedInt n = 0; n < num_nodes_in_elem; n++) {
+        CeedInt g_node = 0, g_node_stride = 1, r_node = n;
+
+        for (CeedInt d = 0; d < dim; d++) {
+          g_node += (elem_xy[d] * (p - 1) + r_node % p) * g_node_stride;
+          g_node_stride *= n_d[d];
+          r_node /= p;
+        }
+        elem_nodes[n] = g_node;
+      }
+    }
+    CeedElemRestrictionCreate(ceed, num_elem, p * p, 1, 1, num_nodes, CEED_MEM_HOST, CEED_COPY_VALUES, ind_u, &elem_restriction_u);
+  }
+  CeedBasisCreateTensorH1Lagrange(ceed, dim, 1, p, q, CEED_GAUSS, &basis_u);
+
+  // Mass operator
+  CeedQFunctionCreateInterior(ceed, 1, mass, mass_loc, &qf_mass);
+  CeedQFunctionAddInput(qf_mass, "u", 1, CEED_EVAL_INTERP);
+  CeedQFunctionAddInput(qf_mass, "rho", 1, CEED_EVAL_NONE);
+  CeedQFunctionAddOutput(qf_mass, "v", 1, CEED_EVAL_INTERP);
+
+  CeedOperatorCreateAtPoints(ceed, qf_mass, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_mass);
+  CeedOperatorSetField(op_mass, "u", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetField(op_mass, "rho", elem_restriction_q_data, CEED_BASIS_NONE, q_data);
+  CeedOperatorSetField(op_mass, "v", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE);
+  CeedOperatorAtPointsSetPoints(op_mass, elem_restriction_x_points, x_points);
+
+  CeedOperatorIsAtPoints(op_mass, &is_at_points);
+  if (!is_at_points) printf("Error: Operator should be at points\n");
+
+  // Estimate FLOPs
+  CeedQFunctionSetUserFlopsEstimate(qf_mass, 1);
+  CeedOperatorGetFlopsEstimate(op_mass, &flop_estimate);
+
+  // Check output
+  if (flop_estimate != 16317) {
+    // LCOV_EXCL_START
+    printf("Incorrect FLOP estimate computed, %ld != 16317\n", flop_estimate);
+    // LCOV_EXCL_STOP
+  }
+
+  CeedVectorDestroy(&x_points);
+  CeedVectorDestroy(&q_data);
+  CeedElemRestrictionDestroy(&elem_restriction_x_points);
+  CeedElemRestrictionDestroy(&elem_restriction_q_data);
+  CeedElemRestrictionDestroy(&elem_restriction_u);
+  CeedBasisDestroy(&basis_x);
+  CeedBasisDestroy(&basis_u);
+  CeedQFunctionDestroy(&qf_mass);
+  CeedOperatorDestroy(&op_mass);
+  CeedDestroy(&ceed);
+  return 0;
+}
diff --git a/tests/t595-operator.h b/tests/t595-operator.h
new file mode 100644
index 0000000000..e2dcddf09d
--- /dev/null
+++ b/tests/t595-operator.h
@@ -0,0 +1,17 @@
+// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+#include <ceed/types.h>
+
+CEED_QFUNCTION(mass)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+  const CeedScalar *u = in[0], *rho = in[1];
+  CeedScalar       *v = out[0];
+
+  // Quadrature point loop
+  CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { v[i] = rho[i] * u[i]; }
+  return 0;
+}

From f815fac990b20019e227e6950cc74a96439f9eba Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Mon, 9 Dec 2024 15:32:50 -0700
Subject: [PATCH 237/571] gen - fun name standardization

---
 .../cuda-gen/ceed-cuda-gen-operator-build.cpp | 121 ++++++++++--------
 .../hip-gen/ceed-hip-gen-operator-build.cpp   | 121 ++++++++++--------
 .../ceed/jit-source/cuda/cuda-gen-templates.h |  62 ++++-----
 .../ceed/jit-source/hip/hip-gen-templates.h   |  62 ++++-----
 4 files changed, 194 insertions(+), 172 deletions(-)

diff --git a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
index c0f4b081a0..1e5d56863d 100644
--- a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
+++ b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
@@ -166,18 +166,18 @@ static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, C
       if (is_input) data->B.inputs[i] = basis_data->d_interp_1d;
       else data->B.outputs[i] = basis_data->d_interp_1d;
       code << "  __shared__ CeedScalar s_B" << var_suffix << "[" << P_1d * Q_1d << "];\n";
-      code << "  loadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
+      code << "  LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
       break;
     case CEED_EVAL_GRAD:
       if (is_input) data->B.inputs[i] = basis_data->d_interp_1d;
       else data->B.outputs[i] = basis_data->d_interp_1d;
       code << "  __shared__ CeedScalar s_B" << var_suffix << "[" << P_1d * Q_1d << "];\n";
-      code << "  loadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
+      code << "  LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
       if (use_3d_slices) {
         if (is_input) data->G.inputs[i] = basis_data->d_collo_grad_1d;
         else data->G.outputs[i] = basis_data->d_collo_grad_1d;
         code << "  __shared__ CeedScalar s_G" << var_suffix << "[" << Q_1d * Q_1d << "];\n";
-        code << "  loadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
+        code << "  LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
       } else {
         bool has_collo_grad = basis_data->d_collo_grad_1d;
 
@@ -185,10 +185,10 @@ static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, C
         else data->G.outputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d;
         if (has_collo_grad) {
           code << "  __shared__ CeedScalar s_G" << var_suffix << "[" << Q_1d * Q_1d << "];\n";
-          code << "  loadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
+          code << "  LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
         } else {
           code << "  __shared__ CeedScalar s_G" << var_suffix << "[" << Q_1d * P_1d << "];\n";
-          code << "  loadMatrix<" << P_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
+          code << "  LoadMatrix<" << P_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
         }
       }
       break;
@@ -196,7 +196,6 @@ static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, C
       break;  // No action
       // LCOV_EXCL_START
     case CEED_EVAL_DIV:
-      break;  // TODO: Not implemented
     case CEED_EVAL_CURL:
       break;  // TODO: Not implemented
               // LCOV_EXCL_STOP
@@ -215,6 +214,7 @@ static int CeedOperatorBuildKernelRestriction_Cuda_gen(std::ostringstream &code,
   CeedEvalMode              eval_mode  = CEED_EVAL_NONE;
   CeedInt                   elem_size = 0, num_comp = 0, P_1d = 0;
   CeedSize                  l_size;
+  CeedRestrictionType       rstr_type = CEED_RESTRICTION_STANDARD;
   CeedElemRestriction_Cuda *rstr_data;
   CeedElemRestriction       elem_rstr;
   CeedBasis                 basis;
@@ -222,6 +222,7 @@ static int CeedOperatorBuildKernelRestriction_Cuda_gen(std::ostringstream &code,
   // Get field data
   CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr));
   if (elem_rstr != CEED_ELEMRESTRICTION_NONE) {
+    CeedCallBackend(CeedElemRestrictionGetType(elem_rstr, &rstr_type));
     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
     CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data));
@@ -242,8 +243,6 @@ static int CeedOperatorBuildKernelRestriction_Cuda_gen(std::ostringstream &code,
       // Restriction was already done for previous input
       code << "    CeedScalar *r_e" << var_suffix << " = " << buffer_name << ";\n";
     } else if (eval_mode != CEED_EVAL_WEIGHT && !((eval_mode == CEED_EVAL_NONE) && use_3d_slices)) {
-      bool is_strided;
-
       if (eval_mode == CEED_EVAL_NONE) {
         // No basis action, so r_e_in_* in also r_q_in_* and needs to be allocated
         code << "    CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << P_name << "];\n";
@@ -251,18 +250,59 @@ static int CeedOperatorBuildKernelRestriction_Cuda_gen(std::ostringstream &code,
         // Otherwise we're using the scratch space
         code << "    CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
       }
-      CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided));
-      if (!is_strided) {
+      switch (rstr_type) {
+        case CEED_RESTRICTION_STANDARD: {
+          CeedInt comp_stride;
+
+          CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
+          code << "    const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
+          CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
+          code << "    // CompStride: " << comp_stride << "\n";
+          data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets;
+          code << "    ReadLVecStandard" << dim << "d<num_comp" << var_suffix << ", " << comp_stride << ", " << P_name << ">(data, l_size"
+               << var_suffix << ", elem, indices.inputs[" << i << "], d" << var_suffix << ", r_e" << var_suffix << ");\n";
+          break;
+        }
+        case CEED_RESTRICTION_STRIDED: {
+          bool    has_backend_strides;
+          CeedInt num_elem;
+
+          CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides));
+          CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem));
+          CeedInt strides[3] = {1, elem_size * num_elem, elem_size};
+
+          if (!has_backend_strides) {
+            CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
+          }
+          code << "    // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n";
+          code << "    ReadLVecStrided" << dim << "d<num_comp" << var_suffix << ", " << P_name << "," << strides[0] << "," << strides[1] << ","
+               << strides[2] << ">(data, elem, d" << var_suffix << ", r_e" << var_suffix << ");\n";
+          break;
+        }
+        // LCOV_EXCL_START
+        case CEED_RESTRICTION_ORIENTED:
+        case CEED_RESTRICTION_CURL_ORIENTED:
+        case CEED_RESTRICTION_POINTS:
+          break;  // TODO: Not implemented
+                  // LCOV_EXCL_STOP
+      }
+    }
+  } else {
+    // Output
+    switch (rstr_type) {
+      case CEED_RESTRICTION_STANDARD: {
         CeedInt comp_stride;
 
         CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
         code << "    const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
         CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
         code << "    // CompStride: " << comp_stride << "\n";
-        data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets;
-        code << "    readDofsOffset" << dim << "d<num_comp" << var_suffix << ", " << comp_stride << ", " << P_name << ">(data, l_size" << var_suffix
-             << ", elem, indices.inputs[" << i << "], d" << var_suffix << ", r_e" << var_suffix << ");\n";
-      } else {
+        data->indices.outputs[i] = (CeedInt *)rstr_data->d_offsets;
+        code << "    WriteLVecStandard" << dim << "d<num_comp" << var_suffix << ", " << comp_stride << ", " << P_name << ">(data, l_size"
+             << var_suffix << ", elem, indices.outputs[" << i << "], r_e" << var_suffix << ", d" << var_suffix << ");\n";
+        break;
+      }
+      case CEED_RESTRICTION_STRIDED: {
         bool    has_backend_strides;
         CeedInt num_elem;
 
@@ -274,39 +314,16 @@ static int CeedOperatorBuildKernelRestriction_Cuda_gen(std::ostringstream &code,
           CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
         }
         code << "    // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n";
-        code << "    readDofsStrided" << dim << "d<num_comp" << var_suffix << ", " << P_name << "," << strides[0] << "," << strides[1] << ","
-             << strides[2] << ">(data, elem, d" << var_suffix << ", r_e" << var_suffix << ");\n";
-      }
-    }
-  } else {
-    // Output
-    bool is_strided;
-
-    CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided));
-    if (!is_strided) {
-      CeedInt comp_stride;
-
-      CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
-      code << "    const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
-      CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
-      code << "    // CompStride: " << comp_stride << "\n";
-      data->indices.outputs[i] = (CeedInt *)rstr_data->d_offsets;
-      code << "    writeDofsOffset" << dim << "d<num_comp" << var_suffix << ", " << comp_stride << ", " << P_name << ">(data, l_size" << var_suffix
-           << ", elem, indices.outputs[" << i << "], r_e" << var_suffix << ", d" << var_suffix << ");\n";
-    } else {
-      bool    has_backend_strides;
-      CeedInt num_elem;
-
-      CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides));
-      CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem));
-      CeedInt strides[3] = {1, elem_size * num_elem, elem_size};
-
-      if (!has_backend_strides) {
-        CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
+        code << "    WriteLVecStrided" << dim << "d<num_comp" << var_suffix << ", " << P_name << "," << strides[0] << "," << strides[1] << ","
+             << strides[2] << ">(data, elem, r_e" << var_suffix << ", d" << var_suffix << ");\n";
+        break;
       }
-      code << "    // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n";
-      code << "    writeDofsStrided" << dim << "d<num_comp" << var_suffix << ", " << P_name << "," << strides[0] << "," << strides[1] << ","
-           << strides[2] << ">(data, elem, r_e" << var_suffix << ", d" << var_suffix << ");\n";
+      // LCOV_EXCL_START
+      case CEED_RESTRICTION_ORIENTED:
+      case CEED_RESTRICTION_CURL_ORIENTED:
+      case CEED_RESTRICTION_POINTS:
+        break;  // TODO: Not implemented
+                // LCOV_EXCL_STOP
     }
   }
   CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
@@ -483,7 +500,7 @@ static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, C
               CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
             }
             code << "      // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n";
-            code << "      readSliceQuadsStrided3d<num_comp" << var_suffix << ", " << Q_name << "," << strides[0] << "," << strides[1] << ","
+            code << "      ReadEVecSliceStrided3d<num_comp" << var_suffix << ", " << Q_name << "," << strides[0] << "," << strides[1] << ","
                  << strides[2] << ">(data, elem, q, d" << var_suffix << ", r_s" << var_suffix << ");\n";
           } else {
             CeedSize                  l_size = 0;
@@ -496,7 +513,7 @@ static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, C
             code << "      // CompStride: " << comp_stride << "\n";
             CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data));
             data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets;
-            code << "      readSliceQuadsOffset3d<num_comp" << var_suffix << ", " << comp_stride << ", " << Q_name << ">(data, l_size" << var_suffix
+            code << "      ReadEVecSliceStandard3d<num_comp" << var_suffix << ", " << comp_stride << ", " << Q_name << ">(data, l_size" << var_suffix
                  << ", elem, q, indices.inputs[" << i << "], d" << var_suffix << ", r_s" << var_suffix << ");\n";
           }
           CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
@@ -509,8 +526,8 @@ static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, C
           break;
         case CEED_EVAL_GRAD:
           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim];\n";
-          code << "      gradCollo3d<num_comp" << var_suffix << ", " << Q_name << ">(data, q, r_q" << var_suffix << ", s_G" << var_suffix << ", r_s"
-               << var_suffix << ");\n";
+          code << "      GradColloSlice3d<num_comp" << var_suffix << ", " << Q_name << ">(data, q, r_q" << var_suffix << ", s_G" << var_suffix
+               << ", r_s" << var_suffix << ");\n";
           break;
         case CEED_EVAL_WEIGHT:
           code << "      CeedScalar r_s" << var_suffix << "[1];\n";
@@ -614,8 +631,8 @@ static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, C
           code << "      }\n";
           break;
         case CEED_EVAL_GRAD:
-          code << "      gradColloTranspose3d<num_comp" << var_suffix << ", " << Q_name << ">(data, q, r_s" << var_suffix << ", s_G" << var_suffix
-               << ", r_q" << var_suffix << ");\n";
+          code << "      GradColloSliceTranspose3d<num_comp" << var_suffix << ", " << Q_name << ">(data, q, r_s" << var_suffix << ", s_G"
+               << var_suffix << ", r_q" << var_suffix << ");\n";
           break;
           // LCOV_EXCL_START
         case CEED_EVAL_WEIGHT:
diff --git a/backends/hip-gen/ceed-hip-gen-operator-build.cpp b/backends/hip-gen/ceed-hip-gen-operator-build.cpp
index 3525bbd2fe..a55e6e5cc5 100644
--- a/backends/hip-gen/ceed-hip-gen-operator-build.cpp
+++ b/backends/hip-gen/ceed-hip-gen-operator-build.cpp
@@ -187,18 +187,18 @@ static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, Ce
       if (is_input) data->B.inputs[i] = basis_data->d_interp_1d;
       else data->B.outputs[i] = basis_data->d_interp_1d;
       code << "  __shared__ CeedScalar s_B" << var_suffix << "[" << P_1d * Q_1d << "];\n";
-      code << "  loadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
+      code << "  LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
       break;
     case CEED_EVAL_GRAD:
       if (is_input) data->B.inputs[i] = basis_data->d_interp_1d;
       else data->B.outputs[i] = basis_data->d_interp_1d;
       code << "  __shared__ CeedScalar s_B" << var_suffix << "[" << P_1d * Q_1d << "];\n";
-      code << "  loadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
+      code << "  LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
       if (use_3d_slices) {
         if (is_input) data->G.inputs[i] = basis_data->d_collo_grad_1d;
         else data->G.outputs[i] = basis_data->d_collo_grad_1d;
         code << "  __shared__ CeedScalar s_G" << var_suffix << "[" << Q_1d * Q_1d << "];\n";
-        code << "  loadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
+        code << "  LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
       } else {
         bool has_collo_grad = basis_data->d_collo_grad_1d;
 
@@ -206,10 +206,10 @@ static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, Ce
         else data->G.outputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d;
         if (has_collo_grad) {
           code << "  __shared__ CeedScalar s_G" << var_suffix << "[" << Q_1d * Q_1d << "];\n";
-          code << "  loadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
+          code << "  LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
         } else {
           code << "  __shared__ CeedScalar s_G" << var_suffix << "[" << Q_1d * P_1d << "];\n";
-          code << "  loadMatrix<" << P_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
+          code << "  LoadMatrix<" << P_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
         }
       }
       break;
@@ -236,6 +236,7 @@ static int CeedOperatorBuildKernelRestriction_Hip_gen(std::ostringstream &code,
   CeedEvalMode             eval_mode  = CEED_EVAL_NONE;
   CeedInt                  elem_size = 0, num_comp = 0, P_1d = 0;
   CeedSize                 l_size;
+  CeedRestrictionType      rstr_type = CEED_RESTRICTION_STANDARD;
   CeedElemRestriction_Hip *rstr_data;
   CeedElemRestriction      elem_rstr;
   CeedBasis                basis;
@@ -243,6 +244,7 @@ static int CeedOperatorBuildKernelRestriction_Hip_gen(std::ostringstream &code,
   // Get field data
   CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr));
   if (elem_rstr != CEED_ELEMRESTRICTION_NONE) {
+    CeedCallBackend(CeedElemRestrictionGetType(elem_rstr, &rstr_type));
     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
     CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data));
@@ -255,7 +257,6 @@ static int CeedOperatorBuildKernelRestriction_Hip_gen(std::ostringstream &code,
 
   // Restriction
   if (is_input) {
-    // Input
     // Input
     if (field_input_buffer[i] != i) {
       std::string buffer_name = "r_e_in_" + std::to_string(field_input_buffer[i]);
@@ -263,8 +264,6 @@ static int CeedOperatorBuildKernelRestriction_Hip_gen(std::ostringstream &code,
       // Restriction was already done for previous input
       code << "    CeedScalar *r_e" << var_suffix << " = " << buffer_name << ";\n";
     } else if (eval_mode != CEED_EVAL_WEIGHT && !((eval_mode == CEED_EVAL_NONE) && use_3d_slices)) {
-      bool is_strided;
-
       if (eval_mode == CEED_EVAL_NONE) {
         // No basis action, so r_e_in_* in also r_q_in_* and needs to be allocated
         code << "    CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << P_name << "];\n";
@@ -272,18 +271,59 @@ static int CeedOperatorBuildKernelRestriction_Hip_gen(std::ostringstream &code,
         // Otherwise we're using the scratch space
         code << "    CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
       }
-      CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided));
-      if (!is_strided) {
+      switch (rstr_type) {
+        case CEED_RESTRICTION_STANDARD: {
+          CeedInt comp_stride;
+
+          CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
+          code << "    const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
+          CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
+          code << "    // CompStride: " << comp_stride << "\n";
+          data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets;
+          code << "    ReadLVecStandard" << dim << "d<num_comp" << var_suffix << ", " << comp_stride << ", " << P_name << ">(data, l_size"
+               << var_suffix << ", elem, indices.inputs[" << i << "], d" << var_suffix << ", r_e" << var_suffix << ");\n";
+          break;
+        }
+        case CEED_RESTRICTION_STRIDED: {
+          bool    has_backend_strides;
+          CeedInt num_elem;
+
+          CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides));
+          CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem));
+          CeedInt strides[3] = {1, elem_size * num_elem, elem_size};
+
+          if (!has_backend_strides) {
+            CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
+          }
+          code << "    // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n";
+          code << "    ReadLVecStrided" << dim << "d<num_comp" << var_suffix << ", " << P_name << "," << strides[0] << "," << strides[1] << ","
+               << strides[2] << ">(data, elem, d" << var_suffix << ", r_e" << var_suffix << ");\n";
+          break;
+        }
+        // LCOV_EXCL_START
+        case CEED_RESTRICTION_ORIENTED:
+        case CEED_RESTRICTION_CURL_ORIENTED:
+        case CEED_RESTRICTION_POINTS:
+          break;  // TODO: Not implemented
+                  // LCOV_EXCL_STOP
+      }
+    }
+  } else {
+    // Output
+    switch (rstr_type) {
+      case CEED_RESTRICTION_STANDARD: {
         CeedInt comp_stride;
 
         CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
         code << "    const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
         CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
         code << "    // CompStride: " << comp_stride << "\n";
-        data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets;
-        code << "    readDofsOffset" << dim << "d<num_comp" << var_suffix << ", " << comp_stride << ", " << P_name << ">(data, l_size" << var_suffix
-             << ", elem, indices.inputs[" << i << "], d" << var_suffix << ", r_e" << var_suffix << ");\n";
-      } else {
+        data->indices.outputs[i] = (CeedInt *)rstr_data->d_offsets;
+        code << "    WriteLVecStandard" << dim << "d<num_comp" << var_suffix << ", " << comp_stride << ", " << P_name << ">(data, l_size"
+             << var_suffix << ", elem, indices.outputs[" << i << "], r_e" << var_suffix << ", d" << var_suffix << ");\n";
+        break;
+      }
+      case CEED_RESTRICTION_STRIDED: {
         bool    has_backend_strides;
         CeedInt num_elem;
 
@@ -295,39 +335,16 @@ static int CeedOperatorBuildKernelRestriction_Hip_gen(std::ostringstream &code,
           CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
         }
         code << "    // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n";
-        code << "    readDofsStrided" << dim << "d<num_comp" << var_suffix << ", " << P_name << "," << strides[0] << "," << strides[1] << ","
-             << strides[2] << ">(data, elem, d" << var_suffix << ", r_e" << var_suffix << ");\n";
-      }
-    }
-  } else {
-    // Output
-    bool is_strided;
-
-    CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided));
-    if (!is_strided) {
-      CeedInt comp_stride;
-
-      CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
-      code << "    const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
-      CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
-      code << "    // CompStride: " << comp_stride << "\n";
-      data->indices.outputs[i] = (CeedInt *)rstr_data->d_offsets;
-      code << "    writeDofsOffset" << dim << "d<num_comp" << var_suffix << ", " << comp_stride << ", " << P_name << ">(data, l_size" << var_suffix
-           << ", elem, indices.outputs[" << i << "], r_e" << var_suffix << ", d" << var_suffix << ");\n";
-    } else {
-      bool    has_backend_strides;
-      CeedInt num_elem;
-
-      CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides));
-      CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem));
-      CeedInt strides[3] = {1, elem_size * num_elem, elem_size};
-
-      if (!has_backend_strides) {
-        CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
+        code << "    WriteLVecStrided" << dim << "d<num_comp" << var_suffix << ", " << P_name << "," << strides[0] << "," << strides[1] << ","
+             << strides[2] << ">(data, elem, r_e" << var_suffix << ", d" << var_suffix << ");\n";
+        break;
       }
-      code << "    // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n";
-      code << "    writeDofsStrided" << dim << "d<num_comp" << var_suffix << ", " << P_name << "," << strides[0] << "," << strides[1] << ","
-           << strides[2] << ">(data, elem, r_e" << var_suffix << ", d" << var_suffix << ");\n";
+      // LCOV_EXCL_START
+      case CEED_RESTRICTION_ORIENTED:
+      case CEED_RESTRICTION_CURL_ORIENTED:
+      case CEED_RESTRICTION_POINTS:
+        break;  // TODO: Not implemented
+                // LCOV_EXCL_STOP
     }
   }
   return CEED_ERROR_SUCCESS;
@@ -501,7 +518,7 @@ static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, Ce
               CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
             }
             code << "      // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n";
-            code << "      readSliceQuadsStrided3d<num_comp" << var_suffix << ", " << Q_name << "," << strides[0] << "," << strides[1] << ","
+            code << "      ReadEVecSliceStrided3d<num_comp" << var_suffix << ", " << Q_name << "," << strides[0] << "," << strides[1] << ","
                  << strides[2] << ">(data, elem, q, d" << var_suffix << ", r_s" << var_suffix << ");\n";
           } else {
             CeedSize                 l_size = 0;
@@ -514,7 +531,7 @@ static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, Ce
             code << "      // CompStride: " << comp_stride << "\n";
             CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data));
             data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets;
-            code << "      readSliceQuadsOffset3d<num_comp" << var_suffix << ", " << comp_stride << ", " << Q_name << ">(data, l_size" << var_suffix
+            code << "      ReadEVecSliceStandard3d<num_comp" << var_suffix << ", " << comp_stride << ", " << Q_name << ">(data, l_size" << var_suffix
                  << ", elem, q, indices.inputs[" << i << "], d" << var_suffix << ", r_s" << var_suffix << ");\n";
           }
           break;
@@ -526,8 +543,8 @@ static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, Ce
           break;
         case CEED_EVAL_GRAD:
           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim];\n";
-          code << "      gradCollo3d<num_comp" << var_suffix << ", " << Q_name << ">(data, q, r_q" << var_suffix << ", s_G" << var_suffix << ", r_s"
-               << var_suffix << ");\n";
+          code << "      GradColloSlice3d<num_comp" << var_suffix << ", " << Q_name << ">(data, q, r_q" << var_suffix << ", s_G" << var_suffix
+               << ", r_s" << var_suffix << ");\n";
           break;
         case CEED_EVAL_WEIGHT:
           code << "      CeedScalar r_s" << var_suffix << "[1];\n";
@@ -631,8 +648,8 @@ static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, Ce
           code << "      }\n";
           break;
         case CEED_EVAL_GRAD:
-          code << "      gradColloTranspose3d<num_comp" << var_suffix << ", " << Q_name << ">(data, q, r_s" << var_suffix << ", s_G" << var_suffix
-               << ", r_q" << var_suffix << ");\n";
+          code << "      GradColloSliceTranspose3d<num_comp" << var_suffix << ", " << Q_name << ">(data, q, r_s" << var_suffix << ", s_G"
+               << var_suffix << ", r_q" << var_suffix << ");\n";
           break;
           // LCOV_EXCL_START
         case CEED_EVAL_WEIGHT:
diff --git a/include/ceed/jit-source/cuda/cuda-gen-templates.h b/include/ceed/jit-source/cuda/cuda-gen-templates.h
index eb566137ee..8a79ba5989 100644
--- a/include/ceed/jit-source/cuda/cuda-gen-templates.h
+++ b/include/ceed/jit-source/cuda/cuda-gen-templates.h
@@ -13,7 +13,7 @@
 // Load matrices for basis actions
 //------------------------------------------------------------------------------
 template <int P, int Q>
-inline __device__ void loadMatrix(SharedData_Cuda &data, const CeedScalar *__restrict__ d_B, CeedScalar *B) {
+inline __device__ void LoadMatrix(SharedData_Cuda &data, const CeedScalar *__restrict__ d_B, CeedScalar *B) {
   for (CeedInt i = data.t_id; i < P * Q; i += blockDim.x * blockDim.y * blockDim.z) B[i] = d_B[i];
 }
 
@@ -25,8 +25,8 @@ inline __device__ void loadMatrix(SharedData_Cuda &data, const CeedScalar *__res
 // L-vector -> E-vector, offsets provided
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int COMP_STRIDE, int P_1d>
-inline __device__ void readDofsOffset1d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
-                                        const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
+inline __device__ void ReadLVecStandard1d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
+                                          const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
   if (data.t_id_x < P_1d) {
     const CeedInt node = data.t_id_x;
     const CeedInt ind  = indices[node + elem * P_1d];
@@ -39,7 +39,7 @@ inline __device__ void readDofsOffset1d(SharedData_Cuda &data, const CeedInt num
 // L-vector -> E-vector, strided
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
-inline __device__ void readDofsStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u,
+inline __device__ void ReadLVecStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u,
                                          CeedScalar *__restrict__ r_u) {
   if (data.t_id_x < P_1d) {
     const CeedInt node = data.t_id_x;
@@ -53,8 +53,8 @@ inline __device__ void readDofsStrided1d(SharedData_Cuda &data, const CeedInt el
 // E-vector -> L-vector, offsets provided
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int COMP_STRIDE, int P_1d>
-inline __device__ void writeDofsOffset1d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
-                                         const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
+inline __device__ void WriteLVecStandard1d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
+                                           const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
   if (data.t_id_x < P_1d) {
     const CeedInt node = data.t_id_x;
     const CeedInt ind  = indices[node + elem * P_1d];
@@ -67,7 +67,7 @@ inline __device__ void writeDofsOffset1d(SharedData_Cuda &data, const CeedInt nu
 // E-vector -> L-vector, strided
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
-inline __device__ void writeDofsStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
+inline __device__ void WriteLVecStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
                                           CeedScalar *__restrict__ d_v) {
   if (data.t_id_x < P_1d) {
     const CeedInt node = data.t_id_x;
@@ -85,8 +85,8 @@ inline __device__ void writeDofsStrided1d(SharedData_Cuda &data, const CeedInt e
 // L-vector -> E-vector, offsets provided
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int COMP_STRIDE, int P_1d>
-inline __device__ void readDofsOffset2d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
-                                        const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
+inline __device__ void ReadLVecStandard2d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
+                                          const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
   if (data.t_id_x < P_1d && data.t_id_y < P_1d) {
     const CeedInt node = data.t_id_x + data.t_id_y * P_1d;
     const CeedInt ind  = indices[node + elem * P_1d * P_1d];
@@ -99,7 +99,7 @@ inline __device__ void readDofsOffset2d(SharedData_Cuda &data, const CeedInt num
 // L-vector -> E-vector, strided
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
-inline __device__ void readDofsStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u,
+inline __device__ void ReadLVecStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u,
                                          CeedScalar *__restrict__ r_u) {
   if (data.t_id_x < P_1d && data.t_id_y < P_1d) {
     const CeedInt node = data.t_id_x + data.t_id_y * P_1d;
@@ -113,8 +113,8 @@ inline __device__ void readDofsStrided2d(SharedData_Cuda &data, const CeedInt el
 // E-vector -> L-vector, offsets provided
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int COMP_STRIDE, int P_1d>
-inline __device__ void writeDofsOffset2d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
-                                         const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
+inline __device__ void WriteLVecStandard2d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
+                                           const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
   if (data.t_id_x < P_1d && data.t_id_y < P_1d) {
     const CeedInt node = data.t_id_x + data.t_id_y * P_1d;
     const CeedInt ind  = indices[node + elem * P_1d * P_1d];
@@ -127,7 +127,7 @@ inline __device__ void writeDofsOffset2d(SharedData_Cuda &data, const CeedInt nu
 // E-vector -> L-vector, strided
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
-inline __device__ void writeDofsStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
+inline __device__ void WriteLVecStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
                                           CeedScalar *__restrict__ d_v) {
   if (data.t_id_x < P_1d && data.t_id_y < P_1d) {
     const CeedInt node = data.t_id_x + data.t_id_y * P_1d;
@@ -144,16 +144,9 @@ inline __device__ void writeDofsStrided2d(SharedData_Cuda &data, const CeedInt e
 //------------------------------------------------------------------------------
 // L-vector -> E-vector, offsets provided
 //------------------------------------------------------------------------------
-// TODO: remove "Dofs" and "Quads" in the following function names?
-//   - readDofsOffset3d -> readOffset3d ?
-//   - readDofsStrided3d -> readStrided3d ?
-//   - readSliceQuadsOffset3d -> readSliceOffset3d ?
-//   - readSliceQuadsStrided3d -> readSliceStrided3d ?
-//   - writeDofsOffset3d -> writeOffset3d ?
-//   - writeDofsStrided3d -> writeStrided3d ?
 template <int NUM_COMP, int COMP_STRIDE, int P_1d>
-inline __device__ void readDofsOffset3d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
-                                        const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
+inline __device__ void ReadLVecStandard3d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
+                                          const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
   if (data.t_id_x < P_1d && data.t_id_y < P_1d)
     for (CeedInt z = 0; z < P_1d; z++) {
       const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d;
@@ -167,7 +160,7 @@ inline __device__ void readDofsOffset3d(SharedData_Cuda &data, const CeedInt num
 // L-vector -> E-vector, strided
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
-inline __device__ void readDofsStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u,
+inline __device__ void ReadLVecStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u,
                                          CeedScalar *__restrict__ r_u) {
   if (data.t_id_x < P_1d && data.t_id_y < P_1d)
     for (CeedInt z = 0; z < P_1d; z++) {
@@ -182,8 +175,9 @@ inline __device__ void readDofsStrided3d(SharedData_Cuda &data, const CeedInt el
 // E-vector -> Q-vector, offests provided
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int COMP_STRIDE, int Q_1d>
-inline __device__ void readSliceQuadsOffset3d(SharedData_Cuda &data, const CeedInt nquads, const CeedInt elem, const CeedInt q,
-                                              const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
+inline __device__ void ReadEVecSliceStandard3d(SharedData_Cuda &data, const CeedInt nquads, const CeedInt elem, const CeedInt q,
+                                               const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u,
+                                               CeedScalar *__restrict__ r_u) {
   if (data.t_id_x < Q_1d && data.t_id_y < Q_1d) {
     const CeedInt node = data.t_id_x + data.t_id_y * Q_1d + q * Q_1d * Q_1d;
     const CeedInt ind  = indices[node + elem * Q_1d * Q_1d * Q_1d];
@@ -196,8 +190,8 @@ inline __device__ void readSliceQuadsOffset3d(SharedData_Cuda &data, const CeedI
 // E-vector -> Q-vector, strided
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int Q_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
-inline __device__ void readSliceQuadsStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedInt q, const CeedScalar *__restrict__ d_u,
-                                               CeedScalar *__restrict__ r_u) {
+inline __device__ void ReadEVecSliceStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedInt q, const CeedScalar *__restrict__ d_u,
+                                              CeedScalar *__restrict__ r_u) {
   if (data.t_id_x < Q_1d && data.t_id_y < Q_1d) {
     const CeedInt node = data.t_id_x + data.t_id_y * Q_1d + q * Q_1d * Q_1d;
     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
@@ -210,8 +204,8 @@ inline __device__ void readSliceQuadsStrided3d(SharedData_Cuda &data, const Ceed
 // E-vector -> L-vector, offsets provided
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int COMP_STRIDE, int P_1d>
-inline __device__ void writeDofsOffset3d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
-                                         const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
+inline __device__ void WriteLVecStandard3d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
+                                           const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
   if (data.t_id_x < P_1d && data.t_id_y < P_1d)
     for (CeedInt z = 0; z < P_1d; z++) {
       const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d;
@@ -225,7 +219,7 @@ inline __device__ void writeDofsOffset3d(SharedData_Cuda &data, const CeedInt nu
 // E-vector -> L-vector, strided
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
-inline __device__ void writeDofsStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
+inline __device__ void WriteLVecStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
                                           CeedScalar *__restrict__ d_v) {
   if (data.t_id_x < P_1d && data.t_id_y < P_1d)
     for (CeedInt z = 0; z < P_1d; z++) {
@@ -240,8 +234,8 @@ inline __device__ void writeDofsStrided3d(SharedData_Cuda &data, const CeedInt e
 // 3D collocated derivatives computation
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int Q_1d>
-inline __device__ void gradCollo3d(SharedData_Cuda &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
-                                   CeedScalar *__restrict__ r_V) {
+inline __device__ void GradColloSlice3d(SharedData_Cuda &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
+                                        CeedScalar *__restrict__ r_V) {
   if (data.t_id_x < Q_1d && data.t_id_y < Q_1d) {
     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
       data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[q + comp * Q_1d];
@@ -266,8 +260,8 @@ inline __device__ void gradCollo3d(SharedData_Cuda &data, const CeedInt q, const
 // 3D collocated derivatives transpose
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int Q_1d>
-inline __device__ void gradColloTranspose3d(SharedData_Cuda &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
-                                            CeedScalar *__restrict__ r_V) {
+inline __device__ void GradColloSliceTranspose3d(SharedData_Cuda &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
+                                                 CeedScalar *__restrict__ r_V) {
   if (data.t_id_x < Q_1d && data.t_id_y < Q_1d) {
     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
       // X derivative
diff --git a/include/ceed/jit-source/hip/hip-gen-templates.h b/include/ceed/jit-source/hip/hip-gen-templates.h
index 02b4a7fd51..7059d8dafe 100644
--- a/include/ceed/jit-source/hip/hip-gen-templates.h
+++ b/include/ceed/jit-source/hip/hip-gen-templates.h
@@ -13,7 +13,7 @@
 // Load matrices for basis actions
 //------------------------------------------------------------------------------
 template <int P, int Q>
-inline __device__ void loadMatrix(SharedData_Hip &data, const CeedScalar *__restrict__ d_B, CeedScalar *B) {
+inline __device__ void LoadMatrix(SharedData_Hip &data, const CeedScalar *__restrict__ d_B, CeedScalar *B) {
   for (CeedInt i = data.t_id; i < P * Q; i += blockDim.x * blockDim.y * blockDim.z) B[i] = d_B[i];
 }
 
@@ -25,8 +25,8 @@ inline __device__ void loadMatrix(SharedData_Hip &data, const CeedScalar *__rest
 // L-vector -> E-vector, offsets provided
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int COMP_STRIDE, int P_1d>
-inline __device__ void readDofsOffset1d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
-                                        const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
+inline __device__ void ReadLVecStandard1d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
+                                          const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
   if (data.t_id_x < P_1d) {
     const CeedInt node = data.t_id_x;
     const CeedInt ind  = indices[node + elem * P_1d];
@@ -39,7 +39,7 @@ inline __device__ void readDofsOffset1d(SharedData_Hip &data, const CeedInt num_
 // L-vector -> E-vector, strided
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
-inline __device__ void readDofsStrided1d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
+inline __device__ void ReadLVecStrided1d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
   if (data.t_id_x < P_1d) {
     const CeedInt node = data.t_id_x;
     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
@@ -52,8 +52,8 @@ inline __device__ void readDofsStrided1d(SharedData_Hip &data, const CeedInt ele
 // E-vector -> L-vector, offsets provided
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int COMP_STRIDE, int P_1d>
-inline __device__ void writeDofsOffset1d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
-                                         const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
+inline __device__ void WriteLVecStandard1d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
+                                           const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
   if (data.t_id_x < P_1d) {
     const CeedInt node = data.t_id_x;
     const CeedInt ind  = indices[node + elem * P_1d];
@@ -66,7 +66,7 @@ inline __device__ void writeDofsOffset1d(SharedData_Hip &data, const CeedInt num
 // E-vector -> L-vector, strided
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
-inline __device__ void writeDofsStrided1d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
+inline __device__ void WriteLVecStrided1d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
                                           CeedScalar *__restrict__ d_v) {
   if (data.t_id_x < P_1d) {
     const CeedInt node = data.t_id_x;
@@ -84,8 +84,8 @@ inline __device__ void writeDofsStrided1d(SharedData_Hip &data, const CeedInt el
 // L-vector -> E-vector, offsets provided
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int COMP_STRIDE, int P_1d>
-inline __device__ void readDofsOffset2d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
-                                        const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
+inline __device__ void ReadLVecStandard2d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
+                                          const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
   if (data.t_id_x < P_1d && data.t_id_y < P_1d) {
     const CeedInt node = data.t_id_x + data.t_id_y * P_1d;
     const CeedInt ind  = indices[node + elem * P_1d * P_1d];
@@ -98,7 +98,7 @@ inline __device__ void readDofsOffset2d(SharedData_Hip &data, const CeedInt num_
 // L-vector -> E-vector, strided
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
-inline __device__ void readDofsStrided2d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
+inline __device__ void ReadLVecStrided2d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
   if (data.t_id_x < P_1d && data.t_id_y < P_1d) {
     const CeedInt node = data.t_id_x + data.t_id_y * P_1d;
     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
@@ -111,8 +111,8 @@ inline __device__ void readDofsStrided2d(SharedData_Hip &data, const CeedInt ele
 // E-vector -> L-vector, offsets provided
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int COMP_STRIDE, int P_1d>
-inline __device__ void writeDofsOffset2d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
-                                         const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
+inline __device__ void WriteLVecStandard2d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
+                                           const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
   if (data.t_id_x < P_1d && data.t_id_y < P_1d) {
     const CeedInt node = data.t_id_x + data.t_id_y * P_1d;
     const CeedInt ind  = indices[node + elem * P_1d * P_1d];
@@ -125,7 +125,7 @@ inline __device__ void writeDofsOffset2d(SharedData_Hip &data, const CeedInt num
 // E-vector -> L-vector, strided
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
-inline __device__ void writeDofsStrided2d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
+inline __device__ void WriteLVecStrided2d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
                                           CeedScalar *__restrict__ d_v) {
   if (data.t_id_x < P_1d && data.t_id_y < P_1d) {
     const CeedInt node = data.t_id_x + data.t_id_y * P_1d;
@@ -142,16 +142,9 @@ inline __device__ void writeDofsStrided2d(SharedData_Hip &data, const CeedInt el
 //------------------------------------------------------------------------------
 // L-vector -> E-vector, offsets provided
 //------------------------------------------------------------------------------
-// TODO: remove "Dofs" and "Quads" in the following function names?
-//   - readDofsOffset3d -> readOffset3d ?
-//   - readDofsStrided3d -> readStrided3d ?
-//   - readSliceQuadsOffset3d -> readSliceOffset3d ?
-//   - readSliceQuadsStrided3d -> readSliceStrided3d ?
-//   - writeDofsOffset3d -> writeOffset3d ?
-//   - writeDofsStrided3d -> writeStrided3d ?
 template <int NUM_COMP, int COMP_STRIDE, int P_1d>
-inline __device__ void readDofsOffset3d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
-                                        const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
+inline __device__ void ReadLVecStandard3d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
+                                          const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
   if (data.t_id_x < P_1d && data.t_id_y < P_1d)
     for (CeedInt z = 0; z < P_1d; z++) {
       const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d;
@@ -165,7 +158,7 @@ inline __device__ void readDofsOffset3d(SharedData_Hip &data, const CeedInt num_
 // L-vector -> E-vector, strided
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
-inline __device__ void readDofsStrided3d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
+inline __device__ void ReadLVecStrided3d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
   if (data.t_id_x < P_1d && data.t_id_y < P_1d)
     for (CeedInt z = 0; z < P_1d; z++) {
       const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d;
@@ -179,8 +172,9 @@ inline __device__ void readDofsStrided3d(SharedData_Hip &data, const CeedInt ele
 // E-vector -> Q-vector, offests provided
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int COMP_STRIDE, int Q_1d>
-inline __device__ void readSliceQuadsOffset3d(SharedData_Hip &data, const CeedInt nquads, const CeedInt elem, const CeedInt q,
-                                              const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
+inline __device__ void ReadEVecSliceStandard3d(SharedData_Hip &data, const CeedInt nquads, const CeedInt elem, const CeedInt q,
+                                               const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u,
+                                               CeedScalar *__restrict__ r_u) {
   if (data.t_id_x < Q_1d && data.t_id_y < Q_1d) {
     const CeedInt node = data.t_id_x + data.t_id_y * Q_1d + q * Q_1d * Q_1d;
     const CeedInt ind  = indices[node + elem * Q_1d * Q_1d * Q_1d];
@@ -193,8 +187,8 @@ inline __device__ void readSliceQuadsOffset3d(SharedData_Hip &data, const CeedIn
 // E-vector -> Q-vector, strided
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int Q_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
-inline __device__ void readSliceQuadsStrided3d(SharedData_Hip &data, const CeedInt elem, const CeedInt q, const CeedScalar *__restrict__ d_u,
-                                               CeedScalar *__restrict__ r_u) {
+inline __device__ void ReadEVecSliceStrided3d(SharedData_Hip &data, const CeedInt elem, const CeedInt q, const CeedScalar *__restrict__ d_u,
+                                              CeedScalar *__restrict__ r_u) {
   if (data.t_id_x < Q_1d && data.t_id_y < Q_1d) {
     const CeedInt node = data.t_id_x + data.t_id_y * Q_1d + q * Q_1d * Q_1d;
     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
@@ -207,8 +201,8 @@ inline __device__ void readSliceQuadsStrided3d(SharedData_Hip &data, const CeedI
 // E-vector -> L-vector, offsets provided
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int COMP_STRIDE, int P_1d>
-inline __device__ void writeDofsOffset3d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
-                                         const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
+inline __device__ void WriteLVecStandard3d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
+                                           const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
   if (data.t_id_x < P_1d && data.t_id_y < P_1d)
     for (CeedInt z = 0; z < P_1d; z++) {
       const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d;
@@ -222,7 +216,7 @@ inline __device__ void writeDofsOffset3d(SharedData_Hip &data, const CeedInt num
 // E-vector -> L-vector, strided
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
-inline __device__ void writeDofsStrided3d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
+inline __device__ void WriteLVecStrided3d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
                                           CeedScalar *__restrict__ d_v) {
   if (data.t_id_x < P_1d && data.t_id_y < P_1d)
     for (CeedInt z = 0; z < P_1d; z++) {
@@ -237,8 +231,8 @@ inline __device__ void writeDofsStrided3d(SharedData_Hip &data, const CeedInt el
 // 3D collocated derivatives computation
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int Q_1d>
-inline __device__ void gradCollo3d(SharedData_Hip &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
-                                   CeedScalar *__restrict__ r_V) {
+inline __device__ void GradColloSlice3d(SharedData_Hip &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
+                                        CeedScalar *__restrict__ r_V) {
   if (data.t_id_x < Q_1d && data.t_id_y < Q_1d) {
     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
       data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[q + comp * Q_1d];
@@ -263,8 +257,8 @@ inline __device__ void gradCollo3d(SharedData_Hip &data, const CeedInt q, const
 // 3D collocated derivatives transpose
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int Q_1d>
-inline __device__ void gradColloTranspose3d(SharedData_Hip &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
-                                            CeedScalar *__restrict__ r_V) {
+inline __device__ void GradColloSliceTranspose3d(SharedData_Hip &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
+                                                 CeedScalar *__restrict__ r_V) {
   if (data.t_id_x < Q_1d && data.t_id_y < Q_1d) {
     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
       // X derivative

From b6a2eb7998676e206f6df72229aa5643127bbcef Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Mon, 9 Dec 2024 16:50:02 -0700
Subject: [PATCH 238/571] shared - AtPoints template changes for gen

---
 .../cuda-shared-basis-read-write-templates.h  |  40 ++++++
 ...-shared-basis-tensor-at-points-templates.h | 120 ------------------
 .../cuda/cuda-shared-basis-tensor-at-points.h |  81 ++++++++++--
 .../hip-shared-basis-read-write-templates.h   |  39 ++++++
 ...-shared-basis-tensor-at-points-templates.h | 120 ------------------
 .../hip/hip-shared-basis-tensor-at-points.h   |  81 ++++++++++--
 6 files changed, 225 insertions(+), 256 deletions(-)

diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-read-write-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-read-write-templates.h
index 8671dc6423..49e7eca873 100644
--- a/include/ceed/jit-source/cuda/cuda-shared-basis-read-write-templates.h
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-read-write-templates.h
@@ -161,3 +161,43 @@ inline __device__ void SumElementStrided3d(SharedData_Cuda &data, const CeedInt
     }
   }
 }
+
+//------------------------------------------------------------------------------
+// AtPoints
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+// E-vector -> single point
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int NUM_PTS>
+inline __device__ void ReadPoint(SharedData_Cuda &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem,
+                                 const CeedInt strides_point, const CeedInt strides_comp, const CeedInt strides_elem,
+                                 const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
+  const CeedInt ind = (p % NUM_PTS) * strides_point + elem * strides_elem;
+
+  if (p < points_in_elem) {
+    for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+      r_u[comp] = d_u[ind + comp * strides_comp];
+    }
+  } else {
+    for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+      r_u[comp] = 0.0;
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// Single point -> E-vector
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int NUM_PTS>
+inline __device__ void WritePoint(SharedData_Cuda &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem,
+                                  const CeedInt strides_point, const CeedInt strides_comp, const CeedInt strides_elem, const CeedScalar *r_v,
+                                  CeedScalar *d_v) {
+  if (p < points_in_elem) {
+    const CeedInt ind = (p % NUM_PTS) * strides_point + elem * strides_elem;
+
+    for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+      d_v[ind + comp * strides_comp] = r_v[comp];
+    }
+  }
+}
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h
index 35681df470..acf35a0dd7 100644
--- a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h
@@ -446,123 +446,3 @@ inline __device__ void GradTransposeAtPoints3d(SharedData_Cuda &data, const Ceed
     }
   }
 }
-
-//------------------------------------------------------------------------------
-// Loops over points
-//------------------------------------------------------------------------------
-
-//------------------------------------------------------------------------------
-// Interpolate to points
-//------------------------------------------------------------------------------
-template <int DIM, int NUM_COMP, int NUM_PTS, int Q_1D>
-inline __device__ void InterpAtPoints(SharedData_Cuda &data, const CeedInt comp_stride, const CeedScalar *__restrict__ r_C,
-                                      const CeedScalar *__restrict__ d_X, CeedScalar *__restrict__ r_V, CeedScalar *__restrict__ d_V) {
-  const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * NUM_PTS / (blockDim.x * blockDim.y));
-
-  for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) {
-    const CeedInt p = i % NUM_PTS;
-    CeedScalar    r_X[DIM];
-
-    for (CeedInt d = 0; d < BASIS_DIM; d++) r_X[d] = d_X[comp_stride * d + p];
-    if (DIM == 1) {
-      InterpAtPoints1d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_C, r_X, r_V);
-    } else if (DIM == 2) {
-      InterpAtPoints2d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_C, r_X, r_V);
-    } else if (DIM == 3) {
-      InterpAtPoints3d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_C, r_X, r_V);
-    }
-    if (i < NUM_PTS) {
-      for (CeedInt j = 0; j < NUM_COMP; j++) d_V[comp_stride * j + p] = r_V[j];
-    }
-  }
-}
-
-//------------------------------------------------------------------------------
-// Interpolate from points
-//------------------------------------------------------------------------------
-template <int DIM, int NUM_COMP, int NUM_PTS, int Q_1D>
-inline __device__ void InterpTransposeAtPoints(SharedData_Cuda &data, const CeedInt comp_stride, const CeedInt points_per_elem,
-                                               const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ r_U, const CeedScalar *__restrict__ d_X,
-                                               CeedScalar *__restrict__ r_C) {
-  // Clear register
-  for (CeedInt i = 0; i < NUM_COMP * (DIM > 2 ? Q_1D : 1); i++) r_C[i] = 0.0;
-
-  const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * NUM_PTS / (blockDim.x * blockDim.y));
-
-  for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) {
-    const CeedInt p = i % NUM_PTS;
-    CeedScalar    r_X[DIM];
-
-    for (CeedInt d = 0; d < DIM; d++) r_X[d] = d_X[comp_stride * d + p];
-    for (CeedInt j = 0; j < NUM_COMP; j++) {
-      if (i < points_per_elem) r_U[j] = d_U[comp_stride * j + p];
-      else r_U[j] = 0.0;
-    }
-    if (BASIS_DIM == 1) {
-      InterpTransposeAtPoints1d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_U, r_X, r_C);
-    } else if (BASIS_DIM == 2) {
-      InterpTransposeAtPoints2d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_U, r_X, r_C);
-    } else if (BASIS_DIM == 3) {
-      InterpTransposeAtPoints3d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_U, r_X, r_C);
-    }
-  }
-  __syncthreads();
-}
-
-//------------------------------------------------------------------------------
-// Gradient at points
-//------------------------------------------------------------------------------
-template <int DIM, int NUM_COMP, int NUM_PTS, int Q_1D>
-inline __device__ void GradAtPoints(SharedData_Cuda &data, const CeedInt comp_stride, const CeedScalar *__restrict__ r_C,
-                                    const CeedScalar *__restrict__ d_X, CeedScalar *__restrict__ r_V, CeedScalar *__restrict__ d_V) {
-  const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * NUM_PTS / (blockDim.x * blockDim.y));
-
-  for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) {
-    const CeedInt p = i % NUM_PTS;
-    CeedScalar    r_X[DIM];
-
-    for (CeedInt d = 0; d < BASIS_DIM; d++) r_X[d] = d_X[comp_stride * d + p];
-    if (DIM == 1) {
-      GradAtPoints1d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_C, r_X, r_V);
-    } else if (DIM == 2) {
-      GradAtPoints2d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_C, r_X, r_V);
-    } else if (DIM == 3) {
-      GradAtPoints3d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_C, r_X, r_V);
-    }
-    if (i < NUM_PTS) {
-      for (CeedInt j = 0; j < NUM_COMP * DIM; j++) d_V[comp_stride * j + p] = r_V[j];
-    }
-  }
-}
-
-//------------------------------------------------------------------------------
-// Grad from points
-//------------------------------------------------------------------------------
-template <int DIM, int NUM_COMP, int NUM_PTS, int Q_1D>
-inline __device__ void GradTransposeAtPoints(SharedData_Cuda &data, const CeedInt comp_stride, const CeedInt points_per_elem,
-                                             const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ r_U, const CeedScalar *__restrict__ d_X,
-                                             CeedScalar *__restrict__ r_C) {
-  // Clear register
-  for (CeedInt i = 0; i < NUM_COMP * (DIM > 2 ? Q_1D : 1); i++) r_C[i] = 0.0;
-
-  const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * NUM_PTS / (blockDim.x * blockDim.y));
-
-  for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) {
-    const CeedInt p = i % NUM_PTS;
-    CeedScalar    r_X[DIM];
-
-    for (CeedInt d = 0; d < DIM; d++) r_X[d] = d_X[comp_stride * d + p];
-    for (CeedInt j = 0; j < NUM_COMP * DIM; j++) {
-      if (i < points_per_elem) r_U[j] = d_U[comp_stride * j + p];
-      else r_U[j] = 0.0;
-    }
-    if (BASIS_DIM == 1) {
-      GradTransposeAtPoints1d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_U, r_X, r_C);
-    } else if (BASIS_DIM == 2) {
-      GradTransposeAtPoints2d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_U, r_X, r_C);
-    } else if (BASIS_DIM == 3) {
-      GradTransposeAtPoints3d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_U, r_X, r_C);
-    }
-  }
-  __syncthreads();
-}
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points.h
index cfc6899476..cd9021611a 100644
--- a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points.h
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points.h
@@ -31,6 +31,7 @@ extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedScal
   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
   data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
 
+  CeedScalar r_X[BASIS_DIM];
   CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
   CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
   CeedScalar r_V[BASIS_NUM_COMP];
@@ -51,8 +52,21 @@ extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedScal
     }
 
     // Map to points
-    InterpAtPoints<BASIS_DIM, BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, num_elem * BASIS_NUM_PTS, r_C, &d_X[elem * BASIS_NUM_PTS], r_V,
-                                                                         &d_V[elem * BASIS_NUM_PTS]);
+    const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y));
+
+    for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) {
+      const CeedInt p = i % BASIS_NUM_PTS;
+
+      ReadPoint<BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X);
+      if (BASIS_DIM == 1) {
+        InterpAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
+      } else if (BASIS_DIM == 2) {
+        InterpAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
+      } else if (BASIS_DIM == 3) {
+        InterpAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
+      }
+      WritePoint<BASIS_NUM_COMP, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, r_V, d_V);
+    }
   }
 }
 
@@ -68,15 +82,33 @@ extern "C" __global__ void InterpTransposeAtPoints(const CeedInt num_elem, const
   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
   data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
 
+  CeedScalar r_X[BASIS_DIM];
   CeedScalar r_U[BASIS_NUM_COMP];
   CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
   CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
 
   // Apply basis element by element
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    // Clear register
+    for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_C[i] = 0.0;
+
     // Map from points
-    InterpTransposeAtPoints<BASIS_DIM, BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, num_elem * BASIS_NUM_PTS, points_per_elem[elem],
-                                                                                  &d_U[elem * BASIS_NUM_PTS], r_U, &d_X[elem * BASIS_NUM_PTS], r_C);
+    const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y));
+
+    for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) {
+      const CeedInt p = i % BASIS_NUM_PTS;
+
+      ReadPoint<BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X);
+      ReadPoint<BASIS_NUM_COMP, BASIS_NUM_PTS>(data, elem, i, points_per_elem[elem], 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_U, r_U);
+      if (BASIS_DIM == 1) {
+        InterpTransposeAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
+      } else if (BASIS_DIM == 2) {
+        InterpTransposeAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
+      } else if (BASIS_DIM == 3) {
+        InterpTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
+      }
+    }
+    __syncthreads();
 
     // Map from coefficients
     if (BASIS_DIM == 1) {
@@ -107,6 +139,7 @@ extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedScalar
   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
   data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
 
+  CeedScalar r_X[BASIS_DIM];
   CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
   CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
   CeedScalar r_V[BASIS_NUM_COMP * BASIS_DIM];
@@ -127,8 +160,21 @@ extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedScalar
     }
 
     // Map to points
-    GradAtPoints<BASIS_DIM, BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, num_elem * BASIS_NUM_PTS, r_C, &d_X[elem * BASIS_NUM_PTS], r_V,
-                                                                       &d_V[elem * BASIS_NUM_PTS]);
+    const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y));
+
+    for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) {
+      const CeedInt p = i % BASIS_NUM_PTS;
+
+      ReadPoint<BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X);
+      if (BASIS_DIM == 1) {
+        GradAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
+      } else if (BASIS_DIM == 2) {
+        GradAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
+      } else if (BASIS_DIM == 3) {
+        GradAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
+      }
+      WritePoint<BASIS_NUM_COMP * BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, r_V, d_V);
+    }
   }
 }
 
@@ -144,15 +190,34 @@ extern "C" __global__ void GradTransposeAtPoints(const CeedInt num_elem, const C
   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
   data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
 
+  CeedScalar r_X[BASIS_DIM];
   CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM];
   CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
   CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
 
   // Apply basis element by element
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    // Clear register
+    for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_C[i] = 0.0;
+
     // Map from points
-    GradTransposeAtPoints<BASIS_DIM, BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, num_elem * BASIS_NUM_PTS, points_per_elem[elem],
-                                                                                &d_U[elem * BASIS_NUM_PTS], r_U, &d_X[elem * BASIS_NUM_PTS], r_C);
+    const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y));
+
+    for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) {
+      const CeedInt p = i % BASIS_NUM_PTS;
+
+      ReadPoint<BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X);
+      ReadPoint<BASIS_NUM_COMP * BASIS_DIM, BASIS_NUM_PTS>(data, elem, i, points_per_elem[elem], 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_U,
+                                                           r_U);
+      if (BASIS_DIM == 1) {
+        GradTransposeAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
+      } else if (BASIS_DIM == 2) {
+        GradTransposeAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
+      } else if (BASIS_DIM == 3) {
+        GradTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
+      }
+    }
+    __syncthreads();
 
     // Map from coefficients
     if (BASIS_DIM == 1) {
diff --git a/include/ceed/jit-source/hip/hip-shared-basis-read-write-templates.h b/include/ceed/jit-source/hip/hip-shared-basis-read-write-templates.h
index 8691a92710..a5313ec925 100644
--- a/include/ceed/jit-source/hip/hip-shared-basis-read-write-templates.h
+++ b/include/ceed/jit-source/hip/hip-shared-basis-read-write-templates.h
@@ -171,3 +171,42 @@ inline __device__ void SumElementStrided3d(SharedData_Hip &data, const CeedInt e
     }
   }
 }
+
+//------------------------------------------------------------------------------
+// AtPoints
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+// E-vector -> single point
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int NUM_PTS>
+inline __device__ void ReadPoint(SharedData_Hip &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem, const CeedInt strides_point,
+                                 const CeedInt strides_comp, const CeedInt strides_elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
+  const CeedInt ind = (p % NUM_PTS) * strides_point + elem * strides_elem;
+
+  if (p < points_in_elem) {
+    for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+      r_u[comp] = d_u[ind + comp * strides_comp];
+    }
+  } else {
+    for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+      r_u[comp] = 0.0;
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// Single point -> E-vector
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int NUM_PTS>
+inline __device__ void WritePoint(SharedData_Hip &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem,
+                                  const CeedInt strides_point, const CeedInt strides_comp, const CeedInt strides_elem, const CeedScalar *r_v,
+                                  CeedScalar *d_v) {
+  if (p < points_in_elem) {
+    const CeedInt ind = (p % NUM_PTS) * strides_point + elem * strides_elem;
+
+    for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+      d_v[ind + comp * strides_comp] = r_v[comp];
+    }
+  }
+}
diff --git a/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h b/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h
index 7844810c2d..73d8cba91b 100644
--- a/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h
+++ b/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h
@@ -446,123 +446,3 @@ inline __device__ void GradTransposeAtPoints3d(SharedData_Hip &data, const CeedI
     }
   }
 }
-
-//------------------------------------------------------------------------------
-// Loops over points
-//------------------------------------------------------------------------------
-
-//------------------------------------------------------------------------------
-// Interpolate to points
-//------------------------------------------------------------------------------
-template <int DIM, int NUM_COMP, int NUM_PTS, int Q_1D>
-inline __device__ void InterpAtPoints(SharedData_Hip &data, const CeedInt comp_stride, const CeedScalar *__restrict__ r_C,
-                                      const CeedScalar *__restrict__ d_X, CeedScalar *__restrict__ r_V, CeedScalar *__restrict__ d_V) {
-  const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * NUM_PTS / (blockDim.x * blockDim.y));
-
-  for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) {
-    const CeedInt p = i % NUM_PTS;
-    CeedScalar    r_X[DIM];
-
-    for (CeedInt d = 0; d < BASIS_DIM; d++) r_X[d] = d_X[comp_stride * d + p];
-    if (DIM == 1) {
-      InterpAtPoints1d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_C, r_X, r_V);
-    } else if (DIM == 2) {
-      InterpAtPoints2d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_C, r_X, r_V);
-    } else if (DIM == 3) {
-      InterpAtPoints3d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_C, r_X, r_V);
-    }
-    if (i < NUM_PTS) {
-      for (CeedInt j = 0; j < NUM_COMP; j++) d_V[comp_stride * j + p] = r_V[j];
-    }
-  }
-}
-
-//------------------------------------------------------------------------------
-// Interpolate from points
-//------------------------------------------------------------------------------
-template <int DIM, int NUM_COMP, int NUM_PTS, int Q_1D>
-inline __device__ void InterpTransposeAtPoints(SharedData_Hip &data, const CeedInt comp_stride, const CeedInt points_per_elem,
-                                               const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ r_U, const CeedScalar *__restrict__ d_X,
-                                               CeedScalar *__restrict__ r_C) {
-  // Clear register
-  for (CeedInt i = 0; i < NUM_COMP * (DIM > 2 ? Q_1D : 1); i++) r_C[i] = 0.0;
-
-  const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * NUM_PTS / (blockDim.x * blockDim.y));
-
-  for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) {
-    const CeedInt p = i % NUM_PTS;
-    CeedScalar    r_X[DIM];
-
-    for (CeedInt d = 0; d < DIM; d++) r_X[d] = d_X[comp_stride * d + p];
-    for (CeedInt j = 0; j < NUM_COMP; j++) {
-      if (i < points_per_elem) r_U[j] = d_U[comp_stride * j + p];
-      else r_U[j] = 0.0;
-    }
-    if (BASIS_DIM == 1) {
-      InterpTransposeAtPoints1d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_U, r_X, r_C);
-    } else if (BASIS_DIM == 2) {
-      InterpTransposeAtPoints2d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_U, r_X, r_C);
-    } else if (BASIS_DIM == 3) {
-      InterpTransposeAtPoints3d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_U, r_X, r_C);
-    }
-  }
-  __syncthreads();
-}
-
-//------------------------------------------------------------------------------
-// Gradient at points
-//------------------------------------------------------------------------------
-template <int DIM, int NUM_COMP, int NUM_PTS, int Q_1D>
-inline __device__ void GradAtPoints(SharedData_Hip &data, const CeedInt comp_stride, const CeedScalar *__restrict__ r_C,
-                                    const CeedScalar *__restrict__ d_X, CeedScalar *__restrict__ r_V, CeedScalar *__restrict__ d_V) {
-  const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * NUM_PTS / (blockDim.x * blockDim.y));
-
-  for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) {
-    const CeedInt p = i % NUM_PTS;
-    CeedScalar    r_X[DIM];
-
-    for (CeedInt d = 0; d < BASIS_DIM; d++) r_X[d] = d_X[comp_stride * d + p];
-    if (DIM == 1) {
-      GradAtPoints1d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_C, r_X, r_V);
-    } else if (DIM == 2) {
-      GradAtPoints2d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_C, r_X, r_V);
-    } else if (DIM == 3) {
-      GradAtPoints3d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_C, r_X, r_V);
-    }
-    if (i < NUM_PTS) {
-      for (CeedInt j = 0; j < NUM_COMP * DIM; j++) d_V[comp_stride * j + p] = r_V[j];
-    }
-  }
-}
-
-//------------------------------------------------------------------------------
-// Grad from points
-//------------------------------------------------------------------------------
-template <int DIM, int NUM_COMP, int NUM_PTS, int Q_1D>
-inline __device__ void GradTransposeAtPoints(SharedData_Hip &data, const CeedInt comp_stride, const CeedInt points_per_elem,
-                                             const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ r_U, const CeedScalar *__restrict__ d_X,
-                                             CeedScalar *__restrict__ r_C) {
-  // Clear register
-  for (CeedInt i = 0; i < NUM_COMP * (DIM > 2 ? Q_1D : 1); i++) r_C[i] = 0.0;
-
-  const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * NUM_PTS / (blockDim.x * blockDim.y));
-
-  for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) {
-    const CeedInt p = i % NUM_PTS;
-    CeedScalar    r_X[DIM];
-
-    for (CeedInt d = 0; d < DIM; d++) r_X[d] = d_X[comp_stride * d + p];
-    for (CeedInt j = 0; j < NUM_COMP * DIM; j++) {
-      if (i < points_per_elem) r_U[j] = d_U[comp_stride * j + p];
-      else r_U[j] = 0.0;
-    }
-    if (BASIS_DIM == 1) {
-      GradTransposeAtPoints1d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_U, r_X, r_C);
-    } else if (BASIS_DIM == 2) {
-      GradTransposeAtPoints2d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_U, r_X, r_C);
-    } else if (BASIS_DIM == 3) {
-      GradTransposeAtPoints3d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_U, r_X, r_C);
-    }
-  }
-  __syncthreads();
-}
diff --git a/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points.h b/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points.h
index 355f53d0f4..9f5e947a07 100644
--- a/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points.h
+++ b/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points.h
@@ -37,6 +37,7 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
   data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
 
+  CeedScalar r_X[BASIS_DIM];
   CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
   CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
   CeedScalar r_V[BASIS_NUM_COMP];
@@ -57,8 +58,21 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
     }
 
     // Map to points
-    InterpAtPoints<BASIS_DIM, BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, num_elem * BASIS_NUM_PTS, r_C, &d_X[elem * BASIS_NUM_PTS], r_V,
-                                                                         &d_V[elem * BASIS_NUM_PTS]);
+    const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y));
+
+    for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) {
+      const CeedInt p = i % BASIS_NUM_PTS;
+
+      ReadPoint<BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X);
+      if (BASIS_DIM == 1) {
+        InterpAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
+      } else if (BASIS_DIM == 2) {
+        InterpAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
+      } else if (BASIS_DIM == 3) {
+        InterpAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
+      }
+      WritePoint<BASIS_NUM_COMP, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, r_V, d_V);
+    }
   }
 }
 
@@ -79,15 +93,33 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
   data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
 
+  CeedScalar r_X[BASIS_DIM];
   CeedScalar r_U[BASIS_NUM_COMP];
   CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
   CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
 
   // Apply basis element by element
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    // Clear register
+    for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_C[i] = 0.0;
+
     // Map from points
-    InterpTransposeAtPoints<BASIS_DIM, BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, num_elem * BASIS_NUM_PTS, points_per_elem[elem],
-                                                                                  &d_U[elem * BASIS_NUM_PTS], r_U, &d_X[elem * BASIS_NUM_PTS], r_C);
+    const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y));
+
+    for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) {
+      const CeedInt p = i % BASIS_NUM_PTS;
+
+      ReadPoint<BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X);
+      ReadPoint<BASIS_NUM_COMP, BASIS_NUM_PTS>(data, elem, i, points_per_elem[elem], 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_U, r_U);
+      if (BASIS_DIM == 1) {
+        InterpTransposeAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
+      } else if (BASIS_DIM == 2) {
+        InterpTransposeAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
+      } else if (BASIS_DIM == 3) {
+        InterpTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
+      }
+    }
+    __syncthreads();
 
     // Map from coefficients
     if (BASIS_DIM == 1) {
@@ -124,6 +156,7 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
   data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
 
+  CeedScalar r_X[BASIS_DIM];
   CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
   CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
   CeedScalar r_V[BASIS_NUM_COMP * BASIS_DIM];
@@ -144,8 +177,21 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
     }
 
     // Map to points
-    GradAtPoints<BASIS_DIM, BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, num_elem * BASIS_NUM_PTS, r_C, &d_X[elem * BASIS_NUM_PTS], r_V,
-                                                                       &d_V[elem * BASIS_NUM_PTS]);
+    const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y));
+
+    for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) {
+      const CeedInt p = i % BASIS_NUM_PTS;
+
+      ReadPoint<BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X);
+      if (BASIS_DIM == 1) {
+        GradAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
+      } else if (BASIS_DIM == 2) {
+        GradAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
+      } else if (BASIS_DIM == 3) {
+        GradAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
+      }
+      WritePoint<BASIS_NUM_COMP * BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, r_V, d_V);
+    }
   }
 }
 
@@ -166,15 +212,34 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
   data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
 
+  CeedScalar r_X[BASIS_DIM];
   CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM];
   CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
   CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
 
   // Apply basis element by element
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    // Clear register
+    for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_C[i] = 0.0;
+
     // Map from points
-    GradTransposeAtPoints<BASIS_DIM, BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, num_elem * BASIS_NUM_PTS, points_per_elem[elem],
-                                                                                &d_U[elem * BASIS_NUM_PTS], r_U, &d_X[elem * BASIS_NUM_PTS], r_C);
+    const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y));
+
+    for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) {
+      const CeedInt p = i % BASIS_NUM_PTS;
+
+      ReadPoint<BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X);
+      ReadPoint<BASIS_NUM_COMP * BASIS_DIM, BASIS_NUM_PTS>(data, elem, i, points_per_elem[elem], 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_U,
+                                                           r_U);
+      if (BASIS_DIM == 1) {
+        GradTransposeAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
+      } else if (BASIS_DIM == 2) {
+        GradTransposeAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
+      } else if (BASIS_DIM == 3) {
+        GradTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
+      }
+    }
+    __syncthreads();
 
     // Map from coefficients
     if (BASIS_DIM == 1) {

From ce2232d0cd83b6469387c731df8a2d357870e5de Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Tue, 10 Dec 2024 12:35:25 -0700
Subject: [PATCH 239/571] doc - update docstrings for RstrType and FEType

---
 include/ceed/backend.h | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/include/ceed/backend.h b/include/ceed/backend.h
index 5ec604ee5d..3884501f4b 100644
--- a/include/ceed/backend.h
+++ b/include/ceed/backend.h
@@ -75,7 +75,7 @@
 #endif
 
 /**
-  This enum supples common colors for CeedDebug256 debugging output.
+  This enum supplies common colors for CeedDebug256 debugging output.
   Set the environment variable `CEED_DEBUG = 1` to activate debugging output.
 
   @ingroup Ceed
@@ -267,8 +267,12 @@ CEED_EXTERN int CeedVectorGetData(CeedVector vec, void *data);
 CEED_EXTERN int CeedVectorSetData(CeedVector vec, void *data);
 CEED_EXTERN int CeedVectorReference(CeedVector vec);
 
-/// Type of element restriction;
-/// @ingroup CeedElemRestriction
+/**
+  Specify type of restriction operation.
+
+  @ingroup CeedElemRestriction
+  @ref     Backend
+**/
 typedef enum {
   /// Standard element restriction with offsets
   CEED_RESTRICTION_STANDARD = 1,
@@ -305,8 +309,12 @@ CEED_EXTERN int CeedElemRestrictionSetData(CeedElemRestriction rstr, void *data)
 CEED_EXTERN int CeedElemRestrictionReference(CeedElemRestriction rstr);
 CEED_EXTERN int CeedElemRestrictionGetFlopsEstimate(CeedElemRestriction rstr, CeedTransposeMode t_mode, CeedSize *flops);
 
-/// Type of FE space;
-/// @ingroup CeedBasis
+/**
+  Specify type of FE space.
+
+  @ingroup CeedBasis
+  @ref     Backend
+**/
 typedef enum {
   /// H^1 FE space
   CEED_FE_SPACE_H1 = 1,

From dac827217876cb432a93a9823b361b03fbf022dd Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Wed, 11 Dec 2024 11:20:45 -0700
Subject: [PATCH 240/571] minor - style

---
 backends/cuda-shared/ceed-cuda-shared-basis.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/backends/cuda-shared/ceed-cuda-shared-basis.c b/backends/cuda-shared/ceed-cuda-shared-basis.c
index 33b07a3087..8a3def90b3 100644
--- a/backends/cuda-shared/ceed-cuda-shared-basis.c
+++ b/backends/cuda-shared/ceed-cuda-shared-basis.c
@@ -62,8 +62,8 @@ static int CeedBasisApplyTensorCore_Cuda_shared(CeedBasis basis, bool apply_add,
       void *interp_args[] = {(void *)&num_elem, &data->c_B, &d_u, &d_v};
 
       if (dim == 1) {
-        CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread_1d,
-                                                                                                 1));  // avoid >512 total threads
+        // avoid >512 total threads
+        CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread_1d, 1));
         CeedInt grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
         CeedInt shared_mem      = elems_per_block * thread_1d * sizeof(CeedScalar);
 
@@ -113,8 +113,8 @@ static int CeedBasisApplyTensorCore_Cuda_shared(CeedBasis basis, bool apply_add,
       }
       void *grad_args[] = {(void *)&num_elem, &data->c_B, &data->c_G, &d_u, &d_v};
       if (dim == 1) {
-        CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread_1d,
-                                                                                                 1));  // avoid >512 total threads
+        // avoid >512 total threads
+        CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread_1d, 1));
         CeedInt grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
         CeedInt shared_mem      = elems_per_block * thread_1d * sizeof(CeedScalar);
 
@@ -332,8 +332,8 @@ static int CeedBasisApplyAtPointsCore_Cuda_shared(CeedBasis basis, bool apply_ad
       void *interp_args[] = {(void *)&num_elem, &data->c_B, &data->d_points_per_elem, &d_x, &d_u, &d_v};
 
       if (dim == 1) {
-        CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread_1d,
-                                                                                                 1));  // avoid >512 total threads
+        // avoid >512 total threads
+        CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread_1d, 1));
         CeedInt grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
         CeedInt shared_mem      = elems_per_block * thread_1d * sizeof(CeedScalar);
 
@@ -368,8 +368,8 @@ static int CeedBasisApplyAtPointsCore_Cuda_shared(CeedBasis basis, bool apply_ad
       void *grad_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
 
       if (dim == 1) {
-        CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread_1d,
-                                                                                                 1));  // avoid >512 total threads
+        // avoid >512 total threads
+        CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread_1d, 1));
         CeedInt grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
         CeedInt shared_mem      = elems_per_block * thread_1d * sizeof(CeedScalar);
 

From b4280a96583940f87169e4a342af92c298f7bcf5 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Wed, 11 Dec 2024 11:21:11 -0700
Subject: [PATCH 241/571] hip - reduce elem per block for 3d shared AtPoints
 basis

---
 backends/hip-shared/ceed-hip-shared-basis.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/backends/hip-shared/ceed-hip-shared-basis.c b/backends/hip-shared/ceed-hip-shared-basis.c
index 3926623a21..90357211ad 100644
--- a/backends/hip-shared/ceed-hip-shared-basis.c
+++ b/backends/hip-shared/ceed-hip-shared-basis.c
@@ -406,7 +406,7 @@ static int CeedBasisApplyAtPointsCore_Hip_shared(CeedBasis basis, bool apply_add
         CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, is_transpose ? data->InterpTransposeAtPoints : data->InterpAtPoints, grid, thread_1d,
                                                    thread_1d, elems_per_block, shared_mem, interp_args));
       } else if (dim == 3) {
-        const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1);
+        const CeedInt elems_per_block = 1;
         CeedInt       grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
         CeedInt       shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
 
@@ -440,7 +440,7 @@ static int CeedBasisApplyAtPointsCore_Hip_shared(CeedBasis basis, bool apply_add
         CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, is_transpose ? data->GradTransposeAtPoints : data->GradAtPoints, grid, thread_1d, thread_1d,
                                                    elems_per_block, shared_mem, grad_args));
       } else if (dim == 3) {
-        const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1);
+        const CeedInt elems_per_block = 1;
         CeedInt       grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
         CeedInt       shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
 

From 8b97b69a34af3dd9e1bc9b8fd8651212448dc9ec Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Thu, 5 Dec 2024 14:50:50 -0700
Subject: [PATCH 242/571] cuda - AtPoints for cuda/gen

---
 .../cuda-gen/ceed-cuda-gen-operator-build.cpp | 396 ++++++++++++++----
 backends/cuda-gen/ceed-cuda-gen-operator.c    |  62 ++-
 backends/cuda-gen/ceed-cuda-gen.c             |   1 +
 backends/cuda-gen/ceed-cuda-gen.h             |   1 +
 .../hip-gen/ceed-hip-gen-operator-build.cpp   |   2 +-
 .../ceed/jit-source/cuda/cuda-gen-templates.h |  32 ++
 include/ceed/jit-source/cuda/cuda-types.h     |   7 +
 7 files changed, 415 insertions(+), 86 deletions(-)

diff --git a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
index 1e5d56863d..cafcbd6d94 100644
--- a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
+++ b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
@@ -124,7 +124,8 @@ static int CeedOperatorBuildKernelData_Cuda_gen(Ceed ceed, CeedInt num_input_fie
 // Setup fields
 //------------------------------------------------------------------------------
 static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, CeedInt i, CeedOperatorField op_field,
-                                                     CeedQFunctionField qf_field, CeedInt Q_1d, bool is_input, bool use_3d_slices) {
+                                                     CeedQFunctionField qf_field, CeedInt Q_1d, bool is_input, bool is_at_points,
+                                                     bool use_3d_slices) {
   std::string            var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
   std::string            P_name = "P_1d" + var_suffix, Q_name = "Q_1d";
   std::string            option_name = (is_input ? "inputs" : "outputs");
@@ -155,7 +156,6 @@ static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, C
     code << "  const CeedInt " << P_name << " = " << (basis == CEED_BASIS_NONE ? Q_1d : P_1d) << ";\n";
     code << "  const CeedInt num_comp" << var_suffix << " = " << num_comp << ";\n";
   }
-  CeedCallBackend(CeedBasisDestroy(&basis));
 
   // Load basis data
   code << "  // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
@@ -163,16 +163,55 @@ static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, C
     case CEED_EVAL_NONE:
       break;
     case CEED_EVAL_INTERP:
-      if (is_input) data->B.inputs[i] = basis_data->d_interp_1d;
-      else data->B.outputs[i] = basis_data->d_interp_1d;
+      if (is_at_points) {
+        // AtPoints
+        if (!basis_data->d_chebyshev_interp_1d) {
+          CeedSize    interp_bytes;
+          CeedScalar *chebyshev_interp_1d;
+
+          interp_bytes = P_1d * Q_1d * sizeof(CeedScalar);
+          CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d));
+          CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d));
+          CeedCallCuda(CeedBasisReturnCeed(basis), cudaMalloc((void **)&basis_data->d_chebyshev_interp_1d, interp_bytes));
+          CeedCallCuda(CeedBasisReturnCeed(basis),
+                       cudaMemcpy(basis_data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, cudaMemcpyHostToDevice));
+          CeedCallBackend(CeedFree(&chebyshev_interp_1d));
+        }
+        if (is_input) data->B.inputs[i] = basis_data->d_chebyshev_interp_1d;
+        else data->B.outputs[i] = basis_data->d_chebyshev_interp_1d;
+      } else {
+        // Standard quadrature
+        if (is_input) data->B.inputs[i] = basis_data->d_interp_1d;
+        else data->B.outputs[i] = basis_data->d_interp_1d;
+      }
       code << "  __shared__ CeedScalar s_B" << var_suffix << "[" << P_1d * Q_1d << "];\n";
       code << "  LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
       break;
     case CEED_EVAL_GRAD:
-      if (is_input) data->B.inputs[i] = basis_data->d_interp_1d;
-      else data->B.outputs[i] = basis_data->d_interp_1d;
+      if (is_at_points) {
+        // AtPoints
+        if (!basis_data->d_chebyshev_interp_1d) {
+          CeedSize    interp_bytes;
+          CeedScalar *chebyshev_interp_1d;
+
+          interp_bytes = P_1d * Q_1d * sizeof(CeedScalar);
+          CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d));
+          CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d));
+          CeedCallCuda(CeedBasisReturnCeed(basis), cudaMalloc((void **)&basis_data->d_chebyshev_interp_1d, interp_bytes));
+          CeedCallCuda(CeedBasisReturnCeed(basis),
+                       cudaMemcpy(basis_data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, cudaMemcpyHostToDevice));
+          CeedCallBackend(CeedFree(&chebyshev_interp_1d));
+        }
+        if (is_input) data->B.inputs[i] = basis_data->d_chebyshev_interp_1d;
+        else data->B.outputs[i] = basis_data->d_chebyshev_interp_1d;
+      } else {
+        // Standard quadrature
+        if (is_input) data->B.inputs[i] = basis_data->d_interp_1d;
+        else data->B.outputs[i] = basis_data->d_interp_1d;
+      }
       code << "  __shared__ CeedScalar s_B" << var_suffix << "[" << P_1d * Q_1d << "];\n";
       code << "  LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
+      if (is_at_points) break;  // No G mat for AtPoints
       if (use_3d_slices) {
         if (is_input) data->G.inputs[i] = basis_data->d_collo_grad_1d;
         else data->G.outputs[i] = basis_data->d_collo_grad_1d;
@@ -200,6 +239,7 @@ static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, C
       break;  // TODO: Not implemented
               // LCOV_EXCL_STOP
   }
+  CeedCallBackend(CeedBasisDestroy(&basis));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -208,7 +248,7 @@ static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, C
 //------------------------------------------------------------------------------
 static int CeedOperatorBuildKernelRestriction_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, CeedInt i, CeedInt dim,
                                                        CeedInt field_input_buffer[], CeedOperatorField op_field, CeedQFunctionField qf_field,
-                                                       CeedInt Q_1d, bool is_input, bool use_3d_slices) {
+                                                       CeedInt Q_1d, bool is_input, bool is_at_points, bool use_3d_slices) {
   std::string               var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
   std::string               P_name     = "P_1d" + var_suffix;
   CeedEvalMode              eval_mode  = CEED_EVAL_NONE;
@@ -242,11 +282,11 @@ static int CeedOperatorBuildKernelRestriction_Cuda_gen(std::ostringstream &code,
 
       // Restriction was already done for previous input
       code << "    CeedScalar *r_e" << var_suffix << " = " << buffer_name << ";\n";
-    } else if (eval_mode != CEED_EVAL_WEIGHT && !((eval_mode == CEED_EVAL_NONE) && use_3d_slices)) {
-      if (eval_mode == CEED_EVAL_NONE) {
+    } else if (eval_mode != CEED_EVAL_WEIGHT && !((eval_mode == CEED_EVAL_NONE) && use_3d_slices && is_at_points)) {
+      if (eval_mode == CEED_EVAL_NONE && rstr_type != CEED_RESTRICTION_POINTS) {
         // No basis action, so r_e_in_* in also r_q_in_* and needs to be allocated
         code << "    CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << P_name << "];\n";
-      } else {
+      } else if (rstr_type != CEED_RESTRICTION_POINTS) {
         // Otherwise we're using the scratch space
         code << "    CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
       }
@@ -279,10 +319,17 @@ static int CeedOperatorBuildKernelRestriction_Cuda_gen(std::ostringstream &code,
                << strides[2] << ">(data, elem, d" << var_suffix << ", r_e" << var_suffix << ");\n";
           break;
         }
+        case CEED_RESTRICTION_POINTS: {
+          CeedInt comp_stride;
+
+          CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
+          code << "    const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
+          data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets;
+          break;
+        }
         // LCOV_EXCL_START
         case CEED_RESTRICTION_ORIENTED:
         case CEED_RESTRICTION_CURL_ORIENTED:
-        case CEED_RESTRICTION_POINTS:
           break;  // TODO: Not implemented
                   // LCOV_EXCL_STOP
       }
@@ -318,10 +365,12 @@ static int CeedOperatorBuildKernelRestriction_Cuda_gen(std::ostringstream &code,
              << strides[2] << ">(data, elem, r_e" << var_suffix << ", d" << var_suffix << ");\n";
         break;
       }
+      case CEED_RESTRICTION_POINTS:
+        data->indices.outputs[i] = (CeedInt *)rstr_data->d_offsets;
+        break;
       // LCOV_EXCL_START
       case CEED_RESTRICTION_ORIENTED:
       case CEED_RESTRICTION_CURL_ORIENTED:
-      case CEED_RESTRICTION_POINTS:
         break;  // TODO: Not implemented
                 // LCOV_EXCL_STOP
     }
@@ -335,7 +384,7 @@ static int CeedOperatorBuildKernelRestriction_Cuda_gen(std::ostringstream &code,
 //------------------------------------------------------------------------------
 static int CeedOperatorBuildKernelBasis_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, CeedInt i, CeedInt dim,
                                                  CeedOperatorField op_field, CeedQFunctionField qf_field, CeedInt Q_1d, bool is_input,
-                                                 bool use_3d_slices) {
+                                                 bool is_at_points, bool use_3d_slices) {
   std::string         var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
   std::string         P_name = "P_1d" + var_suffix, Q_name = "Q_1d";
   CeedEvalMode        eval_mode = CEED_EVAL_NONE;
@@ -361,17 +410,27 @@ static int CeedOperatorBuildKernelBasis_Cuda_gen(std::ostringstream &code, CeedO
   if (is_input) {
     switch (eval_mode) {
       case CEED_EVAL_NONE:
-        if (!use_3d_slices) {
+        if (!use_3d_slices && !is_at_points) {
           code << "    CeedScalar *r_q" << var_suffix << " = r_e" << var_suffix << ";\n";
         }
         break;
       case CEED_EVAL_INTERP:
-        code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
-        code << "    Interp" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp" << var_suffix << ", P_1d" << var_suffix << ", " << Q_name
-             << ">(data, r_e" << var_suffix << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n";
+        if (is_at_points) {
+          code << "    CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim > 2 ? Q_name : "1") << "];\n";
+          code << "    Interp" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp" << var_suffix << ", P_1d" << var_suffix << ", " << Q_name
+               << ">(data, r_e" << var_suffix << ", s_B" << var_suffix << ", r_c" << var_suffix << ");\n";
+        } else {
+          code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
+          code << "    Interp" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp" << var_suffix << ", P_1d" << var_suffix << ", " << Q_name
+               << ">(data, r_e" << var_suffix << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n";
+        }
         break;
       case CEED_EVAL_GRAD:
-        if (use_3d_slices) {
+        if (is_at_points) {
+          code << "    CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim > 2 ? Q_name : "1") << "];\n";
+          code << "    Interp" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp" << var_suffix << ", P_1d" << var_suffix << ", " << Q_name
+               << ">(data, r_e" << var_suffix << ", s_B" << var_suffix << ", r_c" << var_suffix << ");\n";
+        } else if (use_3d_slices) {
           code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
           code << "    Interp" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp" << var_suffix << ", P_1d" << var_suffix << ", " << Q_name
                << ">(data, r_e" << var_suffix << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n";
@@ -383,12 +442,16 @@ static int CeedOperatorBuildKernelBasis_Cuda_gen(std::ostringstream &code, CeedO
         }
         break;
       case CEED_EVAL_WEIGHT: {
-        CeedBasis_Cuda_shared *basis_data;
+        if (is_at_points) {
+          code << "    // Nothing to do AtPoints\n";
+        } else {
+          CeedBasis_Cuda_shared *basis_data;
 
-        code << "    CeedScalar r_q" << var_suffix << "[" << Q_name << "];\n";
-        CeedCallBackend(CeedBasisGetData(basis, &basis_data));
-        data->W = basis_data->d_q_weight_1d;
-        code << "    Weight" << (dim > 1 ? "Tensor" : "") << dim << "d<" << Q_name << ">(data, W, r_q" << var_suffix << ");\n";
+          code << "    CeedScalar r_q" << var_suffix << "[" << Q_name << "];\n";
+          CeedCallBackend(CeedBasisGetData(basis, &basis_data));
+          data->W = basis_data->d_q_weight_1d;
+          code << "    Weight" << (dim > 1 ? "Tensor" : "") << dim << "d<" << Q_name << ">(data, W, r_q" << var_suffix << ");\n";
+        }
         break;
       }
       // LCOV_EXCL_START
@@ -404,12 +467,20 @@ static int CeedOperatorBuildKernelBasis_Cuda_gen(std::ostringstream &code, CeedO
         break;  // No action
       case CEED_EVAL_INTERP:
         code << "    CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
-        code << "    InterpTranspose" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp" << var_suffix << ", " << P_name << ", " << Q_name
-             << ">(data, r_q" << var_suffix << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
+        if (is_at_points) {
+          code << "    InterpTranspose" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp" << var_suffix << ", " << P_name << ", " << Q_name
+               << ">(data, r_c" << var_suffix << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
+        } else {
+          code << "    InterpTranspose" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp" << var_suffix << ", " << P_name << ", " << Q_name
+               << ">(data, r_q" << var_suffix << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
+        }
         break;
       case CEED_EVAL_GRAD:
         code << "    CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
-        if (use_3d_slices) {
+        if (is_at_points) {
+          code << "    InterpTranspose" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp" << var_suffix << ", " << P_name << ", " << Q_name
+               << ">(data, r_c" << var_suffix << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
+        } else if (use_3d_slices) {
           code << "    InterpTranspose" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp" << var_suffix << ", " << P_name << ", " << Q_name
                << ">(data, r_q" << var_suffix << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
         } else {
@@ -434,40 +505,144 @@ static int CeedOperatorBuildKernelBasis_Cuda_gen(std::ostringstream &code, CeedO
 //------------------------------------------------------------------------------
 // QFunction
 //------------------------------------------------------------------------------
-static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, CeedInt dim, CeedInt num_input_fields,
-                                                     CeedOperatorField *op_input_fields, CeedQFunctionField *qf_input_fields,
-                                                     CeedInt num_output_fields, CeedOperatorField *op_output_fields,
-                                                     CeedQFunctionField *qf_output_fields, std::string qfunction_name, CeedInt Q_1d,
-                                                     bool use_3d_slices) {
+static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, CeedInt dim, CeedInt max_num_points,
+                                                     CeedInt num_input_fields, CeedOperatorField *op_input_fields,
+                                                     CeedQFunctionField *qf_input_fields, CeedInt num_output_fields,
+                                                     CeedOperatorField *op_output_fields, CeedQFunctionField *qf_output_fields,
+                                                     std::string qfunction_name, CeedInt Q_1d, bool is_at_points, bool use_3d_slices) {
   std::string         Q_name    = "Q_1d";
   CeedEvalMode        eval_mode = CEED_EVAL_NONE;
   CeedElemRestriction elem_rstr;
 
-  // Setup output arays
+  // Setup output arrays
   code << "\n    // -- Output field setup\n";
   for (CeedInt i = 0; i < num_output_fields; i++) {
     std::string var_suffix = "_out_" + std::to_string(i);
 
     code << "    // ---- Output field " << i << "\n";
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
-    if (eval_mode == CEED_EVAL_NONE || eval_mode == CEED_EVAL_INTERP) {
-      code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
+    switch (eval_mode) {
+      case CEED_EVAL_NONE:
+        if (is_at_points) {
+          code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "];\n";
+        } else {
+          code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
+        }
+        break;
+      case CEED_EVAL_INTERP:
+        if (is_at_points) {
+          // Accumulator for point data
+          code << "    CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim > 2 ? Q_name : "1") << "];\n";
+          code << "    for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (dim > 2 ? Q_name : "1") << "; i++) {\n";
+          code << "      r_c" << var_suffix << "[i] = 0.0;\n";
+          code << "    }\n";
+        } else {
+          code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
+        }
+        break;
+      case CEED_EVAL_GRAD:
+        if (is_at_points) {
+          // Accumulator for point data
+          code << "    CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim > 2 ? Q_name : "1") << "*dim];\n";
+          code << "    for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (dim > 2 ? Q_name : "1") << "; i++) {\n";
+          code << "      r_c" << var_suffix << "[i] = 0.0;\n";
+          code << "    }\n";
+        } else if (use_3d_slices) {
+          // Accumulator for gradient slices
+          code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
+          code << "    for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << Q_name << "; i++) {\n";
+          code << "      r_q" << var_suffix << "[i] = 0.0;\n";
+          code << "    }\n";
+        } else {
+          code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim*" << Q_name << "];\n";
+        }
+        break;
+      case CEED_EVAL_WEIGHT:
+        break;
+        // LCOV_EXCL_START
+      case CEED_EVAL_DIV:
+      case CEED_EVAL_CURL:
+        break;  // TODO: Not implemented
+                // LCOV_EXCL_STOP
     }
-    if (eval_mode == CEED_EVAL_GRAD) {
-      if (use_3d_slices) {
-        // Accumulator for gradient slices
-        code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
-        code << "    for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << Q_name << "; i++) {\n";
-        code << "      r_q" << var_suffix << "[i] = 0.0;\n";
-        code << "    }\n";
-      } else {
-        code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim*" << Q_name << "];\n";
+  }
+
+  if (is_at_points) {
+    // We need to handle batches of points
+    code << "\n    // Note: Using batches of points\n";
+    code << "    const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * max_num_points / (blockDim.x * blockDim.y));\n\n";
+    code << "    #pragma unroll\n";
+    code << "    for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) {\n";
+    code << "      const CeedInt p = i % max_num_points;\n\n";
+
+    code << "      // -- Coordinates\n";
+    code << "      CeedScalar r_x[dim];\n";
+    code << "      ReadPoint<dim, coords_comp_stride, max_num_points>(data, elem, p, max_num_points, points.indices, points.coords, r_x);\n\n";
+
+    code << "      // -- Input fields\n";
+    for (CeedInt i = 0; i < num_input_fields; i++) {
+      std::string var_suffix = "_in_" + std::to_string(i);
+
+      code << "      // ---- Input field " << i << "\n";
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
+      // Basis action
+      code << "      // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
+      switch (eval_mode) {
+        case CEED_EVAL_NONE:
+          code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
+          code << "      ReadPoint<num_comp" << var_suffix << ", comp_stride" << var_suffix
+               << ", max_num_points>(data, elem, p, max_num_points, indices.inputs[" << i << "], d" << var_suffix << ", r_s" << var_suffix << ");\n";
+          break;
+        case CEED_EVAL_INTERP:
+          code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
+          code << "      InterpAtPoints" << dim << "d<num_comp" << var_suffix << ", max_num_points, " << Q_name << ">(data, i, r_c" << var_suffix
+               << ", r_x, r_s" << var_suffix << ");\n";
+          break;
+        case CEED_EVAL_GRAD:
+          code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim];\n";
+          code << "      GradAtPoints" << dim << "d<num_comp" << var_suffix << ", max_num_points, " << Q_name << ">(data, i, r_c" << var_suffix
+               << ", r_x, r_s" << var_suffix << ");\n";
+          break;
+        case CEED_EVAL_WEIGHT:
+          code << "      CeedScalar r_s" << var_suffix << "[1];\n";
+          code << "      r_s" << var_suffix << "[0] = 1.0;\n";
+          break;
+          // LCOV_EXCL_START
+        case CEED_EVAL_DIV:
+        case CEED_EVAL_CURL:
+          break;  // TODO: Not implemented
+                  // LCOV_EXCL_STOP
+      }
+    }
+    code << "\n      // -- Output fields\n";
+    for (CeedInt i = 0; i < num_output_fields; i++) {
+      std::string var_suffix = "_out_" + std::to_string(i);
+
+      code << "      // ---- Output field " << i << "\n";
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
+      // Basis action
+      switch (eval_mode) {
+        case CEED_EVAL_NONE:
+          code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
+          break;
+        case CEED_EVAL_INTERP:
+          code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
+          break;
+        case CEED_EVAL_GRAD:
+          code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim];\n";
+          break;
+          // LCOV_EXCL_START
+        case CEED_EVAL_WEIGHT:
+          break;  // Should not occur
+        case CEED_EVAL_DIV:
+        case CEED_EVAL_CURL:
+          break;  // TODO: Not implemented
+                  // LCOV_EXCL_STOP
       }
     }
-  }
 
-  // We treat quadrature points per slice in 3d to save registers
-  if (use_3d_slices) {
+  } else if (use_3d_slices) {
+    // We treat quadrature points per slice in 3d to save registers
     code << "\n    // Note: Using planes of 3D elements\n";
     code << "    #pragma unroll\n";
     code << "    for (CeedInt q = 0; q < " << Q_name << "; q++) {\n";
@@ -532,10 +707,9 @@ static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, C
         case CEED_EVAL_WEIGHT:
           code << "      CeedScalar r_s" << var_suffix << "[1];\n";
           code << "      r_s" << var_suffix << "[0] = r_q" << var_suffix << "[q];\n";
-          break;  // No action
-                  // LCOV_EXCL_START
+          break;
+          // LCOV_EXCL_START
         case CEED_EVAL_DIV:
-          break;  // TODO: Not implemented
         case CEED_EVAL_CURL:
           break;  // TODO: Not implemented
                   // LCOV_EXCL_STOP
@@ -551,7 +725,7 @@ static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, C
       switch (eval_mode) {
         case CEED_EVAL_NONE:
           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
-          break;  // No action
+          break;
         case CEED_EVAL_INTERP:
           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
           break;
@@ -562,7 +736,6 @@ static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, C
         case CEED_EVAL_WEIGHT:
           break;  // Should not occur
         case CEED_EVAL_DIV:
-          break;  // TODO: Not implemented
         case CEED_EVAL_CURL:
           break;  // TODO: Not implemented
                   // LCOV_EXCL_STOP
@@ -601,16 +774,64 @@ static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, C
   // Apply QFunction
   code << "\n      // -- Apply QFunction\n";
   code << "      " << qfunction_name << "(ctx, ";
-  if (dim != 3 || use_3d_slices) {
+  if (dim != 3 || is_at_points || use_3d_slices) {
     code << "1";
   } else {
     code << "Q_1d";
   }
   code << ", inputs, outputs);\n";
 
-  // Copy or apply transpose grad, if needed
-  if (use_3d_slices) {
-    code << "      // -- Output fields\n";
+  if (is_at_points) {
+    // Map back to coefficients
+    code << "\n      // -- Output fields\n";
+    for (CeedInt i = 0; i < num_output_fields; i++) {
+      std::string var_suffix = "_out_" + std::to_string(i);
+      std::string P_name     = "P_1d" + var_suffix;
+
+      code << "      // ---- Output field " << i << "\n";
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
+      // Basis action
+      code << "      // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
+      switch (eval_mode) {
+        case CEED_EVAL_NONE: {
+          CeedInt             comp_stride;
+          CeedElemRestriction elem_rstr;
+
+          CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
+          CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
+          CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
+          code << "      const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
+          code << "      WritePoint<num_comp" << var_suffix << ", comp_stride" << var_suffix
+               << ", max_num_points>(data, elem, i, points.num_per_elem[elem], indices.outputs[" << i << "]"
+               << ", r_s" << var_suffix << ", d" << var_suffix << ");\n";
+          break;
+        }
+        case CEED_EVAL_INTERP:
+          code << "      if (i >= points.num_per_elem[elem]) {\n";
+          code << "        for (CeedInt j = 0; j < num_comp" << var_suffix << "; j++) r_s" << var_suffix << "[j] = 0.0;\n";
+          code << "      }\n";
+          code << "      InterpTransposeAtPoints" << dim << "d<num_comp" << var_suffix << ", max_num_points, " << Q_name << ">(data, i, r_s"
+               << var_suffix << ", r_x, r_c" << var_suffix << ");\n";
+          break;
+        case CEED_EVAL_GRAD:
+          code << "      if (i >= points.num_per_elem[elem]) {\n";
+          code << "        for (CeedInt j = 0; j < num_comp" << var_suffix << "*dim; j++) r_s" << var_suffix << "[j] = 0.0;\n";
+          code << "      }\n";
+          code << "      GradTransposeAtPoints" << dim << "d<num_comp" << var_suffix << ", max_num_points, " << Q_name << ">(data, i, r_s"
+               << var_suffix << ", r_x, r_c" << var_suffix << ");\n";
+          break;
+          // LCOV_EXCL_START
+        case CEED_EVAL_WEIGHT:
+          break;  // Should not occur
+        case CEED_EVAL_DIV:
+        case CEED_EVAL_CURL:
+          break;  // TODO: Not implemented
+                  // LCOV_EXCL_STOP
+      }
+    }
+  } else if (use_3d_slices) {
+    // Copy or apply transpose grad, if needed
+    code << "\n      // -- Output fields\n";
     for (CeedInt i = 0; i < num_output_fields; i++) {
       std::string var_suffix = "_out_" + std::to_string(i);
       std::string P_name     = "P_1d" + var_suffix;
@@ -624,7 +845,7 @@ static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, C
           code << "      for (CeedInt j = 0; j < num_comp" << var_suffix << " ; j++) {\n";
           code << "        r_q" << var_suffix << "[q + j*" << Q_name << "] = r_s" << var_suffix << "[j];\n";
           code << "      }\n";
-          break;  // No action
+          break;
         case CEED_EVAL_INTERP:
           code << "      for (CeedInt j = 0; j < num_comp" << var_suffix << " ; j++) {\n";
           code << "        r_q" << var_suffix << "[q + j*" << Q_name << "] = r_s" << var_suffix << "[j];\n";
@@ -638,7 +859,6 @@ static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, C
         case CEED_EVAL_WEIGHT:
           break;  // Should not occur
         case CEED_EVAL_DIV:
-          break;  // TODO: Not implemented
         case CEED_EVAL_CURL:
           break;  // TODO: Not implemented
                   // LCOV_EXCL_STOP
@@ -653,9 +873,9 @@ static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, C
 // Build single operator kernel
 //------------------------------------------------------------------------------
 extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {
-  bool                    is_tensor = true, use_3d_slices = false;
+  bool                    is_tensor = true, is_at_points = false, use_3d_slices = false;
   Ceed                    ceed;
-  CeedInt                 Q_1d, num_input_fields, num_output_fields, dim = 1;
+  CeedInt                 Q_1d, num_input_fields, num_output_fields, dim = 1, max_num_points = 0, coords_comp_stride = 0;
   CeedQFunctionField     *qf_input_fields, *qf_output_fields;
   CeedQFunction_Cuda_gen *qf_data;
   CeedQFunction           qf;
@@ -678,15 +898,26 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {
   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
 
   // Get operator data
+  CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points));
   CeedCallBackend(CeedOperatorBuildKernelData_Cuda_gen(ceed, num_input_fields, op_input_fields, qf_input_fields, num_output_fields, op_output_fields,
                                                        qf_output_fields, &data->max_P_1d, &Q_1d, &dim, &is_tensor, &use_3d_slices));
   if (dim == 0) dim = 1;
   data->dim = dim;
+  if (is_at_points) {
+    CeedElemRestriction_Cuda *rstr_data;
+    CeedElemRestriction       rstr_points = NULL;
+
+    CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL));
+    CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(rstr_points, &max_num_points));
+    CeedCallBackend(CeedElemRestrictionGetCompStride(rstr_points, &coords_comp_stride));
+    CeedCallBackend(CeedElemRestrictionGetData(rstr_points, &rstr_data));
+    data->points.indices = (CeedInt *)rstr_data->d_offsets;
+    CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
+  }
+  if (is_at_points) use_3d_slices = false;
   if (Q_1d == 0) {
-    CeedInt Q;
-
-    CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q));
-    Q_1d = Q;
+    if (is_at_points) Q_1d = max_num_points;
+    else CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q_1d));
   }
   data->Q_1d = Q_1d;
 
@@ -722,6 +953,8 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {
   // TODO: Add non-tensor, AtPoints
   code << "// Tensor basis source\n";
   code << "#include <ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h>\n\n";
+  code << "// AtPoints basis source\n";
+  code << "#include <ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h>\n\n";
   code << "// CodeGen operator source\n";
   code << "#include <ceed/jit-source/cuda/cuda-gen-templates.h>\n\n";
 
@@ -733,7 +966,7 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {
 
   // Define CEED_Q_VLA
   code << "\n#undef CEED_Q_VLA\n";
-  if (dim != 3 || use_3d_slices) {
+  if (dim != 3 || is_at_points || use_3d_slices) {
     code << "#define CEED_Q_VLA 1\n\n";
   } else {
     code << "#define CEED_Q_VLA " << Q_1d << "\n\n";
@@ -757,13 +990,15 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {
   code << "// d_[in,out]_i:   CeedVector device array\n";
   code << "// r_[in,out]_e_i: Element vector register\n";
   code << "// r_[in,out]_q_i: Quadrature space vector register\n";
-  code << "// r_[in,out]_s_i: Quadrature space slice  vector register\n";
+  code << "// r_[in,out]_c_i: AtPoints Chebyshev coefficents register\n";
+  code << "// r_[in,out]_s_i: Quadrature space slice vector register\n";
   code << "// \n";
   code << "// s_B_[in,out]_i: Interpolation matrix, shared memory\n";
   code << "// s_G_[in,out]_i: Gradient matrix, shared memory\n";
   code << "// -----------------------------------------------------------------------------\n";
   code << "extern \"C\" __global__ void " << operator_name
-       << "(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalar *W) {\n";
+       << "(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalar *W, Points_Cuda "
+          "points) {\n";
 
   // Scratch buffers
   for (CeedInt i = 0; i < num_input_fields; i++) {
@@ -780,6 +1015,10 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {
 
   code << "  const CeedInt dim = " << dim << ";\n";
   code << "  const CeedInt Q_1d = " << Q_1d << ";\n";
+  if (is_at_points) {
+    code << "  const CeedInt max_num_points = " << max_num_points << ";\n";
+    code << "  const CeedInt coords_comp_stride = " << coords_comp_stride << ";\n";
+  }
 
   // Shared data
   code << "  extern __shared__ CeedScalar slice[];\n";
@@ -793,11 +1032,13 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {
   // Initialize constants, and matrices B and G
   code << "\n  // Input field constants and basis data\n";
   for (CeedInt i = 0; i < num_input_fields; i++) {
-    CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, i, op_input_fields[i], qf_input_fields[i], Q_1d, true, use_3d_slices));
+    CeedCallBackend(
+        CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, i, op_input_fields[i], qf_input_fields[i], Q_1d, true, is_at_points, use_3d_slices));
   }
   code << "\n  // Output field constants and basis data\n";
   for (CeedInt i = 0; i < num_output_fields; i++) {
-    CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, i, op_output_fields[i], qf_output_fields[i], Q_1d, false, use_3d_slices));
+    CeedCallBackend(
+        CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, i, op_output_fields[i], qf_output_fields[i], Q_1d, false, is_at_points, use_3d_slices));
   }
 
   // Loop over all elements
@@ -806,7 +1047,7 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {
   code << "  for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n";
 
   // -- Compute minimum buffer space needed
-  CeedInt max_rstr_buffer_size = 0;
+  CeedInt max_rstr_buffer_size = 1;
 
   for (CeedInt i = 0; i < num_input_fields; i++) {
     CeedInt             num_comp, elem_size;
@@ -884,15 +1125,17 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {
 
     // ---- Restriction
     CeedCallBackend(CeedOperatorBuildKernelRestriction_Cuda_gen(code, data, f, dim, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f],
-                                                                Q_1d, true, use_3d_slices));
+                                                                Q_1d, true, is_at_points, use_3d_slices));
 
     // ---- Basis action
-    CeedCallBackend(CeedOperatorBuildKernelBasis_Cuda_gen(code, data, f, dim, op_input_fields[f], qf_input_fields[f], Q_1d, true, use_3d_slices));
+    CeedCallBackend(
+        CeedOperatorBuildKernelBasis_Cuda_gen(code, data, f, dim, op_input_fields[f], qf_input_fields[f], Q_1d, true, is_at_points, use_3d_slices));
   }
 
   // -- Q function
-  CeedCallBackend(CeedOperatorBuildKernelQFunction_Cuda_gen(code, data, dim, num_input_fields, op_input_fields, qf_input_fields, num_output_fields,
-                                                            op_output_fields, qf_output_fields, qfunction_name, Q_1d, use_3d_slices));
+  CeedCallBackend(CeedOperatorBuildKernelQFunction_Cuda_gen(code, data, dim, max_num_points, num_input_fields, op_input_fields, qf_input_fields,
+                                                            num_output_fields, op_output_fields, qf_output_fields, qfunction_name, Q_1d, is_at_points,
+                                                            use_3d_slices));
 
   // -- Output basis and restriction
   code << "\n    // -- Output field basis action and restrictions\n";
@@ -900,11 +1143,12 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {
     code << "    // ---- Output field " << i << "\n";
 
     // ---- Basis action
-    CeedCallBackend(CeedOperatorBuildKernelBasis_Cuda_gen(code, data, i, dim, op_output_fields[i], qf_output_fields[i], Q_1d, false, use_3d_slices));
+    CeedCallBackend(CeedOperatorBuildKernelBasis_Cuda_gen(code, data, i, dim, op_output_fields[i], qf_output_fields[i], Q_1d, false, is_at_points,
+                                                          use_3d_slices));
 
     // ---- Restriction
-    CeedCallBackend(
-        CeedOperatorBuildKernelRestriction_Cuda_gen(code, data, i, dim, NULL, op_output_fields[i], qf_output_fields[i], Q_1d, false, use_3d_slices));
+    CeedCallBackend(CeedOperatorBuildKernelRestriction_Cuda_gen(code, data, i, dim, NULL, op_output_fields[i], qf_output_fields[i], Q_1d, false,
+                                                                is_at_points, use_3d_slices));
   }
 
   // Close loop and function
diff --git a/backends/cuda-gen/ceed-cuda-gen-operator.c b/backends/cuda-gen/ceed-cuda-gen-operator.c
index d999dc8caa..570e8735c1 100644
--- a/backends/cuda-gen/ceed-cuda-gen-operator.c
+++ b/backends/cuda-gen/ceed-cuda-gen-operator.c
@@ -8,6 +8,8 @@
 #include <ceed.h>
 #include <ceed/backend.h>
 #include <ceed/jit-source/cuda/cuda-types.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
 #include <stddef.h>
 
 #include "../cuda/ceed-cuda-common.h"
@@ -19,10 +21,14 @@
 // Destroy operator
 //------------------------------------------------------------------------------
 static int CeedOperatorDestroy_Cuda_gen(CeedOperator op) {
+  Ceed                   ceed;
   CeedOperator_Cuda_gen *impl;
 
+  CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
   CeedCallBackend(CeedOperatorGetData(op, &impl));
+  if (impl->points.num_per_elem) CeedCallCuda(ceed, cudaFree((void **)impl->points.num_per_elem));
   CeedCallBackend(CeedFree(&impl));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -92,6 +98,7 @@ static size_t dynamicSMemSize(int threads) { return threads * sizeof(CeedScalar)
 // Apply and add to output
 //------------------------------------------------------------------------------
 static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec, CeedVector output_vec, CeedRequest *request) {
+  bool                    is_at_points;
   Ceed                    ceed;
   Ceed_Cuda              *cuda_data;
   CeedInt                 num_elem, num_input_fields, num_output_fields;
@@ -181,25 +188,52 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec,
     }
   }
 
+  // Point coordinates, if needed
+  CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points));
+  if (is_at_points) {
+    // Coords
+    CeedVector vec;
+
+    CeedCallBackend(CeedOperatorAtPointsGetPoints(op, NULL, &vec));
+    CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->points.coords));
+    CeedCallBackend(CeedVectorDestroy(&vec));
+
+    // Points per elem
+    if (num_elem != data->points.num_elem) {
+      CeedInt            *points_per_elem;
+      const CeedInt       num_bytes   = num_elem * sizeof(CeedInt);
+      CeedElemRestriction rstr_points = NULL;
+
+      data->points.num_elem = num_elem;
+      CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL));
+      CeedCallBackend(CeedCalloc(num_elem, &points_per_elem));
+      for (CeedInt e = 0; e < num_elem; e++) {
+        CeedInt num_points_elem;
+
+        CeedCallBackend(CeedElemRestrictionGetNumPointsInElement(rstr_points, e, &num_points_elem));
+        points_per_elem[e] = num_points_elem;
+      }
+      if (data->points.num_per_elem) CeedCallCuda(ceed, cudaFree((void **)data->points.num_per_elem));
+      CeedCallCuda(ceed, cudaMalloc((void **)&data->points.num_per_elem, num_bytes));
+      CeedCallCuda(ceed, cudaMemcpy((void *)data->points.num_per_elem, points_per_elem, num_bytes, cudaMemcpyHostToDevice));
+      CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
+      CeedCallBackend(CeedFree(&points_per_elem));
+    }
+  }
+
   // Get context data
   CeedCallBackend(CeedQFunctionGetInnerContextData(qf, CEED_MEM_DEVICE, &qf_data->d_c));
 
   // Apply operator
-  void         *opargs[]  = {(void *)&num_elem, &qf_data->d_c, &data->indices, &data->fields, &data->B, &data->G, &data->W};
+  void         *opargs[]  = {(void *)&num_elem, &qf_data->d_c, &data->indices, &data->fields, &data->B, &data->G, &data->W, &data->points};
   const CeedInt dim       = data->dim;
   const CeedInt Q_1d      = data->Q_1d;
   const CeedInt P_1d      = data->max_P_1d;
   const CeedInt thread_1d = CeedIntMax(Q_1d, P_1d);
-  int           max_threads_per_block, min_grid_size;
+  int           max_threads_per_block, min_grid_size, grid;
 
   CeedCallCuda(ceed, cuOccupancyMaxPotentialBlockSize(&min_grid_size, &max_threads_per_block, data->op, dynamicSMemSize, 0, 0x10000));
-  int block[3] =
-      {
-          thread_1d,
-          dim < 2 ? 1 : thread_1d,
-          -1,
-      },
-      grid;
+  int block[3] = {thread_1d, dim < 2 ? 1 : thread_1d, -1};
 
   CeedCallBackend(BlockGridCalculate(num_elem, min_grid_size / cuda_data->device_prop.multiProcessorCount, max_threads_per_block,
                                      cuda_data->device_prop.maxThreadsDim[2], cuda_data->device_prop.warpSize, block, &grid));
@@ -236,6 +270,7 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec,
       if (is_active) vec = output_vec;
       // Check for multiple output modes
       CeedInt index = -1;
+
       for (CeedInt j = 0; j < i; j++) {
         if (vec == output_vecs[j]) {
           index = j;
@@ -249,6 +284,15 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec,
     }
   }
 
+  // Restore point coordinates, if needed
+  if (is_at_points) {
+    CeedVector vec;
+
+    CeedCallBackend(CeedOperatorAtPointsGetPoints(op, NULL, &vec));
+    CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->points.coords));
+    CeedCallBackend(CeedVectorDestroy(&vec));
+  }
+
   // Restore context data
   CeedCallBackend(CeedQFunctionRestoreInnerContextData(qf, &qf_data->d_c));
   CeedCallBackend(CeedDestroy(&ceed));
diff --git a/backends/cuda-gen/ceed-cuda-gen.c b/backends/cuda-gen/ceed-cuda-gen.c
index 0ab817186d..213a769052 100644
--- a/backends/cuda-gen/ceed-cuda-gen.c
+++ b/backends/cuda-gen/ceed-cuda-gen.c
@@ -39,6 +39,7 @@ static int CeedInit_Cuda_gen(const char *resource, Ceed ceed) {
 
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "QFunctionCreate", CeedQFunctionCreate_Cuda_gen));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "OperatorCreate", CeedOperatorCreate_Cuda_gen));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "OperatorCreateAtPoints", CeedOperatorCreate_Cuda_gen));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "Destroy", CeedDestroy_Cuda));
   return CEED_ERROR_SUCCESS;
 }
diff --git a/backends/cuda-gen/ceed-cuda-gen.h b/backends/cuda-gen/ceed-cuda-gen.h
index bd0c76e671..c88e9fd18f 100644
--- a/backends/cuda-gen/ceed-cuda-gen.h
+++ b/backends/cuda-gen/ceed-cuda-gen.h
@@ -22,6 +22,7 @@ typedef struct {
   Fields_Cuda    B;
   Fields_Cuda    G;
   CeedScalar    *W;
+  Points_Cuda    points;
 } CeedOperator_Cuda_gen;
 
 typedef struct {
diff --git a/backends/hip-gen/ceed-hip-gen-operator-build.cpp b/backends/hip-gen/ceed-hip-gen-operator-build.cpp
index a55e6e5cc5..9922cb3a69 100644
--- a/backends/hip-gen/ceed-hip-gen-operator-build.cpp
+++ b/backends/hip-gen/ceed-hip-gen-operator-build.cpp
@@ -461,7 +461,7 @@ static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, Ce
   CeedEvalMode        eval_mode = CEED_EVAL_NONE;
   CeedElemRestriction elem_rstr;
 
-  // Setup output arays
+  // Setup output arrays
   code << "\n    // -- Output field setup\n";
   for (CeedInt i = 0; i < num_output_fields; i++) {
     std::string var_suffix = "_out_" + std::to_string(i);
diff --git a/include/ceed/jit-source/cuda/cuda-gen-templates.h b/include/ceed/jit-source/cuda/cuda-gen-templates.h
index 8a79ba5989..b5373320c5 100644
--- a/include/ceed/jit-source/cuda/cuda-gen-templates.h
+++ b/include/ceed/jit-source/cuda/cuda-gen-templates.h
@@ -17,6 +17,38 @@ inline __device__ void LoadMatrix(SharedData_Cuda &data, const CeedScalar *__res
   for (CeedInt i = data.t_id; i < P * Q; i += blockDim.x * blockDim.y * blockDim.z) B[i] = d_B[i];
 }
 
+//------------------------------------------------------------------------------
+// AtPoints
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+// L-vector -> single point
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int COMP_STRIDE, int NUM_PTS>
+inline __device__ void ReadPoint(SharedData_Cuda &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem,
+                                 const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
+  const CeedInt ind = indices[p + elem * NUM_PTS];
+
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    r_u[comp] = d_u[ind + comp * COMP_STRIDE];
+  }
+}
+
+//------------------------------------------------------------------------------
+// Single point -> L-vector
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int COMP_STRIDE, int NUM_PTS>
+inline __device__ void WritePoint(SharedData_Cuda &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem,
+                                  const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_u, CeedScalar *d_u) {
+  if (p < points_in_elem) {
+    const CeedInt ind = indices[p + elem * NUM_PTS];
+
+    for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+      d_u[ind + comp * COMP_STRIDE] += r_u[comp];
+    }
+  }
+}
+
 //------------------------------------------------------------------------------
 // 1D
 //------------------------------------------------------------------------------
diff --git a/include/ceed/jit-source/cuda/cuda-types.h b/include/ceed/jit-source/cuda/cuda-types.h
index 3410286f78..f80d7193d9 100644
--- a/include/ceed/jit-source/cuda/cuda-types.h
+++ b/include/ceed/jit-source/cuda/cuda-types.h
@@ -23,6 +23,13 @@ typedef struct {
   CeedInt *outputs[CEED_CUDA_NUMBER_FIELDS];
 } FieldsInt_Cuda;
 
+typedef struct {
+  CeedInt           num_elem;
+  const CeedInt    *num_per_elem;
+  const CeedInt    *indices;
+  const CeedScalar *coords;
+} Points_Cuda;
+
 typedef struct {
   CeedInt     t_id_x;
   CeedInt     t_id_y;

From bcd92680c8b93d5b8861c3c87761f161cf9f50ef Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Thu, 12 Dec 2024 17:29:30 -0700
Subject: [PATCH 243/571] pc - fix fallback creation for AtPoints operator

---
 interface/ceed-preconditioning.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/interface/ceed-preconditioning.c b/interface/ceed-preconditioning.c
index 1879464450..700699e75b 100644
--- a/interface/ceed-preconditioning.c
+++ b/interface/ceed-preconditioning.c
@@ -136,6 +136,7 @@ static int CeedOperatorCreateFallback(CeedOperator op) {
       CeedCall(CeedCompositeOperatorAddSub(op_fallback, op_sub_fallback));
     }
   } else {
+    bool               is_at_points = false;
     CeedInt            num_input_fields, num_output_fields;
     CeedQFunction      qf_fallback = NULL, dqf_fallback = NULL, dqfT_fallback = NULL;
     CeedOperatorField *input_fields, *output_fields;
@@ -143,7 +144,19 @@ static int CeedOperatorCreateFallback(CeedOperator op) {
     CeedCall(CeedQFunctionCreateFallback(ceed_fallback, op->qf, &qf_fallback));
     CeedCall(CeedQFunctionCreateFallback(ceed_fallback, op->dqf, &dqf_fallback));
     CeedCall(CeedQFunctionCreateFallback(ceed_fallback, op->dqfT, &dqfT_fallback));
-    CeedCall(CeedOperatorCreate(ceed_fallback, qf_fallback, dqf_fallback, dqfT_fallback, &op_fallback));
+    CeedCall(CeedOperatorIsAtPoints(op, &is_at_points));
+    if (is_at_points) {
+      CeedVector          points;
+      CeedElemRestriction rstr_points;
+
+      CeedCall(CeedOperatorCreateAtPoints(ceed_fallback, qf_fallback, dqf_fallback, dqfT_fallback, &op_fallback));
+      CeedCall(CeedOperatorAtPointsGetPoints(op, &rstr_points, &points));
+      CeedCall(CeedOperatorAtPointsSetPoints(op_fallback, rstr_points, points));
+      CeedCall(CeedVectorDestroy(&points));
+      CeedCall(CeedElemRestrictionDestroy(&rstr_points));
+    } else {
+      CeedCall(CeedOperatorCreate(ceed_fallback, qf_fallback, dqf_fallback, dqfT_fallback, &op_fallback));
+    }
     CeedCall(CeedOperatorGetFields(op, &num_input_fields, &input_fields, &num_output_fields, &output_fields));
     for (CeedInt i = 0; i < num_input_fields; i++) {
       const char         *field_name;

From 99f7f61f6ec335c6cfeb7f25a9074d61f2120c87 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Thu, 12 Dec 2024 17:54:15 -0700
Subject: [PATCH 244/571] minor - fix view for AtPoints operators

---
 interface/ceed-operator.c | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/interface/ceed-operator.c b/interface/ceed-operator.c
index cc3493db87..09eb33ada6 100644
--- a/interface/ceed-operator.c
+++ b/interface/ceed-operator.c
@@ -136,12 +136,14 @@ static int CeedOperatorFieldView(CeedOperatorField op_field, CeedQFunctionField
   @ref Utility
 **/
 int CeedOperatorSingleView(CeedOperator op, bool sub, FILE *stream) {
+  bool                is_at_points;
   const char         *pre = sub ? "  " : "";
   CeedInt             num_elem, num_qpts, total_fields = 0, num_input_fields, num_output_fields;
   CeedQFunction       qf;
   CeedQFunctionField *qf_input_fields, *qf_output_fields;
   CeedOperatorField  *op_input_fields, *op_output_fields;
 
+  CeedCall(CeedOperatorIsAtPoints(op, &is_at_points));
   CeedCall(CeedOperatorGetNumElements(op, &num_elem));
   CeedCall(CeedOperatorGetNumQuadraturePoints(op, &num_qpts));
   CeedCall(CeedOperatorGetNumArgs(op, &total_fields));
@@ -150,7 +152,17 @@ int CeedOperatorSingleView(CeedOperator op, bool sub, FILE *stream) {
   CeedCall(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
   CeedCall(CeedQFunctionDestroy(&qf));
 
-  fprintf(stream, "%s  %" CeedInt_FMT " elements with %" CeedInt_FMT " quadrature points each\n", pre, num_elem, num_qpts);
+  if (is_at_points) {
+    CeedInt             max_points = 0;
+    CeedElemRestriction rstr_points;
+
+    CeedCall(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL));
+    CeedCall(CeedElemRestrictionGetMaxPointsInElement(rstr_points, &max_points));
+    fprintf(stream, "%s  %" CeedInt_FMT " elements with %" CeedInt_FMT " max points each\n", pre, num_elem, max_points);
+    CeedCall(CeedElemRestrictionDestroy(&rstr_points));
+  } else {
+    fprintf(stream, "%s  %" CeedInt_FMT " elements with %" CeedInt_FMT " quadrature points each\n", pre, num_elem, num_qpts);
+  }
   fprintf(stream, "%s  %" CeedInt_FMT " field%s\n", pre, total_fields, total_fields > 1 ? "s" : "");
   fprintf(stream, "%s  %" CeedInt_FMT " input field%s:\n", pre, num_input_fields, num_input_fields > 1 ? "s" : "");
   for (CeedInt i = 0; i < num_input_fields; i++) {
@@ -1551,9 +1563,10 @@ int CeedOperatorSetName(CeedOperator op, const char *name) {
   @return Error code: 0 - success, otherwise - failure
 **/
 static int CeedOperatorView_Core(CeedOperator op, FILE *stream, bool is_full) {
-  bool has_name = op->name, is_composite;
+  bool has_name = op->name, is_composite, is_at_points;
 
   CeedCall(CeedOperatorIsComposite(op, &is_composite));
+  CeedCall(CeedOperatorIsAtPoints(op, &is_at_points));
   if (is_composite) {
     CeedInt       num_suboperators;
     CeedOperator *sub_operators;
@@ -1564,11 +1577,12 @@ static int CeedOperatorView_Core(CeedOperator op, FILE *stream, bool is_full) {
 
     for (CeedInt i = 0; i < num_suboperators; i++) {
       has_name = sub_operators[i]->name;
-      fprintf(stream, "  SubOperator %" CeedInt_FMT "%s%s%s\n", i, has_name ? " - " : "", has_name ? sub_operators[i]->name : "", is_full ? ":" : "");
+      fprintf(stream, "  SubOperator%s %" CeedInt_FMT "%s%s%s\n", is_at_points ? " AtPoints" : "", i, has_name ? " - " : "",
+              has_name ? sub_operators[i]->name : "", is_full ? ":" : "");
       if (is_full) CeedCall(CeedOperatorSingleView(sub_operators[i], 1, stream));
     }
   } else {
-    fprintf(stream, "CeedOperator%s%s\n", has_name ? " - " : "", has_name ? op->name : "");
+    fprintf(stream, "CeedOperator%s%s%s\n", is_at_points ? " AtPoints" : "", has_name ? " - " : "", has_name ? op->name : "");
     if (is_full) CeedCall(CeedOperatorSingleView(op, 0, stream));
   }
   return CEED_ERROR_SUCCESS;

From 4eda27c22ac4807a862a62ad9a64874b04aeaf7a Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 13 Dec 2024 14:21:20 -0700
Subject: [PATCH 245/571] gpu - minor fix to 1d AtPoints basis transpose

---
 .../cuda/cuda-shared-basis-tensor-at-points-templates.h       | 4 ++--
 .../hip/hip-shared-basis-tensor-at-points-templates.h         | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h
index acf35a0dd7..2442d648ee 100644
--- a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h
@@ -79,7 +79,7 @@ inline __device__ void InterpTransposeAtPoints1d(SharedData_Cuda &data, const Ce
     }
     // Pull from shared to register
     __syncthreads();
-    if (data.t_id_x < Q_1D) r_C[comp] = data.slice[p];
+    if (data.t_id_x < Q_1D) r_C[comp] += data.slice[data.t_id_x];
   }
 }
 
@@ -125,7 +125,7 @@ inline __device__ void GradTransposeAtPoints1d(SharedData_Cuda &data, const Ceed
     }
     // Pull from shared to register
     __syncthreads();
-    if (data.t_id_x < Q_1D) r_C[comp] = data.slice[p];
+    if (data.t_id_x < Q_1D) r_C[comp] += data.slice[data.t_id_x];
   }
 }
 
diff --git a/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h b/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h
index 73d8cba91b..6c522ac5cd 100644
--- a/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h
+++ b/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h
@@ -79,7 +79,7 @@ inline __device__ void InterpTransposeAtPoints1d(SharedData_Hip &data, const Cee
     }
     // Pull from shared to register
     __syncthreads();
-    if (data.t_id_x < Q_1D) r_C[comp] = data.slice[p];
+    if (data.t_id_x < Q_1D) r_C[comp] += data.slice[data.t_id_x];
   }
 }
 
@@ -125,7 +125,7 @@ inline __device__ void GradTransposeAtPoints1d(SharedData_Hip &data, const CeedI
     }
     // Pull from shared to register
     __syncthreads();
-    if (data.t_id_x < Q_1D) r_C[comp] = data.slice[p];
+    if (data.t_id_x < Q_1D) r_C[comp] += data.slice[data.t_id_x];
   }
 }
 

From 688b547317fb91b3911fc39af1ff501a468744ac Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 13 Dec 2024 16:33:06 -0700
Subject: [PATCH 246/571] minor - fix style in gen templates

---
 .../ceed/jit-source/cuda/cuda-gen-templates.h | 43 ++++++++++++-------
 .../ceed/jit-source/hip/hip-gen-templates.h   | 41 +++++++++++-------
 2 files changed, 53 insertions(+), 31 deletions(-)

diff --git a/include/ceed/jit-source/cuda/cuda-gen-templates.h b/include/ceed/jit-source/cuda/cuda-gen-templates.h
index b5373320c5..17b9658cb5 100644
--- a/include/ceed/jit-source/cuda/cuda-gen-templates.h
+++ b/include/ceed/jit-source/cuda/cuda-gen-templates.h
@@ -179,13 +179,14 @@ inline __device__ void WriteLVecStrided2d(SharedData_Cuda &data, const CeedInt e
 template <int NUM_COMP, int COMP_STRIDE, int P_1d>
 inline __device__ void ReadLVecStandard3d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
                                           const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
-  if (data.t_id_x < P_1d && data.t_id_y < P_1d)
+  if (data.t_id_x < P_1d && data.t_id_y < P_1d) {
     for (CeedInt z = 0; z < P_1d; z++) {
       const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d;
       const CeedInt ind  = indices[node + elem * P_1d * P_1d * P_1d];
 
       for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1d] = d_u[ind + COMP_STRIDE * comp];
     }
+  }
 }
 
 //------------------------------------------------------------------------------
@@ -194,13 +195,14 @@ inline __device__ void ReadLVecStandard3d(SharedData_Cuda &data, const CeedInt n
 template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
 inline __device__ void ReadLVecStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u,
                                          CeedScalar *__restrict__ r_u) {
-  if (data.t_id_x < P_1d && data.t_id_y < P_1d)
+  if (data.t_id_x < P_1d && data.t_id_y < P_1d) {
     for (CeedInt z = 0; z < P_1d; z++) {
       const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d;
       const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
 
       for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1d] = d_u[ind + comp * STRIDES_COMP];
     }
+  }
 }
 
 //------------------------------------------------------------------------------
@@ -238,13 +240,14 @@ inline __device__ void ReadEVecSliceStrided3d(SharedData_Cuda &data, const CeedI
 template <int NUM_COMP, int COMP_STRIDE, int P_1d>
 inline __device__ void WriteLVecStandard3d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
                                            const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
-  if (data.t_id_x < P_1d && data.t_id_y < P_1d)
+  if (data.t_id_x < P_1d && data.t_id_y < P_1d) {
     for (CeedInt z = 0; z < P_1d; z++) {
       const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d;
       const CeedInt ind  = indices[node + elem * P_1d * P_1d * P_1d];
 
       for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[z + comp * P_1d]);
     }
+  }
 }
 
 //------------------------------------------------------------------------------
@@ -253,13 +256,14 @@ inline __device__ void WriteLVecStandard3d(SharedData_Cuda &data, const CeedInt
 template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
 inline __device__ void WriteLVecStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
                                           CeedScalar *__restrict__ d_v) {
-  if (data.t_id_x < P_1d && data.t_id_y < P_1d)
+  if (data.t_id_x < P_1d && data.t_id_y < P_1d) {
     for (CeedInt z = 0; z < P_1d; z++) {
       const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d;
       const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
 
       for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[z + comp * P_1d];
     }
+  }
 }
 
 //------------------------------------------------------------------------------
@@ -274,15 +278,19 @@ inline __device__ void GradColloSlice3d(SharedData_Cuda &data, const CeedInt q,
       __syncthreads();
       // X derivative
       r_V[comp + 0 * NUM_COMP] = 0.0;
-      for (CeedInt i = 0; i < Q_1d; i++)
-        r_V[comp + 0 * NUM_COMP] += c_G[i + data.t_id_x * Q_1d] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction (X derivative)
+      for (CeedInt i = 0; i < Q_1d; i++) {
+        r_V[comp + 0 * NUM_COMP] += c_G[i + data.t_id_x * Q_1d] * data.slice[i + data.t_id_y * T_1D];
+      }
       // Y derivative
       r_V[comp + 1 * NUM_COMP] = 0.0;
-      for (CeedInt i = 0; i < Q_1d; i++)
-        r_V[comp + 1 * NUM_COMP] += c_G[i + data.t_id_y * Q_1d] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction (Y derivative)
+      for (CeedInt i = 0; i < Q_1d; i++) {
+        r_V[comp + 1 * NUM_COMP] += c_G[i + data.t_id_y * Q_1d] * data.slice[data.t_id_x + i * T_1D];
+      }
       // Z derivative
       r_V[comp + 2 * NUM_COMP] = 0.0;
-      for (CeedInt i = 0; i < Q_1d; i++) r_V[comp + 2 * NUM_COMP] += c_G[i + q * Q_1d] * r_U[i + comp * Q_1d];  // Contract z direction (Z derivative)
+      for (CeedInt i = 0; i < Q_1d; i++) {
+        r_V[comp + 2 * NUM_COMP] += c_G[i + q * Q_1d] * r_U[i + comp * Q_1d];
+      }
       __syncthreads();
     }
   }
@@ -296,21 +304,24 @@ inline __device__ void GradColloSliceTranspose3d(SharedData_Cuda &data, const Ce
                                                  CeedScalar *__restrict__ r_V) {
   if (data.t_id_x < Q_1d && data.t_id_y < Q_1d) {
     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-      // X derivative
       data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 0 * NUM_COMP];
       __syncthreads();
-      for (CeedInt i = 0; i < Q_1d; i++)
-        r_V[q + comp * Q_1d] += c_G[data.t_id_x + i * Q_1d] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction (X derivative)
+      // X derivative
+      for (CeedInt i = 0; i < Q_1d; i++) {
+        r_V[q + comp * Q_1d] += c_G[data.t_id_x + i * Q_1d] * data.slice[i + data.t_id_y * T_1D];
+      }
       __syncthreads();
       // Y derivative
       data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 1 * NUM_COMP];
       __syncthreads();
-      for (CeedInt i = 0; i < Q_1d; i++)
-        r_V[q + comp * Q_1d] += c_G[data.t_id_y + i * Q_1d] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction (Y derivative)
+      for (CeedInt i = 0; i < Q_1d; i++) {
+        r_V[q + comp * Q_1d] += c_G[data.t_id_y + i * Q_1d] * data.slice[data.t_id_x + i * T_1D];
+      }
       __syncthreads();
       // Z derivative
-      for (CeedInt i = 0; i < Q_1d; i++)
-        r_V[i + comp * Q_1d] += c_G[i + q * Q_1d] * r_U[comp + 2 * NUM_COMP];  // PARTIAL contract z direction (Z derivative)
+      for (CeedInt i = 0; i < Q_1d; i++) {
+        r_V[i + comp * Q_1d] += c_G[i + q * Q_1d] * r_U[comp + 2 * NUM_COMP];
+      }
     }
   }
 }
diff --git a/include/ceed/jit-source/hip/hip-gen-templates.h b/include/ceed/jit-source/hip/hip-gen-templates.h
index 7059d8dafe..56521a3685 100644
--- a/include/ceed/jit-source/hip/hip-gen-templates.h
+++ b/include/ceed/jit-source/hip/hip-gen-templates.h
@@ -145,13 +145,14 @@ inline __device__ void WriteLVecStrided2d(SharedData_Hip &data, const CeedInt el
 template <int NUM_COMP, int COMP_STRIDE, int P_1d>
 inline __device__ void ReadLVecStandard3d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
                                           const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
-  if (data.t_id_x < P_1d && data.t_id_y < P_1d)
+  if (data.t_id_x < P_1d && data.t_id_y < P_1d) {
     for (CeedInt z = 0; z < P_1d; z++) {
       const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d;
       const CeedInt ind  = indices[node + elem * P_1d * P_1d * P_1d];
 
       for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1d] = d_u[ind + COMP_STRIDE * comp];
     }
+  }
 }
 
 //------------------------------------------------------------------------------
@@ -159,13 +160,14 @@ inline __device__ void ReadLVecStandard3d(SharedData_Hip &data, const CeedInt nu
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
 inline __device__ void ReadLVecStrided3d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
-  if (data.t_id_x < P_1d && data.t_id_y < P_1d)
+  if (data.t_id_x < P_1d && data.t_id_y < P_1d) {
     for (CeedInt z = 0; z < P_1d; z++) {
       const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d;
       const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
 
       for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1d] = d_u[ind + comp * STRIDES_COMP];
     }
+  }
 }
 
 //------------------------------------------------------------------------------
@@ -203,13 +205,14 @@ inline __device__ void ReadEVecSliceStrided3d(SharedData_Hip &data, const CeedIn
 template <int NUM_COMP, int COMP_STRIDE, int P_1d>
 inline __device__ void WriteLVecStandard3d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
                                            const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
-  if (data.t_id_x < P_1d && data.t_id_y < P_1d)
+  if (data.t_id_x < P_1d && data.t_id_y < P_1d) {
     for (CeedInt z = 0; z < P_1d; z++) {
       const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d;
       const CeedInt ind  = indices[node + elem * P_1d * P_1d * P_1d];
 
       for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[z + comp * P_1d]);
     }
+  }
 }
 
 //------------------------------------------------------------------------------
@@ -218,13 +221,14 @@ inline __device__ void WriteLVecStandard3d(SharedData_Hip &data, const CeedInt n
 template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
 inline __device__ void WriteLVecStrided3d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
                                           CeedScalar *__restrict__ d_v) {
-  if (data.t_id_x < P_1d && data.t_id_y < P_1d)
+  if (data.t_id_x < P_1d && data.t_id_y < P_1d) {
     for (CeedInt z = 0; z < P_1d; z++) {
       const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d;
       const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
 
       for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[z + comp * P_1d];
     }
+  }
 }
 
 //------------------------------------------------------------------------------
@@ -239,15 +243,19 @@ inline __device__ void GradColloSlice3d(SharedData_Hip &data, const CeedInt q, c
       __syncthreads();
       // X derivative
       r_V[comp + 0 * NUM_COMP] = 0.0;
-      for (CeedInt i = 0; i < Q_1d; i++)
-        r_V[comp + 0 * NUM_COMP] += c_G[i + data.t_id_x * Q_1d] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction (X derivative)
+      for (CeedInt i = 0; i < Q_1d; i++) {
+        r_V[comp + 0 * NUM_COMP] += c_G[i + data.t_id_x * Q_1d] * data.slice[i + data.t_id_y * T_1D];
+      }
       // Y derivative
       r_V[comp + 1 * NUM_COMP] = 0.0;
-      for (CeedInt i = 0; i < Q_1d; i++)
-        r_V[comp + 1 * NUM_COMP] += c_G[i + data.t_id_y * Q_1d] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction (Y derivative)
+      for (CeedInt i = 0; i < Q_1d; i++) {
+        r_V[comp + 1 * NUM_COMP] += c_G[i + data.t_id_y * Q_1d] * data.slice[data.t_id_x + i * T_1D];
+      }
       // Z derivative
       r_V[comp + 2 * NUM_COMP] = 0.0;
-      for (CeedInt i = 0; i < Q_1d; i++) r_V[comp + 2 * NUM_COMP] += c_G[i + q * Q_1d] * r_U[i + comp * Q_1d];  // Contract z direction (Z derivative)
+      for (CeedInt i = 0; i < Q_1d; i++) {
+        r_V[comp + 2 * NUM_COMP] += c_G[i + q * Q_1d] * r_U[i + comp * Q_1d];
+      }
       __syncthreads();
     }
   }
@@ -264,18 +272,21 @@ inline __device__ void GradColloSliceTranspose3d(SharedData_Hip &data, const Cee
       // X derivative
       data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 0 * NUM_COMP];
       __syncthreads();
-      for (CeedInt i = 0; i < Q_1d; i++)
-        r_V[q + comp * Q_1d] += c_G[data.t_id_x + i * Q_1d] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction (X derivative)
+      for (CeedInt i = 0; i < Q_1d; i++) {
+        r_V[q + comp * Q_1d] += c_G[data.t_id_x + i * Q_1d] * data.slice[i + data.t_id_y * T_1D];
+      }
       __syncthreads();
       // Y derivative
       data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 1 * NUM_COMP];
       __syncthreads();
-      for (CeedInt i = 0; i < Q_1d; i++)
-        r_V[q + comp * Q_1d] += c_G[data.t_id_y + i * Q_1d] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction (Y derivative)
+      for (CeedInt i = 0; i < Q_1d; i++) {
+        r_V[q + comp * Q_1d] += c_G[data.t_id_y + i * Q_1d] * data.slice[data.t_id_x + i * T_1D];
+      }
       __syncthreads();
       // Z derivative
-      for (CeedInt i = 0; i < Q_1d; i++)
-        r_V[i + comp * Q_1d] += c_G[i + q * Q_1d] * r_U[comp + 2 * NUM_COMP];  // PARTIAL contract z direction (Z derivative)
+      for (CeedInt i = 0; i < Q_1d; i++) {
+        r_V[i + comp * Q_1d] += c_G[i + q * Q_1d] * r_U[comp + 2 * NUM_COMP];
+      }
     }
   }
 }

From 3a2968d63a7f2ece086fcd3a62875aca8b9498aa Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Tue, 17 Dec 2024 12:36:46 -0700
Subject: [PATCH 247/571] hip - AtPoints for hip/gen

---
 .../cuda-gen/ceed-cuda-gen-operator-build.cpp |   2 +-
 .../hip-gen/ceed-hip-gen-operator-build.cpp   | 422 ++++++++++++++----
 backends/hip-gen/ceed-hip-gen-operator.c      |  51 ++-
 backends/hip-gen/ceed-hip-gen.c               |   1 +
 backends/hip-gen/ceed-hip-gen.h               |   1 +
 .../ceed/jit-source/hip/hip-gen-templates.h   |  32 ++
 include/ceed/jit-source/hip/hip-types.h       |   7 +
 7 files changed, 431 insertions(+), 85 deletions(-)

diff --git a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
index cafcbd6d94..99da342f66 100644
--- a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
+++ b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
@@ -1156,7 +1156,7 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {
   code << "}\n";
   code << "// -----------------------------------------------------------------------------\n\n";
 
-  // View kernel for debugging
+  // Compile
   CeedCallBackend(CeedCompile_Cuda(ceed, code.str().c_str(), &data->module, 1, "T_1D", CeedIntMax(Q_1d, data->max_P_1d)));
   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, operator_name.c_str(), &data->op));
   CeedCallBackend(CeedOperatorSetSetupDone(op));
diff --git a/backends/hip-gen/ceed-hip-gen-operator-build.cpp b/backends/hip-gen/ceed-hip-gen-operator-build.cpp
index 9922cb3a69..bd56a77350 100644
--- a/backends/hip-gen/ceed-hip-gen-operator-build.cpp
+++ b/backends/hip-gen/ceed-hip-gen-operator-build.cpp
@@ -25,25 +25,25 @@
 // Calculate the block size used for launching the operator kernel
 //------------------------------------------------------------------------------
 extern "C" int BlockGridCalculate_Hip_gen(const CeedInt dim, const CeedInt num_elem, const CeedInt P_1d, const CeedInt Q_1d, CeedInt *block_sizes) {
-  const CeedInt thread1d = CeedIntMax(Q_1d, P_1d);
+  const CeedInt thread_1d = CeedIntMax(Q_1d, P_1d);
   if (dim == 1) {
-    CeedInt elems_per_block = 64 * thread1d > 256 ? 256 / thread1d : 64;
+    CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64;
 
     elems_per_block = elems_per_block > 0 ? elems_per_block : 1;
-    block_sizes[0]  = thread1d;
+    block_sizes[0]  = thread_1d;
     block_sizes[1]  = 1;
     block_sizes[2]  = elems_per_block;
   } else if (dim == 2) {
-    const CeedInt elems_per_block = thread1d < 4 ? 16 : 2;
+    const CeedInt elems_per_block = thread_1d < 4 ? 16 : 2;
 
-    block_sizes[0] = thread1d;
-    block_sizes[1] = thread1d;
+    block_sizes[0] = thread_1d;
+    block_sizes[1] = thread_1d;
     block_sizes[2] = elems_per_block;
   } else if (dim == 3) {
-    const CeedInt elems_per_block = thread1d < 6 ? 4 : (thread1d < 8 ? 2 : 1);
+    const CeedInt elems_per_block = thread_1d < 6 ? 4 : (thread_1d < 8 ? 2 : 1);
 
-    block_sizes[0] = thread1d;
-    block_sizes[1] = thread1d;
+    block_sizes[0] = thread_1d;
+    block_sizes[1] = thread_1d;
     block_sizes[2] = elems_per_block;
   }
   return CEED_ERROR_SUCCESS;
@@ -82,6 +82,7 @@ static int CeedOperatorBuildKernelData_Hip_gen(Ceed ceed, CeedInt num_input_fiel
       CeedCheck(*Q_1d == 0 || field_Q_1d == *Q_1d, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
       *Q_1d = field_Q_1d;
     }
+    CeedCallBackend(CeedBasisDestroy(&basis));
   }
   for (CeedInt i = 0; i < num_output_fields; i++) {
     CeedBasis basis;
@@ -104,6 +105,7 @@ static int CeedOperatorBuildKernelData_Hip_gen(Ceed ceed, CeedInt num_input_fiel
       CeedCheck(*Q_1d == 0 || field_Q_1d == *Q_1d, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
       *Q_1d = field_Q_1d;
     }
+    CeedCallBackend(CeedBasisDestroy(&basis));
   }
 
   // Only use 3D collocated gradient parallelization strategy when gradient is computed
@@ -123,6 +125,7 @@ static int CeedOperatorBuildKernelData_Hip_gen(Ceed ceed, CeedInt num_input_fiel
         CeedCallBackend(CeedBasisGetData(basis, &basis_data));
         *use_3d_slices = basis_data->d_collo_grad_1d && (was_grad_found ? *use_3d_slices : true);
         was_grad_found = true;
+        CeedCallBackend(CeedBasisDestroy(&basis));
       }
     }
     for (CeedInt i = 0; i < num_output_fields; i++) {
@@ -137,6 +140,7 @@ static int CeedOperatorBuildKernelData_Hip_gen(Ceed ceed, CeedInt num_input_fiel
         CeedCallBackend(CeedBasisGetData(basis, &basis_data));
         *use_3d_slices = basis_data->d_collo_grad_1d && (was_grad_found ? *use_3d_slices : true);
         was_grad_found = true;
+        CeedCallBackend(CeedBasisDestroy(&basis));
       }
     }
   }
@@ -147,7 +151,7 @@ static int CeedOperatorBuildKernelData_Hip_gen(Ceed ceed, CeedInt num_input_fiel
 // Setup fields
 //------------------------------------------------------------------------------
 static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, CeedInt i, CeedOperatorField op_field,
-                                                    CeedQFunctionField qf_field, CeedInt Q_1d, bool is_input, bool use_3d_slices) {
+                                                    CeedQFunctionField qf_field, CeedInt Q_1d, bool is_input, bool is_at_points, bool use_3d_slices) {
   std::string           var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
   std::string           P_name = "P_1d" + var_suffix, Q_name = "Q_1d";
   std::string           option_name = (is_input ? "inputs" : "outputs");
@@ -165,6 +169,7 @@ static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, Ce
     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
   }
+  CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
   CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis));
   if (basis != CEED_BASIS_NONE) {
     CeedCallBackend(CeedBasisGetData(basis, &basis_data));
@@ -184,16 +189,55 @@ static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, Ce
     case CEED_EVAL_NONE:
       break;
     case CEED_EVAL_INTERP:
-      if (is_input) data->B.inputs[i] = basis_data->d_interp_1d;
-      else data->B.outputs[i] = basis_data->d_interp_1d;
+      if (is_at_points) {
+        // AtPoints
+        if (!basis_data->d_chebyshev_interp_1d) {
+          CeedSize    interp_bytes;
+          CeedScalar *chebyshev_interp_1d;
+
+          interp_bytes = P_1d * Q_1d * sizeof(CeedScalar);
+          CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d));
+          CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d));
+          CeedCallHip(CeedBasisReturnCeed(basis), hipMalloc((void **)&basis_data->d_chebyshev_interp_1d, interp_bytes));
+          CeedCallHip(CeedBasisReturnCeed(basis),
+                      hipMemcpy(basis_data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, hipMemcpyHostToDevice));
+          CeedCallBackend(CeedFree(&chebyshev_interp_1d));
+        }
+        if (is_input) data->B.inputs[i] = basis_data->d_chebyshev_interp_1d;
+        else data->B.outputs[i] = basis_data->d_chebyshev_interp_1d;
+      } else {
+        // Standard quadrature
+        if (is_input) data->B.inputs[i] = basis_data->d_interp_1d;
+        else data->B.outputs[i] = basis_data->d_interp_1d;
+      }
       code << "  __shared__ CeedScalar s_B" << var_suffix << "[" << P_1d * Q_1d << "];\n";
       code << "  LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
       break;
     case CEED_EVAL_GRAD:
-      if (is_input) data->B.inputs[i] = basis_data->d_interp_1d;
-      else data->B.outputs[i] = basis_data->d_interp_1d;
+      if (is_at_points) {
+        // AtPoints
+        if (!basis_data->d_chebyshev_interp_1d) {
+          CeedSize    interp_bytes;
+          CeedScalar *chebyshev_interp_1d;
+
+          interp_bytes = P_1d * Q_1d * sizeof(CeedScalar);
+          CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d));
+          CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d));
+          CeedCallHip(CeedBasisReturnCeed(basis), hipMalloc((void **)&basis_data->d_chebyshev_interp_1d, interp_bytes));
+          CeedCallHip(CeedBasisReturnCeed(basis),
+                      hipMemcpy(basis_data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, hipMemcpyHostToDevice));
+          CeedCallBackend(CeedFree(&chebyshev_interp_1d));
+        }
+        if (is_input) data->B.inputs[i] = basis_data->d_chebyshev_interp_1d;
+        else data->B.outputs[i] = basis_data->d_chebyshev_interp_1d;
+      } else {
+        // Standard quadrature
+        if (is_input) data->B.inputs[i] = basis_data->d_interp_1d;
+        else data->B.outputs[i] = basis_data->d_interp_1d;
+      }
       code << "  __shared__ CeedScalar s_B" << var_suffix << "[" << P_1d * Q_1d << "];\n";
       code << "  LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
+      if (is_at_points) break;  // No G mat for AtPoints
       if (use_3d_slices) {
         if (is_input) data->G.inputs[i] = basis_data->d_collo_grad_1d;
         else data->G.outputs[i] = basis_data->d_collo_grad_1d;
@@ -217,11 +261,11 @@ static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, Ce
       break;  // No action
       // LCOV_EXCL_START
     case CEED_EVAL_DIV:
-      break;  // TODO: Not implemented
     case CEED_EVAL_CURL:
       break;  // TODO: Not implemented
               // LCOV_EXCL_STOP
   }
+  CeedCallBackend(CeedBasisDestroy(&basis));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -230,7 +274,7 @@ static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, Ce
 //------------------------------------------------------------------------------
 static int CeedOperatorBuildKernelRestriction_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, CeedInt i, CeedInt dim,
                                                       CeedInt field_input_buffer[], CeedOperatorField op_field, CeedQFunctionField qf_field,
-                                                      CeedInt Q_1d, bool is_input, bool use_3d_slices) {
+                                                      CeedInt Q_1d, bool is_input, bool is_at_points, bool use_3d_slices) {
   std::string              var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
   std::string              P_name     = "P_1d" + var_suffix;
   CeedEvalMode             eval_mode  = CEED_EVAL_NONE;
@@ -253,6 +297,7 @@ static int CeedOperatorBuildKernelRestriction_Hip_gen(std::ostringstream &code,
   if (basis != CEED_BASIS_NONE) {
     CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
   }
+  CeedCallBackend(CeedBasisDestroy(&basis));
   CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode));
 
   // Restriction
@@ -263,11 +308,11 @@ static int CeedOperatorBuildKernelRestriction_Hip_gen(std::ostringstream &code,
 
       // Restriction was already done for previous input
       code << "    CeedScalar *r_e" << var_suffix << " = " << buffer_name << ";\n";
-    } else if (eval_mode != CEED_EVAL_WEIGHT && !((eval_mode == CEED_EVAL_NONE) && use_3d_slices)) {
-      if (eval_mode == CEED_EVAL_NONE) {
+    } else if (eval_mode != CEED_EVAL_WEIGHT && !((eval_mode == CEED_EVAL_NONE) && use_3d_slices && is_at_points)) {
+      if (eval_mode == CEED_EVAL_NONE && rstr_type != CEED_RESTRICTION_POINTS) {
         // No basis action, so r_e_in_* in also r_q_in_* and needs to be allocated
         code << "    CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << P_name << "];\n";
-      } else {
+      } else if (rstr_type != CEED_RESTRICTION_POINTS) {
         // Otherwise we're using the scratch space
         code << "    CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
       }
@@ -300,10 +345,17 @@ static int CeedOperatorBuildKernelRestriction_Hip_gen(std::ostringstream &code,
                << strides[2] << ">(data, elem, d" << var_suffix << ", r_e" << var_suffix << ");\n";
           break;
         }
+        case CEED_RESTRICTION_POINTS: {
+          CeedInt comp_stride;
+
+          CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
+          code << "    const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
+          data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets;
+          break;
+        }
         // LCOV_EXCL_START
         case CEED_RESTRICTION_ORIENTED:
         case CEED_RESTRICTION_CURL_ORIENTED:
-        case CEED_RESTRICTION_POINTS:
           break;  // TODO: Not implemented
                   // LCOV_EXCL_STOP
       }
@@ -339,14 +391,17 @@ static int CeedOperatorBuildKernelRestriction_Hip_gen(std::ostringstream &code,
              << strides[2] << ">(data, elem, r_e" << var_suffix << ", d" << var_suffix << ");\n";
         break;
       }
+      case CEED_RESTRICTION_POINTS:
+        data->indices.outputs[i] = (CeedInt *)rstr_data->d_offsets;
+        break;
       // LCOV_EXCL_START
       case CEED_RESTRICTION_ORIENTED:
       case CEED_RESTRICTION_CURL_ORIENTED:
-      case CEED_RESTRICTION_POINTS:
         break;  // TODO: Not implemented
                 // LCOV_EXCL_STOP
     }
   }
+  CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -355,7 +410,7 @@ static int CeedOperatorBuildKernelRestriction_Hip_gen(std::ostringstream &code,
 //------------------------------------------------------------------------------
 static int CeedOperatorBuildKernelBasis_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, CeedInt i, CeedInt dim,
                                                 CeedOperatorField op_field, CeedQFunctionField qf_field, CeedInt Q_1d, bool is_input,
-                                                bool use_3d_slices) {
+                                                bool is_at_points, bool use_3d_slices) {
   std::string         var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
   std::string         P_name = "P_1d" + var_suffix, Q_name = "Q_1d";
   CeedEvalMode        eval_mode = CEED_EVAL_NONE;
@@ -369,6 +424,7 @@ static int CeedOperatorBuildKernelBasis_Hip_gen(std::ostringstream &code, CeedOp
     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
   }
+  CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
   CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis));
   if (basis != CEED_BASIS_NONE) {
     CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
@@ -380,17 +436,27 @@ static int CeedOperatorBuildKernelBasis_Hip_gen(std::ostringstream &code, CeedOp
   if (is_input) {
     switch (eval_mode) {
       case CEED_EVAL_NONE:
-        if (!use_3d_slices) {
+        if (!use_3d_slices && !is_at_points) {
           code << "    CeedScalar *r_q" << var_suffix << " = r_e" << var_suffix << ";\n";
         }
         break;
       case CEED_EVAL_INTERP:
-        code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
-        code << "    Interp" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp" << var_suffix << ", P_1d" << var_suffix << ", " << Q_name
-             << ">(data, r_e" << var_suffix << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n";
+        if (is_at_points) {
+          code << "    CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim > 2 ? Q_name : "1") << "];\n";
+          code << "    Interp" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp" << var_suffix << ", P_1d" << var_suffix << ", " << Q_name
+               << ">(data, r_e" << var_suffix << ", s_B" << var_suffix << ", r_c" << var_suffix << ");\n";
+        } else {
+          code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
+          code << "    Interp" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp" << var_suffix << ", P_1d" << var_suffix << ", " << Q_name
+               << ">(data, r_e" << var_suffix << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n";
+        }
         break;
       case CEED_EVAL_GRAD:
-        if (use_3d_slices) {
+        if (is_at_points) {
+          code << "    CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim > 2 ? Q_name : "1") << "];\n";
+          code << "    Interp" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp" << var_suffix << ", P_1d" << var_suffix << ", " << Q_name
+               << ">(data, r_e" << var_suffix << ", s_B" << var_suffix << ", r_c" << var_suffix << ");\n";
+        } else if (use_3d_slices) {
           code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
           code << "    Interp" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp" << var_suffix << ", P_1d" << var_suffix << ", " << Q_name
                << ">(data, r_e" << var_suffix << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n";
@@ -402,12 +468,16 @@ static int CeedOperatorBuildKernelBasis_Hip_gen(std::ostringstream &code, CeedOp
         }
         break;
       case CEED_EVAL_WEIGHT: {
-        CeedBasis_Hip_shared *basis_data;
+        if (is_at_points) {
+          code << "    // Nothing to do AtPoints\n";
+        } else {
+          CeedBasis_Hip_shared *basis_data;
 
-        code << "    CeedScalar r_q" << var_suffix << "[" << Q_name << "];\n";
-        CeedCallBackend(CeedBasisGetData(basis, &basis_data));
-        data->W = basis_data->d_q_weight_1d;
-        code << "    Weight" << (dim > 1 ? "Tensor" : "") << dim << "d<" << Q_name << ">(data, W, r_q" << var_suffix << ");\n";
+          code << "    CeedScalar r_q" << var_suffix << "[" << Q_name << "];\n";
+          CeedCallBackend(CeedBasisGetData(basis, &basis_data));
+          data->W = basis_data->d_q_weight_1d;
+          code << "    Weight" << (dim > 1 ? "Tensor" : "") << dim << "d<" << Q_name << ">(data, W, r_q" << var_suffix << ");\n";
+        }
         break;
       }
       // LCOV_EXCL_START
@@ -423,12 +493,20 @@ static int CeedOperatorBuildKernelBasis_Hip_gen(std::ostringstream &code, CeedOp
         break;  // No action
       case CEED_EVAL_INTERP:
         code << "    CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
-        code << "    InterpTranspose" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp" << var_suffix << ", " << P_name << ", " << Q_name
-             << ">(data, r_q" << var_suffix << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
+        if (is_at_points) {
+          code << "    InterpTranspose" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp" << var_suffix << ", " << P_name << ", " << Q_name
+               << ">(data, r_c" << var_suffix << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
+        } else {
+          code << "    InterpTranspose" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp" << var_suffix << ", " << P_name << ", " << Q_name
+               << ">(data, r_q" << var_suffix << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
+        }
         break;
       case CEED_EVAL_GRAD:
         code << "    CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
-        if (use_3d_slices) {
+        if (is_at_points) {
+          code << "    InterpTranspose" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp" << var_suffix << ", " << P_name << ", " << Q_name
+               << ">(data, r_c" << var_suffix << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
+        } else if (use_3d_slices) {
           code << "    InterpTranspose" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp" << var_suffix << ", " << P_name << ", " << Q_name
                << ">(data, r_q" << var_suffix << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
         } else {
@@ -446,16 +524,17 @@ static int CeedOperatorBuildKernelBasis_Hip_gen(std::ostringstream &code, CeedOp
                 // LCOV_EXCL_STOP
     }
   }
+  CeedCallBackend(CeedBasisDestroy(&basis));
   return CEED_ERROR_SUCCESS;
 }
 
 //------------------------------------------------------------------------------
 // QFunction
 //------------------------------------------------------------------------------
-static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, CeedInt dim, CeedInt num_input_fields,
-                                                    CeedOperatorField *op_input_fields, CeedQFunctionField *qf_input_fields,
+static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, CeedInt dim, CeedInt max_num_points,
+                                                    CeedInt num_input_fields, CeedOperatorField *op_input_fields, CeedQFunctionField *qf_input_fields,
                                                     CeedInt num_output_fields, CeedOperatorField *op_output_fields,
-                                                    CeedQFunctionField *qf_output_fields, std::string qfunction_name, CeedInt Q_1d,
+                                                    CeedQFunctionField *qf_output_fields, std::string qfunction_name, CeedInt Q_1d, bool is_at_points,
                                                     bool use_3d_slices) {
   std::string         Q_name    = "Q_1d";
   CeedEvalMode        eval_mode = CEED_EVAL_NONE;
@@ -468,24 +547,128 @@ static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, Ce
 
     code << "    // ---- Output field " << i << "\n";
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
-    if (eval_mode == CEED_EVAL_NONE || eval_mode == CEED_EVAL_INTERP) {
-      code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
+    switch (eval_mode) {
+      case CEED_EVAL_NONE:
+        if (is_at_points) {
+          code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "];\n";
+        } else {
+          code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
+        }
+        break;
+      case CEED_EVAL_INTERP:
+        if (is_at_points) {
+          // Accumulator for point data
+          code << "    CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim > 2 ? Q_name : "1") << "];\n";
+          code << "    for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (dim > 2 ? Q_name : "1") << "; i++) {\n";
+          code << "      r_c" << var_suffix << "[i] = 0.0;\n";
+          code << "    }\n";
+        } else {
+          code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
+        }
+        break;
+      case CEED_EVAL_GRAD:
+        if (is_at_points) {
+          // Accumulator for point data
+          code << "    CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim > 2 ? Q_name : "1") << "*dim];\n";
+          code << "    for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (dim > 2 ? Q_name : "1") << "; i++) {\n";
+          code << "      r_c" << var_suffix << "[i] = 0.0;\n";
+          code << "    }\n";
+        } else if (use_3d_slices) {
+          // Accumulator for gradient slices
+          code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
+          code << "    for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << Q_name << "; i++) {\n";
+          code << "      r_q" << var_suffix << "[i] = 0.0;\n";
+          code << "    }\n";
+        } else {
+          code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim*" << Q_name << "];\n";
+        }
+        break;
+      case CEED_EVAL_WEIGHT:
+        break;
+        // LCOV_EXCL_START
+      case CEED_EVAL_DIV:
+      case CEED_EVAL_CURL:
+        break;  // TODO: Not implemented
+                // LCOV_EXCL_STOP
     }
-    if (eval_mode == CEED_EVAL_GRAD) {
-      if (use_3d_slices) {
-        // Accumulator for gradient slices
-        code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
-        code << "    for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << Q_name << "; i++) {\n";
-        code << "      r_q" << var_suffix << "[i] = 0.0;\n";
-        code << "    }\n";
-      } else {
-        code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim*" << Q_name << "];\n";
+  }
+
+  if (is_at_points) {
+    // We need to handle batches of points
+    code << "\n    // Note: Using batches of points\n";
+    code << "    const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * max_num_points / (blockDim.x * blockDim.y));\n\n";
+    code << "    #pragma unroll\n";
+    code << "    for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) {\n";
+    code << "      const CeedInt p = i % max_num_points;\n\n";
+
+    code << "      // -- Coordinates\n";
+    code << "      CeedScalar r_x[dim];\n";
+    code << "      ReadPoint<dim, coords_comp_stride, max_num_points>(data, elem, p, max_num_points, points.indices, points.coords, r_x);\n\n";
+
+    code << "      // -- Input fields\n";
+    for (CeedInt i = 0; i < num_input_fields; i++) {
+      std::string var_suffix = "_in_" + std::to_string(i);
+
+      code << "      // ---- Input field " << i << "\n";
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
+      // Basis action
+      code << "      // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
+      switch (eval_mode) {
+        case CEED_EVAL_NONE:
+          code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
+          code << "      ReadPoint<num_comp" << var_suffix << ", comp_stride" << var_suffix
+               << ", max_num_points>(data, elem, p, max_num_points, indices.inputs[" << i << "], d" << var_suffix << ", r_s" << var_suffix << ");\n";
+          break;
+        case CEED_EVAL_INTERP:
+          code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
+          code << "      InterpAtPoints" << dim << "d<num_comp" << var_suffix << ", max_num_points, " << Q_name << ">(data, i, r_c" << var_suffix
+               << ", r_x, r_s" << var_suffix << ");\n";
+          break;
+        case CEED_EVAL_GRAD:
+          code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim];\n";
+          code << "      GradAtPoints" << dim << "d<num_comp" << var_suffix << ", max_num_points, " << Q_name << ">(data, i, r_c" << var_suffix
+               << ", r_x, r_s" << var_suffix << ");\n";
+          break;
+        case CEED_EVAL_WEIGHT:
+          code << "      CeedScalar r_s" << var_suffix << "[1];\n";
+          code << "      r_s" << var_suffix << "[0] = 1.0;\n";
+          break;
+          // LCOV_EXCL_START
+        case CEED_EVAL_DIV:
+        case CEED_EVAL_CURL:
+          break;  // TODO: Not implemented
+                  // LCOV_EXCL_STOP
       }
     }
-  }
+    code << "\n      // -- Output fields\n";
+    for (CeedInt i = 0; i < num_output_fields; i++) {
+      std::string var_suffix = "_out_" + std::to_string(i);
 
-  // We treat quadrature points per slice in 3d to save registers
-  if (use_3d_slices) {
+      code << "      // ---- Output field " << i << "\n";
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
+      // Basis action
+      switch (eval_mode) {
+        case CEED_EVAL_NONE:
+          code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
+          break;
+        case CEED_EVAL_INTERP:
+          code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
+          break;
+        case CEED_EVAL_GRAD:
+          code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim];\n";
+          break;
+          // LCOV_EXCL_START
+        case CEED_EVAL_WEIGHT:
+          break;  // Should not occur
+        case CEED_EVAL_DIV:
+        case CEED_EVAL_CURL:
+          break;  // TODO: Not implemented
+                  // LCOV_EXCL_STOP
+      }
+    }
+
+  } else if (use_3d_slices) {
+    // We treat quadrature points per slice in 3d to save registers
     code << "\n    // Note: Using planes of 3D elements\n";
     code << "    #pragma unroll\n";
     code << "    for (CeedInt q = 0; q < " << Q_name << "; q++) {\n";
@@ -549,10 +732,9 @@ static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, Ce
         case CEED_EVAL_WEIGHT:
           code << "      CeedScalar r_s" << var_suffix << "[1];\n";
           code << "      r_s" << var_suffix << "[0] = r_q" << var_suffix << "[q];\n";
-          break;  // No action
-                  // LCOV_EXCL_START
+          break;
+          // LCOV_EXCL_START
         case CEED_EVAL_DIV:
-          break;  // TODO: Not implemented
         case CEED_EVAL_CURL:
           break;  // TODO: Not implemented
                   // LCOV_EXCL_STOP
@@ -568,7 +750,7 @@ static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, Ce
       switch (eval_mode) {
         case CEED_EVAL_NONE:
           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
-          break;  // No action
+          break;
         case CEED_EVAL_INTERP:
           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
           break;
@@ -579,7 +761,6 @@ static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, Ce
         case CEED_EVAL_WEIGHT:
           break;  // Should not occur
         case CEED_EVAL_DIV:
-          break;  // TODO: Not implemented
         case CEED_EVAL_CURL:
           break;  // TODO: Not implemented
                   // LCOV_EXCL_STOP
@@ -618,16 +799,64 @@ static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, Ce
   // Apply QFunction
   code << "\n      // -- Apply QFunction\n";
   code << "      " << qfunction_name << "(ctx, ";
-  if (dim != 3 || use_3d_slices) {
+  if (dim != 3 || is_at_points || use_3d_slices) {
     code << "1";
   } else {
     code << "Q_1d";
   }
   code << ", inputs, outputs);\n";
 
-  // Copy or apply transpose grad, if needed
-  if (use_3d_slices) {
-    code << "      // -- Output fields\n";
+  if (is_at_points) {
+    // Map back to coefficients
+    code << "\n      // -- Output fields\n";
+    for (CeedInt i = 0; i < num_output_fields; i++) {
+      std::string var_suffix = "_out_" + std::to_string(i);
+      std::string P_name     = "P_1d" + var_suffix;
+
+      code << "      // ---- Output field " << i << "\n";
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
+      // Basis action
+      code << "      // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
+      switch (eval_mode) {
+        case CEED_EVAL_NONE: {
+          CeedInt             comp_stride;
+          CeedElemRestriction elem_rstr;
+
+          CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
+          CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
+          CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
+          code << "      const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
+          code << "      WritePoint<num_comp" << var_suffix << ", comp_stride" << var_suffix
+               << ", max_num_points>(data, elem, i, points.num_per_elem[elem], indices.outputs[" << i << "]"
+               << ", r_s" << var_suffix << ", d" << var_suffix << ");\n";
+          break;
+        }
+        case CEED_EVAL_INTERP:
+          code << "      if (i >= points.num_per_elem[elem]) {\n";
+          code << "        for (CeedInt j = 0; j < num_comp" << var_suffix << "; j++) r_s" << var_suffix << "[j] = 0.0;\n";
+          code << "      }\n";
+          code << "      InterpTransposeAtPoints" << dim << "d<num_comp" << var_suffix << ", max_num_points, " << Q_name << ">(data, i, r_s"
+               << var_suffix << ", r_x, r_c" << var_suffix << ");\n";
+          break;
+        case CEED_EVAL_GRAD:
+          code << "      if (i >= points.num_per_elem[elem]) {\n";
+          code << "        for (CeedInt j = 0; j < num_comp" << var_suffix << "*dim; j++) r_s" << var_suffix << "[j] = 0.0;\n";
+          code << "      }\n";
+          code << "      GradTransposeAtPoints" << dim << "d<num_comp" << var_suffix << ", max_num_points, " << Q_name << ">(data, i, r_s"
+               << var_suffix << ", r_x, r_c" << var_suffix << ");\n";
+          break;
+          // LCOV_EXCL_START
+        case CEED_EVAL_WEIGHT:
+          break;  // Should not occur
+        case CEED_EVAL_DIV:
+        case CEED_EVAL_CURL:
+          break;  // TODO: Not implemented
+                  // LCOV_EXCL_STOP
+      }
+    }
+  } else if (use_3d_slices) {
+    // Copy or apply transpose grad, if needed
+    code << "\n      // -- Output fields\n";
     for (CeedInt i = 0; i < num_output_fields; i++) {
       std::string var_suffix = "_out_" + std::to_string(i);
       std::string P_name     = "P_1d" + var_suffix;
@@ -641,7 +870,7 @@ static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, Ce
           code << "      for (CeedInt j = 0; j < num_comp" << var_suffix << " ; j++) {\n";
           code << "        r_q" << var_suffix << "[q + j*" << Q_name << "] = r_s" << var_suffix << "[j];\n";
           code << "      }\n";
-          break;  // No action
+          break;
         case CEED_EVAL_INTERP:
           code << "      for (CeedInt j = 0; j < num_comp" << var_suffix << " ; j++) {\n";
           code << "        r_q" << var_suffix << "[q + j*" << Q_name << "] = r_s" << var_suffix << "[j];\n";
@@ -655,7 +884,6 @@ static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, Ce
         case CEED_EVAL_WEIGHT:
           break;  // Should not occur
         case CEED_EVAL_DIV:
-          break;  // TODO: Not implemented
         case CEED_EVAL_CURL:
           break;  // TODO: Not implemented
                   // LCOV_EXCL_STOP
@@ -670,9 +898,9 @@ static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, Ce
 // Build single operator kernel
 //------------------------------------------------------------------------------
 extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op) {
-  bool                   is_tensor = true, use_3d_slices = false;
+  bool                   is_tensor = true, is_at_points = false, use_3d_slices = false;
   Ceed                   ceed;
-  CeedInt                Q_1d, num_input_fields, num_output_fields, dim = 1;
+  CeedInt                Q_1d, num_input_fields, num_output_fields, dim = 1, max_num_points = 0, coords_comp_stride = 0;
   CeedQFunctionField    *qf_input_fields, *qf_output_fields;
   CeedQFunction_Hip_gen *qf_data;
   CeedQFunction          qf;
@@ -695,15 +923,26 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op) {
   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
 
   // Get operator data
+  CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points));
   CeedCallBackend(CeedOperatorBuildKernelData_Hip_gen(ceed, num_input_fields, op_input_fields, qf_input_fields, num_output_fields, op_output_fields,
                                                       qf_output_fields, &data->max_P_1d, &Q_1d, &dim, &is_tensor, &use_3d_slices));
   if (dim == 0) dim = 1;
   data->dim = dim;
+  if (is_at_points) {
+    CeedElemRestriction_Hip *rstr_data;
+    CeedElemRestriction      rstr_points = NULL;
+
+    CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL));
+    CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(rstr_points, &max_num_points));
+    CeedCallBackend(CeedElemRestrictionGetCompStride(rstr_points, &coords_comp_stride));
+    CeedCallBackend(CeedElemRestrictionGetData(rstr_points, &rstr_data));
+    data->points.indices = (CeedInt *)rstr_data->d_offsets;
+    CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
+  }
+  if (is_at_points) use_3d_slices = false;
   if (Q_1d == 0) {
-    CeedInt Q;
-
-    CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q));
-    Q_1d = Q;
+    if (is_at_points) Q_1d = max_num_points;
+    else CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q_1d));
   }
   data->Q_1d = Q_1d;
 
@@ -726,6 +965,8 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op) {
   // TODO: Add non-tensor, AtPoints
   code << "// Tensor basis source\n";
   code << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-templates.h>\n\n";
+  code << "// AtPoints basis source\n";
+  code << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h>\n\n";
   code << "// CodeGen operator source\n";
   code << "#include <ceed/jit-source/hip/hip-gen-templates.h>\n\n";
 
@@ -737,7 +978,7 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op) {
 
   // Define CEED_Q_VLA
   code << "\n#undef CEED_Q_VLA\n";
-  if (dim != 3 || use_3d_slices) {
+  if (dim != 3 || is_at_points || use_3d_slices) {
     code << "#define CEED_Q_VLA 1\n\n";
   } else {
     code << "#define CEED_Q_VLA " << Q_1d << "\n\n";
@@ -761,14 +1002,15 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op) {
   code << "// d_[in,out]_i:   CeedVector device array\n";
   code << "// r_[in,out]_e_i: Element vector register\n";
   code << "// r_[in,out]_q_i: Quadrature space vector register\n";
-  code << "// r_[in,out]_s_i: Quadrature space slice  vector register\n";
+  code << "// r_[in,out]_c_i: AtPoints Chebyshev coefficents register\n";
+  code << "// r_[in,out]_s_i: Quadrature space slice vector register\n";
   code << "// \n";
   code << "// s_B_[in,out]_i: Interpolation matrix, shared memory\n";
   code << "// s_G_[in,out]_i: Gradient matrix, shared memory\n";
   code << "// -----------------------------------------------------------------------------\n";
   code << "\nextern \"C\" __launch_bounds__(BLOCK_SIZE)\n";
   code << "__global__ void " << operator_name
-       << "(CeedInt num_elem, void* ctx, FieldsInt_Hip indices, Fields_Hip fields, Fields_Hip B, Fields_Hip G, CeedScalar* W) {\n";
+       << "(CeedInt num_elem, void* ctx, FieldsInt_Hip indices, Fields_Hip fields, Fields_Hip B, Fields_Hip G, CeedScalar* W, Points_Hip points) {\n";
 
   // Scratch buffers
   for (CeedInt i = 0; i < num_input_fields; i++) {
@@ -785,6 +1027,10 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op) {
 
   code << "  const CeedInt dim = " << dim << ";\n";
   code << "  const CeedInt Q_1d = " << Q_1d << ";\n";
+  if (is_at_points) {
+    code << "  const CeedInt max_num_points = " << max_num_points << ";\n";
+    code << "  const CeedInt coords_comp_stride = " << coords_comp_stride << ";\n";
+  }
 
   // Shared data
   code << "  extern __shared__ CeedScalar slice[];\n";
@@ -798,11 +1044,13 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op) {
   // Initialize constants, and matrices B and G
   code << "\n  // Input field constants and basis data\n";
   for (CeedInt i = 0; i < num_input_fields; i++) {
-    CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, i, op_input_fields[i], qf_input_fields[i], Q_1d, true, use_3d_slices));
+    CeedCallBackend(
+        CeedOperatorBuildKernelFieldData_Hip_gen(code, data, i, op_input_fields[i], qf_input_fields[i], Q_1d, true, is_at_points, use_3d_slices));
   }
   code << "\n  // Output field constants and basis data\n";
   for (CeedInt i = 0; i < num_output_fields; i++) {
-    CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, i, op_output_fields[i], qf_output_fields[i], Q_1d, false, use_3d_slices));
+    CeedCallBackend(
+        CeedOperatorBuildKernelFieldData_Hip_gen(code, data, i, op_output_fields[i], qf_output_fields[i], Q_1d, false, is_at_points, use_3d_slices));
   }
 
   // Loop over all elements
@@ -811,7 +1059,7 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op) {
   code << "  for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n";
 
   // -- Compute minimum buffer space needed
-  CeedInt max_rstr_buffer_size = 0;
+  CeedInt max_rstr_buffer_size = 1;
 
   for (CeedInt i = 0; i < num_input_fields; i++) {
     CeedInt             num_comp, elem_size;
@@ -872,12 +1120,16 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op) {
           input_field_order[curr_index] = j;
           curr_index++;
         }
+        CeedCallBackend(CeedVectorDestroy(&vec_j));
+        CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j));
       }
+      CeedCallBackend(CeedVectorDestroy(&vec_i));
+      CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
     }
   }
 
   // -- Input restriction and basis
-  code << "    // -- Input field restrictions and basis actions\n";
+  code << "\n    // -- Input field restrictions and basis actions\n";
   for (CeedInt i = 0; i < num_input_fields; i++) {
     CeedInt f = input_field_order[i];
 
@@ -885,15 +1137,17 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op) {
 
     // ---- Restriction
     CeedCallBackend(CeedOperatorBuildKernelRestriction_Hip_gen(code, data, f, dim, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f], Q_1d,
-                                                               true, use_3d_slices));
+                                                               true, is_at_points, use_3d_slices));
 
     // ---- Basis action
-    CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, f, dim, op_input_fields[f], qf_input_fields[f], Q_1d, true, use_3d_slices));
+    CeedCallBackend(
+        CeedOperatorBuildKernelBasis_Hip_gen(code, data, f, dim, op_input_fields[f], qf_input_fields[f], Q_1d, true, is_at_points, use_3d_slices));
   }
 
   // -- Q function
-  CeedCallBackend(CeedOperatorBuildKernelQFunction_Hip_gen(code, data, dim, num_input_fields, op_input_fields, qf_input_fields, num_output_fields,
-                                                           op_output_fields, qf_output_fields, qfunction_name, Q_1d, use_3d_slices));
+  CeedCallBackend(CeedOperatorBuildKernelQFunction_Hip_gen(code, data, dim, max_num_points, num_input_fields, op_input_fields, qf_input_fields,
+                                                           num_output_fields, op_output_fields, qf_output_fields, qfunction_name, Q_1d, is_at_points,
+                                                           use_3d_slices));
 
   // -- Output basis and restriction
   code << "\n    // -- Output field basis action and restrictions\n";
@@ -901,11 +1155,12 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op) {
     code << "    // ---- Output field " << i << "\n";
 
     // ---- Basis action
-    CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, i, dim, op_output_fields[i], qf_output_fields[i], Q_1d, false, use_3d_slices));
+    CeedCallBackend(
+        CeedOperatorBuildKernelBasis_Hip_gen(code, data, i, dim, op_output_fields[i], qf_output_fields[i], Q_1d, false, is_at_points, use_3d_slices));
 
     // ---- Restriction
-    CeedCallBackend(
-        CeedOperatorBuildKernelRestriction_Hip_gen(code, data, i, dim, NULL, op_output_fields[i], qf_output_fields[i], Q_1d, false, use_3d_slices));
+    CeedCallBackend(CeedOperatorBuildKernelRestriction_Hip_gen(code, data, i, dim, NULL, op_output_fields[i], qf_output_fields[i], Q_1d, false,
+                                                               is_at_points, use_3d_slices));
   }
 
   // Close loop and function
@@ -916,6 +1171,7 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op) {
   CeedInt block_sizes[3] = {0, 0, 0};
   CeedInt num_elem;
 
+  // Compile
   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
   CeedCallBackend(BlockGridCalculate_Hip_gen(dim, num_elem, data->max_P_1d, Q_1d, block_sizes));
   CeedCallBackend(CeedCompile_Hip(ceed, code.str().c_str(), &data->module, 2, "T_1D", block_sizes[0], "BLOCK_SIZE",
diff --git a/backends/hip-gen/ceed-hip-gen-operator.c b/backends/hip-gen/ceed-hip-gen-operator.c
index 11ee4943b8..d3455ac137 100644
--- a/backends/hip-gen/ceed-hip-gen-operator.c
+++ b/backends/hip-gen/ceed-hip-gen-operator.c
@@ -9,6 +9,7 @@
 #include <ceed/backend.h>
 #include <ceed/jit-source/hip/hip-types.h>
 #include <stddef.h>
+#include <hip/hiprtc.h>
 
 #include "../hip/ceed-hip-common.h"
 #include "../hip/ceed-hip-compile.h"
@@ -19,10 +20,14 @@
 // Destroy operator
 //------------------------------------------------------------------------------
 static int CeedOperatorDestroy_Hip_gen(CeedOperator op) {
+  Ceed                  ceed;
   CeedOperator_Hip_gen *impl;
 
+  CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
   CeedCallBackend(CeedOperatorGetData(op, &impl));
+  if (impl->points.num_per_elem) CeedCallHip(ceed, hipFree((void **)impl->points.num_per_elem));
   CeedCallBackend(CeedFree(&impl));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -30,6 +35,7 @@ static int CeedOperatorDestroy_Hip_gen(CeedOperator op) {
 // Apply and add to output
 //------------------------------------------------------------------------------
 static int CeedOperatorApplyAdd_Hip_gen(CeedOperator op, CeedVector input_vec, CeedVector output_vec, CeedRequest *request) {
+  bool                   is_at_points;
   Ceed                   ceed;
   CeedInt                num_elem, num_input_fields, num_output_fields;
   CeedEvalMode           eval_mode;
@@ -110,11 +116,44 @@ static int CeedOperatorApplyAdd_Hip_gen(CeedOperator op, CeedVector input_vec, C
     }
   }
 
+  // Point coordinates, if needed
+  CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points));
+  if (is_at_points) {
+    // Coords
+    CeedVector vec;
+
+    CeedCallBackend(CeedOperatorAtPointsGetPoints(op, NULL, &vec));
+    CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->points.coords));
+    CeedCallBackend(CeedVectorDestroy(&vec));
+
+    // Points per elem
+    if (num_elem != data->points.num_elem) {
+      CeedInt            *points_per_elem;
+      const CeedInt       num_bytes   = num_elem * sizeof(CeedInt);
+      CeedElemRestriction rstr_points = NULL;
+
+      data->points.num_elem = num_elem;
+      CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL));
+      CeedCallBackend(CeedCalloc(num_elem, &points_per_elem));
+      for (CeedInt e = 0; e < num_elem; e++) {
+        CeedInt num_points_elem;
+
+        CeedCallBackend(CeedElemRestrictionGetNumPointsInElement(rstr_points, e, &num_points_elem));
+        points_per_elem[e] = num_points_elem;
+      }
+      if (data->points.num_per_elem) CeedCallHip(ceed, hipFree((void **)data->points.num_per_elem));
+      CeedCallHip(ceed, hipMalloc((void **)&data->points.num_per_elem, num_bytes));
+      CeedCallHip(ceed, hipMemcpy((void *)data->points.num_per_elem, points_per_elem, num_bytes, hipMemcpyHostToDevice));
+      CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
+      CeedCallBackend(CeedFree(&points_per_elem));
+    }
+  }
+
   // Get context data
   CeedCallBackend(CeedQFunctionGetInnerContextData(qf, CEED_MEM_DEVICE, &qf_data->d_c));
 
   // Apply operator
-  void         *opargs[]  = {(void *)&num_elem, &qf_data->d_c, &data->indices, &data->fields, &data->B, &data->G, &data->W};
+  void         *opargs[]  = {(void *)&num_elem, &qf_data->d_c, &data->indices, &data->fields, &data->B, &data->G, &data->W, &data->points};
   const CeedInt dim       = data->dim;
   const CeedInt Q_1d      = data->Q_1d;
   const CeedInt P_1d      = data->max_P_1d;
@@ -163,6 +202,7 @@ static int CeedOperatorApplyAdd_Hip_gen(CeedOperator op, CeedVector input_vec, C
       if (vec == CEED_VECTOR_ACTIVE) vec = output_vec;
       // Check for multiple output modes
       CeedInt index = -1;
+
       for (CeedInt j = 0; j < i; j++) {
         if (vec == output_vecs[j]) {
           index = j;
@@ -175,6 +215,15 @@ static int CeedOperatorApplyAdd_Hip_gen(CeedOperator op, CeedVector input_vec, C
     }
   }
 
+  // Restore point coordinates, if needed
+  if (is_at_points) {
+    CeedVector vec;
+
+    CeedCallBackend(CeedOperatorAtPointsGetPoints(op, NULL, &vec));
+    CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->points.coords));
+    CeedCallBackend(CeedVectorDestroy(&vec));
+  }
+
   // Restore context data
   CeedCallBackend(CeedQFunctionRestoreInnerContextData(qf, &qf_data->d_c));
   CeedCallBackend(CeedDestroy(&ceed));
diff --git a/backends/hip-gen/ceed-hip-gen.c b/backends/hip-gen/ceed-hip-gen.c
index e867b4eb9e..b7a8c76d0a 100644
--- a/backends/hip-gen/ceed-hip-gen.c
+++ b/backends/hip-gen/ceed-hip-gen.c
@@ -39,6 +39,7 @@ static int CeedInit_Hip_gen(const char *resource, Ceed ceed) {
 
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "QFunctionCreate", CeedQFunctionCreate_Hip_gen));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "OperatorCreate", CeedOperatorCreate_Hip_gen));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "OperatorCreateAtPoints", CeedOperatorCreate_Hip_gen));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "Destroy", CeedDestroy_Hip));
   return CEED_ERROR_SUCCESS;
 }
diff --git a/backends/hip-gen/ceed-hip-gen.h b/backends/hip-gen/ceed-hip-gen.h
index 139bab43bb..eb5dd0c893 100644
--- a/backends/hip-gen/ceed-hip-gen.h
+++ b/backends/hip-gen/ceed-hip-gen.h
@@ -22,6 +22,7 @@ typedef struct {
   Fields_Hip    B;
   Fields_Hip    G;
   CeedScalar   *W;
+  Points_Hip    points;
 } CeedOperator_Hip_gen;
 
 typedef struct {
diff --git a/include/ceed/jit-source/hip/hip-gen-templates.h b/include/ceed/jit-source/hip/hip-gen-templates.h
index 56521a3685..358583a79e 100644
--- a/include/ceed/jit-source/hip/hip-gen-templates.h
+++ b/include/ceed/jit-source/hip/hip-gen-templates.h
@@ -17,6 +17,38 @@ inline __device__ void LoadMatrix(SharedData_Hip &data, const CeedScalar *__rest
   for (CeedInt i = data.t_id; i < P * Q; i += blockDim.x * blockDim.y * blockDim.z) B[i] = d_B[i];
 }
 
+//------------------------------------------------------------------------------
+// AtPoints
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+// L-vector -> single point
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int COMP_STRIDE, int NUM_PTS>
+inline __device__ void ReadPoint(SharedData_Hip &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem,
+                                 const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
+  const CeedInt ind = indices[p + elem * NUM_PTS];
+
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    r_u[comp] = d_u[ind + comp * COMP_STRIDE];
+  }
+}
+
+//------------------------------------------------------------------------------
+// Single point -> L-vector
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int COMP_STRIDE, int NUM_PTS>
+inline __device__ void WritePoint(SharedData_Hip &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem,
+                                  const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_u, CeedScalar *d_u) {
+  if (p < points_in_elem) {
+    const CeedInt ind = indices[p + elem * NUM_PTS];
+
+    for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+      d_u[ind + comp * COMP_STRIDE] += r_u[comp];
+    }
+  }
+}
+
 //------------------------------------------------------------------------------
 // 1D
 //------------------------------------------------------------------------------
diff --git a/include/ceed/jit-source/hip/hip-types.h b/include/ceed/jit-source/hip/hip-types.h
index 418e6fb02c..01fe82e08d 100644
--- a/include/ceed/jit-source/hip/hip-types.h
+++ b/include/ceed/jit-source/hip/hip-types.h
@@ -23,6 +23,13 @@ typedef struct {
   CeedInt *outputs[CEED_HIP_NUMBER_FIELDS];
 } FieldsInt_Hip;
 
+typedef struct {
+  CeedInt           num_elem;
+  const CeedInt    *num_per_elem;
+  const CeedInt    *indices;
+  const CeedScalar *coords;
+} Points_Hip;
+
 typedef struct {
   CeedInt     t_id_x;
   CeedInt     t_id_y;

From 1caccaa9310efa488a472974b0bb3e05a8316d41 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Thu, 2 Jan 2025 09:16:46 -0700
Subject: [PATCH 248/571] ci - disable ROCm CI

---
 .gitlab-ci.yml | 140 +++++++++++++++++++++++++------------------------
 1 file changed, 72 insertions(+), 68 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index e490cb533a..000b0e650a 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -252,90 +252,94 @@ noether-cuda:
 # ----------------------------------------------------------------------------------------
 # ROCm backends
 # ----------------------------------------------------------------------------------------
-noether-rocm:
-  stage: test:gpu-and-float
-  tags:
-    - rocm
-  interruptible: true
-  before_script:
-# Environment
-    - export COVERAGE=1 CC=gcc CXX=g++ FC=gfortran HIPCC=hipcc
-    - export NPROC_POOL=4
-    - echo "-------------- nproc ---------------" && NPROC_CPU=$(nproc) && NPROC_GPU=$(($(nproc)<8?$(nproc):8)) && echo "NPROC_CPU" $NPROC_CPU && echo "NPROC_GPU" $NPROC_GPU
-    - echo "-------------- CC ------------------" && $CC --version
-    - echo "-------------- CXX -----------------" && $CXX --version
-    - echo "-------------- FC ------------------" && $FC --version
-    - echo "-------------- HIPCC ---------------" && $HIPCC --version
-    - echo "-------------- GCOV ----------------" && gcov --version
-# Libraries for backends
-# -- MAGMA from dev branch
-    - echo "-------------- MAGMA ---------------"
-    - export MAGMA_DIR=/projects/hipMAGMA && git -C $MAGMA_DIR -c safe.directory=$MAGMA_DIR describe
-  script:
-    - rm -f .SUCCESS
-# libCEED
-    - make configure ROCM_DIR=/opt/rocm-6.1.0 OPT='-O -march=native -ffp-contract=fast'
-    - BACKENDS_CPU=$(make info-backends-all | grep -o '/cpu[^ ]*' | tr '\n' ' ') && BACKENDS_GPU=$(make info-backends | grep -o '/gpu[^ ]*' | tr '\n' ' ')
-    - echo "-------------- libCEED -------------" && make info
-    - echo "-------------- BACKENDS_GPU --------" && echo $BACKENDS_GPU
-    - make clean
-    - make -j$NPROC_CPU
-# -- libCEED only tests
-    - echo "-------------- core tests ----------"
-    - echo '[{"subject":"/","metrics":[{"name":"Transfer Size (KB)","value":"19.5","desiredSize":"smaller"},{"name":"Speed Index","value":0,"desiredSize":"smaller"},{"name":"Total Score","value":92,"desiredSize":"larger"},{"name":"Requests","value":4,"desiredSize":"smaller"}]}]' > performance.json
-#    Note: PETSC_DIR is set by default in GitLab runner env, unsetting to isolate core tests
-    - export PETSC_DIR= PETSC_ARCH=
-    - make -k -j$((NPROC_GPU / NPROC_POOL)) BACKENDS="$BACKENDS_GPU" JUNIT_BATCH="hip" junit realsearch=%
-# Libraries for examples
-# -- PETSc with HIP (minimal)
-    - export PETSC_DIR=/projects/petsc PETSC_ARCH=mpich-hip && git -C $PETSC_DIR -c safe.directory=$PETSC_DIR describe
-    - echo "-------------- PETSc ---------------" && make -C $PETSC_DIR info
-    - make -k -j$((NPROC_GPU / NPROC_POOL)) BACKENDS="$BACKENDS_GPU" JUNIT_BATCH="hip" junit search="petsc fluids solids"
-# Clang-tidy
-    - echo "-------------- clang-tidy ----------" && clang-tidy --version
-    - TIDY_OPTS="-fix-errors" make -j$NPROC_CPU tidy && git diff --color=always --exit-code
-# Report status
-    - touch .SUCCESS
-  after_script:
-    - |
-      if [ -f .SUCCESS ]; then
-        lcov --directory . --capture --output-file coverage.info;
-        bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F interface;
-        bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F gallery;
-        bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F backends;
-        bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F tests;
-        bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F examples;
-      fi
-  artifacts:
-    paths:
-      - build/*.junit
-    reports:
-      junit: build/*.junit
-      performance: performance.json
+
+# ROCm tests currently disabled
+
+#noether-rocm:
+#  stage: test:gpu-and-float
+#  tags:
+#    - rocm
+#  interruptible: true
+#  before_script:
+## Environment
+#    - export COVERAGE=1 CC=gcc CXX=g++ FC=gfortran HIPCC=hipcc
+#    - export NPROC_POOL=4
+#    - echo "-------------- nproc ---------------" && NPROC_CPU=$(nproc) && NPROC_GPU=$(($(nproc)<8?$(nproc):8)) && echo "NPROC_CPU" $NPROC_CPU && echo "NPROC_GPU" $NPROC_GPU
+#    - echo "-------------- CC ------------------" && $CC --version
+#    - echo "-------------- CXX -----------------" && $CXX --version
+#    - echo "-------------- FC ------------------" && $FC --version
+#    - echo "-------------- HIPCC ---------------" && $HIPCC --version
+#    - echo "-------------- GCOV ----------------" && gcov --version
+## Libraries for backends
+## -- MAGMA from dev branch
+#    - echo "-------------- MAGMA ---------------"
+#    - export MAGMA_DIR=/projects/hipMAGMA && git -C $MAGMA_DIR -c safe.directory=$MAGMA_DIR describe
+#  script:
+#    - rm -f .SUCCESS
+## libCEED
+#    - make configure ROCM_DIR=/opt/rocm-6.1.0 OPT='-O -march=native -ffp-contract=fast'
+#    - BACKENDS_CPU=$(make info-backends-all | grep -o '/cpu[^ ]*' | tr '\n' ' ') && BACKENDS_GPU=$(make info-backends | grep -o '/gpu[^ ]*' | tr '\n' ' ')
+#    - echo "-------------- libCEED -------------" && make info
+#    - echo "-------------- BACKENDS_GPU --------" && echo $BACKENDS_GPU
+#    - make clean
+#    - make -j$NPROC_CPU
+## -- libCEED only tests
+#    - echo "-------------- core tests ----------"
+#    - echo '[{"subject":"/","metrics":[{"name":"Transfer Size (KB)","value":"19.5","desiredSize":"smaller"},{"name":"Speed Index","value":0,"desiredSize":"smaller"},{"name":"Total Score","value":92,"desiredSize":"larger"},{"name":"Requests","value":4,"desiredSize":"smaller"}]}]' > performance.json
+##    Note: PETSC_DIR is set by default in GitLab runner env, unsetting to isolate core tests
+#    - export PETSC_DIR= PETSC_ARCH=
+#    - make -k -j$((NPROC_GPU / NPROC_POOL)) BACKENDS="$BACKENDS_GPU" JUNIT_BATCH="hip" junit realsearch=%
+## Libraries for examples
+## -- PETSc with HIP (minimal)
+#    - export PETSC_DIR=/projects/petsc PETSC_ARCH=mpich-hip && git -C $PETSC_DIR -c safe.directory=$PETSC_DIR describe
+#    - echo "-------------- PETSc ---------------" && make -C $PETSC_DIR info
+#    - make -k -j$((NPROC_GPU / NPROC_POOL)) BACKENDS="$BACKENDS_GPU" JUNIT_BATCH="hip" junit search="petsc fluids solids"
+## Clang-tidy
+#    - echo "-------------- clang-tidy ----------" && clang-tidy --version
+#    - TIDY_OPTS="-fix-errors" make -j$NPROC_CPU tidy && git diff --color=always --exit-code
+## Report status
+#    - touch .SUCCESS
+#  after_script:
+#    - |
+#      if [ -f .SUCCESS ]; then
+#        lcov --directory . --capture --output-file coverage.info;
+#        bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F interface;
+#        bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F gallery;
+#        bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F backends;
+#        bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F tests;
+#        bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F examples;
+#      fi
+#  artifacts:
+#    paths:
+#      - build/*.junit
+#    reports:
+#      junit: build/*.junit
+#      performance: performance.json
 
 
 # ----------------------------------------------------------------------------------------
-# CPU + ROCm backends with CeedScalar == float (32 bit)
+# CPU + CUDA backends with CeedScalar == float (32 bit)
 # ----------------------------------------------------------------------------------------
 noether-float:
   stage: test:gpu-and-float
   tags:
     - cpu
-    - rocm
+    - cuda
   interruptible: true
   before_script:
 # Environment
-    - export COVERAGE=1 CC=gcc CXX=g++ FC= HIPCC=hipcc
+    - export COVERAGE=1 CC=gcc CXX=g++ FC= NVCC=nvcc
     - export NPROC_POOL=8
     - echo "-------------- nproc ---------------" && NPROC_CPU=$(nproc) && NPROC_GPU=$(($(nproc)<8?$(nproc):8)) && echo "NPROC_CPU" $NPROC_CPU && echo "NPROC_GPU" $NPROC_GPU
     - echo "-------------- CC ------------------" && $CC --version
     - echo "-------------- CXX -----------------" && $CXX --version
-    - echo "-------------- HIPCC ---------------" && $HIPCC --version
+    - echo "-------------- NVCC ----------------" && $NVCC --version
     - echo "-------------- GCOV ----------------" && gcov --version
 # Libraries for backends
+# ROCm tests currently disabled
 # -- MAGMA from dev branch
-    - echo "-------------- MAGMA ---------------"
-    - export MAGMA_DIR=/projects/hipMAGMA && git -C $MAGMA_DIR -c safe.directory=$MAGMA_DIR describe
+#    - echo "-------------- MAGMA ---------------"
+#    - export MAGMA_DIR=/projects/hipMAGMA && git -C $MAGMA_DIR -c safe.directory=$MAGMA_DIR describe
 # -- LIBXSMM 7 April 2024
     - cd .. && export XSMM_HASH=94ee71576870152feb62f3f0cf6b061d036dcdb5 && { [[ -d libxsmm-$XSMM_HASH ]] || { curl -L https://github.com/libxsmm/libxsmm/archive/$XSMM_HASH.tar.gz -o xsmm.tar.gz && tar zvxf xsmm.tar.gz && rm xsmm.tar.gz && make -C libxsmm-$XSMM_HASH -j$(nproc); }; } && export XSMM_DIR=$PWD/libxsmm-$XSMM_HASH && cd libCEED
     - echo "-------------- LIBXSMM -------------" && basename $XSMM_DIR
@@ -345,7 +349,7 @@ noether-float:
 # Change to single precision
     - sed -i 's/ceed-f64/ceed-f32/1' include/ceed/types.h
 # Build libCEED
-    - make configure ROCM_DIR=/opt/rocm-6.1.0 OPT='-O -march=native -ffp-contract=fast'
+    - make configure OPT='-O -march=native -ffp-contract=fast' CUDA_DIR=/usr
     - BACKENDS_CPU=$(make info-backends-all | grep -o '/cpu[^ ]*' | tr '\n' ' ') && BACKENDS_GPU=$(make info-backends | grep -o '/gpu[^ ]*' | tr '\n' ' ')
     - echo "-------------- libCEED -------------" && make info
     - echo "-------------- BACKENDS_CPU --------" && echo $BACKENDS_CPU
@@ -359,7 +363,7 @@ noether-float:
     - export PETSC_DIR= PETSC_ARCH=
     - make -k -j$((NPROC_CPU / NPROC_POOL)) BACKENDS="$BACKENDS_CPU" JUNIT_BATCH="float-cpu" junit realsearch=%
     - export NPROC_POOL=4
-    - make -k -j$((NPROC_GPU / NPROC_POOL)) BACKENDS="$BACKENDS_GPU" JUNIT_BATCH="float-hip" junit realsearch=%
+    - make -k -j$((NPROC_GPU / NPROC_POOL)) BACKENDS="$BACKENDS_GPU" JUNIT_BATCH="float-cuda" junit realsearch=%
 # Report status
     - echo "SUCCESS" > .job_status
   after_script:

From 87d77c19c5dfc2c216f0234817a91dd9ccdabce0 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 3 Jan 2025 10:34:50 -0700
Subject: [PATCH 249/571] minor - style

---
 .gitlab-ci.yml | 112 ++++++++++++++++++++++++-------------------------
 1 file changed, 56 insertions(+), 56 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 000b0e650a..a0ea8afcf8 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -19,37 +19,37 @@ noether-asan:
     - cpu
   interruptible: true
   before_script:
-# Environment
-#    Note: COVERAGE=0 is needed when using ASAN
+    # Environment
+    #    Note: COVERAGE=0 is needed when using ASAN
     - export COVERAGE=0 CC=gcc CXX=g++ FC=gfortran
     - export NPROC_POOL=8
     - echo "-------------- nproc ---------------" && NPROC_CPU=$(nproc) && NPROC_GPU=$(($(nproc)<8?$(nproc):8)) && echo "NPROC_CPU" $NPROC_CPU && echo "NPROC_GPU" $NPROC_GPU
     - echo "-------------- CC ------------------" && $CC --version
     - echo "-------------- CXX -----------------" && $CXX --version
     - echo "-------------- FC ------------------" && $FC --version
-# ASAN
+    # ASAN
     - echo "-------------- ASAN ----------------"
     - export ASAN=1 AFLAGS="-fsanitize=address -fsanitize=leak"
     - echo $AFLAGS
   script:
     - rm -f .SUCCESS
-# libCEED
+    # libCEED
     - make configure OPT='-O -march=native -ffp-contract=fast'
     - BACKENDS_CPU=$(make info-backends-all | grep -o '/cpu[^ ]*' | tr '\n' ' ')
     - echo "-------------- libCEED -------------" && make info
     - echo "-------------- BACKENDS_CPU --------" && echo $BACKENDS_CPU
     - make clean
     - make -j$NPROC_CPU
-# -- libCEED only tests
+    # -- libCEED only tests
     - echo "-------------- core tests ----------"
     - echo '[{"subject":"/","metrics":[{"name":"Transfer Size (KB)","value":"19.5","desiredSize":"smaller"},{"name":"Speed Index","value":0,"desiredSize":"smaller"},{"name":"Total Score","value":92,"desiredSize":"larger"},{"name":"Requests","value":4,"desiredSize":"smaller"}]}]' > performance.json
-#    Note: PETSC_DIR is set by default in GitLab runner env, unsetting to isolate core tests
+    #    Note: PETSC_DIR is set by default in GitLab runner env, unsetting to isolate core tests
     - export PETSC_DIR= PETSC_ARCH=
     - make -k -j$((NPROC_CPU / NPROC_POOL)) BACKENDS="$BACKENDS_CPU" JUNIT_BATCH="memcheck" junit realsearch=%
-# Clang-tidy
+    # Clang-tidy
     - echo "-------------- clang-tidy ----------" && clang-tidy --version
     - TIDY_OPTS="-fix-errors" make -j$NPROC_CPU tidy && git diff --color=always --exit-code
-# Report status
+    # Report status
     - touch .SUCCESS
   artifacts:
     paths:
@@ -68,7 +68,7 @@ noether-cpu:
     - cpu
   interruptible: true
   before_script:
-# Environment
+    # Environment
     - export COVERAGE=1 CC=gcc CXX=g++ FC=gfortran
     - export NPROC_POOL=8
     - echo "-------------- nproc ---------------" && NPROC_CPU=$(nproc) && NPROC_GPU=$(($(nproc)<8?$(nproc):8)) && echo "NPROC_CPU" $NPROC_CPU && echo "NPROC_GPU" $NPROC_GPU
@@ -76,16 +76,16 @@ noether-cpu:
     - echo "-------------- CXX -----------------" && $CXX --version
     - echo "-------------- FC ------------------" && $FC --version
     - echo "-------------- GCOV ----------------" && gcov --version
-# Libraries for backends
-# -- LIBXSMM 7 April 2024
+    # Libraries for backends
+    # -- LIBXSMM 7 April 2024
     - cd .. && export XSMM_HASH=94ee71576870152feb62f3f0cf6b061d036dcdb5 && { [[ -d libxsmm-$XSMM_HASH ]] || { curl -L https://github.com/libxsmm/libxsmm/archive/$XSMM_HASH.tar.gz -o xsmm.tar.gz && tar zvxf xsmm.tar.gz && rm xsmm.tar.gz && make -C libxsmm-$XSMM_HASH -j$(nproc); }; } && export XSMM_DIR=$PWD/libxsmm-$XSMM_HASH && cd libCEED
     - echo "-------------- LIBXSMM -------------" && basename $XSMM_DIR
-# -- OCCA v1.6.0
+    # -- OCCA v1.6.0
     - cd .. && export OCCA_VERSION=occa-1.6.0 && { [[ -d $OCCA_VERSION ]] || { git clone --depth 1 --branch v1.6.0 https://github.com/libocca/occa.git $OCCA_VERSION && cd $OCCA_VERSION && export ENABLE_OPENCL="OFF" ENABLE_DPCPP="OFF" ENABLE_HIP="OFF" ENABLE_CUDA="OFF" && ./configure-cmake.sh && cmake --build build --parallel $NPROC_CPU && cmake --install build && cd ..; }; } && export OCCA_DIR=$PWD/$OCCA_VERSION/install && cd libCEED
     - echo "-------------- OCCA ----------------" && git -C $OCCA_DIR describe --tags && LD_LIBRARY_PATH=$OCCA_DIR/lib $OCCA_DIR/bin/occa info
   script:
     - rm -f .SUCCESS
-# libCEED
+    # libCEED
     - make configure OPT='-O -march=native -ffp-contract=fast'
     - BACKENDS_CPU=$(make info-backends-all | grep -o '/cpu[^ ]*' | tr '\n' ' ')
     - echo "-------------- libCEED -------------" && make info
@@ -93,33 +93,33 @@ noether-cpu:
     - make clean
     - OCCA_DIR= PEDANTIC=1 make -j$NPROC_CPU
     - make -j$NPROC_CPU
-# -- libCEED only tests
+    # -- libCEED only tests
     - echo "-------------- core tests ----------"
     - echo '[{"subject":"/","metrics":[{"name":"Transfer Size (KB)","value":"19.5","desiredSize":"smaller"},{"name":"Speed Index","value":0,"desiredSize":"smaller"},{"name":"Total Score","value":92,"desiredSize":"larger"},{"name":"Requests","value":4,"desiredSize":"smaller"}]}]' > performance.json
-#    Note: PETSC_DIR is set by default in GitLab runner env, unsetting to isolate core tests
+    #    Note: PETSC_DIR is set by default in GitLab runner env, unsetting to isolate core tests
     - export PETSC_DIR= PETSC_ARCH=
     - make -k -j$((NPROC_CPU / NPROC_POOL)) BACKENDS="$BACKENDS_CPU" JUNIT_BATCH="cpu" junit realsearch=%
-# Libraries for examples
-# -- PETSc (minimal)
+    # Libraries for examples
+    # -- PETSc (minimal)
     - export PETSC_DIR=/projects/petsc PETSC_ARCH=mpich-cpu-int64 && git -C $PETSC_DIR -c safe.directory=$PETSC_DIR describe
     - echo "-------------- PETSc ---------------" && make -C $PETSC_DIR info
     - make -k -j$((NPROC_CPU / NPROC_POOL)) BACKENDS="$BACKENDS_CPU" JUNIT_BATCH="cpu" junit search="petsc fluids-navierstokes solids"
-# -- MFEM v4.7
+    # -- MFEM v4.7
     - cd .. && export MFEM_VERSION=mfem-4.7 && { [[ -d $MFEM_VERSION ]] || { git clone --depth 1 --branch v4.7 https://github.com/mfem/mfem.git $MFEM_VERSION && make -C $MFEM_VERSION -j$(nproc) serial CXXFLAGS="-O -std=c++11"; }; } && export MFEM_DIR=$PWD/$MFEM_VERSION && cd libCEED
     - echo "-------------- MFEM ----------------" && make -C $MFEM_DIR info
     - make -k -j$((NPROC_CPU / NPROC_POOL)) BACKENDS="$BACKENDS_CPU" JUNIT_BATCH="cpu" junit search=mfem
-# -- Nek5000 v19.0
+    # -- Nek5000 v19.0
     - export COVERAGE=0
     - cd .. && export NEK5K_VERSION=Nek5000-19.0 && { [[ -d $NEK5K_VERSION ]] || { git clone --depth 1 --branch v19.0 https://github.com/Nek5000/Nek5000.git $NEK5K_VERSION && cd $NEK5K_VERSION/tools && ./maketools genbox genmap reatore2 && cd ../..; }; } && export NEK5K_DIR=$PWD/$NEK5K_VERSION && export PATH=$NEK5K_DIR/bin:$PATH MPI=0 && cd libCEED
     - echo "-------------- Nek5000 -------------" && git -C $NEK5K_DIR describe --tags
     - export NPROC_POOL=1
     - make -k -j$NPROC_CPU BACKENDS="$BACKENDS_CPU" JUNIT_BATCH="cpu" junit search=nek NEK5K_DIR=$NEK5K_DIR
-# -- deal.II 8bd5c262f13e15793aa206b6eed8774a9b25ce11
+    # -- deal.II 8bd5c262f13e15793aa206b6eed8774a9b25ce11
     - OCCA_DIR= BACKENDS_CPU=$(make info-backends-all | grep -o '/cpu[^ ]*' | tr '\n' ' ')
     - export DEAL_II_ROOT_DIR=/projects/dealii DEAL_II_DIR=/projects/dealii/install
     - echo "-------------- deal.II -------------" && git -C $DEAL_II_ROOT_DIR -c safe.directory=$DEAL_II_ROOT_DIR describe --always
     - make -k -j$NPROC_CPU BACKENDS="$BACKENDS_CPU" JUNIT_BATCH="cpu" junit search=dealii DEAL_II_DIR=$DEAL_II_DIR
-# Report status
+    # Report status
     - touch .SUCCESS
   after_script:
     - |
@@ -148,7 +148,7 @@ noether-sycl:
     - sycl
   interruptible: true
   before_script:
-# Environment
+    # Environment
     - . /opt/intel/oneapi/setvars.sh
     - export COVERAGE=1 CC=icx CXX=icpx
     - export NPROC_POOL=8
@@ -157,14 +157,14 @@ noether-sycl:
     - echo "-------------- CXX -----------------" && $CXX --version
   script:
     - rm -f .SUCCESS
-# libCEED
+    # libCEED
     - make configure SYCL_DIR=/opt/intel/oneapi/compiler/latest OPT='-O -march=native -ffp-contract=fast'
     - BACKENDS_SYCL=$(make info-backends-all | grep -o '/sycl[^ ]*' | tr '\n' ' ')
     - echo "-------------- libCEED -------------" && make info
     - echo "-------------- BACKENDS_SYCL -------" && echo $BACKENDS_SYCL
     - make clean
     - make -j$NPROC_CPU
-# Report status
+    # Report status
     - touch .SUCCESS
 
 
@@ -177,7 +177,7 @@ noether-cuda:
     - cuda
   interruptible: true
   before_script:
-# Environment
+    # Environment
     - export COVERAGE=1 CC=gcc CXX=g++ FC=gfortran NVCC=nvcc
     - export NPROC_POOL=4
     - echo "-------------- nproc ---------------" && NPROC_CPU=$(nproc) && NPROC_GPU=$(($(nproc)<8?$(nproc):8)) && echo "NPROC_CPU" $NPROC_CPU && echo "NPROC_GPU" $NPROC_GPU
@@ -186,51 +186,51 @@ noether-cuda:
     - echo "-------------- FC ------------------" && $FC --version
     - echo "-------------- NVCC ----------------" && $NVCC --version
     - echo "-------------- GCOV ----------------" && gcov --version
-# ASAN
+    # ASAN
     - echo "-------------- ASAN ----------------"
     - export ASAN=1 AFLAGS="-fsanitize=address -fsanitize=leak" ASAN_OPTIONS=protect_shadow_gap=0
     - echo $AFLAGS
   script:
     - rm -f .SUCCESS
-# libCEED
+    # libCEED
     - make configure OPT='-O -march=native -ffp-contract=fast' CUDA_DIR=/usr
     - echo "-------------- libCEED -------------" && make info
     - BACKENDS_GPU=$(make info-backends | grep -o '/gpu[^ ]*' | tr '\n' ' ')
     - echo "-------------- BACKENDS_GPU --------" && echo $BACKENDS_GPU
     - make clean
     - PEDANTIC=1 make -k -j$NPROC_CPU -l$NPROC_CPU
-# -- libCEED only tests
+    # -- libCEED only tests
     - echo "-------------- core tests ----------"
-#    Note: PETSC_DIR is set by default in GitLab runner env, unsetting to isolate core tests
+    #    Note: PETSC_DIR is set by default in GitLab runner env, unsetting to isolate core tests
     - export PETSC_DIR= PETSC_ARCH=
     - make -k -j$((NPROC_GPU / NPROC_POOL)) BACKENDS="$BACKENDS_GPU" JUNIT_BATCH="cuda" junit realsearch=%
-# Rebuild without ASAN
+    # Rebuild without ASAN
     - unset ASAN AFLAGS ASAN_OPTIONS
     - make clean
     - PEDANTIC=1 make -k -j$NPROC_CPU -l$NPROC_CPU
-# Libraries for examples
-# -- PETSc with CUDA (minimal)
+    # Libraries for examples
+    # -- PETSc with CUDA (minimal)
     - export PETSC_DIR=/projects/petsc PETSC_ARCH=mpich-cuda-O PETSC_OPTIONS='-use_gpu_aware_mpi 0' && git -C $PETSC_DIR -c safe.directory=$PETSC_DIR describe
     - echo "-------------- PETSc ---------------" && make -C $PETSC_DIR info
     - make -k -j$((NPROC_GPU / NPROC_POOL)) BACKENDS="$BACKENDS_GPU" JUNIT_BATCH="cuda" junit search="petsc fluids solids"
-# -- MFEM v4.7
+    # -- MFEM v4.7
     - cd .. && export MFEM_VERSION=mfem-4.7 && { [[ -d $MFEM_VERSION ]] || { git clone --depth 1 --branch v4.7 https://github.com/mfem/mfem.git $MFEM_VERSION && make -C $MFEM_VERSION -j$(nproc) serial CXXFLAGS="-O -std=c++11"; }; } && export MFEM_DIR=$PWD/$MFEM_VERSION && cd libCEED
     - echo "-------------- MFEM ----------------" && make -C $MFEM_DIR info
     - make -k -j$((NPROC_GPU / NPROC_POOL)) BACKENDS="$BACKENDS_GPU" JUNIT_BATCH="cuda" junit search=mfem
-# -- Nek5000 v19.0
+    # -- Nek5000 v19.0
     - export COVERAGE=0
     - cd .. && export NEK5K_VERSION=Nek5000-19.0 && { [[ -d $NEK5K_VERSION ]] || { git clone --depth 1 --branch v19.0 https://github.com/Nek5000/Nek5000.git $NEK5K_VERSION && cd $NEK5K_VERSION/tools && ./maketools genbox genmap reatore2 && cd ../..; }; } && export NEK5K_DIR=$PWD/$NEK5K_VERSION && export PATH=$NEK5K_DIR/bin:$PATH MPI=0 && cd libCEED
     - echo "-------------- Nek5000 -------------" && git -C $NEK5K_DIR describe --tags
     - export NPROC_POOL=1
     - make -k -j$NPROC_GPU BACKENDS="$BACKENDS_GPU" JUNIT_BATCH="cuda" junit search=nek NEK5K_DIR=$NEK5K_DIR
-# -- deal.II 8bd5c262f13e15793aa206b6eed8774a9b25ce11
+    # -- deal.II 8bd5c262f13e15793aa206b6eed8774a9b25ce11
     - export DEAL_II_ROOT_DIR=/projects/dealii DEAL_II_DIR=/projects/dealii/install
     - echo "-------------- deal.II -------------" && git -C $DEAL_II_ROOT_DIR -c safe.directory=$DEAL_II_ROOT_DIR describe --always
     - make -k -j$((NPROC_GPU / NPROC_POOL)) BACKENDS="$BACKENDS_GPU" JUNIT_BATCH="cuda" junit search=dealii DEAL_II_DIR=$DEAL_II_DIR
-# Clang-tidy
+    # Clang-tidy
     - echo "-------------- clang-tidy ----------" && clang-tidy --version
     - TIDY_OPTS="-fix-errors" make -j$NPROC_CPU tidy && git diff --color=always --exit-code
-# Report status
+    # Report status
     - touch .SUCCESS
   after_script:
     - |
@@ -261,7 +261,7 @@ noether-cuda:
 #    - rocm
 #  interruptible: true
 #  before_script:
-## Environment
+#    # Environment
 #    - export COVERAGE=1 CC=gcc CXX=g++ FC=gfortran HIPCC=hipcc
 #    - export NPROC_POOL=4
 #    - echo "-------------- nproc ---------------" && NPROC_CPU=$(nproc) && NPROC_GPU=$(($(nproc)<8?$(nproc):8)) && echo "NPROC_CPU" $NPROC_CPU && echo "NPROC_GPU" $NPROC_GPU
@@ -270,34 +270,34 @@ noether-cuda:
 #    - echo "-------------- FC ------------------" && $FC --version
 #    - echo "-------------- HIPCC ---------------" && $HIPCC --version
 #    - echo "-------------- GCOV ----------------" && gcov --version
-## Libraries for backends
-## -- MAGMA from dev branch
+#    # Libraries for backends
+#    # -- MAGMA from dev branch
 #    - echo "-------------- MAGMA ---------------"
 #    - export MAGMA_DIR=/projects/hipMAGMA && git -C $MAGMA_DIR -c safe.directory=$MAGMA_DIR describe
 #  script:
 #    - rm -f .SUCCESS
-## libCEED
+#    # libCEED
 #    - make configure ROCM_DIR=/opt/rocm-6.1.0 OPT='-O -march=native -ffp-contract=fast'
 #    - BACKENDS_CPU=$(make info-backends-all | grep -o '/cpu[^ ]*' | tr '\n' ' ') && BACKENDS_GPU=$(make info-backends | grep -o '/gpu[^ ]*' | tr '\n' ' ')
 #    - echo "-------------- libCEED -------------" && make info
 #    - echo "-------------- BACKENDS_GPU --------" && echo $BACKENDS_GPU
 #    - make clean
 #    - make -j$NPROC_CPU
-## -- libCEED only tests
+#    # -- libCEED only tests
 #    - echo "-------------- core tests ----------"
 #    - echo '[{"subject":"/","metrics":[{"name":"Transfer Size (KB)","value":"19.5","desiredSize":"smaller"},{"name":"Speed Index","value":0,"desiredSize":"smaller"},{"name":"Total Score","value":92,"desiredSize":"larger"},{"name":"Requests","value":4,"desiredSize":"smaller"}]}]' > performance.json
-##    Note: PETSC_DIR is set by default in GitLab runner env, unsetting to isolate core tests
+#    #    Note: PETSC_DIR is set by default in GitLab runner env, unsetting to isolate core tests
 #    - export PETSC_DIR= PETSC_ARCH=
 #    - make -k -j$((NPROC_GPU / NPROC_POOL)) BACKENDS="$BACKENDS_GPU" JUNIT_BATCH="hip" junit realsearch=%
-## Libraries for examples
-## -- PETSc with HIP (minimal)
+#    # Libraries for examples
+#    # -- PETSc with HIP (minimal)
 #    - export PETSC_DIR=/projects/petsc PETSC_ARCH=mpich-hip && git -C $PETSC_DIR -c safe.directory=$PETSC_DIR describe
 #    - echo "-------------- PETSc ---------------" && make -C $PETSC_DIR info
 #    - make -k -j$((NPROC_GPU / NPROC_POOL)) BACKENDS="$BACKENDS_GPU" JUNIT_BATCH="hip" junit search="petsc fluids solids"
-## Clang-tidy
+#    # Clang-tidy
 #    - echo "-------------- clang-tidy ----------" && clang-tidy --version
 #    - TIDY_OPTS="-fix-errors" make -j$NPROC_CPU tidy && git diff --color=always --exit-code
-## Report status
+#    # Report status
 #    - touch .SUCCESS
 #  after_script:
 #    - |
@@ -327,7 +327,7 @@ noether-float:
     - cuda
   interruptible: true
   before_script:
-# Environment
+    # Environment
     - export COVERAGE=1 CC=gcc CXX=g++ FC= NVCC=nvcc
     - export NPROC_POOL=8
     - echo "-------------- nproc ---------------" && NPROC_CPU=$(nproc) && NPROC_GPU=$(($(nproc)<8?$(nproc):8)) && echo "NPROC_CPU" $NPROC_CPU && echo "NPROC_GPU" $NPROC_GPU
@@ -335,20 +335,20 @@ noether-float:
     - echo "-------------- CXX -----------------" && $CXX --version
     - echo "-------------- NVCC ----------------" && $NVCC --version
     - echo "-------------- GCOV ----------------" && gcov --version
-# Libraries for backends
+    # Libraries for backends
 # ROCm tests currently disabled
 # -- MAGMA from dev branch
 #    - echo "-------------- MAGMA ---------------"
 #    - export MAGMA_DIR=/projects/hipMAGMA && git -C $MAGMA_DIR -c safe.directory=$MAGMA_DIR describe
-# -- LIBXSMM 7 April 2024
+    # -- LIBXSMM 7 April 2024
     - cd .. && export XSMM_HASH=94ee71576870152feb62f3f0cf6b061d036dcdb5 && { [[ -d libxsmm-$XSMM_HASH ]] || { curl -L https://github.com/libxsmm/libxsmm/archive/$XSMM_HASH.tar.gz -o xsmm.tar.gz && tar zvxf xsmm.tar.gz && rm xsmm.tar.gz && make -C libxsmm-$XSMM_HASH -j$(nproc); }; } && export XSMM_DIR=$PWD/libxsmm-$XSMM_HASH && cd libCEED
     - echo "-------------- LIBXSMM -------------" && basename $XSMM_DIR
   script:
     - rm -f .SUCCESS
-# libCEED
-# Change to single precision
+    # libCEED
+    # Change to single precision
     - sed -i 's/ceed-f64/ceed-f32/1' include/ceed/types.h
-# Build libCEED
+    # Build libCEED
     - make configure OPT='-O -march=native -ffp-contract=fast' CUDA_DIR=/usr
     - BACKENDS_CPU=$(make info-backends-all | grep -o '/cpu[^ ]*' | tr '\n' ' ') && BACKENDS_GPU=$(make info-backends | grep -o '/gpu[^ ]*' | tr '\n' ' ')
     - echo "-------------- libCEED -------------" && make info
@@ -356,15 +356,15 @@ noether-float:
     - echo "-------------- BACKENDS_GPU --------" && echo $BACKENDS_GPU
     - make clean
     - make -j$NPROC_CPU
-# -- libCEED only tests
+    # -- libCEED only tests
     - echo "-------------- core tests ----------"
     - echo '[{"subject":"/","metrics":[{"name":"Transfer Size (KB)","value":"19.5","desiredSize":"smaller"},{"name":"Speed Index","value":0,"desiredSize":"smaller"},{"name":"Total Score","value":92,"desiredSize":"larger"},{"name":"Requests","value":4,"desiredSize":"smaller"}]}]' > performance.json
-#    Note: PETSC_DIR is set by default in GitLab runner env, unsetting to isolate core tests
+    #    Note: PETSC_DIR is set by default in GitLab runner env, unsetting to isolate core tests
     - export PETSC_DIR= PETSC_ARCH=
     - make -k -j$((NPROC_CPU / NPROC_POOL)) BACKENDS="$BACKENDS_CPU" JUNIT_BATCH="float-cpu" junit realsearch=%
     - export NPROC_POOL=4
     - make -k -j$((NPROC_GPU / NPROC_POOL)) BACKENDS="$BACKENDS_GPU" JUNIT_BATCH="float-cuda" junit realsearch=%
-# Report status
+    # Report status
     - echo "SUCCESS" > .job_status
   after_script:
     - |

From ff186337b341b423400898116d8fc0cf1bb9b9bf Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 3 Jan 2025 10:35:35 -0700
Subject: [PATCH 250/571] ci - temporarily disable SYCL tests

---
 .gitlab-ci.yml | 51 ++++++++++++++++++++++++++------------------------
 1 file changed, 27 insertions(+), 24 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index a0ea8afcf8..27b8f62cc1 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -142,30 +142,33 @@ noether-cpu:
 # ----------------------------------------------------------------------------------------
 # Check SYCL backends build
 # ----------------------------------------------------------------------------------------
-noether-sycl:
-  stage: test:gpu-and-float
-  tags:
-    - sycl
-  interruptible: true
-  before_script:
-    # Environment
-    - . /opt/intel/oneapi/setvars.sh
-    - export COVERAGE=1 CC=icx CXX=icpx
-    - export NPROC_POOL=8
-    - echo "-------------- nproc ---------------" && NPROC_CPU=$(nproc) && NPROC_GPU=$(($(nproc)<8?$(nproc):8)) && echo "NPROC_CPU" $NPROC_CPU && echo "NPROC_GPU" $NPROC_GPU
-    - echo "-------------- CC ------------------" && $CC --version
-    - echo "-------------- CXX -----------------" && $CXX --version
-  script:
-    - rm -f .SUCCESS
-    # libCEED
-    - make configure SYCL_DIR=/opt/intel/oneapi/compiler/latest OPT='-O -march=native -ffp-contract=fast'
-    - BACKENDS_SYCL=$(make info-backends-all | grep -o '/sycl[^ ]*' | tr '\n' ' ')
-    - echo "-------------- libCEED -------------" && make info
-    - echo "-------------- BACKENDS_SYCL -------" && echo $BACKENDS_SYCL
-    - make clean
-    - make -j$NPROC_CPU
-    # Report status
-    - touch .SUCCESS
+
+# SYCL tests currently disabled
+
+#noether-sycl:
+#  stage: test:gpu-and-float
+#  tags:
+#    - sycl
+#  interruptible: true
+#  before_script:
+#    # Environment
+#    - . /opt/intel/oneapi/setvars.sh
+#    - export COVERAGE=1 CC=icx CXX=icpx
+#    - export NPROC_POOL=8
+#    - echo "-------------- nproc ---------------" && NPROC_CPU=$(nproc) && NPROC_GPU=$(($(nproc)<8?$(nproc):8)) && echo "NPROC_CPU" $NPROC_CPU && echo "NPROC_GPU" $NPROC_GPU
+#    - echo "-------------- CC ------------------" && $CC --version
+#    - echo "-------------- CXX -----------------" && $CXX --version
+#  script:
+#    - rm -f .SUCCESS
+#    # libCEED
+#    - make configure SYCL_DIR=/opt/intel/oneapi/compiler/latest OPT='-O -march=native -ffp-contract=fast'
+#    - BACKENDS_SYCL=$(make info-backends-all | grep -o '/sycl[^ ]*' | tr '\n' ' ')
+#    - echo "-------------- libCEED -------------" && make info
+#    - echo "-------------- BACKENDS_SYCL -------" && echo $BACKENDS_SYCL
+#    - make clean
+#    - make -j$NPROC_CPU
+#    # Report status
+#    - touch .SUCCESS
 
 
 # ----------------------------------------------------------------------------------------

From 9ff05d55386b4e6413be60b7231511258906fd9f Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 3 Jan 2025 11:59:41 -0700
Subject: [PATCH 251/571] cuda - add nontensor shared

---
 backends/cuda-shared/ceed-cuda-shared-basis.c | 176 +++++++++++++++++-
 backends/cuda-shared/ceed-cuda-shared.c       |   1 +
 backends/cuda-shared/ceed-cuda-shared.h       |   3 +
 .../cuda-shared-basis-nontensor-templates.h   |  98 ++++++++++
 .../cuda/cuda-shared-basis-nontensor.h        | 164 ++++++++++++++++
 5 files changed, 441 insertions(+), 1 deletion(-)
 create mode 100644 include/ceed/jit-source/cuda/cuda-shared-basis-nontensor-templates.h
 create mode 100644 include/ceed/jit-source/cuda/cuda-shared-basis-nontensor.h

diff --git a/backends/cuda-shared/ceed-cuda-shared-basis.c b/backends/cuda-shared/ceed-cuda-shared-basis.c
index 8a3def90b3..f01ec95f8c 100644
--- a/backends/cuda-shared/ceed-cuda-shared-basis.c
+++ b/backends/cuda-shared/ceed-cuda-shared-basis.c
@@ -26,7 +26,7 @@ int CeedInit_CudaGrad(CeedScalar *d_B, CeedScalar *d_G, CeedInt P_1d, CeedInt Q_
 int CeedInit_CudaCollocatedGrad(CeedScalar *d_B, CeedScalar *d_G, CeedInt P_1d, CeedInt Q_1d, CeedScalar **c_B_ptr, CeedScalar **c_G_ptr);
 
 //------------------------------------------------------------------------------
-// Apply basis
+// Apply tensor basis
 //------------------------------------------------------------------------------
 static int CeedBasisApplyTensorCore_Cuda_shared(CeedBasis basis, bool apply_add, const CeedInt num_elem, CeedTransposeMode t_mode,
                                                 CeedEvalMode eval_mode, CeedVector u, CeedVector v) {
@@ -424,6 +424,123 @@ static int CeedBasisApplyAddAtPoints_Cuda_shared(CeedBasis basis, const CeedInt
   return CEED_ERROR_SUCCESS;
 }
 
+//------------------------------------------------------------------------------
+// Apply non-tensor basis
+//------------------------------------------------------------------------------
+static int CeedBasisApplyNonTensorCore_Cuda_shared(CeedBasis basis, bool apply_add, const CeedInt num_elem, CeedTransposeMode t_mode,
+                                                   CeedEvalMode eval_mode, CeedVector u, CeedVector v) {
+  Ceed                   ceed;
+  Ceed_Cuda             *ceed_Cuda;
+  CeedInt                dim;
+  const CeedScalar      *d_u;
+  CeedScalar            *d_v;
+  CeedBasis_Cuda_shared *data;
+
+  CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
+  CeedCallBackend(CeedGetData(ceed, &ceed_Cuda));
+  CeedCallBackend(CeedBasisGetData(basis, &data));
+  CeedCallBackend(CeedBasisGetDimension(basis, &dim));
+
+  // Get read/write access to u, v
+  if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
+  else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
+  if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v));
+  else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
+
+  // Apply basis operation
+  switch (eval_mode) {
+    case CEED_EVAL_INTERP: {
+      CeedInt P, Q;
+
+      CeedCallBackend(CeedBasisGetNumNodes(basis, &P));
+      CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &Q));
+      CeedInt thread = CeedIntMax(Q, P);
+
+      CeedCallBackend(CeedInit_CudaInterp(data->d_interp_1d, P, Q, &data->c_B));
+      void *interp_args[] = {(void *)&num_elem, &data->c_B, &d_u, &d_v};
+
+      {
+        // avoid >512 total threads
+        CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread, 1));
+        CeedInt grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
+        CeedInt shared_mem      = elems_per_block * thread * sizeof(CeedScalar);
+
+        if (t_mode == CEED_TRANSPOSE) {
+          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, grid, thread, 1,
+                                                      elems_per_block, shared_mem, interp_args));
+        } else {
+          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Interp, grid, thread, 1, elems_per_block, shared_mem, interp_args));
+        }
+      }
+    } break;
+    case CEED_EVAL_GRAD: {
+      CeedInt P, Q;
+
+      CeedCallBackend(CeedBasisGetNumNodes(basis, &P));
+      CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &Q));
+      CeedInt thread = CeedIntMax(Q, P);
+
+      CeedCallBackend(CeedInit_CudaInterp(data->d_grad_1d, P, Q * dim, &data->c_G));
+      void *grad_args[] = {(void *)&num_elem, &data->c_G, &d_u, &d_v};
+
+      {
+        // avoid >512 total threads
+        CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread, 1));
+        CeedInt grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
+        CeedInt shared_mem      = elems_per_block * thread * sizeof(CeedScalar);
+
+        if (t_mode == CEED_TRANSPOSE) {
+          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, grid, thread, 1,
+                                                      elems_per_block, shared_mem, grad_args));
+        } else {
+          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Grad, grid, thread, 1, elems_per_block, shared_mem, grad_args));
+        }
+      }
+    } break;
+    case CEED_EVAL_WEIGHT: {
+      CeedInt Q;
+
+      CeedCheck(data->d_q_weight_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights_1d not set", CeedEvalModes[eval_mode]);
+      CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &Q));
+      void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight_1d, &d_v};
+
+      {
+        // avoid >512 total threads
+        CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / Q, 1));
+        CeedInt grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
+
+        CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, grid, Q, elems_per_block, 1, weight_args));
+      }
+    } break;
+    case CEED_EVAL_NONE: /* handled separately below */
+      break;
+    // LCOV_EXCL_START
+    case CEED_EVAL_DIV:
+    case CEED_EVAL_CURL:
+      return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]);
+      // LCOV_EXCL_STOP
+  }
+
+  // Restore vectors, cover CEED_EVAL_NONE
+  CeedCallBackend(CeedVectorRestoreArray(v, &d_v));
+  if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u));
+  if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u));
+  CeedCallBackend(CeedDestroy(&ceed));
+  return CEED_ERROR_SUCCESS;
+}
+
+static int CeedBasisApplyNonTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode,
+                                               CeedVector u, CeedVector v) {
+  CeedCallBackend(CeedBasisApplyNonTensorCore_Cuda_shared(basis, false, num_elem, t_mode, eval_mode, u, v));
+  return CEED_ERROR_SUCCESS;
+}
+
+static int CeedBasisApplyAddNonTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode,
+                                                  CeedVector u, CeedVector v) {
+  CeedCallBackend(CeedBasisApplyNonTensorCore_Cuda_shared(basis, true, num_elem, t_mode, eval_mode, u, v));
+  return CEED_ERROR_SUCCESS;
+}
+
 //------------------------------------------------------------------------------
 // Destroy basis
 //------------------------------------------------------------------------------
@@ -513,3 +630,60 @@ int CeedBasisCreateTensorH1_Cuda_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d,
 }
 
 //------------------------------------------------------------------------------
+// Create non-tensor basis
+//------------------------------------------------------------------------------
+int CeedBasisCreateH1_Cuda_shared(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp,
+                                  const CeedScalar *grad, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) {
+  Ceed                   ceed;
+  CeedInt                num_comp, q_comp_interp, q_comp_grad;
+  const CeedInt          q_bytes = num_qpts * sizeof(CeedScalar);
+  CeedBasis_Cuda_shared *data;
+
+  CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
+  CeedCallBackend(CeedCalloc(1, &data));
+
+  // Copy basis data to GPU
+  CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp));
+  CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_GRAD, &q_comp_grad));
+  if (q_weight) {
+    CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight_1d, q_bytes));
+    CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight_1d, q_weight, q_bytes, cudaMemcpyHostToDevice));
+  }
+  if (interp) {
+    const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp;
+
+    CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp_1d, interp_bytes));
+    CeedCallCuda(ceed, cudaMemcpy(data->d_interp_1d, interp, interp_bytes, cudaMemcpyHostToDevice));
+  }
+  if (grad) {
+    const CeedInt grad_bytes = q_bytes * num_nodes * q_comp_grad;
+
+    CeedCallCuda(ceed, cudaMalloc((void **)&data->d_grad_1d, grad_bytes));
+    CeedCallCuda(ceed, cudaMemcpy(data->d_grad_1d, grad, grad_bytes, cudaMemcpyHostToDevice));
+  }
+
+  // Compile basis kernels
+  const char basis_kernel_source[] = "// Non-tensor basis source\n#include <ceed/jit-source/cuda/cuda-shared-basis-nontensor.h>\n";
+
+  CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
+  CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "T_1D",
+                                   CeedIntMax(num_qpts, num_nodes), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp));
+  CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp));
+  CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "InterpTranspose", &data->InterpTranspose));
+  CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "InterpTransposeAdd", &data->InterpTransposeAdd));
+  CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Grad", &data->Grad));
+  CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "GradTranspose", &data->GradTranspose));
+  CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "GradTransposeAdd", &data->GradTransposeAdd));
+  CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight));
+
+  CeedCallBackend(CeedBasisSetData(basis, data));
+
+  // Register backend functions
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Cuda_shared));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Cuda_shared));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Cuda_shared));
+  CeedCallBackend(CeedDestroy(&ceed));
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
diff --git a/backends/cuda-shared/ceed-cuda-shared.c b/backends/cuda-shared/ceed-cuda-shared.c
index baa374af36..fe3c2a7117 100644
--- a/backends/cuda-shared/ceed-cuda-shared.c
+++ b/backends/cuda-shared/ceed-cuda-shared.c
@@ -36,6 +36,7 @@ static int CeedInit_Cuda_shared(const char *resource, Ceed ceed) {
   CeedCallBackend(CeedDestroy(&ceed_ref));
 
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "BasisCreateTensorH1", CeedBasisCreateTensorH1_Cuda_shared));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "BasisCreateH1", CeedBasisCreateH1_Cuda_shared));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "Destroy", CeedDestroy_Cuda));
   return CEED_ERROR_SUCCESS;
 }
diff --git a/backends/cuda-shared/ceed-cuda-shared.h b/backends/cuda-shared/ceed-cuda-shared.h
index f42f2b1cff..db2d47809d 100644
--- a/backends/cuda-shared/ceed-cuda-shared.h
+++ b/backends/cuda-shared/ceed-cuda-shared.h
@@ -39,3 +39,6 @@ typedef struct {
 
 CEED_INTERN int CeedBasisCreateTensorH1_Cuda_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d,
                                                     const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis);
+
+CEED_INTERN int CeedBasisCreateH1_Cuda_shared(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp,
+                                              const CeedScalar *grad, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis);
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor-templates.h
new file mode 100644
index 0000000000..1c03111dd4
--- /dev/null
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor-templates.h
@@ -0,0 +1,98 @@
+// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+/// @file
+/// Internal header for CUDA shared memory non-tensor product basis templates
+#include <ceed/types.h>
+
+//------------------------------------------------------------------------------
+// 1D tensor contraction
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D>
+inline __device__ void Contract1d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+  data.slice[data.t_id_x] = *U;
+  __syncthreads();
+  *V = 0.0;
+  if (data.t_id_x < Q_1D) {
+    for (CeedInt i = 0; i < P_1D; i++) {
+      *V += B[i + data.t_id_x * P_1D] * data.slice[i];  // Contract x direction
+    }
+  }
+  __syncthreads();
+}
+
+//------------------------------------------------------------------------------
+// 1D transpose tensor contraction
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D>
+inline __device__ void ContractTranspose1d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+  data.slice[data.t_id_x] = *U;
+  __syncthreads();
+  if (data.t_id_x < P_1D) {
+    for (CeedInt i = 0; i < Q_1D; i++) {
+      *V += B[data.t_id_x + i * P_1D] * data.slice[i];  // Contract x direction
+    }
+  }
+  __syncthreads();
+}
+
+//------------------------------------------------------------------------------
+// Interpolate to quadrature points
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P, int Q>
+inline __device__ void InterpNonTensor(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                       CeedScalar *__restrict__ r_V) {
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    Contract1d<NUM_COMP, P, Q>(data, &r_U[comp], c_B, &r_V[comp]);
+  }
+}
+
+//------------------------------------------------------------------------------
+// Interpolate transpose
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P, int Q>
+inline __device__ void InterpTransposeNonTensor(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                CeedScalar *__restrict__ r_V) {
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    r_V[comp] = 0.0;
+    ContractTranspose1d<NUM_COMP, P, Q>(data, &r_U[comp], c_B, &r_V[comp]);
+  }
+}
+
+//------------------------------------------------------------------------------
+// Derivatives at quadrature points
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int DIM, int P, int Q>
+inline __device__ void GradNonTensor(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
+  for (CeedInt dim = 0; dim < DIM; dim++) {
+    for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+      Contract1d<NUM_COMP, P, Q>(data, &r_U[comp], &c_G[dim * P * Q], &r_V[comp + dim * NUM_COMP]);
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// Derivatives transpose
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int DIM, int P, int Q>
+inline __device__ void GradTransposeNonTensor(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
+                                              CeedScalar *__restrict__ r_V) {
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_V[comp] = 0.0;
+  for (CeedInt dim = 0; dim < DIM; dim++) {
+    for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+      ContractTranspose1d<NUM_COMP, P, Q>(data, &r_U[comp + dim * NUM_COMP], &c_G[dim * P * Q], &r_V[comp]);
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// Quadrature weights
+//------------------------------------------------------------------------------
+template <int Q>
+inline __device__ void WeightNonTensor(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight, CeedScalar *w) {
+  *w = (data.t_id_x < Q) ? q_weight[data.t_id_x] : 0.0;
+}
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor.h b/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor.h
new file mode 100644
index 0000000000..804679ddf4
--- /dev/null
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor.h
@@ -0,0 +1,164 @@
+// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+/// @file
+/// Internal header for CUDA shared memory non-tensor basis
+#include <ceed/types.h>
+
+#include "cuda-shared-basis-nontensor-templates.h"
+#include "cuda-shared-basis-read-write-templates.h"
+
+//------------------------------------------------------------------------------
+// Interp kernel by dim
+//------------------------------------------------------------------------------
+extern "C" __global__ void Interp(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
+  extern __shared__ CeedScalar slice[];
+
+  SharedData_Cuda data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = slice + data.t_id_z * T_1D;
+
+  CeedScalar r_U[BASIS_NUM_COMP];
+  CeedScalar r_V[BASIS_NUM_COMP];
+
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, d_U, r_U);
+    InterpNonTensor<BASIS_NUM_COMP, BASIS_P, BASIS_Q>(data, r_U, c_B, r_V);
+    WriteElementStrided1d<BASIS_NUM_COMP, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, r_V, d_V);
+  }
+}
+
+extern "C" __global__ void InterpTranspose(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U,
+                                           CeedScalar *__restrict__ d_V) {
+  extern __shared__ CeedScalar slice[];
+
+  SharedData_Cuda data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = slice + data.t_id_z * T_1D;
+
+  CeedScalar r_U[BASIS_NUM_COMP];
+  CeedScalar r_V[BASIS_NUM_COMP];
+
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, d_U, r_U);
+    InterpTransposeNonTensor<BASIS_NUM_COMP, BASIS_P, BASIS_Q>(data, r_U, c_B, r_V);
+    WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, r_V, d_V);
+  }
+}
+
+extern "C" __global__ void InterpTransposeAdd(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U,
+                                              CeedScalar *__restrict__ d_V) {
+  extern __shared__ CeedScalar slice[];
+
+  SharedData_Cuda data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = slice + data.t_id_z * T_1D;
+
+  CeedScalar r_U[BASIS_NUM_COMP];
+  CeedScalar r_V[BASIS_NUM_COMP];
+
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, d_U, r_U);
+    InterpTransposeNonTensor<BASIS_NUM_COMP, BASIS_P, BASIS_Q>(data, r_U, c_B, r_V);
+    SumElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, r_V, d_V);
+  }
+}
+
+//------------------------------------------------------------------------------
+// Grad kernel by dim
+//------------------------------------------------------------------------------
+extern "C" __global__ void Grad(const CeedInt num_elem, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
+  extern __shared__ CeedScalar slice[];
+
+  SharedData_Cuda data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = slice + data.t_id_z * T_1D;
+
+  CeedScalar r_U[BASIS_NUM_COMP];
+  CeedScalar r_V[BASIS_NUM_COMP * BASIS_DIM];
+
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, d_U, r_U);
+    GradNonTensor<BASIS_NUM_COMP, BASIS_DIM, BASIS_P, BASIS_Q>(data, r_U, c_G, r_V);
+    WriteElementStrided1d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, r_V, d_V);
+  }
+}
+
+extern "C" __global__ void GradTranspose(const CeedInt num_elem, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U,
+                                         CeedScalar *__restrict__ d_V) {
+  extern __shared__ CeedScalar slice[];
+
+  SharedData_Cuda data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = slice + data.t_id_z * T_1D;
+
+  CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM];
+  CeedScalar r_V[BASIS_NUM_COMP];
+
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    ReadElementStrided1d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, d_U, r_U);
+    GradTransposeNonTensor<BASIS_NUM_COMP, BASIS_DIM, BASIS_P, BASIS_Q>(data, r_U, c_G, r_V);
+    WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, r_V, d_V);
+  }
+}
+
+extern "C" __global__ void GradTransposeAdd(const CeedInt num_elem, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U,
+                                            CeedScalar *__restrict__ d_V) {
+  extern __shared__ CeedScalar slice[];
+
+  SharedData_Cuda data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = slice + data.t_id_z * T_1D;
+
+  CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM];
+  CeedScalar r_V[BASIS_NUM_COMP];
+
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    ReadElementStrided1d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, d_U, r_U);
+    GradTransposeNonTensor<BASIS_NUM_COMP, BASIS_DIM, BASIS_P, BASIS_Q>(data, r_U, c_G, r_V);
+    SumElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, r_V, d_V);
+  }
+}
+
+//------------------------------------------------------------------------------
+// Weight kernels by dim
+//------------------------------------------------------------------------------
+extern "C" __global__ void Weight(const CeedInt num_elem, const CeedScalar *__restrict__ q_weight, CeedScalar *__restrict__ d_W) {
+  extern __shared__ CeedScalar slice[];
+
+  SharedData_Cuda data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = slice + data.t_id_z * T_1D;
+
+  CeedScalar r_W[1];
+
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    WeightNonTensor<BASIS_Q>(data, q_weight, r_W);
+    WriteElementStrided1d<1, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, r_W, d_W);
+  }
+}

From cb270d314a40d8c6c34bcefc3e58c4db0ebb0bd4 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 3 Jan 2025 14:45:00 -0700
Subject: [PATCH 252/571] minor style

---
 tests/t319-basis.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/t319-basis.c b/tests/t319-basis.c
index f63299867a..7417952dfd 100644
--- a/tests/t319-basis.c
+++ b/tests/t319-basis.c
@@ -155,24 +155,24 @@ int main(int argc, char **argv) {
     CeedBasis basis_from_nontensor, basis_to_nontensor;
     {
       CeedElemTopology  topo;
-      CeedInt           num_comp, num_nodes, nqpts;
+      CeedInt           num_comp, num_nodes, num_qpts;
       const CeedScalar *interp, *grad;
 
       CeedBasisGetTopology(basis_from, &topo);
       CeedBasisGetNumComponents(basis_from, &num_comp);
       CeedBasisGetNumNodes(basis_from, &num_nodes);
-      CeedBasisGetNumQuadraturePoints(basis_from, &nqpts);
+      CeedBasisGetNumQuadraturePoints(basis_from, &num_qpts);
       CeedBasisGetInterp(basis_from, &interp);
       CeedBasisGetGrad(basis_from, &grad);
-      CeedBasisCreateH1(ceed, topo, num_comp, num_nodes, nqpts, interp, grad, NULL, NULL, &basis_from_nontensor);
+      CeedBasisCreateH1(ceed, topo, num_comp, num_nodes, num_qpts, interp, grad, NULL, NULL, &basis_from_nontensor);
 
       CeedBasisGetTopology(basis_to, &topo);
       CeedBasisGetNumComponents(basis_to, &num_comp);
       CeedBasisGetNumNodes(basis_to, &num_nodes);
-      CeedBasisGetNumQuadraturePoints(basis_to, &nqpts);
+      CeedBasisGetNumQuadraturePoints(basis_to, &num_qpts);
       CeedBasisGetInterp(basis_to, &interp);
       CeedBasisGetGrad(basis_to, &grad);
-      CeedBasisCreateH1(ceed, topo, num_comp, num_nodes, nqpts, interp, grad, NULL, NULL, &basis_to_nontensor);
+      CeedBasisCreateH1(ceed, topo, num_comp, num_nodes, num_qpts, interp, grad, NULL, NULL, &basis_to_nontensor);
     }
 
     // Test projection on non-tensor bases

From aa4002ad1a9f7f3442c9f6afb79353f990ebef22 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 3 Jan 2025 16:35:42 -0700
Subject: [PATCH 253/571] gpu - use gen LoadMatrix in shared

---
 backends/cuda-shared/ceed-cuda-shared-basis.c | 34 +++----
 .../cuda-shared/kernels/cuda-shared-basis.cu  | 53 -----------
 backends/hip-shared/ceed-hip-shared-basis.c   |  1 +
 .../cuda/cuda-shared-basis-nontensor.h        | 48 ++++++++--
 .../cuda-shared-basis-read-write-templates.h  |  8 ++
 .../cuda/cuda-shared-basis-tensor-at-points.h | 44 ++++++---
 .../cuda/cuda-shared-basis-tensor.h           | 85 ++++++++++++-----
 .../hip-shared-basis-read-write-templates.h   |  8 +-
 .../hip/hip-shared-basis-tensor-at-points.h   | 56 +++++------
 .../jit-source/hip/hip-shared-basis-tensor.h  | 93 ++++++++++---------
 10 files changed, 240 insertions(+), 190 deletions(-)
 delete mode 100644 backends/cuda-shared/kernels/cuda-shared-basis.cu

diff --git a/backends/cuda-shared/ceed-cuda-shared-basis.c b/backends/cuda-shared/ceed-cuda-shared-basis.c
index f01ec95f8c..5991559cd3 100644
--- a/backends/cuda-shared/ceed-cuda-shared-basis.c
+++ b/backends/cuda-shared/ceed-cuda-shared-basis.c
@@ -18,13 +18,6 @@
 #include "../cuda/ceed-cuda-compile.h"
 #include "ceed-cuda-shared.h"
 
-//------------------------------------------------------------------------------
-// Device initalization
-//------------------------------------------------------------------------------
-int CeedInit_CudaInterp(CeedScalar *d_B, CeedInt P_1d, CeedInt Q_1d, CeedScalar **c_B);
-int CeedInit_CudaGrad(CeedScalar *d_B, CeedScalar *d_G, CeedInt P_1d, CeedInt Q_1d, CeedScalar **c_B_ptr, CeedScalar **c_G_ptr);
-int CeedInit_CudaCollocatedGrad(CeedScalar *d_B, CeedScalar *d_G, CeedInt P_1d, CeedInt Q_1d, CeedScalar **c_B_ptr, CeedScalar **c_G_ptr);
-
 //------------------------------------------------------------------------------
 // Apply tensor basis
 //------------------------------------------------------------------------------
@@ -58,8 +51,7 @@ static int CeedBasisApplyTensorCore_Cuda_shared(CeedBasis basis, bool apply_add,
       CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
       CeedInt thread_1d = CeedIntMax(Q_1d, P_1d);
 
-      CeedCallBackend(CeedInit_CudaInterp(data->d_interp_1d, P_1d, Q_1d, &data->c_B));
-      void *interp_args[] = {(void *)&num_elem, &data->c_B, &d_u, &d_v};
+      void *interp_args[] = {(void *)&num_elem, &data->d_interp_1d, &d_u, &d_v};
 
       if (dim == 1) {
         // avoid >512 total threads
@@ -104,14 +96,14 @@ static int CeedBasisApplyTensorCore_Cuda_shared(CeedBasis basis, bool apply_add,
 
       CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
       CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
-      CeedInt thread_1d = CeedIntMax(Q_1d, P_1d);
+      CeedInt     thread_1d = CeedIntMax(Q_1d, P_1d);
+      CeedScalar *d_grad_1d = data->d_grad_1d;
 
       if (data->d_collo_grad_1d) {
-        CeedCallBackend(CeedInit_CudaCollocatedGrad(data->d_interp_1d, data->d_collo_grad_1d, P_1d, Q_1d, &data->c_B, &data->c_G));
-      } else {
-        CeedCallBackend(CeedInit_CudaGrad(data->d_interp_1d, data->d_grad_1d, P_1d, Q_1d, &data->c_B, &data->c_G));
+        d_grad_1d = data->d_collo_grad_1d;
       }
-      void *grad_args[] = {(void *)&num_elem, &data->c_B, &data->c_G, &d_u, &d_v};
+      void *grad_args[] = {(void *)&num_elem, &data->d_interp_1d, &d_grad_1d, &d_u, &d_v};
+
       if (dim == 1) {
         // avoid >512 total threads
         CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread_1d, 1));
@@ -328,8 +320,7 @@ static int CeedBasisApplyAtPointsCore_Cuda_shared(CeedBasis basis, bool apply_ad
       CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
       CeedInt thread_1d = CeedIntMax(Q_1d, P_1d);
 
-      CeedCallBackend(CeedInit_CudaInterp(data->d_chebyshev_interp_1d, P_1d, Q_1d, &data->c_B));
-      void *interp_args[] = {(void *)&num_elem, &data->c_B, &data->d_points_per_elem, &d_x, &d_u, &d_v};
+      void *interp_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
 
       if (dim == 1) {
         // avoid >512 total threads
@@ -364,7 +355,6 @@ static int CeedBasisApplyAtPointsCore_Cuda_shared(CeedBasis basis, bool apply_ad
       CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
       CeedInt thread_1d = CeedIntMax(Q_1d, P_1d);
 
-      CeedCallBackend(CeedInit_CudaInterp(data->d_chebyshev_interp_1d, P_1d, Q_1d, &data->c_B));
       void *grad_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
 
       if (dim == 1) {
@@ -456,8 +446,7 @@ static int CeedBasisApplyNonTensorCore_Cuda_shared(CeedBasis basis, bool apply_a
       CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &Q));
       CeedInt thread = CeedIntMax(Q, P);
 
-      CeedCallBackend(CeedInit_CudaInterp(data->d_interp_1d, P, Q, &data->c_B));
-      void *interp_args[] = {(void *)&num_elem, &data->c_B, &d_u, &d_v};
+      void *interp_args[] = {(void *)&num_elem, &data->d_interp_1d, &d_u, &d_v};
 
       {
         // avoid >512 total threads
@@ -480,8 +469,7 @@ static int CeedBasisApplyNonTensorCore_Cuda_shared(CeedBasis basis, bool apply_a
       CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &Q));
       CeedInt thread = CeedIntMax(Q, P);
 
-      CeedCallBackend(CeedInit_CudaInterp(data->d_grad_1d, P, Q * dim, &data->c_G));
-      void *grad_args[] = {(void *)&num_elem, &data->c_G, &d_u, &d_v};
+      void *grad_args[] = {(void *)&num_elem, &data->d_grad_1d, &d_u, &d_v};
 
       {
         // avoid >512 total threads
@@ -642,6 +630,10 @@ int CeedBasisCreateH1_Cuda_shared(CeedElemTopology topo, CeedInt dim, CeedInt nu
   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
   CeedCallBackend(CeedCalloc(1, &data));
 
+  // Check max sizes
+  CeedCheck(dim <= 3, ceed, CEED_ERROR_BACKEND, "Backend does not implement nontensor bases with dim > 3");
+  CeedCheck(num_nodes * num_qpts * dim < 52 * 52 * 3, ceed, CEED_ERROR_BACKEND, "Backend does not implement nontensor bases with P * Q this large");
+
   // Copy basis data to GPU
   CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp));
   CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_GRAD, &q_comp_grad));
diff --git a/backends/cuda-shared/kernels/cuda-shared-basis.cu b/backends/cuda-shared/kernels/cuda-shared-basis.cu
deleted file mode 100644
index f654f7ddda..0000000000
--- a/backends/cuda-shared/kernels/cuda-shared-basis.cu
+++ /dev/null
@@ -1,53 +0,0 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#include <ceed.h>
-#include <cuda.h>
-
-const int               sizeMax = 16;
-__constant__ CeedScalar c_B[sizeMax * sizeMax];
-__constant__ CeedScalar c_G[sizeMax * sizeMax];
-
-//------------------------------------------------------------------------------
-// Interp device initialization
-//------------------------------------------------------------------------------
-extern "C" int CeedInit_CudaInterp(CeedScalar *d_B, CeedInt P_1d, CeedInt Q_1d, CeedScalar **c_B_ptr) {
-  const int bytes = P_1d * Q_1d * sizeof(CeedScalar);
-
-  cudaMemcpyToSymbol(c_B, d_B, bytes, 0, cudaMemcpyDeviceToDevice);
-  cudaGetSymbolAddress((void **)c_B_ptr, c_B);
-  return CEED_ERROR_SUCCESS;
-}
-
-//------------------------------------------------------------------------------
-// Grad device initialization
-//------------------------------------------------------------------------------
-extern "C" int CeedInit_CudaGrad(CeedScalar *d_B, CeedScalar *d_G, CeedInt P_1d, CeedInt Q_1d, CeedScalar **c_B_ptr, CeedScalar **c_G_ptr) {
-  const int bytes = P_1d * Q_1d * sizeof(CeedScalar);
-
-  cudaMemcpyToSymbol(c_B, d_B, bytes, 0, cudaMemcpyDeviceToDevice);
-  cudaGetSymbolAddress((void **)c_B_ptr, c_B);
-  cudaMemcpyToSymbol(c_G, d_G, bytes, 0, cudaMemcpyDeviceToDevice);
-  cudaGetSymbolAddress((void **)c_G_ptr, c_G);
-  return CEED_ERROR_SUCCESS;
-}
-
-//------------------------------------------------------------------------------
-// Collocated grad device initialization
-//------------------------------------------------------------------------------
-extern "C" int CeedInit_CudaCollocatedGrad(CeedScalar *d_B, CeedScalar *d_G, CeedInt P_1d, CeedInt Q_1d, CeedScalar **c_B_ptr, CeedScalar **c_G_ptr) {
-  const int bytes_interp = P_1d * Q_1d * sizeof(CeedScalar);
-  const int bytes_grad   = Q_1d * Q_1d * sizeof(CeedScalar);
-
-  cudaMemcpyToSymbol(c_B, d_B, bytes_interp, 0, cudaMemcpyDeviceToDevice);
-  cudaGetSymbolAddress((void **)c_B_ptr, c_B);
-  cudaMemcpyToSymbol(c_G, d_G, bytes_grad, 0, cudaMemcpyDeviceToDevice);
-  cudaGetSymbolAddress((void **)c_G_ptr, c_G);
-  return CEED_ERROR_SUCCESS;
-}
-
-//------------------------------------------------------------------------------
diff --git a/backends/hip-shared/ceed-hip-shared-basis.c b/backends/hip-shared/ceed-hip-shared-basis.c
index 90357211ad..a94ef0c868 100644
--- a/backends/hip-shared/ceed-hip-shared-basis.c
+++ b/backends/hip-shared/ceed-hip-shared-basis.c
@@ -170,6 +170,7 @@ static int CeedBasisApplyTensorCore_Hip_shared(CeedBasis basis, bool apply_add,
         d_grad_1d = data->d_collo_grad_1d;
       }
       void *grad_args[] = {(void *)&num_elem, &data->d_interp_1d, &d_grad_1d, &d_u, &d_v};
+
       if (dim == 1) {
         CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64;
         elems_per_block         = elems_per_block > 0 ? elems_per_block : 1;
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor.h b/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor.h
index 804679ddf4..7813174a34 100644
--- a/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor.h
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor.h
@@ -28,9 +28,15 @@ extern "C" __global__ void Interp(const CeedInt num_elem, const CeedScalar *c_B,
   CeedScalar r_U[BASIS_NUM_COMP];
   CeedScalar r_V[BASIS_NUM_COMP];
 
+  // load interp into shared memory
+  __shared__ CeedScalar s_B[BASIS_P * BASIS_Q];
+  LoadMatrix<BASIS_P, BASIS_Q>(data, c_B, s_B);
+  __syncthreads();
+
+  // Apply basis element by element
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
     ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, d_U, r_U);
-    InterpNonTensor<BASIS_NUM_COMP, BASIS_P, BASIS_Q>(data, r_U, c_B, r_V);
+    InterpNonTensor<BASIS_NUM_COMP, BASIS_P, BASIS_Q>(data, r_U, s_B, r_V);
     WriteElementStrided1d<BASIS_NUM_COMP, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, r_V, d_V);
   }
 }
@@ -49,9 +55,15 @@ extern "C" __global__ void InterpTranspose(const CeedInt num_elem, const CeedSca
   CeedScalar r_U[BASIS_NUM_COMP];
   CeedScalar r_V[BASIS_NUM_COMP];
 
+  // load interp into shared memory
+  __shared__ CeedScalar s_B[BASIS_P * BASIS_Q];
+  LoadMatrix<BASIS_P, BASIS_Q>(data, c_B, s_B);
+  __syncthreads();
+
+  // Apply basis element by element
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
     ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, d_U, r_U);
-    InterpTransposeNonTensor<BASIS_NUM_COMP, BASIS_P, BASIS_Q>(data, r_U, c_B, r_V);
+    InterpTransposeNonTensor<BASIS_NUM_COMP, BASIS_P, BASIS_Q>(data, r_U, s_B, r_V);
     WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, r_V, d_V);
   }
 }
@@ -70,9 +82,15 @@ extern "C" __global__ void InterpTransposeAdd(const CeedInt num_elem, const Ceed
   CeedScalar r_U[BASIS_NUM_COMP];
   CeedScalar r_V[BASIS_NUM_COMP];
 
+  // load interp into shared memory
+  __shared__ CeedScalar s_B[BASIS_P * BASIS_Q];
+  LoadMatrix<BASIS_P, BASIS_Q>(data, c_B, s_B);
+  __syncthreads();
+
+  // Apply basis element by element
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
     ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, d_U, r_U);
-    InterpTransposeNonTensor<BASIS_NUM_COMP, BASIS_P, BASIS_Q>(data, r_U, c_B, r_V);
+    InterpTransposeNonTensor<BASIS_NUM_COMP, BASIS_P, BASIS_Q>(data, r_U, s_B, r_V);
     SumElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, r_V, d_V);
   }
 }
@@ -93,9 +111,15 @@ extern "C" __global__ void Grad(const CeedInt num_elem, const CeedScalar *c_G, c
   CeedScalar r_U[BASIS_NUM_COMP];
   CeedScalar r_V[BASIS_NUM_COMP * BASIS_DIM];
 
+  // load grad into shared memory
+  __shared__ CeedScalar s_G[BASIS_P * BASIS_Q * BASIS_DIM];
+  LoadMatrix<BASIS_P, BASIS_Q * BASIS_DIM>(data, c_G, s_G);
+  __syncthreads();
+
+  // Apply basis element by element
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
     ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, d_U, r_U);
-    GradNonTensor<BASIS_NUM_COMP, BASIS_DIM, BASIS_P, BASIS_Q>(data, r_U, c_G, r_V);
+    GradNonTensor<BASIS_NUM_COMP, BASIS_DIM, BASIS_P, BASIS_Q>(data, r_U, s_G, r_V);
     WriteElementStrided1d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, r_V, d_V);
   }
 }
@@ -114,9 +138,15 @@ extern "C" __global__ void GradTranspose(const CeedInt num_elem, const CeedScala
   CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM];
   CeedScalar r_V[BASIS_NUM_COMP];
 
+  // load grad into shared memory
+  __shared__ CeedScalar s_G[BASIS_P * BASIS_Q * BASIS_DIM];
+  LoadMatrix<BASIS_P, BASIS_Q * BASIS_DIM>(data, c_G, s_G);
+  __syncthreads();
+
+  // Apply basis element by element
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
     ReadElementStrided1d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, d_U, r_U);
-    GradTransposeNonTensor<BASIS_NUM_COMP, BASIS_DIM, BASIS_P, BASIS_Q>(data, r_U, c_G, r_V);
+    GradTransposeNonTensor<BASIS_NUM_COMP, BASIS_DIM, BASIS_P, BASIS_Q>(data, r_U, s_G, r_V);
     WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, r_V, d_V);
   }
 }
@@ -135,9 +165,15 @@ extern "C" __global__ void GradTransposeAdd(const CeedInt num_elem, const CeedSc
   CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM];
   CeedScalar r_V[BASIS_NUM_COMP];
 
+  // load grad into shared memory
+  __shared__ CeedScalar s_G[BASIS_P * BASIS_Q * BASIS_DIM];
+  LoadMatrix<BASIS_P, BASIS_Q * BASIS_DIM>(data, c_G, s_G);
+  __syncthreads();
+
+  // Apply basis element by element
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
     ReadElementStrided1d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, d_U, r_U);
-    GradTransposeNonTensor<BASIS_NUM_COMP, BASIS_DIM, BASIS_P, BASIS_Q>(data, r_U, c_G, r_V);
+    GradTransposeNonTensor<BASIS_NUM_COMP, BASIS_DIM, BASIS_P, BASIS_Q>(data, r_U, s_G, r_V);
     SumElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, r_V, d_V);
   }
 }
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-read-write-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-read-write-templates.h
index 49e7eca873..066f95ed58 100644
--- a/include/ceed/jit-source/cuda/cuda-shared-basis-read-write-templates.h
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-read-write-templates.h
@@ -9,6 +9,14 @@
 /// Internal header for CUDA shared memory basis read/write templates
 #include <ceed/types.h>
 
+//------------------------------------------------------------------------------
+// Load matrices for basis actions
+//------------------------------------------------------------------------------
+template <int P, int Q>
+inline __device__ void LoadMatrix(SharedData_Cuda &data, const CeedScalar *__restrict__ d_B, CeedScalar *B) {
+  for (CeedInt i = data.t_id; i < P * Q; i += blockDim.x * blockDim.y * blockDim.z) B[i] = d_B[i];
+}
+
 //------------------------------------------------------------------------------
 // 1D
 //------------------------------------------------------------------------------
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points.h
index cd9021611a..d0cc602be9 100644
--- a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points.h
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points.h
@@ -36,19 +36,24 @@ extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedScal
   CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
   CeedScalar r_V[BASIS_NUM_COMP];
 
+  // load interp_1d into shared memory
+  __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
+  LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B);
+  __syncthreads();
+
   // Apply basis element by element
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
     // Map to coefficients
     if (BASIS_DIM == 1) {
       ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U);
-      Interp1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, c_B, r_C);
+      Interp1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_C);
     } else if (BASIS_DIM == 2) {
       ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U);
-      InterpTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, c_B, r_C);
+      InterpTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_C);
     } else if (BASIS_DIM == 3) {
       ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
                                                        BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U);
-      InterpTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, c_B, r_C);
+      InterpTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_C);
     }
 
     // Map to points
@@ -87,6 +92,11 @@ extern "C" __global__ void InterpTransposeAtPoints(const CeedInt num_elem, const
   CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
   CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
 
+  // load interp_1d into shared memory
+  __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
+  LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B);
+  __syncthreads();
+
   // Apply basis element by element
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
     // Clear register
@@ -112,13 +122,13 @@ extern "C" __global__ void InterpTransposeAtPoints(const CeedInt num_elem, const
 
     // Map from coefficients
     if (BASIS_DIM == 1) {
-      InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, c_B, r_V);
+      InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V);
       SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
     } else if (BASIS_DIM == 2) {
-      InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, c_B, r_V);
+      InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V);
       SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
     } else if (BASIS_DIM == 3) {
-      InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, c_B, r_V);
+      InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V);
       SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
                                                       BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
     }
@@ -144,19 +154,24 @@ extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedScalar
   CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
   CeedScalar r_V[BASIS_NUM_COMP * BASIS_DIM];
 
+  // load interp_1d into shared memory
+  __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
+  LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B);
+  __syncthreads();
+
   // Apply basis element by element
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
     // Map to coefficients
     if (BASIS_DIM == 1) {
       ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U);
-      Interp1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, c_B, r_C);
+      Interp1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_C);
     } else if (BASIS_DIM == 2) {
       ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U);
-      InterpTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, c_B, r_C);
+      InterpTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_C);
     } else if (BASIS_DIM == 3) {
       ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
                                                        BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U);
-      InterpTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, c_B, r_C);
+      InterpTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_C);
     }
 
     // Map to points
@@ -195,6 +210,11 @@ extern "C" __global__ void GradTransposeAtPoints(const CeedInt num_elem, const C
   CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
   CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
 
+  // load interp_1d into shared memory
+  __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
+  LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B);
+  __syncthreads();
+
   // Apply basis element by element
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
     // Clear register
@@ -221,13 +241,13 @@ extern "C" __global__ void GradTransposeAtPoints(const CeedInt num_elem, const C
 
     // Map from coefficients
     if (BASIS_DIM == 1) {
-      InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, c_B, r_V);
+      InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V);
       SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
     } else if (BASIS_DIM == 2) {
-      InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, c_B, r_V);
+      InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V);
       SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
     } else if (BASIS_DIM == 3) {
-      InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, c_B, r_V);
+      InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V);
       SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
                                                       BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
     }
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor.h
index 9b80043996..a70481fb55 100644
--- a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor.h
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor.h
@@ -28,19 +28,25 @@ extern "C" __global__ void Interp(const CeedInt num_elem, const CeedScalar *c_B,
   CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
   CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
 
+  // load interp_1d into shared memory
+  __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
+  LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B);
+  __syncthreads();
+
+  // Apply basis element by element
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
     if (BASIS_DIM == 1) {
       ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U);
-      Interp1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, c_B, r_V);
+      Interp1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_V);
       WriteElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, r_V, d_V);
     } else if (BASIS_DIM == 2) {
       ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U);
-      InterpTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, c_B, r_V);
+      InterpTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_V);
       WriteElementStrided2d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, r_V, d_V);
     } else if (BASIS_DIM == 3) {
       ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
                                                        BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U);
-      InterpTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, c_B, r_V);
+      InterpTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_V);
       WriteElementStrided3d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
                                                         BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_V, d_V);
     }
@@ -61,19 +67,25 @@ extern "C" __global__ void InterpTranspose(const CeedInt num_elem, const CeedSca
   CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
   CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
 
+  // load interp_1d into shared memory
+  __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
+  LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B);
+  __syncthreads();
+
+  // Apply basis element by element
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
     if (BASIS_DIM == 1) {
       ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
-      InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, c_B, r_V);
+      InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_V);
       WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
     } else if (BASIS_DIM == 2) {
       ReadElementStrided2d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
-      InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, c_B, r_V);
+      InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_V);
       WriteElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
     } else if (BASIS_DIM == 3) {
       ReadElementStrided3d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
                                                        BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
-      InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, c_B, r_V);
+      InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_V);
       WriteElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
                                                         BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
     }
@@ -94,19 +106,25 @@ extern "C" __global__ void InterpTransposeAdd(const CeedInt num_elem, const Ceed
   CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
   CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
 
+  // load interp_1d into shared memory
+  __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
+  LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B);
+  __syncthreads();
+
+  // Apply basis element by element
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
     if (BASIS_DIM == 1) {
       ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
-      InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, c_B, r_V);
+      InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_V);
       SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
     } else if (BASIS_DIM == 2) {
       ReadElementStrided2d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
-      InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, c_B, r_V);
+      InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_V);
       SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
     } else if (BASIS_DIM == 3) {
       ReadElementStrided3d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
                                                        BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
-      InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, c_B, r_V);
+      InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_V);
       SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
                                                       BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
     }
@@ -130,21 +148,29 @@ extern "C" __global__ void Grad(const CeedInt num_elem, const CeedScalar *c_B, c
   CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
   CeedScalar r_V[BASIS_NUM_COMP * BASIS_DIM * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
 
+  // load interp_1d and grad_1d into shared memory
+  __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
+  LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B);
+  __shared__ CeedScalar s_G[BASIS_Q_1D * (BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)];
+  LoadMatrix<BASIS_Q_1D, BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D>(data, c_G, s_G);
+  __syncthreads();
+
+  // Apply basis element by element
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
     if (BASIS_DIM == 1) {
       ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U);
-      Grad1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, c_B, c_G, r_V);
+      Grad1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V);
       WriteElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, r_V, d_V);
     } else if (BASIS_DIM == 2) {
       ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U);
-      GradTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, c_B, c_G, r_V);
+      GradTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V);
       WriteElementStrided2d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, r_V,
                                                                     d_V);
     } else if (BASIS_DIM == 3) {
       ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
                                                        BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U);
-      if (BASIS_HAS_COLLOCATED_GRAD) GradTensorCollocated3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, c_B, c_G, r_V);
-      else GradTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, c_B, c_G, r_V);
+      if (BASIS_HAS_COLLOCATED_GRAD) GradTensorCollocated3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V);
+      else GradTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V);
       WriteElementStrided3d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
                                                                     BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_V, d_V);
     }
@@ -165,21 +191,29 @@ extern "C" __global__ void GradTranspose(const CeedInt num_elem, const CeedScala
   CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
   CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
 
+  // load interp_1d and grad_1d into shared memory
+  __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
+  LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B);
+  __shared__ CeedScalar s_G[BASIS_Q_1D * (BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)];
+  LoadMatrix<BASIS_Q_1D, BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D>(data, c_G, s_G);
+  __syncthreads();
+
+  // Apply basis element by element
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
     if (BASIS_DIM == 1) {
       ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
-      GradTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, c_B, c_G, r_V);
+      GradTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V);
       WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
     } else if (BASIS_DIM == 2) {
       ReadElementStrided2d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U,
                                                                    r_U);
-      GradTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, c_B, c_G, r_V);
+      GradTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V);
       WriteElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
     } else if (BASIS_DIM == 3) {
       ReadElementStrided3d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
                                                                    BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
-      if (BASIS_HAS_COLLOCATED_GRAD) GradTransposeTensorCollocated3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, c_B, c_G, r_V);
-      else GradTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, c_B, c_G, r_V);
+      if (BASIS_HAS_COLLOCATED_GRAD) GradTransposeTensorCollocated3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V);
+      else GradTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V);
       WriteElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
                                                         BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
     }
@@ -200,21 +234,29 @@ extern "C" __global__ void GradTransposeAdd(const CeedInt num_elem, const CeedSc
   CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
   CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
 
+  // load interp_1d and grad_1d into shared memory
+  __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
+  LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B);
+  __shared__ CeedScalar s_G[BASIS_Q_1D * (BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)];
+  LoadMatrix<BASIS_Q_1D, BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D>(data, c_G, s_G);
+  __syncthreads();
+
+  // Apply basis element by element
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
     if (BASIS_DIM == 1) {
       ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
-      GradTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, c_B, c_G, r_V);
+      GradTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V);
       SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
     } else if (BASIS_DIM == 2) {
       ReadElementStrided2d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U,
                                                                    r_U);
-      GradTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, c_B, c_G, r_V);
+      GradTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V);
       SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
     } else if (BASIS_DIM == 3) {
       ReadElementStrided3d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
                                                                    BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
-      if (BASIS_HAS_COLLOCATED_GRAD) GradTransposeTensorCollocated3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, c_B, c_G, r_V);
-      else GradTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, c_B, c_G, r_V);
+      if (BASIS_HAS_COLLOCATED_GRAD) GradTransposeTensorCollocated3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V);
+      else GradTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V);
       SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
                                                       BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
     }
@@ -236,6 +278,7 @@ extern "C" __global__ void Weight(const CeedInt num_elem, const CeedScalar *__re
 
   CeedScalar r_W[BASIS_DIM > 2 ? BASIS_Q_1D : 1];
 
+  // Apply basis element by element
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
     if (BASIS_DIM == 1) {
       Weight1d<BASIS_Q_1D>(data, q_weight_1d, r_W);
diff --git a/include/ceed/jit-source/hip/hip-shared-basis-read-write-templates.h b/include/ceed/jit-source/hip/hip-shared-basis-read-write-templates.h
index a5313ec925..47b4eae92f 100644
--- a/include/ceed/jit-source/hip/hip-shared-basis-read-write-templates.h
+++ b/include/ceed/jit-source/hip/hip-shared-basis-read-write-templates.h
@@ -12,11 +12,9 @@
 //------------------------------------------------------------------------------
 // Helper function: load matrices for basis actions
 //------------------------------------------------------------------------------
-template <int SIZE>
-inline __device__ void loadMatrix(const CeedScalar *d_B, CeedScalar *B) {
-  CeedInt tid = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
-
-  for (CeedInt i = tid; i < SIZE; i += blockDim.x * blockDim.y * blockDim.z) B[i] = d_B[i];
+template <int P, int Q>
+inline __device__ void LoadMatrix(SharedData_Hip &data, const CeedScalar *__restrict__ d_B, CeedScalar *B) {
+  for (CeedInt i = data.t_id; i < P * Q; i += blockDim.x * blockDim.y * blockDim.z) B[i] = d_B[i];
 }
 
 //------------------------------------------------------------------------------
diff --git a/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points.h b/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points.h
index 9f5e947a07..753d5e1af7 100644
--- a/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points.h
+++ b/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points.h
@@ -21,15 +21,10 @@
 // Interp
 //------------------------------------------------------------------------------
 extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
-    void InterpAtPoints(const CeedInt num_elem, const CeedScalar *d_chebyshev_interp_1d, const CeedInt *points_per_elem,
-                        const CeedScalar *__restrict__ d_X, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
+    void InterpAtPoints(const CeedInt num_elem, const CeedScalar *c_B, const CeedInt *points_per_elem, const CeedScalar *__restrict__ d_X,
+                        const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
   extern __shared__ CeedScalar slice[];
 
-  // load chebyshev_interp_1d into shared memory
-  __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
-  loadMatrix<BASIS_P_1D * BASIS_Q_1D>(d_chebyshev_interp_1d, s_B);
-  __syncthreads();
-
   SharedData_Hip data;
   data.t_id_x = threadIdx.x;
   data.t_id_y = threadIdx.y;
@@ -42,6 +37,11 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
   CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
   CeedScalar r_V[BASIS_NUM_COMP];
 
+  // load chebyshev_interp_1d into shared memory
+  __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
+  LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B);
+  __syncthreads();
+
   // Apply basis element by element
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
     // Map to coefficients
@@ -77,15 +77,10 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
 }
 
 extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
-    void InterpTransposeAtPoints(const CeedInt num_elem, const CeedScalar *d_chebyshev_interp_1d, const CeedInt *points_per_elem,
-                                 const CeedScalar *__restrict__ d_X, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
+    void InterpTransposeAtPoints(const CeedInt num_elem, const CeedScalar *c_B, const CeedInt *points_per_elem, const CeedScalar *__restrict__ d_X,
+                                 const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
   extern __shared__ CeedScalar slice[];
 
-  // load chebyshev_interp_1d into shared memory
-  __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
-  loadMatrix<BASIS_P_1D * BASIS_Q_1D>(d_chebyshev_interp_1d, s_B);
-  __syncthreads();
-
   SharedData_Hip data;
   data.t_id_x = threadIdx.x;
   data.t_id_y = threadIdx.y;
@@ -98,6 +93,11 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
   CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
   CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
 
+  // load chebyshev_interp_1d into shared memory
+  __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
+  LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B);
+  __syncthreads();
+
   // Apply basis element by element
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
     // Clear register
@@ -140,15 +140,10 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
 // Grad
 //------------------------------------------------------------------------------
 extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
-    void GradAtPoints(const CeedInt num_elem, const CeedScalar *d_chebyshev_interp_1d, const CeedInt *points_per_elem,
-                      const CeedScalar *__restrict__ d_X, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
+    void GradAtPoints(const CeedInt num_elem, const CeedScalar *c_B, const CeedInt *points_per_elem, const CeedScalar *__restrict__ d_X,
+                      const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
   extern __shared__ CeedScalar slice[];
 
-  // load chebyshev_interp_1d into shared memory
-  __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
-  loadMatrix<BASIS_P_1D * BASIS_Q_1D>(d_chebyshev_interp_1d, s_B);
-  __syncthreads();
-
   SharedData_Hip data;
   data.t_id_x = threadIdx.x;
   data.t_id_y = threadIdx.y;
@@ -161,6 +156,11 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
   CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
   CeedScalar r_V[BASIS_NUM_COMP * BASIS_DIM];
 
+  // load chebyshev_interp_1d into shared memory
+  __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
+  LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B);
+  __syncthreads();
+
   // Apply basis element by element
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
     // Map to coefficients
@@ -196,15 +196,10 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
 }
 
 extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
-    void GradTransposeAtPoints(const CeedInt num_elem, const CeedScalar *d_chebyshev_interp_1d, const CeedInt *points_per_elem,
-                               const CeedScalar *__restrict__ d_X, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
+    void GradTransposeAtPoints(const CeedInt num_elem, const CeedScalar *c_B, const CeedInt *points_per_elem, const CeedScalar *__restrict__ d_X,
+                               const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
   extern __shared__ CeedScalar slice[];
 
-  // load chebyshev_interp_1d into shared memory
-  __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
-  loadMatrix<BASIS_P_1D * BASIS_Q_1D>(d_chebyshev_interp_1d, s_B);
-  __syncthreads();
-
   SharedData_Hip data;
   data.t_id_x = threadIdx.x;
   data.t_id_y = threadIdx.y;
@@ -217,6 +212,11 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
   CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
   CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
 
+  // load chebyshev_interp_1d into shared memory
+  __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
+  LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B);
+  __syncthreads();
+
   // Apply basis element by element
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
     // Clear register
diff --git a/include/ceed/jit-source/hip/hip-shared-basis-tensor.h b/include/ceed/jit-source/hip/hip-shared-basis-tensor.h
index d84f5555c8..fda6ee2cfe 100644
--- a/include/ceed/jit-source/hip/hip-shared-basis-tensor.h
+++ b/include/ceed/jit-source/hip/hip-shared-basis-tensor.h
@@ -16,14 +16,9 @@
 // Interp kernel by dim
 //------------------------------------------------------------------------------
 extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
-    void Interp(const CeedInt num_elem, const CeedScalar *d_interp_1d, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
+    void Interp(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
   extern __shared__ CeedScalar slice[];
 
-  // load interp_1d into shared memory
-  __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
-  loadMatrix<BASIS_P_1D * BASIS_Q_1D>(d_interp_1d, s_B);
-  __syncthreads();
-
   SharedData_Hip data;
   data.t_id_x = threadIdx.x;
   data.t_id_y = threadIdx.y;
@@ -34,6 +29,12 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
   CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
   CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
 
+  // load interp_1d into shared memory
+  __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
+  LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B);
+  __syncthreads();
+
+  // Apply basis element by element
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
     if (BASIS_DIM == 1) {
       ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U);
@@ -54,14 +55,9 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
 }
 
 extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
-    void InterpTranspose(const CeedInt num_elem, const CeedScalar *d_interp_1d, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
+    void InterpTranspose(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
   extern __shared__ CeedScalar slice[];
 
-  // load interp_1d into shared memory
-  __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
-  loadMatrix<BASIS_P_1D * BASIS_Q_1D>(d_interp_1d, s_B);
-  __syncthreads();
-
   SharedData_Hip data;
   data.t_id_x = threadIdx.x;
   data.t_id_y = threadIdx.y;
@@ -72,6 +68,12 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
   CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
   CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
 
+  // load interp_1d into shared memory
+  __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
+  LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B);
+  __syncthreads();
+
+  // Apply basis element by element
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
     if (BASIS_DIM == 1) {
       ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
@@ -92,14 +94,9 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
 }
 
 extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
-    void InterpTransposeAdd(const CeedInt num_elem, const CeedScalar *d_interp_1d, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
+    void InterpTransposeAdd(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
   extern __shared__ CeedScalar slice[];
 
-  // load interp_1d into shared memory
-  __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
-  loadMatrix<BASIS_P_1D * BASIS_Q_1D>(d_interp_1d, s_B);
-  __syncthreads();
-
   SharedData_Hip data;
   data.t_id_x = threadIdx.x;
   data.t_id_y = threadIdx.y;
@@ -110,6 +107,12 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
   CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
   CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
 
+  // load interp_1d into shared memory
+  __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
+  LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B);
+  __syncthreads();
+
+  // Apply basis element by element
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
     if (BASIS_DIM == 1) {
       ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
@@ -132,18 +135,10 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
 //------------------------------------------------------------------------------
 // Grad kernel by dim
 //------------------------------------------------------------------------------
-extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
-    void Grad(const CeedInt num_elem, const CeedScalar *d_interp_1d, const CeedScalar *d_grad_1d, const CeedScalar *__restrict__ d_U,
-              CeedScalar *__restrict__ d_V) {
+extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__ void Grad(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *c_G,
+                                                                         const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
   extern __shared__ CeedScalar slice[];
 
-  // load interp_1d and grad_1d into shared memory
-  __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
-  loadMatrix<BASIS_P_1D * BASIS_Q_1D>(d_interp_1d, s_B);
-  __shared__ CeedScalar s_G[BASIS_Q_1D * (BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)];
-  loadMatrix<BASIS_Q_1D *(BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)>(d_grad_1d, s_G);
-  __syncthreads();
-
   SharedData_Hip data;
   data.t_id_x = threadIdx.x;
   data.t_id_y = threadIdx.y;
@@ -154,6 +149,14 @@ extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
   CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
   CeedScalar r_V[BASIS_NUM_COMP * BASIS_DIM * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
 
+  // load interp_1d and grad_1d into shared memory
+  __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
+  LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B);
+  __shared__ CeedScalar s_G[BASIS_Q_1D * (BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)];
+  LoadMatrix<BASIS_Q_1D, BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D>(data, c_G, s_G);
+  __syncthreads();
+
+  // Apply basis element by element
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
     if (BASIS_DIM == 1) {
       ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U);
@@ -176,17 +179,10 @@ extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
 }
 
 extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
-    void GradTranspose(const CeedInt num_elem, const CeedScalar *d_interp_1d, const CeedScalar *d_grad_1d, const CeedScalar *__restrict__ d_U,
+    void GradTranspose(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U,
                        CeedScalar *__restrict__ d_V) {
   extern __shared__ CeedScalar slice[];
 
-  // load interp_1d and grad_1d into shared memory
-  __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
-  loadMatrix<BASIS_P_1D * BASIS_Q_1D>(d_interp_1d, s_B);
-  __shared__ CeedScalar s_G[BASIS_Q_1D * (BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)];
-  loadMatrix<BASIS_Q_1D *(BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)>(d_grad_1d, s_G);
-  __syncthreads();
-
   SharedData_Hip data;
   data.t_id_x = threadIdx.x;
   data.t_id_y = threadIdx.y;
@@ -197,6 +193,14 @@ extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
   CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
   CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
 
+  // load interp_1d and grad_1d into shared memory
+  __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
+  LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B);
+  __shared__ CeedScalar s_G[BASIS_Q_1D * (BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)];
+  LoadMatrix<BASIS_Q_1D, BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D>(data, c_G, s_G);
+  __syncthreads();
+
+  // Apply basis element by element
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
     if (BASIS_DIM == 1) {
       ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
@@ -219,17 +223,10 @@ extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
 }
 
 extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
-    void GradTransposeAdd(const CeedInt num_elem, const CeedScalar *d_interp_1d, const CeedScalar *d_grad_1d, const CeedScalar *__restrict__ d_U,
+    void GradTransposeAdd(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U,
                           CeedScalar *__restrict__ d_V) {
   extern __shared__ CeedScalar slice[];
 
-  // load interp_1d and grad_1d into shared memory
-  __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
-  loadMatrix<BASIS_P_1D * BASIS_Q_1D>(d_interp_1d, s_B);
-  __shared__ CeedScalar s_G[BASIS_Q_1D * (BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)];
-  loadMatrix<BASIS_Q_1D *(BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)>(d_grad_1d, s_G);
-  __syncthreads();
-
   SharedData_Hip data;
   data.t_id_x = threadIdx.x;
   data.t_id_y = threadIdx.y;
@@ -240,6 +237,14 @@ extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
   CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
   CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
 
+  // load interp_1d and grad_1d into shared memory
+  __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
+  LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B);
+  __shared__ CeedScalar s_G[BASIS_Q_1D * (BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)];
+  LoadMatrix<BASIS_Q_1D, BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D>(data, c_G, s_G);
+  __syncthreads();
+
+  // Apply basis element by element
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
     if (BASIS_DIM == 1) {
       ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);

From 6c13bbcb075a8bd926202fb66e317904cf28c80e Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Tue, 7 Jan 2025 14:06:13 -0700
Subject: [PATCH 254/571] hip - add nontensor shared

---
 backends/hip-shared/ceed-hip-shared-basis.c   | 176 +++++++++++++++
 backends/hip-shared/ceed-hip-shared.c         |   1 +
 backends/hip-shared/ceed-hip-shared.h         |   3 +
 .../cuda-shared-basis-nontensor-templates.h   |   2 +-
 .../cuda/cuda-shared-basis-nontensor.h        |   6 +-
 .../hip-shared-basis-nontensor-templates.h    |  98 +++++++++
 .../hip/hip-shared-basis-nontensor.h          | 205 ++++++++++++++++++
 .../jit-source/hip/hip-shared-basis-tensor.h  |   4 +-
 8 files changed, 489 insertions(+), 6 deletions(-)
 create mode 100644 include/ceed/jit-source/hip/hip-shared-basis-nontensor-templates.h
 create mode 100644 include/ceed/jit-source/hip/hip-shared-basis-nontensor.h

diff --git a/backends/hip-shared/ceed-hip-shared-basis.c b/backends/hip-shared/ceed-hip-shared-basis.c
index a94ef0c868..bfca624c44 100644
--- a/backends/hip-shared/ceed-hip-shared-basis.c
+++ b/backends/hip-shared/ceed-hip-shared-basis.c
@@ -479,6 +479,121 @@ static int CeedBasisApplyAddAtPoints_Hip_shared(CeedBasis basis, const CeedInt n
   return CEED_ERROR_SUCCESS;
 }
 
+//------------------------------------------------------------------------------
+// Apply basis
+//------------------------------------------------------------------------------
+static int CeedBasisApplyNonTensorCore_Hip_shared(CeedBasis basis, bool apply_add, const CeedInt num_elem, CeedTransposeMode t_mode,
+                                                  CeedEvalMode eval_mode, CeedVector u, CeedVector v) {
+  Ceed                  ceed;
+  Ceed_Hip             *ceed_Hip;
+  CeedInt               dim, num_comp;
+  const CeedScalar     *d_u;
+  CeedScalar           *d_v;
+  CeedBasis_Hip_shared *data;
+
+  CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
+  CeedCallBackend(CeedGetData(ceed, &ceed_Hip));
+  CeedCallBackend(CeedBasisGetData(basis, &data));
+  CeedCallBackend(CeedBasisGetDimension(basis, &dim));
+  CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
+
+  // Get read/write access to u, v
+  if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
+  else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
+  if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v));
+  else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
+
+  // Apply basis operation
+  switch (eval_mode) {
+    case CEED_EVAL_INTERP: {
+      CeedInt P, Q;
+
+      CeedCallBackend(CeedBasisGetNumNodes(basis, &P));
+      CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &Q));
+      CeedInt thread        = CeedIntMax(Q, P);
+      void   *interp_args[] = {(void *)&num_elem, &data->d_interp_1d, &d_u, &d_v};
+
+      {
+        CeedInt elems_per_block = 64 * thread > 256 ? 256 / thread : 64;
+        elems_per_block         = elems_per_block > 0 ? elems_per_block : 1;
+        CeedInt grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
+        CeedInt shared_mem      = elems_per_block * thread * sizeof(CeedScalar);
+
+        if (t_mode == CEED_TRANSPOSE) {
+          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, grid, thread, 1,
+                                                     elems_per_block, shared_mem, interp_args));
+        } else {
+          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Interp, grid, thread, 1, elems_per_block, shared_mem, interp_args));
+        }
+      }
+    } break;
+    case CEED_EVAL_GRAD: {
+      CeedInt P, Q;
+
+      CeedCallBackend(CeedBasisGetNumNodes(basis, &P));
+      CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &Q));
+      CeedInt thread      = CeedIntMax(Q, P);
+      void   *grad_args[] = {(void *)&num_elem, &data->d_interp_1d, &data->d_grad_1d, &d_u, &d_v};
+
+      {
+        CeedInt elems_per_block = 64 * thread > 256 ? 256 / thread : 64;
+        elems_per_block         = elems_per_block > 0 ? elems_per_block : 1;
+        CeedInt grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
+        CeedInt shared_mem      = elems_per_block * thread * sizeof(CeedScalar);
+
+        if (t_mode == CEED_TRANSPOSE) {
+          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, grid, thread, 1, elems_per_block,
+                                                     shared_mem, grad_args));
+        } else {
+          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Grad, grid, thread, 1, elems_per_block, shared_mem, grad_args));
+        }
+      }
+    } break;
+    case CEED_EVAL_WEIGHT: {
+      CeedInt Q;
+      CeedInt block_size = data->block_sizes[2];
+
+      CeedCheck(data->d_q_weight_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights_1d not set", CeedEvalModes[eval_mode]);
+      CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q));
+      void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight_1d, &d_v};
+
+      {
+        const CeedInt opt_elems       = block_size / Q;
+        const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1;
+        const CeedInt grid_size       = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
+
+        CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Weight, grid_size, Q, elems_per_block, 1, weight_args));
+      }
+    } break;
+    case CEED_EVAL_NONE: /* handled separately below */
+      break;
+    // LCOV_EXCL_START
+    case CEED_EVAL_DIV:
+    case CEED_EVAL_CURL:
+      return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]);
+      // LCOV_EXCL_STOP
+  }
+
+  // Restore vectors, cover CEED_EVAL_NONE
+  CeedCallBackend(CeedVectorRestoreArray(v, &d_v));
+  if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u));
+  if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u));
+  CeedCallBackend(CeedDestroy(&ceed));
+  return CEED_ERROR_SUCCESS;
+}
+
+int CeedBasisApplyNonTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u,
+                                       CeedVector v) {
+  CeedCallBackend(CeedBasisApplyNonTensorCore_Hip_shared(basis, false, num_elem, t_mode, eval_mode, u, v));
+  return CEED_ERROR_SUCCESS;
+}
+
+int CeedBasisApplyAddNonTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u,
+                                          CeedVector v) {
+  CeedCallBackend(CeedBasisApplyNonTensorCore_Hip_shared(basis, true, num_elem, t_mode, eval_mode, u, v));
+  return CEED_ERROR_SUCCESS;
+}
+
 //------------------------------------------------------------------------------
 // Destroy basis
 //------------------------------------------------------------------------------
@@ -572,3 +687,64 @@ int CeedBasisCreateTensorH1_Hip_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d,
 }
 
 //------------------------------------------------------------------------------
+// Create non-tensor basis
+//------------------------------------------------------------------------------
+int CeedBasisCreateH1_Hip_shared(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp,
+                                 const CeedScalar *grad, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) {
+  Ceed                  ceed;
+  CeedInt               num_comp, q_comp_interp, q_comp_grad;
+  const CeedInt         q_bytes = num_qpts * sizeof(CeedScalar);
+  CeedBasis_Hip_shared *data;
+
+  CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
+  CeedCallBackend(CeedCalloc(1, &data));
+
+  // Check max sizes
+  CeedCheck(dim <= 3, ceed, CEED_ERROR_BACKEND, "Backend does not implement nontensor bases with dim > 3");
+  CeedCheck(num_nodes * num_qpts * dim < 52 * 52 * 3, ceed, CEED_ERROR_BACKEND, "Backend does not implement nontensor bases with P * Q this large");
+
+  // Copy basis data to GPU
+  CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp));
+  CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_GRAD, &q_comp_grad));
+  if (q_weight) {
+    CeedCallHip(ceed, hipMalloc((void **)&data->d_q_weight_1d, q_bytes));
+    CeedCallHip(ceed, hipMemcpy(data->d_q_weight_1d, q_weight, q_bytes, hipMemcpyHostToDevice));
+  }
+  if (interp) {
+    const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp;
+
+    CeedCallHip(ceed, hipMalloc((void **)&data->d_interp_1d, interp_bytes));
+    CeedCallHip(ceed, hipMemcpy(data->d_interp_1d, interp, interp_bytes, hipMemcpyHostToDevice));
+  }
+  if (grad) {
+    const CeedInt grad_bytes = q_bytes * num_nodes * q_comp_grad;
+
+    CeedCallHip(ceed, hipMalloc((void **)&data->d_grad_1d, grad_bytes));
+    CeedCallHip(ceed, hipMemcpy(data->d_grad_1d, grad, grad_bytes, hipMemcpyHostToDevice));
+  }
+
+  // Compile basis kernels
+  const char basis_kernel_source[] = "// Non-tensor basis source\n#include <ceed/jit-source/hip/hip-shared-basis-nontensor.h>\n";
+
+  CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
+  CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "T_1D",
+                                  CeedIntMax(num_qpts, num_nodes), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp));
+  CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Interp", &data->Interp));
+  CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "InterpTranspose", &data->InterpTranspose));
+  CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "InterpTransposeAdd", &data->InterpTransposeAdd));
+  CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Grad", &data->Grad));
+  CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "GradTranspose", &data->GradTranspose));
+  CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "GradTransposeAdd", &data->GradTransposeAdd));
+  CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Weight", &data->Weight));
+
+  CeedCallBackend(CeedBasisSetData(basis, data));
+
+  // Register backend functions
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Hip_shared));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Hip_shared));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Hip_shared));
+  CeedCallBackend(CeedDestroy(&ceed));
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
diff --git a/backends/hip-shared/ceed-hip-shared.c b/backends/hip-shared/ceed-hip-shared.c
index 01d173e42e..72d1da5f0b 100644
--- a/backends/hip-shared/ceed-hip-shared.c
+++ b/backends/hip-shared/ceed-hip-shared.c
@@ -36,6 +36,7 @@ static int CeedInit_Hip_shared(const char *resource, Ceed ceed) {
   CeedCallBackend(CeedDestroy(&ceed_ref));
 
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "BasisCreateTensorH1", CeedBasisCreateTensorH1_Hip_shared));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "BasisCreateH1", CeedBasisCreateH1_Hip_shared));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "Destroy", CeedDestroy_Hip));
   return CEED_ERROR_SUCCESS;
 }
diff --git a/backends/hip-shared/ceed-hip-shared.h b/backends/hip-shared/ceed-hip-shared.h
index 962c088bc0..236e6f63a0 100644
--- a/backends/hip-shared/ceed-hip-shared.h
+++ b/backends/hip-shared/ceed-hip-shared.h
@@ -38,3 +38,6 @@ typedef struct {
 
 CEED_INTERN int CeedBasisCreateTensorH1_Hip_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d,
                                                    const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis);
+
+CEED_INTERN int CeedBasisCreateH1_Hip_shared(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp,
+                                             const CeedScalar *grad, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis);
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor-templates.h
index 1c03111dd4..c142b05c3d 100644
--- a/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor-templates.h
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor-templates.h
@@ -6,7 +6,7 @@
 // This file is part of CEED:  http://github.com/ceed
 
 /// @file
-/// Internal header for CUDA shared memory non-tensor product basis templates
+/// Internal header for CUDA shared memory non-tensor basis templates
 #include <ceed/types.h>
 
 //------------------------------------------------------------------------------
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor.h b/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor.h
index 7813174a34..ad10f7dc9b 100644
--- a/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor.h
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor.h
@@ -13,7 +13,7 @@
 #include "cuda-shared-basis-read-write-templates.h"
 
 //------------------------------------------------------------------------------
-// Interp kernel by dim
+// Interp kernels
 //------------------------------------------------------------------------------
 extern "C" __global__ void Interp(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
   extern __shared__ CeedScalar slice[];
@@ -96,7 +96,7 @@ extern "C" __global__ void InterpTransposeAdd(const CeedInt num_elem, const Ceed
 }
 
 //------------------------------------------------------------------------------
-// Grad kernel by dim
+// Grad kernels
 //------------------------------------------------------------------------------
 extern "C" __global__ void Grad(const CeedInt num_elem, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
   extern __shared__ CeedScalar slice[];
@@ -179,7 +179,7 @@ extern "C" __global__ void GradTransposeAdd(const CeedInt num_elem, const CeedSc
 }
 
 //------------------------------------------------------------------------------
-// Weight kernels by dim
+// Weight kernel
 //------------------------------------------------------------------------------
 extern "C" __global__ void Weight(const CeedInt num_elem, const CeedScalar *__restrict__ q_weight, CeedScalar *__restrict__ d_W) {
   extern __shared__ CeedScalar slice[];
diff --git a/include/ceed/jit-source/hip/hip-shared-basis-nontensor-templates.h b/include/ceed/jit-source/hip/hip-shared-basis-nontensor-templates.h
new file mode 100644
index 0000000000..94e665b347
--- /dev/null
+++ b/include/ceed/jit-source/hip/hip-shared-basis-nontensor-templates.h
@@ -0,0 +1,98 @@
+// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+/// @file
+/// Internal header for HIP shared memory non-tensor basis templates
+#include <ceed/types.h>
+
+//------------------------------------------------------------------------------
+// 1D tensor contraction
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D>
+inline __device__ void Contract1d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+  data.slice[data.t_id_x] = *U;
+  __syncthreads();
+  *V = 0.0;
+  if (data.t_id_x < Q_1D) {
+    for (CeedInt i = 0; i < P_1D; i++) {
+      *V += B[i + data.t_id_x * P_1D] * data.slice[i];  // Contract x direction
+    }
+  }
+  __syncthreads();
+}
+
+//------------------------------------------------------------------------------
+// 1D transpose tensor contraction
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D>
+inline __device__ void ContractTranspose1d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+  data.slice[data.t_id_x] = *U;
+  __syncthreads();
+  if (data.t_id_x < P_1D) {
+    for (CeedInt i = 0; i < Q_1D; i++) {
+      *V += B[data.t_id_x + i * P_1D] * data.slice[i];  // Contract x direction
+    }
+  }
+  __syncthreads();
+}
+
+//------------------------------------------------------------------------------
+// Interpolate to quadrature points
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P, int Q>
+inline __device__ void Interp1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    Contract1d<NUM_COMP, P, Q>(data, &r_U[comp], c_B, &r_V[comp]);
+  }
+}
+
+//------------------------------------------------------------------------------
+// Interpolate transpose
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P, int Q>
+inline __device__ void InterpTranspose1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                         CeedScalar *__restrict__ r_V) {
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    r_V[comp] = 0.0;
+    ContractTranspose1d<NUM_COMP, P, Q>(data, &r_U[comp], c_B, &r_V[comp]);
+  }
+}
+
+//------------------------------------------------------------------------------
+// Derivatives at quadrature points
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P, int Q>
+inline __device__ void Grad1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
+                              CeedScalar *__restrict__ r_V) {
+  for (CeedInt dim = 0; dim < DIM; dim++) {
+    for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+      Contract1d<NUM_COMP, P, Q>(data, &r_U[comp], &c_G[dim * P * Q], &r_V[comp + dim * NUM_COMP]);
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// Derivatives transpose
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P, int Q>
+inline __device__ void GradTranspose1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
+                                       CeedScalar *__restrict__ r_V) {
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_V[comp] = 0.0;
+  for (CeedInt dim = 0; dim < DIM; dim++) {
+    for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+      ContractTranspose1d<NUM_COMP, P, Q>(data, &r_U[comp + dim * NUM_COMP], &c_G[dim * P * Q], &r_V[comp]);
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// Quadrature weights
+//------------------------------------------------------------------------------
+template <int Q>
+inline __device__ void Weight1d(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
+  *w = (data.t_id_x < Q) ? q_weight_1d[data.t_id_x] : 0.0;
+}
diff --git a/include/ceed/jit-source/hip/hip-shared-basis-nontensor.h b/include/ceed/jit-source/hip/hip-shared-basis-nontensor.h
new file mode 100644
index 0000000000..4347815ea1
--- /dev/null
+++ b/include/ceed/jit-source/hip/hip-shared-basis-nontensor.h
@@ -0,0 +1,205 @@
+// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+/// @file
+/// Internal header for HIP shared memory non-tensor basis
+#include <ceed/types.h>
+
+#include "hip-shared-basis-read-write-templates.h"
+#include "hip-shared-basis-nontensor-templates.h"
+
+//------------------------------------------------------------------------------
+// Interp kernels
+//------------------------------------------------------------------------------
+extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
+    void Interp(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
+  extern __shared__ CeedScalar slice[];
+
+  SharedData_Hip data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = slice + data.t_id_z * T_1D;
+
+  CeedScalar r_U[BASIS_NUM_COMP];
+  CeedScalar r_V[BASIS_NUM_COMP];
+
+  // load interp into shared memory
+  __shared__ CeedScalar s_B[BASIS_P * BASIS_Q];
+  LoadMatrix<BASIS_P, BASIS_Q>(data, c_B, s_B);
+  __syncthreads();
+
+  // Apply basis element by element
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, d_U, r_U);
+    InterpNonTensor<BASIS_NUM_COMP, BASIS_P, BASIS_Q>(data, r_U, s_B, r_V);
+    WriteElementStrided1d<BASIS_NUM_COMP, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, r_V, d_V);
+  }
+}
+
+extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
+    void InterpTranspose(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
+  extern __shared__ CeedScalar slice[];
+
+  SharedData_Hip data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = slice + data.t_id_z * T_1D;
+
+  CeedScalar r_U[BASIS_NUM_COMP];
+  CeedScalar r_V[BASIS_NUM_COMP];
+
+  // load interp into shared memory
+  __shared__ CeedScalar s_B[BASIS_P * BASIS_Q];
+  LoadMatrix<BASIS_P, BASIS_Q>(data, c_B, s_B);
+  __syncthreads();
+
+  // Apply basis element by element
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, d_U, r_U);
+    InterpTransposeNonTensor<BASIS_NUM_COMP, BASIS_P, BASIS_Q>(data, r_U, s_B, r_V);
+    WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, r_V, d_V);
+  }
+}
+
+extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
+    void InterpTransposeAdd(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
+  extern __shared__ CeedScalar slice[];
+
+  SharedData_Hip data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = slice + data.t_id_z * T_1D;
+
+  CeedScalar r_U[BASIS_NUM_COMP];
+  CeedScalar r_V[BASIS_NUM_COMP];
+
+  // load interp into shared memory
+  __shared__ CeedScalar s_B[BASIS_P * BASIS_Q];
+  LoadMatrix<BASIS_P, BASIS_Q>(data, c_B, s_B);
+  __syncthreads();
+
+  // Apply basis element by element
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, d_U, r_U);
+    InterpTransposeNonTensor<BASIS_NUM_COMP, BASIS_P, BASIS_Q>(data, r_U, s_B, r_V);
+    SumElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, r_V, d_V);
+  }
+}
+
+//------------------------------------------------------------------------------
+// Grad kernels
+//------------------------------------------------------------------------------
+extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__ void Grad(const CeedInt num_elem, const CeedScalar *c_G,
+                                                                         const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
+  extern __shared__ CeedScalar slice[];
+
+  SharedData_Hip data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = slice + data.t_id_z * T_1D;
+
+  CeedScalar r_U[BASIS_NUM_COMP];
+  CeedScalar r_V[BASIS_NUM_COMP * BASIS_DIM];
+
+  // load grad into shared memory
+  __shared__ CeedScalar s_G[BASIS_P * BASIS_Q];
+  LoadMatrix<BASIS_P, BASIS_Q>(data, c_G, s_G);
+  __syncthreads();
+
+  // Apply basis element by element
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, d_U, r_U);
+    GradNonTensor<BASIS_NUM_COMP, BASIS_DIM, BASIS_P, BASIS_Q>(data, r_U, s_G, r_V);
+    WriteElementStrided1d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, r_V, d_V);
+  }
+}
+
+extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
+    void GradTranspose(const CeedInt num_elem, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U,
+                       CeedScalar *__restrict__ d_V) {
+  extern __shared__ CeedScalar slice[];
+
+  SharedData_Hip data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = slice + data.t_id_z * T_1D;
+
+  CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM];
+  CeedScalar r_V[BASIS_NUM_COMP];
+
+  // load grad into shared memory
+  __shared__ CeedScalar s_G[BASIS_P * BASIS_Q];
+  LoadMatrix<BASIS_P, BASIS_Q>(data, c_G, s_G);
+  __syncthreads();
+
+  // Apply basis element by element
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    ReadElementStrided1d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, d_U, r_U);
+    GradTransposeNonTensor<BASIS_NUM_COMP, BASIS_DIM, BASIS_P, BASIS_Q>(data, r_U, s_G, r_V);
+    WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, r_V, d_V);
+  }
+}
+
+extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
+    void GradTransposeAdd(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U,
+                          CeedScalar *__restrict__ d_V) {
+  extern __shared__ CeedScalar slice[];
+
+  SharedData_Hip data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = &slice[data.t_id_z * T_1D];
+
+  CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM];
+  CeedScalar r_V[BASIS_NUM_COMP];
+
+  // load grad into shared memory
+  __shared__ CeedScalar s_G[BASIS_P * BASIS_Q];
+  LoadMatrix<BASIS_P, BASIS_Q>(data, c_G, s_G);
+  __syncthreads();
+
+  // Apply basis element by element
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    ReadElementStrided1d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, d_U, r_U);
+    GradTransposeNonTensor<BASIS_NUM_COMP, BASIS_DIM, BASIS_P, BASIS_Q>(data, r_U, s_G, r_V);
+    SumElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, r_V, d_V);
+  }
+}
+
+//------------------------------------------------------------------------------
+// Weight kernel
+//------------------------------------------------------------------------------
+extern "C" __launch_bounds__(BASIS_WEIGHT_BLOCK_SIZE) __global__
+    void Weight(const CeedInt num_elem, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *__restrict__ d_W) {
+  extern __shared__ CeedScalar slice[];
+
+  SharedData_Hip data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = slice + data.t_id_z * T_1D;
+
+  CeedScalar r_W[1];
+
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    WeightNonTensor<BASIS_Q>(data, q_weight, r_W);
+    WriteElementStrided1d<1, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, r_W, d_W);
+  }
+}
diff --git a/include/ceed/jit-source/hip/hip-shared-basis-tensor.h b/include/ceed/jit-source/hip/hip-shared-basis-tensor.h
index fda6ee2cfe..06c6d370b9 100644
--- a/include/ceed/jit-source/hip/hip-shared-basis-tensor.h
+++ b/include/ceed/jit-source/hip/hip-shared-basis-tensor.h
@@ -102,7 +102,7 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
   data.t_id_y = threadIdx.y;
   data.t_id_z = threadIdx.z;
   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
-  data.slice  = &slice[data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1)];
+  data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
 
   CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
   CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
@@ -232,7 +232,7 @@ extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
   data.t_id_y = threadIdx.y;
   data.t_id_z = threadIdx.z;
   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
-  data.slice  = &slice[data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1)];
+  data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
 
   CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
   CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];

From 1f6c24fecef91b56ee93b512ed53da4b2ab340d3 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Tue, 7 Jan 2025 14:13:01 -0700
Subject: [PATCH 255/571] test - shrink sizes in t319 for non-tensor

---
 backends/hip-shared/ceed-hip-shared-basis.c              | 4 ----
 .../hip/hip-shared-basis-nontensor-templates.h           | 3 +--
 include/ceed/jit-source/hip/hip-shared-basis-nontensor.h | 9 ++++-----
 tests/t319-basis.c                                       | 2 +-
 4 files changed, 6 insertions(+), 12 deletions(-)

diff --git a/backends/hip-shared/ceed-hip-shared-basis.c b/backends/hip-shared/ceed-hip-shared-basis.c
index bfca624c44..63b4c47a99 100644
--- a/backends/hip-shared/ceed-hip-shared-basis.c
+++ b/backends/hip-shared/ceed-hip-shared-basis.c
@@ -699,10 +699,6 @@ int CeedBasisCreateH1_Hip_shared(CeedElemTopology topo, CeedInt dim, CeedInt num
   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
   CeedCallBackend(CeedCalloc(1, &data));
 
-  // Check max sizes
-  CeedCheck(dim <= 3, ceed, CEED_ERROR_BACKEND, "Backend does not implement nontensor bases with dim > 3");
-  CeedCheck(num_nodes * num_qpts * dim < 52 * 52 * 3, ceed, CEED_ERROR_BACKEND, "Backend does not implement nontensor bases with P * Q this large");
-
   // Copy basis data to GPU
   CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp));
   CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_GRAD, &q_comp_grad));
diff --git a/include/ceed/jit-source/hip/hip-shared-basis-nontensor-templates.h b/include/ceed/jit-source/hip/hip-shared-basis-nontensor-templates.h
index 94e665b347..68e2767090 100644
--- a/include/ceed/jit-source/hip/hip-shared-basis-nontensor-templates.h
+++ b/include/ceed/jit-source/hip/hip-shared-basis-nontensor-templates.h
@@ -66,8 +66,7 @@ inline __device__ void InterpTranspose1d(SharedData_Hip &data, const CeedScalar
 // Derivatives at quadrature points
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int P, int Q>
-inline __device__ void Grad1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
-                              CeedScalar *__restrict__ r_V) {
+inline __device__ void Grad1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
   for (CeedInt dim = 0; dim < DIM; dim++) {
     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
       Contract1d<NUM_COMP, P, Q>(data, &r_U[comp], &c_G[dim * P * Q], &r_V[comp + dim * NUM_COMP]);
diff --git a/include/ceed/jit-source/hip/hip-shared-basis-nontensor.h b/include/ceed/jit-source/hip/hip-shared-basis-nontensor.h
index 4347815ea1..f9ce1398d8 100644
--- a/include/ceed/jit-source/hip/hip-shared-basis-nontensor.h
+++ b/include/ceed/jit-source/hip/hip-shared-basis-nontensor.h
@@ -9,8 +9,8 @@
 /// Internal header for HIP shared memory non-tensor basis
 #include <ceed/types.h>
 
-#include "hip-shared-basis-read-write-templates.h"
 #include "hip-shared-basis-nontensor-templates.h"
+#include "hip-shared-basis-read-write-templates.h"
 
 //------------------------------------------------------------------------------
 // Interp kernels
@@ -99,8 +99,8 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
 //------------------------------------------------------------------------------
 // Grad kernels
 //------------------------------------------------------------------------------
-extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__ void Grad(const CeedInt num_elem, const CeedScalar *c_G,
-                                                                         const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
+extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
+    void Grad(const CeedInt num_elem, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
   extern __shared__ CeedScalar slice[];
 
   SharedData_Hip data;
@@ -127,8 +127,7 @@ extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__ void Grad(const C
 }
 
 extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
-    void GradTranspose(const CeedInt num_elem, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U,
-                       CeedScalar *__restrict__ d_V) {
+    void GradTranspose(const CeedInt num_elem, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
   extern __shared__ CeedScalar slice[];
 
   SharedData_Hip data;
diff --git a/tests/t319-basis.c b/tests/t319-basis.c
index 7417952dfd..e34e296fca 100644
--- a/tests/t319-basis.c
+++ b/tests/t319-basis.c
@@ -116,7 +116,7 @@ int main(int argc, char **argv) {
   for (CeedInt dim = 1; dim <= 3; dim++) {
     CeedVector x_corners, x_from, x_to, u_from, u_to, du_to;
     CeedBasis  basis_x, basis_from, basis_to, basis_project;
-    CeedInt    p_from = 5, p_to = 6, q = 7, x_dim = CeedIntPow(2, dim), p_from_dim = CeedIntPow(p_from, dim), p_to_dim = CeedIntPow(p_to, dim);
+    CeedInt    p_from = 3, p_to = 4, q = 4, x_dim = CeedIntPow(2, dim), p_from_dim = CeedIntPow(p_from, dim), p_to_dim = CeedIntPow(p_to, dim);
 
     CeedVectorCreate(ceed, x_dim * dim, &x_corners);
     {

From 527d8beaad4548b3d383b6fd27e9f28f39e3ef2e Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Thu, 9 Jan 2025 10:04:40 -0700
Subject: [PATCH 256/571] ci - add rocm build only test

---
 .gitlab-ci.yml | 38 +++++++++++++++++++++++++++++++++++++-
 1 file changed, 37 insertions(+), 1 deletion(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 27b8f62cc1..c216de800a 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -256,7 +256,7 @@ noether-cuda:
 # ROCm backends
 # ----------------------------------------------------------------------------------------
 
-# ROCm tests currently disabled
+# ROCm test execution currently disabled
 
 #noether-rocm:
 #  stage: test:gpu-and-float
@@ -320,6 +320,42 @@ noether-cuda:
 #      performance: performance.json
 
 
+noether-rocm:
+  stage: test:gpu-and-float
+  tags:
+    - rocm
+  interruptible: true
+  before_script:
+    # Environment
+    - export COVERAGE=1 CC=gcc CXX=g++ FC=gfortran HIPCC=hipcc
+    - export NPROC_POOL=4
+    - echo "-------------- nproc ---------------" && NPROC_CPU=$(nproc) && NPROC_GPU=$(($(nproc)<8?$(nproc):8)) && echo "NPROC_CPU" $NPROC_CPU && echo "NPROC_GPU" $NPROC_GPU
+    - echo "-------------- CC ------------------" && $CC --version
+    - echo "-------------- CXX -----------------" && $CXX --version
+    - echo "-------------- FC ------------------" && $FC --version
+    - echo "-------------- HIPCC ---------------" && $HIPCC --version
+    - echo "-------------- GCOV ----------------" && gcov --version
+    # Libraries for backends
+    # -- MAGMA from dev branch
+    - echo "-------------- MAGMA ---------------"
+    - export MAGMA_DIR=/projects/hipMAGMA && git -C $MAGMA_DIR -c safe.directory=$MAGMA_DIR describe
+  script:
+    - rm -f .SUCCESS
+    # libCEED
+    - make configure ROCM_DIR=/opt/rocm-6.1.0 OPT='-O -march=native -ffp-contract=fast'
+    - BACKENDS_CPU=$(make info-backends-all | grep -o '/cpu[^ ]*' | tr '\n' ' ') && BACKENDS_GPU=$(make info-backends | grep -o '/gpu[^ ]*' | tr '\n' ' ')
+    - echo "-------------- libCEED -------------" && make info
+    - echo "-------------- BACKENDS_GPU --------" && echo $BACKENDS_GPU
+    - make clean
+    - make -j$NPROC_CPU
+    # Clang-tidy
+    - echo "-------------- clang-tidy ----------" && clang-tidy --version
+    - make clean
+    - TIDY_OPTS="-fix-errors" make -j$NPROC_CPU tidy && git diff --color=always --exit-code
+    # Report status
+    - touch .SUCCESS
+
+
 # ----------------------------------------------------------------------------------------
 # CPU + CUDA backends with CeedScalar == float (32 bit)
 # ----------------------------------------------------------------------------------------

From a24d84eaf50532bd6ddb3309c91171c35669c827 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Thu, 9 Jan 2025 15:56:21 -0700
Subject: [PATCH 257/571] gpu - fix AtPoints transpose shift

---
 ...-shared-basis-tensor-at-points-templates.h | 20 +++++++++----------
 ...-shared-basis-tensor-at-points-templates.h | 20 +++++++++----------
 2 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h
index 2442d648ee..32437bf4c4 100644
--- a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h
@@ -74,7 +74,7 @@ inline __device__ void InterpTransposeAtPoints1d(SharedData_Cuda &data, const Ce
     // Contract x direction
     if (p < NUM_POINTS) {
       for (CeedInt i = 0; i < Q_1D; i++) {
-        atomicAdd(&data.slice[comp * Q_1D + (i + p) % Q_1D], chebyshev_x[(i + p) % Q_1D] * r_U[comp]);
+        atomicAdd(&data.slice[comp * Q_1D + (i + data.t_id_x) % Q_1D], chebyshev_x[(i + data.t_id_x) % Q_1D] * r_U[comp]);
       }
     }
     // Pull from shared to register
@@ -120,7 +120,7 @@ inline __device__ void GradTransposeAtPoints1d(SharedData_Cuda &data, const Ceed
     // Contract x direction
     if (p < NUM_POINTS) {
       for (CeedInt i = 0; i < Q_1D; i++) {
-        atomicAdd(&data.slice[comp * Q_1D + (i + p) % Q_1D], chebyshev_x[(i + p) % Q_1D] * r_U[comp]);
+        atomicAdd(&data.slice[comp * Q_1D + (i + data.t_id_x) % Q_1D], chebyshev_x[(i + data.t_id_x) % Q_1D] * r_U[comp]);
       }
     }
     // Pull from shared to register
@@ -186,10 +186,10 @@ inline __device__ void InterpTransposeAtPoints2d(SharedData_Cuda &data, const Ce
     if (p < NUM_POINTS) {
       for (CeedInt i = 0; i < Q_1D; i++) {
         // Note: shifting to avoid atomic adds
-        const CeedInt ii = (i + (p / Q_1D)) % Q_1D;
+        const CeedInt ii = (i + data.t_id_x) % Q_1D;
 
         for (CeedInt j = 0; j < Q_1D; j++) {
-          const CeedInt jj = (j + p) % Q_1D;
+          const CeedInt jj = (j + data.t_id_y) % Q_1D;
 
           atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
         }
@@ -261,10 +261,10 @@ inline __device__ void GradTransposeAtPoints2d(SharedData_Cuda &data, const Ceed
       if (p < NUM_POINTS) {
         for (CeedInt i = 0; i < Q_1D; i++) {
           // Note: shifting to avoid atomic adds
-          const CeedInt ii = (i + (p / Q_1D)) % Q_1D;
+          const CeedInt ii = (i + data.t_id_x) % Q_1D;
 
           for (CeedInt j = 0; j < Q_1D; j++) {
-            const CeedInt jj = (j + p) % Q_1D;
+            const CeedInt jj = (j + data.t_id_y) % Q_1D;
 
             atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
           }
@@ -343,10 +343,10 @@ inline __device__ void InterpTransposeAtPoints3d(SharedData_Cuda &data, const Ce
       if (p < NUM_POINTS) {
         for (CeedInt i = 0; i < Q_1D; i++) {
           // Note: shifting to avoid atomic adds
-          const CeedInt ii = (i + (p / Q_1D)) % Q_1D;
+          const CeedInt ii = (i + data.t_id_x) % Q_1D;
 
           for (CeedInt j = 0; j < Q_1D; j++) {
-            const CeedInt jj = ((j + p) % Q_1D);
+            const CeedInt jj = (j + data.t_id_y) % Q_1D;
 
             atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
           }
@@ -430,10 +430,10 @@ inline __device__ void GradTransposeAtPoints3d(SharedData_Cuda &data, const Ceed
         if (p < NUM_POINTS) {
           for (CeedInt i = 0; i < Q_1D; i++) {
             // Note: shifting to avoid atomic adds
-            const CeedInt ii = (i + (p / Q_1D)) % Q_1D;
+            const CeedInt ii = (i + data.t_id_x) % Q_1D;
 
             for (CeedInt j = 0; j < Q_1D; j++) {
-              const CeedInt jj = ((j + p) % Q_1D);
+              const CeedInt jj = (j + data.t_id_y) % Q_1D;
 
               atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
             }
diff --git a/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h b/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h
index 6c522ac5cd..923de63395 100644
--- a/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h
+++ b/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h
@@ -74,7 +74,7 @@ inline __device__ void InterpTransposeAtPoints1d(SharedData_Hip &data, const Cee
     // Contract x direction
     if (p < NUM_POINTS) {
       for (CeedInt i = 0; i < Q_1D; i++) {
-        atomicAdd(&data.slice[comp * Q_1D + (i + p) % Q_1D], chebyshev_x[(i + p) % Q_1D] * r_U[comp]);
+        atomicAdd(&data.slice[comp * Q_1D + (i + data.t_id_x) % Q_1D], chebyshev_x[(i + data.t_id_x) % Q_1D] * r_U[comp]);
       }
     }
     // Pull from shared to register
@@ -120,7 +120,7 @@ inline __device__ void GradTransposeAtPoints1d(SharedData_Hip &data, const CeedI
     // Contract x direction
     if (p < NUM_POINTS) {
       for (CeedInt i = 0; i < Q_1D; i++) {
-        atomicAdd(&data.slice[comp * Q_1D + (i + p) % Q_1D], chebyshev_x[(i + p) % Q_1D] * r_U[comp]);
+        atomicAdd(&data.slice[comp * Q_1D + (i + data.t_id_x) % Q_1D], chebyshev_x[(i + data.t_id_x) % Q_1D] * r_U[comp]);
       }
     }
     // Pull from shared to register
@@ -186,10 +186,10 @@ inline __device__ void InterpTransposeAtPoints2d(SharedData_Hip &data, const Cee
     if (p < NUM_POINTS) {
       for (CeedInt i = 0; i < Q_1D; i++) {
         // Note: shifting to avoid atomic adds
-        const CeedInt ii = (i + (p / Q_1D)) % Q_1D;
+        const CeedInt ii = (i + data.t_id_x) % Q_1D;
 
         for (CeedInt j = 0; j < Q_1D; j++) {
-          const CeedInt jj = (j + p) % Q_1D;
+          const CeedInt jj = (j + data.t_id_y) % Q_1D;
 
           atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
         }
@@ -261,10 +261,10 @@ inline __device__ void GradTransposeAtPoints2d(SharedData_Hip &data, const CeedI
       if (p < NUM_POINTS) {
         for (CeedInt i = 0; i < Q_1D; i++) {
           // Note: shifting to avoid atomic adds
-          const CeedInt ii = (i + (p / Q_1D)) % Q_1D;
+          const CeedInt ii = (i + data.t_id_x) % Q_1D;
 
           for (CeedInt j = 0; j < Q_1D; j++) {
-            const CeedInt jj = (j + p) % Q_1D;
+            const CeedInt jj = (j + data.t_id_y) % Q_1D;
 
             atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
           }
@@ -343,10 +343,10 @@ inline __device__ void InterpTransposeAtPoints3d(SharedData_Hip &data, const Cee
       if (p < NUM_POINTS) {
         for (CeedInt i = 0; i < Q_1D; i++) {
           // Note: shifting to avoid atomic adds
-          const CeedInt ii = (i + (p / Q_1D)) % Q_1D;
+          const CeedInt ii = (i + data.t_id_x) % Q_1D;
 
           for (CeedInt j = 0; j < Q_1D; j++) {
-            const CeedInt jj = ((j + p) % Q_1D);
+            const CeedInt jj = (j + data.t_id_y) % Q_1D;
 
             atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
           }
@@ -430,10 +430,10 @@ inline __device__ void GradTransposeAtPoints3d(SharedData_Hip &data, const CeedI
         if (p < NUM_POINTS) {
           for (CeedInt i = 0; i < Q_1D; i++) {
             // Note: shifting to avoid atomic adds
-            const CeedInt ii = (i + (p / Q_1D)) % Q_1D;
+            const CeedInt ii = (i + data.t_id_x) % Q_1D;
 
             for (CeedInt j = 0; j < Q_1D; j++) {
-              const CeedInt jj = ((j + p) % Q_1D);
+              const CeedInt jj = (j + data.t_id_y) % Q_1D;
 
               atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
             }

From 2d217acfcdb3fc2c258895deb77c68448814c687 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Mon, 13 Jan 2025 11:05:25 -0700
Subject: [PATCH 258/571] hip - fix missing template, compile values, fn names

---
 backends/hip-shared/ceed-hip-shared-basis.c   | 10 ++++---
 .../hip-shared-basis-nontensor-templates.h    | 21 ++++++++-------
 .../hip/hip-shared-basis-nontensor.h          | 27 +++++++++----------
 3 files changed, 30 insertions(+), 28 deletions(-)

diff --git a/backends/hip-shared/ceed-hip-shared-basis.c b/backends/hip-shared/ceed-hip-shared-basis.c
index 63b4c47a99..144af79c2f 100644
--- a/backends/hip-shared/ceed-hip-shared-basis.c
+++ b/backends/hip-shared/ceed-hip-shared-basis.c
@@ -533,7 +533,7 @@ static int CeedBasisApplyNonTensorCore_Hip_shared(CeedBasis basis, bool apply_ad
       CeedCallBackend(CeedBasisGetNumNodes(basis, &P));
       CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &Q));
       CeedInt thread      = CeedIntMax(Q, P);
-      void   *grad_args[] = {(void *)&num_elem, &data->d_interp_1d, &data->d_grad_1d, &d_u, &d_v};
+      void   *grad_args[] = {(void *)&num_elem, &data->d_grad_1d, &d_u, &d_v};
 
       {
         CeedInt elems_per_block = 64 * thread > 256 ? 256 / thread : 64;
@@ -554,7 +554,7 @@ static int CeedBasisApplyNonTensorCore_Hip_shared(CeedBasis basis, bool apply_ad
       CeedInt block_size = data->block_sizes[2];
 
       CeedCheck(data->d_q_weight_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights_1d not set", CeedEvalModes[eval_mode]);
-      CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q));
+      CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &Q));
       void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight_1d, &d_v};
 
       {
@@ -723,8 +723,10 @@ int CeedBasisCreateH1_Hip_shared(CeedElemTopology topo, CeedInt dim, CeedInt num
   const char basis_kernel_source[] = "// Non-tensor basis source\n#include <ceed/jit-source/hip/hip-shared-basis-nontensor.h>\n";
 
   CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
-  CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "T_1D",
-                                  CeedIntMax(num_qpts, num_nodes), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp));
+  CeedCallBackend(ComputeBasisThreadBlockSizes(dim, num_nodes, num_qpts, num_comp, data->block_sizes));
+  CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->module, 6, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "T_1D",
+                                  CeedIntMax(num_qpts, num_nodes), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, "BASIS_INTERP_BLOCK_SIZE",
+                                  data->block_sizes[0]));
   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Interp", &data->Interp));
   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "InterpTranspose", &data->InterpTranspose));
   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "InterpTransposeAdd", &data->InterpTransposeAdd));
diff --git a/include/ceed/jit-source/hip/hip-shared-basis-nontensor-templates.h b/include/ceed/jit-source/hip/hip-shared-basis-nontensor-templates.h
index 68e2767090..d394179dfe 100644
--- a/include/ceed/jit-source/hip/hip-shared-basis-nontensor-templates.h
+++ b/include/ceed/jit-source/hip/hip-shared-basis-nontensor-templates.h
@@ -44,7 +44,8 @@ inline __device__ void ContractTranspose1d(SharedData_Hip &data, const CeedScala
 // Interpolate to quadrature points
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int P, int Q>
-inline __device__ void Interp1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
+inline __device__ void InterpNonTensor(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                       CeedScalar *__restrict__ r_V) {
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
     Contract1d<NUM_COMP, P, Q>(data, &r_U[comp], c_B, &r_V[comp]);
   }
@@ -54,8 +55,8 @@ inline __device__ void Interp1d(SharedData_Hip &data, const CeedScalar *__restri
 // Interpolate transpose
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int P, int Q>
-inline __device__ void InterpTranspose1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
-                                         CeedScalar *__restrict__ r_V) {
+inline __device__ void InterpTransposeNonTensor(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                CeedScalar *__restrict__ r_V) {
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
     r_V[comp] = 0.0;
     ContractTranspose1d<NUM_COMP, P, Q>(data, &r_U[comp], c_B, &r_V[comp]);
@@ -65,8 +66,8 @@ inline __device__ void InterpTranspose1d(SharedData_Hip &data, const CeedScalar
 //------------------------------------------------------------------------------
 // Derivatives at quadrature points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P, int Q>
-inline __device__ void Grad1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
+template <int NUM_COMP, int DIM, int P, int Q>
+inline __device__ void GradNonTensor(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
   for (CeedInt dim = 0; dim < DIM; dim++) {
     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
       Contract1d<NUM_COMP, P, Q>(data, &r_U[comp], &c_G[dim * P * Q], &r_V[comp + dim * NUM_COMP]);
@@ -77,9 +78,9 @@ inline __device__ void Grad1d(SharedData_Hip &data, const CeedScalar *__restrict
 //------------------------------------------------------------------------------
 // Derivatives transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P, int Q>
-inline __device__ void GradTranspose1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
-                                       CeedScalar *__restrict__ r_V) {
+template <int NUM_COMP, int DIM, int P, int Q>
+inline __device__ void GradTransposeNonTensor(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
+                                              CeedScalar *__restrict__ r_V) {
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_V[comp] = 0.0;
   for (CeedInt dim = 0; dim < DIM; dim++) {
     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
@@ -92,6 +93,6 @@ inline __device__ void GradTranspose1d(SharedData_Hip &data, const CeedScalar *_
 // Quadrature weights
 //------------------------------------------------------------------------------
 template <int Q>
-inline __device__ void Weight1d(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
-  *w = (data.t_id_x < Q) ? q_weight_1d[data.t_id_x] : 0.0;
+inline __device__ void WeightNonTensor(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight, CeedScalar *w) {
+  *w = (data.t_id_x < Q) ? q_weight[data.t_id_x] : 0.0;
 }
diff --git a/include/ceed/jit-source/hip/hip-shared-basis-nontensor.h b/include/ceed/jit-source/hip/hip-shared-basis-nontensor.h
index f9ce1398d8..c892a9c939 100644
--- a/include/ceed/jit-source/hip/hip-shared-basis-nontensor.h
+++ b/include/ceed/jit-source/hip/hip-shared-basis-nontensor.h
@@ -99,7 +99,7 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
 //------------------------------------------------------------------------------
 // Grad kernels
 //------------------------------------------------------------------------------
-extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
+extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
     void Grad(const CeedInt num_elem, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
   extern __shared__ CeedScalar slice[];
 
@@ -114,8 +114,8 @@ extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
   CeedScalar r_V[BASIS_NUM_COMP * BASIS_DIM];
 
   // load grad into shared memory
-  __shared__ CeedScalar s_G[BASIS_P * BASIS_Q];
-  LoadMatrix<BASIS_P, BASIS_Q>(data, c_G, s_G);
+  __shared__ CeedScalar s_G[BASIS_P * BASIS_Q * BASIS_DIM];
+  LoadMatrix<BASIS_P, BASIS_Q * BASIS_DIM>(data, c_G, s_G);
   __syncthreads();
 
   // Apply basis element by element
@@ -126,7 +126,7 @@ extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
   }
 }
 
-extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
+extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
     void GradTranspose(const CeedInt num_elem, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
   extern __shared__ CeedScalar slice[];
 
@@ -141,8 +141,8 @@ extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
   CeedScalar r_V[BASIS_NUM_COMP];
 
   // load grad into shared memory
-  __shared__ CeedScalar s_G[BASIS_P * BASIS_Q];
-  LoadMatrix<BASIS_P, BASIS_Q>(data, c_G, s_G);
+  __shared__ CeedScalar s_G[BASIS_P * BASIS_Q * BASIS_DIM];
+  LoadMatrix<BASIS_P, BASIS_Q * BASIS_DIM>(data, c_G, s_G);
   __syncthreads();
 
   // Apply basis element by element
@@ -153,9 +153,8 @@ extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
   }
 }
 
-extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
-    void GradTransposeAdd(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U,
-                          CeedScalar *__restrict__ d_V) {
+extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
+    void GradTransposeAdd(const CeedInt num_elem, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
   extern __shared__ CeedScalar slice[];
 
   SharedData_Hip data;
@@ -163,14 +162,14 @@ extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
   data.t_id_y = threadIdx.y;
   data.t_id_z = threadIdx.z;
   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
-  data.slice  = &slice[data.t_id_z * T_1D];
+  data.slice  = slice + data.t_id_z * T_1D;
 
   CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM];
   CeedScalar r_V[BASIS_NUM_COMP];
 
   // load grad into shared memory
-  __shared__ CeedScalar s_G[BASIS_P * BASIS_Q];
-  LoadMatrix<BASIS_P, BASIS_Q>(data, c_G, s_G);
+  __shared__ CeedScalar s_G[BASIS_P * BASIS_Q * BASIS_DIM];
+  LoadMatrix<BASIS_P, BASIS_Q * BASIS_DIM>(data, c_G, s_G);
   __syncthreads();
 
   // Apply basis element by element
@@ -184,8 +183,8 @@ extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
 //------------------------------------------------------------------------------
 // Weight kernel
 //------------------------------------------------------------------------------
-extern "C" __launch_bounds__(BASIS_WEIGHT_BLOCK_SIZE) __global__
-    void Weight(const CeedInt num_elem, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *__restrict__ d_W) {
+extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
+    void Weight(const CeedInt num_elem, const CeedScalar *__restrict__ q_weight, CeedScalar *__restrict__ d_W) {
   extern __shared__ CeedScalar slice[];
 
   SharedData_Hip data;

From fda2654674c58d65f4c9c8e4ad604f8368d0111a Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Tue, 14 Jan 2025 14:48:14 -0700
Subject: [PATCH 259/571] gpu - fallback if nontensor shared uses too much mem

---
 backends/cuda-shared/ceed-cuda-shared-basis.c | 18 +++++++--
 backends/hip-shared/ceed-hip-shared-basis.c   | 13 +++++++
 include/ceed/backend.h                        |  3 ++
 interface/ceed-basis.c                        | 38 ++++++++++++++++++-
 tests/t319-basis.c                            |  2 +-
 5 files changed, 68 insertions(+), 6 deletions(-)

diff --git a/backends/cuda-shared/ceed-cuda-shared-basis.c b/backends/cuda-shared/ceed-cuda-shared-basis.c
index 5991559cd3..137e9f718c 100644
--- a/backends/cuda-shared/ceed-cuda-shared-basis.c
+++ b/backends/cuda-shared/ceed-cuda-shared-basis.c
@@ -628,11 +628,21 @@ int CeedBasisCreateH1_Cuda_shared(CeedElemTopology topo, CeedInt dim, CeedInt nu
   CeedBasis_Cuda_shared *data;
 
   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
-  CeedCallBackend(CeedCalloc(1, &data));
 
-  // Check max sizes
-  CeedCheck(dim <= 3, ceed, CEED_ERROR_BACKEND, "Backend does not implement nontensor bases with dim > 3");
-  CeedCheck(num_nodes * num_qpts * dim < 52 * 52 * 3, ceed, CEED_ERROR_BACKEND, "Backend does not implement nontensor bases with P * Q this large");
+  // Check shared memory size
+  {
+    Ceed_Cuda *cuda_data;
+
+    CeedCallBackend(CeedGetData(ceed, &cuda_data));
+    if (((size_t)num_nodes * (size_t)num_qpts * (size_t)dim + (size_t)CeedIntMax(num_nodes, num_qpts)) * sizeof(CeedScalar) >
+        cuda_data->device_prop.sharedMemPerBlock) {
+      CeedCallBackend(CeedBasisCreateH1Fallback(ceed, topo, dim, num_nodes, num_qpts, interp, grad, q_ref, q_weight, basis));
+      CeedCallBackend(CeedDestroy(&ceed));
+      return CEED_ERROR_SUCCESS;
+    }
+  }
+
+  CeedCallBackend(CeedCalloc(1, &data));
 
   // Copy basis data to GPU
   CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp));
diff --git a/backends/hip-shared/ceed-hip-shared-basis.c b/backends/hip-shared/ceed-hip-shared-basis.c
index 144af79c2f..d65c065ec2 100644
--- a/backends/hip-shared/ceed-hip-shared-basis.c
+++ b/backends/hip-shared/ceed-hip-shared-basis.c
@@ -697,6 +697,19 @@ int CeedBasisCreateH1_Hip_shared(CeedElemTopology topo, CeedInt dim, CeedInt num
   CeedBasis_Hip_shared *data;
 
   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
+
+  // Check shared memory size
+  {
+    Ceed_Hip *hip_data;
+
+    CeedCallBackend(CeedGetData(ceed, &hip_data));
+    if (((size_t)num_nodes * (size_t)num_qpts * (size_t)dim + (size_t)CeedIntMax(num_nodes, num_qpts)) * sizeof(CeedScalar) >
+        hip_data->device_prop.sharedMemPerBlock) {
+      CeedCallBackend(CeedBasisCreateH1Fallback(ceed, topo, dim, num_nodes, num_qpts, interp, grad, q_ref, q_weight, basis));
+      return CEED_ERROR_SUCCESS;
+    }
+  }
+
   CeedCallBackend(CeedCalloc(1, &data));
 
   // Copy basis data to GPU
diff --git a/include/ceed/backend.h b/include/ceed/backend.h
index 3884501f4b..7f686660ed 100644
--- a/include/ceed/backend.h
+++ b/include/ceed/backend.h
@@ -338,6 +338,9 @@ CEED_EXTERN int CeedBasisGetFESpace(CeedBasis basis, CeedFESpace *fe_space);
 CEED_EXTERN int CeedBasisGetTopologyDimension(CeedElemTopology topo, CeedInt *dim);
 CEED_EXTERN int CeedBasisGetTensorContract(CeedBasis basis, CeedTensorContract *contract);
 CEED_EXTERN int CeedBasisSetTensorContract(CeedBasis basis, CeedTensorContract contract);
+CEED_EXTERN int CeedBasisCreateH1Fallback(Ceed ceed, CeedElemTopology topo, CeedInt num_comp, CeedInt num_nodes, CeedInt nqpts,
+                                          const CeedScalar *interp, const CeedScalar *grad, const CeedScalar *q_ref, const CeedScalar *q_weights,
+                                          CeedBasis basis);
 
 CEED_EXTERN int  CeedTensorContractCreate(Ceed ceed, CeedTensorContract *contract);
 CEED_EXTERN int  CeedTensorContractApply(CeedTensorContract contract, CeedInt A, CeedInt B, CeedInt C, CeedInt J, const CeedScalar *__restrict__ t,
diff --git a/interface/ceed-basis.c b/interface/ceed-basis.c
index 4a4f5fb180..3d5d51107f 100644
--- a/interface/ceed-basis.c
+++ b/interface/ceed-basis.c
@@ -600,6 +600,42 @@ static int CeedBasisApplyAtPoints_Core(CeedBasis basis, bool apply_add, CeedInt
 /// @addtogroup CeedBasisBackend
 /// @{
 
+/**
+  @brief Fallback to a reference implementation for a non tensor-product basis for \f$H^1\f$ discretizations.
+    This function may only be called inside of a backend `BasisCreateH1` function.
+    This is used by a backend when the specific parameters for a `CeedBasis` exceed the backend's support, such as
+    when a `interp` and `grad` matrices require too many bytes to fit into shared memory on a GPU.
+
+  @param[in]  ceed      `Ceed` object used to create the `CeedBasis`
+  @param[in]  topo      Topology of element, e.g. hypercube, simplex, etc
+  @param[in]  num_comp  Number of field components (1 for scalar fields)
+  @param[in]  num_nodes Total number of nodes
+  @param[in]  num_qpts  Total number of quadrature points
+  @param[in]  interp    Row-major (`num_qpts * num_nodes`) matrix expressing the values of nodal basis functions at quadrature points
+  @param[in]  grad      Row-major (`dim * num_qpts * num_nodes`) matrix expressing derivatives of nodal basis functions at quadrature points
+  @param[in]  q_ref     Array of length `num_qpts * dim` holding the locations of quadrature points on the reference element
+  @param[in]  q_weight  Array of length `num_qpts` holding the quadrature weights on the reference element
+  @param[out] basis     Newly created `CeedBasis`
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref User
+**/
+int CeedBasisCreateH1Fallback(Ceed ceed, CeedElemTopology topo, CeedInt num_comp, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp,
+                              const CeedScalar *grad, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) {
+  CeedInt P = num_nodes, Q = num_qpts, dim = 0;
+  Ceed    delegate;
+
+  CeedCall(CeedGetObjectDelegate(ceed, &delegate, "Basis"));
+  CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement BasisCreateH1");
+
+  CeedCall(CeedReferenceCopy(delegate, &(basis)->ceed));
+  CeedCall(CeedBasisGetTopologyDimension(topo, &dim));
+  CeedCall(delegate->BasisCreateH1(topo, dim, P, Q, interp, grad, q_ref, q_weight, basis));
+  CeedCall(CeedDestroy(&delegate));
+  return CEED_ERROR_SUCCESS;
+}
+
 /**
   @brief Return collocated gradient matrix
 
@@ -1493,7 +1529,7 @@ int CeedBasisCreateTensorH1Lagrange(Ceed ceed, CeedInt dim, CeedInt num_comp, Ce
   @param[in]  num_qpts  Total number of quadrature points
   @param[in]  interp    Row-major (`num_qpts * num_nodes`) matrix expressing the values of nodal basis functions at quadrature points
   @param[in]  grad      Row-major (`dim * num_qpts * num_nodes`) matrix expressing derivatives of nodal basis functions at quadrature points
-  @param[in]  q_ref     Array of length `num_qpts` * dim holding the locations of quadrature points on the reference element
+  @param[in]  q_ref     Array of length `num_qpts * dim` holding the locations of quadrature points on the reference element
   @param[in]  q_weight  Array of length `num_qpts` holding the quadrature weights on the reference element
   @param[out] basis     Address of the variable where the newly created `CeedBasis` will be stored
 
diff --git a/tests/t319-basis.c b/tests/t319-basis.c
index e34e296fca..c314cb2e82 100644
--- a/tests/t319-basis.c
+++ b/tests/t319-basis.c
@@ -116,7 +116,7 @@ int main(int argc, char **argv) {
   for (CeedInt dim = 1; dim <= 3; dim++) {
     CeedVector x_corners, x_from, x_to, u_from, u_to, du_to;
     CeedBasis  basis_x, basis_from, basis_to, basis_project;
-    CeedInt    p_from = 3, p_to = 4, q = 4, x_dim = CeedIntPow(2, dim), p_from_dim = CeedIntPow(p_from, dim), p_to_dim = CeedIntPow(p_to, dim);
+    CeedInt    p_from = 4, p_to = 5, q = 6, x_dim = CeedIntPow(2, dim), p_from_dim = CeedIntPow(p_from, dim), p_to_dim = CeedIntPow(p_to, dim);
 
     CeedVectorCreate(ceed, x_dim * dim, &x_corners);
     {

From 97011eab160f424d1728899096d209d02ae5cfb2 Mon Sep 17 00:00:00 2001
From: Zach Atkins <Zach.Atkins@colorado.edu>
Date: Tue, 14 Jan 2025 14:09:06 -0800
Subject: [PATCH 260/571] Fix issue in block sizing for GPU shared basis

---
 backends/cuda-shared/ceed-cuda-shared-basis.c |  9 ++++++---
 backends/hip-shared/ceed-hip-shared-basis.c   | 15 ++++++++-------
 2 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/backends/cuda-shared/ceed-cuda-shared-basis.c b/backends/cuda-shared/ceed-cuda-shared-basis.c
index 5991559cd3..599ece636d 100644
--- a/backends/cuda-shared/ceed-cuda-shared-basis.c
+++ b/backends/cuda-shared/ceed-cuda-shared-basis.c
@@ -486,18 +486,21 @@ static int CeedBasisApplyNonTensorCore_Cuda_shared(CeedBasis basis, bool apply_a
       }
     } break;
     case CEED_EVAL_WEIGHT: {
-      CeedInt Q;
+      CeedInt P, Q;
 
       CeedCheck(data->d_q_weight_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights_1d not set", CeedEvalModes[eval_mode]);
+      CeedCallBackend(CeedBasisGetNumNodes(basis, &P));
       CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &Q));
+      CeedInt thread = CeedIntMax(Q, P);
+
       void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight_1d, &d_v};
 
       {
         // avoid >512 total threads
-        CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / Q, 1));
+        CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread, 1));
         CeedInt grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
 
-        CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, grid, Q, elems_per_block, 1, weight_args));
+        CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, grid, thread, elems_per_block, 1, weight_args));
       }
     } break;
     case CEED_EVAL_NONE: /* handled separately below */
diff --git a/backends/hip-shared/ceed-hip-shared-basis.c b/backends/hip-shared/ceed-hip-shared-basis.c
index 144af79c2f..1c18099a82 100644
--- a/backends/hip-shared/ceed-hip-shared-basis.c
+++ b/backends/hip-shared/ceed-hip-shared-basis.c
@@ -550,19 +550,20 @@ static int CeedBasisApplyNonTensorCore_Hip_shared(CeedBasis basis, bool apply_ad
       }
     } break;
     case CEED_EVAL_WEIGHT: {
-      CeedInt Q;
-      CeedInt block_size = data->block_sizes[2];
+      CeedInt P, Q;
 
       CeedCheck(data->d_q_weight_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights_1d not set", CeedEvalModes[eval_mode]);
+      CeedCallBackend(CeedBasisGetNumNodes(basis, &P));
       CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &Q));
-      void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight_1d, &d_v};
+      CeedInt thread        = CeedIntMax(Q, P);
+      void   *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight_1d, &d_v};
 
       {
-        const CeedInt opt_elems       = block_size / Q;
-        const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1;
-        const CeedInt grid_size       = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
+        CeedInt elems_per_block = 64 * thread > 256 ? 256 / thread : 64;
+        elems_per_block         = elems_per_block > 0 ? elems_per_block : 1;
+        const CeedInt grid_size = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
 
-        CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Weight, grid_size, Q, elems_per_block, 1, weight_args));
+        CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Weight, grid_size, thread, elems_per_block, 1, weight_args));
       }
     } break;
     case CEED_EVAL_NONE: /* handled separately below */

From 4cbc44e08f838c303fd0baecdcbbb18a0c602ef6 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Wed, 15 Jan 2025 12:41:45 -0700
Subject: [PATCH 261/571] minor - check for interp, grad in shared backends

---
 backends/cuda-shared/ceed-cuda-shared-basis.c | 6 +++++-
 backends/hip-shared/ceed-hip-shared-basis.c   | 6 +++++-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/backends/cuda-shared/ceed-cuda-shared-basis.c b/backends/cuda-shared/ceed-cuda-shared-basis.c
index d0aaa76ba5..f3a73b54ba 100644
--- a/backends/cuda-shared/ceed-cuda-shared-basis.c
+++ b/backends/cuda-shared/ceed-cuda-shared-basis.c
@@ -47,6 +47,7 @@ static int CeedBasisApplyTensorCore_Cuda_shared(CeedBasis basis, bool apply_add,
     case CEED_EVAL_INTERP: {
       CeedInt P_1d, Q_1d;
 
+      CeedCheck(data->d_interp_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; interp_1d not set", CeedEvalModes[eval_mode]);
       CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
       CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
       CeedInt thread_1d = CeedIntMax(Q_1d, P_1d);
@@ -94,6 +95,7 @@ static int CeedBasisApplyTensorCore_Cuda_shared(CeedBasis basis, bool apply_add,
     case CEED_EVAL_GRAD: {
       CeedInt P_1d, Q_1d;
 
+      CeedCheck(data->d_grad_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; grad_1d not set", CeedEvalModes[eval_mode]);
       CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
       CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
       CeedInt     thread_1d = CeedIntMax(Q_1d, P_1d);
@@ -442,6 +444,7 @@ static int CeedBasisApplyNonTensorCore_Cuda_shared(CeedBasis basis, bool apply_a
     case CEED_EVAL_INTERP: {
       CeedInt P, Q;
 
+      CeedCheck(data->d_interp_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; interp not set", CeedEvalModes[eval_mode]);
       CeedCallBackend(CeedBasisGetNumNodes(basis, &P));
       CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &Q));
       CeedInt thread = CeedIntMax(Q, P);
@@ -465,6 +468,7 @@ static int CeedBasisApplyNonTensorCore_Cuda_shared(CeedBasis basis, bool apply_a
     case CEED_EVAL_GRAD: {
       CeedInt P, Q;
 
+      CeedCheck(data->d_grad_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; grad not set", CeedEvalModes[eval_mode]);
       CeedCallBackend(CeedBasisGetNumNodes(basis, &P));
       CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &Q));
       CeedInt thread = CeedIntMax(Q, P);
@@ -488,7 +492,7 @@ static int CeedBasisApplyNonTensorCore_Cuda_shared(CeedBasis basis, bool apply_a
     case CEED_EVAL_WEIGHT: {
       CeedInt P, Q;
 
-      CeedCheck(data->d_q_weight_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights_1d not set", CeedEvalModes[eval_mode]);
+      CeedCheck(data->d_q_weight_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights not set", CeedEvalModes[eval_mode]);
       CeedCallBackend(CeedBasisGetNumNodes(basis, &P));
       CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &Q));
       CeedInt thread = CeedIntMax(Q, P);
diff --git a/backends/hip-shared/ceed-hip-shared-basis.c b/backends/hip-shared/ceed-hip-shared-basis.c
index e39809e030..ebe9048d3f 100644
--- a/backends/hip-shared/ceed-hip-shared-basis.c
+++ b/backends/hip-shared/ceed-hip-shared-basis.c
@@ -115,6 +115,7 @@ static int CeedBasisApplyTensorCore_Hip_shared(CeedBasis basis, bool apply_add,
       CeedInt P_1d, Q_1d;
       CeedInt block_size = data->block_sizes[0];
 
+      CeedCheck(data->d_interp_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; interp_1d not set", CeedEvalModes[eval_mode]);
       CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
       CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
       CeedInt thread_1d     = CeedIntMax(Q_1d, P_1d);
@@ -161,6 +162,7 @@ static int CeedBasisApplyTensorCore_Hip_shared(CeedBasis basis, bool apply_add,
       CeedInt P_1d, Q_1d;
       CeedInt block_size = data->block_sizes[1];
 
+      CeedCheck(data->d_grad_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; grad_1d not set", CeedEvalModes[eval_mode]);
       CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
       CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
       CeedInt     thread_1d = CeedIntMax(Q_1d, P_1d);
@@ -508,6 +510,7 @@ static int CeedBasisApplyNonTensorCore_Hip_shared(CeedBasis basis, bool apply_ad
     case CEED_EVAL_INTERP: {
       CeedInt P, Q;
 
+      CeedCheck(data->d_interp_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; interp not set", CeedEvalModes[eval_mode]);
       CeedCallBackend(CeedBasisGetNumNodes(basis, &P));
       CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &Q));
       CeedInt thread        = CeedIntMax(Q, P);
@@ -530,6 +533,7 @@ static int CeedBasisApplyNonTensorCore_Hip_shared(CeedBasis basis, bool apply_ad
     case CEED_EVAL_GRAD: {
       CeedInt P, Q;
 
+      CeedCheck(data->d_grad_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; grad not set", CeedEvalModes[eval_mode]);
       CeedCallBackend(CeedBasisGetNumNodes(basis, &P));
       CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &Q));
       CeedInt thread      = CeedIntMax(Q, P);
@@ -552,7 +556,7 @@ static int CeedBasisApplyNonTensorCore_Hip_shared(CeedBasis basis, bool apply_ad
     case CEED_EVAL_WEIGHT: {
       CeedInt P, Q;
 
-      CeedCheck(data->d_q_weight_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights_1d not set", CeedEvalModes[eval_mode]);
+      CeedCheck(data->d_q_weight_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights not set", CeedEvalModes[eval_mode]);
       CeedCallBackend(CeedBasisGetNumNodes(basis, &P));
       CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &Q));
       CeedInt thread        = CeedIntMax(Q, P);

From cc3bdf8cd7a97c52371610b7fbb458a86c2b0cc9 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Thu, 16 Jan 2025 11:19:58 -0700
Subject: [PATCH 262/571] vec - update SetArray to keep old arrays for
 CEED_COPY_VALUES

---
 backends/cuda/ceed-cuda-common.c         | 13 ++++--
 backends/hip/ceed-hip-common.c           | 13 ++++--
 backends/memcheck/ceed-memcheck-vector.c | 13 +++---
 interface/ceed.c                         | 13 ++++--
 tests/t128-vector.c                      | 51 ++++++++++++++++++++++++
 5 files changed, 85 insertions(+), 18 deletions(-)
 create mode 100644 tests/t128-vector.c

diff --git a/backends/cuda/ceed-cuda-common.c b/backends/cuda/ceed-cuda-common.c
index cae17d11d5..f27b453e4d 100644
--- a/backends/cuda/ceed-cuda-common.c
+++ b/backends/cuda/ceed-cuda-common.c
@@ -53,10 +53,15 @@ static inline int CeedSetDeviceGenericArray_Cuda(Ceed ceed, const void *source_a
                                                  void *target_array_owned, void *target_array_borrowed, void *target_array) {
   switch (copy_mode) {
     case CEED_COPY_VALUES:
-      if (!*(void **)target_array_owned) CeedCallCuda(ceed, cudaMalloc(target_array_owned, size_unit * num_values));
-      if (source_array) CeedCallCuda(ceed, cudaMemcpy(*(void **)target_array_owned, source_array, size_unit * num_values, cudaMemcpyDeviceToDevice));
-      *(void **)target_array_borrowed = NULL;
-      *(void **)target_array          = *(void **)target_array_owned;
+      if (!*(void **)target_array) {
+        if (*(void **)target_array_borrowed) {
+          *(void **)target_array = *(void **)target_array_borrowed;
+        } else {
+          if (!*(void **)target_array_owned) CeedCallCuda(ceed, cudaMalloc(target_array_owned, size_unit * num_values));
+          *(void **)target_array = *(void **)target_array_owned;
+        }
+      }
+      if (source_array) CeedCallCuda(ceed, cudaMemcpy(*(void **)target_array, source_array, size_unit * num_values, cudaMemcpyDeviceToDevice));
       break;
     case CEED_OWN_POINTER:
       CeedCallCuda(ceed, cudaFree(*(void **)target_array_owned));
diff --git a/backends/hip/ceed-hip-common.c b/backends/hip/ceed-hip-common.c
index 597aee9037..b73ef1ffb8 100644
--- a/backends/hip/ceed-hip-common.c
+++ b/backends/hip/ceed-hip-common.c
@@ -53,10 +53,15 @@ static inline int CeedSetDeviceGenericArray_Hip(Ceed ceed, const void *source_ar
                                                 void *target_array_owned, void *target_array_borrowed, void *target_array) {
   switch (copy_mode) {
     case CEED_COPY_VALUES:
-      if (!*(void **)target_array_owned) CeedCallHip(ceed, hipMalloc(target_array_owned, size_unit * num_values));
-      if (source_array) CeedCallHip(ceed, hipMemcpy(*(void **)target_array_owned, source_array, size_unit * num_values, hipMemcpyDeviceToDevice));
-      *(void **)target_array_borrowed = NULL;
-      *(void **)target_array          = *(void **)target_array_owned;
+      if (!*(void **)target_array) {
+        if (*(void **)target_array_borrowed) {
+          *(void **)target_array = *(void **)target_array_borrowed;
+        } else {
+          if (!*(void **)target_array_owned) CeedCallHip(ceed, hipMalloc(target_array_owned, size_unit * num_values));
+          *(void **)target_array = *(void **)target_array_owned;
+        }
+      }
+      if (source_array) CeedCallHip(ceed, hipMemcpy(*(void **)target_array, source_array, size_unit * num_values, hipMemcpyDeviceToDevice));
       break;
     case CEED_OWN_POINTER:
       CeedCallHip(ceed, hipFree(*(void **)target_array_owned));
diff --git a/backends/memcheck/ceed-memcheck-vector.c b/backends/memcheck/ceed-memcheck-vector.c
index 52716d5c70..b2d6b5efd5 100644
--- a/backends/memcheck/ceed-memcheck-vector.c
+++ b/backends/memcheck/ceed-memcheck-vector.c
@@ -57,11 +57,13 @@ static int CeedVectorSetArray_Memcheck(CeedVector vec, CeedMemType mem_type, Cee
     VALGRIND_DISCARD(impl->allocated_block_id);
   }
   CeedCallBackend(CeedFree(&impl->array_allocated));
-  if (impl->array_owned) {
-    for (CeedSize i = 0; i < length; i++) impl->array_owned[i] = NAN;
-    VALGRIND_DISCARD(impl->owned_block_id);
+  if (copy_mode != CEED_COPY_VALUES) {
+    if (impl->array_owned) {
+      for (CeedSize i = 0; i < length; i++) impl->array_owned[i] = NAN;
+      VALGRIND_DISCARD(impl->owned_block_id);
+    }
+    CeedCallBackend(CeedFree(&impl->array_owned));
   }
-  CeedCallBackend(CeedFree(&impl->array_owned));
 
   // Clear borrowed block id, if present
   if (impl->array_borrowed) VALGRIND_DISCARD(impl->borrowed_block_id);
@@ -69,8 +71,7 @@ static int CeedVectorSetArray_Memcheck(CeedVector vec, CeedMemType mem_type, Cee
   // Set internal pointers to external arrays
   switch (copy_mode) {
     case CEED_COPY_VALUES:
-      impl->array_owned    = NULL;
-      impl->array_borrowed = NULL;
+      // Nothing to update
       break;
     case CEED_OWN_POINTER:
       impl->array_owned    = array;
diff --git a/interface/ceed.c b/interface/ceed.c
index a25e97a6b7..6ebade1da0 100644
--- a/interface/ceed.c
+++ b/interface/ceed.c
@@ -349,10 +349,15 @@ static inline int CeedSetHostGenericArray(const void *source_array, CeedCopyMode
                                           void *target_array_owned, void *target_array_borrowed, void *target_array) {
   switch (copy_mode) {
     case CEED_COPY_VALUES:
-      if (!*(void **)target_array_owned) CeedCall(CeedCallocArray(num_values, size_unit, target_array_owned));
-      if (source_array) memcpy(*(void **)target_array_owned, source_array, size_unit * num_values);
-      *(void **)target_array_borrowed = NULL;
-      *(void **)target_array          = *(void **)target_array_owned;
+      if (!*(void **)target_array) {
+        if (*(void **)target_array_borrowed) {
+          *(void **)target_array = *(void **)target_array_borrowed;
+        } else {
+          if (!*(void **)target_array_owned) CeedCall(CeedCallocArray(num_values, size_unit, target_array_owned));
+          *(void **)target_array = *(void **)target_array_owned;
+        }
+      }
+      if (source_array) memcpy(*(void **)target_array, source_array, size_unit * num_values);
       break;
     case CEED_OWN_POINTER:
       CeedCall(CeedFree(target_array_owned));
diff --git a/tests/t128-vector.c b/tests/t128-vector.c
new file mode 100644
index 0000000000..037b482cbe
--- /dev/null
+++ b/tests/t128-vector.c
@@ -0,0 +1,51 @@
+/// @file
+/// Test copying into vector with borrowed pointer
+/// \test Test copying into vector with borrowed pointer
+#include <ceed.h>
+#include <stdio.h>
+
+int main(int argc, char **argv) {
+  Ceed       ceed;
+  CeedVector x, x_copy;
+  CeedInt    len = 10;
+  CeedScalar array_borrowed[len];
+
+  CeedInit(argv[1], &ceed);
+
+  CeedVectorCreate(ceed, len, &x);
+  CeedVectorCreate(ceed, len, &x_copy);
+
+  {
+    CeedScalar array[len];
+
+    for (CeedInt i = 0; i < len; i++) {
+      array[i]          = i;
+      array_borrowed[i] = 10 + i;
+    }
+
+    CeedVectorSetArray(x, CEED_MEM_HOST, CEED_COPY_VALUES, array);
+    CeedVectorSetArray(x_copy, CEED_MEM_HOST, CEED_USE_POINTER, array_borrowed);
+  }
+
+  // Copy to device if preferred
+  {
+    CeedMemType mem_type = CEED_MEM_HOST;
+
+    CeedGetPreferredMemType(ceed, &mem_type);
+    if (mem_type == CEED_MEM_DEVICE) CeedVectorSyncArray(x, CEED_MEM_DEVICE);
+  }
+
+  // Copy and sync borrowed array
+  CeedVectorCopy(x, x_copy);
+  CeedVectorSyncArray(x_copy, CEED_MEM_HOST);
+
+  // Check that borrowed array is the same as the original input array a
+  for (CeedInt i = 0; i < len; i++) {
+    if (array_borrowed[i] != i) printf("Error in copying values of CeedVector\n");
+  }
+
+  CeedVectorDestroy(&x);
+  CeedVectorDestroy(&x_copy);
+  CeedDestroy(&ceed);
+  return 0;
+}

From 99837b8af49d68657458a57a35df01ab8cfd96a9 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Thu, 16 Jan 2025 13:04:47 -0700
Subject: [PATCH 263/571] python - fix bad array bounds

---
 python/tests/test-1-vector.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tests/test-1-vector.py b/python/tests/test-1-vector.py
index 246b82515e..73ed078bca 100644
--- a/python/tests/test-1-vector.py
+++ b/python/tests/test-1-vector.py
@@ -359,7 +359,7 @@ def test_126(ceed_resource, capsys):
     a = np.arange(10, 10 + n, dtype=ceed.scalar_type())
     x.set_array(a, cmode=libceed.USE_POINTER)
 
-    a2 = np.arange(10, n, dtype=ceed.scalar_type())
+    a2 = np.arange(0, n, dtype=ceed.scalar_type())
     y.set_array(a2, cmode=libceed.USE_POINTER)
 
     y.copy_from(x)

From fd831f258b694b9857c348ebef72fcfdbf8a8f6b Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Tue, 21 Jan 2025 12:41:48 -0700
Subject: [PATCH 264/571] ci - use native hardware for ARM testing

---
 .github/workflows/c-fortran-test-arm64.yml    | 28 +++++++++++++++++++
 ...ardware.yml => c-fortran-test-ppc64le.yml} |  4 +--
 2 files changed, 30 insertions(+), 2 deletions(-)
 create mode 100644 .github/workflows/c-fortran-test-arm64.yml
 rename .github/workflows/{c-fortran-test-hardware.yml => c-fortran-test-ppc64le.yml} (93%)

diff --git a/.github/workflows/c-fortran-test-arm64.yml b/.github/workflows/c-fortran-test-arm64.yml
new file mode 100644
index 0000000000..fa355ebcf4
--- /dev/null
+++ b/.github/workflows/c-fortran-test-arm64.yml
@@ -0,0 +1,28 @@
+name: ARM
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+
+jobs:
+  test:
+    strategy:
+      matrix:
+        os: [ubuntu-24.04-arm]
+        compiler: [gcc-13, clang]
+
+    runs-on: ${{ matrix.os }}
+
+    steps:
+    - name: Environment setup
+      uses: actions/checkout@v4
+    - name: Build and test libCEED
+      env:
+        CC: ${{ matrix.compiler }}
+        FC: gfortran-13
+      run: |
+        make info
+        make -j2
+        PROVE_OPTS=-v make prove -j2
diff --git a/.github/workflows/c-fortran-test-hardware.yml b/.github/workflows/c-fortran-test-ppc64le.yml
similarity index 93%
rename from .github/workflows/c-fortran-test-hardware.yml
rename to .github/workflows/c-fortran-test-ppc64le.yml
index 80d395f8d0..709cdcaa84 100644
--- a/.github/workflows/c-fortran-test-hardware.yml
+++ b/.github/workflows/c-fortran-test-ppc64le.yml
@@ -1,4 +1,4 @@
-name: ARM and IBM Power
+name: IBM Power
 
 on:
   push:
@@ -12,7 +12,7 @@ jobs:
       matrix:
         os: [ubuntu-24.04]
         compiler: [gcc-13]
-        arch: [aarch64, ppc64le]
+        arch: [ppc64le]
         distro: [ubuntu22.04]
 
     runs-on: ${{ matrix.os }}

From dc007f05648c670dfdc3e42fab8d6c1219c0afbb Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Mon, 27 Jan 2025 11:36:06 -0700
Subject: [PATCH 265/571] cuda - nontensor gen operators

---
 .../cuda-gen/ceed-cuda-gen-operator-build.cpp | 238 +++++++++++-------
 backends/cuda-gen/ceed-cuda-gen-operator.c    |  65 ++++-
 backends/cuda-ref/ceed-cuda-ref-operator.c    |   2 +-
 3 files changed, 201 insertions(+), 104 deletions(-)

diff --git a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
index 99da342f66..187cc18fb6 100644
--- a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
+++ b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
@@ -34,6 +34,7 @@ static int CeedOperatorBuildKernelData_Cuda_gen(Ceed ceed, CeedInt num_input_fie
   *Q_1d      = 0;
   *dim       = 0;
   *is_tensor = true;
+
   for (CeedInt i = 0; i < num_input_fields; i++) {
     CeedBasis basis;
 
@@ -44,14 +45,15 @@ static int CeedOperatorBuildKernelData_Cuda_gen(Ceed ceed, CeedInt num_input_fie
 
       // Collect dim, P_1d, and Q_1d
       CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
-      CeedCheck(is_field_tensor, ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis");
-      *is_tensor = *is_tensor && is_field_tensor;
-      CeedCallBackend(CeedBasisGetNumNodes1D(basis, &field_P_1d));
+      *is_tensor          = *is_tensor && is_field_tensor;
+      if (is_field_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &field_P_1d));
+      else CeedCallBackend(CeedBasisGetNumNodes(basis, &field_P_1d));
       *max_P_1d = CeedIntMax(*max_P_1d, field_P_1d);
       CeedCallBackend(CeedBasisGetDimension(basis, &field_dim));
       CeedCheck(*dim == 0 || field_dim == *dim, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
       *dim = field_dim;
-      CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &field_Q_1d));
+      if (is_field_tensor) CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &field_Q_1d));
+      else CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &field_Q_1d));
       CeedCheck(*Q_1d == 0 || field_Q_1d == *Q_1d, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
       *Q_1d = field_Q_1d;
     }
@@ -67,14 +69,15 @@ static int CeedOperatorBuildKernelData_Cuda_gen(Ceed ceed, CeedInt num_input_fie
 
       // Collect dim, P_1d, and Q_1d
       CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
-      CeedCheck(is_field_tensor, ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis");
-      *is_tensor = *is_tensor && is_field_tensor;
-      CeedCallBackend(CeedBasisGetNumNodes1D(basis, &field_P_1d));
+      *is_tensor          = *is_tensor && is_field_tensor;
+      if (is_field_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &field_P_1d));
+      else CeedCallBackend(CeedBasisGetNumNodes(basis, &field_P_1d));
       *max_P_1d = CeedIntMax(*max_P_1d, field_P_1d);
       CeedCallBackend(CeedBasisGetDimension(basis, &field_dim));
       CeedCheck(*dim == 0 || field_dim == *dim, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
       *dim = field_dim;
-      CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &field_Q_1d));
+      if (is_field_tensor) CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &field_Q_1d));
+      else CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &field_Q_1d));
       CeedCheck(*Q_1d == 0 || field_Q_1d == *Q_1d, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
       *Q_1d = field_Q_1d;
     }
@@ -124,10 +127,10 @@ static int CeedOperatorBuildKernelData_Cuda_gen(Ceed ceed, CeedInt num_input_fie
 // Setup fields
 //------------------------------------------------------------------------------
 static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, CeedInt i, CeedOperatorField op_field,
-                                                     CeedQFunctionField qf_field, CeedInt Q_1d, bool is_input, bool is_at_points,
+                                                     CeedQFunctionField qf_field, CeedInt Q_1d, bool is_input, bool is_tensor, bool is_at_points,
                                                      bool use_3d_slices) {
   std::string            var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
-  std::string            P_name = "P_1d" + var_suffix, Q_name = "Q_1d";
+  std::string            P_name = (is_tensor ? "P_1d" : "P") + var_suffix, Q_name = is_tensor ? "Q_1d" : "Q";
   std::string            option_name = (is_input ? "inputs" : "outputs");
   CeedEvalMode           eval_mode   = CEED_EVAL_NONE;
   CeedInt                elem_size = 0, num_comp = 0, P_1d = 0;
@@ -147,7 +150,8 @@ static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, C
   CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis));
   if (basis != CEED_BASIS_NONE) {
     CeedCallBackend(CeedBasisGetData(basis, &basis_data));
-    CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
+    if (is_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
+    else CeedCallBackend(CeedBasisGetNumNodes(basis, &P_1d));
   }
   CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode));
 
@@ -184,7 +188,7 @@ static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, C
         if (is_input) data->B.inputs[i] = basis_data->d_interp_1d;
         else data->B.outputs[i] = basis_data->d_interp_1d;
       }
-      code << "  __shared__ CeedScalar s_B" << var_suffix << "[" << P_1d * Q_1d << "];\n";
+      code << "  __shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n";
       code << "  LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
       break;
     case CEED_EVAL_GRAD:
@@ -209,13 +213,15 @@ static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, C
         if (is_input) data->B.inputs[i] = basis_data->d_interp_1d;
         else data->B.outputs[i] = basis_data->d_interp_1d;
       }
-      code << "  __shared__ CeedScalar s_B" << var_suffix << "[" << P_1d * Q_1d << "];\n";
-      code << "  LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
+      if (is_tensor) {
+        code << "  __shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n";
+        code << "  LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
+      }
       if (is_at_points) break;  // No G mat for AtPoints
       if (use_3d_slices) {
         if (is_input) data->G.inputs[i] = basis_data->d_collo_grad_1d;
         else data->G.outputs[i] = basis_data->d_collo_grad_1d;
-        code << "  __shared__ CeedScalar s_G" << var_suffix << "[" << Q_1d * Q_1d << "];\n";
+        code << "  __shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n";
         code << "  LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
       } else {
         bool has_collo_grad = basis_data->d_collo_grad_1d;
@@ -223,11 +229,12 @@ static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, C
         if (is_input) data->G.inputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d;
         else data->G.outputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d;
         if (has_collo_grad) {
-          code << "  __shared__ CeedScalar s_G" << var_suffix << "[" << Q_1d * Q_1d << "];\n";
+          code << "  __shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n";
           code << "  LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
         } else {
-          code << "  __shared__ CeedScalar s_G" << var_suffix << "[" << Q_1d * P_1d << "];\n";
-          code << "  LoadMatrix<" << P_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
+          code << "  __shared__ CeedScalar s_G" << var_suffix << "[" << P_name << "*" << Q_name << (is_tensor ? "" : "*dim") << "];\n";
+          code << "  LoadMatrix<" << P_name << ", " << Q_name << (is_tensor ? "" : "*dim") << ">(data, G." << option_name << "[" << i << "], s_G"
+               << var_suffix << ");\n";
         }
       }
       break;
@@ -248,9 +255,9 @@ static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, C
 //------------------------------------------------------------------------------
 static int CeedOperatorBuildKernelRestriction_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, CeedInt i, CeedInt dim,
                                                        CeedInt field_input_buffer[], CeedOperatorField op_field, CeedQFunctionField qf_field,
-                                                       CeedInt Q_1d, bool is_input, bool is_at_points, bool use_3d_slices) {
+                                                       CeedInt Q_1d, bool is_input, bool is_tensor, bool is_at_points, bool use_3d_slices) {
   std::string               var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
-  std::string               P_name     = "P_1d" + var_suffix;
+  std::string               P_name     = (is_tensor ? "P_1d" : "P") + var_suffix;
   CeedEvalMode              eval_mode  = CEED_EVAL_NONE;
   CeedInt                   elem_size = 0, num_comp = 0, P_1d = 0;
   CeedSize                  l_size;
@@ -269,7 +276,8 @@ static int CeedOperatorBuildKernelRestriction_Cuda_gen(std::ostringstream &code,
   }
   CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis));
   if (basis != CEED_BASIS_NONE) {
-    CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
+    if (is_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
+    else CeedCallBackend(CeedBasisGetNumNodes(basis, &P_1d));
   }
   CeedCallBackend(CeedBasisDestroy(&basis));
   CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode));
@@ -299,8 +307,8 @@ static int CeedOperatorBuildKernelRestriction_Cuda_gen(std::ostringstream &code,
           CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
           code << "    // CompStride: " << comp_stride << "\n";
           data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets;
-          code << "    ReadLVecStandard" << dim << "d<num_comp" << var_suffix << ", " << comp_stride << ", " << P_name << ">(data, l_size"
-               << var_suffix << ", elem, indices.inputs[" << i << "], d" << var_suffix << ", r_e" << var_suffix << ");\n";
+          code << "    ReadLVecStandard" << (is_tensor ? dim : 1) << "d<num_comp" << var_suffix << ", " << comp_stride << ", " << P_name
+               << ">(data, l_size" << var_suffix << ", elem, indices.inputs[" << i << "], d" << var_suffix << ", r_e" << var_suffix << ");\n";
           break;
         }
         case CEED_RESTRICTION_STRIDED: {
@@ -315,8 +323,8 @@ static int CeedOperatorBuildKernelRestriction_Cuda_gen(std::ostringstream &code,
             CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
           }
           code << "    // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n";
-          code << "    ReadLVecStrided" << dim << "d<num_comp" << var_suffix << ", " << P_name << "," << strides[0] << "," << strides[1] << ","
-               << strides[2] << ">(data, elem, d" << var_suffix << ", r_e" << var_suffix << ");\n";
+          code << "    ReadLVecStrided" << (is_tensor ? dim : 1) << "d<num_comp" << var_suffix << ", " << P_name << ", " << strides[0] << ", "
+               << strides[1] << ", " << strides[2] << ">(data, elem, d" << var_suffix << ", r_e" << var_suffix << ");\n";
           break;
         }
         case CEED_RESTRICTION_POINTS: {
@@ -345,8 +353,8 @@ static int CeedOperatorBuildKernelRestriction_Cuda_gen(std::ostringstream &code,
         CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
         code << "    // CompStride: " << comp_stride << "\n";
         data->indices.outputs[i] = (CeedInt *)rstr_data->d_offsets;
-        code << "    WriteLVecStandard" << dim << "d<num_comp" << var_suffix << ", " << comp_stride << ", " << P_name << ">(data, l_size"
-             << var_suffix << ", elem, indices.outputs[" << i << "], r_e" << var_suffix << ", d" << var_suffix << ");\n";
+        code << "    WriteLVecStandard" << (is_tensor ? dim : 1) << "d<num_comp" << var_suffix << ", " << comp_stride << ", " << P_name
+             << ">(data, l_size" << var_suffix << ", elem, indices.outputs[" << i << "], r_e" << var_suffix << ", d" << var_suffix << ");\n";
         break;
       }
       case CEED_RESTRICTION_STRIDED: {
@@ -361,8 +369,8 @@ static int CeedOperatorBuildKernelRestriction_Cuda_gen(std::ostringstream &code,
           CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
         }
         code << "    // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n";
-        code << "    WriteLVecStrided" << dim << "d<num_comp" << var_suffix << ", " << P_name << "," << strides[0] << "," << strides[1] << ","
-             << strides[2] << ">(data, elem, r_e" << var_suffix << ", d" << var_suffix << ");\n";
+        code << "    WriteLVecStrided" << (is_tensor ? dim : 1) << "d<num_comp" << var_suffix << ", " << P_name << ", " << strides[0] << ", "
+             << strides[1] << ", " << strides[2] << ">(data, elem, r_e" << var_suffix << ", d" << var_suffix << ");\n";
         break;
       }
       case CEED_RESTRICTION_POINTS:
@@ -383,10 +391,10 @@ static int CeedOperatorBuildKernelRestriction_Cuda_gen(std::ostringstream &code,
 // Basis
 //------------------------------------------------------------------------------
 static int CeedOperatorBuildKernelBasis_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, CeedInt i, CeedInt dim,
-                                                 CeedOperatorField op_field, CeedQFunctionField qf_field, CeedInt Q_1d, bool is_input,
+                                                 CeedOperatorField op_field, CeedQFunctionField qf_field, CeedInt Q_1d, bool is_input, bool is_tensor,
                                                  bool is_at_points, bool use_3d_slices) {
   std::string         var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
-  std::string         P_name = "P_1d" + var_suffix, Q_name = "Q_1d";
+  std::string         P_name = (is_tensor ? "P_1d" : "P") + var_suffix, Q_name = is_tensor ? "Q_1d" : "Q";
   CeedEvalMode        eval_mode = CEED_EVAL_NONE;
   CeedInt             elem_size = 0, num_comp = 0, P_1d = 0;
   CeedElemRestriction elem_rstr;
@@ -401,7 +409,8 @@ static int CeedOperatorBuildKernelBasis_Cuda_gen(std::ostringstream &code, CeedO
   CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
   CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis));
   if (basis != CEED_BASIS_NONE) {
-    CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
+    if (is_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
+    else CeedCallBackend(CeedBasisGetNumNodes(basis, &P_1d));
   }
   CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode));
 
@@ -416,29 +425,45 @@ static int CeedOperatorBuildKernelBasis_Cuda_gen(std::ostringstream &code, CeedO
         break;
       case CEED_EVAL_INTERP:
         if (is_at_points) {
-          code << "    CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim > 2 ? Q_name : "1") << "];\n";
-          code << "    Interp" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp" << var_suffix << ", P_1d" << var_suffix << ", " << Q_name
-               << ">(data, r_e" << var_suffix << ", s_B" << var_suffix << ", r_c" << var_suffix << ");\n";
+          std::string function_name = (dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d";
+
+          code << "    CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "];\n";
+          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ">(data, r_e" << var_suffix << ", s_B"
+               << var_suffix << ", r_c" << var_suffix << ");\n";
         } else {
-          code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
-          code << "    Interp" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp" << var_suffix << ", P_1d" << var_suffix << ", " << Q_name
-               << ">(data, r_e" << var_suffix << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n";
+          std::string function_name = is_tensor ? ((dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d") : "InterpNonTensor";
+
+          code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
+          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ">(data, r_e" << var_suffix << ", s_B"
+               << var_suffix << ", r_q" << var_suffix << ");\n";
         }
         break;
       case CEED_EVAL_GRAD:
         if (is_at_points) {
-          code << "    CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim > 2 ? Q_name : "1") << "];\n";
-          code << "    Interp" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp" << var_suffix << ", P_1d" << var_suffix << ", " << Q_name
-               << ">(data, r_e" << var_suffix << ", s_B" << var_suffix << ", r_c" << var_suffix << ");\n";
+          std::string function_name = (dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d";
+
+          code << "    CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "];\n";
+          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ">(data, r_e" << var_suffix << ", s_B"
+               << var_suffix << ", r_c" << var_suffix << ");\n";
         } else if (use_3d_slices) {
+          std::string function_name = (dim > 1 ? "InterpTensor" : "Interp") + std::to_string(dim) + "d";
+
           code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
-          code << "    Interp" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp" << var_suffix << ", P_1d" << var_suffix << ", " << Q_name
-               << ">(data, r_e" << var_suffix << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n";
+          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ">(data, r_e" << var_suffix << ", s_B"
+               << var_suffix << ", r_q" << var_suffix << ");\n";
+        } else if (is_tensor) {
+          bool        is_collocated = dim == 3 && Q_1d >= P_1d;
+          std::string function_name = (dim == 1 ? "Grad" : (is_collocated ? "GradTensorCollocated" : "GradTensor")) + std::to_string(dim) + "d";
+
+          code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim*" << (dim >= 3 ? Q_name : "1") << "];\n";
+          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ">(data, r_e" << var_suffix << ", s_B"
+               << var_suffix << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n";
         } else {
-          code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim*" << Q_name << "];\n";
-          code << "    Grad" << (dim > 1 ? "Tensor" : "") << (dim == 3 && Q_1d >= P_1d ? "Collocated" : "") << dim << "d<num_comp" << var_suffix
-               << ", P_1d" << var_suffix << ", " << Q_name << ">(data, r_e" << var_suffix << ", s_B" << var_suffix << ", s_G" << var_suffix << ", r_q"
-               << var_suffix << ");\n";
+          std::string function_name = "GradNonTensor";
+
+          code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim];\n";
+          code << "    " << function_name << "<num_comp" << var_suffix << ", dim, " << P_name << ", " << Q_name << ">(data, r_e" << var_suffix
+               << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n";
         }
         break;
       case CEED_EVAL_WEIGHT: {
@@ -446,11 +471,12 @@ static int CeedOperatorBuildKernelBasis_Cuda_gen(std::ostringstream &code, CeedO
           code << "    // Nothing to do AtPoints\n";
         } else {
           CeedBasis_Cuda_shared *basis_data;
+          std::string            function_name = is_tensor ? ((dim == 1 ? "Weight" : "WeightTensor") + std::to_string(dim) + "d") : "WeightNonTensor";
 
-          code << "    CeedScalar r_q" << var_suffix << "[" << Q_name << "];\n";
+          code << "    CeedScalar r_q" << var_suffix << "[" << (is_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
           CeedCallBackend(CeedBasisGetData(basis, &basis_data));
           data->W = basis_data->d_q_weight_1d;
-          code << "    Weight" << (dim > 1 ? "Tensor" : "") << dim << "d<" << Q_name << ">(data, W, r_q" << var_suffix << ");\n";
+          code << "    " << function_name << "<" << Q_name << ">(data, W, r_q" << var_suffix << ");\n";
         }
         break;
       }
@@ -468,25 +494,42 @@ static int CeedOperatorBuildKernelBasis_Cuda_gen(std::ostringstream &code, CeedO
       case CEED_EVAL_INTERP:
         code << "    CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
         if (is_at_points) {
-          code << "    InterpTranspose" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp" << var_suffix << ", " << P_name << ", " << Q_name
-               << ">(data, r_c" << var_suffix << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
+          std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d";
+
+          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ">(data, r_c" << var_suffix << ", s_B"
+               << var_suffix << ", r_e" << var_suffix << ");\n";
         } else {
-          code << "    InterpTranspose" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp" << var_suffix << ", " << P_name << ", " << Q_name
-               << ">(data, r_q" << var_suffix << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
+          std::string function_name =
+              is_tensor ? ((dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d") : "InterpTransposeNonTensor";
+
+          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ">(data, r_q" << var_suffix << ", s_B"
+               << var_suffix << ", r_e" << var_suffix << ");\n";
         }
         break;
       case CEED_EVAL_GRAD:
         code << "    CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
         if (is_at_points) {
-          code << "    InterpTranspose" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp" << var_suffix << ", " << P_name << ", " << Q_name
-               << ">(data, r_c" << var_suffix << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
+          std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d";
+
+          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ">(data, r_c" << var_suffix << ", s_B"
+               << var_suffix << ", r_e" << var_suffix << ");\n";
         } else if (use_3d_slices) {
-          code << "    InterpTranspose" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp" << var_suffix << ", " << P_name << ", " << Q_name
-               << ">(data, r_q" << var_suffix << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
+          std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d";
+
+          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ">(data, r_q" << var_suffix << ", s_B"
+               << var_suffix << ", r_e" << var_suffix << ");\n";
+        } else if (is_tensor) {
+          bool        is_collocated = dim == 3 && Q_1d >= P_1d;
+          std::string function_name =
+              (dim == 1 ? "GradTranspose" : (is_collocated ? "GradTransposeTensorCollocated" : "GradTransposeTensor")) + std::to_string(dim) + "d";
+
+          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ">(data, r_q" << var_suffix << ", s_B"
+               << var_suffix << ", s_G" << var_suffix << ", r_e" << var_suffix << ");\n";
         } else {
-          code << "    GradTranspose" << (dim > 1 ? "Tensor" : "") << (dim == 3 && Q_1d >= P_1d ? "Collocated" : "") << dim << "d<num_comp"
-               << var_suffix << ", " << P_name << "," << Q_name << ">(data, r_q" << var_suffix << ", s_B" << var_suffix << ", s_G" << var_suffix
-               << ", r_e" << var_suffix << ");\n";
+          std::string function_name = "GradTransposeNonTensor";
+
+          code << "    " << function_name << "<num_comp" << var_suffix << ", dim, " << P_name << ", " << Q_name << ">(data, r_q" << var_suffix
+               << ", s_G" << var_suffix << ", r_e" << var_suffix << ");\n";
         }
         break;
       // LCOV_EXCL_START
@@ -509,8 +552,9 @@ static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, C
                                                      CeedInt num_input_fields, CeedOperatorField *op_input_fields,
                                                      CeedQFunctionField *qf_input_fields, CeedInt num_output_fields,
                                                      CeedOperatorField *op_output_fields, CeedQFunctionField *qf_output_fields,
-                                                     std::string qfunction_name, CeedInt Q_1d, bool is_at_points, bool use_3d_slices) {
-  std::string         Q_name    = "Q_1d";
+                                                     std::string qfunction_name, CeedInt Q_1d, bool is_tensor, bool is_at_points,
+                                                     bool use_3d_slices) {
+  std::string         Q_name    = is_tensor ? "Q_1d" : "Q";
   CeedEvalMode        eval_mode = CEED_EVAL_NONE;
   CeedElemRestriction elem_rstr;
 
@@ -526,25 +570,25 @@ static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, C
         if (is_at_points) {
           code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "];\n";
         } else {
-          code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
+          code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
         }
         break;
       case CEED_EVAL_INTERP:
         if (is_at_points) {
           // Accumulator for point data
-          code << "    CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim > 2 ? Q_name : "1") << "];\n";
-          code << "    for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (dim > 2 ? Q_name : "1") << "; i++) {\n";
+          code << "    CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "];\n";
+          code << "    for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "; i++) {\n";
           code << "      r_c" << var_suffix << "[i] = 0.0;\n";
           code << "    }\n";
         } else {
-          code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
+          code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
         }
         break;
       case CEED_EVAL_GRAD:
         if (is_at_points) {
           // Accumulator for point data
-          code << "    CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim > 2 ? Q_name : "1") << "*dim];\n";
-          code << "    for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (dim > 2 ? Q_name : "1") << "; i++) {\n";
+          code << "    CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "*dim];\n";
+          code << "    for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "; i++) {\n";
           code << "      r_c" << var_suffix << "[i] = 0.0;\n";
           code << "    }\n";
         } else if (use_3d_slices) {
@@ -554,7 +598,7 @@ static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, C
           code << "      r_q" << var_suffix << "[i] = 0.0;\n";
           code << "    }\n";
         } else {
-          code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim*" << Q_name << "];\n";
+          code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim*" << (is_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
         }
         break;
       case CEED_EVAL_WEIGHT:
@@ -675,7 +719,7 @@ static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, C
               CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
             }
             code << "      // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n";
-            code << "      ReadEVecSliceStrided3d<num_comp" << var_suffix << ", " << Q_name << "," << strides[0] << "," << strides[1] << ","
+            code << "      ReadEVecSliceStrided3d<num_comp" << var_suffix << ", " << Q_name << ", " << strides[0] << ", " << strides[1] << ", "
                  << strides[2] << ">(data, elem, q, d" << var_suffix << ", r_s" << var_suffix << ");\n";
           } else {
             CeedSize                  l_size = 0;
@@ -774,10 +818,10 @@ static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, C
   // Apply QFunction
   code << "\n      // -- Apply QFunction\n";
   code << "      " << qfunction_name << "(ctx, ";
-  if (dim != 3 || is_at_points || use_3d_slices) {
+  if (dim != 3 || is_at_points || use_3d_slices || !is_tensor) {
     code << "1";
   } else {
-    code << "Q_1d";
+    code << Q_name;
   }
   code << ", inputs, outputs);\n";
 
@@ -950,11 +994,17 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {
   }
 
   // Load basis source files
-  // TODO: Add non-tensor, AtPoints
-  code << "// Tensor basis source\n";
-  code << "#include <ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h>\n\n";
-  code << "// AtPoints basis source\n";
-  code << "#include <ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h>\n\n";
+  if (is_tensor) {
+    code << "// Tensor basis source\n";
+    code << "#include <ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h>\n\n";
+  } else {
+    code << "// Non-tensor basis source\n";
+    code << "#include <ceed/jit-source/cuda/cuda-shared-basis-nontensor-templates.h>\n\n";
+  }
+  if (is_at_points) {
+    code << "// AtPoints basis source\n";
+    code << "#include <ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h>\n\n";
+  }
   code << "// CodeGen operator source\n";
   code << "#include <ceed/jit-source/cuda/cuda-gen-templates.h>\n\n";
 
@@ -966,7 +1016,7 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {
 
   // Define CEED_Q_VLA
   code << "\n#undef CEED_Q_VLA\n";
-  if (dim != 3 || is_at_points || use_3d_slices) {
+  if (dim != 3 || is_at_points || use_3d_slices || !is_tensor) {
     code << "#define CEED_Q_VLA 1\n\n";
   } else {
     code << "#define CEED_Q_VLA " << Q_1d << "\n\n";
@@ -1014,7 +1064,7 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {
   }
 
   code << "  const CeedInt dim = " << dim << ";\n";
-  code << "  const CeedInt Q_1d = " << Q_1d << ";\n";
+  code << "  const CeedInt " << (is_tensor ? "Q_1d" : "Q") << " = " << Q_1d << ";\n";
   if (is_at_points) {
     code << "  const CeedInt max_num_points = " << max_num_points << ";\n";
     code << "  const CeedInt coords_comp_stride = " << coords_comp_stride << ";\n";
@@ -1027,18 +1077,18 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {
   code << "  data.t_id_y = threadIdx.y;\n";
   code << "  data.t_id_z = threadIdx.z;\n";
   code << "  data.t_id  = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
-  code << "  data.slice = slice + data.t_id_z*T_1D" << (dim > 1 ? "*T_1D" : "") << ";\n";
+  code << "  data.slice = slice + data.t_id_z*T_1D" << ((!is_tensor || dim == 1) ? "" : "*T_1D") << ";\n";
 
   // Initialize constants, and matrices B and G
   code << "\n  // Input field constants and basis data\n";
   for (CeedInt i = 0; i < num_input_fields; i++) {
-    CeedCallBackend(
-        CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, i, op_input_fields[i], qf_input_fields[i], Q_1d, true, is_at_points, use_3d_slices));
+    CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, i, op_input_fields[i], qf_input_fields[i], Q_1d, true, is_tensor,
+                                                              is_at_points, use_3d_slices));
   }
   code << "\n  // Output field constants and basis data\n";
   for (CeedInt i = 0; i < num_output_fields; i++) {
-    CeedCallBackend(
-        CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, i, op_output_fields[i], qf_output_fields[i], Q_1d, false, is_at_points, use_3d_slices));
+    CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, i, op_output_fields[i], qf_output_fields[i], Q_1d, false, is_tensor,
+                                                              is_at_points, use_3d_slices));
   }
 
   // Loop over all elements
@@ -1056,7 +1106,7 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {
     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
-    max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * elem_size);
+    max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_tensor && (dim >= 3) ? elem_size : 1));
     CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
   }
   for (CeedInt i = 0; i < num_output_fields; i++) {
@@ -1066,7 +1116,7 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {
     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
-    max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * elem_size);
+    max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_tensor && (dim >= 3) ? elem_size : 1));
     CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
   }
   code << "    // Scratch restriction buffer space\n";
@@ -1125,17 +1175,17 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {
 
     // ---- Restriction
     CeedCallBackend(CeedOperatorBuildKernelRestriction_Cuda_gen(code, data, f, dim, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f],
-                                                                Q_1d, true, is_at_points, use_3d_slices));
+                                                                Q_1d, true, is_tensor, is_at_points, use_3d_slices));
 
     // ---- Basis action
-    CeedCallBackend(
-        CeedOperatorBuildKernelBasis_Cuda_gen(code, data, f, dim, op_input_fields[f], qf_input_fields[f], Q_1d, true, is_at_points, use_3d_slices));
+    CeedCallBackend(CeedOperatorBuildKernelBasis_Cuda_gen(code, data, f, dim, op_input_fields[f], qf_input_fields[f], Q_1d, true, is_tensor,
+                                                          is_at_points, use_3d_slices));
   }
 
   // -- Q function
   CeedCallBackend(CeedOperatorBuildKernelQFunction_Cuda_gen(code, data, dim, max_num_points, num_input_fields, op_input_fields, qf_input_fields,
-                                                            num_output_fields, op_output_fields, qf_output_fields, qfunction_name, Q_1d, is_at_points,
-                                                            use_3d_slices));
+                                                            num_output_fields, op_output_fields, qf_output_fields, qfunction_name, Q_1d, is_tensor,
+                                                            is_at_points, use_3d_slices));
 
   // -- Output basis and restriction
   code << "\n    // -- Output field basis action and restrictions\n";
@@ -1143,12 +1193,12 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {
     code << "    // ---- Output field " << i << "\n";
 
     // ---- Basis action
-    CeedCallBackend(CeedOperatorBuildKernelBasis_Cuda_gen(code, data, i, dim, op_output_fields[i], qf_output_fields[i], Q_1d, false, is_at_points,
-                                                          use_3d_slices));
+    CeedCallBackend(CeedOperatorBuildKernelBasis_Cuda_gen(code, data, i, dim, op_output_fields[i], qf_output_fields[i], Q_1d, false, is_tensor,
+                                                          is_at_points, use_3d_slices));
 
     // ---- Restriction
     CeedCallBackend(CeedOperatorBuildKernelRestriction_Cuda_gen(code, data, i, dim, NULL, op_output_fields[i], qf_output_fields[i], Q_1d, false,
-                                                                is_at_points, use_3d_slices));
+                                                                is_tensor, is_at_points, use_3d_slices));
   }
 
   // Close loop and function
diff --git a/backends/cuda-gen/ceed-cuda-gen-operator.c b/backends/cuda-gen/ceed-cuda-gen-operator.c
index 570e8735c1..468729595c 100644
--- a/backends/cuda-gen/ceed-cuda-gen-operator.c
+++ b/backends/cuda-gen/ceed-cuda-gen-operator.c
@@ -11,6 +11,7 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <stddef.h>
+#include <string.h>
 
 #include "../cuda/ceed-cuda-common.h"
 #include "../cuda/ceed-cuda-compile.h"
@@ -98,7 +99,7 @@ static size_t dynamicSMemSize(int threads) { return threads * sizeof(CeedScalar)
 // Apply and add to output
 //------------------------------------------------------------------------------
 static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec, CeedVector output_vec, CeedRequest *request) {
-  bool                    is_at_points;
+  bool                    is_at_points, is_tensor;
   Ceed                    ceed;
   Ceed_Cuda              *cuda_data;
   CeedInt                 num_elem, num_input_fields, num_output_fields;
@@ -110,16 +111,62 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec,
   CeedOperatorField      *op_input_fields, *op_output_fields;
   CeedOperator_Cuda_gen  *data;
 
-  // Check for tensor-product bases
+  // Check for shared bases
+  CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
   {
-    bool has_tensor_bases;
+    bool has_shared_bases = true, is_all_tensor = true, is_all_nontensor = true;
+
+    for (CeedInt i = 0; i < num_input_fields; i++) {
+      CeedBasis basis;
+
+      CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
+      if (basis != CEED_BASIS_NONE) {
+        bool        is_tensor = true;
+        const char *resource;
+        char       *resource_root;
+        Ceed        basis_ceed;
+
+        CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
+        is_all_tensor    &= is_tensor;
+        is_all_nontensor &= !is_tensor;
+        CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
+        CeedCallBackend(CeedGetResource(basis_ceed, &resource));
+        CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
+        has_shared_bases &= !strcmp(resource_root, "/gpu/cuda/shared");
+        CeedCallBackend(CeedFree(&resource_root));
+        CeedCallBackend(CeedDestroy(&basis_ceed));
+      }
+      CeedCallBackend(CeedBasisDestroy(&basis));
+    }
 
-    CeedCallBackend(CeedOperatorHasTensorBases(op, &has_tensor_bases));
-    // -- Fallback to ref if not all bases are tensor-product
-    if (!has_tensor_bases) {
+    for (CeedInt i = 0; i < num_output_fields; i++) {
+      CeedBasis basis;
+
+      CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
+      if (basis != CEED_BASIS_NONE) {
+        bool        is_tensor = true;
+        const char *resource;
+        char       *resource_root;
+        Ceed        basis_ceed;
+
+        CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
+        is_all_tensor    &= is_tensor;
+        is_all_nontensor &= !is_tensor;
+
+        CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
+        CeedCallBackend(CeedGetResource(basis_ceed, &resource));
+        CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
+        has_shared_bases &= !strcmp(resource_root, "/gpu/cuda/shared");
+        CeedCallBackend(CeedFree(&resource_root));
+        CeedCallBackend(CeedDestroy(&basis_ceed));
+      }
+      CeedCallBackend(CeedBasisDestroy(&basis));
+    }
+    // -- Fallback to ref if not all bases are shared
+    if (!has_shared_bases || (!is_all_tensor && !is_all_nontensor)) {
       CeedOperator op_fallback;
 
-      CeedDebug256(CeedOperatorReturnCeed(op), CEED_DEBUG_COLOR_SUCCESS, "Falling back to /gpu/cuda/ref CeedOperator due to non-tensor bases");
+      CeedDebug256(CeedOperatorReturnCeed(op), CEED_DEBUG_COLOR_SUCCESS, "Falling back to /gpu/cuda/ref CeedOperator due to large non-tensor bases");
       CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback));
       CeedCallBackend(CeedOperatorApplyAdd(op_fallback, input_vec, output_vec, request));
       return CEED_ERROR_SUCCESS;
@@ -132,7 +179,6 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec,
   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
   CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
-  CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
 
   // Creation of the operator
@@ -232,8 +278,9 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec,
   const CeedInt thread_1d = CeedIntMax(Q_1d, P_1d);
   int           max_threads_per_block, min_grid_size, grid;
 
+  CeedCallBackend(CeedOperatorHasTensorBases(op, &is_tensor));
   CeedCallCuda(ceed, cuOccupancyMaxPotentialBlockSize(&min_grid_size, &max_threads_per_block, data->op, dynamicSMemSize, 0, 0x10000));
-  int block[3] = {thread_1d, dim < 2 ? 1 : thread_1d, -1};
+  int block[3] = {thread_1d, ((!is_tensor || dim == 1) ? 1 : thread_1d), -1};
 
   CeedCallBackend(BlockGridCalculate(num_elem, min_grid_size / cuda_data->device_prop.multiProcessorCount, max_threads_per_block,
                                      cuda_data->device_prop.maxThreadsDim[2], cuda_data->device_prop.warpSize, block, &grid));
diff --git a/backends/cuda-ref/ceed-cuda-ref-operator.c b/backends/cuda-ref/ceed-cuda-ref-operator.c
index fe873c74b5..319d10145f 100644
--- a/backends/cuda-ref/ceed-cuda-ref-operator.c
+++ b/backends/cuda-ref/ceed-cuda-ref-operator.c
@@ -132,7 +132,7 @@ static int CeedOperatorSetupFields_Cuda(CeedQFunction qf, CeedOperator op, bool
     // Input CEED_VECTOR_ACTIVE
     // Output CEED_VECTOR_ACTIVE without CEED_EVAL_NONE
     // Input CEED_VECTOR_NONE with CEED_EVAL_WEIGHT
-    // Input passive vectorr with CEED_EVAL_NONE and strided restriction with CEED_STRIDES_BACKEND
+    // Input passive vector with CEED_EVAL_NONE and strided restriction with CEED_STRIDES_BACKEND
     CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &l_vec));
     is_active = l_vec == CEED_VECTOR_ACTIVE;
     CeedCallBackend(CeedVectorDestroy(&l_vec));

From 9123fb08d52f01bdd0d1f3a790ba84e4ab900e9f Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Wed, 29 Jan 2025 09:51:38 -0700
Subject: [PATCH 266/571] hip - nontensor gen operators

---
 .../cuda-gen/ceed-cuda-gen-operator-build.cpp |   6 +-
 backends/cuda-gen/ceed-cuda-gen-operator.c    |   6 +-
 .../hip-gen/ceed-hip-gen-operator-build.cpp   | 241 +++++++++++-------
 backends/hip-gen/ceed-hip-gen-operator.c      |  66 ++++-
 backends/hip-ref/ceed-hip-ref-operator.c      |   2 +-
 5 files changed, 209 insertions(+), 112 deletions(-)

diff --git a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
index 187cc18fb6..d19eedd491 100644
--- a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
+++ b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
@@ -45,7 +45,7 @@ static int CeedOperatorBuildKernelData_Cuda_gen(Ceed ceed, CeedInt num_input_fie
 
       // Collect dim, P_1d, and Q_1d
       CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
-      *is_tensor          = *is_tensor && is_field_tensor;
+      *is_tensor = *is_tensor && is_field_tensor;
       if (is_field_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &field_P_1d));
       else CeedCallBackend(CeedBasisGetNumNodes(basis, &field_P_1d));
       *max_P_1d = CeedIntMax(*max_P_1d, field_P_1d);
@@ -69,7 +69,7 @@ static int CeedOperatorBuildKernelData_Cuda_gen(Ceed ceed, CeedInt num_input_fie
 
       // Collect dim, P_1d, and Q_1d
       CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
-      *is_tensor          = *is_tensor && is_field_tensor;
+      *is_tensor = *is_tensor && is_field_tensor;
       if (is_field_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &field_P_1d));
       else CeedCallBackend(CeedBasisGetNumNodes(basis, &field_P_1d));
       *max_P_1d = CeedIntMax(*max_P_1d, field_P_1d);
@@ -1040,7 +1040,7 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {
   code << "// d_[in,out]_i:   CeedVector device array\n";
   code << "// r_[in,out]_e_i: Element vector register\n";
   code << "// r_[in,out]_q_i: Quadrature space vector register\n";
-  code << "// r_[in,out]_c_i: AtPoints Chebyshev coefficents register\n";
+  code << "// r_[in,out]_c_i: AtPoints Chebyshev coefficients register\n";
   code << "// r_[in,out]_s_i: Quadrature space slice vector register\n";
   code << "// \n";
   code << "// s_B_[in,out]_i: Interpolation matrix, shared memory\n";
diff --git a/backends/cuda-gen/ceed-cuda-gen-operator.c b/backends/cuda-gen/ceed-cuda-gen-operator.c
index 468729595c..8f080ed544 100644
--- a/backends/cuda-gen/ceed-cuda-gen-operator.c
+++ b/backends/cuda-gen/ceed-cuda-gen-operator.c
@@ -127,7 +127,7 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec,
         Ceed        basis_ceed;
 
         CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
-        is_all_tensor    &= is_tensor;
+        is_all_tensor &= is_tensor;
         is_all_nontensor &= !is_tensor;
         CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
         CeedCallBackend(CeedGetResource(basis_ceed, &resource));
@@ -150,7 +150,7 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec,
         Ceed        basis_ceed;
 
         CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
-        is_all_tensor    &= is_tensor;
+        is_all_tensor &= is_tensor;
         is_all_nontensor &= !is_tensor;
 
         CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
@@ -166,7 +166,7 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec,
     if (!has_shared_bases || (!is_all_tensor && !is_all_nontensor)) {
       CeedOperator op_fallback;
 
-      CeedDebug256(CeedOperatorReturnCeed(op), CEED_DEBUG_COLOR_SUCCESS, "Falling back to /gpu/cuda/ref CeedOperator due to large non-tensor bases");
+      CeedDebug256(CeedOperatorReturnCeed(op), CEED_DEBUG_COLOR_SUCCESS, "Falling back to /gpu/cuda/ref CeedOperator due unsupported bases");
       CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback));
       CeedCallBackend(CeedOperatorApplyAdd(op_fallback, input_vec, output_vec, request));
       return CEED_ERROR_SUCCESS;
diff --git a/backends/hip-gen/ceed-hip-gen-operator-build.cpp b/backends/hip-gen/ceed-hip-gen-operator-build.cpp
index bd56a77350..1df23c3d7e 100644
--- a/backends/hip-gen/ceed-hip-gen-operator-build.cpp
+++ b/backends/hip-gen/ceed-hip-gen-operator-build.cpp
@@ -61,6 +61,7 @@ static int CeedOperatorBuildKernelData_Hip_gen(Ceed ceed, CeedInt num_input_fiel
   *Q_1d      = 0;
   *dim       = 0;
   *is_tensor = true;
+
   for (CeedInt i = 0; i < num_input_fields; i++) {
     CeedBasis basis;
 
@@ -71,14 +72,15 @@ static int CeedOperatorBuildKernelData_Hip_gen(Ceed ceed, CeedInt num_input_fiel
 
       // Collect dim, P_1d, and Q_1d
       CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
-      CeedCheck(is_field_tensor, ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis");
       *is_tensor = *is_tensor && is_field_tensor;
-      CeedCallBackend(CeedBasisGetNumNodes1D(basis, &field_P_1d));
+      if (is_field_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &field_P_1d));
+      else CeedCallBackend(CeedBasisGetNumNodes(basis, &field_P_1d));
       *max_P_1d = CeedIntMax(*max_P_1d, field_P_1d);
       CeedCallBackend(CeedBasisGetDimension(basis, &field_dim));
       CeedCheck(*dim == 0 || field_dim == *dim, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
       *dim = field_dim;
-      CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &field_Q_1d));
+      if (is_field_tensor) CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &field_Q_1d));
+      else CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &field_Q_1d));
       CeedCheck(*Q_1d == 0 || field_Q_1d == *Q_1d, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
       *Q_1d = field_Q_1d;
     }
@@ -94,14 +96,15 @@ static int CeedOperatorBuildKernelData_Hip_gen(Ceed ceed, CeedInt num_input_fiel
 
       // Collect dim, P_1d, and Q_1d
       CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
-      CeedCheck(is_field_tensor, ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis");
       *is_tensor = *is_tensor && is_field_tensor;
-      CeedCallBackend(CeedBasisGetNumNodes1D(basis, &field_P_1d));
+      if (is_field_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &field_P_1d));
+      else CeedCallBackend(CeedBasisGetNumNodes(basis, &field_P_1d));
       *max_P_1d = CeedIntMax(*max_P_1d, field_P_1d);
       CeedCallBackend(CeedBasisGetDimension(basis, &field_dim));
       CeedCheck(*dim == 0 || field_dim == *dim, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
       *dim = field_dim;
-      CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &field_Q_1d));
+      if (is_field_tensor) CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &field_Q_1d));
+      else CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &field_Q_1d));
       CeedCheck(*Q_1d == 0 || field_Q_1d == *Q_1d, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
       *Q_1d = field_Q_1d;
     }
@@ -151,9 +154,10 @@ static int CeedOperatorBuildKernelData_Hip_gen(Ceed ceed, CeedInt num_input_fiel
 // Setup fields
 //------------------------------------------------------------------------------
 static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, CeedInt i, CeedOperatorField op_field,
-                                                    CeedQFunctionField qf_field, CeedInt Q_1d, bool is_input, bool is_at_points, bool use_3d_slices) {
+                                                    CeedQFunctionField qf_field, CeedInt Q_1d, bool is_input, bool is_tensor, bool is_at_points,
+                                                    bool use_3d_slices) {
   std::string           var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
-  std::string           P_name = "P_1d" + var_suffix, Q_name = "Q_1d";
+  std::string           P_name = (is_tensor ? "P_1d" : "P") + var_suffix, Q_name = is_tensor ? "Q_1d" : "Q";
   std::string           option_name = (is_input ? "inputs" : "outputs");
   CeedEvalMode          eval_mode   = CEED_EVAL_NONE;
   CeedInt               elem_size = 0, num_comp = 0, P_1d = 0;
@@ -173,7 +177,8 @@ static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, Ce
   CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis));
   if (basis != CEED_BASIS_NONE) {
     CeedCallBackend(CeedBasisGetData(basis, &basis_data));
-    CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
+    if (is_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
+    else CeedCallBackend(CeedBasisGetNumNodes(basis, &P_1d));
   }
   CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode));
 
@@ -210,7 +215,7 @@ static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, Ce
         if (is_input) data->B.inputs[i] = basis_data->d_interp_1d;
         else data->B.outputs[i] = basis_data->d_interp_1d;
       }
-      code << "  __shared__ CeedScalar s_B" << var_suffix << "[" << P_1d * Q_1d << "];\n";
+      code << "  __shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n";
       code << "  LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
       break;
     case CEED_EVAL_GRAD:
@@ -235,13 +240,15 @@ static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, Ce
         if (is_input) data->B.inputs[i] = basis_data->d_interp_1d;
         else data->B.outputs[i] = basis_data->d_interp_1d;
       }
-      code << "  __shared__ CeedScalar s_B" << var_suffix << "[" << P_1d * Q_1d << "];\n";
-      code << "  LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
+      if (is_tensor) {
+        code << "  __shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n";
+        code << "  LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
+      }
       if (is_at_points) break;  // No G mat for AtPoints
       if (use_3d_slices) {
         if (is_input) data->G.inputs[i] = basis_data->d_collo_grad_1d;
         else data->G.outputs[i] = basis_data->d_collo_grad_1d;
-        code << "  __shared__ CeedScalar s_G" << var_suffix << "[" << Q_1d * Q_1d << "];\n";
+        code << "  __shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n";
         code << "  LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
       } else {
         bool has_collo_grad = basis_data->d_collo_grad_1d;
@@ -249,11 +256,12 @@ static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, Ce
         if (is_input) data->G.inputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d;
         else data->G.outputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d;
         if (has_collo_grad) {
-          code << "  __shared__ CeedScalar s_G" << var_suffix << "[" << Q_1d * Q_1d << "];\n";
+          code << "  __shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n";
           code << "  LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
         } else {
-          code << "  __shared__ CeedScalar s_G" << var_suffix << "[" << Q_1d * P_1d << "];\n";
-          code << "  LoadMatrix<" << P_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
+          code << "  __shared__ CeedScalar s_G" << var_suffix << "[" << P_name << "*" << Q_name << (is_tensor ? "" : "*dim") << "];\n";
+          code << "  LoadMatrix<" << P_name << ", " << Q_name << (is_tensor ? "" : "*dim") << ">(data, G." << option_name << "[" << i << "], s_G"
+               << var_suffix << ");\n";
         }
       }
       break;
@@ -274,9 +282,9 @@ static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, Ce
 //------------------------------------------------------------------------------
 static int CeedOperatorBuildKernelRestriction_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, CeedInt i, CeedInt dim,
                                                       CeedInt field_input_buffer[], CeedOperatorField op_field, CeedQFunctionField qf_field,
-                                                      CeedInt Q_1d, bool is_input, bool is_at_points, bool use_3d_slices) {
+                                                      CeedInt Q_1d, bool is_input, bool is_tensor, bool is_at_points, bool use_3d_slices) {
   std::string              var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
-  std::string              P_name     = "P_1d" + var_suffix;
+  std::string              P_name     = (is_tensor ? "P_1d" : "P") + var_suffix;
   CeedEvalMode             eval_mode  = CEED_EVAL_NONE;
   CeedInt                  elem_size = 0, num_comp = 0, P_1d = 0;
   CeedSize                 l_size;
@@ -295,7 +303,8 @@ static int CeedOperatorBuildKernelRestriction_Hip_gen(std::ostringstream &code,
   }
   CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis));
   if (basis != CEED_BASIS_NONE) {
-    CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
+    if (is_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
+    else CeedCallBackend(CeedBasisGetNumNodes(basis, &P_1d));
   }
   CeedCallBackend(CeedBasisDestroy(&basis));
   CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode));
@@ -325,8 +334,8 @@ static int CeedOperatorBuildKernelRestriction_Hip_gen(std::ostringstream &code,
           CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
           code << "    // CompStride: " << comp_stride << "\n";
           data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets;
-          code << "    ReadLVecStandard" << dim << "d<num_comp" << var_suffix << ", " << comp_stride << ", " << P_name << ">(data, l_size"
-               << var_suffix << ", elem, indices.inputs[" << i << "], d" << var_suffix << ", r_e" << var_suffix << ");\n";
+          code << "    ReadLVecStandard" << (is_tensor ? dim : 1) << "d<num_comp" << var_suffix << ", " << comp_stride << ", " << P_name
+               << ">(data, l_size" << var_suffix << ", elem, indices.inputs[" << i << "], d" << var_suffix << ", r_e" << var_suffix << ");\n";
           break;
         }
         case CEED_RESTRICTION_STRIDED: {
@@ -341,8 +350,8 @@ static int CeedOperatorBuildKernelRestriction_Hip_gen(std::ostringstream &code,
             CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
           }
           code << "    // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n";
-          code << "    ReadLVecStrided" << dim << "d<num_comp" << var_suffix << ", " << P_name << "," << strides[0] << "," << strides[1] << ","
-               << strides[2] << ">(data, elem, d" << var_suffix << ", r_e" << var_suffix << ");\n";
+          code << "    ReadLVecStrided" << (is_tensor ? dim : 1) << "d<num_comp" << var_suffix << ", " << P_name << ", " << strides[0] << ", "
+               << strides[1] << ", " << strides[2] << ">(data, elem, d" << var_suffix << ", r_e" << var_suffix << ");\n";
           break;
         }
         case CEED_RESTRICTION_POINTS: {
@@ -371,8 +380,8 @@ static int CeedOperatorBuildKernelRestriction_Hip_gen(std::ostringstream &code,
         CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
         code << "    // CompStride: " << comp_stride << "\n";
         data->indices.outputs[i] = (CeedInt *)rstr_data->d_offsets;
-        code << "    WriteLVecStandard" << dim << "d<num_comp" << var_suffix << ", " << comp_stride << ", " << P_name << ">(data, l_size"
-             << var_suffix << ", elem, indices.outputs[" << i << "], r_e" << var_suffix << ", d" << var_suffix << ");\n";
+        code << "    WriteLVecStandard" << (is_tensor ? dim : 1) << "d<num_comp" << var_suffix << ", " << comp_stride << ", " << P_name
+             << ">(data, l_size" << var_suffix << ", elem, indices.outputs[" << i << "], r_e" << var_suffix << ", d" << var_suffix << ");\n";
         break;
       }
       case CEED_RESTRICTION_STRIDED: {
@@ -387,8 +396,8 @@ static int CeedOperatorBuildKernelRestriction_Hip_gen(std::ostringstream &code,
           CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
         }
         code << "    // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n";
-        code << "    WriteLVecStrided" << dim << "d<num_comp" << var_suffix << ", " << P_name << "," << strides[0] << "," << strides[1] << ","
-             << strides[2] << ">(data, elem, r_e" << var_suffix << ", d" << var_suffix << ");\n";
+        code << "    WriteLVecStrided" << (is_tensor ? dim : 1) << "d<num_comp" << var_suffix << ", " << P_name << ", " << strides[0] << ", "
+             << strides[1] << ", " << strides[2] << ">(data, elem, r_e" << var_suffix << ", d" << var_suffix << ");\n";
         break;
       }
       case CEED_RESTRICTION_POINTS:
@@ -409,10 +418,10 @@ static int CeedOperatorBuildKernelRestriction_Hip_gen(std::ostringstream &code,
 // Basis
 //------------------------------------------------------------------------------
 static int CeedOperatorBuildKernelBasis_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, CeedInt i, CeedInt dim,
-                                                CeedOperatorField op_field, CeedQFunctionField qf_field, CeedInt Q_1d, bool is_input,
+                                                CeedOperatorField op_field, CeedQFunctionField qf_field, CeedInt Q_1d, bool is_input, bool is_tensor,
                                                 bool is_at_points, bool use_3d_slices) {
   std::string         var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
-  std::string         P_name = "P_1d" + var_suffix, Q_name = "Q_1d";
+  std::string         P_name = (is_tensor ? "P_1d" : "P") + var_suffix, Q_name = is_tensor ? "Q_1d" : "Q";
   CeedEvalMode        eval_mode = CEED_EVAL_NONE;
   CeedInt             elem_size = 0, num_comp = 0, P_1d = 0;
   CeedElemRestriction elem_rstr;
@@ -427,7 +436,8 @@ static int CeedOperatorBuildKernelBasis_Hip_gen(std::ostringstream &code, CeedOp
   CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
   CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis));
   if (basis != CEED_BASIS_NONE) {
-    CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
+    if (is_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
+    else CeedCallBackend(CeedBasisGetNumNodes(basis, &P_1d));
   }
   CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode));
 
@@ -442,29 +452,45 @@ static int CeedOperatorBuildKernelBasis_Hip_gen(std::ostringstream &code, CeedOp
         break;
       case CEED_EVAL_INTERP:
         if (is_at_points) {
-          code << "    CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim > 2 ? Q_name : "1") << "];\n";
-          code << "    Interp" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp" << var_suffix << ", P_1d" << var_suffix << ", " << Q_name
-               << ">(data, r_e" << var_suffix << ", s_B" << var_suffix << ", r_c" << var_suffix << ");\n";
+          std::string function_name = (dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d";
+
+          code << "    CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "];\n";
+          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ">(data, r_e" << var_suffix << ", s_B"
+               << var_suffix << ", r_c" << var_suffix << ");\n";
         } else {
-          code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
-          code << "    Interp" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp" << var_suffix << ", P_1d" << var_suffix << ", " << Q_name
-               << ">(data, r_e" << var_suffix << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n";
+          std::string function_name = is_tensor ? ((dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d") : "InterpNonTensor";
+
+          code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
+          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ">(data, r_e" << var_suffix << ", s_B"
+               << var_suffix << ", r_q" << var_suffix << ");\n";
         }
         break;
       case CEED_EVAL_GRAD:
         if (is_at_points) {
-          code << "    CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim > 2 ? Q_name : "1") << "];\n";
-          code << "    Interp" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp" << var_suffix << ", P_1d" << var_suffix << ", " << Q_name
-               << ">(data, r_e" << var_suffix << ", s_B" << var_suffix << ", r_c" << var_suffix << ");\n";
+          std::string function_name = (dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d";
+
+          code << "    CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "];\n";
+          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ">(data, r_e" << var_suffix << ", s_B"
+               << var_suffix << ", r_c" << var_suffix << ");\n";
         } else if (use_3d_slices) {
+          std::string function_name = (dim > 1 ? "InterpTensor" : "Interp") + std::to_string(dim) + "d";
+
           code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
-          code << "    Interp" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp" << var_suffix << ", P_1d" << var_suffix << ", " << Q_name
-               << ">(data, r_e" << var_suffix << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n";
+          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ">(data, r_e" << var_suffix << ", s_B"
+               << var_suffix << ", r_q" << var_suffix << ");\n";
+        } else if (is_tensor) {
+          bool        is_collocated = dim == 3 && Q_1d >= P_1d;
+          std::string function_name = (dim == 1 ? "Grad" : (is_collocated ? "GradTensorCollocated" : "GradTensor")) + std::to_string(dim) + "d";
+
+          code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim*" << (dim >= 3 ? Q_name : "1") << "];\n";
+          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ">(data, r_e" << var_suffix << ", s_B"
+               << var_suffix << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n";
         } else {
-          code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim*" << Q_name << "];\n";
-          code << "    Grad" << (dim > 1 ? "Tensor" : "") << (dim == 3 && Q_1d >= P_1d ? "Collocated" : "") << dim << "d<num_comp" << var_suffix
-               << ", P_1d" << var_suffix << ", " << Q_name << ">(data, r_e" << var_suffix << ", s_B" << var_suffix << ", s_G" << var_suffix << ", r_q"
-               << var_suffix << ");\n";
+          std::string function_name = "GradNonTensor";
+
+          code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim];\n";
+          code << "    " << function_name << "<num_comp" << var_suffix << ", dim, " << P_name << ", " << Q_name << ">(data, r_e" << var_suffix
+               << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n";
         }
         break;
       case CEED_EVAL_WEIGHT: {
@@ -472,11 +498,12 @@ static int CeedOperatorBuildKernelBasis_Hip_gen(std::ostringstream &code, CeedOp
           code << "    // Nothing to do AtPoints\n";
         } else {
           CeedBasis_Hip_shared *basis_data;
+          std::string           function_name = is_tensor ? ((dim == 1 ? "Weight" : "WeightTensor") + std::to_string(dim) + "d") : "WeightNonTensor";
 
-          code << "    CeedScalar r_q" << var_suffix << "[" << Q_name << "];\n";
+          code << "    CeedScalar r_q" << var_suffix << "[" << (is_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
           CeedCallBackend(CeedBasisGetData(basis, &basis_data));
           data->W = basis_data->d_q_weight_1d;
-          code << "    Weight" << (dim > 1 ? "Tensor" : "") << dim << "d<" << Q_name << ">(data, W, r_q" << var_suffix << ");\n";
+          code << "    " << function_name << "<" << Q_name << ">(data, W, r_q" << var_suffix << ");\n";
         }
         break;
       }
@@ -494,25 +521,42 @@ static int CeedOperatorBuildKernelBasis_Hip_gen(std::ostringstream &code, CeedOp
       case CEED_EVAL_INTERP:
         code << "    CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
         if (is_at_points) {
-          code << "    InterpTranspose" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp" << var_suffix << ", " << P_name << ", " << Q_name
-               << ">(data, r_c" << var_suffix << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
+          std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d";
+
+          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ">(data, r_c" << var_suffix << ", s_B"
+               << var_suffix << ", r_e" << var_suffix << ");\n";
         } else {
-          code << "    InterpTranspose" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp" << var_suffix << ", " << P_name << ", " << Q_name
-               << ">(data, r_q" << var_suffix << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
+          std::string function_name =
+              is_tensor ? ((dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d") : "InterpTransposeNonTensor";
+
+          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ">(data, r_q" << var_suffix << ", s_B"
+               << var_suffix << ", r_e" << var_suffix << ");\n";
         }
         break;
       case CEED_EVAL_GRAD:
         code << "    CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
         if (is_at_points) {
-          code << "    InterpTranspose" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp" << var_suffix << ", " << P_name << ", " << Q_name
-               << ">(data, r_c" << var_suffix << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
+          std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d";
+
+          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ">(data, r_c" << var_suffix << ", s_B"
+               << var_suffix << ", r_e" << var_suffix << ");\n";
         } else if (use_3d_slices) {
-          code << "    InterpTranspose" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp" << var_suffix << ", " << P_name << ", " << Q_name
-               << ">(data, r_q" << var_suffix << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
+          std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d";
+
+          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ">(data, r_q" << var_suffix << ", s_B"
+               << var_suffix << ", r_e" << var_suffix << ");\n";
+        } else if (is_tensor) {
+          bool        is_collocated = dim == 3 && Q_1d >= P_1d;
+          std::string function_name =
+              (dim == 1 ? "GradTranspose" : (is_collocated ? "GradTransposeTensorCollocated" : "GradTransposeTensor")) + std::to_string(dim) + "d";
+
+          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ">(data, r_q" << var_suffix << ", s_B"
+               << var_suffix << ", s_G" << var_suffix << ", r_e" << var_suffix << ");\n";
         } else {
-          code << "    GradTranspose" << (dim > 1 ? "Tensor" : "") << (dim == 3 && Q_1d >= P_1d ? "Collocated" : "") << dim << "d<num_comp"
-               << var_suffix << ", " << P_name << "," << Q_name << ">(data, r_q" << var_suffix << ", s_B" << var_suffix << ", s_G" << var_suffix
-               << ", r_e" << var_suffix << ");\n";
+          std::string function_name = "GradTransposeNonTensor";
+
+          code << "    " << function_name << "<num_comp" << var_suffix << ", dim, " << P_name << ", " << Q_name << ">(data, r_q" << var_suffix
+               << ", s_G" << var_suffix << ", r_e" << var_suffix << ");\n";
         }
         break;
       // LCOV_EXCL_START
@@ -534,9 +578,9 @@ static int CeedOperatorBuildKernelBasis_Hip_gen(std::ostringstream &code, CeedOp
 static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, CeedInt dim, CeedInt max_num_points,
                                                     CeedInt num_input_fields, CeedOperatorField *op_input_fields, CeedQFunctionField *qf_input_fields,
                                                     CeedInt num_output_fields, CeedOperatorField *op_output_fields,
-                                                    CeedQFunctionField *qf_output_fields, std::string qfunction_name, CeedInt Q_1d, bool is_at_points,
-                                                    bool use_3d_slices) {
-  std::string         Q_name    = "Q_1d";
+                                                    CeedQFunctionField *qf_output_fields, std::string qfunction_name, CeedInt Q_1d, bool is_tensor,
+                                                    bool is_at_points, bool use_3d_slices) {
+  std::string         Q_name    = is_tensor ? "Q_1d" : "Q";
   CeedEvalMode        eval_mode = CEED_EVAL_NONE;
   CeedElemRestriction elem_rstr;
 
@@ -552,25 +596,25 @@ static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, Ce
         if (is_at_points) {
           code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "];\n";
         } else {
-          code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
+          code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
         }
         break;
       case CEED_EVAL_INTERP:
         if (is_at_points) {
           // Accumulator for point data
-          code << "    CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim > 2 ? Q_name : "1") << "];\n";
-          code << "    for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (dim > 2 ? Q_name : "1") << "; i++) {\n";
+          code << "    CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "];\n";
+          code << "    for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "; i++) {\n";
           code << "      r_c" << var_suffix << "[i] = 0.0;\n";
           code << "    }\n";
         } else {
-          code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
+          code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
         }
         break;
       case CEED_EVAL_GRAD:
         if (is_at_points) {
           // Accumulator for point data
-          code << "    CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim > 2 ? Q_name : "1") << "*dim];\n";
-          code << "    for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (dim > 2 ? Q_name : "1") << "; i++) {\n";
+          code << "    CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "*dim];\n";
+          code << "    for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "; i++) {\n";
           code << "      r_c" << var_suffix << "[i] = 0.0;\n";
           code << "    }\n";
         } else if (use_3d_slices) {
@@ -580,7 +624,7 @@ static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, Ce
           code << "      r_q" << var_suffix << "[i] = 0.0;\n";
           code << "    }\n";
         } else {
-          code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim*" << Q_name << "];\n";
+          code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim*" << (is_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
         }
         break;
       case CEED_EVAL_WEIGHT:
@@ -701,7 +745,7 @@ static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, Ce
               CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
             }
             code << "      // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n";
-            code << "      ReadEVecSliceStrided3d<num_comp" << var_suffix << ", " << Q_name << "," << strides[0] << "," << strides[1] << ","
+            code << "      ReadEVecSliceStrided3d<num_comp" << var_suffix << ", " << Q_name << ", " << strides[0] << ", " << strides[1] << ", "
                  << strides[2] << ">(data, elem, q, d" << var_suffix << ", r_s" << var_suffix << ");\n";
           } else {
             CeedSize                 l_size = 0;
@@ -717,6 +761,7 @@ static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, Ce
             code << "      ReadEVecSliceStandard3d<num_comp" << var_suffix << ", " << comp_stride << ", " << Q_name << ">(data, l_size" << var_suffix
                  << ", elem, q, indices.inputs[" << i << "], d" << var_suffix << ", r_s" << var_suffix << ");\n";
           }
+          CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
           break;
         case CEED_EVAL_INTERP:
           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
@@ -799,10 +844,10 @@ static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, Ce
   // Apply QFunction
   code << "\n      // -- Apply QFunction\n";
   code << "      " << qfunction_name << "(ctx, ";
-  if (dim != 3 || is_at_points || use_3d_slices) {
+  if (dim != 3 || is_at_points || use_3d_slices || !is_tensor) {
     code << "1";
   } else {
-    code << "Q_1d";
+    code << Q_name;
   }
   code << ", inputs, outputs);\n";
 
@@ -962,11 +1007,17 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op) {
   }
 
   // Load basis source files
-  // TODO: Add non-tensor, AtPoints
-  code << "// Tensor basis source\n";
-  code << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-templates.h>\n\n";
-  code << "// AtPoints basis source\n";
-  code << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h>\n\n";
+  if (is_tensor) {
+    code << "// Tensor basis source\n";
+    code << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-templates.h>\n\n";
+  } else {
+    code << "// Non-tensor basis source\n";
+    code << "#include <ceed/jit-source/hip/hip-shared-basis-nontensor-templates.h>\n\n";
+  }
+  if (is_at_points) {
+    code << "// AtPoints basis source\n";
+    code << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h>\n\n";
+  }
   code << "// CodeGen operator source\n";
   code << "#include <ceed/jit-source/hip/hip-gen-templates.h>\n\n";
 
@@ -978,7 +1029,7 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op) {
 
   // Define CEED_Q_VLA
   code << "\n#undef CEED_Q_VLA\n";
-  if (dim != 3 || is_at_points || use_3d_slices) {
+  if (dim != 3 || is_at_points || use_3d_slices || !is_tensor) {
     code << "#define CEED_Q_VLA 1\n\n";
   } else {
     code << "#define CEED_Q_VLA " << Q_1d << "\n\n";
@@ -1002,7 +1053,7 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op) {
   code << "// d_[in,out]_i:   CeedVector device array\n";
   code << "// r_[in,out]_e_i: Element vector register\n";
   code << "// r_[in,out]_q_i: Quadrature space vector register\n";
-  code << "// r_[in,out]_c_i: AtPoints Chebyshev coefficents register\n";
+  code << "// r_[in,out]_c_i: AtPoints Chebyshev coefficients register\n";
   code << "// r_[in,out]_s_i: Quadrature space slice vector register\n";
   code << "// \n";
   code << "// s_B_[in,out]_i: Interpolation matrix, shared memory\n";
@@ -1026,7 +1077,7 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op) {
   }
 
   code << "  const CeedInt dim = " << dim << ";\n";
-  code << "  const CeedInt Q_1d = " << Q_1d << ";\n";
+  code << "  const CeedInt " << (is_tensor ? "Q_1d" : "Q") << " = " << Q_1d << ";\n";
   if (is_at_points) {
     code << "  const CeedInt max_num_points = " << max_num_points << ";\n";
     code << "  const CeedInt coords_comp_stride = " << coords_comp_stride << ";\n";
@@ -1039,18 +1090,18 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op) {
   code << "  data.t_id_y = threadIdx.y;\n";
   code << "  data.t_id_z = threadIdx.z;\n";
   code << "  data.t_id  = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
-  code << "  data.slice = slice + data.t_id_z*T_1D" << (dim > 1 ? "*T_1D" : "") << ";\n";
+  code << "  data.slice = slice + data.t_id_z*T_1D" << ((!is_tensor || dim == 1) ? "" : "*T_1D") << ";\n";
 
   // Initialize constants, and matrices B and G
   code << "\n  // Input field constants and basis data\n";
   for (CeedInt i = 0; i < num_input_fields; i++) {
-    CeedCallBackend(
-        CeedOperatorBuildKernelFieldData_Hip_gen(code, data, i, op_input_fields[i], qf_input_fields[i], Q_1d, true, is_at_points, use_3d_slices));
+    CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, i, op_input_fields[i], qf_input_fields[i], Q_1d, true, is_tensor,
+                                                             is_at_points, use_3d_slices));
   }
   code << "\n  // Output field constants and basis data\n";
   for (CeedInt i = 0; i < num_output_fields; i++) {
-    CeedCallBackend(
-        CeedOperatorBuildKernelFieldData_Hip_gen(code, data, i, op_output_fields[i], qf_output_fields[i], Q_1d, false, is_at_points, use_3d_slices));
+    CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, i, op_output_fields[i], qf_output_fields[i], Q_1d, false, is_tensor,
+                                                             is_at_points, use_3d_slices));
   }
 
   // Loop over all elements
@@ -1068,7 +1119,7 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op) {
     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
-    max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * elem_size);
+    max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_tensor && (dim >= 3) ? elem_size : 1));
     CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
   }
   for (CeedInt i = 0; i < num_output_fields; i++) {
@@ -1078,7 +1129,7 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op) {
     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
-    max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * elem_size);
+    max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_tensor && (dim >= 3) ? elem_size : 1));
     CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
   }
   code << "    // Scratch restriction buffer space\n";
@@ -1137,17 +1188,17 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op) {
 
     // ---- Restriction
     CeedCallBackend(CeedOperatorBuildKernelRestriction_Hip_gen(code, data, f, dim, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f], Q_1d,
-                                                               true, is_at_points, use_3d_slices));
+                                                               true, is_tensor, is_at_points, use_3d_slices));
 
     // ---- Basis action
-    CeedCallBackend(
-        CeedOperatorBuildKernelBasis_Hip_gen(code, data, f, dim, op_input_fields[f], qf_input_fields[f], Q_1d, true, is_at_points, use_3d_slices));
+    CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, f, dim, op_input_fields[f], qf_input_fields[f], Q_1d, true, is_tensor,
+                                                         is_at_points, use_3d_slices));
   }
 
   // -- Q function
   CeedCallBackend(CeedOperatorBuildKernelQFunction_Hip_gen(code, data, dim, max_num_points, num_input_fields, op_input_fields, qf_input_fields,
-                                                           num_output_fields, op_output_fields, qf_output_fields, qfunction_name, Q_1d, is_at_points,
-                                                           use_3d_slices));
+                                                           num_output_fields, op_output_fields, qf_output_fields, qfunction_name, Q_1d, is_tensor,
+                                                           is_at_points, use_3d_slices));
 
   // -- Output basis and restriction
   code << "\n    // -- Output field basis action and restrictions\n";
@@ -1155,12 +1206,12 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op) {
     code << "    // ---- Output field " << i << "\n";
 
     // ---- Basis action
-    CeedCallBackend(
-        CeedOperatorBuildKernelBasis_Hip_gen(code, data, i, dim, op_output_fields[i], qf_output_fields[i], Q_1d, false, is_at_points, use_3d_slices));
+    CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, i, dim, op_output_fields[i], qf_output_fields[i], Q_1d, false, is_tensor,
+                                                         is_at_points, use_3d_slices));
 
     // ---- Restriction
     CeedCallBackend(CeedOperatorBuildKernelRestriction_Hip_gen(code, data, i, dim, NULL, op_output_fields[i], qf_output_fields[i], Q_1d, false,
-                                                               is_at_points, use_3d_slices));
+                                                               is_tensor, is_at_points, use_3d_slices));
   }
 
   // Close loop and function
@@ -1173,7 +1224,7 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op) {
 
   // Compile
   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
-  CeedCallBackend(BlockGridCalculate_Hip_gen(dim, num_elem, data->max_P_1d, Q_1d, block_sizes));
+  CeedCallBackend(BlockGridCalculate_Hip_gen(is_tensor ? dim : 1, num_elem, data->max_P_1d, Q_1d, block_sizes));
   CeedCallBackend(CeedCompile_Hip(ceed, code.str().c_str(), &data->module, 2, "T_1D", block_sizes[0], "BLOCK_SIZE",
                                   block_sizes[0] * block_sizes[1] * block_sizes[2]));
   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, operator_name.c_str(), &data->op));
diff --git a/backends/hip-gen/ceed-hip-gen-operator.c b/backends/hip-gen/ceed-hip-gen-operator.c
index d3455ac137..e1b6595a0f 100644
--- a/backends/hip-gen/ceed-hip-gen-operator.c
+++ b/backends/hip-gen/ceed-hip-gen-operator.c
@@ -35,7 +35,7 @@ static int CeedOperatorDestroy_Hip_gen(CeedOperator op) {
 // Apply and add to output
 //------------------------------------------------------------------------------
 static int CeedOperatorApplyAdd_Hip_gen(CeedOperator op, CeedVector input_vec, CeedVector output_vec, CeedRequest *request) {
-  bool                   is_at_points;
+  bool                   is_at_points, is_tensor;
   Ceed                   ceed;
   CeedInt                num_elem, num_input_fields, num_output_fields;
   CeedEvalMode           eval_mode;
@@ -46,16 +46,62 @@ static int CeedOperatorApplyAdd_Hip_gen(CeedOperator op, CeedVector input_vec, C
   CeedOperatorField     *op_input_fields, *op_output_fields;
   CeedOperator_Hip_gen  *data;
 
-  // Check for tensor-product bases
+  // Check for shared bases
+  CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
   {
-    bool has_tensor_bases;
+    bool has_shared_bases = true, is_all_tensor = true, is_all_nontensor = true;
+
+    for (CeedInt i = 0; i < num_input_fields; i++) {
+      CeedBasis basis;
+
+      CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
+      if (basis != CEED_BASIS_NONE) {
+        bool        is_tensor = true;
+        const char *resource;
+        char       *resource_root;
+        Ceed        basis_ceed;
+
+        CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
+        is_all_tensor &= is_tensor;
+        is_all_nontensor &= !is_tensor;
+        CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
+        CeedCallBackend(CeedGetResource(basis_ceed, &resource));
+        CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
+        has_shared_bases &= !strcmp(resource_root, "/gpu/hip/shared");
+        CeedCallBackend(CeedFree(&resource_root));
+        CeedCallBackend(CeedDestroy(&basis_ceed));
+      }
+      CeedCallBackend(CeedBasisDestroy(&basis));
+    }
 
-    CeedCallBackend(CeedOperatorHasTensorBases(op, &has_tensor_bases));
-    // -- Fallback to ref if not all bases are tensor-product
-    if (!has_tensor_bases) {
+    for (CeedInt i = 0; i < num_output_fields; i++) {
+      CeedBasis basis;
+
+      CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
+      if (basis != CEED_BASIS_NONE) {
+        bool        is_tensor = true;
+        const char *resource;
+        char       *resource_root;
+        Ceed        basis_ceed;
+
+        CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
+        is_all_tensor &= is_tensor;
+        is_all_nontensor &= !is_tensor;
+
+        CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
+        CeedCallBackend(CeedGetResource(basis_ceed, &resource));
+        CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
+        has_shared_bases &= !strcmp(resource_root, "/gpu/hip/shared");
+        CeedCallBackend(CeedFree(&resource_root));
+        CeedCallBackend(CeedDestroy(&basis_ceed));
+      }
+      CeedCallBackend(CeedBasisDestroy(&basis));
+    }
+    // -- Fallback to ref if not all bases are shared
+    if (!has_shared_bases || (!is_all_tensor && !is_all_nontensor)) {
       CeedOperator op_fallback;
 
-      CeedDebug256(CeedOperatorReturnCeed(op), CEED_DEBUG_COLOR_SUCCESS, "Falling back to /gpu/hip/ref CeedOperator due to non-tensor bases");
+      CeedDebug256(CeedOperatorReturnCeed(op), CEED_DEBUG_COLOR_SUCCESS, "Falling back to /gpu/hip/ref CeedOperator due to unsupported bases");
       CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback));
       CeedCallBackend(CeedOperatorApplyAdd(op_fallback, input_vec, output_vec, request));
       return CEED_ERROR_SUCCESS;
@@ -67,7 +113,6 @@ static int CeedOperatorApplyAdd_Hip_gen(CeedOperator op, CeedVector input_vec, C
   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
   CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
-  CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
 
   // Creation of the operator
@@ -160,8 +205,9 @@ static int CeedOperatorApplyAdd_Hip_gen(CeedOperator op, CeedVector input_vec, C
   const CeedInt thread_1d = CeedIntMax(Q_1d, P_1d);
   CeedInt       block_sizes[3];
 
-  CeedCallBackend(BlockGridCalculate_Hip_gen(dim, num_elem, P_1d, Q_1d, block_sizes));
-  if (dim == 1) {
+  CeedCallBackend(CeedOperatorHasTensorBases(op, &is_tensor));
+  CeedCallBackend(BlockGridCalculate_Hip_gen(is_tensor ? dim : 1, num_elem, P_1d, Q_1d, block_sizes));
+  if (dim == 1 || !is_tensor) {
     CeedInt grid      = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
     CeedInt sharedMem = block_sizes[2] * thread_1d * sizeof(CeedScalar);
 
diff --git a/backends/hip-ref/ceed-hip-ref-operator.c b/backends/hip-ref/ceed-hip-ref-operator.c
index 6f1119084b..67a3533ee2 100644
--- a/backends/hip-ref/ceed-hip-ref-operator.c
+++ b/backends/hip-ref/ceed-hip-ref-operator.c
@@ -131,7 +131,7 @@ static int CeedOperatorSetupFields_Hip(CeedQFunction qf, CeedOperator op, bool i
     // Input CEED_VECTOR_ACTIVE
     // Output CEED_VECTOR_ACTIVE without CEED_EVAL_NONE
     // Input CEED_VECTOR_NONE with CEED_EVAL_WEIGHT
-    // Input passive vectorr with CEED_EVAL_NONE and strided restriction with CEED_STRIDES_BACKEND
+    // Input passive vector with CEED_EVAL_NONE and strided restriction with CEED_STRIDES_BACKEND
     CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &l_vec));
     is_active = l_vec == CEED_VECTOR_ACTIVE;
     CeedCallBackend(CeedVectorDestroy(&l_vec));

From f82027a4cec74c9231e41878a1743c92167114a7 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Thu, 30 Jan 2025 14:55:36 -0700
Subject: [PATCH 267/571] gpu - update gen non-tensor block strategy

---
 backends/cuda-gen/ceed-cuda-gen-operator.c | 11 +++++++++--
 backends/hip-gen/ceed-hip-gen-operator.c   | 12 ++++++++++--
 2 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/backends/cuda-gen/ceed-cuda-gen-operator.c b/backends/cuda-gen/ceed-cuda-gen-operator.c
index 8f080ed544..43d388e293 100644
--- a/backends/cuda-gen/ceed-cuda-gen-operator.c
+++ b/backends/cuda-gen/ceed-cuda-gen-operator.c
@@ -282,8 +282,15 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec,
   CeedCallCuda(ceed, cuOccupancyMaxPotentialBlockSize(&min_grid_size, &max_threads_per_block, data->op, dynamicSMemSize, 0, 0x10000));
   int block[3] = {thread_1d, ((!is_tensor || dim == 1) ? 1 : thread_1d), -1};
 
-  CeedCallBackend(BlockGridCalculate(num_elem, min_grid_size / cuda_data->device_prop.multiProcessorCount, max_threads_per_block,
-                                     cuda_data->device_prop.maxThreadsDim[2], cuda_data->device_prop.warpSize, block, &grid));
+  if (is_tensor) {
+    CeedCallBackend(BlockGridCalculate(num_elem, min_grid_size / cuda_data->device_prop.multiProcessorCount, max_threads_per_block,
+                                       cuda_data->device_prop.maxThreadsDim[2], cuda_data->device_prop.warpSize, block, &grid));
+  } else {
+    CeedInt elems_per_block = CeedIntMin(cuda_data->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread_1d, 1));
+
+    grid     = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
+    block[2] = elems_per_block;
+  }
   CeedInt shared_mem = block[0] * block[1] * block[2] * sizeof(CeedScalar);
 
   CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->op, grid, block[0], block[1], block[2], shared_mem, opargs));
diff --git a/backends/hip-gen/ceed-hip-gen-operator.c b/backends/hip-gen/ceed-hip-gen-operator.c
index e1b6595a0f..a2a6ccd1f1 100644
--- a/backends/hip-gen/ceed-hip-gen-operator.c
+++ b/backends/hip-gen/ceed-hip-gen-operator.c
@@ -203,10 +203,18 @@ static int CeedOperatorApplyAdd_Hip_gen(CeedOperator op, CeedVector input_vec, C
   const CeedInt Q_1d      = data->Q_1d;
   const CeedInt P_1d      = data->max_P_1d;
   const CeedInt thread_1d = CeedIntMax(Q_1d, P_1d);
-  CeedInt       block_sizes[3];
 
   CeedCallBackend(CeedOperatorHasTensorBases(op, &is_tensor));
-  CeedCallBackend(BlockGridCalculate_Hip_gen(is_tensor ? dim : 1, num_elem, P_1d, Q_1d, block_sizes));
+  CeedInt block_sizes[3] = {thread_1d, ((!is_tensor || dim == 1) ? 1 : thread_1d), -1};
+
+  if (is_tensor) {
+    CeedCallBackend(BlockGridCalculate_Hip_gen(is_tensor ? dim : 1, num_elem, P_1d, Q_1d, block_sizes));
+  } else {
+    CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64;
+
+    elems_per_block = elems_per_block > 0 ? elems_per_block : 1;
+    block_sizes[2]  = elems_per_block;
+  }
   if (dim == 1 || !is_tensor) {
     CeedInt grid      = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
     CeedInt sharedMem = block_sizes[2] * thread_1d * sizeof(CeedScalar);

From da5de306636ec0d157763fe187cd5223e023ed1d Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 31 Jan 2025 14:32:46 -0700
Subject: [PATCH 268/571] doc - not nontensor gen support

---
 doc/sphinx/source/releasenotes.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doc/sphinx/source/releasenotes.md b/doc/sphinx/source/releasenotes.md
index 816187fd1f..a9f765a2fe 100644
--- a/doc/sphinx/source/releasenotes.md
+++ b/doc/sphinx/source/releasenotes.md
@@ -24,6 +24,7 @@ On this page we provide a summary of the main API changes, new features and exam
 - Enable `#pragma once` for all JiT source; remove duplicate includes in JiT source string before compilation.
 - Allow user to set additional compiler options for CUDA and HIP JiT.
 Specifically, directories set with `CeedAddJitSourceRoot(ceed, "foo/bar")` will be used to set `-Ifoo/bar` and defines set with `CeedAddJitDefine(ceed, "foo=bar")` will be used to set `-Dfoo=bar`.
+- Added non-tensor basis support to code generation backends `/gpu/cuda/gen` and `/gpu/hip/gen`.
 
 ### Examples
 

From 19feff82d9a67df6ce1e0b86aa12b71dfb25f41c Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 31 Jan 2025 14:12:30 -0700
Subject: [PATCH 269/571] minor - guard divide by zero

---
 interface/ceed-operator.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/interface/ceed-operator.c b/interface/ceed-operator.c
index 09eb33ada6..a3f6dfa6d5 100644
--- a/interface/ceed-operator.c
+++ b/interface/ceed-operator.c
@@ -1719,8 +1719,9 @@ int CeedOperatorGetFlopsEstimate(CeedOperator op, CeedSize *flops) {
     CeedQFunctionField *qf_input_fields, *qf_output_fields;
     CeedOperatorField  *op_input_fields, *op_output_fields;
 
-    CeedCall(CeedOperatorIsAtPoints(op, &is_at_points));
     CeedCall(CeedOperatorGetNumElements(op, &num_elem));
+    if (num_elem == 0) return CEED_ERROR_SUCCESS;
+    CeedCall(CeedOperatorIsAtPoints(op, &is_at_points));
     if (is_at_points) {
       CeedMemType         mem_type;
       CeedElemRestriction rstr_points = NULL;

From ddae5012d4ca987da08499b586cefc9e622c3919 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 7 Feb 2025 12:22:52 -0700
Subject: [PATCH 270/571] cuda - gen fallback to shared if error

---
 .../cuda-gen/ceed-cuda-gen-operator-build.cpp | 81 +++++++++++++++++--
 .../cuda-gen/ceed-cuda-gen-operator-build.h   |  2 +-
 backends/cuda-gen/ceed-cuda-gen-operator.c    | 78 +++++-------------
 backends/cuda-gen/ceed-cuda-gen.h             |  1 +
 backends/cuda/ceed-cuda-compile.cpp           | 78 +++++++++++++-----
 backends/cuda/ceed-cuda-compile.h             |  3 +
 6 files changed, 160 insertions(+), 83 deletions(-)

diff --git a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
index d19eedd491..4e06536adf 100644
--- a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
+++ b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
@@ -916,7 +916,7 @@ static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, C
 //------------------------------------------------------------------------------
 // Build single operator kernel
 //------------------------------------------------------------------------------
-extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {
+extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_build) {
   bool                    is_tensor = true, is_at_points = false, use_3d_slices = false;
   Ceed                    ceed;
   CeedInt                 Q_1d, num_input_fields, num_output_fields, dim = 1, max_num_points = 0, coords_comp_stride = 0;
@@ -927,18 +927,77 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {
   CeedOperator_Cuda_gen  *data;
   std::ostringstream      code;
 
+  CeedCallBackend(CeedOperatorGetData(op, &data));
   {
     bool is_setup_done;
 
     CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done));
-    if (is_setup_done) return CEED_ERROR_SUCCESS;
+    if (is_setup_done) {
+      *is_good_build = !data->use_fallback;
+      return CEED_ERROR_SUCCESS;
+    }
   }
+  CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
+
+  // Check field compatibility
+  {
+    bool has_shared_bases = true, is_all_tensor = true, is_all_nontensor = true;
 
+    for (CeedInt i = 0; i < num_input_fields; i++) {
+      CeedBasis basis;
+
+      CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
+      if (basis != CEED_BASIS_NONE) {
+        bool        is_tensor = true;
+        const char *resource;
+        char       *resource_root;
+        Ceed        basis_ceed;
+
+        CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
+        is_all_tensor &= is_tensor;
+        is_all_nontensor &= !is_tensor;
+        CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
+        CeedCallBackend(CeedGetResource(basis_ceed, &resource));
+        CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
+        has_shared_bases &= !strcmp(resource_root, "/gpu/cuda/shared");
+        CeedCallBackend(CeedFree(&resource_root));
+        CeedCallBackend(CeedDestroy(&basis_ceed));
+      }
+      CeedCallBackend(CeedBasisDestroy(&basis));
+    }
+
+    for (CeedInt i = 0; i < num_output_fields; i++) {
+      CeedBasis basis;
+
+      CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
+      if (basis != CEED_BASIS_NONE) {
+        bool        is_tensor = true;
+        const char *resource;
+        char       *resource_root;
+        Ceed        basis_ceed;
+
+        CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
+        is_all_tensor &= is_tensor;
+        is_all_nontensor &= !is_tensor;
+
+        CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
+        CeedCallBackend(CeedGetResource(basis_ceed, &resource));
+        CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
+        has_shared_bases &= !strcmp(resource_root, "/gpu/cuda/shared");
+        CeedCallBackend(CeedFree(&resource_root));
+        CeedCallBackend(CeedDestroy(&basis_ceed));
+      }
+      CeedCallBackend(CeedBasisDestroy(&basis));
+    }
+    // -- Fallback to ref if not all bases are shared
+    if (!has_shared_bases || (!is_all_tensor && !is_all_nontensor)) {
+      *is_good_build = false;
+      return CEED_ERROR_SUCCESS;
+    }
+  }
   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
-  CeedCallBackend(CeedOperatorGetData(op, &data));
   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
   CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
-  CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
 
   // Get operator data
@@ -1207,8 +1266,18 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {
   code << "// -----------------------------------------------------------------------------\n\n";
 
   // Compile
-  CeedCallBackend(CeedCompile_Cuda(ceed, code.str().c_str(), &data->module, 1, "T_1D", CeedIntMax(Q_1d, data->max_P_1d)));
-  CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, operator_name.c_str(), &data->op));
+  {
+    bool is_compile_good = false;
+
+    CeedCallBackend(CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good, &data->module, 1, "T_1D", CeedIntMax(Q_1d, data->max_P_1d)));
+    if (is_compile_good) {
+      *is_good_build = true;
+      CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, operator_name.c_str(), &data->op));
+    } else {
+      *is_good_build     = false;
+      data->use_fallback = true;
+    }
+  }
   CeedCallBackend(CeedOperatorSetSetupDone(op));
   CeedCallBackend(CeedDestroy(&ceed));
   CeedCallBackend(CeedQFunctionDestroy(&qf));
diff --git a/backends/cuda-gen/ceed-cuda-gen-operator-build.h b/backends/cuda-gen/ceed-cuda-gen-operator-build.h
index 28031e8e3b..88e20ceda2 100644
--- a/backends/cuda-gen/ceed-cuda-gen-operator-build.h
+++ b/backends/cuda-gen/ceed-cuda-gen-operator-build.h
@@ -6,4 +6,4 @@
 // This file is part of CEED:  http://github.com/ceed
 #pragma once
 
-CEED_INTERN int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op);
+CEED_INTERN int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_build);
diff --git a/backends/cuda-gen/ceed-cuda-gen-operator.c b/backends/cuda-gen/ceed-cuda-gen-operator.c
index 43d388e293..175a4c0034 100644
--- a/backends/cuda-gen/ceed-cuda-gen-operator.c
+++ b/backends/cuda-gen/ceed-cuda-gen-operator.c
@@ -99,7 +99,7 @@ static size_t dynamicSMemSize(int threads) { return threads * sizeof(CeedScalar)
 // Apply and add to output
 //------------------------------------------------------------------------------
 static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec, CeedVector output_vec, CeedRequest *request) {
-  bool                    is_at_points, is_tensor;
+  bool                    is_at_points, is_tensor, is_good_run = true;
   Ceed                    ceed;
   Ceed_Cuda              *cuda_data;
   CeedInt                 num_elem, num_input_fields, num_output_fields;
@@ -111,62 +111,15 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec,
   CeedOperatorField      *op_input_fields, *op_output_fields;
   CeedOperator_Cuda_gen  *data;
 
-  // Check for shared bases
-  CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
+  // Creation of the operator
   {
-    bool has_shared_bases = true, is_all_tensor = true, is_all_nontensor = true;
-
-    for (CeedInt i = 0; i < num_input_fields; i++) {
-      CeedBasis basis;
-
-      CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
-      if (basis != CEED_BASIS_NONE) {
-        bool        is_tensor = true;
-        const char *resource;
-        char       *resource_root;
-        Ceed        basis_ceed;
-
-        CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
-        is_all_tensor &= is_tensor;
-        is_all_nontensor &= !is_tensor;
-        CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
-        CeedCallBackend(CeedGetResource(basis_ceed, &resource));
-        CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
-        has_shared_bases &= !strcmp(resource_root, "/gpu/cuda/shared");
-        CeedCallBackend(CeedFree(&resource_root));
-        CeedCallBackend(CeedDestroy(&basis_ceed));
-      }
-      CeedCallBackend(CeedBasisDestroy(&basis));
-    }
+    bool is_good_build = false;
 
-    for (CeedInt i = 0; i < num_output_fields; i++) {
-      CeedBasis basis;
-
-      CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
-      if (basis != CEED_BASIS_NONE) {
-        bool        is_tensor = true;
-        const char *resource;
-        char       *resource_root;
-        Ceed        basis_ceed;
-
-        CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
-        is_all_tensor &= is_tensor;
-        is_all_nontensor &= !is_tensor;
-
-        CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
-        CeedCallBackend(CeedGetResource(basis_ceed, &resource));
-        CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
-        has_shared_bases &= !strcmp(resource_root, "/gpu/cuda/shared");
-        CeedCallBackend(CeedFree(&resource_root));
-        CeedCallBackend(CeedDestroy(&basis_ceed));
-      }
-      CeedCallBackend(CeedBasisDestroy(&basis));
-    }
-    // -- Fallback to ref if not all bases are shared
-    if (!has_shared_bases || (!is_all_tensor && !is_all_nontensor)) {
+    CeedCallBackend(CeedOperatorBuildKernel_Cuda_gen(op, &is_good_build));
+    if (!is_good_build) {
       CeedOperator op_fallback;
 
-      CeedDebug256(CeedOperatorReturnCeed(op), CEED_DEBUG_COLOR_SUCCESS, "Falling back to /gpu/cuda/ref CeedOperator due unsupported bases");
+      CeedDebug256(CeedOperatorReturnCeed(op), CEED_DEBUG_COLOR_SUCCESS, "Falling back to /gpu/cuda/ref CeedOperator due to code generation issue");
       CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback));
       CeedCallBackend(CeedOperatorApplyAdd(op_fallback, input_vec, output_vec, request));
       return CEED_ERROR_SUCCESS;
@@ -179,11 +132,9 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec,
   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
   CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
+  CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
 
-  // Creation of the operator
-  CeedCallBackend(CeedOperatorBuildKernel_Cuda_gen(op));
-
   // Input vectors
   for (CeedInt i = 0; i < num_input_fields; i++) {
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
@@ -293,7 +244,7 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec,
   }
   CeedInt shared_mem = block[0] * block[1] * block[2] * sizeof(CeedScalar);
 
-  CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->op, grid, block[0], block[1], block[2], shared_mem, opargs));
+  CeedCallBackend(CeedTryRunKernelDimShared_Cuda(ceed, data->op, grid, block[0], block[1], block[2], shared_mem, &is_good_run, opargs));
 
   // Restore input arrays
   for (CeedInt i = 0; i < num_input_fields; i++) {
@@ -349,8 +300,21 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec,
 
   // Restore context data
   CeedCallBackend(CeedQFunctionRestoreInnerContextData(qf, &qf_data->d_c));
+
+  // Cleanup
   CeedCallBackend(CeedDestroy(&ceed));
   CeedCallBackend(CeedQFunctionDestroy(&qf));
+
+  // Fallback if run was bad (out of resources)
+  if (!is_good_run) {
+    CeedOperator op_fallback;
+
+    data->use_fallback = true;
+    CeedDebug256(CeedOperatorReturnCeed(op), CEED_DEBUG_COLOR_SUCCESS, "Falling back to /gpu/cuda/ref CeedOperator due to kernel execution issue");
+    CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback));
+    CeedCallBackend(CeedOperatorApplyAdd(op_fallback, input_vec, output_vec, request));
+    return CEED_ERROR_SUCCESS;
+  }
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/cuda-gen/ceed-cuda-gen.h b/backends/cuda-gen/ceed-cuda-gen.h
index c88e9fd18f..09b66171e9 100644
--- a/backends/cuda-gen/ceed-cuda-gen.h
+++ b/backends/cuda-gen/ceed-cuda-gen.h
@@ -12,6 +12,7 @@
 #include <cuda.h>
 
 typedef struct {
+  bool           use_fallback;
   CeedInt        dim;
   CeedInt        Q_1d;
   CeedInt        max_P_1d;
diff --git a/backends/cuda/ceed-cuda-compile.cpp b/backends/cuda/ceed-cuda-compile.cpp
index 20c57db2e8..4c196fe297 100644
--- a/backends/cuda/ceed-cuda-compile.cpp
+++ b/backends/cuda/ceed-cuda-compile.cpp
@@ -34,7 +34,8 @@
 //------------------------------------------------------------------------------
 // Compile CUDA kernel
 //------------------------------------------------------------------------------
-int CeedCompile_Cuda(Ceed ceed, const char *source, CUmodule *module, const CeedInt num_defines, ...) {
+static int CeedCompileCore_Cuda(Ceed ceed, const char *source, const bool throw_error, bool *is_compile_good, CUmodule *module,
+                                const CeedInt num_defines, va_list args) {
   size_t                ptx_size;
   char                 *ptx;
   const int             num_opts            = 4;
@@ -50,8 +51,6 @@ int CeedCompile_Cuda(Ceed ceed, const char *source, CUmodule *module, const Ceed
 
   // Get kernel specific options, such as kernel constants
   if (num_defines > 0) {
-    va_list args;
-    va_start(args, num_defines);
     char *name;
     int   val;
 
@@ -60,7 +59,6 @@ int CeedCompile_Cuda(Ceed ceed, const char *source, CUmodule *module, const Ceed
       val  = va_arg(args, int);
       code << "#define " << name << " " << val << "\n";
     }
-    va_end(args);
   }
 
   // Standard libCEED definitions for CUDA backends
@@ -133,14 +131,17 @@ int CeedCompile_Cuda(Ceed ceed, const char *source, CUmodule *module, const Ceed
     CeedCallBackend(CeedFree(&opts[num_opts + num_jit_source_dirs + i]));
   }
   CeedCallBackend(CeedFree(&opts));
-  if (result != NVRTC_SUCCESS) {
+  *is_compile_good = result == NVRTC_SUCCESS;
+  if (!*is_compile_good) {
     char  *log;
     size_t log_size;
 
-    CeedCallNvrtc(ceed, nvrtcGetProgramLogSize(prog, &log_size));
-    CeedCallBackend(CeedMalloc(log_size, &log));
-    CeedCallNvrtc(ceed, nvrtcGetProgramLog(prog, log));
-    return CeedError(ceed, CEED_ERROR_BACKEND, "%s\n%s", nvrtcGetErrorString(result), log);
+    if (throw_error) {
+      CeedCallNvrtc(ceed, nvrtcGetProgramLogSize(prog, &log_size));
+      CeedCallBackend(CeedMalloc(log_size, &log));
+      CeedCallNvrtc(ceed, nvrtcGetProgramLog(prog, log));
+      return CeedError(ceed, CEED_ERROR_BACKEND, "%s\n%s", nvrtcGetErrorString(result), log);
+    }
   }
 
 #if CUDA_VERSION >= 11010
@@ -159,6 +160,25 @@ int CeedCompile_Cuda(Ceed ceed, const char *source, CUmodule *module, const Ceed
   return CEED_ERROR_SUCCESS;
 }
 
+int CeedCompile_Cuda(Ceed ceed, const char *source, CUmodule *module, const CeedInt num_defines, ...) {
+  bool    is_compile_good = true;
+  va_list args;
+
+  va_start(args, num_defines);
+  CeedCallBackend(CeedCompileCore_Cuda(ceed, source, true, &is_compile_good, module, num_defines, args));
+  va_end(args);
+  return CEED_ERROR_SUCCESS;
+}
+
+int CeedTryCompile_Cuda(Ceed ceed, const char *source, bool *is_compile_good, CUmodule *module, const CeedInt num_defines, ...) {
+  va_list args;
+
+  va_start(args, num_defines);
+  CeedCallBackend(CeedCompileCore_Cuda(ceed, source, false, is_compile_good, module, num_defines, args));
+  va_end(args);
+  return CEED_ERROR_SUCCESS;
+}
+
 //------------------------------------------------------------------------------
 // Get CUDA kernel
 //------------------------------------------------------------------------------
@@ -200,24 +220,44 @@ int CeedRunKernelDim_Cuda(Ceed ceed, CUfunction kernel, const int grid_size, con
 //------------------------------------------------------------------------------
 // Run CUDA kernel for spatial dimension with shared memory
 //------------------------------------------------------------------------------
-int CeedRunKernelDimShared_Cuda(Ceed ceed, CUfunction kernel, const int grid_size, const int block_size_x, const int block_size_y,
-                                const int block_size_z, const int shared_mem_size, void **args) {
+static int CeedRunKernelDimSharedCore_Cuda(Ceed ceed, CUfunction kernel, const int grid_size, const int block_size_x, const int block_size_y,
+                                           const int block_size_z, const int shared_mem_size, const bool throw_error, bool *is_good_run,
+                                           void **args) {
 #if CUDA_VERSION >= 9000
   cuFuncSetAttribute(kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, shared_mem_size);
 #endif
   CUresult result = cuLaunchKernel(kernel, grid_size, 1, 1, block_size_x, block_size_y, block_size_z, shared_mem_size, NULL, args, NULL);
 
   if (result == CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES) {
-    int max_threads_per_block, shared_size_bytes, num_regs;
-
-    cuFuncGetAttribute(&max_threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, kernel);
-    cuFuncGetAttribute(&shared_size_bytes, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, kernel);
-    cuFuncGetAttribute(&num_regs, CU_FUNC_ATTRIBUTE_NUM_REGS, kernel);
-    return CeedError(ceed, CEED_ERROR_BACKEND,
-                     "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: max_threads_per_block %d on block size (%d,%d,%d), shared_size %d, num_regs %d",
-                     max_threads_per_block, block_size_x, block_size_y, block_size_z, shared_size_bytes, num_regs);
+    *is_good_run = false;
+    if (throw_error) {
+      int max_threads_per_block, shared_size_bytes, num_regs;
+
+      cuFuncGetAttribute(&max_threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, kernel);
+      cuFuncGetAttribute(&shared_size_bytes, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, kernel);
+      cuFuncGetAttribute(&num_regs, CU_FUNC_ATTRIBUTE_NUM_REGS, kernel);
+      return CeedError(ceed, CEED_ERROR_BACKEND,
+                       "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: max_threads_per_block %d on block size (%d,%d,%d), shared_size %d, num_regs %d",
+                       max_threads_per_block, block_size_x, block_size_y, block_size_z, shared_size_bytes, num_regs);
+    }
   } else CeedChk_Cu(ceed, result);
   return CEED_ERROR_SUCCESS;
 }
 
+int CeedRunKernelDimShared_Cuda(Ceed ceed, CUfunction kernel, const int grid_size, const int block_size_x, const int block_size_y,
+                                const int block_size_z, const int shared_mem_size, void **args) {
+  bool is_good_run = true;
+
+  CeedCallBackend(
+      CeedRunKernelDimSharedCore_Cuda(ceed, kernel, grid_size, block_size_x, block_size_y, block_size_z, shared_mem_size, true, &is_good_run, args));
+  return CEED_ERROR_SUCCESS;
+}
+
+int CeedTryRunKernelDimShared_Cuda(Ceed ceed, CUfunction kernel, const int grid_size, const int block_size_x, const int block_size_y,
+                                   const int block_size_z, const int shared_mem_size, bool *is_good_run, void **args) {
+  CeedCallBackend(
+      CeedRunKernelDimSharedCore_Cuda(ceed, kernel, grid_size, block_size_x, block_size_y, block_size_z, shared_mem_size, false, is_good_run, args));
+  return CEED_ERROR_SUCCESS;
+}
+
 //------------------------------------------------------------------------------
diff --git a/backends/cuda/ceed-cuda-compile.h b/backends/cuda/ceed-cuda-compile.h
index 846de28c9d..21204a495d 100644
--- a/backends/cuda/ceed-cuda-compile.h
+++ b/backends/cuda/ceed-cuda-compile.h
@@ -13,6 +13,7 @@
 static inline CeedInt CeedDivUpInt(CeedInt numerator, CeedInt denominator) { return (numerator + denominator - 1) / denominator; }
 
 CEED_INTERN int CeedCompile_Cuda(Ceed ceed, const char *source, CUmodule *module, const CeedInt num_defines, ...);
+CEED_INTERN int CeedTryCompile_Cuda(Ceed ceed, const char *source, bool *is_compile_good, CUmodule *module, const CeedInt num_defines, ...);
 
 CEED_INTERN int CeedGetKernel_Cuda(Ceed ceed, CUmodule module, const char *name, CUfunction *kernel);
 
@@ -24,3 +25,5 @@ CEED_INTERN int CeedRunKernelDim_Cuda(Ceed ceed, CUfunction kernel, int grid_siz
 
 CEED_INTERN int CeedRunKernelDimShared_Cuda(Ceed ceed, CUfunction kernel, int grid_size, int block_size_x, int block_size_y, int block_size_z,
                                             int shared_mem_size, void **args);
+CEED_INTERN int CeedTryRunKernelDimShared_Cuda(Ceed ceed, CUfunction kernel, int grid_size, int block_size_x, int block_size_y, int block_size_z,
+                                               int shared_mem_size, bool *is_good_run, void **args);

From 8d12f40e0e187f71c4a1a78742076f931e72da09 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 7 Feb 2025 13:06:17 -0700
Subject: [PATCH 271/571] hip - gen fallback to shared if error

---
 .../cuda-gen/ceed-cuda-gen-operator-build.cpp |  2 +-
 backends/cuda-gen/ceed-cuda-gen-operator.c    |  6 +-
 backends/cuda/ceed-cuda-compile.cpp           | 12 ++-
 .../hip-gen/ceed-hip-gen-operator-build.cpp   | 83 ++++++++++++++++--
 .../hip-gen/ceed-hip-gen-operator-build.h     |  2 +-
 backends/hip-gen/ceed-hip-gen-operator.c      | 85 ++++++-------------
 backends/hip-gen/ceed-hip-gen.h               |  1 +
 backends/hip/ceed-hip-compile.cpp             | 49 +++++++++--
 backends/hip/ceed-hip-compile.h               |  3 +
 9 files changed, 159 insertions(+), 84 deletions(-)

diff --git a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
index 4e06536adf..69f6788c19 100644
--- a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
+++ b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
@@ -937,9 +937,9 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_b
       return CEED_ERROR_SUCCESS;
     }
   }
-  CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
 
   // Check field compatibility
+  CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
   {
     bool has_shared_bases = true, is_all_tensor = true, is_all_nontensor = true;
 
diff --git a/backends/cuda-gen/ceed-cuda-gen-operator.c b/backends/cuda-gen/ceed-cuda-gen-operator.c
index 175a4c0034..3410cdcfb7 100644
--- a/backends/cuda-gen/ceed-cuda-gen-operator.c
+++ b/backends/cuda-gen/ceed-cuda-gen-operator.c
@@ -99,7 +99,7 @@ static size_t dynamicSMemSize(int threads) { return threads * sizeof(CeedScalar)
 // Apply and add to output
 //------------------------------------------------------------------------------
 static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec, CeedVector output_vec, CeedRequest *request) {
-  bool                    is_at_points, is_tensor, is_good_run = true;
+  bool                    is_at_points, is_tensor, is_run_good = true;
   Ceed                    ceed;
   Ceed_Cuda              *cuda_data;
   CeedInt                 num_elem, num_input_fields, num_output_fields;
@@ -244,7 +244,7 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec,
   }
   CeedInt shared_mem = block[0] * block[1] * block[2] * sizeof(CeedScalar);
 
-  CeedCallBackend(CeedTryRunKernelDimShared_Cuda(ceed, data->op, grid, block[0], block[1], block[2], shared_mem, &is_good_run, opargs));
+  CeedCallBackend(CeedTryRunKernelDimShared_Cuda(ceed, data->op, grid, block[0], block[1], block[2], shared_mem, &is_run_good, opargs));
 
   // Restore input arrays
   for (CeedInt i = 0; i < num_input_fields; i++) {
@@ -306,7 +306,7 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec,
   CeedCallBackend(CeedQFunctionDestroy(&qf));
 
   // Fallback if run was bad (out of resources)
-  if (!is_good_run) {
+  if (!is_run_good) {
     CeedOperator op_fallback;
 
     data->use_fallback = true;
diff --git a/backends/cuda/ceed-cuda-compile.cpp b/backends/cuda/ceed-cuda-compile.cpp
index 4c196fe297..6c0e07c0b9 100644
--- a/backends/cuda/ceed-cuda-compile.cpp
+++ b/backends/cuda/ceed-cuda-compile.cpp
@@ -132,16 +132,14 @@ static int CeedCompileCore_Cuda(Ceed ceed, const char *source, const bool throw_
   }
   CeedCallBackend(CeedFree(&opts));
   *is_compile_good = result == NVRTC_SUCCESS;
-  if (!*is_compile_good) {
+  if (!*is_compile_good && throw_error) {
     char  *log;
     size_t log_size;
 
-    if (throw_error) {
-      CeedCallNvrtc(ceed, nvrtcGetProgramLogSize(prog, &log_size));
-      CeedCallBackend(CeedMalloc(log_size, &log));
-      CeedCallNvrtc(ceed, nvrtcGetProgramLog(prog, log));
-      return CeedError(ceed, CEED_ERROR_BACKEND, "%s\n%s", nvrtcGetErrorString(result), log);
-    }
+    CeedCallNvrtc(ceed, nvrtcGetProgramLogSize(prog, &log_size));
+    CeedCallBackend(CeedMalloc(log_size, &log));
+    CeedCallNvrtc(ceed, nvrtcGetProgramLog(prog, log));
+    return CeedError(ceed, CEED_ERROR_BACKEND, "%s\n%s", nvrtcGetErrorString(result), log);
   }
 
 #if CUDA_VERSION >= 11010
diff --git a/backends/hip-gen/ceed-hip-gen-operator-build.cpp b/backends/hip-gen/ceed-hip-gen-operator-build.cpp
index 1df23c3d7e..4c3479ee21 100644
--- a/backends/hip-gen/ceed-hip-gen-operator-build.cpp
+++ b/backends/hip-gen/ceed-hip-gen-operator-build.cpp
@@ -942,7 +942,7 @@ static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, Ce
 //------------------------------------------------------------------------------
 // Build single operator kernel
 //------------------------------------------------------------------------------
-extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op) {
+extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_build) {
   bool                   is_tensor = true, is_at_points = false, use_3d_slices = false;
   Ceed                   ceed;
   CeedInt                Q_1d, num_input_fields, num_output_fields, dim = 1, max_num_points = 0, coords_comp_stride = 0;
@@ -953,18 +953,77 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op) {
   CeedOperator_Hip_gen  *data;
   std::ostringstream     code;
 
+  CeedCallBackend(CeedOperatorGetData(op, &data));
   {
     bool is_setup_done;
 
     CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done));
-    if (is_setup_done) return CEED_ERROR_SUCCESS;
+    if (is_setup_done) {
+      *is_good_build = !data->use_fallback;
+      return CEED_ERROR_SUCCESS;
+    }
   }
 
+  // Check field compatibility
+  CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
+  {
+    bool has_shared_bases = true, is_all_tensor = true, is_all_nontensor = true;
+
+    for (CeedInt i = 0; i < num_input_fields; i++) {
+      CeedBasis basis;
+
+      CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
+      if (basis != CEED_BASIS_NONE) {
+        bool        is_tensor = true;
+        const char *resource;
+        char       *resource_root;
+        Ceed        basis_ceed;
+
+        CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
+        is_all_tensor &= is_tensor;
+        is_all_nontensor &= !is_tensor;
+        CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
+        CeedCallBackend(CeedGetResource(basis_ceed, &resource));
+        CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
+        has_shared_bases &= !strcmp(resource_root, "/gpu/hip/shared");
+        CeedCallBackend(CeedFree(&resource_root));
+        CeedCallBackend(CeedDestroy(&basis_ceed));
+      }
+      CeedCallBackend(CeedBasisDestroy(&basis));
+    }
+
+    for (CeedInt i = 0; i < num_output_fields; i++) {
+      CeedBasis basis;
+
+      CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
+      if (basis != CEED_BASIS_NONE) {
+        bool        is_tensor = true;
+        const char *resource;
+        char       *resource_root;
+        Ceed        basis_ceed;
+
+        CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
+        is_all_tensor &= is_tensor;
+        is_all_nontensor &= !is_tensor;
+
+        CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
+        CeedCallBackend(CeedGetResource(basis_ceed, &resource));
+        CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
+        has_shared_bases &= !strcmp(resource_root, "/gpu/hip/shared");
+        CeedCallBackend(CeedFree(&resource_root));
+        CeedCallBackend(CeedDestroy(&basis_ceed));
+      }
+      CeedCallBackend(CeedBasisDestroy(&basis));
+    }
+    // -- Fallback to ref if not all bases are shared
+    if (!has_shared_bases || (!is_all_tensor && !is_all_nontensor)) {
+      *is_good_build = false;
+      return CEED_ERROR_SUCCESS;
+    }
+  }
   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
-  CeedCallBackend(CeedOperatorGetData(op, &data));
   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
   CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
-  CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
 
   // Get operator data
@@ -1225,9 +1284,19 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op) {
   // Compile
   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
   CeedCallBackend(BlockGridCalculate_Hip_gen(is_tensor ? dim : 1, num_elem, data->max_P_1d, Q_1d, block_sizes));
-  CeedCallBackend(CeedCompile_Hip(ceed, code.str().c_str(), &data->module, 2, "T_1D", block_sizes[0], "BLOCK_SIZE",
-                                  block_sizes[0] * block_sizes[1] * block_sizes[2]));
-  CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, operator_name.c_str(), &data->op));
+  {
+    bool is_compile_good = false;
+
+    CeedCallBackend(CeedTryCompile_Hip(ceed, code.str().c_str(), &is_compile_good, &data->module, 2, "T_1D", block_sizes[0], "BLOCK_SIZE",
+                                       block_sizes[0] * block_sizes[1] * block_sizes[2]));
+    if (is_compile_good) {
+      *is_good_build = true;
+      CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, operator_name.c_str(), &data->op));
+    } else {
+      *is_good_build     = false;
+      data->use_fallback = true;
+    }
+  }
   CeedCallBackend(CeedOperatorSetSetupDone(op));
   CeedCallBackend(CeedDestroy(&ceed));
   CeedCallBackend(CeedQFunctionDestroy(&qf));
diff --git a/backends/hip-gen/ceed-hip-gen-operator-build.h b/backends/hip-gen/ceed-hip-gen-operator-build.h
index c17ba46eeb..4d5de74269 100644
--- a/backends/hip-gen/ceed-hip-gen-operator-build.h
+++ b/backends/hip-gen/ceed-hip-gen-operator-build.h
@@ -7,4 +7,4 @@
 #pragma once
 
 CEED_INTERN int BlockGridCalculate_Hip_gen(CeedInt dim, CeedInt num_elem, CeedInt P_1d, CeedInt Q_1d, CeedInt *block_sizes);
-CEED_INTERN int CeedOperatorBuildKernel_Hip_gen(CeedOperator op);
+CEED_INTERN int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_build);
diff --git a/backends/hip-gen/ceed-hip-gen-operator.c b/backends/hip-gen/ceed-hip-gen-operator.c
index a2a6ccd1f1..da164e2b93 100644
--- a/backends/hip-gen/ceed-hip-gen-operator.c
+++ b/backends/hip-gen/ceed-hip-gen-operator.c
@@ -35,7 +35,7 @@ static int CeedOperatorDestroy_Hip_gen(CeedOperator op) {
 // Apply and add to output
 //------------------------------------------------------------------------------
 static int CeedOperatorApplyAdd_Hip_gen(CeedOperator op, CeedVector input_vec, CeedVector output_vec, CeedRequest *request) {
-  bool                   is_at_points, is_tensor;
+  bool                   is_at_points, is_tensor, is_good_run = true;
   Ceed                   ceed;
   CeedInt                num_elem, num_input_fields, num_output_fields;
   CeedEvalMode           eval_mode;
@@ -46,62 +46,15 @@ static int CeedOperatorApplyAdd_Hip_gen(CeedOperator op, CeedVector input_vec, C
   CeedOperatorField     *op_input_fields, *op_output_fields;
   CeedOperator_Hip_gen  *data;
 
-  // Check for shared bases
-  CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
+  // Creation of the operator
   {
-    bool has_shared_bases = true, is_all_tensor = true, is_all_nontensor = true;
-
-    for (CeedInt i = 0; i < num_input_fields; i++) {
-      CeedBasis basis;
-
-      CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
-      if (basis != CEED_BASIS_NONE) {
-        bool        is_tensor = true;
-        const char *resource;
-        char       *resource_root;
-        Ceed        basis_ceed;
-
-        CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
-        is_all_tensor &= is_tensor;
-        is_all_nontensor &= !is_tensor;
-        CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
-        CeedCallBackend(CeedGetResource(basis_ceed, &resource));
-        CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
-        has_shared_bases &= !strcmp(resource_root, "/gpu/hip/shared");
-        CeedCallBackend(CeedFree(&resource_root));
-        CeedCallBackend(CeedDestroy(&basis_ceed));
-      }
-      CeedCallBackend(CeedBasisDestroy(&basis));
-    }
+    bool is_good_build = false;
 
-    for (CeedInt i = 0; i < num_output_fields; i++) {
-      CeedBasis basis;
-
-      CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
-      if (basis != CEED_BASIS_NONE) {
-        bool        is_tensor = true;
-        const char *resource;
-        char       *resource_root;
-        Ceed        basis_ceed;
-
-        CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
-        is_all_tensor &= is_tensor;
-        is_all_nontensor &= !is_tensor;
-
-        CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
-        CeedCallBackend(CeedGetResource(basis_ceed, &resource));
-        CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
-        has_shared_bases &= !strcmp(resource_root, "/gpu/hip/shared");
-        CeedCallBackend(CeedFree(&resource_root));
-        CeedCallBackend(CeedDestroy(&basis_ceed));
-      }
-      CeedCallBackend(CeedBasisDestroy(&basis));
-    }
-    // -- Fallback to ref if not all bases are shared
-    if (!has_shared_bases || (!is_all_tensor && !is_all_nontensor)) {
+    CeedCallBackend(CeedOperatorBuildKernel_Hip_gen(op, &is_good_build));
+    if (!is_good_build) {
       CeedOperator op_fallback;
 
-      CeedDebug256(CeedOperatorReturnCeed(op), CEED_DEBUG_COLOR_SUCCESS, "Falling back to /gpu/hip/ref CeedOperator due to unsupported bases");
+      CeedDebug256(CeedOperatorReturnCeed(op), CEED_DEBUG_COLOR_SUCCESS, "Falling back to /gpu/hip/ref CeedOperator due to code generation issue");
       CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback));
       CeedCallBackend(CeedOperatorApplyAdd(op_fallback, input_vec, output_vec, request));
       return CEED_ERROR_SUCCESS;
@@ -113,11 +66,9 @@ static int CeedOperatorApplyAdd_Hip_gen(CeedOperator op, CeedVector input_vec, C
   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
   CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
+  CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
 
-  // Creation of the operator
-  CeedCallBackend(CeedOperatorBuildKernel_Hip_gen(op));
-
   // Input vectors
   for (CeedInt i = 0; i < num_input_fields; i++) {
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
@@ -219,17 +170,20 @@ static int CeedOperatorApplyAdd_Hip_gen(CeedOperator op, CeedVector input_vec, C
     CeedInt grid      = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
     CeedInt sharedMem = block_sizes[2] * thread_1d * sizeof(CeedScalar);
 
-    CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->op, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, opargs));
+    CeedCallBackend(
+        CeedTryRunKernelDimShared_Hip(ceed, data->op, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, &is_good_run, opargs));
   } else if (dim == 2) {
     CeedInt grid      = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
     CeedInt sharedMem = block_sizes[2] * thread_1d * thread_1d * sizeof(CeedScalar);
 
-    CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->op, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, opargs));
+    CeedCallBackend(
+        CeedTryRunKernelDimShared_Hip(ceed, data->op, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, &is_good_run, opargs));
   } else if (dim == 3) {
     CeedInt grid      = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
     CeedInt sharedMem = block_sizes[2] * thread_1d * thread_1d * sizeof(CeedScalar);
 
-    CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->op, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, opargs));
+    CeedCallBackend(
+        CeedTryRunKernelDimShared_Hip(ceed, data->op, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, &is_good_run, opargs));
   }
 
   // Restore input arrays
@@ -280,8 +234,21 @@ static int CeedOperatorApplyAdd_Hip_gen(CeedOperator op, CeedVector input_vec, C
 
   // Restore context data
   CeedCallBackend(CeedQFunctionRestoreInnerContextData(qf, &qf_data->d_c));
+
+  // Cleanup
   CeedCallBackend(CeedDestroy(&ceed));
   CeedCallBackend(CeedQFunctionDestroy(&qf));
+
+  // Fallback if run was bad (out of resources)
+  if (!is_good_run) {
+    CeedOperator op_fallback;
+
+    data->use_fallback = true;
+    CeedDebug256(CeedOperatorReturnCeed(op), CEED_DEBUG_COLOR_SUCCESS, "Falling back to /gpu/hip/ref CeedOperator due to kernel execution issue");
+    CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback));
+    CeedCallBackend(CeedOperatorApplyAdd(op_fallback, input_vec, output_vec, request));
+    return CEED_ERROR_SUCCESS;
+  }
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/hip-gen/ceed-hip-gen.h b/backends/hip-gen/ceed-hip-gen.h
index eb5dd0c893..760fef2ed5 100644
--- a/backends/hip-gen/ceed-hip-gen.h
+++ b/backends/hip-gen/ceed-hip-gen.h
@@ -12,6 +12,7 @@
 #include <hip/hip_runtime.h>
 
 typedef struct {
+  bool          use_fallback;
   CeedInt       dim;
   CeedInt       Q_1d;
   CeedInt       max_P_1d;
diff --git a/backends/hip/ceed-hip-compile.cpp b/backends/hip/ceed-hip-compile.cpp
index dface44ef6..51c83cf222 100644
--- a/backends/hip/ceed-hip-compile.cpp
+++ b/backends/hip/ceed-hip-compile.cpp
@@ -33,7 +33,8 @@
 //------------------------------------------------------------------------------
 // Compile HIP kernel
 //------------------------------------------------------------------------------
-int CeedCompile_Hip(Ceed ceed, const char *source, hipModule_t *module, const CeedInt num_defines, ...) {
+static int CeedCompileCore_Hip(Ceed ceed, const char *source, const bool throw_error, bool *is_compile_good, hipModule_t *module,
+                               const CeedInt num_defines, va_list args) {
   size_t                 ptx_size;
   char                  *ptx;
   const int              num_opts            = 4;
@@ -62,8 +63,6 @@ int CeedCompile_Hip(Ceed ceed, const char *source, hipModule_t *module, const Ce
 
   // Kernel specific options, such as kernel constants
   if (num_defines > 0) {
-    va_list args;
-    va_start(args, num_defines);
     char *name;
     int   val;
 
@@ -72,7 +71,6 @@ int CeedCompile_Hip(Ceed ceed, const char *source, hipModule_t *module, const Ce
       val  = va_arg(args, int);
       code << "#define " << name << " " << val << "\n";
     }
-    va_end(args);
   }
 
   // Standard libCEED definitions for HIP backends
@@ -135,7 +133,8 @@ int CeedCompile_Hip(Ceed ceed, const char *source, hipModule_t *module, const Ce
     CeedCallBackend(CeedFree(&opts[num_opts + num_jit_source_dirs + i]));
   }
   CeedCallBackend(CeedFree(&opts));
-  if (result != HIPRTC_SUCCESS) {
+  *is_compile_good = result == HIPRTC_SUCCESS;
+  if (!*is_compile_good && throw_error) {
     size_t log_size;
     char  *log;
 
@@ -155,6 +154,25 @@ int CeedCompile_Hip(Ceed ceed, const char *source, hipModule_t *module, const Ce
   return CEED_ERROR_SUCCESS;
 }
 
+int CeedCompile_Hip(Ceed ceed, const char *source, hipModule_t *module, const CeedInt num_defines, ...) {
+  bool    is_compile_good = true;
+  va_list args;
+
+  va_start(args, num_defines);
+  CeedCallBackend(CeedCompileCore_Hip(ceed, source, true, &is_compile_good, module, num_defines, args));
+  va_end(args);
+  return CEED_ERROR_SUCCESS;
+}
+
+int CeedTryCompile_Hip(Ceed ceed, const char *source, bool *is_compile_good, hipModule_t *module, const CeedInt num_defines, ...) {
+  va_list args;
+
+  va_start(args, num_defines);
+  CeedCallBackend(CeedCompileCore_Hip(ceed, source, false, is_compile_good, module, num_defines, args));
+  va_end(args);
+  return CEED_ERROR_SUCCESS;
+}
+
 //------------------------------------------------------------------------------
 // Get HIP kernel
 //------------------------------------------------------------------------------
@@ -183,9 +201,28 @@ int CeedRunKernelDim_Hip(Ceed ceed, hipFunction_t kernel, const int grid_size, c
 //------------------------------------------------------------------------------
 // Run HIP kernel for spatial dimension with shared memory
 //------------------------------------------------------------------------------
+static int CeedRunKernelDimSharedCore_Hip(Ceed ceed, hipFunction_t kernel, const int grid_size, const int block_size_x, const int block_size_y,
+                                          const int block_size_z, const int shared_mem_size, const bool throw_error, bool *is_good_run, void **args) {
+  hipError_t result = hipModuleLaunchKernel(kernel, grid_size, 1, 1, block_size_x, block_size_y, block_size_z, shared_mem_size, NULL, args, NULL);
+
+  *is_good_run = result == hipSuccess;
+  if (throw_error) CeedCallHip(ceed, result);
+  return CEED_ERROR_SUCCESS;
+}
+
 int CeedRunKernelDimShared_Hip(Ceed ceed, hipFunction_t kernel, const int grid_size, const int block_size_x, const int block_size_y,
                                const int block_size_z, const int shared_mem_size, void **args) {
-  CeedCallHip(ceed, hipModuleLaunchKernel(kernel, grid_size, 1, 1, block_size_x, block_size_y, block_size_z, shared_mem_size, NULL, args, NULL));
+  bool is_good_run = true;
+
+  CeedCallBackend(
+      CeedRunKernelDimSharedCore_Hip(ceed, kernel, grid_size, block_size_x, block_size_y, block_size_z, shared_mem_size, true, &is_good_run, args));
+  return CEED_ERROR_SUCCESS;
+}
+
+int CeedTryRunKernelDimShared_Hip(Ceed ceed, hipFunction_t kernel, const int grid_size, const int block_size_x, const int block_size_y,
+                                  const int block_size_z, const int shared_mem_size, bool *is_good_run, void **args) {
+  CeedCallBackend(
+      CeedRunKernelDimSharedCore_Hip(ceed, kernel, grid_size, block_size_x, block_size_y, block_size_z, shared_mem_size, false, is_good_run, args));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/hip/ceed-hip-compile.h b/backends/hip/ceed-hip-compile.h
index d990924ec2..0a29fad33e 100644
--- a/backends/hip/ceed-hip-compile.h
+++ b/backends/hip/ceed-hip-compile.h
@@ -13,6 +13,7 @@
 static inline CeedInt CeedDivUpInt(CeedInt numerator, CeedInt denominator) { return (numerator + denominator - 1) / denominator; }
 
 CEED_INTERN int CeedCompile_Hip(Ceed ceed, const char *source, hipModule_t *module, const CeedInt num_defines, ...);
+CEED_INTERN int CeedTryCompile_Hip(Ceed ceed, const char *source, bool *is_compile_good, hipModule_t *module, const CeedInt num_defines, ...);
 
 CEED_INTERN int CeedGetKernel_Hip(Ceed ceed, hipModule_t module, const char *name, hipFunction_t *kernel);
 
@@ -23,3 +24,5 @@ CEED_INTERN int CeedRunKernelDim_Hip(Ceed ceed, hipFunction_t kernel, int grid_s
 
 CEED_INTERN int CeedRunKernelDimShared_Hip(Ceed ceed, hipFunction_t kernel, int grid_size, int block_size_x, int block_size_y, int block_size_z,
                                            int shared_mem_size, void **args);
+CEED_INTERN int CeedTryRunKernelDimShared_Hip(Ceed ceed, hipFunction_t kernel, int grid_size, int block_size_x, int block_size_y, int block_size_z,
+                                              int shared_mem_size, bool *is_good_run, void **args);

From c9192aca9c02dc42a6a7d7a897b4af02df4a189e Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 7 Feb 2025 16:05:47 -0700
Subject: [PATCH 272/571] gpu - swap out bitwise assignment operators for bools

Co-authored-by: Zach Atkins <zach.atkins@colorado.edu>
---
 backends/cuda-gen/ceed-cuda-gen-operator-build.cpp | 12 ++++++------
 backends/hip-gen/ceed-hip-gen-operator-build.cpp   | 12 ++++++------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
index 69f6788c19..181346303c 100644
--- a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
+++ b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
@@ -954,12 +954,12 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_b
         Ceed        basis_ceed;
 
         CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
-        is_all_tensor &= is_tensor;
-        is_all_nontensor &= !is_tensor;
+        is_all_tensor    = is_all_tensor && is_tensor;
+        is_all_nontensor = is_all_not_tensor && !is_tensor;
         CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
         CeedCallBackend(CeedGetResource(basis_ceed, &resource));
         CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
-        has_shared_bases &= !strcmp(resource_root, "/gpu/cuda/shared");
+        has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/cuda/shared");
         CeedCallBackend(CeedFree(&resource_root));
         CeedCallBackend(CeedDestroy(&basis_ceed));
       }
@@ -977,13 +977,13 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_b
         Ceed        basis_ceed;
 
         CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
-        is_all_tensor &= is_tensor;
-        is_all_nontensor &= !is_tensor;
+        is_all_tensor    = is_all_tensor && is_tensor;
+        is_all_nontensor = is_all_nontensor && !is_tensor;
 
         CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
         CeedCallBackend(CeedGetResource(basis_ceed, &resource));
         CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
-        has_shared_bases &= !strcmp(resource_root, "/gpu/cuda/shared");
+        has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/cuda/shared");
         CeedCallBackend(CeedFree(&resource_root));
         CeedCallBackend(CeedDestroy(&basis_ceed));
       }
diff --git a/backends/hip-gen/ceed-hip-gen-operator-build.cpp b/backends/hip-gen/ceed-hip-gen-operator-build.cpp
index 4c3479ee21..77c642b9ca 100644
--- a/backends/hip-gen/ceed-hip-gen-operator-build.cpp
+++ b/backends/hip-gen/ceed-hip-gen-operator-build.cpp
@@ -980,12 +980,12 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_bu
         Ceed        basis_ceed;
 
         CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
-        is_all_tensor &= is_tensor;
-        is_all_nontensor &= !is_tensor;
+        is_all_tensor    = is_all_tensor && is_tensor;
+        is_all_nontensor = is_all_nontensor && !is_tensor;
         CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
         CeedCallBackend(CeedGetResource(basis_ceed, &resource));
         CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
-        has_shared_bases &= !strcmp(resource_root, "/gpu/hip/shared");
+        has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/hip/shared");
         CeedCallBackend(CeedFree(&resource_root));
         CeedCallBackend(CeedDestroy(&basis_ceed));
       }
@@ -1003,13 +1003,13 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_bu
         Ceed        basis_ceed;
 
         CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
-        is_all_tensor &= is_tensor;
-        is_all_nontensor &= !is_tensor;
+        is_all_tensor    = is_all_tensor && is_tensor;
+        is_all_nontensor = is_all_nontensor && !is_tensor;
 
         CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
         CeedCallBackend(CeedGetResource(basis_ceed, &resource));
         CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
-        has_shared_bases &= !strcmp(resource_root, "/gpu/hip/shared");
+        has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/hip/shared");
         CeedCallBackend(CeedFree(&resource_root));
         CeedCallBackend(CeedDestroy(&basis_ceed));
       }

From 0a2a64927a7a47782a31ad89eee1373d25546e0c Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Thu, 6 Feb 2025 15:13:38 -0700
Subject: [PATCH 273/571] cuda - remove duplicate mats in gen

---
 .../cuda-gen/ceed-cuda-gen-operator-build.cpp | 170 ++++++++++++++++--
 1 file changed, 153 insertions(+), 17 deletions(-)

diff --git a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
index 181346303c..a7f2681332 100644
--- a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
+++ b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
@@ -127,8 +127,8 @@ static int CeedOperatorBuildKernelData_Cuda_gen(Ceed ceed, CeedInt num_input_fie
 // Setup fields
 //------------------------------------------------------------------------------
 static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, CeedInt i, CeedOperatorField op_field,
-                                                     CeedQFunctionField qf_field, CeedInt Q_1d, bool is_input, bool is_tensor, bool is_at_points,
-                                                     bool use_3d_slices) {
+                                                     CeedQFunctionField qf_field, CeedInt field_reuse[3], CeedInt Q_1d, bool is_input, bool is_tensor,
+                                                     bool is_at_points, bool use_3d_slices) {
   std::string            var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
   std::string            P_name = (is_tensor ? "P_1d" : "P") + var_suffix, Q_name = is_tensor ? "Q_1d" : "Q";
   std::string            option_name = (is_input ? "inputs" : "outputs");
@@ -138,6 +138,12 @@ static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, C
   CeedBasis_Cuda_shared *basis_data;
   CeedBasis              basis;
 
+  // Field reuse info
+  bool         use_previous_field = field_reuse[0] != -1;
+  bool         reuse_input        = field_reuse[1];
+  CeedInt      reuse_field        = field_reuse[0];
+  CeedEvalMode reuse_mode         = (CeedEvalMode)field_reuse[2];
+
   code << "  // -- " << (is_input ? "Input" : "Output") << " field " << i << "\n";
 
   // Get field data
@@ -188,8 +194,14 @@ static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, C
         if (is_input) data->B.inputs[i] = basis_data->d_interp_1d;
         else data->B.outputs[i] = basis_data->d_interp_1d;
       }
-      code << "  __shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n";
-      code << "  LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
+      if (use_previous_field) {
+        std::string reuse_var = "s_B" + ((reuse_input ? "_in_" : "_out_") + std::to_string(reuse_field));
+
+        code << "  CeedScalar *s_B" << var_suffix << " = " << reuse_var << ";\n";
+      } else {
+        code << "  __shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n";
+        code << "  LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
+      }
       break;
     case CEED_EVAL_GRAD:
       if (is_at_points) {
@@ -214,27 +226,51 @@ static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, C
         else data->B.outputs[i] = basis_data->d_interp_1d;
       }
       if (is_tensor) {
-        code << "  __shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n";
-        code << "  LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
+        if (use_previous_field) {
+          std::string reuse_var = "s_B" + ((reuse_input ? "_in_" : "_out_") + std::to_string(reuse_field));
+
+          code << "  CeedScalar *s_B" << var_suffix << " = " << reuse_var << ";\n";
+        } else {
+          code << "  __shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n";
+          code << "  LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
+        }
       }
       if (is_at_points) break;  // No G mat for AtPoints
       if (use_3d_slices) {
         if (is_input) data->G.inputs[i] = basis_data->d_collo_grad_1d;
         else data->G.outputs[i] = basis_data->d_collo_grad_1d;
-        code << "  __shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n";
-        code << "  LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
+        if (use_previous_field && reuse_mode == CEED_EVAL_GRAD) {
+          std::string reuse_var = "s_G" + ((reuse_input ? "_in_" : "_out_") + std::to_string(reuse_field));
+
+          code << "  CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
+        } else {
+          code << "  __shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n";
+          code << "  LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
+        }
       } else {
         bool has_collo_grad = basis_data->d_collo_grad_1d;
 
         if (is_input) data->G.inputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d;
         else data->G.outputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d;
         if (has_collo_grad) {
-          code << "  __shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n";
-          code << "  LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
+          if (use_previous_field && reuse_mode == CEED_EVAL_GRAD) {
+            std::string reuse_var = "s_G" + ((reuse_input ? "_in_" : "_out_") + std::to_string(reuse_field));
+
+            code << "  CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
+          } else {
+            code << "  __shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n";
+            code << "  LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
+          }
         } else {
-          code << "  __shared__ CeedScalar s_G" << var_suffix << "[" << P_name << "*" << Q_name << (is_tensor ? "" : "*dim") << "];\n";
-          code << "  LoadMatrix<" << P_name << ", " << Q_name << (is_tensor ? "" : "*dim") << ">(data, G." << option_name << "[" << i << "], s_G"
-               << var_suffix << ");\n";
+          if (use_previous_field && reuse_mode == CEED_EVAL_GRAD) {
+            std::string reuse_var = "s_G" + ((reuse_input ? "_in_" : "_out_") + std::to_string(reuse_field));
+
+            code << "  CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
+          } else {
+            code << "  __shared__ CeedScalar s_G" << var_suffix << "[" << P_name << "*" << Q_name << (is_tensor ? "" : "*dim") << "];\n";
+            code << "  LoadMatrix<" << P_name << ", " << Q_name << (is_tensor ? "" : "*dim") << ">(data, G." << option_name << "[" << i << "], s_G"
+                 << var_suffix << ");\n";
+          }
         }
       }
       break;
@@ -1138,16 +1174,116 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_b
   code << "  data.t_id  = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
   code << "  data.slice = slice + data.t_id_z*T_1D" << ((!is_tensor || dim == 1) ? "" : "*T_1D") << ";\n";
 
+  // -- Determine input mat reuse
+  CeedInt input_matrix_reuse[CEED_FIELD_MAX][3];  // field, is_input, eval_mode
+
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    input_matrix_reuse[i][0] = -1;
+  }
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedEvalMode eval_mode_i;
+    CeedBasis    basis_i;
+
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode_i));
+    if (eval_mode_i == CEED_EVAL_WEIGHT) continue;
+    CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis_i));
+    for (CeedInt j = 0; (input_matrix_reuse[i][0] == -1) && (j < i); j++) {
+      CeedEvalMode eval_mode_j;
+      CeedBasis    basis_j;
+
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
+      if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
+      CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
+      if (basis_i == basis_j) {
+        if (is_tensor) {
+          input_matrix_reuse[i][0] = j;
+          input_matrix_reuse[i][1] = true;
+          input_matrix_reuse[i][2] = eval_mode_j;
+        } else {
+          // For non-tensor can only re-use with the same eval mode
+          if (eval_mode_i == eval_mode_j) {
+            input_matrix_reuse[i][0] = j;
+            input_matrix_reuse[i][1] = true;
+            input_matrix_reuse[i][2] = eval_mode_j;
+          }
+        }
+      }
+      CeedCallBackend(CeedBasisDestroy(&basis_j));
+    }
+    CeedCallBackend(CeedBasisDestroy(&basis_i));
+  }
+
+  // -- Determine output mat reuse
+  CeedInt output_matrix_reuse[CEED_FIELD_MAX][3];  // field, is_input, eval_mode
+
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    output_matrix_reuse[i][0] = -1;
+  }
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    CeedEvalMode eval_mode_i;
+    CeedBasis    basis_i;
+
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode_i));
+    CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis_i));
+    for (CeedInt j = 0; (output_matrix_reuse[i][0] == -1) && (j < num_input_fields); j++) {
+      CeedEvalMode eval_mode_j;
+      CeedBasis    basis_j;
+
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
+      if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
+      CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
+      if (basis_i == basis_j) {
+        if (is_tensor) {
+          output_matrix_reuse[i][0] = j;
+          output_matrix_reuse[i][1] = true;
+          output_matrix_reuse[i][2] = eval_mode_j;
+        } else {
+          // For non-tensor can only re-use with the same eval mode
+          if (eval_mode_i == eval_mode_j) {
+            output_matrix_reuse[i][0] = j;
+            output_matrix_reuse[i][1] = true;
+            output_matrix_reuse[i][2] = eval_mode_j;
+          }
+        }
+      }
+      CeedCallBackend(CeedBasisDestroy(&basis_j));
+    }
+    for (CeedInt j = 0; (output_matrix_reuse[i][0] == -1) && (j < i); j++) {
+      CeedEvalMode eval_mode_j;
+      CeedBasis    basis_j;
+
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode_j));
+      if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
+      CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[j], &basis_j));
+      if (basis_i == basis_j) {
+        if (is_tensor) {
+          output_matrix_reuse[i][0] = j;
+          output_matrix_reuse[i][1] = false;
+          output_matrix_reuse[i][2] = eval_mode_j;
+        } else {
+          // For non-tensor can only re-use with the same eval mode
+          if (eval_mode_i == eval_mode_j) {
+            output_matrix_reuse[i][0] = j;
+            output_matrix_reuse[i][1] = false;
+            output_matrix_reuse[i][2] = eval_mode_j;
+          }
+        }
+      }
+      CeedCallBackend(CeedBasisDestroy(&basis_j));
+    }
+    CeedCallBackend(CeedBasisDestroy(&basis_i));
+  }
+
   // Initialize constants, and matrices B and G
   code << "\n  // Input field constants and basis data\n";
   for (CeedInt i = 0; i < num_input_fields; i++) {
-    CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, i, op_input_fields[i], qf_input_fields[i], Q_1d, true, is_tensor,
-                                                              is_at_points, use_3d_slices));
+    CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i], Q_1d,
+                                                              true, is_tensor, is_at_points, use_3d_slices));
   }
   code << "\n  // Output field constants and basis data\n";
   for (CeedInt i = 0; i < num_output_fields; i++) {
-    CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, i, op_output_fields[i], qf_output_fields[i], Q_1d, false, is_tensor,
-                                                              is_at_points, use_3d_slices));
+    CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i], Q_1d,
+                                                              false, is_tensor, is_at_points, use_3d_slices));
   }
 
   // Loop over all elements

From 9ee499e57bead864c0606ad2f4caeb6962c54f83 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 7 Feb 2025 10:44:05 -0700
Subject: [PATCH 274/571] hip - remove duplicate mats in gen

---
 .../hip-gen/ceed-hip-gen-operator-build.cpp   | 170 ++++++++++++++++--
 1 file changed, 153 insertions(+), 17 deletions(-)

diff --git a/backends/hip-gen/ceed-hip-gen-operator-build.cpp b/backends/hip-gen/ceed-hip-gen-operator-build.cpp
index 77c642b9ca..38666e1916 100644
--- a/backends/hip-gen/ceed-hip-gen-operator-build.cpp
+++ b/backends/hip-gen/ceed-hip-gen-operator-build.cpp
@@ -154,8 +154,8 @@ static int CeedOperatorBuildKernelData_Hip_gen(Ceed ceed, CeedInt num_input_fiel
 // Setup fields
 //------------------------------------------------------------------------------
 static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, CeedInt i, CeedOperatorField op_field,
-                                                    CeedQFunctionField qf_field, CeedInt Q_1d, bool is_input, bool is_tensor, bool is_at_points,
-                                                    bool use_3d_slices) {
+                                                    CeedQFunctionField qf_field, CeedInt field_reuse[3], CeedInt Q_1d, bool is_input, bool is_tensor,
+                                                    bool is_at_points, bool use_3d_slices) {
   std::string           var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
   std::string           P_name = (is_tensor ? "P_1d" : "P") + var_suffix, Q_name = is_tensor ? "Q_1d" : "Q";
   std::string           option_name = (is_input ? "inputs" : "outputs");
@@ -165,6 +165,12 @@ static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, Ce
   CeedBasis_Hip_shared *basis_data;
   CeedBasis             basis;
 
+  // Field reuse info
+  bool         use_previous_field = field_reuse[0] != -1;
+  bool         reuse_input        = field_reuse[1];
+  CeedInt      reuse_field        = field_reuse[0];
+  CeedEvalMode reuse_mode         = (CeedEvalMode)field_reuse[2];
+
   code << "  // -- " << (is_input ? "Input" : "Output") << " field " << i << "\n";
 
   // Get field data
@@ -215,8 +221,14 @@ static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, Ce
         if (is_input) data->B.inputs[i] = basis_data->d_interp_1d;
         else data->B.outputs[i] = basis_data->d_interp_1d;
       }
-      code << "  __shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n";
-      code << "  LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
+      if (use_previous_field) {
+        std::string reuse_var = "s_B" + ((reuse_input ? "_in_" : "_out_") + std::to_string(reuse_field));
+
+        code << "  CeedScalar *s_B" << var_suffix << " = " << reuse_var << ";\n";
+      } else {
+        code << "  __shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n";
+        code << "  LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
+      }
       break;
     case CEED_EVAL_GRAD:
       if (is_at_points) {
@@ -241,27 +253,51 @@ static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, Ce
         else data->B.outputs[i] = basis_data->d_interp_1d;
       }
       if (is_tensor) {
-        code << "  __shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n";
-        code << "  LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
+        if (use_previous_field) {
+          std::string reuse_var = "s_B" + ((reuse_input ? "_in_" : "_out_") + std::to_string(reuse_field));
+
+          code << "  CeedScalar *s_B" << var_suffix << " = " << reuse_var << ";\n";
+        } else {
+          code << "  __shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n";
+          code << "  LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
+        }
       }
       if (is_at_points) break;  // No G mat for AtPoints
       if (use_3d_slices) {
         if (is_input) data->G.inputs[i] = basis_data->d_collo_grad_1d;
         else data->G.outputs[i] = basis_data->d_collo_grad_1d;
-        code << "  __shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n";
-        code << "  LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
+        if (use_previous_field && reuse_mode == CEED_EVAL_GRAD) {
+          std::string reuse_var = "s_G" + ((reuse_input ? "_in_" : "_out_") + std::to_string(reuse_field));
+
+          code << "  CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
+        } else {
+          code << "  __shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n";
+          code << "  LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
+        }
       } else {
         bool has_collo_grad = basis_data->d_collo_grad_1d;
 
         if (is_input) data->G.inputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d;
         else data->G.outputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d;
         if (has_collo_grad) {
-          code << "  __shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n";
-          code << "  LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
+          if (use_previous_field && reuse_mode == CEED_EVAL_GRAD) {
+            std::string reuse_var = "s_G" + ((reuse_input ? "_in_" : "_out_") + std::to_string(reuse_field));
+
+            code << "  CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
+          } else {
+            code << "  __shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n";
+            code << "  LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
+          }
         } else {
-          code << "  __shared__ CeedScalar s_G" << var_suffix << "[" << P_name << "*" << Q_name << (is_tensor ? "" : "*dim") << "];\n";
-          code << "  LoadMatrix<" << P_name << ", " << Q_name << (is_tensor ? "" : "*dim") << ">(data, G." << option_name << "[" << i << "], s_G"
-               << var_suffix << ");\n";
+          if (use_previous_field && reuse_mode == CEED_EVAL_GRAD) {
+            std::string reuse_var = "s_G" + ((reuse_input ? "_in_" : "_out_") + std::to_string(reuse_field));
+
+            code << "  CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
+          } else {
+            code << "  __shared__ CeedScalar s_G" << var_suffix << "[" << P_name << "*" << Q_name << (is_tensor ? "" : "*dim") << "];\n";
+            code << "  LoadMatrix<" << P_name << ", " << Q_name << (is_tensor ? "" : "*dim") << ">(data, G." << option_name << "[" << i << "], s_G"
+                 << var_suffix << ");\n";
+          }
         }
       }
       break;
@@ -1151,16 +1187,116 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_bu
   code << "  data.t_id  = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
   code << "  data.slice = slice + data.t_id_z*T_1D" << ((!is_tensor || dim == 1) ? "" : "*T_1D") << ";\n";
 
+  // -- Determine input mat reuse
+  CeedInt input_matrix_reuse[CEED_FIELD_MAX][3];  // field, is_input, eval_mode
+
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    input_matrix_reuse[i][0] = -1;
+  }
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedEvalMode eval_mode_i;
+    CeedBasis    basis_i;
+
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode_i));
+    if (eval_mode_i == CEED_EVAL_WEIGHT) continue;
+    CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis_i));
+    for (CeedInt j = 0; (input_matrix_reuse[i][0] == -1) && (j < i); j++) {
+      CeedEvalMode eval_mode_j;
+      CeedBasis    basis_j;
+
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
+      if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
+      CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
+      if (basis_i == basis_j) {
+        if (is_tensor) {
+          input_matrix_reuse[i][0] = j;
+          input_matrix_reuse[i][1] = true;
+          input_matrix_reuse[i][2] = eval_mode_j;
+        } else {
+          // For non-tensor can only re-use with the same eval mode
+          if (eval_mode_i == eval_mode_j) {
+            input_matrix_reuse[i][0] = j;
+            input_matrix_reuse[i][1] = true;
+            input_matrix_reuse[i][2] = eval_mode_j;
+          }
+        }
+      }
+      CeedCallBackend(CeedBasisDestroy(&basis_j));
+    }
+    CeedCallBackend(CeedBasisDestroy(&basis_i));
+  }
+
+  // -- Determine output mat reuse
+  CeedInt output_matrix_reuse[CEED_FIELD_MAX][3];  // field, is_input, eval_mode
+
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    output_matrix_reuse[i][0] = -1;
+  }
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    CeedEvalMode eval_mode_i;
+    CeedBasis    basis_i;
+
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode_i));
+    CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis_i));
+    for (CeedInt j = 0; (output_matrix_reuse[i][0] == -1) && (j < num_input_fields); j++) {
+      CeedEvalMode eval_mode_j;
+      CeedBasis    basis_j;
+
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
+      if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
+      CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
+      if (basis_i == basis_j) {
+        if (is_tensor) {
+          output_matrix_reuse[i][0] = j;
+          output_matrix_reuse[i][1] = true;
+          output_matrix_reuse[i][2] = eval_mode_j;
+        } else {
+          // For non-tensor can only re-use with the same eval mode
+          if (eval_mode_i == eval_mode_j) {
+            output_matrix_reuse[i][0] = j;
+            output_matrix_reuse[i][1] = true;
+            output_matrix_reuse[i][2] = eval_mode_j;
+          }
+        }
+      }
+      CeedCallBackend(CeedBasisDestroy(&basis_j));
+    }
+    for (CeedInt j = 0; (output_matrix_reuse[i][0] == -1) && (j < i); j++) {
+      CeedEvalMode eval_mode_j;
+      CeedBasis    basis_j;
+
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode_j));
+      if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
+      CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[j], &basis_j));
+      if (basis_i == basis_j) {
+        if (is_tensor) {
+          output_matrix_reuse[i][0] = j;
+          output_matrix_reuse[i][1] = false;
+          output_matrix_reuse[i][2] = eval_mode_j;
+        } else {
+          // For non-tensor can only re-use with the same eval mode
+          if (eval_mode_i == eval_mode_j) {
+            output_matrix_reuse[i][0] = j;
+            output_matrix_reuse[i][1] = false;
+            output_matrix_reuse[i][2] = eval_mode_j;
+          }
+        }
+      }
+      CeedCallBackend(CeedBasisDestroy(&basis_j));
+    }
+    CeedCallBackend(CeedBasisDestroy(&basis_i));
+  }
+
   // Initialize constants, and matrices B and G
   code << "\n  // Input field constants and basis data\n";
   for (CeedInt i = 0; i < num_input_fields; i++) {
-    CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, i, op_input_fields[i], qf_input_fields[i], Q_1d, true, is_tensor,
-                                                             is_at_points, use_3d_slices));
+    CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i], Q_1d, true,
+                                                             is_tensor, is_at_points, use_3d_slices));
   }
   code << "\n  // Output field constants and basis data\n";
   for (CeedInt i = 0; i < num_output_fields; i++) {
-    CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, i, op_output_fields[i], qf_output_fields[i], Q_1d, false, is_tensor,
-                                                             is_at_points, use_3d_slices));
+    CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i], Q_1d,
+                                                             false, is_tensor, is_at_points, use_3d_slices));
   }
 
   // Loop over all elements

From 45a787f75d07c3f171308ee0d0be6daefd4754b0 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 7 Feb 2025 16:32:38 -0700
Subject: [PATCH 275/571] gpu - use struct over array for clarity

---
 .../cuda-gen/ceed-cuda-gen-operator-build.cpp | 83 ++++++++++---------
 .../hip-gen/ceed-hip-gen-operator-build.cpp   | 81 +++++++++---------
 2 files changed, 85 insertions(+), 79 deletions(-)

diff --git a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
index a7f2681332..f306fdaa0c 100644
--- a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
+++ b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
@@ -22,6 +22,12 @@
 #include "../cuda/ceed-cuda-compile.h"
 #include "ceed-cuda-gen.h"
 
+struct FieldReuse_Cuda {
+  CeedInt      index;
+  bool         is_input;
+  CeedEvalMode eval_mode;
+};
+
 //------------------------------------------------------------------------------
 // Determine type of operator
 //------------------------------------------------------------------------------
@@ -127,8 +133,8 @@ static int CeedOperatorBuildKernelData_Cuda_gen(Ceed ceed, CeedInt num_input_fie
 // Setup fields
 //------------------------------------------------------------------------------
 static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, CeedInt i, CeedOperatorField op_field,
-                                                     CeedQFunctionField qf_field, CeedInt field_reuse[3], CeedInt Q_1d, bool is_input, bool is_tensor,
-                                                     bool is_at_points, bool use_3d_slices) {
+                                                     CeedQFunctionField qf_field, FieldReuse_Cuda field_reuse, CeedInt Q_1d, bool is_input,
+                                                     bool is_tensor, bool is_at_points, bool use_3d_slices) {
   std::string            var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
   std::string            P_name = (is_tensor ? "P_1d" : "P") + var_suffix, Q_name = is_tensor ? "Q_1d" : "Q";
   std::string            option_name = (is_input ? "inputs" : "outputs");
@@ -139,10 +145,7 @@ static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, C
   CeedBasis              basis;
 
   // Field reuse info
-  bool         use_previous_field = field_reuse[0] != -1;
-  bool         reuse_input        = field_reuse[1];
-  CeedInt      reuse_field        = field_reuse[0];
-  CeedEvalMode reuse_mode         = (CeedEvalMode)field_reuse[2];
+  bool use_previous_field = field_reuse.index != -1;
 
   code << "  // -- " << (is_input ? "Input" : "Output") << " field " << i << "\n";
 
@@ -195,7 +198,7 @@ static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, C
         else data->B.outputs[i] = basis_data->d_interp_1d;
       }
       if (use_previous_field) {
-        std::string reuse_var = "s_B" + ((reuse_input ? "_in_" : "_out_") + std::to_string(reuse_field));
+        std::string reuse_var = "s_B" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
 
         code << "  CeedScalar *s_B" << var_suffix << " = " << reuse_var << ";\n";
       } else {
@@ -227,7 +230,7 @@ static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, C
       }
       if (is_tensor) {
         if (use_previous_field) {
-          std::string reuse_var = "s_B" + ((reuse_input ? "_in_" : "_out_") + std::to_string(reuse_field));
+          std::string reuse_var = "s_B" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
 
           code << "  CeedScalar *s_B" << var_suffix << " = " << reuse_var << ";\n";
         } else {
@@ -239,8 +242,8 @@ static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, C
       if (use_3d_slices) {
         if (is_input) data->G.inputs[i] = basis_data->d_collo_grad_1d;
         else data->G.outputs[i] = basis_data->d_collo_grad_1d;
-        if (use_previous_field && reuse_mode == CEED_EVAL_GRAD) {
-          std::string reuse_var = "s_G" + ((reuse_input ? "_in_" : "_out_") + std::to_string(reuse_field));
+        if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD) {
+          std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
 
           code << "  CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
         } else {
@@ -253,8 +256,8 @@ static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, C
         if (is_input) data->G.inputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d;
         else data->G.outputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d;
         if (has_collo_grad) {
-          if (use_previous_field && reuse_mode == CEED_EVAL_GRAD) {
-            std::string reuse_var = "s_G" + ((reuse_input ? "_in_" : "_out_") + std::to_string(reuse_field));
+          if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD) {
+            std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
 
             code << "  CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
           } else {
@@ -262,8 +265,8 @@ static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, C
             code << "  LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
           }
         } else {
-          if (use_previous_field && reuse_mode == CEED_EVAL_GRAD) {
-            std::string reuse_var = "s_G" + ((reuse_input ? "_in_" : "_out_") + std::to_string(reuse_field));
+          if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD) {
+            std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
 
             code << "  CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
           } else {
@@ -991,7 +994,7 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_b
 
         CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
         is_all_tensor    = is_all_tensor && is_tensor;
-        is_all_nontensor = is_all_not_tensor && !is_tensor;
+        is_all_nontensor = is_all_nontensor && !is_tensor;
         CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
         CeedCallBackend(CeedGetResource(basis_ceed, &resource));
         CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
@@ -1175,10 +1178,10 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_b
   code << "  data.slice = slice + data.t_id_z*T_1D" << ((!is_tensor || dim == 1) ? "" : "*T_1D") << ";\n";
 
   // -- Determine input mat reuse
-  CeedInt input_matrix_reuse[CEED_FIELD_MAX][3];  // field, is_input, eval_mode
+  FieldReuse_Cuda input_matrix_reuse[CEED_FIELD_MAX];
 
   for (CeedInt i = 0; i < num_input_fields; i++) {
-    input_matrix_reuse[i][0] = -1;
+    input_matrix_reuse[i].index = -1;
   }
   for (CeedInt i = 0; i < num_input_fields; i++) {
     CeedEvalMode eval_mode_i;
@@ -1187,7 +1190,7 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_b
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode_i));
     if (eval_mode_i == CEED_EVAL_WEIGHT) continue;
     CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis_i));
-    for (CeedInt j = 0; (input_matrix_reuse[i][0] == -1) && (j < i); j++) {
+    for (CeedInt j = 0; (input_matrix_reuse[i].index == -1) && (j < i); j++) {
       CeedEvalMode eval_mode_j;
       CeedBasis    basis_j;
 
@@ -1196,15 +1199,15 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_b
       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
       if (basis_i == basis_j) {
         if (is_tensor) {
-          input_matrix_reuse[i][0] = j;
-          input_matrix_reuse[i][1] = true;
-          input_matrix_reuse[i][2] = eval_mode_j;
+          input_matrix_reuse[i].index     = j;
+          input_matrix_reuse[i].is_input  = true;
+          input_matrix_reuse[i].eval_mode = eval_mode_j;
         } else {
           // For non-tensor can only re-use with the same eval mode
           if (eval_mode_i == eval_mode_j) {
-            input_matrix_reuse[i][0] = j;
-            input_matrix_reuse[i][1] = true;
-            input_matrix_reuse[i][2] = eval_mode_j;
+            input_matrix_reuse[i].index     = j;
+            input_matrix_reuse[i].is_input  = true;
+            input_matrix_reuse[i].eval_mode = eval_mode_j;
           }
         }
       }
@@ -1214,10 +1217,10 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_b
   }
 
   // -- Determine output mat reuse
-  CeedInt output_matrix_reuse[CEED_FIELD_MAX][3];  // field, is_input, eval_mode
+  FieldReuse_Cuda output_matrix_reuse[CEED_FIELD_MAX];
 
   for (CeedInt i = 0; i < num_output_fields; i++) {
-    output_matrix_reuse[i][0] = -1;
+    output_matrix_reuse[i].index = -1;
   }
   for (CeedInt i = 0; i < num_output_fields; i++) {
     CeedEvalMode eval_mode_i;
@@ -1225,7 +1228,7 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_b
 
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode_i));
     CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis_i));
-    for (CeedInt j = 0; (output_matrix_reuse[i][0] == -1) && (j < num_input_fields); j++) {
+    for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < num_input_fields); j++) {
       CeedEvalMode eval_mode_j;
       CeedBasis    basis_j;
 
@@ -1234,21 +1237,21 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_b
       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
       if (basis_i == basis_j) {
         if (is_tensor) {
-          output_matrix_reuse[i][0] = j;
-          output_matrix_reuse[i][1] = true;
-          output_matrix_reuse[i][2] = eval_mode_j;
+          output_matrix_reuse[i].index     = j;
+          output_matrix_reuse[i].is_input  = true;
+          output_matrix_reuse[i].eval_mode = eval_mode_j;
         } else {
           // For non-tensor can only re-use with the same eval mode
           if (eval_mode_i == eval_mode_j) {
-            output_matrix_reuse[i][0] = j;
-            output_matrix_reuse[i][1] = true;
-            output_matrix_reuse[i][2] = eval_mode_j;
+            output_matrix_reuse[i].index     = j;
+            output_matrix_reuse[i].is_input  = true;
+            output_matrix_reuse[i].eval_mode = eval_mode_j;
           }
         }
       }
       CeedCallBackend(CeedBasisDestroy(&basis_j));
     }
-    for (CeedInt j = 0; (output_matrix_reuse[i][0] == -1) && (j < i); j++) {
+    for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < i); j++) {
       CeedEvalMode eval_mode_j;
       CeedBasis    basis_j;
 
@@ -1257,15 +1260,15 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_b
       CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[j], &basis_j));
       if (basis_i == basis_j) {
         if (is_tensor) {
-          output_matrix_reuse[i][0] = j;
-          output_matrix_reuse[i][1] = false;
-          output_matrix_reuse[i][2] = eval_mode_j;
+          output_matrix_reuse[i].index     = j;
+          output_matrix_reuse[i].is_input  = false;
+          output_matrix_reuse[i].eval_mode = eval_mode_j;
         } else {
           // For non-tensor can only re-use with the same eval mode
           if (eval_mode_i == eval_mode_j) {
-            output_matrix_reuse[i][0] = j;
-            output_matrix_reuse[i][1] = false;
-            output_matrix_reuse[i][2] = eval_mode_j;
+            output_matrix_reuse[i].index     = j;
+            output_matrix_reuse[i].is_input  = false;
+            output_matrix_reuse[i].eval_mode = eval_mode_j;
           }
         }
       }
diff --git a/backends/hip-gen/ceed-hip-gen-operator-build.cpp b/backends/hip-gen/ceed-hip-gen-operator-build.cpp
index 38666e1916..b3cd64eef4 100644
--- a/backends/hip-gen/ceed-hip-gen-operator-build.cpp
+++ b/backends/hip-gen/ceed-hip-gen-operator-build.cpp
@@ -21,6 +21,12 @@
 #include "../hip/ceed-hip-compile.h"
 #include "ceed-hip-gen.h"
 
+struct FieldReuse_Hip {
+  CeedInt      index;
+  bool         is_input;
+  CeedEvalMode eval_mode;
+};
+
 //------------------------------------------------------------------------------
 // Calculate the block size used for launching the operator kernel
 //------------------------------------------------------------------------------
@@ -154,8 +160,8 @@ static int CeedOperatorBuildKernelData_Hip_gen(Ceed ceed, CeedInt num_input_fiel
 // Setup fields
 //------------------------------------------------------------------------------
 static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, CeedInt i, CeedOperatorField op_field,
-                                                    CeedQFunctionField qf_field, CeedInt field_reuse[3], CeedInt Q_1d, bool is_input, bool is_tensor,
-                                                    bool is_at_points, bool use_3d_slices) {
+                                                    CeedQFunctionField qf_field, FieldReuse_Hip field_reuse, CeedInt Q_1d, bool is_input,
+                                                    bool is_tensor, bool is_at_points, bool use_3d_slices) {
   std::string           var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
   std::string           P_name = (is_tensor ? "P_1d" : "P") + var_suffix, Q_name = is_tensor ? "Q_1d" : "Q";
   std::string           option_name = (is_input ? "inputs" : "outputs");
@@ -166,10 +172,7 @@ static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, Ce
   CeedBasis             basis;
 
   // Field reuse info
-  bool         use_previous_field = field_reuse[0] != -1;
-  bool         reuse_input        = field_reuse[1];
-  CeedInt      reuse_field        = field_reuse[0];
-  CeedEvalMode reuse_mode         = (CeedEvalMode)field_reuse[2];
+  bool use_previous_field = field_reuse.index != -1;
 
   code << "  // -- " << (is_input ? "Input" : "Output") << " field " << i << "\n";
 
@@ -222,7 +225,7 @@ static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, Ce
         else data->B.outputs[i] = basis_data->d_interp_1d;
       }
       if (use_previous_field) {
-        std::string reuse_var = "s_B" + ((reuse_input ? "_in_" : "_out_") + std::to_string(reuse_field));
+        std::string reuse_var = "s_B" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
 
         code << "  CeedScalar *s_B" << var_suffix << " = " << reuse_var << ";\n";
       } else {
@@ -254,7 +257,7 @@ static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, Ce
       }
       if (is_tensor) {
         if (use_previous_field) {
-          std::string reuse_var = "s_B" + ((reuse_input ? "_in_" : "_out_") + std::to_string(reuse_field));
+          std::string reuse_var = "s_B" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
 
           code << "  CeedScalar *s_B" << var_suffix << " = " << reuse_var << ";\n";
         } else {
@@ -266,8 +269,8 @@ static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, Ce
       if (use_3d_slices) {
         if (is_input) data->G.inputs[i] = basis_data->d_collo_grad_1d;
         else data->G.outputs[i] = basis_data->d_collo_grad_1d;
-        if (use_previous_field && reuse_mode == CEED_EVAL_GRAD) {
-          std::string reuse_var = "s_G" + ((reuse_input ? "_in_" : "_out_") + std::to_string(reuse_field));
+        if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD) {
+          std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
 
           code << "  CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
         } else {
@@ -280,8 +283,8 @@ static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, Ce
         if (is_input) data->G.inputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d;
         else data->G.outputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d;
         if (has_collo_grad) {
-          if (use_previous_field && reuse_mode == CEED_EVAL_GRAD) {
-            std::string reuse_var = "s_G" + ((reuse_input ? "_in_" : "_out_") + std::to_string(reuse_field));
+          if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD) {
+            std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
 
             code << "  CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
           } else {
@@ -289,8 +292,8 @@ static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, Ce
             code << "  LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
           }
         } else {
-          if (use_previous_field && reuse_mode == CEED_EVAL_GRAD) {
-            std::string reuse_var = "s_G" + ((reuse_input ? "_in_" : "_out_") + std::to_string(reuse_field));
+          if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD) {
+            std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
 
             code << "  CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
           } else {
@@ -1188,10 +1191,10 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_bu
   code << "  data.slice = slice + data.t_id_z*T_1D" << ((!is_tensor || dim == 1) ? "" : "*T_1D") << ";\n";
 
   // -- Determine input mat reuse
-  CeedInt input_matrix_reuse[CEED_FIELD_MAX][3];  // field, is_input, eval_mode
+  FieldReuse_Hip input_matrix_reuse[CEED_FIELD_MAX];
 
   for (CeedInt i = 0; i < num_input_fields; i++) {
-    input_matrix_reuse[i][0] = -1;
+    input_matrix_reuse[i].index = -1;
   }
   for (CeedInt i = 0; i < num_input_fields; i++) {
     CeedEvalMode eval_mode_i;
@@ -1200,7 +1203,7 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_bu
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode_i));
     if (eval_mode_i == CEED_EVAL_WEIGHT) continue;
     CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis_i));
-    for (CeedInt j = 0; (input_matrix_reuse[i][0] == -1) && (j < i); j++) {
+    for (CeedInt j = 0; (input_matrix_reuse[i].index == -1) && (j < i); j++) {
       CeedEvalMode eval_mode_j;
       CeedBasis    basis_j;
 
@@ -1209,15 +1212,15 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_bu
       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
       if (basis_i == basis_j) {
         if (is_tensor) {
-          input_matrix_reuse[i][0] = j;
-          input_matrix_reuse[i][1] = true;
-          input_matrix_reuse[i][2] = eval_mode_j;
+          input_matrix_reuse[i].index     = j;
+          input_matrix_reuse[i].is_input  = true;
+          input_matrix_reuse[i].eval_mode = eval_mode_j;
         } else {
           // For non-tensor can only re-use with the same eval mode
           if (eval_mode_i == eval_mode_j) {
-            input_matrix_reuse[i][0] = j;
-            input_matrix_reuse[i][1] = true;
-            input_matrix_reuse[i][2] = eval_mode_j;
+            input_matrix_reuse[i].index     = j;
+            input_matrix_reuse[i].is_input  = true;
+            input_matrix_reuse[i].eval_mode = eval_mode_j;
           }
         }
       }
@@ -1227,10 +1230,10 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_bu
   }
 
   // -- Determine output mat reuse
-  CeedInt output_matrix_reuse[CEED_FIELD_MAX][3];  // field, is_input, eval_mode
+  FieldReuse_Hip output_matrix_reuse[CEED_FIELD_MAX];
 
   for (CeedInt i = 0; i < num_output_fields; i++) {
-    output_matrix_reuse[i][0] = -1;
+    output_matrix_reuse[i].index = -1;
   }
   for (CeedInt i = 0; i < num_output_fields; i++) {
     CeedEvalMode eval_mode_i;
@@ -1238,7 +1241,7 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_bu
 
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode_i));
     CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis_i));
-    for (CeedInt j = 0; (output_matrix_reuse[i][0] == -1) && (j < num_input_fields); j++) {
+    for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < num_input_fields); j++) {
       CeedEvalMode eval_mode_j;
       CeedBasis    basis_j;
 
@@ -1247,21 +1250,21 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_bu
       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
       if (basis_i == basis_j) {
         if (is_tensor) {
-          output_matrix_reuse[i][0] = j;
-          output_matrix_reuse[i][1] = true;
-          output_matrix_reuse[i][2] = eval_mode_j;
+          output_matrix_reuse[i].index     = j;
+          output_matrix_reuse[i].is_input  = true;
+          output_matrix_reuse[i].eval_mode = eval_mode_j;
         } else {
           // For non-tensor can only re-use with the same eval mode
           if (eval_mode_i == eval_mode_j) {
-            output_matrix_reuse[i][0] = j;
-            output_matrix_reuse[i][1] = true;
-            output_matrix_reuse[i][2] = eval_mode_j;
+            output_matrix_reuse[i].index     = j;
+            output_matrix_reuse[i].is_input  = true;
+            output_matrix_reuse[i].eval_mode = eval_mode_j;
           }
         }
       }
       CeedCallBackend(CeedBasisDestroy(&basis_j));
     }
-    for (CeedInt j = 0; (output_matrix_reuse[i][0] == -1) && (j < i); j++) {
+    for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < i); j++) {
       CeedEvalMode eval_mode_j;
       CeedBasis    basis_j;
 
@@ -1270,15 +1273,15 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_bu
       CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[j], &basis_j));
       if (basis_i == basis_j) {
         if (is_tensor) {
-          output_matrix_reuse[i][0] = j;
-          output_matrix_reuse[i][1] = false;
-          output_matrix_reuse[i][2] = eval_mode_j;
+          output_matrix_reuse[i].index     = j;
+          output_matrix_reuse[i].is_input  = false;
+          output_matrix_reuse[i].eval_mode = eval_mode_j;
         } else {
           // For non-tensor can only re-use with the same eval mode
           if (eval_mode_i == eval_mode_j) {
-            output_matrix_reuse[i][0] = j;
-            output_matrix_reuse[i][1] = false;
-            output_matrix_reuse[i][2] = eval_mode_j;
+            output_matrix_reuse[i].index     = j;
+            output_matrix_reuse[i].is_input  = false;
+            output_matrix_reuse[i].eval_mode = eval_mode_j;
           }
         }
       }

From 6ab8e59f8abf7a04a361d274e3bc30e1205b0661 Mon Sep 17 00:00:00 2001
From: James Wright <james@jameswright.xyz>
Date: Sat, 8 Feb 2025 09:02:44 -0700
Subject: [PATCH 276/571] doc: Misc doc arrangement fixes

---
 interface/ceed-basis.c    | 108 +++++++++++++++++++-------------------
 interface/ceed-operator.c |   8 +--
 2 files changed, 59 insertions(+), 57 deletions(-)

diff --git a/interface/ceed-basis.c b/interface/ceed-basis.c
index 3d5d51107f..46edf8213e 100644
--- a/interface/ceed-basis.c
+++ b/interface/ceed-basis.c
@@ -291,6 +291,60 @@ static int CeedBasisCreateProjectionMatrices(CeedBasis basis_from, CeedBasis bas
   return CEED_ERROR_SUCCESS;
 }
 
+/**
+  @brief Check input vector dimensions for CeedBasisApply[Add]
+
+  @param[in]  basis     `CeedBasis` to evaluate
+  @param[in]  num_elem  The number of elements to apply the basis evaluation to;
+                          the backend will specify the ordering in @ref CeedElemRestrictionCreate()
+  @param[in]  t_mode    @ref CEED_NOTRANSPOSE to evaluate from nodes to quadrature points;
+                          @ref CEED_TRANSPOSE to apply the transpose, mapping from quadrature points to nodes
+  @param[in]  eval_mode @ref CEED_EVAL_NONE to use values directly,
+                          @ref CEED_EVAL_INTERP to use interpolated values,
+                          @ref CEED_EVAL_GRAD to use gradients,
+                          @ref CEED_EVAL_DIV to use divergence,
+                          @ref CEED_EVAL_CURL to use curl,
+                          @ref CEED_EVAL_WEIGHT to use quadrature weights
+  @param[in]  u         Input `CeedVector`
+  @param[out] v         Output `CeedVector`
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Developer
+**/
+static int CeedBasisApplyCheckDims(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, CeedVector v) {
+  CeedInt  dim, num_comp, q_comp, num_nodes, num_qpts;
+  CeedSize u_length = 0, v_length;
+
+  CeedCall(CeedBasisGetDimension(basis, &dim));
+  CeedCall(CeedBasisGetNumComponents(basis, &num_comp));
+  CeedCall(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp));
+  CeedCall(CeedBasisGetNumNodes(basis, &num_nodes));
+  CeedCall(CeedBasisGetNumQuadraturePoints(basis, &num_qpts));
+  CeedCall(CeedVectorGetLength(v, &v_length));
+  if (u) CeedCall(CeedVectorGetLength(u, &u_length));
+
+  // Check vector lengths to prevent out of bounds issues
+  bool has_good_dims = true;
+  switch (eval_mode) {
+    case CEED_EVAL_NONE:
+    case CEED_EVAL_INTERP:
+    case CEED_EVAL_GRAD:
+    case CEED_EVAL_DIV:
+    case CEED_EVAL_CURL:
+      has_good_dims = ((t_mode == CEED_TRANSPOSE && u_length >= (CeedSize)num_elem * (CeedSize)num_comp * (CeedSize)num_qpts * (CeedSize)q_comp &&
+                        v_length >= (CeedSize)num_elem * (CeedSize)num_comp * (CeedSize)num_nodes) ||
+                       (t_mode == CEED_NOTRANSPOSE && v_length >= (CeedSize)num_elem * (CeedSize)num_qpts * (CeedSize)num_comp * (CeedSize)q_comp &&
+                        u_length >= (CeedSize)num_elem * (CeedSize)num_comp * (CeedSize)num_nodes));
+      break;
+    case CEED_EVAL_WEIGHT:
+      has_good_dims = v_length >= (CeedSize)num_elem * (CeedSize)num_qpts;
+      break;
+  }
+  CeedCheck(has_good_dims, CeedBasisReturnCeed(basis), CEED_ERROR_DIMENSION, "Input/output vectors too short for basis and evaluation mode");
+  return CEED_ERROR_SUCCESS;
+}
+
 /**
   @brief Check input vector dimensions for CeedBasisApply[Add]AtPoints
 
@@ -1866,60 +1920,6 @@ int CeedBasisView(CeedBasis basis, FILE *stream) {
   return CEED_ERROR_SUCCESS;
 }
 
-/**
-  @brief Check input vector dimensions for CeedBasisApply[Add]
-
-  @param[in]  basis     `CeedBasis` to evaluate
-  @param[in]  num_elem  The number of elements to apply the basis evaluation to;
-                          the backend will specify the ordering in @ref CeedElemRestrictionCreate()
-  @param[in]  t_mode    @ref CEED_NOTRANSPOSE to evaluate from nodes to quadrature points;
-                          @ref CEED_TRANSPOSE to apply the transpose, mapping from quadrature points to nodes
-  @param[in]  eval_mode @ref CEED_EVAL_NONE to use values directly,
-                          @ref CEED_EVAL_INTERP to use interpolated values,
-                          @ref CEED_EVAL_GRAD to use gradients,
-                          @ref CEED_EVAL_DIV to use divergence,
-                          @ref CEED_EVAL_CURL to use curl,
-                          @ref CEED_EVAL_WEIGHT to use quadrature weights
-  @param[in]  u         Input `CeedVector`
-  @param[out] v         Output `CeedVector`
-
-  @return An error code: 0 - success, otherwise - failure
-
-  @ref Developer
-**/
-static int CeedBasisApplyCheckDims(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, CeedVector v) {
-  CeedInt  dim, num_comp, q_comp, num_nodes, num_qpts;
-  CeedSize u_length = 0, v_length;
-
-  CeedCall(CeedBasisGetDimension(basis, &dim));
-  CeedCall(CeedBasisGetNumComponents(basis, &num_comp));
-  CeedCall(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp));
-  CeedCall(CeedBasisGetNumNodes(basis, &num_nodes));
-  CeedCall(CeedBasisGetNumQuadraturePoints(basis, &num_qpts));
-  CeedCall(CeedVectorGetLength(v, &v_length));
-  if (u) CeedCall(CeedVectorGetLength(u, &u_length));
-
-  // Check vector lengths to prevent out of bounds issues
-  bool has_good_dims = true;
-  switch (eval_mode) {
-    case CEED_EVAL_NONE:
-    case CEED_EVAL_INTERP:
-    case CEED_EVAL_GRAD:
-    case CEED_EVAL_DIV:
-    case CEED_EVAL_CURL:
-      has_good_dims = ((t_mode == CEED_TRANSPOSE && u_length >= (CeedSize)num_elem * (CeedSize)num_comp * (CeedSize)num_qpts * (CeedSize)q_comp &&
-                        v_length >= (CeedSize)num_elem * (CeedSize)num_comp * (CeedSize)num_nodes) ||
-                       (t_mode == CEED_NOTRANSPOSE && v_length >= (CeedSize)num_elem * (CeedSize)num_qpts * (CeedSize)num_comp * (CeedSize)q_comp &&
-                        u_length >= (CeedSize)num_elem * (CeedSize)num_comp * (CeedSize)num_nodes));
-      break;
-    case CEED_EVAL_WEIGHT:
-      has_good_dims = v_length >= (CeedSize)num_elem * (CeedSize)num_qpts;
-      break;
-  }
-  CeedCheck(has_good_dims, CeedBasisReturnCeed(basis), CEED_ERROR_DIMENSION, "Input/output vectors too short for basis and evaluation mode");
-  return CEED_ERROR_SUCCESS;
-}
-
 /**
   @brief Apply basis evaluation from nodes to quadrature points or vice versa
 
diff --git a/interface/ceed-operator.c b/interface/ceed-operator.c
index a3f6dfa6d5..515c4e3f52 100644
--- a/interface/ceed-operator.c
+++ b/interface/ceed-operator.c
@@ -354,7 +354,7 @@ int CeedOperatorGetActiveElemRestrictions(CeedOperator op, CeedElemRestriction *
 
   @return An error code: 0 - success, otherwise - failure
 
-  @ref User
+  @ref Developer
 **/
 static int CeedOperatorContextSetGeneric(CeedOperator op, CeedContextFieldLabel field_label, CeedContextFieldType field_type, void *values) {
   bool is_composite = false;
@@ -417,7 +417,7 @@ static int CeedOperatorContextSetGeneric(CeedOperator op, CeedContextFieldLabel
 
   @return An error code: 0 - success, otherwise - failure
 
-  @ref User
+  @ref Developer
 **/
 static int CeedOperatorContextGetGenericRead(CeedOperator op, CeedContextFieldLabel field_label, CeedContextFieldType field_type, size_t *num_values,
                                              void *values) {
@@ -484,7 +484,7 @@ static int CeedOperatorContextGetGenericRead(CeedOperator op, CeedContextFieldLa
 
   @return An error code: 0 - success, otherwise - failure
 
-  @ref User
+  @ref Developer
 **/
 static int CeedOperatorContextRestoreGenericRead(CeedOperator op, CeedContextFieldLabel field_label, CeedContextFieldType field_type, void *values) {
   bool is_composite = false;
@@ -1561,6 +1561,8 @@ int CeedOperatorSetName(CeedOperator op, const char *name) {
   @param[in] is_full Whether to write full operator view or terse
 
   @return Error code: 0 - success, otherwise - failure
+
+  @ref Developer
 **/
 static int CeedOperatorView_Core(CeedOperator op, FILE *stream, bool is_full) {
   bool has_name = op->name, is_composite, is_at_points;

From 3efc994b81c669267587c1c03464de8986ef1de5 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Tue, 11 Feb 2025 12:10:53 -0700
Subject: [PATCH 277/571] hip - fix minor leak

---
 backends/hip-gen/ceed-hip-gen-operator.c | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/backends/hip-gen/ceed-hip-gen-operator.c b/backends/hip-gen/ceed-hip-gen-operator.c
index da164e2b93..c081c98cc6 100644
--- a/backends/hip-gen/ceed-hip-gen-operator.c
+++ b/backends/hip-gen/ceed-hip-gen-operator.c
@@ -75,12 +75,15 @@ static int CeedOperatorApplyAdd_Hip_gen(CeedOperator op, CeedVector input_vec, C
     if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
       data->fields.inputs[i] = NULL;
     } else {
+      bool       is_active;
       CeedVector vec;
 
       // Get input vector
       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
-      if (vec == CEED_VECTOR_ACTIVE) vec = input_vec;
+      is_active = vec == CEED_VECTOR_ACTIVE;
+      if (is_active) vec = input_vec;
       CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->fields.inputs[i]));
+      if (!is_active) CeedCallBackend(CeedVectorDestroy(&vec));
     }
   }
 
@@ -90,14 +93,17 @@ static int CeedOperatorApplyAdd_Hip_gen(CeedOperator op, CeedVector input_vec, C
     if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
       data->fields.outputs[i] = NULL;
     } else {
+      bool       is_active;
       CeedVector vec;
 
       // Get output vector
       CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
-      if (vec == CEED_VECTOR_ACTIVE) vec = output_vec;
+      is_active = vec == CEED_VECTOR_ACTIVE;
+      if (is_active) vec = output_vec;
       output_vecs[i] = vec;
       // Check for multiple output modes
       CeedInt index = -1;
+
       for (CeedInt j = 0; j < i; j++) {
         if (vec == output_vecs[j]) {
           index = j;
@@ -109,6 +115,7 @@ static int CeedOperatorApplyAdd_Hip_gen(CeedOperator op, CeedVector input_vec, C
       } else {
         data->fields.outputs[i] = data->fields.outputs[index];
       }
+      if (!is_active) CeedCallBackend(CeedVectorDestroy(&vec));
     }
   }
 
@@ -191,11 +198,14 @@ static int CeedOperatorApplyAdd_Hip_gen(CeedOperator op, CeedVector input_vec, C
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
     if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
     } else {
+      bool       is_active;
       CeedVector vec;
 
       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
-      if (vec == CEED_VECTOR_ACTIVE) vec = input_vec;
+      is_active = vec == CEED_VECTOR_ACTIVE;
+      if (is_active) vec = input_vec;
       CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->fields.inputs[i]));
+      if (!is_active) CeedCallBackend(CeedVectorDestroy(&vec));
     }
   }
 
@@ -204,10 +214,12 @@ static int CeedOperatorApplyAdd_Hip_gen(CeedOperator op, CeedVector input_vec, C
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
     if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
     } else {
+      bool       is_active;
       CeedVector vec;
 
       CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
-      if (vec == CEED_VECTOR_ACTIVE) vec = output_vec;
+      is_active = vec == CEED_VECTOR_ACTIVE;
+      if (is_active) vec = output_vec;
       // Check for multiple output modes
       CeedInt index = -1;
 
@@ -220,6 +232,7 @@ static int CeedOperatorApplyAdd_Hip_gen(CeedOperator op, CeedVector input_vec, C
       if (index == -1) {
         CeedCallBackend(CeedVectorRestoreArray(vec, &data->fields.outputs[i]));
       }
+      if (!is_active) CeedCallBackend(CeedVectorDestroy(&vec));
     }
   }
 

From 5e82904b4c5308c8a305aac10dbc3d591382f1c7 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Tue, 11 Feb 2025 12:56:48 -0700
Subject: [PATCH 278/571] make - use newer rocm option

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 023d1ed559..c9f4dd7cc1 100644
--- a/Makefile
+++ b/Makefile
@@ -170,7 +170,7 @@ ifneq ($(CUDA_ARCH),)
 endif
 HIPCCFLAGS ?= $(filter-out $(OMP_SIMD_FLAG),$(OPT)) -fPIC -munsafe-fp-atomics
 ifneq ($(HIP_ARCH),)
-  HIPCCFLAGS += --amdgpu-target=$(HIP_ARCH)
+  HIPCCFLAGS += --offload-arch=$(HIP_ARCH)
 endif
 SYCL_FLAG := $(SYCL_FLAG.$(CC_VENDOR))
 SYCLFLAGS ?= $(SYCL_FLAG) -fPIC -std=c++17 $(filter-out -std=c++11,$(CXXFLAGS)) $(filter-out $(OMP_SIMD_FLAG),$(OPT))

From 11ac676fef74f333d84c957cd407a32f0dbe85cd Mon Sep 17 00:00:00 2001
From: Zach Atkins <Zach.Atkins@colorado.edu>
Date: Fri, 24 Jan 2025 15:28:59 -0800
Subject: [PATCH 279/571] Update hip basis code to conform to vector interfaces

---
 backends/hip-ref/ceed-hip-ref-basis.c       | 56 +++++++--------------
 backends/hip-shared/ceed-hip-shared-basis.c | 29 +++++------
 2 files changed, 32 insertions(+), 53 deletions(-)

diff --git a/backends/hip-ref/ceed-hip-ref-basis.c b/backends/hip-ref/ceed-hip-ref-basis.c
index 7fdfb9f16d..be4fdd459b 100644
--- a/backends/hip-ref/ceed-hip-ref-basis.c
+++ b/backends/hip-ref/ceed-hip-ref-basis.c
@@ -34,21 +34,14 @@ static int CeedBasisApplyCore_Hip(CeedBasis basis, bool apply_add, const CeedInt
   // Get read/write access to u, v
   if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
   else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
-  if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v));
-  else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
-
-  // Clear v for transpose operation
-  if (is_transpose && !apply_add) {
-    CeedInt  num_comp, q_comp, num_nodes, num_qpts;
-    CeedSize length;
-
-    CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
-    CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp));
-    CeedCallBackend(CeedBasisGetNumNodes(basis, &num_nodes));
-    CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &num_qpts));
-    length = (CeedSize)num_elem * (CeedSize)num_comp * (t_mode == CEED_TRANSPOSE ? (CeedSize)num_nodes : ((CeedSize)num_qpts * (CeedSize)q_comp));
-    CeedCallHip(ceed, hipMemset(d_v, 0, length * sizeof(CeedScalar)));
+  if (apply_add) {
+    CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v));
+  } else {
+    // Clear v for transpose operation
+    if (is_transpose) CeedCallBackend(CeedVectorSetValue(v, 0.0));
+    CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
   }
+
   CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
   CeedCallBackend(CeedBasisGetDimension(basis, &dim));
 
@@ -201,20 +194,12 @@ static int CeedBasisApplyAtPointsCore_Hip(CeedBasis basis, bool apply_add, const
   CeedCallBackend(CeedVectorGetArrayRead(x_ref, CEED_MEM_DEVICE, &d_x));
   if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
   else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
-  if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v));
-  else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
-
-  // Clear v for transpose operation
-  if (is_transpose && !apply_add) {
-    CeedInt  num_comp, q_comp, num_nodes;
-    CeedSize length;
-
-    CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
-    CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp));
-    CeedCallBackend(CeedBasisGetNumNodes(basis, &num_nodes));
-    length =
-        (CeedSize)num_elem * (CeedSize)num_comp * (t_mode == CEED_TRANSPOSE ? (CeedSize)num_nodes : ((CeedSize)max_num_points * (CeedSize)q_comp));
-    CeedCallHip(ceed, hipMemset(d_v, 0, length * sizeof(CeedScalar)));
+  if (apply_add) {
+    CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v));
+  } else {
+    // Clear v for transpose operation
+    if (is_transpose) CeedCallBackend(CeedVectorSetValue(v, 0.0));
+    CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
   }
 
   // Basis action
@@ -285,15 +270,12 @@ static int CeedBasisApplyNonTensorCore_Hip(CeedBasis basis, bool apply_add, cons
   // Get read/write access to u, v
   if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
   else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
-  if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v));
-  else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
-
-  // Clear v for transpose operation
-  if (is_transpose && !apply_add) {
-    CeedSize length;
-
-    CeedCallBackend(CeedVectorGetLength(v, &length));
-    CeedCallHip(ceed, hipMemset(d_v, 0, length * sizeof(CeedScalar)));
+  if (apply_add) {
+    CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v));
+  } else {
+    // Clear v for transpose operation
+    if (is_transpose) CeedCallBackend(CeedVectorSetValue(v, 0.0));
+    CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
   }
 
   // Apply basis operation
diff --git a/backends/hip-shared/ceed-hip-shared-basis.c b/backends/hip-shared/ceed-hip-shared-basis.c
index ebe9048d3f..7c54aa2bf2 100644
--- a/backends/hip-shared/ceed-hip-shared-basis.c
+++ b/backends/hip-shared/ceed-hip-shared-basis.c
@@ -365,20 +365,12 @@ static int CeedBasisApplyAtPointsCore_Hip_shared(CeedBasis basis, bool apply_add
   CeedCallBackend(CeedVectorGetArrayRead(x_ref, CEED_MEM_DEVICE, &d_x));
   if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
   else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
-  if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v));
-  else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
-
-  // Clear v for transpose operation
-  if (is_transpose && !apply_add) {
-    CeedInt  num_comp, q_comp, num_nodes;
-    CeedSize length;
-
-    CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
-    CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp));
-    CeedCallBackend(CeedBasisGetNumNodes(basis, &num_nodes));
-    length =
-        (CeedSize)num_elem * (CeedSize)num_comp * (t_mode == CEED_TRANSPOSE ? (CeedSize)num_nodes : ((CeedSize)max_num_points * (CeedSize)q_comp));
-    CeedCallHip(ceed, hipMemset(d_v, 0, length * sizeof(CeedScalar)));
+  if (apply_add) {
+    CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v));
+  } else {
+    // Clear v for transpose operation
+    if (is_transpose) CeedCallBackend(CeedVectorSetValue(v, 0.0));
+    CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
   }
 
   // Basis action
@@ -502,8 +494,13 @@ static int CeedBasisApplyNonTensorCore_Hip_shared(CeedBasis basis, bool apply_ad
   // Get read/write access to u, v
   if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
   else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
-  if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v));
-  else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
+  if (apply_add) {
+    CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v));
+  } else {
+    // Clear v for transpose operation
+    if (is_transpose) CeedCallBackend(CeedVectorSetValue(v, 0.0));
+    CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
+  }
 
   // Apply basis operation
   switch (eval_mode) {

From 9dafd6df2806212aa59d2280162ce1a1d431b661 Mon Sep 17 00:00:00 2001
From: Zach Atkins <zach.atkins@colorado.edu>
Date: Mon, 27 Jan 2025 09:57:50 -0700
Subject: [PATCH 280/571] Vector API compliance for CUDA backends

---
 backends/cuda-ref/ceed-cuda-ref-basis.c       | 58 ++++++-------------
 backends/cuda-shared/ceed-cuda-shared-basis.c | 20 ++-----
 2 files changed, 24 insertions(+), 54 deletions(-)

diff --git a/backends/cuda-ref/ceed-cuda-ref-basis.c b/backends/cuda-ref/ceed-cuda-ref-basis.c
index 544a5cb188..b21466f33c 100644
--- a/backends/cuda-ref/ceed-cuda-ref-basis.c
+++ b/backends/cuda-ref/ceed-cuda-ref-basis.c
@@ -35,20 +35,12 @@ static int CeedBasisApplyCore_Cuda(CeedBasis basis, bool apply_add, const CeedIn
   // Get read/write access to u, v
   if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
   else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
-  if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v));
-  else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
-
-  // Clear v for transpose operation
-  if (is_transpose && !apply_add) {
-    CeedInt  num_comp, q_comp, num_nodes, num_qpts;
-    CeedSize length;
-
-    CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
-    CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp));
-    CeedCallBackend(CeedBasisGetNumNodes(basis, &num_nodes));
-    CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &num_qpts));
-    length = (CeedSize)num_elem * (CeedSize)num_comp * (t_mode == CEED_TRANSPOSE ? (CeedSize)num_nodes : ((CeedSize)num_qpts * (CeedSize)q_comp));
-    CeedCallCuda(ceed, cudaMemset(d_v, 0, length * sizeof(CeedScalar)));
+  if (apply_add) {
+    CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v));
+  } else {
+    // Clear v for transpose operation
+    if (is_transpose) CeedCallBackend(CeedVectorSetValue(v, 0.0));
+    CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
   }
   CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
   CeedCallBackend(CeedBasisGetDimension(basis, &dim));
@@ -203,20 +195,12 @@ static int CeedBasisApplyAtPointsCore_Cuda(CeedBasis basis, bool apply_add, cons
   CeedCallBackend(CeedVectorGetArrayRead(x_ref, CEED_MEM_DEVICE, &d_x));
   if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
   else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
-  if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v));
-  else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
-
-  // Clear v for transpose operation
-  if (is_transpose && !apply_add) {
-    CeedInt  num_comp, q_comp, num_nodes;
-    CeedSize length;
-
-    CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
-    CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp));
-    CeedCallBackend(CeedBasisGetNumNodes(basis, &num_nodes));
-    length =
-        (CeedSize)num_elem * (CeedSize)num_comp * (t_mode == CEED_TRANSPOSE ? (CeedSize)num_nodes : ((CeedSize)max_num_points * (CeedSize)q_comp));
-    CeedCallCuda(ceed, cudaMemset(d_v, 0, length * sizeof(CeedScalar)));
+  if (apply_add) {
+    CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v));
+  } else {
+    // Clear v for transpose operation
+    if (is_transpose) CeedCallBackend(CeedVectorSetValue(v, 0.0));
+    CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
   }
 
   // Basis action
@@ -287,18 +271,12 @@ static int CeedBasisApplyNonTensorCore_Cuda(CeedBasis basis, bool apply_add, con
   // Get read/write access to u, v
   if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
   else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
-  if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v));
-  else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
-
-  // Clear v for transpose operation
-  if (is_transpose && !apply_add) {
-    CeedInt  num_comp, q_comp;
-    CeedSize length;
-
-    CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
-    CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp));
-    length = (CeedSize)num_elem * (CeedSize)num_comp * (t_mode == CEED_TRANSPOSE ? (CeedSize)num_nodes : ((CeedSize)num_qpts * (CeedSize)q_comp));
-    CeedCallCuda(ceed, cudaMemset(d_v, 0, length * sizeof(CeedScalar)));
+  if (apply_add) {
+    CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v));
+  } else {
+    // Clear v for transpose operation
+    if (is_transpose) CeedCallBackend(CeedVectorSetValue(v, 0.0));
+    CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
   }
 
   // Apply basis operation
diff --git a/backends/cuda-shared/ceed-cuda-shared-basis.c b/backends/cuda-shared/ceed-cuda-shared-basis.c
index f3a73b54ba..06fd102f11 100644
--- a/backends/cuda-shared/ceed-cuda-shared-basis.c
+++ b/backends/cuda-shared/ceed-cuda-shared-basis.c
@@ -297,20 +297,12 @@ static int CeedBasisApplyAtPointsCore_Cuda_shared(CeedBasis basis, bool apply_ad
   CeedCallBackend(CeedVectorGetArrayRead(x_ref, CEED_MEM_DEVICE, &d_x));
   if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
   else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
-  if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v));
-  else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
-
-  // Clear v for transpose operation
-  if (is_transpose && !apply_add) {
-    CeedInt  num_comp, q_comp, num_nodes;
-    CeedSize length;
-
-    CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
-    CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp));
-    CeedCallBackend(CeedBasisGetNumNodes(basis, &num_nodes));
-    length =
-        (CeedSize)num_elem * (CeedSize)num_comp * (t_mode == CEED_TRANSPOSE ? (CeedSize)num_nodes : ((CeedSize)max_num_points * (CeedSize)q_comp));
-    CeedCallCuda(ceed, cudaMemset(d_v, 0, length * sizeof(CeedScalar)));
+  if (apply_add) {
+    CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v));
+  } else {
+    // Clear v for transpose operation
+    if (is_transpose) CeedCallBackend(CeedVectorSetValue(v, 0.0));
+    CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
   }
 
   // Basis action

From 759e0bc390c235748ccd97d5467004760f707fa5 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Tue, 11 Feb 2025 13:32:08 -0700
Subject: [PATCH 281/571] minor - style consistency

---
 backends/cuda-shared/ceed-cuda-shared-basis.c | 14 ++++++++++----
 backends/hip-shared/ceed-hip-shared-basis.c   |  9 +++++----
 2 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/backends/cuda-shared/ceed-cuda-shared-basis.c b/backends/cuda-shared/ceed-cuda-shared-basis.c
index 06fd102f11..de63662ded 100644
--- a/backends/cuda-shared/ceed-cuda-shared-basis.c
+++ b/backends/cuda-shared/ceed-cuda-shared-basis.c
@@ -39,8 +39,11 @@ static int CeedBasisApplyTensorCore_Cuda_shared(CeedBasis basis, bool apply_add,
   // Get read/write access to u, v
   if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
   else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
-  if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v));
-  else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
+  if (apply_add) {
+    CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v));
+  } else {
+    CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
+  }
 
   // Apply basis operation
   switch (eval_mode) {
@@ -428,8 +431,11 @@ static int CeedBasisApplyNonTensorCore_Cuda_shared(CeedBasis basis, bool apply_a
   // Get read/write access to u, v
   if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
   else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
-  if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v));
-  else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
+  if (apply_add) {
+    CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v));
+  } else {
+    CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
+  }
 
   // Apply basis operation
   switch (eval_mode) {
diff --git a/backends/hip-shared/ceed-hip-shared-basis.c b/backends/hip-shared/ceed-hip-shared-basis.c
index 7c54aa2bf2..ca627d9061 100644
--- a/backends/hip-shared/ceed-hip-shared-basis.c
+++ b/backends/hip-shared/ceed-hip-shared-basis.c
@@ -106,8 +106,11 @@ static int CeedBasisApplyTensorCore_Hip_shared(CeedBasis basis, bool apply_add,
   // Get read/write access to u, v
   if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
   else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
-  if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v));
-  else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
+  if (apply_add) {
+    CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v));
+  } else {
+    CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
+  }
 
   // Apply basis operation
   switch (eval_mode) {
@@ -497,8 +500,6 @@ static int CeedBasisApplyNonTensorCore_Hip_shared(CeedBasis basis, bool apply_ad
   if (apply_add) {
     CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v));
   } else {
-    // Clear v for transpose operation
-    if (is_transpose) CeedCallBackend(CeedVectorSetValue(v, 0.0));
     CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
   }
 

From 82138112808ac45c6722ef2bfe52ea5cd96df80f Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Tue, 11 Feb 2025 14:30:19 -0700
Subject: [PATCH 282/571] ex - enable basic benchmarking mode

---
 examples/ceed/ex1-volume.c  | 16 +++++++++++++++-
 examples/ceed/ex2-surface.c | 16 +++++++++++++++-
 2 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/examples/ceed/ex1-volume.c b/examples/ceed/ex1-volume.c
index 88bd472bf1..c8a12651a6 100644
--- a/examples/ceed/ex1-volume.c
+++ b/examples/ceed/ex1-volume.c
@@ -60,7 +60,7 @@ int main(int argc, const char *argv[]) {
   CeedInt     sol_degree  = 4;               // polynomial degree for the solution
   CeedInt     num_qpts    = sol_degree + 2;  // number of 1D quadrature points
   CeedInt     prob_size   = -1;              // approximate problem size
-  CeedInt     help = 0, test = 0, gallery = 0;
+  CeedInt     help = 0, test = 0, gallery = 0, benchmark = 0;
 
   // Process command line arguments.
   for (int ia = 1; ia < argc; ia++) {
@@ -81,6 +81,8 @@ int main(int argc, const char *argv[]) {
       parse_error = next_arg ? num_qpts = atoi(argv[++ia]), 0 : 1;
     } else if (!strcmp(argv[ia], "-s")) {
       parse_error = next_arg ? prob_size = atoi(argv[++ia]), 0 : 1;
+    } else if (!strcmp(argv[ia], "-b")) {
+      parse_error = next_arg ? benchmark = atoi(argv[++ia]), 0 : 1;
     } else if (!strcmp(argv[ia], "-t")) {
       test = 1;
     } else if (!strcmp(argv[ia], "-g")) {
@@ -223,6 +225,18 @@ int main(int argc, const char *argv[]) {
   // Compute the mesh volume using the mass operator: volume = 1^T \cdot M \cdot 1
   CeedOperatorApply(op_apply, u, v, CEED_REQUEST_IMMEDIATE);
 
+  // Benchmark runs
+  if (!test && benchmark) {
+    // LCOV_EXCL_START
+    printf(" Executing %d benchmarking runs...\n", benchmark);
+    // LCOV_EXCL_STOP
+  }
+  for (CeedInt i = 0; i < benchmark; i++) {
+    // LCOV_EXCL_START
+    CeedOperatorApply(op_apply, u, v, CEED_REQUEST_IMMEDIATE);
+    // LCOV_EXCL_STOP
+  }
+
   // Compute and print the sum of the entries of 'v' giving the mesh volume.
   CeedScalar volume = 0.;
   {
diff --git a/examples/ceed/ex2-surface.c b/examples/ceed/ex2-surface.c
index 77b1480227..5109e36d79 100644
--- a/examples/ceed/ex2-surface.c
+++ b/examples/ceed/ex2-surface.c
@@ -60,7 +60,7 @@ int main(int argc, const char *argv[]) {
   CeedInt     sol_degree  = 4;               // polynomial degree for the solution
   CeedInt     num_qpts    = sol_degree + 2;  // number of 1D quadrature points
   CeedInt     prob_size   = -1;              // approximate problem size
-  CeedInt     help = 0, test = 0, gallery = 0;
+  CeedInt     help = 0, test = 0, gallery = 0, benchmark = 0;
 
   // Process command line arguments.
   for (int ia = 1; ia < argc; ia++) {
@@ -81,6 +81,8 @@ int main(int argc, const char *argv[]) {
       parse_error = next_arg ? num_qpts = atoi(argv[++ia]), 0 : 1;
     } else if (!strcmp(argv[ia], "-s")) {
       parse_error = next_arg ? prob_size = atoi(argv[++ia]), 0 : 1;
+    } else if (!strcmp(argv[ia], "-b")) {
+      parse_error = next_arg ? benchmark = atoi(argv[++ia]), 0 : 1;
     } else if (!strcmp(argv[ia], "-t")) {
       test = 1;
     } else if (!strcmp(argv[ia], "-g")) {
@@ -243,6 +245,18 @@ int main(int argc, const char *argv[]) {
   // Compute the mesh surface area using the diff operator: surface_area = 1^T \cdot abs( K \cdot x).
   CeedOperatorApply(op_apply, u, v, CEED_REQUEST_IMMEDIATE);
 
+  // Benchmark runs
+  if (!test && benchmark) {
+    // LCOV_EXCL_START
+    printf(" Executing %d benchmarking runs...\n", benchmark);
+    // LCOV_EXCL_STOP
+  }
+  for (CeedInt i = 0; i < benchmark; i++) {
+    // LCOV_EXCL_START
+    CeedOperatorApply(op_apply, u, v, CEED_REQUEST_IMMEDIATE);
+    // LCOV_EXCL_STOP
+  }
+
   // Compute and print the sum of the entries of 'v' giving the mesh surface area.
   CeedScalar surface_area = 0.;
   {

From 8c03e814a8aedd48736bf8454f3df41e37fe2fcc Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Tue, 11 Feb 2025 17:50:49 -0700
Subject: [PATCH 283/571] petsc - fix vec type checking

---
 examples/petsc/area.c      | 49 +++++++++++++++++++-------------------
 examples/petsc/bps.c       | 31 ++++++++++++------------
 examples/petsc/bpsraw.c    |  1 +
 examples/petsc/bpssphere.c | 31 ++++++++++++------------
 examples/petsc/bpsswarm.c  | 47 ++++++++++++++++++------------------
 examples/petsc/multigrid.c |  5 +++-
 6 files changed, 82 insertions(+), 82 deletions(-)

diff --git a/examples/petsc/area.c b/examples/petsc/area.c
index 6b782e45fa..1f5cca850a 100644
--- a/examples/petsc/area.c
+++ b/examples/petsc/area.c
@@ -71,7 +71,7 @@ int main(int argc, char **argv) {
   Ceed                 ceed;
   CeedData             ceed_data;
   ProblemType          problem_choice;
-  VecType              vec_type;
+  VecType              vec_type = VECSTANDARD;
   PetscMemType         mem_type;
 
   PetscCall(PetscInitialize(&argc, &argv, NULL, help));
@@ -110,15 +110,6 @@ int main(int argc, char **argv) {
   // Create DM
   PetscCall(SetupDMByDegree(dm, degree, q_extra, num_comp_u, topo_dim, false));
 
-  // Create vectors
-  PetscCall(DMCreateGlobalVector(dm, &U));
-  PetscCall(VecGetLocalSize(U, &l_size));
-  PetscCall(VecGetSize(U, &g_size));
-  PetscCall(DMCreateLocalVector(dm, &U_loc));
-  PetscCall(VecGetSize(U_loc, &xl_size));
-  PetscCall(VecDuplicate(U, &V));
-  PetscCall(VecDuplicate(U_loc, &V_loc));
-
   // Setup op_apply_ctx structure
   PetscCall(PetscMalloc1(1, &op_apply_ctx));
 
@@ -127,23 +118,31 @@ int main(int argc, char **argv) {
   CeedMemType mem_type_backend;
   CeedGetPreferredMemType(ceed, &mem_type_backend);
 
-  PetscCall(DMGetVecType(dm, &vec_type));
-  if (!vec_type) {  // Not yet set by op_apply_ctx -dm_vec_type
-    switch (mem_type_backend) {
-      case CEED_MEM_HOST:
-        vec_type = VECSTANDARD;
-        break;
-      case CEED_MEM_DEVICE: {
-        const char *resolved;
-        CeedGetResource(ceed, &resolved);
-        if (strstr(resolved, "/gpu/cuda")) vec_type = VECCUDA;
-        else if (strstr(resolved, "/gpu/hip/occa")) vec_type = VECSTANDARD;  // https://github.com/CEED/libCEED/issues/678
-        else if (strstr(resolved, "/gpu/hip")) vec_type = VECHIP;
-        else vec_type = VECSTANDARD;
-      }
+  // Set mesh vec_type
+  switch (mem_type_backend) {
+    case CEED_MEM_HOST:
+      vec_type = VECSTANDARD;
+      break;
+    case CEED_MEM_DEVICE: {
+      const char *resolved;
+
+      CeedGetResource(ceed, &resolved);
+      if (strstr(resolved, "/gpu/cuda")) vec_type = VECCUDA;
+      else if (strstr(resolved, "/gpu/hip/occa")) vec_type = VECSTANDARD;  // https://github.com/CEED/libCEED/issues/678
+      else if (strstr(resolved, "/gpu/hip")) vec_type = VECHIP;
+      else vec_type = VECSTANDARD;
     }
-    PetscCall(DMSetVecType(dm, vec_type));
   }
+  PetscCall(DMSetVecType(dm, vec_type));
+
+  // Create vectors
+  PetscCall(DMCreateGlobalVector(dm, &U));
+  PetscCall(VecGetLocalSize(U, &l_size));
+  PetscCall(VecGetSize(U, &g_size));
+  PetscCall(DMCreateLocalVector(dm, &U_loc));
+  PetscCall(VecGetSize(U_loc, &xl_size));
+  PetscCall(VecDuplicate(U, &V));
+  PetscCall(VecDuplicate(U_loc, &V_loc));
 
   // Print summary
   if (!test_mode) {
diff --git a/examples/petsc/bps.c b/examples/petsc/bps.c
index 9040add679..ecfe294798 100644
--- a/examples/petsc/bps.c
+++ b/examples/petsc/bps.c
@@ -62,7 +62,7 @@ static PetscErrorCode RunWithDM(RunParams rp, DM dm, const char *ceed_resource)
   CeedQFunction        qf_error;
   CeedOperator         op_error;
   CeedVector           rhs_ceed, target;
-  VecType              vec_type;
+  VecType              vec_type = VECSTANDARD;
   PetscMemType         mem_type;
 
   PetscFunctionBeginUser;
@@ -71,23 +71,22 @@ static PetscErrorCode RunWithDM(RunParams rp, DM dm, const char *ceed_resource)
   CeedMemType mem_type_backend;
   CeedGetPreferredMemType(ceed, &mem_type_backend);
 
-  PetscCall(DMGetVecType(dm, &vec_type));
-  if (!vec_type) {  // Not yet set by user -dm_vec_type
-    switch (mem_type_backend) {
-      case CEED_MEM_HOST:
-        vec_type = VECSTANDARD;
-        break;
-      case CEED_MEM_DEVICE: {
-        const char *resolved;
-        CeedGetResource(ceed, &resolved);
-        if (strstr(resolved, "/gpu/cuda")) vec_type = VECCUDA;
-        else if (strstr(resolved, "/gpu/hip/occa")) vec_type = VECSTANDARD;  // https://github.com/CEED/libCEED/issues/678
-        else if (strstr(resolved, "/gpu/hip")) vec_type = VECHIP;
-        else vec_type = VECSTANDARD;
-      }
+  // Set mesh vec_type
+  switch (mem_type_backend) {
+    case CEED_MEM_HOST:
+      vec_type = VECSTANDARD;
+      break;
+    case CEED_MEM_DEVICE: {
+      const char *resolved;
+
+      CeedGetResource(ceed, &resolved);
+      if (strstr(resolved, "/gpu/cuda")) vec_type = VECCUDA;
+      else if (strstr(resolved, "/gpu/hip/occa")) vec_type = VECSTANDARD;  // https://github.com/CEED/libCEED/issues/678
+      else if (strstr(resolved, "/gpu/hip")) vec_type = VECHIP;
+      else vec_type = VECSTANDARD;
     }
-    PetscCall(DMSetVecType(dm, vec_type));
   }
+  PetscCall(DMSetVecType(dm, vec_type));
 
   // Create global and local solution vectors
   PetscCall(DMCreateGlobalVector(dm, &X));
diff --git a/examples/petsc/bpsraw.c b/examples/petsc/bpsraw.c
index 8a6e21bc93..e8d901b410 100644
--- a/examples/petsc/bpsraw.c
+++ b/examples/petsc/bpsraw.c
@@ -403,6 +403,7 @@ int main(int argc, char **argv) {
       break;
     case CEED_MEM_DEVICE: {
       const char *resolved;
+
       CeedGetResource(ceed, &resolved);
       if (strstr(resolved, "/gpu/cuda")) default_vec_type = VECCUDA;
       else if (strstr(resolved, "/gpu/hip/occa")) default_vec_type = VECSTANDARD;  // https://github.com/CEED/libCEED/issues/678
diff --git a/examples/petsc/bpssphere.c b/examples/petsc/bpssphere.c
index 9fc63472fa..ab9648a4de 100644
--- a/examples/petsc/bpssphere.c
+++ b/examples/petsc/bpssphere.c
@@ -64,7 +64,7 @@ int main(int argc, char **argv) {
   CeedOperator         op_error;
   CeedVector           rhs_ceed, target;
   BPType               bp_choice;
-  VecType              vec_type;
+  VecType              vec_type = VECSTANDARD;
   PetscMemType         mem_type;
 
   PetscCall(PetscInitialize(&argc, &argv, NULL, help));
@@ -130,23 +130,22 @@ int main(int argc, char **argv) {
   CeedMemType mem_type_backend;
   CeedGetPreferredMemType(ceed, &mem_type_backend);
 
-  PetscCall(DMGetVecType(dm, &vec_type));
-  if (!vec_type) {  // Not yet set by user -dm_vec_type
-    switch (mem_type_backend) {
-      case CEED_MEM_HOST:
-        vec_type = VECSTANDARD;
-        break;
-      case CEED_MEM_DEVICE: {
-        const char *resolved;
-        CeedGetResource(ceed, &resolved);
-        if (strstr(resolved, "/gpu/cuda")) vec_type = VECCUDA;
-        else if (strstr(resolved, "/gpu/hip/occa")) vec_type = VECSTANDARD;  // https://github.com/CEED/libCEED/issues/678
-        else if (strstr(resolved, "/gpu/hip")) vec_type = VECHIP;
-        else vec_type = VECSTANDARD;
-      }
+  // Set mesh vec_type
+  switch (mem_type_backend) {
+    case CEED_MEM_HOST:
+      vec_type = VECSTANDARD;
+      break;
+    case CEED_MEM_DEVICE: {
+      const char *resolved;
+
+      CeedGetResource(ceed, &resolved);
+      if (strstr(resolved, "/gpu/cuda")) vec_type = VECCUDA;
+      else if (strstr(resolved, "/gpu/hip/occa")) vec_type = VECSTANDARD;  // https://github.com/CEED/libCEED/issues/678
+      else if (strstr(resolved, "/gpu/hip")) vec_type = VECHIP;
+      else vec_type = VECSTANDARD;
     }
-    PetscCall(DMSetVecType(dm, vec_type));
   }
+  PetscCall(DMSetVecType(dm, vec_type));
 
   // Print summary
   if (!test_mode) {
diff --git a/examples/petsc/bpsswarm.c b/examples/petsc/bpsswarm.c
index 0609a8d7c0..a5acffd52e 100644
--- a/examples/petsc/bpsswarm.c
+++ b/examples/petsc/bpsswarm.c
@@ -65,7 +65,7 @@ int main(int argc, char **argv) {
   CeedData             ceed_data;
   CeedOperator         op_error;
   BPType               bp_choice;
-  VecType              vec_type;
+  VecType              vec_type         = VECSTANDARD;
   PointSwarmType       point_swarm_type = SWARM_GAUSS;
   PetscMPIInt          ranks_per_node;
   char                 hostname[PETSC_MAX_PATH_LEN];
@@ -195,6 +195,28 @@ int main(int argc, char **argv) {
   PetscCall(PetscObjectSetName((PetscObject)dm_swarm, "Particle Swarm"));
   PetscCall(DMViewFromOptions(dm_swarm, NULL, "-dm_swarm_view"));
 
+  // Set up libCEED
+  CeedInit(ceed_resource, &ceed);
+  CeedMemType mem_type_backend;
+  CeedGetPreferredMemType(ceed, &mem_type_backend);
+
+  // Set background mesh vec_type
+  switch (mem_type_backend) {
+    case CEED_MEM_HOST:
+      vec_type = VECSTANDARD;
+      break;
+    case CEED_MEM_DEVICE: {
+      const char *resolved;
+
+      CeedGetResource(ceed, &resolved);
+      if (strstr(resolved, "/gpu/cuda")) vec_type = VECCUDA;
+      else if (strstr(resolved, "/gpu/hip/occa")) vec_type = VECSTANDARD;  // https://github.com/CEED/libCEED/issues/678
+      else if (strstr(resolved, "/gpu/hip")) vec_type = VECHIP;
+      else vec_type = VECSTANDARD;
+    }
+  }
+  PetscCall(DMSetVecType(dm_mesh, vec_type));
+
   // Create vectors
   PetscCall(DMCreateGlobalVector(dm_mesh, &X));
   PetscCall(VecGetLocalSize(X, &l_size));
@@ -211,29 +233,6 @@ int main(int argc, char **argv) {
   PetscCall(MatShellSetOperation(mat_O, MATOP_MULT, (void (*)(void))MatMult_Ceed));
   PetscCall(MatShellSetOperation(mat_O, MATOP_GET_DIAGONAL, (void (*)(void))MatGetDiag));
 
-  // Set up libCEED
-  CeedInit(ceed_resource, &ceed);
-  CeedMemType mem_type_backend;
-  CeedGetPreferredMemType(ceed, &mem_type_backend);
-
-  PetscCall(DMGetVecType(dm_mesh, &vec_type));
-  if (!vec_type) {  // Not yet set by user -dm_vec_type
-    switch (mem_type_backend) {
-      case CEED_MEM_HOST:
-        vec_type = VECSTANDARD;
-        break;
-      case CEED_MEM_DEVICE: {
-        const char *resolved;
-        CeedGetResource(ceed, &resolved);
-        if (strstr(resolved, "/gpu/cuda")) vec_type = VECCUDA;
-        else if (strstr(resolved, "/gpu/hip/occa")) vec_type = VECSTANDARD;  // https://github.com/CEED/libCEED/issues/678
-        else if (strstr(resolved, "/gpu/hip")) vec_type = VECHIP;
-        else vec_type = VECSTANDARD;
-      }
-    }
-    PetscCall(DMSetVecType(dm_mesh, vec_type));
-  }
-
   // Print summary
   if (!test_mode) {
     PetscInt P = degree + 1, Q = P + q_extra;
diff --git a/examples/petsc/multigrid.c b/examples/petsc/multigrid.c
index 4fd9a62166..60926fef5f 100644
--- a/examples/petsc/multigrid.c
+++ b/examples/petsc/multigrid.c
@@ -120,13 +120,16 @@ int main(int argc, char **argv) {
     PetscCall(DMPlexCreateBoxMesh(PETSC_COMM_WORLD, dim, simplex, mesh_elem, NULL, NULL, NULL, PETSC_TRUE, 0, PETSC_FALSE, &dm_orig));
   }
 
-  VecType vec_type;
+  // Set mesh vec_type
+  VecType vec_type = VECSTANDARD;
+
   switch (mem_type_backend) {
     case CEED_MEM_HOST:
       vec_type = VECSTANDARD;
       break;
     case CEED_MEM_DEVICE: {
       const char *resolved;
+
       CeedGetResource(ceed, &resolved);
       if (strstr(resolved, "/gpu/cuda")) vec_type = VECCUDA;
       else if (strstr(resolved, "/gpu/hip/occa")) vec_type = VECSTANDARD;  // https://github.com/CEED/libCEED/issues/678

From 072672ed2b369cbf5116989ce5cefb44381c6b56 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Wed, 12 Feb 2025 10:53:35 -0700
Subject: [PATCH 284/571] petsc - allow setting vec type in command line again

---
 examples/petsc/bps.c      | 1 +
 examples/petsc/bpsswarm.c | 1 +
 2 files changed, 2 insertions(+)

diff --git a/examples/petsc/bps.c b/examples/petsc/bps.c
index ecfe294798..3d6e475385 100644
--- a/examples/petsc/bps.c
+++ b/examples/petsc/bps.c
@@ -87,6 +87,7 @@ static PetscErrorCode RunWithDM(RunParams rp, DM dm, const char *ceed_resource)
     }
   }
   PetscCall(DMSetVecType(dm, vec_type));
+  PetscCall(DMSetFromOptions(dm));
 
   // Create global and local solution vectors
   PetscCall(DMCreateGlobalVector(dm, &X));
diff --git a/examples/petsc/bpsswarm.c b/examples/petsc/bpsswarm.c
index a5acffd52e..3809964e9a 100644
--- a/examples/petsc/bpsswarm.c
+++ b/examples/petsc/bpsswarm.c
@@ -216,6 +216,7 @@ int main(int argc, char **argv) {
     }
   }
   PetscCall(DMSetVecType(dm_mesh, vec_type));
+  PetscCall(DMSetFromOptions(dm_mesh));
 
   // Create vectors
   PetscCall(DMCreateGlobalVector(dm_mesh, &X));

From f174933e42b4d5a2db574b02cd498c160daf12d9 Mon Sep 17 00:00:00 2001
From: Zach Atkins <Zach.Atkins@colorado.edu>
Date: Wed, 12 Feb 2025 09:54:08 -0800
Subject: [PATCH 285/571] Remove massive VLA

---
 examples/petsc/src/swarmutils.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/examples/petsc/src/swarmutils.c b/examples/petsc/src/swarmutils.c
index ee047b724f..64901f8ed9 100644
--- a/examples/petsc/src/swarmutils.c
+++ b/examples/petsc/src/swarmutils.c
@@ -617,6 +617,7 @@ PetscErrorCode SetupProblemSwarm(DM dm_swarm, Ceed ceed, BPData bp_data, CeedDat
   // Swarm objects
   {
     const PetscInt *cell_points;
+    CeedInt        *offsets;
     IS              is_points;
     Vec             X_ref;
     CeedInt         num_elem;
@@ -628,7 +629,7 @@ PetscErrorCode SetupProblemSwarm(DM dm_swarm, Ceed ceed, BPData bp_data, CeedDat
 
     PetscCall(ISGetIndices(is_points, &cell_points));
     PetscInt num_points = cell_points[num_elem + 1] - num_elem - 2;
-    CeedInt  offsets[num_elem + 1 + num_points];
+    PetscCall(PetscCalloc1(num_elem + 1 + num_points, &offsets));
 
     for (PetscInt i = 0; i < num_elem + 1; i++) offsets[i] = cell_points[i + 1] - 1;
     for (PetscInt i = num_elem + 1; i < num_points + num_elem + 1; i++) offsets[i] = cell_points[i + 1];
@@ -685,6 +686,7 @@ PetscErrorCode SetupProblemSwarm(DM dm_swarm, Ceed ceed, BPData bp_data, CeedDat
 
     // Cleanup
     PetscCall(ISDestroy(&is_points));
+    PetscCall(PetscFree(offsets));
     PetscCall(VecDestroy(&X_ref));
   }
 

From 124cc107811199b024577cdea9f0240bb43be223 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Wed, 12 Feb 2025 07:18:47 -0700
Subject: [PATCH 286/571] vec - use memset on GPU when SetValue for 0

---
 backends/cuda-ref/ceed-cuda-ref-vector.c | 9 ++++++---
 backends/hip-ref/ceed-hip-ref-vector.c   | 9 ++++++---
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/backends/cuda-ref/ceed-cuda-ref-vector.c b/backends/cuda-ref/ceed-cuda-ref-vector.c
index 7365327229..791673b25f 100644
--- a/backends/cuda-ref/ceed-cuda-ref-vector.c
+++ b/backends/cuda-ref/ceed-cuda-ref-vector.c
@@ -304,10 +304,13 @@ static int CeedVectorSetValue_Cuda(CeedVector vec, CeedScalar val) {
     }
   }
   if (impl->d_array) {
-    CeedCallBackend(CeedDeviceSetValue_Cuda(impl->d_array, length, val));
+    if (val == 0) {
+      CeedCallCuda(CeedVectorReturnCeed(vec), cudaMemset(impl->d_array, 0, length * sizeof(CeedScalar)));
+    } else {
+      CeedCallBackend(CeedDeviceSetValue_Cuda(impl->d_array, length, val));
+    }
     impl->h_array = NULL;
-  }
-  if (impl->h_array) {
+  } else if (impl->h_array) {
     CeedCallBackend(CeedHostSetValue_Cuda(impl->h_array, length, val));
     impl->d_array = NULL;
   }
diff --git a/backends/hip-ref/ceed-hip-ref-vector.c b/backends/hip-ref/ceed-hip-ref-vector.c
index 0a3a3fe3d0..5eced676df 100644
--- a/backends/hip-ref/ceed-hip-ref-vector.c
+++ b/backends/hip-ref/ceed-hip-ref-vector.c
@@ -304,10 +304,13 @@ static int CeedVectorSetValue_Hip(CeedVector vec, CeedScalar val) {
     }
   }
   if (impl->d_array) {
-    CeedCallBackend(CeedDeviceSetValue_Hip(impl->d_array, length, val));
+    if (val == 0) {
+      CeedCallHip(CeedVectorReturnCeed(vec), hipMemset(impl->d_array, 0, length * sizeof(CeedScalar)));
+    } else {
+      CeedCallBackend(CeedDeviceSetValue_Hip(impl->d_array, length, val));
+    }
     impl->h_array = NULL;
-  }
-  if (impl->h_array) {
+  } else if (impl->h_array) {
     CeedCallBackend(CeedHostSetValue_Hip(impl->h_array, length, val));
     impl->d_array = NULL;
   }

From c9d5affad74485f8d1e55e6be07e3d9f76bd4cae Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Wed, 12 Feb 2025 07:43:33 -0700
Subject: [PATCH 287/571] gpu - minor consistency

---
 backends/cuda-ref/kernels/cuda-ref-vector.cu  | 58 +++++++++++--------
 .../hip-ref/kernels/hip-ref-vector.hip.cpp    | 50 ++++++++--------
 2 files changed, 58 insertions(+), 50 deletions(-)

diff --git a/backends/cuda-ref/kernels/cuda-ref-vector.cu b/backends/cuda-ref/kernels/cuda-ref-vector.cu
index 29788bf4fd..3ce095cb8f 100644
--- a/backends/cuda-ref/kernels/cuda-ref-vector.cu
+++ b/backends/cuda-ref/kernels/cuda-ref-vector.cu
@@ -12,9 +12,11 @@
 // Kernel for copy strided on device
 //------------------------------------------------------------------------------
 __global__ static void copyStridedK(CeedScalar *__restrict__ vec, CeedSize start, CeedSize step, CeedSize size, CeedScalar *__restrict__ vec_copy) {
-  CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
-  if (index >= size) return;
-  if ((index - start) % step == 0) vec_copy[index] = vec[index];
+  const CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
+
+  if (index < size) {
+    if ((index - start) % step == 0) vec_copy[index] = vec[index];
+  }
 }
 
 //------------------------------------------------------------------------------
@@ -34,9 +36,9 @@ extern "C" int CeedDeviceCopyStrided_Cuda(CeedScalar *d_array, CeedSize start, C
 // Kernel for set value on device
 //------------------------------------------------------------------------------
 __global__ static void setValueK(CeedScalar *__restrict__ vec, CeedSize size, CeedScalar val) {
-  CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
-  if (index >= size) return;
-  vec[index] = val;
+  const CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
+
+  if (index < size) vec[index] = val;
 }
 
 //------------------------------------------------------------------------------
@@ -56,9 +58,11 @@ extern "C" int CeedDeviceSetValue_Cuda(CeedScalar *d_array, CeedSize length, Cee
 // Kernel for set value strided on device
 //------------------------------------------------------------------------------
 __global__ static void setValueStridedK(CeedScalar *__restrict__ vec, CeedSize start, CeedSize step, CeedSize size, CeedScalar val) {
-  CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
-  if (index >= size) return;
-  if ((index - start) % step == 0) vec[index] = val;
+  const CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
+
+  if (index < size) {
+    if ((index - start) % step == 0) vec[index] = val;
+  }
 }
 
 //------------------------------------------------------------------------------
@@ -78,9 +82,11 @@ extern "C" int CeedDeviceSetValueStrided_Cuda(CeedScalar *d_array, CeedSize star
 // Kernel for taking reciprocal
 //------------------------------------------------------------------------------
 __global__ static void rcpValueK(CeedScalar *__restrict__ vec, CeedSize size) {
-  CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
-  if (index >= size) return;
-  if (fabs(vec[index]) > 1E-16) vec[index] = 1. / vec[index];
+  const CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
+
+  if (index < size) {
+    if (fabs(vec[index]) > 1E-16) vec[index] = 1. / vec[index];
+  }
 }
 
 //------------------------------------------------------------------------------
@@ -100,9 +106,9 @@ extern "C" int CeedDeviceReciprocal_Cuda(CeedScalar *d_array, CeedSize length) {
 // Kernel for scale
 //------------------------------------------------------------------------------
 __global__ static void scaleValueK(CeedScalar *__restrict__ x, CeedScalar alpha, CeedSize size) {
-  CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
-  if (index >= size) return;
-  x[index] *= alpha;
+  const CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
+
+  if (index < size) x[index] *= alpha;
 }
 
 //------------------------------------------------------------------------------
@@ -122,9 +128,9 @@ extern "C" int CeedDeviceScale_Cuda(CeedScalar *x_array, CeedScalar alpha, CeedS
 // Kernel for axpy
 //------------------------------------------------------------------------------
 __global__ static void axpyValueK(CeedScalar *__restrict__ y, CeedScalar alpha, CeedScalar *__restrict__ x, CeedSize size) {
-  CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
-  if (index >= size) return;
-  y[index] += alpha * x[index];
+  const CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
+
+  if (index < size) y[index] += alpha * x[index];
 }
 
 //------------------------------------------------------------------------------
@@ -144,10 +150,12 @@ extern "C" int CeedDeviceAXPY_Cuda(CeedScalar *y_array, CeedScalar alpha, CeedSc
 // Kernel for axpby
 //------------------------------------------------------------------------------
 __global__ static void axpbyValueK(CeedScalar *__restrict__ y, CeedScalar alpha, CeedScalar beta, CeedScalar *__restrict__ x, CeedSize size) {
-  CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
-  if (index >= size) return;
-  y[index] = beta * y[index];
-  y[index] += alpha * x[index];
+  const CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
+
+  if (index < size) {
+    y[index] = beta * y[index];
+    y[index] += alpha * x[index];
+  }
 }
 
 //------------------------------------------------------------------------------
@@ -167,9 +175,9 @@ extern "C" int CeedDeviceAXPBY_Cuda(CeedScalar *y_array, CeedScalar alpha, CeedS
 // Kernel for pointwise mult
 //------------------------------------------------------------------------------
 __global__ static void pointwiseMultValueK(CeedScalar *__restrict__ w, CeedScalar *x, CeedScalar *__restrict__ y, CeedSize size) {
-  CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
-  if (index >= size) return;
-  w[index] = x[index] * y[index];
+  const CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
+
+  if (index < size) w[index] = x[index] * y[index];
 }
 
 //------------------------------------------------------------------------------
diff --git a/backends/hip-ref/kernels/hip-ref-vector.hip.cpp b/backends/hip-ref/kernels/hip-ref-vector.hip.cpp
index 5375d2e10b..1186548b16 100644
--- a/backends/hip-ref/kernels/hip-ref-vector.hip.cpp
+++ b/backends/hip-ref/kernels/hip-ref-vector.hip.cpp
@@ -12,10 +12,11 @@
 // Kernel for copy strided on device
 //------------------------------------------------------------------------------
 __global__ static void copyStridedK(CeedScalar *__restrict__ vec, CeedSize start, CeedSize step, CeedSize size, CeedScalar *__restrict__ vec_copy) {
-  CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
+  const CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
 
-  if (index >= size) return;
-  if ((index - start) % step == 0) vec_copy[index] = vec[index];
+  if (index < size) {
+    if ((index - start) % step == 0) vec_copy[index] = vec[index];
+  }
 }
 
 //------------------------------------------------------------------------------
@@ -35,10 +36,9 @@ extern "C" int CeedDeviceCopyStrided_Hip(CeedScalar *d_array, CeedSize start, Ce
 // Kernel for set value on device
 //------------------------------------------------------------------------------
 __global__ static void setValueK(CeedScalar *__restrict__ vec, CeedSize size, CeedScalar val) {
-  CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
+  const CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
 
-  if (index >= size) return;
-  vec[index] = val;
+  if (index < size) vec[index] = val;
 }
 
 //------------------------------------------------------------------------------
@@ -58,10 +58,11 @@ extern "C" int CeedDeviceSetValue_Hip(CeedScalar *d_array, CeedSize length, Ceed
 // Kernel for set value strided on device
 //------------------------------------------------------------------------------
 __global__ static void setValueStridedK(CeedScalar *__restrict__ vec, CeedSize start, CeedSize step, CeedSize size, CeedScalar val) {
-  CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
+  const CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
 
-  if (index >= size) return;
-  if ((index - start) % step == 0) vec[index] = val;
+  if (index < size) {
+    if ((index - start) % step == 0) vec[index] = val;
+  }
 }
 
 //------------------------------------------------------------------------------
@@ -81,10 +82,11 @@ extern "C" int CeedDeviceSetValueStrided_Hip(CeedScalar *d_array, CeedSize start
 // Kernel for taking reciprocal
 //------------------------------------------------------------------------------
 __global__ static void rcpValueK(CeedScalar *__restrict__ vec, CeedSize size) {
-  CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
+  const CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
 
-  if (index >= size) return;
-  if (fabs(vec[index]) > 1E-16) vec[index] = 1. / vec[index];
+  if (index < size) {
+    if (fabs(vec[index]) > 1E-16) vec[index] = 1. / vec[index];
+  }
 }
 
 //------------------------------------------------------------------------------
@@ -104,10 +106,9 @@ extern "C" int CeedDeviceReciprocal_Hip(CeedScalar *d_array, CeedSize length) {
 // Kernel for scale
 //------------------------------------------------------------------------------
 __global__ static void scaleValueK(CeedScalar *__restrict__ x, CeedScalar alpha, CeedSize size) {
-  CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
+  const CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
 
-  if (index >= size) return;
-  x[index] *= alpha;
+  if (index < size) x[index] *= alpha;
 }
 
 //------------------------------------------------------------------------------
@@ -127,10 +128,9 @@ extern "C" int CeedDeviceScale_Hip(CeedScalar *x_array, CeedScalar alpha, CeedSi
 // Kernel for axpy
 //------------------------------------------------------------------------------
 __global__ static void axpyValueK(CeedScalar *__restrict__ y, CeedScalar alpha, CeedScalar *__restrict__ x, CeedSize size) {
-  CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
+  const CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
 
-  if (index >= size) return;
-  y[index] += alpha * x[index];
+  if (index < size) y[index] += alpha * x[index];
 }
 
 //------------------------------------------------------------------------------
@@ -150,11 +150,12 @@ extern "C" int CeedDeviceAXPY_Hip(CeedScalar *y_array, CeedScalar alpha, CeedSca
 // Kernel for axpby
 //------------------------------------------------------------------------------
 __global__ static void axpbyValueK(CeedScalar *__restrict__ y, CeedScalar alpha, CeedScalar beta, CeedScalar *__restrict__ x, CeedSize size) {
-  CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
+  const CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
 
-  if (index >= size) return;
-  y[index] = beta * y[index];
-  y[index] += alpha * x[index];
+  if (index < size) {
+    y[index] = beta * y[index];
+    y[index] += alpha * x[index];
+  }
 }
 
 //------------------------------------------------------------------------------
@@ -174,10 +175,9 @@ extern "C" int CeedDeviceAXPBY_Hip(CeedScalar *y_array, CeedScalar alpha, CeedSc
 // Kernel for pointwise mult
 //------------------------------------------------------------------------------
 __global__ static void pointwiseMultValueK(CeedScalar *__restrict__ w, CeedScalar *x, CeedScalar *__restrict__ y, CeedSize size) {
-  CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
+  const CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
 
-  if (index >= size) return;
-  w[index] = x[index] * y[index];
+  if (index < size) w[index] = x[index] * y[index];
 }
 
 //------------------------------------------------------------------------------

From 18c38aee78e3756c49c9a08180e50b276da31c2d Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Wed, 12 Feb 2025 14:47:38 -0700
Subject: [PATCH 288/571] minor - make tidy happy about leak

---
 backends/cuda/ceed-cuda-compile.cpp | 8 ++++++--
 backends/hip/ceed-hip-compile.cpp   | 8 ++++++--
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/backends/cuda/ceed-cuda-compile.cpp b/backends/cuda/ceed-cuda-compile.cpp
index 6c0e07c0b9..382a6557d5 100644
--- a/backends/cuda/ceed-cuda-compile.cpp
+++ b/backends/cuda/ceed-cuda-compile.cpp
@@ -163,8 +163,10 @@ int CeedCompile_Cuda(Ceed ceed, const char *source, CUmodule *module, const Ceed
   va_list args;
 
   va_start(args, num_defines);
-  CeedCallBackend(CeedCompileCore_Cuda(ceed, source, true, &is_compile_good, module, num_defines, args));
+  const CeedInt ierr = CeedCompileCore_Cuda(ceed, source, true, &is_compile_good, module, num_defines, args);
+
   va_end(args);
+  CeedCallBackend(ierr);
   return CEED_ERROR_SUCCESS;
 }
 
@@ -172,8 +174,10 @@ int CeedTryCompile_Cuda(Ceed ceed, const char *source, bool *is_compile_good, CU
   va_list args;
 
   va_start(args, num_defines);
-  CeedCallBackend(CeedCompileCore_Cuda(ceed, source, false, is_compile_good, module, num_defines, args));
+  const CeedInt ierr = CeedCompileCore_Cuda(ceed, source, false, is_compile_good, module, num_defines, args);
+
   va_end(args);
+  CeedCallBackend(ierr);
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/hip/ceed-hip-compile.cpp b/backends/hip/ceed-hip-compile.cpp
index 51c83cf222..2bf14a91b1 100644
--- a/backends/hip/ceed-hip-compile.cpp
+++ b/backends/hip/ceed-hip-compile.cpp
@@ -159,8 +159,10 @@ int CeedCompile_Hip(Ceed ceed, const char *source, hipModule_t *module, const Ce
   va_list args;
 
   va_start(args, num_defines);
-  CeedCallBackend(CeedCompileCore_Hip(ceed, source, true, &is_compile_good, module, num_defines, args));
+  const CeedInt ierr = CeedCompileCore_Hip(ceed, source, true, &is_compile_good, module, num_defines, args);
+
   va_end(args);
+  CeedCallBackend(ierr);
   return CEED_ERROR_SUCCESS;
 }
 
@@ -168,8 +170,10 @@ int CeedTryCompile_Hip(Ceed ceed, const char *source, bool *is_compile_good, hip
   va_list args;
 
   va_start(args, num_defines);
-  CeedCallBackend(CeedCompileCore_Hip(ceed, source, false, is_compile_good, module, num_defines, args));
+  const CeedInt ierr = CeedCompileCore_Hip(ceed, source, false, is_compile_good, module, num_defines, args);
+
   va_end(args);
+  CeedCallBackend(ierr);
   return CEED_ERROR_SUCCESS;
 }
 

From 41126227c0a68abf2240c8dd998e8a0afe6a93f8 Mon Sep 17 00:00:00 2001
From: Zach Atkins <Zach.Atkins@colorado.edu>
Date: Wed, 12 Feb 2025 14:16:05 -0800
Subject: [PATCH 289/571] Add common profiling outputs to .gitignore

---
 .gitignore | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/.gitignore b/.gitignore
index 7e7115b20e..7e0f413323 100644
--- a/.gitignore
+++ b/.gitignore
@@ -91,3 +91,10 @@ libCEED.includes
 *.aux
 *.fdb_latexmk
 *.fls
+
+# profiling files
+*.so
+*.so.*
+*.txt
+*.proto
+*.csv

From db0c7856395feb5fa95ebd3cb4d39cdd22491ca6 Mon Sep 17 00:00:00 2001
From: Zach Atkins <Zach.Atkins@colorado.edu>
Date: Wed, 12 Feb 2025 14:20:01 -0800
Subject: [PATCH 290/571] clean up duplicates

---
 .gitignore | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.gitignore b/.gitignore
index 7e0f413323..e7b100a069 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,6 +7,7 @@ lib/*
 # General
 *.o
 *.so
+*.so.*
 *.d
 *.DIR
 ceed.pc
@@ -93,8 +94,6 @@ libCEED.includes
 *.fls
 
 # profiling files
-*.so
-*.so.*
 *.txt
 *.proto
 *.csv

From e84c3ebc97a9b078c9a14c8bdeb2c2ebf4a01814 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Wed, 12 Feb 2025 15:22:32 -0700
Subject: [PATCH 291/571] gpu - prefer cu/hipBlas over handrolls

---
 backends/cuda-ref/ceed-cuda-ref-vector.c | 292 +++++++++++++----------
 backends/hip-ref/ceed-hip-ref-vector.c   | 260 +++++++++++++-------
 2 files changed, 341 insertions(+), 211 deletions(-)

diff --git a/backends/cuda-ref/ceed-cuda-ref-vector.c b/backends/cuda-ref/ceed-cuda-ref-vector.c
index 791673b25f..259d3526ba 100644
--- a/backends/cuda-ref/ceed-cuda-ref-vector.c
+++ b/backends/cuda-ref/ceed-cuda-ref-vector.c
@@ -253,14 +253,30 @@ static int CeedVectorCopyStrided_Cuda(CeedVector vec, CeedSize start, CeedSize s
     CeedScalar *copy_array;
 
     CeedCallBackend(CeedVectorGetArray(vec_copy, CEED_MEM_DEVICE, &copy_array));
+#if (CUDA_VERSION >= 12000)
+    cublasHandle_t handle;
+    Ceed           ceed;
+
+    CeedCallBackend(CeedVectorGetCeed(vec, &ceed));
+    CeedCallBackend(CeedGetCublasHandle_Cuda(ceed, &handle));
+#if defined(CEED_SCALAR_IS_FP32)
+    CeedCallCublas(ceed, cublasScopy_64(handle, (int64_t)length, impl->d_array + start, (int64_t)step, copy_array + start, (int64_t)step));
+#else  /* CEED_SCALAR */
+    CeedCallCublas(ceed, cublasDcopy_64(handle, (int64_t)length, impl->d_array + start, (int64_t)step, copy_array + start, (int64_t)step));
+#endif /* CEED_SCALAR */
+    CeedCallBackend(CeedDestroy(&ceed));
+#else  /* CUDA_VERSION */
     CeedCallBackend(CeedDeviceCopyStrided_Cuda(impl->d_array, start, step, length, copy_array));
+#endif /* CUDA_VERSION */
     CeedCallBackend(CeedVectorRestoreArray(vec_copy, &copy_array));
+    impl->h_array = NULL;
   } else if (impl->h_array) {
     CeedScalar *copy_array;
 
     CeedCallBackend(CeedVectorGetArray(vec_copy, CEED_MEM_HOST, &copy_array));
     CeedCallBackend(CeedHostCopyStrided_Cuda(impl->h_array, start, step, length, copy_array));
     CeedCallBackend(CeedVectorRestoreArray(vec_copy, &copy_array));
+    impl->d_array = NULL;
   } else {
     return CeedError(CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND, "CeedVector must have valid data set");
   }
@@ -459,9 +475,9 @@ static int CeedVectorGetArrayWrite_Cuda(const CeedVector vec, const CeedMemType
 static int CeedVectorNorm_Cuda(CeedVector vec, CeedNormType type, CeedScalar *norm) {
   Ceed     ceed;
   CeedSize length;
-#if CUDA_VERSION < 12000
+#if (CUDA_VERSION < 12000)
   CeedSize num_calls;
-#endif
+#endif /* CUDA_VERSION */
   const CeedScalar *d_array;
   CeedVector_Cuda  *impl;
   cublasHandle_t    handle;
@@ -471,142 +487,142 @@ static int CeedVectorNorm_Cuda(CeedVector vec, CeedNormType type, CeedScalar *no
   CeedCallBackend(CeedVectorGetLength(vec, &length));
   CeedCallBackend(CeedGetCublasHandle_Cuda(ceed, &handle));
 
-#if CUDA_VERSION < 12000
+#if (CUDA_VERSION < 12000)
   // With CUDA 12, we can use the 64-bit integer interface. Prior to that,
   // we need to check if the vector is too long to handle with int32,
   // and if so, divide it into subsections for repeated cuBLAS calls.
   num_calls = length / INT_MAX;
   if (length % INT_MAX > 0) num_calls += 1;
-#endif
+#endif /* CUDA_VERSION */
 
   // Compute norm
   CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &d_array));
   switch (type) {
     case CEED_NORM_1: {
       *norm = 0.0;
-      if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) {
-#if CUDA_VERSION >= 12000  // We have CUDA 12, and can use 64-bit integers
-        CeedCallCublas(ceed, cublasSasum_64(handle, (int64_t)length, (float *)d_array, 1, (float *)norm));
-#else
-        float  sub_norm = 0.0;
-        float *d_array_start;
-
-        for (CeedInt i = 0; i < num_calls; i++) {
-          d_array_start             = (float *)d_array + (CeedSize)(i)*INT_MAX;
-          CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX;
-          CeedInt  sub_length       = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
-
-          CeedCallCublas(ceed, cublasSasum(handle, (CeedInt)sub_length, (float *)d_array_start, 1, &sub_norm));
-          *norm += sub_norm;
-        }
-#endif
-      } else {
-#if CUDA_VERSION >= 12000
-        CeedCallCublas(ceed, cublasDasum_64(handle, (int64_t)length, (double *)d_array, 1, (double *)norm));
-#else
-        double  sub_norm = 0.0;
-        double *d_array_start;
-
-        for (CeedInt i = 0; i < num_calls; i++) {
-          d_array_start             = (double *)d_array + (CeedSize)(i)*INT_MAX;
-          CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX;
-          CeedInt  sub_length       = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
-
-          CeedCallCublas(ceed, cublasDasum(handle, (CeedInt)sub_length, (double *)d_array_start, 1, &sub_norm));
-          *norm += sub_norm;
-        }
-#endif
+#if defined(CEED_SCALAR_IS_FP32)
+#if (CUDA_VERSION >= 12000)  // We have CUDA 12, and can use 64-bit integers
+      CeedCallCublas(ceed, cublasSasum_64(handle, (int64_t)length, (float *)d_array, 1, (float *)norm));
+#else  /* CUDA_VERSION */
+      float  sub_norm = 0.0;
+      float *d_array_start;
+
+      for (CeedInt i = 0; i < num_calls; i++) {
+        d_array_start             = (float *)d_array + (CeedSize)(i)*INT_MAX;
+        CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX;
+        CeedInt  sub_length       = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
+
+        CeedCallCublas(ceed, cublasSasum(handle, (CeedInt)sub_length, (float *)d_array_start, 1, &sub_norm));
+        *norm += sub_norm;
       }
+#endif /* CUDA_VERSION */
+#else  /* CEED_SCALAR */
+#if (CUDA_VERSION >= 12000)
+      CeedCallCublas(ceed, cublasDasum_64(handle, (int64_t)length, (double *)d_array, 1, (double *)norm));
+#else  /* CUDA_VERSION */
+      double  sub_norm = 0.0;
+      double *d_array_start;
+
+      for (CeedInt i = 0; i < num_calls; i++) {
+        d_array_start             = (double *)d_array + (CeedSize)(i)*INT_MAX;
+        CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX;
+        CeedInt  sub_length       = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
+
+        CeedCallCublas(ceed, cublasDasum(handle, (CeedInt)sub_length, (double *)d_array_start, 1, &sub_norm));
+        *norm += sub_norm;
+      }
+#endif /* CUDA_VERSION */
+#endif /* CEED_SCALAR */
       break;
     }
     case CEED_NORM_2: {
-      if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) {
-#if CUDA_VERSION >= 12000
-        CeedCallCublas(ceed, cublasSnrm2_64(handle, (int64_t)length, (float *)d_array, 1, (float *)norm));
-#else
-        float  sub_norm = 0.0, norm_sum = 0.0;
-        float *d_array_start;
-
-        for (CeedInt i = 0; i < num_calls; i++) {
-          d_array_start             = (float *)d_array + (CeedSize)(i)*INT_MAX;
-          CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX;
-          CeedInt  sub_length       = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
-
-          CeedCallCublas(ceed, cublasSnrm2(handle, (CeedInt)sub_length, (float *)d_array_start, 1, &sub_norm));
-          norm_sum += sub_norm * sub_norm;
-        }
-        *norm = sqrt(norm_sum);
-#endif
-      } else {
-#if CUDA_VERSION >= 12000
-        CeedCallCublas(ceed, cublasDnrm2_64(handle, (int64_t)length, (double *)d_array, 1, (double *)norm));
-#else
-        double  sub_norm = 0.0, norm_sum = 0.0;
-        double *d_array_start;
-
-        for (CeedInt i = 0; i < num_calls; i++) {
-          d_array_start             = (double *)d_array + (CeedSize)(i)*INT_MAX;
-          CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX;
-          CeedInt  sub_length       = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
-
-          CeedCallCublas(ceed, cublasDnrm2(handle, (CeedInt)sub_length, (double *)d_array_start, 1, &sub_norm));
-          norm_sum += sub_norm * sub_norm;
-        }
-        *norm = sqrt(norm_sum);
-#endif
+#if defined(CEED_SCALAR_IS_FP32)
+#if (CUDA_VERSION >= 12000)
+      CeedCallCublas(ceed, cublasSnrm2_64(handle, (int64_t)length, (float *)d_array, 1, (float *)norm));
+#else  /* CUDA_VERSION */
+      float  sub_norm = 0.0, norm_sum = 0.0;
+      float *d_array_start;
+
+      for (CeedInt i = 0; i < num_calls; i++) {
+        d_array_start             = (float *)d_array + (CeedSize)(i)*INT_MAX;
+        CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX;
+        CeedInt  sub_length       = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
+
+        CeedCallCublas(ceed, cublasSnrm2(handle, (CeedInt)sub_length, (float *)d_array_start, 1, &sub_norm));
+        norm_sum += sub_norm * sub_norm;
+      }
+      *norm = sqrt(norm_sum);
+#endif /* CUDA_VERSION */
+#else  /* CEED_SCALAR */
+#if (CUDA_VERSION >= 12000)
+      CeedCallCublas(ceed, cublasDnrm2_64(handle, (int64_t)length, (double *)d_array, 1, (double *)norm));
+#else  /* CUDA_VERSION */
+      double  sub_norm = 0.0, norm_sum = 0.0;
+      double *d_array_start;
+
+      for (CeedInt i = 0; i < num_calls; i++) {
+        d_array_start             = (double *)d_array + (CeedSize)(i)*INT_MAX;
+        CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX;
+        CeedInt  sub_length       = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
+
+        CeedCallCublas(ceed, cublasDnrm2(handle, (CeedInt)sub_length, (double *)d_array_start, 1, &sub_norm));
+        norm_sum += sub_norm * sub_norm;
       }
+      *norm = sqrt(norm_sum);
+#endif /* CUDA_VERSION */
+#endif /* CEED_SCALAR */
       break;
     }
     case CEED_NORM_MAX: {
-      if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) {
-#if CUDA_VERSION >= 12000
-        int64_t    index;
-        CeedScalar norm_no_abs;
-
-        CeedCallCublas(ceed, cublasIsamax_64(handle, (int64_t)length, (float *)d_array, 1, &index));
-        CeedCallCuda(ceed, cudaMemcpy(&norm_no_abs, impl->d_array + index - 1, sizeof(CeedScalar), cudaMemcpyDeviceToHost));
-        *norm = fabs(norm_no_abs);
-#else
-        CeedInt index;
-        float   sub_max = 0.0, current_max = 0.0;
-        float  *d_array_start;
-
-        for (CeedInt i = 0; i < num_calls; i++) {
-          d_array_start             = (float *)d_array + (CeedSize)(i)*INT_MAX;
-          CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX;
-          CeedInt  sub_length       = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
-
-          CeedCallCublas(ceed, cublasIsamax(handle, (CeedInt)sub_length, (float *)d_array_start, 1, &index));
-          CeedCallCuda(ceed, cudaMemcpy(&sub_max, d_array_start + index - 1, sizeof(CeedScalar), cudaMemcpyDeviceToHost));
-          if (fabs(sub_max) > current_max) current_max = fabs(sub_max);
-        }
-        *norm = current_max;
-#endif
-      } else {
-#if CUDA_VERSION >= 12000
-        int64_t    index;
-        CeedScalar norm_no_abs;
-
-        CeedCallCublas(ceed, cublasIdamax_64(handle, (int64_t)length, (double *)d_array, 1, &index));
-        CeedCallCuda(ceed, cudaMemcpy(&norm_no_abs, impl->d_array + index - 1, sizeof(CeedScalar), cudaMemcpyDeviceToHost));
-        *norm = fabs(norm_no_abs);
-#else
-        CeedInt index;
-        double  sub_max = 0.0, current_max = 0.0;
-        double *d_array_start;
-
-        for (CeedInt i = 0; i < num_calls; i++) {
-          d_array_start             = (double *)d_array + (CeedSize)(i)*INT_MAX;
-          CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX;
-          CeedInt  sub_length       = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
-
-          CeedCallCublas(ceed, cublasIdamax(handle, (CeedInt)sub_length, (double *)d_array_start, 1, &index));
-          CeedCallCuda(ceed, cudaMemcpy(&sub_max, d_array_start + index - 1, sizeof(CeedScalar), cudaMemcpyDeviceToHost));
-          if (fabs(sub_max) > current_max) current_max = fabs(sub_max);
-        }
-        *norm = current_max;
-#endif
+#if defined(CEED_SCALAR_IS_FP32)
+#if (CUDA_VERSION >= 12000)
+      int64_t    index;
+      CeedScalar norm_no_abs;
+
+      CeedCallCublas(ceed, cublasIsamax_64(handle, (int64_t)length, (float *)d_array, 1, &index));
+      CeedCallCuda(ceed, cudaMemcpy(&norm_no_abs, impl->d_array + index - 1, sizeof(CeedScalar), cudaMemcpyDeviceToHost));
+      *norm = fabs(norm_no_abs);
+#else  /* CUDA_VERSION */
+      CeedInt index;
+      float   sub_max = 0.0, current_max = 0.0;
+      float  *d_array_start;
+
+      for (CeedInt i = 0; i < num_calls; i++) {
+        d_array_start             = (float *)d_array + (CeedSize)(i)*INT_MAX;
+        CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX;
+        CeedInt  sub_length       = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
+
+        CeedCallCublas(ceed, cublasIsamax(handle, (CeedInt)sub_length, (float *)d_array_start, 1, &index));
+        CeedCallCuda(ceed, cudaMemcpy(&sub_max, d_array_start + index - 1, sizeof(CeedScalar), cudaMemcpyDeviceToHost));
+        if (fabs(sub_max) > current_max) current_max = fabs(sub_max);
       }
+      *norm = current_max;
+#endif /* CUDA_VERSION */
+#else  /* CEED_SCALAR */
+#if (CUDA_VERSION >= 12000)
+      int64_t    index;
+      CeedScalar norm_no_abs;
+
+      CeedCallCublas(ceed, cublasIdamax_64(handle, (int64_t)length, (double *)d_array, 1, &index));
+      CeedCallCuda(ceed, cudaMemcpy(&norm_no_abs, impl->d_array + index - 1, sizeof(CeedScalar), cudaMemcpyDeviceToHost));
+      *norm = fabs(norm_no_abs);
+#else  /* CUDA_VERSION */
+      CeedInt index;
+      double  sub_max = 0.0, current_max = 0.0;
+      double *d_array_start;
+
+      for (CeedInt i = 0; i < num_calls; i++) {
+        d_array_start             = (double *)d_array + (CeedSize)(i)*INT_MAX;
+        CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX;
+        CeedInt  sub_length       = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
+
+        CeedCallCublas(ceed, cublasIdamax(handle, (CeedInt)sub_length, (double *)d_array_start, 1, &index));
+        CeedCallCuda(ceed, cudaMemcpy(&sub_max, d_array_start + index - 1, sizeof(CeedScalar), cudaMemcpyDeviceToHost));
+        if (fabs(sub_max) > current_max) current_max = fabs(sub_max);
+      }
+      *norm = current_max;
+#endif /* CUDA_VERSION */
+#endif /* CEED_SCALAR */
       break;
     }
   }
@@ -663,13 +679,29 @@ int CeedDeviceScale_Cuda(CeedScalar *x_array, CeedScalar alpha, CeedSize length)
 //------------------------------------------------------------------------------
 static int CeedVectorScale_Cuda(CeedVector x, CeedScalar alpha) {
   CeedSize         length;
-  CeedVector_Cuda *x_impl;
+  CeedVector_Cuda *impl;
 
-  CeedCallBackend(CeedVectorGetData(x, &x_impl));
+  CeedCallBackend(CeedVectorGetData(x, &impl));
   CeedCallBackend(CeedVectorGetLength(x, &length));
   // Set value for synced device/host array
-  if (x_impl->d_array) CeedCallBackend(CeedDeviceScale_Cuda(x_impl->d_array, alpha, length));
-  if (x_impl->h_array) CeedCallBackend(CeedHostScale_Cuda(x_impl->h_array, alpha, length));
+  if (impl->d_array) {
+#if (CUDA_VERSION >= 12000)
+    cublasHandle_t handle;
+
+    CeedCallBackend(CeedGetCublasHandle_Cuda(CeedVectorReturnCeed(x), &handle));
+#if defined(CEED_SCALAR_IS_FP32)
+    CeedCallCublas(CeedVectorReturnCeed(x), cublasSscal_64(handle, (int64_t)length, &alpha, impl->d_array, 1));
+#else  /* CEED_SCALAR */
+    CeedCallCublas(CeedVectorReturnCeed(x), cublasDscal_64(handle, (int64_t)length, &alpha, impl->d_array, 1));
+#endif /* CEED_SCALAR */
+#else  /* CUDA_VERSION */
+    CeedCallBackend(CeedDeviceScale_Cuda(impl->d_array, alpha, length));
+#endif /* CUDA_VERSION */
+    impl->h_array = NULL;
+  } else if (impl->h_array) {
+    CeedCallBackend(CeedHostScale_Cuda(impl->h_array, alpha, length));
+    impl->d_array = NULL;
+  }
   return CEED_ERROR_SUCCESS;
 }
 
@@ -699,11 +731,23 @@ static int CeedVectorAXPY_Cuda(CeedVector y, CeedScalar alpha, CeedVector x) {
   // Set value for synced device/host array
   if (y_impl->d_array) {
     CeedCallBackend(CeedVectorSyncArray(x, CEED_MEM_DEVICE));
+#if (CUDA_VERSION >= 12000)
+    cublasHandle_t handle;
+
+    CeedCallBackend(CeedGetCublasHandle_Cuda(CeedVectorReturnCeed(y), &handle));
+#if defined(CEED_SCALAR_IS_FP32)
+    CeedCallCublas(CeedVectorReturnCeed(y), cublasSaxpy_64(handle, (int64_t)length, &alpha, x_impl->d_array, 1, y_impl->d_array, 1));
+#else  /* CEED_SCALAR */
+    CeedCallCublas(CeedVectorReturnCeed(y), cublasDaxpy_64(handle, (int64_t)length, &alpha, x_impl->d_array, 1, y_impl->d_array, 1));
+#endif /* CEED_SCALAR */
+#else  /* CUDA_VERSION */
     CeedCallBackend(CeedDeviceAXPY_Cuda(y_impl->d_array, alpha, x_impl->d_array, length));
-  }
-  if (y_impl->h_array) {
+#endif /* CUDA_VERSION */
+    y_impl->h_array = NULL;
+  } else if (y_impl->h_array) {
     CeedCallBackend(CeedVectorSyncArray(x, CEED_MEM_HOST));
     CeedCallBackend(CeedHostAXPY_Cuda(y_impl->h_array, alpha, x_impl->h_array, length));
+    y_impl->d_array = NULL;
   }
   return CEED_ERROR_SUCCESS;
 }
diff --git a/backends/hip-ref/ceed-hip-ref-vector.c b/backends/hip-ref/ceed-hip-ref-vector.c
index 5eced676df..93798d72e8 100644
--- a/backends/hip-ref/ceed-hip-ref-vector.c
+++ b/backends/hip-ref/ceed-hip-ref-vector.c
@@ -253,14 +253,30 @@ static int CeedVectorCopyStrided_Hip(CeedVector vec, CeedSize start, CeedSize st
     CeedScalar *copy_array;
 
     CeedCallBackend(CeedVectorGetArray(vec_copy, CEED_MEM_DEVICE, &copy_array));
+#if (HIP_VERSION >= 60000000)
+    hipblasHandle_t handle;
+    Ceed            ceed;
+
+    CeedCallBackend(CeedVectorGetCeed(vec, &ceed));
+    CeedCallBackend(CeedGetHipblasHandle_Hip(ceed, &handle));
+#if defined(CEED_SCALAR_IS_FP32)
+    CeedCallHipblas(ceed, hipblasScopy_64(handle, (int64_t)length, impl->d_array + start, (int64_t)step, copy_array + start, (int64_t)step));
+#else  /* CEED_SCALAR */
+    CeedCallHipblas(ceed, hipblasDcopy_64(handle, (int64_t)length, impl->d_array + start, (int64_t)step, copy_array + start, (int64_t)step));
+#endif /* CEED_SCALAR */
+#else  /* HIP_VERSION */
     CeedCallBackend(CeedDeviceCopyStrided_Hip(impl->d_array, start, step, length, copy_array));
+#endif /* HIP_VERSION */
     CeedCallBackend(CeedVectorRestoreArray(vec_copy, &copy_array));
+    impl->h_array = NULL;
+    CeedCallBackend(CeedDestroy(&ceed));
   } else if (impl->h_array) {
     CeedScalar *copy_array;
 
     CeedCallBackend(CeedVectorGetArray(vec_copy, CEED_MEM_HOST, &copy_array));
     CeedCallBackend(CeedHostCopyStrided_Hip(impl->h_array, start, step, length, copy_array));
     CeedCallBackend(CeedVectorRestoreArray(vec_copy, &copy_array));
+    impl->d_array = NULL;
   } else {
     return CeedError(CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND, "CeedVector must have valid data set");
   }
@@ -461,8 +477,11 @@ static int CeedVectorGetArrayWrite_Hip(const CeedVector vec, const CeedMemType m
 // Get the norm of a CeedVector
 //------------------------------------------------------------------------------
 static int CeedVectorNorm_Hip(CeedVector vec, CeedNormType type, CeedScalar *norm) {
-  Ceed              ceed;
-  CeedSize          length, num_calls;
+  Ceed     ceed;
+  CeedSize length;
+#if (HIP_VERSION < 60000000)
+  CeedSize num_calls;
+#endif /* HIP_VERSION */
   const CeedScalar *d_array;
   CeedVector_Hip   *impl;
   hipblasHandle_t   handle;
@@ -472,104 +491,142 @@ static int CeedVectorNorm_Hip(CeedVector vec, CeedNormType type, CeedScalar *nor
   CeedCallBackend(CeedVectorGetLength(vec, &length));
   CeedCallBackend(CeedGetHipblasHandle_Hip(ceed, &handle));
 
-  // Is the vector too long to handle with int32? If so, we will divide
-  // it up into "int32-sized" subsections and make repeated BLAS calls.
+#if (HIP_VERSION < 60000000)
+  // With ROCm 6, we can use the 64-bit integer interface. Prior to that,
+  // we need to check if the vector is too long to handle with int32,
+  // and if so, divide it into subsections for repeated hipBLAS calls.
   num_calls = length / INT_MAX;
   if (length % INT_MAX > 0) num_calls += 1;
+#endif /* HIP_VERSION */
 
   // Compute norm
   CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &d_array));
   switch (type) {
     case CEED_NORM_1: {
       *norm = 0.0;
-      if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) {
-        float  sub_norm = 0.0;
-        float *d_array_start;
-
-        for (CeedInt i = 0; i < num_calls; i++) {
-          d_array_start             = (float *)d_array + (CeedSize)(i)*INT_MAX;
-          CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX;
-          CeedInt  sub_length       = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
-
-          CeedCallHipblas(ceed, hipblasSasum(handle, (CeedInt)sub_length, (float *)d_array_start, 1, &sub_norm));
-          *norm += sub_norm;
-        }
-      } else {
-        double  sub_norm = 0.0;
-        double *d_array_start;
-
-        for (CeedInt i = 0; i < num_calls; i++) {
-          d_array_start             = (double *)d_array + (CeedSize)(i)*INT_MAX;
-          CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX;
-          CeedInt  sub_length       = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
-
-          CeedCallHipblas(ceed, hipblasDasum(handle, (CeedInt)sub_length, (double *)d_array_start, 1, &sub_norm));
-          *norm += sub_norm;
-        }
+#if defined(CEED_SCALAR_IS_FP32)
+#if (HIP_VERSION >= 60000000)  // We have ROCm 6, and can use 64-bit integers
+      CeedCallHipblas(ceed, hipblasSasum_64(handle, (int64_t)length, (float *)d_array, 1, (float *)norm));
+#else  /* HIP_VERSION */
+      float  sub_norm = 0.0;
+      float *d_array_start;
+
+      for (CeedInt i = 0; i < num_calls; i++) {
+        d_array_start             = (float *)d_array + (CeedSize)(i)*INT_MAX;
+        CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX;
+        CeedInt  sub_length       = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
+
+        CeedCallHipblas(ceed, cublasSasum(handle, (CeedInt)sub_length, (float *)d_array_start, 1, &sub_norm));
+        *norm += sub_norm;
+      }
+#endif /* HIP_VERSION */
+#else  /* CEED_SCALAR */
+#if (HIP_VERSION >= 60000000)
+      CeedCallHipblas(ceed, hipblasDasum_64(handle, (int64_t)length, (double *)d_array, 1, (double *)norm));
+#else  /* HIP_VERSION */
+      double  sub_norm = 0.0;
+      double *d_array_start;
+
+      for (CeedInt i = 0; i < num_calls; i++) {
+        d_array_start             = (double *)d_array + (CeedSize)(i)*INT_MAX;
+        CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX;
+        CeedInt  sub_length       = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
+
+        CeedCallHipblas(ceed, hipblasDasum(handle, (CeedInt)sub_length, (double *)d_array_start, 1, &sub_norm));
+        *norm += sub_norm;
       }
+#endif /* HIP_VERSION */
+#endif /* CEED_SCALAR */
       break;
     }
     case CEED_NORM_2: {
-      if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) {
-        float  sub_norm = 0.0, norm_sum = 0.0;
-        float *d_array_start;
-
-        for (CeedInt i = 0; i < num_calls; i++) {
-          d_array_start             = (float *)d_array + (CeedSize)(i)*INT_MAX;
-          CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX;
-          CeedInt  sub_length       = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
-
-          CeedCallHipblas(ceed, hipblasSnrm2(handle, (CeedInt)sub_length, (float *)d_array_start, 1, &sub_norm));
-          norm_sum += sub_norm * sub_norm;
-        }
-        *norm = sqrt(norm_sum);
-      } else {
-        double  sub_norm = 0.0, norm_sum = 0.0;
-        double *d_array_start;
-
-        for (CeedInt i = 0; i < num_calls; i++) {
-          d_array_start             = (double *)d_array + (CeedSize)(i)*INT_MAX;
-          CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX;
-          CeedInt  sub_length       = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
-
-          CeedCallHipblas(ceed, hipblasDnrm2(handle, (CeedInt)sub_length, (double *)d_array_start, 1, &sub_norm));
-          norm_sum += sub_norm * sub_norm;
-        }
-        *norm = sqrt(norm_sum);
+#if defined(CEED_SCALAR_IS_FP32)
+#if (HIP_VERSION >= 60000000)
+      CeedCallHipblas(ceed, hipblasSnrm2_64(handle, (int64_t)length, (float *)d_array, 1, (float *)norm));
+#else  /* CUDA_VERSION */
+      float  sub_norm = 0.0, norm_sum = 0.0;
+      float *d_array_start;
+
+      for (CeedInt i = 0; i < num_calls; i++) {
+        d_array_start             = (float *)d_array + (CeedSize)(i)*INT_MAX;
+        CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX;
+        CeedInt  sub_length       = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
+
+        CeedCallHipblas(ceed, hipblasSnrm2(handle, (CeedInt)sub_length, (float *)d_array_start, 1, &sub_norm));
+        norm_sum += sub_norm * sub_norm;
       }
+      *norm = sqrt(norm_sum);
+#endif /* HIP_VERSION */
+#else  /* CEED_SCALAR */
+#if (HIP_VERSION >= 60000000)
+      CeedCallHipblas(ceed, hipblasDnrm2_64(handle, (int64_t)length, (double *)d_array, 1, (double *)norm));
+#else  /* CUDA_VERSION */
+      double  sub_norm = 0.0, norm_sum = 0.0;
+      double *d_array_start;
+
+      for (CeedInt i = 0; i < num_calls; i++) {
+        d_array_start             = (double *)d_array + (CeedSize)(i)*INT_MAX;
+        CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX;
+        CeedInt  sub_length       = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
+
+        CeedCallHipblas(ceed, hipblasDnrm2(handle, (CeedInt)sub_length, (double *)d_array_start, 1, &sub_norm));
+        norm_sum += sub_norm * sub_norm;
+      }
+      *norm = sqrt(norm_sum);
+#endif /* HIP_VERSION */
+#endif /* CEED_SCALAR */
       break;
     }
     case CEED_NORM_MAX: {
+#if defined(CEED_SCALAR_IS_FP32)
+#if (HIP_VERSION >= 60000000)
+      int64_t    index;
+      CeedScalar norm_no_abs;
+
+      CeedCallHipblas(ceed, hipblasIsamax_64(handle, (int64_t)length, (float *)d_array, 1, &index));
+      CeedCallHip(ceed, hipMemcpy(&norm_no_abs, impl->d_array + index - 1, sizeof(CeedScalar), hipMemcpyDeviceToHost));
+      *norm = fabs(norm_no_abs);
+#else  /* HIP_VERSION */
       CeedInt index;
+      float   sub_max = 0.0, current_max = 0.0;
+      float  *d_array_start;
+
+      for (CeedInt i = 0; i < num_calls; i++) {
+        d_array_start             = (float *)d_array + (CeedSize)(i)*INT_MAX;
+        CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX;
+        CeedInt  sub_length       = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
+
+        CeedCallHipblas(ceed, hipblasIsamax(handle, (CeedInt)sub_length, (float *)d_array_start, 1, &index));
+        CeedCallHip(ceed, hipMemcpy(&sub_max, d_array_start + index - 1, sizeof(CeedScalar), hipMemcpyDeviceToHost));
+        if (fabs(sub_max) > current_max) current_max = fabs(sub_max);
+      }
+      *norm = current_max;
+#endif /* HIP_VERSION */
+#else  /* CEED_SCALAR */
+#if (HIP_VERSION >= 60000000)
+      int64_t    index;
+      CeedScalar norm_no_abs;
+
+      CeedCallHipblas(ceed, hipblasIdamax_64(handle, (int64_t)length, (double *)d_array, 1, &index));
+      CeedCallHip(ceed, hipMemcpy(&norm_no_abs, impl->d_array + index - 1, sizeof(CeedScalar), hipMemcpyDeviceToHost));
+      *norm = fabs(norm_no_abs);
+#else  /* HIP_VERSION */
+      CeedInt index;
+      double  sub_max = 0.0, current_max = 0.0;
+      double *d_array_start;
+
+      for (CeedInt i = 0; i < num_calls; i++) {
+        d_array_start             = (double *)d_array + (CeedSize)(i)*INT_MAX;
+        CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX;
+        CeedInt  sub_length       = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
 
-      if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) {
-        float  sub_max = 0.0, current_max = 0.0;
-        float *d_array_start;
-        for (CeedInt i = 0; i < num_calls; i++) {
-          d_array_start             = (float *)d_array + (CeedSize)(i)*INT_MAX;
-          CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX;
-          CeedInt  sub_length       = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
-
-          CeedCallHipblas(ceed, hipblasIsamax(handle, (CeedInt)sub_length, (float *)d_array_start, 1, &index));
-          CeedCallHip(ceed, hipMemcpy(&sub_max, d_array_start + index - 1, sizeof(CeedScalar), hipMemcpyDeviceToHost));
-          if (fabs(sub_max) > current_max) current_max = fabs(sub_max);
-        }
-        *norm = current_max;
-      } else {
-        double  sub_max = 0.0, current_max = 0.0;
-        double *d_array_start;
-
-        for (CeedInt i = 0; i < num_calls; i++) {
-          d_array_start             = (double *)d_array + (CeedSize)(i)*INT_MAX;
-          CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX;
-          CeedInt  sub_length       = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
-
-          CeedCallHipblas(ceed, hipblasIdamax(handle, (CeedInt)sub_length, (double *)d_array_start, 1, &index));
-          CeedCallHip(ceed, hipMemcpy(&sub_max, d_array_start + index - 1, sizeof(CeedScalar), hipMemcpyDeviceToHost));
-          if (fabs(sub_max) > current_max) current_max = fabs(sub_max);
-        }
-        *norm = current_max;
+        CeedCallHipblas(ceed, hipblasIdamax(handle, (CeedInt)sub_length, (double *)d_array_start, 1, &index));
+        CeedCallHip(ceed, hipMemcpy(&sub_max, d_array_start + index - 1, sizeof(CeedScalar), hipMemcpyDeviceToHost));
+        if (fabs(sub_max) > current_max) current_max = fabs(sub_max);
       }
+      *norm = current_max;
+#endif /* HIP_VERSION */
+#endif /* CEED_SCALAR */
       break;
     }
   }
@@ -626,13 +683,30 @@ int CeedDeviceScale_Hip(CeedScalar *x_array, CeedScalar alpha, CeedSize length);
 //------------------------------------------------------------------------------
 static int CeedVectorScale_Hip(CeedVector x, CeedScalar alpha) {
   CeedSize        length;
-  CeedVector_Hip *x_impl;
+  CeedVector_Hip *impl;
 
-  CeedCallBackend(CeedVectorGetData(x, &x_impl));
+  CeedCallBackend(CeedVectorGetData(x, &impl));
   CeedCallBackend(CeedVectorGetLength(x, &length));
   // Set value for synced device/host array
-  if (x_impl->d_array) CeedCallBackend(CeedDeviceScale_Hip(x_impl->d_array, alpha, length));
-  if (x_impl->h_array) CeedCallBackend(CeedHostScale_Hip(x_impl->h_array, alpha, length));
+  if (impl->d_array) {
+#if (HIP_VERSION >= 60000000)
+    hipblasHandle_t handle;
+
+    CeedCallBackend(CeedGetHipblasHandle_Hip(CeedVectorReturnCeed(x), &handle));
+#if defined(CEED_SCALAR_IS_FP32)
+    CeedCallHipblas(CeedVectorReturnCeed(x), hipblasSscal_64(handle, (int64_t)length, &alpha, impl->d_array, 1));
+#else  /* CEED_SCALAR */
+    CeedCallHipblas(CeedVectorReturnCeed(x), hipblasDscal_64(handle, (int64_t)length, &alpha, impl->d_array, 1));
+#endif /* CEED_SCALAR */
+#else  /* HIP_VERSION */
+    CeedCallBackend(CeedDeviceScale_Hip(impl->d_array, alpha, length));
+#endif /* HIP_VERSION */
+    impl->h_array = NULL;
+  }
+  if (impl->h_array) {
+    CeedCallBackend(CeedHostScale_Hip(impl->h_array, alpha, length));
+    impl->d_array = NULL;
+  }
   return CEED_ERROR_SUCCESS;
 }
 
@@ -662,11 +736,23 @@ static int CeedVectorAXPY_Hip(CeedVector y, CeedScalar alpha, CeedVector x) {
   // Set value for synced device/host array
   if (y_impl->d_array) {
     CeedCallBackend(CeedVectorSyncArray(x, CEED_MEM_DEVICE));
+#if (HIP_VERSION >= 60000000)
+    hipblasHandle_t handle;
+
+    CeedCallBackend(CeedGetHipblasHandle_Hip(CeedVectorReturnCeed(y), &handle));
+#if defined(CEED_SCALAR_IS_FP32)
+    CeedCallHipblas(CeedVectorReturnCeed(y), hipblasSaxpy_64(handle, (int64_t)length, &alpha, x_impl->d_array, 1, y_impl->d_array, 1));
+#else  /* CEED_SCALAR */
+    CeedCallHipblas(CeedVectorReturnCeed(y), hipblasDaxpy_64(handle, (int64_t)length, &alpha, x_impl->d_array, 1, y_impl->d_array, 1));
+#endif /* CEED_SCALAR */
+#else  /* HIP_VERSION */
     CeedCallBackend(CeedDeviceAXPY_Hip(y_impl->d_array, alpha, x_impl->d_array, length));
-  }
-  if (y_impl->h_array) {
+#endif /* HIP_VERSION */
+    y_impl->h_array = NULL;
+  } else if (y_impl->h_array) {
     CeedCallBackend(CeedVectorSyncArray(x, CEED_MEM_HOST));
     CeedCallBackend(CeedHostAXPY_Hip(y_impl->h_array, alpha, x_impl->h_array, length));
+    y_impl->d_array = NULL;
   }
   return CEED_ERROR_SUCCESS;
 }

From af0e6e89dc9eb1085789d576366358a2a9a69ecc Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Thu, 13 Feb 2025 10:51:03 -0700
Subject: [PATCH 292/571] gpu - add Transpose/TransposeAdd variants for
 AtPoints

---
 backends/cuda-shared/ceed-cuda-shared-basis.c |  54 +++++--
 backends/cuda-shared/ceed-cuda-shared.h       |   2 +
 backends/hip-shared/ceed-hip-shared-basis.c   |  52 +++++--
 backends/hip-shared/ceed-hip-shared.h         |   2 +
 .../cuda/cuda-shared-basis-tensor-at-points.h | 143 ++++++++++++++++++
 .../hip/hip-shared-basis-tensor-at-points.h   | 143 ++++++++++++++++++
 6 files changed, 368 insertions(+), 28 deletions(-)

diff --git a/backends/cuda-shared/ceed-cuda-shared-basis.c b/backends/cuda-shared/ceed-cuda-shared-basis.c
index de63662ded..eb2a976625 100644
--- a/backends/cuda-shared/ceed-cuda-shared-basis.c
+++ b/backends/cuda-shared/ceed-cuda-shared-basis.c
@@ -292,8 +292,10 @@ static int CeedBasisApplyAtPointsCore_Cuda_shared(CeedBasis basis, bool apply_ad
                                      "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_NUM_PTS", max_num_points));
     CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "InterpAtPoints", &data->InterpAtPoints));
     CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "InterpTransposeAtPoints", &data->InterpTransposeAtPoints));
+    CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "InterpTransposeAddAtPoints", &data->InterpTransposeAddAtPoints));
     CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "GradAtPoints", &data->GradAtPoints));
     CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "GradTransposeAtPoints", &data->GradTransposeAtPoints));
+    CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "GradTransposeAddAtPoints", &data->GradTransposeAddAtPoints));
   }
 
   // Get read/write access to u, v
@@ -303,8 +305,6 @@ static int CeedBasisApplyAtPointsCore_Cuda_shared(CeedBasis basis, bool apply_ad
   if (apply_add) {
     CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v));
   } else {
-    // Clear v for transpose operation
-    if (is_transpose) CeedCallBackend(CeedVectorSetValue(v, 0.0));
     CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
   }
 
@@ -325,8 +325,12 @@ static int CeedBasisApplyAtPointsCore_Cuda_shared(CeedBasis basis, bool apply_ad
         CeedInt grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
         CeedInt shared_mem      = elems_per_block * thread_1d * sizeof(CeedScalar);
 
-        CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, is_transpose ? data->InterpTransposeAtPoints : data->InterpAtPoints, grid, thread_1d, 1,
-                                                    elems_per_block, shared_mem, interp_args));
+        if (is_transpose) {
+          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->InterpTransposeAddAtPoints : data->InterpTransposeAtPoints, grid,
+                                                      thread_1d, 1, elems_per_block, shared_mem, interp_args));
+        } else {
+          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->InterpAtPoints, grid, thread_1d, 1, elems_per_block, shared_mem, interp_args));
+        }
       } else if (dim == 2) {
         const CeedInt opt_elems[7] = {0, 32, 8, 6, 4, 2, 8};
         // elems_per_block must be at least 1
@@ -334,15 +338,25 @@ static int CeedBasisApplyAtPointsCore_Cuda_shared(CeedBasis basis, bool apply_ad
         CeedInt grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
         CeedInt shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
 
-        CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, is_transpose ? data->InterpTransposeAtPoints : data->InterpAtPoints, grid, thread_1d,
-                                                    thread_1d, elems_per_block, shared_mem, interp_args));
+        if (is_transpose) {
+          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->InterpTransposeAddAtPoints : data->InterpTransposeAtPoints, grid,
+                                                      thread_1d, thread_1d, elems_per_block, shared_mem, interp_args));
+        } else {
+          CeedCallBackend(
+              CeedRunKernelDimShared_Cuda(ceed, data->InterpAtPoints, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args));
+        }
       } else if (dim == 3) {
         CeedInt elems_per_block = 1;
         CeedInt grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
         CeedInt shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
 
-        CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, is_transpose ? data->InterpTransposeAtPoints : data->InterpAtPoints, grid, thread_1d,
-                                                    thread_1d, elems_per_block, shared_mem, interp_args));
+        if (is_transpose) {
+          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->InterpTransposeAddAtPoints : data->InterpTransposeAtPoints, grid,
+                                                      thread_1d, thread_1d, elems_per_block, shared_mem, interp_args));
+        } else {
+          CeedCallBackend(
+              CeedRunKernelDimShared_Cuda(ceed, data->InterpAtPoints, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args));
+        }
       }
     } break;
     case CEED_EVAL_GRAD: {
@@ -360,8 +374,12 @@ static int CeedBasisApplyAtPointsCore_Cuda_shared(CeedBasis basis, bool apply_ad
         CeedInt grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
         CeedInt shared_mem      = elems_per_block * thread_1d * sizeof(CeedScalar);
 
-        CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, is_transpose ? data->GradTransposeAtPoints : data->GradAtPoints, grid, thread_1d, 1,
-                                                    elems_per_block, shared_mem, grad_args));
+        if (is_transpose) {
+          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->GradTransposeAddAtPoints : data->GradTransposeAtPoints, grid, thread_1d,
+                                                      1, elems_per_block, shared_mem, grad_args));
+        } else {
+          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->GradAtPoints, grid, thread_1d, 1, elems_per_block, shared_mem, grad_args));
+        }
       } else if (dim == 2) {
         const CeedInt opt_elems[7] = {0, 32, 8, 6, 4, 2, 8};
         // elems_per_block must be at least 1
@@ -369,15 +387,23 @@ static int CeedBasisApplyAtPointsCore_Cuda_shared(CeedBasis basis, bool apply_ad
         CeedInt grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
         CeedInt shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
 
-        CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, is_transpose ? data->GradTransposeAtPoints : data->GradAtPoints, grid, thread_1d, thread_1d,
-                                                    elems_per_block, shared_mem, grad_args));
+        if (is_transpose) {
+          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->GradTransposeAddAtPoints : data->GradTransposeAtPoints, grid, thread_1d,
+                                                      thread_1d, elems_per_block, shared_mem, grad_args));
+        } else {
+          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->GradAtPoints, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args));
+        }
       } else if (dim == 3) {
         CeedInt elems_per_block = 1;
         CeedInt grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
         CeedInt shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
 
-        CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, is_transpose ? data->GradTransposeAtPoints : data->GradAtPoints, grid, thread_1d, thread_1d,
-                                                    elems_per_block, shared_mem, grad_args));
+        if (is_transpose) {
+          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->GradTransposeAddAtPoints : data->GradTransposeAtPoints, grid, thread_1d,
+                                                      thread_1d, elems_per_block, shared_mem, grad_args));
+        } else {
+          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->GradAtPoints, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args));
+        }
       }
     } break;
     case CEED_EVAL_WEIGHT:
diff --git a/backends/cuda-shared/ceed-cuda-shared.h b/backends/cuda-shared/ceed-cuda-shared.h
index db2d47809d..754028b964 100644
--- a/backends/cuda-shared/ceed-cuda-shared.h
+++ b/backends/cuda-shared/ceed-cuda-shared.h
@@ -23,8 +23,10 @@ typedef struct {
   CeedInt     num_points;
   CUfunction  InterpAtPoints;
   CUfunction  InterpTransposeAtPoints;
+  CUfunction  InterpTransposeAddAtPoints;
   CUfunction  GradAtPoints;
   CUfunction  GradTransposeAtPoints;
+  CUfunction  GradTransposeAddAtPoints;
   CeedScalar *d_interp_1d;
   CeedScalar *d_grad_1d;
   CeedScalar *d_collo_grad_1d;
diff --git a/backends/hip-shared/ceed-hip-shared-basis.c b/backends/hip-shared/ceed-hip-shared-basis.c
index ca627d9061..8bd33917dc 100644
--- a/backends/hip-shared/ceed-hip-shared-basis.c
+++ b/backends/hip-shared/ceed-hip-shared-basis.c
@@ -371,8 +371,6 @@ static int CeedBasisApplyAtPointsCore_Hip_shared(CeedBasis basis, bool apply_add
   if (apply_add) {
     CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v));
   } else {
-    // Clear v for transpose operation
-    if (is_transpose) CeedCallBackend(CeedVectorSetValue(v, 0.0));
     CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
   }
 
@@ -393,23 +391,37 @@ static int CeedBasisApplyAtPointsCore_Hip_shared(CeedBasis basis, bool apply_add
         CeedInt grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
         CeedInt shared_mem      = elems_per_block * thread_1d * sizeof(CeedScalar);
 
-        CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, is_transpose ? data->InterpTransposeAtPoints : data->InterpAtPoints, grid, thread_1d, 1,
-                                                   elems_per_block, shared_mem, interp_args));
+        if (is_transpose) {
+          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->InterpTransposeAddAtPoints : data->InterpTransposeAtPoints, grid,
+                                                     thread_1d, 1, elems_per_block, shared_mem, interp_args));
+        } else {
+          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->InterpAtPoints, grid, thread_1d, 1, elems_per_block, shared_mem, interp_args));
+        }
       } else if (dim == 2) {
         // Check if required threads is small enough to do multiple elems
         const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1);
         CeedInt       grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
         CeedInt       shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
 
-        CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, is_transpose ? data->InterpTransposeAtPoints : data->InterpAtPoints, grid, thread_1d,
-                                                   thread_1d, elems_per_block, shared_mem, interp_args));
+        if (is_transpose) {
+          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->InterpTransposeAddAtPoints : data->InterpTransposeAtPoints, grid,
+                                                     thread_1d, thread_1d, elems_per_block, shared_mem, interp_args));
+        } else {
+          CeedCallBackend(
+              CeedRunKernelDimShared_Hip(ceed, data->InterpAtPoints, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args));
+        }
       } else if (dim == 3) {
         const CeedInt elems_per_block = 1;
         CeedInt       grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
         CeedInt       shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
 
-        CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, is_transpose ? data->InterpTransposeAtPoints : data->InterpAtPoints, grid, thread_1d,
-                                                   thread_1d, elems_per_block, shared_mem, interp_args));
+        if (is_transpose) {
+          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->InterpTransposeAddAtPoints : data->InterpTransposeAtPoints, grid,
+                                                     thread_1d, thread_1d, elems_per_block, shared_mem, interp_args));
+        } else {
+          CeedCallBackend(
+              CeedRunKernelDimShared_Hip(ceed, data->InterpAtPoints, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args));
+        }
       }
     } break;
     case CEED_EVAL_GRAD: {
@@ -427,23 +439,35 @@ static int CeedBasisApplyAtPointsCore_Hip_shared(CeedBasis basis, bool apply_add
         CeedInt grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
         CeedInt shared_mem      = elems_per_block * thread_1d * sizeof(CeedScalar);
 
-        CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, is_transpose ? data->GradTransposeAtPoints : data->GradAtPoints, grid, thread_1d, 1,
-                                                   elems_per_block, shared_mem, grad_args));
+        if (is_transpose) {
+          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->GradTransposeAddAtPoints : data->GradTransposeAtPoints, grid, thread_1d,
+                                                     1, elems_per_block, shared_mem, grad_args));
+        } else {
+          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->GradAtPoints, grid, thread_1d, 1, elems_per_block, shared_mem, grad_args));
+        }
       } else if (dim == 2) {
         // Check if required threads is small enough to do multiple elems
         const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1);
         CeedInt       grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
         CeedInt       shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
 
-        CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, is_transpose ? data->GradTransposeAtPoints : data->GradAtPoints, grid, thread_1d, thread_1d,
-                                                   elems_per_block, shared_mem, grad_args));
+        if (is_transpose) {
+          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->GradTransposeAddAtPoints : data->GradTransposeAtPoints, grid, thread_1d,
+                                                     thread_1d, elems_per_block, shared_mem, grad_args));
+        } else {
+          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->GradAtPoints, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args));
+        }
       } else if (dim == 3) {
         const CeedInt elems_per_block = 1;
         CeedInt       grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
         CeedInt       shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
 
-        CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, is_transpose ? data->GradTransposeAtPoints : data->GradAtPoints, grid, thread_1d, thread_1d,
-                                                   elems_per_block, shared_mem, grad_args));
+        if (is_transpose) {
+          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->GradTransposeAddAtPoints : data->GradTransposeAtPoints, grid, thread_1d,
+                                                     thread_1d, elems_per_block, shared_mem, grad_args));
+        } else {
+          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->GradAtPoints, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args));
+        }
       }
     } break;
     case CEED_EVAL_WEIGHT:
diff --git a/backends/hip-shared/ceed-hip-shared.h b/backends/hip-shared/ceed-hip-shared.h
index 236e6f63a0..cfb9480f49 100644
--- a/backends/hip-shared/ceed-hip-shared.h
+++ b/backends/hip-shared/ceed-hip-shared.h
@@ -23,8 +23,10 @@ typedef struct {
   CeedInt       num_points;
   hipFunction_t InterpAtPoints;
   hipFunction_t InterpTransposeAtPoints;
+  hipFunction_t InterpTransposeAddAtPoints;
   hipFunction_t GradAtPoints;
   hipFunction_t GradTransposeAtPoints;
+  hipFunction_t GradTransposeAddAtPoints;
   CeedInt       block_sizes[3];  // interp, grad, weight thread block sizes
   CeedScalar   *d_interp_1d;
   CeedScalar   *d_grad_1d;
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points.h
index d0cc602be9..d24106863f 100644
--- a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points.h
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points.h
@@ -97,6 +97,77 @@ extern "C" __global__ void InterpTransposeAtPoints(const CeedInt num_elem, const
   LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B);
   __syncthreads();
 
+  // Apply basis element by element
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    // Clear register
+    for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_C[i] = 0.0;
+
+    // Clear output vector
+    for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_V[i] = 0.0;
+    if (BASIS_DIM == 1) {
+      WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 2) {
+      WriteElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 3) {
+      WriteElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                        BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    }
+
+    // Map from points
+    const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y));
+
+    for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) {
+      const CeedInt p = i % BASIS_NUM_PTS;
+
+      ReadPoint<BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X);
+      ReadPoint<BASIS_NUM_COMP, BASIS_NUM_PTS>(data, elem, i, points_per_elem[elem], 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_U, r_U);
+      if (BASIS_DIM == 1) {
+        InterpTransposeAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
+      } else if (BASIS_DIM == 2) {
+        InterpTransposeAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
+      } else if (BASIS_DIM == 3) {
+        InterpTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
+      }
+    }
+    __syncthreads();
+
+    // Map from coefficients
+    if (BASIS_DIM == 1) {
+      InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V);
+      SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 2) {
+      InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V);
+      SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 3) {
+      InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V);
+      SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                      BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    }
+  }
+}
+
+extern "C" __global__ void InterpTransposeAddAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ c_B,
+                                                      const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ d_X,
+                                                      const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
+  extern __shared__ CeedScalar slice[];
+
+  SharedData_Cuda data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
+
+  CeedScalar r_X[BASIS_DIM];
+  CeedScalar r_U[BASIS_NUM_COMP];
+  CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
+  CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
+
+  // load interp_1d into shared memory
+  __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
+  LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B);
+  __syncthreads();
+
   // Apply basis element by element
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
     // Clear register
@@ -215,6 +286,78 @@ extern "C" __global__ void GradTransposeAtPoints(const CeedInt num_elem, const C
   LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B);
   __syncthreads();
 
+  // Apply basis element by element
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    // Clear register
+    for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_C[i] = 0.0;
+
+    // Clear output vector
+    for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_V[i] = 0.0;
+    if (BASIS_DIM == 1) {
+      WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 2) {
+      WriteElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 3) {
+      WriteElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                        BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    }
+
+    // Map from points
+    const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y));
+
+    for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) {
+      const CeedInt p = i % BASIS_NUM_PTS;
+
+      ReadPoint<BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X);
+      ReadPoint<BASIS_NUM_COMP * BASIS_DIM, BASIS_NUM_PTS>(data, elem, i, points_per_elem[elem], 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_U,
+                                                           r_U);
+      if (BASIS_DIM == 1) {
+        GradTransposeAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
+      } else if (BASIS_DIM == 2) {
+        GradTransposeAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
+      } else if (BASIS_DIM == 3) {
+        GradTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
+      }
+    }
+    __syncthreads();
+
+    // Map from coefficients
+    if (BASIS_DIM == 1) {
+      InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V);
+      SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 2) {
+      InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V);
+      SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 3) {
+      InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V);
+      SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                      BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    }
+  }
+}
+
+extern "C" __global__ void GradTransposeAddAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ c_B,
+                                                    const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ d_X,
+                                                    const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
+  extern __shared__ CeedScalar slice[];
+
+  SharedData_Cuda data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
+
+  CeedScalar r_X[BASIS_DIM];
+  CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM];
+  CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
+  CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
+
+  // load interp_1d into shared memory
+  __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
+  LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B);
+  __syncthreads();
+
   // Apply basis element by element
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
     // Clear register
diff --git a/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points.h b/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points.h
index 753d5e1af7..5de645c501 100644
--- a/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points.h
+++ b/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points.h
@@ -98,6 +98,77 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
   LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B);
   __syncthreads();
 
+  // Apply basis element by element
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    // Clear register
+    for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_C[i] = 0.0;
+
+    // Clear output vector
+    for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_V[i] = 0.0;
+    if (BASIS_DIM == 1) {
+      WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 2) {
+      WriteElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 3) {
+      WriteElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                        BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    }
+
+    // Map from points
+    const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y));
+
+    for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) {
+      const CeedInt p = i % BASIS_NUM_PTS;
+
+      ReadPoint<BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X);
+      ReadPoint<BASIS_NUM_COMP, BASIS_NUM_PTS>(data, elem, i, points_per_elem[elem], 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_U, r_U);
+      if (BASIS_DIM == 1) {
+        InterpTransposeAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
+      } else if (BASIS_DIM == 2) {
+        InterpTransposeAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
+      } else if (BASIS_DIM == 3) {
+        InterpTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
+      }
+    }
+    __syncthreads();
+
+    // Map from coefficients
+    if (BASIS_DIM == 1) {
+      InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V);
+      SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 2) {
+      InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V);
+      SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 3) {
+      InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V);
+      SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                      BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    }
+  }
+}
+
+extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
+    void InterpTransposeAddAtPoints(const CeedInt num_elem, const CeedScalar *c_B, const CeedInt *points_per_elem, const CeedScalar *__restrict__ d_X,
+                                    const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
+  extern __shared__ CeedScalar slice[];
+
+  SharedData_Hip data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
+
+  CeedScalar r_X[BASIS_DIM];
+  CeedScalar r_U[BASIS_NUM_COMP];
+  CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
+  CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
+
+  // load chebyshev_interp_1d into shared memory
+  __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
+  LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B);
+  __syncthreads();
+
   // Apply basis element by element
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
     // Clear register
@@ -217,6 +288,78 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
   LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B);
   __syncthreads();
 
+  // Apply basis element by element
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    // Clear register
+    for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_C[i] = 0.0;
+
+    // Clear output vector
+    for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_V[i] = 0.0;
+    if (BASIS_DIM == 1) {
+      WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 2) {
+      WriteElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 3) {
+      WriteElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                        BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    }
+
+    // Map from points
+    const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y));
+
+    for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) {
+      const CeedInt p = i % BASIS_NUM_PTS;
+
+      ReadPoint<BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X);
+      ReadPoint<BASIS_NUM_COMP * BASIS_DIM, BASIS_NUM_PTS>(data, elem, i, points_per_elem[elem], 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_U,
+                                                           r_U);
+      if (BASIS_DIM == 1) {
+        GradTransposeAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
+      } else if (BASIS_DIM == 2) {
+        GradTransposeAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
+      } else if (BASIS_DIM == 3) {
+        GradTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
+      }
+    }
+    __syncthreads();
+
+    // Map from coefficients
+    if (BASIS_DIM == 1) {
+      InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V);
+      SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 2) {
+      InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V);
+      SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 3) {
+      InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V);
+      SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                      BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    }
+  }
+}
+
+extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
+    void GradTransposeAddAtPoints(const CeedInt num_elem, const CeedScalar *c_B, const CeedInt *points_per_elem, const CeedScalar *__restrict__ d_X,
+                                  const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
+  extern __shared__ CeedScalar slice[];
+
+  SharedData_Hip data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
+
+  CeedScalar r_X[BASIS_DIM];
+  CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM];
+  CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
+  CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
+
+  // load chebyshev_interp_1d into shared memory
+  __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
+  LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B);
+  __syncthreads();
+
   // Apply basis element by element
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
     // Clear register

From 3719258a628b8e1dc5c85f3eda31f5c9774d6f96 Mon Sep 17 00:00:00 2001
From: Zach Atkins <Zach.Atkins@colorado.edu>
Date: Thu, 13 Feb 2025 11:22:47 -0800
Subject: [PATCH 293/571] Minor bpswarm fixes

---
 examples/petsc/bpsswarm.c | 50 +++++++++++++++++++--------------------
 1 file changed, 25 insertions(+), 25 deletions(-)

diff --git a/examples/petsc/bpsswarm.c b/examples/petsc/bpsswarm.c
index 3809964e9a..33248910e7 100644
--- a/examples/petsc/bpsswarm.c
+++ b/examples/petsc/bpsswarm.c
@@ -100,7 +100,7 @@ int main(int argc, char **argv) {
                              &write_true_solution_swarm, NULL));
   degree = 2;
   PetscCall(PetscOptionsInt("-degree", "Polynomial degree of tensor product basis", NULL, degree, &degree, NULL));
-  q_extra = bp_options[bp_choice].q_extra;
+  q_extra = 0;
   PetscCall(PetscOptionsInt("-q_extra", "Number of extra quadrature points", NULL, q_extra, &q_extra, NULL));
   PetscCall(PetscOptionsString("-ceed", "CEED resource specifier", NULL, ceed_resource, ceed_resource, sizeof(ceed_resource), NULL));
   PetscCall(PetscGetHostName(hostname, sizeof hostname));
@@ -146,6 +146,27 @@ int main(int argc, char **argv) {
   }
   PetscOptionsEnd();
 
+  // Set up libCEED
+  CeedInit(ceed_resource, &ceed);
+  CeedMemType mem_type_backend;
+  CeedGetPreferredMemType(ceed, &mem_type_backend);
+
+  // Set background mesh vec_type
+  switch (mem_type_backend) {
+    case CEED_MEM_HOST:
+      vec_type = VECSTANDARD;
+      break;
+    case CEED_MEM_DEVICE: {
+      const char *resolved;
+
+      CeedGetResource(ceed, &resolved);
+      if (strstr(resolved, "/gpu/cuda")) vec_type = VECCUDA;
+      else if (strstr(resolved, "/gpu/hip/occa")) vec_type = VECSTANDARD;  // https://github.com/CEED/libCEED/issues/678
+      else if (strstr(resolved, "/gpu/hip")) vec_type = VECHIP;
+      else vec_type = VECSTANDARD;
+    }
+  }
+
   // Setup DM
   if (read_mesh) {
     PetscCall(DMPlexCreateFromFile(comm, filename, NULL, PETSC_TRUE, &dm_mesh));
@@ -162,11 +183,13 @@ int main(int argc, char **argv) {
       PetscCheck(!is_simplex, comm, PETSC_ERR_USER, "Only tensor-product background meshes supported");
     }
   }
+  PetscCall(DMSetVecType(dm_mesh, vec_type));
+  PetscCall(DMSetFromOptions(dm_mesh));
+
   PetscCall(DMGetDimension(dm_mesh, &dim));
   PetscCall(SetupDMByDegree(dm_mesh, degree, q_extra, num_comp_u, dim, bp_options[bp_choice].enforce_bc));
 
   // View mesh
-  PetscCall(DMSetOptionsPrefix(dm_mesh, "final_"));
   PetscCall(DMViewFromOptions(dm_mesh, NULL, "-dm_view"));
 
   // Create particle swarm
@@ -195,29 +218,6 @@ int main(int argc, char **argv) {
   PetscCall(PetscObjectSetName((PetscObject)dm_swarm, "Particle Swarm"));
   PetscCall(DMViewFromOptions(dm_swarm, NULL, "-dm_swarm_view"));
 
-  // Set up libCEED
-  CeedInit(ceed_resource, &ceed);
-  CeedMemType mem_type_backend;
-  CeedGetPreferredMemType(ceed, &mem_type_backend);
-
-  // Set background mesh vec_type
-  switch (mem_type_backend) {
-    case CEED_MEM_HOST:
-      vec_type = VECSTANDARD;
-      break;
-    case CEED_MEM_DEVICE: {
-      const char *resolved;
-
-      CeedGetResource(ceed, &resolved);
-      if (strstr(resolved, "/gpu/cuda")) vec_type = VECCUDA;
-      else if (strstr(resolved, "/gpu/hip/occa")) vec_type = VECSTANDARD;  // https://github.com/CEED/libCEED/issues/678
-      else if (strstr(resolved, "/gpu/hip")) vec_type = VECHIP;
-      else vec_type = VECSTANDARD;
-    }
-  }
-  PetscCall(DMSetVecType(dm_mesh, vec_type));
-  PetscCall(DMSetFromOptions(dm_mesh));
-
   // Create vectors
   PetscCall(DMCreateGlobalVector(dm_mesh, &X));
   PetscCall(VecGetLocalSize(X, &l_size));

From 3f49564b8bf582357b06cc8cc0c2b2bd0573ff59 Mon Sep 17 00:00:00 2001
From: Zach Atkins <Zach.Atkins@colorado.edu>
Date: Thu, 13 Feb 2025 11:27:07 -0800
Subject: [PATCH 294/571] undo q_extra change

---
 examples/petsc/bpsswarm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/petsc/bpsswarm.c b/examples/petsc/bpsswarm.c
index 33248910e7..3ff9685c8e 100644
--- a/examples/petsc/bpsswarm.c
+++ b/examples/petsc/bpsswarm.c
@@ -100,7 +100,7 @@ int main(int argc, char **argv) {
                              &write_true_solution_swarm, NULL));
   degree = 2;
   PetscCall(PetscOptionsInt("-degree", "Polynomial degree of tensor product basis", NULL, degree, &degree, NULL));
-  q_extra = 0;
+  q_extra = bp_options[bp_choice].q_extra;
   PetscCall(PetscOptionsInt("-q_extra", "Number of extra quadrature points", NULL, q_extra, &q_extra, NULL));
   PetscCall(PetscOptionsString("-ceed", "CEED resource specifier", NULL, ceed_resource, ceed_resource, sizeof(ceed_resource), NULL));
   PetscCall(PetscGetHostName(hostname, sizeof hostname));

From a8772291e77ef4a118363067e2a34dce366a0bfb Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Thu, 13 Feb 2025 16:28:35 -0700
Subject: [PATCH 295/571] hip - fix bug, need to actually get kernels

---
 backends/hip-shared/ceed-hip-shared-basis.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/backends/hip-shared/ceed-hip-shared-basis.c b/backends/hip-shared/ceed-hip-shared-basis.c
index 8bd33917dc..4dc823754c 100644
--- a/backends/hip-shared/ceed-hip-shared-basis.c
+++ b/backends/hip-shared/ceed-hip-shared-basis.c
@@ -360,8 +360,10 @@ static int CeedBasisApplyAtPointsCore_Hip_shared(CeedBasis basis, bool apply_add
                                     data->block_sizes[0]));
     CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "InterpAtPoints", &data->InterpAtPoints));
     CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "InterpTransposeAtPoints", &data->InterpTransposeAtPoints));
+    CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "InterpTransposeAddAtPoints", &data->InterpTransposeAddAtPoints));
     CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "GradAtPoints", &data->GradAtPoints));
     CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "GradTransposeAtPoints", &data->GradTransposeAtPoints));
+    CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "GradTransposeAddAtPoints", &data->GradTransposeAddAtPoints));
   }
 
   // Get read/write access to u, v

From 4422a61cbad3667d93498ebd26401008bb17cfcc Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Mon, 17 Feb 2025 07:49:05 -0700
Subject: [PATCH 296/571] petsc - determin vec type before DMSetFromOptions

---
 examples/petsc/bpssphere.c | 44 +++++++++++++++++++-------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/examples/petsc/bpssphere.c b/examples/petsc/bpssphere.c
index ab9648a4de..043465b789 100644
--- a/examples/petsc/bpssphere.c
+++ b/examples/petsc/bpssphere.c
@@ -92,6 +92,27 @@ int main(int argc, char **argv) {
   PetscCall(PetscOptionsBool("-simplex", "Use simplices, or tensor product cells", NULL, simplex, &simplex, NULL));
   PetscOptionsEnd();
 
+  // Set up libCEED
+  CeedInit(ceed_resource, &ceed);
+  CeedMemType mem_type_backend;
+  CeedGetPreferredMemType(ceed, &mem_type_backend);
+
+  // Set mesh vec_type
+  switch (mem_type_backend) {
+    case CEED_MEM_HOST:
+      vec_type = VECSTANDARD;
+      break;
+    case CEED_MEM_DEVICE: {
+      const char *resolved;
+
+      CeedGetResource(ceed, &resolved);
+      if (strstr(resolved, "/gpu/cuda")) vec_type = VECCUDA;
+      else if (strstr(resolved, "/gpu/hip/occa")) vec_type = VECSTANDARD;  // https://github.com/CEED/libCEED/issues/678
+      else if (strstr(resolved, "/gpu/hip")) vec_type = VECHIP;
+      else vec_type = VECSTANDARD;
+    }
+  }
+
   // Setup DM
   if (read_mesh) {
     PetscCall(DMPlexCreateFromFile(PETSC_COMM_WORLD, filename, NULL, PETSC_TRUE, &dm));
@@ -104,6 +125,7 @@ int main(int argc, char **argv) {
     // Refine DMPlex with uniform refinement using runtime option -dm_refine
     PetscCall(DMPlexSetRefinementUniform(dm, PETSC_TRUE));
   }
+  PetscCall(DMSetVecType(dm, vec_type));
   PetscCall(DMSetFromOptions(dm));
   // View DMPlex via runtime option
   PetscCall(DMViewFromOptions(dm, NULL, "-dm_view"));
@@ -125,28 +147,6 @@ int main(int argc, char **argv) {
   PetscCall(MatCreateShell(comm, l_size, l_size, g_size, g_size, op_apply_ctx, &mat_O));
   PetscCall(MatShellSetOperation(mat_O, MATOP_MULT, (void (*)(void))MatMult_Ceed));
 
-  // Set up libCEED
-  CeedInit(ceed_resource, &ceed);
-  CeedMemType mem_type_backend;
-  CeedGetPreferredMemType(ceed, &mem_type_backend);
-
-  // Set mesh vec_type
-  switch (mem_type_backend) {
-    case CEED_MEM_HOST:
-      vec_type = VECSTANDARD;
-      break;
-    case CEED_MEM_DEVICE: {
-      const char *resolved;
-
-      CeedGetResource(ceed, &resolved);
-      if (strstr(resolved, "/gpu/cuda")) vec_type = VECCUDA;
-      else if (strstr(resolved, "/gpu/hip/occa")) vec_type = VECSTANDARD;  // https://github.com/CEED/libCEED/issues/678
-      else if (strstr(resolved, "/gpu/hip")) vec_type = VECHIP;
-      else vec_type = VECSTANDARD;
-    }
-  }
-  PetscCall(DMSetVecType(dm, vec_type));
-
   // Print summary
   if (!test_mode) {
     PetscInt    P = degree + 1, Q = P + q_extra;

From 25dc7e9f521aebf9309cdc04bbc1cb3fd7dfb61b Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Wed, 19 Feb 2025 12:21:35 -0700
Subject: [PATCH 297/571] ci - remove IMB test due to action bug

---
 .github/workflows/c-fortran-test-ppc64le.yml | 40 --------------------
 1 file changed, 40 deletions(-)
 delete mode 100644 .github/workflows/c-fortran-test-ppc64le.yml

diff --git a/.github/workflows/c-fortran-test-ppc64le.yml b/.github/workflows/c-fortran-test-ppc64le.yml
deleted file mode 100644
index 709cdcaa84..0000000000
--- a/.github/workflows/c-fortran-test-ppc64le.yml
+++ /dev/null
@@ -1,40 +0,0 @@
-name: IBM Power
-
-on:
-  push:
-    branches:
-      - main
-  pull_request:
-
-jobs:
-  test:
-    strategy:
-      matrix:
-        os: [ubuntu-24.04]
-        compiler: [gcc-13]
-        arch: [ppc64le]
-        distro: [ubuntu22.04]
-
-    runs-on: ${{ matrix.os }}
-
-    steps:
-    - name: Environment setup
-      uses: actions/checkout@v4
-    - name: Hardware setup and test libCEED
-      uses: uraimo/run-on-arch-action@v2
-      env:
-        CC: ${{ matrix.compiler }}
-        FC: gfortran-13
-      id: runcmd
-      with:
-        arch: ${{ matrix.arch }}
-        distro: ${{ matrix.distro }}
-        run: |
-          apt-get -y update
-          apt-get install -y build-essential
-          apt-get install -y gfortran
-          apt-get install -y python3
-          uname -a
-          make info
-          make -j2
-          PROVE_OPTS=-v make prove -j2

From ea04d07fbab2141e7d4cd58a032ed98e4ff61ed0 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Tue, 11 Feb 2025 14:24:19 -0700
Subject: [PATCH 298/571] gpu - isolate gen ApplyAdd inner logic

---
 backends/cuda-gen/ceed-cuda-gen-operator.c | 93 ++++++++-------------
 backends/hip-gen/ceed-hip-gen-operator.c   | 97 ++++++++--------------
 2 files changed, 70 insertions(+), 120 deletions(-)

diff --git a/backends/cuda-gen/ceed-cuda-gen-operator.c b/backends/cuda-gen/ceed-cuda-gen-operator.c
index 3410cdcfb7..77ef6e9535 100644
--- a/backends/cuda-gen/ceed-cuda-gen-operator.c
+++ b/backends/cuda-gen/ceed-cuda-gen-operator.c
@@ -98,33 +98,22 @@ static size_t dynamicSMemSize(int threads) { return threads * sizeof(CeedScalar)
 //------------------------------------------------------------------------------
 // Apply and add to output
 //------------------------------------------------------------------------------
-static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec, CeedVector output_vec, CeedRequest *request) {
-  bool                    is_at_points, is_tensor, is_run_good = true;
+static int CeedOperatorApplyAddCore_Cuda_gen(CeedOperator op, const CeedScalar *input_arr, CeedScalar *output_arr, bool *is_run_good,
+                                             CeedRequest *request) {
+  bool                    is_at_points, is_tensor;
   Ceed                    ceed;
   Ceed_Cuda              *cuda_data;
   CeedInt                 num_elem, num_input_fields, num_output_fields;
   CeedEvalMode            eval_mode;
-  CeedVector              output_vecs[CEED_FIELD_MAX] = {NULL};
   CeedQFunctionField     *qf_input_fields, *qf_output_fields;
   CeedQFunction_Cuda_gen *qf_data;
   CeedQFunction           qf;
   CeedOperatorField      *op_input_fields, *op_output_fields;
   CeedOperator_Cuda_gen  *data;
 
-  // Creation of the operator
-  {
-    bool is_good_build = false;
-
-    CeedCallBackend(CeedOperatorBuildKernel_Cuda_gen(op, &is_good_build));
-    if (!is_good_build) {
-      CeedOperator op_fallback;
-
-      CeedDebug256(CeedOperatorReturnCeed(op), CEED_DEBUG_COLOR_SUCCESS, "Falling back to /gpu/cuda/ref CeedOperator due to code generation issue");
-      CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback));
-      CeedCallBackend(CeedOperatorApplyAdd(op_fallback, input_vec, output_vec, request));
-      return CEED_ERROR_SUCCESS;
-    }
-  }
+  // Build the operator kernel
+  CeedCallBackend(CeedOperatorBuildKernel_Cuda_gen(op, is_run_good));
+  if (!(*is_run_good)) return CEED_ERROR_SUCCESS;
 
   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
   CeedCallBackend(CeedGetData(ceed, &cuda_data));
@@ -147,9 +136,9 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec,
       // Get input vector
       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
       is_active = vec == CEED_VECTOR_ACTIVE;
-      if (is_active) vec = input_vec;
-      CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->fields.inputs[i]));
-      if (!is_active) CeedCallBackend(CeedVectorDestroy(&vec));
+      if (is_active) data->fields.inputs[i] = input_arr;
+      else CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->fields.inputs[i]));
+      CeedCallBackend(CeedVectorDestroy(&vec));
     }
   }
 
@@ -165,23 +154,9 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec,
       // Get output vector
       CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
       is_active = vec == CEED_VECTOR_ACTIVE;
-      if (is_active) vec = output_vec;
-      output_vecs[i] = vec;
-      // Check for multiple output modes
-      CeedInt index = -1;
-
-      for (CeedInt j = 0; j < i; j++) {
-        if (vec == output_vecs[j]) {
-          index = j;
-          break;
-        }
-      }
-      if (index == -1) {
-        CeedCallBackend(CeedVectorGetArray(vec, CEED_MEM_DEVICE, &data->fields.outputs[i]));
-      } else {
-        data->fields.outputs[i] = data->fields.outputs[index];
-      }
-      if (!is_active) CeedCallBackend(CeedVectorDestroy(&vec));
+      if (is_active) data->fields.outputs[i] = output_arr;
+      else CeedCallBackend(CeedVectorGetArrayWrite(vec, CEED_MEM_DEVICE, &data->fields.outputs[i]));
+      CeedCallBackend(CeedVectorDestroy(&vec));
     }
   }
 
@@ -244,7 +219,7 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec,
   }
   CeedInt shared_mem = block[0] * block[1] * block[2] * sizeof(CeedScalar);
 
-  CeedCallBackend(CeedTryRunKernelDimShared_Cuda(ceed, data->op, grid, block[0], block[1], block[2], shared_mem, &is_run_good, opargs));
+  CeedCallBackend(CeedTryRunKernelDimShared_Cuda(ceed, data->op, grid, block[0], block[1], block[2], shared_mem, is_run_good, opargs));
 
   // Restore input arrays
   for (CeedInt i = 0; i < num_input_fields; i++) {
@@ -256,9 +231,8 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec,
 
       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
       is_active = vec == CEED_VECTOR_ACTIVE;
-      if (is_active) vec = input_vec;
-      CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->fields.inputs[i]));
-      if (!is_active) CeedCallBackend(CeedVectorDestroy(&vec));
+      if (!is_active) CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->fields.inputs[i]));
+      CeedCallBackend(CeedVectorDestroy(&vec));
     }
   }
 
@@ -272,20 +246,8 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec,
 
       CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
       is_active = vec == CEED_VECTOR_ACTIVE;
-      if (is_active) vec = output_vec;
-      // Check for multiple output modes
-      CeedInt index = -1;
-
-      for (CeedInt j = 0; j < i; j++) {
-        if (vec == output_vecs[j]) {
-          index = j;
-          break;
-        }
-      }
-      if (index == -1) {
-        CeedCallBackend(CeedVectorRestoreArray(vec, &data->fields.outputs[i]));
-      }
-      if (!is_active) CeedCallBackend(CeedVectorDestroy(&vec));
+      if (!is_active) CeedCallBackend(CeedVectorRestoreArray(vec, &data->fields.outputs[i]));
+      CeedCallBackend(CeedVectorDestroy(&vec));
     }
   }
 
@@ -304,16 +266,29 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec,
   // Cleanup
   CeedCallBackend(CeedDestroy(&ceed));
   CeedCallBackend(CeedQFunctionDestroy(&qf));
+  if (!(*is_run_good)) data->use_fallback = true;
+  return CEED_ERROR_SUCCESS;
+}
 
-  // Fallback if run was bad (out of resources)
+static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec, CeedVector output_vec, CeedRequest *request) {
+  bool              is_run_good = false;
+  const CeedScalar *input_arr   = NULL;
+  CeedScalar       *output_arr  = NULL;
+
+  // Try to run kernel
+  if (input_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(input_vec, CEED_MEM_DEVICE, &input_arr));
+  if (output_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArray(output_vec, CEED_MEM_DEVICE, &output_arr));
+  CeedCallBackend(CeedOperatorApplyAddCore_Cuda_gen(op, input_arr, output_arr, &is_run_good, request));
+  if (input_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorRestoreArrayRead(input_vec, &input_arr));
+  if (output_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorRestoreArray(output_vec, &output_arr));
+
+  // Fallback on unsuccessful run
   if (!is_run_good) {
     CeedOperator op_fallback;
 
-    data->use_fallback = true;
-    CeedDebug256(CeedOperatorReturnCeed(op), CEED_DEBUG_COLOR_SUCCESS, "Falling back to /gpu/cuda/ref CeedOperator due to kernel execution issue");
+    CeedDebug256(CeedOperatorReturnCeed(op), CEED_DEBUG_COLOR_SUCCESS, "Falling back to /gpu/cuda/ref CeedOperator");
     CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback));
     CeedCallBackend(CeedOperatorApplyAdd(op_fallback, input_vec, output_vec, request));
-    return CEED_ERROR_SUCCESS;
   }
   return CEED_ERROR_SUCCESS;
 }
diff --git a/backends/hip-gen/ceed-hip-gen-operator.c b/backends/hip-gen/ceed-hip-gen-operator.c
index c081c98cc6..b10dc89d44 100644
--- a/backends/hip-gen/ceed-hip-gen-operator.c
+++ b/backends/hip-gen/ceed-hip-gen-operator.c
@@ -34,12 +34,12 @@ static int CeedOperatorDestroy_Hip_gen(CeedOperator op) {
 //------------------------------------------------------------------------------
 // Apply and add to output
 //------------------------------------------------------------------------------
-static int CeedOperatorApplyAdd_Hip_gen(CeedOperator op, CeedVector input_vec, CeedVector output_vec, CeedRequest *request) {
-  bool                   is_at_points, is_tensor, is_good_run = true;
+static int CeedOperatorApplyAddCore_Hip_gen(CeedOperator op, const CeedScalar *input_arr, CeedScalar *output_arr, bool *is_run_good,
+                                            CeedRequest *request) {
+  bool                   is_at_points, is_tensor;
   Ceed                   ceed;
   CeedInt                num_elem, num_input_fields, num_output_fields;
   CeedEvalMode           eval_mode;
-  CeedVector             output_vecs[CEED_FIELD_MAX] = {NULL};
   CeedQFunctionField    *qf_input_fields, *qf_output_fields;
   CeedQFunction_Hip_gen *qf_data;
   CeedQFunction          qf;
@@ -47,19 +47,8 @@ static int CeedOperatorApplyAdd_Hip_gen(CeedOperator op, CeedVector input_vec, C
   CeedOperator_Hip_gen  *data;
 
   // Creation of the operator
-  {
-    bool is_good_build = false;
-
-    CeedCallBackend(CeedOperatorBuildKernel_Hip_gen(op, &is_good_build));
-    if (!is_good_build) {
-      CeedOperator op_fallback;
-
-      CeedDebug256(CeedOperatorReturnCeed(op), CEED_DEBUG_COLOR_SUCCESS, "Falling back to /gpu/hip/ref CeedOperator due to code generation issue");
-      CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback));
-      CeedCallBackend(CeedOperatorApplyAdd(op_fallback, input_vec, output_vec, request));
-      return CEED_ERROR_SUCCESS;
-    }
-  }
+  CeedCallBackend(CeedOperatorBuildKernel_Hip_gen(op, is_run_good));
+  if (!(*is_run_good)) return CEED_ERROR_SUCCESS;
 
   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
   CeedCallBackend(CeedOperatorGetData(op, &data));
@@ -81,9 +70,9 @@ static int CeedOperatorApplyAdd_Hip_gen(CeedOperator op, CeedVector input_vec, C
       // Get input vector
       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
       is_active = vec == CEED_VECTOR_ACTIVE;
-      if (is_active) vec = input_vec;
-      CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->fields.inputs[i]));
-      if (!is_active) CeedCallBackend(CeedVectorDestroy(&vec));
+      if (is_active) data->fields.inputs[i] = input_arr;
+      else CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->fields.inputs[i]));
+      CeedCallBackend(CeedVectorDestroy(&vec));
     }
   }
 
@@ -99,23 +88,9 @@ static int CeedOperatorApplyAdd_Hip_gen(CeedOperator op, CeedVector input_vec, C
       // Get output vector
       CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
       is_active = vec == CEED_VECTOR_ACTIVE;
-      if (is_active) vec = output_vec;
-      output_vecs[i] = vec;
-      // Check for multiple output modes
-      CeedInt index = -1;
-
-      for (CeedInt j = 0; j < i; j++) {
-        if (vec == output_vecs[j]) {
-          index = j;
-          break;
-        }
-      }
-      if (index == -1) {
-        CeedCallBackend(CeedVectorGetArray(vec, CEED_MEM_DEVICE, &data->fields.outputs[i]));
-      } else {
-        data->fields.outputs[i] = data->fields.outputs[index];
-      }
-      if (!is_active) CeedCallBackend(CeedVectorDestroy(&vec));
+      if (is_active) data->fields.outputs[i] = output_arr;
+      else CeedCallBackend(CeedVectorGetArrayWrite(vec, CEED_MEM_DEVICE, &data->fields.outputs[i]));
+      CeedCallBackend(CeedVectorDestroy(&vec));
     }
   }
 
@@ -178,19 +153,19 @@ static int CeedOperatorApplyAdd_Hip_gen(CeedOperator op, CeedVector input_vec, C
     CeedInt sharedMem = block_sizes[2] * thread_1d * sizeof(CeedScalar);
 
     CeedCallBackend(
-        CeedTryRunKernelDimShared_Hip(ceed, data->op, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, &is_good_run, opargs));
+        CeedTryRunKernelDimShared_Hip(ceed, data->op, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, is_run_good, opargs));
   } else if (dim == 2) {
     CeedInt grid      = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
     CeedInt sharedMem = block_sizes[2] * thread_1d * thread_1d * sizeof(CeedScalar);
 
     CeedCallBackend(
-        CeedTryRunKernelDimShared_Hip(ceed, data->op, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, &is_good_run, opargs));
+        CeedTryRunKernelDimShared_Hip(ceed, data->op, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, is_run_good, opargs));
   } else if (dim == 3) {
     CeedInt grid      = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
     CeedInt sharedMem = block_sizes[2] * thread_1d * thread_1d * sizeof(CeedScalar);
 
     CeedCallBackend(
-        CeedTryRunKernelDimShared_Hip(ceed, data->op, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, &is_good_run, opargs));
+        CeedTryRunKernelDimShared_Hip(ceed, data->op, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, is_run_good, opargs));
   }
 
   // Restore input arrays
@@ -203,9 +178,8 @@ static int CeedOperatorApplyAdd_Hip_gen(CeedOperator op, CeedVector input_vec, C
 
       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
       is_active = vec == CEED_VECTOR_ACTIVE;
-      if (is_active) vec = input_vec;
-      CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->fields.inputs[i]));
-      if (!is_active) CeedCallBackend(CeedVectorDestroy(&vec));
+      if (!is_active) CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->fields.inputs[i]));
+      CeedCallBackend(CeedVectorDestroy(&vec));
     }
   }
 
@@ -219,20 +193,8 @@ static int CeedOperatorApplyAdd_Hip_gen(CeedOperator op, CeedVector input_vec, C
 
       CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
       is_active = vec == CEED_VECTOR_ACTIVE;
-      if (is_active) vec = output_vec;
-      // Check for multiple output modes
-      CeedInt index = -1;
-
-      for (CeedInt j = 0; j < i; j++) {
-        if (vec == output_vecs[j]) {
-          index = j;
-          break;
-        }
-      }
-      if (index == -1) {
-        CeedCallBackend(CeedVectorRestoreArray(vec, &data->fields.outputs[i]));
-      }
-      if (!is_active) CeedCallBackend(CeedVectorDestroy(&vec));
+      if (!is_active) CeedCallBackend(CeedVectorRestoreArray(vec, &data->fields.outputs[i]));
+      CeedCallBackend(CeedVectorDestroy(&vec));
     }
   }
 
@@ -251,16 +213,29 @@ static int CeedOperatorApplyAdd_Hip_gen(CeedOperator op, CeedVector input_vec, C
   // Cleanup
   CeedCallBackend(CeedDestroy(&ceed));
   CeedCallBackend(CeedQFunctionDestroy(&qf));
+  if (!(*is_run_good)) data->use_fallback = true;
+  return CEED_ERROR_SUCCESS;
+}
 
-  // Fallback if run was bad (out of resources)
-  if (!is_good_run) {
+static int CeedOperatorApplyAdd_Hip_gen(CeedOperator op, CeedVector input_vec, CeedVector output_vec, CeedRequest *request) {
+  bool              is_run_good = false;
+  const CeedScalar *input_arr   = NULL;
+  CeedScalar       *output_arr  = NULL;
+
+  // Try to run kernel
+  if (input_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(input_vec, CEED_MEM_DEVICE, &input_arr));
+  if (output_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArray(output_vec, CEED_MEM_DEVICE, &output_arr));
+  CeedCallBackend(CeedOperatorApplyAddCore_Cuda_gen(op, input_arr, output_arr, &is_run_good, request));
+  if (input_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorRestoreArrayRead(input_vec, &input_arr));
+  if (output_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorRestoreArray(output_vec, &output_arr));
+
+  // Fallback on unsuccessful run
+  if (!is_run_good) {
     CeedOperator op_fallback;
 
-    data->use_fallback = true;
-    CeedDebug256(CeedOperatorReturnCeed(op), CEED_DEBUG_COLOR_SUCCESS, "Falling back to /gpu/hip/ref CeedOperator due to kernel execution issue");
+    CeedDebug256(CeedOperatorReturnCeed(op), CEED_DEBUG_COLOR_SUCCESS, "Falling back to /gpu/hip/ref CeedOperator");
     CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback));
     CeedCallBackend(CeedOperatorApplyAdd(op_fallback, input_vec, output_vec, request));
-    return CEED_ERROR_SUCCESS;
   }
   return CEED_ERROR_SUCCESS;
 }

From e9c76bddc0f2a44f522e0176ed6b7e0c0aa1df73 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Wed, 19 Feb 2025 13:24:22 -0700
Subject: [PATCH 299/571] gpu - allow running shared kernels on stream

---
 backends/cuda-gen/ceed-cuda-gen-operator.c    |  6 +-
 backends/cuda-ref/ceed-cuda-ref-operator.c    |  4 +-
 backends/cuda-shared/ceed-cuda-shared-basis.c | 75 ++++++++++---------
 backends/cuda/ceed-cuda-compile.cpp           | 24 +++---
 backends/cuda/ceed-cuda-compile.h             |  8 +-
 backends/hip-gen/ceed-hip-gen-operator.c      | 12 +--
 backends/hip-ref/ceed-hip-ref-operator.c      |  4 +-
 backends/hip-shared/ceed-hip-shared-basis.c   | 70 ++++++++---------
 backends/hip/ceed-hip-compile.cpp             | 23 +++---
 backends/hip/ceed-hip-compile.h               |  8 +-
 backends/magma/ceed-magma-basis.c             | 18 ++---
 11 files changed, 130 insertions(+), 122 deletions(-)

diff --git a/backends/cuda-gen/ceed-cuda-gen-operator.c b/backends/cuda-gen/ceed-cuda-gen-operator.c
index 77ef6e9535..c80f08a3d2 100644
--- a/backends/cuda-gen/ceed-cuda-gen-operator.c
+++ b/backends/cuda-gen/ceed-cuda-gen-operator.c
@@ -98,7 +98,7 @@ static size_t dynamicSMemSize(int threads) { return threads * sizeof(CeedScalar)
 //------------------------------------------------------------------------------
 // Apply and add to output
 //------------------------------------------------------------------------------
-static int CeedOperatorApplyAddCore_Cuda_gen(CeedOperator op, const CeedScalar *input_arr, CeedScalar *output_arr, bool *is_run_good,
+static int CeedOperatorApplyAddCore_Cuda_gen(CeedOperator op, CUstream stream, const CeedScalar *input_arr, CeedScalar *output_arr, bool *is_run_good,
                                              CeedRequest *request) {
   bool                    is_at_points, is_tensor;
   Ceed                    ceed;
@@ -219,7 +219,7 @@ static int CeedOperatorApplyAddCore_Cuda_gen(CeedOperator op, const CeedScalar *
   }
   CeedInt shared_mem = block[0] * block[1] * block[2] * sizeof(CeedScalar);
 
-  CeedCallBackend(CeedTryRunKernelDimShared_Cuda(ceed, data->op, grid, block[0], block[1], block[2], shared_mem, is_run_good, opargs));
+  CeedCallBackend(CeedTryRunKernelDimShared_Cuda(ceed, data->op, stream, grid, block[0], block[1], block[2], shared_mem, is_run_good, opargs));
 
   // Restore input arrays
   for (CeedInt i = 0; i < num_input_fields; i++) {
@@ -278,7 +278,7 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec,
   // Try to run kernel
   if (input_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(input_vec, CEED_MEM_DEVICE, &input_arr));
   if (output_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArray(output_vec, CEED_MEM_DEVICE, &output_arr));
-  CeedCallBackend(CeedOperatorApplyAddCore_Cuda_gen(op, input_arr, output_arr, &is_run_good, request));
+  CeedCallBackend(CeedOperatorApplyAddCore_Cuda_gen(op, NULL, input_arr, output_arr, &is_run_good, request));
   if (input_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorRestoreArrayRead(input_vec, &input_arr));
   if (output_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorRestoreArray(output_vec, &output_arr));
 
diff --git a/backends/cuda-ref/ceed-cuda-ref-operator.c b/backends/cuda-ref/ceed-cuda-ref-operator.c
index 319d10145f..bc0414fb2a 100644
--- a/backends/cuda-ref/ceed-cuda-ref-operator.c
+++ b/backends/cuda-ref/ceed-cuda-ref-operator.c
@@ -1755,8 +1755,8 @@ static int CeedSingleOperatorAssemble_Cuda(CeedOperator op, CeedInt offset, Ceed
   void   *args[] = {(void *)&num_elem_in, &asmb->d_B_in,     &asmb->d_B_out,      &orients_in,  &curl_orients_in,
                     &orients_out,         &curl_orients_out, &assembled_qf_array, &values_array};
 
-  CeedCallBackend(
-      CeedRunKernelDimShared_Cuda(ceed, asmb->LinearAssemble, grid, asmb->block_size_x, asmb->block_size_y, asmb->elems_per_block, shared_mem, args));
+  CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, asmb->LinearAssemble, NULL, grid, asmb->block_size_x, asmb->block_size_y, asmb->elems_per_block,
+                                              shared_mem, args));
 
   // Restore arrays
   CeedCallBackend(CeedVectorRestoreArray(values, &values_array));
diff --git a/backends/cuda-shared/ceed-cuda-shared-basis.c b/backends/cuda-shared/ceed-cuda-shared-basis.c
index eb2a976625..993e877f32 100644
--- a/backends/cuda-shared/ceed-cuda-shared-basis.c
+++ b/backends/cuda-shared/ceed-cuda-shared-basis.c
@@ -64,10 +64,10 @@ static int CeedBasisApplyTensorCore_Cuda_shared(CeedBasis basis, bool apply_add,
         CeedInt shared_mem      = elems_per_block * thread_1d * sizeof(CeedScalar);
 
         if (t_mode == CEED_TRANSPOSE) {
-          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, grid, thread_1d, 1,
+          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, NULL, grid, thread_1d, 1,
                                                       elems_per_block, shared_mem, interp_args));
         } else {
-          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Interp, grid, thread_1d, 1, elems_per_block, shared_mem, interp_args));
+          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Interp, NULL, grid, thread_1d, 1, elems_per_block, shared_mem, interp_args));
         }
       } else if (dim == 2) {
         const CeedInt opt_elems[7] = {0, 32, 8, 6, 4, 2, 8};
@@ -77,10 +77,11 @@ static int CeedBasisApplyTensorCore_Cuda_shared(CeedBasis basis, bool apply_add,
         CeedInt shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
 
         if (t_mode == CEED_TRANSPOSE) {
-          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, grid, thread_1d, thread_1d,
-                                                      elems_per_block, shared_mem, interp_args));
+          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, NULL, grid, thread_1d,
+                                                      thread_1d, elems_per_block, shared_mem, interp_args));
         } else {
-          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Interp, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args));
+          CeedCallBackend(
+              CeedRunKernelDimShared_Cuda(ceed, data->Interp, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args));
         }
       } else if (dim == 3) {
         CeedInt elems_per_block = 1;
@@ -88,10 +89,11 @@ static int CeedBasisApplyTensorCore_Cuda_shared(CeedBasis basis, bool apply_add,
         CeedInt shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
 
         if (t_mode == CEED_TRANSPOSE) {
-          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, grid, thread_1d, thread_1d,
-                                                      elems_per_block, shared_mem, interp_args));
+          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, NULL, grid, thread_1d,
+                                                      thread_1d, elems_per_block, shared_mem, interp_args));
         } else {
-          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Interp, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args));
+          CeedCallBackend(
+              CeedRunKernelDimShared_Cuda(ceed, data->Interp, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args));
         }
       }
     } break;
@@ -116,10 +118,10 @@ static int CeedBasisApplyTensorCore_Cuda_shared(CeedBasis basis, bool apply_add,
         CeedInt shared_mem      = elems_per_block * thread_1d * sizeof(CeedScalar);
 
         if (t_mode == CEED_TRANSPOSE) {
-          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, grid, thread_1d, 1,
+          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, NULL, grid, thread_1d, 1,
                                                       elems_per_block, shared_mem, grad_args));
         } else {
-          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Grad, grid, thread_1d, 1, elems_per_block, shared_mem, grad_args));
+          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Grad, NULL, grid, thread_1d, 1, elems_per_block, shared_mem, grad_args));
         }
       } else if (dim == 2) {
         const CeedInt opt_elems[7] = {0, 32, 8, 6, 4, 2, 8};
@@ -129,10 +131,10 @@ static int CeedBasisApplyTensorCore_Cuda_shared(CeedBasis basis, bool apply_add,
         CeedInt shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
 
         if (t_mode == CEED_TRANSPOSE) {
-          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, grid, thread_1d, thread_1d,
-                                                      elems_per_block, shared_mem, grad_args));
+          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, NULL, grid, thread_1d,
+                                                      thread_1d, elems_per_block, shared_mem, grad_args));
         } else {
-          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Grad, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args));
+          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Grad, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args));
         }
       } else if (dim == 3) {
         CeedInt elems_per_block = 1;
@@ -140,10 +142,10 @@ static int CeedBasisApplyTensorCore_Cuda_shared(CeedBasis basis, bool apply_add,
         CeedInt shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
 
         if (t_mode == CEED_TRANSPOSE) {
-          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, grid, thread_1d, thread_1d,
-                                                      elems_per_block, shared_mem, grad_args));
+          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, NULL, grid, thread_1d,
+                                                      thread_1d, elems_per_block, shared_mem, grad_args));
         } else {
-          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Grad, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args));
+          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Grad, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args));
         }
       }
     } break;
@@ -326,10 +328,11 @@ static int CeedBasisApplyAtPointsCore_Cuda_shared(CeedBasis basis, bool apply_ad
         CeedInt shared_mem      = elems_per_block * thread_1d * sizeof(CeedScalar);
 
         if (is_transpose) {
-          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->InterpTransposeAddAtPoints : data->InterpTransposeAtPoints, grid,
+          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->InterpTransposeAddAtPoints : data->InterpTransposeAtPoints, NULL, grid,
                                                       thread_1d, 1, elems_per_block, shared_mem, interp_args));
         } else {
-          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->InterpAtPoints, grid, thread_1d, 1, elems_per_block, shared_mem, interp_args));
+          CeedCallBackend(
+              CeedRunKernelDimShared_Cuda(ceed, data->InterpAtPoints, NULL, grid, thread_1d, 1, elems_per_block, shared_mem, interp_args));
         }
       } else if (dim == 2) {
         const CeedInt opt_elems[7] = {0, 32, 8, 6, 4, 2, 8};
@@ -339,11 +342,11 @@ static int CeedBasisApplyAtPointsCore_Cuda_shared(CeedBasis basis, bool apply_ad
         CeedInt shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
 
         if (is_transpose) {
-          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->InterpTransposeAddAtPoints : data->InterpTransposeAtPoints, grid,
+          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->InterpTransposeAddAtPoints : data->InterpTransposeAtPoints, NULL, grid,
                                                       thread_1d, thread_1d, elems_per_block, shared_mem, interp_args));
         } else {
           CeedCallBackend(
-              CeedRunKernelDimShared_Cuda(ceed, data->InterpAtPoints, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args));
+              CeedRunKernelDimShared_Cuda(ceed, data->InterpAtPoints, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args));
         }
       } else if (dim == 3) {
         CeedInt elems_per_block = 1;
@@ -351,11 +354,11 @@ static int CeedBasisApplyAtPointsCore_Cuda_shared(CeedBasis basis, bool apply_ad
         CeedInt shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
 
         if (is_transpose) {
-          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->InterpTransposeAddAtPoints : data->InterpTransposeAtPoints, grid,
+          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->InterpTransposeAddAtPoints : data->InterpTransposeAtPoints, NULL, grid,
                                                       thread_1d, thread_1d, elems_per_block, shared_mem, interp_args));
         } else {
           CeedCallBackend(
-              CeedRunKernelDimShared_Cuda(ceed, data->InterpAtPoints, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args));
+              CeedRunKernelDimShared_Cuda(ceed, data->InterpAtPoints, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args));
         }
       }
     } break;
@@ -375,10 +378,10 @@ static int CeedBasisApplyAtPointsCore_Cuda_shared(CeedBasis basis, bool apply_ad
         CeedInt shared_mem      = elems_per_block * thread_1d * sizeof(CeedScalar);
 
         if (is_transpose) {
-          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->GradTransposeAddAtPoints : data->GradTransposeAtPoints, grid, thread_1d,
-                                                      1, elems_per_block, shared_mem, grad_args));
+          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->GradTransposeAddAtPoints : data->GradTransposeAtPoints, NULL, grid,
+                                                      thread_1d, 1, elems_per_block, shared_mem, grad_args));
         } else {
-          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->GradAtPoints, grid, thread_1d, 1, elems_per_block, shared_mem, grad_args));
+          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->GradAtPoints, NULL, grid, thread_1d, 1, elems_per_block, shared_mem, grad_args));
         }
       } else if (dim == 2) {
         const CeedInt opt_elems[7] = {0, 32, 8, 6, 4, 2, 8};
@@ -388,10 +391,11 @@ static int CeedBasisApplyAtPointsCore_Cuda_shared(CeedBasis basis, bool apply_ad
         CeedInt shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
 
         if (is_transpose) {
-          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->GradTransposeAddAtPoints : data->GradTransposeAtPoints, grid, thread_1d,
-                                                      thread_1d, elems_per_block, shared_mem, grad_args));
+          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->GradTransposeAddAtPoints : data->GradTransposeAtPoints, NULL, grid,
+                                                      thread_1d, thread_1d, elems_per_block, shared_mem, grad_args));
         } else {
-          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->GradAtPoints, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args));
+          CeedCallBackend(
+              CeedRunKernelDimShared_Cuda(ceed, data->GradAtPoints, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args));
         }
       } else if (dim == 3) {
         CeedInt elems_per_block = 1;
@@ -399,10 +403,11 @@ static int CeedBasisApplyAtPointsCore_Cuda_shared(CeedBasis basis, bool apply_ad
         CeedInt shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
 
         if (is_transpose) {
-          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->GradTransposeAddAtPoints : data->GradTransposeAtPoints, grid, thread_1d,
-                                                      thread_1d, elems_per_block, shared_mem, grad_args));
+          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->GradTransposeAddAtPoints : data->GradTransposeAtPoints, NULL, grid,
+                                                      thread_1d, thread_1d, elems_per_block, shared_mem, grad_args));
         } else {
-          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->GradAtPoints, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args));
+          CeedCallBackend(
+              CeedRunKernelDimShared_Cuda(ceed, data->GradAtPoints, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args));
         }
       }
     } break;
@@ -482,10 +487,10 @@ static int CeedBasisApplyNonTensorCore_Cuda_shared(CeedBasis basis, bool apply_a
         CeedInt shared_mem      = elems_per_block * thread * sizeof(CeedScalar);
 
         if (t_mode == CEED_TRANSPOSE) {
-          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, grid, thread, 1,
+          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, NULL, grid, thread, 1,
                                                       elems_per_block, shared_mem, interp_args));
         } else {
-          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Interp, grid, thread, 1, elems_per_block, shared_mem, interp_args));
+          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Interp, NULL, grid, thread, 1, elems_per_block, shared_mem, interp_args));
         }
       }
     } break;
@@ -506,10 +511,10 @@ static int CeedBasisApplyNonTensorCore_Cuda_shared(CeedBasis basis, bool apply_a
         CeedInt shared_mem      = elems_per_block * thread * sizeof(CeedScalar);
 
         if (t_mode == CEED_TRANSPOSE) {
-          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, grid, thread, 1,
+          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, NULL, grid, thread, 1,
                                                       elems_per_block, shared_mem, grad_args));
         } else {
-          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Grad, grid, thread, 1, elems_per_block, shared_mem, grad_args));
+          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Grad, NULL, grid, thread, 1, elems_per_block, shared_mem, grad_args));
         }
       }
     } break;
diff --git a/backends/cuda/ceed-cuda-compile.cpp b/backends/cuda/ceed-cuda-compile.cpp
index 382a6557d5..d11cb3b499 100644
--- a/backends/cuda/ceed-cuda-compile.cpp
+++ b/backends/cuda/ceed-cuda-compile.cpp
@@ -206,7 +206,7 @@ int CeedRunKernelAutoblockCuda(Ceed ceed, CUfunction kernel, size_t points, void
 // Run CUDA kernel
 //------------------------------------------------------------------------------
 int CeedRunKernel_Cuda(Ceed ceed, CUfunction kernel, const int grid_size, const int block_size, void **args) {
-  CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, kernel, grid_size, block_size, 1, 1, 0, args));
+  CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, kernel, NULL, grid_size, block_size, 1, 1, 0, args));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -215,20 +215,20 @@ int CeedRunKernel_Cuda(Ceed ceed, CUfunction kernel, const int grid_size, const
 //------------------------------------------------------------------------------
 int CeedRunKernelDim_Cuda(Ceed ceed, CUfunction kernel, const int grid_size, const int block_size_x, const int block_size_y, const int block_size_z,
                           void **args) {
-  CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, kernel, grid_size, block_size_x, block_size_y, block_size_z, 0, args));
+  CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, kernel, NULL, grid_size, block_size_x, block_size_y, block_size_z, 0, args));
   return CEED_ERROR_SUCCESS;
 }
 
 //------------------------------------------------------------------------------
 // Run CUDA kernel for spatial dimension with shared memory
 //------------------------------------------------------------------------------
-static int CeedRunKernelDimSharedCore_Cuda(Ceed ceed, CUfunction kernel, const int grid_size, const int block_size_x, const int block_size_y,
-                                           const int block_size_z, const int shared_mem_size, const bool throw_error, bool *is_good_run,
-                                           void **args) {
+static int CeedRunKernelDimSharedCore_Cuda(Ceed ceed, CUfunction kernel, CUstream stream, const int grid_size, const int block_size_x,
+                                           const int block_size_y, const int block_size_z, const int shared_mem_size, const bool throw_error,
+                                           bool *is_good_run, void **args) {
 #if CUDA_VERSION >= 9000
   cuFuncSetAttribute(kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, shared_mem_size);
 #endif
-  CUresult result = cuLaunchKernel(kernel, grid_size, 1, 1, block_size_x, block_size_y, block_size_z, shared_mem_size, NULL, args, NULL);
+  CUresult result = cuLaunchKernel(kernel, grid_size, 1, 1, block_size_x, block_size_y, block_size_z, shared_mem_size, stream, args, NULL);
 
   if (result == CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES) {
     *is_good_run = false;
@@ -246,19 +246,19 @@ static int CeedRunKernelDimSharedCore_Cuda(Ceed ceed, CUfunction kernel, const i
   return CEED_ERROR_SUCCESS;
 }
 
-int CeedRunKernelDimShared_Cuda(Ceed ceed, CUfunction kernel, const int grid_size, const int block_size_x, const int block_size_y,
+int CeedRunKernelDimShared_Cuda(Ceed ceed, CUfunction kernel, CUstream stream, const int grid_size, const int block_size_x, const int block_size_y,
                                 const int block_size_z, const int shared_mem_size, void **args) {
   bool is_good_run = true;
 
-  CeedCallBackend(
-      CeedRunKernelDimSharedCore_Cuda(ceed, kernel, grid_size, block_size_x, block_size_y, block_size_z, shared_mem_size, true, &is_good_run, args));
+  CeedCallBackend(CeedRunKernelDimSharedCore_Cuda(ceed, kernel, stream, grid_size, block_size_x, block_size_y, block_size_z, shared_mem_size, true,
+                                                  &is_good_run, args));
   return CEED_ERROR_SUCCESS;
 }
 
-int CeedTryRunKernelDimShared_Cuda(Ceed ceed, CUfunction kernel, const int grid_size, const int block_size_x, const int block_size_y,
+int CeedTryRunKernelDimShared_Cuda(Ceed ceed, CUfunction kernel, CUstream stream, const int grid_size, const int block_size_x, const int block_size_y,
                                    const int block_size_z, const int shared_mem_size, bool *is_good_run, void **args) {
-  CeedCallBackend(
-      CeedRunKernelDimSharedCore_Cuda(ceed, kernel, grid_size, block_size_x, block_size_y, block_size_z, shared_mem_size, false, is_good_run, args));
+  CeedCallBackend(CeedRunKernelDimSharedCore_Cuda(ceed, kernel, stream, grid_size, block_size_x, block_size_y, block_size_z, shared_mem_size, false,
+                                                  is_good_run, args));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/cuda/ceed-cuda-compile.h b/backends/cuda/ceed-cuda-compile.h
index 21204a495d..07572aa4ad 100644
--- a/backends/cuda/ceed-cuda-compile.h
+++ b/backends/cuda/ceed-cuda-compile.h
@@ -23,7 +23,7 @@ CEED_INTERN int CeedRunKernelAutoblockCuda(Ceed ceed, CUfunction kernel, size_t
 
 CEED_INTERN int CeedRunKernelDim_Cuda(Ceed ceed, CUfunction kernel, int grid_size, int block_size_x, int block_size_y, int block_size_z, void **args);
 
-CEED_INTERN int CeedRunKernelDimShared_Cuda(Ceed ceed, CUfunction kernel, int grid_size, int block_size_x, int block_size_y, int block_size_z,
-                                            int shared_mem_size, void **args);
-CEED_INTERN int CeedTryRunKernelDimShared_Cuda(Ceed ceed, CUfunction kernel, int grid_size, int block_size_x, int block_size_y, int block_size_z,
-                                               int shared_mem_size, bool *is_good_run, void **args);
+CEED_INTERN int CeedRunKernelDimShared_Cuda(Ceed ceed, CUfunction kernel, CUstream stream, int grid_size, int block_size_x, int block_size_y,
+                                            int block_size_z, int shared_mem_size, void **args);
+CEED_INTERN int CeedTryRunKernelDimShared_Cuda(Ceed ceed, CUfunction kernel, CUstream stream, int grid_size, int block_size_x, int block_size_y,
+                                               int block_size_z, int shared_mem_size, bool *is_good_run, void **args);
diff --git a/backends/hip-gen/ceed-hip-gen-operator.c b/backends/hip-gen/ceed-hip-gen-operator.c
index b10dc89d44..2041c872df 100644
--- a/backends/hip-gen/ceed-hip-gen-operator.c
+++ b/backends/hip-gen/ceed-hip-gen-operator.c
@@ -34,8 +34,8 @@ static int CeedOperatorDestroy_Hip_gen(CeedOperator op) {
 //------------------------------------------------------------------------------
 // Apply and add to output
 //------------------------------------------------------------------------------
-static int CeedOperatorApplyAddCore_Hip_gen(CeedOperator op, const CeedScalar *input_arr, CeedScalar *output_arr, bool *is_run_good,
-                                            CeedRequest *request) {
+static int CeedOperatorApplyAddCore_Hip_gen(CeedOperator op, hipStream_t stream, const CeedScalar *input_arr, CeedScalar *output_arr,
+                                            bool *is_run_good, CeedRequest *request) {
   bool                   is_at_points, is_tensor;
   Ceed                   ceed;
   CeedInt                num_elem, num_input_fields, num_output_fields;
@@ -153,19 +153,19 @@ static int CeedOperatorApplyAddCore_Hip_gen(CeedOperator op, const CeedScalar *i
     CeedInt sharedMem = block_sizes[2] * thread_1d * sizeof(CeedScalar);
 
     CeedCallBackend(
-        CeedTryRunKernelDimShared_Hip(ceed, data->op, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, is_run_good, opargs));
+        CeedTryRunKernelDimShared_Hip(ceed, data->op, stream, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, is_run_good, opargs));
   } else if (dim == 2) {
     CeedInt grid      = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
     CeedInt sharedMem = block_sizes[2] * thread_1d * thread_1d * sizeof(CeedScalar);
 
     CeedCallBackend(
-        CeedTryRunKernelDimShared_Hip(ceed, data->op, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, is_run_good, opargs));
+        CeedTryRunKernelDimShared_Hip(ceed, data->op, stream, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, is_run_good, opargs));
   } else if (dim == 3) {
     CeedInt grid      = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
     CeedInt sharedMem = block_sizes[2] * thread_1d * thread_1d * sizeof(CeedScalar);
 
     CeedCallBackend(
-        CeedTryRunKernelDimShared_Hip(ceed, data->op, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, is_run_good, opargs));
+        CeedTryRunKernelDimShared_Hip(ceed, data->op, stream, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, is_run_good, opargs));
   }
 
   // Restore input arrays
@@ -225,7 +225,7 @@ static int CeedOperatorApplyAdd_Hip_gen(CeedOperator op, CeedVector input_vec, C
   // Try to run kernel
   if (input_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(input_vec, CEED_MEM_DEVICE, &input_arr));
   if (output_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArray(output_vec, CEED_MEM_DEVICE, &output_arr));
-  CeedCallBackend(CeedOperatorApplyAddCore_Cuda_gen(op, input_arr, output_arr, &is_run_good, request));
+  CeedCallBackend(CeedOperatorApplyAddCore_Cuda_gen(op, NULL, input_arr, output_arr, &is_run_good, request));
   if (input_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorRestoreArrayRead(input_vec, &input_arr));
   if (output_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorRestoreArray(output_vec, &output_arr));
 
diff --git a/backends/hip-ref/ceed-hip-ref-operator.c b/backends/hip-ref/ceed-hip-ref-operator.c
index 67a3533ee2..e20d6d13af 100644
--- a/backends/hip-ref/ceed-hip-ref-operator.c
+++ b/backends/hip-ref/ceed-hip-ref-operator.c
@@ -1752,8 +1752,8 @@ static int CeedSingleOperatorAssemble_Hip(CeedOperator op, CeedInt offset, CeedV
   void   *args[] = {(void *)&num_elem_in, &asmb->d_B_in,     &asmb->d_B_out,      &orients_in,  &curl_orients_in,
                     &orients_out,         &curl_orients_out, &assembled_qf_array, &values_array};
 
-  CeedCallBackend(
-      CeedRunKernelDimShared_Hip(ceed, asmb->LinearAssemble, grid, asmb->block_size_x, asmb->block_size_y, asmb->elems_per_block, shared_mem, args));
+  CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, asmb->LinearAssemble, NULL, grid, asmb->block_size_x, asmb->block_size_y, asmb->elems_per_block,
+                                             shared_mem, args));
 
   // Restore arrays
   CeedCallBackend(CeedVectorRestoreArray(values, &values_array));
diff --git a/backends/hip-shared/ceed-hip-shared-basis.c b/backends/hip-shared/ceed-hip-shared-basis.c
index 4dc823754c..f0080d1eda 100644
--- a/backends/hip-shared/ceed-hip-shared-basis.c
+++ b/backends/hip-shared/ceed-hip-shared-basis.c
@@ -131,10 +131,10 @@ static int CeedBasisApplyTensorCore_Hip_shared(CeedBasis basis, bool apply_add,
         CeedInt shared_mem      = elems_per_block * thread_1d * sizeof(CeedScalar);
 
         if (t_mode == CEED_TRANSPOSE) {
-          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, grid, thread_1d, 1,
+          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, NULL, grid, thread_1d, 1,
                                                      elems_per_block, shared_mem, interp_args));
         } else {
-          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Interp, grid, thread_1d, 1, elems_per_block, shared_mem, interp_args));
+          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Interp, NULL, grid, thread_1d, 1, elems_per_block, shared_mem, interp_args));
         }
       } else if (dim == 2) {
         // Check if required threads is small enough to do multiple elems
@@ -143,10 +143,10 @@ static int CeedBasisApplyTensorCore_Hip_shared(CeedBasis basis, bool apply_add,
         CeedInt       shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
 
         if (t_mode == CEED_TRANSPOSE) {
-          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, grid, thread_1d, thread_1d,
-                                                     elems_per_block, shared_mem, interp_args));
+          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, NULL, grid, thread_1d,
+                                                     thread_1d, elems_per_block, shared_mem, interp_args));
         } else {
-          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Interp, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args));
+          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Interp, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args));
         }
       } else if (dim == 3) {
         const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1);
@@ -154,10 +154,10 @@ static int CeedBasisApplyTensorCore_Hip_shared(CeedBasis basis, bool apply_add,
         CeedInt       shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
 
         if (t_mode == CEED_TRANSPOSE) {
-          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, grid, thread_1d, thread_1d,
-                                                     elems_per_block, shared_mem, interp_args));
+          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, NULL, grid, thread_1d,
+                                                     thread_1d, elems_per_block, shared_mem, interp_args));
         } else {
-          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Interp, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args));
+          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Interp, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args));
         }
       }
     } break;
@@ -183,10 +183,10 @@ static int CeedBasisApplyTensorCore_Hip_shared(CeedBasis basis, bool apply_add,
         CeedInt shared_mem      = elems_per_block * thread_1d * sizeof(CeedScalar);
 
         if (t_mode == CEED_TRANSPOSE) {
-          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, grid, thread_1d, 1,
+          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, NULL, grid, thread_1d, 1,
                                                      elems_per_block, shared_mem, grad_args));
         } else {
-          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Grad, grid, thread_1d, 1, elems_per_block, shared_mem, grad_args));
+          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Grad, NULL, grid, thread_1d, 1, elems_per_block, shared_mem, grad_args));
         }
       } else if (dim == 2) {
         // Check if required threads is small enough to do multiple elems
@@ -195,10 +195,10 @@ static int CeedBasisApplyTensorCore_Hip_shared(CeedBasis basis, bool apply_add,
         CeedInt       shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
 
         if (t_mode == CEED_TRANSPOSE) {
-          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, grid, thread_1d, thread_1d,
+          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, NULL, grid, thread_1d, thread_1d,
                                                      elems_per_block, shared_mem, grad_args));
         } else {
-          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Grad, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args));
+          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Grad, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args));
         }
       } else if (dim == 3) {
         const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1);
@@ -206,10 +206,10 @@ static int CeedBasisApplyTensorCore_Hip_shared(CeedBasis basis, bool apply_add,
         CeedInt       shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
 
         if (t_mode == CEED_TRANSPOSE) {
-          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, grid, thread_1d, thread_1d,
+          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, NULL, grid, thread_1d, thread_1d,
                                                      elems_per_block, shared_mem, grad_args));
         } else {
-          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Grad, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args));
+          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Grad, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args));
         }
       }
     } break;
@@ -394,10 +394,10 @@ static int CeedBasisApplyAtPointsCore_Hip_shared(CeedBasis basis, bool apply_add
         CeedInt shared_mem      = elems_per_block * thread_1d * sizeof(CeedScalar);
 
         if (is_transpose) {
-          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->InterpTransposeAddAtPoints : data->InterpTransposeAtPoints, grid,
+          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->InterpTransposeAddAtPoints : data->InterpTransposeAtPoints, NULL, grid,
                                                      thread_1d, 1, elems_per_block, shared_mem, interp_args));
         } else {
-          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->InterpAtPoints, grid, thread_1d, 1, elems_per_block, shared_mem, interp_args));
+          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->InterpAtPoints, NULL, grid, thread_1d, 1, elems_per_block, shared_mem, interp_args));
         }
       } else if (dim == 2) {
         // Check if required threads is small enough to do multiple elems
@@ -406,11 +406,11 @@ static int CeedBasisApplyAtPointsCore_Hip_shared(CeedBasis basis, bool apply_add
         CeedInt       shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
 
         if (is_transpose) {
-          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->InterpTransposeAddAtPoints : data->InterpTransposeAtPoints, grid,
+          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->InterpTransposeAddAtPoints : data->InterpTransposeAtPoints, NULL, grid,
                                                      thread_1d, thread_1d, elems_per_block, shared_mem, interp_args));
         } else {
           CeedCallBackend(
-              CeedRunKernelDimShared_Hip(ceed, data->InterpAtPoints, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args));
+              CeedRunKernelDimShared_Hip(ceed, data->InterpAtPoints, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args));
         }
       } else if (dim == 3) {
         const CeedInt elems_per_block = 1;
@@ -418,11 +418,11 @@ static int CeedBasisApplyAtPointsCore_Hip_shared(CeedBasis basis, bool apply_add
         CeedInt       shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
 
         if (is_transpose) {
-          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->InterpTransposeAddAtPoints : data->InterpTransposeAtPoints, grid,
+          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->InterpTransposeAddAtPoints : data->InterpTransposeAtPoints, NULL, grid,
                                                      thread_1d, thread_1d, elems_per_block, shared_mem, interp_args));
         } else {
           CeedCallBackend(
-              CeedRunKernelDimShared_Hip(ceed, data->InterpAtPoints, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args));
+              CeedRunKernelDimShared_Hip(ceed, data->InterpAtPoints, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args));
         }
       }
     } break;
@@ -442,10 +442,10 @@ static int CeedBasisApplyAtPointsCore_Hip_shared(CeedBasis basis, bool apply_add
         CeedInt shared_mem      = elems_per_block * thread_1d * sizeof(CeedScalar);
 
         if (is_transpose) {
-          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->GradTransposeAddAtPoints : data->GradTransposeAtPoints, grid, thread_1d,
-                                                     1, elems_per_block, shared_mem, grad_args));
+          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->GradTransposeAddAtPoints : data->GradTransposeAtPoints, NULL, grid,
+                                                     thread_1d, 1, elems_per_block, shared_mem, grad_args));
         } else {
-          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->GradAtPoints, grid, thread_1d, 1, elems_per_block, shared_mem, grad_args));
+          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->GradAtPoints, NULL, grid, thread_1d, 1, elems_per_block, shared_mem, grad_args));
         }
       } else if (dim == 2) {
         // Check if required threads is small enough to do multiple elems
@@ -454,10 +454,11 @@ static int CeedBasisApplyAtPointsCore_Hip_shared(CeedBasis basis, bool apply_add
         CeedInt       shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
 
         if (is_transpose) {
-          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->GradTransposeAddAtPoints : data->GradTransposeAtPoints, grid, thread_1d,
-                                                     thread_1d, elems_per_block, shared_mem, grad_args));
+          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->GradTransposeAddAtPoints : data->GradTransposeAtPoints, NULL, grid,
+                                                     thread_1d, thread_1d, elems_per_block, shared_mem, grad_args));
         } else {
-          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->GradAtPoints, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args));
+          CeedCallBackend(
+              CeedRunKernelDimShared_Hip(ceed, data->GradAtPoints, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args));
         }
       } else if (dim == 3) {
         const CeedInt elems_per_block = 1;
@@ -465,10 +466,11 @@ static int CeedBasisApplyAtPointsCore_Hip_shared(CeedBasis basis, bool apply_add
         CeedInt       shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
 
         if (is_transpose) {
-          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->GradTransposeAddAtPoints : data->GradTransposeAtPoints, grid, thread_1d,
-                                                     thread_1d, elems_per_block, shared_mem, grad_args));
+          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->GradTransposeAddAtPoints : data->GradTransposeAtPoints, NULL, grid,
+                                                     thread_1d, thread_1d, elems_per_block, shared_mem, grad_args));
         } else {
-          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->GradAtPoints, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args));
+          CeedCallBackend(
+              CeedRunKernelDimShared_Hip(ceed, data->GradAtPoints, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args));
         }
       }
     } break;
@@ -547,10 +549,10 @@ static int CeedBasisApplyNonTensorCore_Hip_shared(CeedBasis basis, bool apply_ad
         CeedInt shared_mem      = elems_per_block * thread * sizeof(CeedScalar);
 
         if (t_mode == CEED_TRANSPOSE) {
-          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, grid, thread, 1,
+          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, NULL, grid, thread, 1,
                                                      elems_per_block, shared_mem, interp_args));
         } else {
-          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Interp, grid, thread, 1, elems_per_block, shared_mem, interp_args));
+          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Interp, NULL, grid, thread, 1, elems_per_block, shared_mem, interp_args));
         }
       }
     } break;
@@ -570,10 +572,10 @@ static int CeedBasisApplyNonTensorCore_Hip_shared(CeedBasis basis, bool apply_ad
         CeedInt shared_mem      = elems_per_block * thread * sizeof(CeedScalar);
 
         if (t_mode == CEED_TRANSPOSE) {
-          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, grid, thread, 1, elems_per_block,
-                                                     shared_mem, grad_args));
+          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, NULL, grid, thread, 1,
+                                                     elems_per_block, shared_mem, grad_args));
         } else {
-          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Grad, grid, thread, 1, elems_per_block, shared_mem, grad_args));
+          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Grad, NULL, grid, thread, 1, elems_per_block, shared_mem, grad_args));
         }
       }
     } break;
diff --git a/backends/hip/ceed-hip-compile.cpp b/backends/hip/ceed-hip-compile.cpp
index 2bf14a91b1..52dd6848c3 100644
--- a/backends/hip/ceed-hip-compile.cpp
+++ b/backends/hip/ceed-hip-compile.cpp
@@ -205,28 +205,29 @@ int CeedRunKernelDim_Hip(Ceed ceed, hipFunction_t kernel, const int grid_size, c
 //------------------------------------------------------------------------------
 // Run HIP kernel for spatial dimension with shared memory
 //------------------------------------------------------------------------------
-static int CeedRunKernelDimSharedCore_Hip(Ceed ceed, hipFunction_t kernel, const int grid_size, const int block_size_x, const int block_size_y,
-                                          const int block_size_z, const int shared_mem_size, const bool throw_error, bool *is_good_run, void **args) {
-  hipError_t result = hipModuleLaunchKernel(kernel, grid_size, 1, 1, block_size_x, block_size_y, block_size_z, shared_mem_size, NULL, args, NULL);
+static int CeedRunKernelDimSharedCore_Hip(Ceed ceed, hipFunction_t kernel, hipStream_t stream, const int grid_size, const int block_size_x,
+                                          const int block_size_y, const int block_size_z, const int shared_mem_size, const bool throw_error,
+                                          bool *is_good_run, void **args) {
+  hipError_t result = hipModuleLaunchKernel(kernel, grid_size, 1, 1, block_size_x, block_size_y, block_size_z, shared_mem_size, stream, args, NULL);
 
   *is_good_run = result == hipSuccess;
   if (throw_error) CeedCallHip(ceed, result);
   return CEED_ERROR_SUCCESS;
 }
 
-int CeedRunKernelDimShared_Hip(Ceed ceed, hipFunction_t kernel, const int grid_size, const int block_size_x, const int block_size_y,
-                               const int block_size_z, const int shared_mem_size, void **args) {
+int CeedRunKernelDimShared_Hip(Ceed ceed, hipFunction_t kernel, hipStream_t stream, const int grid_size, const int block_size_x,
+                               const int block_size_y, const int block_size_z, const int shared_mem_size, void **args) {
   bool is_good_run = true;
 
-  CeedCallBackend(
-      CeedRunKernelDimSharedCore_Hip(ceed, kernel, grid_size, block_size_x, block_size_y, block_size_z, shared_mem_size, true, &is_good_run, args));
+  CeedCallBackend(CeedRunKernelDimSharedCore_Hip(ceed, kernel, stream, grid_size, block_size_x, block_size_y, block_size_z, shared_mem_size, true,
+                                                 &is_good_run, args));
   return CEED_ERROR_SUCCESS;
 }
 
-int CeedTryRunKernelDimShared_Hip(Ceed ceed, hipFunction_t kernel, const int grid_size, const int block_size_x, const int block_size_y,
-                                  const int block_size_z, const int shared_mem_size, bool *is_good_run, void **args) {
-  CeedCallBackend(
-      CeedRunKernelDimSharedCore_Hip(ceed, kernel, grid_size, block_size_x, block_size_y, block_size_z, shared_mem_size, false, is_good_run, args));
+int CeedTryRunKernelDimShared_Hip(Ceed ceed, hipFunction_t kernel, hipStream_t stream, const int grid_size, const int block_size_x,
+                                  const int block_size_y, const int block_size_z, const int shared_mem_size, bool *is_good_run, void **args) {
+  CeedCallBackend(CeedRunKernelDimSharedCore_Hip(ceed, kernel, stream, grid_size, block_size_x, block_size_y, block_size_z, shared_mem_size, false,
+                                                 is_good_run, args));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/hip/ceed-hip-compile.h b/backends/hip/ceed-hip-compile.h
index 0a29fad33e..66f542e1a1 100644
--- a/backends/hip/ceed-hip-compile.h
+++ b/backends/hip/ceed-hip-compile.h
@@ -22,7 +22,7 @@ CEED_INTERN int CeedRunKernel_Hip(Ceed ceed, hipFunction_t kernel, int grid_size
 CEED_INTERN int CeedRunKernelDim_Hip(Ceed ceed, hipFunction_t kernel, int grid_size, int block_size_x, int block_size_y, int block_size_z,
                                      void **args);
 
-CEED_INTERN int CeedRunKernelDimShared_Hip(Ceed ceed, hipFunction_t kernel, int grid_size, int block_size_x, int block_size_y, int block_size_z,
-                                           int shared_mem_size, void **args);
-CEED_INTERN int CeedTryRunKernelDimShared_Hip(Ceed ceed, hipFunction_t kernel, int grid_size, int block_size_x, int block_size_y, int block_size_z,
-                                              int shared_mem_size, bool *is_good_run, void **args);
+CEED_INTERN int CeedRunKernelDimShared_Hip(Ceed ceed, hipFunction_t kernel, hipStream_t stream, int grid_size, int block_size_x, int block_size_y,
+                                           int block_size_z, int shared_mem_size, void **args);
+CEED_INTERN int CeedTryRunKernelDimShared_Hip(Ceed ceed, hipFunction_t kernel, hipStream_t stream, int grid_size, int block_size_x, int block_size_y,
+                                              int block_size_z, int shared_mem_size, bool *is_good_run, void **args);
diff --git a/backends/magma/ceed-magma-basis.c b/backends/magma/ceed-magma-basis.c
index e1f39139d6..3043576489 100644
--- a/backends/magma/ceed-magma-basis.c
+++ b/backends/magma/ceed-magma-basis.c
@@ -117,10 +117,10 @@ static int CeedBasisApplyCore_Magma(CeedBasis basis, bool apply_add, CeedInt num
       void   *args[] = {&impl->d_interp_1d, &d_u, &u_elem_stride, &u_comp_stride, &d_v, &v_elem_stride, &v_comp_stride, &num_elem};
 
       if (t_mode == CEED_TRANSPOSE) {
-        CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, apply_add ? impl->InterpTransposeAdd : impl->InterpTranspose, grid, num_threads, num_t_col,
-                                                    1, shared_mem, args));
+        CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, apply_add ? impl->InterpTransposeAdd : impl->InterpTranspose, NULL, grid, num_threads,
+                                                    num_t_col, 1, shared_mem, args));
       } else {
-        CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->Interp, grid, num_threads, num_t_col, 1, shared_mem, args));
+        CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->Interp, NULL, grid, num_threads, num_t_col, 1, shared_mem, args));
       }
     } break;
     case CEED_EVAL_GRAD: {
@@ -195,10 +195,10 @@ static int CeedBasisApplyCore_Magma(CeedBasis basis, bool apply_add, CeedInt num
                         &v_elem_stride,     &v_comp_stride,   &v_dim_stride, &num_elem};
 
       if (t_mode == CEED_TRANSPOSE) {
-        CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, apply_add ? impl->GradTransposeAdd : impl->GradTranspose, grid, num_threads, num_t_col, 1,
-                                                    shared_mem, args));
+        CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, apply_add ? impl->GradTransposeAdd : impl->GradTranspose, NULL, grid, num_threads,
+                                                    num_t_col, 1, shared_mem, args));
       } else {
-        CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->Grad, grid, num_threads, num_t_col, 1, shared_mem, args));
+        CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->Grad, NULL, grid, num_threads, num_t_col, 1, shared_mem, args));
       }
     } break;
     case CEED_EVAL_WEIGHT: {
@@ -230,7 +230,7 @@ static int CeedBasisApplyCore_Magma(CeedBasis basis, bool apply_add, CeedInt num
       CeedInt grid   = CeedDivUpInt(num_elem, num_t_col);
       void   *args[] = {&impl->d_q_weight_1d, &d_v, &elem_dofs_size, &num_elem};
 
-      CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->Weight, grid, num_threads, num_t_col, 1, shared_mem, args));
+      CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->Weight, NULL, grid, num_threads, num_t_col, 1, shared_mem, args));
     } break;
     // LCOV_EXCL_START
     case CEED_EVAL_DIV:
@@ -429,7 +429,7 @@ static int CeedBasisApplyNonTensorCore_Magma(CeedBasis basis, bool apply_add, Ce
       CeedInt shared_mem   = (t_mode != CEED_TRANSPOSE && q_comp > 1) ? (shared_mem_A + shared_mem_B) : CeedIntMax(shared_mem_A, shared_mem_B);
       void   *args[]       = {&N, &d_b, &d_u, &d_v};
 
-      CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, Kernel, grid, M, num_t_col, 1, shared_mem, args));
+      CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, Kernel, NULL, grid, M, num_t_col, 1, shared_mem, args));
     } else {
       for (CeedInt d = 0; d < q_comp; d++) {
         if (t_mode == CEED_TRANSPOSE) {
@@ -448,7 +448,7 @@ static int CeedBasisApplyNonTensorCore_Magma(CeedBasis basis, bool apply_add, Ce
     CeedInt shared_mem = Q * sizeof(CeedScalar) + num_t_col * Q * sizeof(CeedScalar);
     void   *args[]     = {&num_elem, &impl->d_q_weight, &d_v};
 
-    CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->Weight, grid, Q, num_t_col, 1, shared_mem, args));
+    CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->Weight, NULL, grid, Q, num_t_col, 1, shared_mem, args));
   }
 
   // Must sync to ensure completeness

From 58e06b7258417d52453a30aa89a13f7908ace0c8 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Mon, 24 Feb 2025 10:19:22 -0700
Subject: [PATCH 300/571] op - minor, make Apply call ApplyAdd on composite
 over subs

---
 interface/ceed-operator.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/interface/ceed-operator.c b/interface/ceed-operator.c
index 515c4e3f52..445adbeeb6 100644
--- a/interface/ceed-operator.c
+++ b/interface/ceed-operator.c
@@ -2161,10 +2161,8 @@ int CeedOperatorApply(CeedOperator op, CeedVector in, CeedVector out, CeedReques
           CeedCall(CeedVectorDestroy(&vec));
         }
       }
-      // Apply
-      for (CeedInt i = 0; i < num_suboperators; i++) {
-        CeedCall(CeedOperatorApplyAdd(sub_operators[i], in, out, request));
-      }
+      // ApplyAdd
+      CeedCall(CeedOperatorApplyAdd(op, in, out, request));
     }
   } else {
     // Standard Operator

From c99afcd86b1f347c46664d00888893a7ea393539 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Mon, 24 Feb 2025 10:38:13 -0700
Subject: [PATCH 301/571] gpu - gen ApplyAdd functions

---
 backends/cuda-gen/ceed-cuda-gen-operator.c | 43 +++++++++++++++++++++-
 backends/cuda-gen/ceed-cuda-gen.c          |  1 +
 backends/hip-gen/ceed-hip-gen-operator.c   | 43 +++++++++++++++++++++-
 backends/hip-gen/ceed-hip-gen.c            |  1 +
 4 files changed, 86 insertions(+), 2 deletions(-)

diff --git a/backends/cuda-gen/ceed-cuda-gen-operator.c b/backends/cuda-gen/ceed-cuda-gen-operator.c
index c80f08a3d2..464418e233 100644
--- a/backends/cuda-gen/ceed-cuda-gen-operator.c
+++ b/backends/cuda-gen/ceed-cuda-gen-operator.c
@@ -293,17 +293,58 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec,
   return CEED_ERROR_SUCCESS;
 }
 
+static int CeedOperatorApplyAddComposite_Cuda_gen(CeedOperator op, CeedVector input_vec, CeedVector output_vec, CeedRequest *request) {
+  bool              is_run_good[CEED_COMPOSITE_MAX] = {false};
+  CeedInt           num_suboperators;
+  const CeedScalar *input_arr  = NULL;
+  CeedScalar       *output_arr = NULL;
+  CeedOperator     *sub_operators;
+
+  CeedCall(CeedCompositeOperatorGetNumSub(op, &num_suboperators));
+  CeedCall(CeedCompositeOperatorGetSubList(op, &sub_operators));
+  if (input_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(input_vec, CEED_MEM_DEVICE, &input_arr));
+  if (output_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArray(output_vec, CEED_MEM_DEVICE, &output_arr));
+  for (CeedInt i = 0; i < num_suboperators; i++) {
+    CeedInt num_elem = 0;
+
+    CeedCall(CeedOperatorGetNumElements(sub_operators[i], &num_elem));
+    if (num_elem > 0) {
+      CeedCallBackend(CeedOperatorApplyAddCore_Cuda_gen(sub_operators[i], NULL, input_arr, output_arr, &is_run_good[i], request));
+    }
+  }
+  if (input_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorRestoreArrayRead(input_vec, &input_arr));
+  if (output_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorRestoreArray(output_vec, &output_arr));
+
+  // Fallback on unsuccessful run
+  for (CeedInt i = 0; i < num_suboperators; i++) {
+    if (!is_run_good[i]) {
+      CeedOperator op_fallback;
+
+      CeedDebug256(CeedOperatorReturnCeed(op), CEED_DEBUG_COLOR_SUCCESS, "Falling back to /gpu/cuda/ref CeedOperator");
+      CeedCallBackend(CeedOperatorGetFallback(sub_operators[i], &op_fallback));
+      CeedCallBackend(CeedOperatorApplyAdd(op_fallback, input_vec, output_vec, request));
+    }
+  }
+  return CEED_ERROR_SUCCESS;
+}
+
 //------------------------------------------------------------------------------
 // Create operator
 //------------------------------------------------------------------------------
 int CeedOperatorCreate_Cuda_gen(CeedOperator op) {
+  bool                   is_composite;
   Ceed                   ceed;
   CeedOperator_Cuda_gen *impl;
 
   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
   CeedCallBackend(CeedCalloc(1, &impl));
   CeedCallBackend(CeedOperatorSetData(op, impl));
-  CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAdd_Cuda_gen));
+  CeedCall(CeedOperatorIsComposite(op, &is_composite));
+  if (is_composite) {
+    CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "ApplyAddComposite", CeedOperatorApplyAddComposite_Cuda_gen));
+  } else {
+    CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAdd_Cuda_gen));
+  }
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "Destroy", CeedOperatorDestroy_Cuda_gen));
   CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
diff --git a/backends/cuda-gen/ceed-cuda-gen.c b/backends/cuda-gen/ceed-cuda-gen.c
index 213a769052..404b3b5a89 100644
--- a/backends/cuda-gen/ceed-cuda-gen.c
+++ b/backends/cuda-gen/ceed-cuda-gen.c
@@ -39,6 +39,7 @@ static int CeedInit_Cuda_gen(const char *resource, Ceed ceed) {
 
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "QFunctionCreate", CeedQFunctionCreate_Cuda_gen));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "OperatorCreate", CeedOperatorCreate_Cuda_gen));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "CompositeOperatorCreate", CeedOperatorCreate_Cuda_gen));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "OperatorCreateAtPoints", CeedOperatorCreate_Cuda_gen));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "Destroy", CeedDestroy_Cuda));
   return CEED_ERROR_SUCCESS;
diff --git a/backends/hip-gen/ceed-hip-gen-operator.c b/backends/hip-gen/ceed-hip-gen-operator.c
index 2041c872df..46f58ebb18 100644
--- a/backends/hip-gen/ceed-hip-gen-operator.c
+++ b/backends/hip-gen/ceed-hip-gen-operator.c
@@ -240,17 +240,58 @@ static int CeedOperatorApplyAdd_Hip_gen(CeedOperator op, CeedVector input_vec, C
   return CEED_ERROR_SUCCESS;
 }
 
+static int CeedOperatorApplyAddComposite_Hip_gen(CeedOperator op, CeedVector input_vec, CeedVector output_vec, CeedRequest *request) {
+  bool              is_run_good[CEED_COMPOSITE_MAX] = {false};
+  CeedInt           num_suboperators;
+  const CeedScalar *input_arr  = NULL;
+  CeedScalar       *output_arr = NULL;
+  CeedOperator     *sub_operators;
+
+  CeedCall(CeedCompositeOperatorGetNumSub(op, &num_suboperators));
+  CeedCall(CeedCompositeOperatorGetSubList(op, &sub_operators));
+  if (input_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(input_vec, CEED_MEM_DEVICE, &input_arr));
+  if (output_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArray(output_vec, CEED_MEM_DEVICE, &output_arr));
+  for (CeedInt i = 0; i < num_suboperators; i++) {
+    CeedInt num_elem = 0;
+
+    CeedCall(CeedOperatorGetNumElements(sub_operators[i], &num_elem));
+    if (num_elem > 0) {
+      CeedCallBackend(CeedOperatorApplyAddCore_Hip_gen(sub_operators[i], NULL, input_arr, output_arr, &is_run_good[i], request));
+    }
+  }
+  if (input_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorRestoreArrayRead(input_vec, &input_arr));
+  if (output_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorRestoreArray(output_vec, &output_arr));
+
+  // Fallback on unsuccessful run
+  for (CeedInt i = 0; i < num_suboperators; i++) {
+    if (!is_run_good[i]) {
+      CeedOperator op_fallback;
+
+      CeedDebug256(CeedOperatorReturnCeed(op), CEED_DEBUG_COLOR_SUCCESS, "Falling back to /gpu/hip/ref CeedOperator");
+      CeedCallBackend(CeedOperatorGetFallback(sub_operators[i], &op_fallback));
+      CeedCallBackend(CeedOperatorApplyAdd(op_fallback, input_vec, output_vec, request));
+    }
+  }
+  return CEED_ERROR_SUCCESS;
+}
+
 //------------------------------------------------------------------------------
 // Create operator
 //------------------------------------------------------------------------------
 int CeedOperatorCreate_Hip_gen(CeedOperator op) {
+  bool                  is_composite;
   Ceed                  ceed;
   CeedOperator_Hip_gen *impl;
 
   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
   CeedCallBackend(CeedCalloc(1, &impl));
   CeedCallBackend(CeedOperatorSetData(op, impl));
-  CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAdd_Hip_gen));
+  CeedCall(CeedOperatorIsComposite(op, &is_composite));
+  if (is_composite) {
+    CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "ApplyAddComposite", CeedOperatorApplyAddComposite_Hip_gen));
+  } else {
+    CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAdd_Hip_gen));
+  }
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "Destroy", CeedOperatorDestroy_Hip_gen));
   CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
diff --git a/backends/hip-gen/ceed-hip-gen.c b/backends/hip-gen/ceed-hip-gen.c
index b7a8c76d0a..4ba43f8918 100644
--- a/backends/hip-gen/ceed-hip-gen.c
+++ b/backends/hip-gen/ceed-hip-gen.c
@@ -39,6 +39,7 @@ static int CeedInit_Hip_gen(const char *resource, Ceed ceed) {
 
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "QFunctionCreate", CeedQFunctionCreate_Hip_gen));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "OperatorCreate", CeedOperatorCreate_Hip_gen));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "CompositeOperatorCreate", CeedOperatorCreate_Hip_gen));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "OperatorCreateAtPoints", CeedOperatorCreate_Hip_gen));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "Destroy", CeedDestroy_Hip));
   return CEED_ERROR_SUCCESS;

From 087855af316bb603747bedb7a7e8830a22c7e257 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Mon, 24 Feb 2025 10:55:25 -0700
Subject: [PATCH 302/571] gpu - gen put suboperators on separate streams

---
 backends/cuda-gen/ceed-cuda-gen-operator.c | 12 ++++++++++--
 backends/hip-gen/ceed-hip-gen-operator.c   | 14 +++++++++++---
 2 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/backends/cuda-gen/ceed-cuda-gen-operator.c b/backends/cuda-gen/ceed-cuda-gen-operator.c
index 464418e233..8ec58978da 100644
--- a/backends/cuda-gen/ceed-cuda-gen-operator.c
+++ b/backends/cuda-gen/ceed-cuda-gen-operator.c
@@ -298,8 +298,10 @@ static int CeedOperatorApplyAddComposite_Cuda_gen(CeedOperator op, CeedVector in
   CeedInt           num_suboperators;
   const CeedScalar *input_arr  = NULL;
   CeedScalar       *output_arr = NULL;
+  Ceed              ceed;
   CeedOperator     *sub_operators;
 
+  CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
   CeedCall(CeedCompositeOperatorGetNumSub(op, &num_suboperators));
   CeedCall(CeedCompositeOperatorGetSubList(op, &sub_operators));
   if (input_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(input_vec, CEED_MEM_DEVICE, &input_arr));
@@ -309,22 +311,28 @@ static int CeedOperatorApplyAddComposite_Cuda_gen(CeedOperator op, CeedVector in
 
     CeedCall(CeedOperatorGetNumElements(sub_operators[i], &num_elem));
     if (num_elem > 0) {
-      CeedCallBackend(CeedOperatorApplyAddCore_Cuda_gen(sub_operators[i], NULL, input_arr, output_arr, &is_run_good[i], request));
+      cudaStream_t stream = NULL;
+
+      CeedCallCuda(ceed, cudaStreamCreate(&stream));
+      CeedCallBackend(CeedOperatorApplyAddCore_Cuda_gen(sub_operators[i], stream, input_arr, output_arr, &is_run_good[i], request));
+      CeedCallCuda(ceed, cudaStreamDestroy(stream));
     }
   }
   if (input_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorRestoreArrayRead(input_vec, &input_arr));
   if (output_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorRestoreArray(output_vec, &output_arr));
+  CeedCallCuda(ceed, cudaDeviceSynchronize());
 
   // Fallback on unsuccessful run
   for (CeedInt i = 0; i < num_suboperators; i++) {
     if (!is_run_good[i]) {
       CeedOperator op_fallback;
 
-      CeedDebug256(CeedOperatorReturnCeed(op), CEED_DEBUG_COLOR_SUCCESS, "Falling back to /gpu/cuda/ref CeedOperator");
+      CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "Falling back to /gpu/cuda/ref CeedOperator");
       CeedCallBackend(CeedOperatorGetFallback(sub_operators[i], &op_fallback));
       CeedCallBackend(CeedOperatorApplyAdd(op_fallback, input_vec, output_vec, request));
     }
   }
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/hip-gen/ceed-hip-gen-operator.c b/backends/hip-gen/ceed-hip-gen-operator.c
index 46f58ebb18..2db86e3e14 100644
--- a/backends/hip-gen/ceed-hip-gen-operator.c
+++ b/backends/hip-gen/ceed-hip-gen-operator.c
@@ -225,7 +225,7 @@ static int CeedOperatorApplyAdd_Hip_gen(CeedOperator op, CeedVector input_vec, C
   // Try to run kernel
   if (input_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(input_vec, CEED_MEM_DEVICE, &input_arr));
   if (output_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArray(output_vec, CEED_MEM_DEVICE, &output_arr));
-  CeedCallBackend(CeedOperatorApplyAddCore_Cuda_gen(op, NULL, input_arr, output_arr, &is_run_good, request));
+  CeedCallBackend(CeedOperatorApplyAddCore_Hip_gen(op, NULL, input_arr, output_arr, &is_run_good, request));
   if (input_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorRestoreArrayRead(input_vec, &input_arr));
   if (output_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorRestoreArray(output_vec, &output_arr));
 
@@ -245,8 +245,10 @@ static int CeedOperatorApplyAddComposite_Hip_gen(CeedOperator op, CeedVector inp
   CeedInt           num_suboperators;
   const CeedScalar *input_arr  = NULL;
   CeedScalar       *output_arr = NULL;
+  Ceed              ceed;
   CeedOperator     *sub_operators;
 
+  CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
   CeedCall(CeedCompositeOperatorGetNumSub(op, &num_suboperators));
   CeedCall(CeedCompositeOperatorGetSubList(op, &sub_operators));
   if (input_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(input_vec, CEED_MEM_DEVICE, &input_arr));
@@ -256,22 +258,28 @@ static int CeedOperatorApplyAddComposite_Hip_gen(CeedOperator op, CeedVector inp
 
     CeedCall(CeedOperatorGetNumElements(sub_operators[i], &num_elem));
     if (num_elem > 0) {
-      CeedCallBackend(CeedOperatorApplyAddCore_Hip_gen(sub_operators[i], NULL, input_arr, output_arr, &is_run_good[i], request));
+      hipStream_t stream = NULL;
+
+      CeedCallHip(ceed, hipStreamCreate(&stream));
+      CeedCallBackend(CeedOperatorApplyAddCore_Hip_gen(sub_operators[i], stream, input_arr, output_arr, &is_run_good[i], request));
+      CeedCallHip(ceed, hipStreamDestroy(stream));
     }
   }
   if (input_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorRestoreArrayRead(input_vec, &input_arr));
   if (output_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorRestoreArray(output_vec, &output_arr));
+  CeedCallHip(ceed, hipDeviceSynchronize());
 
   // Fallback on unsuccessful run
   for (CeedInt i = 0; i < num_suboperators; i++) {
     if (!is_run_good[i]) {
       CeedOperator op_fallback;
 
-      CeedDebug256(CeedOperatorReturnCeed(op), CEED_DEBUG_COLOR_SUCCESS, "Falling back to /gpu/hip/ref CeedOperator");
+      CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "Falling back to /gpu/hip/ref CeedOperator");
       CeedCallBackend(CeedOperatorGetFallback(sub_operators[i], &op_fallback));
       CeedCallBackend(CeedOperatorApplyAdd(op_fallback, input_vec, output_vec, request));
     }
   }
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 

From 0c8fbeed9f3d7a529e0100c3518095eb7a58974d Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Wed, 26 Feb 2025 09:26:32 -0700
Subject: [PATCH 303/571] gpu - gen should use GetArray over GetArrayWrite

---
 backends/cuda-gen/ceed-cuda-gen-operator.c | 2 +-
 backends/hip-gen/ceed-hip-gen-operator.c   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/backends/cuda-gen/ceed-cuda-gen-operator.c b/backends/cuda-gen/ceed-cuda-gen-operator.c
index 8ec58978da..05e9498d02 100644
--- a/backends/cuda-gen/ceed-cuda-gen-operator.c
+++ b/backends/cuda-gen/ceed-cuda-gen-operator.c
@@ -155,7 +155,7 @@ static int CeedOperatorApplyAddCore_Cuda_gen(CeedOperator op, CUstream stream, c
       CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
       is_active = vec == CEED_VECTOR_ACTIVE;
       if (is_active) data->fields.outputs[i] = output_arr;
-      else CeedCallBackend(CeedVectorGetArrayWrite(vec, CEED_MEM_DEVICE, &data->fields.outputs[i]));
+      else CeedCallBackend(CeedVectorGetArray(vec, CEED_MEM_DEVICE, &data->fields.outputs[i]));
       CeedCallBackend(CeedVectorDestroy(&vec));
     }
   }
diff --git a/backends/hip-gen/ceed-hip-gen-operator.c b/backends/hip-gen/ceed-hip-gen-operator.c
index 2db86e3e14..0ead03b8f5 100644
--- a/backends/hip-gen/ceed-hip-gen-operator.c
+++ b/backends/hip-gen/ceed-hip-gen-operator.c
@@ -89,7 +89,7 @@ static int CeedOperatorApplyAddCore_Hip_gen(CeedOperator op, hipStream_t stream,
       CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
       is_active = vec == CEED_VECTOR_ACTIVE;
       if (is_active) data->fields.outputs[i] = output_arr;
-      else CeedCallBackend(CeedVectorGetArrayWrite(vec, CEED_MEM_DEVICE, &data->fields.outputs[i]));
+      else CeedCallBackend(CeedVectorGetArray(vec, CEED_MEM_DEVICE, &data->fields.outputs[i]));
       CeedCallBackend(CeedVectorDestroy(&vec));
     }
   }

From 2fc995f6102555f4a428ce850576f23ccfe8b680 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Wed, 26 Feb 2025 09:57:01 -0700
Subject: [PATCH 304/571] ex - add Poisson to python tutorial 5

---
 examples/python/tutorial-5-operator.ipynb | 96 ++++++++++++++++++++++-
 1 file changed, 94 insertions(+), 2 deletions(-)

diff --git a/examples/python/tutorial-5-operator.ipynb b/examples/python/tutorial-5-operator.ipynb
index 123a59836c..c99030075e 100644
--- a/examples/python/tutorial-5-operator.ipynb
+++ b/examples/python/tutorial-5-operator.ipynb
@@ -125,11 +125,103 @@
     "with v.array_read() as v_array:\n",
     "  print('The length of the domain is l = %4.2f'%np.sum(v_array))"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "* In the next example, we create and apply a CeedOperator for the Poisson operator in 1D. By applying this operator to a vector with a linear function, we compute the 'surface area' of this 1D domain, similar to Ex2-Surface in the [tutorial-6-shell tutorial](./tutorial-6-shell.ipynb)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import libceed\n",
+    "import numpy as np\n",
+    "\n",
+    "ceed = libceed.Ceed()\n",
+    "\n",
+    "nelem = 15\n",
+    "p = 5\n",
+    "q = 8\n",
+    "nx = nelem + 1\n",
+    "nu = nelem*(p-1) + 1\n",
+    "\n",
+    "# Vectors\n",
+    "x = ceed.Vector(nx)\n",
+    "x_array = np.zeros(nx)\n",
+    "for i in range(nx):\n",
+    "  x_array[i] = i / (nx - 1.0)\n",
+    "x.set_array(x_array, cmode=libceed.USE_POINTER)\n",
+    "\n",
+    "qdata = ceed.Vector(nelem*q)\n",
+    "u = ceed.Vector(nu)\n",
+    "v = ceed.Vector(nu)\n",
+    "\n",
+    "# Restrictions\n",
+    "indx = np.zeros(nx*2, dtype=\"int32\")\n",
+    "for i in range(nx):\n",
+    "  indx[2*i+0] = i\n",
+    "  indx[2*i+1] = i+1\n",
+    "rx = ceed.ElemRestriction(nelem, 2, 1, 1, nx, indx, cmode=libceed.USE_POINTER)\n",
+    "\n",
+    "indu = np.zeros(nelem*p, dtype=\"int32\")\n",
+    "for i in range(nelem):\n",
+    "  for j in range(p):\n",
+    "    indu[p*i+j] = i*(p-1) + j\n",
+    "ru = ceed.ElemRestriction(nelem, p, 1, 1, nu, indu, cmode=libceed.USE_POINTER)\n",
+    "strides = np.array([1, q, q], dtype=\"int32\")\n",
+    "rui = ceed.StridedElemRestriction(nelem, q, 1, q*nelem, strides)\n",
+    "\n",
+    "# Bases\n",
+    "bx = ceed.BasisTensorH1Lagrange(1, 1, 2, q, libceed.GAUSS)\n",
+    "bu = ceed.BasisTensorH1Lagrange(1, 1, p, q, libceed.GAUSS)\n",
+    "\n",
+    "# QFunctions\n",
+    "qf_setup = ceed.QFunctionByName(\"Poisson1DBuild\")\n",
+    "qf_mass = ceed.QFunctionByName(\"Poisson1DApply\")\n",
+    "\n",
+    "# Setup operator\n",
+    "op_setup = ceed.Operator(qf_setup)\n",
+    "op_setup.set_field(\"dx\", rx, bx, libceed.VECTOR_ACTIVE)\n",
+    "op_setup.set_field(\"weights\", libceed.ELEMRESTRICTION_NONE, bx,\n",
+    "                   libceed.VECTOR_NONE)\n",
+    "op_setup.set_field(\"qdata\", rui, libceed.BASIS_NONE,\n",
+    "                   libceed.VECTOR_ACTIVE)\n",
+    "op_setup.check()\n",
+    "print('Setup operator: ', op_setup)\n",
+    "\n",
+    "# Poisson operator\n",
+    "op_poisson = ceed.Operator(qf_mass)\n",
+    "op_poisson.set_field(\"du\", ru, bu, libceed.VECTOR_ACTIVE)\n",
+    "op_poisson.set_field(\"qdata\", rui, libceed.BASIS_NONE, qdata)\n",
+    "op_poisson.set_field(\"dv\", ru, bu, libceed.VECTOR_ACTIVE)\n",
+    "op_poisson.check()\n",
+    "print('Poisson operator: ', op_poisson)\n",
+    "\n",
+    "# Setup\n",
+    "op_setup.apply(x, qdata)\n",
+    "\n",
+    "# Apply Poisson operator\n",
+    "with u.array_write() as u_array:\n",
+    "  [points, _] = ceed.lobatto_quadrature(p)\n",
+    "  for elem in range(nelem):\n",
+    "      for point in range(p):\n",
+    "          u_array[elem * (p - 1) + point] = (1.0 + 2.0 * elem + points[point])/(2.0 * nelem)\n",
+    "op_poisson.apply(u, v)\n",
+    "\n",
+    "# Check\n",
+    "with v.array_read() as v_array:\n",
+    "  print('The surface area of the domain is dl = %4.2f'%np.sum(abs(v_array)))"
+   ]
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -143,7 +235,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.5"
+   "version": "3.13.2"
   }
  },
  "nbformat": 4,

From 235e439982b8aa90553a430cee711df838bd85a4 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Wed, 26 Feb 2025 10:28:40 -0700
Subject: [PATCH 305/571] ex - use words for variable names in Python tutorials

---
 .../python/tutorial-2-elemrestriction.ipynb   |  92 +++++-----
 examples/python/tutorial-3-basis.ipynb        | 170 +++++++++---------
 examples/python/tutorial-4-qfunction.ipynb    |   4 +-
 examples/python/tutorial-5-operator.ipynb     | 124 ++++++-------
 4 files changed, 194 insertions(+), 196 deletions(-)

diff --git a/examples/python/tutorial-2-elemrestriction.ipynb b/examples/python/tutorial-2-elemrestriction.ipynb
index c9a9483288..6c8f8593a7 100644
--- a/examples/python/tutorial-2-elemrestriction.ipynb
+++ b/examples/python/tutorial-2-elemrestriction.ipynb
@@ -61,20 +61,20 @@
     "\n",
     "ceed = libceed.Ceed()\n",
     "\n",
-    "ne = 3\n",
+    "num_elem = 3\n",
     "\n",
-    "x = ceed.Vector(ne+1)\n",
-    "a = np.arange(10, 10 + ne+1, dtype=\"float64\")\n",
+    "x = ceed.Vector(num_elem+1)\n",
+    "a = np.arange(10, 10 + num_elem+1, dtype=\"float64\")\n",
     "x.set_array(a, cmode=libceed.USE_POINTER)\n",
     "\n",
-    "ind = np.zeros(2*ne, dtype=\"int32\")\n",
-    "for i in range(ne):\n",
-    "  ind[2*i+0] = i\n",
-    "  ind[2*i+1] = i+1\n",
+    "indices = np.zeros(2*num_elem, dtype=\"int32\")\n",
+    "for i in range(num_elem):\n",
+    "  indices[2*i+0] = i\n",
+    "  indices[2*i+1] = i+1\n",
     "    \n",
-    "r = ceed.ElemRestriction(ne, 2, 1, 1, ne+1, ind, cmode=libceed.USE_POINTER)\n",
+    "r = ceed.ElemRestriction(num_elem, 2, 1, 1, num_elem+1, indices, cmode=libceed.USE_POINTER)\n",
     "\n",
-    "y = ceed.Vector(2*ne)\n",
+    "y = ceed.Vector(2*num_elem)\n",
     "y.set_value(0)\n",
     "\n",
     "r.apply(x, y)\n",
@@ -100,17 +100,17 @@
     "# \n",
     "#  x -- o -- o -- x -- o -- o -- x -- o -- o -- x\n",
     "\n",
-    "ne = 3\n",
+    "num_elem = 3\n",
     "\n",
-    "ind = np.zeros(4*ne, dtype=\"int32\")\n",
+    "indices = np.zeros(4*num_elem, dtype=\"int32\")\n",
     "\n",
-    "for i in range(ne):\n",
-    "  ind[4*i+0] = i*3+0\n",
-    "  ind[4*i+1] = i*3+1\n",
-    "  ind[4*i+2] = i*3+2\n",
-    "  ind[4*i+3] = i*3+3\n",
+    "for i in range(num_elem):\n",
+    "  indices[4*i+0] = i*3+0\n",
+    "  indices[4*i+1] = i*3+1\n",
+    "  indices[4*i+2] = i*3+2\n",
+    "  indices[4*i+3] = i*3+3\n",
     "\n",
-    "r = ceed.ElemRestriction(ne, 4, 1, 1, 3*ne+1, ind, cmode=libceed.USE_POINTER)\n",
+    "r = ceed.ElemRestriction(num_elem, 4, 1, 1, 3*num_elem+1, indices, cmode=libceed.USE_POINTER)\n",
     "\n",
     "mult = r.get_multiplicity()\n",
     "\n",
@@ -141,17 +141,17 @@
     "#  x --  x |  x --  x |  x --  x\n",
     "# 10 -- 11 | 12 -- 13 | 14 -- 15\n",
     "\n",
-    "ne = 3\n",
+    "num_elem = 3\n",
     "\n",
-    "x = ceed.Vector(2*ne)\n",
-    "a = np.arange(10, 10 + 2*ne, dtype=\"float64\")\n",
+    "x = ceed.Vector(2*num_elem)\n",
+    "a = np.arange(10, 10 + 2*num_elem, dtype=\"float64\")\n",
     "x.set_array(a, cmode=libceed.USE_POINTER)\n",
     "\n",
     "strides = np.array([1, 2, 2], dtype=\"int32\")\n",
     "\n",
-    "r = ceed.StridedElemRestriction(ne, 2, 1, 2*ne, strides)\n",
+    "r = ceed.StridedElemRestriction(num_elem, 2, 1, 2*num_elem, strides)\n",
     "\n",
-    "y = ceed.Vector(2*ne)\n",
+    "y = ceed.Vector(2*num_elem)\n",
     "y.set_value(0)\n",
     "\n",
     "r.apply(x, y)\n",
@@ -177,11 +177,11 @@
     "# \n",
     "#  x -- x -- x -- x\n",
     "\n",
-    "ne = 3\n",
+    "num_elem = 3\n",
     "\n",
     "strides = np.array([1, 2, 2], dtype=\"int32\")\n",
     "\n",
-    "r = ceed.BlockedStridedElemRestriction(ne, 2, 2, 1, ne+1, strides)\n",
+    "r = ceed.BlockedStridedElemRestriction(num_elem, 2, 2, 1, 2*(num_elem+1), strides)\n",
     "\n",
     "print(r)"
    ]
@@ -233,22 +233,22 @@
     "# | 10-11-12-13-14        11-12-13-14-15 | 15-16-17-17-17        16-17-18-18-18 |\n",
     "# | e0 e1 e2 e3 e4        e0 e1 e2 e3 e4 | e0 e1 e2 e3 e4        e0 e1 e2 e3 e4 |\n",
     "\n",
-    "ne = 8\n",
-    "blksize = 5\n",
+    "num_elem = 8\n",
+    "block_size = 5\n",
     "\n",
-    "x = ceed.Vector(ne+1)\n",
-    "a = np.arange(10, 10 + ne+1, dtype=\"float64\")\n",
+    "x = ceed.Vector(num_elem+1)\n",
+    "a = np.arange(10, 10 + num_elem+1, dtype=\"float64\")\n",
     "x.set_array(a, cmode=libceed.USE_POINTER)\n",
     "\n",
-    "ind = np.zeros(2*ne, dtype=\"int32\")\n",
-    "for i in range(ne):\n",
-    "  ind[2*i+0] = i\n",
-    "  ind[2*i+1] = i+1\n",
+    "indices = np.zeros(2*num_elem, dtype=\"int32\")\n",
+    "for i in range(num_elem):\n",
+    "  indices[2*i+0] = i\n",
+    "  indices[2*i+1] = i+1\n",
     "\n",
-    "r = ceed.BlockedElemRestriction(ne, 2, blksize, 1, 1, ne+1, ind,\n",
+    "r = ceed.BlockedElemRestriction(num_elem, 2, block_size, 1, 1, num_elem+1, indices,\n",
     "                                cmode=libceed.USE_POINTER)\n",
     "\n",
-    "y = ceed.Vector(2*blksize*2)\n",
+    "y = ceed.Vector(2*block_size*2)\n",
     "y.set_value(0)\n",
     "\n",
     "r.apply(x, y)\n",
@@ -303,22 +303,22 @@
     "# | 15-16-17-17-17        16-17-18-18-18 |\n",
     "# | e0 e1 e2 e3 e4        e0 e1 e2 e3 e4 |\n",
     "\n",
-    "ne = 8\n",
-    "blksize = 5\n",
+    "num_elem = 8\n",
+    "block_size = 5\n",
     "\n",
-    "x = ceed.Vector(ne+1)\n",
-    "a = np.arange(10, 10 + ne+1, dtype=\"float64\")\n",
+    "x = ceed.Vector(num_elem+1)\n",
+    "a = np.arange(10, 10 + num_elem+1, dtype=\"float64\")\n",
     "x.set_array(a, cmode=libceed.USE_POINTER)\n",
     "\n",
-    "ind = np.zeros(2*ne, dtype=\"int32\")\n",
-    "for i in range(ne):\n",
-    "  ind[2*i+0] = i\n",
-    "  ind[2*i+1] = i+1\n",
+    "indices = np.zeros(2*num_elem, dtype=\"int32\")\n",
+    "for i in range(num_elem):\n",
+    "  indices[2*i+0] = i\n",
+    "  indices[2*i+1] = i+1\n",
     "\n",
-    "r = ceed.BlockedElemRestriction(ne, 2, blksize, 1, 1, ne+1, ind,\n",
+    "r = ceed.BlockedElemRestriction(num_elem, 2, block_size, 1, 1, num_elem+1, indices,\n",
     "                                cmode=libceed.USE_POINTER)\n",
     "\n",
-    "y = ceed.Vector(blksize*2)\n",
+    "y = ceed.Vector(block_size*2)\n",
     "y.set_value(0)\n",
     "\n",
     "r.apply_block(1, x, y)\n",
@@ -343,7 +343,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -357,7 +357,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.5"
+   "version": "3.13.2"
   }
  },
  "nbformat": 4,
diff --git a/examples/python/tutorial-3-basis.ipynb b/examples/python/tutorial-3-basis.ipynb
index a2141e4e9d..ef18be2789 100644
--- a/examples/python/tutorial-3-basis.ipynb
+++ b/examples/python/tutorial-3-basis.ipynb
@@ -63,11 +63,11 @@
     "    center += 0.1\n",
     "  return result\n",
     "\n",
-    "def feval(x1, x2):\n",
-    "  return x1*x1 + x2*x2 + x1*x2 + 1\n",
+    "def feval(x_1, x_2):\n",
+    "  return x_1*x_1 + x_2*x_2 + x_1*x_2 + 1\n",
     "\n",
-    "def dfeval(x1, x2):\n",
-    "  return 2*x1 + x2"
+    "def dfeval(x_1, x_2):\n",
+    "  return 2*x_1 + x_2"
    ]
   },
   {
@@ -112,24 +112,24 @@
    "outputs": [],
    "source": [
     "P = b.get_num_nodes()\n",
-    "nviz = 50\n",
-    "bviz = ceed.BasisTensorH1Lagrange(1, 1, P, nviz, libceed.GAUSS_LOBATTO)\n",
+    "Q_viz = 50\n",
+    "basis_viz = ceed.BasisTensorH1Lagrange(1, 1, P, Q_viz, libceed.GAUSS_LOBATTO)\n",
     "\n",
     "# Construct P \"elements\" with one node activated\n",
     "I = ceed.Vector(P * P)\n",
-    "with I.array(P, P) as x:\n",
+    "with I.array_write(P, P) as x:\n",
     "    x[...] = np.eye(P)\n",
     "\n",
-    "Bvander = ceed.Vector(P * nviz)\n",
-    "bviz.apply(4, libceed.EVAL_INTERP, I, Bvander)\n",
+    "basis_fns = ceed.Vector(P * Q_viz)\n",
+    "basis_viz.apply(4, libceed.EVAL_INTERP, I, basis_fns)\n",
     "\n",
-    "qviz, _weight = ceed.lobatto_quadrature(nviz)\n",
-    "with Bvander.array_read(nviz, P) as B:\n",
-    "    plt.plot(qviz, B)\n",
+    "qpts_viz, _ = ceed.lobatto_quadrature(Q_viz)\n",
+    "with basis_fns.array_read(Q_viz, P) as B_array:\n",
+    "    plt.plot(qpts_viz, B_array)\n",
     "\n",
     "# Mark tho Lobatto nodes\n",
-    "qb, _weight = ceed.lobatto_quadrature(P)\n",
-    "plt.plot(qb, 0*qb, 'ok');"
+    "nodes, _ = ceed.lobatto_quadrature(P)\n",
+    "plt.plot(nodes, 0*nodes, 'ok');"
    ]
   },
   {
@@ -148,11 +148,11 @@
     "b = ceed.BasisTensorH1Lagrange(1, 1, 4, 4, libceed.GAUSS)\n",
     "print(b)\n",
     "\n",
-    "with Bvander.array_read(nviz, P) as B:\n",
-    "    plt.plot(qviz, B)\n",
+    "with basis_fns.array_read(Q_viz, P) as B_array:\n",
+    "    plt.plot(qpts_viz, B_array)\n",
     "# Mark tho Gauss quadrature points\n",
-    "qb, _weight = ceed.gauss_quadrature(P)\n",
-    "plt.plot(qb, 0*qb, 'ok');"
+    "qpts, _ = ceed.gauss_quadrature(P)\n",
+    "plt.plot(qpts, 0*qpts, 'ok');"
    ]
   },
   {
@@ -193,54 +193,52 @@
    "source": [
     "for dim in range(1, 4):\n",
     "  Q = 4\n",
-    "  Qdim = Q**dim\n",
-    "  Xdim = 2**dim\n",
-    "  x = np.empty(Xdim*dim, dtype=\"float64\")\n",
-    "  uq = np.empty(Qdim, dtype=\"float64\")\n",
+    "  Q_dim = Q**dim\n",
+    "  X_dim = 2**dim\n",
+    "  x = np.empty(X_dim*dim, dtype=\"float64\")\n",
+    "  u_array = np.empty(Q_dim, dtype=\"float64\")\n",
     "\n",
     "  for d in range(dim):\n",
-    "    for i in range(Xdim):\n",
-    "      x[d*Xdim + i] = 1 if (i % (2**(dim-d))) // (2**(dim-d-1)) else -1\n",
+    "    for i in range(X_dim):\n",
+    "      x[d*X_dim + i] = 1 if (i % (2**(dim-d))) // (2**(dim-d-1)) else -1\n",
     "\n",
-    "  X = ceed.Vector(Xdim*dim)\n",
+    "  X = ceed.Vector(X_dim*dim)\n",
     "  X.set_array(x, cmode=libceed.USE_POINTER)\n",
-    "  Xq = ceed.Vector(Qdim*dim)\n",
-    "  Xq.set_value(0)\n",
-    "  U = ceed.Vector(Qdim)\n",
+    "  X_q = ceed.Vector(Q_dim*dim)\n",
+    "  X_q.set_value(0)\n",
+    "  U = ceed.Vector(Q_dim)\n",
     "  U.set_value(0)\n",
-    "  Uq = ceed.Vector(Qdim)\n",
+    "  U_q = ceed.Vector(Q_dim)\n",
     "\n",
-    "  bxl = ceed.BasisTensorH1Lagrange(dim, dim, 2, Q, libceed.GAUSS_LOBATTO)\n",
-    "  bul = ceed.BasisTensorH1Lagrange(dim, 1, Q, Q, libceed.GAUSS_LOBATTO)\n",
+    "  basis_x_lobatto = ceed.BasisTensorH1Lagrange(dim, dim, 2, Q, libceed.GAUSS_LOBATTO)\n",
+    "  basis_u_lobatto = ceed.BasisTensorH1Lagrange(dim, 1, Q, Q, libceed.GAUSS_LOBATTO)\n",
     "\n",
-    "  bxl.apply(1, libceed.EVAL_INTERP, X, Xq)\n",
+    "  basis_x_lobatto.apply(1, libceed.EVAL_INTERP, X, X_q)\n",
     "\n",
-    "  with Xq.array_read() as xq:\n",
-    "    for i in range(Qdim):\n",
-    "      xx = np.empty(dim, dtype=\"float64\")\n",
+    "  with X_q.array_read() as x_array:\n",
+    "    for i in range(Q_dim):\n",
+    "      x = np.empty(dim, dtype=\"float64\")\n",
     "      for d in range(dim):\n",
-    "        xx[d] = xq[d*Qdim + i]\n",
-    "      uq[i] = eval(dim, xx)\n",
+    "        x[d] = x_array[d*Q_dim + i]\n",
+    "      u_array[i] = eval(dim, x)\n",
     "\n",
-    "  Uq.set_array(uq, cmode=libceed.USE_POINTER)\n",
+    "  U_q.set_array(u_array, cmode=libceed.USE_POINTER)\n",
     "\n",
     "  # This operation is the identity because the quadrature is collocated\n",
-    "  bul.T.apply(1, libceed.EVAL_INTERP, Uq, U)\n",
+    "  basis_u_lobatto.T.apply(1, libceed.EVAL_INTERP, U_q, U)\n",
     "\n",
-    "  bxg = ceed.BasisTensorH1Lagrange(dim, dim, 2, Q, libceed.GAUSS)\n",
-    "  bug = ceed.BasisTensorH1Lagrange(dim, 1, Q, Q, libceed.GAUSS)\n",
+    "  basis_x_gauss = ceed.BasisTensorH1Lagrange(dim, dim, 2, Q, libceed.GAUSS)\n",
+    "  basis_u_gauss = ceed.BasisTensorH1Lagrange(dim, 1, Q, Q, libceed.GAUSS)\n",
     "\n",
-    "  bxg.apply(1, libceed.EVAL_INTERP, X, Xq)\n",
-    "  bug.apply(1, libceed.EVAL_INTERP, U, Uq)\n",
+    "  basis_x_gauss.apply(1, libceed.EVAL_INTERP, X, X_q)\n",
+    "  basis_u_gauss.apply(1, libceed.EVAL_INTERP, U, U_q)\n",
     "\n",
-    "  with Xq.array_read() as xq, Uq.array_read() as u:\n",
-    "    #print('xq =', xq)\n",
-    "    #print('u =', u)\n",
+    "  with X_q.array_read() as x_array, U_q.array_read() as u_array:\n",
     "    if dim == 2:\n",
     "        # Default ordering is contiguous in x direction, but\n",
     "        # pyplot expects meshgrid convention, which is transposed.\n",
-    "        x, y = xq.reshape(2, Q, Q).transpose(0, 2, 1)\n",
-    "        plt.scatter(x, y, c=np.array(u).reshape(Q, Q))\n",
+    "        x, y = x_array.reshape(2, Q, Q).transpose(0, 2, 1)\n",
+    "        plt.scatter(x, y, c=np.array(u_array).reshape(Q, Q))\n",
     "        plt.xlim(-1, 1)\n",
     "        plt.ylim(-1, 1)\n",
     "        plt.colorbar(label='u')"
@@ -261,62 +259,62 @@
    "source": [
     "for dim in range (1, 4):\n",
     "  P, Q = 8, 10\n",
-    "  Pdim = P**dim\n",
-    "  Qdim = Q**dim\n",
-    "  Xdim = 2**dim\n",
-    "  sum1 = sum2 = 0\n",
-    "  x = np.empty(Xdim*dim, dtype=\"float64\")\n",
-    "  u = np.empty(Pdim, dtype=\"float64\")\n",
+    "  P_dim = P**dim\n",
+    "  Q_dim = Q**dim\n",
+    "  X_dim = 2**dim\n",
+    "  sum_1 = sum_2 = 0\n",
+    "  x_array = np.empty(X_dim*dim, dtype=\"float64\")\n",
+    "  u_array = np.empty(P_dim, dtype=\"float64\")\n",
     "\n",
     "  for d in range(dim):\n",
-    "    for i in range(Xdim):\n",
-    "      x[d*Xdim + i] = 1 if (i % (2**(dim-d))) // (2**(dim-d-1)) else -1\n",
-    "\n",
-    "  X = ceed.Vector(Xdim*dim)\n",
-    "  X.set_array(x, cmode=libceed.USE_POINTER)\n",
-    "  Xq = ceed.Vector(Pdim*dim)\n",
-    "  Xq.set_value(0)\n",
-    "  U = ceed.Vector(Pdim)\n",
-    "  Uq = ceed.Vector(Qdim*dim)\n",
-    "  Uq.set_value(0)\n",
-    "  Ones = ceed.Vector(Qdim*dim)\n",
+    "    for i in range(X_dim):\n",
+    "      x_array[d*X_dim + i] = 1 if (i % (2**(dim-d))) // (2**(dim-d-1)) else -1\n",
+    "\n",
+    "  X = ceed.Vector(X_dim*dim)\n",
+    "  X.set_array(x_array, cmode=libceed.USE_POINTER)\n",
+    "  X_q = ceed.Vector(P_dim*dim)\n",
+    "  X_q.set_value(0)\n",
+    "  U = ceed.Vector(P_dim)\n",
+    "  U_q = ceed.Vector(Q_dim*dim)\n",
+    "  U_q.set_value(0)\n",
+    "  Ones = ceed.Vector(Q_dim*dim)\n",
     "  Ones.set_value(1)\n",
-    "  Gtposeones = ceed.Vector(Pdim)\n",
-    "  Gtposeones.set_value(0)\n",
+    "  G_transpose_ones = ceed.Vector(P_dim)\n",
+    "  G_transpose_ones.set_value(0)\n",
     "\n",
     "  # Get function values at quadrature points\n",
-    "  bxl = ceed.BasisTensorH1Lagrange(dim, dim, 2, P, libceed.GAUSS_LOBATTO)\n",
-    "  bxl.apply(1, libceed.EVAL_INTERP, X, Xq)\n",
+    "  basis_x_lobatto = ceed.BasisTensorH1Lagrange(dim, dim, 2, P, libceed.GAUSS_LOBATTO)\n",
+    "  basis_x_lobatto.apply(1, libceed.EVAL_INTERP, X, X_q)\n",
     "\n",
-    "  with Xq.array_read() as xq:\n",
-    "    for i in range(Pdim):\n",
-    "      xx = np.empty(dim, dtype=\"float64\")\n",
+    "  with X_q.array_read() as x_array:\n",
+    "    for i in range(P_dim):\n",
+    "      x = np.empty(dim, dtype=\"float64\")\n",
     "      for d in range(dim):\n",
-    "        xx[d] = xq[d*Pdim + i]\n",
-    "      u[i] = eval(dim, xx)\n",
+    "        x[d] = x_array[d*P_dim + i]\n",
+    "      u_array[i] = eval(dim, x)\n",
     "\n",
-    "  U.set_array(u, cmode=libceed.USE_POINTER)\n",
+    "  U.set_array(u_array, cmode=libceed.USE_POINTER)\n",
     "\n",
     "  # Calculate G u at quadrature points, G' * 1 at dofs\n",
-    "  bug = ceed.BasisTensorH1Lagrange(dim, 1, P, Q, libceed.GAUSS)\n",
-    "  bug.apply(1, libceed.EVAL_GRAD, U, Uq)\n",
-    "  bug.T.apply(1, libceed.EVAL_GRAD, Ones, Gtposeones)\n",
+    "  basis_u_gauss = ceed.BasisTensorH1Lagrange(dim, 1, P, Q, libceed.GAUSS)\n",
+    "  basis_u_gauss.apply(1, libceed.EVAL_GRAD, U, U_q)\n",
+    "  basis_u_gauss.T.apply(1, libceed.EVAL_GRAD, Ones, G_transpose_ones)\n",
     "\n",
     "  # Check if 1' * G * u = u' * (G' * 1)\n",
-    "  with Gtposeones.array_read() as gtposeones, Uq.array_read() as uq:\n",
-    "    for i in range(Pdim):\n",
-    "      sum1 += gtposeones[i]*u[i]\n",
-    "    for i in range(dim*Qdim):\n",
-    "      sum2 += uq[i]\n",
+    "  with G_transpose_ones.array_read() as g_array, U_q.array_read() as uq_array:\n",
+    "    for i in range(P_dim):\n",
+    "      sum_1 += g_array[i]*u_array[i]\n",
+    "    for i in range(dim*Q_dim):\n",
+    "      sum_2 += uq_array[i]\n",
     "\n",
     "  # Check that (1' * G * u - u' * (G' * 1)) is numerically zero\n",
-    "  print('1T * G * u - uT * (GT * 1) =', np.abs(sum1 - sum2))"
+    "  print('1T * G * u - uT * (GT * 1) =', np.abs(sum_1 - sum_2))"
    ]
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -330,7 +328,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.5"
+   "version": "3.13.2"
   }
  },
  "nbformat": 4,
diff --git a/examples/python/tutorial-4-qfunction.ipynb b/examples/python/tutorial-4-qfunction.ipynb
index d6495e241e..9aba23d2fc 100644
--- a/examples/python/tutorial-4-qfunction.ipynb
+++ b/examples/python/tutorial-4-qfunction.ipynb
@@ -189,7 +189,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -203,7 +203,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.5"
+   "version": "3.13.2"
   }
  },
  "nbformat": 4,
diff --git a/examples/python/tutorial-5-operator.ipynb b/examples/python/tutorial-5-operator.ipynb
index c99030075e..bb756a42a7 100644
--- a/examples/python/tutorial-5-operator.ipynb
+++ b/examples/python/tutorial-5-operator.ipynb
@@ -56,41 +56,41 @@
     "\n",
     "ceed = libceed.Ceed()\n",
     "\n",
-    "nelem = 15\n",
+    "num_elem = 15\n",
     "p = 5\n",
     "q = 8\n",
-    "nx = nelem + 1\n",
-    "nu = nelem*(p-1) + 1\n",
+    "num_x = num_elem + 1\n",
+    "num_u = num_elem*(p-1) + 1\n",
     "\n",
     "# Vectors\n",
-    "x = ceed.Vector(nx)\n",
-    "x_array = np.zeros(nx)\n",
-    "for i in range(nx):\n",
-    "  x_array[i] = i / (nx - 1.0)\n",
+    "x = ceed.Vector(num_x)\n",
+    "x_array = np.zeros(num_x)\n",
+    "for i in range(num_x):\n",
+    "  x_array[i] = i / (num_x - 1.0)\n",
     "x.set_array(x_array, cmode=libceed.USE_POINTER)\n",
     "\n",
-    "qdata = ceed.Vector(nelem*q)\n",
-    "u = ceed.Vector(nu)\n",
-    "v = ceed.Vector(nu)\n",
+    "q_data = ceed.Vector(num_elem*q)\n",
+    "u = ceed.Vector(num_u)\n",
+    "v = ceed.Vector(num_u)\n",
     "\n",
     "# Restrictions\n",
-    "indx = np.zeros(nx*2, dtype=\"int32\")\n",
-    "for i in range(nx):\n",
-    "  indx[2*i+0] = i\n",
-    "  indx[2*i+1] = i+1\n",
-    "rx = ceed.ElemRestriction(nelem, 2, 1, 1, nx, indx, cmode=libceed.USE_POINTER)\n",
-    "\n",
-    "indu = np.zeros(nelem*p, dtype=\"int32\")\n",
-    "for i in range(nelem):\n",
+    "indices_x = np.zeros(num_x*2, dtype=\"int32\")\n",
+    "for i in range(num_x):\n",
+    "  indices_x[2*i+0] = i\n",
+    "  indices_x[2*i+1] = i+1\n",
+    "restriction_x = ceed.ElemRestriction(num_elem, 2, 1, 1, num_x, indices_x, cmode=libceed.USE_POINTER)\n",
+    "\n",
+    "indices_u = np.zeros(num_elem*p, dtype=\"int32\")\n",
+    "for i in range(num_elem):\n",
     "  for j in range(p):\n",
-    "    indu[p*i+j] = i*(p-1) + j\n",
-    "ru = ceed.ElemRestriction(nelem, p, 1, 1, nu, indu, cmode=libceed.USE_POINTER)\n",
+    "    indices_u[p*i+j] = i*(p-1) + j\n",
+    "restriction_u = ceed.ElemRestriction(num_elem, p, 1, 1, num_u, indices_u, cmode=libceed.USE_POINTER)\n",
     "strides = np.array([1, q, q], dtype=\"int32\")\n",
-    "rui = ceed.StridedElemRestriction(nelem, q, 1, q*nelem, strides)\n",
+    "restriction_q_data = ceed.StridedElemRestriction(num_elem, q, 1, q*num_elem, strides)\n",
     "\n",
     "# Bases\n",
-    "bx = ceed.BasisTensorH1Lagrange(1, 1, 2, q, libceed.GAUSS)\n",
-    "bu = ceed.BasisTensorH1Lagrange(1, 1, p, q, libceed.GAUSS)\n",
+    "basis_x = ceed.BasisTensorH1Lagrange(1, 1, 2, q, libceed.GAUSS)\n",
+    "basis_u = ceed.BasisTensorH1Lagrange(1, 1, p, q, libceed.GAUSS)\n",
     "\n",
     "# QFunctions\n",
     "qf_setup = ceed.QFunctionByName(\"Mass1DBuild\")\n",
@@ -98,24 +98,24 @@
     "\n",
     "# Setup operator\n",
     "op_setup = ceed.Operator(qf_setup)\n",
-    "op_setup.set_field(\"dx\", rx, bx, libceed.VECTOR_ACTIVE)\n",
-    "op_setup.set_field(\"weights\", libceed.ELEMRESTRICTION_NONE, bx,\n",
+    "op_setup.set_field(\"dx\", restriction_x, basis_x, libceed.VECTOR_ACTIVE)\n",
+    "op_setup.set_field(\"weights\", libceed.ELEMRESTRICTION_NONE, basis_x,\n",
     "                   libceed.VECTOR_NONE)\n",
-    "op_setup.set_field(\"qdata\", rui, libceed.BASIS_NONE,\n",
+    "op_setup.set_field(\"qdata\", restriction_q_data, libceed.BASIS_NONE,\n",
     "                   libceed.VECTOR_ACTIVE)\n",
     "op_setup.check()\n",
     "print('Setup operator: ', op_setup)\n",
     "\n",
     "# Mass operator\n",
     "op_mass = ceed.Operator(qf_mass)\n",
-    "op_mass.set_field(\"u\", ru, bu, libceed.VECTOR_ACTIVE)\n",
-    "op_mass.set_field(\"qdata\", rui, libceed.BASIS_NONE, qdata)\n",
-    "op_mass.set_field(\"v\", ru, bu, libceed.VECTOR_ACTIVE)\n",
+    "op_mass.set_field(\"u\", restriction_u, basis_u, libceed.VECTOR_ACTIVE)\n",
+    "op_mass.set_field(\"qdata\", restriction_q_data, libceed.BASIS_NONE, q_data)\n",
+    "op_mass.set_field(\"v\", restriction_u, basis_u, libceed.VECTOR_ACTIVE)\n",
     "op_mass.check()\n",
     "print('Mass operator: ', op_mass)\n",
     "\n",
     "# Setup\n",
-    "op_setup.apply(x, qdata)\n",
+    "op_setup.apply(x, q_data)\n",
     "\n",
     "# Apply mass matrix\n",
     "u.set_value(1)\n",
@@ -144,41 +144,41 @@
     "\n",
     "ceed = libceed.Ceed()\n",
     "\n",
-    "nelem = 15\n",
+    "num_elem = 15\n",
     "p = 5\n",
     "q = 8\n",
-    "nx = nelem + 1\n",
-    "nu = nelem*(p-1) + 1\n",
+    "num_x = num_elem + 1\n",
+    "num_u = num_elem*(p-1) + 1\n",
     "\n",
     "# Vectors\n",
-    "x = ceed.Vector(nx)\n",
-    "x_array = np.zeros(nx)\n",
-    "for i in range(nx):\n",
-    "  x_array[i] = i / (nx - 1.0)\n",
+    "x = ceed.Vector(num_x)\n",
+    "x_array = np.zeros(num_x)\n",
+    "for i in range(num_x):\n",
+    "  x_array[i] = i / (num_x - 1.0)\n",
     "x.set_array(x_array, cmode=libceed.USE_POINTER)\n",
     "\n",
-    "qdata = ceed.Vector(nelem*q)\n",
-    "u = ceed.Vector(nu)\n",
-    "v = ceed.Vector(nu)\n",
+    "q_data = ceed.Vector(num_elem*q)\n",
+    "u = ceed.Vector(num_u)\n",
+    "v = ceed.Vector(num_u)\n",
     "\n",
     "# Restrictions\n",
-    "indx = np.zeros(nx*2, dtype=\"int32\")\n",
-    "for i in range(nx):\n",
-    "  indx[2*i+0] = i\n",
-    "  indx[2*i+1] = i+1\n",
-    "rx = ceed.ElemRestriction(nelem, 2, 1, 1, nx, indx, cmode=libceed.USE_POINTER)\n",
-    "\n",
-    "indu = np.zeros(nelem*p, dtype=\"int32\")\n",
-    "for i in range(nelem):\n",
+    "indices_x = np.zeros(num_x*2, dtype=\"int32\")\n",
+    "for i in range(num_x):\n",
+    "  indices_x[2*i+0] = i\n",
+    "  indices_x[2*i+1] = i+1\n",
+    "restriction_x = ceed.ElemRestriction(num_elem, 2, 1, 1, num_x, indices_x, cmode=libceed.USE_POINTER)\n",
+    "\n",
+    "indices_u = np.zeros(num_elem*p, dtype=\"int32\")\n",
+    "for i in range(num_elem):\n",
     "  for j in range(p):\n",
-    "    indu[p*i+j] = i*(p-1) + j\n",
-    "ru = ceed.ElemRestriction(nelem, p, 1, 1, nu, indu, cmode=libceed.USE_POINTER)\n",
+    "    indices_u[p*i+j] = i*(p-1) + j\n",
+    "restriction_u = ceed.ElemRestriction(num_elem, p, 1, 1, num_u, indices_u, cmode=libceed.USE_POINTER)\n",
     "strides = np.array([1, q, q], dtype=\"int32\")\n",
-    "rui = ceed.StridedElemRestriction(nelem, q, 1, q*nelem, strides)\n",
+    "restriction_q_data = ceed.StridedElemRestriction(num_elem, q, 1, q*num_elem, strides)\n",
     "\n",
     "# Bases\n",
-    "bx = ceed.BasisTensorH1Lagrange(1, 1, 2, q, libceed.GAUSS)\n",
-    "bu = ceed.BasisTensorH1Lagrange(1, 1, p, q, libceed.GAUSS)\n",
+    "basis_x = ceed.BasisTensorH1Lagrange(1, 1, 2, q, libceed.GAUSS)\n",
+    "basis_u = ceed.BasisTensorH1Lagrange(1, 1, p, q, libceed.GAUSS)\n",
     "\n",
     "# QFunctions\n",
     "qf_setup = ceed.QFunctionByName(\"Poisson1DBuild\")\n",
@@ -186,31 +186,31 @@
     "\n",
     "# Setup operator\n",
     "op_setup = ceed.Operator(qf_setup)\n",
-    "op_setup.set_field(\"dx\", rx, bx, libceed.VECTOR_ACTIVE)\n",
-    "op_setup.set_field(\"weights\", libceed.ELEMRESTRICTION_NONE, bx,\n",
+    "op_setup.set_field(\"dx\", restriction_x, basis_x, libceed.VECTOR_ACTIVE)\n",
+    "op_setup.set_field(\"weights\", libceed.ELEMRESTRICTION_NONE, basis_x,\n",
     "                   libceed.VECTOR_NONE)\n",
-    "op_setup.set_field(\"qdata\", rui, libceed.BASIS_NONE,\n",
+    "op_setup.set_field(\"qdata\", restriction_q_data, libceed.BASIS_NONE,\n",
     "                   libceed.VECTOR_ACTIVE)\n",
     "op_setup.check()\n",
     "print('Setup operator: ', op_setup)\n",
     "\n",
     "# Poisson operator\n",
     "op_poisson = ceed.Operator(qf_mass)\n",
-    "op_poisson.set_field(\"du\", ru, bu, libceed.VECTOR_ACTIVE)\n",
-    "op_poisson.set_field(\"qdata\", rui, libceed.BASIS_NONE, qdata)\n",
-    "op_poisson.set_field(\"dv\", ru, bu, libceed.VECTOR_ACTIVE)\n",
+    "op_poisson.set_field(\"du\", restriction_u, basis_u, libceed.VECTOR_ACTIVE)\n",
+    "op_poisson.set_field(\"qdata\", restriction_q_data, libceed.BASIS_NONE, q_data)\n",
+    "op_poisson.set_field(\"dv\", restriction_u, basis_u, libceed.VECTOR_ACTIVE)\n",
     "op_poisson.check()\n",
     "print('Poisson operator: ', op_poisson)\n",
     "\n",
     "# Setup\n",
-    "op_setup.apply(x, qdata)\n",
+    "op_setup.apply(x, q_data)\n",
     "\n",
     "# Apply Poisson operator\n",
     "with u.array_write() as u_array:\n",
     "  [points, _] = ceed.lobatto_quadrature(p)\n",
-    "  for elem in range(nelem):\n",
+    "  for elem in range(num_elem):\n",
     "      for point in range(p):\n",
-    "          u_array[elem * (p - 1) + point] = (1.0 + 2.0 * elem + points[point])/(2.0 * nelem)\n",
+    "          u_array[elem * (p - 1) + point] = (1.0 + 2.0 * elem + points[point])/(2.0 * num_elem)\n",
     "op_poisson.apply(u, v)\n",
     "\n",
     "# Check\n",

From a4065bfb851b046304cab6c8e82c8315ce7dda8f Mon Sep 17 00:00:00 2001
From: Zach Atkins <zach.atkins@colorado.edu>
Date: Fri, 28 Feb 2025 10:18:04 -0700
Subject: [PATCH 306/571] Add GetMinPointsInElement and
 GetMinMaxPointsInElement for elem restriction at points

---
 include/ceed/ceed.h              |  2 ++
 interface/ceed-elemrestriction.c | 61 +++++++++++++++++++++++++++-----
 2 files changed, 54 insertions(+), 9 deletions(-)

diff --git a/include/ceed/ceed.h b/include/ceed/ceed.h
index b531bd8d28..872124c765 100644
--- a/include/ceed/ceed.h
+++ b/include/ceed/ceed.h
@@ -279,6 +279,8 @@ CEED_EXTERN int  CeedElemRestrictionGetElementSize(CeedElemRestriction rstr, Cee
 CEED_EXTERN int  CeedElemRestrictionGetNumPoints(CeedElemRestriction rstr, CeedInt *num_points);
 CEED_EXTERN int  CeedElemRestrictionGetNumPointsInElement(CeedElemRestriction rstr, CeedInt elem, CeedInt *num_points);
 CEED_EXTERN int  CeedElemRestrictionGetMaxPointsInElement(CeedElemRestriction rstr, CeedInt *max_points);
+CEED_EXTERN int  CeedElemRestrictionGetMinPointsInElement(CeedElemRestriction rstr, CeedInt *min_points);
+CEED_EXTERN int  CeedElemRestrictionGetMinMaxPointsInElement(CeedElemRestriction rstr, CeedInt *min_points, CeedInt *max_points);
 CEED_EXTERN int  CeedElemRestrictionGetLVectorSize(CeedElemRestriction rstr, CeedSize *l_size);
 CEED_EXTERN int  CeedElemRestrictionGetEVectorSize(CeedElemRestriction rstr, CeedSize *e_size);
 CEED_EXTERN int  CeedElemRestrictionGetNumComponents(CeedElemRestriction rstr, CeedInt *num_comp);
diff --git a/interface/ceed-elemrestriction.c b/interface/ceed-elemrestriction.c
index 50028e6934..1f705ecd2e 100644
--- a/interface/ceed-elemrestriction.c
+++ b/interface/ceed-elemrestriction.c
@@ -1548,34 +1548,77 @@ int CeedElemRestrictionGetNumPointsInElement(CeedElemRestriction rstr, CeedInt e
 }
 
 /**
-  @brief Get the maximum number of points in an element for a `CeedElemRestriction` at points
+  @brief Get the minimum and/or maximum number of points in an element for a `CeedElemRestriction` at points
 
   @param[in]  rstr       `CeedElemRestriction`
-  @param[out] max_points Variable to store size of elements
+  @param[out] min_points Variable to minimum number of points in an element, or `NULL`
+  @param[out] max_points Variable to maximum number of points in an element, or `NULL`
 
   @return An error code: 0 - success, otherwise - failure
 
   @ref Advanced
 **/
-int CeedElemRestrictionGetMaxPointsInElement(CeedElemRestriction rstr, CeedInt *max_points) {
-  CeedInt             num_elem;
+int CeedElemRestrictionGetMinMaxPointsInElement(CeedElemRestriction rstr, CeedInt *min_points, CeedInt *max_points) {
+  CeedInt             num_elem, num_points;
   CeedRestrictionType rstr_type;
 
   CeedCall(CeedElemRestrictionGetType(rstr, &rstr_type));
   CeedCheck(rstr_type == CEED_RESTRICTION_POINTS, CeedElemRestrictionReturnCeed(rstr), CEED_ERROR_INCOMPATIBLE,
-            "Cannot compute max points for a CeedElemRestriction that does not use points");
+            "Cannot compute min/max points for a CeedElemRestriction that does not use points");
 
   CeedCall(CeedElemRestrictionGetNumElements(rstr, &num_elem));
-  *max_points = 0;
-  for (CeedInt e = 0; e < num_elem; e++) {
-    CeedInt num_points;
 
+  // Exit early if there are no elements
+  if (num_elem == 0) {
+    if (min_points) *min_points = 0;
+    if (max_points) *max_points = 0;
+    return CEED_ERROR_SUCCESS;
+  }
+
+  // Initialize to the number of points in the first element
+  CeedCall(CeedElemRestrictionGetNumPointsInElement(rstr, 0, &num_points));
+  if (min_points) *min_points = num_points;
+  if (max_points) *max_points = num_points;
+  for (CeedInt e = 1; e < num_elem; e++) {
     CeedCall(CeedElemRestrictionGetNumPointsInElement(rstr, e, &num_points));
-    *max_points = CeedIntMax(num_points, *max_points);
+    if (min_points) *min_points = CeedIntMin(num_points, *min_points);
+    if (max_points) *max_points = CeedIntMax(num_points, *max_points);
   }
   return CEED_ERROR_SUCCESS;
 }
 
+/**
+  @brief Get the maximum number of points in an element for a `CeedElemRestriction` at points
+
+  @param[in]  rstr       `CeedElemRestriction`
+  @param[out] max_points Variable to store maximum number of points in an element
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref User
+
+  @see CeedElemRestrictionGetMinMaxPointsInElement()
+**/
+int CeedElemRestrictionGetMaxPointsInElement(CeedElemRestriction rstr, CeedInt *max_points) {
+  return CeedElemRestrictionGetMinMaxPointsInElement(rstr, NULL, max_points);
+}
+
+/**
+  @brief Get the minimum number of points in an element for a `CeedElemRestriction` at points
+
+  @param[in]  rstr       `CeedElemRestriction`
+  @param[out] min_points Variable to store minimum number of points in an element
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref User
+
+  @see CeedElemRestrictionGetMinMaxPointsInElement()
+**/
+int CeedElemRestrictionGetMinPointsInElement(CeedElemRestriction rstr, CeedInt *min_points) {
+  return CeedElemRestrictionGetMinMaxPointsInElement(rstr, min_points, NULL);
+}
+
 /**
   @brief Get the size of the l-vector for a `CeedElemRestriction`
 

From 4ed2b27715919084fc6a9c0e6a05a1e8adc518b8 Mon Sep 17 00:00:00 2001
From: Zach Atkins <zach.atkins@colorado.edu>
Date: Fri, 28 Feb 2025 10:24:51 -0700
Subject: [PATCH 307/571] Add min points in element check to tests

---
 tests/t233-elemrestriction.c | 8 +++++++-
 tests/t234-elemrestriction.c | 8 +++++++-
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/tests/t233-elemrestriction.c b/tests/t233-elemrestriction.c
index 1973f3b86a..3573e1c349 100644
--- a/tests/t233-elemrestriction.c
+++ b/tests/t233-elemrestriction.c
@@ -48,9 +48,15 @@ int main(int argc, char **argv) {
   }
 
   {
-    CeedInt max_points;
+    CeedInt min_points, max_points;
 
+    CeedElemRestrictionGetMinPointsInElement(elem_restriction, &min_points);
     CeedElemRestrictionGetMaxPointsInElement(elem_restriction, &max_points);
+    if (min_points != 1 || max_points != num_elem) {
+      // LCOV_EXCL_START
+      printf("Error in min/max points: min %" CeedInt_FMT " max %" CeedInt_FMT "\n", min_points, max_points);
+      // LCOV_EXCL_STOP
+    }
     CeedVectorCreate(ceed, max_points, &y);
   }
 
diff --git a/tests/t234-elemrestriction.c b/tests/t234-elemrestriction.c
index f42c2aed9f..3f434bd365 100644
--- a/tests/t234-elemrestriction.c
+++ b/tests/t234-elemrestriction.c
@@ -35,9 +35,15 @@ int main(int argc, char **argv) {
   CeedElemRestrictionCreateVector(elem_restriction, &x, NULL);
   CeedVectorSetValue(x, 0.0);
   {
-    CeedInt max_points;
+    CeedInt min_points, max_points;
 
+    CeedElemRestrictionGetMinPointsInElement(elem_restriction, &min_points);
     CeedElemRestrictionGetMaxPointsInElement(elem_restriction, &max_points);
+    if (min_points != 1 || max_points != num_elem) {
+      // LCOV_EXCL_START
+      printf("Error in min/max points: min %" CeedInt_FMT " max %" CeedInt_FMT "\n", min_points, max_points);
+      // LCOV_EXCL_STOP
+    }
     CeedVectorCreate(ceed, max_points, &y);
     CeedVectorSetValue(y, 1.0);
   }

From acf3c4b6a8c41b1acd7e7bfed71087418a910350 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 28 Feb 2025 13:34:01 -0700
Subject: [PATCH 308/571] ci - restore IBM testing

---
 .github/workflows/c-fortan-test-ppc64le.yml | 40 +++++++++++++++++++++
 1 file changed, 40 insertions(+)
 create mode 100644 .github/workflows/c-fortan-test-ppc64le.yml

diff --git a/.github/workflows/c-fortan-test-ppc64le.yml b/.github/workflows/c-fortan-test-ppc64le.yml
new file mode 100644
index 0000000000..c1526c1161
--- /dev/null
+++ b/.github/workflows/c-fortan-test-ppc64le.yml
@@ -0,0 +1,40 @@
+name: IBM Power
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+
+jobs:
+  test:
+    strategy:
+      matrix:
+        os: [ubuntu-24.04]
+        compiler: [gcc-13]
+        arch: [ppc64le]
+        distro: [ubuntu22.04]
+
+    runs-on: ${{ matrix.os }}
+
+    steps:
+    - name: Environment setup
+      uses: actions/checkout@v4
+    - name: Hardware setup and test libCEED
+      uses: uraimo/run-on-arch-action@v3
+      env:
+        CC: ${{ matrix.compiler }}
+        FC: gfortran-13
+      id: runcmd
+      with:
+        arch: ${{ matrix.arch }}
+        distro: ${{ matrix.distro }}
+        run: |
+          apt-get -y update
+          apt-get install -y build-essential
+          apt-get install -y gfortran
+          apt-get install -y python3
+          uname -a
+          make info
+          make -j
+          PROVE_OPTS=-v make prove -j

From d402b6f85c4d9a9e57466cd088bf2e84d90848f4 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 28 Feb 2025 13:59:51 -0700
Subject: [PATCH 309/571] ci - minor, tests run slower with -V by default

---
 .github/workflows/c-fortan-test-ppc64le.yml    | 2 +-
 .github/workflows/c-fortran-test-arm64.yml     | 4 ++--
 .github/workflows/c-fortran-test-icc.yml       | 4 ++--
 .github/workflows/c-fortran-test-linux-osx.yml | 4 ++--
 .github/workflows/c-fortran-test-style.yml     | 2 +-
 5 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/c-fortan-test-ppc64le.yml b/.github/workflows/c-fortan-test-ppc64le.yml
index c1526c1161..45d7b4841e 100644
--- a/.github/workflows/c-fortan-test-ppc64le.yml
+++ b/.github/workflows/c-fortan-test-ppc64le.yml
@@ -37,4 +37,4 @@ jobs:
           uname -a
           make info
           make -j
-          PROVE_OPTS=-v make prove -j
+          make prove -j
diff --git a/.github/workflows/c-fortran-test-arm64.yml b/.github/workflows/c-fortran-test-arm64.yml
index fa355ebcf4..d75d11512a 100644
--- a/.github/workflows/c-fortran-test-arm64.yml
+++ b/.github/workflows/c-fortran-test-arm64.yml
@@ -24,5 +24,5 @@ jobs:
         FC: gfortran-13
       run: |
         make info
-        make -j2
-        PROVE_OPTS=-v make prove -j2
+        make -j
+        make prove -j
diff --git a/.github/workflows/c-fortran-test-icc.yml b/.github/workflows/c-fortran-test-icc.yml
index f7101d4e90..4e854195b1 100644
--- a/.github/workflows/c-fortran-test-icc.yml
+++ b/.github/workflows/c-fortran-test-icc.yml
@@ -32,6 +32,6 @@ jobs:
           export CC=icx CXX=icx FC=ifx
           export OPENMP=1
           make info
-          make -j2
-          PROVE_OPTS=-v make prove -j2
+          make -j
+          make prove -j
 
diff --git a/.github/workflows/c-fortran-test-linux-osx.yml b/.github/workflows/c-fortran-test-linux-osx.yml
index 3b369ce1e4..180eb9ecdc 100644
--- a/.github/workflows/c-fortran-test-linux-osx.yml
+++ b/.github/workflows/c-fortran-test-linux-osx.yml
@@ -24,5 +24,5 @@ jobs:
         FC: gfortran-13
       run: |
         make info
-        make -j2
-        PROVE_OPTS=-v make prove -j2
+        make -j
+        make prove -j
diff --git a/.github/workflows/c-fortran-test-style.yml b/.github/workflows/c-fortran-test-style.yml
index b570221d44..5658082596 100644
--- a/.github/workflows/c-fortran-test-style.yml
+++ b/.github/workflows/c-fortran-test-style.yml
@@ -29,4 +29,4 @@ jobs:
         FC: gfortran-11
       run: |
         make info
-        make format-c -j2 CLANG_FORMAT=clang-format-18 && git diff --exit-code
+        make format-c -j CLANG_FORMAT=clang-format-18 && git diff --exit-code

From 0b96b02dcf565e79672f1c0ce41352c1d7cf48d3 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Tue, 4 Mar 2025 17:01:29 -0700
Subject: [PATCH 310/571] ex - add ex03 mass+diff

---
 examples/ceed/README.md     |   5 +
 examples/ceed/ex1-volume.c  |  33 ++-
 examples/ceed/ex1-volume.h  |   4 +-
 examples/ceed/ex2-surface.c |  31 ++-
 examples/ceed/ex2-surface.h |  19 +-
 examples/ceed/ex3-volume.c  | 418 ++++++++++++++++++++++++++++++++++++
 examples/ceed/ex3-volume.h  | 151 +++++++++++++
 7 files changed, 648 insertions(+), 13 deletions(-)
 create mode 100644 examples/ceed/ex3-volume.c
 create mode 100644 examples/ceed/ex3-volume.h

diff --git a/examples/ceed/README.md b/examples/ceed/README.md
index 6d4543b2e3..2495f01d0c 100644
--- a/examples/ceed/README.md
+++ b/examples/ceed/README.md
@@ -9,3 +9,8 @@ This example uses the mass matrix to compute the length, area, or volume of a re
 ### Example 2: ex2-surface
 
 This example uses the diffusion matrix to compute the surface area of a region, in 1D, 2D or 3D, depending upon runtime parameters.
+
+### Example 3: ex3-volume
+
+This example uses the mass matrix to compute the length, area, or volume of a region, depending upon runtime parameters.
+Unlike ex1, this example also adds the diffusion matrix to add a zero contribution to this calculation while demonstrating the ability of libCEED to handle multiple basis evaluation modes on the same input and output vectors.
diff --git a/examples/ceed/ex1-volume.c b/examples/ceed/ex1-volume.c
index c8a12651a6..554c5ab883 100644
--- a/examples/ceed/ex1-volume.c
+++ b/examples/ceed/ex1-volume.c
@@ -117,15 +117,18 @@ int main(int argc, const char *argv[]) {
 
   // Select appropriate backend and logical device based on the (-ceed) command line argument.
   Ceed ceed;
+
   CeedInit(ceed_spec, &ceed);
 
   // Construct the mesh and solution bases.
   CeedBasis mesh_basis, sol_basis;
+
   CeedBasisCreateTensorH1Lagrange(ceed, dim, num_comp_x, mesh_degree + 1, num_qpts, CEED_GAUSS, &mesh_basis);
   CeedBasisCreateTensorH1Lagrange(ceed, dim, 1, sol_degree + 1, num_qpts, CEED_GAUSS, &sol_basis);
 
   // Determine the mesh size based on the given approximate problem size.
   CeedInt num_xyz[dim];
+
   GetCartesianMeshSize(dim, sol_degree, prob_size, num_xyz);
   if (!test) {
     // LCOV_EXCL_START
@@ -139,6 +142,7 @@ int main(int argc, const char *argv[]) {
   // Build CeedElemRestriction objects describing the mesh and solution discrete representations.
   CeedInt             mesh_size, sol_size;
   CeedElemRestriction mesh_restriction, sol_restriction, q_data_restriction;
+
   BuildCartesianRestriction(ceed, dim, num_xyz, mesh_degree, num_comp_x, &mesh_size, num_qpts, &mesh_restriction, NULL);
   BuildCartesianRestriction(ceed, dim, num_xyz, sol_degree, 1, &sol_size, num_qpts, &sol_restriction, &q_data_restriction);
   if (!test) {
@@ -150,6 +154,7 @@ int main(int argc, const char *argv[]) {
 
   // Create a CeedVector with the mesh coordinates.
   CeedVector mesh_coords;
+
   CeedVectorCreate(ceed, mesh_size, &mesh_coords);
   SetCartesianMeshCoords(dim, num_xyz, mesh_degree, mesh_coords);
 
@@ -159,12 +164,14 @@ int main(int argc, const char *argv[]) {
   // Context data to be passed to the 'build_mass' QFunction.
   CeedQFunctionContext build_ctx;
   struct BuildContext  build_ctx_data;
+
   build_ctx_data.dim = build_ctx_data.space_dim = dim;
   CeedQFunctionContextCreate(ceed, &build_ctx);
   CeedQFunctionContextSetData(build_ctx, CEED_MEM_HOST, CEED_USE_POINTER, sizeof(build_ctx_data), &build_ctx_data);
 
   // Create the QFunction that builds the mass operator (i.e. computes its quadrature data) and set its context data.
   CeedQFunction qf_build;
+
   if (gallery) {
     // This creates the QFunction via the gallery.
     char name[13] = "";
@@ -181,6 +188,7 @@ int main(int argc, const char *argv[]) {
 
   // Create the operator that builds the quadrature data for the mass operator.
   CeedOperator op_build;
+
   CeedOperatorCreate(ceed, qf_build, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_build);
   CeedOperatorSetField(op_build, "dx", mesh_restriction, mesh_basis, CEED_VECTOR_ACTIVE);
   CeedOperatorSetField(op_build, "weights", CEED_ELEMRESTRICTION_NONE, mesh_basis, CEED_VECTOR_NONE);
@@ -190,12 +198,14 @@ int main(int argc, const char *argv[]) {
   CeedVector q_data;
   CeedInt    elem_qpts = CeedIntPow(num_qpts, dim);
   CeedInt    num_elem  = 1;
+
   for (CeedInt d = 0; d < dim; d++) num_elem *= num_xyz[d];
   CeedVectorCreate(ceed, num_elem * elem_qpts, &q_data);
   CeedOperatorApply(op_build, mesh_coords, q_data, CEED_REQUEST_IMMEDIATE);
 
   // Create the QFunction that defines the action of the mass operator.
   CeedQFunction qf_apply;
+
   if (gallery) {
     // This creates the QFunction via the gallery.
     CeedQFunctionCreateInteriorByName(ceed, "MassApply", &qf_apply);
@@ -209,6 +219,7 @@ int main(int argc, const char *argv[]) {
 
   // Create the mass operator.
   CeedOperator op_apply;
+
   CeedOperatorCreate(ceed, qf_apply, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_apply);
   CeedOperatorSetField(op_apply, "u", sol_restriction, sol_basis, CEED_VECTOR_ACTIVE);
   CeedOperatorSetField(op_apply, "qdata", q_data_restriction, CEED_BASIS_NONE, q_data);
@@ -216,6 +227,7 @@ int main(int argc, const char *argv[]) {
 
   // Create auxiliary solution-size vectors.
   CeedVector u, v;
+
   CeedVectorCreate(ceed, sol_size, &u);
   CeedVectorCreate(ceed, sol_size, &v);
 
@@ -239,8 +251,10 @@ int main(int argc, const char *argv[]) {
 
   // Compute and print the sum of the entries of 'v' giving the mesh volume.
   CeedScalar volume = 0.;
+
   {
     const CeedScalar *v_array;
+
     CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array);
     for (CeedInt i = 0; i < sol_size; i++) volume += v_array[i];
     CeedVectorRestoreArrayRead(v, &v_array);
@@ -254,6 +268,7 @@ int main(int argc, const char *argv[]) {
     // LCOV_EXCL_STOP
   } else {
     CeedScalar tol = (dim == 1 ? 200. * CEED_EPSILON : dim == 2 ? 1E-5 : 1E-5);
+
     if (fabs(volume - exact_volume) > tol) printf("Volume error : % .1e\n", volume - exact_volume);
   }
 
@@ -281,13 +296,16 @@ int GetCartesianMeshSize(CeedInt dim, CeedInt degree, CeedInt prob_size, CeedInt
   //    prob_size ~ num_elem * degree^dim
   CeedInt num_elem = prob_size / CeedIntPow(degree, dim);
   CeedInt s        = 0;  // find s: num_elem/2 < 2^s <= num_elem
+
   while (num_elem > 1) {
     num_elem /= 2;
     s++;
   }
   CeedInt r = s % dim;
+
   for (CeedInt d = 0; d < dim; d++) {
     CeedInt sd = s / dim;
+
     if (r > 0) {
       sd++;
       r--;
@@ -303,6 +321,7 @@ int BuildCartesianRestriction(Ceed ceed, CeedInt dim, CeedInt num_xyz[dim], Ceed
   CeedInt num_nodes = CeedIntPow(p, dim);         // number of scalar nodes per element
   CeedInt elem_qpts = CeedIntPow(num_qpts, dim);  // number of qpts per element
   CeedInt nd[3], num_elem = 1, scalar_size = 1;
+
   for (CeedInt d = 0; d < dim; d++) {
     num_elem *= num_xyz[d];
     nd[d] = num_xyz[d] * (p - 1) + 1;
@@ -313,15 +332,19 @@ int BuildCartesianRestriction(Ceed ceed, CeedInt dim, CeedInt num_xyz[dim], Ceed
   //           |---*-...-*---|---*-...-*---|- ... -|--...--|
   // num_nodes:   0   1    p-1  p  p+1       2*p             n*p
   CeedInt *elem_nodes = malloc(sizeof(CeedInt) * num_elem * num_nodes);
+
   for (CeedInt e = 0; e < num_elem; e++) {
     CeedInt e_xyz[3] = {1, 1, 1}, re = e;
+
     for (CeedInt d = 0; d < dim; d++) {
       e_xyz[d] = re % num_xyz[d];
       re /= num_xyz[d];
     }
     CeedInt *local_elem_nodes = elem_nodes + e * num_nodes;
+
     for (CeedInt l_nodes = 0; l_nodes < num_nodes; l_nodes++) {
       CeedInt g_nodes = 0, g_nodes_stride = 1, r_nodes = l_nodes;
+
       for (CeedInt d = 0; d < dim; d++) {
         g_nodes += (e_xyz[d] * (p - 1) + r_nodes % p) * g_nodes_stride;
         g_nodes_stride *= nd[d];
@@ -342,20 +365,25 @@ int BuildCartesianRestriction(Ceed ceed, CeedInt dim, CeedInt num_xyz[dim], Ceed
 int SetCartesianMeshCoords(CeedInt dim, CeedInt num_xyz[dim], CeedInt mesh_degree, CeedVector mesh_coords) {
   CeedInt p = mesh_degree + 1;
   CeedInt nd[3], scalar_size = 1;
+
   for (CeedInt d = 0; d < dim; d++) {
     nd[d] = num_xyz[d] * (p - 1) + 1;
     scalar_size *= nd[d];
   }
   CeedScalar *coords;
+
   CeedVectorGetArrayWrite(mesh_coords, CEED_MEM_HOST, &coords);
   CeedScalar *nodes = malloc(sizeof(CeedScalar) * p);
+
   // The H1 basis uses Lobatto quadrature points as nodes.
   CeedLobattoQuadrature(p, nodes, NULL);  // nodes are in [-1,1]
   for (CeedInt i = 0; i < p; i++) nodes[i] = 0.5 + 0.5 * nodes[i];
   for (CeedInt gs_nodes = 0; gs_nodes < scalar_size; gs_nodes++) {
     CeedInt r_nodes = gs_nodes;
+
     for (CeedInt d = 0; d < dim; d++) {
-      CeedInt d_1d                       = r_nodes % nd[d];
+      CeedInt d_1d = r_nodes % nd[d];
+
       coords[gs_nodes + scalar_size * d] = ((d_1d / (p - 1)) + nodes[d_1d % (p - 1)]) / num_xyz[d];
       r_nodes /= nd[d];
     }
@@ -373,6 +401,7 @@ int SetCartesianMeshCoords(CeedInt dim, CeedInt num_xyz[dim], CeedInt mesh_degre
 CeedScalar TransformMeshCoords(CeedInt dim, CeedInt mesh_size, CeedVector mesh_coords) {
   CeedScalar  exact_volume;
   CeedScalar *coords;
+
   CeedVectorGetArray(mesh_coords, CEED_MEM_HOST, &coords);
   if (dim == 1) {
     for (CeedInt i = 0; i < mesh_size; i++) {
@@ -382,10 +411,12 @@ CeedScalar TransformMeshCoords(CeedInt dim, CeedInt mesh_size, CeedVector mesh_c
     exact_volume = 1.;
   } else {
     CeedInt num_nodes = mesh_size / dim;
+
     for (CeedInt i = 0; i < num_nodes; i++) {
       // map (x,y) from [0,1]x[0,1] to the quarter annulus with polar
       // coordinates, (r,phi) in [1,2]x[0,pi/2] with area = 3/4*pi
       CeedScalar u = coords[i], v = coords[i + num_nodes];
+
       u                     = 1. + u;
       v                     = M_PI_2 * v;
       coords[i]             = u * cos(v);
diff --git a/examples/ceed/ex1-volume.h b/examples/ceed/ex1-volume.h
index 3ec78c4366..4b5d576ccb 100644
--- a/examples/ceed/ex1-volume.h
+++ b/examples/ceed/ex1-volume.h
@@ -46,7 +46,7 @@ CEED_QFUNCTION(build_mass)(void *ctx, const CeedInt Q, const CeedScalar *const *
       }  // End of Quadrature Point Loop
       break;
   }
-  return 0;
+  return CEED_ERROR_SUCCESS;
 }
 
 /// libCEED Q-function for applying a mass operator
@@ -56,5 +56,5 @@ CEED_QFUNCTION(apply_mass)(void *ctx, const CeedInt Q, const CeedScalar *const *
 
   // Quadrature Point Loop
   CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { v[i] = q_data[i] * u[i]; }  // End of Quadrature Point Loop
-  return 0;
+  return CEED_ERROR_SUCCESS;
 }
diff --git a/examples/ceed/ex2-surface.c b/examples/ceed/ex2-surface.c
index 5109e36d79..269312698e 100644
--- a/examples/ceed/ex2-surface.c
+++ b/examples/ceed/ex2-surface.c
@@ -121,15 +121,18 @@ int main(int argc, const char *argv[]) {
 
   // Select appropriate backend and logical device based on the (-ceed) command line argument.
   Ceed ceed;
+
   CeedInit(ceed_spec, &ceed);
 
   // Construct the mesh and solution bases.
   CeedBasis mesh_basis, sol_basis;
+
   CeedBasisCreateTensorH1Lagrange(ceed, dim, num_comp_x, mesh_degree + 1, num_qpts, CEED_GAUSS, &mesh_basis);
   CeedBasisCreateTensorH1Lagrange(ceed, dim, 1, sol_degree + 1, num_qpts, CEED_GAUSS, &sol_basis);
 
   // Determine the mesh size based on the given approximate problem size.
   CeedInt num_xyz[3];
+
   GetCartesianMeshSize(dim, sol_degree, prob_size, num_xyz);
 
   if (!test) {
@@ -144,6 +147,7 @@ int main(int argc, const char *argv[]) {
   // Build CeedElemRestriction objects describing the mesh and solution discrete representations.
   CeedInt             mesh_size, sol_size;
   CeedElemRestriction mesh_restriction, sol_restriction, q_data_restriction;
+
   BuildCartesianRestriction(ceed, dim, num_xyz, mesh_degree, num_comp_x, &mesh_size, num_qpts, &mesh_restriction, NULL);
   BuildCartesianRestriction(ceed, dim, num_xyz, sol_degree, dim * (dim + 1) / 2, &sol_size, num_qpts, NULL, &q_data_restriction);
   BuildCartesianRestriction(ceed, dim, num_xyz, sol_degree, 1, &sol_size, num_qpts, &sol_restriction, NULL);
@@ -156,6 +160,7 @@ int main(int argc, const char *argv[]) {
 
   // Create a CeedVector with the mesh coordinates.
   CeedVector mesh_coords;
+
   CeedVectorCreate(ceed, mesh_size, &mesh_coords);
   SetCartesianMeshCoords(dim, num_xyz, mesh_degree, mesh_coords);
 
@@ -165,12 +170,14 @@ int main(int argc, const char *argv[]) {
   // Context data to be passed to the 'build_diff' QFunction.
   CeedQFunctionContext build_ctx;
   struct BuildContext  build_ctx_data;
+
   build_ctx_data.dim = build_ctx_data.space_dim = dim;
   CeedQFunctionContextCreate(ceed, &build_ctx);
   CeedQFunctionContextSetData(build_ctx, CEED_MEM_HOST, CEED_USE_POINTER, sizeof(build_ctx_data), &build_ctx_data);
 
   // Create the QFunction that builds the diffusion operator (i.e. computes its quadrature data) and set its context data.
   CeedQFunction qf_build;
+
   if (gallery) {
     // This creates the QFunction via the gallery.
     char name[16] = "";
@@ -187,6 +194,7 @@ int main(int argc, const char *argv[]) {
 
   // Create the operator that builds the quadrature data for the diffusion operator.
   CeedOperator op_build;
+
   CeedOperatorCreate(ceed, qf_build, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_build);
   CeedOperatorSetField(op_build, "dx", mesh_restriction, mesh_basis, CEED_VECTOR_ACTIVE);
   CeedOperatorSetField(op_build, "weights", CEED_ELEMRESTRICTION_NONE, mesh_basis, CEED_VECTOR_NONE);
@@ -196,12 +204,14 @@ int main(int argc, const char *argv[]) {
   CeedVector q_data;
   CeedInt    elem_qpts = CeedIntPow(num_qpts, dim);
   CeedInt    num_elem  = 1;
+
   for (CeedInt d = 0; d < dim; d++) num_elem *= num_xyz[d];
   CeedVectorCreate(ceed, num_elem * elem_qpts * dim * (dim + 1) / 2, &q_data);
   CeedOperatorApply(op_build, mesh_coords, q_data, CEED_REQUEST_IMMEDIATE);
 
   // Create the QFunction that defines the action of the diffusion operator.
   CeedQFunction qf_apply;
+
   if (gallery) {
     // This creates the QFunction via the gallery.
     char name[25] = "";
@@ -218,6 +228,7 @@ int main(int argc, const char *argv[]) {
 
   // Create the diffusion operator.
   CeedOperator op_apply;
+
   CeedOperatorCreate(ceed, qf_apply, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_apply);
   CeedOperatorSetField(op_apply, "du", sol_restriction, sol_basis, CEED_VECTOR_ACTIVE);
   CeedOperatorSetField(op_apply, "qdata", q_data_restriction, CEED_BASIS_NONE, q_data);
@@ -225,6 +236,7 @@ int main(int argc, const char *argv[]) {
 
   // Create auxiliary solution-size vectors.
   CeedVector u, v;
+
   CeedVectorCreate(ceed, sol_size, &u);
   CeedVectorCreate(ceed, sol_size, &v);
 
@@ -232,6 +244,7 @@ int main(int argc, const char *argv[]) {
   {
     CeedScalar       *u_array;
     const CeedScalar *x_array;
+
     CeedVectorGetArrayWrite(u, CEED_MEM_HOST, &u_array);
     CeedVectorGetArrayRead(mesh_coords, CEED_MEM_HOST, &x_array);
     for (CeedInt i = 0; i < sol_size; i++) {
@@ -261,6 +274,7 @@ int main(int argc, const char *argv[]) {
   CeedScalar surface_area = 0.;
   {
     const CeedScalar *v_array;
+
     CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array);
     for (CeedInt i = 0; i < sol_size; i++) surface_area += fabs(v_array[i]);
     CeedVectorRestoreArrayRead(v, &v_array);
@@ -274,6 +288,7 @@ int main(int argc, const char *argv[]) {
     // LCOV_EXCL_STOP
   } else {
     CeedScalar tol = (dim == 1 ? 10000. * CEED_EPSILON : dim == 2 ? 1E-1 : 1E-1);
+
     if (fabs(surface_area - exact_surface_area) > tol) printf("Surface area error         : % .14g\n", surface_area - exact_surface_area);
   }
 
@@ -301,13 +316,16 @@ int GetCartesianMeshSize(CeedInt dim, CeedInt degree, CeedInt prob_size, CeedInt
   //    prob_size ~ num_elem * degree^dim
   CeedInt num_elem = prob_size / CeedIntPow(degree, dim);
   CeedInt s        = 0;  // find s: num_elem/2 < 2^s <= num_elem
+
   while (num_elem > 1) {
     num_elem /= 2;
     s++;
   }
   CeedInt r = s % dim;
+
   for (CeedInt d = 0; d < dim; d++) {
     CeedInt sd = s / dim;
+
     if (r > 0) {
       sd++;
       r--;
@@ -323,6 +341,7 @@ int BuildCartesianRestriction(Ceed ceed, CeedInt dim, CeedInt num_xyz[3], CeedIn
   CeedInt num_nodes = CeedIntPow(p, dim);         // number of scalar nodes per element
   CeedInt elem_qpts = CeedIntPow(num_qpts, dim);  // number of qpts per element
   CeedInt nd[3], num_elem = 1, scalar_size = 1;
+
   for (CeedInt d = 0; d < dim; d++) {
     num_elem *= num_xyz[d];
     nd[d] = num_xyz[d] * (p - 1) + 1;
@@ -333,15 +352,19 @@ int BuildCartesianRestriction(Ceed ceed, CeedInt dim, CeedInt num_xyz[3], CeedIn
   //           |---*-...-*---|---*-...-*---|- ... -|--...--|
   // num_nodes:   0   1    p-1  p  p+1       2*p             n*p
   CeedInt *el_nodes = malloc(sizeof(CeedInt) * num_elem * num_nodes);
+
   for (CeedInt e = 0; e < num_elem; e++) {
     CeedInt e_xyz[3] = {1, 1, 1}, re = e;
+
     for (CeedInt d = 0; d < dim; d++) {
       e_xyz[d] = re % num_xyz[d];
       re /= num_xyz[d];
     }
     CeedInt *local_elem_nodes = el_nodes + e * num_nodes;
+
     for (CeedInt l_nodes = 0; l_nodes < num_nodes; l_nodes++) {
       CeedInt g_nodes = 0, g_nodes_stride = 1, r_nodes = l_nodes;
+
       for (CeedInt d = 0; d < dim; d++) {
         g_nodes += (e_xyz[d] * (p - 1) + r_nodes % p) * g_nodes_stride;
         g_nodes_stride *= nd[d];
@@ -366,20 +389,25 @@ int BuildCartesianRestriction(Ceed ceed, CeedInt dim, CeedInt num_xyz[3], CeedIn
 int SetCartesianMeshCoords(CeedInt dim, CeedInt num_xyz[3], CeedInt mesh_degree, CeedVector mesh_coords) {
   CeedInt p = mesh_degree + 1;
   CeedInt nd[3], scalar_size = 1;
+
   for (CeedInt d = 0; d < dim; d++) {
     nd[d] = num_xyz[d] * (p - 1) + 1;
     scalar_size *= nd[d];
   }
   CeedScalar *coords;
+
   CeedVectorGetArrayWrite(mesh_coords, CEED_MEM_HOST, &coords);
   CeedScalar *nodes = malloc(sizeof(CeedScalar) * p);
+
   // The H1 basis uses Lobatto quadrature points as nodes.
   CeedLobattoQuadrature(p, nodes, NULL);  // nodes are in [-1,1]
   for (CeedInt i = 0; i < p; i++) nodes[i] = 0.5 + 0.5 * nodes[i];
   for (CeedInt gs_nodes = 0; gs_nodes < scalar_size; gs_nodes++) {
     CeedInt r_nodes = gs_nodes;
+
     for (CeedInt d = 0; d < dim; d++) {
-      CeedInt d1d                        = r_nodes % nd[d];
+      CeedInt d1d = r_nodes % nd[d];
+
       coords[gs_nodes + scalar_size * d] = ((d1d / (p - 1)) + nodes[d1d % (p - 1)]) / num_xyz[d];
       r_nodes /= nd[d];
     }
@@ -403,6 +431,5 @@ CeedScalar TransformMeshCoords(CeedInt dim, CeedInt mesh_size, CeedVector mesh_c
     coords[i] = 0.5 + 1. / sqrt(3.) * sin((2. / 3.) * M_PI * (coords[i] - 0.5));
   }
   CeedVectorRestoreArray(mesh_coords, &coords);
-
   return exact_surface_area;
 }
diff --git a/examples/ceed/ex2-surface.h b/examples/ceed/ex2-surface.h
index 1355918b70..5a4db76efa 100644
--- a/examples/ceed/ex2-surface.h
+++ b/examples/ceed/ex2-surface.h
@@ -30,27 +30,30 @@ CEED_QFUNCTION(build_diff)(void *ctx, const CeedInt Q, const CeedScalar *const *
     case 22:
       CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
         // J: 0 2   q_data: 0 2   adj(J):  J22 -J12
-        //    1 3          2 1           -J21  J11
+        //    1 3           2 1           -J21  J11
         const CeedScalar J11 = J[i + Q * 0];
         const CeedScalar J21 = J[i + Q * 1];
         const CeedScalar J12 = J[i + Q * 2];
         const CeedScalar J22 = J[i + Q * 3];
         const CeedScalar qw  = w[i] / (J11 * J22 - J21 * J12);
-        q_data[i + Q * 0]    = qw * (J12 * J12 + J22 * J22);
-        q_data[i + Q * 1]    = qw * (J11 * J11 + J21 * J21);
-        q_data[i + Q * 2]    = -qw * (J11 * J12 + J21 * J22);
+
+        q_data[i + Q * 0] = qw * (J12 * J12 + J22 * J22);
+        q_data[i + Q * 1] = qw * (J11 * J11 + J21 * J21);
+        q_data[i + Q * 2] = -qw * (J11 * J12 + J21 * J22);
       }  // End of Quadrature Point Loop
       break;
     case 33:
       CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
         // Compute the adjoint
         CeedScalar A[3][3];
-        for (CeedInt j = 0; j < 3; j++)
-          for (CeedInt k = 0; k < 3; k++)
+        for (CeedInt j = 0; j < 3; j++) {
+          for (CeedInt k = 0; k < 3; k++) {
             // Equivalent code with J as a VLA and no mod operations:
             // A[k][j] = J[j+1][k+1]*J[j+2][k+2] - J[j+1][k+2]*J[j+2][k+1]
             A[k][j] = J[i + Q * ((j + 1) % 3 + 3 * ((k + 1) % 3))] * J[i + Q * ((j + 2) % 3 + 3 * ((k + 2) % 3))] -
                       J[i + Q * ((j + 1) % 3 + 3 * ((k + 2) % 3))] * J[i + Q * ((j + 2) % 3 + 3 * ((k + 1) % 3))];
+          }
+        }
 
         // Compute quadrature weight / det(J)
         const CeedScalar qw = w[i] / (J[i + Q * 0] * A[0][0] + J[i + Q * 1] * A[0][1] + J[i + Q * 2] * A[0][2]);
@@ -69,7 +72,7 @@ CEED_QFUNCTION(build_diff)(void *ctx, const CeedInt Q, const CeedScalar *const *
       }  // End of Quadrature Point Loop
       break;
   }
-  return 0;
+  return CEED_ERROR_SUCCESS;
 }
 
 /// libCEED Q-function for applying a diff operator
@@ -120,5 +123,5 @@ CEED_QFUNCTION(apply_diff)(void *ctx, const CeedInt Q, const CeedScalar *const *
       }  // End of Quadrature Point Loop
       break;
   }
-  return 0;
+  return CEED_ERROR_SUCCESS;
 }
diff --git a/examples/ceed/ex3-volume.c b/examples/ceed/ex3-volume.c
new file mode 100644
index 0000000000..5acaaa7c4a
--- /dev/null
+++ b/examples/ceed/ex3-volume.c
@@ -0,0 +1,418 @@
+// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+//                             libCEED Example 1
+//
+// This example illustrates a simple usage of libCEED to compute the volume of a 3D body using matrix-free application of a mass operator.
+// This example also uses a diffusion operator, which provides zero contribution to the computed volume but demonstrates libCEED's ability
+// to handle multiple basis evaluation modes for the same input and output vectors.
+// Arbitrary mesh and solution degrees in 1D, 2D and 3D are supported from the same code.
+//
+// The example has no dependencies, and is designed to be self-contained.
+// For additional examples that use external discretization libraries (MFEM, PETSc, etc.) see the subdirectories in libceed/examples.
+//
+// All libCEED objects use a Ceed device object constructed based on a command line argument (-ceed).
+//
+// Build with:
+//
+//     make ex1-volume [CEED_DIR=</path/to/libceed>]
+//
+// Sample runs:
+//
+//     ./ex3-volume
+//     ./ex3-volume -ceed /cpu/self
+//     ./ex3-volume -ceed /gpu/cuda
+//
+// Test in 1D-3D
+//TESTARGS(name="1D User QFunction") -ceed {ceed_resource} -d 1 -t
+//TESTARGS(name="2D User QFunction") -ceed {ceed_resource} -d 2 -t
+//TESTARGS(name="3D User QFunction") -ceed {ceed_resource} -d 3 -t
+
+/// @file
+/// libCEED example using mass operator to compute volume
+
+#include "ex3-volume.h"
+
+#include <ceed.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+// Auxiliary functions
+int        GetCartesianMeshSize(CeedInt dim, CeedInt degree, CeedInt prob_size, CeedInt num_xyz[dim]);
+int        BuildCartesianRestriction(Ceed ceed, CeedInt dim, CeedInt num_xyz[dim], CeedInt degree, CeedInt num_comp, CeedInt *size, CeedInt num_qpts,
+                                     CeedElemRestriction *restriction, CeedElemRestriction *q_data_restriction);
+int        SetCartesianMeshCoords(CeedInt dim, CeedInt num_xyz[dim], CeedInt mesh_degree, CeedVector mesh_coords);
+CeedScalar TransformMeshCoords(CeedInt dim, CeedInt mesh_size, CeedVector mesh_coords);
+
+// Main example
+int main(int argc, const char *argv[]) {
+  const char *ceed_spec   = "/cpu/self";
+  CeedInt     dim         = 3;               // dimension of the mesh
+  CeedInt     num_comp_x  = 3;               // number of x components
+  CeedInt     mesh_degree = 4;               // polynomial degree for the mesh
+  CeedInt     sol_degree  = 4;               // polynomial degree for the solution
+  CeedInt     num_qpts    = sol_degree + 2;  // number of 1D quadrature points
+  CeedInt     prob_size   = -1;              // approximate problem size
+  CeedInt     help = 0, test = 0, benchmark = 0;
+
+  // Process command line arguments.
+  for (int ia = 1; ia < argc; ia++) {
+    // LCOV_EXCL_START
+    int next_arg = ((ia + 1) < argc), parse_error = 0;
+    if (!strcmp(argv[ia], "-h")) {
+      help = 1;
+    } else if (!strcmp(argv[ia], "-c") || !strcmp(argv[ia], "-ceed")) {
+      parse_error = next_arg ? ceed_spec = argv[++ia], 0 : 1;
+    } else if (!strcmp(argv[ia], "-d")) {
+      parse_error = next_arg ? dim = atoi(argv[++ia]), 0 : 1;
+      num_comp_x                   = dim;
+    } else if (!strcmp(argv[ia], "-m")) {
+      parse_error = next_arg ? mesh_degree = atoi(argv[++ia]), 0 : 1;
+    } else if (!strcmp(argv[ia], "-p")) {
+      parse_error = next_arg ? sol_degree = atoi(argv[++ia]), 0 : 1;
+    } else if (!strcmp(argv[ia], "-q")) {
+      parse_error = next_arg ? num_qpts = atoi(argv[++ia]), 0 : 1;
+    } else if (!strcmp(argv[ia], "-s")) {
+      parse_error = next_arg ? prob_size = atoi(argv[++ia]), 0 : 1;
+    } else if (!strcmp(argv[ia], "-b")) {
+      parse_error = next_arg ? benchmark = atoi(argv[++ia]), 0 : 1;
+    } else if (!strcmp(argv[ia], "-t")) {
+      test = 1;
+    }
+    if (parse_error) {
+      printf("Error parsing command line options.\n");
+      return 1;
+    }
+    // LCOV_EXCL_STOP
+  }
+  if (prob_size < 0) prob_size = test ? 8 * 16 : 256 * 1024;
+
+  // Print the values of all options:
+  if (!test || help) {
+    // LCOV_EXCL_START
+    printf("Selected options: [command line option] : <current value>\n");
+    printf("  Ceed specification     [-c] : %s\n", ceed_spec);
+    printf("  Mesh dimension         [-d] : %" CeedInt_FMT "\n", dim);
+    printf("  Mesh degree            [-m] : %" CeedInt_FMT "\n", mesh_degree);
+    printf("  Solution degree        [-p] : %" CeedInt_FMT "\n", sol_degree);
+    printf("  Num. 1D quadrature pts [-q] : %" CeedInt_FMT "\n", num_qpts);
+    printf("  Approx. # unknowns     [-s] : %" CeedInt_FMT "\n", prob_size);
+    printf("  QFunction source            : header");
+    if (help) {
+      printf("Test/quiet mode is %s\n", (test ? "ON" : "OFF (use -t to enable)"));
+      return 0;
+    }
+    printf("\n");
+    // LCOV_EXCL_STOP
+  }
+
+  // Select appropriate backend and logical device based on the (-ceed) command line argument.
+  Ceed ceed;
+
+  CeedInit(ceed_spec, &ceed);
+
+  // Construct the mesh and solution bases.
+  CeedBasis mesh_basis, sol_basis;
+
+  CeedBasisCreateTensorH1Lagrange(ceed, dim, num_comp_x, mesh_degree + 1, num_qpts, CEED_GAUSS, &mesh_basis);
+  CeedBasisCreateTensorH1Lagrange(ceed, dim, 1, sol_degree + 1, num_qpts, CEED_GAUSS, &sol_basis);
+
+  // Determine the mesh size based on the given approximate problem size.
+  CeedInt num_xyz[dim];
+
+  GetCartesianMeshSize(dim, sol_degree, prob_size, num_xyz);
+  if (!test) {
+    // LCOV_EXCL_START
+    printf("Mesh size: nx = %" CeedInt_FMT, num_xyz[0]);
+    if (dim > 1) printf(", ny = %" CeedInt_FMT, num_xyz[1]);
+    if (dim > 2) printf(", nz = %" CeedInt_FMT, num_xyz[2]);
+    printf("\n");
+    // LCOV_EXCL_STOP
+  }
+
+  // Build CeedElemRestriction objects describing the mesh and solution discrete representations.
+  CeedInt             mesh_size, sol_size;
+  CeedElemRestriction mesh_restriction, sol_restriction, q_data_restriction;
+
+  BuildCartesianRestriction(ceed, dim, num_xyz, mesh_degree, num_comp_x, &mesh_size, num_qpts, &mesh_restriction, NULL);
+  BuildCartesianRestriction(ceed, dim, num_xyz, sol_degree, 1 + dim * (dim + 1) / 2, &sol_size, num_qpts, NULL, &q_data_restriction);
+  BuildCartesianRestriction(ceed, dim, num_xyz, sol_degree, 1, &sol_size, num_qpts, &sol_restriction, NULL);
+  if (!test) {
+    // LCOV_EXCL_START
+    printf("Number of mesh nodes     : %" CeedInt_FMT "\n", mesh_size / dim);
+    printf("Number of solution nodes : %" CeedInt_FMT "\n", sol_size);
+    // LCOV_EXCL_STOP
+  }
+
+  // Create a CeedVector with the mesh coordinates.
+  CeedVector mesh_coords;
+
+  CeedVectorCreate(ceed, mesh_size, &mesh_coords);
+  SetCartesianMeshCoords(dim, num_xyz, mesh_degree, mesh_coords);
+
+  // Apply a transformation to the mesh.
+  CeedScalar exact_volume = TransformMeshCoords(dim, mesh_size, mesh_coords);
+
+  // Context data to be passed to the 'build_mass_diff' QFunction.
+  CeedQFunctionContext build_ctx;
+  struct BuildContext  build_ctx_data;
+
+  build_ctx_data.dim = build_ctx_data.space_dim = dim;
+  CeedQFunctionContextCreate(ceed, &build_ctx);
+  CeedQFunctionContextSetData(build_ctx, CEED_MEM_HOST, CEED_USE_POINTER, sizeof(build_ctx_data), &build_ctx_data);
+
+  // Create the QFunction that builds the mass + diffusion operator (i.e. computes its quadrature data) and set its context data.
+  CeedQFunction qf_build;
+
+  CeedQFunctionCreateInterior(ceed, 1, build_mass_diff, build_mass_diff_loc, &qf_build);
+  CeedQFunctionAddInput(qf_build, "dx", num_comp_x * dim, CEED_EVAL_GRAD);
+  CeedQFunctionAddInput(qf_build, "weights", 1, CEED_EVAL_WEIGHT);
+  CeedQFunctionAddOutput(qf_build, "qdata", 1 + dim * (dim + 1) / 2, CEED_EVAL_NONE);
+  CeedQFunctionSetContext(qf_build, build_ctx);
+
+  // Create the operator that builds the quadrature data for the mass + diffusion operator.
+  CeedOperator op_build;
+
+  CeedOperatorCreate(ceed, qf_build, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_build);
+  CeedOperatorSetField(op_build, "dx", mesh_restriction, mesh_basis, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetField(op_build, "weights", CEED_ELEMRESTRICTION_NONE, mesh_basis, CEED_VECTOR_NONE);
+  CeedOperatorSetField(op_build, "qdata", q_data_restriction, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE);
+
+  // Compute the quadrature data for the mass + diffusion operator.
+  CeedVector q_data;
+  CeedInt    elem_qpts = CeedIntPow(num_qpts, dim);
+  CeedInt    num_elem  = 1;
+
+  for (CeedInt d = 0; d < dim; d++) num_elem *= num_xyz[d];
+  CeedVectorCreate(ceed, num_elem * elem_qpts * (1 + dim * (dim + 1) / 2), &q_data);
+  CeedOperatorApply(op_build, mesh_coords, q_data, CEED_REQUEST_IMMEDIATE);
+
+  // Create the QFunction that defines the action of the mass + diffusion operator.
+  CeedQFunction qf_apply;
+
+  CeedQFunctionCreateInterior(ceed, 1, apply_mass_diff, apply_mass_diff_loc, &qf_apply);
+  CeedQFunctionAddInput(qf_apply, "u", 1, CEED_EVAL_INTERP);
+  CeedQFunctionAddInput(qf_apply, "du", dim, CEED_EVAL_GRAD);
+  CeedQFunctionAddInput(qf_apply, "qdata", 1 + dim * (dim + 1) / 2, CEED_EVAL_NONE);
+  CeedQFunctionAddOutput(qf_apply, "v", 1, CEED_EVAL_INTERP);
+  CeedQFunctionAddOutput(qf_apply, "dv", dim, CEED_EVAL_GRAD);
+  CeedQFunctionSetContext(qf_apply, build_ctx);
+
+  // Create the mass +diffusion operator.
+  CeedOperator op_apply;
+
+  CeedOperatorCreate(ceed, qf_apply, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_apply);
+  CeedOperatorSetField(op_apply, "u", sol_restriction, sol_basis, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetField(op_apply, "du", sol_restriction, sol_basis, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetField(op_apply, "qdata", q_data_restriction, CEED_BASIS_NONE, q_data);
+  CeedOperatorSetField(op_apply, "v", sol_restriction, sol_basis, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetField(op_apply, "dv", sol_restriction, sol_basis, CEED_VECTOR_ACTIVE);
+
+  // Create auxiliary solution-size vectors.
+  CeedVector u, v;
+
+  CeedVectorCreate(ceed, sol_size, &u);
+  CeedVectorCreate(ceed, sol_size, &v);
+
+  // Initialize 'u' with ones.
+  CeedVectorSetValue(u, 1.0);
+
+  // Compute the mesh volume using the mass + diffusion operator: volume = 1^T \cdot M \cdot 1
+  CeedOperatorApply(op_apply, u, v, CEED_REQUEST_IMMEDIATE);
+
+  // Benchmark runs
+  if (!test && benchmark) {
+    // LCOV_EXCL_START
+    printf(" Executing %d benchmarking runs...\n", benchmark);
+    // LCOV_EXCL_STOP
+  }
+  for (CeedInt i = 0; i < benchmark; i++) {
+    // LCOV_EXCL_START
+    CeedOperatorApply(op_apply, u, v, CEED_REQUEST_IMMEDIATE);
+    // LCOV_EXCL_STOP
+  }
+
+  // Compute and print the sum of the entries of 'v' giving the mesh volume.
+  CeedScalar volume = 0.;
+
+  {
+    const CeedScalar *v_array;
+
+    CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array);
+    for (CeedInt i = 0; i < sol_size; i++) volume += v_array[i];
+    CeedVectorRestoreArrayRead(v, &v_array);
+  }
+  if (!test) {
+    // LCOV_EXCL_START
+    printf(" done.\n");
+    printf("Exact mesh volume    : % .14g\n", exact_volume);
+    printf("Computed mesh volume : % .14g\n", volume);
+    printf("Volume error         : % .14g\n", volume - exact_volume);
+    // LCOV_EXCL_STOP
+  } else {
+    CeedScalar tol = (dim == 1 ? 200. * CEED_EPSILON : dim == 2 ? 1E-5 : 1E-5);
+
+    if (fabs(volume - exact_volume) > tol) printf("Volume error : % .1e\n", volume - exact_volume);
+  }
+
+  // Free dynamically allocated memory.
+  CeedVectorDestroy(&u);
+  CeedVectorDestroy(&v);
+  CeedVectorDestroy(&q_data);
+  CeedVectorDestroy(&mesh_coords);
+  CeedOperatorDestroy(&op_apply);
+  CeedQFunctionDestroy(&qf_apply);
+  CeedQFunctionContextDestroy(&build_ctx);
+  CeedOperatorDestroy(&op_build);
+  CeedQFunctionDestroy(&qf_build);
+  CeedElemRestrictionDestroy(&sol_restriction);
+  CeedElemRestrictionDestroy(&mesh_restriction);
+  CeedElemRestrictionDestroy(&q_data_restriction);
+  CeedBasisDestroy(&sol_basis);
+  CeedBasisDestroy(&mesh_basis);
+  CeedDestroy(&ceed);
+  return 0;
+}
+
+int GetCartesianMeshSize(CeedInt dim, CeedInt degree, CeedInt prob_size, CeedInt num_xyz[dim]) {
+  // Use the approximate formula:
+  //    prob_size ~ num_elem * degree^dim
+  CeedInt num_elem = prob_size / CeedIntPow(degree, dim);
+  CeedInt s        = 0;  // find s: num_elem/2 < 2^s <= num_elem
+
+  while (num_elem > 1) {
+    num_elem /= 2;
+    s++;
+  }
+  CeedInt r = s % dim;
+
+  for (CeedInt d = 0; d < dim; d++) {
+    CeedInt sd = s / dim;
+
+    if (r > 0) {
+      sd++;
+      r--;
+    }
+    num_xyz[d] = 1 << sd;
+  }
+  return 0;
+}
+
+int BuildCartesianRestriction(Ceed ceed, CeedInt dim, CeedInt num_xyz[dim], CeedInt degree, CeedInt num_comp, CeedInt *size, CeedInt num_qpts,
+                              CeedElemRestriction *restriction, CeedElemRestriction *q_data_restriction) {
+  CeedInt p         = degree + 1;
+  CeedInt num_nodes = CeedIntPow(p, dim);         // number of scalar nodes per element
+  CeedInt elem_qpts = CeedIntPow(num_qpts, dim);  // number of qpts per element
+  CeedInt nd[3], num_elem = 1, scalar_size = 1;
+
+  for (CeedInt d = 0; d < dim; d++) {
+    num_elem *= num_xyz[d];
+    nd[d] = num_xyz[d] * (p - 1) + 1;
+    scalar_size *= nd[d];
+  }
+  *size = scalar_size * num_comp;
+  // elem:         0             1                 n-1
+  //           |---*-...-*---|---*-...-*---|- ... -|--...--|
+  // num_nodes:   0   1    p-1  p  p+1       2*p             n*p
+  CeedInt *elem_nodes = malloc(sizeof(CeedInt) * num_elem * num_nodes);
+
+  for (CeedInt e = 0; e < num_elem; e++) {
+    CeedInt e_xyz[3] = {1, 1, 1}, re = e;
+
+    for (CeedInt d = 0; d < dim; d++) {
+      e_xyz[d] = re % num_xyz[d];
+      re /= num_xyz[d];
+    }
+    CeedInt *local_elem_nodes = elem_nodes + e * num_nodes;
+
+    for (CeedInt l_nodes = 0; l_nodes < num_nodes; l_nodes++) {
+      CeedInt g_nodes = 0, g_nodes_stride = 1, r_nodes = l_nodes;
+
+      for (CeedInt d = 0; d < dim; d++) {
+        g_nodes += (e_xyz[d] * (p - 1) + r_nodes % p) * g_nodes_stride;
+        g_nodes_stride *= nd[d];
+        r_nodes /= p;
+      }
+      local_elem_nodes[l_nodes] = g_nodes;
+    }
+  }
+  if (restriction) {
+    CeedElemRestrictionCreate(ceed, num_elem, num_nodes, num_comp, scalar_size, num_comp * scalar_size, CEED_MEM_HOST, CEED_COPY_VALUES, elem_nodes,
+                              restriction);
+  }
+  if (q_data_restriction) {
+    CeedElemRestrictionCreateStrided(ceed, num_elem, elem_qpts, num_comp, num_comp * elem_qpts * num_elem, CEED_STRIDES_BACKEND, q_data_restriction);
+  }
+  free(elem_nodes);
+  return 0;
+}
+
+int SetCartesianMeshCoords(CeedInt dim, CeedInt num_xyz[dim], CeedInt mesh_degree, CeedVector mesh_coords) {
+  CeedInt p = mesh_degree + 1;
+  CeedInt nd[3], scalar_size = 1;
+
+  for (CeedInt d = 0; d < dim; d++) {
+    nd[d] = num_xyz[d] * (p - 1) + 1;
+    scalar_size *= nd[d];
+  }
+  CeedScalar *coords;
+
+  CeedVectorGetArrayWrite(mesh_coords, CEED_MEM_HOST, &coords);
+  CeedScalar *nodes = malloc(sizeof(CeedScalar) * p);
+
+  // The H1 basis uses Lobatto quadrature points as nodes.
+  CeedLobattoQuadrature(p, nodes, NULL);  // nodes are in [-1,1]
+  for (CeedInt i = 0; i < p; i++) nodes[i] = 0.5 + 0.5 * nodes[i];
+  for (CeedInt gs_nodes = 0; gs_nodes < scalar_size; gs_nodes++) {
+    CeedInt r_nodes = gs_nodes;
+
+    for (CeedInt d = 0; d < dim; d++) {
+      CeedInt d_1d                       = r_nodes % nd[d];
+      coords[gs_nodes + scalar_size * d] = ((d_1d / (p - 1)) + nodes[d_1d % (p - 1)]) / num_xyz[d];
+      r_nodes /= nd[d];
+    }
+  }
+  free(nodes);
+  CeedVectorRestoreArray(mesh_coords, &coords);
+  return 0;
+}
+
+#ifndef M_PI
+#define M_PI 3.14159265358979323846
+#define M_PI_2 1.57079632679489661923
+#endif
+
+CeedScalar TransformMeshCoords(CeedInt dim, CeedInt mesh_size, CeedVector mesh_coords) {
+  CeedScalar  exact_volume;
+  CeedScalar *coords;
+
+  CeedVectorGetArray(mesh_coords, CEED_MEM_HOST, &coords);
+  if (dim == 1) {
+    for (CeedInt i = 0; i < mesh_size; i++) {
+      // map [0,1] to [0,1] varying the mesh density
+      coords[i] = 0.5 + 1. / sqrt(3.) * sin((2. / 3.) * M_PI * (coords[i] - 0.5));
+    }
+    exact_volume = 1.;
+  } else {
+    CeedInt num_nodes = mesh_size / dim;
+    for (CeedInt i = 0; i < num_nodes; i++) {
+      // map (x,y) from [0,1]x[0,1] to the quarter annulus with polar
+      // coordinates, (r,phi) in [1,2]x[0,pi/2] with area = 3/4*pi
+      CeedScalar u = coords[i], v = coords[i + num_nodes];
+
+      u                     = 1. + u;
+      v                     = M_PI_2 * v;
+      coords[i]             = u * cos(v);
+      coords[i + num_nodes] = u * sin(v);
+    }
+    exact_volume = 3. / 4. * M_PI;
+  }
+  CeedVectorRestoreArray(mesh_coords, &coords);
+  return exact_volume;
+}
diff --git a/examples/ceed/ex3-volume.h b/examples/ceed/ex3-volume.h
new file mode 100644
index 0000000000..01086ecaf2
--- /dev/null
+++ b/examples/ceed/ex3-volume.h
@@ -0,0 +1,151 @@
+// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+#include <ceed/types.h>
+
+/// A structure used to pass additional data to f_build_mass_diff
+struct BuildContext {
+  CeedInt dim, space_dim;
+};
+
+/// libCEED Q-function for building quadrature data for a mass + diffusion operator
+CEED_QFUNCTION(build_mass_diff)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+  struct BuildContext *build_data = (struct BuildContext *)ctx;
+  // in[0] is Jacobians with shape [dim, nc=dim, Q]
+  // in[1] is quadrature weights, size (Q)
+  //
+  // At every quadrature point, compute w/det(J).adj(J).adj(J)^T and store
+  // the symmetric part of the result.
+  const CeedScalar *J = in[0], *w = in[1];
+  CeedScalar       *q_data = out[0];
+
+  switch (build_data->dim + 10 * build_data->space_dim) {
+    case 11:
+      CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
+        // Mass
+        q_data[i + Q * 0] = w[i] * J[i];
+        // Diffusion
+        q_data[i + Q * 1] = w[i] / J[i];
+      }  // End of Quadrature Point Loop
+      break;
+    case 22:
+      CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
+        // J: 0 2   q_data: 1 3   adj(J):  J22 -J12
+        //    1 3           3 2           -J21  J11
+        const CeedScalar J11 = J[i + Q * 0];
+        const CeedScalar J21 = J[i + Q * 1];
+        const CeedScalar J12 = J[i + Q * 2];
+        const CeedScalar J22 = J[i + Q * 3];
+        const CeedScalar qw  = w[i] / (J11 * J22 - J21 * J12);
+
+        // Mass
+        q_data[i + Q * 0] = w[i] * (J11 * J22 - J21 * J12);
+        // Diffusion
+        q_data[i + Q * 1] = qw * (J12 * J12 + J22 * J22);
+        q_data[i + Q * 2] = qw * (J11 * J11 + J21 * J21);
+        q_data[i + Q * 3] = -qw * (J11 * J12 + J21 * J22);
+      }  // End of Quadrature Point Loop
+      break;
+    case 33:
+      CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
+        // Compute the adjoint
+        CeedScalar A[3][3];
+        for (CeedInt j = 0; j < 3; j++) {
+          for (CeedInt k = 0; k < 3; k++) {
+            // Equivalent code with J as a VLA and no mod operations:
+            // A[k][j] = J[j+1][k+1]*J[j+2][k+2] - J[j+1][k+2]*J[j+2][k+1]
+            A[k][j] = J[i + Q * ((j + 1) % 3 + 3 * ((k + 1) % 3))] * J[i + Q * ((j + 2) % 3 + 3 * ((k + 2) % 3))] -
+                      J[i + Q * ((j + 1) % 3 + 3 * ((k + 2) % 3))] * J[i + Q * ((j + 2) % 3 + 3 * ((k + 1) % 3))];
+          }
+        }
+
+        // Compute quadrature weight / det(J)
+        const CeedScalar qw = w[i] / (J[i + Q * 0] * A[0][0] + J[i + Q * 1] * A[0][1] + J[i + Q * 2] * A[0][2]);
+
+        // Mass
+        q_data[i + Q * 0] = w[i] * (J[i + Q * 0] * A[0][0] + J[i + Q * 1] * A[0][1] + J[i + Q * 2] * A[0][2]);
+        // Diffusion
+        // Stored in Voigt convention
+        // 1 6 5
+        // 6 2 4
+        // 5 4 3
+        q_data[i + Q * 1] = qw * (A[0][0] * A[0][0] + A[0][1] * A[0][1] + A[0][2] * A[0][2]);
+        q_data[i + Q * 2] = qw * (A[1][0] * A[1][0] + A[1][1] * A[1][1] + A[1][2] * A[1][2]);
+        q_data[i + Q * 3] = qw * (A[2][0] * A[2][0] + A[2][1] * A[2][1] + A[2][2] * A[2][2]);
+        q_data[i + Q * 4] = qw * (A[1][0] * A[2][0] + A[1][1] * A[2][1] + A[1][2] * A[2][2]);
+        q_data[i + Q * 5] = qw * (A[0][0] * A[2][0] + A[0][1] * A[2][1] + A[0][2] * A[2][2]);
+        q_data[i + Q * 6] = qw * (A[0][0] * A[1][0] + A[0][1] * A[1][1] + A[0][2] * A[1][2]);
+      }  // End of Quadrature Point Loop
+      break;
+  }
+  return CEED_ERROR_SUCCESS;
+}
+
+/// libCEED Q-function for applying a mass + diffusion operator
+CEED_QFUNCTION(apply_mass_diff)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+  struct BuildContext *build_data = (struct BuildContext *)ctx;
+  // in[0], out[0] have shape [1,   nc=1, Q]
+  // in[1], out[1] have shape [dim, nc=1, Q]
+  const CeedScalar *u = in[0], *ug = in[1], *q_data = in[2];
+  CeedScalar       *v = out[0], *vg = out[1];
+
+  switch (build_data->dim) {
+    case 1:
+      CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
+        // Mass
+        v[i] = q_data[i + Q * 0] * u[i];
+        // Diffusion
+        vg[i] = ug[i] * q_data[i + Q * 1];
+      }  // End of Quadrature Point Loop
+      break;
+    case 2:
+      CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
+        // Mass
+        v[i] = q_data[i + Q * 0] * u[i];
+
+        // Diffusion
+        // Read spatial derivatives of u
+        const CeedScalar du[2] = {ug[i + Q * 0], ug[i + Q * 1]};
+
+        // Read q_data (dXdxdXdx_T symmetric matrix)
+        // Stored in Voigt convention
+        // 1 3
+        // 3 2
+        const CeedScalar dXdxdXdx_T[2][2] = {
+            {q_data[i + 1 * Q], q_data[i + 3 * Q]},
+            {q_data[i + 3 * Q], q_data[i + 2 * Q]}
+        };
+        // j = direction of vg
+        for (int j = 0; j < 2; j++) vg[i + j * Q] = (du[0] * dXdxdXdx_T[0][j] + du[1] * dXdxdXdx_T[1][j]);
+      }  // End of Quadrature Point Loop
+      break;
+    case 3:
+      CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
+        // Mass
+        v[i] = q_data[i + Q * 0] * u[i];
+
+        // Diffusion
+        // Read spatial derivatives of u
+        const CeedScalar du[3] = {ug[i + Q * 0], ug[i + Q * 1], ug[i + Q * 2]};
+
+        // Read q_data (dXdxdXdx_T symmetric matrix)
+        // Stored in Voigt convention
+        // 0 5 4
+        // 5 1 3
+        // 4 3 2
+        const CeedScalar dXdxdXdx_T[3][3] = {
+            {q_data[i + 1 * Q], q_data[i + 6 * Q], q_data[i + 5 * Q]},
+            {q_data[i + 6 * Q], q_data[i + 2 * Q], q_data[i + 4 * Q]},
+            {q_data[i + 5 * Q], q_data[i + 4 * Q], q_data[i + 3 * Q]}
+        };
+        // j = direction of vg
+        for (int j = 0; j < 3; j++) vg[i + j * Q] = (du[0] * dXdxdXdx_T[0][j] + du[1] * dXdxdXdx_T[1][j] + du[2] * dXdxdXdx_T[2][j]);
+      }  // End of Quadrature Point Loop
+      break;
+  }
+  return CEED_ERROR_SUCCESS;
+}

From 2849f54c6d85a8b2e17c2a5bb5551e8ae280f68a Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Tue, 4 Mar 2025 17:04:16 -0700
Subject: [PATCH 311/571] ci - only test t5, ex on IBM Power

---
 .github/workflows/c-fortan-test-ppc64le.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/c-fortan-test-ppc64le.yml b/.github/workflows/c-fortan-test-ppc64le.yml
index 45d7b4841e..b959d929d5 100644
--- a/.github/workflows/c-fortan-test-ppc64le.yml
+++ b/.github/workflows/c-fortan-test-ppc64le.yml
@@ -37,4 +37,4 @@ jobs:
           uname -a
           make info
           make -j
-          make prove -j
+          make prove -j search="t5 ex"

From 0806d17a7359838f02ea4f382dfd8ee0afba251f Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Wed, 5 Mar 2025 10:20:30 -0700
Subject: [PATCH 312/571] ex3 - fix typo in sample call

Co-authored-by: James Wright <james@jameswright.xyz>
---
 examples/ceed/ex3-volume.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/ceed/ex3-volume.c b/examples/ceed/ex3-volume.c
index 5acaaa7c4a..1bee79686e 100644
--- a/examples/ceed/ex3-volume.c
+++ b/examples/ceed/ex3-volume.c
@@ -19,7 +19,7 @@
 //
 // Build with:
 //
-//     make ex1-volume [CEED_DIR=</path/to/libceed>]
+//     make ex3-volume [CEED_DIR=</path/to/libceed>]
 //
 // Sample runs:
 //

From 0a242873a29183ba8fb2a56a57a36b831ce52f87 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Wed, 5 Mar 2025 11:29:43 -0700
Subject: [PATCH 313/571] ex - use VLA casts to clarify qf data shapes

---
 examples/ceed/ex1-volume.h  |  45 ++++++-----
 examples/ceed/ex2-surface.h | 123 ++++++++++++++++-------------
 examples/ceed/ex3-volume.h  | 151 ++++++++++++++++++++----------------
 3 files changed, 176 insertions(+), 143 deletions(-)

diff --git a/examples/ceed/ex1-volume.h b/examples/ceed/ex1-volume.h
index 4b5d576ccb..98d1315bf3 100644
--- a/examples/ceed/ex1-volume.h
+++ b/examples/ceed/ex1-volume.h
@@ -14,43 +14,46 @@ struct BuildContext {
 
 /// libCEED Q-function for building quadrature data for a mass operator
 CEED_QFUNCTION(build_mass)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
-  // in[0] is Jacobians with shape [dim, nc=dim, Q]
-  // in[1] is quadrature weights, size (Q)
+  // in[0] is Jacobians with shape [dim, dim, Q]
+  // in[1] is quadrature weights with shape [1, Q]
+  const CeedScalar    *w          = in[1];
+  CeedScalar          *q_data     = out[0];
   struct BuildContext *build_data = (struct BuildContext *)ctx;
-  const CeedScalar    *J = in[0], *w = in[1];
-  CeedScalar          *q_data = out[0];
 
   switch (build_data->dim + 10 * build_data->space_dim) {
-    case 11:
+    case 11: {
+      const CeedScalar(*J)[1][CEED_Q_VLA] = (const CeedScalar(*)[1][CEED_Q_VLA])in[0];
+
       // Quadrature Point Loop
-      CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { q_data[i] = J[i] * w[i]; }  // End of Quadrature Point Loop
-      break;
-    case 22:
+      CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { q_data[i] = J[0][0][i] * w[i]; }  // End of Quadrature Point Loop
+    } break;
+    case 22: {
+      const CeedScalar(*J)[2][CEED_Q_VLA] = (const CeedScalar(*)[2][CEED_Q_VLA])in[0];
+
       // Quadrature Point Loop
       CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
-        // 0 2
-        // 1 3
-        q_data[i] = (J[i + Q * 0] * J[i + Q * 3] - J[i + Q * 1] * J[i + Q * 2]) * w[i];
+        q_data[i] = (J[0][0][i] * J[1][1][i] - J[0][1][i] * J[1][0][i]) * w[i];
       }  // End of Quadrature Point Loop
-      break;
-    case 33:
+    } break;
+    case 33: {
+      const CeedScalar(*J)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0];
+
       // Quadrature Point Loop
       CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
-        // 0 3 6
-        // 1 4 7
-        // 2 5 8
-        q_data[i] = (J[i + Q * 0] * (J[i + Q * 4] * J[i + Q * 8] - J[i + Q * 5] * J[i + Q * 7]) -
-                     J[i + Q * 1] * (J[i + Q * 3] * J[i + Q * 8] - J[i + Q * 5] * J[i + Q * 6]) +
-                     J[i + Q * 2] * (J[i + Q * 3] * J[i + Q * 7] - J[i + Q * 4] * J[i + Q * 6])) *
-                    w[i];
+        q_data[i] =
+            (J[0][0][i] * (J[1][1][i] * J[2][2][i] - J[1][2][i] * J[2][1][i]) - J[0][1][i] * (J[1][0][i] * J[2][2][i] - J[1][2][i] * J[2][0][i]) +
+             J[0][2][i] * (J[1][0][i] * J[2][1][i] - J[1][1][i] * J[2][0][i])) *
+            w[i];
       }  // End of Quadrature Point Loop
-      break;
+    } break;
   }
   return CEED_ERROR_SUCCESS;
 }
 
 /// libCEED Q-function for applying a mass operator
 CEED_QFUNCTION(apply_mass)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+  // in[0], out[0] are solution variables with shape [1, Q]
+  // in[1] is quadrature data with shape [1, Q]
   const CeedScalar *u = in[0], *q_data = in[1];
   CeedScalar       *v = out[0];
 
diff --git a/examples/ceed/ex2-surface.h b/examples/ceed/ex2-surface.h
index 5a4db76efa..d11ee6ab5f 100644
--- a/examples/ceed/ex2-surface.h
+++ b/examples/ceed/ex2-surface.h
@@ -14,63 +14,69 @@ struct BuildContext {
 
 /// libCEED Q-function for building quadrature data for a diffusion operator
 CEED_QFUNCTION(build_diff)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
-  struct BuildContext *build_data = (struct BuildContext *)ctx;
-  // in[0] is Jacobians with shape [dim, nc=dim, Q]
+  // in[0] is Jacobians with shape [dim, dim, Q]
   // in[1] is quadrature weights, size (Q)
-  //
+  const CeedScalar *w             = in[1];
+  CeedScalar(*q_data)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0];
+  struct BuildContext *build_data = (struct BuildContext *)ctx;
+
   // At every quadrature point, compute w/det(J).adj(J).adj(J)^T and store
   // the symmetric part of the result.
-  const CeedScalar *J = in[0], *w = in[1];
-  CeedScalar       *q_data = out[0];
-
   switch (build_data->dim + 10 * build_data->space_dim) {
-    case 11:
-      CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { q_data[i] = w[i] / J[i]; }  // End of Quadrature Point Loop
-      break;
-    case 22:
+    case 11: {
+      const CeedScalar(*J)[1][CEED_Q_VLA] = (const CeedScalar(*)[1][CEED_Q_VLA])in[0];
+
+      CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { q_data[0][i] = w[i] / J[0][0][i]; }  // End of Quadrature Point Loop
+    } break;
+    case 22: {
+      const CeedScalar(*J)[2][CEED_Q_VLA] = (const CeedScalar(*)[2][CEED_Q_VLA])in[0];
+
       CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
-        // J: 0 2   q_data: 0 2   adj(J):  J22 -J12
-        //    1 3           2 1           -J21  J11
-        const CeedScalar J11 = J[i + Q * 0];
-        const CeedScalar J21 = J[i + Q * 1];
-        const CeedScalar J12 = J[i + Q * 2];
-        const CeedScalar J22 = J[i + Q * 3];
-        const CeedScalar qw  = w[i] / (J11 * J22 - J21 * J12);
-
-        q_data[i + Q * 0] = qw * (J12 * J12 + J22 * J22);
-        q_data[i + Q * 1] = qw * (J11 * J11 + J21 * J21);
-        q_data[i + Q * 2] = -qw * (J11 * J12 + J21 * J22);
+        // J: 0 2   q_data: 0 2   adj(J):  J11 -J01
+        //    1 3           2 1           -J10  J00
+        const CeedScalar J00 = J[0][0][i];
+        const CeedScalar J10 = J[0][1][i];
+        const CeedScalar J01 = J[1][0][i];
+        const CeedScalar J11 = J[1][1][i];
+        const CeedScalar qw  = w[i] / (J00 * J11 - J10 * J01);
+
+        q_data[0][i] = qw * (J01 * J01 + J11 * J11);
+        q_data[1][i] = qw * (J00 * J00 + J10 * J10);
+        q_data[2][i] = -qw * (J00 * J01 + J10 * J11);
       }  // End of Quadrature Point Loop
-      break;
-    case 33:
+    } break;
+    case 33: {
+      const CeedScalar(*J)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0];
+
       CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
         // Compute the adjoint
         CeedScalar A[3][3];
+
         for (CeedInt j = 0; j < 3; j++) {
           for (CeedInt k = 0; k < 3; k++) {
             // Equivalent code with J as a VLA and no mod operations:
             // A[k][j] = J[j+1][k+1]*J[j+2][k+2] - J[j+1][k+2]*J[j+2][k+1]
-            A[k][j] = J[i + Q * ((j + 1) % 3 + 3 * ((k + 1) % 3))] * J[i + Q * ((j + 2) % 3 + 3 * ((k + 2) % 3))] -
-                      J[i + Q * ((j + 1) % 3 + 3 * ((k + 2) % 3))] * J[i + Q * ((j + 2) % 3 + 3 * ((k + 1) % 3))];
+            A[k][j] =
+                J[(k + 1) % 3][(j + 1) % 3][i] * J[(k + 2) % 3][(j + 2) % 3][i] - J[(k + 2) % 3][(j + 1) % 3][i] * J[(k + 1) % 3][(j + 2) % 3][i];
           }
         }
 
         // Compute quadrature weight / det(J)
-        const CeedScalar qw = w[i] / (J[i + Q * 0] * A[0][0] + J[i + Q * 1] * A[0][1] + J[i + Q * 2] * A[0][2]);
+        const CeedScalar qw = w[i] / (J[0][0][i] * A[0][0] + J[0][1][i] * A[0][1] + J[0][2][i] * A[0][2]);
 
         // Compute geometric factors
         // Stored in Voigt convention
         // 0 5 4
         // 5 1 3
         // 4 3 2
-        q_data[i + Q * 0] = qw * (A[0][0] * A[0][0] + A[0][1] * A[0][1] + A[0][2] * A[0][2]);
-        q_data[i + Q * 1] = qw * (A[1][0] * A[1][0] + A[1][1] * A[1][1] + A[1][2] * A[1][2]);
-        q_data[i + Q * 2] = qw * (A[2][0] * A[2][0] + A[2][1] * A[2][1] + A[2][2] * A[2][2]);
-        q_data[i + Q * 3] = qw * (A[1][0] * A[2][0] + A[1][1] * A[2][1] + A[1][2] * A[2][2]);
-        q_data[i + Q * 4] = qw * (A[0][0] * A[2][0] + A[0][1] * A[2][1] + A[0][2] * A[2][2]);
-        q_data[i + Q * 5] = qw * (A[0][0] * A[1][0] + A[0][1] * A[1][1] + A[0][2] * A[1][2]);
+        q_data[0][i] = qw * (A[0][0] * A[0][0] + A[0][1] * A[0][1] + A[0][2] * A[0][2]);
+        q_data[1][i] = qw * (A[1][0] * A[1][0] + A[1][1] * A[1][1] + A[1][2] * A[1][2]);
+        q_data[2][i] = qw * (A[2][0] * A[2][0] + A[2][1] * A[2][1] + A[2][2] * A[2][2]);
+        q_data[3][i] = qw * (A[1][0] * A[2][0] + A[1][1] * A[2][1] + A[1][2] * A[2][2]);
+        q_data[4][i] = qw * (A[0][0] * A[2][0] + A[0][1] * A[2][1] + A[0][2] * A[2][2]);
+        q_data[5][i] = qw * (A[0][0] * A[1][0] + A[0][1] * A[1][1] + A[0][2] * A[1][2]);
       }  // End of Quadrature Point Loop
-      break;
+    } break;
   }
   return CEED_ERROR_SUCCESS;
 }
@@ -78,50 +84,55 @@ CEED_QFUNCTION(build_diff)(void *ctx, const CeedInt Q, const CeedScalar *const *
 /// libCEED Q-function for applying a diff operator
 CEED_QFUNCTION(apply_diff)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   struct BuildContext *build_data = (struct BuildContext *)ctx;
-  // in[0], out[0] have shape [dim, nc=1, Q]
-  const CeedScalar *ug = in[0], *q_data = in[1];
-  CeedScalar       *vg = out[0];
+  // in[0], out[0] solution gradients with shape [dim, 1, Q]
+  // in[1] is quadrature data with shape [num_components, Q]
+  const CeedScalar(*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[1];
 
   switch (build_data->dim) {
-    case 1:
-      CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { vg[i] = ug[i] * q_data[i]; }  // End of Quadrature Point Loop
-      break;
-    case 2:
-      CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
-        // Read spatial derivatives of u
-        const CeedScalar du[2] = {ug[i + Q * 0], ug[i + Q * 1]};
+    case 1: {
+      const CeedScalar *ug = in[0];
+      CeedScalar       *vg = out[0];
 
+      CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { vg[i] = ug[i] * q_data[0][i]; }  // End of Quadrature Point Loop
+    } break;
+    case 2: {
+      const CeedScalar(*ug)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0];
+      CeedScalar(*vg)[CEED_Q_VLA]       = (CeedScalar(*)[CEED_Q_VLA])out[0];
+
+      CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
         // Read q_data (dXdxdXdx_T symmetric matrix)
         // Stored in Voigt convention
         // 0 2
         // 2 1
         const CeedScalar dXdxdXdx_T[2][2] = {
-            {q_data[i + 0 * Q], q_data[i + 2 * Q]},
-            {q_data[i + 2 * Q], q_data[i + 1 * Q]}
+            {q_data[0][i], q_data[2][i]},
+            {q_data[2][i], q_data[1][i]}
         };
+
         // j = direction of vg
-        for (int j = 0; j < 2; j++) vg[i + j * Q] = (du[0] * dXdxdXdx_T[0][j] + du[1] * dXdxdXdx_T[1][j]);
+        for (int j = 0; j < 2; j++) vg[j][i] = (ug[0][i] * dXdxdXdx_T[0][j] + ug[1][i] * dXdxdXdx_T[1][j]);
       }  // End of Quadrature Point Loop
-      break;
-    case 3:
-      CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
-        // Read spatial derivatives of u
-        const CeedScalar du[3] = {ug[i + Q * 0], ug[i + Q * 1], ug[i + Q * 2]};
+    } break;
+    case 3: {
+      const CeedScalar(*ug)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0];
+      CeedScalar(*vg)[CEED_Q_VLA]       = (CeedScalar(*)[CEED_Q_VLA])out[0];
 
+      CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
         // Read q_data (dXdxdXdx_T symmetric matrix)
         // Stored in Voigt convention
         // 0 5 4
         // 5 1 3
         // 4 3 2
         const CeedScalar dXdxdXdx_T[3][3] = {
-            {q_data[i + 0 * Q], q_data[i + 5 * Q], q_data[i + 4 * Q]},
-            {q_data[i + 5 * Q], q_data[i + 1 * Q], q_data[i + 3 * Q]},
-            {q_data[i + 4 * Q], q_data[i + 3 * Q], q_data[i + 2 * Q]}
+            {q_data[0][i], q_data[5][i], q_data[4][i]},
+            {q_data[5][i], q_data[1][i], q_data[3][i]},
+            {q_data[4][i], q_data[3][i], q_data[2][i]}
         };
+
         // j = direction of vg
-        for (int j = 0; j < 3; j++) vg[i + j * Q] = (du[0] * dXdxdXdx_T[0][j] + du[1] * dXdxdXdx_T[1][j] + du[2] * dXdxdXdx_T[2][j]);
+        for (int j = 0; j < 3; j++) vg[j][i] = (ug[0][i] * dXdxdXdx_T[0][j] + ug[1][i] * dXdxdXdx_T[1][j] + ug[2][i] * dXdxdXdx_T[2][j]);
       }  // End of Quadrature Point Loop
-      break;
+    } break;
   }
   return CEED_ERROR_SUCCESS;
 }
diff --git a/examples/ceed/ex3-volume.h b/examples/ceed/ex3-volume.h
index 01086ecaf2..d544d229c1 100644
--- a/examples/ceed/ex3-volume.h
+++ b/examples/ceed/ex3-volume.h
@@ -14,73 +14,82 @@ struct BuildContext {
 
 /// libCEED Q-function for building quadrature data for a mass + diffusion operator
 CEED_QFUNCTION(build_mass_diff)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
-  struct BuildContext *build_data = (struct BuildContext *)ctx;
-  // in[0] is Jacobians with shape [dim, nc=dim, Q]
+  // in[0] is Jacobians with shape [dim, dim, Q]
   // in[1] is quadrature weights, size (Q)
-  //
+  const CeedScalar *w             = in[1];
+  CeedScalar(*q_data)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0];
+  struct BuildContext *build_data = (struct BuildContext *)ctx;
+
   // At every quadrature point, compute w/det(J).adj(J).adj(J)^T and store
   // the symmetric part of the result.
-  const CeedScalar *J = in[0], *w = in[1];
-  CeedScalar       *q_data = out[0];
-
   switch (build_data->dim + 10 * build_data->space_dim) {
-    case 11:
+    case 11: {
+      const CeedScalar(*J)[1][CEED_Q_VLA] = (const CeedScalar(*)[1][CEED_Q_VLA])in[0];
+
       CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
         // Mass
-        q_data[i + Q * 0] = w[i] * J[i];
+        q_data[0][i] = w[i] * J[0][0][i];
+
         // Diffusion
-        q_data[i + Q * 1] = w[i] / J[i];
+        q_data[1][i] = w[i] / J[0][0][i];
       }  // End of Quadrature Point Loop
-      break;
-    case 22:
+    } break;
+    case 22: {
+      const CeedScalar(*J)[2][CEED_Q_VLA] = (const CeedScalar(*)[2][CEED_Q_VLA])in[0];
+
       CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
-        // J: 0 2   q_data: 1 3   adj(J):  J22 -J12
-        //    1 3           3 2           -J21  J11
-        const CeedScalar J11 = J[i + Q * 0];
-        const CeedScalar J21 = J[i + Q * 1];
-        const CeedScalar J12 = J[i + Q * 2];
-        const CeedScalar J22 = J[i + Q * 3];
-        const CeedScalar qw  = w[i] / (J11 * J22 - J21 * J12);
+        // J: 0 2   q_data: 0 2   adj(J):  J22 -J12
+        //    1 3           2 1           -J10  J00
+        const CeedScalar J00 = J[0][0][i];
+        const CeedScalar J10 = J[0][1][i];
+        const CeedScalar J01 = J[1][0][i];
+        const CeedScalar J11 = J[1][1][i];
+        const CeedScalar qw  = w[i] / (J00 * J11 - J10 * J01);
 
         // Mass
-        q_data[i + Q * 0] = w[i] * (J11 * J22 - J21 * J12);
+        q_data[0][i] = w[i] * (J00 * J11 - J10 * J01);
+
         // Diffusion
-        q_data[i + Q * 1] = qw * (J12 * J12 + J22 * J22);
-        q_data[i + Q * 2] = qw * (J11 * J11 + J21 * J21);
-        q_data[i + Q * 3] = -qw * (J11 * J12 + J21 * J22);
+        q_data[1][i] = qw * (J01 * J01 + J11 * J11);
+        q_data[2][i] = qw * (J00 * J00 + J10 * J10);
+        q_data[3][i] = -qw * (J00 * J01 + J10 * J11);
       }  // End of Quadrature Point Loop
-      break;
-    case 33:
+    } break;
+    case 33: {
+      const CeedScalar(*J)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0];
+
       CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
         // Compute the adjoint
         CeedScalar A[3][3];
+
         for (CeedInt j = 0; j < 3; j++) {
           for (CeedInt k = 0; k < 3; k++) {
             // Equivalent code with J as a VLA and no mod operations:
             // A[k][j] = J[j+1][k+1]*J[j+2][k+2] - J[j+1][k+2]*J[j+2][k+1]
-            A[k][j] = J[i + Q * ((j + 1) % 3 + 3 * ((k + 1) % 3))] * J[i + Q * ((j + 2) % 3 + 3 * ((k + 2) % 3))] -
-                      J[i + Q * ((j + 1) % 3 + 3 * ((k + 2) % 3))] * J[i + Q * ((j + 2) % 3 + 3 * ((k + 1) % 3))];
+            A[k][j] =
+                J[(k + 1) % 3][(j + 1) % 3][i] * J[(k + 2) % 3][(j + 2) % 3][i] - J[(k + 2) % 3][(j + 1) % 3][i] * J[(k + 1) % 3][(j + 2) % 3][i];
           }
         }
 
         // Compute quadrature weight / det(J)
-        const CeedScalar qw = w[i] / (J[i + Q * 0] * A[0][0] + J[i + Q * 1] * A[0][1] + J[i + Q * 2] * A[0][2]);
+        const CeedScalar qw = w[i] / (J[0][0][i] * A[0][0] + J[0][1][i] * A[0][1] + J[0][2][i] * A[0][2]);
 
         // Mass
-        q_data[i + Q * 0] = w[i] * (J[i + Q * 0] * A[0][0] + J[i + Q * 1] * A[0][1] + J[i + Q * 2] * A[0][2]);
+        q_data[0][i] = w[i] * (J[0][0][i] * A[0][0] + J[0][1][i] * A[0][1] + J[0][2][i] * A[0][2]);
+
         // Diffusion
         // Stored in Voigt convention
         // 1 6 5
         // 6 2 4
         // 5 4 3
-        q_data[i + Q * 1] = qw * (A[0][0] * A[0][0] + A[0][1] * A[0][1] + A[0][2] * A[0][2]);
-        q_data[i + Q * 2] = qw * (A[1][0] * A[1][0] + A[1][1] * A[1][1] + A[1][2] * A[1][2]);
-        q_data[i + Q * 3] = qw * (A[2][0] * A[2][0] + A[2][1] * A[2][1] + A[2][2] * A[2][2]);
-        q_data[i + Q * 4] = qw * (A[1][0] * A[2][0] + A[1][1] * A[2][1] + A[1][2] * A[2][2]);
-        q_data[i + Q * 5] = qw * (A[0][0] * A[2][0] + A[0][1] * A[2][1] + A[0][2] * A[2][2]);
-        q_data[i + Q * 6] = qw * (A[0][0] * A[1][0] + A[0][1] * A[1][1] + A[0][2] * A[1][2]);
+        q_data[1][i] = qw * (A[0][0] * A[0][0] + A[0][1] * A[0][1] + A[0][2] * A[0][2]);
+        q_data[2][i] = qw * (A[1][0] * A[1][0] + A[1][1] * A[1][1] + A[1][2] * A[1][2]);
+        q_data[3][i] = qw * (A[2][0] * A[2][0] + A[2][1] * A[2][1] + A[2][2] * A[2][2]);
+        q_data[4][i] = qw * (A[1][0] * A[2][0] + A[1][1] * A[2][1] + A[1][2] * A[2][2]);
+        q_data[5][i] = qw * (A[0][0] * A[2][0] + A[0][1] * A[2][1] + A[0][2] * A[2][2]);
+        q_data[6][i] = qw * (A[0][0] * A[1][0] + A[0][1] * A[1][1] + A[0][2] * A[1][2]);
       }  // End of Quadrature Point Loop
-      break;
+    } break;
   }
   return CEED_ERROR_SUCCESS;
 }
@@ -88,64 +97,74 @@ CEED_QFUNCTION(build_mass_diff)(void *ctx, const CeedInt Q, const CeedScalar *co
 /// libCEED Q-function for applying a mass + diffusion operator
 CEED_QFUNCTION(apply_mass_diff)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   struct BuildContext *build_data = (struct BuildContext *)ctx;
-  // in[0], out[0] have shape [1,   nc=1, Q]
-  // in[1], out[1] have shape [dim, nc=1, Q]
-  const CeedScalar *u = in[0], *ug = in[1], *q_data = in[2];
-  CeedScalar       *v = out[0], *vg = out[1];
+  // in[1], out[1] solution values with shape [1, 1, Q]
+  // in[1], out[1] solution gradients with shape [dim, 1, Q]
+  // in[2] is quadrature data with shape [num_components, Q]
+  const CeedScalar(*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[2];
 
   switch (build_data->dim) {
-    case 1:
+    case 1: {
+      const CeedScalar *u = in[0], *ug = in[1];
+      CeedScalar       *v = out[0], *vg = out[1];
+
       CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
         // Mass
-        v[i] = q_data[i + Q * 0] * u[i];
+        v[i] = q_data[0][i] * u[i];
+
         // Diffusion
-        vg[i] = ug[i] * q_data[i + Q * 1];
+        vg[i] = q_data[1][i] * ug[i];
       }  // End of Quadrature Point Loop
-      break;
-    case 2:
+    } break;
+    case 2: {
+      const CeedScalar *u               = in[0];
+      const CeedScalar(*ug)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[1];
+      CeedScalar *v                     = out[0];
+      CeedScalar(*vg)[CEED_Q_VLA]       = (CeedScalar(*)[CEED_Q_VLA])out[1];
+
       CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
         // Mass
-        v[i] = q_data[i + Q * 0] * u[i];
+        v[i] = q_data[0][i] * u[i];
 
         // Diffusion
-        // Read spatial derivatives of u
-        const CeedScalar du[2] = {ug[i + Q * 0], ug[i + Q * 1]};
-
         // Read q_data (dXdxdXdx_T symmetric matrix)
         // Stored in Voigt convention
         // 1 3
-        // 3 2
+        // 23 2
         const CeedScalar dXdxdXdx_T[2][2] = {
-            {q_data[i + 1 * Q], q_data[i + 3 * Q]},
-            {q_data[i + 3 * Q], q_data[i + 2 * Q]}
+            {q_data[1][i], q_data[3][i]},
+            {q_data[3][i], q_data[2][i]}
         };
+
         // j = direction of vg
-        for (int j = 0; j < 2; j++) vg[i + j * Q] = (du[0] * dXdxdXdx_T[0][j] + du[1] * dXdxdXdx_T[1][j]);
+        for (int j = 0; j < 2; j++) vg[j][i] = (ug[0][i] * dXdxdXdx_T[0][j] + ug[1][i] * dXdxdXdx_T[1][j]);
       }  // End of Quadrature Point Loop
-      break;
-    case 3:
+    } break;
+    case 3: {
+      const CeedScalar *u               = in[0];
+      const CeedScalar(*ug)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[1];
+      CeedScalar *v                     = out[0];
+      CeedScalar(*vg)[CEED_Q_VLA]       = (CeedScalar(*)[CEED_Q_VLA])out[1];
+
       CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
         // Mass
-        v[i] = q_data[i + Q * 0] * u[i];
+        v[i] = q_data[0][i] * u[i];
 
         // Diffusion
-        // Read spatial derivatives of u
-        const CeedScalar du[3] = {ug[i + Q * 0], ug[i + Q * 1], ug[i + Q * 2]};
-
         // Read q_data (dXdxdXdx_T symmetric matrix)
         // Stored in Voigt convention
-        // 0 5 4
-        // 5 1 3
-        // 4 3 2
+        // 1 6 5
+        // 6 2 4
+        // 5 4 3
         const CeedScalar dXdxdXdx_T[3][3] = {
-            {q_data[i + 1 * Q], q_data[i + 6 * Q], q_data[i + 5 * Q]},
-            {q_data[i + 6 * Q], q_data[i + 2 * Q], q_data[i + 4 * Q]},
-            {q_data[i + 5 * Q], q_data[i + 4 * Q], q_data[i + 3 * Q]}
+            {q_data[1][i], q_data[6][i], q_data[5][i]},
+            {q_data[6][i], q_data[2][i], q_data[4][i]},
+            {q_data[5][i], q_data[4][i], q_data[3][i]}
         };
+
         // j = direction of vg
-        for (int j = 0; j < 3; j++) vg[i + j * Q] = (du[0] * dXdxdXdx_T[0][j] + du[1] * dXdxdXdx_T[1][j] + du[2] * dXdxdXdx_T[2][j]);
+        for (int j = 0; j < 3; j++) vg[j][i] = (ug[0][i] * dXdxdXdx_T[0][j] + ug[1][i] * dXdxdXdx_T[1][j] + ug[2][i] * dXdxdXdx_T[2][j]);
       }  // End of Quadrature Point Loop
-      break;
+    } break;
   }
   return CEED_ERROR_SUCCESS;
 }

From 8baf801d73b4af53c6a6591441c9a2d5a4ea04b6 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Wed, 5 Mar 2025 14:07:48 -0700
Subject: [PATCH 314/571] make - add some headings grouping commands

---
 Makefile | 385 ++++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 250 insertions(+), 135 deletions(-)

diff --git a/Makefile b/Makefile
index c9f4dd7cc1..c0df25d13f 100644
--- a/Makefile
+++ b/Makefile
@@ -5,11 +5,60 @@
 #
 # This file is part of CEED:  http://github.com/ceed
 
+# ------------------------------------------------------------
+# Configuration
+# ------------------------------------------------------------
+
+# config.mk stores cached configuration variables
 CONFIG ?= config.mk
 -include $(CONFIG)
+
+# common.mk holds definitions used in various makefiles throughout the project
 COMMON ?= common.mk
 -include $(COMMON)
 
+# Quiet, color output
+quiet ?= $($(1))
+
+# Cancel built-in and old-fashioned implicit rules which we don't use
+.SUFFIXES:
+
+.SECONDEXPANSION: # to expand $$(@D)/.DIR
+
+%/.DIR :
+	@mkdir -p $(@D)
+	@touch $@
+
+.PRECIOUS: %/.DIR
+
+
+# ------------------------------------------------------------
+# Root directories for backend dependencies
+# ------------------------------------------------------------
+
+# XSMM_DIR env variable should point to XSMM main (github.com/hfp/libxsmm)
+XSMM_DIR ?= ../libxsmm
+
+# Often /opt/cuda or /usr/local/cuda, but sometimes present on machines that don't support CUDA
+CUDA_DIR  ?=
+CUDA_ARCH ?=
+
+# Often /opt/rocm, but sometimes present on machines that don't support HIP
+ROCM_DIR ?=
+HIP_ARCH ?=
+
+# env variable MAGMA_DIR can be used too
+MAGMA_DIR ?= ../magma
+
+# OCCA_DIR env variable should point to OCCA main (github.com/libocca/occa)
+OCCA_DIR ?= ../occa/install
+
+
+# ------------------------------------------------------------
+# Compiler flags
+# ------------------------------------------------------------
+
+# Detect user compiler options and set defaults
 ifeq (,$(filter-out undefined default,$(origin CC)))
   CC = gcc
 endif
@@ -47,56 +96,13 @@ ASAN ?=
 # if any. If the user sets CEED_LDFLAGS or CEED_LDLIBS, they are used *instead
 # of* what we populate here (thus that's advanced usage and not recommended).
 CEED_LDFLAGS ?=
-CEED_LDLIBS ?=
+CEED_LDLIBS  ?=
 
 UNDERSCORE ?= 1
 
 # Verbose mode, V or VERBOSE
 V ?= $(VERBOSE)
 
-# MFEM_DIR env variable should point to sibling directory
-ifneq ($(wildcard ../mfem/libmfem.*),)
-  MFEM_DIR ?= ../mfem
-endif
-
-# NEK5K_DIR env variable should point to sibling directory
-ifneq ($(wildcard ../Nek5000/*),)
-  NEK5K_DIR ?= $(abspath ../Nek5000)
-endif
-export NEK5K_DIR
-MPI ?= 1
-
-# DEAL_II_DIR env variable should point to sibling directory
-ifneq ($(wildcard ../dealii/install/lib/libdeal_II.*),)
-  DEAL_II_DIR ?= ../dealii/install
-endif
-export DEAL_II_DIR
-
-# CEED_DIR env for NEK5K testing
-export CEED_DIR = $(abspath .)
-
-# XSMM_DIR env variable should point to XSMM main (github.com/hfp/libxsmm)
-XSMM_DIR ?= ../libxsmm
-
-# OCCA_DIR env variable should point to OCCA main (github.com/libocca/occa)
-OCCA_DIR ?= ../occa/install
-
-# env variable MAGMA_DIR can be used too
-MAGMA_DIR ?= ../magma
-
-# Often /opt/cuda or /usr/local/cuda, but sometimes present on machines that don't support CUDA
-CUDA_DIR  ?=
-CUDA_ARCH ?=
-
-# Often /opt/rocm, but sometimes present on machines that don't support HIP
-ROCM_DIR ?=
-HIP_ARCH ?=
-
-# Check for PETSc in ../petsc
-ifneq ($(wildcard ../petsc/lib/libpetsc.*),)
-  PETSC_DIR ?= ../petsc
-endif
-
 # Warning: SANTIZ options still don't run with /gpu/occa
 AFLAGS ?= -fsanitize=address #-fsanitize=undefined -fno-omit-frame-pointer
 
@@ -159,6 +165,7 @@ OMP_SIMD_FLAG := $(if $(call cc_check_flag,$(OMP_SIMD_FLAG)),$(OMP_SIMD_FLAG))
 PEDANTIC      ?=
 PEDANTICFLAGS ?= -Werror -pedantic
 
+# Compiler flags
 OPT    ?= -O $(MARCHFLAG) $(OPT.$(CC_VENDOR)) $(OMP_SIMD_FLAG)
 CFLAGS ?= $(OPT) $(CFLAGS.$(CC_VENDOR)) $(if $(PEDANTIC),$(PEDANTICFLAGS))
 CXXFLAGS ?= $(OPT) $(CXXFLAGS.$(CC_VENDOR)) $(if $(PEDANTIC),$(PEDANTICFLAGS))
@@ -198,7 +205,6 @@ OBJDIR := build
 for_install := $(filter install,$(MAKECMDGOALS))
 LIBDIR := $(if $(for_install),$(OBJDIR),lib)
 
-
 # Installation variables
 prefix ?= /usr/local
 bindir = $(prefix)/bin
@@ -226,48 +232,69 @@ libceed.so := $(LIBDIR)/libceed.$(SO_EXT)
 libceed.a := $(LIBDIR)/libceed.a
 libceed := $(if $(STATIC),$(libceed.a),$(libceed.so))
 CEED_LIBS = -lceed
-libceed.c := $(filter-out interface/ceed-cuda.c interface/ceed-hip.c interface/ceed-jit-source-root-$(if $(for_install),default,install).c, $(wildcard interface/ceed*.c backends/*.c gallery/*.c))
-gallery.c := $(wildcard gallery/*/ceed*.c)
-libceed.c += $(gallery.c)
 libceeds = $(libceed)
 BACKENDS_BUILTIN := /cpu/self/ref/serial /cpu/self/ref/blocked /cpu/self/opt/serial /cpu/self/opt/blocked
 BACKENDS_MAKE := $(BACKENDS_BUILTIN)
 
-# Tests
-tests.c   := $(sort $(wildcard tests/t[0-9][0-9][0-9]-*.c))
-tests.f   := $(if $(FC),$(sort $(wildcard tests/t[0-9][0-9][0-9]-*.f90)))
-tests     := $(tests.c:tests/%.c=$(OBJDIR)/%$(EXE_SUFFIX))
-ctests    := $(tests)
-tests     += $(tests.f:tests/%.f90=$(OBJDIR)/%$(EXE_SUFFIX))
-# Examples
-examples.c := $(sort $(wildcard examples/ceed/*.c))
-examples.f := $(if $(FC),$(sort $(wildcard examples/ceed/*.f)))
-examples   := $(examples.c:examples/ceed/%.c=$(OBJDIR)/%$(EXE_SUFFIX))
-examples   += $(examples.f:examples/ceed/%.f=$(OBJDIR)/%$(EXE_SUFFIX))
-# MFEM Examples
-mfemexamples.cpp := $(sort $(wildcard examples/mfem/*.cpp))
-mfemexamples  := $(mfemexamples.cpp:examples/mfem/%.cpp=$(OBJDIR)/mfem-%)
-# Nek5K Examples
-nekexamples   := $(OBJDIR)/nek-bps
-# PETSc Examples
-petscexamples.c := $(wildcard examples/petsc/*.c)
-petscexamples   := $(petscexamples.c:examples/petsc/%.c=$(OBJDIR)/petsc-%)
-# deal.II Examples
-dealiiexamples  := $(OBJDIR)/dealii-bps
-# Fluid Dynamics Examples
-fluidsexamples.c  := $(sort $(wildcard examples/fluids/*.c))
-fluidsexamples    := $(fluidsexamples.c:examples/fluids/%.c=$(OBJDIR)/fluids-%)
-# Solid Mechanics Examples
-solidsexamples.c  := $(sort $(wildcard examples/solids/*.c))
-solidsexamples    := $(solidsexamples.c:examples/solids/%.c=$(OBJDIR)/solids-%)
-
-# Backends/[ref, blocked, memcheck, opt, avx, occa, magma]
+
+# ------------------------------------------------------------
+# Root directories for examples using external libraries
+# ------------------------------------------------------------
+
+# DEAL_II_DIR env variable should point to sibling directory
+ifneq ($(wildcard ../dealii/install/lib/libdeal_II.*),)
+  DEAL_II_DIR ?= ../dealii/install
+endif
+# Export for deal.II testing
+export DEAL_II_DIR
+
+# MFEM_DIR env variable should point to sibling directory
+ifneq ($(wildcard ../mfem/libmfem.*),)
+  MFEM_DIR ?= ../mfem
+endif
+
+# NEK5K_DIR env variable should point to sibling directory
+ifneq ($(wildcard ../Nek5000/*),)
+  NEK5K_DIR ?= $(abspath ../Nek5000)
+endif
+# Exports for NEK5K testing
+export CEED_DIR = $(abspath .)
+export NEK5K_DIR
+MPI ?= 1
+
+# Check for PETSc in ../petsc
+ifneq ($(wildcard ../petsc/lib/libpetsc.*),)
+  PETSC_DIR ?= ../petsc
+endif
+
+# ------------------------------------------------------------
+# Build the library (default target)
+# ------------------------------------------------------------
+
+lib: $(libceed) $(ceed.pc)
+# run 'lib' target in parallel
+par:;@$(MAKE) $(MFLAGS) V=$(V) lib
+
+$(libceed.so) : CEED_LDFLAGS += $(if $(DARWIN), -install_name @rpath/$(notdir $(libceed.so)))
+
+# ------------------------------------------------------------
+# Source files
+# ------------------------------------------------------------
+
+# Interface and gallery
+libceed.c := $(filter-out interface/ceed-cuda.c interface/ceed-hip.c interface/ceed-jit-source-root-$(if $(for_install),default,install).c, $(wildcard interface/ceed*.c backends/*.c gallery/*.c))
+gallery.c := $(wildcard gallery/*/ceed*.c)
+libceed.c += $(gallery.c)
+
+# Backends
+# - CPU
 ref.c          := $(sort $(wildcard backends/ref/*.c))
 blocked.c      := $(sort $(wildcard backends/blocked/*.c))
 ceedmemcheck.c := $(sort $(wildcard backends/memcheck/*.c))
 opt.c          := $(sort $(wildcard backends/opt/*.c))
 avx.c          := $(sort $(wildcard backends/avx/*.c))
 xsmm.c         := $(sort $(wildcard backends/xsmm/*.c))
+# - GPU
 cuda.c         := $(sort $(wildcard backends/cuda/*.c))
 cuda.cpp       := $(sort $(wildcard backends/cuda/*.cpp))
 cuda-ref.c     := $(sort $(wildcard backends/cuda-ref/*.c))
@@ -278,9 +305,6 @@ cuda-shared.cu := $(sort $(wildcard backends/cuda-shared/kernels/*.cu))
 cuda-gen.c     := $(sort $(wildcard backends/cuda-gen/*.c))
 cuda-gen.cpp   := $(sort $(wildcard backends/cuda-gen/*.cpp))
 cuda-gen.cu    := $(sort $(wildcard backends/cuda-gen/kernels/*.cu))
-occa.cpp       := $(sort $(shell find backends/occa -type f -name *.cpp))
-magma.c        := $(sort $(wildcard backends/magma/*.c))
-magma.cpp      := $(sort $(wildcard backends/magma/*.cpp))
 hip.c          := $(sort $(wildcard backends/hip/*.c))
 hip.cpp        := $(sort $(wildcard backends/hip/*.cpp))
 hip-ref.c      := $(sort $(wildcard backends/hip-ref/*.c))
@@ -293,28 +317,55 @@ sycl-core.cpp  := $(sort $(wildcard backends/sycl/*.sycl.cpp))
 sycl-ref.cpp   := $(sort $(wildcard backends/sycl-ref/*.sycl.cpp))
 sycl-shared.cpp:= $(sort $(wildcard backends/sycl-shared/*.sycl.cpp))
 sycl-gen.cpp   := $(sort $(wildcard backends/sycl-gen/*.sycl.cpp))
+magma.c        := $(sort $(wildcard backends/magma/*.c))
+magma.cpp      := $(sort $(wildcard backends/magma/*.cpp))
+occa.cpp       := $(sort $(shell find backends/occa -type f -name *.cpp))
 
-hip-all.c := interface/ceed-hip.c $(hip.c) $(hip-ref.c) $(hip-shared.c) $(hip-gen.c)
+hip-all.c   := interface/ceed-hip.c $(hip.c) $(hip-ref.c) $(hip-shared.c) $(hip-gen.c)
 hip-all.cpp := $(hip.cpp) $(hip-ref.cpp) $(hip-gen.cpp)
 
-# Quiet, color output
-quiet ?= $($(1))
+# Tests
+tests.c := $(sort $(wildcard tests/t[0-9][0-9][0-9]-*.c))
+tests.f := $(if $(FC),$(sort $(wildcard tests/t[0-9][0-9][0-9]-*.f90)))
+tests   := $(tests.c:tests/%.c=$(OBJDIR)/%$(EXE_SUFFIX))
+ctests  := $(tests)
+tests   += $(tests.f:tests/%.f90=$(OBJDIR)/%$(EXE_SUFFIX))
 
-# Cancel built-in and old-fashioned implicit rules which we don't use
-.SUFFIXES:
+# Examples
+examples.c := $(sort $(wildcard examples/ceed/*.c))
+examples.f := $(if $(FC),$(sort $(wildcard examples/ceed/*.f)))
+examples   := $(examples.c:examples/ceed/%.c=$(OBJDIR)/%$(EXE_SUFFIX))
+examples   += $(examples.f:examples/ceed/%.f=$(OBJDIR)/%$(EXE_SUFFIX))
 
-.SECONDEXPANSION: # to expand $$(@D)/.DIR
+# deal.II Examples
+dealiiexamples := $(OBJDIR)/dealii-bps
 
-%/.DIR :
-	@mkdir -p $(@D)
-	@touch $@
+# MFEM Examples
+mfemexamples.cpp := $(sort $(wildcard examples/mfem/*.cpp))
+mfemexamples     := $(mfemexamples.cpp:examples/mfem/%.cpp=$(OBJDIR)/mfem-%)
 
-.PRECIOUS: %/.DIR
+# Nek5K Examples
+nekexamples := $(OBJDIR)/nek-bps
+
+# PETSc Examples
+petscexamples.c := $(wildcard examples/petsc/*.c)
+petscexamples   := $(petscexamples.c:examples/petsc/%.c=$(OBJDIR)/petsc-%)
+
+# Fluid Dynamics Example
+fluidsexamples.c := $(sort $(wildcard examples/fluids/*.c))
+fluidsexamples   := $(fluidsexamples.c:examples/fluids/%.c=$(OBJDIR)/fluids-%)
+
+# Solid Mechanics Example
+solidsexamples.c := $(sort $(wildcard examples/solids/*.c))
+solidsexamples   := $(solidsexamples.c:examples/solids/%.c=$(OBJDIR)/solids-%)
+
+
+# ------------------------------------------------------------
+# View configuration options
+# ------------------------------------------------------------
 
-lib: $(libceed) $(ceed.pc)
-# run 'lib' target in parallel
-par:;@$(MAKE) $(MFLAGS) V=$(V) lib
 backend_status = $(if $(filter $1,$(BACKENDS_MAKE)), [backends: $1], [not found])
+
 info:
 	$(info ------------------------------------)
 	$(info CC            = $(CC))
@@ -356,14 +407,19 @@ info:
 	$(info pkgconfigdir  = $(value pkgconfigdir))
 	$(info ------------------------------------)
 	@true
+
 info-backends:
 	$(info make: 'lib' with optional backends: $(filter-out $(BACKENDS_BUILTIN),$(BACKENDS)))
 	@true
+
 info-backends-all:
 	$(info make: 'lib' with backends: $(BACKENDS))
 	@true
 
-$(libceed.so) : CEED_LDFLAGS += $(if $(DARWIN), -install_name @rpath/$(notdir $(libceed.so)))
+
+# ------------------------------------------------------------
+# Backends
+# ------------------------------------------------------------
 
 # Standard Backends
 libceed.c += $(ref.c)
@@ -417,22 +473,6 @@ ifneq ($(wildcard $(XSMM_DIR)/lib/libxsmm.*),)
   BACKENDS_MAKE += $(XSMM_BACKENDS)
 endif
 
-# OCCA Backends
-OCCA_BACKENDS = /cpu/self/occa
-ifneq ($(wildcard $(OCCA_DIR)/lib/libocca.*),)
-  OCCA_MODES := $(shell LD_LIBRARY_PATH=$(OCCA_DIR)/lib $(OCCA_DIR)/bin/occa modes)
-  OCCA_BACKENDS += $(if $(filter OpenMP,$(OCCA_MODES)),/cpu/openmp/occa)
-  OCCA_BACKENDS += $(if $(filter dpcpp,$(OCCA_MODES)),/gpu/dpcpp/occa)
-  OCCA_BACKENDS += $(if $(filter OpenCL,$(OCCA_MODES)),/gpu/opencl/occa)
-  OCCA_BACKENDS += $(if $(filter HIP,$(OCCA_MODES)),/gpu/hip/occa)
-  OCCA_BACKENDS += $(if $(filter CUDA,$(OCCA_MODES)),/gpu/cuda/occa)
-  $(libceeds) : CPPFLAGS += -I$(OCCA_DIR)/include
-  PKG_LIBS += -L$(abspath $(OCCA_DIR))/lib -locca
-  LIBCEED_CONTAINS_CXX = 1
-  libceed.cpp += $(occa.cpp)
-  BACKENDS_MAKE += $(OCCA_BACKENDS)
-endif
-
 # CUDA Backends
 ifneq ($(CUDA_DIR),)
   CUDA_LIB_DIR := $(wildcard $(foreach d,lib lib64 lib/x86_64-linux-gnu,$(CUDA_DIR)/$d/libcudart.${SO_EXT}))
@@ -519,9 +559,29 @@ ifneq ($(wildcard $(MAGMA_DIR)/lib/libmagma.*),)
   BACKENDS_MAKE += $(MAGMA_BACKENDS)
 endif
 
+# OCCA Backends
+OCCA_BACKENDS = /cpu/self/occa
+ifneq ($(wildcard $(OCCA_DIR)/lib/libocca.*),)
+  OCCA_MODES := $(shell LD_LIBRARY_PATH=$(OCCA_DIR)/lib $(OCCA_DIR)/bin/occa modes)
+  OCCA_BACKENDS += $(if $(filter OpenMP,$(OCCA_MODES)),/cpu/openmp/occa)
+  OCCA_BACKENDS += $(if $(filter dpcpp,$(OCCA_MODES)),/gpu/dpcpp/occa)
+  OCCA_BACKENDS += $(if $(filter OpenCL,$(OCCA_MODES)),/gpu/opencl/occa)
+  OCCA_BACKENDS += $(if $(filter HIP,$(OCCA_MODES)),/gpu/hip/occa)
+  OCCA_BACKENDS += $(if $(filter CUDA,$(OCCA_MODES)),/gpu/cuda/occa)
+  $(libceeds) : CPPFLAGS += -I$(OCCA_DIR)/include
+  PKG_LIBS += -L$(abspath $(OCCA_DIR))/lib -locca
+  LIBCEED_CONTAINS_CXX = 1
+  libceed.cpp += $(occa.cpp)
+  BACKENDS_MAKE += $(OCCA_BACKENDS)
+endif
+
 BACKENDS ?= $(BACKENDS_MAKE)
 export BACKENDS
 
+
+# ------------------------------------------------------------
+# Linker Flags
+# ------------------------------------------------------------
 _pkg_ldflags = $(filter -L%,$(PKG_LIBS))
 _pkg_ldlibs = $(filter-out -L%,$(PKG_LIBS))
 $(libceeds) : CEED_LDFLAGS += $(_pkg_ldflags) $(if $(STATIC),,$(_pkg_ldflags:-L%=-Wl,-rpath,%)) $(PKG_STUBS_LIBS)
@@ -540,6 +600,11 @@ ifeq ($(LIBCEED_CONTAINS_CXX),1)
   endif
 endif
 
+
+# ------------------------------------------------------------
+# Building core library components
+# ------------------------------------------------------------
+
 # File names *-weak.c contain weak symbol definitions, which must be listed last
 # when creating shared or static libraries.
 weak_last = $(filter-out %-weak.o,$(1)) $(filter %-weak.o,$(1))
@@ -583,11 +648,26 @@ $(OBJDIR)/%$(EXE_SUFFIX) : examples/ceed/%.c | $$(@D)/.DIR
 $(OBJDIR)/%$(EXE_SUFFIX) : examples/ceed/%.f | $$(@D)/.DIR
 	$(call quiet,LINK.F) -DSOURCE_DIR='"$(abspath $(<D))/"' $(CEED_LDFLAGS) -o $@ $(abspath $<) $(CEED_LIBS) $(CEED_LDLIBS) $(LDLIBS)
 
+
+# ------------------------------------------------------------
+# Building examples
+# ------------------------------------------------------------
+
+# deal.II
+# Note: Invoking deal.II's CMAKE build system here
+$(OBJDIR)/dealii-bps : examples/deal.II/*.cc examples/deal.II/*.h $(libceed) | $$(@D)/.DIR
+	mkdir -p examples/deal.II/build
+	cmake -B examples/deal.II/build -S examples/deal.II -DDEAL_II_DIR=$(DEAL_II_DIR) -DCEED_DIR=$(PWD)
+	+$(call quiet,MAKE) -C examples/deal.II/build
+	cp examples/deal.II/build/bps $(OBJDIR)/dealii-bps
+
+# MFEM
 $(OBJDIR)/mfem-% : examples/mfem/%.cpp $(libceed) | $$(@D)/.DIR
 	+$(MAKE) -C examples/mfem CEED_DIR=`pwd` \
 	  MFEM_DIR="$(abspath $(MFEM_DIR))" CXX=$(CXX) $*
 	cp examples/mfem/$* $@
 
+# Nek5000
 # Note: Multiple Nek files cannot be built in parallel. The '+' here enables
 #       this single Nek bps file to be built in parallel with other examples,
 #       such as when calling `make prove-all -j2`.
@@ -596,13 +676,7 @@ $(OBJDIR)/nek-bps : examples/nek/bps/bps.usr examples/nek/nek-examples.sh $(libc
 	mv examples/nek/build/bps $(OBJDIR)/bps
 	cp examples/nek/nek-examples.sh $(OBJDIR)/nek-bps
 
-# Note: Invoking deal.II's CMAKE build system here
-$(OBJDIR)/dealii-bps : examples/deal.II/*.cc examples/deal.II/*.h $(libceed) | $$(@D)/.DIR
-	mkdir -p examples/deal.II/build
-	cmake -B examples/deal.II/build -S examples/deal.II -DDEAL_II_DIR=$(DEAL_II_DIR) -DCEED_DIR=$(PWD)
-	+$(call quiet,MAKE) -C examples/deal.II/build
-	cp examples/deal.II/build/bps $(OBJDIR)/dealii-bps
-
+# PETSc
 # Several executables have common utilities, but we can't build the utilities
 # from separate submake invocations because they'll compete with each
 # other/corrupt output. So we put it in this utility library, but we don't want
@@ -617,11 +691,13 @@ $(OBJDIR)/petsc-% : examples/petsc/%.c examples/petsc/libutils.a.PHONY $(libceed
 	  PETSC_DIR="$(abspath $(PETSC_DIR))" OPT="$(OPT)" $*
 	cp examples/petsc/$* $@
 
+# Fluid dynamics proxy application
 $(OBJDIR)/fluids-% : examples/fluids/%.c examples/fluids/src/*.c examples/fluids/*.h examples/fluids/include/*.h examples/fluids/problems/*.c examples/fluids/qfunctions/*.h $(libceed) $(ceed.pc) examples/fluids/Makefile | $$(@D)/.DIR
 	+$(call quiet,MAKE) -C examples/fluids CEED_DIR=`pwd` \
 	  PETSC_DIR="$(abspath $(PETSC_DIR))" OPT="$(OPT)" $*
 	cp examples/fluids/$* $@
 
+# Solid mechanics proxy application
 $(OBJDIR)/solids-% : examples/solids/%.c examples/solids/%.h \
     examples/solids/problems/*.c examples/solids/src/*.c \
     examples/solids/include/*.h examples/solids/problems/*.h examples/solids/qfunctions/*.h \
@@ -630,10 +706,31 @@ $(OBJDIR)/solids-% : examples/solids/%.c examples/solids/%.h \
 	  PETSC_DIR="$(abspath $(PETSC_DIR))" OPT="$(OPT)" $*
 	cp examples/solids/$* $@
 
+examples : $(allexamples)
+ceedexamples : $(examples)
+nekexamples : $(nekexamples)
+mfemexamples : $(mfemexamples)
+petscexamples : $(petscexamples)
+
+external_examples := \
+	$(if $(MFEM_DIR),$(mfemexamples)) \
+	$(if $(PETSC_DIR),$(petscexamples)) \
+	$(if $(NEK5K_DIR),$(nekexamples)) \
+	$(if $(DEAL_II_DIR),$(dealiiexamples)) \
+	$(if $(PETSC_DIR),$(fluidsexamples)) \
+	$(if $(PETSC_DIR),$(solidsexamples))
+
+allexamples = $(examples) $(external_examples)
+
 $(examples) : $(libceed)
 $(tests) : $(libceed)
 $(tests) $(examples) : override LDFLAGS += $(if $(STATIC),,-Wl,-rpath,$(abspath $(LIBDIR))) -L$(LIBDIR)
 
+
+# ------------------------------------------------------------
+# Testing
+# ------------------------------------------------------------
+
 # Set number processes for testing
 NPROC_TEST ?= 1
 export NPROC_TEST
@@ -645,16 +742,6 @@ export NPROC_POOL
 run-% : $(OBJDIR)/%
 	@$(PYTHON) tests/junit.py --mode tap --ceed-backends $(BACKENDS) --nproc $(NPROC_TEST) --pool-size $(NPROC_POOL) $(<:$(OBJDIR)/%=%)
 
-external_examples := \
-	$(if $(MFEM_DIR),$(mfemexamples)) \
-	$(if $(PETSC_DIR),$(petscexamples)) \
-	$(if $(NEK5K_DIR),$(nekexamples)) \
-	$(if $(DEAL_II_DIR),$(dealiiexamples)) \
-	$(if $(PETSC_DIR),$(fluidsexamples)) \
-	$(if $(PETSC_DIR),$(solidsexamples))
-
-allexamples = $(examples) $(external_examples)
-
 # The test and prove targets can be controlled via pattern searches.  The
 # default is to run tests and those examples that have no external dependencies.
 # Examples of finer grained control:
@@ -675,6 +762,8 @@ tst : ;@$(MAKE) $(MFLAGS) V=$(V) test
 # CPU C tests only for backend %
 ctc-% : $(ctests);@$(foreach tst,$(ctests),$(tst) /cpu/$*;)
 
+# Testing with TAP format
+# https://testanything.org/tap-specification.html
 prove : $(matched)
 	$(info Testing backends: $(BACKENDS))
 	$(PROVE) $(PROVE_OPTS) --exec '$(PYTHON) tests/junit.py --mode tap --ceed-backends $(BACKENDS) --nproc $(NPROC_TEST) --pool-size $(NPROC_POOL)' $(matched:$(OBJDIR)/%=%)
@@ -691,12 +780,6 @@ junit : $(matched:$(OBJDIR)/%=junit-%)
 
 all: $(alltests)
 
-examples : $(allexamples)
-ceedexamples : $(examples)
-nekexamples : $(nekexamples)
-mfemexamples : $(mfemexamples)
-petscexamples : $(petscexamples)
-
 # Benchmarks
 allbenchmarks = petsc-bps
 bench_targets = $(addprefix bench-,$(allbenchmarks))
@@ -716,6 +799,10 @@ $(OBJDIR)/ceed.pc : pkgconfig-prefix = $(prefix)
 $(OBJDIR)/interface/ceed-jit-source-root-default.o : CPPFLAGS += -DCEED_JIT_SOURCE_ROOT_DEFAULT="\"$(abspath ./include)/\""
 $(OBJDIR)/interface/ceed-jit-source-root-install.o : CPPFLAGS += -DCEED_JIT_SOURCE_ROOT_DEFAULT="\"$(abspath $(includedir))/\""
 
+
+# ------------------------------------------------------------
+# Installation
+# ------------------------------------------------------------
 install : $(libceed) $(OBJDIR)/ceed.pc
 	$(INSTALL) -d $(addprefix $(if $(DESTDIR),"$(DESTDIR)"),"$(includedir)"\
 	  "$(includedir)/ceed/" "$(includedir)/ceed/jit-source/"\
@@ -740,8 +827,10 @@ install : $(libceed) $(OBJDIR)/ceed.pc
 	$(INSTALL_DATA) $(wildcard include/ceed/jit-source/magma/*.h) "$(DESTDIR)$(includedir)/ceed/jit-source/magma/"
 	$(INSTALL_DATA) $(wildcard include/ceed/jit-source/sycl/*.h) "$(DESTDIR)$(includedir)/ceed/jit-source/sycl/"
 
-.PHONY : all cln clean doxygen doc format lib install par print test tst prove prv prove-all junit examples tidy iwyu info info-backends info-backends-all
 
+# ------------------------------------------------------------
+# Cleaning
+# ------------------------------------------------------------
 cln clean :
 	$(RM) -r $(OBJDIR) $(LIBDIR) dist *egg* .pytest_cache *cffi*
 	$(call quiet,MAKE) -C examples clean NEK5K_DIR="$(abspath $(NEK5K_DIR))"
@@ -751,7 +840,10 @@ cln clean :
 distclean : clean
 	$(RM) -r doc/html doc/sphinx/build $(CONFIG)
 
+
+# ------------------------------------------------------------
 # Documentation
+# ------------------------------------------------------------
 DOXYGEN ?= doxygen
 
 doxygen :
@@ -762,6 +854,11 @@ doc-html doc-latexpdf doc-epub doc-livehtml : doc-% : doxygen
 
 doc : doc-html
 
+
+# ------------------------------------------------------------
+# Linting utilities
+# ------------------------------------------------------------
+
 # Style/Format
 CLANG_FORMAT      ?= clang-format
 CLANG_FORMAT_OPTS += -style=file -i
@@ -812,6 +909,11 @@ endif
 iwyu :
 	$(MAKE) -B CC=$(IWYU_CC)
 
+
+# ------------------------------------------------------------
+# Variable printing for debugging
+# ------------------------------------------------------------
+
 print :
 	@echo $(VAR)=$($(VAR))
 
@@ -824,6 +926,10 @@ print-% :
 	$(info )
 	@true
 
+
+# ------------------------------------------------------------
+# Configuration caching
+# ------------------------------------------------------------
 # "make configure" detects any variables passed on the command line or
 # previously set in config.mk, caching them in config.mk as simple
 # (:=) variables.  Variables set in config.mk or on the command line
@@ -856,6 +962,10 @@ configure :
 	@echo "Configuration cached in $(CONFIG):"
 	@cat $(CONFIG)
 
+
+# ------------------------------------------------------------
+# Building Python wheels for deployment
+# ------------------------------------------------------------
 wheel : export MARCHFLAG = -march=generic
 wheel : export WHEEL_PLAT = manylinux2010_x86_64
 wheel :
@@ -863,7 +973,12 @@ wheel :
 	  -e MARCHFLAG -e WHEEL_PLAT \
 	  quay.io/pypa/$(WHEEL_PLAT) python/make-wheels.sh
 
-.PHONY : configure wheel
+# ------------------------------------------------------------
+# Phony targets
+# ------------------------------------------------------------
+# These targets are not files but rather commands to run
+.PHONY : all cln clean doxygen doc format lib install par print test tst prove prv prove-all junit examples tidy iwyu info info-backends info-backends-all configure wheel
+
 
 # Include *.d deps when not -B = --always-make: useful if the paths are wonky in a container
 -include $(if $(filter B,$(MAKEFLAGS)),,$(libceed.c:%.c=$(OBJDIR)/%.d) $(tests.c:tests/%.c=$(OBJDIR)/%.d))

From a5ef5560233b41639b8f63aaa7afe24f9b67de19 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Wed, 5 Mar 2025 14:19:25 -0700
Subject: [PATCH 315/571] make - add info-basic, list backends in info

---
 Makefile | 63 +++++++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 56 insertions(+), 7 deletions(-)

diff --git a/Makefile b/Makefile
index c0df25d13f..058361fa01 100644
--- a/Makefile
+++ b/Makefile
@@ -366,8 +366,47 @@ solidsexamples   := $(solidsexamples.c:examples/solids/%.c=$(OBJDIR)/solids-%)
 
 backend_status = $(if $(filter $1,$(BACKENDS_MAKE)), [backends: $1], [not found])
 
+info-basic:
+	$(info -----------------------------------------)
+	$(info |     ___ __    ______________________  |)
+	$(info |    / (_) /_  / ____/ ____/ ____/ __ \ |)
+	$(info |   / / / __ \/ /   / __/ / __/ / / / / |)
+	$(info |  / / / /_/ / /___/ /___/ /___/ /_/ /  |)
+	$(info | /_/_/_.___/\____/_____/_____/_____/   |)
+	$(info -----------------------------------------)
+	$(info )
+	$(info -----------------------------------------)
+	$(info )
+	$(info Built-in Backends:)
+	$(info   $(BACKENDS_BUILTIN))
+	$(info )
+	$(info Additional Backends:)
+	$(info   $(filter-out $(BACKENDS_BUILTIN),$(BACKENDS)))
+	$(info )
+	$(info -----------------------------------------)
+	$(info )
+	@true
+
 info:
-	$(info ------------------------------------)
+	$(info -----------------------------------------)
+	$(info |     ___ __    ______________________  |)
+	$(info |    / (_) /_  / ____/ ____/ ____/ __ \ |)
+	$(info |   / / / __ \/ /   / __/ / __/ / / / / |)
+	$(info |  / / / /_/ / /___/ /___/ /___/ /_/ /  |)
+	$(info | /_/_/_.___/\____/_____/_____/_____/   |)
+	$(info -----------------------------------------)
+	$(info )
+	$(info -----------------------------------------)
+	$(info )
+	$(info Built-in Backends:)
+	$(info   $(BACKENDS_BUILTIN))
+	$(info )
+	$(info Additional Backends:)
+	$(info   $(filter-out $(BACKENDS_BUILTIN),$(BACKENDS)))
+	$(info )
+	$(info -----------------------------------------)
+	$(info )
+	$(info Compiler Flags:)
 	$(info CC            = $(CC))
 	$(info CXX           = $(CXX))
 	$(info FC            = $(FC))
@@ -386,26 +425,36 @@ info:
 	$(info AFLAGS        = $(AFLAGS))
 	$(info ASAN          = $(or $(ASAN),(empty)))
 	$(info VERBOSE       = $(or $(V),(empty)) [verbose=$(if $(V),on,off)])
-	$(info ------------------------------------)
+	$(info )
+	$(info -----------------------------------------)
+	$(info )
+	$(info Backend Dependencies:)
 	$(info MEMCHK_STATUS = $(MEMCHK_STATUS)$(call backend_status,$(MEMCHK_BACKENDS)))
 	$(info AVX_STATUS    = $(AVX_STATUS)$(call backend_status,$(AVX_BACKENDS)))
 	$(info XSMM_DIR      = $(XSMM_DIR)$(call backend_status,$(XSMM_BACKENDS)))
-	$(info OCCA_DIR      = $(OCCA_DIR)$(call backend_status,$(OCCA_BACKENDS)))
-	$(info MAGMA_DIR     = $(MAGMA_DIR)$(call backend_status,$(MAGMA_BACKENDS)))
 	$(info CUDA_DIR      = $(CUDA_DIR)$(call backend_status,$(CUDA_BACKENDS)))
 	$(info ROCM_DIR      = $(ROCM_DIR)$(call backend_status,$(HIP_BACKENDS)))
 	$(info SYCL_DIR      = $(SYCL_DIR)$(call backend_status,$(SYCL_BACKENDS)))
-	$(info ------------------------------------)
+	$(info MAGMA_DIR     = $(MAGMA_DIR)$(call backend_status,$(MAGMA_BACKENDS)))
+	$(info OCCA_DIR      = $(OCCA_DIR)$(call backend_status,$(OCCA_BACKENDS)))
+	$(info )
+	$(info -----------------------------------------)
+	$(info )
+	$(info Example Dependencies:)
 	$(info MFEM_DIR      = $(MFEM_DIR))
 	$(info NEK5K_DIR     = $(NEK5K_DIR))
 	$(info PETSC_DIR     = $(PETSC_DIR))
 	$(info DEAL_II_DIR   = $(DEAL_II_DIR))
-	$(info ------------------------------------)
+	$(info )
+	$(info -----------------------------------------)
+	$(info )
+	$(info Install Options:)
 	$(info prefix        = $(prefix))
 	$(info includedir    = $(value includedir))
 	$(info libdir        = $(value libdir))
 	$(info pkgconfigdir  = $(value pkgconfigdir))
-	$(info ------------------------------------)
+	$(info )
+	$(info -----------------------------------------)
 	@true
 
 info-backends:

From 9a92872b82268e40ae396fab31704085c4650f1f Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Thu, 6 Mar 2025 10:44:27 -0700
Subject: [PATCH 316/571] make - common strategy w/ [cuda,hip]-all variables

---
 Makefile | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/Makefile b/Makefile
index 058361fa01..507b248ee8 100644
--- a/Makefile
+++ b/Makefile
@@ -301,10 +301,11 @@ cuda-ref.c     := $(sort $(wildcard backends/cuda-ref/*.c))
 cuda-ref.cpp   := $(sort $(wildcard backends/cuda-ref/*.cpp))
 cuda-ref.cu    := $(sort $(wildcard backends/cuda-ref/kernels/*.cu))
 cuda-shared.c  := $(sort $(wildcard backends/cuda-shared/*.c))
-cuda-shared.cu := $(sort $(wildcard backends/cuda-shared/kernels/*.cu))
 cuda-gen.c     := $(sort $(wildcard backends/cuda-gen/*.c))
 cuda-gen.cpp   := $(sort $(wildcard backends/cuda-gen/*.cpp))
-cuda-gen.cu    := $(sort $(wildcard backends/cuda-gen/kernels/*.cu))
+cuda-all.c     := interface/ceed-cuda.c $(cuda.c) $(cuda-ref.c) $(cuda-shared.c) $(cuda-gen.c)
+cuda-all.cpp   := $(cuda.cpp) $(cuda-ref.cpp) $(cuda-gen.cpp)
+cuda-all.cu    := $(cuda-ref.cu)
 hip.c          := $(sort $(wildcard backends/hip/*.c))
 hip.cpp        := $(sort $(wildcard backends/hip/*.cpp))
 hip-ref.c      := $(sort $(wildcard backends/hip-ref/*.c))
@@ -313,6 +314,9 @@ hip-ref.hip    := $(sort $(wildcard backends/hip-ref/kernels/*.hip.cpp))
 hip-shared.c   := $(sort $(wildcard backends/hip-shared/*.c))
 hip-gen.c      := $(sort $(wildcard backends/hip-gen/*.c))
 hip-gen.cpp    := $(sort $(wildcard backends/hip-gen/*.cpp))
+hip-all.c      := interface/ceed-hip.c $(hip.c) $(hip-ref.c) $(hip-shared.c) $(hip-gen.c)
+hip-all.cpp    := $(hip.cpp) $(hip-ref.cpp) $(hip-gen.cpp)
+hip-all.hip    := $(hip-ref.hip)
 sycl-core.cpp  := $(sort $(wildcard backends/sycl/*.sycl.cpp))
 sycl-ref.cpp   := $(sort $(wildcard backends/sycl-ref/*.sycl.cpp))
 sycl-shared.cpp:= $(sort $(wildcard backends/sycl-shared/*.sycl.cpp))
@@ -321,9 +325,6 @@ magma.c        := $(sort $(wildcard backends/magma/*.c))
 magma.cpp      := $(sort $(wildcard backends/magma/*.cpp))
 occa.cpp       := $(sort $(shell find backends/occa -type f -name *.cpp))
 
-hip-all.c   := interface/ceed-hip.c $(hip.c) $(hip-ref.c) $(hip-shared.c) $(hip-gen.c)
-hip-all.cpp := $(hip.cpp) $(hip-ref.cpp) $(hip-gen.cpp)
-
 # Tests
 tests.c := $(sort $(wildcard tests/t[0-9][0-9][0-9]-*.c))
 tests.f := $(if $(FC),$(sort $(wildcard tests/t[0-9][0-9][0-9]-*.f90)))
@@ -535,9 +536,9 @@ ifneq ($(CUDA_LIB_DIR),)
   PKG_STUBS_LIBS += -L$(CUDA_LIB_DIR_STUBS)
   LIBCEED_CONTAINS_CXX = 1
   libceed.c     += interface/ceed-cuda.c
-  libceed.c     += $(cuda.c) $(cuda-ref.c) $(cuda-shared.c) $(cuda-gen.c)
-  libceed.cpp   += $(cuda.cpp) $(cuda-ref.cpp) $(cuda-gen.cpp)
-  libceed.cu    += $(cuda-ref.cu) $(cuda-shared.cu) $(cuda-gen.cu)
+  libceed.c     += $(cuda-all.c)
+  libceed.cpp   += $(cuda-all.cpp)
+  libceed.cu    += $(cuda-all.cu)
   BACKENDS_MAKE += $(CUDA_BACKENDS)
 endif
 
@@ -555,7 +556,7 @@ ifneq ($(HIP_LIB_DIR),)
   LIBCEED_CONTAINS_CXX = 1
   libceed.c     += $(hip-all.c)
   libceed.cpp   += $(hip-all.cpp)
-  libceed.hip   += $(hip-ref.hip)
+  libceed.hip   += $(hip-all.hip)
   BACKENDS_MAKE += $(HIP_BACKENDS)
 endif
 
@@ -568,7 +569,7 @@ endif
 ifneq ($(SYCL_LIB_DIR),)
   PKG_LIBS += $(SYCL_FLAG) -lze_loader
   LIBCEED_CONTAINS_CXX = 1
-  libceed.sycl += $(sycl-core.cpp) $(sycl-ref.cpp) $(sycl-shared.cpp) $(sycl-gen.cpp)
+  libceed.sycl  += $(sycl-core.cpp) $(sycl-ref.cpp) $(sycl-shared.cpp) $(sycl-gen.cpp)
   BACKENDS_MAKE += $(SYCL_BACKENDS)
 endif
 

From baef4b7ad4c64e341b7f00a3fb5b9fc01a969864 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Thu, 6 Mar 2025 11:33:03 -0700
Subject: [PATCH 317/571] make - minor style

---
 Makefile | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/Makefile b/Makefile
index 507b248ee8..f73bc54e3b 100644
--- a/Makefile
+++ b/Makefile
@@ -632,6 +632,7 @@ export BACKENDS
 # ------------------------------------------------------------
 # Linker Flags
 # ------------------------------------------------------------
+
 _pkg_ldflags = $(filter -L%,$(PKG_LIBS))
 _pkg_ldlibs = $(filter-out -L%,$(PKG_LIBS))
 $(libceeds) : CEED_LDFLAGS += $(_pkg_ldflags) $(if $(STATIC),,$(_pkg_ldflags:-L%=-Wl,-rpath,%)) $(PKG_STUBS_LIBS)
@@ -853,6 +854,7 @@ $(OBJDIR)/interface/ceed-jit-source-root-install.o : CPPFLAGS += -DCEED_JIT_SOUR
 # ------------------------------------------------------------
 # Installation
 # ------------------------------------------------------------
+
 install : $(libceed) $(OBJDIR)/ceed.pc
 	$(INSTALL) -d $(addprefix $(if $(DESTDIR),"$(DESTDIR)"),"$(includedir)"\
 	  "$(includedir)/ceed/" "$(includedir)/ceed/jit-source/"\
@@ -881,6 +883,7 @@ install : $(libceed) $(OBJDIR)/ceed.pc
 # ------------------------------------------------------------
 # Cleaning
 # ------------------------------------------------------------
+
 cln clean :
 	$(RM) -r $(OBJDIR) $(LIBDIR) dist *egg* .pytest_cache *cffi*
 	$(call quiet,MAKE) -C examples clean NEK5K_DIR="$(abspath $(NEK5K_DIR))"
@@ -894,6 +897,7 @@ distclean : clean
 # ------------------------------------------------------------
 # Documentation
 # ------------------------------------------------------------
+
 DOXYGEN ?= doxygen
 
 doxygen :
@@ -980,6 +984,7 @@ print-% :
 # ------------------------------------------------------------
 # Configuration caching
 # ------------------------------------------------------------
+
 # "make configure" detects any variables passed on the command line or
 # previously set in config.mk, caching them in config.mk as simple
 # (:=) variables.  Variables set in config.mk or on the command line
@@ -1016,6 +1021,7 @@ configure :
 # ------------------------------------------------------------
 # Building Python wheels for deployment
 # ------------------------------------------------------------
+
 wheel : export MARCHFLAG = -march=generic
 wheel : export WHEEL_PLAT = manylinux2010_x86_64
 wheel :
@@ -1026,6 +1032,7 @@ wheel :
 # ------------------------------------------------------------
 # Phony targets
 # ------------------------------------------------------------
+
 # These targets are not files but rather commands to run
 .PHONY : all cln clean doxygen doc format lib install par print test tst prove prv prove-all junit examples tidy iwyu info info-backends info-backends-all configure wheel
 

From 59fa3f92dbea5a7e09ccfccd0c5c76c4fcbd3373 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Thu, 6 Mar 2025 16:29:57 -0700
Subject: [PATCH 318/571] gen - use field names for clarity

---
 .../cuda-gen/ceed-cuda-gen-operator-build.cpp | 63 ++++++++++++++-----
 .../hip-gen/ceed-hip-gen-operator-build.cpp   | 63 ++++++++++++++-----
 2 files changed, 96 insertions(+), 30 deletions(-)

diff --git a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
index f306fdaa0c..41adf6b070 100644
--- a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
+++ b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
@@ -135,6 +135,7 @@ static int CeedOperatorBuildKernelData_Cuda_gen(Ceed ceed, CeedInt num_input_fie
 static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, CeedInt i, CeedOperatorField op_field,
                                                      CeedQFunctionField qf_field, FieldReuse_Cuda field_reuse, CeedInt Q_1d, bool is_input,
                                                      bool is_tensor, bool is_at_points, bool use_3d_slices) {
+  const char            *field_name;
   std::string            var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
   std::string            P_name = (is_tensor ? "P_1d" : "P") + var_suffix, Q_name = is_tensor ? "Q_1d" : "Q";
   std::string            option_name = (is_input ? "inputs" : "outputs");
@@ -147,7 +148,8 @@ static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, C
   // Field reuse info
   bool use_previous_field = field_reuse.index != -1;
 
-  code << "  // -- " << (is_input ? "Input" : "Output") << " field " << i << "\n";
+  CeedCallBackend(CeedOperatorFieldGetName(op_field, &field_name));
+  code << "  // -- " << (is_input ? "Input" : "Output") << " field " << i << ": " << field_name << "\n";
 
   // Get field data
   CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr));
@@ -600,9 +602,11 @@ static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, C
   // Setup output arrays
   code << "\n    // -- Output field setup\n";
   for (CeedInt i = 0; i < num_output_fields; i++) {
+    const char *field_name;
     std::string var_suffix = "_out_" + std::to_string(i);
 
-    code << "    // ---- Output field " << i << "\n";
+    CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
+    code << "    // ---- Output field " << i << ": " << field_name << "\n";
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
     switch (eval_mode) {
       case CEED_EVAL_NONE:
@@ -664,9 +668,11 @@ static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, C
 
     code << "      // -- Input fields\n";
     for (CeedInt i = 0; i < num_input_fields; i++) {
+      const char *field_name;
       std::string var_suffix = "_in_" + std::to_string(i);
 
-      code << "      // ---- Input field " << i << "\n";
+      CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name));
+      code << "      // ---- Input field " << i << ": " << field_name << "\n";
       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
       // Basis action
       code << "      // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
@@ -699,9 +705,11 @@ static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, C
     }
     code << "\n      // -- Output fields\n";
     for (CeedInt i = 0; i < num_output_fields; i++) {
+      const char *field_name;
       std::string var_suffix = "_out_" + std::to_string(i);
 
-      code << "      // ---- Output field " << i << "\n";
+      CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
+      code << "      // ---- Output field " << i << ": " << field_name << "\n";
       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
       // Basis action
       switch (eval_mode) {
@@ -731,9 +739,11 @@ static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, C
     code << "    for (CeedInt q = 0; q < " << Q_name << "; q++) {\n";
     code << "      // -- Input fields\n";
     for (CeedInt i = 0; i < num_input_fields; i++) {
+      const char *field_name;
       std::string var_suffix = "_in_" + std::to_string(i);
 
-      code << "      // ---- Input field " << i << "\n";
+      CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name));
+      code << "      // ---- Input field " << i << ": " << field_name << "\n";
       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
       // Basis action
       code << "      // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
@@ -800,9 +810,11 @@ static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, C
     }
     code << "\n      // -- Output fields\n";
     for (CeedInt i = 0; i < num_output_fields; i++) {
+      const char *field_name;
       std::string var_suffix = "_out_" + std::to_string(i);
 
-      code << "      // ---- Output field " << i << "\n";
+      CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
+      code << "      // ---- Output field " << i << ": " << field_name << "\n";
       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
       // Basis action
       switch (eval_mode) {
@@ -829,12 +841,18 @@ static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, C
     code << "    {\n";
     code << "      // -- Input fields\n";
     for (CeedInt i = 0; i < num_input_fields; i++) {
-      code << "      // ---- Input field " << i << "\n";
+      const char *field_name;
+
+      CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name));
+      code << "      // ---- Input field " << i << ": " << field_name << "\n";
       code << "      CeedScalar *r_s_in_" << i << " = r_q_in_" << i << ";\n";
     }
     code << "      // -- Output fields\n";
     for (CeedInt i = 0; i < num_output_fields; i++) {
-      code << "      // ---- Output field " << i << "\n";
+      const char *field_name;
+
+      CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
+      code << "      // ---- Output field " << i << ": " << field_name << "\n";
       code << "      CeedScalar *r_s_out_" << i << " = r_q_out_" << i << ";\n";
     }
   }
@@ -844,13 +862,19 @@ static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, C
   code << "      // ---- Inputs\n";
   code << "      CeedScalar *inputs[" << CeedIntMax(num_input_fields, 1) << "];\n";
   for (CeedInt i = 0; i < num_input_fields; i++) {
-    code << "      // ------ Input field " << i << "\n";
+    const char *field_name;
+
+    CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name));
+    code << "      // ------ Input field " << i << ": " << field_name << "\n";
     code << "      inputs[" << i << "] = r_s_in_" << i << ";\n";
   }
   code << "      // ---- Outputs\n";
   code << "      CeedScalar *outputs[" << CeedIntMax(num_output_fields, 1) << "];\n";
   for (CeedInt i = 0; i < num_output_fields; i++) {
-    code << "      // ------ Output field " << i << "\n";
+    const char *field_name;
+
+    CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
+    code << "      // ------ Output field " << i << ": " << field_name << "\n";
     code << "      outputs[" << i << "] = r_s_out_" << i << ";\n";
   }
 
@@ -868,10 +892,12 @@ static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, C
     // Map back to coefficients
     code << "\n      // -- Output fields\n";
     for (CeedInt i = 0; i < num_output_fields; i++) {
+      const char *field_name;
       std::string var_suffix = "_out_" + std::to_string(i);
       std::string P_name     = "P_1d" + var_suffix;
 
-      code << "      // ---- Output field " << i << "\n";
+      CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
+      code << "      // ---- Output field " << i << ": " << field_name << "\n";
       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
       // Basis action
       code << "      // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
@@ -916,10 +942,12 @@ static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, C
     // Copy or apply transpose grad, if needed
     code << "\n      // -- Output fields\n";
     for (CeedInt i = 0; i < num_output_fields; i++) {
+      const char *field_name;
       std::string var_suffix = "_out_" + std::to_string(i);
       std::string P_name     = "P_1d" + var_suffix;
 
-      code << "      // ---- Output field " << i << "\n";
+      CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
+      code << "      // ---- Output field " << i << ": " << field_name << "\n";
       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
       // Basis action
       code << "      // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
@@ -1367,9 +1395,11 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_b
   // -- Input restriction and basis
   code << "\n    // -- Input field restrictions and basis actions\n";
   for (CeedInt i = 0; i < num_input_fields; i++) {
-    CeedInt f = input_field_order[i];
+    const char   *field_name;
+    const CeedInt f = input_field_order[i];
 
-    code << "    // ---- Input field " << f << "\n";
+    CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[f], &field_name));
+    code << "    // ---- Input field " << f << ": " << field_name << "\n";
 
     // ---- Restriction
     CeedCallBackend(CeedOperatorBuildKernelRestriction_Cuda_gen(code, data, f, dim, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f],
@@ -1388,7 +1418,10 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_b
   // -- Output basis and restriction
   code << "\n    // -- Output field basis action and restrictions\n";
   for (CeedInt i = 0; i < num_output_fields; i++) {
-    code << "    // ---- Output field " << i << "\n";
+    const char *field_name;
+
+    CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
+    code << "    // ---- Output field " << i << ": " << field_name << "\n";
 
     // ---- Basis action
     CeedCallBackend(CeedOperatorBuildKernelBasis_Cuda_gen(code, data, i, dim, op_output_fields[i], qf_output_fields[i], Q_1d, false, is_tensor,
diff --git a/backends/hip-gen/ceed-hip-gen-operator-build.cpp b/backends/hip-gen/ceed-hip-gen-operator-build.cpp
index b3cd64eef4..4fe0e00f70 100644
--- a/backends/hip-gen/ceed-hip-gen-operator-build.cpp
+++ b/backends/hip-gen/ceed-hip-gen-operator-build.cpp
@@ -162,6 +162,7 @@ static int CeedOperatorBuildKernelData_Hip_gen(Ceed ceed, CeedInt num_input_fiel
 static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, CeedInt i, CeedOperatorField op_field,
                                                     CeedQFunctionField qf_field, FieldReuse_Hip field_reuse, CeedInt Q_1d, bool is_input,
                                                     bool is_tensor, bool is_at_points, bool use_3d_slices) {
+  const char           *field_name;
   std::string           var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
   std::string           P_name = (is_tensor ? "P_1d" : "P") + var_suffix, Q_name = is_tensor ? "Q_1d" : "Q";
   std::string           option_name = (is_input ? "inputs" : "outputs");
@@ -174,7 +175,8 @@ static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, Ce
   // Field reuse info
   bool use_previous_field = field_reuse.index != -1;
 
-  code << "  // -- " << (is_input ? "Input" : "Output") << " field " << i << "\n";
+  CeedCallBackend(CeedOperatorFieldGetName(op_field, &field_name));
+  code << "  // -- " << (is_input ? "Input" : "Output") << " field " << i << ": " << field_name << "\n";
 
   // Get field data
   CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr));
@@ -626,9 +628,11 @@ static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, Ce
   // Setup output arrays
   code << "\n    // -- Output field setup\n";
   for (CeedInt i = 0; i < num_output_fields; i++) {
+    const char *field_name;
     std::string var_suffix = "_out_" + std::to_string(i);
 
-    code << "    // ---- Output field " << i << "\n";
+    CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
+    code << "    // ---- Output field " << i << ": " << field_name << "\n";
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
     switch (eval_mode) {
       case CEED_EVAL_NONE:
@@ -690,9 +694,11 @@ static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, Ce
 
     code << "      // -- Input fields\n";
     for (CeedInt i = 0; i < num_input_fields; i++) {
+      const char *field_name;
       std::string var_suffix = "_in_" + std::to_string(i);
 
-      code << "      // ---- Input field " << i << "\n";
+      CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name));
+      code << "      // ---- Input field " << i << ": " << field_name << "\n";
       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
       // Basis action
       code << "      // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
@@ -725,9 +731,11 @@ static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, Ce
     }
     code << "\n      // -- Output fields\n";
     for (CeedInt i = 0; i < num_output_fields; i++) {
+      const char *field_name;
       std::string var_suffix = "_out_" + std::to_string(i);
 
-      code << "      // ---- Output field " << i << "\n";
+      CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
+      code << "      // ---- Output field " << i << ": " << field_name << "\n";
       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
       // Basis action
       switch (eval_mode) {
@@ -757,9 +765,11 @@ static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, Ce
     code << "    for (CeedInt q = 0; q < " << Q_name << "; q++) {\n";
     code << "      // -- Input fields\n";
     for (CeedInt i = 0; i < num_input_fields; i++) {
+      const char *field_name;
       std::string var_suffix = "_in_" + std::to_string(i);
 
-      code << "      // ---- Input field " << i << "\n";
+      CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name));
+      code << "      // ---- Input field " << i << ": " << field_name << "\n";
       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
       // Basis action
       code << "      // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
@@ -826,9 +836,11 @@ static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, Ce
     }
     code << "\n      // -- Output fields\n";
     for (CeedInt i = 0; i < num_output_fields; i++) {
+      const char *field_name;
       std::string var_suffix = "_out_" + std::to_string(i);
 
-      code << "      // ---- Output field " << i << "\n";
+      CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
+      code << "      // ---- Output field " << i << ": " << field_name << "\n";
       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
       // Basis action
       switch (eval_mode) {
@@ -855,12 +867,18 @@ static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, Ce
     code << "    {\n";
     code << "      // -- Input fields\n";
     for (CeedInt i = 0; i < num_input_fields; i++) {
-      code << "      // ---- Input field " << i << "\n";
+      const char *field_name;
+
+      CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name));
+      code << "      // ---- Input field " << i << ": " << field_name << "\n";
       code << "      CeedScalar *r_s_in_" << i << " = r_q_in_" << i << ";\n";
     }
     code << "      // -- Output fields\n";
     for (CeedInt i = 0; i < num_output_fields; i++) {
-      code << "      // ---- Output field " << i << "\n";
+      const char *field_name;
+
+      CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
+      code << "      // ---- Output field " << i << ": " << field_name << "\n";
       code << "      CeedScalar *r_s_out_" << i << " = r_q_out_" << i << ";\n";
     }
   }
@@ -870,13 +888,19 @@ static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, Ce
   code << "      // ---- Inputs\n";
   code << "      CeedScalar *inputs[" << CeedIntMax(num_input_fields, 1) << "];\n";
   for (CeedInt i = 0; i < num_input_fields; i++) {
-    code << "      // ------ Input field " << i << "\n";
+    const char *field_name;
+
+    CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name));
+    code << "      // ------ Input field " << i << ": " << field_name << "\n";
     code << "      inputs[" << i << "] = r_s_in_" << i << ";\n";
   }
   code << "      // ---- Outputs\n";
   code << "      CeedScalar *outputs[" << CeedIntMax(num_output_fields, 1) << "];\n";
   for (CeedInt i = 0; i < num_output_fields; i++) {
-    code << "      // ------ Output field " << i << "\n";
+    const char *field_name;
+
+    CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
+    code << "      // ------ Output field " << i << ": " << field_name << "\n";
     code << "      outputs[" << i << "] = r_s_out_" << i << ";\n";
   }
 
@@ -894,10 +918,12 @@ static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, Ce
     // Map back to coefficients
     code << "\n      // -- Output fields\n";
     for (CeedInt i = 0; i < num_output_fields; i++) {
+      const char *field_name;
       std::string var_suffix = "_out_" + std::to_string(i);
       std::string P_name     = "P_1d" + var_suffix;
 
-      code << "      // ---- Output field " << i << "\n";
+      CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
+      code << "      // ---- Output field " << i << ": " << field_name << "\n";
       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
       // Basis action
       code << "      // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
@@ -942,10 +968,12 @@ static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, Ce
     // Copy or apply transpose grad, if needed
     code << "\n      // -- Output fields\n";
     for (CeedInt i = 0; i < num_output_fields; i++) {
+      const char *field_name;
       std::string var_suffix = "_out_" + std::to_string(i);
       std::string P_name     = "P_1d" + var_suffix;
 
-      code << "      // ---- Output field " << i << "\n";
+      CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
+      code << "      // ---- Output field " << i << ": " << field_name << "\n";
       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
       // Basis action
       code << "      // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
@@ -1380,9 +1408,11 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_bu
   // -- Input restriction and basis
   code << "\n    // -- Input field restrictions and basis actions\n";
   for (CeedInt i = 0; i < num_input_fields; i++) {
-    CeedInt f = input_field_order[i];
+    const char   *field_name;
+    const CeedInt f = input_field_order[i];
 
-    code << "    // ---- Input field " << f << "\n";
+    CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[f], &field_name));
+    code << "    // ---- Input field " << f << ": " << field_name << "\n";
 
     // ---- Restriction
     CeedCallBackend(CeedOperatorBuildKernelRestriction_Hip_gen(code, data, f, dim, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f], Q_1d,
@@ -1401,7 +1431,10 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_bu
   // -- Output basis and restriction
   code << "\n    // -- Output field basis action and restrictions\n";
   for (CeedInt i = 0; i < num_output_fields; i++) {
-    code << "    // ---- Output field " << i << "\n";
+    const char *field_name;
+
+    CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
+    code << "    // ---- Output field " << i << ": " << field_name << "\n";
 
     // ---- Basis action
     CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, i, dim, op_output_fields[i], qf_output_fields[i], Q_1d, false, is_tensor,

From 826538b3ddde0d7c15f4f01c7db2d4d257d3f79f Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 7 Mar 2025 09:45:37 -0700
Subject: [PATCH 319/571] gen - restrict input/output array pointers

---
 backends/cuda-gen/ceed-cuda-gen-operator-build.cpp | 4 ++--
 backends/hip-gen/ceed-hip-gen-operator-build.cpp   | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
index 41adf6b070..3f2e414fe8 100644
--- a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
+++ b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
@@ -1182,11 +1182,11 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_b
 
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
     if (eval_mode != CEED_EVAL_WEIGHT) {  // Skip CEED_EVAL_WEIGHT
-      code << "  const CeedScalar *d_in_" << i << " = fields.inputs[" << i << "];\n";
+      code << "  const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n";
     }
   }
   for (CeedInt i = 0; i < num_output_fields; i++) {
-    code << "  CeedScalar *d_out_" << i << " = fields.outputs[" << i << "];\n";
+    code << "  CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n";
   }
 
   code << "  const CeedInt dim = " << dim << ";\n";
diff --git a/backends/hip-gen/ceed-hip-gen-operator-build.cpp b/backends/hip-gen/ceed-hip-gen-operator-build.cpp
index 4fe0e00f70..5acbcb3aa1 100644
--- a/backends/hip-gen/ceed-hip-gen-operator-build.cpp
+++ b/backends/hip-gen/ceed-hip-gen-operator-build.cpp
@@ -1195,11 +1195,11 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_bu
 
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
     if (eval_mode != CEED_EVAL_WEIGHT) {  // Skip CEED_EVAL_WEIGHT
-      code << "  const CeedScalar *d_in_" << i << " = fields.inputs[" << i << "];\n";
+      code << "  const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n";
     }
   }
   for (CeedInt i = 0; i < num_output_fields; i++) {
-    code << "  CeedScalar *d_out_" << i << " = fields.outputs[" << i << "];\n";
+    code << "  CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n";
   }
 
   code << "  const CeedInt dim = " << dim << ";\n";

From 99421279ebf997f2fb157e1e18f23b9dc112bbac Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Mon, 10 Mar 2025 12:54:11 -0600
Subject: [PATCH 320/571] cuda - use BASIS_T_1D in codegen

---
 .../cuda-gen/ceed-cuda-gen-operator-build.cpp |  58 +++----
 backends/cuda-shared/ceed-cuda-shared-basis.c |   6 +-
 .../ceed/jit-source/cuda/cuda-gen-templates.h | 140 ++++++++--------
 .../cuda-shared-basis-nontensor-templates.h   |   8 +-
 .../cuda/cuda-shared-basis-nontensor.h        |  26 +--
 .../cuda/cuda-shared-basis-tensor-at-points.h |  48 +++---
 .../cuda/cuda-shared-basis-tensor-templates.h | 150 +++++++++---------
 .../cuda/cuda-shared-basis-tensor.h           |  56 +++----
 8 files changed, 246 insertions(+), 246 deletions(-)

diff --git a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
index 3f2e414fe8..fe7975c6bf 100644
--- a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
+++ b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
@@ -469,14 +469,14 @@ static int CeedOperatorBuildKernelBasis_Cuda_gen(std::ostringstream &code, CeedO
           std::string function_name = (dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d";
 
           code << "    CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "];\n";
-          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ">(data, r_e" << var_suffix << ", s_B"
-               << var_suffix << ", r_c" << var_suffix << ");\n";
+          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix
+               << ", s_B" << var_suffix << ", r_c" << var_suffix << ");\n";
         } else {
           std::string function_name = is_tensor ? ((dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d") : "InterpNonTensor";
 
           code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
-          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ">(data, r_e" << var_suffix << ", s_B"
-               << var_suffix << ", r_q" << var_suffix << ");\n";
+          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix
+               << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n";
         }
         break;
       case CEED_EVAL_GRAD:
@@ -484,27 +484,27 @@ static int CeedOperatorBuildKernelBasis_Cuda_gen(std::ostringstream &code, CeedO
           std::string function_name = (dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d";
 
           code << "    CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "];\n";
-          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ">(data, r_e" << var_suffix << ", s_B"
-               << var_suffix << ", r_c" << var_suffix << ");\n";
+          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix
+               << ", s_B" << var_suffix << ", r_c" << var_suffix << ");\n";
         } else if (use_3d_slices) {
           std::string function_name = (dim > 1 ? "InterpTensor" : "Interp") + std::to_string(dim) + "d";
 
           code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
-          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ">(data, r_e" << var_suffix << ", s_B"
-               << var_suffix << ", r_q" << var_suffix << ");\n";
+          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix
+               << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n";
         } else if (is_tensor) {
           bool        is_collocated = dim == 3 && Q_1d >= P_1d;
           std::string function_name = (dim == 1 ? "Grad" : (is_collocated ? "GradTensorCollocated" : "GradTensor")) + std::to_string(dim) + "d";
 
           code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim*" << (dim >= 3 ? Q_name : "1") << "];\n";
-          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ">(data, r_e" << var_suffix << ", s_B"
-               << var_suffix << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n";
+          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix
+               << ", s_B" << var_suffix << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n";
         } else {
           std::string function_name = "GradNonTensor";
 
           code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim];\n";
-          code << "    " << function_name << "<num_comp" << var_suffix << ", dim, " << P_name << ", " << Q_name << ">(data, r_e" << var_suffix
-               << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n";
+          code << "    " << function_name << "<num_comp" << var_suffix << ", dim, " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e"
+               << var_suffix << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n";
         }
         break;
       case CEED_EVAL_WEIGHT: {
@@ -537,14 +537,14 @@ static int CeedOperatorBuildKernelBasis_Cuda_gen(std::ostringstream &code, CeedO
         if (is_at_points) {
           std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d";
 
-          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ">(data, r_c" << var_suffix << ", s_B"
-               << var_suffix << ", r_e" << var_suffix << ");\n";
+          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_c" << var_suffix
+               << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
         } else {
           std::string function_name =
               is_tensor ? ((dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d") : "InterpTransposeNonTensor";
 
-          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ">(data, r_q" << var_suffix << ", s_B"
-               << var_suffix << ", r_e" << var_suffix << ");\n";
+          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_q" << var_suffix
+               << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
         }
         break;
       case CEED_EVAL_GRAD:
@@ -552,25 +552,25 @@ static int CeedOperatorBuildKernelBasis_Cuda_gen(std::ostringstream &code, CeedO
         if (is_at_points) {
           std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d";
 
-          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ">(data, r_c" << var_suffix << ", s_B"
-               << var_suffix << ", r_e" << var_suffix << ");\n";
+          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_c" << var_suffix
+               << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
         } else if (use_3d_slices) {
           std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d";
 
-          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ">(data, r_q" << var_suffix << ", s_B"
-               << var_suffix << ", r_e" << var_suffix << ");\n";
+          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_q" << var_suffix
+               << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
         } else if (is_tensor) {
           bool        is_collocated = dim == 3 && Q_1d >= P_1d;
           std::string function_name =
               (dim == 1 ? "GradTranspose" : (is_collocated ? "GradTransposeTensorCollocated" : "GradTransposeTensor")) + std::to_string(dim) + "d";
 
-          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ">(data, r_q" << var_suffix << ", s_B"
-               << var_suffix << ", s_G" << var_suffix << ", r_e" << var_suffix << ");\n";
+          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_q" << var_suffix
+               << ", s_B" << var_suffix << ", s_G" << var_suffix << ", r_e" << var_suffix << ");\n";
         } else {
           std::string function_name = "GradTransposeNonTensor";
 
-          code << "    " << function_name << "<num_comp" << var_suffix << ", dim, " << P_name << ", " << Q_name << ">(data, r_q" << var_suffix
-               << ", s_G" << var_suffix << ", r_e" << var_suffix << ");\n";
+          code << "    " << function_name << "<num_comp" << var_suffix << ", dim, " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_q"
+               << var_suffix << ", s_G" << var_suffix << ", r_e" << var_suffix << ");\n";
         }
         break;
       // LCOV_EXCL_START
@@ -794,8 +794,8 @@ static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, C
           break;
         case CEED_EVAL_GRAD:
           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim];\n";
-          code << "      GradColloSlice3d<num_comp" << var_suffix << ", " << Q_name << ">(data, q, r_q" << var_suffix << ", s_G" << var_suffix
-               << ", r_s" << var_suffix << ");\n";
+          code << "      GradColloSlice3d<num_comp" << var_suffix << ", " << Q_name << ", OP_T_1D>(data, q, r_q" << var_suffix << ", s_G"
+               << var_suffix << ", r_s" << var_suffix << ");\n";
           break;
         case CEED_EVAL_WEIGHT:
           code << "      CeedScalar r_s" << var_suffix << "[1];\n";
@@ -963,7 +963,7 @@ static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, C
           code << "      }\n";
           break;
         case CEED_EVAL_GRAD:
-          code << "      GradColloSliceTranspose3d<num_comp" << var_suffix << ", " << Q_name << ">(data, q, r_s" << var_suffix << ", s_G"
+          code << "      GradColloSliceTranspose3d<num_comp" << var_suffix << ", " << Q_name << ", OP_T_1D>(data, q, r_s" << var_suffix << ", s_G"
                << var_suffix << ", r_q" << var_suffix << ");\n";
           break;
           // LCOV_EXCL_START
@@ -1203,7 +1203,7 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_b
   code << "  data.t_id_y = threadIdx.y;\n";
   code << "  data.t_id_z = threadIdx.z;\n";
   code << "  data.t_id  = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
-  code << "  data.slice = slice + data.t_id_z*T_1D" << ((!is_tensor || dim == 1) ? "" : "*T_1D") << ";\n";
+  code << "  data.slice = slice + data.t_id_z*OP_T_1D" << ((!is_tensor || dim == 1) ? "" : "*OP_T_1D") << ";\n";
 
   // -- Determine input mat reuse
   FieldReuse_Cuda input_matrix_reuse[CEED_FIELD_MAX];
@@ -1441,7 +1441,7 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_b
   {
     bool is_compile_good = false;
 
-    CeedCallBackend(CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good, &data->module, 1, "T_1D", CeedIntMax(Q_1d, data->max_P_1d)));
+    CeedCallBackend(CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good, &data->module, 1, "OP_T_1D", CeedIntMax(Q_1d, data->max_P_1d)));
     if (is_compile_good) {
       *is_good_build = true;
       CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, operator_name.c_str(), &data->op));
diff --git a/backends/cuda-shared/ceed-cuda-shared-basis.c b/backends/cuda-shared/ceed-cuda-shared-basis.c
index 993e877f32..e99387e027 100644
--- a/backends/cuda-shared/ceed-cuda-shared-basis.c
+++ b/backends/cuda-shared/ceed-cuda-shared-basis.c
@@ -289,7 +289,7 @@ static int CeedBasisApplyAtPointsCore_Cuda_shared(CeedBasis basis, bool apply_ad
 
     if (data->moduleAtPoints) CeedCallCuda(ceed, cuModuleUnload(data->moduleAtPoints));
     CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
-    CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->moduleAtPoints, 8, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "T_1D",
+    CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->moduleAtPoints, 8, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_T_1D",
                                      CeedIntMax(Q_1d, P_1d), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, "BASIS_NUM_NODES", CeedIntPow(P_1d, dim),
                                      "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_NUM_PTS", max_num_points));
     CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "InterpAtPoints", &data->InterpAtPoints));
@@ -630,7 +630,7 @@ int CeedBasisCreateTensorH1_Cuda_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d,
   const char basis_kernel_source[] = "// Tensor basis source\n#include <ceed/jit-source/cuda/cuda-shared-basis-tensor.h>\n";
 
   CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
-  CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 8, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "T_1D",
+  CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 8, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_T_1D",
                                    CeedIntMax(Q_1d, P_1d), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, "BASIS_NUM_NODES", CeedIntPow(P_1d, dim),
                                    "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_HAS_COLLOCATED_GRAD", has_collocated_grad));
   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp));
@@ -704,7 +704,7 @@ int CeedBasisCreateH1_Cuda_shared(CeedElemTopology topo, CeedInt dim, CeedInt nu
   const char basis_kernel_source[] = "// Non-tensor basis source\n#include <ceed/jit-source/cuda/cuda-shared-basis-nontensor.h>\n";
 
   CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
-  CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "T_1D",
+  CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_T_1D",
                                    CeedIntMax(num_qpts, num_nodes), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp));
   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp));
   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "InterpTranspose", &data->InterpTranspose));
diff --git a/include/ceed/jit-source/cuda/cuda-gen-templates.h b/include/ceed/jit-source/cuda/cuda-gen-templates.h
index 17b9658cb5..b795da6ca4 100644
--- a/include/ceed/jit-source/cuda/cuda-gen-templates.h
+++ b/include/ceed/jit-source/cuda/cuda-gen-templates.h
@@ -56,12 +56,12 @@ inline __device__ void WritePoint(SharedData_Cuda &data, const CeedInt elem, con
 //------------------------------------------------------------------------------
 // L-vector -> E-vector, offsets provided
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int COMP_STRIDE, int P_1d>
+template <int NUM_COMP, int COMP_STRIDE, int P_1D>
 inline __device__ void ReadLVecStandard1d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
                                           const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
-  if (data.t_id_x < P_1d) {
+  if (data.t_id_x < P_1D) {
     const CeedInt node = data.t_id_x;
-    const CeedInt ind  = indices[node + elem * P_1d];
+    const CeedInt ind  = indices[node + elem * P_1D];
 
     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp];
   }
@@ -70,10 +70,10 @@ inline __device__ void ReadLVecStandard1d(SharedData_Cuda &data, const CeedInt n
 //------------------------------------------------------------------------------
 // L-vector -> E-vector, strided
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
+template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
 inline __device__ void ReadLVecStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u,
                                          CeedScalar *__restrict__ r_u) {
-  if (data.t_id_x < P_1d) {
+  if (data.t_id_x < P_1D) {
     const CeedInt node = data.t_id_x;
     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
 
@@ -84,12 +84,12 @@ inline __device__ void ReadLVecStrided1d(SharedData_Cuda &data, const CeedInt el
 //------------------------------------------------------------------------------
 // E-vector -> L-vector, offsets provided
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int COMP_STRIDE, int P_1d>
+template <int NUM_COMP, int COMP_STRIDE, int P_1D>
 inline __device__ void WriteLVecStandard1d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
                                            const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
-  if (data.t_id_x < P_1d) {
+  if (data.t_id_x < P_1D) {
     const CeedInt node = data.t_id_x;
-    const CeedInt ind  = indices[node + elem * P_1d];
+    const CeedInt ind  = indices[node + elem * P_1D];
 
     for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[comp]);
   }
@@ -98,10 +98,10 @@ inline __device__ void WriteLVecStandard1d(SharedData_Cuda &data, const CeedInt
 //------------------------------------------------------------------------------
 // E-vector -> L-vector, strided
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
+template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
 inline __device__ void WriteLVecStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
                                           CeedScalar *__restrict__ d_v) {
-  if (data.t_id_x < P_1d) {
+  if (data.t_id_x < P_1D) {
     const CeedInt node = data.t_id_x;
     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
 
@@ -116,12 +116,12 @@ inline __device__ void WriteLVecStrided1d(SharedData_Cuda &data, const CeedInt e
 //------------------------------------------------------------------------------
 // L-vector -> E-vector, offsets provided
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int COMP_STRIDE, int P_1d>
+template <int NUM_COMP, int COMP_STRIDE, int P_1D>
 inline __device__ void ReadLVecStandard2d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
                                           const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
-  if (data.t_id_x < P_1d && data.t_id_y < P_1d) {
-    const CeedInt node = data.t_id_x + data.t_id_y * P_1d;
-    const CeedInt ind  = indices[node + elem * P_1d * P_1d];
+  if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
+    const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
+    const CeedInt ind  = indices[node + elem * P_1D * P_1D];
 
     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp];
   }
@@ -130,11 +130,11 @@ inline __device__ void ReadLVecStandard2d(SharedData_Cuda &data, const CeedInt n
 //------------------------------------------------------------------------------
 // L-vector -> E-vector, strided
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
+template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
 inline __device__ void ReadLVecStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u,
                                          CeedScalar *__restrict__ r_u) {
-  if (data.t_id_x < P_1d && data.t_id_y < P_1d) {
-    const CeedInt node = data.t_id_x + data.t_id_y * P_1d;
+  if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
+    const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
 
     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP];
@@ -144,12 +144,12 @@ inline __device__ void ReadLVecStrided2d(SharedData_Cuda &data, const CeedInt el
 //------------------------------------------------------------------------------
 // E-vector -> L-vector, offsets provided
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int COMP_STRIDE, int P_1d>
+template <int NUM_COMP, int COMP_STRIDE, int P_1D>
 inline __device__ void WriteLVecStandard2d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
                                            const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
-  if (data.t_id_x < P_1d && data.t_id_y < P_1d) {
-    const CeedInt node = data.t_id_x + data.t_id_y * P_1d;
-    const CeedInt ind  = indices[node + elem * P_1d * P_1d];
+  if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
+    const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
+    const CeedInt ind  = indices[node + elem * P_1D * P_1D];
 
     for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[comp]);
   }
@@ -158,11 +158,11 @@ inline __device__ void WriteLVecStandard2d(SharedData_Cuda &data, const CeedInt
 //------------------------------------------------------------------------------
 // E-vector -> L-vector, strided
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
+template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
 inline __device__ void WriteLVecStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
                                           CeedScalar *__restrict__ d_v) {
-  if (data.t_id_x < P_1d && data.t_id_y < P_1d) {
-    const CeedInt node = data.t_id_x + data.t_id_y * P_1d;
+  if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
+    const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
 
     for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[comp];
@@ -176,15 +176,15 @@ inline __device__ void WriteLVecStrided2d(SharedData_Cuda &data, const CeedInt e
 //------------------------------------------------------------------------------
 // L-vector -> E-vector, offsets provided
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int COMP_STRIDE, int P_1d>
+template <int NUM_COMP, int COMP_STRIDE, int P_1D>
 inline __device__ void ReadLVecStandard3d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
                                           const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
-  if (data.t_id_x < P_1d && data.t_id_y < P_1d) {
-    for (CeedInt z = 0; z < P_1d; z++) {
-      const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d;
-      const CeedInt ind  = indices[node + elem * P_1d * P_1d * P_1d];
+  if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
+    for (CeedInt z = 0; z < P_1D; z++) {
+      const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
+      const CeedInt ind  = indices[node + elem * P_1D * P_1D * P_1D];
 
-      for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1d] = d_u[ind + COMP_STRIDE * comp];
+      for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1D] = d_u[ind + COMP_STRIDE * comp];
     }
   }
 }
@@ -192,15 +192,15 @@ inline __device__ void ReadLVecStandard3d(SharedData_Cuda &data, const CeedInt n
 //------------------------------------------------------------------------------
 // L-vector -> E-vector, strided
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
+template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
 inline __device__ void ReadLVecStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u,
                                          CeedScalar *__restrict__ r_u) {
-  if (data.t_id_x < P_1d && data.t_id_y < P_1d) {
-    for (CeedInt z = 0; z < P_1d; z++) {
-      const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d;
+  if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
+    for (CeedInt z = 0; z < P_1D; z++) {
+      const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
       const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
 
-      for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1d] = d_u[ind + comp * STRIDES_COMP];
+      for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1D] = d_u[ind + comp * STRIDES_COMP];
     }
   }
 }
@@ -208,13 +208,13 @@ inline __device__ void ReadLVecStrided3d(SharedData_Cuda &data, const CeedInt el
 //------------------------------------------------------------------------------
 // E-vector -> Q-vector, offests provided
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int COMP_STRIDE, int Q_1d>
+template <int NUM_COMP, int COMP_STRIDE, int Q_1D>
 inline __device__ void ReadEVecSliceStandard3d(SharedData_Cuda &data, const CeedInt nquads, const CeedInt elem, const CeedInt q,
                                                const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u,
                                                CeedScalar *__restrict__ r_u) {
-  if (data.t_id_x < Q_1d && data.t_id_y < Q_1d) {
-    const CeedInt node = data.t_id_x + data.t_id_y * Q_1d + q * Q_1d * Q_1d;
-    const CeedInt ind  = indices[node + elem * Q_1d * Q_1d * Q_1d];
+  if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
+    const CeedInt node = data.t_id_x + data.t_id_y * Q_1D + q * Q_1D * Q_1D;
+    const CeedInt ind  = indices[node + elem * Q_1D * Q_1D * Q_1D];
 
     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp];
   }
@@ -223,11 +223,11 @@ inline __device__ void ReadEVecSliceStandard3d(SharedData_Cuda &data, const Ceed
 //------------------------------------------------------------------------------
 // E-vector -> Q-vector, strided
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int Q_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
+template <int NUM_COMP, int Q_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
 inline __device__ void ReadEVecSliceStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedInt q, const CeedScalar *__restrict__ d_u,
                                               CeedScalar *__restrict__ r_u) {
-  if (data.t_id_x < Q_1d && data.t_id_y < Q_1d) {
-    const CeedInt node = data.t_id_x + data.t_id_y * Q_1d + q * Q_1d * Q_1d;
+  if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
+    const CeedInt node = data.t_id_x + data.t_id_y * Q_1D + q * Q_1D * Q_1D;
     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
 
     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP];
@@ -237,15 +237,15 @@ inline __device__ void ReadEVecSliceStrided3d(SharedData_Cuda &data, const CeedI
 //------------------------------------------------------------------------------
 // E-vector -> L-vector, offsets provided
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int COMP_STRIDE, int P_1d>
+template <int NUM_COMP, int COMP_STRIDE, int P_1D>
 inline __device__ void WriteLVecStandard3d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
                                            const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
-  if (data.t_id_x < P_1d && data.t_id_y < P_1d) {
-    for (CeedInt z = 0; z < P_1d; z++) {
-      const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d;
-      const CeedInt ind  = indices[node + elem * P_1d * P_1d * P_1d];
+  if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
+    for (CeedInt z = 0; z < P_1D; z++) {
+      const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
+      const CeedInt ind  = indices[node + elem * P_1D * P_1D * P_1D];
 
-      for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[z + comp * P_1d]);
+      for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[z + comp * P_1D]);
     }
   }
 }
@@ -253,15 +253,15 @@ inline __device__ void WriteLVecStandard3d(SharedData_Cuda &data, const CeedInt
 //------------------------------------------------------------------------------
 // E-vector -> L-vector, strided
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
+template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
 inline __device__ void WriteLVecStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
                                           CeedScalar *__restrict__ d_v) {
-  if (data.t_id_x < P_1d && data.t_id_y < P_1d) {
-    for (CeedInt z = 0; z < P_1d; z++) {
-      const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d;
+  if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
+    for (CeedInt z = 0; z < P_1D; z++) {
+      const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
       const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
 
-      for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[z + comp * P_1d];
+      for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[z + comp * P_1D];
     }
   }
 }
@@ -269,27 +269,27 @@ inline __device__ void WriteLVecStrided3d(SharedData_Cuda &data, const CeedInt e
 //------------------------------------------------------------------------------
 // 3D collocated derivatives computation
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int Q_1d>
+template <int NUM_COMP, int Q_1D, int T_1D>
 inline __device__ void GradColloSlice3d(SharedData_Cuda &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
                                         CeedScalar *__restrict__ r_V) {
-  if (data.t_id_x < Q_1d && data.t_id_y < Q_1d) {
+  if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-      data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[q + comp * Q_1d];
+      data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[q + comp * Q_1D];
       __syncthreads();
       // X derivative
       r_V[comp + 0 * NUM_COMP] = 0.0;
-      for (CeedInt i = 0; i < Q_1d; i++) {
-        r_V[comp + 0 * NUM_COMP] += c_G[i + data.t_id_x * Q_1d] * data.slice[i + data.t_id_y * T_1D];
+      for (CeedInt i = 0; i < Q_1D; i++) {
+        r_V[comp + 0 * NUM_COMP] += c_G[i + data.t_id_x * Q_1D] * data.slice[i + data.t_id_y * T_1D];
       }
       // Y derivative
       r_V[comp + 1 * NUM_COMP] = 0.0;
-      for (CeedInt i = 0; i < Q_1d; i++) {
-        r_V[comp + 1 * NUM_COMP] += c_G[i + data.t_id_y * Q_1d] * data.slice[data.t_id_x + i * T_1D];
+      for (CeedInt i = 0; i < Q_1D; i++) {
+        r_V[comp + 1 * NUM_COMP] += c_G[i + data.t_id_y * Q_1D] * data.slice[data.t_id_x + i * T_1D];
       }
       // Z derivative
       r_V[comp + 2 * NUM_COMP] = 0.0;
-      for (CeedInt i = 0; i < Q_1d; i++) {
-        r_V[comp + 2 * NUM_COMP] += c_G[i + q * Q_1d] * r_U[i + comp * Q_1d];
+      for (CeedInt i = 0; i < Q_1D; i++) {
+        r_V[comp + 2 * NUM_COMP] += c_G[i + q * Q_1D] * r_U[i + comp * Q_1D];
       }
       __syncthreads();
     }
@@ -299,28 +299,28 @@ inline __device__ void GradColloSlice3d(SharedData_Cuda &data, const CeedInt q,
 //------------------------------------------------------------------------------
 // 3D collocated derivatives transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int Q_1d>
+template <int NUM_COMP, int Q_1D, int T_1D>
 inline __device__ void GradColloSliceTranspose3d(SharedData_Cuda &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
                                                  CeedScalar *__restrict__ r_V) {
-  if (data.t_id_x < Q_1d && data.t_id_y < Q_1d) {
+  if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
       data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 0 * NUM_COMP];
       __syncthreads();
       // X derivative
-      for (CeedInt i = 0; i < Q_1d; i++) {
-        r_V[q + comp * Q_1d] += c_G[data.t_id_x + i * Q_1d] * data.slice[i + data.t_id_y * T_1D];
+      for (CeedInt i = 0; i < Q_1D; i++) {
+        r_V[q + comp * Q_1D] += c_G[data.t_id_x + i * Q_1D] * data.slice[i + data.t_id_y * T_1D];
       }
       __syncthreads();
       // Y derivative
       data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 1 * NUM_COMP];
       __syncthreads();
-      for (CeedInt i = 0; i < Q_1d; i++) {
-        r_V[q + comp * Q_1d] += c_G[data.t_id_y + i * Q_1d] * data.slice[data.t_id_x + i * T_1D];
+      for (CeedInt i = 0; i < Q_1D; i++) {
+        r_V[q + comp * Q_1D] += c_G[data.t_id_y + i * Q_1D] * data.slice[data.t_id_x + i * T_1D];
       }
       __syncthreads();
       // Z derivative
-      for (CeedInt i = 0; i < Q_1d; i++) {
-        r_V[i + comp * Q_1d] += c_G[i + q * Q_1d] * r_U[comp + 2 * NUM_COMP];
+      for (CeedInt i = 0; i < Q_1D; i++) {
+        r_V[i + comp * Q_1D] += c_G[i + q * Q_1D] * r_U[comp + 2 * NUM_COMP];
       }
     }
   }
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor-templates.h
index c142b05c3d..4cb265e78a 100644
--- a/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor-templates.h
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor-templates.h
@@ -43,7 +43,7 @@ inline __device__ void ContractTranspose1d(SharedData_Cuda &data, const CeedScal
 //------------------------------------------------------------------------------
 // Interpolate to quadrature points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P, int Q>
+template <int NUM_COMP, int P, int Q, int T_1D>
 inline __device__ void InterpNonTensor(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
                                        CeedScalar *__restrict__ r_V) {
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
@@ -54,7 +54,7 @@ inline __device__ void InterpNonTensor(SharedData_Cuda &data, const CeedScalar *
 //------------------------------------------------------------------------------
 // Interpolate transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P, int Q>
+template <int NUM_COMP, int P, int Q, int T_1D>
 inline __device__ void InterpTransposeNonTensor(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
                                                 CeedScalar *__restrict__ r_V) {
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
@@ -66,7 +66,7 @@ inline __device__ void InterpTransposeNonTensor(SharedData_Cuda &data, const Cee
 //------------------------------------------------------------------------------
 // Derivatives at quadrature points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int DIM, int P, int Q>
+template <int NUM_COMP, int DIM, int P, int Q, int T_1D>
 inline __device__ void GradNonTensor(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
   for (CeedInt dim = 0; dim < DIM; dim++) {
     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
@@ -78,7 +78,7 @@ inline __device__ void GradNonTensor(SharedData_Cuda &data, const CeedScalar *__
 //------------------------------------------------------------------------------
 // Derivatives transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int DIM, int P, int Q>
+template <int NUM_COMP, int DIM, int P, int Q, int T_1D>
 inline __device__ void GradTransposeNonTensor(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
                                               CeedScalar *__restrict__ r_V) {
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_V[comp] = 0.0;
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor.h b/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor.h
index ad10f7dc9b..3a7fb2e241 100644
--- a/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor.h
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor.h
@@ -23,7 +23,7 @@ extern "C" __global__ void Interp(const CeedInt num_elem, const CeedScalar *c_B,
   data.t_id_y = threadIdx.y;
   data.t_id_z = threadIdx.z;
   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
-  data.slice  = slice + data.t_id_z * T_1D;
+  data.slice  = slice + data.t_id_z * BASIS_T_1D;
 
   CeedScalar r_U[BASIS_NUM_COMP];
   CeedScalar r_V[BASIS_NUM_COMP];
@@ -36,7 +36,7 @@ extern "C" __global__ void Interp(const CeedInt num_elem, const CeedScalar *c_B,
   // Apply basis element by element
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
     ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, d_U, r_U);
-    InterpNonTensor<BASIS_NUM_COMP, BASIS_P, BASIS_Q>(data, r_U, s_B, r_V);
+    InterpNonTensor<BASIS_NUM_COMP, BASIS_P, BASIS_Q, BASIS_T_1D>(data, r_U, s_B, r_V);
     WriteElementStrided1d<BASIS_NUM_COMP, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, r_V, d_V);
   }
 }
@@ -50,7 +50,7 @@ extern "C" __global__ void InterpTranspose(const CeedInt num_elem, const CeedSca
   data.t_id_y = threadIdx.y;
   data.t_id_z = threadIdx.z;
   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
-  data.slice  = slice + data.t_id_z * T_1D;
+  data.slice  = slice + data.t_id_z * BASIS_T_1D;
 
   CeedScalar r_U[BASIS_NUM_COMP];
   CeedScalar r_V[BASIS_NUM_COMP];
@@ -63,7 +63,7 @@ extern "C" __global__ void InterpTranspose(const CeedInt num_elem, const CeedSca
   // Apply basis element by element
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
     ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, d_U, r_U);
-    InterpTransposeNonTensor<BASIS_NUM_COMP, BASIS_P, BASIS_Q>(data, r_U, s_B, r_V);
+    InterpTransposeNonTensor<BASIS_NUM_COMP, BASIS_P, BASIS_Q, BASIS_T_1D>(data, r_U, s_B, r_V);
     WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, r_V, d_V);
   }
 }
@@ -77,7 +77,7 @@ extern "C" __global__ void InterpTransposeAdd(const CeedInt num_elem, const Ceed
   data.t_id_y = threadIdx.y;
   data.t_id_z = threadIdx.z;
   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
-  data.slice  = slice + data.t_id_z * T_1D;
+  data.slice  = slice + data.t_id_z * BASIS_T_1D;
 
   CeedScalar r_U[BASIS_NUM_COMP];
   CeedScalar r_V[BASIS_NUM_COMP];
@@ -90,7 +90,7 @@ extern "C" __global__ void InterpTransposeAdd(const CeedInt num_elem, const Ceed
   // Apply basis element by element
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
     ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, d_U, r_U);
-    InterpTransposeNonTensor<BASIS_NUM_COMP, BASIS_P, BASIS_Q>(data, r_U, s_B, r_V);
+    InterpTransposeNonTensor<BASIS_NUM_COMP, BASIS_P, BASIS_Q, BASIS_T_1D>(data, r_U, s_B, r_V);
     SumElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, r_V, d_V);
   }
 }
@@ -106,7 +106,7 @@ extern "C" __global__ void Grad(const CeedInt num_elem, const CeedScalar *c_G, c
   data.t_id_y = threadIdx.y;
   data.t_id_z = threadIdx.z;
   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
-  data.slice  = slice + data.t_id_z * T_1D;
+  data.slice  = slice + data.t_id_z * BASIS_T_1D;
 
   CeedScalar r_U[BASIS_NUM_COMP];
   CeedScalar r_V[BASIS_NUM_COMP * BASIS_DIM];
@@ -119,7 +119,7 @@ extern "C" __global__ void Grad(const CeedInt num_elem, const CeedScalar *c_G, c
   // Apply basis element by element
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
     ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, d_U, r_U);
-    GradNonTensor<BASIS_NUM_COMP, BASIS_DIM, BASIS_P, BASIS_Q>(data, r_U, s_G, r_V);
+    GradNonTensor<BASIS_NUM_COMP, BASIS_DIM, BASIS_P, BASIS_Q, BASIS_T_1D>(data, r_U, s_G, r_V);
     WriteElementStrided1d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, r_V, d_V);
   }
 }
@@ -133,7 +133,7 @@ extern "C" __global__ void GradTranspose(const CeedInt num_elem, const CeedScala
   data.t_id_y = threadIdx.y;
   data.t_id_z = threadIdx.z;
   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
-  data.slice  = slice + data.t_id_z * T_1D;
+  data.slice  = slice + data.t_id_z * BASIS_T_1D;
 
   CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM];
   CeedScalar r_V[BASIS_NUM_COMP];
@@ -146,7 +146,7 @@ extern "C" __global__ void GradTranspose(const CeedInt num_elem, const CeedScala
   // Apply basis element by element
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
     ReadElementStrided1d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, d_U, r_U);
-    GradTransposeNonTensor<BASIS_NUM_COMP, BASIS_DIM, BASIS_P, BASIS_Q>(data, r_U, s_G, r_V);
+    GradTransposeNonTensor<BASIS_NUM_COMP, BASIS_DIM, BASIS_P, BASIS_Q, BASIS_T_1D>(data, r_U, s_G, r_V);
     WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, r_V, d_V);
   }
 }
@@ -160,7 +160,7 @@ extern "C" __global__ void GradTransposeAdd(const CeedInt num_elem, const CeedSc
   data.t_id_y = threadIdx.y;
   data.t_id_z = threadIdx.z;
   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
-  data.slice  = slice + data.t_id_z * T_1D;
+  data.slice  = slice + data.t_id_z * BASIS_T_1D;
 
   CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM];
   CeedScalar r_V[BASIS_NUM_COMP];
@@ -173,7 +173,7 @@ extern "C" __global__ void GradTransposeAdd(const CeedInt num_elem, const CeedSc
   // Apply basis element by element
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
     ReadElementStrided1d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, d_U, r_U);
-    GradTransposeNonTensor<BASIS_NUM_COMP, BASIS_DIM, BASIS_P, BASIS_Q>(data, r_U, s_G, r_V);
+    GradTransposeNonTensor<BASIS_NUM_COMP, BASIS_DIM, BASIS_P, BASIS_Q, BASIS_T_1D>(data, r_U, s_G, r_V);
     SumElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, r_V, d_V);
   }
 }
@@ -189,7 +189,7 @@ extern "C" __global__ void Weight(const CeedInt num_elem, const CeedScalar *__re
   data.t_id_y = threadIdx.y;
   data.t_id_z = threadIdx.z;
   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
-  data.slice  = slice + data.t_id_z * T_1D;
+  data.slice  = slice + data.t_id_z * BASIS_T_1D;
 
   CeedScalar r_W[1];
 
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points.h
index d24106863f..1b318f80e0 100644
--- a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points.h
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points.h
@@ -29,7 +29,7 @@ extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedScal
   data.t_id_y = threadIdx.y;
   data.t_id_z = threadIdx.z;
   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
-  data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
+  data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
 
   CeedScalar r_X[BASIS_DIM];
   CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
@@ -46,14 +46,14 @@ extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedScal
     // Map to coefficients
     if (BASIS_DIM == 1) {
       ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U);
-      Interp1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_C);
+      Interp1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_C);
     } else if (BASIS_DIM == 2) {
       ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U);
-      InterpTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_C);
+      InterpTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_C);
     } else if (BASIS_DIM == 3) {
       ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
                                                        BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U);
-      InterpTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_C);
+      InterpTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_C);
     }
 
     // Map to points
@@ -85,7 +85,7 @@ extern "C" __global__ void InterpTransposeAtPoints(const CeedInt num_elem, const
   data.t_id_y = threadIdx.y;
   data.t_id_z = threadIdx.z;
   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
-  data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
+  data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
 
   CeedScalar r_X[BASIS_DIM];
   CeedScalar r_U[BASIS_NUM_COMP];
@@ -133,13 +133,13 @@ extern "C" __global__ void InterpTransposeAtPoints(const CeedInt num_elem, const
 
     // Map from coefficients
     if (BASIS_DIM == 1) {
-      InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V);
+      InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V);
       SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
     } else if (BASIS_DIM == 2) {
-      InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V);
+      InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V);
       SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
     } else if (BASIS_DIM == 3) {
-      InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V);
+      InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V);
       SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
                                                       BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
     }
@@ -156,7 +156,7 @@ extern "C" __global__ void InterpTransposeAddAtPoints(const CeedInt num_elem, co
   data.t_id_y = threadIdx.y;
   data.t_id_z = threadIdx.z;
   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
-  data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
+  data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
 
   CeedScalar r_X[BASIS_DIM];
   CeedScalar r_U[BASIS_NUM_COMP];
@@ -193,13 +193,13 @@ extern "C" __global__ void InterpTransposeAddAtPoints(const CeedInt num_elem, co
 
     // Map from coefficients
     if (BASIS_DIM == 1) {
-      InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V);
+      InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V);
       SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
     } else if (BASIS_DIM == 2) {
-      InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V);
+      InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V);
       SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
     } else if (BASIS_DIM == 3) {
-      InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V);
+      InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V);
       SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
                                                       BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
     }
@@ -218,7 +218,7 @@ extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedScalar
   data.t_id_y = threadIdx.y;
   data.t_id_z = threadIdx.z;
   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
-  data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
+  data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
 
   CeedScalar r_X[BASIS_DIM];
   CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
@@ -235,14 +235,14 @@ extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedScalar
     // Map to coefficients
     if (BASIS_DIM == 1) {
       ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U);
-      Interp1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_C);
+      Interp1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_C);
     } else if (BASIS_DIM == 2) {
       ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U);
-      InterpTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_C);
+      InterpTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_C);
     } else if (BASIS_DIM == 3) {
       ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
                                                        BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U);
-      InterpTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_C);
+      InterpTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_C);
     }
 
     // Map to points
@@ -274,7 +274,7 @@ extern "C" __global__ void GradTransposeAtPoints(const CeedInt num_elem, const C
   data.t_id_y = threadIdx.y;
   data.t_id_z = threadIdx.z;
   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
-  data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
+  data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
 
   CeedScalar r_X[BASIS_DIM];
   CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM];
@@ -323,13 +323,13 @@ extern "C" __global__ void GradTransposeAtPoints(const CeedInt num_elem, const C
 
     // Map from coefficients
     if (BASIS_DIM == 1) {
-      InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V);
+      InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V);
       SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
     } else if (BASIS_DIM == 2) {
-      InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V);
+      InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V);
       SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
     } else if (BASIS_DIM == 3) {
-      InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V);
+      InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V);
       SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
                                                       BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
     }
@@ -346,7 +346,7 @@ extern "C" __global__ void GradTransposeAddAtPoints(const CeedInt num_elem, cons
   data.t_id_y = threadIdx.y;
   data.t_id_z = threadIdx.z;
   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
-  data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
+  data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
 
   CeedScalar r_X[BASIS_DIM];
   CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM];
@@ -384,13 +384,13 @@ extern "C" __global__ void GradTransposeAddAtPoints(const CeedInt num_elem, cons
 
     // Map from coefficients
     if (BASIS_DIM == 1) {
-      InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V);
+      InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V);
       SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
     } else if (BASIS_DIM == 2) {
-      InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V);
+      InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V);
       SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
     } else if (BASIS_DIM == 3) {
-      InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V);
+      InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V);
       SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
                                                       BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
     }
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h
index ba2a273a40..d49cfcd717 100644
--- a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h
@@ -48,7 +48,7 @@ inline __device__ void ContractTransposeX1d(SharedData_Cuda &data, const CeedSca
 //------------------------------------------------------------------------------
 // 1D interpolate to quadrature points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void Interp1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
     ContractX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, &r_V[comp]);
@@ -58,7 +58,7 @@ inline __device__ void Interp1d(SharedData_Cuda &data, const CeedScalar *__restr
 //------------------------------------------------------------------------------
 // 1D interpolate transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void InterpTranspose1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
                                          CeedScalar *__restrict__ r_V) {
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
@@ -69,7 +69,7 @@ inline __device__ void InterpTranspose1d(SharedData_Cuda &data, const CeedScalar
 //------------------------------------------------------------------------------
 // 1D derivatives at quadrature points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void Grad1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
                               CeedScalar *__restrict__ r_V) {
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
@@ -80,7 +80,7 @@ inline __device__ void Grad1d(SharedData_Cuda &data, const CeedScalar *__restric
 //------------------------------------------------------------------------------
 // 1D derivatives transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void GradTranspose1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
                                        CeedScalar *__restrict__ r_V) {
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
@@ -103,7 +103,7 @@ inline __device__ void Weight1d(SharedData_Cuda &data, const CeedScalar *__restr
 //------------------------------------------------------------------------------
 // 2D tensor contraction x
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void ContractX2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
   __syncthreads();
@@ -119,7 +119,7 @@ inline __device__ void ContractX2d(SharedData_Cuda &data, const CeedScalar *U, c
 //------------------------------------------------------------------------------
 // 2D tensor contract y
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void ContractY2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
   __syncthreads();
@@ -135,7 +135,7 @@ inline __device__ void ContractY2d(SharedData_Cuda &data, const CeedScalar *U, c
 //------------------------------------------------------------------------------
 // 2D transpose tensor contract y
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void ContractTransposeY2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
   __syncthreads();
@@ -151,7 +151,7 @@ inline __device__ void ContractTransposeY2d(SharedData_Cuda &data, const CeedSca
 //------------------------------------------------------------------------------
 // 2D transpose tensor contract x
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void ContractTransposeX2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
   __syncthreads();
@@ -167,7 +167,7 @@ inline __device__ void ContractTransposeX2d(SharedData_Cuda &data, const CeedSca
 //------------------------------------------------------------------------------
 // 2D transpose tensor contract and add x
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void ContractTransposeAddX2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
   __syncthreads();
@@ -182,56 +182,56 @@ inline __device__ void ContractTransposeAddX2d(SharedData_Cuda &data, const Ceed
 //------------------------------------------------------------------------------
 // 2D interpolate to quadrature points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void InterpTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
                                       CeedScalar *__restrict__ r_V) {
   CeedScalar r_t[1];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractX2d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, r_t);
-    ContractY2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, &r_V[comp]);
+    ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_B, r_t);
+    ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp]);
   }
 }
 
 //------------------------------------------------------------------------------
 // 2D interpolate transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void InterpTransposeTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
                                                CeedScalar *__restrict__ r_V) {
   CeedScalar r_t[1];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractTransposeY2d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, r_t);
-    ContractTransposeX2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, &r_V[comp]);
+    ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_B, r_t);
+    ContractTransposeX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp]);
   }
 }
 
 //------------------------------------------------------------------------------
 // 2D derivatives at quadrature points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void GradTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
                                     CeedScalar *__restrict__ r_V) {
   CeedScalar r_t[1];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractX2d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_G, r_t);
-    ContractY2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, &r_V[comp + 0 * NUM_COMP]);
-    ContractX2d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, r_t);
-    ContractY2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_G, &r_V[comp + 1 * NUM_COMP]);
+    ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_G, r_t);
+    ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp + 0 * NUM_COMP]);
+    ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_B, r_t);
+    ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_G, &r_V[comp + 1 * NUM_COMP]);
   }
 }
 
 //------------------------------------------------------------------------------
 // 2D derivatives transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void GradTransposeTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
                                              CeedScalar *__restrict__ r_V) {
   CeedScalar r_t[1];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractTransposeY2d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp + 0 * NUM_COMP], c_B, r_t);
-    ContractTransposeX2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_G, &r_V[comp]);
-    ContractTransposeY2d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp + 1 * NUM_COMP], c_G, r_t);
-    ContractTransposeAddX2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, &r_V[comp]);
+    ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp + 0 * NUM_COMP], c_B, r_t);
+    ContractTransposeX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_G, &r_V[comp]);
+    ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp + 1 * NUM_COMP], c_G, r_t);
+    ContractTransposeAddX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp]);
   }
 }
 
@@ -250,7 +250,7 @@ inline __device__ void WeightTensor2d(SharedData_Cuda &data, const CeedScalar *_
 //------------------------------------------------------------------------------
 // 3D tensor contract x
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void ContractX3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
   CeedScalar r_B[P_1D];
   for (CeedInt i = 0; i < P_1D; i++) {
@@ -273,7 +273,7 @@ inline __device__ void ContractX3d(SharedData_Cuda &data, const CeedScalar *U, c
 //------------------------------------------------------------------------------
 // 3D tensor contract y
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void ContractY3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
   CeedScalar r_B[P_1D];
   for (CeedInt i = 0; i < P_1D; i++) {
@@ -296,7 +296,7 @@ inline __device__ void ContractY3d(SharedData_Cuda &data, const CeedScalar *U, c
 //------------------------------------------------------------------------------
 // 3D tensor contract z
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void ContractZ3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
   for (CeedInt k = 0; k < Q_1D; k++) {
     V[k] = 0.0;
@@ -311,7 +311,7 @@ inline __device__ void ContractZ3d(SharedData_Cuda &data, const CeedScalar *U, c
 //------------------------------------------------------------------------------
 // 3D transpose tensor contract z
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void ContractTransposeZ3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
   for (CeedInt k = 0; k < P_1D; k++) {
     V[k] = 0.0;
@@ -326,7 +326,7 @@ inline __device__ void ContractTransposeZ3d(SharedData_Cuda &data, const CeedSca
 //------------------------------------------------------------------------------
 // 3D transpose tensor contract y
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void ContractTransposeY3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
   CeedScalar r_B[Q_1D];
   for (CeedInt i = 0; i < Q_1D; i++) {
@@ -349,7 +349,7 @@ inline __device__ void ContractTransposeY3d(SharedData_Cuda &data, const CeedSca
 //------------------------------------------------------------------------------
 // 3D transpose tensor contract y
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void ContractTransposeAddY3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
   CeedScalar r_B[Q_1D];
   for (CeedInt i = 0; i < Q_1D; i++) {
@@ -371,7 +371,7 @@ inline __device__ void ContractTransposeAddY3d(SharedData_Cuda &data, const Ceed
 //------------------------------------------------------------------------------
 // 3D transpose tensor contract x
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void ContractTransposeX3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
   CeedScalar r_B[Q_1D];
   for (CeedInt i = 0; i < Q_1D; i++) {
@@ -394,7 +394,7 @@ inline __device__ void ContractTransposeX3d(SharedData_Cuda &data, const CeedSca
 //------------------------------------------------------------------------------
 // 3D transpose tensor contract add x
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void ContractTransposeAddX3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
   CeedScalar r_B[Q_1D];
   for (CeedInt i = 0; i < Q_1D; i++) {
@@ -416,108 +416,108 @@ inline __device__ void ContractTransposeAddX3d(SharedData_Cuda &data, const Ceed
 //------------------------------------------------------------------------------
 // 3D interpolate to quadrature points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void InterpTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
                                       CeedScalar *__restrict__ r_V) {
   CeedScalar r_t1[T_1D];
   CeedScalar r_t2[T_1D];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractX3d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
-    ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
-    ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, &r_V[comp * Q_1D]);
+    ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
+    ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
+    ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D]);
   }
 }
 
 //------------------------------------------------------------------------------
 // 3D interpolate transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void InterpTransposeTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
                                                CeedScalar *__restrict__ r_V) {
   CeedScalar r_t1[T_1D];
   CeedScalar r_t2[T_1D];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp * Q_1D], c_B, r_t1);
-    ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
-    ContractTransposeX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
+    ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D], c_B, r_t1);
+    ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
+    ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
   }
 }
 
 //------------------------------------------------------------------------------
 // 3D derivatives at quadrature points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void GradTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
                                     CeedScalar *__restrict__ r_V) {
   CeedScalar r_t1[T_1D];
   CeedScalar r_t2[T_1D];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractX3d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp * P_1D], c_G, r_t1);
-    ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
-    ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]);
-    ContractX3d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
-    ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_G, r_t2);
-    ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]);
-    ContractX3d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
-    ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
-    ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]);
+    ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_G, r_t1);
+    ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
+    ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]);
+    ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
+    ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2);
+    ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]);
+    ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
+    ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
+    ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]);
   }
 }
 
 //------------------------------------------------------------------------------
 // 3D derivatives transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void GradTransposeTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
                                              CeedScalar *__restrict__ r_V) {
   CeedScalar r_t1[T_1D];
   CeedScalar r_t2[T_1D];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_B, r_t1);
-    ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
-    ContractTransposeX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_G, &r_V[comp * P_1D]);
-    ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_B, r_t1);
-    ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_G, r_t2);
-    ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
-    ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, r_t1);
-    ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
-    ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
+    ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_B, r_t1);
+    ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
+    ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp * P_1D]);
+    ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_B, r_t1);
+    ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2);
+    ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
+    ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, r_t1);
+    ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
+    ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
   }
 }
 
 //------------------------------------------------------------------------------
 // 3D derivatives at quadrature points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void GradTensorCollocated3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
                                               CeedScalar *__restrict__ r_V) {
   CeedScalar r_t1[T_1D];
   CeedScalar r_t2[T_1D];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractX3d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
-    ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
-    ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_t1);
-    ContractX3d<NUM_COMP, Q_1D, Q_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]);
-    ContractY3d<NUM_COMP, Q_1D, Q_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]);
-    ContractZ3d<NUM_COMP, Q_1D, Q_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]);
+    ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
+    ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
+    ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, r_t1);
+    ContractX3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]);
+    ContractY3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]);
+    ContractZ3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]);
   }
 }
 
 //------------------------------------------------------------------------------
 // 3D derivatives transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void GradTransposeTensorCollocated3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
                                                        const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
   CeedScalar r_t1[T_1D];
   CeedScalar r_t2[T_1D];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractTransposeZ3d<NUM_COMP, Q_1D, Q_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, r_t2);
-    ContractTransposeAddY3d<NUM_COMP, Q_1D, Q_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_G, r_t2);
-    ContractTransposeAddX3d<NUM_COMP, Q_1D, Q_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_G, r_t2);
-    ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_t1);
-    ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
-    ContractTransposeX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
+    ContractTransposeZ3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, r_t2);
+    ContractTransposeAddY3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_G, r_t2);
+    ContractTransposeAddX3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_G, r_t2);
+    ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, r_t1);
+    ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
+    ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
   }
 }
 
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor.h
index a70481fb55..312442c3aa 100644
--- a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor.h
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor.h
@@ -23,7 +23,7 @@ extern "C" __global__ void Interp(const CeedInt num_elem, const CeedScalar *c_B,
   data.t_id_y = threadIdx.y;
   data.t_id_z = threadIdx.z;
   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
-  data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
+  data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
 
   CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
   CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
@@ -37,16 +37,16 @@ extern "C" __global__ void Interp(const CeedInt num_elem, const CeedScalar *c_B,
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
     if (BASIS_DIM == 1) {
       ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U);
-      Interp1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_V);
+      Interp1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V);
       WriteElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, r_V, d_V);
     } else if (BASIS_DIM == 2) {
       ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U);
-      InterpTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_V);
+      InterpTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V);
       WriteElementStrided2d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, r_V, d_V);
     } else if (BASIS_DIM == 3) {
       ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
                                                        BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U);
-      InterpTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_V);
+      InterpTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V);
       WriteElementStrided3d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
                                                         BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_V, d_V);
     }
@@ -62,7 +62,7 @@ extern "C" __global__ void InterpTranspose(const CeedInt num_elem, const CeedSca
   data.t_id_y = threadIdx.y;
   data.t_id_z = threadIdx.z;
   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
-  data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
+  data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
 
   CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
   CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
@@ -76,16 +76,16 @@ extern "C" __global__ void InterpTranspose(const CeedInt num_elem, const CeedSca
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
     if (BASIS_DIM == 1) {
       ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
-      InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_V);
+      InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V);
       WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
     } else if (BASIS_DIM == 2) {
       ReadElementStrided2d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
-      InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_V);
+      InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V);
       WriteElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
     } else if (BASIS_DIM == 3) {
       ReadElementStrided3d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
                                                        BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
-      InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_V);
+      InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V);
       WriteElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
                                                         BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
     }
@@ -101,7 +101,7 @@ extern "C" __global__ void InterpTransposeAdd(const CeedInt num_elem, const Ceed
   data.t_id_y = threadIdx.y;
   data.t_id_z = threadIdx.z;
   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
-  data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
+  data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
 
   CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
   CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
@@ -115,16 +115,16 @@ extern "C" __global__ void InterpTransposeAdd(const CeedInt num_elem, const Ceed
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
     if (BASIS_DIM == 1) {
       ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
-      InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_V);
+      InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V);
       SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
     } else if (BASIS_DIM == 2) {
       ReadElementStrided2d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
-      InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_V);
+      InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V);
       SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
     } else if (BASIS_DIM == 3) {
       ReadElementStrided3d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
                                                        BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
-      InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_V);
+      InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V);
       SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
                                                       BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
     }
@@ -143,7 +143,7 @@ extern "C" __global__ void Grad(const CeedInt num_elem, const CeedScalar *c_B, c
   data.t_id_y = threadIdx.y;
   data.t_id_z = threadIdx.z;
   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
-  data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
+  data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
 
   CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
   CeedScalar r_V[BASIS_NUM_COMP * BASIS_DIM * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
@@ -159,18 +159,18 @@ extern "C" __global__ void Grad(const CeedInt num_elem, const CeedScalar *c_B, c
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
     if (BASIS_DIM == 1) {
       ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U);
-      Grad1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V);
+      Grad1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
       WriteElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, r_V, d_V);
     } else if (BASIS_DIM == 2) {
       ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U);
-      GradTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V);
+      GradTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
       WriteElementStrided2d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, r_V,
                                                                     d_V);
     } else if (BASIS_DIM == 3) {
       ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
                                                        BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U);
-      if (BASIS_HAS_COLLOCATED_GRAD) GradTensorCollocated3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V);
-      else GradTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V);
+      if (BASIS_HAS_COLLOCATED_GRAD) GradTensorCollocated3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
+      else GradTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
       WriteElementStrided3d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
                                                                     BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_V, d_V);
     }
@@ -186,7 +186,7 @@ extern "C" __global__ void GradTranspose(const CeedInt num_elem, const CeedScala
   data.t_id_y = threadIdx.y;
   data.t_id_z = threadIdx.z;
   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
-  data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
+  data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
 
   CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
   CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
@@ -202,18 +202,18 @@ extern "C" __global__ void GradTranspose(const CeedInt num_elem, const CeedScala
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
     if (BASIS_DIM == 1) {
       ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
-      GradTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V);
+      GradTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
       WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
     } else if (BASIS_DIM == 2) {
       ReadElementStrided2d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U,
                                                                    r_U);
-      GradTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V);
+      GradTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
       WriteElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
     } else if (BASIS_DIM == 3) {
       ReadElementStrided3d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
                                                                    BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
-      if (BASIS_HAS_COLLOCATED_GRAD) GradTransposeTensorCollocated3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V);
-      else GradTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V);
+      if (BASIS_HAS_COLLOCATED_GRAD) GradTransposeTensorCollocated3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
+      else GradTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
       WriteElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
                                                         BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
     }
@@ -229,7 +229,7 @@ extern "C" __global__ void GradTransposeAdd(const CeedInt num_elem, const CeedSc
   data.t_id_y = threadIdx.y;
   data.t_id_z = threadIdx.z;
   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
-  data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
+  data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
 
   CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
   CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
@@ -245,18 +245,18 @@ extern "C" __global__ void GradTransposeAdd(const CeedInt num_elem, const CeedSc
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
     if (BASIS_DIM == 1) {
       ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
-      GradTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V);
+      GradTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
       SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
     } else if (BASIS_DIM == 2) {
       ReadElementStrided2d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U,
                                                                    r_U);
-      GradTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V);
+      GradTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
       SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
     } else if (BASIS_DIM == 3) {
       ReadElementStrided3d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
                                                                    BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
-      if (BASIS_HAS_COLLOCATED_GRAD) GradTransposeTensorCollocated3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V);
-      else GradTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V);
+      if (BASIS_HAS_COLLOCATED_GRAD) GradTransposeTensorCollocated3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
+      else GradTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
       SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
                                                       BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
     }
@@ -274,7 +274,7 @@ extern "C" __global__ void Weight(const CeedInt num_elem, const CeedScalar *__re
   data.t_id_y = threadIdx.y;
   data.t_id_z = threadIdx.z;
   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
-  data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
+  data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
 
   CeedScalar r_W[BASIS_DIM > 2 ? BASIS_Q_1D : 1];
 

From 6b92dc4b38fb4a2ad1e306f024e5961058ad12b3 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Mon, 10 Mar 2025 13:19:25 -0600
Subject: [PATCH 321/571] hip - use BASIS_T_1D in codegen

---
 .../hip-gen/ceed-hip-gen-operator-build.cpp   |  58 +++----
 backends/hip-shared/ceed-hip-shared-basis.c   |   6 +-
 .../ceed/jit-source/hip/hip-gen-templates.h   | 140 ++++++++--------
 .../hip-shared-basis-nontensor-templates.h    |   8 +-
 .../hip/hip-shared-basis-nontensor.h          |  26 +--
 .../hip/hip-shared-basis-tensor-at-points.h   |  48 +++---
 .../hip/hip-shared-basis-tensor-templates.h   | 150 +++++++++---------
 .../jit-source/hip/hip-shared-basis-tensor.h  |  56 +++----
 8 files changed, 246 insertions(+), 246 deletions(-)

diff --git a/backends/hip-gen/ceed-hip-gen-operator-build.cpp b/backends/hip-gen/ceed-hip-gen-operator-build.cpp
index 5acbcb3aa1..9bb1a7e77f 100644
--- a/backends/hip-gen/ceed-hip-gen-operator-build.cpp
+++ b/backends/hip-gen/ceed-hip-gen-operator-build.cpp
@@ -496,14 +496,14 @@ static int CeedOperatorBuildKernelBasis_Hip_gen(std::ostringstream &code, CeedOp
           std::string function_name = (dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d";
 
           code << "    CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "];\n";
-          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ">(data, r_e" << var_suffix << ", s_B"
-               << var_suffix << ", r_c" << var_suffix << ");\n";
+          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix
+               << ", s_B" << var_suffix << ", r_c" << var_suffix << ");\n";
         } else {
           std::string function_name = is_tensor ? ((dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d") : "InterpNonTensor";
 
           code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
-          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ">(data, r_e" << var_suffix << ", s_B"
-               << var_suffix << ", r_q" << var_suffix << ");\n";
+          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix
+               << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n";
         }
         break;
       case CEED_EVAL_GRAD:
@@ -511,27 +511,27 @@ static int CeedOperatorBuildKernelBasis_Hip_gen(std::ostringstream &code, CeedOp
           std::string function_name = (dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d";
 
           code << "    CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "];\n";
-          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ">(data, r_e" << var_suffix << ", s_B"
-               << var_suffix << ", r_c" << var_suffix << ");\n";
+          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix
+               << ", s_B" << var_suffix << ", r_c" << var_suffix << ");\n";
         } else if (use_3d_slices) {
           std::string function_name = (dim > 1 ? "InterpTensor" : "Interp") + std::to_string(dim) + "d";
 
           code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
-          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ">(data, r_e" << var_suffix << ", s_B"
-               << var_suffix << ", r_q" << var_suffix << ");\n";
+          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix
+               << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n";
         } else if (is_tensor) {
           bool        is_collocated = dim == 3 && Q_1d >= P_1d;
           std::string function_name = (dim == 1 ? "Grad" : (is_collocated ? "GradTensorCollocated" : "GradTensor")) + std::to_string(dim) + "d";
 
           code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim*" << (dim >= 3 ? Q_name : "1") << "];\n";
-          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ">(data, r_e" << var_suffix << ", s_B"
-               << var_suffix << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n";
+          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix
+               << ", s_B" << var_suffix << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n";
         } else {
           std::string function_name = "GradNonTensor";
 
           code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim];\n";
-          code << "    " << function_name << "<num_comp" << var_suffix << ", dim, " << P_name << ", " << Q_name << ">(data, r_e" << var_suffix
-               << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n";
+          code << "    " << function_name << "<num_comp" << var_suffix << ", dim, " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e"
+               << var_suffix << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n";
         }
         break;
       case CEED_EVAL_WEIGHT: {
@@ -564,14 +564,14 @@ static int CeedOperatorBuildKernelBasis_Hip_gen(std::ostringstream &code, CeedOp
         if (is_at_points) {
           std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d";
 
-          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ">(data, r_c" << var_suffix << ", s_B"
-               << var_suffix << ", r_e" << var_suffix << ");\n";
+          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_c" << var_suffix
+               << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
         } else {
           std::string function_name =
               is_tensor ? ((dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d") : "InterpTransposeNonTensor";
 
-          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ">(data, r_q" << var_suffix << ", s_B"
-               << var_suffix << ", r_e" << var_suffix << ");\n";
+          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_q" << var_suffix
+               << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
         }
         break;
       case CEED_EVAL_GRAD:
@@ -579,25 +579,25 @@ static int CeedOperatorBuildKernelBasis_Hip_gen(std::ostringstream &code, CeedOp
         if (is_at_points) {
           std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d";
 
-          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ">(data, r_c" << var_suffix << ", s_B"
-               << var_suffix << ", r_e" << var_suffix << ");\n";
+          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_c" << var_suffix
+               << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
         } else if (use_3d_slices) {
           std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d";
 
-          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ">(data, r_q" << var_suffix << ", s_B"
-               << var_suffix << ", r_e" << var_suffix << ");\n";
+          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_q" << var_suffix
+               << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
         } else if (is_tensor) {
           bool        is_collocated = dim == 3 && Q_1d >= P_1d;
           std::string function_name =
               (dim == 1 ? "GradTranspose" : (is_collocated ? "GradTransposeTensorCollocated" : "GradTransposeTensor")) + std::to_string(dim) + "d";
 
-          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ">(data, r_q" << var_suffix << ", s_B"
-               << var_suffix << ", s_G" << var_suffix << ", r_e" << var_suffix << ");\n";
+          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_q" << var_suffix
+               << ", s_B" << var_suffix << ", s_G" << var_suffix << ", r_e" << var_suffix << ");\n";
         } else {
           std::string function_name = "GradTransposeNonTensor";
 
-          code << "    " << function_name << "<num_comp" << var_suffix << ", dim, " << P_name << ", " << Q_name << ">(data, r_q" << var_suffix
-               << ", s_G" << var_suffix << ", r_e" << var_suffix << ");\n";
+          code << "    " << function_name << "<num_comp" << var_suffix << ", dim, " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_q"
+               << var_suffix << ", s_G" << var_suffix << ", r_e" << var_suffix << ");\n";
         }
         break;
       // LCOV_EXCL_START
@@ -820,8 +820,8 @@ static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, Ce
           break;
         case CEED_EVAL_GRAD:
           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim];\n";
-          code << "      GradColloSlice3d<num_comp" << var_suffix << ", " << Q_name << ">(data, q, r_q" << var_suffix << ", s_G" << var_suffix
-               << ", r_s" << var_suffix << ");\n";
+          code << "      GradColloSlice3d<num_comp" << var_suffix << ", " << Q_name << ", OP_T_1D>(data, q, r_q" << var_suffix << ", s_G"
+               << var_suffix << ", r_s" << var_suffix << ");\n";
           break;
         case CEED_EVAL_WEIGHT:
           code << "      CeedScalar r_s" << var_suffix << "[1];\n";
@@ -989,7 +989,7 @@ static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, Ce
           code << "      }\n";
           break;
         case CEED_EVAL_GRAD:
-          code << "      GradColloSliceTranspose3d<num_comp" << var_suffix << ", " << Q_name << ">(data, q, r_s" << var_suffix << ", s_G"
+          code << "      GradColloSliceTranspose3d<num_comp" << var_suffix << ", " << Q_name << ", OP_T_1D>(data, q, r_s" << var_suffix << ", s_G"
                << var_suffix << ", r_q" << var_suffix << ");\n";
           break;
           // LCOV_EXCL_START
@@ -1216,7 +1216,7 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_bu
   code << "  data.t_id_y = threadIdx.y;\n";
   code << "  data.t_id_z = threadIdx.z;\n";
   code << "  data.t_id  = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
-  code << "  data.slice = slice + data.t_id_z*T_1D" << ((!is_tensor || dim == 1) ? "" : "*T_1D") << ";\n";
+  code << "  data.slice = slice + data.t_id_z*OP_T_1D" << ((!is_tensor || dim == 1) ? "" : "*OP_T_1D") << ";\n";
 
   // -- Determine input mat reuse
   FieldReuse_Hip input_matrix_reuse[CEED_FIELD_MAX];
@@ -1459,7 +1459,7 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_bu
   {
     bool is_compile_good = false;
 
-    CeedCallBackend(CeedTryCompile_Hip(ceed, code.str().c_str(), &is_compile_good, &data->module, 2, "T_1D", block_sizes[0], "BLOCK_SIZE",
+    CeedCallBackend(CeedTryCompile_Hip(ceed, code.str().c_str(), &is_compile_good, &data->module, 2, "OP_T_1D", block_sizes[0], "BLOCK_SIZE",
                                        block_sizes[0] * block_sizes[1] * block_sizes[2]));
     if (is_compile_good) {
       *is_good_build = true;
diff --git a/backends/hip-shared/ceed-hip-shared-basis.c b/backends/hip-shared/ceed-hip-shared-basis.c
index f0080d1eda..37859b1f21 100644
--- a/backends/hip-shared/ceed-hip-shared-basis.c
+++ b/backends/hip-shared/ceed-hip-shared-basis.c
@@ -354,7 +354,7 @@ static int CeedBasisApplyAtPointsCore_Hip_shared(CeedBasis basis, bool apply_add
 
     if (data->moduleAtPoints) CeedCallHip(ceed, hipModuleUnload(data->moduleAtPoints));
     CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
-    CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->moduleAtPoints, 9, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "T_1D",
+    CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->moduleAtPoints, 9, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_T_1D",
                                     CeedIntMax(Q_1d, P_1d), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, "BASIS_NUM_NODES", CeedIntPow(P_1d, dim),
                                     "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_NUM_PTS", max_num_points, "BASIS_INTERP_BLOCK_SIZE",
                                     data->block_sizes[0]));
@@ -692,7 +692,7 @@ int CeedBasisCreateTensorH1_Hip_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d,
   // Compile basis kernels
   const char basis_kernel_source[] = "// Tensor basis source\n#include <ceed/jit-source/hip/hip-shared-basis-tensor.h>\n";
 
-  CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->module, 11, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "T_1D",
+  CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->module, 11, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_T_1D",
                                   CeedIntMax(Q_1d, P_1d), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, "BASIS_NUM_NODES", CeedIntPow(P_1d, dim),
                                   "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_INTERP_BLOCK_SIZE", data->block_sizes[0], "BASIS_GRAD_BLOCK_SIZE",
                                   data->block_sizes[1], "BASIS_WEIGHT_BLOCK_SIZE", data->block_sizes[2], "BASIS_HAS_COLLOCATED_GRAD",
@@ -768,7 +768,7 @@ int CeedBasisCreateH1_Hip_shared(CeedElemTopology topo, CeedInt dim, CeedInt num
 
   CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
   CeedCallBackend(ComputeBasisThreadBlockSizes(dim, num_nodes, num_qpts, num_comp, data->block_sizes));
-  CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->module, 6, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "T_1D",
+  CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->module, 6, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_T_1D",
                                   CeedIntMax(num_qpts, num_nodes), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, "BASIS_INTERP_BLOCK_SIZE",
                                   data->block_sizes[0]));
   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Interp", &data->Interp));
diff --git a/include/ceed/jit-source/hip/hip-gen-templates.h b/include/ceed/jit-source/hip/hip-gen-templates.h
index 358583a79e..53e02133bb 100644
--- a/include/ceed/jit-source/hip/hip-gen-templates.h
+++ b/include/ceed/jit-source/hip/hip-gen-templates.h
@@ -56,12 +56,12 @@ inline __device__ void WritePoint(SharedData_Hip &data, const CeedInt elem, cons
 //------------------------------------------------------------------------------
 // L-vector -> E-vector, offsets provided
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int COMP_STRIDE, int P_1d>
+template <int NUM_COMP, int COMP_STRIDE, int P_1D>
 inline __device__ void ReadLVecStandard1d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
                                           const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
-  if (data.t_id_x < P_1d) {
+  if (data.t_id_x < P_1D) {
     const CeedInt node = data.t_id_x;
-    const CeedInt ind  = indices[node + elem * P_1d];
+    const CeedInt ind  = indices[node + elem * P_1D];
 
     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp];
   }
@@ -70,9 +70,9 @@ inline __device__ void ReadLVecStandard1d(SharedData_Hip &data, const CeedInt nu
 //------------------------------------------------------------------------------
 // L-vector -> E-vector, strided
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
+template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
 inline __device__ void ReadLVecStrided1d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
-  if (data.t_id_x < P_1d) {
+  if (data.t_id_x < P_1D) {
     const CeedInt node = data.t_id_x;
     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
 
@@ -83,12 +83,12 @@ inline __device__ void ReadLVecStrided1d(SharedData_Hip &data, const CeedInt ele
 //------------------------------------------------------------------------------
 // E-vector -> L-vector, offsets provided
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int COMP_STRIDE, int P_1d>
+template <int NUM_COMP, int COMP_STRIDE, int P_1D>
 inline __device__ void WriteLVecStandard1d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
                                            const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
-  if (data.t_id_x < P_1d) {
+  if (data.t_id_x < P_1D) {
     const CeedInt node = data.t_id_x;
-    const CeedInt ind  = indices[node + elem * P_1d];
+    const CeedInt ind  = indices[node + elem * P_1D];
 
     for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[comp]);
   }
@@ -97,10 +97,10 @@ inline __device__ void WriteLVecStandard1d(SharedData_Hip &data, const CeedInt n
 //------------------------------------------------------------------------------
 // E-vector -> L-vector, strided
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
+template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
 inline __device__ void WriteLVecStrided1d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
                                           CeedScalar *__restrict__ d_v) {
-  if (data.t_id_x < P_1d) {
+  if (data.t_id_x < P_1D) {
     const CeedInt node = data.t_id_x;
     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
 
@@ -115,12 +115,12 @@ inline __device__ void WriteLVecStrided1d(SharedData_Hip &data, const CeedInt el
 //------------------------------------------------------------------------------
 // L-vector -> E-vector, offsets provided
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int COMP_STRIDE, int P_1d>
+template <int NUM_COMP, int COMP_STRIDE, int P_1D>
 inline __device__ void ReadLVecStandard2d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
                                           const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
-  if (data.t_id_x < P_1d && data.t_id_y < P_1d) {
-    const CeedInt node = data.t_id_x + data.t_id_y * P_1d;
-    const CeedInt ind  = indices[node + elem * P_1d * P_1d];
+  if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
+    const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
+    const CeedInt ind  = indices[node + elem * P_1D * P_1D];
 
     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp];
   }
@@ -129,10 +129,10 @@ inline __device__ void ReadLVecStandard2d(SharedData_Hip &data, const CeedInt nu
 //------------------------------------------------------------------------------
 // L-vector -> E-vector, strided
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
+template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
 inline __device__ void ReadLVecStrided2d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
-  if (data.t_id_x < P_1d && data.t_id_y < P_1d) {
-    const CeedInt node = data.t_id_x + data.t_id_y * P_1d;
+  if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
+    const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
 
     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP];
@@ -142,12 +142,12 @@ inline __device__ void ReadLVecStrided2d(SharedData_Hip &data, const CeedInt ele
 //------------------------------------------------------------------------------
 // E-vector -> L-vector, offsets provided
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int COMP_STRIDE, int P_1d>
+template <int NUM_COMP, int COMP_STRIDE, int P_1D>
 inline __device__ void WriteLVecStandard2d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
                                            const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
-  if (data.t_id_x < P_1d && data.t_id_y < P_1d) {
-    const CeedInt node = data.t_id_x + data.t_id_y * P_1d;
-    const CeedInt ind  = indices[node + elem * P_1d * P_1d];
+  if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
+    const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
+    const CeedInt ind  = indices[node + elem * P_1D * P_1D];
 
     for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[comp]);
   }
@@ -156,11 +156,11 @@ inline __device__ void WriteLVecStandard2d(SharedData_Hip &data, const CeedInt n
 //------------------------------------------------------------------------------
 // E-vector -> L-vector, strided
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
+template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
 inline __device__ void WriteLVecStrided2d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
                                           CeedScalar *__restrict__ d_v) {
-  if (data.t_id_x < P_1d && data.t_id_y < P_1d) {
-    const CeedInt node = data.t_id_x + data.t_id_y * P_1d;
+  if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
+    const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
 
     for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[comp];
@@ -174,15 +174,15 @@ inline __device__ void WriteLVecStrided2d(SharedData_Hip &data, const CeedInt el
 //------------------------------------------------------------------------------
 // L-vector -> E-vector, offsets provided
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int COMP_STRIDE, int P_1d>
+template <int NUM_COMP, int COMP_STRIDE, int P_1D>
 inline __device__ void ReadLVecStandard3d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
                                           const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
-  if (data.t_id_x < P_1d && data.t_id_y < P_1d) {
-    for (CeedInt z = 0; z < P_1d; z++) {
-      const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d;
-      const CeedInt ind  = indices[node + elem * P_1d * P_1d * P_1d];
+  if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
+    for (CeedInt z = 0; z < P_1D; z++) {
+      const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
+      const CeedInt ind  = indices[node + elem * P_1D * P_1D * P_1D];
 
-      for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1d] = d_u[ind + COMP_STRIDE * comp];
+      for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1D] = d_u[ind + COMP_STRIDE * comp];
     }
   }
 }
@@ -190,14 +190,14 @@ inline __device__ void ReadLVecStandard3d(SharedData_Hip &data, const CeedInt nu
 //------------------------------------------------------------------------------
 // L-vector -> E-vector, strided
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
+template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
 inline __device__ void ReadLVecStrided3d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
-  if (data.t_id_x < P_1d && data.t_id_y < P_1d) {
-    for (CeedInt z = 0; z < P_1d; z++) {
-      const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d;
+  if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
+    for (CeedInt z = 0; z < P_1D; z++) {
+      const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
       const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
 
-      for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1d] = d_u[ind + comp * STRIDES_COMP];
+      for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1D] = d_u[ind + comp * STRIDES_COMP];
     }
   }
 }
@@ -205,13 +205,13 @@ inline __device__ void ReadLVecStrided3d(SharedData_Hip &data, const CeedInt ele
 //------------------------------------------------------------------------------
 // E-vector -> Q-vector, offests provided
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int COMP_STRIDE, int Q_1d>
+template <int NUM_COMP, int COMP_STRIDE, int Q_1D>
 inline __device__ void ReadEVecSliceStandard3d(SharedData_Hip &data, const CeedInt nquads, const CeedInt elem, const CeedInt q,
                                                const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u,
                                                CeedScalar *__restrict__ r_u) {
-  if (data.t_id_x < Q_1d && data.t_id_y < Q_1d) {
-    const CeedInt node = data.t_id_x + data.t_id_y * Q_1d + q * Q_1d * Q_1d;
-    const CeedInt ind  = indices[node + elem * Q_1d * Q_1d * Q_1d];
+  if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
+    const CeedInt node = data.t_id_x + data.t_id_y * Q_1D + q * Q_1D * Q_1D;
+    const CeedInt ind  = indices[node + elem * Q_1D * Q_1D * Q_1D];
 
     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp];
   }
@@ -220,11 +220,11 @@ inline __device__ void ReadEVecSliceStandard3d(SharedData_Hip &data, const CeedI
 //------------------------------------------------------------------------------
 // E-vector -> Q-vector, strided
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int Q_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
+template <int NUM_COMP, int Q_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
 inline __device__ void ReadEVecSliceStrided3d(SharedData_Hip &data, const CeedInt elem, const CeedInt q, const CeedScalar *__restrict__ d_u,
                                               CeedScalar *__restrict__ r_u) {
-  if (data.t_id_x < Q_1d && data.t_id_y < Q_1d) {
-    const CeedInt node = data.t_id_x + data.t_id_y * Q_1d + q * Q_1d * Q_1d;
+  if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
+    const CeedInt node = data.t_id_x + data.t_id_y * Q_1D + q * Q_1D * Q_1D;
     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
 
     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP];
@@ -234,15 +234,15 @@ inline __device__ void ReadEVecSliceStrided3d(SharedData_Hip &data, const CeedIn
 //------------------------------------------------------------------------------
 // E-vector -> L-vector, offsets provided
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int COMP_STRIDE, int P_1d>
+template <int NUM_COMP, int COMP_STRIDE, int P_1D>
 inline __device__ void WriteLVecStandard3d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
                                            const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
-  if (data.t_id_x < P_1d && data.t_id_y < P_1d) {
-    for (CeedInt z = 0; z < P_1d; z++) {
-      const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d;
-      const CeedInt ind  = indices[node + elem * P_1d * P_1d * P_1d];
+  if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
+    for (CeedInt z = 0; z < P_1D; z++) {
+      const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
+      const CeedInt ind  = indices[node + elem * P_1D * P_1D * P_1D];
 
-      for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[z + comp * P_1d]);
+      for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[z + comp * P_1D]);
     }
   }
 }
@@ -250,15 +250,15 @@ inline __device__ void WriteLVecStandard3d(SharedData_Hip &data, const CeedInt n
 //------------------------------------------------------------------------------
 // E-vector -> L-vector, strided
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
+template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
 inline __device__ void WriteLVecStrided3d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
                                           CeedScalar *__restrict__ d_v) {
-  if (data.t_id_x < P_1d && data.t_id_y < P_1d) {
-    for (CeedInt z = 0; z < P_1d; z++) {
-      const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d;
+  if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
+    for (CeedInt z = 0; z < P_1D; z++) {
+      const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
       const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
 
-      for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[z + comp * P_1d];
+      for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[z + comp * P_1D];
     }
   }
 }
@@ -266,27 +266,27 @@ inline __device__ void WriteLVecStrided3d(SharedData_Hip &data, const CeedInt el
 //------------------------------------------------------------------------------
 // 3D collocated derivatives computation
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int Q_1d>
+template <int NUM_COMP, int Q_1D, int T_1D>
 inline __device__ void GradColloSlice3d(SharedData_Hip &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
                                         CeedScalar *__restrict__ r_V) {
-  if (data.t_id_x < Q_1d && data.t_id_y < Q_1d) {
+  if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-      data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[q + comp * Q_1d];
+      data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[q + comp * Q_1D];
       __syncthreads();
       // X derivative
       r_V[comp + 0 * NUM_COMP] = 0.0;
-      for (CeedInt i = 0; i < Q_1d; i++) {
-        r_V[comp + 0 * NUM_COMP] += c_G[i + data.t_id_x * Q_1d] * data.slice[i + data.t_id_y * T_1D];
+      for (CeedInt i = 0; i < Q_1D; i++) {
+        r_V[comp + 0 * NUM_COMP] += c_G[i + data.t_id_x * Q_1D] * data.slice[i + data.t_id_y * T_1D];
       }
       // Y derivative
       r_V[comp + 1 * NUM_COMP] = 0.0;
-      for (CeedInt i = 0; i < Q_1d; i++) {
-        r_V[comp + 1 * NUM_COMP] += c_G[i + data.t_id_y * Q_1d] * data.slice[data.t_id_x + i * T_1D];
+      for (CeedInt i = 0; i < Q_1D; i++) {
+        r_V[comp + 1 * NUM_COMP] += c_G[i + data.t_id_y * Q_1D] * data.slice[data.t_id_x + i * T_1D];
       }
       // Z derivative
       r_V[comp + 2 * NUM_COMP] = 0.0;
-      for (CeedInt i = 0; i < Q_1d; i++) {
-        r_V[comp + 2 * NUM_COMP] += c_G[i + q * Q_1d] * r_U[i + comp * Q_1d];
+      for (CeedInt i = 0; i < Q_1D; i++) {
+        r_V[comp + 2 * NUM_COMP] += c_G[i + q * Q_1D] * r_U[i + comp * Q_1D];
       }
       __syncthreads();
     }
@@ -296,28 +296,28 @@ inline __device__ void GradColloSlice3d(SharedData_Hip &data, const CeedInt q, c
 //------------------------------------------------------------------------------
 // 3D collocated derivatives transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int Q_1d>
+template <int NUM_COMP, int Q_1D, int T_1D>
 inline __device__ void GradColloSliceTranspose3d(SharedData_Hip &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
                                                  CeedScalar *__restrict__ r_V) {
-  if (data.t_id_x < Q_1d && data.t_id_y < Q_1d) {
+  if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
       // X derivative
       data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 0 * NUM_COMP];
       __syncthreads();
-      for (CeedInt i = 0; i < Q_1d; i++) {
-        r_V[q + comp * Q_1d] += c_G[data.t_id_x + i * Q_1d] * data.slice[i + data.t_id_y * T_1D];
+      for (CeedInt i = 0; i < Q_1D; i++) {
+        r_V[q + comp * Q_1D] += c_G[data.t_id_x + i * Q_1D] * data.slice[i + data.t_id_y * T_1D];
       }
       __syncthreads();
       // Y derivative
       data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 1 * NUM_COMP];
       __syncthreads();
-      for (CeedInt i = 0; i < Q_1d; i++) {
-        r_V[q + comp * Q_1d] += c_G[data.t_id_y + i * Q_1d] * data.slice[data.t_id_x + i * T_1D];
+      for (CeedInt i = 0; i < Q_1D; i++) {
+        r_V[q + comp * Q_1D] += c_G[data.t_id_y + i * Q_1D] * data.slice[data.t_id_x + i * T_1D];
       }
       __syncthreads();
       // Z derivative
-      for (CeedInt i = 0; i < Q_1d; i++) {
-        r_V[i + comp * Q_1d] += c_G[i + q * Q_1d] * r_U[comp + 2 * NUM_COMP];
+      for (CeedInt i = 0; i < Q_1D; i++) {
+        r_V[i + comp * Q_1D] += c_G[i + q * Q_1D] * r_U[comp + 2 * NUM_COMP];
       }
     }
   }
diff --git a/include/ceed/jit-source/hip/hip-shared-basis-nontensor-templates.h b/include/ceed/jit-source/hip/hip-shared-basis-nontensor-templates.h
index d394179dfe..898b0ff331 100644
--- a/include/ceed/jit-source/hip/hip-shared-basis-nontensor-templates.h
+++ b/include/ceed/jit-source/hip/hip-shared-basis-nontensor-templates.h
@@ -43,7 +43,7 @@ inline __device__ void ContractTranspose1d(SharedData_Hip &data, const CeedScala
 //------------------------------------------------------------------------------
 // Interpolate to quadrature points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P, int Q>
+template <int NUM_COMP, int P, int Q, int T_1D>
 inline __device__ void InterpNonTensor(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
                                        CeedScalar *__restrict__ r_V) {
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
@@ -54,7 +54,7 @@ inline __device__ void InterpNonTensor(SharedData_Hip &data, const CeedScalar *_
 //------------------------------------------------------------------------------
 // Interpolate transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P, int Q>
+template <int NUM_COMP, int P, int Q, int T_1D>
 inline __device__ void InterpTransposeNonTensor(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
                                                 CeedScalar *__restrict__ r_V) {
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
@@ -66,7 +66,7 @@ inline __device__ void InterpTransposeNonTensor(SharedData_Hip &data, const Ceed
 //------------------------------------------------------------------------------
 // Derivatives at quadrature points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int DIM, int P, int Q>
+template <int NUM_COMP, int DIM, int P, int Q, int T_1D>
 inline __device__ void GradNonTensor(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
   for (CeedInt dim = 0; dim < DIM; dim++) {
     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
@@ -78,7 +78,7 @@ inline __device__ void GradNonTensor(SharedData_Hip &data, const CeedScalar *__r
 //------------------------------------------------------------------------------
 // Derivatives transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int DIM, int P, int Q>
+template <int NUM_COMP, int DIM, int P, int Q, int T_1D>
 inline __device__ void GradTransposeNonTensor(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
                                               CeedScalar *__restrict__ r_V) {
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_V[comp] = 0.0;
diff --git a/include/ceed/jit-source/hip/hip-shared-basis-nontensor.h b/include/ceed/jit-source/hip/hip-shared-basis-nontensor.h
index c892a9c939..dabe392f10 100644
--- a/include/ceed/jit-source/hip/hip-shared-basis-nontensor.h
+++ b/include/ceed/jit-source/hip/hip-shared-basis-nontensor.h
@@ -24,7 +24,7 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
   data.t_id_y = threadIdx.y;
   data.t_id_z = threadIdx.z;
   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
-  data.slice  = slice + data.t_id_z * T_1D;
+  data.slice  = slice + data.t_id_z * BASIS_T_1D;
 
   CeedScalar r_U[BASIS_NUM_COMP];
   CeedScalar r_V[BASIS_NUM_COMP];
@@ -37,7 +37,7 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
   // Apply basis element by element
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
     ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, d_U, r_U);
-    InterpNonTensor<BASIS_NUM_COMP, BASIS_P, BASIS_Q>(data, r_U, s_B, r_V);
+    InterpNonTensor<BASIS_NUM_COMP, BASIS_P, BASIS_Q, BASIS_T_1D>(data, r_U, s_B, r_V);
     WriteElementStrided1d<BASIS_NUM_COMP, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, r_V, d_V);
   }
 }
@@ -51,7 +51,7 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
   data.t_id_y = threadIdx.y;
   data.t_id_z = threadIdx.z;
   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
-  data.slice  = slice + data.t_id_z * T_1D;
+  data.slice  = slice + data.t_id_z * BASIS_T_1D;
 
   CeedScalar r_U[BASIS_NUM_COMP];
   CeedScalar r_V[BASIS_NUM_COMP];
@@ -64,7 +64,7 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
   // Apply basis element by element
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
     ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, d_U, r_U);
-    InterpTransposeNonTensor<BASIS_NUM_COMP, BASIS_P, BASIS_Q>(data, r_U, s_B, r_V);
+    InterpTransposeNonTensor<BASIS_NUM_COMP, BASIS_P, BASIS_Q, BASIS_T_1D>(data, r_U, s_B, r_V);
     WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, r_V, d_V);
   }
 }
@@ -78,7 +78,7 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
   data.t_id_y = threadIdx.y;
   data.t_id_z = threadIdx.z;
   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
-  data.slice  = slice + data.t_id_z * T_1D;
+  data.slice  = slice + data.t_id_z * BASIS_T_1D;
 
   CeedScalar r_U[BASIS_NUM_COMP];
   CeedScalar r_V[BASIS_NUM_COMP];
@@ -91,7 +91,7 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
   // Apply basis element by element
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
     ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, d_U, r_U);
-    InterpTransposeNonTensor<BASIS_NUM_COMP, BASIS_P, BASIS_Q>(data, r_U, s_B, r_V);
+    InterpTransposeNonTensor<BASIS_NUM_COMP, BASIS_P, BASIS_Q, BASIS_T_1D>(data, r_U, s_B, r_V);
     SumElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, r_V, d_V);
   }
 }
@@ -108,7 +108,7 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
   data.t_id_y = threadIdx.y;
   data.t_id_z = threadIdx.z;
   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
-  data.slice  = slice + data.t_id_z * T_1D;
+  data.slice  = slice + data.t_id_z * BASIS_T_1D;
 
   CeedScalar r_U[BASIS_NUM_COMP];
   CeedScalar r_V[BASIS_NUM_COMP * BASIS_DIM];
@@ -121,7 +121,7 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
   // Apply basis element by element
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
     ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, d_U, r_U);
-    GradNonTensor<BASIS_NUM_COMP, BASIS_DIM, BASIS_P, BASIS_Q>(data, r_U, s_G, r_V);
+    GradNonTensor<BASIS_NUM_COMP, BASIS_DIM, BASIS_P, BASIS_Q, BASIS_T_1D>(data, r_U, s_G, r_V);
     WriteElementStrided1d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, r_V, d_V);
   }
 }
@@ -135,7 +135,7 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
   data.t_id_y = threadIdx.y;
   data.t_id_z = threadIdx.z;
   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
-  data.slice  = slice + data.t_id_z * T_1D;
+  data.slice  = slice + data.t_id_z * BASIS_T_1D;
 
   CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM];
   CeedScalar r_V[BASIS_NUM_COMP];
@@ -148,7 +148,7 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
   // Apply basis element by element
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
     ReadElementStrided1d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, d_U, r_U);
-    GradTransposeNonTensor<BASIS_NUM_COMP, BASIS_DIM, BASIS_P, BASIS_Q>(data, r_U, s_G, r_V);
+    GradTransposeNonTensor<BASIS_NUM_COMP, BASIS_DIM, BASIS_P, BASIS_Q, BASIS_T_1D>(data, r_U, s_G, r_V);
     WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, r_V, d_V);
   }
 }
@@ -162,7 +162,7 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
   data.t_id_y = threadIdx.y;
   data.t_id_z = threadIdx.z;
   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
-  data.slice  = slice + data.t_id_z * T_1D;
+  data.slice  = slice + data.t_id_z * BASIS_T_1D;
 
   CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM];
   CeedScalar r_V[BASIS_NUM_COMP];
@@ -175,7 +175,7 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
   // Apply basis element by element
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
     ReadElementStrided1d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, d_U, r_U);
-    GradTransposeNonTensor<BASIS_NUM_COMP, BASIS_DIM, BASIS_P, BASIS_Q>(data, r_U, s_G, r_V);
+    GradTransposeNonTensor<BASIS_NUM_COMP, BASIS_DIM, BASIS_P, BASIS_Q, BASIS_T_1D>(data, r_U, s_G, r_V);
     SumElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, r_V, d_V);
   }
 }
@@ -192,7 +192,7 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
   data.t_id_y = threadIdx.y;
   data.t_id_z = threadIdx.z;
   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
-  data.slice  = slice + data.t_id_z * T_1D;
+  data.slice  = slice + data.t_id_z * BASIS_T_1D;
 
   CeedScalar r_W[1];
 
diff --git a/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points.h b/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points.h
index 5de645c501..81cca74474 100644
--- a/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points.h
+++ b/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points.h
@@ -30,7 +30,7 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
   data.t_id_y = threadIdx.y;
   data.t_id_z = threadIdx.z;
   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
-  data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
+  data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
 
   CeedScalar r_X[BASIS_DIM];
   CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
@@ -47,14 +47,14 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
     // Map to coefficients
     if (BASIS_DIM == 1) {
       ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U);
-      Interp1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_C);
+      Interp1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_C);
     } else if (BASIS_DIM == 2) {
       ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U);
-      InterpTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_C);
+      InterpTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_C);
     } else if (BASIS_DIM == 3) {
       ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
                                                        BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U);
-      InterpTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_C);
+      InterpTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_C);
     }
 
     // Map to points
@@ -86,7 +86,7 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
   data.t_id_y = threadIdx.y;
   data.t_id_z = threadIdx.z;
   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
-  data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
+  data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
 
   CeedScalar r_X[BASIS_DIM];
   CeedScalar r_U[BASIS_NUM_COMP];
@@ -134,13 +134,13 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
 
     // Map from coefficients
     if (BASIS_DIM == 1) {
-      InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V);
+      InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V);
       SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
     } else if (BASIS_DIM == 2) {
-      InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V);
+      InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V);
       SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
     } else if (BASIS_DIM == 3) {
-      InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V);
+      InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V);
       SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
                                                       BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
     }
@@ -157,7 +157,7 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
   data.t_id_y = threadIdx.y;
   data.t_id_z = threadIdx.z;
   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
-  data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
+  data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
 
   CeedScalar r_X[BASIS_DIM];
   CeedScalar r_U[BASIS_NUM_COMP];
@@ -194,13 +194,13 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
 
     // Map from coefficients
     if (BASIS_DIM == 1) {
-      InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V);
+      InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V);
       SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
     } else if (BASIS_DIM == 2) {
-      InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V);
+      InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V);
       SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
     } else if (BASIS_DIM == 3) {
-      InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V);
+      InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V);
       SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
                                                       BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
     }
@@ -220,7 +220,7 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
   data.t_id_y = threadIdx.y;
   data.t_id_z = threadIdx.z;
   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
-  data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
+  data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
 
   CeedScalar r_X[BASIS_DIM];
   CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
@@ -237,14 +237,14 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
     // Map to coefficients
     if (BASIS_DIM == 1) {
       ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U);
-      Interp1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_C);
+      Interp1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_C);
     } else if (BASIS_DIM == 2) {
       ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U);
-      InterpTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_C);
+      InterpTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_C);
     } else if (BASIS_DIM == 3) {
       ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
                                                        BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U);
-      InterpTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_C);
+      InterpTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_C);
     }
 
     // Map to points
@@ -276,7 +276,7 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
   data.t_id_y = threadIdx.y;
   data.t_id_z = threadIdx.z;
   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
-  data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
+  data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
 
   CeedScalar r_X[BASIS_DIM];
   CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM];
@@ -325,13 +325,13 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
 
     // Map from coefficients
     if (BASIS_DIM == 1) {
-      InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V);
+      InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V);
       SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
     } else if (BASIS_DIM == 2) {
-      InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V);
+      InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V);
       SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
     } else if (BASIS_DIM == 3) {
-      InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V);
+      InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V);
       SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
                                                       BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
     }
@@ -348,7 +348,7 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
   data.t_id_y = threadIdx.y;
   data.t_id_z = threadIdx.z;
   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
-  data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
+  data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
 
   CeedScalar r_X[BASIS_DIM];
   CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM];
@@ -386,13 +386,13 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
 
     // Map from coefficients
     if (BASIS_DIM == 1) {
-      InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V);
+      InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V);
       SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
     } else if (BASIS_DIM == 2) {
-      InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V);
+      InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V);
       SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
     } else if (BASIS_DIM == 3) {
-      InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V);
+      InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V);
       SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
                                                       BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
     }
diff --git a/include/ceed/jit-source/hip/hip-shared-basis-tensor-templates.h b/include/ceed/jit-source/hip/hip-shared-basis-tensor-templates.h
index 4f4cc58e78..be7857ded1 100644
--- a/include/ceed/jit-source/hip/hip-shared-basis-tensor-templates.h
+++ b/include/ceed/jit-source/hip/hip-shared-basis-tensor-templates.h
@@ -48,7 +48,7 @@ inline __device__ void ContractTransposeX1d(SharedData_Hip &data, const CeedScal
 //------------------------------------------------------------------------------
 // 1D interpolate to quadrature points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void Interp1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
     ContractX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, &r_V[comp]);
@@ -58,7 +58,7 @@ inline __device__ void Interp1d(SharedData_Hip &data, const CeedScalar *__restri
 //------------------------------------------------------------------------------
 // 1D interpolate transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void InterpTranspose1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
                                          CeedScalar *__restrict__ r_V) {
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
@@ -69,7 +69,7 @@ inline __device__ void InterpTranspose1d(SharedData_Hip &data, const CeedScalar
 //------------------------------------------------------------------------------
 // 1D derivatives at quadrature points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void Grad1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
                               CeedScalar *__restrict__ r_V) {
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
@@ -80,7 +80,7 @@ inline __device__ void Grad1d(SharedData_Hip &data, const CeedScalar *__restrict
 //------------------------------------------------------------------------------
 // 1D derivatives transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void GradTranspose1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
                                        CeedScalar *__restrict__ r_V) {
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
@@ -103,7 +103,7 @@ inline __device__ void Weight1d(SharedData_Hip &data, const CeedScalar *__restri
 //------------------------------------------------------------------------------
 // 2D tensor contraction x
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void ContractX2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
   __syncthreads();
@@ -119,7 +119,7 @@ inline __device__ void ContractX2d(SharedData_Hip &data, const CeedScalar *U, co
 //------------------------------------------------------------------------------
 // 2D tensor contract y
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void ContractY2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
   __syncthreads();
@@ -135,7 +135,7 @@ inline __device__ void ContractY2d(SharedData_Hip &data, const CeedScalar *U, co
 //------------------------------------------------------------------------------
 // 2D transpose tensor contract y
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void ContractTransposeY2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
   __syncthreads();
@@ -151,7 +151,7 @@ inline __device__ void ContractTransposeY2d(SharedData_Hip &data, const CeedScal
 //------------------------------------------------------------------------------
 // 2D transpose tensor contract x
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void ContractTransposeX2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
   __syncthreads();
@@ -167,7 +167,7 @@ inline __device__ void ContractTransposeX2d(SharedData_Hip &data, const CeedScal
 //------------------------------------------------------------------------------
 // 2D transpose tensor contract and add x
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void ContractTransposeAddX2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
   __syncthreads();
@@ -182,55 +182,55 @@ inline __device__ void ContractTransposeAddX2d(SharedData_Hip &data, const CeedS
 //------------------------------------------------------------------------------
 // 2D interpolate to quadrature points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void InterpTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
   CeedScalar r_t[1];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractX2d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, r_t);
-    ContractY2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, &r_V[comp]);
+    ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_B, r_t);
+    ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp]);
   }
 }
 
 //------------------------------------------------------------------------------
 // 2D interpolate transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void InterpTransposeTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
                                                CeedScalar *__restrict__ r_V) {
   CeedScalar r_t[1];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractTransposeY2d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, r_t);
-    ContractTransposeX2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, &r_V[comp]);
+    ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_B, r_t);
+    ContractTransposeX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp]);
   }
 }
 
 //------------------------------------------------------------------------------
 // 2D derivatives at quadrature points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void GradTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
                                     CeedScalar *__restrict__ r_V) {
   CeedScalar r_t[1];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractX2d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_G, r_t);
-    ContractY2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, &r_V[comp + 0 * NUM_COMP]);
-    ContractX2d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, r_t);
-    ContractY2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_G, &r_V[comp + 1 * NUM_COMP]);
+    ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_G, r_t);
+    ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp + 0 * NUM_COMP]);
+    ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_B, r_t);
+    ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_G, &r_V[comp + 1 * NUM_COMP]);
   }
 }
 
 //------------------------------------------------------------------------------
 // 2D derivatives transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void GradTransposeTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
                                              CeedScalar *__restrict__ r_V) {
   CeedScalar r_t[1];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractTransposeY2d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp + 0 * NUM_COMP], c_B, r_t);
-    ContractTransposeX2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_G, &r_V[comp]);
-    ContractTransposeY2d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp + 1 * NUM_COMP], c_G, r_t);
-    ContractTransposeAddX2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, &r_V[comp]);
+    ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp + 0 * NUM_COMP], c_B, r_t);
+    ContractTransposeX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_G, &r_V[comp]);
+    ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp + 1 * NUM_COMP], c_G, r_t);
+    ContractTransposeAddX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp]);
   }
 }
 
@@ -249,7 +249,7 @@ inline __device__ void WeightTensor2d(SharedData_Hip &data, const CeedScalar *__
 //------------------------------------------------------------------------------
 // 3D tensor contract x
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void ContractX3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
   CeedScalar r_B[P_1D];
   for (CeedInt i = 0; i < P_1D; i++) {
@@ -272,7 +272,7 @@ inline __device__ void ContractX3d(SharedData_Hip &data, const CeedScalar *U, co
 //------------------------------------------------------------------------------
 // 3D tensor contract y
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void ContractY3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
   CeedScalar r_B[P_1D];
   for (CeedInt i = 0; i < P_1D; i++) {
@@ -295,7 +295,7 @@ inline __device__ void ContractY3d(SharedData_Hip &data, const CeedScalar *U, co
 //------------------------------------------------------------------------------
 // 3D tensor contract z
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void ContractZ3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
   for (CeedInt k = 0; k < Q_1D; k++) {
     V[k] = 0.0;
@@ -310,7 +310,7 @@ inline __device__ void ContractZ3d(SharedData_Hip &data, const CeedScalar *U, co
 //------------------------------------------------------------------------------
 // 3D transpose tensor contract z
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void ContractTransposeZ3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
   for (CeedInt k = 0; k < P_1D; k++) {
     V[k] = 0.0;
@@ -325,7 +325,7 @@ inline __device__ void ContractTransposeZ3d(SharedData_Hip &data, const CeedScal
 //------------------------------------------------------------------------------
 // 3D transpose tensor contract y
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void ContractTransposeY3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
   CeedScalar r_B[Q_1D];
   for (CeedInt i = 0; i < Q_1D; i++) {
@@ -348,7 +348,7 @@ inline __device__ void ContractTransposeY3d(SharedData_Hip &data, const CeedScal
 //------------------------------------------------------------------------------
 // 3D transpose tensor contract y
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void ContractTransposeAddY3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
   CeedScalar r_B[Q_1D];
   for (CeedInt i = 0; i < Q_1D; i++) {
@@ -370,7 +370,7 @@ inline __device__ void ContractTransposeAddY3d(SharedData_Hip &data, const CeedS
 //------------------------------------------------------------------------------
 // 3D transpose tensor contract x
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void ContractTransposeX3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
   CeedScalar r_B[Q_1D];
   for (CeedInt i = 0; i < Q_1D; i++) {
@@ -393,7 +393,7 @@ inline __device__ void ContractTransposeX3d(SharedData_Hip &data, const CeedScal
 //------------------------------------------------------------------------------
 // 3D transpose tensor contract add x
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void ContractTransposeAddX3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
   CeedScalar r_B[Q_1D];
   for (CeedInt i = 0; i < Q_1D; i++) {
@@ -415,107 +415,107 @@ inline __device__ void ContractTransposeAddX3d(SharedData_Hip &data, const CeedS
 //------------------------------------------------------------------------------
 // 3D interpolate to quadrature points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void InterpTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
   CeedScalar r_t1[T_1D];
   CeedScalar r_t2[T_1D];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractX3d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
-    ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
-    ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, &r_V[comp * Q_1D]);
+    ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
+    ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
+    ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D]);
   }
 }
 
 //------------------------------------------------------------------------------
 // 3D interpolate transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void InterpTransposeTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
                                                CeedScalar *__restrict__ r_V) {
   CeedScalar r_t1[T_1D];
   CeedScalar r_t2[T_1D];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp * Q_1D], c_B, r_t1);
-    ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
-    ContractTransposeX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
+    ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D], c_B, r_t1);
+    ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
+    ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
   }
 }
 
 //------------------------------------------------------------------------------
 // 3D derivatives at quadrature points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void GradTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
                                     CeedScalar *__restrict__ r_V) {
   CeedScalar r_t1[T_1D];
   CeedScalar r_t2[T_1D];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractX3d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp * P_1D], c_G, r_t1);
-    ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
-    ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]);
-    ContractX3d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
-    ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_G, r_t2);
-    ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]);
-    ContractX3d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
-    ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
-    ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]);
+    ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_G, r_t1);
+    ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
+    ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]);
+    ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
+    ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2);
+    ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]);
+    ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
+    ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
+    ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]);
   }
 }
 
 //------------------------------------------------------------------------------
 // 3D derivatives transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void GradTransposeTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
                                              CeedScalar *__restrict__ r_V) {
   CeedScalar r_t1[T_1D];
   CeedScalar r_t2[T_1D];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_B, r_t1);
-    ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
-    ContractTransposeX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_G, &r_V[comp * P_1D]);
-    ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_B, r_t1);
-    ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_G, r_t2);
-    ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
-    ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, r_t1);
-    ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
-    ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
+    ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_B, r_t1);
+    ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
+    ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp * P_1D]);
+    ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_B, r_t1);
+    ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2);
+    ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
+    ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, r_t1);
+    ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
+    ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
   }
 }
 
 //------------------------------------------------------------------------------
 // 3D derivatives at quadrature points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void GradTensorCollocated3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
                                               CeedScalar *__restrict__ r_V) {
   CeedScalar r_t1[T_1D];
   CeedScalar r_t2[T_1D];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractX3d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
-    ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
-    ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_t1);
-    ContractX3d<NUM_COMP, Q_1D, Q_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]);
-    ContractY3d<NUM_COMP, Q_1D, Q_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]);
-    ContractZ3d<NUM_COMP, Q_1D, Q_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]);
+    ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
+    ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
+    ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, r_t1);
+    ContractX3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]);
+    ContractY3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]);
+    ContractZ3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]);
   }
 }
 
 //------------------------------------------------------------------------------
 // 3D derivatives transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void GradTransposeTensorCollocated3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
                                                        const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
   CeedScalar r_t1[T_1D];
   CeedScalar r_t2[T_1D];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractTransposeZ3d<NUM_COMP, Q_1D, Q_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, r_t2);
-    ContractTransposeAddY3d<NUM_COMP, Q_1D, Q_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_G, r_t2);
-    ContractTransposeAddX3d<NUM_COMP, Q_1D, Q_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_G, r_t2);
-    ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_t1);
-    ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
-    ContractTransposeX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
+    ContractTransposeZ3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, r_t2);
+    ContractTransposeAddY3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_G, r_t2);
+    ContractTransposeAddX3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_G, r_t2);
+    ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, r_t1);
+    ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
+    ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
   }
 }
 
diff --git a/include/ceed/jit-source/hip/hip-shared-basis-tensor.h b/include/ceed/jit-source/hip/hip-shared-basis-tensor.h
index 06c6d370b9..13e3690b38 100644
--- a/include/ceed/jit-source/hip/hip-shared-basis-tensor.h
+++ b/include/ceed/jit-source/hip/hip-shared-basis-tensor.h
@@ -24,7 +24,7 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
   data.t_id_y = threadIdx.y;
   data.t_id_z = threadIdx.z;
   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
-  data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
+  data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
 
   CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
   CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
@@ -38,16 +38,16 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
     if (BASIS_DIM == 1) {
       ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U);
-      Interp1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_V);
+      Interp1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V);
       WriteElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, r_V, d_V);
     } else if (BASIS_DIM == 2) {
       ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U);
-      InterpTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_V);
+      InterpTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V);
       WriteElementStrided2d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, r_V, d_V);
     } else if (BASIS_DIM == 3) {
       ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
                                                        BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U);
-      InterpTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_V);
+      InterpTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V);
       WriteElementStrided3d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
                                                         BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_V, d_V);
     }
@@ -63,7 +63,7 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
   data.t_id_y = threadIdx.y;
   data.t_id_z = threadIdx.z;
   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
-  data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
+  data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
 
   CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
   CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
@@ -77,16 +77,16 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
     if (BASIS_DIM == 1) {
       ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
-      InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_V);
+      InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V);
       WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
     } else if (BASIS_DIM == 2) {
       ReadElementStrided2d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
-      InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_V);
+      InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V);
       WriteElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
     } else if (BASIS_DIM == 3) {
       ReadElementStrided3d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
                                                        BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
-      InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_V);
+      InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V);
       WriteElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
                                                         BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
     }
@@ -102,7 +102,7 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
   data.t_id_y = threadIdx.y;
   data.t_id_z = threadIdx.z;
   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
-  data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
+  data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
 
   CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
   CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
@@ -116,16 +116,16 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
     if (BASIS_DIM == 1) {
       ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
-      InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_V);
+      InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V);
       SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
     } else if (BASIS_DIM == 2) {
       ReadElementStrided2d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
-      InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_V);
+      InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V);
       SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
     } else if (BASIS_DIM == 3) {
       ReadElementStrided3d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
                                                        BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
-      InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_V);
+      InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V);
       SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
                                                       BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
     }
@@ -144,7 +144,7 @@ extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__ void Grad(const C
   data.t_id_y = threadIdx.y;
   data.t_id_z = threadIdx.z;
   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
-  data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
+  data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
 
   CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
   CeedScalar r_V[BASIS_NUM_COMP * BASIS_DIM * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
@@ -160,18 +160,18 @@ extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__ void Grad(const C
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
     if (BASIS_DIM == 1) {
       ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U);
-      Grad1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V);
+      Grad1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
       WriteElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, r_V, d_V);
     } else if (BASIS_DIM == 2) {
       ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U);
-      GradTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V);
+      GradTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
       WriteElementStrided2d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, r_V,
                                                                     d_V);
     } else if (BASIS_DIM == 3) {
       ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
                                                        BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U);
-      if (BASIS_HAS_COLLOCATED_GRAD) GradTensorCollocated3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V);
-      else GradTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V);
+      if (BASIS_HAS_COLLOCATED_GRAD) GradTensorCollocated3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
+      else GradTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
       WriteElementStrided3d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
                                                                     BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_V, d_V);
     }
@@ -188,7 +188,7 @@ extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
   data.t_id_y = threadIdx.y;
   data.t_id_z = threadIdx.z;
   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
-  data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
+  data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
 
   CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
   CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
@@ -204,18 +204,18 @@ extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
     if (BASIS_DIM == 1) {
       ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
-      GradTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V);
+      GradTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
       WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
     } else if (BASIS_DIM == 2) {
       ReadElementStrided2d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U,
                                                                    r_U);
-      GradTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V);
+      GradTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
       WriteElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
     } else if (BASIS_DIM == 3) {
       ReadElementStrided3d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
                                                                    BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
-      if (BASIS_HAS_COLLOCATED_GRAD) GradTransposeTensorCollocated3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V);
-      else GradTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V);
+      if (BASIS_HAS_COLLOCATED_GRAD) GradTransposeTensorCollocated3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
+      else GradTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
       WriteElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
                                                         BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
     }
@@ -232,7 +232,7 @@ extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
   data.t_id_y = threadIdx.y;
   data.t_id_z = threadIdx.z;
   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
-  data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
+  data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
 
   CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
   CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
@@ -248,18 +248,18 @@ extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
     if (BASIS_DIM == 1) {
       ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
-      GradTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V);
+      GradTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
       SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
     } else if (BASIS_DIM == 2) {
       ReadElementStrided2d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U,
                                                                    r_U);
-      GradTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V);
+      GradTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
       SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
     } else if (BASIS_DIM == 3) {
       ReadElementStrided3d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
                                                                    BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
-      if (BASIS_HAS_COLLOCATED_GRAD) GradTransposeTensorCollocated3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V);
-      else GradTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V);
+      if (BASIS_HAS_COLLOCATED_GRAD) GradTransposeTensorCollocated3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
+      else GradTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
       SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
                                                       BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
     }
@@ -278,7 +278,7 @@ extern "C" __launch_bounds__(BASIS_WEIGHT_BLOCK_SIZE) __global__
   data.t_id_y = threadIdx.y;
   data.t_id_z = threadIdx.z;
   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
-  data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
+  data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
 
   CeedScalar r_W[BASIS_DIM > 2 ? BASIS_Q_1D : 1];
 

From 51f0702717a794f7620f2392d972a3b77acbbcb5 Mon Sep 17 00:00:00 2001
From: Zach Atkins <zach.atkins@colorado.edu>
Date: Thu, 13 Mar 2025 09:40:45 -0600
Subject: [PATCH 322/571] Add Zachary R. Atkins to authors

---
 AUTHORS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/AUTHORS b/AUTHORS
index 8c6e400008..c41f42dfb9 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -1,4 +1,5 @@
 Ahmad Abdelfattah
+Zachary R. Atkins
 Valeria Barra
 Natalie Beams
 Jed Brown

From 28c1f7472c6a66b820596d9869102b453e5a7e1f Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Thu, 13 Mar 2025 16:34:19 -0600
Subject: [PATCH 323/571] gpu - log error to debug on JiT try & fail

---
 backends/cuda/ceed-cuda-compile.cpp | 13 +++++++++++--
 backends/hip/ceed-hip-compile.cpp   | 13 +++++++++++--
 2 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/backends/cuda/ceed-cuda-compile.cpp b/backends/cuda/ceed-cuda-compile.cpp
index d11cb3b499..c0c2be9035 100644
--- a/backends/cuda/ceed-cuda-compile.cpp
+++ b/backends/cuda/ceed-cuda-compile.cpp
@@ -132,14 +132,23 @@ static int CeedCompileCore_Cuda(Ceed ceed, const char *source, const bool throw_
   }
   CeedCallBackend(CeedFree(&opts));
   *is_compile_good = result == NVRTC_SUCCESS;
-  if (!*is_compile_good && throw_error) {
+  if (!*is_compile_good) {
     char  *log;
     size_t log_size;
 
     CeedCallNvrtc(ceed, nvrtcGetProgramLogSize(prog, &log_size));
     CeedCallBackend(CeedMalloc(log_size, &log));
     CeedCallNvrtc(ceed, nvrtcGetProgramLog(prog, log));
-    return CeedError(ceed, CEED_ERROR_BACKEND, "%s\n%s", nvrtcGetErrorString(result), log);
+    if (throw_error) {
+      return CeedError(ceed, CEED_ERROR_BACKEND, "%s\n%s", nvrtcGetErrorString(result), log);
+    } else {
+      CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- COMPILE ERROR DETECTED ----------\n");
+      CeedDebug(ceed, "Error: %s\nCompile log:\n%s\n", nvrtcGetErrorString(result), log);
+      CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- BACKEND MAY FALLBACK ----------\n");
+      CeedCallBackend(CeedFree(&log));
+      CeedCallNvrtc(ceed, nvrtcDestroyProgram(&prog));
+      return CEED_ERROR_SUCCESS;
+    }
   }
 
 #if CUDA_VERSION >= 11010
diff --git a/backends/hip/ceed-hip-compile.cpp b/backends/hip/ceed-hip-compile.cpp
index 52dd6848c3..bad8519db6 100644
--- a/backends/hip/ceed-hip-compile.cpp
+++ b/backends/hip/ceed-hip-compile.cpp
@@ -134,14 +134,23 @@ static int CeedCompileCore_Hip(Ceed ceed, const char *source, const bool throw_e
   }
   CeedCallBackend(CeedFree(&opts));
   *is_compile_good = result == HIPRTC_SUCCESS;
-  if (!*is_compile_good && throw_error) {
+  if (!*is_compile_good) {
     size_t log_size;
     char  *log;
 
     CeedChk_hiprtc(ceed, hiprtcGetProgramLogSize(prog, &log_size));
     CeedCallBackend(CeedMalloc(log_size, &log));
     CeedCallHiprtc(ceed, hiprtcGetProgramLog(prog, log));
-    return CeedError(ceed, CEED_ERROR_BACKEND, "%s\n%s", hiprtcGetErrorString(result), log);
+    if (throw_error) {
+      return CeedError(ceed, CEED_ERROR_BACKEND, "%s\n%s", hiprtcGetErrorString(result), log);
+    } else {
+      CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- COMPILE ERROR DETECTED ----------\n");
+      CeedDebug(ceed, "Error: %s\nCompile log:\n%s\n", hiprtcGetErrorString(result), log);
+      CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- BACKEND MAY FALLBACK ----------\n");
+      CeedCallBackend(CeedFree(&log));
+      CeedCallHiprtc(ceed, hiprtcDestroyProgram(&prog));
+      return CEED_ERROR_SUCCESS;
+    }
   }
 
   CeedCallHiprtc(ceed, hiprtcGetCodeSize(prog, &ptx_size));

From 90c303742d164f80a8e78a9994ad68ff3f8b20b2 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Tue, 18 Mar 2025 17:36:43 -0600
Subject: [PATCH 324/571] gen - use blocksize of 1 elem AtPoints

---
 backends/cuda-gen/ceed-cuda-gen-operator.c       | 2 +-
 backends/hip-gen/ceed-hip-gen-operator-build.cpp | 1 +
 backends/hip-gen/ceed-hip-gen-operator.c         | 1 +
 3 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/backends/cuda-gen/ceed-cuda-gen-operator.c b/backends/cuda-gen/ceed-cuda-gen-operator.c
index 05e9498d02..0e42eeaaed 100644
--- a/backends/cuda-gen/ceed-cuda-gen-operator.c
+++ b/backends/cuda-gen/ceed-cuda-gen-operator.c
@@ -209,7 +209,7 @@ static int CeedOperatorApplyAddCore_Cuda_gen(CeedOperator op, CUstream stream, c
   int block[3] = {thread_1d, ((!is_tensor || dim == 1) ? 1 : thread_1d), -1};
 
   if (is_tensor) {
-    CeedCallBackend(BlockGridCalculate(num_elem, min_grid_size / cuda_data->device_prop.multiProcessorCount, max_threads_per_block,
+    CeedCallBackend(BlockGridCalculate(num_elem, min_grid_size / cuda_data->device_prop.multiProcessorCount, is_at_points ? 1 : max_threads_per_block,
                                        cuda_data->device_prop.maxThreadsDim[2], cuda_data->device_prop.warpSize, block, &grid));
   } else {
     CeedInt elems_per_block = CeedIntMin(cuda_data->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread_1d, 1));
diff --git a/backends/hip-gen/ceed-hip-gen-operator-build.cpp b/backends/hip-gen/ceed-hip-gen-operator-build.cpp
index 9bb1a7e77f..de77fc3121 100644
--- a/backends/hip-gen/ceed-hip-gen-operator-build.cpp
+++ b/backends/hip-gen/ceed-hip-gen-operator-build.cpp
@@ -1456,6 +1456,7 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_bu
   // Compile
   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
   CeedCallBackend(BlockGridCalculate_Hip_gen(is_tensor ? dim : 1, num_elem, data->max_P_1d, Q_1d, block_sizes));
+  if (is_at_points) block_sizes[2] = 1;
   {
     bool is_compile_good = false;
 
diff --git a/backends/hip-gen/ceed-hip-gen-operator.c b/backends/hip-gen/ceed-hip-gen-operator.c
index 0ead03b8f5..60f28accd9 100644
--- a/backends/hip-gen/ceed-hip-gen-operator.c
+++ b/backends/hip-gen/ceed-hip-gen-operator.c
@@ -142,6 +142,7 @@ static int CeedOperatorApplyAddCore_Hip_gen(CeedOperator op, hipStream_t stream,
 
   if (is_tensor) {
     CeedCallBackend(BlockGridCalculate_Hip_gen(is_tensor ? dim : 1, num_elem, P_1d, Q_1d, block_sizes));
+    if (is_at_points) block_sizes[2] = 1;
   } else {
     CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64;
 

From f725b54ba745f2e00cbca93711762022af4b66cb Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Wed, 26 Feb 2025 15:48:21 -0700
Subject: [PATCH 325/571] gpu - add P_1D to template args for AtPoints

---
 .../cuda-gen/ceed-cuda-gen-operator-build.cpp | 17 ++++-----
 .../hip-gen/ceed-hip-gen-operator-build.cpp   | 17 ++++-----
 ...-shared-basis-tensor-at-points-templates.h | 24 ++++++-------
 .../cuda/cuda-shared-basis-tensor-at-points.h | 36 +++++++++----------
 ...-shared-basis-tensor-at-points-templates.h | 24 ++++++-------
 .../hip/hip-shared-basis-tensor-at-points.h   | 36 +++++++++----------
 6 files changed, 78 insertions(+), 76 deletions(-)

diff --git a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
index fe7975c6bf..93d290c7e2 100644
--- a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
+++ b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
@@ -670,6 +670,7 @@ static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, C
     for (CeedInt i = 0; i < num_input_fields; i++) {
       const char *field_name;
       std::string var_suffix = "_in_" + std::to_string(i);
+      std::string P_name     = "P_1d" + var_suffix;
 
       CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name));
       code << "      // ---- Input field " << i << ": " << field_name << "\n";
@@ -684,13 +685,13 @@ static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, C
           break;
         case CEED_EVAL_INTERP:
           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
-          code << "      InterpAtPoints" << dim << "d<num_comp" << var_suffix << ", max_num_points, " << Q_name << ">(data, i, r_c" << var_suffix
-               << ", r_x, r_s" << var_suffix << ");\n";
+          code << "      InterpAtPoints" << dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name << ">(data, i, r_c"
+               << var_suffix << ", r_x, r_s" << var_suffix << ");\n";
           break;
         case CEED_EVAL_GRAD:
           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim];\n";
-          code << "      GradAtPoints" << dim << "d<num_comp" << var_suffix << ", max_num_points, " << Q_name << ">(data, i, r_c" << var_suffix
-               << ", r_x, r_s" << var_suffix << ");\n";
+          code << "      GradAtPoints" << dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name << ">(data, i, r_c"
+               << var_suffix << ", r_x, r_s" << var_suffix << ");\n";
           break;
         case CEED_EVAL_WEIGHT:
           code << "      CeedScalar r_s" << var_suffix << "[1];\n";
@@ -919,15 +920,15 @@ static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, C
           code << "      if (i >= points.num_per_elem[elem]) {\n";
           code << "        for (CeedInt j = 0; j < num_comp" << var_suffix << "; j++) r_s" << var_suffix << "[j] = 0.0;\n";
           code << "      }\n";
-          code << "      InterpTransposeAtPoints" << dim << "d<num_comp" << var_suffix << ", max_num_points, " << Q_name << ">(data, i, r_s"
-               << var_suffix << ", r_x, r_c" << var_suffix << ");\n";
+          code << "      InterpTransposeAtPoints" << dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
+               << ">(data, i, r_s" << var_suffix << ", r_x, r_c" << var_suffix << ");\n";
           break;
         case CEED_EVAL_GRAD:
           code << "      if (i >= points.num_per_elem[elem]) {\n";
           code << "        for (CeedInt j = 0; j < num_comp" << var_suffix << "*dim; j++) r_s" << var_suffix << "[j] = 0.0;\n";
           code << "      }\n";
-          code << "      GradTransposeAtPoints" << dim << "d<num_comp" << var_suffix << ", max_num_points, " << Q_name << ">(data, i, r_s"
-               << var_suffix << ", r_x, r_c" << var_suffix << ");\n";
+          code << "      GradTransposeAtPoints" << dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
+               << ">(data, i, r_s" << var_suffix << ", r_x, r_c" << var_suffix << ");\n";
           break;
           // LCOV_EXCL_START
         case CEED_EVAL_WEIGHT:
diff --git a/backends/hip-gen/ceed-hip-gen-operator-build.cpp b/backends/hip-gen/ceed-hip-gen-operator-build.cpp
index de77fc3121..86293bdfbd 100644
--- a/backends/hip-gen/ceed-hip-gen-operator-build.cpp
+++ b/backends/hip-gen/ceed-hip-gen-operator-build.cpp
@@ -696,6 +696,7 @@ static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, Ce
     for (CeedInt i = 0; i < num_input_fields; i++) {
       const char *field_name;
       std::string var_suffix = "_in_" + std::to_string(i);
+      std::string P_name     = "P_1d" + var_suffix;
 
       CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name));
       code << "      // ---- Input field " << i << ": " << field_name << "\n";
@@ -710,13 +711,13 @@ static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, Ce
           break;
         case CEED_EVAL_INTERP:
           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
-          code << "      InterpAtPoints" << dim << "d<num_comp" << var_suffix << ", max_num_points, " << Q_name << ">(data, i, r_c" << var_suffix
-               << ", r_x, r_s" << var_suffix << ");\n";
+          code << "      InterpAtPoints" << dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name << ">(data, i, r_c"
+               << var_suffix << ", r_x, r_s" << var_suffix << ");\n";
           break;
         case CEED_EVAL_GRAD:
           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim];\n";
-          code << "      GradAtPoints" << dim << "d<num_comp" << var_suffix << ", max_num_points, " << Q_name << ">(data, i, r_c" << var_suffix
-               << ", r_x, r_s" << var_suffix << ");\n";
+          code << "      GradAtPoints" << dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name << ">(data, i, r_c"
+               << var_suffix << ", r_x, r_s" << var_suffix << ");\n";
           break;
         case CEED_EVAL_WEIGHT:
           code << "      CeedScalar r_s" << var_suffix << "[1];\n";
@@ -945,15 +946,15 @@ static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, Ce
           code << "      if (i >= points.num_per_elem[elem]) {\n";
           code << "        for (CeedInt j = 0; j < num_comp" << var_suffix << "; j++) r_s" << var_suffix << "[j] = 0.0;\n";
           code << "      }\n";
-          code << "      InterpTransposeAtPoints" << dim << "d<num_comp" << var_suffix << ", max_num_points, " << Q_name << ">(data, i, r_s"
-               << var_suffix << ", r_x, r_c" << var_suffix << ");\n";
+          code << "      InterpTransposeAtPoints" << dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
+               << ">(data, i, r_s" << var_suffix << ", r_x, r_c" << var_suffix << ");\n";
           break;
         case CEED_EVAL_GRAD:
           code << "      if (i >= points.num_per_elem[elem]) {\n";
           code << "        for (CeedInt j = 0; j < num_comp" << var_suffix << "*dim; j++) r_s" << var_suffix << "[j] = 0.0;\n";
           code << "      }\n";
-          code << "      GradTransposeAtPoints" << dim << "d<num_comp" << var_suffix << ", max_num_points, " << Q_name << ">(data, i, r_s"
-               << var_suffix << ", r_x, r_c" << var_suffix << ");\n";
+          code << "      GradTransposeAtPoints" << dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
+               << ">(data, i, r_s" << var_suffix << ", r_x, r_c" << var_suffix << ");\n";
           break;
           // LCOV_EXCL_START
         case CEED_EVAL_WEIGHT:
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h
index 32437bf4c4..eda8d4076b 100644
--- a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h
@@ -40,7 +40,7 @@ inline __device__ void ChebyshevDerivativeAtPoint(const CeedScalar x, CeedScalar
 //------------------------------------------------------------------------------
 // 1D interpolate to points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int NUM_POINTS, int Q_1D>
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
 inline __device__ void InterpAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
                                         CeedScalar *__restrict__ r_V) {
   CeedScalar chebyshev_x[Q_1D];
@@ -61,7 +61,7 @@ inline __device__ void InterpAtPoints1d(SharedData_Cuda &data, const CeedInt p,
 //------------------------------------------------------------------------------
 // 1D interpolate transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int NUM_POINTS, int Q_1D>
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
 inline __device__ void InterpTransposeAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
                                                  CeedScalar *__restrict__ r_C) {
   CeedScalar chebyshev_x[Q_1D];
@@ -86,7 +86,7 @@ inline __device__ void InterpTransposeAtPoints1d(SharedData_Cuda &data, const Ce
 //------------------------------------------------------------------------------
 // 1D derivatives at points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int NUM_POINTS, int Q_1D>
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
 inline __device__ void GradAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
                                       CeedScalar *__restrict__ r_V) {
   CeedScalar chebyshev_x[Q_1D];
@@ -107,7 +107,7 @@ inline __device__ void GradAtPoints1d(SharedData_Cuda &data, const CeedInt p, co
 //------------------------------------------------------------------------------
 // 1D derivatives transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int NUM_POINTS, int Q_1D>
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
 inline __device__ void GradTransposeAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
                                                CeedScalar *__restrict__ r_C) {
   CeedScalar chebyshev_x[Q_1D];
@@ -136,7 +136,7 @@ inline __device__ void GradTransposeAtPoints1d(SharedData_Cuda &data, const Ceed
 //------------------------------------------------------------------------------
 // 2D interpolate to points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int NUM_POINTS, int Q_1D>
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
 inline __device__ void InterpAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
                                         CeedScalar *__restrict__ r_V) {
   for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
@@ -166,7 +166,7 @@ inline __device__ void InterpAtPoints2d(SharedData_Cuda &data, const CeedInt p,
 //------------------------------------------------------------------------------
 // 2D interpolate transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int NUM_POINTS, int Q_1D>
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
 inline __device__ void InterpTransposeAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
                                                  CeedScalar *__restrict__ r_C) {
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
@@ -204,7 +204,7 @@ inline __device__ void InterpTransposeAtPoints2d(SharedData_Cuda &data, const Ce
 //------------------------------------------------------------------------------
 // 2D derivatives at points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int NUM_POINTS, int Q_1D>
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
 inline __device__ void GradAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
                                       CeedScalar *__restrict__ r_V) {
   for (CeedInt i = 0; i < NUM_COMP * 2; i++) r_V[i] = 0.0;
@@ -238,7 +238,7 @@ inline __device__ void GradAtPoints2d(SharedData_Cuda &data, const CeedInt p, co
 //------------------------------------------------------------------------------
 // 2D derivatives transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int NUM_POINTS, int Q_1D>
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
 inline __device__ void GradTransposeAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
                                                CeedScalar *__restrict__ r_C) {
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
@@ -284,7 +284,7 @@ inline __device__ void GradTransposeAtPoints2d(SharedData_Cuda &data, const Ceed
 //------------------------------------------------------------------------------
 // 3D interpolate to points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int NUM_POINTS, int Q_1D>
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
 inline __device__ void InterpAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
                                         CeedScalar *__restrict__ r_V) {
   for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
@@ -319,7 +319,7 @@ inline __device__ void InterpAtPoints3d(SharedData_Cuda &data, const CeedInt p,
 //------------------------------------------------------------------------------
 // 3D interpolate transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int NUM_POINTS, int Q_1D>
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
 inline __device__ void InterpTransposeAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
                                                  CeedScalar *__restrict__ r_C) {
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
@@ -362,7 +362,7 @@ inline __device__ void InterpTransposeAtPoints3d(SharedData_Cuda &data, const Ce
 //------------------------------------------------------------------------------
 // 3D derivatives at points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int NUM_POINTS, int Q_1D>
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
 inline __device__ void GradAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
                                       CeedScalar *__restrict__ r_V) {
   for (CeedInt i = 0; i < NUM_COMP * 3; i++) r_V[i] = 0.0;
@@ -402,7 +402,7 @@ inline __device__ void GradAtPoints3d(SharedData_Cuda &data, const CeedInt p, co
 //------------------------------------------------------------------------------
 // 3D derivatives transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int NUM_POINTS, int Q_1D>
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
 inline __device__ void GradTransposeAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
                                                CeedScalar *__restrict__ r_C) {
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points.h
index 1b318f80e0..c61a7be300 100644
--- a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points.h
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points.h
@@ -64,11 +64,11 @@ extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedScal
 
       ReadPoint<BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X);
       if (BASIS_DIM == 1) {
-        InterpAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
+        InterpAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
       } else if (BASIS_DIM == 2) {
-        InterpAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
+        InterpAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
       } else if (BASIS_DIM == 3) {
-        InterpAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
+        InterpAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
       }
       WritePoint<BASIS_NUM_COMP, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, r_V, d_V);
     }
@@ -122,11 +122,11 @@ extern "C" __global__ void InterpTransposeAtPoints(const CeedInt num_elem, const
       ReadPoint<BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X);
       ReadPoint<BASIS_NUM_COMP, BASIS_NUM_PTS>(data, elem, i, points_per_elem[elem], 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_U, r_U);
       if (BASIS_DIM == 1) {
-        InterpTransposeAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
+        InterpTransposeAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
       } else if (BASIS_DIM == 2) {
-        InterpTransposeAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
+        InterpTransposeAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
       } else if (BASIS_DIM == 3) {
-        InterpTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
+        InterpTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
       }
     }
     __syncthreads();
@@ -182,11 +182,11 @@ extern "C" __global__ void InterpTransposeAddAtPoints(const CeedInt num_elem, co
       ReadPoint<BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X);
       ReadPoint<BASIS_NUM_COMP, BASIS_NUM_PTS>(data, elem, i, points_per_elem[elem], 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_U, r_U);
       if (BASIS_DIM == 1) {
-        InterpTransposeAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
+        InterpTransposeAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
       } else if (BASIS_DIM == 2) {
-        InterpTransposeAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
+        InterpTransposeAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
       } else if (BASIS_DIM == 3) {
-        InterpTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
+        InterpTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
       }
     }
     __syncthreads();
@@ -253,11 +253,11 @@ extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedScalar
 
       ReadPoint<BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X);
       if (BASIS_DIM == 1) {
-        GradAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
+        GradAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
       } else if (BASIS_DIM == 2) {
-        GradAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
+        GradAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
       } else if (BASIS_DIM == 3) {
-        GradAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
+        GradAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
       }
       WritePoint<BASIS_NUM_COMP * BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, r_V, d_V);
     }
@@ -312,11 +312,11 @@ extern "C" __global__ void GradTransposeAtPoints(const CeedInt num_elem, const C
       ReadPoint<BASIS_NUM_COMP * BASIS_DIM, BASIS_NUM_PTS>(data, elem, i, points_per_elem[elem], 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_U,
                                                            r_U);
       if (BASIS_DIM == 1) {
-        GradTransposeAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
+        GradTransposeAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
       } else if (BASIS_DIM == 2) {
-        GradTransposeAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
+        GradTransposeAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
       } else if (BASIS_DIM == 3) {
-        GradTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
+        GradTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
       }
     }
     __syncthreads();
@@ -373,11 +373,11 @@ extern "C" __global__ void GradTransposeAddAtPoints(const CeedInt num_elem, cons
       ReadPoint<BASIS_NUM_COMP * BASIS_DIM, BASIS_NUM_PTS>(data, elem, i, points_per_elem[elem], 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_U,
                                                            r_U);
       if (BASIS_DIM == 1) {
-        GradTransposeAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
+        GradTransposeAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
       } else if (BASIS_DIM == 2) {
-        GradTransposeAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
+        GradTransposeAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
       } else if (BASIS_DIM == 3) {
-        GradTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
+        GradTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
       }
     }
     __syncthreads();
diff --git a/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h b/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h
index 923de63395..a6abc6a807 100644
--- a/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h
+++ b/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h
@@ -40,7 +40,7 @@ inline __device__ void ChebyshevDerivativeAtPoint(const CeedScalar x, CeedScalar
 //------------------------------------------------------------------------------
 // 1D interpolate to points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int NUM_POINTS, int Q_1D>
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
 inline __device__ void InterpAtPoints1d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X,
                                         CeedScalar *__restrict__ r_V) {
   CeedScalar chebyshev_x[Q_1D];
@@ -61,7 +61,7 @@ inline __device__ void InterpAtPoints1d(SharedData_Hip &data, const CeedInt p, c
 //------------------------------------------------------------------------------
 // 1D interpolate transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int NUM_POINTS, int Q_1D>
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
 inline __device__ void InterpTransposeAtPoints1d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U,
                                                  const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) {
   CeedScalar chebyshev_x[Q_1D];
@@ -86,7 +86,7 @@ inline __device__ void InterpTransposeAtPoints1d(SharedData_Hip &data, const Cee
 //------------------------------------------------------------------------------
 // 1D derivatives at points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int NUM_POINTS, int Q_1D>
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
 inline __device__ void GradAtPoints1d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X,
                                       CeedScalar *__restrict__ r_V) {
   CeedScalar chebyshev_x[Q_1D];
@@ -107,7 +107,7 @@ inline __device__ void GradAtPoints1d(SharedData_Hip &data, const CeedInt p, con
 //------------------------------------------------------------------------------
 // 1D derivatives transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int NUM_POINTS, int Q_1D>
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
 inline __device__ void GradTransposeAtPoints1d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U,
                                                const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) {
   CeedScalar chebyshev_x[Q_1D];
@@ -136,7 +136,7 @@ inline __device__ void GradTransposeAtPoints1d(SharedData_Hip &data, const CeedI
 //------------------------------------------------------------------------------
 // 2D interpolate to points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int NUM_POINTS, int Q_1D>
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
 inline __device__ void InterpAtPoints2d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X,
                                         CeedScalar *__restrict__ r_V) {
   for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
@@ -166,7 +166,7 @@ inline __device__ void InterpAtPoints2d(SharedData_Hip &data, const CeedInt p, c
 //------------------------------------------------------------------------------
 // 2D interpolate transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int NUM_POINTS, int Q_1D>
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
 inline __device__ void InterpTransposeAtPoints2d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U,
                                                  const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) {
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
@@ -204,7 +204,7 @@ inline __device__ void InterpTransposeAtPoints2d(SharedData_Hip &data, const Cee
 //------------------------------------------------------------------------------
 // 2D derivatives at points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int NUM_POINTS, int Q_1D>
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
 inline __device__ void GradAtPoints2d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X,
                                       CeedScalar *__restrict__ r_V) {
   for (CeedInt i = 0; i < NUM_COMP * 2; i++) r_V[i] = 0.0;
@@ -238,7 +238,7 @@ inline __device__ void GradAtPoints2d(SharedData_Hip &data, const CeedInt p, con
 //------------------------------------------------------------------------------
 // 2D derivatives transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int NUM_POINTS, int Q_1D>
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
 inline __device__ void GradTransposeAtPoints2d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U,
                                                const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) {
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
@@ -284,7 +284,7 @@ inline __device__ void GradTransposeAtPoints2d(SharedData_Hip &data, const CeedI
 //------------------------------------------------------------------------------
 // 3D interpolate to points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int NUM_POINTS, int Q_1D>
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
 inline __device__ void InterpAtPoints3d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X,
                                         CeedScalar *__restrict__ r_V) {
   for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
@@ -319,7 +319,7 @@ inline __device__ void InterpAtPoints3d(SharedData_Hip &data, const CeedInt p, c
 //------------------------------------------------------------------------------
 // 3D interpolate transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int NUM_POINTS, int Q_1D>
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
 inline __device__ void InterpTransposeAtPoints3d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U,
                                                  const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) {
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
@@ -362,7 +362,7 @@ inline __device__ void InterpTransposeAtPoints3d(SharedData_Hip &data, const Cee
 //------------------------------------------------------------------------------
 // 3D derivatives at points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int NUM_POINTS, int Q_1D>
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
 inline __device__ void GradAtPoints3d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X,
                                       CeedScalar *__restrict__ r_V) {
   for (CeedInt i = 0; i < NUM_COMP * 3; i++) r_V[i] = 0.0;
@@ -402,7 +402,7 @@ inline __device__ void GradAtPoints3d(SharedData_Hip &data, const CeedInt p, con
 //------------------------------------------------------------------------------
 // 3D derivatives transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int NUM_POINTS, int Q_1D>
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
 inline __device__ void GradTransposeAtPoints3d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U,
                                                const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) {
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
diff --git a/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points.h b/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points.h
index 81cca74474..e48c1681da 100644
--- a/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points.h
+++ b/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points.h
@@ -65,11 +65,11 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
 
       ReadPoint<BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X);
       if (BASIS_DIM == 1) {
-        InterpAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
+        InterpAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
       } else if (BASIS_DIM == 2) {
-        InterpAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
+        InterpAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
       } else if (BASIS_DIM == 3) {
-        InterpAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
+        InterpAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
       }
       WritePoint<BASIS_NUM_COMP, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, r_V, d_V);
     }
@@ -123,11 +123,11 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
       ReadPoint<BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X);
       ReadPoint<BASIS_NUM_COMP, BASIS_NUM_PTS>(data, elem, i, points_per_elem[elem], 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_U, r_U);
       if (BASIS_DIM == 1) {
-        InterpTransposeAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
+        InterpTransposeAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
       } else if (BASIS_DIM == 2) {
-        InterpTransposeAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
+        InterpTransposeAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
       } else if (BASIS_DIM == 3) {
-        InterpTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
+        InterpTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
       }
     }
     __syncthreads();
@@ -183,11 +183,11 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
       ReadPoint<BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X);
       ReadPoint<BASIS_NUM_COMP, BASIS_NUM_PTS>(data, elem, i, points_per_elem[elem], 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_U, r_U);
       if (BASIS_DIM == 1) {
-        InterpTransposeAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
+        InterpTransposeAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
       } else if (BASIS_DIM == 2) {
-        InterpTransposeAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
+        InterpTransposeAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
       } else if (BASIS_DIM == 3) {
-        InterpTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
+        InterpTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
       }
     }
     __syncthreads();
@@ -255,11 +255,11 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
 
       ReadPoint<BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X);
       if (BASIS_DIM == 1) {
-        GradAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
+        GradAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
       } else if (BASIS_DIM == 2) {
-        GradAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
+        GradAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
       } else if (BASIS_DIM == 3) {
-        GradAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
+        GradAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
       }
       WritePoint<BASIS_NUM_COMP * BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, r_V, d_V);
     }
@@ -314,11 +314,11 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
       ReadPoint<BASIS_NUM_COMP * BASIS_DIM, BASIS_NUM_PTS>(data, elem, i, points_per_elem[elem], 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_U,
                                                            r_U);
       if (BASIS_DIM == 1) {
-        GradTransposeAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
+        GradTransposeAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
       } else if (BASIS_DIM == 2) {
-        GradTransposeAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
+        GradTransposeAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
       } else if (BASIS_DIM == 3) {
-        GradTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
+        GradTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
       }
     }
     __syncthreads();
@@ -375,11 +375,11 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
       ReadPoint<BASIS_NUM_COMP * BASIS_DIM, BASIS_NUM_PTS>(data, elem, i, points_per_elem[elem], 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_U,
                                                            r_U);
       if (BASIS_DIM == 1) {
-        GradTransposeAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
+        GradTransposeAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
       } else if (BASIS_DIM == 2) {
-        GradTransposeAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
+        GradTransposeAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
       } else if (BASIS_DIM == 3) {
-        GradTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
+        GradTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
       }
     }
     __syncthreads();

From 343e3094792a64f9c2da70ef2256f98e7dc173cf Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Wed, 26 Feb 2025 16:40:46 -0700
Subject: [PATCH 326/571] gpu - isolate core 2D tensor logic to allow flat
 version

---
 .../cuda-gen/ceed-cuda-gen-operator-build.cpp |   4 +-
 .../hip-gen/ceed-hip-gen-operator-build.cpp   |   4 +-
 .../cuda-shared-basis-nontensor-templates.h   |   2 +-
 .../cuda/cuda-shared-basis-nontensor.h        |   2 +-
 ...-shared-basis-tensor-at-points-templates.h |  68 ++++++----
 .../cuda/cuda-shared-basis-tensor-templates.h | 121 +++++++++++-------
 .../cuda/cuda-shared-basis-tensor.h           |   6 +-
 .../hip-shared-basis-nontensor-templates.h    |   2 +-
 .../hip/hip-shared-basis-nontensor.h          |   2 +-
 ...-shared-basis-tensor-at-points-templates.h |  68 +++++++---
 .../hip/hip-shared-basis-tensor-templates.h   | 108 ++++++++++------
 11 files changed, 251 insertions(+), 136 deletions(-)

diff --git a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
index 93d290c7e2..58a7859b24 100644
--- a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
+++ b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
@@ -167,8 +167,8 @@ static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, C
   CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode));
 
   // Set field constants
+  code << "  const CeedInt " << P_name << " = " << (basis == CEED_BASIS_NONE ? Q_1d : P_1d) << ";\n";
   if (eval_mode != CEED_EVAL_WEIGHT) {
-    code << "  const CeedInt " << P_name << " = " << (basis == CEED_BASIS_NONE ? Q_1d : P_1d) << ";\n";
     code << "  const CeedInt num_comp" << var_suffix << " = " << num_comp << ";\n";
   }
 
@@ -517,7 +517,7 @@ static int CeedOperatorBuildKernelBasis_Cuda_gen(std::ostringstream &code, CeedO
           code << "    CeedScalar r_q" << var_suffix << "[" << (is_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
           CeedCallBackend(CeedBasisGetData(basis, &basis_data));
           data->W = basis_data->d_q_weight_1d;
-          code << "    " << function_name << "<" << Q_name << ">(data, W, r_q" << var_suffix << ");\n";
+          code << "    " << function_name << "<" << P_name << ", " << Q_name << ">(data, W, r_q" << var_suffix << ");\n";
         }
         break;
       }
diff --git a/backends/hip-gen/ceed-hip-gen-operator-build.cpp b/backends/hip-gen/ceed-hip-gen-operator-build.cpp
index 86293bdfbd..d1beb5fac7 100644
--- a/backends/hip-gen/ceed-hip-gen-operator-build.cpp
+++ b/backends/hip-gen/ceed-hip-gen-operator-build.cpp
@@ -194,8 +194,8 @@ static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, Ce
   CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode));
 
   // Set field constants
+  code << "  const CeedInt " << P_name << " = " << (basis == CEED_BASIS_NONE ? Q_1d : P_1d) << ";\n";
   if (eval_mode != CEED_EVAL_WEIGHT) {
-    code << "  const CeedInt " << P_name << " = " << (basis == CEED_BASIS_NONE ? Q_1d : P_1d) << ";\n";
     code << "  const CeedInt num_comp" << var_suffix << " = " << num_comp << ";\n";
   }
 
@@ -544,7 +544,7 @@ static int CeedOperatorBuildKernelBasis_Hip_gen(std::ostringstream &code, CeedOp
           code << "    CeedScalar r_q" << var_suffix << "[" << (is_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
           CeedCallBackend(CeedBasisGetData(basis, &basis_data));
           data->W = basis_data->d_q_weight_1d;
-          code << "    " << function_name << "<" << Q_name << ">(data, W, r_q" << var_suffix << ");\n";
+          code << "    " << function_name << "<" << P_name << ", " << Q_name << ">(data, W, r_q" << var_suffix << ");\n";
         }
         break;
       }
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor-templates.h
index 4cb265e78a..da67dcf20c 100644
--- a/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor-templates.h
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor-templates.h
@@ -92,7 +92,7 @@ inline __device__ void GradTransposeNonTensor(SharedData_Cuda &data, const CeedS
 //------------------------------------------------------------------------------
 // Quadrature weights
 //------------------------------------------------------------------------------
-template <int Q>
+template <int P, int Q>
 inline __device__ void WeightNonTensor(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight, CeedScalar *w) {
   *w = (data.t_id_x < Q) ? q_weight[data.t_id_x] : 0.0;
 }
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor.h b/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor.h
index 3a7fb2e241..d3c3ef3d86 100644
--- a/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor.h
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor.h
@@ -194,7 +194,7 @@ extern "C" __global__ void Weight(const CeedInt num_elem, const CeedScalar *__re
   CeedScalar r_W[1];
 
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
-    WeightNonTensor<BASIS_Q>(data, q_weight, r_W);
+    WeightNonTensor<BASIS_P, BASIS_Q>(data, q_weight, r_W);
     WriteElementStrided1d<1, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, r_W, d_W);
   }
 }
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h
index eda8d4076b..9444df06bf 100644
--- a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h
@@ -136,16 +136,16 @@ inline __device__ void GradTransposeAtPoints1d(SharedData_Cuda &data, const Ceed
 //------------------------------------------------------------------------------
 // 2D interpolate to points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
-inline __device__ void InterpAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
-                                        CeedScalar *__restrict__ r_V) {
+template <int NUM_COMP, int NUM_POINTS, int Q_1D>
+inline __device__ void InterpAtPoints2d_Core(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedInt p,
+                                             const CeedScalar *__restrict__ r_C, const CeedScalar *r_X, CeedScalar *__restrict__ r_V) {
   for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
     CeedScalar buffer[Q_1D];
     CeedScalar chebyshev_x[Q_1D];
 
     // Load coefficients
-    if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[comp];
+    if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * Q_1D] = r_C[comp];
     __syncthreads();
     // Contract x direction
     ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
@@ -163,18 +163,24 @@ inline __device__ void InterpAtPoints2d(SharedData_Cuda &data, const CeedInt p,
   }
 }
 
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
+inline __device__ void InterpAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
+                                        CeedScalar *__restrict__ r_V) {
+  InterpAtPoints2d_Core<NUM_COMP, NUM_POINTS, Q_1D>(data, data.t_id_x, data.t_id_y, p, r_C, r_X, r_V);
+}
+
 //------------------------------------------------------------------------------
 // 2D interpolate transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
-inline __device__ void InterpTransposeAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
-                                                 CeedScalar *__restrict__ r_C) {
+template <int NUM_COMP, int NUM_POINTS, int Q_1D>
+inline __device__ void InterpTransposeAtPoints2d_Core(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedInt p,
+                                                      const CeedScalar *__restrict__ r_U, const CeedScalar *r_X, CeedScalar *__restrict__ r_C) {
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
     CeedScalar buffer[Q_1D];
     CeedScalar chebyshev_x[Q_1D];
 
     // Clear shared memory
-    if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
+    if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * Q_1D] = 0.0;
     __syncthreads();
     // Contract y direction
     ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
@@ -186,10 +192,10 @@ inline __device__ void InterpTransposeAtPoints2d(SharedData_Cuda &data, const Ce
     if (p < NUM_POINTS) {
       for (CeedInt i = 0; i < Q_1D; i++) {
         // Note: shifting to avoid atomic adds
-        const CeedInt ii = (i + data.t_id_x) % Q_1D;
+        const CeedInt ii = (i + t_id_x) % Q_1D;
 
         for (CeedInt j = 0; j < Q_1D; j++) {
-          const CeedInt jj = (j + data.t_id_y) % Q_1D;
+          const CeedInt jj = (j + t_id_y) % Q_1D;
 
           atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
         }
@@ -197,23 +203,29 @@ inline __device__ void InterpTransposeAtPoints2d(SharedData_Cuda &data, const Ce
     }
     // Pull from shared to register
     __syncthreads();
-    if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[comp] += data.slice[data.t_id_x + data.t_id_y * Q_1D];
+    if (t_id_x < Q_1D && t_id_y < Q_1D) r_C[comp] += data.slice[t_id_x + t_id_y * Q_1D];
   }
 }
 
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
+inline __device__ void InterpTransposeAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
+                                                 CeedScalar *__restrict__ r_C) {
+  InterpTransposeAtPoints2d_Core<NUM_COMP, NUM_POINTS, Q_1D>(data, data.t_id_x, data.t_id_y, p, r_U, r_X, r_C);
+}
+
 //------------------------------------------------------------------------------
 // 2D derivatives at points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
-inline __device__ void GradAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
-                                      CeedScalar *__restrict__ r_V) {
+template <int NUM_COMP, int NUM_POINTS, int Q_1D>
+inline __device__ void GradAtPoints2d_Core(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedInt p,
+                                           const CeedScalar *__restrict__ r_C, const CeedScalar *r_X, CeedScalar *__restrict__ r_V) {
   for (CeedInt i = 0; i < NUM_COMP * 2; i++) r_V[i] = 0.0;
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
     CeedScalar buffer[Q_1D];
     CeedScalar chebyshev_x[Q_1D];
 
     // Load coefficients
-    if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[comp];
+    if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * Q_1D] = r_C[comp];
     __syncthreads();
     for (CeedInt dim = 0; dim < 2; dim++) {
       // Contract x direction
@@ -235,18 +247,24 @@ inline __device__ void GradAtPoints2d(SharedData_Cuda &data, const CeedInt p, co
   }
 }
 
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
+inline __device__ void GradAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
+                                      CeedScalar *__restrict__ r_V) {
+  GradAtPoints2d_Core<NUM_COMP, NUM_POINTS, Q_1D>(data, data.t_id_x, data.t_id_y, p, r_C, r_X, r_V);
+}
+
 //------------------------------------------------------------------------------
 // 2D derivatives transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
-inline __device__ void GradTransposeAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
-                                               CeedScalar *__restrict__ r_C) {
+template <int NUM_COMP, int NUM_POINTS, int Q_1D>
+inline __device__ void GradTransposeAtPoints2d_Core(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedInt p,
+                                                    const CeedScalar *__restrict__ r_U, const CeedScalar *r_X, CeedScalar *__restrict__ r_C) {
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
     CeedScalar buffer[Q_1D];
     CeedScalar chebyshev_x[Q_1D];
 
     // Clear shared memory
-    if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
+    if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * Q_1D] = 0.0;
     __syncthreads();
     for (CeedInt dim = 0; dim < 2; dim++) {
       // Contract y direction
@@ -261,10 +279,10 @@ inline __device__ void GradTransposeAtPoints2d(SharedData_Cuda &data, const Ceed
       if (p < NUM_POINTS) {
         for (CeedInt i = 0; i < Q_1D; i++) {
           // Note: shifting to avoid atomic adds
-          const CeedInt ii = (i + data.t_id_x) % Q_1D;
+          const CeedInt ii = (i + t_id_x) % Q_1D;
 
           for (CeedInt j = 0; j < Q_1D; j++) {
-            const CeedInt jj = (j + data.t_id_y) % Q_1D;
+            const CeedInt jj = (j + t_id_y) % Q_1D;
 
             atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
           }
@@ -273,10 +291,16 @@ inline __device__ void GradTransposeAtPoints2d(SharedData_Cuda &data, const Ceed
     }
     // Pull from shared to register
     __syncthreads();
-    if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[comp] += data.slice[data.t_id_x + data.t_id_y * Q_1D];
+    if (t_id_x < Q_1D && t_id_y < Q_1D) r_C[comp] += data.slice[t_id_x + t_id_y * Q_1D];
   }
 }
 
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
+inline __device__ void GradTransposeAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
+                                               CeedScalar *__restrict__ r_C) {
+  GradTransposeAtPoints2d_Core<NUM_COMP, NUM_POINTS, Q_1D>(data, data.t_id_x, data.t_id_y, p, r_U, r_X, r_C);
+}
+
 //------------------------------------------------------------------------------
 // 3D
 //------------------------------------------------------------------------------
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h
index d49cfcd717..6141838798 100644
--- a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h
@@ -91,7 +91,7 @@ inline __device__ void GradTranspose1d(SharedData_Cuda &data, const CeedScalar *
 //------------------------------------------------------------------------------
 // 1D quadrature weights
 //------------------------------------------------------------------------------
-template <int Q_1D>
+template <int P_1D, int Q_1D>
 inline __device__ void Weight1d(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
   *w = (data.t_id_x < Q_1D) ? q_weight_1d[data.t_id_x] : 0.0;
 }
@@ -104,13 +104,14 @@ inline __device__ void Weight1d(SharedData_Cuda &data, const CeedScalar *__restr
 // 2D tensor contraction x
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void ContractX2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
-  data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
+inline __device__ void ContractX2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
+                                   CeedScalar *V) {
+  data.slice[t_id_x + t_id_y * T_1D] = *U;
   __syncthreads();
   *V = 0.0;
-  if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
+  if (t_id_x < Q_1D && t_id_y < P_1D) {
     for (CeedInt i = 0; i < P_1D; i++) {
-      *V += B[i + data.t_id_x * P_1D] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
+      *V += B[i + t_id_x * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
     }
   }
   __syncthreads();
@@ -120,13 +121,14 @@ inline __device__ void ContractX2d(SharedData_Cuda &data, const CeedScalar *U, c
 // 2D tensor contract y
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void ContractY2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
-  data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
+inline __device__ void ContractY2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
+                                   CeedScalar *V) {
+  data.slice[t_id_x + t_id_y * T_1D] = *U;
   __syncthreads();
   *V = 0.0;
-  if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
+  if (t_id_x < Q_1D && t_id_y < Q_1D) {
     for (CeedInt i = 0; i < P_1D; i++) {
-      *V += B[i + data.t_id_y * P_1D] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
+      *V += B[i + t_id_y * P_1D] * data.slice[t_id_x + i * T_1D];  // Contract y direction
     }
   }
   __syncthreads();
@@ -136,13 +138,14 @@ inline __device__ void ContractY2d(SharedData_Cuda &data, const CeedScalar *U, c
 // 2D transpose tensor contract y
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void ContractTransposeY2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
-  data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
+inline __device__ void ContractTransposeY2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
+                                            CeedScalar *V) {
+  data.slice[t_id_x + t_id_y * T_1D] = *U;
   __syncthreads();
   *V = 0.0;
-  if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
+  if (t_id_x < Q_1D && t_id_y < P_1D) {
     for (CeedInt i = 0; i < Q_1D; i++) {
-      *V += B[data.t_id_y + i * P_1D] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
+      *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D];  // Contract y direction
     }
   }
   __syncthreads();
@@ -152,13 +155,14 @@ inline __device__ void ContractTransposeY2d(SharedData_Cuda &data, const CeedSca
 // 2D transpose tensor contract x
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void ContractTransposeX2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
-  data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
+inline __device__ void ContractTransposeX2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
+                                            CeedScalar *V) {
+  data.slice[t_id_x + t_id_y * T_1D] = *U;
   __syncthreads();
   *V = 0.0;
-  if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
+  if (t_id_x < P_1D && t_id_y < P_1D) {
     for (CeedInt i = 0; i < Q_1D; i++) {
-      *V += B[data.t_id_x + i * P_1D] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
+      *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
     }
   }
   __syncthreads();
@@ -168,12 +172,13 @@ inline __device__ void ContractTransposeX2d(SharedData_Cuda &data, const CeedSca
 // 2D transpose tensor contract and add x
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void ContractTransposeAddX2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
-  data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
+inline __device__ void ContractTransposeAddX2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
+                                               CeedScalar *V) {
+  data.slice[t_id_x + t_id_y * T_1D] = *U;
   __syncthreads();
-  if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
+  if (t_id_x < P_1D && t_id_y < P_1D) {
     for (CeedInt i = 0; i < Q_1D; i++) {
-      *V += B[data.t_id_x + i * P_1D] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
+      *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
     }
   }
   __syncthreads();
@@ -183,64 +188,94 @@ inline __device__ void ContractTransposeAddX2d(SharedData_Cuda &data, const Ceed
 // 2D interpolate to quadrature points
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void InterpTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
-                                      CeedScalar *__restrict__ r_V) {
+inline __device__ void InterpTensor2d_Core(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *__restrict__ r_U,
+                                           const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
   CeedScalar r_t[1];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_B, r_t);
-    ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp]);
+    ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
+    ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
   }
 }
 
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void InterpTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                      CeedScalar *__restrict__ r_V) {
+  InterpTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x, data.t_id_y, r_U, c_B, r_V);
+}
+
 //------------------------------------------------------------------------------
 // 2D interpolate transpose
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void InterpTransposeTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
-                                               CeedScalar *__restrict__ r_V) {
+inline __device__ void InterpTransposeTensor2d_Core(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *__restrict__ r_U,
+                                                    const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
   CeedScalar r_t[1];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_B, r_t);
-    ContractTransposeX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp]);
+    ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
+    ContractTransposeX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
   }
 }
 
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void InterpTransposeTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                               CeedScalar *__restrict__ r_V) {
+  InterpTransposeTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x, data.t_id_y, r_U, c_B, r_V);
+}
+
 //------------------------------------------------------------------------------
 // 2D derivatives at quadrature points
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void GradTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
-                                    CeedScalar *__restrict__ r_V) {
+inline __device__ void GradTensor2d_Core(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *__restrict__ r_U,
+                                         const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
   CeedScalar r_t[1];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_G, r_t);
-    ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp + 0 * NUM_COMP]);
-    ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_B, r_t);
-    ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_G, &r_V[comp + 1 * NUM_COMP]);
+    ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_G, r_t);
+    ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp + 0 * NUM_COMP]);
+    ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
+    ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp + 1 * NUM_COMP]);
   }
 }
 
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void GradTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
+                                    CeedScalar *__restrict__ r_V) {
+  GradTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x, data.t_id_y, r_U, c_B, c_G, r_V);
+}
+
 //------------------------------------------------------------------------------
 // 2D derivatives transpose
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void GradTransposeTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
-                                             CeedScalar *__restrict__ r_V) {
+inline __device__ void GradTransposeTensor2d_Core(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *__restrict__ r_U,
+                                                  const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
   CeedScalar r_t[1];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp + 0 * NUM_COMP], c_B, r_t);
-    ContractTransposeX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_G, &r_V[comp]);
-    ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp + 1 * NUM_COMP], c_G, r_t);
-    ContractTransposeAddX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp]);
+    ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 0 * NUM_COMP], c_B, r_t);
+    ContractTransposeX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp]);
+    ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 1 * NUM_COMP], c_G, r_t);
+    ContractTransposeAddX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
   }
 }
 
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void GradTransposeTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
+                                             CeedScalar *__restrict__ r_V) {
+  GradTransposeTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x, data.t_id_y, r_U, c_B, c_G, r_V);
+}
+
 //------------------------------------------------------------------------------
 // 2D quadrature weights
 //------------------------------------------------------------------------------
 template <int Q_1D>
+inline __device__ void WeightTensor2d_Core(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *__restrict__ q_weight_1d,
+                                           CeedScalar *w) {
+  *w = (t_id_x < Q_1D && t_id_y < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] : 0.0;
+}
+
+template <int P_1D, int Q_1D>
 inline __device__ void WeightTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
-  *w = (data.t_id_x < Q_1D && data.t_id_y < Q_1D) ? q_weight_1d[data.t_id_x] * q_weight_1d[data.t_id_y] : 0.0;
+  WeightTensor2d_Core<Q_1D>(data, data.t_id_x, data.t_id_y, q_weight_1d, w);
 }
 
 //------------------------------------------------------------------------------
@@ -524,7 +559,7 @@ inline __device__ void GradTransposeTensorCollocated3d(SharedData_Cuda &data, co
 //------------------------------------------------------------------------------
 // 3D quadrature weights
 //------------------------------------------------------------------------------
-template <int Q_1D>
+template <int P_1D, int Q_1D>
 inline __device__ void WeightTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
   const bool       quad = (data.t_id_x < Q_1D && data.t_id_y < Q_1D);
   const CeedScalar pw   = quad ? q_weight_1d[data.t_id_x] * q_weight_1d[data.t_id_y] : 0.0;
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor.h
index 312442c3aa..6f1ce0944c 100644
--- a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor.h
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor.h
@@ -281,13 +281,13 @@ extern "C" __global__ void Weight(const CeedInt num_elem, const CeedScalar *__re
   // Apply basis element by element
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
     if (BASIS_DIM == 1) {
-      Weight1d<BASIS_Q_1D>(data, q_weight_1d, r_W);
+      Weight1d<BASIS_P_1D, BASIS_Q_1D>(data, q_weight_1d, r_W);
       WriteElementStrided1d<1, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, r_W, d_W);
     } else if (BASIS_DIM == 2) {
-      WeightTensor2d<BASIS_Q_1D>(data, q_weight_1d, r_W);
+      WeightTensor2d<BASIS_P_1D, BASIS_Q_1D>(data, q_weight_1d, r_W);
       WriteElementStrided2d<1, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, r_W, d_W);
     } else if (BASIS_DIM == 3) {
-      WeightTensor3d<BASIS_Q_1D>(data, q_weight_1d, r_W);
+      WeightTensor3d<BASIS_P_1D, BASIS_Q_1D>(data, q_weight_1d, r_W);
       WriteElementStrided3d<1, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_W,
                                            d_W);
     }
diff --git a/include/ceed/jit-source/hip/hip-shared-basis-nontensor-templates.h b/include/ceed/jit-source/hip/hip-shared-basis-nontensor-templates.h
index 898b0ff331..b8541ad59a 100644
--- a/include/ceed/jit-source/hip/hip-shared-basis-nontensor-templates.h
+++ b/include/ceed/jit-source/hip/hip-shared-basis-nontensor-templates.h
@@ -92,7 +92,7 @@ inline __device__ void GradTransposeNonTensor(SharedData_Hip &data, const CeedSc
 //------------------------------------------------------------------------------
 // Quadrature weights
 //------------------------------------------------------------------------------
-template <int Q>
+template <int P, int Q>
 inline __device__ void WeightNonTensor(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight, CeedScalar *w) {
   *w = (data.t_id_x < Q) ? q_weight[data.t_id_x] : 0.0;
 }
diff --git a/include/ceed/jit-source/hip/hip-shared-basis-nontensor.h b/include/ceed/jit-source/hip/hip-shared-basis-nontensor.h
index dabe392f10..426ae28222 100644
--- a/include/ceed/jit-source/hip/hip-shared-basis-nontensor.h
+++ b/include/ceed/jit-source/hip/hip-shared-basis-nontensor.h
@@ -197,7 +197,7 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
   CeedScalar r_W[1];
 
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
-    WeightNonTensor<BASIS_Q>(data, q_weight, r_W);
+    WeightNonTensor<BASIS_P, BASIS_Q>(data, q_weight, r_W);
     WriteElementStrided1d<1, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, r_W, d_W);
   }
 }
diff --git a/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h b/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h
index a6abc6a807..48ebb0ccbe 100644
--- a/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h
+++ b/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h
@@ -136,16 +136,16 @@ inline __device__ void GradTransposeAtPoints1d(SharedData_Hip &data, const CeedI
 //------------------------------------------------------------------------------
 // 2D interpolate to points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
-inline __device__ void InterpAtPoints2d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X,
-                                        CeedScalar *__restrict__ r_V) {
+template <int NUM_COMP, int NUM_POINTS, int Q_1D>
+inline __device__ void InterpAtPoints2d_Core(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedInt p,
+                                             const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_V) {
   for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
     CeedScalar buffer[Q_1D];
     CeedScalar chebyshev_x[Q_1D];
 
     // Load coefficients
-    if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[comp];
+    if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * Q_1D] = r_C[comp];
     __syncthreads();
     // Contract x direction
     ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
@@ -163,18 +163,25 @@ inline __device__ void InterpAtPoints2d(SharedData_Hip &data, const CeedInt p, c
   }
 }
 
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
+inline __device__ void InterpAtPoints2d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X,
+                                        CeedScalar *__restrict__ r_V) {
+  InterpAtPoints2d_Core<NUM_COMP, NUM_POINTS, Q_1D>(data, data.t_id_x, data.t_id_y, p, r_C, r_X, r_V);
+}
+
 //------------------------------------------------------------------------------
 // 2D interpolate transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
-inline __device__ void InterpTransposeAtPoints2d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U,
-                                                 const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) {
+template <int NUM_COMP, int NUM_POINTS, int Q_1D>
+inline __device__ void InterpTransposeAtPoints2d_Core(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedInt p,
+                                                      const CeedScalar *__restrict__ r_U, const CeedScalar *__restrict__ r_X,
+                                                      CeedScalar *__restrict__ r_C) {
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
     CeedScalar buffer[Q_1D];
     CeedScalar chebyshev_x[Q_1D];
 
     // Clear shared memory
-    if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
+    if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * Q_1D] = 0.0;
     __syncthreads();
     // Contract y direction
     ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
@@ -186,10 +193,10 @@ inline __device__ void InterpTransposeAtPoints2d(SharedData_Hip &data, const Cee
     if (p < NUM_POINTS) {
       for (CeedInt i = 0; i < Q_1D; i++) {
         // Note: shifting to avoid atomic adds
-        const CeedInt ii = (i + data.t_id_x) % Q_1D;
+        const CeedInt ii = (i + t_id_x) % Q_1D;
 
         for (CeedInt j = 0; j < Q_1D; j++) {
-          const CeedInt jj = (j + data.t_id_y) % Q_1D;
+          const CeedInt jj = (j + t_id_y) % Q_1D;
 
           atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
         }
@@ -201,19 +208,25 @@ inline __device__ void InterpTransposeAtPoints2d(SharedData_Hip &data, const Cee
   }
 }
 
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
+inline __device__ void InterpTransposeAtPoints2d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U,
+                                                 const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) {
+  InterpTransposeAtPoints2d_Core<NUM_COMP, NUM_POINTS, Q_1D>(data, data.t_id_x, data.t_id_y, p, r_U, r_X, r_C);
+}
+
 //------------------------------------------------------------------------------
 // 2D derivatives at points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
-inline __device__ void GradAtPoints2d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X,
-                                      CeedScalar *__restrict__ r_V) {
+template <int NUM_COMP, int NUM_POINTS, int Q_1D>
+inline __device__ void GradAtPoints2d_Core(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedInt p,
+                                           const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_V) {
   for (CeedInt i = 0; i < NUM_COMP * 2; i++) r_V[i] = 0.0;
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
     CeedScalar buffer[Q_1D];
     CeedScalar chebyshev_x[Q_1D];
 
     // Load coefficients
-    if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[comp];
+    if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * Q_1D] = r_C[comp];
     __syncthreads();
     for (CeedInt dim = 0; dim < 2; dim++) {
       // Contract x direction
@@ -235,18 +248,25 @@ inline __device__ void GradAtPoints2d(SharedData_Hip &data, const CeedInt p, con
   }
 }
 
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
+inline __device__ void GradAtPoints2d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X,
+                                      CeedScalar *__restrict__ r_V) {
+  GradAtPoints2d_Core<NUM_COMP, NUM_POINTS, Q_1D>(data, data.t_id_x, data.t_id_y, p, r_C, r_X, r_V);
+}
+
 //------------------------------------------------------------------------------
 // 2D derivatives transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
-inline __device__ void GradTransposeAtPoints2d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U,
-                                               const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) {
+template <int NUM_COMP, int NUM_POINTS, int Q_1D>
+inline __device__ void GradTransposeAtPoints2d_Core(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedInt p,
+                                                    const CeedScalar *__restrict__ r_U, const CeedScalar *__restrict__ r_X,
+                                                    CeedScalar *__restrict__ r_C) {
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
     CeedScalar buffer[Q_1D];
     CeedScalar chebyshev_x[Q_1D];
 
     // Clear shared memory
-    if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
+    if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * Q_1D] = 0.0;
     __syncthreads();
     for (CeedInt dim = 0; dim < 2; dim++) {
       // Contract y direction
@@ -261,10 +281,10 @@ inline __device__ void GradTransposeAtPoints2d(SharedData_Hip &data, const CeedI
       if (p < NUM_POINTS) {
         for (CeedInt i = 0; i < Q_1D; i++) {
           // Note: shifting to avoid atomic adds
-          const CeedInt ii = (i + data.t_id_x) % Q_1D;
+          const CeedInt ii = (i + t_id_x) % Q_1D;
 
           for (CeedInt j = 0; j < Q_1D; j++) {
-            const CeedInt jj = (j + data.t_id_y) % Q_1D;
+            const CeedInt jj = (j + t_id_y) % Q_1D;
 
             atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
           }
@@ -273,10 +293,16 @@ inline __device__ void GradTransposeAtPoints2d(SharedData_Hip &data, const CeedI
     }
     // Pull from shared to register
     __syncthreads();
-    if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[comp] += data.slice[data.t_id_x + data.t_id_y * Q_1D];
+    if (t_id_x < Q_1D && t_id_y < Q_1D) r_C[comp] += data.slice[t_id_x + t_id_y * Q_1D];
   }
 }
 
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
+inline __device__ void GradTransposeAtPoints2d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U,
+                                               const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) {
+  GradTransposeAtPoints2d_Core<NUM_COMP, NUM_POINTS, Q_1D>(data, data.t_id_x, data.t_id_y, p, r_U, r_X, r_C);
+}
+
 //------------------------------------------------------------------------------
 // 3D
 //------------------------------------------------------------------------------
diff --git a/include/ceed/jit-source/hip/hip-shared-basis-tensor-templates.h b/include/ceed/jit-source/hip/hip-shared-basis-tensor-templates.h
index be7857ded1..7871193a31 100644
--- a/include/ceed/jit-source/hip/hip-shared-basis-tensor-templates.h
+++ b/include/ceed/jit-source/hip/hip-shared-basis-tensor-templates.h
@@ -104,13 +104,14 @@ inline __device__ void Weight1d(SharedData_Hip &data, const CeedScalar *__restri
 // 2D tensor contraction x
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void ContractX2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
-  data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
+inline __device__ void ContractX2d(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
+                                   CeedScalar *V) {
+  data.slice[t_id_x + t_id_y * T_1D] = *U;
   __syncthreads();
   *V = 0.0;
-  if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
+  if (t_id_x < Q_1D && t_id_y < P_1D) {
     for (CeedInt i = 0; i < P_1D; i++) {
-      *V += B[i + data.t_id_x * P_1D] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
+      *V += B[i + t_id_x * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
     }
   }
   __syncthreads();
@@ -120,13 +121,15 @@ inline __device__ void ContractX2d(SharedData_Hip &data, const CeedScalar *U, co
 // 2D tensor contract y
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void ContractY2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
-  data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
+inline __device__ void ContractY2d(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
+                                   CeedScalar *V) {
+  data.slice[t_id_x + t_id_y * T_1D] = *U;
+>>>>>>> b855402d (gpu - isolate core 2D tensor logic to allow flat version)
   __syncthreads();
   *V = 0.0;
-  if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
+  if (t_id_x < Q_1D && t_id_y < Q_1D) {
     for (CeedInt i = 0; i < P_1D; i++) {
-      *V += B[i + data.t_id_y * P_1D] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
+      *V += B[i + t_id_y * P_1D] * data.slice[t_id_x + i * T_1D];  // Contract y direction
     }
   }
   __syncthreads();
@@ -136,13 +139,14 @@ inline __device__ void ContractY2d(SharedData_Hip &data, const CeedScalar *U, co
 // 2D transpose tensor contract y
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void ContractTransposeY2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
-  data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
+inline __device__ void ContractTransposeY2d(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
+                                            CeedScalar *V) {
+  data.slice[t_id_x + t_id_y * T_1D] = *U;
   __syncthreads();
   *V = 0.0;
-  if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
+  if (t_id_x < Q_1D && t_id_y < P_1D) {
     for (CeedInt i = 0; i < Q_1D; i++) {
-      *V += B[data.t_id_y + i * P_1D] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
+      *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D];  // Contract y direction
     }
   }
   __syncthreads();
@@ -152,13 +156,14 @@ inline __device__ void ContractTransposeY2d(SharedData_Hip &data, const CeedScal
 // 2D transpose tensor contract x
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void ContractTransposeX2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
-  data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
+inline __device__ void ContractTransposeX2d(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
+                                            CeedScalar *V) {
+  data.slice[t_id_x + t_id_y * T_1D] = *U;
   __syncthreads();
   *V = 0.0;
-  if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
+  if (t_id_x < P_1D && t_id_y < P_1D) {
     for (CeedInt i = 0; i < Q_1D; i++) {
-      *V += B[data.t_id_x + i * P_1D] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
+      *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
     }
   }
   __syncthreads();
@@ -168,12 +173,13 @@ inline __device__ void ContractTransposeX2d(SharedData_Hip &data, const CeedScal
 // 2D transpose tensor contract and add x
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void ContractTransposeAddX2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
-  data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
+inline __device__ void ContractTransposeAddX2d(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
+                                               CeedScalar *V) {
+  data.slice[t_id_x + t_id_y * T_1D] = *U;
   __syncthreads();
-  if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
+  if (t_id_x < P_1D && t_id_y < P_1D) {
     for (CeedInt i = 0; i < Q_1D; i++) {
-      *V += B[data.t_id_x + i * P_1D] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
+      *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
     }
   }
   __syncthreads();
@@ -183,57 +189,81 @@ inline __device__ void ContractTransposeAddX2d(SharedData_Hip &data, const CeedS
 // 2D interpolate to quadrature points
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void InterpTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
+inline __device__ void InterpTensor2d_Core(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *__restrict__ r_U,
+                                           const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
   CeedScalar r_t[1];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_B, r_t);
-    ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp]);
+    ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
+    ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
   }
 }
 
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void InterpTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
+  InterpTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x, data.t_id_y, r_U, c_B, r_V);
+}
+
 //------------------------------------------------------------------------------
 // 2D interpolate transpose
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void InterpTransposeTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
-                                               CeedScalar *__restrict__ r_V) {
+inline __device__ void InterpTransposeTensor2d_Core(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *__restrict__ r_U,
+                                                    const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
   CeedScalar r_t[1];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_B, r_t);
-    ContractTransposeX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp]);
+    ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
+    ContractTransposeX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
   }
 }
 
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void InterpTransposeTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                               CeedScalar *__restrict__ r_V) {
+  InterpTransposeTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x, data.t_id_y, r_U, c_B, r_V);
+}
+
 //------------------------------------------------------------------------------
 // 2D derivatives at quadrature points
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void GradTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
-                                    CeedScalar *__restrict__ r_V) {
+inline __device__ void GradTensor2d_Core(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *__restrict__ r_U,
+                                         const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
   CeedScalar r_t[1];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_G, r_t);
-    ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp + 0 * NUM_COMP]);
-    ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_B, r_t);
-    ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_G, &r_V[comp + 1 * NUM_COMP]);
+    ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_G, r_t);
+    ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp + 0 * NUM_COMP]);
+    ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
+    ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp + 1 * NUM_COMP]);
   }
 }
 
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void GradTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
+                                    CeedScalar *__restrict__ r_V) {
+  GradTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x, data.t_id_y, r_U, c_B, c_G, r_V);
+}
+
 //------------------------------------------------------------------------------
 // 2D derivatives transpose
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void GradTransposeTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
-                                             CeedScalar *__restrict__ r_V) {
+inline __device__ void GradTransposeTensor2d_Core(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *__restrict__ r_U,
+                                                  const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
   CeedScalar r_t[1];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp + 0 * NUM_COMP], c_B, r_t);
-    ContractTransposeX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_G, &r_V[comp]);
-    ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp + 1 * NUM_COMP], c_G, r_t);
-    ContractTransposeAddX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp]);
+    ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 0 * NUM_COMP], c_B, r_t);
+    ContractTransposeX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp]);
+    ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 1 * NUM_COMP], c_G, r_t);
+    ContractTransposeAddX2d<NUM_COMP, P_1D, Q_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
   }
 }
 
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void GradTransposeTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
+                                             CeedScalar *__restrict__ r_V) {
+  GradTansposeTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x, data.t_id_y, r_U, c_B, c_G, r_V);
+}
+
 //------------------------------------------------------------------------------
 // 2D quadrature weights
 //------------------------------------------------------------------------------

From ca595be6df907a4366bcc1f56f7a62068f97f05f Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Thu, 27 Feb 2025 14:08:05 -0700
Subject: [PATCH 327/571] gpu - add 2d Flattened variants of functions

---
 .../cuda/cuda-shared-basis-tensor-templates.h | 39 +++++++++++++++
 .../hip/hip-shared-basis-tensor-templates.h   | 47 ++++++++++++++++++-
 .../jit-source/hip/hip-shared-basis-tensor.h  |  6 +--
 3 files changed, 88 insertions(+), 4 deletions(-)

diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h
index 6141838798..b76559bfbf 100644
--- a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h
@@ -203,6 +203,14 @@ inline __device__ void InterpTensor2d(SharedData_Cuda &data, const CeedScalar *_
   InterpTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x, data.t_id_y, r_U, c_B, r_V);
 }
 
+template <int NUM_COMP, int P_1D, int Q_1D>
+inline __device__ void InterpTensor2dFlattened(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                               CeedScalar *__restrict__ r_V) {
+  const int max_1d = P_1D < Q_1D ? P_1D : Q_1D;
+
+  InterpTensor2d_Core<NUM_COMP, P_1D, Q_1D>(data, data.t_id_x % max_1d, data.t_id_x / max_1d, r_U, c_B, r_V);
+}
+
 //------------------------------------------------------------------------------
 // 2D interpolate transpose
 //------------------------------------------------------------------------------
@@ -222,6 +230,14 @@ inline __device__ void InterpTransposeTensor2d(SharedData_Cuda &data, const Ceed
   InterpTransposeTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x, data.t_id_y, r_U, c_B, r_V);
 }
 
+template <int NUM_COMP, int P_1D, int Q_1D>
+inline __device__ void InterpTransposeTensor2dFlattened(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                        CeedScalar *__restrict__ r_V) {
+  const int max_1d = P_1D < Q_1D ? P_1D : Q_1D;
+
+  InterpTransposeTensor2d_Core<NUM_COMP, P_1D, Q_1D>(data, data.t_id_x % max_1d, data.t_id_x / max_1d, r_U, c_B, r_V);
+}
+
 //------------------------------------------------------------------------------
 // 2D derivatives at quadrature points
 //------------------------------------------------------------------------------
@@ -243,6 +259,14 @@ inline __device__ void GradTensor2d(SharedData_Cuda &data, const CeedScalar *__r
   GradTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x, data.t_id_y, r_U, c_B, c_G, r_V);
 }
 
+template <int NUM_COMP, int P_1D, int Q_1D>
+inline __device__ void GradTensor2dFlattened(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
+                                             CeedScalar *__restrict__ r_V) {
+  const int max_1d = P_1D < Q_1D ? P_1D : Q_1D;
+
+  GradTensor2d_Core<NUM_COMP, P_1D, Q_1D>(data, data.t_id_x % max_1d, data.t_id_x / max_1d, r_U, c_B, c_G, r_V);
+}
+
 //------------------------------------------------------------------------------
 // 2D derivatives transpose
 //------------------------------------------------------------------------------
@@ -264,6 +288,14 @@ inline __device__ void GradTransposeTensor2d(SharedData_Cuda &data, const CeedSc
   GradTransposeTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x, data.t_id_y, r_U, c_B, c_G, r_V);
 }
 
+template <int NUM_COMP, int P_1D, int Q_1D>
+inline __device__ void GradTransposeTensor2dFlattened(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                      const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
+  const int max_1d = P_1D < Q_1D ? P_1D : Q_1D;
+
+  GradTransposeTensor2d_Core<NUM_COMP, P_1D, Q_1D>(data, data.t_id_x % max_1d, data.t_id_x / max_1d, r_U, c_B, c_G, r_V);
+}
+
 //------------------------------------------------------------------------------
 // 2D quadrature weights
 //------------------------------------------------------------------------------
@@ -278,6 +310,13 @@ inline __device__ void WeightTensor2d(SharedData_Cuda &data, const CeedScalar *_
   WeightTensor2d_Core<Q_1D>(data, data.t_id_x, data.t_id_y, q_weight_1d, w);
 }
 
+template <int P_1D, int Q_1D>
+inline __device__ void WeightTensor2dFlattened(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
+  const int max_1d = P_1D < Q_1D ? P_1D : Q_1D;
+
+  WeightTensor2d_Core<Q_1D>(data, data.t_id_x % max_1d, data.t_id_x / max_1d, q_weight_1d, w);
+}
+
 //------------------------------------------------------------------------------
 // 3D
 //------------------------------------------------------------------------------
diff --git a/include/ceed/jit-source/hip/hip-shared-basis-tensor-templates.h b/include/ceed/jit-source/hip/hip-shared-basis-tensor-templates.h
index 7871193a31..b3490e3fe5 100644
--- a/include/ceed/jit-source/hip/hip-shared-basis-tensor-templates.h
+++ b/include/ceed/jit-source/hip/hip-shared-basis-tensor-templates.h
@@ -203,6 +203,14 @@ inline __device__ void InterpTensor2d(SharedData_Hip &data, const CeedScalar *__
   InterpTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x, data.t_id_y, r_U, c_B, r_V);
 }
 
+template <int NUM_COMP, int P_1D, int Q_1D>
+inline __device__ void InterpTensor2dFlattened(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                               CeedScalar *__restrict__ r_V) {
+  const int max_1d = P_1D < Q_1D ? P_1D : Q_1D;
+
+  InterpTensor2d_Core<NUM_COMP, P_1D, Q_1D>(data, data.t_id_x % max_1d, data.t_id_x / max_1d, r_U, c_B, r_V);
+}
+
 //------------------------------------------------------------------------------
 // 2D interpolate transpose
 //------------------------------------------------------------------------------
@@ -222,6 +230,14 @@ inline __device__ void InterpTransposeTensor2d(SharedData_Hip &data, const CeedS
   InterpTransposeTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x, data.t_id_y, r_U, c_B, r_V);
 }
 
+template <int NUM_COMP, int P_1D, int Q_1D>
+inline __device__ void InterpTransposeTensor2dFlattened(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                        CeedScalar *__restrict__ r_V) {
+  const int max_1d = P_1D < Q_1D ? P_1D : Q_1D;
+
+  InterpTransposeTensor2d_Core<NUM_COMP, P_1D, Q_1D>(data, data.t_id_x % max_1d, data.t_id_x / max_1d, r_U, c_B, r_V);
+}
+
 //------------------------------------------------------------------------------
 // 2D derivatives at quadrature points
 //------------------------------------------------------------------------------
@@ -243,6 +259,14 @@ inline __device__ void GradTensor2d(SharedData_Hip &data, const CeedScalar *__re
   GradTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x, data.t_id_y, r_U, c_B, c_G, r_V);
 }
 
+template <int NUM_COMP, int P_1D, int Q_1D>
+inline __device__ void GradTensor2dFlattened(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
+                                             CeedScalar *__restrict__ r_V) {
+  const int max_1d = P_1D < Q_1D ? P_1D : Q_1D;
+
+  GradTensor2d_Core<NUM_COMP, P_1D, Q_1D>(data, data.t_id_x % max_1d, data.t_id_x / max_1d, r_U, c_B, c_G, r_V);
+}
+
 //------------------------------------------------------------------------------
 // 2D derivatives transpose
 //------------------------------------------------------------------------------
@@ -264,12 +288,33 @@ inline __device__ void GradTransposeTensor2d(SharedData_Hip &data, const CeedSca
   GradTansposeTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x, data.t_id_y, r_U, c_B, c_G, r_V);
 }
 
+template <int NUM_COMP, int P_1D, int Q_1D>
+inline __device__ void GradTransposeTensor2dFlattened(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                      const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
+  const int max_1d = P_1D < Q_1D ? P_1D : Q_1D;
+
+  GradTansposeTensor2d_Core<NUM_COMP, P_1D, Q_1D>(data, data.t_id_x % max_1d, data.t_id_x / max_1d, r_U, c_B, c_G, r_V);
+}
+
 //------------------------------------------------------------------------------
 // 2D quadrature weights
 //------------------------------------------------------------------------------
 template <int Q_1D>
+inline __device__ void WeightTensor2d_Core(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *__restrict__ q_weight_1d,
+                                           CeedScalar *w) {
+  *w = (t_id_x < Q_1D && t_id_y < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] : 0.0;
+}
+
+template <int P_1D, int Q_1D>
 inline __device__ void WeightTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
-  *w = (data.t_id_x < Q_1D && data.t_id_y < Q_1D) ? q_weight_1d[data.t_id_x] * q_weight_1d[data.t_id_y] : 0.0;
+  WeightTensor2d_Core<Q_1D>(data, data.t_id_x, data.t_id_y, q_weight_1d, w);
+}
+
+template <int P_1D, int Q_1D>
+inline __device__ void WeightTensor2dFlattened(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
+  const int max_1d = P_1D < Q_1D ? P_1D : Q_1D;
+
+  WeightTensor2d_Core<Q_1D>(data, data.t_id_x % max_1d, data.t_id_x / max_1d, q_weight_1d, w);
 }
 
 //------------------------------------------------------------------------------
diff --git a/include/ceed/jit-source/hip/hip-shared-basis-tensor.h b/include/ceed/jit-source/hip/hip-shared-basis-tensor.h
index 13e3690b38..7f7ffaa2ad 100644
--- a/include/ceed/jit-source/hip/hip-shared-basis-tensor.h
+++ b/include/ceed/jit-source/hip/hip-shared-basis-tensor.h
@@ -284,13 +284,13 @@ extern "C" __launch_bounds__(BASIS_WEIGHT_BLOCK_SIZE) __global__
 
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
     if (BASIS_DIM == 1) {
-      Weight1d<BASIS_Q_1D>(data, q_weight_1d, r_W);
+      Weight1d<BASIS_P_1D, BASIS_Q_1D>(data, q_weight_1d, r_W);
       WriteElementStrided1d<1, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, r_W, d_W);
     } else if (BASIS_DIM == 2) {
-      WeightTensor2d<BASIS_Q_1D>(data, q_weight_1d, r_W);
+      WeightTensor2d<BASIS_P_1D, BASIS_Q_1D>(data, q_weight_1d, r_W);
       WriteElementStrided2d<1, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, r_W, d_W);
     } else if (BASIS_DIM == 3) {
-      WeightTensor3d<BASIS_Q_1D>(data, q_weight_1d, r_W);
+      WeightTensor3d<BASIS_P_1D, BASIS_Q_1D>(data, q_weight_1d, r_W);
       WriteElementStrided3d<1, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_W,
                                            d_W);
     }

From 412e56839ae1393ae29619b6fd39ea10cdd89adf Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 28 Feb 2025 09:37:59 -0700
Subject: [PATCH 328/571] gpu - use 2d Flat variants in gen

---
 .../cuda-gen/ceed-cuda-gen-operator-build.cpp | 353 +++++++++++-------
 .../cuda/cuda-shared-basis-tensor-templates.h |  16 +-
 2 files changed, 225 insertions(+), 144 deletions(-)

diff --git a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
index 58a7859b24..ae27028e08 100644
--- a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
+++ b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
@@ -33,35 +33,71 @@ struct FieldReuse_Cuda {
 //------------------------------------------------------------------------------
 static int CeedOperatorBuildKernelData_Cuda_gen(Ceed ceed, CeedInt num_input_fields, CeedOperatorField *op_input_fields,
                                                 CeedQFunctionField *qf_input_fields, CeedInt num_output_fields, CeedOperatorField *op_output_fields,
-                                                CeedQFunctionField *qf_output_fields, CeedInt *max_P_1d, CeedInt *Q_1d, CeedInt *dim, bool *is_tensor,
-                                                bool *use_3d_slices) {
-  // Find dim, P_1d, Q_1d
-  *max_P_1d  = 0;
-  *Q_1d      = 0;
-  *dim       = 0;
-  *is_tensor = true;
+                                                CeedQFunctionField *qf_output_fields, CeedInt *max_P, CeedInt *max_P_1d, CeedInt *Q, CeedInt *Q_1d,
+                                                CeedInt *max_dim, bool *is_all_tensor, bool *use_3d_slices) {
+  // Check if all are tensor
+  *is_all_tensor = true;
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedBasis basis;
+
+    CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
+    if (basis != CEED_BASIS_NONE) {
+      bool is_field_tensor;
+
+      CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
+      *is_all_tensor = *is_all_tensor && is_field_tensor;
+    }
+    CeedCallBackend(CeedBasisDestroy(&basis));
+  }
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    CeedBasis basis;
+
+    CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
+    if (basis != CEED_BASIS_NONE) {
+      bool is_field_tensor;
+
+      CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
+      *is_all_tensor = *is_all_tensor && is_field_tensor;
+    }
+    CeedCallBackend(CeedBasisDestroy(&basis));
+  }
 
+  // Find max_P, max_P_1d, Q, and Q_1d
+  bool is_all_3d = true;
+
+  *max_P    = 0;
+  *max_P_1d = 0;
+  *Q        = 0;
+  *Q_1d     = 0;
   for (CeedInt i = 0; i < num_input_fields; i++) {
     CeedBasis basis;
 
     CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
     if (basis != CEED_BASIS_NONE) {
       bool    is_field_tensor;
-      CeedInt field_P_1d = 0, field_Q_1d = 0, field_dim = 0;
+      CeedInt field_dim = 0, field_P = 0, field_P_1d = 0, field_Q = 0, field_Q_1d = 0;
 
-      // Collect dim, P_1d, and Q_1d
-      CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
-      *is_tensor = *is_tensor && is_field_tensor;
-      if (is_field_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &field_P_1d));
-      else CeedCallBackend(CeedBasisGetNumNodes(basis, &field_P_1d));
-      *max_P_1d = CeedIntMax(*max_P_1d, field_P_1d);
+      // Check if 3D
       CeedCallBackend(CeedBasisGetDimension(basis, &field_dim));
-      CeedCheck(*dim == 0 || field_dim == *dim, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
-      *dim = field_dim;
-      if (is_field_tensor) CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &field_Q_1d));
-      else CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &field_Q_1d));
-      CeedCheck(*Q_1d == 0 || field_Q_1d == *Q_1d, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
-      *Q_1d = field_Q_1d;
+      is_all_3d = is_all_3d && (field_dim == 3);
+      *max_dim  = CeedIntMax(*max_dim, field_dim);
+
+      // Collect P, P_1d, Q, and Q_1d
+      CeedCallBackend(CeedBasisGetNumNodes(basis, &field_P));
+      *max_P = CeedIntMax(*max_P, field_P);
+      CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
+      if (is_field_tensor) {
+        CeedCallBackend(CeedBasisGetNumNodes1D(basis, &field_P_1d));
+        *max_P_1d = CeedIntMax(*max_P_1d, field_P_1d);
+      }
+      CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &field_Q));
+      CeedCheck(*Q == 0 || field_Q == *Q, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
+      *Q = field_Q;
+      if (is_field_tensor) {
+        CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &field_Q_1d));
+        CeedCheck(*Q_1d == 0 || field_Q_1d == *Q_1d, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
+        *Q_1d = field_Q_1d;
+      }
     }
     CeedCallBackend(CeedBasisDestroy(&basis));
   }
@@ -71,28 +107,36 @@ static int CeedOperatorBuildKernelData_Cuda_gen(Ceed ceed, CeedInt num_input_fie
     CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
     if (basis != CEED_BASIS_NONE) {
       bool    is_field_tensor;
-      CeedInt field_P_1d = 0, field_Q_1d = 0, field_dim = 0;
+      CeedInt field_dim = 0, field_P = 0, field_P_1d = 0, field_Q = 0, field_Q_1d = 0;
 
-      // Collect dim, P_1d, and Q_1d
-      CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
-      *is_tensor = *is_tensor && is_field_tensor;
-      if (is_field_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &field_P_1d));
-      else CeedCallBackend(CeedBasisGetNumNodes(basis, &field_P_1d));
-      *max_P_1d = CeedIntMax(*max_P_1d, field_P_1d);
+      // Check if 3D
       CeedCallBackend(CeedBasisGetDimension(basis, &field_dim));
-      CeedCheck(*dim == 0 || field_dim == *dim, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
-      *dim = field_dim;
-      if (is_field_tensor) CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &field_Q_1d));
-      else CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &field_Q_1d));
-      CeedCheck(*Q_1d == 0 || field_Q_1d == *Q_1d, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
-      *Q_1d = field_Q_1d;
+      is_all_3d = is_all_3d && (field_dim == 3);
+      *max_dim  = CeedIntMax(*max_dim, field_dim);
+
+      // Collect P, P_1d, Q, and Q_1d
+      CeedCallBackend(CeedBasisGetNumNodes(basis, &field_P));
+      *max_P = CeedIntMax(*max_P, field_P);
+      CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
+      if (is_field_tensor) {
+        CeedCallBackend(CeedBasisGetNumNodes1D(basis, &field_P_1d));
+        *max_P_1d = CeedIntMax(*max_P_1d, field_P_1d);
+      }
+      CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &field_Q));
+      CeedCheck(*Q == 0 || field_Q == *Q, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
+      *Q = field_Q;
+      if (is_field_tensor) {
+        CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &field_Q_1d));
+        CeedCheck(*Q_1d == 0 || field_Q_1d == *Q_1d, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
+        *Q_1d = field_Q_1d;
+      }
     }
     CeedCallBackend(CeedBasisDestroy(&basis));
   }
 
   // Only use 3D collocated gradient parallelization strategy when gradient is computed
   *use_3d_slices = false;
-  if (*dim == 3) {
+  if (is_all_3d && *is_all_tensor) {
     bool was_grad_found = false;
 
     for (CeedInt i = 0; i < num_input_fields; i++) {
@@ -134,16 +178,20 @@ static int CeedOperatorBuildKernelData_Cuda_gen(Ceed ceed, CeedInt num_input_fie
 //------------------------------------------------------------------------------
 static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, CeedInt i, CeedOperatorField op_field,
                                                      CeedQFunctionField qf_field, FieldReuse_Cuda field_reuse, CeedInt Q_1d, bool is_input,
-                                                     bool is_tensor, bool is_at_points, bool use_3d_slices) {
+                                                     bool is_all_tensor, bool is_at_points, bool use_3d_slices) {
+  bool      is_tensor = true;
+  CeedBasis basis;
+  CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis));
+  if (basis != CEED_BASIS_NONE) CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
+
   const char            *field_name;
   std::string            var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
   std::string            P_name = (is_tensor ? "P_1d" : "P") + var_suffix, Q_name = is_tensor ? "Q_1d" : "Q";
   std::string            option_name = (is_input ? "inputs" : "outputs");
   CeedEvalMode           eval_mode   = CEED_EVAL_NONE;
-  CeedInt                elem_size = 0, num_comp = 0, P_1d = 0;
+  CeedInt                elem_size = 0, num_comp = 0, dim = 1, P_1d = 0;
   CeedElemRestriction    elem_rstr;
   CeedBasis_Cuda_shared *basis_data;
-  CeedBasis              basis;
 
   // Field reuse info
   bool use_previous_field = field_reuse.index != -1;
@@ -158,15 +206,22 @@ static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, C
     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
   }
   CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
-  CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis));
   if (basis != CEED_BASIS_NONE) {
     CeedCallBackend(CeedBasisGetData(basis, &basis_data));
+    CeedCallBackend(CeedBasisGetDimension(basis, &dim));
     if (is_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
     else CeedCallBackend(CeedBasisGetNumNodes(basis, &P_1d));
   }
   CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode));
 
   // Set field constants
+  code << "  const CeedInt dim" << var_suffix << " = " << dim << ";\n";
+  if (is_tensor && !is_all_tensor) {
+    CeedInt P = 0;
+
+    CeedCallBackend(CeedBasisGetNumNodes(basis, &P));
+    code << "  const CeedInt P" << var_suffix << " = " << (basis == CEED_BASIS_NONE ? Q_1d : P) << ";\n";
+  }
   code << "  const CeedInt " << P_name << " = " << (basis == CEED_BASIS_NONE ? Q_1d : P_1d) << ";\n";
   if (eval_mode != CEED_EVAL_WEIGHT) {
     code << "  const CeedInt num_comp" << var_suffix << " = " << num_comp << ";\n";
@@ -272,9 +327,10 @@ static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, C
 
             code << "  CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
           } else {
-            code << "  __shared__ CeedScalar s_G" << var_suffix << "[" << P_name << "*" << Q_name << (is_tensor ? "" : "*dim") << "];\n";
-            code << "  LoadMatrix<" << P_name << ", " << Q_name << (is_tensor ? "" : "*dim") << ">(data, G." << option_name << "[" << i << "], s_G"
-                 << var_suffix << ");\n";
+            code << "  __shared__ CeedScalar s_G" << var_suffix << "[" << P_name << "*" << Q_name << (is_tensor ? "" : "*dim")
+                 << (is_tensor ? "" : var_suffix) << "];\n";
+            code << "  LoadMatrix<" << P_name << ", " << Q_name << (is_tensor ? "" : "*dim") << (is_tensor ? "" : var_suffix) << ">(data, G."
+                 << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
           }
         }
       }
@@ -294,18 +350,17 @@ static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, C
 //------------------------------------------------------------------------------
 // Restriction
 //------------------------------------------------------------------------------
-static int CeedOperatorBuildKernelRestriction_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, CeedInt i, CeedInt dim,
+static int CeedOperatorBuildKernelRestriction_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, CeedInt i, CeedInt max_dim,
                                                        CeedInt field_input_buffer[], CeedOperatorField op_field, CeedQFunctionField qf_field,
-                                                       CeedInt Q_1d, bool is_input, bool is_tensor, bool is_at_points, bool use_3d_slices) {
+                                                       CeedInt Q_1d, bool is_input, bool is_all_tensor, bool is_at_points, bool use_3d_slices) {
   std::string               var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
-  std::string               P_name     = (is_tensor ? "P_1d" : "P") + var_suffix;
+  std::string               P_name     = (is_all_tensor ? "P_1d" : "P") + var_suffix;
   CeedEvalMode              eval_mode  = CEED_EVAL_NONE;
-  CeedInt                   elem_size = 0, num_comp = 0, P_1d = 0;
+  CeedInt                   elem_size = 0, num_comp = 0;
   CeedSize                  l_size;
   CeedRestrictionType       rstr_type = CEED_RESTRICTION_STANDARD;
   CeedElemRestriction_Cuda *rstr_data;
   CeedElemRestriction       elem_rstr;
-  CeedBasis                 basis;
 
   // Get field data
   CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr));
@@ -315,12 +370,6 @@ static int CeedOperatorBuildKernelRestriction_Cuda_gen(std::ostringstream &code,
     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
     CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data));
   }
-  CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis));
-  if (basis != CEED_BASIS_NONE) {
-    if (is_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
-    else CeedCallBackend(CeedBasisGetNumNodes(basis, &P_1d));
-  }
-  CeedCallBackend(CeedBasisDestroy(&basis));
   CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode));
 
   // Restriction
@@ -348,7 +397,7 @@ static int CeedOperatorBuildKernelRestriction_Cuda_gen(std::ostringstream &code,
           CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
           code << "    // CompStride: " << comp_stride << "\n";
           data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets;
-          code << "    ReadLVecStandard" << (is_tensor ? dim : 1) << "d<num_comp" << var_suffix << ", " << comp_stride << ", " << P_name
+          code << "    ReadLVecStandard" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", " << comp_stride << ", " << P_name
                << ">(data, l_size" << var_suffix << ", elem, indices.inputs[" << i << "], d" << var_suffix << ", r_e" << var_suffix << ");\n";
           break;
         }
@@ -364,7 +413,7 @@ static int CeedOperatorBuildKernelRestriction_Cuda_gen(std::ostringstream &code,
             CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
           }
           code << "    // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n";
-          code << "    ReadLVecStrided" << (is_tensor ? dim : 1) << "d<num_comp" << var_suffix << ", " << P_name << ", " << strides[0] << ", "
+          code << "    ReadLVecStrided" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", " << P_name << ", " << strides[0] << ", "
                << strides[1] << ", " << strides[2] << ">(data, elem, d" << var_suffix << ", r_e" << var_suffix << ");\n";
           break;
         }
@@ -394,7 +443,7 @@ static int CeedOperatorBuildKernelRestriction_Cuda_gen(std::ostringstream &code,
         CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
         code << "    // CompStride: " << comp_stride << "\n";
         data->indices.outputs[i] = (CeedInt *)rstr_data->d_offsets;
-        code << "    WriteLVecStandard" << (is_tensor ? dim : 1) << "d<num_comp" << var_suffix << ", " << comp_stride << ", " << P_name
+        code << "    WriteLVecStandard" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", " << comp_stride << ", " << P_name
              << ">(data, l_size" << var_suffix << ", elem, indices.outputs[" << i << "], r_e" << var_suffix << ", d" << var_suffix << ");\n";
         break;
       }
@@ -410,7 +459,7 @@ static int CeedOperatorBuildKernelRestriction_Cuda_gen(std::ostringstream &code,
           CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
         }
         code << "    // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n";
-        code << "    WriteLVecStrided" << (is_tensor ? dim : 1) << "d<num_comp" << var_suffix << ", " << P_name << ", " << strides[0] << ", "
+        code << "    WriteLVecStrided" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", " << P_name << ", " << strides[0] << ", "
              << strides[1] << ", " << strides[2] << ">(data, elem, r_e" << var_suffix << ", d" << var_suffix << ");\n";
         break;
       }
@@ -431,15 +480,19 @@ static int CeedOperatorBuildKernelRestriction_Cuda_gen(std::ostringstream &code,
 //------------------------------------------------------------------------------
 // Basis
 //------------------------------------------------------------------------------
-static int CeedOperatorBuildKernelBasis_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, CeedInt i, CeedInt dim,
-                                                 CeedOperatorField op_field, CeedQFunctionField qf_field, CeedInt Q_1d, bool is_input, bool is_tensor,
-                                                 bool is_at_points, bool use_3d_slices) {
+static int CeedOperatorBuildKernelBasis_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, CeedInt i, CeedInt max_dim,
+                                                 CeedOperatorField op_field, CeedQFunctionField qf_field, CeedInt Q_1d, bool is_input,
+                                                 bool is_all_tensor, bool is_at_points, bool use_3d_slices) {
+  bool      is_tensor = true;
+  CeedBasis basis;
+  CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis));
+  CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
+
   std::string         var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
   std::string         P_name = (is_tensor ? "P_1d" : "P") + var_suffix, Q_name = is_tensor ? "Q_1d" : "Q";
   CeedEvalMode        eval_mode = CEED_EVAL_NONE;
-  CeedInt             elem_size = 0, num_comp = 0, P_1d = 0;
+  CeedInt             dim = max_dim, elem_size = 0, num_comp = 0, P_1d = 0;
   CeedElemRestriction elem_rstr;
-  CeedBasis           basis;
 
   // Get field data
   CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr));
@@ -448,8 +501,8 @@ static int CeedOperatorBuildKernelBasis_Cuda_gen(std::ostringstream &code, CeedO
     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
   }
   CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
-  CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis));
   if (basis != CEED_BASIS_NONE) {
+    CeedCallBackend(CeedBasisGetDimension(basis, &dim));
     if (is_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
     else CeedCallBackend(CeedBasisGetNumNodes(basis, &P_1d));
   }
@@ -472,11 +525,13 @@ static int CeedOperatorBuildKernelBasis_Cuda_gen(std::ostringstream &code, CeedO
           code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix
                << ", s_B" << var_suffix << ", r_c" << var_suffix << ");\n";
         } else {
-          std::string function_name = is_tensor ? ((dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d") : "InterpNonTensor";
+          std::string function_name = is_tensor
+                                          ? ((dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened"))
+                                          : "InterpNonTensor";
 
           code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
-          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix
-               << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n";
+          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << (P_1d > Q_1d ? P_name : Q_name)
+               << ">(data, r_e" << var_suffix << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n";
         }
         break;
       case CEED_EVAL_GRAD:
@@ -494,17 +549,19 @@ static int CeedOperatorBuildKernelBasis_Cuda_gen(std::ostringstream &code, CeedO
                << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n";
         } else if (is_tensor) {
           bool        is_collocated = dim == 3 && Q_1d >= P_1d;
-          std::string function_name = (dim == 1 ? "Grad" : (is_collocated ? "GradTensorCollocated" : "GradTensor")) + std::to_string(dim) + "d";
+          std::string function_name = (dim == 1 ? "Grad" : (is_collocated ? "GradTensorCollocated" : "GradTensor")) + std::to_string(dim) + "d" +
+                                      (is_all_tensor ? "" : "Flattened");
 
-          code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim*" << (dim >= 3 ? Q_name : "1") << "];\n";
-          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix
-               << ", s_B" << var_suffix << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n";
+          code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "*" << (dim >= 3 ? Q_name : "1")
+               << "];\n";
+          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << (P_1d > Q_1d ? P_name : Q_name)
+               << ">(data, r_e" << var_suffix << ", s_B" << var_suffix << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n";
         } else {
           std::string function_name = "GradNonTensor";
 
-          code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim];\n";
-          code << "    " << function_name << "<num_comp" << var_suffix << ", dim, " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e"
-               << var_suffix << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n";
+          code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
+          code << "    " << function_name << "<num_comp" << var_suffix << ", dim" << var_suffix << ", " << P_name << ", " << Q_name << ", "
+               << (P_1d > Q_1d ? P_name : Q_name) << ">(data, r_e" << var_suffix << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n";
         }
         break;
       case CEED_EVAL_WEIGHT: {
@@ -512,7 +569,9 @@ static int CeedOperatorBuildKernelBasis_Cuda_gen(std::ostringstream &code, CeedO
           code << "    // Nothing to do AtPoints\n";
         } else {
           CeedBasis_Cuda_shared *basis_data;
-          std::string            function_name = is_tensor ? ((dim == 1 ? "Weight" : "WeightTensor") + std::to_string(dim) + "d") : "WeightNonTensor";
+          std::string            function_name = is_tensor
+                                                     ? ((dim == 1 ? "Weight" : "WeightTensor") + std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened"))
+                                                     : "WeightNonTensor";
 
           code << "    CeedScalar r_q" << var_suffix << "[" << (is_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
           CeedCallBackend(CeedBasisGetData(basis, &basis_data));
@@ -541,10 +600,11 @@ static int CeedOperatorBuildKernelBasis_Cuda_gen(std::ostringstream &code, CeedO
                << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
         } else {
           std::string function_name =
-              is_tensor ? ((dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d") : "InterpTransposeNonTensor";
+              is_tensor ? ((dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened"))
+                        : "InterpTransposeNonTensor";
 
-          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_q" << var_suffix
-               << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
+          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << (P_1d > Q_1d ? P_name : Q_name)
+               << ">(data, r_q" << var_suffix << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
         }
         break;
       case CEED_EVAL_GRAD:
@@ -561,16 +621,16 @@ static int CeedOperatorBuildKernelBasis_Cuda_gen(std::ostringstream &code, CeedO
                << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
         } else if (is_tensor) {
           bool        is_collocated = dim == 3 && Q_1d >= P_1d;
-          std::string function_name =
-              (dim == 1 ? "GradTranspose" : (is_collocated ? "GradTransposeTensorCollocated" : "GradTransposeTensor")) + std::to_string(dim) + "d";
+          std::string function_name = (dim == 1 ? "GradTranspose" : (is_collocated ? "GradTransposeTensorCollocated" : "GradTransposeTensor")) +
+                                      std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened");
 
-          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_q" << var_suffix
-               << ", s_B" << var_suffix << ", s_G" << var_suffix << ", r_e" << var_suffix << ");\n";
+          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << (P_1d > Q_1d ? P_name : Q_name)
+               << ">(data, r_q" << var_suffix << ", s_B" << var_suffix << ", s_G" << var_suffix << ", r_e" << var_suffix << ");\n";
         } else {
           std::string function_name = "GradTransposeNonTensor";
 
-          code << "    " << function_name << "<num_comp" << var_suffix << ", dim, " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_q"
-               << var_suffix << ", s_G" << var_suffix << ", r_e" << var_suffix << ");\n";
+          code << "    " << function_name << "<num_comp" << var_suffix << ", dim" << var_suffix << ", " << P_name << ", " << Q_name << ", "
+               << (P_1d > Q_1d ? P_name : Q_name) << ">(data, r_q" << var_suffix << ", s_G" << var_suffix << ", r_e" << var_suffix << ");\n";
         }
         break;
       // LCOV_EXCL_START
@@ -589,13 +649,13 @@ static int CeedOperatorBuildKernelBasis_Cuda_gen(std::ostringstream &code, CeedO
 //------------------------------------------------------------------------------
 // QFunction
 //------------------------------------------------------------------------------
-static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, CeedInt dim, CeedInt max_num_points,
+static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, CeedInt max_dim, CeedInt max_num_points,
                                                      CeedInt num_input_fields, CeedOperatorField *op_input_fields,
                                                      CeedQFunctionField *qf_input_fields, CeedInt num_output_fields,
                                                      CeedOperatorField *op_output_fields, CeedQFunctionField *qf_output_fields,
-                                                     std::string qfunction_name, CeedInt Q_1d, bool is_tensor, bool is_at_points,
+                                                     std::string qfunction_name, CeedInt Q_1d, bool is_all_tensor, bool is_at_points,
                                                      bool use_3d_slices) {
-  std::string         Q_name    = is_tensor ? "Q_1d" : "Q";
+  std::string         Q_name    = is_all_tensor ? "Q_1d" : "Q";
   CeedEvalMode        eval_mode = CEED_EVAL_NONE;
   CeedElemRestriction elem_rstr;
 
@@ -613,25 +673,28 @@ static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, C
         if (is_at_points) {
           code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "];\n";
         } else {
-          code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
+          code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tensor && (max_dim >= 3) ? Q_name : "1")
+               << "];\n";
         }
         break;
       case CEED_EVAL_INTERP:
         if (is_at_points) {
           // Accumulator for point data
-          code << "    CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "];\n";
-          code << "    for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "; i++) {\n";
+          code << "    CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "];\n";
+          code << "    for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "; i++) {\n";
           code << "      r_c" << var_suffix << "[i] = 0.0;\n";
           code << "    }\n";
         } else {
-          code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
+          code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tensor && (max_dim >= 3) ? Q_name : "1")
+               << "];\n";
         }
         break;
       case CEED_EVAL_GRAD:
         if (is_at_points) {
           // Accumulator for point data
-          code << "    CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "*dim];\n";
-          code << "    for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "; i++) {\n";
+          code << "    CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "*dim" << var_suffix
+               << "];\n";
+          code << "    for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "; i++) {\n";
           code << "      r_c" << var_suffix << "[i] = 0.0;\n";
           code << "    }\n";
         } else if (use_3d_slices) {
@@ -641,7 +704,8 @@ static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, C
           code << "      r_q" << var_suffix << "[i] = 0.0;\n";
           code << "    }\n";
         } else {
-          code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim*" << (is_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
+          code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "*"
+               << (is_all_tensor && (max_dim >= 3) ? Q_name : "1") << "];\n";
         }
         break;
       case CEED_EVAL_WEIGHT:
@@ -663,8 +727,8 @@ static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, C
     code << "      const CeedInt p = i % max_num_points;\n\n";
 
     code << "      // -- Coordinates\n";
-    code << "      CeedScalar r_x[dim];\n";
-    code << "      ReadPoint<dim, coords_comp_stride, max_num_points>(data, elem, p, max_num_points, points.indices, points.coords, r_x);\n\n";
+    code << "      CeedScalar r_x[max_dim];\n";
+    code << "      ReadPoint<max_dim, coords_comp_stride, max_num_points>(data, elem, p, max_num_points, points.indices, points.coords, r_x);\n\n";
 
     code << "      // -- Input fields\n";
     for (CeedInt i = 0; i < num_input_fields; i++) {
@@ -685,13 +749,13 @@ static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, C
           break;
         case CEED_EVAL_INTERP:
           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
-          code << "      InterpAtPoints" << dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name << ">(data, i, r_c"
-               << var_suffix << ", r_x, r_s" << var_suffix << ");\n";
+          code << "      InterpAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
+               << ">(data, i, r_c" << var_suffix << ", r_x, r_s" << var_suffix << ");\n";
           break;
         case CEED_EVAL_GRAD:
-          code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim];\n";
-          code << "      GradAtPoints" << dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name << ">(data, i, r_c"
-               << var_suffix << ", r_x, r_s" << var_suffix << ");\n";
+          code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
+          code << "      GradAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
+               << ">(data, i, r_c" << var_suffix << ", r_x, r_s" << var_suffix << ");\n";
           break;
         case CEED_EVAL_WEIGHT:
           code << "      CeedScalar r_s" << var_suffix << "[1];\n";
@@ -721,7 +785,7 @@ static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, C
           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
           break;
         case CEED_EVAL_GRAD:
-          code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim];\n";
+          code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
           break;
           // LCOV_EXCL_START
         case CEED_EVAL_WEIGHT:
@@ -794,7 +858,7 @@ static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, C
           code << "      }\n";
           break;
         case CEED_EVAL_GRAD:
-          code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim];\n";
+          code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
           code << "      GradColloSlice3d<num_comp" << var_suffix << ", " << Q_name << ", OP_T_1D>(data, q, r_q" << var_suffix << ", s_G"
                << var_suffix << ", r_s" << var_suffix << ");\n";
           break;
@@ -826,7 +890,7 @@ static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, C
           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
           break;
         case CEED_EVAL_GRAD:
-          code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim];\n";
+          code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
           break;
           // LCOV_EXCL_START
         case CEED_EVAL_WEIGHT:
@@ -882,7 +946,7 @@ static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, C
   // Apply QFunction
   code << "\n      // -- Apply QFunction\n";
   code << "      " << qfunction_name << "(ctx, ";
-  if (dim != 3 || is_at_points || use_3d_slices || !is_tensor) {
+  if (max_dim != 3 || is_at_points || use_3d_slices || !is_all_tensor) {
     code << "1";
   } else {
     code << Q_name;
@@ -920,14 +984,14 @@ static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, C
           code << "      if (i >= points.num_per_elem[elem]) {\n";
           code << "        for (CeedInt j = 0; j < num_comp" << var_suffix << "; j++) r_s" << var_suffix << "[j] = 0.0;\n";
           code << "      }\n";
-          code << "      InterpTransposeAtPoints" << dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
+          code << "      InterpTransposeAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
                << ">(data, i, r_s" << var_suffix << ", r_x, r_c" << var_suffix << ");\n";
           break;
         case CEED_EVAL_GRAD:
           code << "      if (i >= points.num_per_elem[elem]) {\n";
-          code << "        for (CeedInt j = 0; j < num_comp" << var_suffix << "*dim; j++) r_s" << var_suffix << "[j] = 0.0;\n";
+          code << "        for (CeedInt j = 0; j < num_comp" << var_suffix << "*dim" << var_suffix << "; j++) r_s" << var_suffix << "[j] = 0.0;\n";
           code << "      }\n";
-          code << "      GradTransposeAtPoints" << dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
+          code << "      GradTransposeAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
                << ">(data, i, r_s" << var_suffix << ", r_x, r_c" << var_suffix << ");\n";
           break;
           // LCOV_EXCL_START
@@ -985,9 +1049,9 @@ static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, C
 // Build single operator kernel
 //------------------------------------------------------------------------------
 extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_build) {
-  bool                    is_tensor = true, is_at_points = false, use_3d_slices = false;
+  bool                    is_all_tensor = true, is_all_nontensor = true, is_at_points = false, use_3d_slices = false;
   Ceed                    ceed;
-  CeedInt                 Q_1d, num_input_fields, num_output_fields, dim = 1, max_num_points = 0, coords_comp_stride = 0;
+  CeedInt                 Q, Q_1d, num_input_fields, num_output_fields, max_dim = 1, max_num_points = 0, coords_comp_stride = 0;
   CeedQFunctionField     *qf_input_fields, *qf_output_fields;
   CeedQFunction_Cuda_gen *qf_data;
   CeedQFunction           qf;
@@ -1009,7 +1073,7 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_b
   // Check field compatibility
   CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
   {
-    bool has_shared_bases = true, is_all_tensor = true, is_all_nontensor = true;
+    bool has_shared_bases = true;
 
     for (CeedInt i = 0; i < num_input_fields; i++) {
       CeedBasis basis;
@@ -1058,7 +1122,7 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_b
       CeedCallBackend(CeedBasisDestroy(&basis));
     }
     // -- Fallback to ref if not all bases are shared
-    if (!has_shared_bases || (!is_all_tensor && !is_all_nontensor)) {
+    if (!has_shared_bases) {
       *is_good_build = false;
       return CEED_ERROR_SUCCESS;
     }
@@ -1070,10 +1134,16 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_b
 
   // Get operator data
   CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points));
-  CeedCallBackend(CeedOperatorBuildKernelData_Cuda_gen(ceed, num_input_fields, op_input_fields, qf_input_fields, num_output_fields, op_output_fields,
-                                                       qf_output_fields, &data->max_P_1d, &Q_1d, &dim, &is_tensor, &use_3d_slices));
-  if (dim == 0) dim = 1;
-  data->dim = dim;
+  {
+    CeedInt max_P, max_P_1d;
+
+    CeedCallBackend(CeedOperatorBuildKernelData_Cuda_gen(ceed, num_input_fields, op_input_fields, qf_input_fields, num_output_fields,
+                                                         op_output_fields, qf_output_fields, &max_P, &max_P_1d, &Q, &Q_1d, &max_dim, &is_all_tensor,
+                                                         &use_3d_slices));
+    data->max_P_1d = is_all_tensor ? max_P_1d : max_P;
+  }
+  if (max_dim == 0) max_dim = 1;
+  data->dim = max_dim;
   if (is_at_points) {
     CeedElemRestriction_Cuda *rstr_data;
     CeedElemRestriction       rstr_points = NULL;
@@ -1121,10 +1191,11 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_b
   }
 
   // Load basis source files
-  if (is_tensor) {
+  if (!is_all_nontensor) {
     code << "// Tensor basis source\n";
     code << "#include <ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h>\n\n";
-  } else {
+  }
+  if (!is_all_tensor) {
     code << "// Non-tensor basis source\n";
     code << "#include <ceed/jit-source/cuda/cuda-shared-basis-nontensor-templates.h>\n\n";
   }
@@ -1143,7 +1214,7 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_b
 
   // Define CEED_Q_VLA
   code << "\n#undef CEED_Q_VLA\n";
-  if (dim != 3 || is_at_points || use_3d_slices || !is_tensor) {
+  if (max_dim != 3 || is_at_points || use_3d_slices || !is_all_tensor) {
     code << "#define CEED_Q_VLA 1\n\n";
   } else {
     code << "#define CEED_Q_VLA " << Q_1d << "\n\n";
@@ -1190,8 +1261,13 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_b
     code << "  CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n";
   }
 
-  code << "  const CeedInt dim = " << dim << ";\n";
-  code << "  const CeedInt " << (is_tensor ? "Q_1d" : "Q") << " = " << Q_1d << ";\n";
+  code << "  const CeedInt max_dim = " << max_dim << ";\n";
+  if (!is_all_tensor) {
+    code << "  const CeedInt Q = " << Q << ";\n";
+  }
+  if (!is_all_nontensor) {
+    code << "  const CeedInt Q_1d = " << Q_1d << ";\n";
+  }
   if (is_at_points) {
     code << "  const CeedInt max_num_points = " << max_num_points << ";\n";
     code << "  const CeedInt coords_comp_stride = " << coords_comp_stride << ";\n";
@@ -1204,7 +1280,7 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_b
   code << "  data.t_id_y = threadIdx.y;\n";
   code << "  data.t_id_z = threadIdx.z;\n";
   code << "  data.t_id  = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
-  code << "  data.slice = slice + data.t_id_z*OP_T_1D" << ((!is_tensor || dim == 1) ? "" : "*OP_T_1D") << ";\n";
+  code << "  data.slice = slice + data.t_id_z*OP_T_1D" << ((!is_all_tensor || max_dim == 1) ? "" : "*OP_T_1D") << ";\n";
 
   // -- Determine input mat reuse
   FieldReuse_Cuda input_matrix_reuse[CEED_FIELD_MAX];
@@ -1213,12 +1289,14 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_b
     input_matrix_reuse[i].index = -1;
   }
   for (CeedInt i = 0; i < num_input_fields; i++) {
+    bool         is_tensor = true;
     CeedEvalMode eval_mode_i;
     CeedBasis    basis_i;
 
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode_i));
     if (eval_mode_i == CEED_EVAL_WEIGHT) continue;
     CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis_i));
+    CeedCallBackend(CeedBasisIsTensor(basis_i, &is_tensor));
     for (CeedInt j = 0; (input_matrix_reuse[i].index == -1) && (j < i); j++) {
       CeedEvalMode eval_mode_j;
       CeedBasis    basis_j;
@@ -1252,11 +1330,13 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_b
     output_matrix_reuse[i].index = -1;
   }
   for (CeedInt i = 0; i < num_output_fields; i++) {
+    bool         is_tensor = true;
     CeedEvalMode eval_mode_i;
     CeedBasis    basis_i;
 
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode_i));
     CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis_i));
+    CeedCallBackend(CeedBasisIsTensor(basis_i, &is_tensor));
     for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < num_input_fields); j++) {
       CeedEvalMode eval_mode_j;
       CeedBasis    basis_j;
@@ -1310,12 +1390,12 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_b
   code << "\n  // Input field constants and basis data\n";
   for (CeedInt i = 0; i < num_input_fields; i++) {
     CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i], Q_1d,
-                                                              true, is_tensor, is_at_points, use_3d_slices));
+                                                              true, is_all_tensor, is_at_points, use_3d_slices));
   }
   code << "\n  // Output field constants and basis data\n";
   for (CeedInt i = 0; i < num_output_fields; i++) {
     CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i], Q_1d,
-                                                              false, is_tensor, is_at_points, use_3d_slices));
+                                                              false, is_all_tensor, is_at_points, use_3d_slices));
   }
 
   // Loop over all elements
@@ -1333,7 +1413,7 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_b
     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
-    max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_tensor && (dim >= 3) ? elem_size : 1));
+    max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? elem_size : 1));
     CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
   }
   for (CeedInt i = 0; i < num_output_fields; i++) {
@@ -1343,7 +1423,7 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_b
     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
-    max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_tensor && (dim >= 3) ? elem_size : 1));
+    max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? elem_size : 1));
     CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
   }
   code << "    // Scratch restriction buffer space\n";
@@ -1403,18 +1483,18 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_b
     code << "    // ---- Input field " << f << ": " << field_name << "\n";
 
     // ---- Restriction
-    CeedCallBackend(CeedOperatorBuildKernelRestriction_Cuda_gen(code, data, f, dim, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f],
-                                                                Q_1d, true, is_tensor, is_at_points, use_3d_slices));
+    CeedCallBackend(CeedOperatorBuildKernelRestriction_Cuda_gen(code, data, f, max_dim, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f],
+                                                                Q_1d, true, is_all_tensor, is_at_points, use_3d_slices));
 
     // ---- Basis action
-    CeedCallBackend(CeedOperatorBuildKernelBasis_Cuda_gen(code, data, f, dim, op_input_fields[f], qf_input_fields[f], Q_1d, true, is_tensor,
+    CeedCallBackend(CeedOperatorBuildKernelBasis_Cuda_gen(code, data, f, max_dim, op_input_fields[f], qf_input_fields[f], Q_1d, true, is_all_tensor,
                                                           is_at_points, use_3d_slices));
   }
 
   // -- Q function
-  CeedCallBackend(CeedOperatorBuildKernelQFunction_Cuda_gen(code, data, dim, max_num_points, num_input_fields, op_input_fields, qf_input_fields,
-                                                            num_output_fields, op_output_fields, qf_output_fields, qfunction_name, Q_1d, is_tensor,
-                                                            is_at_points, use_3d_slices));
+  CeedCallBackend(CeedOperatorBuildKernelQFunction_Cuda_gen(code, data, max_dim, max_num_points, num_input_fields, op_input_fields, qf_input_fields,
+                                                            num_output_fields, op_output_fields, qf_output_fields, qfunction_name, Q_1d,
+                                                            is_all_tensor, is_at_points, use_3d_slices));
 
   // -- Output basis and restriction
   code << "\n    // -- Output field basis action and restrictions\n";
@@ -1425,12 +1505,12 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_b
     code << "    // ---- Output field " << i << ": " << field_name << "\n";
 
     // ---- Basis action
-    CeedCallBackend(CeedOperatorBuildKernelBasis_Cuda_gen(code, data, i, dim, op_output_fields[i], qf_output_fields[i], Q_1d, false, is_tensor,
-                                                          is_at_points, use_3d_slices));
+    CeedCallBackend(CeedOperatorBuildKernelBasis_Cuda_gen(code, data, i, max_dim, op_output_fields[i], qf_output_fields[i], Q_1d, false,
+                                                          is_all_tensor, is_at_points, use_3d_slices));
 
     // ---- Restriction
-    CeedCallBackend(CeedOperatorBuildKernelRestriction_Cuda_gen(code, data, i, dim, NULL, op_output_fields[i], qf_output_fields[i], Q_1d, false,
-                                                                is_tensor, is_at_points, use_3d_slices));
+    CeedCallBackend(CeedOperatorBuildKernelRestriction_Cuda_gen(code, data, i, max_dim, NULL, op_output_fields[i], qf_output_fields[i], Q_1d, false,
+                                                                is_all_tensor, is_at_points, use_3d_slices));
   }
 
   // Close loop and function
@@ -1440,9 +1520,10 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_b
 
   // Compile
   {
-    bool is_compile_good = false;
+    bool          is_compile_good = false;
+    const CeedInt T_1d            = CeedIntMax(is_all_tensor ? Q_1d : Q, data->max_P_1d);
 
-    CeedCallBackend(CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good, &data->module, 1, "OP_T_1D", CeedIntMax(Q_1d, data->max_P_1d)));
+    CeedCallBackend(CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good, &data->module, 1, "OP_T_1D", T_1d));
     if (is_compile_good) {
       *is_good_build = true;
       CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, operator_name.c_str(), &data->op));
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h
index b76559bfbf..628967402a 100644
--- a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h
@@ -203,12 +203,12 @@ inline __device__ void InterpTensor2d(SharedData_Cuda &data, const CeedScalar *_
   InterpTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x, data.t_id_y, r_U, c_B, r_V);
 }
 
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void InterpTensor2dFlattened(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
                                                CeedScalar *__restrict__ r_V) {
   const int max_1d = P_1D < Q_1D ? P_1D : Q_1D;
 
-  InterpTensor2d_Core<NUM_COMP, P_1D, Q_1D>(data, data.t_id_x % max_1d, data.t_id_x / max_1d, r_U, c_B, r_V);
+  InterpTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x % max_1d, data.t_id_x / max_1d, r_U, c_B, r_V);
 }
 
 //------------------------------------------------------------------------------
@@ -230,12 +230,12 @@ inline __device__ void InterpTransposeTensor2d(SharedData_Cuda &data, const Ceed
   InterpTransposeTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x, data.t_id_y, r_U, c_B, r_V);
 }
 
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void InterpTransposeTensor2dFlattened(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
                                                         CeedScalar *__restrict__ r_V) {
   const int max_1d = P_1D < Q_1D ? P_1D : Q_1D;
 
-  InterpTransposeTensor2d_Core<NUM_COMP, P_1D, Q_1D>(data, data.t_id_x % max_1d, data.t_id_x / max_1d, r_U, c_B, r_V);
+  InterpTransposeTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x % max_1d, data.t_id_x / max_1d, r_U, c_B, r_V);
 }
 
 //------------------------------------------------------------------------------
@@ -259,12 +259,12 @@ inline __device__ void GradTensor2d(SharedData_Cuda &data, const CeedScalar *__r
   GradTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x, data.t_id_y, r_U, c_B, c_G, r_V);
 }
 
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void GradTensor2dFlattened(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
                                              CeedScalar *__restrict__ r_V) {
   const int max_1d = P_1D < Q_1D ? P_1D : Q_1D;
 
-  GradTensor2d_Core<NUM_COMP, P_1D, Q_1D>(data, data.t_id_x % max_1d, data.t_id_x / max_1d, r_U, c_B, c_G, r_V);
+  GradTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x % max_1d, data.t_id_x / max_1d, r_U, c_B, c_G, r_V);
 }
 
 //------------------------------------------------------------------------------
@@ -288,12 +288,12 @@ inline __device__ void GradTransposeTensor2d(SharedData_Cuda &data, const CeedSc
   GradTransposeTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x, data.t_id_y, r_U, c_B, c_G, r_V);
 }
 
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void GradTransposeTensor2dFlattened(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
                                                       const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
   const int max_1d = P_1D < Q_1D ? P_1D : Q_1D;
 
-  GradTransposeTensor2d_Core<NUM_COMP, P_1D, Q_1D>(data, data.t_id_x % max_1d, data.t_id_x / max_1d, r_U, c_B, c_G, r_V);
+  GradTransposeTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x % max_1d, data.t_id_x / max_1d, r_U, c_B, c_G, r_V);
 }
 
 //------------------------------------------------------------------------------

From c433aabc65bbaa1ba98688d834b18c213e9c6270 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Tue, 11 Mar 2025 16:18:48 -0600
Subject: [PATCH 329/571] cuda - fix 2D flattening

---
 .../cuda-gen/ceed-cuda-gen-operator-build.cpp | 40 +++++++------
 backends/cuda-gen/ceed-cuda-gen-operator.c    | 12 ++--
 backends/cuda-gen/ceed-cuda-gen.h             |  2 +-
 .../cuda/cuda-shared-basis-tensor-templates.h | 59 +++++++++++++------
 4 files changed, 69 insertions(+), 44 deletions(-)

diff --git a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
index ae27028e08..2f06a7aa7a 100644
--- a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
+++ b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
@@ -177,7 +177,7 @@ static int CeedOperatorBuildKernelData_Cuda_gen(Ceed ceed, CeedInt num_input_fie
 // Setup fields
 //------------------------------------------------------------------------------
 static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, CeedInt i, CeedOperatorField op_field,
-                                                     CeedQFunctionField qf_field, FieldReuse_Cuda field_reuse, CeedInt Q_1d, bool is_input,
+                                                     CeedQFunctionField qf_field, FieldReuse_Cuda field_reuse, CeedInt Q, CeedInt Q_1d, bool is_input,
                                                      bool is_all_tensor, bool is_at_points, bool use_3d_slices) {
   bool      is_tensor = true;
   CeedBasis basis;
@@ -220,7 +220,7 @@ static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, C
     CeedInt P = 0;
 
     CeedCallBackend(CeedBasisGetNumNodes(basis, &P));
-    code << "  const CeedInt P" << var_suffix << " = " << (basis == CEED_BASIS_NONE ? Q_1d : P) << ";\n";
+    code << "  const CeedInt P" << var_suffix << " = " << (basis == CEED_BASIS_NONE ? Q : P) << ";\n";
   }
   code << "  const CeedInt " << P_name << " = " << (basis == CEED_BASIS_NONE ? Q_1d : P_1d) << ";\n";
   if (eval_mode != CEED_EVAL_WEIGHT) {
@@ -528,10 +528,11 @@ static int CeedOperatorBuildKernelBasis_Cuda_gen(std::ostringstream &code, CeedO
           std::string function_name = is_tensor
                                           ? ((dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened"))
                                           : "InterpNonTensor";
+          std::string op_t_1d_name  = (is_all_tensor || !is_tensor) ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
 
           code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
-          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << (P_1d > Q_1d ? P_name : Q_name)
-               << ">(data, r_e" << var_suffix << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n";
+          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_e"
+               << var_suffix << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n";
         }
         break;
       case CEED_EVAL_GRAD:
@@ -551,17 +552,18 @@ static int CeedOperatorBuildKernelBasis_Cuda_gen(std::ostringstream &code, CeedO
           bool        is_collocated = dim == 3 && Q_1d >= P_1d;
           std::string function_name = (dim == 1 ? "Grad" : (is_collocated ? "GradTensorCollocated" : "GradTensor")) + std::to_string(dim) + "d" +
                                       (is_all_tensor ? "" : "Flattened");
+          std::string op_t_1d_name = is_all_tensor ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
 
           code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "*" << (dim >= 3 ? Q_name : "1")
                << "];\n";
-          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << (P_1d > Q_1d ? P_name : Q_name)
-               << ">(data, r_e" << var_suffix << ", s_B" << var_suffix << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n";
+          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_e"
+               << var_suffix << ", s_B" << var_suffix << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n";
         } else {
           std::string function_name = "GradNonTensor";
 
           code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
-          code << "    " << function_name << "<num_comp" << var_suffix << ", dim" << var_suffix << ", " << P_name << ", " << Q_name << ", "
-               << (P_1d > Q_1d ? P_name : Q_name) << ">(data, r_e" << var_suffix << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n";
+          code << "    " << function_name << "<num_comp" << var_suffix << ", dim" << var_suffix << ", " << P_name << ", " << Q_name
+               << ", OP_T_1D>(data, r_e" << var_suffix << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n";
         }
         break;
       case CEED_EVAL_WEIGHT: {
@@ -602,9 +604,10 @@ static int CeedOperatorBuildKernelBasis_Cuda_gen(std::ostringstream &code, CeedO
           std::string function_name =
               is_tensor ? ((dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened"))
                         : "InterpTransposeNonTensor";
+          std::string op_t_1d_name = (is_all_tensor || !is_tensor) ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
 
-          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << (P_1d > Q_1d ? P_name : Q_name)
-               << ">(data, r_q" << var_suffix << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
+          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_q"
+               << var_suffix << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
         }
         break;
       case CEED_EVAL_GRAD:
@@ -623,14 +626,15 @@ static int CeedOperatorBuildKernelBasis_Cuda_gen(std::ostringstream &code, CeedO
           bool        is_collocated = dim == 3 && Q_1d >= P_1d;
           std::string function_name = (dim == 1 ? "GradTranspose" : (is_collocated ? "GradTransposeTensorCollocated" : "GradTransposeTensor")) +
                                       std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened");
+          std::string op_t_1d_name = is_all_tensor ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
 
-          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << (P_1d > Q_1d ? P_name : Q_name)
-               << ">(data, r_q" << var_suffix << ", s_B" << var_suffix << ", s_G" << var_suffix << ", r_e" << var_suffix << ");\n";
+          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_q"
+               << var_suffix << ", s_B" << var_suffix << ", s_G" << var_suffix << ", r_e" << var_suffix << ");\n";
         } else {
           std::string function_name = "GradTransposeNonTensor";
 
-          code << "    " << function_name << "<num_comp" << var_suffix << ", dim" << var_suffix << ", " << P_name << ", " << Q_name << ", "
-               << (P_1d > Q_1d ? P_name : Q_name) << ">(data, r_q" << var_suffix << ", s_G" << var_suffix << ", r_e" << var_suffix << ");\n";
+          code << "    " << function_name << "<num_comp" << var_suffix << ", dim" << var_suffix << ", " << P_name << ", " << Q_name
+               << ", OP_T_1D>(data, r_q" << var_suffix << ", s_G" << var_suffix << ", r_e" << var_suffix << ");\n";
         }
         break;
       // LCOV_EXCL_START
@@ -1160,6 +1164,8 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_b
     if (is_at_points) Q_1d = max_num_points;
     else CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q_1d));
   }
+  if (Q == 0) Q = Q_1d;
+  data->Q    = Q;
   data->Q_1d = Q_1d;
 
   // Check for restriction only identity operator
@@ -1389,13 +1395,13 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_b
   // Initialize constants, and matrices B and G
   code << "\n  // Input field constants and basis data\n";
   for (CeedInt i = 0; i < num_input_fields; i++) {
-    CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i], Q_1d,
+    CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i], Q, Q_1d,
                                                               true, is_all_tensor, is_at_points, use_3d_slices));
   }
   code << "\n  // Output field constants and basis data\n";
   for (CeedInt i = 0; i < num_output_fields; i++) {
-    CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i], Q_1d,
-                                                              false, is_all_tensor, is_at_points, use_3d_slices));
+    CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i], Q,
+                                                              Q_1d, false, is_all_tensor, is_at_points, use_3d_slices));
   }
 
   // Loop over all elements
diff --git a/backends/cuda-gen/ceed-cuda-gen-operator.c b/backends/cuda-gen/ceed-cuda-gen-operator.c
index 0e42eeaaed..831d6f852b 100644
--- a/backends/cuda-gen/ceed-cuda-gen-operator.c
+++ b/backends/cuda-gen/ceed-cuda-gen-operator.c
@@ -197,16 +197,14 @@ static int CeedOperatorApplyAddCore_Cuda_gen(CeedOperator op, CUstream stream, c
   CeedCallBackend(CeedQFunctionGetInnerContextData(qf, CEED_MEM_DEVICE, &qf_data->d_c));
 
   // Apply operator
-  void         *opargs[]  = {(void *)&num_elem, &qf_data->d_c, &data->indices, &data->fields, &data->B, &data->G, &data->W, &data->points};
-  const CeedInt dim       = data->dim;
-  const CeedInt Q_1d      = data->Q_1d;
-  const CeedInt P_1d      = data->max_P_1d;
-  const CeedInt thread_1d = CeedIntMax(Q_1d, P_1d);
-  int           max_threads_per_block, min_grid_size, grid;
+  void *opargs[] = {(void *)&num_elem, &qf_data->d_c, &data->indices, &data->fields, &data->B, &data->G, &data->W, &data->points};
+  int   max_threads_per_block, min_grid_size, grid;
 
   CeedCallBackend(CeedOperatorHasTensorBases(op, &is_tensor));
+  const CeedInt thread_1d = CeedIntMax(is_tensor ? data->Q_1d : data->Q, data->max_P_1d);
+
   CeedCallCuda(ceed, cuOccupancyMaxPotentialBlockSize(&min_grid_size, &max_threads_per_block, data->op, dynamicSMemSize, 0, 0x10000));
-  int block[3] = {thread_1d, ((!is_tensor || dim == 1) ? 1 : thread_1d), -1};
+  int block[3] = {thread_1d, ((!is_tensor || data->dim == 1) ? 1 : thread_1d), -1};
 
   if (is_tensor) {
     CeedCallBackend(BlockGridCalculate(num_elem, min_grid_size / cuda_data->device_prop.multiProcessorCount, is_at_points ? 1 : max_threads_per_block,
diff --git a/backends/cuda-gen/ceed-cuda-gen.h b/backends/cuda-gen/ceed-cuda-gen.h
index 09b66171e9..30d13574aa 100644
--- a/backends/cuda-gen/ceed-cuda-gen.h
+++ b/backends/cuda-gen/ceed-cuda-gen.h
@@ -14,7 +14,7 @@
 typedef struct {
   bool           use_fallback;
   CeedInt        dim;
-  CeedInt        Q_1d;
+  CeedInt        Q, Q_1d;
   CeedInt        max_P_1d;
   CUmodule       module;
   CUfunction     op;
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h
index 628967402a..bb69f075e6 100644
--- a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h
@@ -184,6 +184,29 @@ inline __device__ void ContractTransposeAddX2d(SharedData_Cuda &data, const int
   __syncthreads();
 }
 
+//------------------------------------------------------------------------------
+// 2D pack/unpack quadrature values
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int Q_1D, int T_1D>
+inline __device__ void QPack2D(SharedData_Cuda &data, const int t_id_x, const int t_id_y, CeedScalar *U) {
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * T_1D] = U[comp];
+    __syncthreads();
+    U[comp] = data.t_id_x < (Q_1D * Q_1D) ? data.slice[(data.t_id_x % Q_1D) + (data.t_id_x / Q_1D) * T_1D] : 0.0;
+    __syncthreads();
+  }
+}
+
+template <int NUM_COMP, int Q_1D, int T_1D>
+inline __device__ void QUnpack2D(SharedData_Cuda &data, const int t_id_x, const int t_id_y, CeedScalar *U) {
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    if (data.t_id_x < (Q_1D * Q_1D)) data.slice[(data.t_id_x % Q_1D) + (data.t_id_x / Q_1D) * T_1D] = U[comp];
+    __syncthreads();
+    U[comp] = (t_id_x < Q_1D && t_id_y < Q_1D) ? data.slice[t_id_x + t_id_y * T_1D] : 0.0;
+    __syncthreads();
+  }
+}
+
 //------------------------------------------------------------------------------
 // 2D interpolate to quadrature points
 //------------------------------------------------------------------------------
@@ -204,11 +227,11 @@ inline __device__ void InterpTensor2d(SharedData_Cuda &data, const CeedScalar *_
 }
 
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void InterpTensor2dFlattened(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+inline __device__ void InterpTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
                                                CeedScalar *__restrict__ r_V) {
-  const int max_1d = P_1D < Q_1D ? P_1D : Q_1D;
-
-  InterpTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x % max_1d, data.t_id_x / max_1d, r_U, c_B, r_V);
+  QUnpack2D<NUM_COMP, P_1D, T_1D>(data, data.t_id_x % T_1D, data.t_id_x / T_1D, r_U);
+  InterpTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x % T_1D, data.t_id_x / T_1D, r_U, c_B, r_V);
+  QPack2D<NUM_COMP, Q_1D, T_1D>(data, data.t_id_x % T_1D, data.t_id_x / T_1D, r_V);
 }
 
 //------------------------------------------------------------------------------
@@ -231,11 +254,11 @@ inline __device__ void InterpTransposeTensor2d(SharedData_Cuda &data, const Ceed
 }
 
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void InterpTransposeTensor2dFlattened(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+inline __device__ void InterpTransposeTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
                                                         CeedScalar *__restrict__ r_V) {
-  const int max_1d = P_1D < Q_1D ? P_1D : Q_1D;
-
-  InterpTransposeTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x % max_1d, data.t_id_x / max_1d, r_U, c_B, r_V);
+  QUnpack2D<NUM_COMP, Q_1D, T_1D>(data, data.t_id_x % T_1D, data.t_id_x / T_1D, r_U);
+  InterpTransposeTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x % T_1D, data.t_id_x / T_1D, r_U, c_B, r_V);
+  QPack2D<NUM_COMP, P_1D, T_1D>(data, data.t_id_x % T_1D, data.t_id_x / T_1D, r_V);
 }
 
 //------------------------------------------------------------------------------
@@ -260,11 +283,11 @@ inline __device__ void GradTensor2d(SharedData_Cuda &data, const CeedScalar *__r
 }
 
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void GradTensor2dFlattened(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
+inline __device__ void GradTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
                                              CeedScalar *__restrict__ r_V) {
-  const int max_1d = P_1D < Q_1D ? P_1D : Q_1D;
-
-  GradTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x % max_1d, data.t_id_x / max_1d, r_U, c_B, c_G, r_V);
+  QUnpack2D<NUM_COMP, P_1D, T_1D>(data, data.t_id_x % T_1D, data.t_id_x / T_1D, r_U);
+  GradTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x % T_1D, data.t_id_x / T_1D, r_U, c_B, c_G, r_V);
+  QPack2D<NUM_COMP * 2, Q_1D, T_1D>(data, data.t_id_x % T_1D, data.t_id_x / T_1D, r_V);
 }
 
 //------------------------------------------------------------------------------
@@ -289,11 +312,11 @@ inline __device__ void GradTransposeTensor2d(SharedData_Cuda &data, const CeedSc
 }
 
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void GradTransposeTensor2dFlattened(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+inline __device__ void GradTransposeTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
                                                       const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
-  const int max_1d = P_1D < Q_1D ? P_1D : Q_1D;
-
-  GradTransposeTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x % max_1d, data.t_id_x / max_1d, r_U, c_B, c_G, r_V);
+  QUnpack2D<NUM_COMP * 2, Q_1D, T_1D>(data, data.t_id_x % T_1D, data.t_id_x / T_1D, r_U);
+  GradTransposeTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x % T_1D, data.t_id_x / T_1D, r_U, c_B, c_G, r_V);
+  QPack2D<NUM_COMP, P_1D, T_1D>(data, data.t_id_x % T_1D, data.t_id_x / T_1D, r_V);
 }
 
 //------------------------------------------------------------------------------
@@ -312,9 +335,7 @@ inline __device__ void WeightTensor2d(SharedData_Cuda &data, const CeedScalar *_
 
 template <int P_1D, int Q_1D>
 inline __device__ void WeightTensor2dFlattened(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
-  const int max_1d = P_1D < Q_1D ? P_1D : Q_1D;
-
-  WeightTensor2d_Core<Q_1D>(data, data.t_id_x % max_1d, data.t_id_x / max_1d, q_weight_1d, w);
+  WeightTensor2d_Core<Q_1D>(data, data.t_id_x % Q_1D, data.t_id_x / Q_1D, q_weight_1d, w);
 }
 
 //------------------------------------------------------------------------------

From c8e372f0bd7b7998d8acb114f0b7d6adab16132f Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Thu, 13 Mar 2025 14:38:23 -0600
Subject: [PATCH 330/571] gen - add 3D mixed support

---
 .../cuda-gen/ceed-cuda-gen-operator-build.cpp |   4 +
 ...-shared-basis-tensor-flattened-templates.h | 524 ++++++++++++++++++
 .../cuda/cuda-shared-basis-tensor-templates.h | 177 ++----
 3 files changed, 569 insertions(+), 136 deletions(-)
 create mode 100644 include/ceed/jit-source/cuda/cuda-shared-basis-tensor-flattened-templates.h

diff --git a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
index 2f06a7aa7a..a9d97d2935 100644
--- a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
+++ b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
@@ -1205,6 +1205,10 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_b
     code << "// Non-tensor basis source\n";
     code << "#include <ceed/jit-source/cuda/cuda-shared-basis-nontensor-templates.h>\n\n";
   }
+  if (!is_all_tensor && !is_all_nontensor) {
+    code << "// Tensor basis source\n";
+    code << "#include <ceed/jit-source/cuda/cuda-shared-basis-tensor-flattened-templates.h>\n\n";
+  }
   if (is_at_points) {
     code << "// AtPoints basis source\n";
     code << "#include <ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h>\n\n";
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-flattened-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-flattened-templates.h
new file mode 100644
index 0000000000..bb09edb226
--- /dev/null
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-flattened-templates.h
@@ -0,0 +1,524 @@
+// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+/// @file
+/// Internal header for CUDA shared memory tensor product basis templates
+#include <ceed/types.h>
+
+//------------------------------------------------------------------------------
+// 2D
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+// 2D tensor contraction x
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void ContractX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
+                                            CeedScalar *V) {
+  data.slice[t_id_x + t_id_y * T_1D] = *U;
+  __syncthreads();
+  *V = 0.0;
+  if (t_id_x < Q_1D && t_id_y < P_1D) {
+    for (CeedInt i = 0; i < P_1D; i++) {
+      *V += B[i + t_id_x * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
+    }
+  }
+  __syncthreads();
+}
+
+//------------------------------------------------------------------------------
+// 2D tensor contract y
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void ContractY2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
+                                            CeedScalar *V) {
+  data.slice[t_id_x + t_id_y * T_1D] = *U;
+  __syncthreads();
+  *V = 0.0;
+  if (t_id_x < Q_1D && t_id_y < Q_1D) {
+    for (CeedInt i = 0; i < P_1D; i++) {
+      *V += B[i + t_id_y * P_1D] * data.slice[t_id_x + i * T_1D];  // Contract y direction
+    }
+  }
+  __syncthreads();
+}
+
+//------------------------------------------------------------------------------
+// 2D transpose tensor contract y
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void ContractTransposeY2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U,
+                                                     const CeedScalar *B, CeedScalar *V) {
+  data.slice[t_id_x + t_id_y * T_1D] = *U;
+  __syncthreads();
+  *V = 0.0;
+  if (t_id_x < Q_1D && t_id_y < P_1D) {
+    for (CeedInt i = 0; i < Q_1D; i++) {
+      *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D];  // Contract y direction
+    }
+  }
+  __syncthreads();
+}
+
+//------------------------------------------------------------------------------
+// 2D transpose tensor contract x
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void ContractTransposeX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U,
+                                                     const CeedScalar *B, CeedScalar *V) {
+  data.slice[t_id_x + t_id_y * T_1D] = *U;
+  __syncthreads();
+  *V = 0.0;
+  if (t_id_x < P_1D && t_id_y < P_1D) {
+    for (CeedInt i = 0; i < Q_1D; i++) {
+      *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
+    }
+  }
+  __syncthreads();
+}
+
+//------------------------------------------------------------------------------
+// 2D transpose tensor contract and add x
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void ContractTransposeAddX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U,
+                                                        const CeedScalar *B, CeedScalar *V) {
+  data.slice[t_id_x + t_id_y * T_1D] = *U;
+  __syncthreads();
+  if (t_id_x < P_1D && t_id_y < P_1D) {
+    for (CeedInt i = 0; i < Q_1D; i++) {
+      *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
+    }
+  }
+  __syncthreads();
+}
+
+//------------------------------------------------------------------------------
+// 2D pack/unpack quadrature values
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int Q_1D, int T_1D>
+inline __device__ void QPack2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, CeedScalar *U) {
+  const CeedInt new_t_id_x = data.t_id_x % Q_1D, new_t_id_y = data.t_id_x / Q_1D;
+
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * T_1D] = U[comp];
+    __syncthreads();
+    U[comp] = data.t_id_x < (Q_1D * Q_1D) ? data.slice[new_t_id_x + new_t_id_y * T_1D] : 0.0;
+    __syncthreads();
+  }
+}
+
+template <int NUM_COMP, int Q_1D, int T_1D>
+inline __device__ void QUnpack2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, CeedScalar *U) {
+  const CeedInt old_t_id_x = data.t_id_x % Q_1D, old_t_id_y = data.t_id_x / Q_1D;
+
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    if (data.t_id_x < (Q_1D * Q_1D)) data.slice[old_t_id_x + old_t_id_y * T_1D] = U[comp];
+    __syncthreads();
+    U[comp] = (t_id_x < Q_1D && t_id_y < Q_1D) ? data.slice[t_id_x + t_id_y * T_1D] : 0.0;
+    __syncthreads();
+  }
+}
+
+//------------------------------------------------------------------------------
+// 2D interpolate to quadrature points
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void InterpTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                               CeedScalar *__restrict__ r_V) {
+  const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
+  CeedScalar r_t[1];
+
+  QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
+    ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
+  }
+  QPack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V);
+}
+
+//------------------------------------------------------------------------------
+// 2D interpolate transpose
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void InterpTransposeTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                        CeedScalar *__restrict__ r_V) {
+  const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
+  CeedScalar r_t[1];
+
+  QUnpack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U);
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
+    ContractTransposeX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
+  }
+  QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V);
+}
+
+//------------------------------------------------------------------------------
+// 2D derivatives at quadrature points
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void GradTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
+                                             CeedScalar *__restrict__ r_V) {
+  const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
+  CeedScalar r_t[1];
+
+  QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_G, r_t);
+    ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp + 0 * NUM_COMP]);
+    ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
+    ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp + 1 * NUM_COMP]);
+  }
+  QPack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V);
+}
+
+//------------------------------------------------------------------------------
+// 2D derivatives transpose
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void GradTransposeTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                      const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
+  const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
+  CeedScalar r_t[1];
+
+  QUnpack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U);
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 0 * NUM_COMP], c_B, r_t);
+    ContractTransposeX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp]);
+    ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 1 * NUM_COMP], c_G, r_t);
+    ContractTransposeAddX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
+  }
+  QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V);
+}
+
+//------------------------------------------------------------------------------
+// 2D quadrature weights
+//------------------------------------------------------------------------------
+template <int P_1D, int Q_1D>
+inline __device__ void WeightTensor2dFlattened(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
+  const int t_id_x = data.t_id_x % Q_1D, t_id_y = data.t_id_x / Q_1D;
+
+  *w = (t_id_x < Q_1D && t_id_y < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] : 0.0;
+}
+
+//------------------------------------------------------------------------------
+// 3D
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+// 3D tensor contract x
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void ContractX3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
+                                            const CeedScalar *B, CeedScalar *V) {
+  data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
+  __syncthreads();
+  *V = 0.0;
+  if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) {
+    for (CeedInt i = 0; i < P_1D; i++) {
+      *V += B[i + t_id_x * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D];  // Contract x direction
+    }
+  }
+  __syncthreads();
+}
+
+//------------------------------------------------------------------------------
+// 3D tensor contract y
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void ContractY3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
+                                            const CeedScalar *B, CeedScalar *V) {
+  data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
+  __syncthreads();
+  *V = 0.0;
+  if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) {
+    for (CeedInt i = 0; i < P_1D; i++) {
+      *V += B[i + t_id_y * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D];  // Contract y direction
+    }
+  }
+  __syncthreads();
+}
+
+//------------------------------------------------------------------------------
+// 3D tensor contract z
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void ContractZ3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
+                                            const CeedScalar *B, CeedScalar *V) {
+  data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
+  __syncthreads();
+  *V = 0.0;
+  if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < Q_1D) {
+    for (CeedInt i = 0; i < P_1D; i++) {
+      *V += B[i + t_id_z * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D];  // Contract z direction
+    }
+  }
+  __syncthreads();
+}
+
+//------------------------------------------------------------------------------
+// 3D tensor contract z
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void ContractTransposeZ3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
+                                                     const CeedScalar *B, CeedScalar *V) {
+  data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
+  __syncthreads();
+  *V = 0.0;
+  if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) {
+    for (CeedInt i = 0; i < Q_1D; i++) {
+      *V += B[t_id_z + i * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D];  // Contract z direction
+    }
+  }
+  __syncthreads();
+}
+
+//------------------------------------------------------------------------------
+// 3D transpose tensor contract z
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void ContractTransposeAddZ3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z,
+                                                        const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+  data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
+  __syncthreads();
+  if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) {
+    for (CeedInt i = 0; i < Q_1D; i++) {
+      *V += B[t_id_z + i * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D];  // Contract z direction
+    }
+  }
+  __syncthreads();
+}
+
+//------------------------------------------------------------------------------
+// 3D transpose tensor contract y
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void ContractTransposeY3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
+                                                     const CeedScalar *B, CeedScalar *V) {
+  data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
+  __syncthreads();
+  *V = 0.0;
+  if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) {
+    for (CeedInt i = 0; i < Q_1D; i++) {
+      *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D];  // Contract y direction
+    }
+  }
+  __syncthreads();
+}
+
+//------------------------------------------------------------------------------
+// 3D transpose tensor contract y
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void ContractTransposeAddY3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z,
+                                                        const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+  data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
+  __syncthreads();
+  if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) {
+    for (CeedInt i = 0; i < Q_1D; i++) {
+      *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D];  // Contract y direction
+    }
+  }
+  __syncthreads();
+}
+
+//------------------------------------------------------------------------------
+// 3D transpose tensor contract x
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void ContractTransposeX3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
+                                                     const CeedScalar *B, CeedScalar *V) {
+  data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
+  __syncthreads();
+  *V = 0.0;
+  if (t_id_x < P_1D && t_id_y < P_1D && t_id_z < P_1D) {
+    for (CeedInt i = 0; i < Q_1D; i++) {
+      *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D];  // Contract x direction
+    }
+  }
+  __syncthreads();
+}
+
+//------------------------------------------------------------------------------
+// 3D transpose tensor contract add x
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void ContractTransposeAddX3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z,
+                                                        const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+  data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
+  __syncthreads();
+  if (t_id_x < P_1D && t_id_y < P_1D && t_id_z < P_1D) {
+    for (CeedInt i = 0; i < Q_1D; i++) {
+      *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D];  // Contract x direction
+    }
+  }
+  __syncthreads();
+}
+
+//------------------------------------------------------------------------------
+// 3D pack/unpack quadrature values
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int Q_1D, int T_1D>
+inline __device__ void QPack3d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, CeedScalar *U) {
+  const CeedInt new_t_id_x = data.t_id_x % Q_1D, new_t_id_y = (data.t_id_x % (Q_1D * Q_1D)) / Q_1D, new_t_id_z = data.t_id_x / (Q_1D * Q_1D);
+
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = U[comp];
+    __syncthreads();
+    U[comp] = data.t_id_x < (Q_1D * Q_1D) ? data.slice[new_t_id_x + new_t_id_y * T_1D + new_t_id_z * T_1D * T_1D] : 0.0;
+    __syncthreads();
+  }
+}
+
+template <int NUM_COMP, int Q_1D, int T_1D>
+inline __device__ void QUnpack3d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, CeedScalar *U) {
+  const CeedInt old_t_id_x = data.t_id_x % Q_1D, old_t_id_y = (data.t_id_x % (Q_1D * Q_1D)) / Q_1D, old_t_id_z = data.t_id_x / (Q_1D * Q_1D);
+
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    if (data.t_id_x < (Q_1D * Q_1D)) data.slice[old_t_id_x + old_t_id_y * T_1D + old_t_id_z * T_1D * T_1D] = U[comp];
+    __syncthreads();
+    U[comp] = (t_id_x < Q_1D && t_id_y < Q_1D) ? data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] : 0.0;
+    __syncthreads();
+  }
+}
+
+//------------------------------------------------------------------------------
+// 3D interpolate to quadrature points
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void InterpTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                               CeedScalar *__restrict__ r_V) {
+  const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x % (T_1D * T_1D)) / T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
+  CeedScalar    r_t1[1], r_t2[1];
+
+  QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
+    ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
+    ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]);
+  }
+  QPack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
+}
+
+//------------------------------------------------------------------------------
+// 3D interpolate transpose
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void InterpTransposeTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                        CeedScalar *__restrict__ r_V) {
+  const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x % (T_1D * T_1D)) / T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
+  CeedScalar    r_t1[1], r_t2[1];
+
+  QUnpack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    ContractTransposeZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
+    ContractTransposeY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
+    ContractTransposeX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]);
+  }
+  QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
+}
+
+//------------------------------------------------------------------------------
+// 3D derivatives at quadrature points
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void GradTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
+                                             CeedScalar *__restrict__ r_V) {
+  const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x % (T_1D * T_1D)) / T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
+  CeedScalar    r_t1[1], r_t2[1];
+
+  QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_G, r_t1);
+    ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
+    ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp + 0 * NUM_COMP]);
+    ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
+    ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, r_t2);
+    ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp + 1 * NUM_COMP]);
+    ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
+    ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
+    ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_G, &r_V[comp + 2 * NUM_COMP]);
+  }
+  QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
+}
+
+//------------------------------------------------------------------------------
+// 3D derivatives transpose
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void GradTransposeTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                      const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
+  const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x % (T_1D * T_1D)) / T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
+  CeedScalar    r_t1[1], r_t2[1];
+
+  QUnpack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 0 * NUM_COMP], c_B, r_t1);
+    ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
+    ContractTransposeX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp]);
+    ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 1 * NUM_COMP], c_B, r_t1);
+    ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2);
+    ContractTransposeAddX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp]);
+    ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 2 * NUM_COMP], c_G, r_t1);
+    ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
+    ContractTransposeAddX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp]);
+  }
+  QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
+}
+
+//------------------------------------------------------------------------------
+// 3D derivatives at quadrature points
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void GradTensorCollocated3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                       const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
+  const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x % (T_1D * T_1D)) / T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
+  CeedScalar    r_t1[1], r_t2[1];
+
+  QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
+    ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
+    ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, r_t1);
+    ContractX3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 0 * NUM_COMP]);
+    ContractY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 1 * NUM_COMP]);
+    ContractZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 2 * NUM_COMP]);
+  }
+  QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
+}
+
+//------------------------------------------------------------------------------
+// 3D derivatives transpose
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void GradTransposeTensorCollocated3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                                const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
+  const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x % (T_1D * T_1D)) / T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
+  CeedScalar    r_t1[1], r_t2[1];
+
+  QUnpack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    ContractTransposeZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 2 * NUM_COMP], c_G, r_t2);
+    ContractTransposeAddY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 1 * NUM_COMP], c_G, r_t2);
+    ContractTransposeAddX3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 0 * NUM_COMP], c_G, r_t2);
+    ContractTransposeZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, r_t1);
+    ContractTransposeY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
+    ContractTransposeX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]);
+  }
+  QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
+}
+
+//------------------------------------------------------------------------------
+// 3D quadrature weights
+//------------------------------------------------------------------------------
+template <int P_1D, int Q_1D>
+inline __device__ void WeightTensor3dFlattened(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
+  const CeedInt t_id_x = data.t_id_x % Q_1D, t_id_y = (data.t_id_x % (Q_1D * Q_1D)) / Q_1D, t_id_z = data.t_id_x / (Q_1D * Q_1D);
+
+  *w = (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] * q_weight_1d[t_id_z] : 0.0;
+}
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h
index bb69f075e6..c73b7928e0 100644
--- a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h
@@ -104,14 +104,13 @@ inline __device__ void Weight1d(SharedData_Cuda &data, const CeedScalar *__restr
 // 2D tensor contraction x
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void ContractX2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
-                                   CeedScalar *V) {
-  data.slice[t_id_x + t_id_y * T_1D] = *U;
+inline __device__ void ContractX2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+  data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
   __syncthreads();
   *V = 0.0;
-  if (t_id_x < Q_1D && t_id_y < P_1D) {
+  if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
     for (CeedInt i = 0; i < P_1D; i++) {
-      *V += B[i + t_id_x * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
+      *V += B[i + data.t_id_x * P_1D] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
     }
   }
   __syncthreads();
@@ -121,14 +120,13 @@ inline __device__ void ContractX2d(SharedData_Cuda &data, const int t_id_x, cons
 // 2D tensor contract y
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void ContractY2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
-                                   CeedScalar *V) {
-  data.slice[t_id_x + t_id_y * T_1D] = *U;
+inline __device__ void ContractY2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+  data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
   __syncthreads();
   *V = 0.0;
-  if (t_id_x < Q_1D && t_id_y < Q_1D) {
+  if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
     for (CeedInt i = 0; i < P_1D; i++) {
-      *V += B[i + t_id_y * P_1D] * data.slice[t_id_x + i * T_1D];  // Contract y direction
+      *V += B[i + data.t_id_y * P_1D] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
     }
   }
   __syncthreads();
@@ -138,14 +136,13 @@ inline __device__ void ContractY2d(SharedData_Cuda &data, const int t_id_x, cons
 // 2D transpose tensor contract y
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void ContractTransposeY2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
-                                            CeedScalar *V) {
-  data.slice[t_id_x + t_id_y * T_1D] = *U;
+inline __device__ void ContractTransposeY2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+  data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
   __syncthreads();
   *V = 0.0;
-  if (t_id_x < Q_1D && t_id_y < P_1D) {
+  if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
     for (CeedInt i = 0; i < Q_1D; i++) {
-      *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D];  // Contract y direction
+      *V += B[data.t_id_y + i * P_1D] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
     }
   }
   __syncthreads();
@@ -155,14 +152,13 @@ inline __device__ void ContractTransposeY2d(SharedData_Cuda &data, const int t_i
 // 2D transpose tensor contract x
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void ContractTransposeX2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
-                                            CeedScalar *V) {
-  data.slice[t_id_x + t_id_y * T_1D] = *U;
+inline __device__ void ContractTransposeX2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+  data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
   __syncthreads();
   *V = 0.0;
-  if (t_id_x < P_1D && t_id_y < P_1D) {
+  if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
     for (CeedInt i = 0; i < Q_1D; i++) {
-      *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
+      *V += B[data.t_id_x + i * P_1D] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
     }
   }
   __syncthreads();
@@ -172,170 +168,79 @@ inline __device__ void ContractTransposeX2d(SharedData_Cuda &data, const int t_i
 // 2D transpose tensor contract and add x
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void ContractTransposeAddX2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
-                                               CeedScalar *V) {
-  data.slice[t_id_x + t_id_y * T_1D] = *U;
+inline __device__ void ContractTransposeAddX2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+  data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
   __syncthreads();
-  if (t_id_x < P_1D && t_id_y < P_1D) {
+  if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
     for (CeedInt i = 0; i < Q_1D; i++) {
-      *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
+      *V += B[data.t_id_x + i * P_1D] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
     }
   }
   __syncthreads();
 }
 
-//------------------------------------------------------------------------------
-// 2D pack/unpack quadrature values
-//------------------------------------------------------------------------------
-template <int NUM_COMP, int Q_1D, int T_1D>
-inline __device__ void QPack2D(SharedData_Cuda &data, const int t_id_x, const int t_id_y, CeedScalar *U) {
-  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * T_1D] = U[comp];
-    __syncthreads();
-    U[comp] = data.t_id_x < (Q_1D * Q_1D) ? data.slice[(data.t_id_x % Q_1D) + (data.t_id_x / Q_1D) * T_1D] : 0.0;
-    __syncthreads();
-  }
-}
-
-template <int NUM_COMP, int Q_1D, int T_1D>
-inline __device__ void QUnpack2D(SharedData_Cuda &data, const int t_id_x, const int t_id_y, CeedScalar *U) {
-  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    if (data.t_id_x < (Q_1D * Q_1D)) data.slice[(data.t_id_x % Q_1D) + (data.t_id_x / Q_1D) * T_1D] = U[comp];
-    __syncthreads();
-    U[comp] = (t_id_x < Q_1D && t_id_y < Q_1D) ? data.slice[t_id_x + t_id_y * T_1D] : 0.0;
-    __syncthreads();
-  }
-}
-
 //------------------------------------------------------------------------------
 // 2D interpolate to quadrature points
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void InterpTensor2d_Core(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *__restrict__ r_U,
-                                           const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
+inline __device__ void InterpTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                      CeedScalar *__restrict__ r_V) {
   CeedScalar r_t[1];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
-    ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
+    ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_B, r_t);
+    ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp]);
   }
 }
 
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void InterpTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
-                                      CeedScalar *__restrict__ r_V) {
-  InterpTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x, data.t_id_y, r_U, c_B, r_V);
-}
-
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void InterpTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
-                                               CeedScalar *__restrict__ r_V) {
-  QUnpack2D<NUM_COMP, P_1D, T_1D>(data, data.t_id_x % T_1D, data.t_id_x / T_1D, r_U);
-  InterpTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x % T_1D, data.t_id_x / T_1D, r_U, c_B, r_V);
-  QPack2D<NUM_COMP, Q_1D, T_1D>(data, data.t_id_x % T_1D, data.t_id_x / T_1D, r_V);
-}
-
 //------------------------------------------------------------------------------
 // 2D interpolate transpose
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void InterpTransposeTensor2d_Core(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *__restrict__ r_U,
-                                                    const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
+inline __device__ void InterpTransposeTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                               CeedScalar *__restrict__ r_V) {
   CeedScalar r_t[1];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
-    ContractTransposeX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
+    ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_B, r_t);
+    ContractTransposeX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp]);
   }
 }
 
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void InterpTransposeTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
-                                               CeedScalar *__restrict__ r_V) {
-  InterpTransposeTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x, data.t_id_y, r_U, c_B, r_V);
-}
-
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void InterpTransposeTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
-                                                        CeedScalar *__restrict__ r_V) {
-  QUnpack2D<NUM_COMP, Q_1D, T_1D>(data, data.t_id_x % T_1D, data.t_id_x / T_1D, r_U);
-  InterpTransposeTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x % T_1D, data.t_id_x / T_1D, r_U, c_B, r_V);
-  QPack2D<NUM_COMP, P_1D, T_1D>(data, data.t_id_x % T_1D, data.t_id_x / T_1D, r_V);
-}
-
 //------------------------------------------------------------------------------
 // 2D derivatives at quadrature points
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void GradTensor2d_Core(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *__restrict__ r_U,
-                                         const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
+inline __device__ void GradTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
+                                    CeedScalar *__restrict__ r_V) {
   CeedScalar r_t[1];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_G, r_t);
-    ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp + 0 * NUM_COMP]);
-    ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
-    ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp + 1 * NUM_COMP]);
+    ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_G, r_t);
+    ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp + 0 * NUM_COMP]);
+    ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_B, r_t);
+    ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_G, &r_V[comp + 1 * NUM_COMP]);
   }
 }
 
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void GradTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
-                                    CeedScalar *__restrict__ r_V) {
-  GradTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x, data.t_id_y, r_U, c_B, c_G, r_V);
-}
-
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void GradTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
-                                             CeedScalar *__restrict__ r_V) {
-  QUnpack2D<NUM_COMP, P_1D, T_1D>(data, data.t_id_x % T_1D, data.t_id_x / T_1D, r_U);
-  GradTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x % T_1D, data.t_id_x / T_1D, r_U, c_B, c_G, r_V);
-  QPack2D<NUM_COMP * 2, Q_1D, T_1D>(data, data.t_id_x % T_1D, data.t_id_x / T_1D, r_V);
-}
-
 //------------------------------------------------------------------------------
 // 2D derivatives transpose
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void GradTransposeTensor2d_Core(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *__restrict__ r_U,
-                                                  const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
+inline __device__ void GradTransposeTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
+                                             CeedScalar *__restrict__ r_V) {
   CeedScalar r_t[1];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 0 * NUM_COMP], c_B, r_t);
-    ContractTransposeX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp]);
-    ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 1 * NUM_COMP], c_G, r_t);
-    ContractTransposeAddX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
+    ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp + 0 * NUM_COMP], c_B, r_t);
+    ContractTransposeX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_G, &r_V[comp]);
+    ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp + 1 * NUM_COMP], c_G, r_t);
+    ContractTransposeAddX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp]);
   }
 }
 
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void GradTransposeTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
-                                             CeedScalar *__restrict__ r_V) {
-  GradTransposeTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x, data.t_id_y, r_U, c_B, c_G, r_V);
-}
-
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void GradTransposeTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
-                                                      const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
-  QUnpack2D<NUM_COMP * 2, Q_1D, T_1D>(data, data.t_id_x % T_1D, data.t_id_x / T_1D, r_U);
-  GradTransposeTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x % T_1D, data.t_id_x / T_1D, r_U, c_B, c_G, r_V);
-  QPack2D<NUM_COMP, P_1D, T_1D>(data, data.t_id_x % T_1D, data.t_id_x / T_1D, r_V);
-}
-
 //------------------------------------------------------------------------------
 // 2D quadrature weights
 //------------------------------------------------------------------------------
-template <int Q_1D>
-inline __device__ void WeightTensor2d_Core(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *__restrict__ q_weight_1d,
-                                           CeedScalar *w) {
-  *w = (t_id_x < Q_1D && t_id_y < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] : 0.0;
-}
-
 template <int P_1D, int Q_1D>
 inline __device__ void WeightTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
-  WeightTensor2d_Core<Q_1D>(data, data.t_id_x, data.t_id_y, q_weight_1d, w);
-}
-
-template <int P_1D, int Q_1D>
-inline __device__ void WeightTensor2dFlattened(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
-  WeightTensor2d_Core<Q_1D>(data, data.t_id_x % Q_1D, data.t_id_x / Q_1D, q_weight_1d, w);
+  *w = (data.t_id_x < Q_1D && data.t_id_y < Q_1D) ? q_weight_1d[data.t_id_x] * q_weight_1d[data.t_id_y] : 0.0;
 }
 
 //------------------------------------------------------------------------------

From f29bd075e7e950777af7eb28ab23847cd86b3305 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 14 Mar 2025 11:11:32 -0600
Subject: [PATCH 331/571] gpu - drop changes in AtPoints

---
 ...-shared-basis-tensor-at-points-templates.h | 68 ++++++-------------
 ...-shared-basis-tensor-at-points-templates.h | 68 ++++++-------------
 2 files changed, 43 insertions(+), 93 deletions(-)

diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h
index 9444df06bf..eda8d4076b 100644
--- a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h
@@ -136,16 +136,16 @@ inline __device__ void GradTransposeAtPoints1d(SharedData_Cuda &data, const Ceed
 //------------------------------------------------------------------------------
 // 2D interpolate to points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int NUM_POINTS, int Q_1D>
-inline __device__ void InterpAtPoints2d_Core(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedInt p,
-                                             const CeedScalar *__restrict__ r_C, const CeedScalar *r_X, CeedScalar *__restrict__ r_V) {
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
+inline __device__ void InterpAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
+                                        CeedScalar *__restrict__ r_V) {
   for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
     CeedScalar buffer[Q_1D];
     CeedScalar chebyshev_x[Q_1D];
 
     // Load coefficients
-    if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * Q_1D] = r_C[comp];
+    if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[comp];
     __syncthreads();
     // Contract x direction
     ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
@@ -163,24 +163,18 @@ inline __device__ void InterpAtPoints2d_Core(SharedData_Cuda &data, const int t_
   }
 }
 
-template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
-inline __device__ void InterpAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
-                                        CeedScalar *__restrict__ r_V) {
-  InterpAtPoints2d_Core<NUM_COMP, NUM_POINTS, Q_1D>(data, data.t_id_x, data.t_id_y, p, r_C, r_X, r_V);
-}
-
 //------------------------------------------------------------------------------
 // 2D interpolate transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int NUM_POINTS, int Q_1D>
-inline __device__ void InterpTransposeAtPoints2d_Core(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedInt p,
-                                                      const CeedScalar *__restrict__ r_U, const CeedScalar *r_X, CeedScalar *__restrict__ r_C) {
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
+inline __device__ void InterpTransposeAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
+                                                 CeedScalar *__restrict__ r_C) {
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
     CeedScalar buffer[Q_1D];
     CeedScalar chebyshev_x[Q_1D];
 
     // Clear shared memory
-    if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * Q_1D] = 0.0;
+    if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
     __syncthreads();
     // Contract y direction
     ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
@@ -192,10 +186,10 @@ inline __device__ void InterpTransposeAtPoints2d_Core(SharedData_Cuda &data, con
     if (p < NUM_POINTS) {
       for (CeedInt i = 0; i < Q_1D; i++) {
         // Note: shifting to avoid atomic adds
-        const CeedInt ii = (i + t_id_x) % Q_1D;
+        const CeedInt ii = (i + data.t_id_x) % Q_1D;
 
         for (CeedInt j = 0; j < Q_1D; j++) {
-          const CeedInt jj = (j + t_id_y) % Q_1D;
+          const CeedInt jj = (j + data.t_id_y) % Q_1D;
 
           atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
         }
@@ -203,29 +197,23 @@ inline __device__ void InterpTransposeAtPoints2d_Core(SharedData_Cuda &data, con
     }
     // Pull from shared to register
     __syncthreads();
-    if (t_id_x < Q_1D && t_id_y < Q_1D) r_C[comp] += data.slice[t_id_x + t_id_y * Q_1D];
+    if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[comp] += data.slice[data.t_id_x + data.t_id_y * Q_1D];
   }
 }
 
-template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
-inline __device__ void InterpTransposeAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
-                                                 CeedScalar *__restrict__ r_C) {
-  InterpTransposeAtPoints2d_Core<NUM_COMP, NUM_POINTS, Q_1D>(data, data.t_id_x, data.t_id_y, p, r_U, r_X, r_C);
-}
-
 //------------------------------------------------------------------------------
 // 2D derivatives at points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int NUM_POINTS, int Q_1D>
-inline __device__ void GradAtPoints2d_Core(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedInt p,
-                                           const CeedScalar *__restrict__ r_C, const CeedScalar *r_X, CeedScalar *__restrict__ r_V) {
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
+inline __device__ void GradAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
+                                      CeedScalar *__restrict__ r_V) {
   for (CeedInt i = 0; i < NUM_COMP * 2; i++) r_V[i] = 0.0;
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
     CeedScalar buffer[Q_1D];
     CeedScalar chebyshev_x[Q_1D];
 
     // Load coefficients
-    if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * Q_1D] = r_C[comp];
+    if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[comp];
     __syncthreads();
     for (CeedInt dim = 0; dim < 2; dim++) {
       // Contract x direction
@@ -247,24 +235,18 @@ inline __device__ void GradAtPoints2d_Core(SharedData_Cuda &data, const int t_id
   }
 }
 
-template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
-inline __device__ void GradAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
-                                      CeedScalar *__restrict__ r_V) {
-  GradAtPoints2d_Core<NUM_COMP, NUM_POINTS, Q_1D>(data, data.t_id_x, data.t_id_y, p, r_C, r_X, r_V);
-}
-
 //------------------------------------------------------------------------------
 // 2D derivatives transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int NUM_POINTS, int Q_1D>
-inline __device__ void GradTransposeAtPoints2d_Core(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedInt p,
-                                                    const CeedScalar *__restrict__ r_U, const CeedScalar *r_X, CeedScalar *__restrict__ r_C) {
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
+inline __device__ void GradTransposeAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
+                                               CeedScalar *__restrict__ r_C) {
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
     CeedScalar buffer[Q_1D];
     CeedScalar chebyshev_x[Q_1D];
 
     // Clear shared memory
-    if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * Q_1D] = 0.0;
+    if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
     __syncthreads();
     for (CeedInt dim = 0; dim < 2; dim++) {
       // Contract y direction
@@ -279,10 +261,10 @@ inline __device__ void GradTransposeAtPoints2d_Core(SharedData_Cuda &data, const
       if (p < NUM_POINTS) {
         for (CeedInt i = 0; i < Q_1D; i++) {
           // Note: shifting to avoid atomic adds
-          const CeedInt ii = (i + t_id_x) % Q_1D;
+          const CeedInt ii = (i + data.t_id_x) % Q_1D;
 
           for (CeedInt j = 0; j < Q_1D; j++) {
-            const CeedInt jj = (j + t_id_y) % Q_1D;
+            const CeedInt jj = (j + data.t_id_y) % Q_1D;
 
             atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
           }
@@ -291,16 +273,10 @@ inline __device__ void GradTransposeAtPoints2d_Core(SharedData_Cuda &data, const
     }
     // Pull from shared to register
     __syncthreads();
-    if (t_id_x < Q_1D && t_id_y < Q_1D) r_C[comp] += data.slice[t_id_x + t_id_y * Q_1D];
+    if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[comp] += data.slice[data.t_id_x + data.t_id_y * Q_1D];
   }
 }
 
-template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
-inline __device__ void GradTransposeAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
-                                               CeedScalar *__restrict__ r_C) {
-  GradTransposeAtPoints2d_Core<NUM_COMP, NUM_POINTS, Q_1D>(data, data.t_id_x, data.t_id_y, p, r_U, r_X, r_C);
-}
-
 //------------------------------------------------------------------------------
 // 3D
 //------------------------------------------------------------------------------
diff --git a/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h b/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h
index 48ebb0ccbe..a6abc6a807 100644
--- a/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h
+++ b/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h
@@ -136,16 +136,16 @@ inline __device__ void GradTransposeAtPoints1d(SharedData_Hip &data, const CeedI
 //------------------------------------------------------------------------------
 // 2D interpolate to points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int NUM_POINTS, int Q_1D>
-inline __device__ void InterpAtPoints2d_Core(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedInt p,
-                                             const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_V) {
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
+inline __device__ void InterpAtPoints2d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X,
+                                        CeedScalar *__restrict__ r_V) {
   for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
     CeedScalar buffer[Q_1D];
     CeedScalar chebyshev_x[Q_1D];
 
     // Load coefficients
-    if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * Q_1D] = r_C[comp];
+    if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[comp];
     __syncthreads();
     // Contract x direction
     ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
@@ -163,25 +163,18 @@ inline __device__ void InterpAtPoints2d_Core(SharedData_Hip &data, const int t_i
   }
 }
 
-template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
-inline __device__ void InterpAtPoints2d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X,
-                                        CeedScalar *__restrict__ r_V) {
-  InterpAtPoints2d_Core<NUM_COMP, NUM_POINTS, Q_1D>(data, data.t_id_x, data.t_id_y, p, r_C, r_X, r_V);
-}
-
 //------------------------------------------------------------------------------
 // 2D interpolate transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int NUM_POINTS, int Q_1D>
-inline __device__ void InterpTransposeAtPoints2d_Core(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedInt p,
-                                                      const CeedScalar *__restrict__ r_U, const CeedScalar *__restrict__ r_X,
-                                                      CeedScalar *__restrict__ r_C) {
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
+inline __device__ void InterpTransposeAtPoints2d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U,
+                                                 const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) {
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
     CeedScalar buffer[Q_1D];
     CeedScalar chebyshev_x[Q_1D];
 
     // Clear shared memory
-    if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * Q_1D] = 0.0;
+    if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
     __syncthreads();
     // Contract y direction
     ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
@@ -193,10 +186,10 @@ inline __device__ void InterpTransposeAtPoints2d_Core(SharedData_Hip &data, cons
     if (p < NUM_POINTS) {
       for (CeedInt i = 0; i < Q_1D; i++) {
         // Note: shifting to avoid atomic adds
-        const CeedInt ii = (i + t_id_x) % Q_1D;
+        const CeedInt ii = (i + data.t_id_x) % Q_1D;
 
         for (CeedInt j = 0; j < Q_1D; j++) {
-          const CeedInt jj = (j + t_id_y) % Q_1D;
+          const CeedInt jj = (j + data.t_id_y) % Q_1D;
 
           atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
         }
@@ -208,25 +201,19 @@ inline __device__ void InterpTransposeAtPoints2d_Core(SharedData_Hip &data, cons
   }
 }
 
-template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
-inline __device__ void InterpTransposeAtPoints2d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U,
-                                                 const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) {
-  InterpTransposeAtPoints2d_Core<NUM_COMP, NUM_POINTS, Q_1D>(data, data.t_id_x, data.t_id_y, p, r_U, r_X, r_C);
-}
-
 //------------------------------------------------------------------------------
 // 2D derivatives at points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int NUM_POINTS, int Q_1D>
-inline __device__ void GradAtPoints2d_Core(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedInt p,
-                                           const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_V) {
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
+inline __device__ void GradAtPoints2d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X,
+                                      CeedScalar *__restrict__ r_V) {
   for (CeedInt i = 0; i < NUM_COMP * 2; i++) r_V[i] = 0.0;
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
     CeedScalar buffer[Q_1D];
     CeedScalar chebyshev_x[Q_1D];
 
     // Load coefficients
-    if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * Q_1D] = r_C[comp];
+    if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[comp];
     __syncthreads();
     for (CeedInt dim = 0; dim < 2; dim++) {
       // Contract x direction
@@ -248,25 +235,18 @@ inline __device__ void GradAtPoints2d_Core(SharedData_Hip &data, const int t_id_
   }
 }
 
-template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
-inline __device__ void GradAtPoints2d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X,
-                                      CeedScalar *__restrict__ r_V) {
-  GradAtPoints2d_Core<NUM_COMP, NUM_POINTS, Q_1D>(data, data.t_id_x, data.t_id_y, p, r_C, r_X, r_V);
-}
-
 //------------------------------------------------------------------------------
 // 2D derivatives transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int NUM_POINTS, int Q_1D>
-inline __device__ void GradTransposeAtPoints2d_Core(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedInt p,
-                                                    const CeedScalar *__restrict__ r_U, const CeedScalar *__restrict__ r_X,
-                                                    CeedScalar *__restrict__ r_C) {
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
+inline __device__ void GradTransposeAtPoints2d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U,
+                                               const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) {
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
     CeedScalar buffer[Q_1D];
     CeedScalar chebyshev_x[Q_1D];
 
     // Clear shared memory
-    if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * Q_1D] = 0.0;
+    if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
     __syncthreads();
     for (CeedInt dim = 0; dim < 2; dim++) {
       // Contract y direction
@@ -281,10 +261,10 @@ inline __device__ void GradTransposeAtPoints2d_Core(SharedData_Hip &data, const
       if (p < NUM_POINTS) {
         for (CeedInt i = 0; i < Q_1D; i++) {
           // Note: shifting to avoid atomic adds
-          const CeedInt ii = (i + t_id_x) % Q_1D;
+          const CeedInt ii = (i + data.t_id_x) % Q_1D;
 
           for (CeedInt j = 0; j < Q_1D; j++) {
-            const CeedInt jj = (j + t_id_y) % Q_1D;
+            const CeedInt jj = (j + data.t_id_y) % Q_1D;
 
             atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
           }
@@ -293,16 +273,10 @@ inline __device__ void GradTransposeAtPoints2d_Core(SharedData_Hip &data, const
     }
     // Pull from shared to register
     __syncthreads();
-    if (t_id_x < Q_1D && t_id_y < Q_1D) r_C[comp] += data.slice[t_id_x + t_id_y * Q_1D];
+    if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[comp] += data.slice[data.t_id_x + data.t_id_y * Q_1D];
   }
 }
 
-template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
-inline __device__ void GradTransposeAtPoints2d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U,
-                                               const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) {
-  GradTransposeAtPoints2d_Core<NUM_COMP, NUM_POINTS, Q_1D>(data, data.t_id_x, data.t_id_y, p, r_U, r_X, r_C);
-}
-
 //------------------------------------------------------------------------------
 // 3D
 //------------------------------------------------------------------------------

From 259057ed0da46db025ab7b301a4a4b3f356037e0 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 14 Mar 2025 12:22:18 -0600
Subject: [PATCH 332/571] gen - fix flattened indexing

---
 .../cuda-gen/ceed-cuda-gen-operator-build.cpp |  8 +++---
 ...-shared-basis-tensor-flattened-templates.h | 27 +++++++++++--------
 2 files changed, 20 insertions(+), 15 deletions(-)

diff --git a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
index a9d97d2935..14897cf0dd 100644
--- a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
+++ b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
@@ -530,7 +530,7 @@ static int CeedOperatorBuildKernelBasis_Cuda_gen(std::ostringstream &code, CeedO
                                           : "InterpNonTensor";
           std::string op_t_1d_name  = (is_all_tensor || !is_tensor) ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
 
-          code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
+          code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
           code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_e"
                << var_suffix << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n";
         }
@@ -554,8 +554,8 @@ static int CeedOperatorBuildKernelBasis_Cuda_gen(std::ostringstream &code, CeedO
                                       (is_all_tensor ? "" : "Flattened");
           std::string op_t_1d_name = is_all_tensor ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
 
-          code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "*" << (dim >= 3 ? Q_name : "1")
-               << "];\n";
+          code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "*"
+               << (is_all_tensor && dim >= 3 ? Q_name : "1") << "];\n";
           code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_e"
                << var_suffix << ", s_B" << var_suffix << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n";
         } else {
@@ -575,7 +575,7 @@ static int CeedOperatorBuildKernelBasis_Cuda_gen(std::ostringstream &code, CeedO
                                                      ? ((dim == 1 ? "Weight" : "WeightTensor") + std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened"))
                                                      : "WeightNonTensor";
 
-          code << "    CeedScalar r_q" << var_suffix << "[" << (is_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
+          code << "    CeedScalar r_q" << var_suffix << "[" << (is_all_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
           CeedCallBackend(CeedBasisGetData(basis, &basis_data));
           data->W = basis_data->d_q_weight_1d;
           code << "    " << function_name << "<" << P_name << ", " << Q_name << ">(data, W, r_q" << var_suffix << ");\n";
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-flattened-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-flattened-templates.h
index bb09edb226..cb16d9616f 100644
--- a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-flattened-templates.h
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-flattened-templates.h
@@ -138,6 +138,7 @@ inline __device__ void InterpTensor2dFlattened(SharedData_Cuda &data, CeedScalar
     ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
     ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
   }
+  QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
   QPack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V);
 }
 
@@ -174,6 +175,7 @@ inline __device__ void GradTensor2dFlattened(SharedData_Cuda &data, CeedScalar *
     ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
     ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp + 1 * NUM_COMP]);
   }
+  QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
   QPack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V);
 }
 
@@ -365,22 +367,22 @@ inline __device__ void ContractTransposeAddX3dFlattened(SharedData_Cuda &data, c
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int Q_1D, int T_1D>
 inline __device__ void QPack3d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, CeedScalar *U) {
-  const CeedInt new_t_id_x = data.t_id_x % Q_1D, new_t_id_y = (data.t_id_x % (Q_1D * Q_1D)) / Q_1D, new_t_id_z = data.t_id_x / (Q_1D * Q_1D);
+  const CeedInt new_t_id_x = data.t_id_x % Q_1D, new_t_id_y = (data.t_id_x / Q_1D) % Q_1D, new_t_id_z = data.t_id_x / (Q_1D * Q_1D);
 
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
     if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = U[comp];
     __syncthreads();
-    U[comp] = data.t_id_x < (Q_1D * Q_1D) ? data.slice[new_t_id_x + new_t_id_y * T_1D + new_t_id_z * T_1D * T_1D] : 0.0;
+    U[comp] = data.t_id_x < (Q_1D * Q_1D * Q_1D) ? data.slice[new_t_id_x + new_t_id_y * T_1D + new_t_id_z * T_1D * T_1D] : 0.0;
     __syncthreads();
   }
 }
 
 template <int NUM_COMP, int Q_1D, int T_1D>
 inline __device__ void QUnpack3d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, CeedScalar *U) {
-  const CeedInt old_t_id_x = data.t_id_x % Q_1D, old_t_id_y = (data.t_id_x % (Q_1D * Q_1D)) / Q_1D, old_t_id_z = data.t_id_x / (Q_1D * Q_1D);
+  const CeedInt old_t_id_x = data.t_id_x % Q_1D, old_t_id_y = (data.t_id_x / Q_1D) % Q_1D, old_t_id_z = data.t_id_x / (Q_1D * Q_1D);
 
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    if (data.t_id_x < (Q_1D * Q_1D)) data.slice[old_t_id_x + old_t_id_y * T_1D + old_t_id_z * T_1D * T_1D] = U[comp];
+    if (data.t_id_x < Q_1D * Q_1D * Q_1D) data.slice[old_t_id_x + old_t_id_y * T_1D + old_t_id_z * T_1D * T_1D] = U[comp];
     __syncthreads();
     U[comp] = (t_id_x < Q_1D && t_id_y < Q_1D) ? data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] : 0.0;
     __syncthreads();
@@ -393,7 +395,7 @@ inline __device__ void QUnpack3d(SharedData_Cuda &data, const int t_id_x, const
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void InterpTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
                                                CeedScalar *__restrict__ r_V) {
-  const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x % (T_1D * T_1D)) / T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
+  const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
   CeedScalar    r_t1[1], r_t2[1];
 
   QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
@@ -402,6 +404,7 @@ inline __device__ void InterpTensor3dFlattened(SharedData_Cuda &data, CeedScalar
     ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
     ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]);
   }
+  QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
   QPack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
 }
 
@@ -411,7 +414,7 @@ inline __device__ void InterpTensor3dFlattened(SharedData_Cuda &data, CeedScalar
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void InterpTransposeTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
                                                         CeedScalar *__restrict__ r_V) {
-  const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x % (T_1D * T_1D)) / T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
+  const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
   CeedScalar    r_t1[1], r_t2[1];
 
   QUnpack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
@@ -429,7 +432,7 @@ inline __device__ void InterpTransposeTensor3dFlattened(SharedData_Cuda &data, C
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void GradTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
                                              CeedScalar *__restrict__ r_V) {
-  const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x % (T_1D * T_1D)) / T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
+  const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
   CeedScalar    r_t1[1], r_t2[1];
 
   QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
@@ -444,6 +447,7 @@ inline __device__ void GradTensor3dFlattened(SharedData_Cuda &data, CeedScalar *
     ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
     ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_G, &r_V[comp + 2 * NUM_COMP]);
   }
+  QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
   QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
 }
 
@@ -453,7 +457,7 @@ inline __device__ void GradTensor3dFlattened(SharedData_Cuda &data, CeedScalar *
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void GradTransposeTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
                                                       const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
-  const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x % (T_1D * T_1D)) / T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
+  const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
   CeedScalar    r_t1[1], r_t2[1];
 
   QUnpack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
@@ -477,7 +481,7 @@ inline __device__ void GradTransposeTensor3dFlattened(SharedData_Cuda &data, Cee
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void GradTensorCollocated3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
                                                        const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
-  const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x % (T_1D * T_1D)) / T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
+  const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
   CeedScalar    r_t1[1], r_t2[1];
 
   QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
@@ -489,6 +493,7 @@ inline __device__ void GradTensorCollocated3dFlattened(SharedData_Cuda &data, Ce
     ContractY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 1 * NUM_COMP]);
     ContractZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 2 * NUM_COMP]);
   }
+  QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
   QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
 }
 
@@ -498,7 +503,7 @@ inline __device__ void GradTensorCollocated3dFlattened(SharedData_Cuda &data, Ce
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void GradTransposeTensorCollocated3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
                                                                 const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
-  const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x % (T_1D * T_1D)) / T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
+  const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
   CeedScalar    r_t1[1], r_t2[1];
 
   QUnpack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
@@ -518,7 +523,7 @@ inline __device__ void GradTransposeTensorCollocated3dFlattened(SharedData_Cuda
 //------------------------------------------------------------------------------
 template <int P_1D, int Q_1D>
 inline __device__ void WeightTensor3dFlattened(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
-  const CeedInt t_id_x = data.t_id_x % Q_1D, t_id_y = (data.t_id_x % (Q_1D * Q_1D)) / Q_1D, t_id_z = data.t_id_x / (Q_1D * Q_1D);
+  const CeedInt t_id_x = data.t_id_x % Q_1D, t_id_y = (data.t_id_x / Q_1D) % Q_1D, t_id_z = data.t_id_x / (Q_1D * Q_1D);
 
   *w = (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] * q_weight_1d[t_id_z] : 0.0;
 }

From 8014c5e77a2314c7e9ef0bdf0731d81ce05a9c67 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 14 Mar 2025 13:13:54 -0600
Subject: [PATCH 333/571] gen - set default dim to max

---
 .../cuda-gen/ceed-cuda-gen-operator-build.cpp | 34 +++++++++----------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
index 14897cf0dd..fbc8199981 100644
--- a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
+++ b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
@@ -177,8 +177,8 @@ static int CeedOperatorBuildKernelData_Cuda_gen(Ceed ceed, CeedInt num_input_fie
 // Setup fields
 //------------------------------------------------------------------------------
 static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, CeedInt i, CeedOperatorField op_field,
-                                                     CeedQFunctionField qf_field, FieldReuse_Cuda field_reuse, CeedInt Q, CeedInt Q_1d, bool is_input,
-                                                     bool is_all_tensor, bool is_at_points, bool use_3d_slices) {
+                                                     CeedQFunctionField qf_field, FieldReuse_Cuda field_reuse, CeedInt max_dim, CeedInt Q,
+                                                     CeedInt Q_1d, bool is_input, bool is_all_tensor, bool is_at_points, bool use_3d_slices) {
   bool      is_tensor = true;
   CeedBasis basis;
   CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis));
@@ -189,7 +189,7 @@ static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, C
   std::string            P_name = (is_tensor ? "P_1d" : "P") + var_suffix, Q_name = is_tensor ? "Q_1d" : "Q";
   std::string            option_name = (is_input ? "inputs" : "outputs");
   CeedEvalMode           eval_mode   = CEED_EVAL_NONE;
-  CeedInt                elem_size = 0, num_comp = 0, dim = 1, P_1d = 0;
+  CeedInt                elem_size = 0, num_comp = 0, dim = max_dim, P_1d = 0;
   CeedElemRestriction    elem_rstr;
   CeedBasis_Cuda_shared *basis_data;
 
@@ -350,9 +350,9 @@ static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, C
 //------------------------------------------------------------------------------
 // Restriction
 //------------------------------------------------------------------------------
-static int CeedOperatorBuildKernelRestriction_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, CeedInt i, CeedInt max_dim,
-                                                       CeedInt field_input_buffer[], CeedOperatorField op_field, CeedQFunctionField qf_field,
-                                                       CeedInt Q_1d, bool is_input, bool is_all_tensor, bool is_at_points, bool use_3d_slices) {
+static int CeedOperatorBuildKernelRestriction_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, CeedInt i, CeedInt field_input_buffer[],
+                                                       CeedOperatorField op_field, CeedQFunctionField qf_field, CeedInt max_dim, CeedInt Q_1d,
+                                                       bool is_input, bool is_all_tensor, bool is_at_points, bool use_3d_slices) {
   std::string               var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
   std::string               P_name     = (is_all_tensor ? "P_1d" : "P") + var_suffix;
   CeedEvalMode              eval_mode  = CEED_EVAL_NONE;
@@ -480,9 +480,9 @@ static int CeedOperatorBuildKernelRestriction_Cuda_gen(std::ostringstream &code,
 //------------------------------------------------------------------------------
 // Basis
 //------------------------------------------------------------------------------
-static int CeedOperatorBuildKernelBasis_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, CeedInt i, CeedInt max_dim,
-                                                 CeedOperatorField op_field, CeedQFunctionField qf_field, CeedInt Q_1d, bool is_input,
-                                                 bool is_all_tensor, bool is_at_points, bool use_3d_slices) {
+static int CeedOperatorBuildKernelBasis_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, CeedInt i, CeedOperatorField op_field,
+                                                 CeedQFunctionField qf_field, CeedInt max_dim, CeedInt Q_1d, bool is_input, bool is_all_tensor,
+                                                 bool is_at_points, bool use_3d_slices) {
   bool      is_tensor = true;
   CeedBasis basis;
   CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis));
@@ -1399,13 +1399,13 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_b
   // Initialize constants, and matrices B and G
   code << "\n  // Input field constants and basis data\n";
   for (CeedInt i = 0; i < num_input_fields; i++) {
-    CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i], Q, Q_1d,
-                                                              true, is_all_tensor, is_at_points, use_3d_slices));
+    CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i], max_dim,
+                                                              Q, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices));
   }
   code << "\n  // Output field constants and basis data\n";
   for (CeedInt i = 0; i < num_output_fields; i++) {
-    CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i], Q,
-                                                              Q_1d, false, is_all_tensor, is_at_points, use_3d_slices));
+    CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i],
+                                                              max_dim, Q, Q_1d, false, is_all_tensor, is_at_points, use_3d_slices));
   }
 
   // Loop over all elements
@@ -1493,11 +1493,11 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_b
     code << "    // ---- Input field " << f << ": " << field_name << "\n";
 
     // ---- Restriction
-    CeedCallBackend(CeedOperatorBuildKernelRestriction_Cuda_gen(code, data, f, max_dim, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f],
+    CeedCallBackend(CeedOperatorBuildKernelRestriction_Cuda_gen(code, data, f, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f], max_dim,
                                                                 Q_1d, true, is_all_tensor, is_at_points, use_3d_slices));
 
     // ---- Basis action
-    CeedCallBackend(CeedOperatorBuildKernelBasis_Cuda_gen(code, data, f, max_dim, op_input_fields[f], qf_input_fields[f], Q_1d, true, is_all_tensor,
+    CeedCallBackend(CeedOperatorBuildKernelBasis_Cuda_gen(code, data, f, op_input_fields[f], qf_input_fields[f], max_dim, Q_1d, true, is_all_tensor,
                                                           is_at_points, use_3d_slices));
   }
 
@@ -1515,11 +1515,11 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_b
     code << "    // ---- Output field " << i << ": " << field_name << "\n";
 
     // ---- Basis action
-    CeedCallBackend(CeedOperatorBuildKernelBasis_Cuda_gen(code, data, i, max_dim, op_output_fields[i], qf_output_fields[i], Q_1d, false,
+    CeedCallBackend(CeedOperatorBuildKernelBasis_Cuda_gen(code, data, i, op_output_fields[i], qf_output_fields[i], max_dim, Q_1d, false,
                                                           is_all_tensor, is_at_points, use_3d_slices));
 
     // ---- Restriction
-    CeedCallBackend(CeedOperatorBuildKernelRestriction_Cuda_gen(code, data, i, max_dim, NULL, op_output_fields[i], qf_output_fields[i], Q_1d, false,
+    CeedCallBackend(CeedOperatorBuildKernelRestriction_Cuda_gen(code, data, i, NULL, op_output_fields[i], qf_output_fields[i], max_dim, Q_1d, false,
                                                                 is_all_tensor, is_at_points, use_3d_slices));
   }
 

From 9b91271bd9570896f9ae5225f4862ed0c149b456 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 14 Mar 2025 13:22:06 -0600
Subject: [PATCH 334/571] hip - add flattened templates

---
 ...-shared-basis-tensor-flattened-templates.h | 529 ++++++++++++++++++
 .../hip/hip-shared-basis-tensor-templates.h   | 159 ++----
 2 files changed, 571 insertions(+), 117 deletions(-)
 create mode 100644 include/ceed/jit-source/hip/hip-shared-basis-tensor-flattened-templates.h

diff --git a/include/ceed/jit-source/hip/hip-shared-basis-tensor-flattened-templates.h b/include/ceed/jit-source/hip/hip-shared-basis-tensor-flattened-templates.h
new file mode 100644
index 0000000000..864f95ae68
--- /dev/null
+++ b/include/ceed/jit-source/hip/hip-shared-basis-tensor-flattened-templates.h
@@ -0,0 +1,529 @@
+// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+/// @file
+/// Internal header for HIP shared memory tensor product basis templates
+#include <ceed/types.h>
+
+//------------------------------------------------------------------------------
+// 2D
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+// 2D tensor contraction x
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void ContractX2dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
+                                            CeedScalar *V) {
+  data.slice[t_id_x + t_id_y * T_1D] = *U;
+  __syncthreads();
+  *V = 0.0;
+  if (t_id_x < Q_1D && t_id_y < P_1D) {
+    for (CeedInt i = 0; i < P_1D; i++) {
+      *V += B[i + t_id_x * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
+    }
+  }
+  __syncthreads();
+}
+
+//------------------------------------------------------------------------------
+// 2D tensor contract y
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void ContractY2dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
+                                            CeedScalar *V) {
+  data.slice[t_id_x + t_id_y * T_1D] = *U;
+  __syncthreads();
+  *V = 0.0;
+  if (t_id_x < Q_1D && t_id_y < Q_1D) {
+    for (CeedInt i = 0; i < P_1D; i++) {
+      *V += B[i + t_id_y * P_1D] * data.slice[t_id_x + i * T_1D];  // Contract y direction
+    }
+  }
+  __syncthreads();
+}
+
+//------------------------------------------------------------------------------
+// 2D transpose tensor contract y
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void ContractTransposeY2dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U,
+                                                     const CeedScalar *B, CeedScalar *V) {
+  data.slice[t_id_x + t_id_y * T_1D] = *U;
+  __syncthreads();
+  *V = 0.0;
+  if (t_id_x < Q_1D && t_id_y < P_1D) {
+    for (CeedInt i = 0; i < Q_1D; i++) {
+      *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D];  // Contract y direction
+    }
+  }
+  __syncthreads();
+}
+
+//------------------------------------------------------------------------------
+// 2D transpose tensor contract x
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void ContractTransposeX2dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U,
+                                                     const CeedScalar *B, CeedScalar *V) {
+  data.slice[t_id_x + t_id_y * T_1D] = *U;
+  __syncthreads();
+  *V = 0.0;
+  if (t_id_x < P_1D && t_id_y < P_1D) {
+    for (CeedInt i = 0; i < Q_1D; i++) {
+      *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
+    }
+  }
+  __syncthreads();
+}
+
+//------------------------------------------------------------------------------
+// 2D transpose tensor contract and add x
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void ContractTransposeAddX2dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U,
+                                                        const CeedScalar *B, CeedScalar *V) {
+  data.slice[t_id_x + t_id_y * T_1D] = *U;
+  __syncthreads();
+  if (t_id_x < P_1D && t_id_y < P_1D) {
+    for (CeedInt i = 0; i < Q_1D; i++) {
+      *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
+    }
+  }
+  __syncthreads();
+}
+
+//------------------------------------------------------------------------------
+// 2D pack/unpack quadrature values
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int Q_1D, int T_1D>
+inline __device__ void QPack2d(SharedData_Hip &data, const int t_id_x, const int t_id_y, CeedScalar *U) {
+  const CeedInt new_t_id_x = data.t_id_x % Q_1D, new_t_id_y = data.t_id_x / Q_1D;
+
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * T_1D] = U[comp];
+    __syncthreads();
+    U[comp] = data.t_id_x < (Q_1D * Q_1D) ? data.slice[new_t_id_x + new_t_id_y * T_1D] : 0.0;
+    __syncthreads();
+  }
+}
+
+template <int NUM_COMP, int Q_1D, int T_1D>
+inline __device__ void QUnpack2d(SharedData_Hip &data, const int t_id_x, const int t_id_y, CeedScalar *U) {
+  const CeedInt old_t_id_x = data.t_id_x % Q_1D, old_t_id_y = data.t_id_x / Q_1D;
+
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    if (data.t_id_x < (Q_1D * Q_1D)) data.slice[old_t_id_x + old_t_id_y * T_1D] = U[comp];
+    __syncthreads();
+    U[comp] = (t_id_x < Q_1D && t_id_y < Q_1D) ? data.slice[t_id_x + t_id_y * T_1D] : 0.0;
+    __syncthreads();
+  }
+}
+
+//------------------------------------------------------------------------------
+// 2D interpolate to quadrature points
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void InterpTensor2dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                               CeedScalar *__restrict__ r_V) {
+  const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
+  CeedScalar r_t[1];
+
+  QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
+    ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
+  }
+  QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
+  QPack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V);
+}
+
+//------------------------------------------------------------------------------
+// 2D interpolate transpose
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void InterpTransposeTensor2dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                        CeedScalar *__restrict__ r_V) {
+  const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
+  CeedScalar r_t[1];
+
+  QUnpack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U);
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
+    ContractTransposeX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
+  }
+  QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V);
+}
+
+//------------------------------------------------------------------------------
+// 2D derivatives at quadrature points
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void GradTensor2dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
+                                             CeedScalar *__restrict__ r_V) {
+  const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
+  CeedScalar r_t[1];
+
+  QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_G, r_t);
+    ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp + 0 * NUM_COMP]);
+    ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
+    ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp + 1 * NUM_COMP]);
+  }
+  QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
+  QPack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V);
+}
+
+//------------------------------------------------------------------------------
+// 2D derivatives transpose
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void GradTransposeTensor2dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                      const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
+  const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
+  CeedScalar r_t[1];
+
+  QUnpack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U);
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 0 * NUM_COMP], c_B, r_t);
+    ContractTransposeX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp]);
+    ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 1 * NUM_COMP], c_G, r_t);
+    ContractTransposeAddX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
+  }
+  QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V);
+}
+
+//------------------------------------------------------------------------------
+// 2D quadrature weights
+//------------------------------------------------------------------------------
+template <int P_1D, int Q_1D>
+inline __device__ void WeightTensor2dFlattened(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
+  const int t_id_x = data.t_id_x % Q_1D, t_id_y = data.t_id_x / Q_1D;
+
+  *w = (t_id_x < Q_1D && t_id_y < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] : 0.0;
+}
+
+//------------------------------------------------------------------------------
+// 3D
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+// 3D tensor contract x
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void ContractX3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
+                                            const CeedScalar *B, CeedScalar *V) {
+  data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
+  __syncthreads();
+  *V = 0.0;
+  if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) {
+    for (CeedInt i = 0; i < P_1D; i++) {
+      *V += B[i + t_id_x * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D];  // Contract x direction
+    }
+  }
+  __syncthreads();
+}
+
+//------------------------------------------------------------------------------
+// 3D tensor contract y
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void ContractY3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
+                                            const CeedScalar *B, CeedScalar *V) {
+  data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
+  __syncthreads();
+  *V = 0.0;
+  if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) {
+    for (CeedInt i = 0; i < P_1D; i++) {
+      *V += B[i + t_id_y * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D];  // Contract y direction
+    }
+  }
+  __syncthreads();
+}
+
+//------------------------------------------------------------------------------
+// 3D tensor contract z
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void ContractZ3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
+                                            const CeedScalar *B, CeedScalar *V) {
+  data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
+  __syncthreads();
+  *V = 0.0;
+  if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < Q_1D) {
+    for (CeedInt i = 0; i < P_1D; i++) {
+      *V += B[i + t_id_z * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D];  // Contract z direction
+    }
+  }
+  __syncthreads();
+}
+
+//------------------------------------------------------------------------------
+// 3D tensor contract z
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void ContractTransposeZ3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
+                                                     const CeedScalar *B, CeedScalar *V) {
+  data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
+  __syncthreads();
+  *V = 0.0;
+  if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) {
+    for (CeedInt i = 0; i < Q_1D; i++) {
+      *V += B[t_id_z + i * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D];  // Contract z direction
+    }
+  }
+  __syncthreads();
+}
+
+//------------------------------------------------------------------------------
+// 3D transpose tensor contract z
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void ContractTransposeAddZ3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z,
+                                                        const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+  data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
+  __syncthreads();
+  if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) {
+    for (CeedInt i = 0; i < Q_1D; i++) {
+      *V += B[t_id_z + i * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D];  // Contract z direction
+    }
+  }
+  __syncthreads();
+}
+
+//------------------------------------------------------------------------------
+// 3D transpose tensor contract y
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void ContractTransposeY3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
+                                                     const CeedScalar *B, CeedScalar *V) {
+  data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
+  __syncthreads();
+  *V = 0.0;
+  if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) {
+    for (CeedInt i = 0; i < Q_1D; i++) {
+      *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D];  // Contract y direction
+    }
+  }
+  __syncthreads();
+}
+
+//------------------------------------------------------------------------------
+// 3D transpose tensor contract y
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void ContractTransposeAddY3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z,
+                                                        const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+  data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
+  __syncthreads();
+  if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) {
+    for (CeedInt i = 0; i < Q_1D; i++) {
+      *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D];  // Contract y direction
+    }
+  }
+  __syncthreads();
+}
+
+//------------------------------------------------------------------------------
+// 3D transpose tensor contract x
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void ContractTransposeX3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
+                                                     const CeedScalar *B, CeedScalar *V) {
+  data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
+  __syncthreads();
+  *V = 0.0;
+  if (t_id_x < P_1D && t_id_y < P_1D && t_id_z < P_1D) {
+    for (CeedInt i = 0; i < Q_1D; i++) {
+      *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D];  // Contract x direction
+    }
+  }
+  __syncthreads();
+}
+
+//------------------------------------------------------------------------------
+// 3D transpose tensor contract add x
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void ContractTransposeAddX3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z,
+                                                        const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+  data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
+  __syncthreads();
+  if (t_id_x < P_1D && t_id_y < P_1D && t_id_z < P_1D) {
+    for (CeedInt i = 0; i < Q_1D; i++) {
+      *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D];  // Contract x direction
+    }
+  }
+  __syncthreads();
+}
+
+//------------------------------------------------------------------------------
+// 3D pack/unpack quadrature values
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int Q_1D, int T_1D>
+inline __device__ void QPack3d(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, CeedScalar *U) {
+  const CeedInt new_t_id_x = data.t_id_x % Q_1D, new_t_id_y = (data.t_id_x / Q_1D) % Q_1D, new_t_id_z = data.t_id_x / (Q_1D * Q_1D);
+
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = U[comp];
+    __syncthreads();
+    U[comp] = data.t_id_x < (Q_1D * Q_1D * Q_1D) ? data.slice[new_t_id_x + new_t_id_y * T_1D + new_t_id_z * T_1D * T_1D] : 0.0;
+    __syncthreads();
+  }
+}
+
+template <int NUM_COMP, int Q_1D, int T_1D>
+inline __device__ void QUnpack3d(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, CeedScalar *U) {
+  const CeedInt old_t_id_x = data.t_id_x % Q_1D, old_t_id_y = (data.t_id_x / Q_1D) % Q_1D, old_t_id_z = data.t_id_x / (Q_1D * Q_1D);
+
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    if (data.t_id_x < Q_1D * Q_1D * Q_1D) data.slice[old_t_id_x + old_t_id_y * T_1D + old_t_id_z * T_1D * T_1D] = U[comp];
+    __syncthreads();
+    U[comp] = (t_id_x < Q_1D && t_id_y < Q_1D) ? data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] : 0.0;
+    __syncthreads();
+  }
+}
+
+//------------------------------------------------------------------------------
+// 3D interpolate to quadrature points
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void InterpTensor3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                               CeedScalar *__restrict__ r_V) {
+  const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
+  CeedScalar    r_t1[1], r_t2[1];
+
+  QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
+    ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
+    ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]);
+  }
+  QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
+  QPack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
+}
+
+//------------------------------------------------------------------------------
+// 3D interpolate transpose
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void InterpTransposeTensor3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                        CeedScalar *__restrict__ r_V) {
+  const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
+  CeedScalar    r_t1[1], r_t2[1];
+
+  QUnpack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    ContractTransposeZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
+    ContractTransposeY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
+    ContractTransposeX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]);
+  }
+  QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
+}
+
+//------------------------------------------------------------------------------
+// 3D derivatives at quadrature points
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void GradTensor3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
+                                             CeedScalar *__restrict__ r_V) {
+  const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
+  CeedScalar    r_t1[1], r_t2[1];
+
+  QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_G, r_t1);
+    ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
+    ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp + 0 * NUM_COMP]);
+    ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
+    ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, r_t2);
+    ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp + 1 * NUM_COMP]);
+    ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
+    ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
+    ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_G, &r_V[comp + 2 * NUM_COMP]);
+  }
+  QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
+  QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
+}
+
+//------------------------------------------------------------------------------
+// 3D derivatives transpose
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void GradTransposeTensor3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                      const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
+  const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
+  CeedScalar    r_t1[1], r_t2[1];
+
+  QUnpack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 0 * NUM_COMP], c_B, r_t1);
+    ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
+    ContractTransposeX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp]);
+    ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 1 * NUM_COMP], c_B, r_t1);
+    ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2);
+    ContractTransposeAddX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp]);
+    ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 2 * NUM_COMP], c_G, r_t1);
+    ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
+    ContractTransposeAddX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp]);
+  }
+  QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
+}
+
+//------------------------------------------------------------------------------
+// 3D derivatives at quadrature points
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void GradTensorCollocated3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                       const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
+  const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
+  CeedScalar    r_t1[1], r_t2[1];
+
+  QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
+    ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
+    ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, r_t1);
+    ContractX3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 0 * NUM_COMP]);
+    ContractY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 1 * NUM_COMP]);
+    ContractZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 2 * NUM_COMP]);
+  }
+  QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
+  QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
+}
+
+//------------------------------------------------------------------------------
+// 3D derivatives transpose
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void GradTransposeTensorCollocated3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                                const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
+  const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
+  CeedScalar    r_t1[1], r_t2[1];
+
+  QUnpack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    ContractTransposeZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 2 * NUM_COMP], c_G, r_t2);
+    ContractTransposeAddY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 1 * NUM_COMP], c_G, r_t2);
+    ContractTransposeAddX3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 0 * NUM_COMP], c_G, r_t2);
+    ContractTransposeZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, r_t1);
+    ContractTransposeY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
+    ContractTransposeX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]);
+  }
+  QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
+}
+
+//------------------------------------------------------------------------------
+// 3D quadrature weights
+//------------------------------------------------------------------------------
+template <int P_1D, int Q_1D>
+inline __device__ void WeightTensor3dFlattened(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
+  const CeedInt t_id_x = data.t_id_x % Q_1D, t_id_y = (data.t_id_x / Q_1D) % Q_1D, t_id_z = data.t_id_x / (Q_1D * Q_1D);
+
+  *w = (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] * q_weight_1d[t_id_z] : 0.0;
+}
diff --git a/include/ceed/jit-source/hip/hip-shared-basis-tensor-templates.h b/include/ceed/jit-source/hip/hip-shared-basis-tensor-templates.h
index b3490e3fe5..452610797b 100644
--- a/include/ceed/jit-source/hip/hip-shared-basis-tensor-templates.h
+++ b/include/ceed/jit-source/hip/hip-shared-basis-tensor-templates.h
@@ -91,7 +91,7 @@ inline __device__ void GradTranspose1d(SharedData_Hip &data, const CeedScalar *_
 //------------------------------------------------------------------------------
 // 1D quadrature weights
 //------------------------------------------------------------------------------
-template <int Q_1D>
+template <int P_1D, int Q_1D>
 inline __device__ void Weight1d(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
   *w = (data.t_id_x < Q_1D) ? q_weight_1d[data.t_id_x] : 0.0;
 }
@@ -104,14 +104,13 @@ inline __device__ void Weight1d(SharedData_Hip &data, const CeedScalar *__restri
 // 2D tensor contraction x
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void ContractX2d(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
-                                   CeedScalar *V) {
-  data.slice[t_id_x + t_id_y * T_1D] = *U;
+inline __device__ void ContractX2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+  data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
   __syncthreads();
   *V = 0.0;
-  if (t_id_x < Q_1D && t_id_y < P_1D) {
+  if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
     for (CeedInt i = 0; i < P_1D; i++) {
-      *V += B[i + t_id_x * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
+      *V += B[i + data.t_id_x * P_1D] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
     }
   }
   __syncthreads();
@@ -121,15 +120,13 @@ inline __device__ void ContractX2d(SharedData_Hip &data, const int t_id_x, const
 // 2D tensor contract y
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void ContractY2d(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
-                                   CeedScalar *V) {
-  data.slice[t_id_x + t_id_y * T_1D] = *U;
->>>>>>> b855402d (gpu - isolate core 2D tensor logic to allow flat version)
+inline __device__ void ContractY2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+  data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
   __syncthreads();
   *V = 0.0;
-  if (t_id_x < Q_1D && t_id_y < Q_1D) {
+  if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
     for (CeedInt i = 0; i < P_1D; i++) {
-      *V += B[i + t_id_y * P_1D] * data.slice[t_id_x + i * T_1D];  // Contract y direction
+      *V += B[i + data.t_id_y * P_1D] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
     }
   }
   __syncthreads();
@@ -139,14 +136,13 @@ inline __device__ void ContractY2d(SharedData_Hip &data, const int t_id_x, const
 // 2D transpose tensor contract y
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void ContractTransposeY2d(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
-                                            CeedScalar *V) {
-  data.slice[t_id_x + t_id_y * T_1D] = *U;
+inline __device__ void ContractTransposeY2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+  data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
   __syncthreads();
   *V = 0.0;
-  if (t_id_x < Q_1D && t_id_y < P_1D) {
+  if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
     for (CeedInt i = 0; i < Q_1D; i++) {
-      *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D];  // Contract y direction
+      *V += B[data.t_id_y + i * P_1D] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
     }
   }
   __syncthreads();
@@ -156,14 +152,13 @@ inline __device__ void ContractTransposeY2d(SharedData_Hip &data, const int t_id
 // 2D transpose tensor contract x
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void ContractTransposeX2d(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
-                                            CeedScalar *V) {
-  data.slice[t_id_x + t_id_y * T_1D] = *U;
+inline __device__ void ContractTransposeX2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+  data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
   __syncthreads();
   *V = 0.0;
-  if (t_id_x < P_1D && t_id_y < P_1D) {
+  if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
     for (CeedInt i = 0; i < Q_1D; i++) {
-      *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
+      *V += B[data.t_id_x + i * P_1D] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
     }
   }
   __syncthreads();
@@ -173,13 +168,12 @@ inline __device__ void ContractTransposeX2d(SharedData_Hip &data, const int t_id
 // 2D transpose tensor contract and add x
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void ContractTransposeAddX2d(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
-                                               CeedScalar *V) {
-  data.slice[t_id_x + t_id_y * T_1D] = *U;
+inline __device__ void ContractTransposeAddX2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+  data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
   __syncthreads();
-  if (t_id_x < P_1D && t_id_y < P_1D) {
+  if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
     for (CeedInt i = 0; i < Q_1D; i++) {
-      *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
+      *V += B[data.t_id_x + i * P_1D] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
     }
   }
   __syncthreads();
@@ -189,132 +183,63 @@ inline __device__ void ContractTransposeAddX2d(SharedData_Hip &data, const int t
 // 2D interpolate to quadrature points
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void InterpTensor2d_Core(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *__restrict__ r_U,
-                                           const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
+inline __device__ void InterpTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
   CeedScalar r_t[1];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
-    ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
+    ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_B, r_t);
+    ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp]);
   }
 }
 
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void InterpTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
-  InterpTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x, data.t_id_y, r_U, c_B, r_V);
-}
-
-template <int NUM_COMP, int P_1D, int Q_1D>
-inline __device__ void InterpTensor2dFlattened(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
-                                               CeedScalar *__restrict__ r_V) {
-  const int max_1d = P_1D < Q_1D ? P_1D : Q_1D;
-
-  InterpTensor2d_Core<NUM_COMP, P_1D, Q_1D>(data, data.t_id_x % max_1d, data.t_id_x / max_1d, r_U, c_B, r_V);
-}
-
 //------------------------------------------------------------------------------
 // 2D interpolate transpose
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void InterpTransposeTensor2d_Core(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *__restrict__ r_U,
-                                                    const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
+inline __device__ void InterpTransposeTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                               CeedScalar *__restrict__ r_V) {
   CeedScalar r_t[1];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
-    ContractTransposeX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
+    ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_B, r_t);
+    ContractTransposeX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp]);
   }
 }
 
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void InterpTransposeTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
-                                               CeedScalar *__restrict__ r_V) {
-  InterpTransposeTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x, data.t_id_y, r_U, c_B, r_V);
-}
-
-template <int NUM_COMP, int P_1D, int Q_1D>
-inline __device__ void InterpTransposeTensor2dFlattened(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
-                                                        CeedScalar *__restrict__ r_V) {
-  const int max_1d = P_1D < Q_1D ? P_1D : Q_1D;
-
-  InterpTransposeTensor2d_Core<NUM_COMP, P_1D, Q_1D>(data, data.t_id_x % max_1d, data.t_id_x / max_1d, r_U, c_B, r_V);
-}
-
 //------------------------------------------------------------------------------
 // 2D derivatives at quadrature points
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void GradTensor2d_Core(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *__restrict__ r_U,
-                                         const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
+inline __device__ void GradTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
+                                    CeedScalar *__restrict__ r_V) {
   CeedScalar r_t[1];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_G, r_t);
-    ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp + 0 * NUM_COMP]);
-    ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
-    ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp + 1 * NUM_COMP]);
+    ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_G, r_t);
+    ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp + 0 * NUM_COMP]);
+    ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_B, r_t);
+    ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_G, &r_V[comp + 1 * NUM_COMP]);
   }
 }
 
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void GradTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
-                                    CeedScalar *__restrict__ r_V) {
-  GradTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x, data.t_id_y, r_U, c_B, c_G, r_V);
-}
-
-template <int NUM_COMP, int P_1D, int Q_1D>
-inline __device__ void GradTensor2dFlattened(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
-                                             CeedScalar *__restrict__ r_V) {
-  const int max_1d = P_1D < Q_1D ? P_1D : Q_1D;
-
-  GradTensor2d_Core<NUM_COMP, P_1D, Q_1D>(data, data.t_id_x % max_1d, data.t_id_x / max_1d, r_U, c_B, c_G, r_V);
-}
-
 //------------------------------------------------------------------------------
 // 2D derivatives transpose
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void GradTransposeTensor2d_Core(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *__restrict__ r_U,
-                                                  const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
+inline __device__ void GradTransposeTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
+                                             CeedScalar *__restrict__ r_V) {
   CeedScalar r_t[1];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 0 * NUM_COMP], c_B, r_t);
-    ContractTransposeX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp]);
-    ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 1 * NUM_COMP], c_G, r_t);
-    ContractTransposeAddX2d<NUM_COMP, P_1D, Q_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
+    ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp + 0 * NUM_COMP], c_B, r_t);
+    ContractTransposeX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_G, &r_V[comp]);
+    ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp + 1 * NUM_COMP], c_G, r_t);
+    ContractTransposeAddX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp]);
   }
 }
 
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void GradTransposeTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
-                                             CeedScalar *__restrict__ r_V) {
-  GradTansposeTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x, data.t_id_y, r_U, c_B, c_G, r_V);
-}
-
-template <int NUM_COMP, int P_1D, int Q_1D>
-inline __device__ void GradTransposeTensor2dFlattened(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
-                                                      const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
-  const int max_1d = P_1D < Q_1D ? P_1D : Q_1D;
-
-  GradTansposeTensor2d_Core<NUM_COMP, P_1D, Q_1D>(data, data.t_id_x % max_1d, data.t_id_x / max_1d, r_U, c_B, c_G, r_V);
-}
-
 //------------------------------------------------------------------------------
 // 2D quadrature weights
 //------------------------------------------------------------------------------
-template <int Q_1D>
-inline __device__ void WeightTensor2d_Core(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *__restrict__ q_weight_1d,
-                                           CeedScalar *w) {
-  *w = (t_id_x < Q_1D && t_id_y < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] : 0.0;
-}
-
 template <int P_1D, int Q_1D>
 inline __device__ void WeightTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
-  WeightTensor2d_Core<Q_1D>(data, data.t_id_x, data.t_id_y, q_weight_1d, w);
-}
-
-template <int P_1D, int Q_1D>
-inline __device__ void WeightTensor2dFlattened(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
-  const int max_1d = P_1D < Q_1D ? P_1D : Q_1D;
-
-  WeightTensor2d_Core<Q_1D>(data, data.t_id_x % max_1d, data.t_id_x / max_1d, q_weight_1d, w);
+  *w = (data.t_id_x < Q_1D && data.t_id_y < Q_1D) ? q_weight_1d[data.t_id_x] * q_weight_1d[data.t_id_y] : 0.0;
 }
 
 //------------------------------------------------------------------------------
@@ -597,7 +522,7 @@ inline __device__ void GradTransposeTensorCollocated3d(SharedData_Hip &data, con
 //------------------------------------------------------------------------------
 // 3D quadrature weights
 //------------------------------------------------------------------------------
-template <int Q_1D>
+template <int P_1D, int Q_1D>
 inline __device__ void WeightTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
   const bool       quad = (data.t_id_x < Q_1D && data.t_id_y < Q_1D);
   const CeedScalar pw   = quad ? q_weight_1d[data.t_id_x] * q_weight_1d[data.t_id_y] : 0.0;

From 74398b5adbecd200ffa32cb282fd0f9b2ec0d708 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 14 Mar 2025 14:46:03 -0600
Subject: [PATCH 335/571] hip - add mixed gen

---
 .../hip-gen/ceed-hip-gen-operator-build.cpp   | 371 +++++++++++-------
 backends/hip-gen/ceed-hip-gen-operator.c      |  18 +-
 backends/hip-gen/ceed-hip-gen.h               |   2 +-
 3 files changed, 239 insertions(+), 152 deletions(-)

diff --git a/backends/hip-gen/ceed-hip-gen-operator-build.cpp b/backends/hip-gen/ceed-hip-gen-operator-build.cpp
index d1beb5fac7..11d7815322 100644
--- a/backends/hip-gen/ceed-hip-gen-operator-build.cpp
+++ b/backends/hip-gen/ceed-hip-gen-operator-build.cpp
@@ -60,35 +60,71 @@ extern "C" int BlockGridCalculate_Hip_gen(const CeedInt dim, const CeedInt num_e
 //------------------------------------------------------------------------------
 static int CeedOperatorBuildKernelData_Hip_gen(Ceed ceed, CeedInt num_input_fields, CeedOperatorField *op_input_fields,
                                                CeedQFunctionField *qf_input_fields, CeedInt num_output_fields, CeedOperatorField *op_output_fields,
-                                               CeedQFunctionField *qf_output_fields, CeedInt *max_P_1d, CeedInt *Q_1d, CeedInt *dim, bool *is_tensor,
-                                               bool *use_3d_slices) {
-  // Find dim, P_1d, Q_1d
-  *max_P_1d  = 0;
-  *Q_1d      = 0;
-  *dim       = 0;
-  *is_tensor = true;
+                                               CeedQFunctionField *qf_output_fields, CeedInt *max_P, CeedInt *max_P_1d, CeedInt *Q, CeedInt *Q_1d,
+                                               CeedInt *max_dim, bool *is_all_tensor, bool *use_3d_slices) {
+  // Check if all are tensor
+  *is_all_tensor = true;
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedBasis basis;
+
+    CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
+    if (basis != CEED_BASIS_NONE) {
+      bool is_field_tensor;
+
+      CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
+      *is_all_tensor = *is_all_tensor && is_field_tensor;
+    }
+    CeedCallBackend(CeedBasisDestroy(&basis));
+  }
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    CeedBasis basis;
+
+    CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
+    if (basis != CEED_BASIS_NONE) {
+      bool is_field_tensor;
+
+      CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
+      *is_all_tensor = *is_all_tensor && is_field_tensor;
+    }
+    CeedCallBackend(CeedBasisDestroy(&basis));
+  }
 
+  // Find max_P, max_P_1d, Q, and Q_1d
+  bool is_all_3d = true;
+
+  *max_P    = 0;
+  *max_P_1d = 0;
+  *Q        = 0;
+  *Q_1d     = 0;
   for (CeedInt i = 0; i < num_input_fields; i++) {
     CeedBasis basis;
 
     CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
     if (basis != CEED_BASIS_NONE) {
       bool    is_field_tensor;
-      CeedInt field_P_1d = 0, field_Q_1d = 0, field_dim = 0;
+      CeedInt field_dim = 0, field_P = 0, field_P_1d = 0, field_Q = 0, field_Q_1d = 0;
 
-      // Collect dim, P_1d, and Q_1d
-      CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
-      *is_tensor = *is_tensor && is_field_tensor;
-      if (is_field_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &field_P_1d));
-      else CeedCallBackend(CeedBasisGetNumNodes(basis, &field_P_1d));
-      *max_P_1d = CeedIntMax(*max_P_1d, field_P_1d);
+      // Check if 3D
       CeedCallBackend(CeedBasisGetDimension(basis, &field_dim));
-      CeedCheck(*dim == 0 || field_dim == *dim, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
-      *dim = field_dim;
-      if (is_field_tensor) CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &field_Q_1d));
-      else CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &field_Q_1d));
-      CeedCheck(*Q_1d == 0 || field_Q_1d == *Q_1d, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
-      *Q_1d = field_Q_1d;
+      is_all_3d = is_all_3d && (field_dim == 3);
+      *max_dim  = CeedIntMax(*max_dim, field_dim);
+
+      // Collect P, P_1d, Q, and Q_1d
+      CeedCallBackend(CeedBasisGetNumNodes(basis, &field_P));
+      *max_P = CeedIntMax(*max_P, field_P);
+      CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
+      if (is_field_tensor) {
+        CeedCallBackend(CeedBasisGetNumNodes1D(basis, &field_P_1d));
+        *max_P_1d = CeedIntMax(*max_P_1d, field_P_1d);
+      }
+      CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &field_Q));
+      CeedCheck(*Q == 0 || field_Q == *Q, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
+      *Q = field_Q;
+      if (is_field_tensor) {
+        CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &field_Q_1d));
+        CeedCheck(*Q_1d == 0 || field_Q_1d == *Q_1d, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
+        *Q_1d = field_Q_1d;
+      }
     }
     CeedCallBackend(CeedBasisDestroy(&basis));
   }
@@ -98,28 +134,36 @@ static int CeedOperatorBuildKernelData_Hip_gen(Ceed ceed, CeedInt num_input_fiel
     CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
     if (basis != CEED_BASIS_NONE) {
       bool    is_field_tensor;
-      CeedInt field_P_1d = 0, field_Q_1d = 0, field_dim = 0;
+      CeedInt field_dim = 0, field_P = 0, field_P_1d = 0, field_Q = 0, field_Q_1d = 0;
 
-      // Collect dim, P_1d, and Q_1d
-      CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
-      *is_tensor = *is_tensor && is_field_tensor;
-      if (is_field_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &field_P_1d));
-      else CeedCallBackend(CeedBasisGetNumNodes(basis, &field_P_1d));
-      *max_P_1d = CeedIntMax(*max_P_1d, field_P_1d);
+      // Check if 3D
       CeedCallBackend(CeedBasisGetDimension(basis, &field_dim));
-      CeedCheck(*dim == 0 || field_dim == *dim, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
-      *dim = field_dim;
-      if (is_field_tensor) CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &field_Q_1d));
-      else CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &field_Q_1d));
-      CeedCheck(*Q_1d == 0 || field_Q_1d == *Q_1d, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
-      *Q_1d = field_Q_1d;
+      is_all_3d = is_all_3d && (field_dim == 3);
+      *max_dim  = CeedIntMax(*max_dim, field_dim);
+
+      // Collect P, P_1d, Q, and Q_1d
+      CeedCallBackend(CeedBasisGetNumNodes(basis, &field_P));
+      *max_P = CeedIntMax(*max_P, field_P);
+      CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
+      if (is_field_tensor) {
+        CeedCallBackend(CeedBasisGetNumNodes1D(basis, &field_P_1d));
+        *max_P_1d = CeedIntMax(*max_P_1d, field_P_1d);
+      }
+      CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &field_Q));
+      CeedCheck(*Q == 0 || field_Q == *Q, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
+      *Q = field_Q;
+      if (is_field_tensor) {
+        CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &field_Q_1d));
+        CeedCheck(*Q_1d == 0 || field_Q_1d == *Q_1d, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
+        *Q_1d = field_Q_1d;
+      }
     }
     CeedCallBackend(CeedBasisDestroy(&basis));
   }
 
   // Only use 3D collocated gradient parallelization strategy when gradient is computed
   *use_3d_slices = false;
-  if (*dim == 3) {
+  if (is_all_3d && *is_all_tensor) {
     bool was_grad_found = false;
 
     for (CeedInt i = 0; i < num_input_fields; i++) {
@@ -160,17 +204,21 @@ static int CeedOperatorBuildKernelData_Hip_gen(Ceed ceed, CeedInt num_input_fiel
 // Setup fields
 //------------------------------------------------------------------------------
 static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, CeedInt i, CeedOperatorField op_field,
-                                                    CeedQFunctionField qf_field, FieldReuse_Hip field_reuse, CeedInt Q_1d, bool is_input,
-                                                    bool is_tensor, bool is_at_points, bool use_3d_slices) {
+                                                    CeedQFunctionField qf_field, FieldReuse_Hip field_reuse, CeedInt max_dim, CeedInt Q, CeedInt Q_1d,
+                                                    bool is_input, bool is_all_tensor, bool is_at_points, bool use_3d_slices) {
+  bool      is_tensor = true;
+  CeedBasis basis;
+  CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis));
+  if (basis != CEED_BASIS_NONE) CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
+
   const char           *field_name;
   std::string           var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
   std::string           P_name = (is_tensor ? "P_1d" : "P") + var_suffix, Q_name = is_tensor ? "Q_1d" : "Q";
   std::string           option_name = (is_input ? "inputs" : "outputs");
   CeedEvalMode          eval_mode   = CEED_EVAL_NONE;
-  CeedInt               elem_size = 0, num_comp = 0, P_1d = 0;
+  CeedInt               elem_size = 0, num_comp = 0, dim = max_dim, P_1d = 0;
   CeedElemRestriction   elem_rstr;
   CeedBasis_Hip_shared *basis_data;
-  CeedBasis             basis;
 
   // Field reuse info
   bool use_previous_field = field_reuse.index != -1;
@@ -185,15 +233,22 @@ static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, Ce
     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
   }
   CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
-  CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis));
   if (basis != CEED_BASIS_NONE) {
     CeedCallBackend(CeedBasisGetData(basis, &basis_data));
+    CeedCallBackend(CeedBasisGetDimension(basis, &dim));
     if (is_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
     else CeedCallBackend(CeedBasisGetNumNodes(basis, &P_1d));
   }
   CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode));
 
   // Set field constants
+  code << "  const CeedInt dim" << var_suffix << " = " << dim << ";\n";
+  if (is_tensor && !is_all_tensor) {
+    CeedInt P = 0;
+
+    CeedCallBackend(CeedBasisGetNumNodes(basis, &P));
+    code << "  const CeedInt P" << var_suffix << " = " << (basis == CEED_BASIS_NONE ? Q : P) << ";\n";
+  }
   code << "  const CeedInt " << P_name << " = " << (basis == CEED_BASIS_NONE ? Q_1d : P_1d) << ";\n";
   if (eval_mode != CEED_EVAL_WEIGHT) {
     code << "  const CeedInt num_comp" << var_suffix << " = " << num_comp << ";\n";
@@ -299,9 +354,10 @@ static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, Ce
 
             code << "  CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
           } else {
-            code << "  __shared__ CeedScalar s_G" << var_suffix << "[" << P_name << "*" << Q_name << (is_tensor ? "" : "*dim") << "];\n";
-            code << "  LoadMatrix<" << P_name << ", " << Q_name << (is_tensor ? "" : "*dim") << ">(data, G." << option_name << "[" << i << "], s_G"
-                 << var_suffix << ");\n";
+            code << "  __shared__ CeedScalar s_G" << var_suffix << "[" << P_name << "*" << Q_name << (is_tensor ? "" : "*dim")
+                 << (is_tensor ? "" : var_suffix) << "];\n";
+            code << "  LoadMatrix<" << P_name << ", " << Q_name << (is_tensor ? "" : "*dim") << (is_tensor ? "" : var_suffix) << ">(data, G."
+                 << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
           }
         }
       }
@@ -321,18 +377,17 @@ static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, Ce
 //------------------------------------------------------------------------------
 // Restriction
 //------------------------------------------------------------------------------
-static int CeedOperatorBuildKernelRestriction_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, CeedInt i, CeedInt dim,
-                                                      CeedInt field_input_buffer[], CeedOperatorField op_field, CeedQFunctionField qf_field,
-                                                      CeedInt Q_1d, bool is_input, bool is_tensor, bool is_at_points, bool use_3d_slices) {
+static int CeedOperatorBuildKernelRestriction_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, CeedInt i, CeedInt field_input_buffer[],
+                                                      CeedOperatorField op_field, CeedQFunctionField qf_field, CeedInt max_dim, CeedInt Q_1d,
+                                                      bool is_input, bool is_all_tensor, bool is_at_points, bool use_3d_slices) {
   std::string              var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
-  std::string              P_name     = (is_tensor ? "P_1d" : "P") + var_suffix;
+  std::string              P_name     = (is_all_tensor ? "P_1d" : "P") + var_suffix;
   CeedEvalMode             eval_mode  = CEED_EVAL_NONE;
-  CeedInt                  elem_size = 0, num_comp = 0, P_1d = 0;
+  CeedInt                  elem_size = 0, num_comp = 0;
   CeedSize                 l_size;
   CeedRestrictionType      rstr_type = CEED_RESTRICTION_STANDARD;
   CeedElemRestriction_Hip *rstr_data;
   CeedElemRestriction      elem_rstr;
-  CeedBasis                basis;
 
   // Get field data
   CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr));
@@ -342,12 +397,6 @@ static int CeedOperatorBuildKernelRestriction_Hip_gen(std::ostringstream &code,
     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
     CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data));
   }
-  CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis));
-  if (basis != CEED_BASIS_NONE) {
-    if (is_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
-    else CeedCallBackend(CeedBasisGetNumNodes(basis, &P_1d));
-  }
-  CeedCallBackend(CeedBasisDestroy(&basis));
   CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode));
 
   // Restriction
@@ -375,7 +424,7 @@ static int CeedOperatorBuildKernelRestriction_Hip_gen(std::ostringstream &code,
           CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
           code << "    // CompStride: " << comp_stride << "\n";
           data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets;
-          code << "    ReadLVecStandard" << (is_tensor ? dim : 1) << "d<num_comp" << var_suffix << ", " << comp_stride << ", " << P_name
+          code << "    ReadLVecStandard" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", " << comp_stride << ", " << P_name
                << ">(data, l_size" << var_suffix << ", elem, indices.inputs[" << i << "], d" << var_suffix << ", r_e" << var_suffix << ");\n";
           break;
         }
@@ -391,7 +440,7 @@ static int CeedOperatorBuildKernelRestriction_Hip_gen(std::ostringstream &code,
             CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
           }
           code << "    // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n";
-          code << "    ReadLVecStrided" << (is_tensor ? dim : 1) << "d<num_comp" << var_suffix << ", " << P_name << ", " << strides[0] << ", "
+          code << "    ReadLVecStrided" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", " << P_name << ", " << strides[0] << ", "
                << strides[1] << ", " << strides[2] << ">(data, elem, d" << var_suffix << ", r_e" << var_suffix << ");\n";
           break;
         }
@@ -421,7 +470,7 @@ static int CeedOperatorBuildKernelRestriction_Hip_gen(std::ostringstream &code,
         CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
         code << "    // CompStride: " << comp_stride << "\n";
         data->indices.outputs[i] = (CeedInt *)rstr_data->d_offsets;
-        code << "    WriteLVecStandard" << (is_tensor ? dim : 1) << "d<num_comp" << var_suffix << ", " << comp_stride << ", " << P_name
+        code << "    WriteLVecStandard" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", " << comp_stride << ", " << P_name
              << ">(data, l_size" << var_suffix << ", elem, indices.outputs[" << i << "], r_e" << var_suffix << ", d" << var_suffix << ");\n";
         break;
       }
@@ -437,7 +486,7 @@ static int CeedOperatorBuildKernelRestriction_Hip_gen(std::ostringstream &code,
           CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
         }
         code << "    // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n";
-        code << "    WriteLVecStrided" << (is_tensor ? dim : 1) << "d<num_comp" << var_suffix << ", " << P_name << ", " << strides[0] << ", "
+        code << "    WriteLVecStrided" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", " << P_name << ", " << strides[0] << ", "
              << strides[1] << ", " << strides[2] << ">(data, elem, r_e" << var_suffix << ", d" << var_suffix << ");\n";
         break;
       }
@@ -458,15 +507,19 @@ static int CeedOperatorBuildKernelRestriction_Hip_gen(std::ostringstream &code,
 //------------------------------------------------------------------------------
 // Basis
 //------------------------------------------------------------------------------
-static int CeedOperatorBuildKernelBasis_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, CeedInt i, CeedInt dim,
-                                                CeedOperatorField op_field, CeedQFunctionField qf_field, CeedInt Q_1d, bool is_input, bool is_tensor,
+static int CeedOperatorBuildKernelBasis_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, CeedInt i, CeedOperatorField op_field,
+                                                CeedQFunctionField qf_field, CeedInt max_dim, CeedInt Q_1d, bool is_input, bool is_all_tensor,
                                                 bool is_at_points, bool use_3d_slices) {
+  bool      is_tensor = true;
+  CeedBasis basis;
+  CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis));
+  CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
+
   std::string         var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
   std::string         P_name = (is_tensor ? "P_1d" : "P") + var_suffix, Q_name = is_tensor ? "Q_1d" : "Q";
   CeedEvalMode        eval_mode = CEED_EVAL_NONE;
-  CeedInt             elem_size = 0, num_comp = 0, P_1d = 0;
+  CeedInt             dim = max_dim, elem_size = 0, num_comp = 0, P_1d = 0;
   CeedElemRestriction elem_rstr;
-  CeedBasis           basis;
 
   // Get field data
   CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr));
@@ -475,8 +528,8 @@ static int CeedOperatorBuildKernelBasis_Hip_gen(std::ostringstream &code, CeedOp
     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
   }
   CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
-  CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis));
   if (basis != CEED_BASIS_NONE) {
+    CeedCallBackend(CeedBasisGetDimension(basis, &dim));
     if (is_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
     else CeedCallBackend(CeedBasisGetNumNodes(basis, &P_1d));
   }
@@ -499,11 +552,14 @@ static int CeedOperatorBuildKernelBasis_Hip_gen(std::ostringstream &code, CeedOp
           code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix
                << ", s_B" << var_suffix << ", r_c" << var_suffix << ");\n";
         } else {
-          std::string function_name = is_tensor ? ((dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d") : "InterpNonTensor";
-
-          code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
-          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix
-               << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n";
+          std::string function_name = is_tensor
+                                          ? ((dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened"))
+                                          : "InterpNonTensor";
+          std::string op_t_1d_name  = (is_all_tensor || !is_tensor) ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
+
+          code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
+          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_e"
+               << var_suffix << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n";
         }
         break;
       case CEED_EVAL_GRAD:
@@ -521,17 +577,20 @@ static int CeedOperatorBuildKernelBasis_Hip_gen(std::ostringstream &code, CeedOp
                << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n";
         } else if (is_tensor) {
           bool        is_collocated = dim == 3 && Q_1d >= P_1d;
-          std::string function_name = (dim == 1 ? "Grad" : (is_collocated ? "GradTensorCollocated" : "GradTensor")) + std::to_string(dim) + "d";
-
-          code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim*" << (dim >= 3 ? Q_name : "1") << "];\n";
-          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix
-               << ", s_B" << var_suffix << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n";
+          std::string function_name = (dim == 1 ? "Grad" : (is_collocated ? "GradTensorCollocated" : "GradTensor")) + std::to_string(dim) + "d" +
+                                      (is_all_tensor ? "" : "Flattened");
+          std::string op_t_1d_name = is_all_tensor ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
+
+          code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "*"
+               << (is_all_tensor && dim >= 3 ? Q_name : "1") << "];\n";
+          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_e"
+               << var_suffix << ", s_B" << var_suffix << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n";
         } else {
           std::string function_name = "GradNonTensor";
 
-          code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim];\n";
-          code << "    " << function_name << "<num_comp" << var_suffix << ", dim, " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e"
-               << var_suffix << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n";
+          code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
+          code << "    " << function_name << "<num_comp" << var_suffix << ", dim" << var_suffix << ", " << P_name << ", " << Q_name
+               << ", OP_T_1D>(data, r_e" << var_suffix << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n";
         }
         break;
       case CEED_EVAL_WEIGHT: {
@@ -539,9 +598,11 @@ static int CeedOperatorBuildKernelBasis_Hip_gen(std::ostringstream &code, CeedOp
           code << "    // Nothing to do AtPoints\n";
         } else {
           CeedBasis_Hip_shared *basis_data;
-          std::string           function_name = is_tensor ? ((dim == 1 ? "Weight" : "WeightTensor") + std::to_string(dim) + "d") : "WeightNonTensor";
+          std::string           function_name = is_tensor
+                                                    ? ((dim == 1 ? "Weight" : "WeightTensor") + std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened"))
+                                                    : "WeightNonTensor";
 
-          code << "    CeedScalar r_q" << var_suffix << "[" << (is_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
+          code << "    CeedScalar r_q" << var_suffix << "[" << (is_all_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
           CeedCallBackend(CeedBasisGetData(basis, &basis_data));
           data->W = basis_data->d_q_weight_1d;
           code << "    " << function_name << "<" << P_name << ", " << Q_name << ">(data, W, r_q" << var_suffix << ");\n";
@@ -568,10 +629,12 @@ static int CeedOperatorBuildKernelBasis_Hip_gen(std::ostringstream &code, CeedOp
                << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
         } else {
           std::string function_name =
-              is_tensor ? ((dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d") : "InterpTransposeNonTensor";
+              is_tensor ? ((dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened"))
+                        : "InterpTransposeNonTensor";
+          std::string op_t_1d_name = (is_all_tensor || !is_tensor) ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
 
-          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_q" << var_suffix
-               << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
+          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_q"
+               << var_suffix << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
         }
         break;
       case CEED_EVAL_GRAD:
@@ -588,16 +651,17 @@ static int CeedOperatorBuildKernelBasis_Hip_gen(std::ostringstream &code, CeedOp
                << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
         } else if (is_tensor) {
           bool        is_collocated = dim == 3 && Q_1d >= P_1d;
-          std::string function_name =
-              (dim == 1 ? "GradTranspose" : (is_collocated ? "GradTransposeTensorCollocated" : "GradTransposeTensor")) + std::to_string(dim) + "d";
+          std::string function_name = (dim == 1 ? "GradTranspose" : (is_collocated ? "GradTransposeTensorCollocated" : "GradTransposeTensor")) +
+                                      std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened");
+          std::string op_t_1d_name = is_all_tensor ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
 
-          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_q" << var_suffix
-               << ", s_B" << var_suffix << ", s_G" << var_suffix << ", r_e" << var_suffix << ");\n";
+          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_q"
+               << var_suffix << ", s_B" << var_suffix << ", s_G" << var_suffix << ", r_e" << var_suffix << ");\n";
         } else {
           std::string function_name = "GradTransposeNonTensor";
 
-          code << "    " << function_name << "<num_comp" << var_suffix << ", dim, " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_q"
-               << var_suffix << ", s_G" << var_suffix << ", r_e" << var_suffix << ");\n";
+          code << "    " << function_name << "<num_comp" << var_suffix << ", dim" << var_suffix << ", " << P_name << ", " << Q_name
+               << ", OP_T_1D>(data, r_q" << var_suffix << ", s_G" << var_suffix << ", r_e" << var_suffix << ");\n";
         }
         break;
       // LCOV_EXCL_START
@@ -616,12 +680,12 @@ static int CeedOperatorBuildKernelBasis_Hip_gen(std::ostringstream &code, CeedOp
 //------------------------------------------------------------------------------
 // QFunction
 //------------------------------------------------------------------------------
-static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, CeedInt dim, CeedInt max_num_points,
+static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, CeedInt max_dim, CeedInt max_num_points,
                                                     CeedInt num_input_fields, CeedOperatorField *op_input_fields, CeedQFunctionField *qf_input_fields,
                                                     CeedInt num_output_fields, CeedOperatorField *op_output_fields,
-                                                    CeedQFunctionField *qf_output_fields, std::string qfunction_name, CeedInt Q_1d, bool is_tensor,
-                                                    bool is_at_points, bool use_3d_slices) {
-  std::string         Q_name    = is_tensor ? "Q_1d" : "Q";
+                                                    CeedQFunctionField *qf_output_fields, std::string qfunction_name, CeedInt Q_1d,
+                                                    bool is_all_tensor, bool is_at_points, bool use_3d_slices) {
+  std::string         Q_name    = is_all_tensor ? "Q_1d" : "Q";
   CeedEvalMode        eval_mode = CEED_EVAL_NONE;
   CeedElemRestriction elem_rstr;
 
@@ -639,25 +703,28 @@ static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, Ce
         if (is_at_points) {
           code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "];\n";
         } else {
-          code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
+          code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tensor && (max_dim >= 3) ? Q_name : "1")
+               << "];\n";
         }
         break;
       case CEED_EVAL_INTERP:
         if (is_at_points) {
           // Accumulator for point data
-          code << "    CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "];\n";
-          code << "    for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "; i++) {\n";
+          code << "    CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "];\n";
+          code << "    for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "; i++) {\n";
           code << "      r_c" << var_suffix << "[i] = 0.0;\n";
           code << "    }\n";
         } else {
-          code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
+          code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tensor && (max_dim >= 3) ? Q_name : "1")
+               << "];\n";
         }
         break;
       case CEED_EVAL_GRAD:
         if (is_at_points) {
           // Accumulator for point data
-          code << "    CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "*dim];\n";
-          code << "    for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "; i++) {\n";
+          code << "    CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "*dim" << var_suffix
+               << "];\n";
+          code << "    for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "; i++) {\n";
           code << "      r_c" << var_suffix << "[i] = 0.0;\n";
           code << "    }\n";
         } else if (use_3d_slices) {
@@ -667,7 +734,8 @@ static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, Ce
           code << "      r_q" << var_suffix << "[i] = 0.0;\n";
           code << "    }\n";
         } else {
-          code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim*" << (is_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
+          code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "*"
+               << (is_all_tensor && (max_dim >= 3) ? Q_name : "1") << "];\n";
         }
         break;
       case CEED_EVAL_WEIGHT:
@@ -689,8 +757,8 @@ static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, Ce
     code << "      const CeedInt p = i % max_num_points;\n\n";
 
     code << "      // -- Coordinates\n";
-    code << "      CeedScalar r_x[dim];\n";
-    code << "      ReadPoint<dim, coords_comp_stride, max_num_points>(data, elem, p, max_num_points, points.indices, points.coords, r_x);\n\n";
+    code << "      CeedScalar r_x[max_dim];\n";
+    code << "      ReadPoint<max_dim, coords_comp_stride, max_num_points>(data, elem, p, max_num_points, points.indices, points.coords, r_x);\n\n";
 
     code << "      // -- Input fields\n";
     for (CeedInt i = 0; i < num_input_fields; i++) {
@@ -711,13 +779,13 @@ static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, Ce
           break;
         case CEED_EVAL_INTERP:
           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
-          code << "      InterpAtPoints" << dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name << ">(data, i, r_c"
-               << var_suffix << ", r_x, r_s" << var_suffix << ");\n";
+          code << "      InterpAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
+               << ">(data, i, r_c" << var_suffix << ", r_x, r_s" << var_suffix << ");\n";
           break;
         case CEED_EVAL_GRAD:
-          code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim];\n";
-          code << "      GradAtPoints" << dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name << ">(data, i, r_c"
-               << var_suffix << ", r_x, r_s" << var_suffix << ");\n";
+          code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
+          code << "      GradAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
+               << ">(data, i, r_c" << var_suffix << ", r_x, r_s" << var_suffix << ");\n";
           break;
         case CEED_EVAL_WEIGHT:
           code << "      CeedScalar r_s" << var_suffix << "[1];\n";
@@ -747,7 +815,7 @@ static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, Ce
           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
           break;
         case CEED_EVAL_GRAD:
-          code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim];\n";
+          code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
           break;
           // LCOV_EXCL_START
         case CEED_EVAL_WEIGHT:
@@ -820,7 +888,7 @@ static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, Ce
           code << "      }\n";
           break;
         case CEED_EVAL_GRAD:
-          code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim];\n";
+          code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
           code << "      GradColloSlice3d<num_comp" << var_suffix << ", " << Q_name << ", OP_T_1D>(data, q, r_q" << var_suffix << ", s_G"
                << var_suffix << ", r_s" << var_suffix << ");\n";
           break;
@@ -852,7 +920,7 @@ static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, Ce
           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
           break;
         case CEED_EVAL_GRAD:
-          code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim];\n";
+          code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
           break;
           // LCOV_EXCL_START
         case CEED_EVAL_WEIGHT:
@@ -908,7 +976,7 @@ static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, Ce
   // Apply QFunction
   code << "\n      // -- Apply QFunction\n";
   code << "      " << qfunction_name << "(ctx, ";
-  if (dim != 3 || is_at_points || use_3d_slices || !is_tensor) {
+  if (max_dim != 3 || is_at_points || use_3d_slices || !is_all_tensor) {
     code << "1";
   } else {
     code << Q_name;
@@ -946,14 +1014,14 @@ static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, Ce
           code << "      if (i >= points.num_per_elem[elem]) {\n";
           code << "        for (CeedInt j = 0; j < num_comp" << var_suffix << "; j++) r_s" << var_suffix << "[j] = 0.0;\n";
           code << "      }\n";
-          code << "      InterpTransposeAtPoints" << dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
+          code << "      InterpTransposeAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
                << ">(data, i, r_s" << var_suffix << ", r_x, r_c" << var_suffix << ");\n";
           break;
         case CEED_EVAL_GRAD:
           code << "      if (i >= points.num_per_elem[elem]) {\n";
-          code << "        for (CeedInt j = 0; j < num_comp" << var_suffix << "*dim; j++) r_s" << var_suffix << "[j] = 0.0;\n";
+          code << "        for (CeedInt j = 0; j < num_comp" << var_suffix << "*dim" << var_suffix << "; j++) r_s" << var_suffix << "[j] = 0.0;\n";
           code << "      }\n";
-          code << "      GradTransposeAtPoints" << dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
+          code << "      GradTransposeAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
                << ">(data, i, r_s" << var_suffix << ", r_x, r_c" << var_suffix << ");\n";
           break;
           // LCOV_EXCL_START
@@ -1011,9 +1079,9 @@ static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, Ce
 // Build single operator kernel
 //------------------------------------------------------------------------------
 extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_build) {
-  bool                   is_tensor = true, is_at_points = false, use_3d_slices = false;
+  bool                   is_all_tensor = true, is_all_nontensor = true, is_at_points = false, use_3d_slices = false;
   Ceed                   ceed;
-  CeedInt                Q_1d, num_input_fields, num_output_fields, dim = 1, max_num_points = 0, coords_comp_stride = 0;
+  CeedInt                Q, Q_1d, num_input_fields, num_output_fields, max_dim = 1, max_num_points = 0, coords_comp_stride = 0;
   CeedQFunctionField    *qf_input_fields, *qf_output_fields;
   CeedQFunction_Hip_gen *qf_data;
   CeedQFunction          qf;
@@ -1035,7 +1103,7 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_bu
   // Check field compatibility
   CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
   {
-    bool has_shared_bases = true, is_all_tensor = true, is_all_nontensor = true;
+    bool has_shared_bases = true;
 
     for (CeedInt i = 0; i < num_input_fields; i++) {
       CeedBasis basis;
@@ -1084,7 +1152,7 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_bu
       CeedCallBackend(CeedBasisDestroy(&basis));
     }
     // -- Fallback to ref if not all bases are shared
-    if (!has_shared_bases || (!is_all_tensor && !is_all_nontensor)) {
+    if (!has_shared_bases) {
       *is_good_build = false;
       return CEED_ERROR_SUCCESS;
     }
@@ -1096,10 +1164,15 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_bu
 
   // Get operator data
   CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points));
-  CeedCallBackend(CeedOperatorBuildKernelData_Hip_gen(ceed, num_input_fields, op_input_fields, qf_input_fields, num_output_fields, op_output_fields,
-                                                      qf_output_fields, &data->max_P_1d, &Q_1d, &dim, &is_tensor, &use_3d_slices));
-  if (dim == 0) dim = 1;
-  data->dim = dim;
+  {
+    CeedInt max_P, max_P_1d;
+
+    CeedCallBackend(CeedOperatorBuildKernelData_Hip_gen(ceed, num_input_fields, op_input_fields, qf_input_fields, num_output_fields, op_output_fields,
+                                                        qf_output_fields, &max_P, &max_P_1d, &Q, &Q_1d, &max_dim, &is_all_tensor, &use_3d_slices));
+    data->max_P_1d = is_all_tensor ? max_P_1d : max_P;
+  }
+  if (max_dim == 0) max_dim = 1;
+  data->dim = max_dim;
   if (is_at_points) {
     CeedElemRestriction_Hip *rstr_data;
     CeedElemRestriction      rstr_points = NULL;
@@ -1116,6 +1189,8 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_bu
     if (is_at_points) Q_1d = max_num_points;
     else CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q_1d));
   }
+  if (Q == 0) Q = Q_1d;
+  data->Q    = Q;
   data->Q_1d = Q_1d;
 
   // Check for restriction only identity operator
@@ -1134,10 +1209,11 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_bu
   }
 
   // Load basis source files
-  if (is_tensor) {
+  if (!is_all_tensor) {
     code << "// Tensor basis source\n";
     code << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-templates.h>\n\n";
-  } else {
+  }
+  if (!is_all_tensor) {
     code << "// Non-tensor basis source\n";
     code << "#include <ceed/jit-source/hip/hip-shared-basis-nontensor-templates.h>\n\n";
   }
@@ -1145,6 +1221,10 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_bu
     code << "// AtPoints basis source\n";
     code << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h>\n\n";
   }
+  if (!is_all_tensor && !is_all_nontensor) {
+    code << "// Tensor basis source\n";
+    code << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-flattened-templates.h>\n\n";
+  }
   code << "// CodeGen operator source\n";
   code << "#include <ceed/jit-source/hip/hip-gen-templates.h>\n\n";
 
@@ -1156,7 +1236,7 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_bu
 
   // Define CEED_Q_VLA
   code << "\n#undef CEED_Q_VLA\n";
-  if (dim != 3 || is_at_points || use_3d_slices || !is_tensor) {
+  if (max_dim != 3 || is_at_points || use_3d_slices || !is_all_tensor) {
     code << "#define CEED_Q_VLA 1\n\n";
   } else {
     code << "#define CEED_Q_VLA " << Q_1d << "\n\n";
@@ -1203,8 +1283,13 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_bu
     code << "  CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n";
   }
 
-  code << "  const CeedInt dim = " << dim << ";\n";
-  code << "  const CeedInt " << (is_tensor ? "Q_1d" : "Q") << " = " << Q_1d << ";\n";
+  code << " const CeedInt max_dim = " << max_dim << ";\n";
+  if (!is_all_tensor) {
+    code << "  const CeedInt Q = " << Q << ";\n";
+  }
+  if (!is_all_nontensor) {
+    code << "  const CeedInt Q_1d = " << Q_1d << ";\n";
+  }
   if (is_at_points) {
     code << "  const CeedInt max_num_points = " << max_num_points << ";\n";
     code << "  const CeedInt coords_comp_stride = " << coords_comp_stride << ";\n";
@@ -1217,7 +1302,7 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_bu
   code << "  data.t_id_y = threadIdx.y;\n";
   code << "  data.t_id_z = threadIdx.z;\n";
   code << "  data.t_id  = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
-  code << "  data.slice = slice + data.t_id_z*OP_T_1D" << ((!is_tensor || dim == 1) ? "" : "*OP_T_1D") << ";\n";
+  code << "  data.slice = slice + data.t_id_z*OP_T_1D" << ((!is_all_tensor || max_dim == 1) ? "" : "*OP_T_1D") << ";\n";
 
   // -- Determine input mat reuse
   FieldReuse_Hip input_matrix_reuse[CEED_FIELD_MAX];
@@ -1226,12 +1311,14 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_bu
     input_matrix_reuse[i].index = -1;
   }
   for (CeedInt i = 0; i < num_input_fields; i++) {
+    bool         is_tensor = true;
     CeedEvalMode eval_mode_i;
     CeedBasis    basis_i;
 
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode_i));
     if (eval_mode_i == CEED_EVAL_WEIGHT) continue;
     CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis_i));
+    CeedCallBackend(CeedBasisIsTensor(basis_i, &is_tensor));
     for (CeedInt j = 0; (input_matrix_reuse[i].index == -1) && (j < i); j++) {
       CeedEvalMode eval_mode_j;
       CeedBasis    basis_j;
@@ -1265,6 +1352,7 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_bu
     output_matrix_reuse[i].index = -1;
   }
   for (CeedInt i = 0; i < num_output_fields; i++) {
+    bool         is_tensor = true;
     CeedEvalMode eval_mode_i;
     CeedBasis    basis_i;
 
@@ -1300,6 +1388,7 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_bu
       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode_j));
       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
       CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[j], &basis_j));
+      CeedCallBackend(CeedBasisIsTensor(basis_i, &is_tensor));
       if (basis_i == basis_j) {
         if (is_tensor) {
           output_matrix_reuse[i].index     = j;
@@ -1322,13 +1411,13 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_bu
   // Initialize constants, and matrices B and G
   code << "\n  // Input field constants and basis data\n";
   for (CeedInt i = 0; i < num_input_fields; i++) {
-    CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i], Q_1d, true,
-                                                             is_tensor, is_at_points, use_3d_slices));
+    CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i], max_dim, Q,
+                                                             Q_1d, true, is_all_tensor, is_at_points, use_3d_slices));
   }
   code << "\n  // Output field constants and basis data\n";
   for (CeedInt i = 0; i < num_output_fields; i++) {
-    CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i], Q_1d,
-                                                             false, is_tensor, is_at_points, use_3d_slices));
+    CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i], max_dim,
+                                                             Q, Q_1d, false, is_all_tensor, is_at_points, use_3d_slices));
   }
 
   // Loop over all elements
@@ -1346,7 +1435,7 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_bu
     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
-    max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_tensor && (dim >= 3) ? elem_size : 1));
+    max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? elem_size : 1));
     CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
   }
   for (CeedInt i = 0; i < num_output_fields; i++) {
@@ -1356,7 +1445,7 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_bu
     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
-    max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_tensor && (dim >= 3) ? elem_size : 1));
+    max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? elem_size : 1));
     CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
   }
   code << "    // Scratch restriction buffer space\n";
@@ -1416,17 +1505,17 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_bu
     code << "    // ---- Input field " << f << ": " << field_name << "\n";
 
     // ---- Restriction
-    CeedCallBackend(CeedOperatorBuildKernelRestriction_Hip_gen(code, data, f, dim, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f], Q_1d,
-                                                               true, is_tensor, is_at_points, use_3d_slices));
+    CeedCallBackend(CeedOperatorBuildKernelRestriction_Hip_gen(code, data, f, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f], max_dim,
+                                                               Q_1d, true, is_all_tensor, is_at_points, use_3d_slices));
 
     // ---- Basis action
-    CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, f, dim, op_input_fields[f], qf_input_fields[f], Q_1d, true, is_tensor,
+    CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, f, op_input_fields[f], qf_input_fields[f], max_dim, Q_1d, true, is_all_tensor,
                                                          is_at_points, use_3d_slices));
   }
 
   // -- Q function
-  CeedCallBackend(CeedOperatorBuildKernelQFunction_Hip_gen(code, data, dim, max_num_points, num_input_fields, op_input_fields, qf_input_fields,
-                                                           num_output_fields, op_output_fields, qf_output_fields, qfunction_name, Q_1d, is_tensor,
+  CeedCallBackend(CeedOperatorBuildKernelQFunction_Hip_gen(code, data, max_dim, max_num_points, num_input_fields, op_input_fields, qf_input_fields,
+                                                           num_output_fields, op_output_fields, qf_output_fields, qfunction_name, Q_1d, is_all_tensor,
                                                            is_at_points, use_3d_slices));
 
   // -- Output basis and restriction
@@ -1438,12 +1527,12 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_bu
     code << "    // ---- Output field " << i << ": " << field_name << "\n";
 
     // ---- Basis action
-    CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, i, dim, op_output_fields[i], qf_output_fields[i], Q_1d, false, is_tensor,
+    CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, i, op_output_fields[i], qf_output_fields[i], max_dim, Q_1d, false, is_all_tensor,
                                                          is_at_points, use_3d_slices));
 
     // ---- Restriction
-    CeedCallBackend(CeedOperatorBuildKernelRestriction_Hip_gen(code, data, i, dim, NULL, op_output_fields[i], qf_output_fields[i], Q_1d, false,
-                                                               is_tensor, is_at_points, use_3d_slices));
+    CeedCallBackend(CeedOperatorBuildKernelRestriction_Hip_gen(code, data, i, NULL, op_output_fields[i], qf_output_fields[i], max_dim, Q_1d, false,
+                                                               is_all_tensor, is_at_points, use_3d_slices));
   }
 
   // Close loop and function
@@ -1456,7 +1545,7 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_bu
 
   // Compile
   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
-  CeedCallBackend(BlockGridCalculate_Hip_gen(is_tensor ? dim : 1, num_elem, data->max_P_1d, Q_1d, block_sizes));
+  CeedCallBackend(BlockGridCalculate_Hip_gen(is_all_tensor ? max_dim : 1, num_elem, data->max_P_1d, is_all_tensor ? Q_1d : Q, block_sizes));
   if (is_at_points) block_sizes[2] = 1;
   {
     bool is_compile_good = false;
diff --git a/backends/hip-gen/ceed-hip-gen-operator.c b/backends/hip-gen/ceed-hip-gen-operator.c
index 60f28accd9..8fd087ca78 100644
--- a/backends/hip-gen/ceed-hip-gen-operator.c
+++ b/backends/hip-gen/ceed-hip-gen-operator.c
@@ -131,17 +131,15 @@ static int CeedOperatorApplyAddCore_Hip_gen(CeedOperator op, hipStream_t stream,
   CeedCallBackend(CeedQFunctionGetInnerContextData(qf, CEED_MEM_DEVICE, &qf_data->d_c));
 
   // Apply operator
-  void         *opargs[]  = {(void *)&num_elem, &qf_data->d_c, &data->indices, &data->fields, &data->B, &data->G, &data->W, &data->points};
-  const CeedInt dim       = data->dim;
-  const CeedInt Q_1d      = data->Q_1d;
-  const CeedInt P_1d      = data->max_P_1d;
-  const CeedInt thread_1d = CeedIntMax(Q_1d, P_1d);
+  void *opargs[] = {(void *)&num_elem, &qf_data->d_c, &data->indices, &data->fields, &data->B, &data->G, &data->W, &data->points};
 
   CeedCallBackend(CeedOperatorHasTensorBases(op, &is_tensor));
-  CeedInt block_sizes[3] = {thread_1d, ((!is_tensor || dim == 1) ? 1 : thread_1d), -1};
+  const CeedInt thread_1d = CeedIntMax(is_tensor ? data->Q_1d : data->Q, data->max_P_1d);
+
+  CeedInt block_sizes[3] = {thread_1d, ((!is_tensor || data->dim == 1) ? 1 : thread_1d), -1};
 
   if (is_tensor) {
-    CeedCallBackend(BlockGridCalculate_Hip_gen(is_tensor ? dim : 1, num_elem, P_1d, Q_1d, block_sizes));
+    CeedCallBackend(BlockGridCalculate_Hip_gen(data->dim, num_elem, data->max_P_1d, data->Q_1d, block_sizes));
     if (is_at_points) block_sizes[2] = 1;
   } else {
     CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64;
@@ -149,19 +147,19 @@ static int CeedOperatorApplyAddCore_Hip_gen(CeedOperator op, hipStream_t stream,
     elems_per_block = elems_per_block > 0 ? elems_per_block : 1;
     block_sizes[2]  = elems_per_block;
   }
-  if (dim == 1 || !is_tensor) {
+  if (data->dim == 1 || !is_tensor) {
     CeedInt grid      = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
     CeedInt sharedMem = block_sizes[2] * thread_1d * sizeof(CeedScalar);
 
     CeedCallBackend(
         CeedTryRunKernelDimShared_Hip(ceed, data->op, stream, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, is_run_good, opargs));
-  } else if (dim == 2) {
+  } else if (data->dim == 2) {
     CeedInt grid      = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
     CeedInt sharedMem = block_sizes[2] * thread_1d * thread_1d * sizeof(CeedScalar);
 
     CeedCallBackend(
         CeedTryRunKernelDimShared_Hip(ceed, data->op, stream, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, is_run_good, opargs));
-  } else if (dim == 3) {
+  } else if (data->dim == 3) {
     CeedInt grid      = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
     CeedInt sharedMem = block_sizes[2] * thread_1d * thread_1d * sizeof(CeedScalar);
 
diff --git a/backends/hip-gen/ceed-hip-gen.h b/backends/hip-gen/ceed-hip-gen.h
index 760fef2ed5..921bcfeb9b 100644
--- a/backends/hip-gen/ceed-hip-gen.h
+++ b/backends/hip-gen/ceed-hip-gen.h
@@ -14,7 +14,7 @@
 typedef struct {
   bool          use_fallback;
   CeedInt       dim;
-  CeedInt       Q_1d;
+  CeedInt       Q, Q_1d;
   CeedInt       max_P_1d;
   hipModule_t   module;
   hipFunction_t op;

From efa41df35aba6fbf99ee55a3580c03081f41c595 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 14 Mar 2025 15:00:45 -0600
Subject: [PATCH 336/571] fix - harmless warnings

---
 backends/cuda-gen/ceed-cuda-gen-operator-build.cpp | 4 ++--
 backends/hip-gen/ceed-hip-gen-operator-build.cpp   | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
index fbc8199981..257e6a7d86 100644
--- a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
+++ b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
@@ -1055,7 +1055,7 @@ static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, C
 extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_build) {
   bool                    is_all_tensor = true, is_all_nontensor = true, is_at_points = false, use_3d_slices = false;
   Ceed                    ceed;
-  CeedInt                 Q, Q_1d, num_input_fields, num_output_fields, max_dim = 1, max_num_points = 0, coords_comp_stride = 0;
+  CeedInt                 Q = 0, Q_1d = 0, num_input_fields, num_output_fields, max_dim = 1, max_num_points = 0, coords_comp_stride = 0;
   CeedQFunctionField     *qf_input_fields, *qf_output_fields;
   CeedQFunction_Cuda_gen *qf_data;
   CeedQFunction           qf;
@@ -1139,7 +1139,7 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_b
   // Get operator data
   CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points));
   {
-    CeedInt max_P, max_P_1d;
+    CeedInt max_P = 0, max_P_1d = 0;
 
     CeedCallBackend(CeedOperatorBuildKernelData_Cuda_gen(ceed, num_input_fields, op_input_fields, qf_input_fields, num_output_fields,
                                                          op_output_fields, qf_output_fields, &max_P, &max_P_1d, &Q, &Q_1d, &max_dim, &is_all_tensor,
diff --git a/backends/hip-gen/ceed-hip-gen-operator-build.cpp b/backends/hip-gen/ceed-hip-gen-operator-build.cpp
index 11d7815322..1b1b9927fa 100644
--- a/backends/hip-gen/ceed-hip-gen-operator-build.cpp
+++ b/backends/hip-gen/ceed-hip-gen-operator-build.cpp
@@ -1081,7 +1081,7 @@ static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, Ce
 extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_build) {
   bool                   is_all_tensor = true, is_all_nontensor = true, is_at_points = false, use_3d_slices = false;
   Ceed                   ceed;
-  CeedInt                Q, Q_1d, num_input_fields, num_output_fields, max_dim = 1, max_num_points = 0, coords_comp_stride = 0;
+  CeedInt                Q = 0, Q_1d = 0, num_input_fields, num_output_fields, max_dim = 1, max_num_points = 0, coords_comp_stride = 0;
   CeedQFunctionField    *qf_input_fields, *qf_output_fields;
   CeedQFunction_Hip_gen *qf_data;
   CeedQFunction          qf;
@@ -1165,7 +1165,7 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_bu
   // Get operator data
   CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points));
   {
-    CeedInt max_P, max_P_1d;
+    CeedInt max_P = 0, max_P_1d = 0;
 
     CeedCallBackend(CeedOperatorBuildKernelData_Hip_gen(ceed, num_input_fields, op_input_fields, qf_input_fields, num_output_fields, op_output_fields,
                                                         qf_output_fields, &max_P, &max_P_1d, &Q, &Q_1d, &max_dim, &is_all_tensor, &use_3d_slices));

From ce44184cf000e5faa51c26eb25197f2718cc4a16 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Mon, 17 Mar 2025 08:40:14 -0600
Subject: [PATCH 337/571] gen - skip unneeded pack/unpack

---
 ...-shared-basis-tensor-flattened-templates.h | 50 +++++++++----------
 ...-shared-basis-tensor-flattened-templates.h | 50 +++++++++----------
 2 files changed, 50 insertions(+), 50 deletions(-)

diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-flattened-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-flattened-templates.h
index cb16d9616f..e08eb7a20e 100644
--- a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-flattened-templates.h
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-flattened-templates.h
@@ -133,13 +133,13 @@ inline __device__ void InterpTensor2dFlattened(SharedData_Cuda &data, CeedScalar
   const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
   CeedScalar r_t[1];
 
-  QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
+  if (P_1D != T_1D) QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
     ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
     ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
   }
-  QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
-  QPack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V);
+  if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
+  if (Q_1D != T_1D) QPack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V);
 }
 
 //------------------------------------------------------------------------------
@@ -151,12 +151,12 @@ inline __device__ void InterpTransposeTensor2dFlattened(SharedData_Cuda &data, C
   const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
   CeedScalar r_t[1];
 
-  QUnpack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U);
+  if (Q_1D != T_1D) QUnpack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U);
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
     ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
     ContractTransposeX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
   }
-  QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V);
+  if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V);
 }
 
 //------------------------------------------------------------------------------
@@ -168,15 +168,15 @@ inline __device__ void GradTensor2dFlattened(SharedData_Cuda &data, CeedScalar *
   const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
   CeedScalar r_t[1];
 
-  QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
+  if (P_1D != T_1D) QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
     ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_G, r_t);
     ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp + 0 * NUM_COMP]);
     ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
     ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp + 1 * NUM_COMP]);
   }
-  QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
-  QPack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V);
+  if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
+  if (Q_1D != T_1D) QPack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V);
 }
 
 //------------------------------------------------------------------------------
@@ -188,14 +188,14 @@ inline __device__ void GradTransposeTensor2dFlattened(SharedData_Cuda &data, Cee
   const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
   CeedScalar r_t[1];
 
-  QUnpack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U);
+  if (Q_1D != T_1D) QUnpack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U);
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
     ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 0 * NUM_COMP], c_B, r_t);
     ContractTransposeX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp]);
     ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 1 * NUM_COMP], c_G, r_t);
     ContractTransposeAddX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
   }
-  QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V);
+  if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V);
 }
 
 //------------------------------------------------------------------------------
@@ -398,14 +398,14 @@ inline __device__ void InterpTensor3dFlattened(SharedData_Cuda &data, CeedScalar
   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
   CeedScalar    r_t1[1], r_t2[1];
 
-  QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
+  if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
     ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
     ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
     ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]);
   }
-  QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
-  QPack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
+  if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
+  if (Q_1D != T_1D) QPack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
 }
 
 //------------------------------------------------------------------------------
@@ -417,13 +417,13 @@ inline __device__ void InterpTransposeTensor3dFlattened(SharedData_Cuda &data, C
   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
   CeedScalar    r_t1[1], r_t2[1];
 
-  QUnpack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
+  if (Q_1D != T_1D) QUnpack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
     ContractTransposeZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
     ContractTransposeY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
     ContractTransposeX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]);
   }
-  QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
+  if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
 }
 
 //------------------------------------------------------------------------------
@@ -435,7 +435,7 @@ inline __device__ void GradTensor3dFlattened(SharedData_Cuda &data, CeedScalar *
   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
   CeedScalar    r_t1[1], r_t2[1];
 
-  QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
+  if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
     ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_G, r_t1);
     ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
@@ -447,8 +447,8 @@ inline __device__ void GradTensor3dFlattened(SharedData_Cuda &data, CeedScalar *
     ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
     ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_G, &r_V[comp + 2 * NUM_COMP]);
   }
-  QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
-  QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
+  if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
+  if (Q_1D != T_1D) QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
 }
 
 //------------------------------------------------------------------------------
@@ -460,7 +460,7 @@ inline __device__ void GradTransposeTensor3dFlattened(SharedData_Cuda &data, Cee
   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
   CeedScalar    r_t1[1], r_t2[1];
 
-  QUnpack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
+  if (Q_1D != T_1D) QUnpack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
     ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 0 * NUM_COMP], c_B, r_t1);
     ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
@@ -472,7 +472,7 @@ inline __device__ void GradTransposeTensor3dFlattened(SharedData_Cuda &data, Cee
     ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
     ContractTransposeAddX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp]);
   }
-  QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
+  if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
 }
 
 //------------------------------------------------------------------------------
@@ -484,7 +484,7 @@ inline __device__ void GradTensorCollocated3dFlattened(SharedData_Cuda &data, Ce
   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
   CeedScalar    r_t1[1], r_t2[1];
 
-  QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
+  if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
     ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
     ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
@@ -493,8 +493,8 @@ inline __device__ void GradTensorCollocated3dFlattened(SharedData_Cuda &data, Ce
     ContractY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 1 * NUM_COMP]);
     ContractZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 2 * NUM_COMP]);
   }
-  QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
-  QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
+  if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
+  if (Q_1D != T_1D) QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
 }
 
 //------------------------------------------------------------------------------
@@ -506,7 +506,7 @@ inline __device__ void GradTransposeTensorCollocated3dFlattened(SharedData_Cuda
   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
   CeedScalar    r_t1[1], r_t2[1];
 
-  QUnpack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
+  if (Q_1D != T_1D) QUnpack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
     ContractTransposeZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 2 * NUM_COMP], c_G, r_t2);
     ContractTransposeAddY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 1 * NUM_COMP], c_G, r_t2);
@@ -515,7 +515,7 @@ inline __device__ void GradTransposeTensorCollocated3dFlattened(SharedData_Cuda
     ContractTransposeY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
     ContractTransposeX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]);
   }
-  QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
+  if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
 }
 
 //------------------------------------------------------------------------------
diff --git a/include/ceed/jit-source/hip/hip-shared-basis-tensor-flattened-templates.h b/include/ceed/jit-source/hip/hip-shared-basis-tensor-flattened-templates.h
index 864f95ae68..634509c311 100644
--- a/include/ceed/jit-source/hip/hip-shared-basis-tensor-flattened-templates.h
+++ b/include/ceed/jit-source/hip/hip-shared-basis-tensor-flattened-templates.h
@@ -133,13 +133,13 @@ inline __device__ void InterpTensor2dFlattened(SharedData_Hip &data, CeedScalar
   const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
   CeedScalar r_t[1];
 
-  QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
+  if (P_1D != T_1D) QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
     ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
     ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
   }
-  QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
-  QPack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V);
+  if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
+  if (Q_1D != T_1D) QPack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V);
 }
 
 //------------------------------------------------------------------------------
@@ -151,12 +151,12 @@ inline __device__ void InterpTransposeTensor2dFlattened(SharedData_Hip &data, Ce
   const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
   CeedScalar r_t[1];
 
-  QUnpack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U);
+  if (Q_1D != T_1D) QUnpack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U);
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
     ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
     ContractTransposeX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
   }
-  QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V);
+  if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V);
 }
 
 //------------------------------------------------------------------------------
@@ -168,15 +168,15 @@ inline __device__ void GradTensor2dFlattened(SharedData_Hip &data, CeedScalar *_
   const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
   CeedScalar r_t[1];
 
-  QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
+  if (P_1D != T_1D) QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
     ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_G, r_t);
     ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp + 0 * NUM_COMP]);
     ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
     ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp + 1 * NUM_COMP]);
   }
-  QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
-  QPack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V);
+  if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
+  if (Q_1D != T_1D) QPack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V);
 }
 
 //------------------------------------------------------------------------------
@@ -188,14 +188,14 @@ inline __device__ void GradTransposeTensor2dFlattened(SharedData_Hip &data, Ceed
   const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
   CeedScalar r_t[1];
 
-  QUnpack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U);
+  if (Q_1D != T_1D) QUnpack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U);
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
     ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 0 * NUM_COMP], c_B, r_t);
     ContractTransposeX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp]);
     ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 1 * NUM_COMP], c_G, r_t);
     ContractTransposeAddX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
   }
-  QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V);
+  if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V);
 }
 
 //------------------------------------------------------------------------------
@@ -398,14 +398,14 @@ inline __device__ void InterpTensor3dFlattened(SharedData_Hip &data, CeedScalar
   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
   CeedScalar    r_t1[1], r_t2[1];
 
-  QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
+  if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
     ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
     ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
     ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]);
   }
-  QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
-  QPack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
+  if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
+  if (Q_1D != T_1D) QPack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
 }
 
 //------------------------------------------------------------------------------
@@ -417,13 +417,13 @@ inline __device__ void InterpTransposeTensor3dFlattened(SharedData_Hip &data, Ce
   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
   CeedScalar    r_t1[1], r_t2[1];
 
-  QUnpack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
+  if (Q_1D != T_1D) QUnpack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
     ContractTransposeZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
     ContractTransposeY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
     ContractTransposeX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]);
   }
-  QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
+  if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
 }
 
 //------------------------------------------------------------------------------
@@ -435,7 +435,7 @@ inline __device__ void GradTensor3dFlattened(SharedData_Hip &data, CeedScalar *_
   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
   CeedScalar    r_t1[1], r_t2[1];
 
-  QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
+  if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
     ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_G, r_t1);
     ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
@@ -447,8 +447,8 @@ inline __device__ void GradTensor3dFlattened(SharedData_Hip &data, CeedScalar *_
     ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
     ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_G, &r_V[comp + 2 * NUM_COMP]);
   }
-  QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
-  QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
+  if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
+  if (Q_1D != T_1D) QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
 }
 
 //------------------------------------------------------------------------------
@@ -460,7 +460,7 @@ inline __device__ void GradTransposeTensor3dFlattened(SharedData_Hip &data, Ceed
   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
   CeedScalar    r_t1[1], r_t2[1];
 
-  QUnpack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
+  if (Q_1D != T_1D) QUnpack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
     ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 0 * NUM_COMP], c_B, r_t1);
     ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
@@ -472,7 +472,7 @@ inline __device__ void GradTransposeTensor3dFlattened(SharedData_Hip &data, Ceed
     ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
     ContractTransposeAddX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp]);
   }
-  QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
+  if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
 }
 
 //------------------------------------------------------------------------------
@@ -484,7 +484,7 @@ inline __device__ void GradTensorCollocated3dFlattened(SharedData_Hip &data, Cee
   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
   CeedScalar    r_t1[1], r_t2[1];
 
-  QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
+  if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
     ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
     ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
@@ -493,8 +493,8 @@ inline __device__ void GradTensorCollocated3dFlattened(SharedData_Hip &data, Cee
     ContractY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 1 * NUM_COMP]);
     ContractZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 2 * NUM_COMP]);
   }
-  QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
-  QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
+  if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
+  if (Q_1D != T_1D) QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
 }
 
 //------------------------------------------------------------------------------
@@ -506,7 +506,7 @@ inline __device__ void GradTransposeTensorCollocated3dFlattened(SharedData_Hip &
   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
   CeedScalar    r_t1[1], r_t2[1];
 
-  QUnpack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
+  if (Q_1D != T_1D) QUnpack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
     ContractTransposeZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 2 * NUM_COMP], c_G, r_t2);
     ContractTransposeAddY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 1 * NUM_COMP], c_G, r_t2);
@@ -515,7 +515,7 @@ inline __device__ void GradTransposeTensorCollocated3dFlattened(SharedData_Hip &
     ContractTransposeY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
     ContractTransposeX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]);
   }
-  QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
+  if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
 }
 
 //------------------------------------------------------------------------------

From a61b1c9113bd68e135048720493b31b7a447a3ac Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Mon, 17 Mar 2025 13:16:02 -0600
Subject: [PATCH 338/571] gen - small fixes

---
 .../cuda-gen/ceed-cuda-gen-operator-build.cpp | 22 +++++++++----------
 backends/cuda-gen/ceed-cuda-gen-operator.c    |  6 ++---
 backends/cuda-gen/ceed-cuda-gen.h             |  1 +
 .../hip-gen/ceed-hip-gen-operator-build.cpp   | 22 +++++++++----------
 backends/hip-gen/ceed-hip-gen-operator.c      | 12 +++++-----
 backends/hip-gen/ceed-hip-gen.h               |  1 +
 6 files changed, 29 insertions(+), 35 deletions(-)

diff --git a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
index 257e6a7d86..8cd83b867c 100644
--- a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
+++ b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
@@ -696,8 +696,7 @@ static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, C
       case CEED_EVAL_GRAD:
         if (is_at_points) {
           // Accumulator for point data
-          code << "    CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "*dim" << var_suffix
-               << "];\n";
+          code << "    CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "];\n";
           code << "    for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "; i++) {\n";
           code << "      r_c" << var_suffix << "[i] = 0.0;\n";
           code << "    }\n";
@@ -725,9 +724,9 @@ static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, C
   if (is_at_points) {
     // We need to handle batches of points
     code << "\n    // Note: Using batches of points\n";
-    code << "    const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * max_num_points / (blockDim.x * blockDim.y));\n\n";
+    code << "    const CeedInt point_loop_bound = (blockDim.x*blockDim.y) * ceil((1.0*max_num_points) / (blockDim.x*blockDim.y));\n\n";
     code << "    #pragma unroll\n";
-    code << "    for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) {\n";
+    code << "    for (CeedInt i = threadIdx.x + threadIdx.y*blockDim.x; i < point_loop_bound; i += blockDim.x*blockDim.y) {\n";
     code << "      const CeedInt p = i % max_num_points;\n\n";
 
     code << "      // -- Coordinates\n";
@@ -1289,8 +1288,8 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_b
   code << "  data.t_id_x = threadIdx.x;\n";
   code << "  data.t_id_y = threadIdx.y;\n";
   code << "  data.t_id_z = threadIdx.z;\n";
-  code << "  data.t_id  = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
-  code << "  data.slice = slice + data.t_id_z*OP_T_1D" << ((!is_all_tensor || max_dim == 1) ? "" : "*OP_T_1D") << ";\n";
+  code << "  data.t_id   = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
+  code << "  data.slice  = slice + data.t_id_z*OP_T_1D" << ((!is_all_tensor || max_dim == 1) ? "" : "*OP_T_1D") << ";\n";
 
   // -- Determine input mat reuse
   FieldReuse_Cuda input_matrix_reuse[CEED_FIELD_MAX];
@@ -1417,23 +1416,21 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_b
   CeedInt max_rstr_buffer_size = 1;
 
   for (CeedInt i = 0; i < num_input_fields; i++) {
-    CeedInt             num_comp, elem_size;
+    CeedInt             num_comp;
     CeedElemRestriction elem_rstr;
 
     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
-    CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
-    max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? elem_size : 1));
+    max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1));
     CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
   }
   for (CeedInt i = 0; i < num_output_fields; i++) {
-    CeedInt             num_comp, elem_size;
+    CeedInt             num_comp;
     CeedElemRestriction elem_rstr;
 
     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
-    CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
-    max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? elem_size : 1));
+    max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1));
     CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
   }
   code << "    // Scratch restriction buffer space\n";
@@ -1533,6 +1530,7 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_b
     bool          is_compile_good = false;
     const CeedInt T_1d            = CeedIntMax(is_all_tensor ? Q_1d : Q, data->max_P_1d);
 
+    data->thread_1d = T_1d;
     CeedCallBackend(CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good, &data->module, 1, "OP_T_1D", T_1d));
     if (is_compile_good) {
       *is_good_build = true;
diff --git a/backends/cuda-gen/ceed-cuda-gen-operator.c b/backends/cuda-gen/ceed-cuda-gen-operator.c
index 831d6f852b..40774b44e7 100644
--- a/backends/cuda-gen/ceed-cuda-gen-operator.c
+++ b/backends/cuda-gen/ceed-cuda-gen-operator.c
@@ -201,16 +201,14 @@ static int CeedOperatorApplyAddCore_Cuda_gen(CeedOperator op, CUstream stream, c
   int   max_threads_per_block, min_grid_size, grid;
 
   CeedCallBackend(CeedOperatorHasTensorBases(op, &is_tensor));
-  const CeedInt thread_1d = CeedIntMax(is_tensor ? data->Q_1d : data->Q, data->max_P_1d);
-
   CeedCallCuda(ceed, cuOccupancyMaxPotentialBlockSize(&min_grid_size, &max_threads_per_block, data->op, dynamicSMemSize, 0, 0x10000));
-  int block[3] = {thread_1d, ((!is_tensor || data->dim == 1) ? 1 : thread_1d), -1};
+  int block[3] = {data->thread_1d, ((!is_tensor || data->dim == 1) ? 1 : data->thread_1d), -1};
 
   if (is_tensor) {
     CeedCallBackend(BlockGridCalculate(num_elem, min_grid_size / cuda_data->device_prop.multiProcessorCount, is_at_points ? 1 : max_threads_per_block,
                                        cuda_data->device_prop.maxThreadsDim[2], cuda_data->device_prop.warpSize, block, &grid));
   } else {
-    CeedInt elems_per_block = CeedIntMin(cuda_data->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread_1d, 1));
+    CeedInt elems_per_block = CeedIntMin(cuda_data->device_prop.maxThreadsDim[2], CeedIntMax(512 / data->thread_1d, 1));
 
     grid     = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
     block[2] = elems_per_block;
diff --git a/backends/cuda-gen/ceed-cuda-gen.h b/backends/cuda-gen/ceed-cuda-gen.h
index 30d13574aa..dcdd6bec3d 100644
--- a/backends/cuda-gen/ceed-cuda-gen.h
+++ b/backends/cuda-gen/ceed-cuda-gen.h
@@ -16,6 +16,7 @@ typedef struct {
   CeedInt        dim;
   CeedInt        Q, Q_1d;
   CeedInt        max_P_1d;
+  CeedInt        thread_1d;
   CUmodule       module;
   CUfunction     op;
   FieldsInt_Cuda indices;
diff --git a/backends/hip-gen/ceed-hip-gen-operator-build.cpp b/backends/hip-gen/ceed-hip-gen-operator-build.cpp
index 1b1b9927fa..21abf9d762 100644
--- a/backends/hip-gen/ceed-hip-gen-operator-build.cpp
+++ b/backends/hip-gen/ceed-hip-gen-operator-build.cpp
@@ -722,8 +722,7 @@ static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, Ce
       case CEED_EVAL_GRAD:
         if (is_at_points) {
           // Accumulator for point data
-          code << "    CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "*dim" << var_suffix
-               << "];\n";
+          code << "    CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "];\n";
           code << "    for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "; i++) {\n";
           code << "      r_c" << var_suffix << "[i] = 0.0;\n";
           code << "    }\n";
@@ -751,9 +750,9 @@ static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, Ce
   if (is_at_points) {
     // We need to handle batches of points
     code << "\n    // Note: Using batches of points\n";
-    code << "    const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * max_num_points / (blockDim.x * blockDim.y));\n\n";
+    code << "    const CeedInt point_loop_bound = (blockDim.x*blockDim.y) * ceil((1.0*max_num_points) / (blockDim.x*blockDim.y));\n\n";
     code << "    #pragma unroll\n";
-    code << "    for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) {\n";
+    code << "    for (CeedInt i = threadIdx.x + threadIdx.y*blockDim.x; i < point_loop_bound; i += blockDim.x*blockDim.y) {\n";
     code << "      const CeedInt p = i % max_num_points;\n\n";
 
     code << "      // -- Coordinates\n";
@@ -1301,8 +1300,8 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_bu
   code << "  data.t_id_x = threadIdx.x;\n";
   code << "  data.t_id_y = threadIdx.y;\n";
   code << "  data.t_id_z = threadIdx.z;\n";
-  code << "  data.t_id  = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
-  code << "  data.slice = slice + data.t_id_z*OP_T_1D" << ((!is_all_tensor || max_dim == 1) ? "" : "*OP_T_1D") << ";\n";
+  code << "  data.t_id   = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
+  code << "  data.slice  = slice + data.t_id_z*OP_T_1D" << ((!is_all_tensor || max_dim == 1) ? "" : "*OP_T_1D") << ";\n";
 
   // -- Determine input mat reuse
   FieldReuse_Hip input_matrix_reuse[CEED_FIELD_MAX];
@@ -1429,23 +1428,21 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_bu
   CeedInt max_rstr_buffer_size = 1;
 
   for (CeedInt i = 0; i < num_input_fields; i++) {
-    CeedInt             num_comp, elem_size;
+    CeedInt             num_comp;
     CeedElemRestriction elem_rstr;
 
     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
-    CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
-    max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? elem_size : 1));
+    max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1));
     CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
   }
   for (CeedInt i = 0; i < num_output_fields; i++) {
-    CeedInt             num_comp, elem_size;
+    CeedInt             num_comp;
     CeedElemRestriction elem_rstr;
 
     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
-    CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
-    max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? elem_size : 1));
+    max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1));
     CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
   }
   code << "    // Scratch restriction buffer space\n";
@@ -1550,6 +1547,7 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_bu
   {
     bool is_compile_good = false;
 
+    data->thread_1d = block_sizes[0];
     CeedCallBackend(CeedTryCompile_Hip(ceed, code.str().c_str(), &is_compile_good, &data->module, 2, "OP_T_1D", block_sizes[0], "BLOCK_SIZE",
                                        block_sizes[0] * block_sizes[1] * block_sizes[2]));
     if (is_compile_good) {
diff --git a/backends/hip-gen/ceed-hip-gen-operator.c b/backends/hip-gen/ceed-hip-gen-operator.c
index 8fd087ca78..a9e284ba7a 100644
--- a/backends/hip-gen/ceed-hip-gen-operator.c
+++ b/backends/hip-gen/ceed-hip-gen-operator.c
@@ -134,34 +134,32 @@ static int CeedOperatorApplyAddCore_Hip_gen(CeedOperator op, hipStream_t stream,
   void *opargs[] = {(void *)&num_elem, &qf_data->d_c, &data->indices, &data->fields, &data->B, &data->G, &data->W, &data->points};
 
   CeedCallBackend(CeedOperatorHasTensorBases(op, &is_tensor));
-  const CeedInt thread_1d = CeedIntMax(is_tensor ? data->Q_1d : data->Q, data->max_P_1d);
-
-  CeedInt block_sizes[3] = {thread_1d, ((!is_tensor || data->dim == 1) ? 1 : thread_1d), -1};
+  CeedInt block_sizes[3] = {data->thread_1d, ((!is_tensor || data->dim == 1) ? 1 : data->thread_1d), -1};
 
   if (is_tensor) {
     CeedCallBackend(BlockGridCalculate_Hip_gen(data->dim, num_elem, data->max_P_1d, data->Q_1d, block_sizes));
     if (is_at_points) block_sizes[2] = 1;
   } else {
-    CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64;
+    CeedInt elems_per_block = 64 * data->thread_1d > 256 ? 256 / data->thread_1d : 64;
 
     elems_per_block = elems_per_block > 0 ? elems_per_block : 1;
     block_sizes[2]  = elems_per_block;
   }
   if (data->dim == 1 || !is_tensor) {
     CeedInt grid      = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
-    CeedInt sharedMem = block_sizes[2] * thread_1d * sizeof(CeedScalar);
+    CeedInt sharedMem = block_sizes[2] * data->thread_1d * sizeof(CeedScalar);
 
     CeedCallBackend(
         CeedTryRunKernelDimShared_Hip(ceed, data->op, stream, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, is_run_good, opargs));
   } else if (data->dim == 2) {
     CeedInt grid      = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
-    CeedInt sharedMem = block_sizes[2] * thread_1d * thread_1d * sizeof(CeedScalar);
+    CeedInt sharedMem = block_sizes[2] * data->thread_1d * data->thread_1d * sizeof(CeedScalar);
 
     CeedCallBackend(
         CeedTryRunKernelDimShared_Hip(ceed, data->op, stream, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, is_run_good, opargs));
   } else if (data->dim == 3) {
     CeedInt grid      = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
-    CeedInt sharedMem = block_sizes[2] * thread_1d * thread_1d * sizeof(CeedScalar);
+    CeedInt sharedMem = block_sizes[2] * data->thread_1d * data->thread_1d * sizeof(CeedScalar);
 
     CeedCallBackend(
         CeedTryRunKernelDimShared_Hip(ceed, data->op, stream, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, is_run_good, opargs));
diff --git a/backends/hip-gen/ceed-hip-gen.h b/backends/hip-gen/ceed-hip-gen.h
index 921bcfeb9b..dba540d58c 100644
--- a/backends/hip-gen/ceed-hip-gen.h
+++ b/backends/hip-gen/ceed-hip-gen.h
@@ -16,6 +16,7 @@ typedef struct {
   CeedInt       dim;
   CeedInt       Q, Q_1d;
   CeedInt       max_P_1d;
+  CeedInt       thread_1d;
   hipModule_t   module;
   hipFunction_t op;
   FieldsInt_Hip indices;

From 6de40545466b5fa4cbcd17feda0352f2a4b1d743 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Mon, 17 Mar 2025 15:05:38 -0600
Subject: [PATCH 339/571] gen - trim down rstr buffer size

---
 .../cuda-gen/ceed-cuda-gen-operator-build.cpp | 34 ++++++++++++-------
 .../hip-gen/ceed-hip-gen-operator-build.cpp   | 34 ++++++++++++-------
 2 files changed, 44 insertions(+), 24 deletions(-)

diff --git a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
index 8cd83b867c..d7b9edf92a 100644
--- a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
+++ b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
@@ -1416,22 +1416,32 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_b
   CeedInt max_rstr_buffer_size = 1;
 
   for (CeedInt i = 0; i < num_input_fields; i++) {
-    CeedInt             num_comp;
-    CeedElemRestriction elem_rstr;
+    CeedEvalMode eval_mode;
 
-    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
-    CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
-    max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1));
-    CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
+    if (eval_mode != CEED_EVAL_NONE && eval_mode != CEED_EVAL_WEIGHT) {
+      CeedInt             num_comp;
+      CeedElemRestriction elem_rstr;
+
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
+      CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
+      max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1));
+      CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
+    }
   }
   for (CeedInt i = 0; i < num_output_fields; i++) {
-    CeedInt             num_comp;
-    CeedElemRestriction elem_rstr;
+    CeedEvalMode eval_mode;
 
-    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
-    CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
-    max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1));
-    CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
+    if (eval_mode != CEED_EVAL_NONE) {
+      CeedInt             num_comp;
+      CeedElemRestriction elem_rstr;
+
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
+      CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
+      max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1));
+      CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
+    }
   }
   code << "    // Scratch restriction buffer space\n";
   code << "    CeedScalar r_e_scratch[" << max_rstr_buffer_size << "];\n";
diff --git a/backends/hip-gen/ceed-hip-gen-operator-build.cpp b/backends/hip-gen/ceed-hip-gen-operator-build.cpp
index 21abf9d762..8d5fb21985 100644
--- a/backends/hip-gen/ceed-hip-gen-operator-build.cpp
+++ b/backends/hip-gen/ceed-hip-gen-operator-build.cpp
@@ -1428,22 +1428,32 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_bu
   CeedInt max_rstr_buffer_size = 1;
 
   for (CeedInt i = 0; i < num_input_fields; i++) {
-    CeedInt             num_comp;
-    CeedElemRestriction elem_rstr;
+    CeedEvalMode eval_mode;
 
-    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
-    CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
-    max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1));
-    CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
+    if (eval_mode != CEED_EVAL_NONE && eval_mode != CEED_EVAL_WEIGHT) {
+      CeedInt             num_comp;
+      CeedElemRestriction elem_rstr;
+
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
+      CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
+      max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1));
+      CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
+    }
   }
   for (CeedInt i = 0; i < num_output_fields; i++) {
-    CeedInt             num_comp;
-    CeedElemRestriction elem_rstr;
+    CeedEvalMode eval_mode;
 
-    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
-    CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
-    max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1));
-    CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
+    if (eval_mode != CEED_EVAL_NONE) {
+      CeedInt             num_comp;
+      CeedElemRestriction elem_rstr;
+
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
+      CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
+      max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1));
+      CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
+    }
   }
   code << "    // Scratch restriction buffer space\n";
   code << "    CeedScalar r_e_scratch[" << max_rstr_buffer_size << "];\n";

From b8245c6c10c8896f1cf4b2f518a4d3ebe17933b7 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Mon, 17 Mar 2025 15:15:57 -0600
Subject: [PATCH 340/571] gen - minor consistency

---
 backends/cuda-gen/ceed-cuda-gen-operator-build.cpp | 14 +++++---------
 backends/hip-gen/ceed-hip-gen-operator-build.cpp   | 14 +++++---------
 2 files changed, 10 insertions(+), 18 deletions(-)

diff --git a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
index d7b9edf92a..9a4443b1e3 100644
--- a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
+++ b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
@@ -685,9 +685,8 @@ static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, C
         if (is_at_points) {
           // Accumulator for point data
           code << "    CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "];\n";
-          code << "    for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "; i++) {\n";
-          code << "      r_c" << var_suffix << "[i] = 0.0;\n";
-          code << "    }\n";
+          code << "    for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "; i++) r_c" << var_suffix
+               << "[i] = 0.0;\n";
         } else {
           code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tensor && (max_dim >= 3) ? Q_name : "1")
                << "];\n";
@@ -697,15 +696,12 @@ static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, C
         if (is_at_points) {
           // Accumulator for point data
           code << "    CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "];\n";
-          code << "    for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "; i++) {\n";
-          code << "      r_c" << var_suffix << "[i] = 0.0;\n";
-          code << "    }\n";
+          code << "    for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "; i++) r_c" << var_suffix
+               << "[i] = 0.0;\n";
         } else if (use_3d_slices) {
           // Accumulator for gradient slices
           code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
-          code << "    for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << Q_name << "; i++) {\n";
-          code << "      r_q" << var_suffix << "[i] = 0.0;\n";
-          code << "    }\n";
+          code << "    for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << Q_name << "; i++) r_q" << var_suffix << "[i] = 0.0;\n";
         } else {
           code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "*"
                << (is_all_tensor && (max_dim >= 3) ? Q_name : "1") << "];\n";
diff --git a/backends/hip-gen/ceed-hip-gen-operator-build.cpp b/backends/hip-gen/ceed-hip-gen-operator-build.cpp
index 8d5fb21985..2cc6b0adbf 100644
--- a/backends/hip-gen/ceed-hip-gen-operator-build.cpp
+++ b/backends/hip-gen/ceed-hip-gen-operator-build.cpp
@@ -711,9 +711,8 @@ static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, Ce
         if (is_at_points) {
           // Accumulator for point data
           code << "    CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "];\n";
-          code << "    for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "; i++) {\n";
-          code << "      r_c" << var_suffix << "[i] = 0.0;\n";
-          code << "    }\n";
+          code << "    for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "; i++) r_c" << var_suffix
+               << "[i] = 0.0;\n";
         } else {
           code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tensor && (max_dim >= 3) ? Q_name : "1")
                << "];\n";
@@ -723,15 +722,12 @@ static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, Ce
         if (is_at_points) {
           // Accumulator for point data
           code << "    CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "];\n";
-          code << "    for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "; i++) {\n";
-          code << "      r_c" << var_suffix << "[i] = 0.0;\n";
-          code << "    }\n";
+          code << "    for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "; i++) r_c" << var_suffix
+               << "[i] = 0.0;\n";
         } else if (use_3d_slices) {
           // Accumulator for gradient slices
           code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
-          code << "    for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << Q_name << "; i++) {\n";
-          code << "      r_q" << var_suffix << "[i] = 0.0;\n";
-          code << "    }\n";
+          code << "    for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << Q_name << "; i++) r_q" << var_suffix << "[i] = 0.0;\n";
         } else {
           code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "*"
                << (is_all_tensor && (max_dim >= 3) ? Q_name : "1") << "];\n";

From 2d3e8ae507b7e626461a8a7f18bf4ea9099d676a Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Wed, 19 Mar 2025 16:11:25 -0600
Subject: [PATCH 341/571] doc - add ex3 to documentation

---
 examples/ceed/index.md | 58 ++++++++++++++++++++++++++++--------------
 1 file changed, 39 insertions(+), 19 deletions(-)

diff --git a/examples/ceed/index.md b/examples/ceed/index.md
index 5d2d7a3807..785c2cff8f 100644
--- a/examples/ceed/index.md
+++ b/examples/ceed/index.md
@@ -1,45 +1,39 @@
 # Standalone libCEED
 
-The following two examples have no dependencies, and are designed to be self-contained.
-For additional examples that use external discretization libraries (MFEM, PETSc, Nek5000
-etc.) see the subdirectories in {file}`examples/`.
+The following three examples have no dependencies, and are designed to be self-contained.
+For additional examples that use external discretization libraries (MFEM, PETSc, Nek5000 etc.) see the subdirectories in {file}`examples/`.
 
 (ex1-volume)=
 
 ## Ex1-Volume
 
-This example is located in the subdirectory {file}`examples/ceed`. It illustrates a
-simple usage of libCEED to compute the volume of a given body using a matrix-free
-application of the mass operator. Arbitrary mesh and solution orders in 1D, 2D, and 3D
-are supported from the same code.
+This example is located in the subdirectory {file}`examples/ceed`.
+It illustrates a simple usage of libCEED to compute the volume of a given body using a matrix-free application of the mass operator.
+Arbitrary mesh and solution orders in 1D, 2D, and 3D are supported from the same code.
 
-This example shows how to compute line/surface/volume integrals of a 1D, 2D, or 3D
-domain $\Omega$ respectively, by applying the mass operator to a vector of
-$1$s. It computes:
+This example shows how to compute line/surface/volume integrals of a 1D, 2D, or 3D domain $\Omega$ respectively, by applying the mass operator to a vector of $1$s.
+It computes:
 
 $$
 I = \int_{\Omega} 1 \, dV .
 $$ (eq-ex1-volume)
 
-Using the same notation as in {ref}`theoretical-framework`, we write here the vector
-$u(x)\equiv 1$ in the Galerkin approximation,
-and find the volume of $\Omega$ as
+Using the same notation as in {ref}`theoretical-framework`, we write here the vector $u(x)\equiv 1$ in the Galerkin approximation, and find the volume of $\Omega$ as
 
 $$
 \sum_e \int_{\Omega_e} v(x) 1 \, dV
 $$ (volume-sum)
 
-with $v(x) \in \mathcal{V}_p = \{ v \in H^{1}(\Omega_e) \,|\, v \in P_p(\bm{I}), e=1,\ldots,N_e \}$,
-the test functions.
+with $v(x) \in \mathcal{V}_p = \{ v \in H^{1}(\Omega_e) \,|\, v \in P_p(\bm{I}), e=1,\ldots,N_e \}$, the test functions.
 
 (ex2-surface)=
 
 ## Ex2-Surface
 
-This example is located in the subdirectory {file}`examples/ceed`. It computes the
-surface area of a given body using matrix-free application of a diffusion operator.
-Similar to {ref}`Ex1-Volume`, arbitrary mesh and solution orders in 1D, 2D, and 3D
-are supported from the same code. It computes:
+This example is located in the subdirectory {file}`examples/ceed`.
+It computes the surface area of a given body using matrix-free application of a diffusion operator.
+Similar to {ref}`Ex1-Volume`, arbitrary mesh and solution orders in 1D, 2D, and 3D are supported from the same code.
+It computes:
 
 $$
 I = \int_{\partial \Omega} 1 \, dS ,
@@ -65,3 +59,29 @@ Since we have chosen $u$ such that $\nabla u \cdot \hat{\bm n} = 1$, the boundar
 $$
 \int_\Omega \nabla v \cdot \nabla u \, dV \approx \sum_e \int_{\partial \Omega_e} v(x) 1 \, dS .
 $$
+
+(ex3-volume)=
+
+## Ex3-Volume
+
+This example is located in the subdirectory {file}`examples/ceed`.
+It illustrates a more complex usage of libCEED to compute the volume of a given body using a matrix-free application of the screened Poisson operator.
+Arbitrary mesh and solution orders in 1D, 2D, and 3D are supported from the same code.
+
+This example shows how to compute line/surface/volume integrals of a 1D, 2D, or 3D domain $\Omega$ respectively, by applying the screened Poisson operator to a vector of $1$s.
+It computes:
+
+$$
+I = \int_{\Omega} \left( 1 + \nabla^2 1 \right) \, dV .
+$$ (eq-ex3-volume)
+
+Using the same notation as in {ref}`theoretical-framework`, we write here the vector $u(x)\equiv 1$ in the Galerkin approximation, and find the volume of $\Omega$ as
+
+$$
+\sum_e \int_{\Omega_e}\left( v(x) 1 + \nabla v(x) \cdot 0 \right) \, dV
+$$ (volume-sum)
+
+with $v(x) \in \mathcal{V}_p = \{ v \in H^{1}(\Omega_e) \,|\, v \in P_p(\bm{I}), e=1,\ldots,N_e \}$, the test functions.
+
+The addition of the Poisson term is not needed to compute the volume of the region, as shown in example 1.
+Rather, this example illustrates the ability to add multiple evaluation modes for the same input or output vector in a libCEED operator.

From 8b89f79d14e4db4089b0333e54c7a137bf490b6c Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Wed, 19 Mar 2025 16:15:57 -0600
Subject: [PATCH 342/571] doc - add mixed gen support to release notes

---
 doc/sphinx/source/releasenotes.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doc/sphinx/source/releasenotes.md b/doc/sphinx/source/releasenotes.md
index a9f765a2fe..28d5caaf62 100644
--- a/doc/sphinx/source/releasenotes.md
+++ b/doc/sphinx/source/releasenotes.md
@@ -25,6 +25,7 @@ On this page we provide a summary of the main API changes, new features and exam
 - Allow user to set additional compiler options for CUDA and HIP JiT.
 Specifically, directories set with `CeedAddJitSourceRoot(ceed, "foo/bar")` will be used to set `-Ifoo/bar` and defines set with `CeedAddJitDefine(ceed, "foo=bar")` will be used to set `-Dfoo=bar`.
 - Added non-tensor basis support to code generation backends `/gpu/cuda/gen` and `/gpu/hip/gen`.
+- Added support to code generation backends `/gpu/cuda/gen` and `/gpu/hip/gen` for operators with both tensor and non-tensor bases.
 
 ### Examples
 

From d275d636ccaa61e594421fac80252590e7a77ccf Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Wed, 19 Mar 2025 16:27:57 -0600
Subject: [PATCH 343/571] minor - upate copyright to 2025

---
 LICENSE                                                         | 2 +-
 Makefile                                                        | 2 +-
 README.md                                                       | 2 +-
 backends/avx/ceed-avx-blocked.c                                 | 2 +-
 backends/avx/ceed-avx-serial.c                                  | 2 +-
 backends/avx/ceed-avx-tensor.c                                  | 2 +-
 backends/avx/ceed-avx.h                                         | 2 +-
 backends/blocked/ceed-blocked-operator.c                        | 2 +-
 backends/blocked/ceed-blocked.c                                 | 2 +-
 backends/blocked/ceed-blocked.h                                 | 2 +-
 backends/ceed-backend-list.h                                    | 2 +-
 backends/ceed-backend-weak.c                                    | 2 +-
 backends/cuda-gen/ceed-cuda-gen-operator-build.cpp              | 2 +-
 backends/cuda-gen/ceed-cuda-gen-operator-build.h                | 2 +-
 backends/cuda-gen/ceed-cuda-gen-operator.c                      | 2 +-
 backends/cuda-gen/ceed-cuda-gen-qfunction.c                     | 2 +-
 backends/cuda-gen/ceed-cuda-gen.c                               | 2 +-
 backends/cuda-gen/ceed-cuda-gen.h                               | 2 +-
 backends/cuda-ref/ceed-cuda-ref-basis.c                         | 2 +-
 backends/cuda-ref/ceed-cuda-ref-operator.c                      | 2 +-
 backends/cuda-ref/ceed-cuda-ref-qfunction-load.cpp              | 2 +-
 backends/cuda-ref/ceed-cuda-ref-qfunction-load.h                | 2 +-
 backends/cuda-ref/ceed-cuda-ref-qfunction.c                     | 2 +-
 backends/cuda-ref/ceed-cuda-ref-qfunctioncontext.c              | 2 +-
 backends/cuda-ref/ceed-cuda-ref-restriction.c                   | 2 +-
 backends/cuda-ref/ceed-cuda-ref-vector.c                        | 2 +-
 backends/cuda-ref/ceed-cuda-ref.c                               | 2 +-
 backends/cuda-ref/ceed-cuda-ref.h                               | 2 +-
 backends/cuda-ref/kernels/cuda-ref-vector.cu                    | 2 +-
 backends/cuda-shared/ceed-cuda-shared-basis.c                   | 2 +-
 backends/cuda-shared/ceed-cuda-shared.c                         | 2 +-
 backends/cuda-shared/ceed-cuda-shared.h                         | 2 +-
 backends/cuda/ceed-cuda-common.c                                | 2 +-
 backends/cuda/ceed-cuda-common.h                                | 2 +-
 backends/cuda/ceed-cuda-compile.cpp                             | 2 +-
 backends/cuda/ceed-cuda-compile.h                               | 2 +-
 backends/hip-gen/ceed-hip-gen-operator-build.cpp                | 2 +-
 backends/hip-gen/ceed-hip-gen-operator-build.h                  | 2 +-
 backends/hip-gen/ceed-hip-gen-operator.c                        | 2 +-
 backends/hip-gen/ceed-hip-gen-qfunction.c                       | 2 +-
 backends/hip-gen/ceed-hip-gen.c                                 | 2 +-
 backends/hip-gen/ceed-hip-gen.h                                 | 2 +-
 backends/hip-ref/ceed-hip-ref-basis.c                           | 2 +-
 backends/hip-ref/ceed-hip-ref-operator.c                        | 2 +-
 backends/hip-ref/ceed-hip-ref-qfunction-load.cpp                | 2 +-
 backends/hip-ref/ceed-hip-ref-qfunction-load.h                  | 2 +-
 backends/hip-ref/ceed-hip-ref-qfunction.c                       | 2 +-
 backends/hip-ref/ceed-hip-ref-qfunctioncontext.c                | 2 +-
 backends/hip-ref/ceed-hip-ref-restriction.c                     | 2 +-
 backends/hip-ref/ceed-hip-ref-vector.c                          | 2 +-
 backends/hip-ref/ceed-hip-ref.c                                 | 2 +-
 backends/hip-ref/ceed-hip-ref.h                                 | 2 +-
 backends/hip-ref/kernels/hip-ref-vector.hip.cpp                 | 2 +-
 backends/hip-shared/ceed-hip-shared-basis.c                     | 2 +-
 backends/hip-shared/ceed-hip-shared.c                           | 2 +-
 backends/hip-shared/ceed-hip-shared.h                           | 2 +-
 backends/hip/ceed-hip-common.c                                  | 2 +-
 backends/hip/ceed-hip-common.h                                  | 2 +-
 backends/hip/ceed-hip-compile.cpp                               | 2 +-
 backends/hip/ceed-hip-compile.h                                 | 2 +-
 backends/magma/ceed-magma-basis.c                               | 2 +-
 backends/magma/ceed-magma-common.c                              | 2 +-
 backends/magma/ceed-magma-common.h                              | 2 +-
 backends/magma/ceed-magma-det.c                                 | 2 +-
 backends/magma/ceed-magma-gemm-nontensor.cpp                    | 2 +-
 backends/magma/ceed-magma-gemm-nontensor.h                      | 2 +-
 backends/magma/ceed-magma-gemm-selector.cpp                     | 2 +-
 backends/magma/ceed-magma-gemm-selector.h                       | 2 +-
 backends/magma/ceed-magma.c                                     | 2 +-
 backends/magma/ceed-magma.h                                     | 2 +-
 backends/magma/tuning/Makefile                                  | 2 +-
 backends/magma/tuning/generate_tuning.py                        | 2 +-
 backends/magma/tuning/tuning.cpp                                | 2 +-
 backends/memcheck/ceed-memcheck-blocked.c                       | 2 +-
 backends/memcheck/ceed-memcheck-qfunction.c                     | 2 +-
 backends/memcheck/ceed-memcheck-qfunctioncontext.c              | 2 +-
 backends/memcheck/ceed-memcheck-restriction.c                   | 2 +-
 backends/memcheck/ceed-memcheck-serial.c                        | 2 +-
 backends/memcheck/ceed-memcheck-vector.c                        | 2 +-
 backends/memcheck/ceed-memcheck.h                               | 2 +-
 backends/occa/ceed-occa-basis.cpp                               | 2 +-
 backends/occa/ceed-occa-basis.hpp                               | 2 +-
 backends/occa/ceed-occa-ceed-object.cpp                         | 2 +-
 backends/occa/ceed-occa-ceed-object.hpp                         | 2 +-
 backends/occa/ceed-occa-context.cpp                             | 2 +-
 backends/occa/ceed-occa-context.hpp                             | 2 +-
 backends/occa/ceed-occa-cpu-operator.cpp                        | 2 +-
 backends/occa/ceed-occa-cpu-operator.hpp                        | 2 +-
 backends/occa/ceed-occa-elem-restriction.cpp                    | 2 +-
 backends/occa/ceed-occa-elem-restriction.hpp                    | 2 +-
 backends/occa/ceed-occa-gpu-operator.cpp                        | 2 +-
 backends/occa/ceed-occa-gpu-operator.hpp                        | 2 +-
 backends/occa/ceed-occa-kernels.hpp                             | 2 +-
 backends/occa/ceed-occa-operator-args.cpp                       | 2 +-
 backends/occa/ceed-occa-operator-args.hpp                       | 2 +-
 backends/occa/ceed-occa-operator-field.cpp                      | 2 +-
 backends/occa/ceed-occa-operator-field.hpp                      | 2 +-
 backends/occa/ceed-occa-operator.cpp                            | 2 +-
 backends/occa/ceed-occa-operator.hpp                            | 2 +-
 backends/occa/ceed-occa-qfunction-args.cpp                      | 2 +-
 backends/occa/ceed-occa-qfunction-args.hpp                      | 2 +-
 backends/occa/ceed-occa-qfunction-field.cpp                     | 2 +-
 backends/occa/ceed-occa-qfunction-field.hpp                     | 2 +-
 backends/occa/ceed-occa-qfunction.cpp                           | 2 +-
 backends/occa/ceed-occa-qfunction.hpp                           | 2 +-
 backends/occa/ceed-occa-qfunctioncontext.cpp                    | 2 +-
 backends/occa/ceed-occa-qfunctioncontext.hpp                    | 2 +-
 backends/occa/ceed-occa-simplex-basis.cpp                       | 2 +-
 backends/occa/ceed-occa-simplex-basis.hpp                       | 2 +-
 backends/occa/ceed-occa-tensor-basis.cpp                        | 2 +-
 backends/occa/ceed-occa-tensor-basis.hpp                        | 2 +-
 backends/occa/ceed-occa-types.hpp                               | 2 +-
 backends/occa/ceed-occa-vector.cpp                              | 2 +-
 backends/occa/ceed-occa-vector.hpp                              | 2 +-
 backends/occa/ceed-occa.cpp                                     | 2 +-
 backends/occa/ceed-occa.h                                       | 2 +-
 backends/occa/kernels/elem-restriction.cpp                      | 2 +-
 backends/occa/kernels/elem-restriction.hpp                      | 2 +-
 backends/occa/kernels/kernel-defines.hpp                        | 2 +-
 backends/occa/kernels/set-value.cpp                             | 2 +-
 backends/occa/kernels/set-value.hpp                             | 2 +-
 backends/occa/kernels/simplex-basis.hpp                         | 2 +-
 backends/occa/kernels/simplex-basis/cpu-simplex-basis.cpp       | 2 +-
 backends/occa/kernels/simplex-basis/gpu-simplex-basis.cpp       | 2 +-
 backends/occa/kernels/tensor-basis.hpp                          | 2 +-
 backends/occa/kernels/tensor-basis/cpu/tensor-basis-1d.cpp      | 2 +-
 backends/occa/kernels/tensor-basis/cpu/tensor-basis-2d.cpp      | 2 +-
 backends/occa/kernels/tensor-basis/cpu/tensor-basis-3d.cpp      | 2 +-
 backends/occa/kernels/tensor-basis/gpu/tensor-basis-1d.cpp      | 2 +-
 backends/occa/kernels/tensor-basis/gpu/tensor-basis-2d.cpp      | 2 +-
 backends/occa/kernels/tensor-basis/gpu/tensor-basis-3d.cpp      | 2 +-
 backends/opt/ceed-opt-blocked.c                                 | 2 +-
 backends/opt/ceed-opt-operator.c                                | 2 +-
 backends/opt/ceed-opt-serial.c                                  | 2 +-
 backends/opt/ceed-opt-tensor.c                                  | 2 +-
 backends/opt/ceed-opt.h                                         | 2 +-
 backends/ref/ceed-ref-basis.c                                   | 2 +-
 backends/ref/ceed-ref-operator.c                                | 2 +-
 backends/ref/ceed-ref-qfunction.c                               | 2 +-
 backends/ref/ceed-ref-qfunctioncontext.c                        | 2 +-
 backends/ref/ceed-ref-restriction.c                             | 2 +-
 backends/ref/ceed-ref-tensor.c                                  | 2 +-
 backends/ref/ceed-ref-vector.c                                  | 2 +-
 backends/ref/ceed-ref.c                                         | 2 +-
 backends/ref/ceed-ref.h                                         | 2 +-
 backends/sycl-gen/ceed-sycl-gen-operator-build.hpp              | 2 +-
 backends/sycl-gen/ceed-sycl-gen-operator-build.sycl.cpp         | 2 +-
 backends/sycl-gen/ceed-sycl-gen-operator.sycl.cpp               | 2 +-
 backends/sycl-gen/ceed-sycl-gen-qfunction.sycl.cpp              | 2 +-
 backends/sycl-gen/ceed-sycl-gen.hpp                             | 2 +-
 backends/sycl-gen/ceed-sycl-gen.sycl.cpp                        | 2 +-
 backends/sycl-ref/ceed-sycl-ref-basis.sycl.cpp                  | 2 +-
 backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp               | 2 +-
 backends/sycl-ref/ceed-sycl-ref-qfunction-load.hpp              | 2 +-
 backends/sycl-ref/ceed-sycl-ref-qfunction-load.sycl.cpp         | 2 +-
 backends/sycl-ref/ceed-sycl-ref-qfunction.sycl.cpp              | 2 +-
 backends/sycl-ref/ceed-sycl-ref-qfunctioncontext.sycl.cpp       | 2 +-
 backends/sycl-ref/ceed-sycl-ref.hpp                             | 2 +-
 backends/sycl-ref/ceed-sycl-ref.sycl.cpp                        | 2 +-
 backends/sycl-ref/ceed-sycl-restriction.sycl.cpp                | 2 +-
 backends/sycl-ref/ceed-sycl-vector.sycl.cpp                     | 2 +-
 backends/sycl-ref/kernels/sycl-ref-vector.cpp                   | 2 +-
 backends/sycl-shared/ceed-sycl-shared-basis.sycl.cpp            | 2 +-
 backends/sycl-shared/ceed-sycl-shared.hpp                       | 2 +-
 backends/sycl-shared/ceed-sycl-shared.sycl.cpp                  | 2 +-
 backends/sycl/ceed-sycl-common.hpp                              | 2 +-
 backends/sycl/ceed-sycl-common.sycl.cpp                         | 2 +-
 backends/sycl/ceed-sycl-compile.hpp                             | 2 +-
 backends/sycl/ceed-sycl-compile.sycl.cpp                        | 2 +-
 backends/xsmm/ceed-xsmm-blocked.c                               | 2 +-
 backends/xsmm/ceed-xsmm-serial.c                                | 2 +-
 backends/xsmm/ceed-xsmm-tensor.c                                | 2 +-
 backends/xsmm/ceed-xsmm.h                                       | 2 +-
 benchmarks/benchmark.sh                                         | 2 +-
 benchmarks/petsc-bps.sh                                         | 2 +-
 benchmarks/petsc-bpsraw.sh                                      | 2 +-
 benchmarks/postprocess_base.py                                  | 2 +-
 benchmarks/postprocess_plot.py                                  | 2 +-
 benchmarks/postprocess_table.py                                 | 2 +-
 common.mk                                                       | 2 +-
 examples/ceed/Makefile                                          | 2 +-
 examples/ceed/ex1-volume.c                                      | 2 +-
 examples/ceed/ex1-volume.h                                      | 2 +-
 examples/ceed/ex2-surface.c                                     | 2 +-
 examples/ceed/ex2-surface.h                                     | 2 +-
 examples/ceed/ex3-volume.c                                      | 2 +-
 examples/ceed/ex3-volume.h                                      | 2 +-
 examples/fluids/include/bc_definition.h                         | 2 +-
 examples/fluids/include/log_events.h                            | 2 +-
 examples/fluids/include/mat-ceed-impl.h                         | 2 +-
 examples/fluids/include/mat-ceed.h                              | 2 +-
 examples/fluids/include/petsc-ceed-utils.h                      | 2 +-
 examples/fluids/include/petsc-ceed.h                            | 2 +-
 examples/fluids/include/petsc_ops.h                             | 2 +-
 examples/fluids/navierstokes.c                                  | 2 +-
 examples/fluids/navierstokes.h                                  | 2 +-
 examples/fluids/problems/advection.c                            | 2 +-
 examples/fluids/problems/bc_freestream.c                        | 2 +-
 examples/fluids/problems/bc_slip.c                              | 2 +-
 examples/fluids/problems/blasius.c                              | 2 +-
 examples/fluids/problems/channel.c                              | 2 +-
 examples/fluids/problems/densitycurrent.c                       | 2 +-
 examples/fluids/problems/eulervortex.c                          | 2 +-
 examples/fluids/problems/gaussianwave.c                         | 2 +-
 examples/fluids/problems/newtonian.c                            | 2 +-
 examples/fluids/problems/shocktube.c                            | 2 +-
 examples/fluids/problems/stg_shur14.c                           | 2 +-
 examples/fluids/problems/stg_shur14.h                           | 2 +-
 examples/fluids/problems/taylorgreen.c                          | 2 +-
 examples/fluids/qfunctions/advection.h                          | 2 +-
 examples/fluids/qfunctions/advection_types.h                    | 2 +-
 examples/fluids/qfunctions/bc_freestream.h                      | 2 +-
 examples/fluids/qfunctions/bc_freestream_type.h                 | 2 +-
 examples/fluids/qfunctions/bc_slip.h                            | 2 +-
 examples/fluids/qfunctions/blasius.h                            | 2 +-
 examples/fluids/qfunctions/channel.h                            | 2 +-
 examples/fluids/qfunctions/densitycurrent.h                     | 2 +-
 examples/fluids/qfunctions/differential_filter.h                | 2 +-
 examples/fluids/qfunctions/differential_filter_enums.h          | 2 +-
 examples/fluids/qfunctions/eulervortex.h                        | 2 +-
 examples/fluids/qfunctions/gaussianwave.h                       | 2 +-
 examples/fluids/qfunctions/grid_anisotropy_tensor.h             | 2 +-
 examples/fluids/qfunctions/inverse_multiplicity.h               | 2 +-
 examples/fluids/qfunctions/mass.h                               | 2 +-
 examples/fluids/qfunctions/newtonian.h                          | 2 +-
 examples/fluids/qfunctions/newtonian_state.h                    | 2 +-
 examples/fluids/qfunctions/newtonian_types.h                    | 2 +-
 examples/fluids/qfunctions/riemann_solver.h                     | 2 +-
 examples/fluids/qfunctions/setupgeo.h                           | 2 +-
 examples/fluids/qfunctions/setupgeo2d.h                         | 2 +-
 examples/fluids/qfunctions/setupgeo_helpers.h                   | 2 +-
 examples/fluids/qfunctions/shocktube.h                          | 2 +-
 examples/fluids/qfunctions/stabilization.h                      | 2 +-
 examples/fluids/qfunctions/stabilization_types.h                | 2 +-
 examples/fluids/qfunctions/stg_shur14.h                         | 2 +-
 examples/fluids/qfunctions/stg_shur14_type.h                    | 2 +-
 examples/fluids/qfunctions/strong_boundary_conditions.h         | 2 +-
 examples/fluids/qfunctions/taylorgreen.h                        | 2 +-
 examples/fluids/qfunctions/turb_spanstats.h                     | 2 +-
 examples/fluids/qfunctions/turb_stats_types.h                   | 2 +-
 examples/fluids/qfunctions/utils.h                              | 2 +-
 examples/fluids/qfunctions/utils_eigensolver_jacobi.h           | 2 +-
 examples/fluids/qfunctions/velocity_gradient_projection.h       | 2 +-
 examples/fluids/src/bc_definition.c                             | 2 +-
 examples/fluids/src/boundary_condition.c                        | 2 +-
 examples/fluids/src/cloptions.c                                 | 2 +-
 examples/fluids/src/differential_filter.c                       | 2 +-
 examples/fluids/src/dm_utils.c                                  | 2 +-
 examples/fluids/src/grid_anisotropy_tensor.c                    | 2 +-
 examples/fluids/src/inverse_multiplicity.c                      | 2 +-
 examples/fluids/src/log_events.c                                | 2 +-
 examples/fluids/src/misc.c                                      | 2 +-
 examples/fluids/src/petsc_ops.c                                 | 2 +-
 examples/fluids/src/qdata.c                                     | 2 +-
 examples/fluids/src/setupdm.c                                   | 2 +-
 examples/fluids/src/setuplibceed.c                              | 2 +-
 examples/fluids/src/setupts.c                                   | 2 +-
 examples/fluids/src/strong_boundary_conditions.c                | 2 +-
 examples/fluids/src/turb_spanstats.c                            | 2 +-
 examples/fluids/src/velocity_gradient_projection.c              | 2 +-
 examples/mfem/Makefile                                          | 2 +-
 examples/mfem/bp1.cpp                                           | 2 +-
 examples/mfem/bp1.h                                             | 2 +-
 examples/mfem/bp1.hpp                                           | 2 +-
 examples/mfem/bp3.cpp                                           | 2 +-
 examples/mfem/bp3.h                                             | 2 +-
 examples/mfem/bp3.hpp                                           | 2 +-
 examples/nek/bps/bps.h                                          | 2 +-
 examples/nek/bps/bps.usr                                        | 2 +-
 examples/petsc/Makefile                                         | 2 +-
 examples/petsc/area.c                                           | 2 +-
 examples/petsc/area.h                                           | 2 +-
 examples/petsc/bps.c                                            | 2 +-
 examples/petsc/bps.h                                            | 2 +-
 examples/petsc/bpsraw.c                                         | 2 +-
 examples/petsc/bpssphere.c                                      | 2 +-
 examples/petsc/bpssphere.h                                      | 2 +-
 examples/petsc/bpsswarm.c                                       | 2 +-
 examples/petsc/dmswarm.c                                        | 2 +-
 examples/petsc/include/areaproblemdata.h                        | 2 +-
 examples/petsc/include/bpsproblemdata.h                         | 2 +-
 examples/petsc/include/libceedsetup.h                           | 2 +-
 examples/petsc/include/matops.h                                 | 2 +-
 examples/petsc/include/petscutils.h                             | 2 +-
 examples/petsc/include/petscversion.h                           | 2 +-
 examples/petsc/include/sphereproblemdata.h                      | 2 +-
 examples/petsc/include/structs.h                                | 2 +-
 examples/petsc/include/swarmutils.h                             | 2 +-
 examples/petsc/multigrid.c                                      | 2 +-
 examples/petsc/qfunctions/area/areacube.h                       | 2 +-
 examples/petsc/qfunctions/area/areasphere.h                     | 2 +-
 examples/petsc/qfunctions/bps/bp1.h                             | 2 +-
 examples/petsc/qfunctions/bps/bp1sphere.h                       | 2 +-
 examples/petsc/qfunctions/bps/bp2.h                             | 2 +-
 examples/petsc/qfunctions/bps/bp2sphere.h                       | 2 +-
 examples/petsc/qfunctions/bps/bp3.h                             | 2 +-
 examples/petsc/qfunctions/bps/bp3sphere.h                       | 2 +-
 examples/petsc/qfunctions/bps/bp4.h                             | 2 +-
 examples/petsc/qfunctions/bps/bp4sphere.h                       | 2 +-
 examples/petsc/qfunctions/bps/common.h                          | 2 +-
 examples/petsc/qfunctions/swarm/swarmmass.h                     | 2 +-
 examples/petsc/src/libceedsetup.c                               | 2 +-
 examples/petsc/src/petscutils.c                                 | 2 +-
 examples/petsc/src/swarmutils.c                                 | 2 +-
 examples/rust/ex1-volume/src/main.rs                            | 2 +-
 examples/rust/ex1-volume/src/opt.rs                             | 2 +-
 examples/rust/ex1-volume/src/transform.rs                       | 2 +-
 examples/rust/ex2-surface/src/main.rs                           | 2 +-
 examples/rust/ex2-surface/src/opt.rs                            | 2 +-
 examples/rust/ex2-surface/src/transform.rs                      | 2 +-
 examples/rust/ex3-vector-volume/src/main.rs                     | 2 +-
 examples/rust/ex3-vector-volume/src/opt.rs                      | 2 +-
 examples/rust/ex3-vector-volume/src/transform.rs                | 2 +-
 examples/rust/ex4-vector-surface/src/main.rs                    | 2 +-
 examples/rust/ex4-vector-surface/src/opt.rs                     | 2 +-
 examples/rust/ex4-vector-surface/src/transform.rs               | 2 +-
 examples/rust/mesh/src/lib.rs                                   | 2 +-
 examples/solids/Makefile                                        | 2 +-
 examples/solids/elasticity.c                                    | 2 +-
 examples/solids/elasticity.h                                    | 2 +-
 examples/solids/include/boundary.h                              | 2 +-
 examples/solids/include/cl-options.h                            | 2 +-
 examples/solids/include/matops.h                                | 2 +-
 examples/solids/include/misc.h                                  | 2 +-
 examples/solids/include/setup-dm.h                              | 2 +-
 examples/solids/include/setup-libceed.h                         | 2 +-
 examples/solids/include/structs.h                               | 2 +-
 examples/solids/include/utils.h                                 | 2 +-
 examples/solids/problems/cl-problems.h                          | 2 +-
 examples/solids/problems/finite-strain-mooney-rivlin.c          | 2 +-
 examples/solids/problems/finite-strain-neo-hookean.c            | 2 +-
 examples/solids/problems/linear.c                               | 2 +-
 examples/solids/problems/mooney-rivlin.c                        | 2 +-
 examples/solids/problems/mooney-rivlin.h                        | 2 +-
 examples/solids/problems/neo-hookean.c                          | 2 +-
 examples/solids/problems/neo-hookean.h                          | 2 +-
 examples/solids/problems/problems.c                             | 2 +-
 examples/solids/problems/problems.h                             | 2 +-
 examples/solids/qfunctions/common.h                             | 2 +-
 examples/solids/qfunctions/constant-force.h                     | 2 +-
 examples/solids/qfunctions/finite-strain-mooney-rivlin.h        | 2 +-
 examples/solids/qfunctions/finite-strain-neo-hookean.h          | 2 +-
 examples/solids/qfunctions/linear.h                             | 2 +-
 examples/solids/qfunctions/manufactured-force.h                 | 2 +-
 examples/solids/qfunctions/manufactured-true.h                  | 2 +-
 examples/solids/qfunctions/traction-boundary.h                  | 2 +-
 examples/solids/src/boundary.c                                  | 2 +-
 examples/solids/src/cl-options.c                                | 2 +-
 examples/solids/src/matops.c                                    | 2 +-
 examples/solids/src/misc.c                                      | 2 +-
 examples/solids/src/setup-dm.c                                  | 2 +-
 examples/solids/src/setup-libceed.c                             | 2 +-
 gallery/ceed-gallery-list.h                                     | 2 +-
 gallery/ceed-gallery-weak.c                                     | 2 +-
 gallery/identity/ceed-identity.c                                | 2 +-
 gallery/mass-vector/ceed-vectormassapply.c                      | 2 +-
 gallery/mass/ceed-mass1dbuild.c                                 | 2 +-
 gallery/mass/ceed-mass2dbuild.c                                 | 2 +-
 gallery/mass/ceed-mass3dbuild.c                                 | 2 +-
 gallery/mass/ceed-massapply.c                                   | 2 +-
 gallery/poisson-vector/ceed-vectorpoisson1dapply.c              | 2 +-
 gallery/poisson-vector/ceed-vectorpoisson2dapply.c              | 2 +-
 gallery/poisson-vector/ceed-vectorpoisson3dapply.c              | 2 +-
 gallery/poisson/ceed-poisson1dapply.c                           | 2 +-
 gallery/poisson/ceed-poisson1dbuild.c                           | 2 +-
 gallery/poisson/ceed-poisson2dapply.c                           | 2 +-
 gallery/poisson/ceed-poisson2dbuild.c                           | 2 +-
 gallery/poisson/ceed-poisson3dapply.c                           | 2 +-
 gallery/poisson/ceed-poisson3dbuild.c                           | 2 +-
 gallery/scale/ceed-scale.c                                      | 2 +-
 include/ceed-fortran-name.h                                     | 2 +-
 include/ceed-impl.h                                             | 2 +-
 include/ceed/backend.h                                          | 2 +-
 include/ceed/ceed-f32.h                                         | 2 +-
 include/ceed/ceed-f64.h                                         | 2 +-
 include/ceed/ceed.h                                             | 2 +-
 include/ceed/cuda.h                                             | 2 +-
 include/ceed/fortran.h                                          | 2 +-
 include/ceed/hip.h                                              | 2 +-
 include/ceed/jit-source/cuda/cuda-atomic-add-fallback.h         | 2 +-
 include/ceed/jit-source/cuda/cuda-gen-templates.h               | 2 +-
 include/ceed/jit-source/cuda/cuda-jit.h                         | 2 +-
 .../ceed/jit-source/cuda/cuda-ref-basis-nontensor-templates.h   | 2 +-
 include/ceed/jit-source/cuda/cuda-ref-basis-nontensor.h         | 2 +-
 include/ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h  | 2 +-
 include/ceed/jit-source/cuda/cuda-ref-basis-tensor.h            | 2 +-
 .../ceed/jit-source/cuda/cuda-ref-operator-assemble-diagonal.h  | 2 +-
 include/ceed/jit-source/cuda/cuda-ref-operator-assemble.h       | 2 +-
 include/ceed/jit-source/cuda/cuda-ref-qfunction.h               | 2 +-
 include/ceed/jit-source/cuda/cuda-ref-restriction-at-points.h   | 2 +-
 .../ceed/jit-source/cuda/cuda-ref-restriction-curl-oriented.h   | 2 +-
 include/ceed/jit-source/cuda/cuda-ref-restriction-offset.h      | 2 +-
 include/ceed/jit-source/cuda/cuda-ref-restriction-oriented.h    | 2 +-
 include/ceed/jit-source/cuda/cuda-ref-restriction-strided.h     | 2 +-
 .../jit-source/cuda/cuda-shared-basis-nontensor-templates.h     | 2 +-
 include/ceed/jit-source/cuda/cuda-shared-basis-nontensor.h      | 2 +-
 .../jit-source/cuda/cuda-shared-basis-read-write-templates.h    | 2 +-
 .../cuda/cuda-shared-basis-tensor-at-points-templates.h         | 2 +-
 .../ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points.h   | 2 +-
 .../ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h   | 2 +-
 include/ceed/jit-source/cuda/cuda-shared-basis-tensor.h         | 2 +-
 include/ceed/jit-source/cuda/cuda-types.h                       | 2 +-
 include/ceed/jit-source/gallery/ceed-identity.h                 | 2 +-
 include/ceed/jit-source/gallery/ceed-mass1dbuild.h              | 2 +-
 include/ceed/jit-source/gallery/ceed-mass2dbuild.h              | 2 +-
 include/ceed/jit-source/gallery/ceed-mass3dbuild.h              | 2 +-
 include/ceed/jit-source/gallery/ceed-massapply.h                | 2 +-
 include/ceed/jit-source/gallery/ceed-poisson1dapply.h           | 2 +-
 include/ceed/jit-source/gallery/ceed-poisson1dbuild.h           | 2 +-
 include/ceed/jit-source/gallery/ceed-poisson2dapply.h           | 2 +-
 include/ceed/jit-source/gallery/ceed-poisson2dbuild.h           | 2 +-
 include/ceed/jit-source/gallery/ceed-poisson3dapply.h           | 2 +-
 include/ceed/jit-source/gallery/ceed-poisson3dbuild.h           | 2 +-
 include/ceed/jit-source/gallery/ceed-scale.h                    | 2 +-
 include/ceed/jit-source/gallery/ceed-vectormassapply.h          | 2 +-
 include/ceed/jit-source/gallery/ceed-vectorpoisson1dapply.h     | 2 +-
 include/ceed/jit-source/gallery/ceed-vectorpoisson2dapply.h     | 2 +-
 include/ceed/jit-source/gallery/ceed-vectorpoisson3dapply.h     | 2 +-
 include/ceed/jit-source/hip/hip-gen-templates.h                 | 2 +-
 include/ceed/jit-source/hip/hip-jit.h                           | 2 +-
 include/ceed/jit-source/hip/hip-ref-basis-nontensor-templates.h | 2 +-
 include/ceed/jit-source/hip/hip-ref-basis-nontensor.h           | 2 +-
 include/ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h    | 2 +-
 include/ceed/jit-source/hip/hip-ref-basis-tensor.h              | 2 +-
 .../ceed/jit-source/hip/hip-ref-operator-assemble-diagonal.h    | 2 +-
 include/ceed/jit-source/hip/hip-ref-operator-assemble.h         | 2 +-
 include/ceed/jit-source/hip/hip-ref-qfunction.h                 | 2 +-
 include/ceed/jit-source/hip/hip-ref-restriction-at-points.h     | 2 +-
 include/ceed/jit-source/hip/hip-ref-restriction-curl-oriented.h | 2 +-
 include/ceed/jit-source/hip/hip-ref-restriction-offset.h        | 2 +-
 include/ceed/jit-source/hip/hip-ref-restriction-oriented.h      | 2 +-
 include/ceed/jit-source/hip/hip-ref-restriction-strided.h       | 2 +-
 .../ceed/jit-source/hip/hip-shared-basis-nontensor-templates.h  | 2 +-
 include/ceed/jit-source/hip/hip-shared-basis-nontensor.h        | 2 +-
 .../ceed/jit-source/hip/hip-shared-basis-read-write-templates.h | 2 +-
 .../hip/hip-shared-basis-tensor-at-points-templates.h           | 2 +-
 include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points.h | 2 +-
 include/ceed/jit-source/hip/hip-shared-basis-tensor-templates.h | 2 +-
 include/ceed/jit-source/hip/hip-shared-basis-tensor.h           | 2 +-
 include/ceed/jit-source/hip/hip-types.h                         | 2 +-
 include/ceed/jit-source/magma/magma-basis-grad-1d.h             | 2 +-
 include/ceed/jit-source/magma/magma-basis-grad-2d.h             | 2 +-
 include/ceed/jit-source/magma/magma-basis-grad-3d.h             | 2 +-
 include/ceed/jit-source/magma/magma-basis-interp-1d.h           | 2 +-
 include/ceed/jit-source/magma/magma-basis-interp-2d.h           | 2 +-
 include/ceed/jit-source/magma/magma-basis-interp-3d.h           | 2 +-
 .../ceed/jit-source/magma/magma-basis-interp-deriv-nontensor.h  | 2 +-
 include/ceed/jit-source/magma/magma-basis-weight-1d.h           | 2 +-
 include/ceed/jit-source/magma/magma-basis-weight-2d.h           | 2 +-
 include/ceed/jit-source/magma/magma-basis-weight-3d.h           | 2 +-
 include/ceed/jit-source/magma/magma-basis-weight-nontensor.h    | 2 +-
 include/ceed/jit-source/magma/magma-common-defs.h               | 2 +-
 include/ceed/jit-source/magma/magma-common-nontensor.h          | 2 +-
 include/ceed/jit-source/magma/magma-common-tensor.h             | 2 +-
 include/ceed/jit-source/sycl/sycl-gen-templates.h               | 2 +-
 include/ceed/jit-source/sycl/sycl-jit.h                         | 2 +-
 include/ceed/jit-source/sycl/sycl-ref-qfunction.h               | 2 +-
 .../jit-source/sycl/sycl-shared-basis-read-write-templates.h    | 2 +-
 .../ceed/jit-source/sycl/sycl-shared-basis-tensor-templates.h   | 2 +-
 include/ceed/jit-source/sycl/sycl-shared-basis-tensor.h         | 2 +-
 include/ceed/jit-source/sycl/sycl-types.h                       | 2 +-
 include/ceed/jit-tools.h                                        | 2 +-
 include/ceed/types.h                                            | 2 +-
 interface/ceed-basis.c                                          | 2 +-
 interface/ceed-cuda.c                                           | 2 +-
 interface/ceed-elemrestriction.c                                | 2 +-
 interface/ceed-fortran.c                                        | 2 +-
 interface/ceed-hip.c                                            | 2 +-
 interface/ceed-jit-source-root-default.c                        | 2 +-
 interface/ceed-jit-source-root-install.c                        | 2 +-
 interface/ceed-jit-tools.c                                      | 2 +-
 interface/ceed-operator.c                                       | 2 +-
 interface/ceed-preconditioning.c                                | 2 +-
 interface/ceed-qfunction-register.c                             | 2 +-
 interface/ceed-qfunction.c                                      | 2 +-
 interface/ceed-qfunctioncontext.c                               | 2 +-
 interface/ceed-register.c                                       | 2 +-
 interface/ceed-tensor.c                                         | 2 +-
 interface/ceed-types.c                                          | 2 +-
 interface/ceed-vector.c                                         | 2 +-
 interface/ceed.c                                                | 2 +-
 python/__init__.py                                              | 2 +-
 python/build_ceed_cffi.py                                       | 2 +-
 python/ceed.py                                                  | 2 +-
 python/ceed_basis.py                                            | 2 +-
 python/ceed_constants.py                                        | 2 +-
 python/ceed_elemrestriction.py                                  | 2 +-
 python/ceed_operator.py                                         | 2 +-
 python/ceed_qfunction.py                                        | 2 +-
 python/ceed_qfunctioncontext.py                                 | 2 +-
 python/ceed_vector.py                                           | 2 +-
 python/tests/Makefile                                           | 2 +-
 python/tests/conftest.py                                        | 2 +-
 python/tests/libceed-qfunctions.c                               | 2 +-
 python/tests/setup-qfunctions.py                                | 2 +-
 python/tests/setup.cfg                                          | 2 +-
 python/tests/test-0-ceed.py                                     | 2 +-
 python/tests/test-1-vector.py                                   | 2 +-
 python/tests/test-2-elemrestriction.py                          | 2 +-
 python/tests/test-3-basis.py                                    | 2 +-
 python/tests/test-4-qfunction.py                                | 2 +-
 python/tests/test-5-operator.py                                 | 2 +-
 python/tests/test-qfunctions.h                                  | 2 +-
 rust/libceed/src/basis.rs                                       | 2 +-
 rust/libceed/src/elem_restriction.rs                            | 2 +-
 rust/libceed/src/lib.rs                                         | 2 +-
 rust/libceed/src/operator.rs                                    | 2 +-
 rust/libceed/src/qfunction.rs                                   | 2 +-
 rust/libceed/src/vector.rs                                      | 2 +-
 rust/libceed/tests/version-numbers.rs                           | 2 +-
 tests/t319-basis.h                                              | 2 +-
 tests/t320-basis-f.h                                            | 2 +-
 tests/t320-basis.h                                              | 2 +-
 tests/t330-basis.h                                              | 2 +-
 tests/t340-basis.h                                              | 2 +-
 tests/t400-qfunction.h                                          | 2 +-
 tests/t401-qfunction.h                                          | 2 +-
 tests/t405-qfunction.h                                          | 2 +-
 tests/t406-qfunction-helper.h                                   | 2 +-
 tests/t406-qfunction-scales.h                                   | 2 +-
 tests/t406-qfunction.h                                          | 2 +-
 tests/t409-qfunction.h                                          | 2 +-
 tests/t500-operator.h                                           | 2 +-
 tests/t502-operator.h                                           | 2 +-
 tests/t507-operator.h                                           | 2 +-
 tests/t510-operator.h                                           | 2 +-
 tests/t522-operator.h                                           | 2 +-
 tests/t530-operator.h                                           | 2 +-
 tests/t531-operator.h                                           | 2 +-
 tests/t532-operator.h                                           | 2 +-
 tests/t534-operator.h                                           | 2 +-
 tests/t535-operator.h                                           | 2 +-
 tests/t537-operator.h                                           | 2 +-
 tests/t539-operator.h                                           | 2 +-
 tests/t540-operator.h                                           | 2 +-
 tests/t541-operator.h                                           | 2 +-
 tests/t566-operator.h                                           | 2 +-
 tests/t567-operator.h                                           | 2 +-
 tests/t568-operator.h                                           | 2 +-
 tests/t580-operator.h                                           | 2 +-
 tests/t590-operator.h                                           | 2 +-
 tests/t591-operator.h                                           | 2 +-
 tests/t595-operator.h                                           | 2 +-
 543 files changed, 543 insertions(+), 543 deletions(-)

diff --git a/LICENSE b/LICENSE
index ec06a37c93..2cefa6edd3 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,6 +1,6 @@
 BSD 2-Clause License
 
-Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
diff --git a/Makefile b/Makefile
index f73bc54e3b..c49058bbcb 100644
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+# Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/README.md b/README.md
index 45f3a08489..41fa318582 100644
--- a/README.md
+++ b/README.md
@@ -478,7 +478,7 @@ The BibTeX entries for these references can be found in the `doc/bib/references.
 
 The following copyright applies to each file in the CEED software suite, unless otherwise stated in the file:
 
-> Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+> Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 > All rights reserved.
 
 See files LICENSE and NOTICE for details.
diff --git a/backends/avx/ceed-avx-blocked.c b/backends/avx/ceed-avx-blocked.c
index bf898f571a..d9098b779e 100644
--- a/backends/avx/ceed-avx-blocked.c
+++ b/backends/avx/ceed-avx-blocked.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/avx/ceed-avx-serial.c b/backends/avx/ceed-avx-serial.c
index 5d33900758..e45294c03d 100644
--- a/backends/avx/ceed-avx-serial.c
+++ b/backends/avx/ceed-avx-serial.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/avx/ceed-avx-tensor.c b/backends/avx/ceed-avx-tensor.c
index ce8f26b355..e58b6eee67 100644
--- a/backends/avx/ceed-avx-tensor.c
+++ b/backends/avx/ceed-avx-tensor.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/avx/ceed-avx.h b/backends/avx/ceed-avx.h
index 786be45a0d..d9d354b59f 100644
--- a/backends/avx/ceed-avx.h
+++ b/backends/avx/ceed-avx.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/blocked/ceed-blocked-operator.c b/backends/blocked/ceed-blocked-operator.c
index 11951e19ba..c698428f50 100644
--- a/backends/blocked/ceed-blocked-operator.c
+++ b/backends/blocked/ceed-blocked-operator.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/blocked/ceed-blocked.c b/backends/blocked/ceed-blocked.c
index ca55c01e45..7dcf10038c 100644
--- a/backends/blocked/ceed-blocked.c
+++ b/backends/blocked/ceed-blocked.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/blocked/ceed-blocked.h b/backends/blocked/ceed-blocked.h
index f04307abdc..dacb05b840 100644
--- a/backends/blocked/ceed-blocked.h
+++ b/backends/blocked/ceed-blocked.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/ceed-backend-list.h b/backends/ceed-backend-list.h
index 75b1d1fe75..2464453c5f 100644
--- a/backends/ceed-backend-list.h
+++ b/backends/ceed-backend-list.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/ceed-backend-weak.c b/backends/ceed-backend-weak.c
index e4c401f6a9..c7edeb1861 100644
--- a/backends/ceed-backend-weak.c
+++ b/backends/ceed-backend-weak.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
index fe7975c6bf..a972060d57 100644
--- a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
+++ b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/cuda-gen/ceed-cuda-gen-operator-build.h b/backends/cuda-gen/ceed-cuda-gen-operator-build.h
index 88e20ceda2..87b2674b7c 100644
--- a/backends/cuda-gen/ceed-cuda-gen-operator-build.h
+++ b/backends/cuda-gen/ceed-cuda-gen-operator-build.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/cuda-gen/ceed-cuda-gen-operator.c b/backends/cuda-gen/ceed-cuda-gen-operator.c
index 0e42eeaaed..2112661dbd 100644
--- a/backends/cuda-gen/ceed-cuda-gen-operator.c
+++ b/backends/cuda-gen/ceed-cuda-gen-operator.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/cuda-gen/ceed-cuda-gen-qfunction.c b/backends/cuda-gen/ceed-cuda-gen-qfunction.c
index 483b520503..bfb36c16fc 100644
--- a/backends/cuda-gen/ceed-cuda-gen-qfunction.c
+++ b/backends/cuda-gen/ceed-cuda-gen-qfunction.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/cuda-gen/ceed-cuda-gen.c b/backends/cuda-gen/ceed-cuda-gen.c
index 404b3b5a89..89ef059c1a 100644
--- a/backends/cuda-gen/ceed-cuda-gen.c
+++ b/backends/cuda-gen/ceed-cuda-gen.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/cuda-gen/ceed-cuda-gen.h b/backends/cuda-gen/ceed-cuda-gen.h
index 09b66171e9..461afd4288 100644
--- a/backends/cuda-gen/ceed-cuda-gen.h
+++ b/backends/cuda-gen/ceed-cuda-gen.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/cuda-ref/ceed-cuda-ref-basis.c b/backends/cuda-ref/ceed-cuda-ref-basis.c
index b21466f33c..b28cad5ae8 100644
--- a/backends/cuda-ref/ceed-cuda-ref-basis.c
+++ b/backends/cuda-ref/ceed-cuda-ref-basis.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/cuda-ref/ceed-cuda-ref-operator.c b/backends/cuda-ref/ceed-cuda-ref-operator.c
index bc0414fb2a..52d1797e49 100644
--- a/backends/cuda-ref/ceed-cuda-ref-operator.c
+++ b/backends/cuda-ref/ceed-cuda-ref-operator.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/cuda-ref/ceed-cuda-ref-qfunction-load.cpp b/backends/cuda-ref/ceed-cuda-ref-qfunction-load.cpp
index 2c66256963..ec4f40ef86 100644
--- a/backends/cuda-ref/ceed-cuda-ref-qfunction-load.cpp
+++ b/backends/cuda-ref/ceed-cuda-ref-qfunction-load.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/cuda-ref/ceed-cuda-ref-qfunction-load.h b/backends/cuda-ref/ceed-cuda-ref-qfunction-load.h
index d8ca4f175b..b0efe60933 100644
--- a/backends/cuda-ref/ceed-cuda-ref-qfunction-load.h
+++ b/backends/cuda-ref/ceed-cuda-ref-qfunction-load.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/cuda-ref/ceed-cuda-ref-qfunction.c b/backends/cuda-ref/ceed-cuda-ref-qfunction.c
index 32a02b43dd..9a74f2ad79 100644
--- a/backends/cuda-ref/ceed-cuda-ref-qfunction.c
+++ b/backends/cuda-ref/ceed-cuda-ref-qfunction.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/cuda-ref/ceed-cuda-ref-qfunctioncontext.c b/backends/cuda-ref/ceed-cuda-ref-qfunctioncontext.c
index 5afbb7aa02..52c9586273 100644
--- a/backends/cuda-ref/ceed-cuda-ref-qfunctioncontext.c
+++ b/backends/cuda-ref/ceed-cuda-ref-qfunctioncontext.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/cuda-ref/ceed-cuda-ref-restriction.c b/backends/cuda-ref/ceed-cuda-ref-restriction.c
index 3ec1a4ef90..bde12c38cd 100644
--- a/backends/cuda-ref/ceed-cuda-ref-restriction.c
+++ b/backends/cuda-ref/ceed-cuda-ref-restriction.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/cuda-ref/ceed-cuda-ref-vector.c b/backends/cuda-ref/ceed-cuda-ref-vector.c
index 259d3526ba..59815605d7 100644
--- a/backends/cuda-ref/ceed-cuda-ref-vector.c
+++ b/backends/cuda-ref/ceed-cuda-ref-vector.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/cuda-ref/ceed-cuda-ref.c b/backends/cuda-ref/ceed-cuda-ref.c
index b8bfcd5f79..db87224018 100644
--- a/backends/cuda-ref/ceed-cuda-ref.c
+++ b/backends/cuda-ref/ceed-cuda-ref.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/cuda-ref/ceed-cuda-ref.h b/backends/cuda-ref/ceed-cuda-ref.h
index 582b43a975..b8cd4babd7 100644
--- a/backends/cuda-ref/ceed-cuda-ref.h
+++ b/backends/cuda-ref/ceed-cuda-ref.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/cuda-ref/kernels/cuda-ref-vector.cu b/backends/cuda-ref/kernels/cuda-ref-vector.cu
index 3ce095cb8f..e325629587 100644
--- a/backends/cuda-ref/kernels/cuda-ref-vector.cu
+++ b/backends/cuda-ref/kernels/cuda-ref-vector.cu
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/cuda-shared/ceed-cuda-shared-basis.c b/backends/cuda-shared/ceed-cuda-shared-basis.c
index e99387e027..5b2fa9256e 100644
--- a/backends/cuda-shared/ceed-cuda-shared-basis.c
+++ b/backends/cuda-shared/ceed-cuda-shared-basis.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/cuda-shared/ceed-cuda-shared.c b/backends/cuda-shared/ceed-cuda-shared.c
index fe3c2a7117..cc6b82089d 100644
--- a/backends/cuda-shared/ceed-cuda-shared.c
+++ b/backends/cuda-shared/ceed-cuda-shared.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/cuda-shared/ceed-cuda-shared.h b/backends/cuda-shared/ceed-cuda-shared.h
index 754028b964..ab66e38926 100644
--- a/backends/cuda-shared/ceed-cuda-shared.h
+++ b/backends/cuda-shared/ceed-cuda-shared.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/cuda/ceed-cuda-common.c b/backends/cuda/ceed-cuda-common.c
index f27b453e4d..35fb6a262e 100644
--- a/backends/cuda/ceed-cuda-common.c
+++ b/backends/cuda/ceed-cuda-common.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/cuda/ceed-cuda-common.h b/backends/cuda/ceed-cuda-common.h
index 1fc8362717..0236e57cdc 100644
--- a/backends/cuda/ceed-cuda-common.h
+++ b/backends/cuda/ceed-cuda-common.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/cuda/ceed-cuda-compile.cpp b/backends/cuda/ceed-cuda-compile.cpp
index c0c2be9035..215c9c017d 100644
--- a/backends/cuda/ceed-cuda-compile.cpp
+++ b/backends/cuda/ceed-cuda-compile.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/cuda/ceed-cuda-compile.h b/backends/cuda/ceed-cuda-compile.h
index 07572aa4ad..8bda2c2f1a 100644
--- a/backends/cuda/ceed-cuda-compile.h
+++ b/backends/cuda/ceed-cuda-compile.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/hip-gen/ceed-hip-gen-operator-build.cpp b/backends/hip-gen/ceed-hip-gen-operator-build.cpp
index de77fc3121..10788a7b7b 100644
--- a/backends/hip-gen/ceed-hip-gen-operator-build.cpp
+++ b/backends/hip-gen/ceed-hip-gen-operator-build.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/hip-gen/ceed-hip-gen-operator-build.h b/backends/hip-gen/ceed-hip-gen-operator-build.h
index 4d5de74269..3193505f5c 100644
--- a/backends/hip-gen/ceed-hip-gen-operator-build.h
+++ b/backends/hip-gen/ceed-hip-gen-operator-build.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/hip-gen/ceed-hip-gen-operator.c b/backends/hip-gen/ceed-hip-gen-operator.c
index 60f28accd9..e09b0c5b1b 100644
--- a/backends/hip-gen/ceed-hip-gen-operator.c
+++ b/backends/hip-gen/ceed-hip-gen-operator.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/hip-gen/ceed-hip-gen-qfunction.c b/backends/hip-gen/ceed-hip-gen-qfunction.c
index 6da2c1d10e..33bfec8f46 100644
--- a/backends/hip-gen/ceed-hip-gen-qfunction.c
+++ b/backends/hip-gen/ceed-hip-gen-qfunction.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/hip-gen/ceed-hip-gen.c b/backends/hip-gen/ceed-hip-gen.c
index 4ba43f8918..ab0a566630 100644
--- a/backends/hip-gen/ceed-hip-gen.c
+++ b/backends/hip-gen/ceed-hip-gen.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/hip-gen/ceed-hip-gen.h b/backends/hip-gen/ceed-hip-gen.h
index 760fef2ed5..2af90f2642 100644
--- a/backends/hip-gen/ceed-hip-gen.h
+++ b/backends/hip-gen/ceed-hip-gen.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/hip-ref/ceed-hip-ref-basis.c b/backends/hip-ref/ceed-hip-ref-basis.c
index be4fdd459b..69091bc381 100644
--- a/backends/hip-ref/ceed-hip-ref-basis.c
+++ b/backends/hip-ref/ceed-hip-ref-basis.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/hip-ref/ceed-hip-ref-operator.c b/backends/hip-ref/ceed-hip-ref-operator.c
index e20d6d13af..3124a48eb9 100644
--- a/backends/hip-ref/ceed-hip-ref-operator.c
+++ b/backends/hip-ref/ceed-hip-ref-operator.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/hip-ref/ceed-hip-ref-qfunction-load.cpp b/backends/hip-ref/ceed-hip-ref-qfunction-load.cpp
index 3cda2455e7..fe8a96cf38 100644
--- a/backends/hip-ref/ceed-hip-ref-qfunction-load.cpp
+++ b/backends/hip-ref/ceed-hip-ref-qfunction-load.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/hip-ref/ceed-hip-ref-qfunction-load.h b/backends/hip-ref/ceed-hip-ref-qfunction-load.h
index dc83256d83..806874fdf1 100644
--- a/backends/hip-ref/ceed-hip-ref-qfunction-load.h
+++ b/backends/hip-ref/ceed-hip-ref-qfunction-load.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/hip-ref/ceed-hip-ref-qfunction.c b/backends/hip-ref/ceed-hip-ref-qfunction.c
index 92835b897e..95e2b90351 100644
--- a/backends/hip-ref/ceed-hip-ref-qfunction.c
+++ b/backends/hip-ref/ceed-hip-ref-qfunction.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/hip-ref/ceed-hip-ref-qfunctioncontext.c b/backends/hip-ref/ceed-hip-ref-qfunctioncontext.c
index 0d09a2087d..1f5eab0ea4 100644
--- a/backends/hip-ref/ceed-hip-ref-qfunctioncontext.c
+++ b/backends/hip-ref/ceed-hip-ref-qfunctioncontext.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/hip-ref/ceed-hip-ref-restriction.c b/backends/hip-ref/ceed-hip-ref-restriction.c
index 0bdf65add6..543839a2bf 100644
--- a/backends/hip-ref/ceed-hip-ref-restriction.c
+++ b/backends/hip-ref/ceed-hip-ref-restriction.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/hip-ref/ceed-hip-ref-vector.c b/backends/hip-ref/ceed-hip-ref-vector.c
index 93798d72e8..238f4b5625 100644
--- a/backends/hip-ref/ceed-hip-ref-vector.c
+++ b/backends/hip-ref/ceed-hip-ref-vector.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/hip-ref/ceed-hip-ref.c b/backends/hip-ref/ceed-hip-ref.c
index fa3215e027..b15686b8dc 100644
--- a/backends/hip-ref/ceed-hip-ref.c
+++ b/backends/hip-ref/ceed-hip-ref.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/hip-ref/ceed-hip-ref.h b/backends/hip-ref/ceed-hip-ref.h
index 9740700d87..9209e88049 100644
--- a/backends/hip-ref/ceed-hip-ref.h
+++ b/backends/hip-ref/ceed-hip-ref.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/hip-ref/kernels/hip-ref-vector.hip.cpp b/backends/hip-ref/kernels/hip-ref-vector.hip.cpp
index 1186548b16..a45118a6df 100644
--- a/backends/hip-ref/kernels/hip-ref-vector.hip.cpp
+++ b/backends/hip-ref/kernels/hip-ref-vector.hip.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/hip-shared/ceed-hip-shared-basis.c b/backends/hip-shared/ceed-hip-shared-basis.c
index 37859b1f21..410d13af2e 100644
--- a/backends/hip-shared/ceed-hip-shared-basis.c
+++ b/backends/hip-shared/ceed-hip-shared-basis.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/hip-shared/ceed-hip-shared.c b/backends/hip-shared/ceed-hip-shared.c
index 72d1da5f0b..d8493e63ec 100644
--- a/backends/hip-shared/ceed-hip-shared.c
+++ b/backends/hip-shared/ceed-hip-shared.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/hip-shared/ceed-hip-shared.h b/backends/hip-shared/ceed-hip-shared.h
index cfb9480f49..ea92ca1ee3 100644
--- a/backends/hip-shared/ceed-hip-shared.h
+++ b/backends/hip-shared/ceed-hip-shared.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/hip/ceed-hip-common.c b/backends/hip/ceed-hip-common.c
index b73ef1ffb8..837cbdc869 100644
--- a/backends/hip/ceed-hip-common.c
+++ b/backends/hip/ceed-hip-common.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/hip/ceed-hip-common.h b/backends/hip/ceed-hip-common.h
index c62c392abe..28805b944b 100644
--- a/backends/hip/ceed-hip-common.h
+++ b/backends/hip/ceed-hip-common.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/hip/ceed-hip-compile.cpp b/backends/hip/ceed-hip-compile.cpp
index bad8519db6..557f6e584e 100644
--- a/backends/hip/ceed-hip-compile.cpp
+++ b/backends/hip/ceed-hip-compile.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/hip/ceed-hip-compile.h b/backends/hip/ceed-hip-compile.h
index 66f542e1a1..d2cb987819 100644
--- a/backends/hip/ceed-hip-compile.h
+++ b/backends/hip/ceed-hip-compile.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/magma/ceed-magma-basis.c b/backends/magma/ceed-magma-basis.c
index 3043576489..9a5fe0fc1e 100644
--- a/backends/magma/ceed-magma-basis.c
+++ b/backends/magma/ceed-magma-basis.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/magma/ceed-magma-common.c b/backends/magma/ceed-magma-common.c
index 592f216c6f..b3a03491c5 100644
--- a/backends/magma/ceed-magma-common.c
+++ b/backends/magma/ceed-magma-common.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/magma/ceed-magma-common.h b/backends/magma/ceed-magma-common.h
index 5ebf9b0d10..1604aaef12 100644
--- a/backends/magma/ceed-magma-common.h
+++ b/backends/magma/ceed-magma-common.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/magma/ceed-magma-det.c b/backends/magma/ceed-magma-det.c
index 5ecbe462e1..07b5bdd291 100644
--- a/backends/magma/ceed-magma-det.c
+++ b/backends/magma/ceed-magma-det.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/magma/ceed-magma-gemm-nontensor.cpp b/backends/magma/ceed-magma-gemm-nontensor.cpp
index 856b514acb..6ac67ae227 100644
--- a/backends/magma/ceed-magma-gemm-nontensor.cpp
+++ b/backends/magma/ceed-magma-gemm-nontensor.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/magma/ceed-magma-gemm-nontensor.h b/backends/magma/ceed-magma-gemm-nontensor.h
index 0431620b83..e867ad2600 100644
--- a/backends/magma/ceed-magma-gemm-nontensor.h
+++ b/backends/magma/ceed-magma-gemm-nontensor.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/magma/ceed-magma-gemm-selector.cpp b/backends/magma/ceed-magma-gemm-selector.cpp
index 46f963bca0..22532a8049 100644
--- a/backends/magma/ceed-magma-gemm-selector.cpp
+++ b/backends/magma/ceed-magma-gemm-selector.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/magma/ceed-magma-gemm-selector.h b/backends/magma/ceed-magma-gemm-selector.h
index c96c95c169..c94eb81c20 100644
--- a/backends/magma/ceed-magma-gemm-selector.h
+++ b/backends/magma/ceed-magma-gemm-selector.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/magma/ceed-magma.c b/backends/magma/ceed-magma.c
index f325947920..8ffce54fe6 100644
--- a/backends/magma/ceed-magma.c
+++ b/backends/magma/ceed-magma.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/magma/ceed-magma.h b/backends/magma/ceed-magma.h
index 22dd4264b6..12eb9cab28 100644
--- a/backends/magma/ceed-magma.h
+++ b/backends/magma/ceed-magma.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/magma/tuning/Makefile b/backends/magma/tuning/Makefile
index 37cfa194f7..930213e647 100644
--- a/backends/magma/tuning/Makefile
+++ b/backends/magma/tuning/Makefile
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors
+# Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/magma/tuning/generate_tuning.py b/backends/magma/tuning/generate_tuning.py
index 10a2062881..04d563a99b 100644
--- a/backends/magma/tuning/generate_tuning.py
+++ b/backends/magma/tuning/generate_tuning.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 
-# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+# Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/magma/tuning/tuning.cpp b/backends/magma/tuning/tuning.cpp
index 7a387c14b6..44e313fe69 100644
--- a/backends/magma/tuning/tuning.cpp
+++ b/backends/magma/tuning/tuning.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/memcheck/ceed-memcheck-blocked.c b/backends/memcheck/ceed-memcheck-blocked.c
index 5bc0a5dc10..6d9ca0e0c9 100644
--- a/backends/memcheck/ceed-memcheck-blocked.c
+++ b/backends/memcheck/ceed-memcheck-blocked.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/memcheck/ceed-memcheck-qfunction.c b/backends/memcheck/ceed-memcheck-qfunction.c
index b73e3c5e90..d67ea92a8f 100644
--- a/backends/memcheck/ceed-memcheck-qfunction.c
+++ b/backends/memcheck/ceed-memcheck-qfunction.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/memcheck/ceed-memcheck-qfunctioncontext.c b/backends/memcheck/ceed-memcheck-qfunctioncontext.c
index 6149a5a3ac..0ba11cc464 100644
--- a/backends/memcheck/ceed-memcheck-qfunctioncontext.c
+++ b/backends/memcheck/ceed-memcheck-qfunctioncontext.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/memcheck/ceed-memcheck-restriction.c b/backends/memcheck/ceed-memcheck-restriction.c
index 57faf28116..47200bf004 100644
--- a/backends/memcheck/ceed-memcheck-restriction.c
+++ b/backends/memcheck/ceed-memcheck-restriction.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/memcheck/ceed-memcheck-serial.c b/backends/memcheck/ceed-memcheck-serial.c
index 433380d6d4..6a5b5a7b5e 100644
--- a/backends/memcheck/ceed-memcheck-serial.c
+++ b/backends/memcheck/ceed-memcheck-serial.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/memcheck/ceed-memcheck-vector.c b/backends/memcheck/ceed-memcheck-vector.c
index b2d6b5efd5..01187d06e9 100644
--- a/backends/memcheck/ceed-memcheck-vector.c
+++ b/backends/memcheck/ceed-memcheck-vector.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/memcheck/ceed-memcheck.h b/backends/memcheck/ceed-memcheck.h
index 45eb9c5ae3..c09b533549 100644
--- a/backends/memcheck/ceed-memcheck.h
+++ b/backends/memcheck/ceed-memcheck.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/ceed-occa-basis.cpp b/backends/occa/ceed-occa-basis.cpp
index c9f94d18d0..29f9c39e41 100644
--- a/backends/occa/ceed-occa-basis.cpp
+++ b/backends/occa/ceed-occa-basis.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/ceed-occa-basis.hpp b/backends/occa/ceed-occa-basis.hpp
index 2fe01ec052..54d4bddb55 100644
--- a/backends/occa/ceed-occa-basis.hpp
+++ b/backends/occa/ceed-occa-basis.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/ceed-occa-ceed-object.cpp b/backends/occa/ceed-occa-ceed-object.cpp
index c6dd28fef2..199d6e0119 100644
--- a/backends/occa/ceed-occa-ceed-object.cpp
+++ b/backends/occa/ceed-occa-ceed-object.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/ceed-occa-ceed-object.hpp b/backends/occa/ceed-occa-ceed-object.hpp
index 46235cbad5..326c5c2944 100644
--- a/backends/occa/ceed-occa-ceed-object.hpp
+++ b/backends/occa/ceed-occa-ceed-object.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/ceed-occa-context.cpp b/backends/occa/ceed-occa-context.cpp
index 4a705147de..2119a0eede 100644
--- a/backends/occa/ceed-occa-context.cpp
+++ b/backends/occa/ceed-occa-context.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/ceed-occa-context.hpp b/backends/occa/ceed-occa-context.hpp
index 3e1586082b..1785cc2024 100644
--- a/backends/occa/ceed-occa-context.hpp
+++ b/backends/occa/ceed-occa-context.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/ceed-occa-cpu-operator.cpp b/backends/occa/ceed-occa-cpu-operator.cpp
index cf5bd3fe59..7f725c7d60 100644
--- a/backends/occa/ceed-occa-cpu-operator.cpp
+++ b/backends/occa/ceed-occa-cpu-operator.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/ceed-occa-cpu-operator.hpp b/backends/occa/ceed-occa-cpu-operator.hpp
index e7e79b059c..65496e274b 100644
--- a/backends/occa/ceed-occa-cpu-operator.hpp
+++ b/backends/occa/ceed-occa-cpu-operator.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/ceed-occa-elem-restriction.cpp b/backends/occa/ceed-occa-elem-restriction.cpp
index 140041cb1d..026f45d90c 100644
--- a/backends/occa/ceed-occa-elem-restriction.cpp
+++ b/backends/occa/ceed-occa-elem-restriction.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/ceed-occa-elem-restriction.hpp b/backends/occa/ceed-occa-elem-restriction.hpp
index 7ac03146b8..3fdc226df7 100644
--- a/backends/occa/ceed-occa-elem-restriction.hpp
+++ b/backends/occa/ceed-occa-elem-restriction.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/ceed-occa-gpu-operator.cpp b/backends/occa/ceed-occa-gpu-operator.cpp
index af7a43becd..14984056fc 100644
--- a/backends/occa/ceed-occa-gpu-operator.cpp
+++ b/backends/occa/ceed-occa-gpu-operator.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/ceed-occa-gpu-operator.hpp b/backends/occa/ceed-occa-gpu-operator.hpp
index fc14304975..8ebf6e742d 100644
--- a/backends/occa/ceed-occa-gpu-operator.hpp
+++ b/backends/occa/ceed-occa-gpu-operator.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/ceed-occa-kernels.hpp b/backends/occa/ceed-occa-kernels.hpp
index 86469be1f1..d5a4896e7b 100644
--- a/backends/occa/ceed-occa-kernels.hpp
+++ b/backends/occa/ceed-occa-kernels.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/ceed-occa-operator-args.cpp b/backends/occa/ceed-occa-operator-args.cpp
index 61199ce288..0008c79ada 100644
--- a/backends/occa/ceed-occa-operator-args.cpp
+++ b/backends/occa/ceed-occa-operator-args.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/ceed-occa-operator-args.hpp b/backends/occa/ceed-occa-operator-args.hpp
index 5edf95188c..683225da80 100644
--- a/backends/occa/ceed-occa-operator-args.hpp
+++ b/backends/occa/ceed-occa-operator-args.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/ceed-occa-operator-field.cpp b/backends/occa/ceed-occa-operator-field.cpp
index 4745b8dfc0..2906c53b0a 100644
--- a/backends/occa/ceed-occa-operator-field.cpp
+++ b/backends/occa/ceed-occa-operator-field.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/ceed-occa-operator-field.hpp b/backends/occa/ceed-occa-operator-field.hpp
index 4eeb5e70ed..8849fffeb9 100644
--- a/backends/occa/ceed-occa-operator-field.hpp
+++ b/backends/occa/ceed-occa-operator-field.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/ceed-occa-operator.cpp b/backends/occa/ceed-occa-operator.cpp
index c19e875033..057a131162 100644
--- a/backends/occa/ceed-occa-operator.cpp
+++ b/backends/occa/ceed-occa-operator.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/ceed-occa-operator.hpp b/backends/occa/ceed-occa-operator.hpp
index 5325bdf33d..d2a84ddb6b 100644
--- a/backends/occa/ceed-occa-operator.hpp
+++ b/backends/occa/ceed-occa-operator.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/ceed-occa-qfunction-args.cpp b/backends/occa/ceed-occa-qfunction-args.cpp
index b8d2d9e936..974719b0a6 100644
--- a/backends/occa/ceed-occa-qfunction-args.cpp
+++ b/backends/occa/ceed-occa-qfunction-args.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/ceed-occa-qfunction-args.hpp b/backends/occa/ceed-occa-qfunction-args.hpp
index 77093ec93d..a8c5638a98 100644
--- a/backends/occa/ceed-occa-qfunction-args.hpp
+++ b/backends/occa/ceed-occa-qfunction-args.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/ceed-occa-qfunction-field.cpp b/backends/occa/ceed-occa-qfunction-field.cpp
index 7dada84ba8..1c15578544 100644
--- a/backends/occa/ceed-occa-qfunction-field.cpp
+++ b/backends/occa/ceed-occa-qfunction-field.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/ceed-occa-qfunction-field.hpp b/backends/occa/ceed-occa-qfunction-field.hpp
index 86eefd690e..7f4b34e158 100644
--- a/backends/occa/ceed-occa-qfunction-field.hpp
+++ b/backends/occa/ceed-occa-qfunction-field.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/ceed-occa-qfunction.cpp b/backends/occa/ceed-occa-qfunction.cpp
index ac8e3b7386..a72b04d00c 100644
--- a/backends/occa/ceed-occa-qfunction.cpp
+++ b/backends/occa/ceed-occa-qfunction.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/ceed-occa-qfunction.hpp b/backends/occa/ceed-occa-qfunction.hpp
index 4af04c5bd7..e0eb84de84 100644
--- a/backends/occa/ceed-occa-qfunction.hpp
+++ b/backends/occa/ceed-occa-qfunction.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/ceed-occa-qfunctioncontext.cpp b/backends/occa/ceed-occa-qfunctioncontext.cpp
index 017925f5a1..6cdc9f36de 100644
--- a/backends/occa/ceed-occa-qfunctioncontext.cpp
+++ b/backends/occa/ceed-occa-qfunctioncontext.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/ceed-occa-qfunctioncontext.hpp b/backends/occa/ceed-occa-qfunctioncontext.hpp
index 850eb3adbf..4cfde9a25c 100644
--- a/backends/occa/ceed-occa-qfunctioncontext.hpp
+++ b/backends/occa/ceed-occa-qfunctioncontext.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/ceed-occa-simplex-basis.cpp b/backends/occa/ceed-occa-simplex-basis.cpp
index 747d21afd9..5fff1d4e7e 100644
--- a/backends/occa/ceed-occa-simplex-basis.cpp
+++ b/backends/occa/ceed-occa-simplex-basis.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/ceed-occa-simplex-basis.hpp b/backends/occa/ceed-occa-simplex-basis.hpp
index c27b6d0a88..e4875f2ddb 100644
--- a/backends/occa/ceed-occa-simplex-basis.hpp
+++ b/backends/occa/ceed-occa-simplex-basis.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/ceed-occa-tensor-basis.cpp b/backends/occa/ceed-occa-tensor-basis.cpp
index 553672170c..29c9361f09 100644
--- a/backends/occa/ceed-occa-tensor-basis.cpp
+++ b/backends/occa/ceed-occa-tensor-basis.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/ceed-occa-tensor-basis.hpp b/backends/occa/ceed-occa-tensor-basis.hpp
index 35e345b8c9..4d0dc2c2ba 100644
--- a/backends/occa/ceed-occa-tensor-basis.hpp
+++ b/backends/occa/ceed-occa-tensor-basis.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/ceed-occa-types.hpp b/backends/occa/ceed-occa-types.hpp
index cc56791f85..9dc1d83f58 100644
--- a/backends/occa/ceed-occa-types.hpp
+++ b/backends/occa/ceed-occa-types.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/ceed-occa-vector.cpp b/backends/occa/ceed-occa-vector.cpp
index 0a5c51a28a..9b369a4efe 100644
--- a/backends/occa/ceed-occa-vector.cpp
+++ b/backends/occa/ceed-occa-vector.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/ceed-occa-vector.hpp b/backends/occa/ceed-occa-vector.hpp
index 37abf5d7fa..71fcbdf693 100644
--- a/backends/occa/ceed-occa-vector.hpp
+++ b/backends/occa/ceed-occa-vector.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/ceed-occa.cpp b/backends/occa/ceed-occa.cpp
index d43231f2a1..eca2f4e798 100644
--- a/backends/occa/ceed-occa.cpp
+++ b/backends/occa/ceed-occa.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/ceed-occa.h b/backends/occa/ceed-occa.h
index d9dc78ebd4..43df0a7001 100644
--- a/backends/occa/ceed-occa.h
+++ b/backends/occa/ceed-occa.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/kernels/elem-restriction.cpp b/backends/occa/kernels/elem-restriction.cpp
index 947556be1f..824e1ef79a 100644
--- a/backends/occa/kernels/elem-restriction.cpp
+++ b/backends/occa/kernels/elem-restriction.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/kernels/elem-restriction.hpp b/backends/occa/kernels/elem-restriction.hpp
index ac45de6c49..65d1309e03 100644
--- a/backends/occa/kernels/elem-restriction.hpp
+++ b/backends/occa/kernels/elem-restriction.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/kernels/kernel-defines.hpp b/backends/occa/kernels/kernel-defines.hpp
index beb0c79624..bbda69714b 100644
--- a/backends/occa/kernels/kernel-defines.hpp
+++ b/backends/occa/kernels/kernel-defines.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/kernels/set-value.cpp b/backends/occa/kernels/set-value.cpp
index a7a756e442..b165748d22 100644
--- a/backends/occa/kernels/set-value.cpp
+++ b/backends/occa/kernels/set-value.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/kernels/set-value.hpp b/backends/occa/kernels/set-value.hpp
index fa5303f5f0..240b531d6c 100644
--- a/backends/occa/kernels/set-value.hpp
+++ b/backends/occa/kernels/set-value.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/kernels/simplex-basis.hpp b/backends/occa/kernels/simplex-basis.hpp
index 4f53e5c6dd..40072e5758 100644
--- a/backends/occa/kernels/simplex-basis.hpp
+++ b/backends/occa/kernels/simplex-basis.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/kernels/simplex-basis/cpu-simplex-basis.cpp b/backends/occa/kernels/simplex-basis/cpu-simplex-basis.cpp
index 39a36684c2..17976fa6d1 100644
--- a/backends/occa/kernels/simplex-basis/cpu-simplex-basis.cpp
+++ b/backends/occa/kernels/simplex-basis/cpu-simplex-basis.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/kernels/simplex-basis/gpu-simplex-basis.cpp b/backends/occa/kernels/simplex-basis/gpu-simplex-basis.cpp
index aa09fa60d7..2ef0dfe28a 100644
--- a/backends/occa/kernels/simplex-basis/gpu-simplex-basis.cpp
+++ b/backends/occa/kernels/simplex-basis/gpu-simplex-basis.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/kernels/tensor-basis.hpp b/backends/occa/kernels/tensor-basis.hpp
index 54850a7830..fe763ace59 100644
--- a/backends/occa/kernels/tensor-basis.hpp
+++ b/backends/occa/kernels/tensor-basis.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/kernels/tensor-basis/cpu/tensor-basis-1d.cpp b/backends/occa/kernels/tensor-basis/cpu/tensor-basis-1d.cpp
index 90c9e905d5..98619f64e8 100644
--- a/backends/occa/kernels/tensor-basis/cpu/tensor-basis-1d.cpp
+++ b/backends/occa/kernels/tensor-basis/cpu/tensor-basis-1d.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/kernels/tensor-basis/cpu/tensor-basis-2d.cpp b/backends/occa/kernels/tensor-basis/cpu/tensor-basis-2d.cpp
index d150129584..8649e42070 100644
--- a/backends/occa/kernels/tensor-basis/cpu/tensor-basis-2d.cpp
+++ b/backends/occa/kernels/tensor-basis/cpu/tensor-basis-2d.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/kernels/tensor-basis/cpu/tensor-basis-3d.cpp b/backends/occa/kernels/tensor-basis/cpu/tensor-basis-3d.cpp
index 942470b85f..eda0e8475d 100644
--- a/backends/occa/kernels/tensor-basis/cpu/tensor-basis-3d.cpp
+++ b/backends/occa/kernels/tensor-basis/cpu/tensor-basis-3d.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/kernels/tensor-basis/gpu/tensor-basis-1d.cpp b/backends/occa/kernels/tensor-basis/gpu/tensor-basis-1d.cpp
index 34377f29b9..892447c271 100644
--- a/backends/occa/kernels/tensor-basis/gpu/tensor-basis-1d.cpp
+++ b/backends/occa/kernels/tensor-basis/gpu/tensor-basis-1d.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/kernels/tensor-basis/gpu/tensor-basis-2d.cpp b/backends/occa/kernels/tensor-basis/gpu/tensor-basis-2d.cpp
index 4d99490306..c98df69a7f 100644
--- a/backends/occa/kernels/tensor-basis/gpu/tensor-basis-2d.cpp
+++ b/backends/occa/kernels/tensor-basis/gpu/tensor-basis-2d.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/kernels/tensor-basis/gpu/tensor-basis-3d.cpp b/backends/occa/kernels/tensor-basis/gpu/tensor-basis-3d.cpp
index 3b59827a8e..3251951305 100644
--- a/backends/occa/kernels/tensor-basis/gpu/tensor-basis-3d.cpp
+++ b/backends/occa/kernels/tensor-basis/gpu/tensor-basis-3d.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/opt/ceed-opt-blocked.c b/backends/opt/ceed-opt-blocked.c
index fcb8140ef1..a3ba49c7fa 100644
--- a/backends/opt/ceed-opt-blocked.c
+++ b/backends/opt/ceed-opt-blocked.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/opt/ceed-opt-operator.c b/backends/opt/ceed-opt-operator.c
index 679e88163b..0695f7f1df 100644
--- a/backends/opt/ceed-opt-operator.c
+++ b/backends/opt/ceed-opt-operator.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/opt/ceed-opt-serial.c b/backends/opt/ceed-opt-serial.c
index 66fc1a9cfb..86bc832bc8 100644
--- a/backends/opt/ceed-opt-serial.c
+++ b/backends/opt/ceed-opt-serial.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/opt/ceed-opt-tensor.c b/backends/opt/ceed-opt-tensor.c
index a8f5335e35..24a00adb81 100644
--- a/backends/opt/ceed-opt-tensor.c
+++ b/backends/opt/ceed-opt-tensor.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/opt/ceed-opt.h b/backends/opt/ceed-opt.h
index d5f7399a89..96bfff26bc 100644
--- a/backends/opt/ceed-opt.h
+++ b/backends/opt/ceed-opt.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/ref/ceed-ref-basis.c b/backends/ref/ceed-ref-basis.c
index 550f631159..2ae551eaf2 100644
--- a/backends/ref/ceed-ref-basis.c
+++ b/backends/ref/ceed-ref-basis.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/ref/ceed-ref-operator.c b/backends/ref/ceed-ref-operator.c
index b53a8c6150..151316f83d 100644
--- a/backends/ref/ceed-ref-operator.c
+++ b/backends/ref/ceed-ref-operator.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/ref/ceed-ref-qfunction.c b/backends/ref/ceed-ref-qfunction.c
index efddda2dcc..a0e6e32cce 100644
--- a/backends/ref/ceed-ref-qfunction.c
+++ b/backends/ref/ceed-ref-qfunction.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/ref/ceed-ref-qfunctioncontext.c b/backends/ref/ceed-ref-qfunctioncontext.c
index 0d3c8bba36..1e8ed0cc90 100644
--- a/backends/ref/ceed-ref-qfunctioncontext.c
+++ b/backends/ref/ceed-ref-qfunctioncontext.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/ref/ceed-ref-restriction.c b/backends/ref/ceed-ref-restriction.c
index 65b6cf080f..3b5a5d50e4 100644
--- a/backends/ref/ceed-ref-restriction.c
+++ b/backends/ref/ceed-ref-restriction.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/ref/ceed-ref-tensor.c b/backends/ref/ceed-ref-tensor.c
index 17499172a6..38c7880dc6 100644
--- a/backends/ref/ceed-ref-tensor.c
+++ b/backends/ref/ceed-ref-tensor.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/ref/ceed-ref-vector.c b/backends/ref/ceed-ref-vector.c
index 2af3a8770c..813ce21d6d 100644
--- a/backends/ref/ceed-ref-vector.c
+++ b/backends/ref/ceed-ref-vector.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/ref/ceed-ref.c b/backends/ref/ceed-ref.c
index a3c15faf8f..274e3c72c9 100644
--- a/backends/ref/ceed-ref.c
+++ b/backends/ref/ceed-ref.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/ref/ceed-ref.h b/backends/ref/ceed-ref.h
index 880b4f89af..621424b2ed 100644
--- a/backends/ref/ceed-ref.h
+++ b/backends/ref/ceed-ref.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/sycl-gen/ceed-sycl-gen-operator-build.hpp b/backends/sycl-gen/ceed-sycl-gen-operator-build.hpp
index ca469f8d77..0a00fdb3b9 100644
--- a/backends/sycl-gen/ceed-sycl-gen-operator-build.hpp
+++ b/backends/sycl-gen/ceed-sycl-gen-operator-build.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/sycl-gen/ceed-sycl-gen-operator-build.sycl.cpp b/backends/sycl-gen/ceed-sycl-gen-operator-build.sycl.cpp
index b3850c05fc..aa7d5f0253 100644
--- a/backends/sycl-gen/ceed-sycl-gen-operator-build.sycl.cpp
+++ b/backends/sycl-gen/ceed-sycl-gen-operator-build.sycl.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/sycl-gen/ceed-sycl-gen-operator.sycl.cpp b/backends/sycl-gen/ceed-sycl-gen-operator.sycl.cpp
index 1acd7147ee..c1414c15c0 100644
--- a/backends/sycl-gen/ceed-sycl-gen-operator.sycl.cpp
+++ b/backends/sycl-gen/ceed-sycl-gen-operator.sycl.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/sycl-gen/ceed-sycl-gen-qfunction.sycl.cpp b/backends/sycl-gen/ceed-sycl-gen-qfunction.sycl.cpp
index e810bfbf7a..3529e76015 100644
--- a/backends/sycl-gen/ceed-sycl-gen-qfunction.sycl.cpp
+++ b/backends/sycl-gen/ceed-sycl-gen-qfunction.sycl.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/sycl-gen/ceed-sycl-gen.hpp b/backends/sycl-gen/ceed-sycl-gen.hpp
index bc1179e4f2..cc632651f6 100644
--- a/backends/sycl-gen/ceed-sycl-gen.hpp
+++ b/backends/sycl-gen/ceed-sycl-gen.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/sycl-gen/ceed-sycl-gen.sycl.cpp b/backends/sycl-gen/ceed-sycl-gen.sycl.cpp
index 2a1ed4ad49..6335df787a 100644
--- a/backends/sycl-gen/ceed-sycl-gen.sycl.cpp
+++ b/backends/sycl-gen/ceed-sycl-gen.sycl.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/sycl-ref/ceed-sycl-ref-basis.sycl.cpp b/backends/sycl-ref/ceed-sycl-ref-basis.sycl.cpp
index 1330d61a6a..a90ad9e78c 100644
--- a/backends/sycl-ref/ceed-sycl-ref-basis.sycl.cpp
+++ b/backends/sycl-ref/ceed-sycl-ref-basis.sycl.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp b/backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp
index 89148678d3..1bf91636db 100644
--- a/backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp
+++ b/backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other
 // CEED contributors. All Rights Reserved. See the top-level LICENSE and NOTICE
 // files for details.
 //
diff --git a/backends/sycl-ref/ceed-sycl-ref-qfunction-load.hpp b/backends/sycl-ref/ceed-sycl-ref-qfunction-load.hpp
index 23e792f90e..712a8b66b4 100644
--- a/backends/sycl-ref/ceed-sycl-ref-qfunction-load.hpp
+++ b/backends/sycl-ref/ceed-sycl-ref-qfunction-load.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/sycl-ref/ceed-sycl-ref-qfunction-load.sycl.cpp b/backends/sycl-ref/ceed-sycl-ref-qfunction-load.sycl.cpp
index 606a1f45ad..e2e2b63749 100644
--- a/backends/sycl-ref/ceed-sycl-ref-qfunction-load.sycl.cpp
+++ b/backends/sycl-ref/ceed-sycl-ref-qfunction-load.sycl.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/sycl-ref/ceed-sycl-ref-qfunction.sycl.cpp b/backends/sycl-ref/ceed-sycl-ref-qfunction.sycl.cpp
index 6a2c7f060b..0c783e713b 100644
--- a/backends/sycl-ref/ceed-sycl-ref-qfunction.sycl.cpp
+++ b/backends/sycl-ref/ceed-sycl-ref-qfunction.sycl.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other
 // CEED contributors. All Rights Reserved. See the top-level LICENSE and NOTICE
 // files for details.
 //
diff --git a/backends/sycl-ref/ceed-sycl-ref-qfunctioncontext.sycl.cpp b/backends/sycl-ref/ceed-sycl-ref-qfunctioncontext.sycl.cpp
index 7130a0dead..0629d404be 100644
--- a/backends/sycl-ref/ceed-sycl-ref-qfunctioncontext.sycl.cpp
+++ b/backends/sycl-ref/ceed-sycl-ref-qfunctioncontext.sycl.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/sycl-ref/ceed-sycl-ref.hpp b/backends/sycl-ref/ceed-sycl-ref.hpp
index 46123d5569..adce713bb6 100644
--- a/backends/sycl-ref/ceed-sycl-ref.hpp
+++ b/backends/sycl-ref/ceed-sycl-ref.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other
 // CEED contributors. All Rights Reserved. See the top-level LICENSE and NOTICE
 // files for details.
 //
diff --git a/backends/sycl-ref/ceed-sycl-ref.sycl.cpp b/backends/sycl-ref/ceed-sycl-ref.sycl.cpp
index 6229003cb4..6b2996a460 100644
--- a/backends/sycl-ref/ceed-sycl-ref.sycl.cpp
+++ b/backends/sycl-ref/ceed-sycl-ref.sycl.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other
 // CEED contributors. All Rights Reserved. See the top-level LICENSE and NOTICE
 // files for details.
 //
diff --git a/backends/sycl-ref/ceed-sycl-restriction.sycl.cpp b/backends/sycl-ref/ceed-sycl-restriction.sycl.cpp
index 07a451213b..ba7e520dfc 100644
--- a/backends/sycl-ref/ceed-sycl-restriction.sycl.cpp
+++ b/backends/sycl-ref/ceed-sycl-restriction.sycl.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other
 // CEED contributors. All Rights Reserved. See the top-level LICENSE and NOTICE
 // files for details.
 //
diff --git a/backends/sycl-ref/ceed-sycl-vector.sycl.cpp b/backends/sycl-ref/ceed-sycl-vector.sycl.cpp
index 32dda419f0..1c7d3f8b5e 100644
--- a/backends/sycl-ref/ceed-sycl-vector.sycl.cpp
+++ b/backends/sycl-ref/ceed-sycl-vector.sycl.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/sycl-ref/kernels/sycl-ref-vector.cpp b/backends/sycl-ref/kernels/sycl-ref-vector.cpp
index 788b608f3a..bc1c26593c 100644
--- a/backends/sycl-ref/kernels/sycl-ref-vector.cpp
+++ b/backends/sycl-ref/kernels/sycl-ref-vector.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other
 // CEED contributors. All Rights Reserved. See the top-level LICENSE and NOTICE
 // files for details.
 //
diff --git a/backends/sycl-shared/ceed-sycl-shared-basis.sycl.cpp b/backends/sycl-shared/ceed-sycl-shared-basis.sycl.cpp
index 7d8302599f..bac5693aa4 100644
--- a/backends/sycl-shared/ceed-sycl-shared-basis.sycl.cpp
+++ b/backends/sycl-shared/ceed-sycl-shared-basis.sycl.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/sycl-shared/ceed-sycl-shared.hpp b/backends/sycl-shared/ceed-sycl-shared.hpp
index e4a4c9f203..69dd86fe9b 100644
--- a/backends/sycl-shared/ceed-sycl-shared.hpp
+++ b/backends/sycl-shared/ceed-sycl-shared.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/sycl-shared/ceed-sycl-shared.sycl.cpp b/backends/sycl-shared/ceed-sycl-shared.sycl.cpp
index d7018c149c..7f001a65cd 100644
--- a/backends/sycl-shared/ceed-sycl-shared.sycl.cpp
+++ b/backends/sycl-shared/ceed-sycl-shared.sycl.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/sycl/ceed-sycl-common.hpp b/backends/sycl/ceed-sycl-common.hpp
index e61cbebc18..3a84e1ef33 100644
--- a/backends/sycl/ceed-sycl-common.hpp
+++ b/backends/sycl/ceed-sycl-common.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/sycl/ceed-sycl-common.sycl.cpp b/backends/sycl/ceed-sycl-common.sycl.cpp
index 176b39cd84..d75e3ec7db 100644
--- a/backends/sycl/ceed-sycl-common.sycl.cpp
+++ b/backends/sycl/ceed-sycl-common.sycl.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other
 // CEED contributors. All Rights Reserved. See the top-level LICENSE and NOTICE
 // files for details.
 //
diff --git a/backends/sycl/ceed-sycl-compile.hpp b/backends/sycl/ceed-sycl-compile.hpp
index 67db04f294..9faea1f6dc 100644
--- a/backends/sycl/ceed-sycl-compile.hpp
+++ b/backends/sycl/ceed-sycl-compile.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/sycl/ceed-sycl-compile.sycl.cpp b/backends/sycl/ceed-sycl-compile.sycl.cpp
index 9dc0177401..02caf67dbc 100644
--- a/backends/sycl/ceed-sycl-compile.sycl.cpp
+++ b/backends/sycl/ceed-sycl-compile.sycl.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/xsmm/ceed-xsmm-blocked.c b/backends/xsmm/ceed-xsmm-blocked.c
index 1bd5d724f1..ec6fb2376f 100644
--- a/backends/xsmm/ceed-xsmm-blocked.c
+++ b/backends/xsmm/ceed-xsmm-blocked.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/xsmm/ceed-xsmm-serial.c b/backends/xsmm/ceed-xsmm-serial.c
index 69d51b769f..0869ff3442 100644
--- a/backends/xsmm/ceed-xsmm-serial.c
+++ b/backends/xsmm/ceed-xsmm-serial.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/xsmm/ceed-xsmm-tensor.c b/backends/xsmm/ceed-xsmm-tensor.c
index 899726ef09..8386181de4 100644
--- a/backends/xsmm/ceed-xsmm-tensor.c
+++ b/backends/xsmm/ceed-xsmm-tensor.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/xsmm/ceed-xsmm.h b/backends/xsmm/ceed-xsmm.h
index 0cb56591fe..65ff339d8a 100644
--- a/backends/xsmm/ceed-xsmm.h
+++ b/backends/xsmm/ceed-xsmm.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/benchmarks/benchmark.sh b/benchmarks/benchmark.sh
index 167e374f7b..9cf27bd199 100755
--- a/benchmarks/benchmark.sh
+++ b/benchmarks/benchmark.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+# Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/benchmarks/petsc-bps.sh b/benchmarks/petsc-bps.sh
index 46ba51b73c..8e69a10c86 100755
--- a/benchmarks/petsc-bps.sh
+++ b/benchmarks/petsc-bps.sh
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+# Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/benchmarks/petsc-bpsraw.sh b/benchmarks/petsc-bpsraw.sh
index 666593c7d3..20567e4408 100755
--- a/benchmarks/petsc-bpsraw.sh
+++ b/benchmarks/petsc-bpsraw.sh
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+# Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/benchmarks/postprocess_base.py b/benchmarks/postprocess_base.py
index b9a8d46ddf..b63acb4b65 100755
--- a/benchmarks/postprocess_base.py
+++ b/benchmarks/postprocess_base.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 
-# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+# Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/benchmarks/postprocess_plot.py b/benchmarks/postprocess_plot.py
index 62939e54d7..52b7c5ee5b 100755
--- a/benchmarks/postprocess_plot.py
+++ b/benchmarks/postprocess_plot.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 
-# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+# Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/benchmarks/postprocess_table.py b/benchmarks/postprocess_table.py
index 27a200e0b1..22463e4cf7 100755
--- a/benchmarks/postprocess_table.py
+++ b/benchmarks/postprocess_table.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 
-# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+# Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/common.mk b/common.mk
index 4c466b8782..19ba90d847 100644
--- a/common.mk
+++ b/common.mk
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+# Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/ceed/Makefile b/examples/ceed/Makefile
index 57528cc1cd..419499f05e 100644
--- a/examples/ceed/Makefile
+++ b/examples/ceed/Makefile
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors
+# Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/ceed/ex1-volume.c b/examples/ceed/ex1-volume.c
index 554c5ab883..1ac84ea112 100644
--- a/examples/ceed/ex1-volume.c
+++ b/examples/ceed/ex1-volume.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/ceed/ex1-volume.h b/examples/ceed/ex1-volume.h
index 98d1315bf3..79cf3d0d4e 100644
--- a/examples/ceed/ex1-volume.h
+++ b/examples/ceed/ex1-volume.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/ceed/ex2-surface.c b/examples/ceed/ex2-surface.c
index 269312698e..2db90319fd 100644
--- a/examples/ceed/ex2-surface.c
+++ b/examples/ceed/ex2-surface.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/ceed/ex2-surface.h b/examples/ceed/ex2-surface.h
index d11ee6ab5f..901be2e9ce 100644
--- a/examples/ceed/ex2-surface.h
+++ b/examples/ceed/ex2-surface.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/ceed/ex3-volume.c b/examples/ceed/ex3-volume.c
index 1bee79686e..00293f3b46 100644
--- a/examples/ceed/ex3-volume.c
+++ b/examples/ceed/ex3-volume.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/ceed/ex3-volume.h b/examples/ceed/ex3-volume.h
index d544d229c1..11e9c7f99c 100644
--- a/examples/ceed/ex3-volume.h
+++ b/examples/ceed/ex3-volume.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/include/bc_definition.h b/examples/fluids/include/bc_definition.h
index 53fff9f23f..095c3884b1 100644
--- a/examples/fluids/include/bc_definition.h
+++ b/examples/fluids/include/bc_definition.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/include/log_events.h b/examples/fluids/include/log_events.h
index 418897df94..8e65ac373c 100644
--- a/examples/fluids/include/log_events.h
+++ b/examples/fluids/include/log_events.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/include/mat-ceed-impl.h b/examples/fluids/include/mat-ceed-impl.h
index d035fd230d..882b6221bc 100644
--- a/examples/fluids/include/mat-ceed-impl.h
+++ b/examples/fluids/include/mat-ceed-impl.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/include/mat-ceed.h b/examples/fluids/include/mat-ceed.h
index d392c6532f..11e7caad0d 100644
--- a/examples/fluids/include/mat-ceed.h
+++ b/examples/fluids/include/mat-ceed.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/include/petsc-ceed-utils.h b/examples/fluids/include/petsc-ceed-utils.h
index d085ad670f..33c9aa2412 100644
--- a/examples/fluids/include/petsc-ceed-utils.h
+++ b/examples/fluids/include/petsc-ceed-utils.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/include/petsc-ceed.h b/examples/fluids/include/petsc-ceed.h
index 6e77c39e59..a8667b3b75 100644
--- a/examples/fluids/include/petsc-ceed.h
+++ b/examples/fluids/include/petsc-ceed.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/include/petsc_ops.h b/examples/fluids/include/petsc_ops.h
index 9913780172..ab79c0ad76 100644
--- a/examples/fluids/include/petsc_ops.h
+++ b/examples/fluids/include/petsc_ops.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/navierstokes.c b/examples/fluids/navierstokes.c
index 7468ef5a05..122f1f579d 100644
--- a/examples/fluids/navierstokes.c
+++ b/examples/fluids/navierstokes.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/navierstokes.h b/examples/fluids/navierstokes.h
index 074ae865b5..4f93e64a71 100644
--- a/examples/fluids/navierstokes.h
+++ b/examples/fluids/navierstokes.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/problems/advection.c b/examples/fluids/problems/advection.c
index d1ce21755b..09d444dd01 100644
--- a/examples/fluids/problems/advection.c
+++ b/examples/fluids/problems/advection.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/problems/bc_freestream.c b/examples/fluids/problems/bc_freestream.c
index 2907a241da..45458af00f 100644
--- a/examples/fluids/problems/bc_freestream.c
+++ b/examples/fluids/problems/bc_freestream.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/problems/bc_slip.c b/examples/fluids/problems/bc_slip.c
index 05c686d4ff..2bb762e8d1 100644
--- a/examples/fluids/problems/bc_slip.c
+++ b/examples/fluids/problems/bc_slip.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/problems/blasius.c b/examples/fluids/problems/blasius.c
index 5481e90896..aec0d1fc82 100644
--- a/examples/fluids/problems/blasius.c
+++ b/examples/fluids/problems/blasius.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/problems/channel.c b/examples/fluids/problems/channel.c
index 8c0511114b..4b95e9acab 100644
--- a/examples/fluids/problems/channel.c
+++ b/examples/fluids/problems/channel.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/problems/densitycurrent.c b/examples/fluids/problems/densitycurrent.c
index e49da42b5b..7075701ecb 100644
--- a/examples/fluids/problems/densitycurrent.c
+++ b/examples/fluids/problems/densitycurrent.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/problems/eulervortex.c b/examples/fluids/problems/eulervortex.c
index c0b4bff51f..96b2f5d293 100644
--- a/examples/fluids/problems/eulervortex.c
+++ b/examples/fluids/problems/eulervortex.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/problems/gaussianwave.c b/examples/fluids/problems/gaussianwave.c
index 2ee5e41726..b421296473 100644
--- a/examples/fluids/problems/gaussianwave.c
+++ b/examples/fluids/problems/gaussianwave.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/problems/newtonian.c b/examples/fluids/problems/newtonian.c
index f6033cd392..f02cc949e8 100644
--- a/examples/fluids/problems/newtonian.c
+++ b/examples/fluids/problems/newtonian.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/problems/shocktube.c b/examples/fluids/problems/shocktube.c
index 3e637392fa..32c8dd691a 100644
--- a/examples/fluids/problems/shocktube.c
+++ b/examples/fluids/problems/shocktube.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/problems/stg_shur14.c b/examples/fluids/problems/stg_shur14.c
index 58be441f81..ca5eb8c9f1 100644
--- a/examples/fluids/problems/stg_shur14.c
+++ b/examples/fluids/problems/stg_shur14.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/problems/stg_shur14.h b/examples/fluids/problems/stg_shur14.h
index ea2087af28..0af335b26d 100644
--- a/examples/fluids/problems/stg_shur14.h
+++ b/examples/fluids/problems/stg_shur14.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/problems/taylorgreen.c b/examples/fluids/problems/taylorgreen.c
index 9c090b593f..a6974775fe 100644
--- a/examples/fluids/problems/taylorgreen.c
+++ b/examples/fluids/problems/taylorgreen.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/qfunctions/advection.h b/examples/fluids/qfunctions/advection.h
index 41f6b249e7..e7cace4afd 100644
--- a/examples/fluids/qfunctions/advection.h
+++ b/examples/fluids/qfunctions/advection.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/qfunctions/advection_types.h b/examples/fluids/qfunctions/advection_types.h
index daaee10bf7..90c709d54a 100644
--- a/examples/fluids/qfunctions/advection_types.h
+++ b/examples/fluids/qfunctions/advection_types.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/qfunctions/bc_freestream.h b/examples/fluids/qfunctions/bc_freestream.h
index b6c0aa33cf..7767b3af1f 100644
--- a/examples/fluids/qfunctions/bc_freestream.h
+++ b/examples/fluids/qfunctions/bc_freestream.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/qfunctions/bc_freestream_type.h b/examples/fluids/qfunctions/bc_freestream_type.h
index 8c30ca2915..a6c7456842 100644
--- a/examples/fluids/qfunctions/bc_freestream_type.h
+++ b/examples/fluids/qfunctions/bc_freestream_type.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/qfunctions/bc_slip.h b/examples/fluids/qfunctions/bc_slip.h
index 3a7c5b5bc2..2d5bd21cac 100644
--- a/examples/fluids/qfunctions/bc_slip.h
+++ b/examples/fluids/qfunctions/bc_slip.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/qfunctions/blasius.h b/examples/fluids/qfunctions/blasius.h
index e372aeedfb..88c24661de 100644
--- a/examples/fluids/qfunctions/blasius.h
+++ b/examples/fluids/qfunctions/blasius.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/qfunctions/channel.h b/examples/fluids/qfunctions/channel.h
index 9d458b0f31..d19fafec03 100644
--- a/examples/fluids/qfunctions/channel.h
+++ b/examples/fluids/qfunctions/channel.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/qfunctions/densitycurrent.h b/examples/fluids/qfunctions/densitycurrent.h
index d1e61a0a10..b328cce698 100644
--- a/examples/fluids/qfunctions/densitycurrent.h
+++ b/examples/fluids/qfunctions/densitycurrent.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/qfunctions/differential_filter.h b/examples/fluids/qfunctions/differential_filter.h
index 36b4cfa2a5..e8dc47e619 100644
--- a/examples/fluids/qfunctions/differential_filter.h
+++ b/examples/fluids/qfunctions/differential_filter.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/qfunctions/differential_filter_enums.h b/examples/fluids/qfunctions/differential_filter_enums.h
index ffa548fff6..9e00c67ccf 100644
--- a/examples/fluids/qfunctions/differential_filter_enums.h
+++ b/examples/fluids/qfunctions/differential_filter_enums.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/qfunctions/eulervortex.h b/examples/fluids/qfunctions/eulervortex.h
index 878c5f615c..8862005cc2 100644
--- a/examples/fluids/qfunctions/eulervortex.h
+++ b/examples/fluids/qfunctions/eulervortex.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/qfunctions/gaussianwave.h b/examples/fluids/qfunctions/gaussianwave.h
index f48de3bcf2..aa90258248 100644
--- a/examples/fluids/qfunctions/gaussianwave.h
+++ b/examples/fluids/qfunctions/gaussianwave.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/qfunctions/grid_anisotropy_tensor.h b/examples/fluids/qfunctions/grid_anisotropy_tensor.h
index cea712726f..7311c3090d 100644
--- a/examples/fluids/qfunctions/grid_anisotropy_tensor.h
+++ b/examples/fluids/qfunctions/grid_anisotropy_tensor.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/qfunctions/inverse_multiplicity.h b/examples/fluids/qfunctions/inverse_multiplicity.h
index 6f83c7b39c..07191c5fc2 100644
--- a/examples/fluids/qfunctions/inverse_multiplicity.h
+++ b/examples/fluids/qfunctions/inverse_multiplicity.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/qfunctions/mass.h b/examples/fluids/qfunctions/mass.h
index 42d27b2f68..a05857c53e 100644
--- a/examples/fluids/qfunctions/mass.h
+++ b/examples/fluids/qfunctions/mass.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/qfunctions/newtonian.h b/examples/fluids/qfunctions/newtonian.h
index a5ca161b3b..a273a25aea 100644
--- a/examples/fluids/qfunctions/newtonian.h
+++ b/examples/fluids/qfunctions/newtonian.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/qfunctions/newtonian_state.h b/examples/fluids/qfunctions/newtonian_state.h
index ab49f0d2c2..b0b054831d 100644
--- a/examples/fluids/qfunctions/newtonian_state.h
+++ b/examples/fluids/qfunctions/newtonian_state.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/qfunctions/newtonian_types.h b/examples/fluids/qfunctions/newtonian_types.h
index b7c4e7e36e..15cf679a2e 100644
--- a/examples/fluids/qfunctions/newtonian_types.h
+++ b/examples/fluids/qfunctions/newtonian_types.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/qfunctions/riemann_solver.h b/examples/fluids/qfunctions/riemann_solver.h
index b3d36f86ba..a3a2970da1 100644
--- a/examples/fluids/qfunctions/riemann_solver.h
+++ b/examples/fluids/qfunctions/riemann_solver.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/qfunctions/setupgeo.h b/examples/fluids/qfunctions/setupgeo.h
index 4e8e9cf8f4..b05bbd8373 100644
--- a/examples/fluids/qfunctions/setupgeo.h
+++ b/examples/fluids/qfunctions/setupgeo.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/qfunctions/setupgeo2d.h b/examples/fluids/qfunctions/setupgeo2d.h
index 0c2662906c..4564a3fc27 100644
--- a/examples/fluids/qfunctions/setupgeo2d.h
+++ b/examples/fluids/qfunctions/setupgeo2d.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/qfunctions/setupgeo_helpers.h b/examples/fluids/qfunctions/setupgeo_helpers.h
index b52c3cdcff..870295fd74 100644
--- a/examples/fluids/qfunctions/setupgeo_helpers.h
+++ b/examples/fluids/qfunctions/setupgeo_helpers.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/qfunctions/shocktube.h b/examples/fluids/qfunctions/shocktube.h
index 3ff908f4af..c4a5ead1c1 100644
--- a/examples/fluids/qfunctions/shocktube.h
+++ b/examples/fluids/qfunctions/shocktube.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/qfunctions/stabilization.h b/examples/fluids/qfunctions/stabilization.h
index 655d49e7a8..9c4718498b 100644
--- a/examples/fluids/qfunctions/stabilization.h
+++ b/examples/fluids/qfunctions/stabilization.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/qfunctions/stabilization_types.h b/examples/fluids/qfunctions/stabilization_types.h
index 97492dd73a..c9241ecc4e 100644
--- a/examples/fluids/qfunctions/stabilization_types.h
+++ b/examples/fluids/qfunctions/stabilization_types.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/qfunctions/stg_shur14.h b/examples/fluids/qfunctions/stg_shur14.h
index 28a779aa8c..1394ac53e4 100644
--- a/examples/fluids/qfunctions/stg_shur14.h
+++ b/examples/fluids/qfunctions/stg_shur14.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/qfunctions/stg_shur14_type.h b/examples/fluids/qfunctions/stg_shur14_type.h
index f7c8942614..4ade5d20c2 100644
--- a/examples/fluids/qfunctions/stg_shur14_type.h
+++ b/examples/fluids/qfunctions/stg_shur14_type.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/qfunctions/strong_boundary_conditions.h b/examples/fluids/qfunctions/strong_boundary_conditions.h
index 7bb0453796..8177fe33d3 100644
--- a/examples/fluids/qfunctions/strong_boundary_conditions.h
+++ b/examples/fluids/qfunctions/strong_boundary_conditions.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/qfunctions/taylorgreen.h b/examples/fluids/qfunctions/taylorgreen.h
index 3b42fe18d8..7bc6074990 100644
--- a/examples/fluids/qfunctions/taylorgreen.h
+++ b/examples/fluids/qfunctions/taylorgreen.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/qfunctions/turb_spanstats.h b/examples/fluids/qfunctions/turb_spanstats.h
index 377a0bbf75..49adf1f364 100644
--- a/examples/fluids/qfunctions/turb_spanstats.h
+++ b/examples/fluids/qfunctions/turb_spanstats.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/qfunctions/turb_stats_types.h b/examples/fluids/qfunctions/turb_stats_types.h
index 95136f9ff0..12deac8c20 100644
--- a/examples/fluids/qfunctions/turb_stats_types.h
+++ b/examples/fluids/qfunctions/turb_stats_types.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/qfunctions/utils.h b/examples/fluids/qfunctions/utils.h
index 90f67fad24..61dcc13d3a 100644
--- a/examples/fluids/qfunctions/utils.h
+++ b/examples/fluids/qfunctions/utils.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/qfunctions/utils_eigensolver_jacobi.h b/examples/fluids/qfunctions/utils_eigensolver_jacobi.h
index 1c0390d3b9..d4c1ffc482 100644
--- a/examples/fluids/qfunctions/utils_eigensolver_jacobi.h
+++ b/examples/fluids/qfunctions/utils_eigensolver_jacobi.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/qfunctions/velocity_gradient_projection.h b/examples/fluids/qfunctions/velocity_gradient_projection.h
index 28914c13d9..3b143d6c33 100644
--- a/examples/fluids/qfunctions/velocity_gradient_projection.h
+++ b/examples/fluids/qfunctions/velocity_gradient_projection.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/src/bc_definition.c b/examples/fluids/src/bc_definition.c
index 5d9b467057..03bb6e3569 100644
--- a/examples/fluids/src/bc_definition.c
+++ b/examples/fluids/src/bc_definition.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/src/boundary_condition.c b/examples/fluids/src/boundary_condition.c
index 18882f9839..3f3f7f7a06 100644
--- a/examples/fluids/src/boundary_condition.c
+++ b/examples/fluids/src/boundary_condition.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/src/cloptions.c b/examples/fluids/src/cloptions.c
index 6dd1d36b6b..aa9d70ccb9 100644
--- a/examples/fluids/src/cloptions.c
+++ b/examples/fluids/src/cloptions.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/src/differential_filter.c b/examples/fluids/src/differential_filter.c
index e6c0db2120..8d0b1e6949 100644
--- a/examples/fluids/src/differential_filter.c
+++ b/examples/fluids/src/differential_filter.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/src/dm_utils.c b/examples/fluids/src/dm_utils.c
index 074240fbfc..4e91ba6da2 100644
--- a/examples/fluids/src/dm_utils.c
+++ b/examples/fluids/src/dm_utils.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/src/grid_anisotropy_tensor.c b/examples/fluids/src/grid_anisotropy_tensor.c
index 02f78bbb67..7f78edd3ca 100644
--- a/examples/fluids/src/grid_anisotropy_tensor.c
+++ b/examples/fluids/src/grid_anisotropy_tensor.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/src/inverse_multiplicity.c b/examples/fluids/src/inverse_multiplicity.c
index 2d71cc15fe..5d3b7dd9ae 100644
--- a/examples/fluids/src/inverse_multiplicity.c
+++ b/examples/fluids/src/inverse_multiplicity.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/src/log_events.c b/examples/fluids/src/log_events.c
index 1bf3b3b039..fce0368614 100644
--- a/examples/fluids/src/log_events.c
+++ b/examples/fluids/src/log_events.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/src/misc.c b/examples/fluids/src/misc.c
index 3765ee649d..8b54f7c90b 100644
--- a/examples/fluids/src/misc.c
+++ b/examples/fluids/src/misc.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/src/petsc_ops.c b/examples/fluids/src/petsc_ops.c
index b9b51209a3..982979f99e 100644
--- a/examples/fluids/src/petsc_ops.c
+++ b/examples/fluids/src/petsc_ops.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/src/qdata.c b/examples/fluids/src/qdata.c
index 6883c2a8c3..d39f7f1fc2 100644
--- a/examples/fluids/src/qdata.c
+++ b/examples/fluids/src/qdata.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/src/setupdm.c b/examples/fluids/src/setupdm.c
index 534b4e5b34..b4de50feb6 100644
--- a/examples/fluids/src/setupdm.c
+++ b/examples/fluids/src/setupdm.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/src/setuplibceed.c b/examples/fluids/src/setuplibceed.c
index 9ce48ae762..7c172a3abc 100644
--- a/examples/fluids/src/setuplibceed.c
+++ b/examples/fluids/src/setuplibceed.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/src/setupts.c b/examples/fluids/src/setupts.c
index 9b9669fe45..1fdda9fb82 100644
--- a/examples/fluids/src/setupts.c
+++ b/examples/fluids/src/setupts.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/src/strong_boundary_conditions.c b/examples/fluids/src/strong_boundary_conditions.c
index eeb48cb7af..2e52a2ae8f 100644
--- a/examples/fluids/src/strong_boundary_conditions.c
+++ b/examples/fluids/src/strong_boundary_conditions.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/src/turb_spanstats.c b/examples/fluids/src/turb_spanstats.c
index 4e8e7536c5..bd856d4af4 100644
--- a/examples/fluids/src/turb_spanstats.c
+++ b/examples/fluids/src/turb_spanstats.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/src/velocity_gradient_projection.c b/examples/fluids/src/velocity_gradient_projection.c
index 0ee457139a..232c46946d 100644
--- a/examples/fluids/src/velocity_gradient_projection.c
+++ b/examples/fluids/src/velocity_gradient_projection.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/mfem/Makefile b/examples/mfem/Makefile
index cb5abeba01..d94e4b01f1 100644
--- a/examples/mfem/Makefile
+++ b/examples/mfem/Makefile
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors
+# Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/mfem/bp1.cpp b/examples/mfem/bp1.cpp
index f6a96dd536..22f420cc24 100644
--- a/examples/mfem/bp1.cpp
+++ b/examples/mfem/bp1.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/mfem/bp1.h b/examples/mfem/bp1.h
index df23dd4b51..47c4879707 100644
--- a/examples/mfem/bp1.h
+++ b/examples/mfem/bp1.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/mfem/bp1.hpp b/examples/mfem/bp1.hpp
index cb43675b56..7a3afbdcff 100644
--- a/examples/mfem/bp1.hpp
+++ b/examples/mfem/bp1.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/mfem/bp3.cpp b/examples/mfem/bp3.cpp
index b4e341db48..ccc435fdb9 100644
--- a/examples/mfem/bp3.cpp
+++ b/examples/mfem/bp3.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/mfem/bp3.h b/examples/mfem/bp3.h
index a546d8aea6..1c2d2bfb41 100644
--- a/examples/mfem/bp3.h
+++ b/examples/mfem/bp3.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/mfem/bp3.hpp b/examples/mfem/bp3.hpp
index d9b74474d0..f6cea2fc59 100644
--- a/examples/mfem/bp3.hpp
+++ b/examples/mfem/bp3.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/nek/bps/bps.h b/examples/nek/bps/bps.h
index a0b6a022c1..5e54d12e90 100644
--- a/examples/nek/bps/bps.h
+++ b/examples/nek/bps/bps.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/nek/bps/bps.usr b/examples/nek/bps/bps.usr
index 89f7fca164..24ccc9cd00 100644
--- a/examples/nek/bps/bps.usr
+++ b/examples/nek/bps/bps.usr
@@ -1,4 +1,4 @@
-C Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors
+C Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors
 C All Rights Reserved. See the top-level COPYRIGHT and NOTICE files for details.
 C
 C SPDX-License-Identifier: (BSD-2-Clause)
diff --git a/examples/petsc/Makefile b/examples/petsc/Makefile
index b465d25cfe..37f08a9dee 100644
--- a/examples/petsc/Makefile
+++ b/examples/petsc/Makefile
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors
+# Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/petsc/area.c b/examples/petsc/area.c
index 1f5cca850a..9821caccf3 100644
--- a/examples/petsc/area.c
+++ b/examples/petsc/area.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/petsc/area.h b/examples/petsc/area.h
index 1b95f6d6df..08c69f43b4 100644
--- a/examples/petsc/area.h
+++ b/examples/petsc/area.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/petsc/bps.c b/examples/petsc/bps.c
index 3d6e475385..be55bbf5c4 100644
--- a/examples/petsc/bps.c
+++ b/examples/petsc/bps.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/petsc/bps.h b/examples/petsc/bps.h
index 9100c8af47..1867020c74 100644
--- a/examples/petsc/bps.h
+++ b/examples/petsc/bps.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/petsc/bpsraw.c b/examples/petsc/bpsraw.c
index e8d901b410..ed8ae0aca8 100644
--- a/examples/petsc/bpsraw.c
+++ b/examples/petsc/bpsraw.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/petsc/bpssphere.c b/examples/petsc/bpssphere.c
index 043465b789..df1e7bbdc0 100644
--- a/examples/petsc/bpssphere.c
+++ b/examples/petsc/bpssphere.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/petsc/bpssphere.h b/examples/petsc/bpssphere.h
index c3c7678f54..80e64144a0 100644
--- a/examples/petsc/bpssphere.h
+++ b/examples/petsc/bpssphere.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/petsc/bpsswarm.c b/examples/petsc/bpsswarm.c
index 3ff9685c8e..9fc1320b21 100644
--- a/examples/petsc/bpsswarm.c
+++ b/examples/petsc/bpsswarm.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/petsc/dmswarm.c b/examples/petsc/dmswarm.c
index 113730e15a..cc618413bf 100644
--- a/examples/petsc/dmswarm.c
+++ b/examples/petsc/dmswarm.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/petsc/include/areaproblemdata.h b/examples/petsc/include/areaproblemdata.h
index cb5a254085..290dc86bdb 100644
--- a/examples/petsc/include/areaproblemdata.h
+++ b/examples/petsc/include/areaproblemdata.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/petsc/include/bpsproblemdata.h b/examples/petsc/include/bpsproblemdata.h
index f89aadc318..db5f2b7d04 100644
--- a/examples/petsc/include/bpsproblemdata.h
+++ b/examples/petsc/include/bpsproblemdata.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/petsc/include/libceedsetup.h b/examples/petsc/include/libceedsetup.h
index 19f5338784..dee9e9d730 100644
--- a/examples/petsc/include/libceedsetup.h
+++ b/examples/petsc/include/libceedsetup.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/petsc/include/matops.h b/examples/petsc/include/matops.h
index 8c29f9e76a..5d2162229c 100644
--- a/examples/petsc/include/matops.h
+++ b/examples/petsc/include/matops.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/petsc/include/petscutils.h b/examples/petsc/include/petscutils.h
index b1b07f7672..8b9d10d075 100644
--- a/examples/petsc/include/petscutils.h
+++ b/examples/petsc/include/petscutils.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/petsc/include/petscversion.h b/examples/petsc/include/petscversion.h
index bbb377d3cc..4ef951a893 100644
--- a/examples/petsc/include/petscversion.h
+++ b/examples/petsc/include/petscversion.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/petsc/include/sphereproblemdata.h b/examples/petsc/include/sphereproblemdata.h
index 5142d9eeba..a4a2b7f8b4 100644
--- a/examples/petsc/include/sphereproblemdata.h
+++ b/examples/petsc/include/sphereproblemdata.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/petsc/include/structs.h b/examples/petsc/include/structs.h
index c33ad80b9b..63e1656763 100644
--- a/examples/petsc/include/structs.h
+++ b/examples/petsc/include/structs.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/petsc/include/swarmutils.h b/examples/petsc/include/swarmutils.h
index 0eeff6e301..14b16c48b7 100644
--- a/examples/petsc/include/swarmutils.h
+++ b/examples/petsc/include/swarmutils.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/petsc/multigrid.c b/examples/petsc/multigrid.c
index 60926fef5f..a59011074b 100644
--- a/examples/petsc/multigrid.c
+++ b/examples/petsc/multigrid.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/petsc/qfunctions/area/areacube.h b/examples/petsc/qfunctions/area/areacube.h
index 1cc7fcccab..e041bb3ad2 100644
--- a/examples/petsc/qfunctions/area/areacube.h
+++ b/examples/petsc/qfunctions/area/areacube.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/petsc/qfunctions/area/areasphere.h b/examples/petsc/qfunctions/area/areasphere.h
index 88ee221a7f..902e2e17ae 100644
--- a/examples/petsc/qfunctions/area/areasphere.h
+++ b/examples/petsc/qfunctions/area/areasphere.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/petsc/qfunctions/bps/bp1.h b/examples/petsc/qfunctions/bps/bp1.h
index b5a1f0ad11..060ec3f395 100644
--- a/examples/petsc/qfunctions/bps/bp1.h
+++ b/examples/petsc/qfunctions/bps/bp1.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/petsc/qfunctions/bps/bp1sphere.h b/examples/petsc/qfunctions/bps/bp1sphere.h
index 0129a3ba66..ba5162be73 100644
--- a/examples/petsc/qfunctions/bps/bp1sphere.h
+++ b/examples/petsc/qfunctions/bps/bp1sphere.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/petsc/qfunctions/bps/bp2.h b/examples/petsc/qfunctions/bps/bp2.h
index 12c5fc3521..c0660f76af 100644
--- a/examples/petsc/qfunctions/bps/bp2.h
+++ b/examples/petsc/qfunctions/bps/bp2.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/petsc/qfunctions/bps/bp2sphere.h b/examples/petsc/qfunctions/bps/bp2sphere.h
index 2ebff9ef91..2370699150 100644
--- a/examples/petsc/qfunctions/bps/bp2sphere.h
+++ b/examples/petsc/qfunctions/bps/bp2sphere.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/petsc/qfunctions/bps/bp3.h b/examples/petsc/qfunctions/bps/bp3.h
index a3674ed031..ade23682c8 100644
--- a/examples/petsc/qfunctions/bps/bp3.h
+++ b/examples/petsc/qfunctions/bps/bp3.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/petsc/qfunctions/bps/bp3sphere.h b/examples/petsc/qfunctions/bps/bp3sphere.h
index fdc16b4c84..db5064d38a 100644
--- a/examples/petsc/qfunctions/bps/bp3sphere.h
+++ b/examples/petsc/qfunctions/bps/bp3sphere.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/petsc/qfunctions/bps/bp4.h b/examples/petsc/qfunctions/bps/bp4.h
index 4f8f6fd58d..4771fb479d 100644
--- a/examples/petsc/qfunctions/bps/bp4.h
+++ b/examples/petsc/qfunctions/bps/bp4.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/petsc/qfunctions/bps/bp4sphere.h b/examples/petsc/qfunctions/bps/bp4sphere.h
index 39b631173b..cd26f72767 100644
--- a/examples/petsc/qfunctions/bps/bp4sphere.h
+++ b/examples/petsc/qfunctions/bps/bp4sphere.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/petsc/qfunctions/bps/common.h b/examples/petsc/qfunctions/bps/common.h
index fd38dbc13d..29e5c5709a 100644
--- a/examples/petsc/qfunctions/bps/common.h
+++ b/examples/petsc/qfunctions/bps/common.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/petsc/qfunctions/swarm/swarmmass.h b/examples/petsc/qfunctions/swarm/swarmmass.h
index 4c321871fe..7eefea5806 100644
--- a/examples/petsc/qfunctions/swarm/swarmmass.h
+++ b/examples/petsc/qfunctions/swarm/swarmmass.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/petsc/src/libceedsetup.c b/examples/petsc/src/libceedsetup.c
index 19ecd1880a..50c174a939 100644
--- a/examples/petsc/src/libceedsetup.c
+++ b/examples/petsc/src/libceedsetup.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/petsc/src/petscutils.c b/examples/petsc/src/petscutils.c
index 4ec84c547b..990a75f1fd 100644
--- a/examples/petsc/src/petscutils.c
+++ b/examples/petsc/src/petscutils.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/petsc/src/swarmutils.c b/examples/petsc/src/swarmutils.c
index 64901f8ed9..f7c37ac0de 100644
--- a/examples/petsc/src/swarmutils.c
+++ b/examples/petsc/src/swarmutils.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/rust/ex1-volume/src/main.rs b/examples/rust/ex1-volume/src/main.rs
index 6120d41999..2f30d497e6 100644
--- a/examples/rust/ex1-volume/src/main.rs
+++ b/examples/rust/ex1-volume/src/main.rs
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/rust/ex1-volume/src/opt.rs b/examples/rust/ex1-volume/src/opt.rs
index 94fca2594a..66d5ca74b8 100644
--- a/examples/rust/ex1-volume/src/opt.rs
+++ b/examples/rust/ex1-volume/src/opt.rs
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/rust/ex1-volume/src/transform.rs b/examples/rust/ex1-volume/src/transform.rs
index 6ebe14bc6f..a66087a330 100644
--- a/examples/rust/ex1-volume/src/transform.rs
+++ b/examples/rust/ex1-volume/src/transform.rs
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/rust/ex2-surface/src/main.rs b/examples/rust/ex2-surface/src/main.rs
index 1c44e7cba5..a7d20dedbf 100644
--- a/examples/rust/ex2-surface/src/main.rs
+++ b/examples/rust/ex2-surface/src/main.rs
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/rust/ex2-surface/src/opt.rs b/examples/rust/ex2-surface/src/opt.rs
index fcf903e501..176d2bab80 100644
--- a/examples/rust/ex2-surface/src/opt.rs
+++ b/examples/rust/ex2-surface/src/opt.rs
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/rust/ex2-surface/src/transform.rs b/examples/rust/ex2-surface/src/transform.rs
index 085d9bc94d..666c56c886 100644
--- a/examples/rust/ex2-surface/src/transform.rs
+++ b/examples/rust/ex2-surface/src/transform.rs
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/rust/ex3-vector-volume/src/main.rs b/examples/rust/ex3-vector-volume/src/main.rs
index a2102e5d0f..f16307f066 100644
--- a/examples/rust/ex3-vector-volume/src/main.rs
+++ b/examples/rust/ex3-vector-volume/src/main.rs
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/rust/ex3-vector-volume/src/opt.rs b/examples/rust/ex3-vector-volume/src/opt.rs
index 7ece85540f..e74dba8bf6 100644
--- a/examples/rust/ex3-vector-volume/src/opt.rs
+++ b/examples/rust/ex3-vector-volume/src/opt.rs
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/rust/ex3-vector-volume/src/transform.rs b/examples/rust/ex3-vector-volume/src/transform.rs
index 6ebe14bc6f..a66087a330 100644
--- a/examples/rust/ex3-vector-volume/src/transform.rs
+++ b/examples/rust/ex3-vector-volume/src/transform.rs
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/rust/ex4-vector-surface/src/main.rs b/examples/rust/ex4-vector-surface/src/main.rs
index 5b788274e8..d04fdb6746 100644
--- a/examples/rust/ex4-vector-surface/src/main.rs
+++ b/examples/rust/ex4-vector-surface/src/main.rs
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/rust/ex4-vector-surface/src/opt.rs b/examples/rust/ex4-vector-surface/src/opt.rs
index 7b335a8e53..1ad9d895a4 100644
--- a/examples/rust/ex4-vector-surface/src/opt.rs
+++ b/examples/rust/ex4-vector-surface/src/opt.rs
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/rust/ex4-vector-surface/src/transform.rs b/examples/rust/ex4-vector-surface/src/transform.rs
index 085d9bc94d..666c56c886 100644
--- a/examples/rust/ex4-vector-surface/src/transform.rs
+++ b/examples/rust/ex4-vector-surface/src/transform.rs
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/rust/mesh/src/lib.rs b/examples/rust/mesh/src/lib.rs
index 9ad2810381..0b7a9da4e5 100644
--- a/examples/rust/mesh/src/lib.rs
+++ b/examples/rust/mesh/src/lib.rs
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/solids/Makefile b/examples/solids/Makefile
index 484d71eda7..54a560f02e 100644
--- a/examples/solids/Makefile
+++ b/examples/solids/Makefile
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+# Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/solids/elasticity.c b/examples/solids/elasticity.c
index 7de51bd0ee..06e56f6d3d 100644
--- a/examples/solids/elasticity.c
+++ b/examples/solids/elasticity.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/solids/elasticity.h b/examples/solids/elasticity.h
index a83789a30f..3e964cc7c4 100644
--- a/examples/solids/elasticity.h
+++ b/examples/solids/elasticity.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/solids/include/boundary.h b/examples/solids/include/boundary.h
index 7143b7c262..c58c00e888 100644
--- a/examples/solids/include/boundary.h
+++ b/examples/solids/include/boundary.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/solids/include/cl-options.h b/examples/solids/include/cl-options.h
index 9c56398139..a6db168fad 100644
--- a/examples/solids/include/cl-options.h
+++ b/examples/solids/include/cl-options.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/solids/include/matops.h b/examples/solids/include/matops.h
index ca57b33356..aead345453 100644
--- a/examples/solids/include/matops.h
+++ b/examples/solids/include/matops.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/solids/include/misc.h b/examples/solids/include/misc.h
index 5836d14ff6..03738cb4d1 100644
--- a/examples/solids/include/misc.h
+++ b/examples/solids/include/misc.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/solids/include/setup-dm.h b/examples/solids/include/setup-dm.h
index 8fcfe7a63b..ff07f7e86f 100644
--- a/examples/solids/include/setup-dm.h
+++ b/examples/solids/include/setup-dm.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/solids/include/setup-libceed.h b/examples/solids/include/setup-libceed.h
index be8ad14e9b..fc1606bf68 100644
--- a/examples/solids/include/setup-libceed.h
+++ b/examples/solids/include/setup-libceed.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/solids/include/structs.h b/examples/solids/include/structs.h
index 8c63ce1199..75f9e3c5ce 100644
--- a/examples/solids/include/structs.h
+++ b/examples/solids/include/structs.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/solids/include/utils.h b/examples/solids/include/utils.h
index 31188d47e3..82832a3755 100644
--- a/examples/solids/include/utils.h
+++ b/examples/solids/include/utils.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/solids/problems/cl-problems.h b/examples/solids/problems/cl-problems.h
index 8a9036c995..ec9a10b2ff 100644
--- a/examples/solids/problems/cl-problems.h
+++ b/examples/solids/problems/cl-problems.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/solids/problems/finite-strain-mooney-rivlin.c b/examples/solids/problems/finite-strain-mooney-rivlin.c
index 6ce2201907..bae739e175 100644
--- a/examples/solids/problems/finite-strain-mooney-rivlin.c
+++ b/examples/solids/problems/finite-strain-mooney-rivlin.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/solids/problems/finite-strain-neo-hookean.c b/examples/solids/problems/finite-strain-neo-hookean.c
index fac1e47ba6..d7ae867a6f 100644
--- a/examples/solids/problems/finite-strain-neo-hookean.c
+++ b/examples/solids/problems/finite-strain-neo-hookean.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/solids/problems/linear.c b/examples/solids/problems/linear.c
index 051b2f1155..a733acc70a 100644
--- a/examples/solids/problems/linear.c
+++ b/examples/solids/problems/linear.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/solids/problems/mooney-rivlin.c b/examples/solids/problems/mooney-rivlin.c
index 2449e98742..2f6de2337f 100644
--- a/examples/solids/problems/mooney-rivlin.c
+++ b/examples/solids/problems/mooney-rivlin.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/solids/problems/mooney-rivlin.h b/examples/solids/problems/mooney-rivlin.h
index 2063e06e19..eca930aea0 100644
--- a/examples/solids/problems/mooney-rivlin.h
+++ b/examples/solids/problems/mooney-rivlin.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/solids/problems/neo-hookean.c b/examples/solids/problems/neo-hookean.c
index 560717673e..86d2b9b0f0 100644
--- a/examples/solids/problems/neo-hookean.c
+++ b/examples/solids/problems/neo-hookean.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/solids/problems/neo-hookean.h b/examples/solids/problems/neo-hookean.h
index 72f6a2ed65..eeeb5e3b42 100644
--- a/examples/solids/problems/neo-hookean.h
+++ b/examples/solids/problems/neo-hookean.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/solids/problems/problems.c b/examples/solids/problems/problems.c
index 1ee1c4c215..e86686ce61 100644
--- a/examples/solids/problems/problems.c
+++ b/examples/solids/problems/problems.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/solids/problems/problems.h b/examples/solids/problems/problems.h
index 41c4271ffc..d7856b703c 100644
--- a/examples/solids/problems/problems.h
+++ b/examples/solids/problems/problems.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/solids/qfunctions/common.h b/examples/solids/qfunctions/common.h
index acaa815cc5..3bd1d37f0f 100644
--- a/examples/solids/qfunctions/common.h
+++ b/examples/solids/qfunctions/common.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/solids/qfunctions/constant-force.h b/examples/solids/qfunctions/constant-force.h
index 232f97588e..147686156d 100644
--- a/examples/solids/qfunctions/constant-force.h
+++ b/examples/solids/qfunctions/constant-force.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/solids/qfunctions/finite-strain-mooney-rivlin.h b/examples/solids/qfunctions/finite-strain-mooney-rivlin.h
index 7a802693d2..74975966fe 100644
--- a/examples/solids/qfunctions/finite-strain-mooney-rivlin.h
+++ b/examples/solids/qfunctions/finite-strain-mooney-rivlin.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/solids/qfunctions/finite-strain-neo-hookean.h b/examples/solids/qfunctions/finite-strain-neo-hookean.h
index 9b1ff27979..ea7486677a 100644
--- a/examples/solids/qfunctions/finite-strain-neo-hookean.h
+++ b/examples/solids/qfunctions/finite-strain-neo-hookean.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/solids/qfunctions/linear.h b/examples/solids/qfunctions/linear.h
index 57f5fe4f61..6e300af27c 100644
--- a/examples/solids/qfunctions/linear.h
+++ b/examples/solids/qfunctions/linear.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/solids/qfunctions/manufactured-force.h b/examples/solids/qfunctions/manufactured-force.h
index de48be4ba3..9c063409bb 100644
--- a/examples/solids/qfunctions/manufactured-force.h
+++ b/examples/solids/qfunctions/manufactured-force.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/solids/qfunctions/manufactured-true.h b/examples/solids/qfunctions/manufactured-true.h
index 6fd97c1b13..943bca1686 100644
--- a/examples/solids/qfunctions/manufactured-true.h
+++ b/examples/solids/qfunctions/manufactured-true.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/solids/qfunctions/traction-boundary.h b/examples/solids/qfunctions/traction-boundary.h
index 7fd59c742c..797cb7cdcd 100644
--- a/examples/solids/qfunctions/traction-boundary.h
+++ b/examples/solids/qfunctions/traction-boundary.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/solids/src/boundary.c b/examples/solids/src/boundary.c
index 2985d0d21a..9f63128e3f 100644
--- a/examples/solids/src/boundary.c
+++ b/examples/solids/src/boundary.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/solids/src/cl-options.c b/examples/solids/src/cl-options.c
index 996ffcfbcf..b2e203dda7 100644
--- a/examples/solids/src/cl-options.c
+++ b/examples/solids/src/cl-options.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/solids/src/matops.c b/examples/solids/src/matops.c
index bef9960fb5..def1109c00 100644
--- a/examples/solids/src/matops.c
+++ b/examples/solids/src/matops.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/solids/src/misc.c b/examples/solids/src/misc.c
index d4f1986473..1633b628f5 100644
--- a/examples/solids/src/misc.c
+++ b/examples/solids/src/misc.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/solids/src/setup-dm.c b/examples/solids/src/setup-dm.c
index 07c7f179fe..38fdd3889a 100644
--- a/examples/solids/src/setup-dm.c
+++ b/examples/solids/src/setup-dm.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/solids/src/setup-libceed.c b/examples/solids/src/setup-libceed.c
index 717fda952f..16f3b076af 100644
--- a/examples/solids/src/setup-libceed.c
+++ b/examples/solids/src/setup-libceed.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/gallery/ceed-gallery-list.h b/gallery/ceed-gallery-list.h
index 4fa8a08227..9014adad07 100644
--- a/gallery/ceed-gallery-list.h
+++ b/gallery/ceed-gallery-list.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/gallery/ceed-gallery-weak.c b/gallery/ceed-gallery-weak.c
index bb983b9a56..02065b51fc 100644
--- a/gallery/ceed-gallery-weak.c
+++ b/gallery/ceed-gallery-weak.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/gallery/identity/ceed-identity.c b/gallery/identity/ceed-identity.c
index aa7f59eda4..07bdb7d3c0 100644
--- a/gallery/identity/ceed-identity.c
+++ b/gallery/identity/ceed-identity.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/gallery/mass-vector/ceed-vectormassapply.c b/gallery/mass-vector/ceed-vectormassapply.c
index c47d77fe48..6899f1bfb8 100644
--- a/gallery/mass-vector/ceed-vectormassapply.c
+++ b/gallery/mass-vector/ceed-vectormassapply.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/gallery/mass/ceed-mass1dbuild.c b/gallery/mass/ceed-mass1dbuild.c
index db4454f7b6..798e1924d8 100644
--- a/gallery/mass/ceed-mass1dbuild.c
+++ b/gallery/mass/ceed-mass1dbuild.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/gallery/mass/ceed-mass2dbuild.c b/gallery/mass/ceed-mass2dbuild.c
index 52e10dec27..766ec2f999 100644
--- a/gallery/mass/ceed-mass2dbuild.c
+++ b/gallery/mass/ceed-mass2dbuild.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/gallery/mass/ceed-mass3dbuild.c b/gallery/mass/ceed-mass3dbuild.c
index fcb3ab23f3..fcc428d1c6 100644
--- a/gallery/mass/ceed-mass3dbuild.c
+++ b/gallery/mass/ceed-mass3dbuild.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/gallery/mass/ceed-massapply.c b/gallery/mass/ceed-massapply.c
index d213a7a359..232f137d1c 100644
--- a/gallery/mass/ceed-massapply.c
+++ b/gallery/mass/ceed-massapply.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/gallery/poisson-vector/ceed-vectorpoisson1dapply.c b/gallery/poisson-vector/ceed-vectorpoisson1dapply.c
index 2e8578d5a3..a2c5aa4b5c 100644
--- a/gallery/poisson-vector/ceed-vectorpoisson1dapply.c
+++ b/gallery/poisson-vector/ceed-vectorpoisson1dapply.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/gallery/poisson-vector/ceed-vectorpoisson2dapply.c b/gallery/poisson-vector/ceed-vectorpoisson2dapply.c
index 8eb96609ff..4170ea7e56 100644
--- a/gallery/poisson-vector/ceed-vectorpoisson2dapply.c
+++ b/gallery/poisson-vector/ceed-vectorpoisson2dapply.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/gallery/poisson-vector/ceed-vectorpoisson3dapply.c b/gallery/poisson-vector/ceed-vectorpoisson3dapply.c
index 2506db2b45..47bfbfac50 100644
--- a/gallery/poisson-vector/ceed-vectorpoisson3dapply.c
+++ b/gallery/poisson-vector/ceed-vectorpoisson3dapply.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/gallery/poisson/ceed-poisson1dapply.c b/gallery/poisson/ceed-poisson1dapply.c
index a9b6cef825..02edfa39d9 100644
--- a/gallery/poisson/ceed-poisson1dapply.c
+++ b/gallery/poisson/ceed-poisson1dapply.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/gallery/poisson/ceed-poisson1dbuild.c b/gallery/poisson/ceed-poisson1dbuild.c
index 69f4e1fb50..98eab20581 100644
--- a/gallery/poisson/ceed-poisson1dbuild.c
+++ b/gallery/poisson/ceed-poisson1dbuild.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/gallery/poisson/ceed-poisson2dapply.c b/gallery/poisson/ceed-poisson2dapply.c
index 5eb2d058bb..d8dea0c3cd 100644
--- a/gallery/poisson/ceed-poisson2dapply.c
+++ b/gallery/poisson/ceed-poisson2dapply.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/gallery/poisson/ceed-poisson2dbuild.c b/gallery/poisson/ceed-poisson2dbuild.c
index 60a13dd7a6..0772c7961c 100644
--- a/gallery/poisson/ceed-poisson2dbuild.c
+++ b/gallery/poisson/ceed-poisson2dbuild.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/gallery/poisson/ceed-poisson3dapply.c b/gallery/poisson/ceed-poisson3dapply.c
index 7af449b13e..d5742ed0dc 100644
--- a/gallery/poisson/ceed-poisson3dapply.c
+++ b/gallery/poisson/ceed-poisson3dapply.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/gallery/poisson/ceed-poisson3dbuild.c b/gallery/poisson/ceed-poisson3dbuild.c
index 5471701b10..63004755de 100644
--- a/gallery/poisson/ceed-poisson3dbuild.c
+++ b/gallery/poisson/ceed-poisson3dbuild.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/gallery/scale/ceed-scale.c b/gallery/scale/ceed-scale.c
index 93fd9be24e..77aff92063 100644
--- a/gallery/scale/ceed-scale.c
+++ b/gallery/scale/ceed-scale.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed-fortran-name.h b/include/ceed-fortran-name.h
index 192356fbc6..5f70b6c879 100644
--- a/include/ceed-fortran-name.h
+++ b/include/ceed-fortran-name.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed-impl.h b/include/ceed-impl.h
index 7af9ecb8c9..4e7941a350 100644
--- a/include/ceed-impl.h
+++ b/include/ceed-impl.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/backend.h b/include/ceed/backend.h
index 7f686660ed..a599730de8 100644
--- a/include/ceed/backend.h
+++ b/include/ceed/backend.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/ceed-f32.h b/include/ceed/ceed-f32.h
index d928f5158a..0bce734257 100644
--- a/include/ceed/ceed-f32.h
+++ b/include/ceed/ceed-f32.h
@@ -1,4 +1,4 @@
-/// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+/// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 /// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 ///
 /// SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/ceed-f64.h b/include/ceed/ceed-f64.h
index bcab40cfd2..b74d867c18 100644
--- a/include/ceed/ceed-f64.h
+++ b/include/ceed/ceed-f64.h
@@ -1,4 +1,4 @@
-/// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+/// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 /// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 ///
 /// SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/ceed.h b/include/ceed/ceed.h
index 872124c765..26911b0a67 100644
--- a/include/ceed/ceed.h
+++ b/include/ceed/ceed.h
@@ -1,4 +1,4 @@
-/// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+/// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 /// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 ///
 /// SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/cuda.h b/include/ceed/cuda.h
index 839e64fed7..4a53a5239e 100644
--- a/include/ceed/cuda.h
+++ b/include/ceed/cuda.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/fortran.h b/include/ceed/fortran.h
index bb7bcac396..08bb627e57 100644
--- a/include/ceed/fortran.h
+++ b/include/ceed/fortran.h
@@ -1,4 +1,4 @@
-! Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+! Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 ! All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 !
 ! SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/hip.h b/include/ceed/hip.h
index 2c0e156872..5f4bdd149a 100644
--- a/include/ceed/hip.h
+++ b/include/ceed/hip.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/cuda/cuda-atomic-add-fallback.h b/include/ceed/jit-source/cuda/cuda-atomic-add-fallback.h
index 6c3712c36b..7cef2f83a2 100644
--- a/include/ceed/jit-source/cuda/cuda-atomic-add-fallback.h
+++ b/include/ceed/jit-source/cuda/cuda-atomic-add-fallback.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/cuda/cuda-gen-templates.h b/include/ceed/jit-source/cuda/cuda-gen-templates.h
index b795da6ca4..f80a2af717 100644
--- a/include/ceed/jit-source/cuda/cuda-gen-templates.h
+++ b/include/ceed/jit-source/cuda/cuda-gen-templates.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/cuda/cuda-jit.h b/include/ceed/jit-source/cuda/cuda-jit.h
index 1aedb54dbe..baa15a1e85 100644
--- a/include/ceed/jit-source/cuda/cuda-jit.h
+++ b/include/ceed/jit-source/cuda/cuda-jit.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/cuda/cuda-ref-basis-nontensor-templates.h b/include/ceed/jit-source/cuda/cuda-ref-basis-nontensor-templates.h
index 9f0fa61b49..4316f70c7e 100644
--- a/include/ceed/jit-source/cuda/cuda-ref-basis-nontensor-templates.h
+++ b/include/ceed/jit-source/cuda/cuda-ref-basis-nontensor-templates.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/cuda/cuda-ref-basis-nontensor.h b/include/ceed/jit-source/cuda/cuda-ref-basis-nontensor.h
index afee25eb8d..b6e90450a8 100644
--- a/include/ceed/jit-source/cuda/cuda-ref-basis-nontensor.h
+++ b/include/ceed/jit-source/cuda/cuda-ref-basis-nontensor.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h b/include/ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h
index 134547ecce..c461d0ce30 100644
--- a/include/ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h
+++ b/include/ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/cuda/cuda-ref-basis-tensor.h b/include/ceed/jit-source/cuda/cuda-ref-basis-tensor.h
index a5ed841a11..a39a9fede7 100644
--- a/include/ceed/jit-source/cuda/cuda-ref-basis-tensor.h
+++ b/include/ceed/jit-source/cuda/cuda-ref-basis-tensor.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/cuda/cuda-ref-operator-assemble-diagonal.h b/include/ceed/jit-source/cuda/cuda-ref-operator-assemble-diagonal.h
index 7a74ea9723..54fe5ec1ad 100644
--- a/include/ceed/jit-source/cuda/cuda-ref-operator-assemble-diagonal.h
+++ b/include/ceed/jit-source/cuda/cuda-ref-operator-assemble-diagonal.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/cuda/cuda-ref-operator-assemble.h b/include/ceed/jit-source/cuda/cuda-ref-operator-assemble.h
index 1de68e76c8..569c2728be 100644
--- a/include/ceed/jit-source/cuda/cuda-ref-operator-assemble.h
+++ b/include/ceed/jit-source/cuda/cuda-ref-operator-assemble.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/cuda/cuda-ref-qfunction.h b/include/ceed/jit-source/cuda/cuda-ref-qfunction.h
index 6b26aee037..3fb97139dc 100644
--- a/include/ceed/jit-source/cuda/cuda-ref-qfunction.h
+++ b/include/ceed/jit-source/cuda/cuda-ref-qfunction.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/cuda/cuda-ref-restriction-at-points.h b/include/ceed/jit-source/cuda/cuda-ref-restriction-at-points.h
index 039eab8cbf..48c9f13063 100644
--- a/include/ceed/jit-source/cuda/cuda-ref-restriction-at-points.h
+++ b/include/ceed/jit-source/cuda/cuda-ref-restriction-at-points.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/cuda/cuda-ref-restriction-curl-oriented.h b/include/ceed/jit-source/cuda/cuda-ref-restriction-curl-oriented.h
index 48d8bda313..f8ad690489 100644
--- a/include/ceed/jit-source/cuda/cuda-ref-restriction-curl-oriented.h
+++ b/include/ceed/jit-source/cuda/cuda-ref-restriction-curl-oriented.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/cuda/cuda-ref-restriction-offset.h b/include/ceed/jit-source/cuda/cuda-ref-restriction-offset.h
index 50c0ddbe92..264524e728 100644
--- a/include/ceed/jit-source/cuda/cuda-ref-restriction-offset.h
+++ b/include/ceed/jit-source/cuda/cuda-ref-restriction-offset.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/cuda/cuda-ref-restriction-oriented.h b/include/ceed/jit-source/cuda/cuda-ref-restriction-oriented.h
index dca2dbb6c7..e8217e4b9d 100644
--- a/include/ceed/jit-source/cuda/cuda-ref-restriction-oriented.h
+++ b/include/ceed/jit-source/cuda/cuda-ref-restriction-oriented.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/cuda/cuda-ref-restriction-strided.h b/include/ceed/jit-source/cuda/cuda-ref-restriction-strided.h
index 4d297b09c3..a72250d311 100644
--- a/include/ceed/jit-source/cuda/cuda-ref-restriction-strided.h
+++ b/include/ceed/jit-source/cuda/cuda-ref-restriction-strided.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor-templates.h
index 4cb265e78a..9cd7ed94f7 100644
--- a/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor-templates.h
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor-templates.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor.h b/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor.h
index 3a7fb2e241..563198db15 100644
--- a/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor.h
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-read-write-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-read-write-templates.h
index 066f95ed58..cb62c4f80b 100644
--- a/include/ceed/jit-source/cuda/cuda-shared-basis-read-write-templates.h
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-read-write-templates.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h
index 32437bf4c4..50ee86a077 100644
--- a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points.h
index 1b318f80e0..30a1997585 100644
--- a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points.h
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h
index d49cfcd717..c7013e3300 100644
--- a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor.h
index 312442c3aa..5903c8c747 100644
--- a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor.h
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/cuda/cuda-types.h b/include/ceed/jit-source/cuda/cuda-types.h
index f80d7193d9..9acb0064a3 100644
--- a/include/ceed/jit-source/cuda/cuda-types.h
+++ b/include/ceed/jit-source/cuda/cuda-types.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/gallery/ceed-identity.h b/include/ceed/jit-source/gallery/ceed-identity.h
index 81a005d664..0c99c75507 100644
--- a/include/ceed/jit-source/gallery/ceed-identity.h
+++ b/include/ceed/jit-source/gallery/ceed-identity.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/gallery/ceed-mass1dbuild.h b/include/ceed/jit-source/gallery/ceed-mass1dbuild.h
index 4db3634acd..d37985aba7 100644
--- a/include/ceed/jit-source/gallery/ceed-mass1dbuild.h
+++ b/include/ceed/jit-source/gallery/ceed-mass1dbuild.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/gallery/ceed-mass2dbuild.h b/include/ceed/jit-source/gallery/ceed-mass2dbuild.h
index 583441007a..02e11a30e1 100644
--- a/include/ceed/jit-source/gallery/ceed-mass2dbuild.h
+++ b/include/ceed/jit-source/gallery/ceed-mass2dbuild.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/gallery/ceed-mass3dbuild.h b/include/ceed/jit-source/gallery/ceed-mass3dbuild.h
index 855f48682c..692bb2e917 100644
--- a/include/ceed/jit-source/gallery/ceed-mass3dbuild.h
+++ b/include/ceed/jit-source/gallery/ceed-mass3dbuild.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/gallery/ceed-massapply.h b/include/ceed/jit-source/gallery/ceed-massapply.h
index 4ec920ac7a..70559b3429 100644
--- a/include/ceed/jit-source/gallery/ceed-massapply.h
+++ b/include/ceed/jit-source/gallery/ceed-massapply.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/gallery/ceed-poisson1dapply.h b/include/ceed/jit-source/gallery/ceed-poisson1dapply.h
index 3d6bbfe513..33709c7ee2 100644
--- a/include/ceed/jit-source/gallery/ceed-poisson1dapply.h
+++ b/include/ceed/jit-source/gallery/ceed-poisson1dapply.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/gallery/ceed-poisson1dbuild.h b/include/ceed/jit-source/gallery/ceed-poisson1dbuild.h
index 07096cca96..eb619b3056 100644
--- a/include/ceed/jit-source/gallery/ceed-poisson1dbuild.h
+++ b/include/ceed/jit-source/gallery/ceed-poisson1dbuild.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/gallery/ceed-poisson2dapply.h b/include/ceed/jit-source/gallery/ceed-poisson2dapply.h
index 5c46422ecf..808b1eb988 100644
--- a/include/ceed/jit-source/gallery/ceed-poisson2dapply.h
+++ b/include/ceed/jit-source/gallery/ceed-poisson2dapply.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/gallery/ceed-poisson2dbuild.h b/include/ceed/jit-source/gallery/ceed-poisson2dbuild.h
index 0f4e0b3f54..afa4c6d64a 100644
--- a/include/ceed/jit-source/gallery/ceed-poisson2dbuild.h
+++ b/include/ceed/jit-source/gallery/ceed-poisson2dbuild.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/gallery/ceed-poisson3dapply.h b/include/ceed/jit-source/gallery/ceed-poisson3dapply.h
index c78c2ecbf4..4b0894b9dd 100644
--- a/include/ceed/jit-source/gallery/ceed-poisson3dapply.h
+++ b/include/ceed/jit-source/gallery/ceed-poisson3dapply.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/gallery/ceed-poisson3dbuild.h b/include/ceed/jit-source/gallery/ceed-poisson3dbuild.h
index b2013de28b..6a3a818cad 100644
--- a/include/ceed/jit-source/gallery/ceed-poisson3dbuild.h
+++ b/include/ceed/jit-source/gallery/ceed-poisson3dbuild.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/gallery/ceed-scale.h b/include/ceed/jit-source/gallery/ceed-scale.h
index 6ffe081815..d7a7098b3f 100644
--- a/include/ceed/jit-source/gallery/ceed-scale.h
+++ b/include/ceed/jit-source/gallery/ceed-scale.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/gallery/ceed-vectormassapply.h b/include/ceed/jit-source/gallery/ceed-vectormassapply.h
index 40825f77f2..c07ebc7fc8 100644
--- a/include/ceed/jit-source/gallery/ceed-vectormassapply.h
+++ b/include/ceed/jit-source/gallery/ceed-vectormassapply.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/gallery/ceed-vectorpoisson1dapply.h b/include/ceed/jit-source/gallery/ceed-vectorpoisson1dapply.h
index 4101c7f886..ef8552bf32 100644
--- a/include/ceed/jit-source/gallery/ceed-vectorpoisson1dapply.h
+++ b/include/ceed/jit-source/gallery/ceed-vectorpoisson1dapply.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/gallery/ceed-vectorpoisson2dapply.h b/include/ceed/jit-source/gallery/ceed-vectorpoisson2dapply.h
index 061fe75355..aef318b9c5 100644
--- a/include/ceed/jit-source/gallery/ceed-vectorpoisson2dapply.h
+++ b/include/ceed/jit-source/gallery/ceed-vectorpoisson2dapply.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/gallery/ceed-vectorpoisson3dapply.h b/include/ceed/jit-source/gallery/ceed-vectorpoisson3dapply.h
index 7aabaa9025..52fb3565c1 100644
--- a/include/ceed/jit-source/gallery/ceed-vectorpoisson3dapply.h
+++ b/include/ceed/jit-source/gallery/ceed-vectorpoisson3dapply.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/hip/hip-gen-templates.h b/include/ceed/jit-source/hip/hip-gen-templates.h
index 53e02133bb..80f8d047b0 100644
--- a/include/ceed/jit-source/hip/hip-gen-templates.h
+++ b/include/ceed/jit-source/hip/hip-gen-templates.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/hip/hip-jit.h b/include/ceed/jit-source/hip/hip-jit.h
index 2ac1968b2d..70a00416e4 100644
--- a/include/ceed/jit-source/hip/hip-jit.h
+++ b/include/ceed/jit-source/hip/hip-jit.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/hip/hip-ref-basis-nontensor-templates.h b/include/ceed/jit-source/hip/hip-ref-basis-nontensor-templates.h
index 9d840f1edd..546cef8780 100644
--- a/include/ceed/jit-source/hip/hip-ref-basis-nontensor-templates.h
+++ b/include/ceed/jit-source/hip/hip-ref-basis-nontensor-templates.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/hip/hip-ref-basis-nontensor.h b/include/ceed/jit-source/hip/hip-ref-basis-nontensor.h
index 6efbf47054..f0707ee270 100644
--- a/include/ceed/jit-source/hip/hip-ref-basis-nontensor.h
+++ b/include/ceed/jit-source/hip/hip-ref-basis-nontensor.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h b/include/ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h
index 188846386b..302ea9fff6 100644
--- a/include/ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h
+++ b/include/ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/hip/hip-ref-basis-tensor.h b/include/ceed/jit-source/hip/hip-ref-basis-tensor.h
index e5cf318dc1..2642a22ae8 100644
--- a/include/ceed/jit-source/hip/hip-ref-basis-tensor.h
+++ b/include/ceed/jit-source/hip/hip-ref-basis-tensor.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/hip/hip-ref-operator-assemble-diagonal.h b/include/ceed/jit-source/hip/hip-ref-operator-assemble-diagonal.h
index c9eed447e6..c73a0b8063 100644
--- a/include/ceed/jit-source/hip/hip-ref-operator-assemble-diagonal.h
+++ b/include/ceed/jit-source/hip/hip-ref-operator-assemble-diagonal.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/hip/hip-ref-operator-assemble.h b/include/ceed/jit-source/hip/hip-ref-operator-assemble.h
index 38625c7c3d..d9bcb07cd9 100644
--- a/include/ceed/jit-source/hip/hip-ref-operator-assemble.h
+++ b/include/ceed/jit-source/hip/hip-ref-operator-assemble.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/hip/hip-ref-qfunction.h b/include/ceed/jit-source/hip/hip-ref-qfunction.h
index f0d436572d..f26ec054b1 100644
--- a/include/ceed/jit-source/hip/hip-ref-qfunction.h
+++ b/include/ceed/jit-source/hip/hip-ref-qfunction.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/hip/hip-ref-restriction-at-points.h b/include/ceed/jit-source/hip/hip-ref-restriction-at-points.h
index cdc06d6061..15fad9984a 100644
--- a/include/ceed/jit-source/hip/hip-ref-restriction-at-points.h
+++ b/include/ceed/jit-source/hip/hip-ref-restriction-at-points.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/hip/hip-ref-restriction-curl-oriented.h b/include/ceed/jit-source/hip/hip-ref-restriction-curl-oriented.h
index 12b3a0250b..cc23fa3a52 100644
--- a/include/ceed/jit-source/hip/hip-ref-restriction-curl-oriented.h
+++ b/include/ceed/jit-source/hip/hip-ref-restriction-curl-oriented.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/hip/hip-ref-restriction-offset.h b/include/ceed/jit-source/hip/hip-ref-restriction-offset.h
index 3d0d68cb10..80d746503e 100644
--- a/include/ceed/jit-source/hip/hip-ref-restriction-offset.h
+++ b/include/ceed/jit-source/hip/hip-ref-restriction-offset.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/hip/hip-ref-restriction-oriented.h b/include/ceed/jit-source/hip/hip-ref-restriction-oriented.h
index 155173de63..a88dac2295 100644
--- a/include/ceed/jit-source/hip/hip-ref-restriction-oriented.h
+++ b/include/ceed/jit-source/hip/hip-ref-restriction-oriented.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/hip/hip-ref-restriction-strided.h b/include/ceed/jit-source/hip/hip-ref-restriction-strided.h
index 8af0528756..f3443b33d4 100644
--- a/include/ceed/jit-source/hip/hip-ref-restriction-strided.h
+++ b/include/ceed/jit-source/hip/hip-ref-restriction-strided.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/hip/hip-shared-basis-nontensor-templates.h b/include/ceed/jit-source/hip/hip-shared-basis-nontensor-templates.h
index 898b0ff331..45c306a217 100644
--- a/include/ceed/jit-source/hip/hip-shared-basis-nontensor-templates.h
+++ b/include/ceed/jit-source/hip/hip-shared-basis-nontensor-templates.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/hip/hip-shared-basis-nontensor.h b/include/ceed/jit-source/hip/hip-shared-basis-nontensor.h
index dabe392f10..9c255f8e75 100644
--- a/include/ceed/jit-source/hip/hip-shared-basis-nontensor.h
+++ b/include/ceed/jit-source/hip/hip-shared-basis-nontensor.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/hip/hip-shared-basis-read-write-templates.h b/include/ceed/jit-source/hip/hip-shared-basis-read-write-templates.h
index 47b4eae92f..349ea253b0 100644
--- a/include/ceed/jit-source/hip/hip-shared-basis-read-write-templates.h
+++ b/include/ceed/jit-source/hip/hip-shared-basis-read-write-templates.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h b/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h
index 923de63395..0d73e57aac 100644
--- a/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h
+++ b/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points.h b/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points.h
index 81cca74474..b7cb81d07f 100644
--- a/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points.h
+++ b/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/hip/hip-shared-basis-tensor-templates.h b/include/ceed/jit-source/hip/hip-shared-basis-tensor-templates.h
index be7857ded1..6425546eb1 100644
--- a/include/ceed/jit-source/hip/hip-shared-basis-tensor-templates.h
+++ b/include/ceed/jit-source/hip/hip-shared-basis-tensor-templates.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/hip/hip-shared-basis-tensor.h b/include/ceed/jit-source/hip/hip-shared-basis-tensor.h
index 13e3690b38..8b562b1f90 100644
--- a/include/ceed/jit-source/hip/hip-shared-basis-tensor.h
+++ b/include/ceed/jit-source/hip/hip-shared-basis-tensor.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/hip/hip-types.h b/include/ceed/jit-source/hip/hip-types.h
index 01fe82e08d..7befa1b492 100644
--- a/include/ceed/jit-source/hip/hip-types.h
+++ b/include/ceed/jit-source/hip/hip-types.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/magma/magma-basis-grad-1d.h b/include/ceed/jit-source/magma/magma-basis-grad-1d.h
index 998b0d5020..ba7c645262 100644
--- a/include/ceed/jit-source/magma/magma-basis-grad-1d.h
+++ b/include/ceed/jit-source/magma/magma-basis-grad-1d.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/magma/magma-basis-grad-2d.h b/include/ceed/jit-source/magma/magma-basis-grad-2d.h
index b9fedf5c8e..671da961e8 100644
--- a/include/ceed/jit-source/magma/magma-basis-grad-2d.h
+++ b/include/ceed/jit-source/magma/magma-basis-grad-2d.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/magma/magma-basis-grad-3d.h b/include/ceed/jit-source/magma/magma-basis-grad-3d.h
index 64572a6510..b833ad7609 100644
--- a/include/ceed/jit-source/magma/magma-basis-grad-3d.h
+++ b/include/ceed/jit-source/magma/magma-basis-grad-3d.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/magma/magma-basis-interp-1d.h b/include/ceed/jit-source/magma/magma-basis-interp-1d.h
index c281d430dc..fb1fa90b33 100644
--- a/include/ceed/jit-source/magma/magma-basis-interp-1d.h
+++ b/include/ceed/jit-source/magma/magma-basis-interp-1d.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/magma/magma-basis-interp-2d.h b/include/ceed/jit-source/magma/magma-basis-interp-2d.h
index fc2bba223a..956059fea9 100644
--- a/include/ceed/jit-source/magma/magma-basis-interp-2d.h
+++ b/include/ceed/jit-source/magma/magma-basis-interp-2d.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/magma/magma-basis-interp-3d.h b/include/ceed/jit-source/magma/magma-basis-interp-3d.h
index 7c214c8624..6011db7e97 100644
--- a/include/ceed/jit-source/magma/magma-basis-interp-3d.h
+++ b/include/ceed/jit-source/magma/magma-basis-interp-3d.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/magma/magma-basis-interp-deriv-nontensor.h b/include/ceed/jit-source/magma/magma-basis-interp-deriv-nontensor.h
index 07b4386c07..f3dfa150ee 100644
--- a/include/ceed/jit-source/magma/magma-basis-interp-deriv-nontensor.h
+++ b/include/ceed/jit-source/magma/magma-basis-interp-deriv-nontensor.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/magma/magma-basis-weight-1d.h b/include/ceed/jit-source/magma/magma-basis-weight-1d.h
index 8333a3cfc4..ad8e6290d2 100644
--- a/include/ceed/jit-source/magma/magma-basis-weight-1d.h
+++ b/include/ceed/jit-source/magma/magma-basis-weight-1d.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/magma/magma-basis-weight-2d.h b/include/ceed/jit-source/magma/magma-basis-weight-2d.h
index 8fa903096b..9ef709af0e 100644
--- a/include/ceed/jit-source/magma/magma-basis-weight-2d.h
+++ b/include/ceed/jit-source/magma/magma-basis-weight-2d.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/magma/magma-basis-weight-3d.h b/include/ceed/jit-source/magma/magma-basis-weight-3d.h
index 2405188dcc..bcfa89df40 100644
--- a/include/ceed/jit-source/magma/magma-basis-weight-3d.h
+++ b/include/ceed/jit-source/magma/magma-basis-weight-3d.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/magma/magma-basis-weight-nontensor.h b/include/ceed/jit-source/magma/magma-basis-weight-nontensor.h
index 4052025c91..5233afa6c9 100644
--- a/include/ceed/jit-source/magma/magma-basis-weight-nontensor.h
+++ b/include/ceed/jit-source/magma/magma-basis-weight-nontensor.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/magma/magma-common-defs.h b/include/ceed/jit-source/magma/magma-common-defs.h
index 5dc3550b76..a2ea52628a 100644
--- a/include/ceed/jit-source/magma/magma-common-defs.h
+++ b/include/ceed/jit-source/magma/magma-common-defs.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/magma/magma-common-nontensor.h b/include/ceed/jit-source/magma/magma-common-nontensor.h
index 945227d145..70a73247b1 100644
--- a/include/ceed/jit-source/magma/magma-common-nontensor.h
+++ b/include/ceed/jit-source/magma/magma-common-nontensor.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/magma/magma-common-tensor.h b/include/ceed/jit-source/magma/magma-common-tensor.h
index 494afacd87..87ec727c0d 100644
--- a/include/ceed/jit-source/magma/magma-common-tensor.h
+++ b/include/ceed/jit-source/magma/magma-common-tensor.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/sycl/sycl-gen-templates.h b/include/ceed/jit-source/sycl/sycl-gen-templates.h
index cf6f6cbc15..b028924996 100644
--- a/include/ceed/jit-source/sycl/sycl-gen-templates.h
+++ b/include/ceed/jit-source/sycl/sycl-gen-templates.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/sycl/sycl-jit.h b/include/ceed/jit-source/sycl/sycl-jit.h
index f4824d8a34..1a2971f4df 100644
--- a/include/ceed/jit-source/sycl/sycl-jit.h
+++ b/include/ceed/jit-source/sycl/sycl-jit.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/sycl/sycl-ref-qfunction.h b/include/ceed/jit-source/sycl/sycl-ref-qfunction.h
index 24b7de724f..b22d86ec33 100644
--- a/include/ceed/jit-source/sycl/sycl-ref-qfunction.h
+++ b/include/ceed/jit-source/sycl/sycl-ref-qfunction.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/sycl/sycl-shared-basis-read-write-templates.h b/include/ceed/jit-source/sycl/sycl-shared-basis-read-write-templates.h
index 06587592da..1105df17f2 100644
--- a/include/ceed/jit-source/sycl/sycl-shared-basis-read-write-templates.h
+++ b/include/ceed/jit-source/sycl/sycl-shared-basis-read-write-templates.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/sycl/sycl-shared-basis-tensor-templates.h b/include/ceed/jit-source/sycl/sycl-shared-basis-tensor-templates.h
index bd6ec34052..8d37c1ba82 100644
--- a/include/ceed/jit-source/sycl/sycl-shared-basis-tensor-templates.h
+++ b/include/ceed/jit-source/sycl/sycl-shared-basis-tensor-templates.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/sycl/sycl-shared-basis-tensor.h b/include/ceed/jit-source/sycl/sycl-shared-basis-tensor.h
index fc38b00351..3593f8ab7d 100644
--- a/include/ceed/jit-source/sycl/sycl-shared-basis-tensor.h
+++ b/include/ceed/jit-source/sycl/sycl-shared-basis-tensor.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/sycl/sycl-types.h b/include/ceed/jit-source/sycl/sycl-types.h
index 58938a4b2a..3d57991fa4 100644
--- a/include/ceed/jit-source/sycl/sycl-types.h
+++ b/include/ceed/jit-source/sycl/sycl-types.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-tools.h b/include/ceed/jit-tools.h
index 60e0795f50..1213f974d2 100644
--- a/include/ceed/jit-tools.h
+++ b/include/ceed/jit-tools.h
@@ -1,4 +1,4 @@
-/// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+/// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 /// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 ///
 /// SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/types.h b/include/ceed/types.h
index 6c6d126548..2739390d8e 100644
--- a/include/ceed/types.h
+++ b/include/ceed/types.h
@@ -1,4 +1,4 @@
-/// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+/// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 /// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 ///
 /// SPDX-License-Identifier: BSD-2-Clause
diff --git a/interface/ceed-basis.c b/interface/ceed-basis.c
index 46edf8213e..d17b8466f9 100644
--- a/interface/ceed-basis.c
+++ b/interface/ceed-basis.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/interface/ceed-cuda.c b/interface/ceed-cuda.c
index ff28d10d14..b54c706720 100644
--- a/interface/ceed-cuda.c
+++ b/interface/ceed-cuda.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/interface/ceed-elemrestriction.c b/interface/ceed-elemrestriction.c
index 1f705ecd2e..3a64789a94 100644
--- a/interface/ceed-elemrestriction.c
+++ b/interface/ceed-elemrestriction.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/interface/ceed-fortran.c b/interface/ceed-fortran.c
index 501d901c2e..73f8c801b9 100644
--- a/interface/ceed-fortran.c
+++ b/interface/ceed-fortran.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/interface/ceed-hip.c b/interface/ceed-hip.c
index f14df51eb5..911a374a94 100644
--- a/interface/ceed-hip.c
+++ b/interface/ceed-hip.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/interface/ceed-jit-source-root-default.c b/interface/ceed-jit-source-root-default.c
index 6f1bc47e6c..27587c6fd9 100644
--- a/interface/ceed-jit-source-root-default.c
+++ b/interface/ceed-jit-source-root-default.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/interface/ceed-jit-source-root-install.c b/interface/ceed-jit-source-root-install.c
index ffa78b21d5..9b41385694 100644
--- a/interface/ceed-jit-source-root-install.c
+++ b/interface/ceed-jit-source-root-install.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/interface/ceed-jit-tools.c b/interface/ceed-jit-tools.c
index 14f1babb87..978bc41f61 100644
--- a/interface/ceed-jit-tools.c
+++ b/interface/ceed-jit-tools.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/interface/ceed-operator.c b/interface/ceed-operator.c
index 445adbeeb6..be83326781 100644
--- a/interface/ceed-operator.c
+++ b/interface/ceed-operator.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/interface/ceed-preconditioning.c b/interface/ceed-preconditioning.c
index 700699e75b..970f3549b5 100644
--- a/interface/ceed-preconditioning.c
+++ b/interface/ceed-preconditioning.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/interface/ceed-qfunction-register.c b/interface/ceed-qfunction-register.c
index 3558d0a225..bdce3f8815 100644
--- a/interface/ceed-qfunction-register.c
+++ b/interface/ceed-qfunction-register.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/interface/ceed-qfunction.c b/interface/ceed-qfunction.c
index 6936974fa4..cfce6575df 100644
--- a/interface/ceed-qfunction.c
+++ b/interface/ceed-qfunction.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/interface/ceed-qfunctioncontext.c b/interface/ceed-qfunctioncontext.c
index a32dee73a9..2568133fcb 100644
--- a/interface/ceed-qfunctioncontext.c
+++ b/interface/ceed-qfunctioncontext.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/interface/ceed-register.c b/interface/ceed-register.c
index bdc8a95d10..2d3413eaf2 100644
--- a/interface/ceed-register.c
+++ b/interface/ceed-register.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/interface/ceed-tensor.c b/interface/ceed-tensor.c
index dd8b81a118..2ae2b4446f 100644
--- a/interface/ceed-tensor.c
+++ b/interface/ceed-tensor.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/interface/ceed-types.c b/interface/ceed-types.c
index 564a5b009a..e975793307 100644
--- a/interface/ceed-types.c
+++ b/interface/ceed-types.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/interface/ceed-vector.c b/interface/ceed-vector.c
index 839946762b..70c7b5464f 100644
--- a/interface/ceed-vector.c
+++ b/interface/ceed-vector.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/interface/ceed.c b/interface/ceed.c
index 6ebade1da0..0caff73f62 100644
--- a/interface/ceed.c
+++ b/interface/ceed.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/python/__init__.py b/python/__init__.py
index 9c6560addb..9c77ff8833 100644
--- a/python/__init__.py
+++ b/python/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors
+# Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/python/build_ceed_cffi.py b/python/build_ceed_cffi.py
index 6a77781d25..cf302c85a1 100644
--- a/python/build_ceed_cffi.py
+++ b/python/build_ceed_cffi.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors
+# Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/python/ceed.py b/python/ceed.py
index 092cd1d047..c146f562f5 100644
--- a/python/ceed.py
+++ b/python/ceed.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors
+# Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/python/ceed_basis.py b/python/ceed_basis.py
index e1c12def62..2b41dc542a 100644
--- a/python/ceed_basis.py
+++ b/python/ceed_basis.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors
+# Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/python/ceed_constants.py b/python/ceed_constants.py
index ab99f7b643..b0df95fdaf 100644
--- a/python/ceed_constants.py
+++ b/python/ceed_constants.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors
+# Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/python/ceed_elemrestriction.py b/python/ceed_elemrestriction.py
index 9c986eb58e..b71df55685 100644
--- a/python/ceed_elemrestriction.py
+++ b/python/ceed_elemrestriction.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors
+# Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/python/ceed_operator.py b/python/ceed_operator.py
index 740beef641..cce2ee5ae5 100644
--- a/python/ceed_operator.py
+++ b/python/ceed_operator.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors
+# Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/python/ceed_qfunction.py b/python/ceed_qfunction.py
index 896d69bfd4..a0c462efed 100644
--- a/python/ceed_qfunction.py
+++ b/python/ceed_qfunction.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors
+# Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/python/ceed_qfunctioncontext.py b/python/ceed_qfunctioncontext.py
index 92c072bdd2..712adcc090 100644
--- a/python/ceed_qfunctioncontext.py
+++ b/python/ceed_qfunctioncontext.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors
+# Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/python/ceed_vector.py b/python/ceed_vector.py
index c72bb265ad..379d1e1913 100644
--- a/python/ceed_vector.py
+++ b/python/ceed_vector.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors
+# Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/python/tests/Makefile b/python/tests/Makefile
index 918d1551ef..be8ad8e707 100644
--- a/python/tests/Makefile
+++ b/python/tests/Makefile
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+# Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/python/tests/conftest.py b/python/tests/conftest.py
index 62e8f4bb1d..6c763ac90a 100644
--- a/python/tests/conftest.py
+++ b/python/tests/conftest.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors
+# Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/python/tests/libceed-qfunctions.c b/python/tests/libceed-qfunctions.c
index bef055452a..8feb69aa91 100644
--- a/python/tests/libceed-qfunctions.c
+++ b/python/tests/libceed-qfunctions.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/python/tests/setup-qfunctions.py b/python/tests/setup-qfunctions.py
index aab21d830a..7a09e50e09 100644
--- a/python/tests/setup-qfunctions.py
+++ b/python/tests/setup-qfunctions.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+# Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/python/tests/setup.cfg b/python/tests/setup.cfg
index e0bbfb441c..89e8bd3596 100644
--- a/python/tests/setup.cfg
+++ b/python/tests/setup.cfg
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+# Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/python/tests/test-0-ceed.py b/python/tests/test-0-ceed.py
index b38d31a332..e8486049b4 100644
--- a/python/tests/test-0-ceed.py
+++ b/python/tests/test-0-ceed.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors
+# Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/python/tests/test-1-vector.py b/python/tests/test-1-vector.py
index 73ed078bca..834212c72c 100644
--- a/python/tests/test-1-vector.py
+++ b/python/tests/test-1-vector.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors
+# Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/python/tests/test-2-elemrestriction.py b/python/tests/test-2-elemrestriction.py
index 6f9b1a3c38..f85beef4ab 100644
--- a/python/tests/test-2-elemrestriction.py
+++ b/python/tests/test-2-elemrestriction.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors
+# Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/python/tests/test-3-basis.py b/python/tests/test-3-basis.py
index 453e0b8401..df62c24bdc 100644
--- a/python/tests/test-3-basis.py
+++ b/python/tests/test-3-basis.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors
+# Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/python/tests/test-4-qfunction.py b/python/tests/test-4-qfunction.py
index 42dd844e2f..7fd4ea41b1 100644
--- a/python/tests/test-4-qfunction.py
+++ b/python/tests/test-4-qfunction.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors
+# Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/python/tests/test-5-operator.py b/python/tests/test-5-operator.py
index 7127fe395e..39219230c5 100644
--- a/python/tests/test-5-operator.py
+++ b/python/tests/test-5-operator.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors
+# Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/python/tests/test-qfunctions.h b/python/tests/test-qfunctions.h
index eb9a5f3f1d..f4b5aa30d8 100644
--- a/python/tests/test-qfunctions.h
+++ b/python/tests/test-qfunctions.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/rust/libceed/src/basis.rs b/rust/libceed/src/basis.rs
index 2e719097e7..d613745fc5 100644
--- a/rust/libceed/src/basis.rs
+++ b/rust/libceed/src/basis.rs
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/rust/libceed/src/elem_restriction.rs b/rust/libceed/src/elem_restriction.rs
index 081e08c61b..3a340e7b23 100644
--- a/rust/libceed/src/elem_restriction.rs
+++ b/rust/libceed/src/elem_restriction.rs
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/rust/libceed/src/lib.rs b/rust/libceed/src/lib.rs
index b248aa9fa7..9c68db4dc6 100755
--- a/rust/libceed/src/lib.rs
+++ b/rust/libceed/src/lib.rs
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/rust/libceed/src/operator.rs b/rust/libceed/src/operator.rs
index 828ac5b998..7fd2891a66 100644
--- a/rust/libceed/src/operator.rs
+++ b/rust/libceed/src/operator.rs
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/rust/libceed/src/qfunction.rs b/rust/libceed/src/qfunction.rs
index f5b746eefd..cf1dbbdcf3 100644
--- a/rust/libceed/src/qfunction.rs
+++ b/rust/libceed/src/qfunction.rs
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/rust/libceed/src/vector.rs b/rust/libceed/src/vector.rs
index 3394ecd870..3793e437b0 100644
--- a/rust/libceed/src/vector.rs
+++ b/rust/libceed/src/vector.rs
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/rust/libceed/tests/version-numbers.rs b/rust/libceed/tests/version-numbers.rs
index 5f276eae0b..5060c78398 100644
--- a/rust/libceed/tests/version-numbers.rs
+++ b/rust/libceed/tests/version-numbers.rs
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors
 // All Rights Reserved. See the top-level COPYRIGHT and NOTICE files for details.
 //
 // SPDX-License-Identifier: (BSD-2-Clause)
diff --git a/tests/t319-basis.h b/tests/t319-basis.h
index 6f1a0cb5c9..12e95b6aa1 100644
--- a/tests/t319-basis.h
+++ b/tests/t319-basis.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/tests/t320-basis-f.h b/tests/t320-basis-f.h
index 93129e54de..762c754b64 100644
--- a/tests/t320-basis-f.h
+++ b/tests/t320-basis-f.h
@@ -1,4 +1,4 @@
-! Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+! Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 ! All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 !
 ! SPDX-License-Identifier: BSD-2-Clause
diff --git a/tests/t320-basis.h b/tests/t320-basis.h
index ef38e43b0a..942103a5e8 100644
--- a/tests/t320-basis.h
+++ b/tests/t320-basis.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/tests/t330-basis.h b/tests/t330-basis.h
index 82ae5a3d81..9fdbe5e531 100644
--- a/tests/t330-basis.h
+++ b/tests/t330-basis.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/tests/t340-basis.h b/tests/t340-basis.h
index 5fd8c420bc..9768c2623d 100644
--- a/tests/t340-basis.h
+++ b/tests/t340-basis.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/tests/t400-qfunction.h b/tests/t400-qfunction.h
index b3e226df14..d3207139f6 100644
--- a/tests/t400-qfunction.h
+++ b/tests/t400-qfunction.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/tests/t401-qfunction.h b/tests/t401-qfunction.h
index 465ec0b119..856fa98110 100644
--- a/tests/t401-qfunction.h
+++ b/tests/t401-qfunction.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/tests/t405-qfunction.h b/tests/t405-qfunction.h
index 40be19b47d..4e2d211c3d 100644
--- a/tests/t405-qfunction.h
+++ b/tests/t405-qfunction.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/tests/t406-qfunction-helper.h b/tests/t406-qfunction-helper.h
index 85fdf9999c..c000ee3d73 100644
--- a/tests/t406-qfunction-helper.h
+++ b/tests/t406-qfunction-helper.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/tests/t406-qfunction-scales.h b/tests/t406-qfunction-scales.h
index cde93275ff..90685238a2 100644
--- a/tests/t406-qfunction-scales.h
+++ b/tests/t406-qfunction-scales.h
@@ -3,7 +3,7 @@
 // Testing # on first line
 // Note: #ifndef and #pragma once header guards both work
 
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/tests/t406-qfunction.h b/tests/t406-qfunction.h
index 75a4229541..db9235d3ff 100644
--- a/tests/t406-qfunction.h
+++ b/tests/t406-qfunction.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/tests/t409-qfunction.h b/tests/t409-qfunction.h
index 5348ffeb9d..78e9930e27 100644
--- a/tests/t409-qfunction.h
+++ b/tests/t409-qfunction.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/tests/t500-operator.h b/tests/t500-operator.h
index 777978bc34..5efd4bac27 100644
--- a/tests/t500-operator.h
+++ b/tests/t500-operator.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/tests/t502-operator.h b/tests/t502-operator.h
index 9915ee4282..5f7a9da561 100644
--- a/tests/t502-operator.h
+++ b/tests/t502-operator.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/tests/t507-operator.h b/tests/t507-operator.h
index 3166f2ee69..adbb802ac2 100644
--- a/tests/t507-operator.h
+++ b/tests/t507-operator.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/tests/t510-operator.h b/tests/t510-operator.h
index 20677b157a..cfe155e2bc 100644
--- a/tests/t510-operator.h
+++ b/tests/t510-operator.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/tests/t522-operator.h b/tests/t522-operator.h
index 52aa9bae28..0685068099 100644
--- a/tests/t522-operator.h
+++ b/tests/t522-operator.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/tests/t530-operator.h b/tests/t530-operator.h
index 20677b157a..cfe155e2bc 100644
--- a/tests/t530-operator.h
+++ b/tests/t530-operator.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/tests/t531-operator.h b/tests/t531-operator.h
index 79a083f032..d310e303ae 100644
--- a/tests/t531-operator.h
+++ b/tests/t531-operator.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/tests/t532-operator.h b/tests/t532-operator.h
index 6de6e8b669..a8e6755a7d 100644
--- a/tests/t532-operator.h
+++ b/tests/t532-operator.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/tests/t534-operator.h b/tests/t534-operator.h
index cfe2bf73ac..83556af4f8 100644
--- a/tests/t534-operator.h
+++ b/tests/t534-operator.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/tests/t535-operator.h b/tests/t535-operator.h
index ba3d5498cb..9510f5ae25 100644
--- a/tests/t535-operator.h
+++ b/tests/t535-operator.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/tests/t537-operator.h b/tests/t537-operator.h
index f42f4fc1e4..71d1988e79 100644
--- a/tests/t537-operator.h
+++ b/tests/t537-operator.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/tests/t539-operator.h b/tests/t539-operator.h
index c51487250b..a48cc7c13b 100644
--- a/tests/t539-operator.h
+++ b/tests/t539-operator.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/tests/t540-operator.h b/tests/t540-operator.h
index f6052946aa..6278964c57 100644
--- a/tests/t540-operator.h
+++ b/tests/t540-operator.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/tests/t541-operator.h b/tests/t541-operator.h
index 2f588f76be..2ccc6a6b1d 100644
--- a/tests/t541-operator.h
+++ b/tests/t541-operator.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/tests/t566-operator.h b/tests/t566-operator.h
index a1c57cae55..3f5a7d90a6 100644
--- a/tests/t566-operator.h
+++ b/tests/t566-operator.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/tests/t567-operator.h b/tests/t567-operator.h
index faee0aa5ac..5ab2e633a3 100644
--- a/tests/t567-operator.h
+++ b/tests/t567-operator.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/tests/t568-operator.h b/tests/t568-operator.h
index 8cbb0ba8bf..6047197254 100644
--- a/tests/t568-operator.h
+++ b/tests/t568-operator.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/tests/t580-operator.h b/tests/t580-operator.h
index e53f7817de..e23db70411 100644
--- a/tests/t580-operator.h
+++ b/tests/t580-operator.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/tests/t590-operator.h b/tests/t590-operator.h
index d4c45b3735..71c26bd525 100644
--- a/tests/t590-operator.h
+++ b/tests/t590-operator.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/tests/t591-operator.h b/tests/t591-operator.h
index 3385bf9dcc..2cffaee1a1 100644
--- a/tests/t591-operator.h
+++ b/tests/t591-operator.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/tests/t595-operator.h b/tests/t595-operator.h
index e2dcddf09d..1e9e9e1ada 100644
--- a/tests/t595-operator.h
+++ b/tests/t595-operator.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause

From ed66640a7bddfba6ec5e31af0b66705275f047c9 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Wed, 19 Mar 2025 16:40:04 -0600
Subject: [PATCH 344/571] ci - use newer libxsmm in CI

---
 .gitlab-ci.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index c216de800a..b6cb108385 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -379,8 +379,8 @@ noether-float:
 # -- MAGMA from dev branch
 #    - echo "-------------- MAGMA ---------------"
 #    - export MAGMA_DIR=/projects/hipMAGMA && git -C $MAGMA_DIR -c safe.directory=$MAGMA_DIR describe
-    # -- LIBXSMM 7 April 2024
-    - cd .. && export XSMM_HASH=94ee71576870152feb62f3f0cf6b061d036dcdb5 && { [[ -d libxsmm-$XSMM_HASH ]] || { curl -L https://github.com/libxsmm/libxsmm/archive/$XSMM_HASH.tar.gz -o xsmm.tar.gz && tar zvxf xsmm.tar.gz && rm xsmm.tar.gz && make -C libxsmm-$XSMM_HASH -j$(nproc); }; } && export XSMM_DIR=$PWD/libxsmm-$XSMM_HASH && cd libCEED
+    # -- LIBXSMM 19 March 2025
+    - cd .. && export XSMM_HASH=ba9d6bc69c421c10f0597d582ea1ace6a6126308 && { [[ -d libxsmm-$XSMM_HASH ]] || { curl -L https://github.com/libxsmm/libxsmm/archive/$XSMM_HASH.tar.gz -o xsmm.tar.gz && tar zvxf xsmm.tar.gz && rm xsmm.tar.gz && make -C libxsmm-$XSMM_HASH -j$(nproc); }; } && export XSMM_DIR=$PWD/libxsmm-$XSMM_HASH && cd libCEED
     - echo "-------------- LIBXSMM -------------" && basename $XSMM_DIR
   script:
     - rm -f .SUCCESS

From ff90b007c3dd968e763cc6a82bde1e90c117acbc Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Tue, 18 Mar 2025 14:54:31 -0600
Subject: [PATCH 345/571] vec - add stop to CeedVectorSetValueStrided

---
 backends/cuda-ref/ceed-cuda-ref-operator.c      | 16 +++++++++++++---
 backends/cuda-ref/ceed-cuda-ref-vector.c        | 13 +++++++------
 backends/cuda-ref/kernels/cuda-ref-vector.cu    |  8 ++++----
 backends/hip-ref/ceed-hip-ref-operator.c        | 16 +++++++++++++---
 backends/hip-ref/ceed-hip-ref-vector.c          | 13 +++++++------
 backends/hip-ref/kernels/hip-ref-vector.hip.cpp |  8 ++++----
 backends/memcheck/ceed-memcheck-vector.c        |  5 +++--
 include/ceed-impl.h                             |  2 +-
 include/ceed/ceed.h                             |  2 +-
 interface/ceed-vector.c                         | 10 ++++++----
 tests/t127-vector.c                             |  2 +-
 11 files changed, 60 insertions(+), 35 deletions(-)

diff --git a/backends/cuda-ref/ceed-cuda-ref-operator.c b/backends/cuda-ref/ceed-cuda-ref-operator.c
index 52d1797e49..5212546ce1 100644
--- a/backends/cuda-ref/ceed-cuda-ref-operator.c
+++ b/backends/cuda-ref/ceed-cuda-ref-operator.c
@@ -1909,9 +1909,19 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda(CeedOperator op, C
       if (!is_active) continue;
 
       // Update unit vector
-      if (s == 0) CeedCallBackend(CeedVectorSetValue(active_e_vec_in, 0.0));
-      else CeedCallBackend(CeedVectorSetValueStrided(active_e_vec_in, s - 1, e_vec_size, 0.0));
-      CeedCallBackend(CeedVectorSetValueStrided(active_e_vec_in, s, e_vec_size, 1.0));
+      {
+        CeedInt  node = (s - 1) % elem_size, comp = (s - 1) / elem_size;
+        CeedSize start = node * 1 + comp * (elem_size * num_elem);
+        CeedSize stop  = start + (num_elem - 1) * elem_size + 1;
+
+        if (s == 0) CeedCallBackend(CeedVectorSetValue(active_e_vec_in, 0.0));
+        else CeedCallBackend(CeedVectorSetValueStrided(active_e_vec_in, start, stop, elem_size, 0.0));
+
+        node = s % elem_size, comp = s / elem_size;
+        start = node * 1 + comp * (elem_size * num_elem);
+        stop  = start + (num_elem - 1) * elem_size + 1;
+        CeedCallBackend(CeedVectorSetValueStrided(active_e_vec_in, start, stop, elem_size, 1.0));
+      }
 
       // Basis action
       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
diff --git a/backends/cuda-ref/ceed-cuda-ref-vector.c b/backends/cuda-ref/ceed-cuda-ref-vector.c
index 59815605d7..3455e67caa 100644
--- a/backends/cuda-ref/ceed-cuda-ref-vector.c
+++ b/backends/cuda-ref/ceed-cuda-ref-vector.c
@@ -336,31 +336,32 @@ static int CeedVectorSetValue_Cuda(CeedVector vec, CeedScalar val) {
 //------------------------------------------------------------------------------
 // Set host array to value strided
 //------------------------------------------------------------------------------
-static int CeedHostSetValueStrided_Cuda(CeedScalar *h_array, CeedSize start, CeedSize step, CeedSize length, CeedScalar val) {
-  for (CeedSize i = start; i < length; i += step) h_array[i] = val;
+static int CeedHostSetValueStrided_Cuda(CeedScalar *h_array, CeedSize start, CeedSize stop, CeedSize step, CeedSize length, CeedScalar val) {
+  for (CeedSize i = start; i <= stop; i += step) h_array[i] = val;
   return CEED_ERROR_SUCCESS;
 }
 
 //------------------------------------------------------------------------------
 // Set device array to value strided (impl in .cu file)
 //------------------------------------------------------------------------------
-int CeedDeviceSetValueStrided_Cuda(CeedScalar *d_array, CeedSize start, CeedSize step, CeedSize length, CeedScalar val);
+int CeedDeviceSetValueStrided_Cuda(CeedScalar *d_array, CeedSize start, CeedSize stop, CeedSize step, CeedSize length, CeedScalar val);
 
 //------------------------------------------------------------------------------
 // Set a vector to a value strided
 //------------------------------------------------------------------------------
-static int CeedVectorSetValueStrided_Cuda(CeedVector vec, CeedSize start, CeedSize step, CeedScalar val) {
+static int CeedVectorSetValueStrided_Cuda(CeedVector vec, CeedSize start, CeedSize stop, CeedSize step, CeedScalar val) {
   CeedSize         length;
   CeedVector_Cuda *impl;
 
   CeedCallBackend(CeedVectorGetData(vec, &impl));
   CeedCallBackend(CeedVectorGetLength(vec, &length));
   // Set value for synced device/host array
+  if (stop == -1) stop = length;
   if (impl->d_array) {
-    CeedCallBackend(CeedDeviceSetValueStrided_Cuda(impl->d_array, start, step, length, val));
+    CeedCallBackend(CeedDeviceSetValueStrided_Cuda(impl->d_array, start, stop, step, length, val));
     impl->h_array = NULL;
   } else if (impl->h_array) {
-    CeedCallBackend(CeedHostSetValueStrided_Cuda(impl->h_array, start, step, length, val));
+    CeedCallBackend(CeedHostSetValueStrided_Cuda(impl->h_array, start, stop, step, length, val));
     impl->d_array = NULL;
   } else {
     return CeedError(CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND, "CeedVector must have valid data set");
diff --git a/backends/cuda-ref/kernels/cuda-ref-vector.cu b/backends/cuda-ref/kernels/cuda-ref-vector.cu
index e325629587..6a75f69824 100644
--- a/backends/cuda-ref/kernels/cuda-ref-vector.cu
+++ b/backends/cuda-ref/kernels/cuda-ref-vector.cu
@@ -57,10 +57,10 @@ extern "C" int CeedDeviceSetValue_Cuda(CeedScalar *d_array, CeedSize length, Cee
 //------------------------------------------------------------------------------
 // Kernel for set value strided on device
 //------------------------------------------------------------------------------
-__global__ static void setValueStridedK(CeedScalar *__restrict__ vec, CeedSize start, CeedSize step, CeedSize size, CeedScalar val) {
+__global__ static void setValueStridedK(CeedScalar *__restrict__ vec, CeedSize start, CeedSize stop, CeedSize step, CeedSize length, CeedScalar val) {
   const CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
 
-  if (index < size) {
+  if (index >= start && index < stop) {
     if ((index - start) % step == 0) vec[index] = val;
   }
 }
@@ -68,13 +68,13 @@ __global__ static void setValueStridedK(CeedScalar *__restrict__ vec, CeedSize s
 //------------------------------------------------------------------------------
 // Set value strided on device memory
 //------------------------------------------------------------------------------
-extern "C" int CeedDeviceSetValueStrided_Cuda(CeedScalar *d_array, CeedSize start, CeedSize step, CeedSize length, CeedScalar val) {
+extern "C" int CeedDeviceSetValueStrided_Cuda(CeedScalar *d_array, CeedSize start, CeedSize stop, CeedSize step, CeedSize length, CeedScalar val) {
   const int      block_size = 512;
   const CeedSize vec_size   = length;
   int            grid_size  = vec_size / block_size;
 
   if (block_size * grid_size < vec_size) grid_size += 1;
-  setValueStridedK<<<grid_size, block_size>>>(d_array, start, step, length, val);
+  setValueStridedK<<<grid_size, block_size>>>(d_array, start, stop, step, length, val);
   return 0;
 }
 
diff --git a/backends/hip-ref/ceed-hip-ref-operator.c b/backends/hip-ref/ceed-hip-ref-operator.c
index 3124a48eb9..86123a9e71 100644
--- a/backends/hip-ref/ceed-hip-ref-operator.c
+++ b/backends/hip-ref/ceed-hip-ref-operator.c
@@ -1906,9 +1906,19 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip(CeedOperator op, Ce
       if (!is_active) continue;
 
       // Update unit vector
-      if (s == 0) CeedCallBackend(CeedVectorSetValue(active_e_vec_in, 0.0));
-      else CeedCallBackend(CeedVectorSetValueStrided(active_e_vec_in, s - 1, e_vec_size, 0.0));
-      CeedCallBackend(CeedVectorSetValueStrided(active_e_vec_in, s, e_vec_size, 1.0));
+      {
+        CeedInt  node = (s - 1) % elem_size, comp = (s - 1) / elem_size;
+        CeedSize start = node * 1 + comp * (elem_size * num_elem);
+        CeedSize stop  = start + (num_elem - 1) * elem_size + 1;
+
+        if (s == 0) CeedCallBackend(CeedVectorSetValue(active_e_vec_in, 0.0));
+        else CeedCallBackend(CeedVectorSetValueStrided(active_e_vec_in, start, stop, elem_size, 0.0));
+
+        node = s % elem_size, comp = s / elem_size;
+        start = node * 1 + comp * (elem_size * num_elem);
+        stop  = start + (num_elem - 1) * elem_size + 1;
+        CeedCallBackend(CeedVectorSetValueStrided(active_e_vec_in, start, stop, elem_size, 1.0));
+      }
 
       // Basis action
       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
diff --git a/backends/hip-ref/ceed-hip-ref-vector.c b/backends/hip-ref/ceed-hip-ref-vector.c
index 238f4b5625..da597f1e92 100644
--- a/backends/hip-ref/ceed-hip-ref-vector.c
+++ b/backends/hip-ref/ceed-hip-ref-vector.c
@@ -336,31 +336,32 @@ static int CeedVectorSetValue_Hip(CeedVector vec, CeedScalar val) {
 //------------------------------------------------------------------------------
 // Set host array to value strided
 //------------------------------------------------------------------------------
-static int CeedHostSetValueStrided_Hip(CeedScalar *h_array, CeedSize start, CeedSize step, CeedSize length, CeedScalar val) {
-  for (CeedSize i = start; i < length; i += step) h_array[i] = val;
+static int CeedHostSetValueStrided_Hip(CeedScalar *h_array, CeedSize start, CeedSize stop, CeedSize step, CeedSize length, CeedScalar val) {
+  for (CeedSize i = start; i <= stop; i += step) h_array[i] = val;
   return CEED_ERROR_SUCCESS;
 }
 
 //------------------------------------------------------------------------------
 // Set device array to value strided (impl in .hip.cpp file)
 //------------------------------------------------------------------------------
-int CeedDeviceSetValueStrided_Hip(CeedScalar *d_array, CeedSize start, CeedSize step, CeedSize length, CeedScalar val);
+int CeedDeviceSetValueStrided_Hip(CeedScalar *d_array, CeedSize start, CeedSize stop, CeedSize step, CeedSize length, CeedScalar val);
 
 //------------------------------------------------------------------------------
 // Set a vector to a value strided
 //------------------------------------------------------------------------------
-static int CeedVectorSetValueStrided_Hip(CeedVector vec, CeedSize start, CeedSize step, CeedScalar val) {
+static int CeedVectorSetValueStrided_Hip(CeedVector vec, CeedSize start, CeedSize stop, CeedSize step, CeedScalar val) {
   CeedSize        length;
   CeedVector_Hip *impl;
 
   CeedCallBackend(CeedVectorGetData(vec, &impl));
   CeedCallBackend(CeedVectorGetLength(vec, &length));
   // Set value for synced device/host array
+  if (stop == -1) stop = length;
   if (impl->d_array) {
-    CeedCallBackend(CeedDeviceSetValueStrided_Hip(impl->d_array, start, step, length, val));
+    CeedCallBackend(CeedDeviceSetValueStrided_Hip(impl->d_array, start, stop, step, length, val));
     impl->h_array = NULL;
   } else if (impl->h_array) {
-    CeedCallBackend(CeedHostSetValueStrided_Hip(impl->h_array, start, step, length, val));
+    CeedCallBackend(CeedHostSetValueStrided_Hip(impl->h_array, start, stop, step, length, val));
     impl->d_array = NULL;
   } else {
     return CeedError(CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND, "CeedVector must have valid data set");
diff --git a/backends/hip-ref/kernels/hip-ref-vector.hip.cpp b/backends/hip-ref/kernels/hip-ref-vector.hip.cpp
index a45118a6df..50f10f0be3 100644
--- a/backends/hip-ref/kernels/hip-ref-vector.hip.cpp
+++ b/backends/hip-ref/kernels/hip-ref-vector.hip.cpp
@@ -57,10 +57,10 @@ extern "C" int CeedDeviceSetValue_Hip(CeedScalar *d_array, CeedSize length, Ceed
 //------------------------------------------------------------------------------
 // Kernel for set value strided on device
 //------------------------------------------------------------------------------
-__global__ static void setValueStridedK(CeedScalar *__restrict__ vec, CeedSize start, CeedSize step, CeedSize size, CeedScalar val) {
+__global__ static void setValueStridedK(CeedScalar *__restrict__ vec, CeedSize start, CeedSize stop, CeedSize step, CeedSize length, CeedScalar val) {
   const CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
 
-  if (index < size) {
+  if (index >= start && index < stop) {
     if ((index - start) % step == 0) vec[index] = val;
   }
 }
@@ -68,13 +68,13 @@ __global__ static void setValueStridedK(CeedScalar *__restrict__ vec, CeedSize s
 //------------------------------------------------------------------------------
 // Set value strided on device memory
 //------------------------------------------------------------------------------
-extern "C" int CeedDeviceSetValueStrided_Hip(CeedScalar *d_array, CeedSize start, CeedSize step, CeedSize length, CeedScalar val) {
+extern "C" int CeedDeviceSetValueStrided_Hip(CeedScalar *d_array, CeedSize start, CeedInt stop, CeedSize step, CeedSize length, CeedScalar val) {
   const int      block_size = 512;
   const CeedSize vec_size   = length;
   int            grid_size  = vec_size / block_size;
 
   if (block_size * grid_size < vec_size) grid_size += 1;
-  hipLaunchKernelGGL(setValueStridedK, dim3(grid_size), dim3(block_size), 0, 0, d_array, start, step, length, val);
+  hipLaunchKernelGGL(setValueStridedK, dim3(grid_size), dim3(block_size), 0, 0, d_array, start, stop, step, length, val);
   return 0;
 }
 
diff --git a/backends/memcheck/ceed-memcheck-vector.c b/backends/memcheck/ceed-memcheck-vector.c
index 01187d06e9..1483ac9841 100644
--- a/backends/memcheck/ceed-memcheck-vector.c
+++ b/backends/memcheck/ceed-memcheck-vector.c
@@ -115,7 +115,7 @@ static int CeedVectorSetValue_Memcheck(CeedVector vec, CeedScalar value) {
 //------------------------------------------------------------------------------
 // Set internal array to value strided
 //------------------------------------------------------------------------------
-static int CeedVectorSetValueStrided_Memcheck(CeedVector vec, CeedSize start, CeedSize step, CeedScalar val) {
+static int CeedVectorSetValueStrided_Memcheck(CeedVector vec, CeedSize start, CeedSize stop, CeedSize step, CeedScalar val) {
   CeedSize             length;
   CeedVector_Memcheck *impl;
 
@@ -124,7 +124,8 @@ static int CeedVectorSetValueStrided_Memcheck(CeedVector vec, CeedSize start, Ce
 
   if (!impl->array_allocated) CeedCallBackend(CeedVectorSetArray_Memcheck(vec, CEED_MEM_HOST, CEED_COPY_VALUES, NULL));
   assert(impl->array_allocated);
-  for (CeedSize i = start; i < length; i += step) impl->array_allocated[i] = val;
+  if (stop == -1) stop = length;
+  for (CeedSize i = start; i < stop; i += step) impl->array_allocated[i] = val;
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/include/ceed-impl.h b/include/ceed-impl.h
index 4e7941a350..b626e9d129 100644
--- a/include/ceed-impl.h
+++ b/include/ceed-impl.h
@@ -140,7 +140,7 @@ struct CeedVector_private {
   int (*CopyStrided)(CeedVector, CeedSize, CeedSize, CeedVector);
   int (*SetArray)(CeedVector, CeedMemType, CeedCopyMode, CeedScalar *);
   int (*SetValue)(CeedVector, CeedScalar);
-  int (*SetValueStrided)(CeedVector, CeedSize, CeedSize, CeedScalar);
+  int (*SetValueStrided)(CeedVector, CeedSize, CeedSize, CeedSize, CeedScalar);
   int (*SyncArray)(CeedVector, CeedMemType);
   int (*TakeArray)(CeedVector, CeedMemType, CeedScalar **);
   int (*GetArray)(CeedVector, CeedMemType, CeedScalar **);
diff --git a/include/ceed/ceed.h b/include/ceed/ceed.h
index 26911b0a67..7a8941cec0 100644
--- a/include/ceed/ceed.h
+++ b/include/ceed/ceed.h
@@ -184,7 +184,7 @@ CEED_EXTERN int  CeedVectorCopy(CeedVector vec, CeedVector vec_copy);
 CEED_EXTERN int  CeedVectorCopyStrided(CeedVector vec, CeedSize start, CeedInt step, CeedVector vec_copy);
 CEED_EXTERN int  CeedVectorSetArray(CeedVector vec, CeedMemType mem_type, CeedCopyMode copy_mode, CeedScalar *array);
 CEED_EXTERN int  CeedVectorSetValue(CeedVector vec, CeedScalar value);
-CEED_EXTERN int  CeedVectorSetValueStrided(CeedVector vec, CeedSize start, CeedInt step, CeedScalar value);
+CEED_EXTERN int  CeedVectorSetValueStrided(CeedVector vec, CeedSize start, CeedSize stop, CeedSize step, CeedScalar value);
 CEED_EXTERN int  CeedVectorSyncArray(CeedVector vec, CeedMemType mem_type);
 CEED_EXTERN int  CeedVectorTakeArray(CeedVector vec, CeedMemType mem_type, CeedScalar **array);
 CEED_EXTERN int  CeedVectorGetArray(CeedVector vec, CeedMemType mem_type, CeedScalar **array);
diff --git a/interface/ceed-vector.c b/interface/ceed-vector.c
index 70c7b5464f..22ce5b368a 100644
--- a/interface/ceed-vector.c
+++ b/interface/ceed-vector.c
@@ -357,7 +357,8 @@ int CeedVectorSetValue(CeedVector vec, CeedScalar value) {
   Note: The `CeedVector` must already have valid data set via @ref CeedVectorSetArray() or similar.
 
   @param[in,out] vec   `CeedVector`
-  @param[in]     start First index to set
+  @param[in]     start First index to set in range `[start, stop)`
+  @param[in]     stop  Last index to set in range `[start, stop)`, or `-1` for `length`
   @param[in]     step  Stride between indices to set
   @param[in]     value Value to be used
 
@@ -365,13 +366,13 @@ int CeedVectorSetValue(CeedVector vec, CeedScalar value) {
 
   @ref User
 **/
-int CeedVectorSetValueStrided(CeedVector vec, CeedSize start, CeedInt step, CeedScalar value) {
+int CeedVectorSetValueStrided(CeedVector vec, CeedSize start, CeedSize stop, CeedSize step, CeedScalar value) {
   CeedCheck(vec->state % 2 == 0, CeedVectorReturnCeed(vec), CEED_ERROR_ACCESS,
             "Cannot grant CeedVector array access, the access lock is already in use");
   CeedCheck(vec->num_readers == 0, CeedVectorReturnCeed(vec), CEED_ERROR_ACCESS, "Cannot grant CeedVector array access, a process has read access");
 
   if (vec->SetValueStrided) {
-    CeedCall(vec->SetValueStrided(vec, start, step, value));
+    CeedCall(vec->SetValueStrided(vec, start, stop, step, value));
     vec->state += 2;
   } else {
     CeedSize    length;
@@ -379,8 +380,9 @@ int CeedVectorSetValueStrided(CeedVector vec, CeedSize start, CeedInt step, Ceed
 
     CeedCall(CeedVectorGetLength(vec, &length));
     if (length <= 0) return CEED_ERROR_SUCCESS;
+    if (stop == -1) stop = length;
     CeedCall(CeedVectorGetArray(vec, CEED_MEM_HOST, &array));
-    for (CeedSize i = start; i < length; i += step) array[i] = value;
+    for (CeedSize i = start; i < stop; i += step) array[i] = value;
     CeedCall(CeedVectorRestoreArray(vec, &array));
   }
   return CEED_ERROR_SUCCESS;
diff --git a/tests/t127-vector.c b/tests/t127-vector.c
index e9fb578d65..68e2470138 100644
--- a/tests/t127-vector.c
+++ b/tests/t127-vector.c
@@ -17,7 +17,7 @@ int main(int argc, char **argv) {
 
   // Set strided
   CeedVectorSetValue(x, 1.0);
-  CeedVectorSetValueStrided(x, start, step, 42.0);
+  CeedVectorSetValueStrided(x, start, -1, step, 42.0);
   {
     const CeedScalar *read_array;
 

From 2d73a370d2e8c78f703c6cd36fd60a5141320722 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Tue, 18 Mar 2025 15:47:59 -0600
Subject: [PATCH 346/571] vec - drop unused length arg in setValueStridedK

---
 backends/cuda-ref/ceed-cuda-ref-vector.c        | 2 +-
 backends/cuda-ref/kernels/cuda-ref-vector.cu    | 4 ++--
 backends/hip-ref/ceed-hip-ref-vector.c          | 2 +-
 backends/hip-ref/kernels/hip-ref-vector.hip.cpp | 4 ++--
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/backends/cuda-ref/ceed-cuda-ref-vector.c b/backends/cuda-ref/ceed-cuda-ref-vector.c
index 3455e67caa..f8e5963987 100644
--- a/backends/cuda-ref/ceed-cuda-ref-vector.c
+++ b/backends/cuda-ref/ceed-cuda-ref-vector.c
@@ -337,7 +337,7 @@ static int CeedVectorSetValue_Cuda(CeedVector vec, CeedScalar val) {
 // Set host array to value strided
 //------------------------------------------------------------------------------
 static int CeedHostSetValueStrided_Cuda(CeedScalar *h_array, CeedSize start, CeedSize stop, CeedSize step, CeedSize length, CeedScalar val) {
-  for (CeedSize i = start; i <= stop; i += step) h_array[i] = val;
+  for (CeedSize i = start; i < stop; i += step) h_array[i] = val;
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/cuda-ref/kernels/cuda-ref-vector.cu b/backends/cuda-ref/kernels/cuda-ref-vector.cu
index 6a75f69824..1122e768f0 100644
--- a/backends/cuda-ref/kernels/cuda-ref-vector.cu
+++ b/backends/cuda-ref/kernels/cuda-ref-vector.cu
@@ -57,7 +57,7 @@ extern "C" int CeedDeviceSetValue_Cuda(CeedScalar *d_array, CeedSize length, Cee
 //------------------------------------------------------------------------------
 // Kernel for set value strided on device
 //------------------------------------------------------------------------------
-__global__ static void setValueStridedK(CeedScalar *__restrict__ vec, CeedSize start, CeedSize stop, CeedSize step, CeedSize length, CeedScalar val) {
+__global__ static void setValueStridedK(CeedScalar *__restrict__ vec, CeedSize start, CeedSize stop, CeedSize step, CeedScalar val) {
   const CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
 
   if (index >= start && index < stop) {
@@ -74,7 +74,7 @@ extern "C" int CeedDeviceSetValueStrided_Cuda(CeedScalar *d_array, CeedSize star
   int            grid_size  = vec_size / block_size;
 
   if (block_size * grid_size < vec_size) grid_size += 1;
-  setValueStridedK<<<grid_size, block_size>>>(d_array, start, stop, step, length, val);
+  setValueStridedK<<<grid_size, block_size>>>(d_array, start, stop, step, val);
   return 0;
 }
 
diff --git a/backends/hip-ref/ceed-hip-ref-vector.c b/backends/hip-ref/ceed-hip-ref-vector.c
index da597f1e92..9d16a2e20e 100644
--- a/backends/hip-ref/ceed-hip-ref-vector.c
+++ b/backends/hip-ref/ceed-hip-ref-vector.c
@@ -337,7 +337,7 @@ static int CeedVectorSetValue_Hip(CeedVector vec, CeedScalar val) {
 // Set host array to value strided
 //------------------------------------------------------------------------------
 static int CeedHostSetValueStrided_Hip(CeedScalar *h_array, CeedSize start, CeedSize stop, CeedSize step, CeedSize length, CeedScalar val) {
-  for (CeedSize i = start; i <= stop; i += step) h_array[i] = val;
+  for (CeedSize i = start; i < stop; i += step) h_array[i] = val;
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/hip-ref/kernels/hip-ref-vector.hip.cpp b/backends/hip-ref/kernels/hip-ref-vector.hip.cpp
index 50f10f0be3..0249ea3f0b 100644
--- a/backends/hip-ref/kernels/hip-ref-vector.hip.cpp
+++ b/backends/hip-ref/kernels/hip-ref-vector.hip.cpp
@@ -57,7 +57,7 @@ extern "C" int CeedDeviceSetValue_Hip(CeedScalar *d_array, CeedSize length, Ceed
 //------------------------------------------------------------------------------
 // Kernel for set value strided on device
 //------------------------------------------------------------------------------
-__global__ static void setValueStridedK(CeedScalar *__restrict__ vec, CeedSize start, CeedSize stop, CeedSize step, CeedSize length, CeedScalar val) {
+__global__ static void setValueStridedK(CeedScalar *__restrict__ vec, CeedSize start, CeedSize stop, CeedSize step, CeedScalar val) {
   const CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
 
   if (index >= start && index < stop) {
@@ -74,7 +74,7 @@ extern "C" int CeedDeviceSetValueStrided_Hip(CeedScalar *d_array, CeedSize start
   int            grid_size  = vec_size / block_size;
 
   if (block_size * grid_size < vec_size) grid_size += 1;
-  hipLaunchKernelGGL(setValueStridedK, dim3(grid_size), dim3(block_size), 0, 0, d_array, start, stop, step, length, val);
+  hipLaunchKernelGGL(setValueStridedK, dim3(grid_size), dim3(block_size), 0, 0, d_array, start, stop, step, val);
   return 0;
 }
 

From b1a610ef203ae2e2bf8a9bddcc192eaf0a166984 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Tue, 18 Mar 2025 15:50:27 -0600
Subject: [PATCH 347/571] vec - check stop value in interface
 CeedVectorSetValueStrided

---
 interface/ceed-vector.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/interface/ceed-vector.c b/interface/ceed-vector.c
index 22ce5b368a..5a6688659a 100644
--- a/interface/ceed-vector.c
+++ b/interface/ceed-vector.c
@@ -367,18 +367,20 @@ int CeedVectorSetValue(CeedVector vec, CeedScalar value) {
   @ref User
 **/
 int CeedVectorSetValueStrided(CeedVector vec, CeedSize start, CeedSize stop, CeedSize step, CeedScalar value) {
+  CeedSize length;
+
   CeedCheck(vec->state % 2 == 0, CeedVectorReturnCeed(vec), CEED_ERROR_ACCESS,
             "Cannot grant CeedVector array access, the access lock is already in use");
   CeedCheck(vec->num_readers == 0, CeedVectorReturnCeed(vec), CEED_ERROR_ACCESS, "Cannot grant CeedVector array access, a process has read access");
+  CeedCall(CeedVectorGetLength(vec, &length));
+  CeedCheck(stop >= -1 && stop <= length, CeedVectorReturnCeed(vec), CEED_ERROR_ACCESS, "Invalid value for stop, must be in the range [-1, length]");
 
   if (vec->SetValueStrided) {
     CeedCall(vec->SetValueStrided(vec, start, stop, step, value));
     vec->state += 2;
   } else {
-    CeedSize    length;
     CeedScalar *array;
 
-    CeedCall(CeedVectorGetLength(vec, &length));
     if (length <= 0) return CEED_ERROR_SUCCESS;
     if (stop == -1) stop = length;
     CeedCall(CeedVectorGetArray(vec, CEED_MEM_HOST, &array));

From 7a747cf1bff4efd128e820e79284a45c73f1f102 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Wed, 19 Mar 2025 08:32:42 -0600
Subject: [PATCH 348/571] vec - clearer strided bounds error message

Co-authored-by: Jed Brown <jed@jedbrown.org>
---
 interface/ceed-vector.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/interface/ceed-vector.c b/interface/ceed-vector.c
index 5a6688659a..22346ecc32 100644
--- a/interface/ceed-vector.c
+++ b/interface/ceed-vector.c
@@ -373,7 +373,8 @@ int CeedVectorSetValueStrided(CeedVector vec, CeedSize start, CeedSize stop, Cee
             "Cannot grant CeedVector array access, the access lock is already in use");
   CeedCheck(vec->num_readers == 0, CeedVectorReturnCeed(vec), CEED_ERROR_ACCESS, "Cannot grant CeedVector array access, a process has read access");
   CeedCall(CeedVectorGetLength(vec, &length));
-  CeedCheck(stop >= -1 && stop <= length, CeedVectorReturnCeed(vec), CEED_ERROR_ACCESS, "Invalid value for stop, must be in the range [-1, length]");
+  CeedCheck(stop >= -1 && stop <= length, CeedVectorReturnCeed(vec), CEED_ERROR_ACCESS,
+            "Invalid value for stop %" CeedSize_FMT ", must be in the range [-1, length]", stop);
 
   if (vec->SetValueStrided) {
     CeedCall(vec->SetValueStrided(vec, start, stop, step, value));

From 126347300c7ca7241e05a0a707bdb6ba247c9e5a Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Wed, 19 Mar 2025 08:37:29 -0600
Subject: [PATCH 349/571] vec - clearer description of stop in SetValueStrided

---
 interface/ceed-vector.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/interface/ceed-vector.c b/interface/ceed-vector.c
index 22346ecc32..bb3b3079a8 100644
--- a/interface/ceed-vector.c
+++ b/interface/ceed-vector.c
@@ -358,7 +358,7 @@ int CeedVectorSetValue(CeedVector vec, CeedScalar value) {
 
   @param[in,out] vec   `CeedVector`
   @param[in]     start First index to set in range `[start, stop)`
-  @param[in]     stop  Last index to set in range `[start, stop)`, or `-1` for `length`
+  @param[in]     stop  One past the last element to set in the range, or `-1` for `length`
   @param[in]     step  Stride between indices to set
   @param[in]     value Value to be used
 

From 14c82621672200cb0d16e80fc054f1ad38dbb785 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Wed, 19 Mar 2025 08:51:07 -0600
Subject: [PATCH 350/571] gpu - fewer threads for SetValueStrided

---
 backends/cuda-ref/ceed-cuda-ref-vector.c        |  8 ++++----
 backends/cuda-ref/kernels/cuda-ref-vector.cu    | 12 ++++++------
 backends/hip-ref/ceed-hip-ref-vector.c          |  8 ++++----
 backends/hip-ref/kernels/hip-ref-vector.hip.cpp | 10 +++++-----
 4 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/backends/cuda-ref/ceed-cuda-ref-vector.c b/backends/cuda-ref/ceed-cuda-ref-vector.c
index f8e5963987..5db2c4cb6b 100644
--- a/backends/cuda-ref/ceed-cuda-ref-vector.c
+++ b/backends/cuda-ref/ceed-cuda-ref-vector.c
@@ -336,7 +336,7 @@ static int CeedVectorSetValue_Cuda(CeedVector vec, CeedScalar val) {
 //------------------------------------------------------------------------------
 // Set host array to value strided
 //------------------------------------------------------------------------------
-static int CeedHostSetValueStrided_Cuda(CeedScalar *h_array, CeedSize start, CeedSize stop, CeedSize step, CeedSize length, CeedScalar val) {
+static int CeedHostSetValueStrided_Cuda(CeedScalar *h_array, CeedSize start, CeedSize stop, CeedSize step, CeedScalar val) {
   for (CeedSize i = start; i < stop; i += step) h_array[i] = val;
   return CEED_ERROR_SUCCESS;
 }
@@ -344,7 +344,7 @@ static int CeedHostSetValueStrided_Cuda(CeedScalar *h_array, CeedSize start, Cee
 //------------------------------------------------------------------------------
 // Set device array to value strided (impl in .cu file)
 //------------------------------------------------------------------------------
-int CeedDeviceSetValueStrided_Cuda(CeedScalar *d_array, CeedSize start, CeedSize stop, CeedSize step, CeedSize length, CeedScalar val);
+int CeedDeviceSetValueStrided_Cuda(CeedScalar *d_array, CeedSize start, CeedSize stop, CeedSize step, CeedScalar val);
 
 //------------------------------------------------------------------------------
 // Set a vector to a value strided
@@ -358,10 +358,10 @@ static int CeedVectorSetValueStrided_Cuda(CeedVector vec, CeedSize start, CeedSi
   // Set value for synced device/host array
   if (stop == -1) stop = length;
   if (impl->d_array) {
-    CeedCallBackend(CeedDeviceSetValueStrided_Cuda(impl->d_array, start, stop, step, length, val));
+    CeedCallBackend(CeedDeviceSetValueStrided_Cuda(impl->d_array, start, stop, step, val));
     impl->h_array = NULL;
   } else if (impl->h_array) {
-    CeedCallBackend(CeedHostSetValueStrided_Cuda(impl->h_array, start, stop, step, length, val));
+    CeedCallBackend(CeedHostSetValueStrided_Cuda(impl->h_array, start, stop, step, val));
     impl->d_array = NULL;
   } else {
     return CeedError(CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND, "CeedVector must have valid data set");
diff --git a/backends/cuda-ref/kernels/cuda-ref-vector.cu b/backends/cuda-ref/kernels/cuda-ref-vector.cu
index 1122e768f0..9560279fbe 100644
--- a/backends/cuda-ref/kernels/cuda-ref-vector.cu
+++ b/backends/cuda-ref/kernels/cuda-ref-vector.cu
@@ -60,20 +60,20 @@ extern "C" int CeedDeviceSetValue_Cuda(CeedScalar *d_array, CeedSize length, Cee
 __global__ static void setValueStridedK(CeedScalar *__restrict__ vec, CeedSize start, CeedSize stop, CeedSize step, CeedScalar val) {
   const CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
 
-  if (index >= start && index < stop) {
-    if ((index - start) % step == 0) vec[index] = val;
+  if (index < stop - start) {
+    if (index % step == 0) vec[start + index] = val;
   }
 }
 
 //------------------------------------------------------------------------------
 // Set value strided on device memory
 //------------------------------------------------------------------------------
-extern "C" int CeedDeviceSetValueStrided_Cuda(CeedScalar *d_array, CeedSize start, CeedSize stop, CeedSize step, CeedSize length, CeedScalar val) {
+extern "C" int CeedDeviceSetValueStrided_Cuda(CeedScalar *d_array, CeedSize start, CeedSize stop, CeedSize step, CeedScalar val) {
   const int      block_size = 512;
-  const CeedSize vec_size   = length;
-  int            grid_size  = vec_size / block_size;
+  const CeedSize set_size   = stop - start;
+  int            grid_size  = set_size / block_size;
 
-  if (block_size * grid_size < vec_size) grid_size += 1;
+  if (block_size * grid_size < set_size) grid_size += 1;
   setValueStridedK<<<grid_size, block_size>>>(d_array, start, stop, step, val);
   return 0;
 }
diff --git a/backends/hip-ref/ceed-hip-ref-vector.c b/backends/hip-ref/ceed-hip-ref-vector.c
index 9d16a2e20e..e4d464de80 100644
--- a/backends/hip-ref/ceed-hip-ref-vector.c
+++ b/backends/hip-ref/ceed-hip-ref-vector.c
@@ -336,7 +336,7 @@ static int CeedVectorSetValue_Hip(CeedVector vec, CeedScalar val) {
 //------------------------------------------------------------------------------
 // Set host array to value strided
 //------------------------------------------------------------------------------
-static int CeedHostSetValueStrided_Hip(CeedScalar *h_array, CeedSize start, CeedSize stop, CeedSize step, CeedSize length, CeedScalar val) {
+static int CeedHostSetValueStrided_Hip(CeedScalar *h_array, CeedSize start, CeedSize stop, CeedSize step, CeedScalar val) {
   for (CeedSize i = start; i < stop; i += step) h_array[i] = val;
   return CEED_ERROR_SUCCESS;
 }
@@ -344,7 +344,7 @@ static int CeedHostSetValueStrided_Hip(CeedScalar *h_array, CeedSize start, Ceed
 //------------------------------------------------------------------------------
 // Set device array to value strided (impl in .hip.cpp file)
 //------------------------------------------------------------------------------
-int CeedDeviceSetValueStrided_Hip(CeedScalar *d_array, CeedSize start, CeedSize stop, CeedSize step, CeedSize length, CeedScalar val);
+int CeedDeviceSetValueStrided_Hip(CeedScalar *d_array, CeedSize start, CeedSize stop, CeedSize step, CeedScalar val);
 
 //------------------------------------------------------------------------------
 // Set a vector to a value strided
@@ -358,10 +358,10 @@ static int CeedVectorSetValueStrided_Hip(CeedVector vec, CeedSize start, CeedSiz
   // Set value for synced device/host array
   if (stop == -1) stop = length;
   if (impl->d_array) {
-    CeedCallBackend(CeedDeviceSetValueStrided_Hip(impl->d_array, start, stop, step, length, val));
+    CeedCallBackend(CeedDeviceSetValueStrided_Hip(impl->d_array, start, stop, step, val));
     impl->h_array = NULL;
   } else if (impl->h_array) {
-    CeedCallBackend(CeedHostSetValueStrided_Hip(impl->h_array, start, stop, step, length, val));
+    CeedCallBackend(CeedHostSetValueStrided_Hip(impl->h_array, start, stop, step, val));
     impl->d_array = NULL;
   } else {
     return CeedError(CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND, "CeedVector must have valid data set");
diff --git a/backends/hip-ref/kernels/hip-ref-vector.hip.cpp b/backends/hip-ref/kernels/hip-ref-vector.hip.cpp
index 0249ea3f0b..0db492fc80 100644
--- a/backends/hip-ref/kernels/hip-ref-vector.hip.cpp
+++ b/backends/hip-ref/kernels/hip-ref-vector.hip.cpp
@@ -60,8 +60,8 @@ extern "C" int CeedDeviceSetValue_Hip(CeedScalar *d_array, CeedSize length, Ceed
 __global__ static void setValueStridedK(CeedScalar *__restrict__ vec, CeedSize start, CeedSize stop, CeedSize step, CeedScalar val) {
   const CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
 
-  if (index >= start && index < stop) {
-    if ((index - start) % step == 0) vec[index] = val;
+  if (index < stop - start) {
+    if (index % step == 0) vec[start + index] = val;
   }
 }
 
@@ -70,10 +70,10 @@ __global__ static void setValueStridedK(CeedScalar *__restrict__ vec, CeedSize s
 //------------------------------------------------------------------------------
 extern "C" int CeedDeviceSetValueStrided_Hip(CeedScalar *d_array, CeedSize start, CeedInt stop, CeedSize step, CeedSize length, CeedScalar val) {
   const int      block_size = 512;
-  const CeedSize vec_size   = length;
-  int            grid_size  = vec_size / block_size;
+  const CeedSize set_size   = stop - start;
+  int            grid_size  = set_size / block_size;
 
-  if (block_size * grid_size < vec_size) grid_size += 1;
+  if (block_size * grid_size < set_size) grid_size += 1;
   hipLaunchKernelGGL(setValueStridedK, dim3(grid_size), dim3(block_size), 0, 0, d_array, start, stop, step, val);
   return 0;
 }

From f52f9f6cf82557c448b4d7d27acc9d71dfcc9d91 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Wed, 19 Mar 2025 08:55:14 -0600
Subject: [PATCH 351/571] vec - wording consistency

---
 interface/ceed-vector.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/interface/ceed-vector.c b/interface/ceed-vector.c
index bb3b3079a8..09156f4d94 100644
--- a/interface/ceed-vector.c
+++ b/interface/ceed-vector.c
@@ -994,8 +994,8 @@ int CeedVectorReciprocal(CeedVector vec) {
         Any portion of the provided range that is outside the range of valid indices for the `CeedVector` will be ignored.
 
   @param[in] vec    `CeedVector` to view
-  @param[in] start  Index of first `CeedVector` entry to view
-  @param[in] stop   Index of last `CeedVector` entry to view
+  @param[in] start  Index of first `CeedVector` entry to view in the range `[start, stop)`
+  @param[in] stop   One past the last element to view in the range, or `-1` for `length`
   @param[in] step   Step between `CeedVector` entries to view
   @param[in] fp_fmt Printing format
   @param[in] stream Filestream to write to
@@ -1017,7 +1017,7 @@ int CeedVectorViewRange(CeedVector vec, CeedSize start, CeedSize stop, CeedInt s
     fprintf(stream, "  start: %" CeedSize_FMT "\n  stop:  %" CeedSize_FMT "\n  step:  %" CeedInt_FMT "\n", start, stop, step);
   }
   if (start > length) start = length;
-  if (stop > length) stop = length;
+  if (stop == -1 || stop > length) stop = length;
 
   snprintf(fmt, sizeof fmt, "  %s\n", fp_fmt ? fp_fmt : "%g");
   CeedCall(CeedVectorGetArrayRead(vec, CEED_MEM_HOST, &x));

From 832a6d734b42c29ce33664f3c9c828bd26de930d Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Wed, 19 Mar 2025 09:07:13 -0600
Subject: [PATCH 352/571] vec - make CopyStrided reflect SetStrided

---
 backends/cuda-ref/ceed-cuda-ref-vector.c     | 17 ++++++------
 backends/cuda-ref/kernels/cuda-ref-vector.cu | 16 +++++------
 backends/hip-ref/ceed-hip-ref-vector.c       | 17 ++++++------
 include/ceed-impl.h                          |  2 +-
 include/ceed/ceed.h                          |  2 +-
 interface/ceed-vector.c                      | 28 ++++++++++++--------
 tests/t127-vector.c                          |  2 +-
 7 files changed, 46 insertions(+), 38 deletions(-)

diff --git a/backends/cuda-ref/ceed-cuda-ref-vector.c b/backends/cuda-ref/ceed-cuda-ref-vector.c
index 5db2c4cb6b..58999fc73a 100644
--- a/backends/cuda-ref/ceed-cuda-ref-vector.c
+++ b/backends/cuda-ref/ceed-cuda-ref-vector.c
@@ -223,20 +223,20 @@ static int CeedVectorSetArray_Cuda(const CeedVector vec, const CeedMemType mem_t
 //------------------------------------------------------------------------------
 // Copy host array to value strided
 //------------------------------------------------------------------------------
-static int CeedHostCopyStrided_Cuda(CeedScalar *h_array, CeedSize start, CeedSize step, CeedSize length, CeedScalar *h_copy_array) {
-  for (CeedSize i = start; i < length; i += step) h_copy_array[i] = h_array[i];
+static int CeedHostCopyStrided_Cuda(CeedScalar *h_array, CeedSize start, CeedSize stop, CeedSize step, CeedScalar *h_copy_array) {
+  for (CeedSize i = start; i < stop; i += step) h_copy_array[i] = h_array[i];
   return CEED_ERROR_SUCCESS;
 }
 
 //------------------------------------------------------------------------------
 // Copy device array to value strided (impl in .cu file)
 //------------------------------------------------------------------------------
-int CeedDeviceCopyStrided_Cuda(CeedScalar *d_array, CeedSize start, CeedSize step, CeedSize length, CeedScalar *d_copy_array);
+int CeedDeviceCopyStrided_Cuda(CeedScalar *d_array, CeedSize start, CeedSize stop, CeedSize step, CeedScalar *d_copy_array);
 
 //------------------------------------------------------------------------------
 // Copy a vector to a value strided
 //------------------------------------------------------------------------------
-static int CeedVectorCopyStrided_Cuda(CeedVector vec, CeedSize start, CeedSize step, CeedVector vec_copy) {
+static int CeedVectorCopyStrided_Cuda(CeedVector vec, CeedSize start, CeedSize stop, CeedSize step, CeedVector vec_copy) {
   CeedSize         length;
   CeedVector_Cuda *impl;
 
@@ -248,6 +248,7 @@ static int CeedVectorCopyStrided_Cuda(CeedVector vec, CeedSize start, CeedSize s
     CeedCallBackend(CeedVectorGetLength(vec_copy, &length_copy));
     length = length_vec < length_copy ? length_vec : length_copy;
   }
+  if (stop == -1) stop = length;
   // Set value for synced device/host array
   if (impl->d_array) {
     CeedScalar *copy_array;
@@ -260,13 +261,13 @@ static int CeedVectorCopyStrided_Cuda(CeedVector vec, CeedSize start, CeedSize s
     CeedCallBackend(CeedVectorGetCeed(vec, &ceed));
     CeedCallBackend(CeedGetCublasHandle_Cuda(ceed, &handle));
 #if defined(CEED_SCALAR_IS_FP32)
-    CeedCallCublas(ceed, cublasScopy_64(handle, (int64_t)length, impl->d_array + start, (int64_t)step, copy_array + start, (int64_t)step));
+    CeedCallCublas(ceed, cublasScopy_64(handle, (int64_t)(stop - start), impl->d_array + start, (int64_t)step, copy_array + start, (int64_t)step));
 #else  /* CEED_SCALAR */
-    CeedCallCublas(ceed, cublasDcopy_64(handle, (int64_t)length, impl->d_array + start, (int64_t)step, copy_array + start, (int64_t)step));
+    CeedCallCublas(ceed, cublasDcopy_64(handle, (int64_t)(stop - start), impl->d_array + start, (int64_t)step, copy_array + start, (int64_t)step));
 #endif /* CEED_SCALAR */
     CeedCallBackend(CeedDestroy(&ceed));
 #else  /* CUDA_VERSION */
-    CeedCallBackend(CeedDeviceCopyStrided_Cuda(impl->d_array, start, step, length, copy_array));
+    CeedCallBackend(CeedDeviceCopyStrided_Cuda(impl->d_array, start, stop, step, copy_array));
 #endif /* CUDA_VERSION */
     CeedCallBackend(CeedVectorRestoreArray(vec_copy, &copy_array));
     impl->h_array = NULL;
@@ -274,7 +275,7 @@ static int CeedVectorCopyStrided_Cuda(CeedVector vec, CeedSize start, CeedSize s
     CeedScalar *copy_array;
 
     CeedCallBackend(CeedVectorGetArray(vec_copy, CEED_MEM_HOST, &copy_array));
-    CeedCallBackend(CeedHostCopyStrided_Cuda(impl->h_array, start, step, length, copy_array));
+    CeedCallBackend(CeedHostCopyStrided_Cuda(impl->h_array, start, stop, step, copy_array));
     CeedCallBackend(CeedVectorRestoreArray(vec_copy, &copy_array));
     impl->d_array = NULL;
   } else {
diff --git a/backends/cuda-ref/kernels/cuda-ref-vector.cu b/backends/cuda-ref/kernels/cuda-ref-vector.cu
index 9560279fbe..6f83efaa1e 100644
--- a/backends/cuda-ref/kernels/cuda-ref-vector.cu
+++ b/backends/cuda-ref/kernels/cuda-ref-vector.cu
@@ -11,24 +11,24 @@
 //------------------------------------------------------------------------------
 // Kernel for copy strided on device
 //------------------------------------------------------------------------------
-__global__ static void copyStridedK(CeedScalar *__restrict__ vec, CeedSize start, CeedSize step, CeedSize size, CeedScalar *__restrict__ vec_copy) {
+__global__ static void copyStridedK(CeedScalar *__restrict__ vec, CeedSize start, CeedSize stop, CeedSize step, CeedScalar *__restrict__ vec_copy) {
   const CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
 
-  if (index < size) {
-    if ((index - start) % step == 0) vec_copy[index] = vec[index];
+  if (index < stop - start) {
+    if (index % step == 0) vec_copy[start + index] = vec[start + index];
   }
 }
 
 //------------------------------------------------------------------------------
 // Copy strided on device memory
 //------------------------------------------------------------------------------
-extern "C" int CeedDeviceCopyStrided_Cuda(CeedScalar *d_array, CeedSize start, CeedSize step, CeedSize length, CeedScalar *d_copy_array) {
+extern "C" int CeedDeviceCopyStrided_Cuda(CeedScalar *d_array, CeedSize start, CeedSize stop, CeedSize step, CeedScalar *d_copy_array) {
   const int      block_size = 512;
-  const CeedSize vec_size   = length;
-  int            grid_size  = vec_size / block_size;
+  const CeedSize copy_size  = stop - start;
+  int            grid_size  = copy_size / block_size;
 
-  if (block_size * grid_size < vec_size) grid_size += 1;
-  copyStridedK<<<grid_size, block_size>>>(d_array, start, step, length, d_copy_array);
+  if (block_size * grid_size < copy_size) grid_size += 1;
+  copyStridedK<<<grid_size, block_size>>>(d_array, start, stop, step, d_copy_array);
   return 0;
 }
 
diff --git a/backends/hip-ref/ceed-hip-ref-vector.c b/backends/hip-ref/ceed-hip-ref-vector.c
index e4d464de80..50e7064551 100644
--- a/backends/hip-ref/ceed-hip-ref-vector.c
+++ b/backends/hip-ref/ceed-hip-ref-vector.c
@@ -223,20 +223,20 @@ static int CeedVectorSetArray_Hip(const CeedVector vec, const CeedMemType mem_ty
 //------------------------------------------------------------------------------
 // Copy host array to value strided
 //------------------------------------------------------------------------------
-static int CeedHostCopyStrided_Hip(CeedScalar *h_array, CeedSize start, CeedSize step, CeedSize length, CeedScalar *h_copy_array) {
-  for (CeedSize i = start; i < length; i += step) h_copy_array[i] = h_array[i];
+static int CeedHostCopyStrided_Hip(CeedScalar *h_array, CeedSize start, CeedSize stop, CeedSize step, CeedScalar *h_copy_array) {
+  for (CeedSize i = start; i < stop; i += step) h_copy_array[i] = h_array[i];
   return CEED_ERROR_SUCCESS;
 }
 
 //------------------------------------------------------------------------------
 // Copy device array to value strided (impl in .hip.cpp file)
 //------------------------------------------------------------------------------
-int CeedDeviceCopyStrided_Hip(CeedScalar *d_array, CeedSize start, CeedSize step, CeedSize length, CeedScalar *d_copy_array);
+int CeedDeviceCopyStrided_Hip(CeedScalar *d_array, CeedSize start, CeedSize stop, CeedSize step, CeedScalar *d_copy_array);
 
 //------------------------------------------------------------------------------
 // Copy a vector to a value strided
 //------------------------------------------------------------------------------
-static int CeedVectorCopyStrided_Hip(CeedVector vec, CeedSize start, CeedSize step, CeedVector vec_copy) {
+static int CeedVectorCopyStrided_Hip(CeedVector vec, CeedSize start, CeedSize stop, CeedSize step, CeedVector vec_copy) {
   CeedSize        length;
   CeedVector_Hip *impl;
 
@@ -248,6 +248,7 @@ static int CeedVectorCopyStrided_Hip(CeedVector vec, CeedSize start, CeedSize st
     CeedCallBackend(CeedVectorGetLength(vec_copy, &length_copy));
     length = length_vec < length_copy ? length_vec : length_copy;
   }
+  if (stop == -1) stop = length;
   // Set value for synced device/host array
   if (impl->d_array) {
     CeedScalar *copy_array;
@@ -260,12 +261,12 @@ static int CeedVectorCopyStrided_Hip(CeedVector vec, CeedSize start, CeedSize st
     CeedCallBackend(CeedVectorGetCeed(vec, &ceed));
     CeedCallBackend(CeedGetHipblasHandle_Hip(ceed, &handle));
 #if defined(CEED_SCALAR_IS_FP32)
-    CeedCallHipblas(ceed, hipblasScopy_64(handle, (int64_t)length, impl->d_array + start, (int64_t)step, copy_array + start, (int64_t)step));
+    CeedCallHipblas(ceed, hipblasScopy_64(handle, (int64_t)(stop - start), impl->d_array + start, (int64_t)step, copy_array + start, (int64_t)step));
 #else  /* CEED_SCALAR */
-    CeedCallHipblas(ceed, hipblasDcopy_64(handle, (int64_t)length, impl->d_array + start, (int64_t)step, copy_array + start, (int64_t)step));
+    CeedCallHipblas(ceed, hipblasDcopy_64(handle, (int64_t)(stop - start), impl->d_array + start, (int64_t)step, copy_array + start, (int64_t)step));
 #endif /* CEED_SCALAR */
 #else  /* HIP_VERSION */
-    CeedCallBackend(CeedDeviceCopyStrided_Hip(impl->d_array, start, step, length, copy_array));
+    CeedCallBackend(CeedDeviceCopyStrided_Hip(impl->d_array, start, stop, step, copy_array));
 #endif /* HIP_VERSION */
     CeedCallBackend(CeedVectorRestoreArray(vec_copy, &copy_array));
     impl->h_array = NULL;
@@ -274,7 +275,7 @@ static int CeedVectorCopyStrided_Hip(CeedVector vec, CeedSize start, CeedSize st
     CeedScalar *copy_array;
 
     CeedCallBackend(CeedVectorGetArray(vec_copy, CEED_MEM_HOST, &copy_array));
-    CeedCallBackend(CeedHostCopyStrided_Hip(impl->h_array, start, step, length, copy_array));
+    CeedCallBackend(CeedHostCopyStrided_Hip(impl->h_array, start, stop, step, copy_array));
     CeedCallBackend(CeedVectorRestoreArray(vec_copy, &copy_array));
     impl->d_array = NULL;
   } else {
diff --git a/include/ceed-impl.h b/include/ceed-impl.h
index b626e9d129..95c920604d 100644
--- a/include/ceed-impl.h
+++ b/include/ceed-impl.h
@@ -137,7 +137,7 @@ struct CeedVector_private {
   Ceed ceed;
   int (*HasValidArray)(CeedVector, bool *);
   int (*HasBorrowedArrayOfType)(CeedVector, CeedMemType, bool *);
-  int (*CopyStrided)(CeedVector, CeedSize, CeedSize, CeedVector);
+  int (*CopyStrided)(CeedVector, CeedSize, CeedSize, CeedSize, CeedVector);
   int (*SetArray)(CeedVector, CeedMemType, CeedCopyMode, CeedScalar *);
   int (*SetValue)(CeedVector, CeedScalar);
   int (*SetValueStrided)(CeedVector, CeedSize, CeedSize, CeedSize, CeedScalar);
diff --git a/include/ceed/ceed.h b/include/ceed/ceed.h
index 7a8941cec0..b1851d6a27 100644
--- a/include/ceed/ceed.h
+++ b/include/ceed/ceed.h
@@ -181,7 +181,7 @@ CEED_EXTERN int CeedGetPreferredMemType(Ceed ceed, CeedMemType *type);
 CEED_EXTERN int  CeedVectorCreate(Ceed ceed, CeedSize len, CeedVector *vec);
 CEED_EXTERN int  CeedVectorReferenceCopy(CeedVector vec, CeedVector *vec_copy);
 CEED_EXTERN int  CeedVectorCopy(CeedVector vec, CeedVector vec_copy);
-CEED_EXTERN int  CeedVectorCopyStrided(CeedVector vec, CeedSize start, CeedInt step, CeedVector vec_copy);
+CEED_EXTERN int  CeedVectorCopyStrided(CeedVector vec, CeedSize start, CeedSize stop, CeedSize step, CeedVector vec_copy);
 CEED_EXTERN int  CeedVectorSetArray(CeedVector vec, CeedMemType mem_type, CeedCopyMode copy_mode, CeedScalar *array);
 CEED_EXTERN int  CeedVectorSetValue(CeedVector vec, CeedScalar value);
 CEED_EXTERN int  CeedVectorSetValueStrided(CeedVector vec, CeedSize start, CeedSize stop, CeedSize step, CeedScalar value);
diff --git a/interface/ceed-vector.c b/interface/ceed-vector.c
index 09156f4d94..3d76acb95d 100644
--- a/interface/ceed-vector.c
+++ b/interface/ceed-vector.c
@@ -251,7 +251,8 @@ int CeedVectorCopy(CeedVector vec, CeedVector vec_copy) {
   @brief Copy a strided portion of `CeedVector` contents into a different `CeedVector`
 
   @param[in]     vec      `CeedVector` to copy
-  @param[in]     start    First index to copy
+  @param[in]     start    First index to copy in the range `[start, stop)`
+  @param[in]     stop     One past the last element to copy in the range, or `-1` for `length`
   @param[in]     step     Stride between indices to copy
   @param[in,out] vec_copy `CeedVector` to copy values to
 
@@ -259,19 +260,12 @@ int CeedVectorCopy(CeedVector vec, CeedVector vec_copy) {
 
   @ref User
 **/
-int CeedVectorCopyStrided(CeedVector vec, CeedSize start, CeedInt step, CeedVector vec_copy) {
+int CeedVectorCopyStrided(CeedVector vec, CeedSize start, CeedSize stop, CeedSize step, CeedVector vec_copy) {
   CeedSize          length;
   const CeedScalar *array      = NULL;
   CeedScalar       *array_copy = NULL;
 
-  // Backend version
-  if (vec->CopyStrided && vec_copy->CopyStrided) {
-    CeedCall(vec->CopyStrided(vec, start, step, vec_copy));
-    vec_copy->state += 2;
-    return CEED_ERROR_SUCCESS;
-  }
-
-  // Get length
+  // Check length
   {
     CeedSize length_vec, length_copy;
 
@@ -280,11 +274,23 @@ int CeedVectorCopyStrided(CeedVector vec, CeedSize start, CeedInt step, CeedVect
     if (length_vec <= 0 || length_copy <= 0) return CEED_ERROR_SUCCESS;
     length = length_vec < length_copy ? length_vec : length_copy;
   }
+  CeedCheck(stop >= -1 && stop <= length, CeedVectorReturnCeed(vec), CEED_ERROR_ACCESS,
+            "Invalid value for stop %" CeedSize_FMT ", must be in the range [-1, length]", stop);
+  CeedCheck(start >= 0 && start <= length && (start <= stop || stop == -1), CeedVectorReturnCeed(vec), CEED_ERROR_ACCESS,
+            "Invalid value for start %" CeedSize_FMT ", must be in the range [0, stop]", start);
+
+  // Backend version
+  if (vec->CopyStrided && vec_copy->CopyStrided) {
+    CeedCall(vec->CopyStrided(vec, start, stop, step, vec_copy));
+    vec_copy->state += 2;
+    return CEED_ERROR_SUCCESS;
+  }
 
   // Copy
   CeedCall(CeedVectorGetArrayRead(vec, CEED_MEM_HOST, &array));
   CeedCall(CeedVectorGetArray(vec_copy, CEED_MEM_HOST, &array_copy));
-  for (CeedSize i = start; i < length; i += step) array_copy[i] = array[i];
+  if (stop == -1) stop = length;
+  for (CeedSize i = start; i < stop; i += step) array_copy[i] = array[i];
 
   // Cleanup
   CeedCall(CeedVectorRestoreArrayRead(vec, &array));
diff --git a/tests/t127-vector.c b/tests/t127-vector.c
index 68e2470138..e13bf15d1b 100644
--- a/tests/t127-vector.c
+++ b/tests/t127-vector.c
@@ -36,7 +36,7 @@ int main(int argc, char **argv) {
 
   // Copy strided
   CeedVectorSetValue(y, 0.0);
-  CeedVectorCopyStrided(x, start, step, y);
+  CeedVectorCopyStrided(x, start, -1, step, y);
   {
     const CeedScalar *read_array;
 

From a637ca9c367b4a844e99f109efe7913184c913e7 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Thu, 20 Mar 2025 09:48:12 -0600
Subject: [PATCH 353/571] minor - simpler upper bound

---
 backends/cuda-ref/ceed-cuda-ref-operator.c | 5 +++--
 backends/hip-ref/ceed-hip-ref-operator.c   | 5 +++--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/backends/cuda-ref/ceed-cuda-ref-operator.c b/backends/cuda-ref/ceed-cuda-ref-operator.c
index 5212546ce1..808816986f 100644
--- a/backends/cuda-ref/ceed-cuda-ref-operator.c
+++ b/backends/cuda-ref/ceed-cuda-ref-operator.c
@@ -1910,16 +1910,17 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda(CeedOperator op, C
 
       // Update unit vector
       {
+        // Note: E-vec strides are node * (1) + comp * (elem_size * num_elem) + elem * (elem_size)
         CeedInt  node = (s - 1) % elem_size, comp = (s - 1) / elem_size;
         CeedSize start = node * 1 + comp * (elem_size * num_elem);
-        CeedSize stop  = start + (num_elem - 1) * elem_size + 1;
+        CeedSize stop  = (comp + 1) * (elem_size * num_elem);
 
         if (s == 0) CeedCallBackend(CeedVectorSetValue(active_e_vec_in, 0.0));
         else CeedCallBackend(CeedVectorSetValueStrided(active_e_vec_in, start, stop, elem_size, 0.0));
 
         node = s % elem_size, comp = s / elem_size;
         start = node * 1 + comp * (elem_size * num_elem);
-        stop  = start + (num_elem - 1) * elem_size + 1;
+        stop  = (comp + 1) * (elem_size * num_elem);
         CeedCallBackend(CeedVectorSetValueStrided(active_e_vec_in, start, stop, elem_size, 1.0));
       }
 
diff --git a/backends/hip-ref/ceed-hip-ref-operator.c b/backends/hip-ref/ceed-hip-ref-operator.c
index 86123a9e71..2d28627a72 100644
--- a/backends/hip-ref/ceed-hip-ref-operator.c
+++ b/backends/hip-ref/ceed-hip-ref-operator.c
@@ -1907,16 +1907,17 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip(CeedOperator op, Ce
 
       // Update unit vector
       {
+        // Note: E-vec strides are node * (1) + comp * (elem_size * num_elem) + elem * (elem_size)
         CeedInt  node = (s - 1) % elem_size, comp = (s - 1) / elem_size;
         CeedSize start = node * 1 + comp * (elem_size * num_elem);
-        CeedSize stop  = start + (num_elem - 1) * elem_size + 1;
+        CeedSize stop  = (comp + 1) * (elem_size * num_elem);
 
         if (s == 0) CeedCallBackend(CeedVectorSetValue(active_e_vec_in, 0.0));
         else CeedCallBackend(CeedVectorSetValueStrided(active_e_vec_in, start, stop, elem_size, 0.0));
 
         node = s % elem_size, comp = s / elem_size;
         start = node * 1 + comp * (elem_size * num_elem);
-        stop  = start + (num_elem - 1) * elem_size + 1;
+        stop  = (comp + 1) * (elem_size * num_elem);
         CeedCallBackend(CeedVectorSetValueStrided(active_e_vec_in, start, stop, elem_size, 1.0));
       }
 

From eb07d68f27a4df7d950ee845c62e88e28e25414a Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 30 Aug 2024 13:53:12 -0600
Subject: [PATCH 354/571] rust - put less in prelude

---
 examples/rust/ex1-volume/src/main.rs          | 36 ++++++----
 examples/rust/ex1-volume/src/transform.rs     | 14 ++--
 examples/rust/ex2-surface/src/main.rs         | 36 ++++++----
 examples/rust/ex2-surface/src/transform.rs    | 10 ++-
 examples/rust/ex3-vector-volume/src/main.rs   | 29 ++++----
 .../rust/ex3-vector-volume/src/transform.rs   | 14 ++--
 examples/rust/ex4-vector-surface/src/main.rs  | 29 ++++----
 .../rust/ex4-vector-surface/src/transform.rs  | 10 ++-
 examples/rust/mesh/src/lib.rs                 | 18 ++---
 rust/libceed/src/basis.rs                     | 20 +++---
 rust/libceed/src/elem_restriction.rs          | 28 ++++----
 rust/libceed/src/lib.rs                       | 43 ++++++------
 rust/libceed/src/operator.rs                  | 66 ++++++++++---------
 rust/libceed/src/qfunction.rs                 | 48 ++++++--------
 rust/libceed/src/vector.rs                    | 48 +++++++-------
 15 files changed, 241 insertions(+), 208 deletions(-)

diff --git a/examples/rust/ex1-volume/src/main.rs b/examples/rust/ex1-volume/src/main.rs
index 2f30d497e6..fe6eb3019f 100644
--- a/examples/rust/ex1-volume/src/main.rs
+++ b/examples/rust/ex1-volume/src/main.rs
@@ -19,7 +19,9 @@
 // line argument (-ceed).
 
 use clap::Parser;
-use libceed::{prelude::*, Ceed};
+use libceed::{
+    BasisOpt, Ceed, ElemRestrictionOpt, QFunctionInputs, QFunctionOpt, QFunctionOutputs, VectorOpt,
+};
 mod opt;
 mod transform;
 
@@ -75,10 +77,20 @@ fn example_1(options: opt::Opt) -> libceed::Result<()> {
     let ceed = Ceed::init(&ceed_spec);
 
     // Mesh and solution bases
-    let basis_mesh =
-        ceed.basis_tensor_H1_Lagrange(dim, ncomp_x, mesh_degree + 1, num_qpts, QuadMode::Gauss)?;
-    let basis_solution =
-        ceed.basis_tensor_H1_Lagrange(dim, 1, solution_degree + 1, num_qpts, QuadMode::Gauss)?;
+    let basis_mesh = ceed.basis_tensor_H1_Lagrange(
+        dim,
+        ncomp_x,
+        mesh_degree + 1,
+        num_qpts,
+        libceed::QuadMode::Gauss,
+    )?;
+    let basis_solution = ceed.basis_tensor_H1_Lagrange(
+        dim,
+        1,
+        solution_degree + 1,
+        num_qpts,
+        libceed::QuadMode::Gauss,
+    )?;
 
     // Determine mesh size from approximate problem size
     let num_xyz = mesh::cartesian_mesh_size(dim, solution_degree, problem_size);
@@ -157,9 +169,9 @@ fn example_1(options: opt::Opt) -> libceed::Result<()> {
     };
     let qf_build_closure = ceed
         .q_function_interior(1, Box::new(build_mass))?
-        .input("dx", ncomp_x * dim, EvalMode::Grad)?
-        .input("weights", 1, EvalMode::Weight)?
-        .output("qdata", 1, EvalMode::None)?;
+        .input("dx", ncomp_x * dim, libceed::EvalMode::Grad)?
+        .input("weights", 1, libceed::EvalMode::Weight)?
+        .output("qdata", 1, libceed::EvalMode::None)?;
     // -- QFunction from gallery
     let qf_build_named = {
         let name = format!("Mass{}DBuild", dim);
@@ -204,9 +216,9 @@ fn example_1(options: opt::Opt) -> libceed::Result<()> {
     };
     let qf_mass_closure = ceed
         .q_function_interior(1, Box::new(apply_mass))?
-        .input("u", 1, EvalMode::Interp)?
-        .input("qdata", 1, EvalMode::None)?
-        .output("v", 1, EvalMode::Interp)?;
+        .input("u", 1, libceed::EvalMode::Interp)?
+        .input("qdata", 1, libceed::EvalMode::None)?
+        .output("v", 1, libceed::EvalMode::Interp)?;
     // -- QFunction from gallery
     let qf_mass_named = ceed.q_function_interior_by_name("MassApply")?;
     // -- QFunction for use with Operator
@@ -233,7 +245,7 @@ fn example_1(options: opt::Opt) -> libceed::Result<()> {
     op_mass.apply(&u, &mut v)?;
 
     // Compute the mesh volume
-    let volume: Scalar = v.view()?.iter().sum();
+    let volume: libceed::Scalar = v.view()?.iter().sum();
 
     // Output results
     if !quiet {
diff --git a/examples/rust/ex1-volume/src/transform.rs b/examples/rust/ex1-volume/src/transform.rs
index a66087a330..875194b829 100644
--- a/examples/rust/ex1-volume/src/transform.rs
+++ b/examples/rust/ex1-volume/src/transform.rs
@@ -5,23 +5,21 @@
 //
 // This file is part of CEED:  http://github.com/ceed
 
-use libceed::prelude::*;
-
 // ----------------------------------------------------------------------------
 // Transform mesh coordinates
 // ----------------------------------------------------------------------------
 pub(crate) fn transform_mesh_coordinates(
     dim: usize,
     mesh_size: usize,
-    mesh_coords: &mut Vector,
-) -> libceed::Result<Scalar> {
+    mesh_coords: &mut libceed::Vector,
+) -> libceed::Result<libceed::Scalar> {
     // Transform coordinates
     if dim == 1 {
         for coord in mesh_coords.view_mut()?.iter_mut() {
             // map [0,1] to [0,1] varying the mesh density
             *coord = 0.5
-                + 1.0 / (3.0 as Scalar).sqrt()
-                    * ((2.0 / 3.0) * std::f64::consts::PI as Scalar * (*coord - 0.5)).sin()
+                + 1.0 / (3.0 as libceed::Scalar).sqrt()
+                    * ((2.0 / 3.0) * std::f64::consts::PI as libceed::Scalar * (*coord - 0.5)).sin()
         }
     } else {
         let mut coords = mesh_coords.view_mut()?;
@@ -30,7 +28,7 @@ pub(crate) fn transform_mesh_coordinates(
             // map (x,y) from [0,1]x[0,1] to the quarter annulus with polar
             // coordinates, (r,phi) in [1,2]x[0,pi/2] with area = 3/4*pi
             let u = 1.0 + coords[i];
-            let v = std::f64::consts::PI as Scalar / 2.0 * coords[i + num_nodes];
+            let v = std::f64::consts::PI as libceed::Scalar / 2.0 * coords[i + num_nodes];
             coords[i] = u * v.cos();
             coords[i + num_nodes] = u * v.sin();
         }
@@ -39,7 +37,7 @@ pub(crate) fn transform_mesh_coordinates(
     // Exact volume of transformed region
     let exact_volume = match dim {
         1 => 1.0,
-        _ => 3.0 / 4.0 * std::f64::consts::PI as Scalar,
+        _ => 3.0 / 4.0 * std::f64::consts::PI as libceed::Scalar,
     };
     Ok(exact_volume)
 }
diff --git a/examples/rust/ex2-surface/src/main.rs b/examples/rust/ex2-surface/src/main.rs
index a7d20dedbf..cd5e119a17 100644
--- a/examples/rust/ex2-surface/src/main.rs
+++ b/examples/rust/ex2-surface/src/main.rs
@@ -20,7 +20,9 @@
 // line argument (-ceed).
 
 use clap::Parser;
-use libceed::{prelude::*, Ceed};
+use libceed::{
+    BasisOpt, Ceed, ElemRestrictionOpt, QFunctionInputs, QFunctionOpt, QFunctionOutputs, VectorOpt,
+};
 mod opt;
 mod transform;
 
@@ -80,10 +82,20 @@ fn example_2(options: opt::Opt) -> libceed::Result<()> {
     let ceed = Ceed::init(&ceed_spec);
 
     // Mesh and solution bases
-    let basis_mesh =
-        ceed.basis_tensor_H1_Lagrange(dim, ncomp_x, mesh_degree + 1, num_qpts, QuadMode::Gauss)?;
-    let basis_solution =
-        ceed.basis_tensor_H1_Lagrange(dim, 1, solution_degree + 1, num_qpts, QuadMode::Gauss)?;
+    let basis_mesh = ceed.basis_tensor_H1_Lagrange(
+        dim,
+        ncomp_x,
+        mesh_degree + 1,
+        num_qpts,
+        libceed::QuadMode::Gauss,
+    )?;
+    let basis_solution = ceed.basis_tensor_H1_Lagrange(
+        dim,
+        1,
+        solution_degree + 1,
+        num_qpts,
+        libceed::QuadMode::Gauss,
+    )?;
 
     // Determine mesh size from approximate problem size
     let num_xyz = mesh::cartesian_mesh_size(dim, solution_degree, problem_size);
@@ -199,9 +211,9 @@ fn example_2(options: opt::Opt) -> libceed::Result<()> {
     };
     let qf_build_closure = ceed
         .q_function_interior(1, Box::new(build_diff))?
-        .input("dx", ncomp_x * dim, EvalMode::Grad)?
-        .input("weights", 1, EvalMode::Weight)?
-        .output("qdata", dim * (dim + 1) / 2, EvalMode::None)?;
+        .input("dx", ncomp_x * dim, libceed::EvalMode::Grad)?
+        .input("weights", 1, libceed::EvalMode::Weight)?
+        .output("qdata", dim * (dim + 1) / 2, libceed::EvalMode::None)?;
     // -- QFunction from gallery
     let qf_build_named = {
         let name = format!("Poisson{}DBuild", dim);
@@ -280,9 +292,9 @@ fn example_2(options: opt::Opt) -> libceed::Result<()> {
     };
     let qf_diff_closure = ceed
         .q_function_interior(1, Box::new(apply_diff))?
-        .input("du", dim, EvalMode::Grad)?
-        .input("qdata", dim * (dim + 1) / 2, EvalMode::None)?
-        .output("dv", dim, EvalMode::Grad)?;
+        .input("du", dim, libceed::EvalMode::Grad)?
+        .input("qdata", dim * (dim + 1) / 2, libceed::EvalMode::None)?
+        .output("dv", dim, libceed::EvalMode::Grad)?;
     // -- QFunction from gallery
     let qf_diff_named = {
         let name = format!("Poisson{}DApply", dim);
@@ -319,7 +331,7 @@ fn example_2(options: opt::Opt) -> libceed::Result<()> {
     op_diff.apply(&u, &mut v)?;
 
     // Compute the mesh surface area
-    let area: Scalar = v.view()?.iter().map(|v| (*v).abs()).sum();
+    let area: libceed::Scalar = v.view()?.iter().map(|v| (*v).abs()).sum();
 
     // Output results
     if !quiet {
diff --git a/examples/rust/ex2-surface/src/transform.rs b/examples/rust/ex2-surface/src/transform.rs
index 666c56c886..d8c21af927 100644
--- a/examples/rust/ex2-surface/src/transform.rs
+++ b/examples/rust/ex2-surface/src/transform.rs
@@ -5,21 +5,19 @@
 //
 // This file is part of CEED:  http://github.com/ceed
 
-use libceed::prelude::*;
-
 // ----------------------------------------------------------------------------
 // Transform mesh coordinates
 // ----------------------------------------------------------------------------
 pub(crate) fn transform_mesh_coordinates(
     dim: usize,
-    mesh_coords: &mut Vector,
-) -> libceed::Result<Scalar> {
+    mesh_coords: &mut libceed::Vector,
+) -> libceed::Result<libceed::Scalar> {
     // Transform coordinates
     for coord in mesh_coords.view_mut()?.iter_mut() {
         // map [0,1] to [0,1] varying the mesh density
         *coord = 0.5
-            + 1.0 / (3.0 as Scalar).sqrt()
-                * ((2.0 / 3.0) * std::f64::consts::PI as Scalar * (*coord - 0.5)).sin()
+            + 1.0 / (3.0 as libceed::Scalar).sqrt()
+                * ((2.0 / 3.0) * std::f64::consts::PI as libceed::Scalar * (*coord - 0.5)).sin()
     }
 
     // Exact surface area of transformed region
diff --git a/examples/rust/ex3-vector-volume/src/main.rs b/examples/rust/ex3-vector-volume/src/main.rs
index f16307f066..8a50b2f09f 100644
--- a/examples/rust/ex3-vector-volume/src/main.rs
+++ b/examples/rust/ex3-vector-volume/src/main.rs
@@ -20,7 +20,9 @@
 // line argument (-ceed).
 
 use clap::Parser;
-use libceed::{prelude::*, Ceed};
+use libceed::{
+    BasisOpt, Ceed, ElemRestrictionOpt, QFunctionInputs, QFunctionOpt, QFunctionOutputs, VectorOpt,
+};
 mod opt;
 mod transform;
 
@@ -77,14 +79,19 @@ fn example_3(options: opt::Opt) -> libceed::Result<()> {
     let ceed = Ceed::init(&ceed_spec);
 
     // Mesh and solution bases
-    let basis_mesh =
-        ceed.basis_tensor_H1_Lagrange(dim, ncomp_x, mesh_degree + 1, num_qpts, QuadMode::Gauss)?;
+    let basis_mesh = ceed.basis_tensor_H1_Lagrange(
+        dim,
+        ncomp_x,
+        mesh_degree + 1,
+        num_qpts,
+        libceed::QuadMode::Gauss,
+    )?;
     let basis_solution = ceed.basis_tensor_H1_Lagrange(
         dim,
         ncomp_u,
         solution_degree + 1,
         num_qpts,
-        QuadMode::Gauss,
+        libceed::QuadMode::Gauss,
     )?;
 
     // Determine mesh size from approximate problem size
@@ -166,9 +173,9 @@ fn example_3(options: opt::Opt) -> libceed::Result<()> {
     };
     let qf_build_closure = ceed
         .q_function_interior(1, Box::new(build_mass))?
-        .input("dx", ncomp_x * dim, EvalMode::Grad)?
-        .input("weights", 1, EvalMode::Weight)?
-        .output("qdata", 1, EvalMode::None)?;
+        .input("dx", ncomp_x * dim, libceed::EvalMode::Grad)?
+        .input("weights", 1, libceed::EvalMode::Weight)?
+        .output("qdata", 1, libceed::EvalMode::None)?;
     // -- QFunction from gallery
     let qf_build_named = {
         let name = format!("Mass{}DBuild", dim);
@@ -217,9 +224,9 @@ fn example_3(options: opt::Opt) -> libceed::Result<()> {
     };
     let qf_mass_closure = ceed
         .q_function_interior(1, Box::new(apply_mass))?
-        .input("u", ncomp_u, EvalMode::Interp)?
-        .input("qdata", 1, EvalMode::None)?
-        .output("v", ncomp_u, EvalMode::Interp)?;
+        .input("u", ncomp_u, libceed::EvalMode::Interp)?
+        .input("qdata", 1, libceed::EvalMode::None)?
+        .output("v", ncomp_u, libceed::EvalMode::Interp)?;
     // -- QFunction from gallery
     let qf_mass_named = ceed.q_function_interior_by_name("Vector3MassApply")?;
     // -- QFunction for use with Operator
@@ -255,7 +262,7 @@ fn example_3(options: opt::Opt) -> libceed::Result<()> {
     op_mass.apply(&u, &mut v)?;
 
     // Compute the mesh volume
-    let volume: Scalar = v.view()?.iter().sum::<libceed::Scalar>()
+    let volume: libceed::Scalar = v.view()?.iter().sum::<libceed::Scalar>()
         / ((ncomp_u * (ncomp_u + 1)) / 2) as libceed::Scalar;
 
     // Output results
diff --git a/examples/rust/ex3-vector-volume/src/transform.rs b/examples/rust/ex3-vector-volume/src/transform.rs
index a66087a330..875194b829 100644
--- a/examples/rust/ex3-vector-volume/src/transform.rs
+++ b/examples/rust/ex3-vector-volume/src/transform.rs
@@ -5,23 +5,21 @@
 //
 // This file is part of CEED:  http://github.com/ceed
 
-use libceed::prelude::*;
-
 // ----------------------------------------------------------------------------
 // Transform mesh coordinates
 // ----------------------------------------------------------------------------
 pub(crate) fn transform_mesh_coordinates(
     dim: usize,
     mesh_size: usize,
-    mesh_coords: &mut Vector,
-) -> libceed::Result<Scalar> {
+    mesh_coords: &mut libceed::Vector,
+) -> libceed::Result<libceed::Scalar> {
     // Transform coordinates
     if dim == 1 {
         for coord in mesh_coords.view_mut()?.iter_mut() {
             // map [0,1] to [0,1] varying the mesh density
             *coord = 0.5
-                + 1.0 / (3.0 as Scalar).sqrt()
-                    * ((2.0 / 3.0) * std::f64::consts::PI as Scalar * (*coord - 0.5)).sin()
+                + 1.0 / (3.0 as libceed::Scalar).sqrt()
+                    * ((2.0 / 3.0) * std::f64::consts::PI as libceed::Scalar * (*coord - 0.5)).sin()
         }
     } else {
         let mut coords = mesh_coords.view_mut()?;
@@ -30,7 +28,7 @@ pub(crate) fn transform_mesh_coordinates(
             // map (x,y) from [0,1]x[0,1] to the quarter annulus with polar
             // coordinates, (r,phi) in [1,2]x[0,pi/2] with area = 3/4*pi
             let u = 1.0 + coords[i];
-            let v = std::f64::consts::PI as Scalar / 2.0 * coords[i + num_nodes];
+            let v = std::f64::consts::PI as libceed::Scalar / 2.0 * coords[i + num_nodes];
             coords[i] = u * v.cos();
             coords[i + num_nodes] = u * v.sin();
         }
@@ -39,7 +37,7 @@ pub(crate) fn transform_mesh_coordinates(
     // Exact volume of transformed region
     let exact_volume = match dim {
         1 => 1.0,
-        _ => 3.0 / 4.0 * std::f64::consts::PI as Scalar,
+        _ => 3.0 / 4.0 * std::f64::consts::PI as libceed::Scalar,
     };
     Ok(exact_volume)
 }
diff --git a/examples/rust/ex4-vector-surface/src/main.rs b/examples/rust/ex4-vector-surface/src/main.rs
index d04fdb6746..3ac489e8a2 100644
--- a/examples/rust/ex4-vector-surface/src/main.rs
+++ b/examples/rust/ex4-vector-surface/src/main.rs
@@ -21,7 +21,9 @@
 // line argument (-ceed).
 
 use clap::Parser;
-use libceed::{prelude::*, Ceed};
+use libceed::{
+    BasisOpt, Ceed, ElemRestrictionOpt, QFunctionInputs, QFunctionOpt, QFunctionOutputs, VectorOpt,
+};
 mod opt;
 mod transform;
 
@@ -82,14 +84,19 @@ fn example_4(options: opt::Opt) -> libceed::Result<()> {
     let ceed = Ceed::init(&ceed_spec);
 
     // Mesh and solution bases
-    let basis_mesh =
-        ceed.basis_tensor_H1_Lagrange(dim, ncomp_x, mesh_degree + 1, num_qpts, QuadMode::Gauss)?;
+    let basis_mesh = ceed.basis_tensor_H1_Lagrange(
+        dim,
+        ncomp_x,
+        mesh_degree + 1,
+        num_qpts,
+        libceed::QuadMode::Gauss,
+    )?;
     let basis_solution = ceed.basis_tensor_H1_Lagrange(
         dim,
         ncomp_u,
         solution_degree + 1,
         num_qpts,
-        QuadMode::Gauss,
+        libceed::QuadMode::Gauss,
     )?;
 
     // Determine mesh size from approximate problem size
@@ -206,9 +213,9 @@ fn example_4(options: opt::Opt) -> libceed::Result<()> {
     };
     let qf_build_closure = ceed
         .q_function_interior(1, Box::new(build_diff))?
-        .input("dx", ncomp_x * dim, EvalMode::Grad)?
-        .input("weights", 1, EvalMode::Weight)?
-        .output("qdata", dim * (dim + 1) / 2, EvalMode::None)?;
+        .input("dx", ncomp_x * dim, libceed::EvalMode::Grad)?
+        .input("weights", 1, libceed::EvalMode::Weight)?
+        .output("qdata", dim * (dim + 1) / 2, libceed::EvalMode::None)?;
     // -- QFunction from gallery
     let qf_build_named = {
         let name = format!("Poisson{}DBuild", dim);
@@ -301,9 +308,9 @@ fn example_4(options: opt::Opt) -> libceed::Result<()> {
     };
     let qf_diff_closure = ceed
         .q_function_interior(1, Box::new(apply_diff))?
-        .input("du", dim * ncomp_u, EvalMode::Grad)?
-        .input("qdata", dim * (dim + 1) / 2, EvalMode::None)?
-        .output("dv", dim * ncomp_u, EvalMode::Grad)?;
+        .input("du", dim * ncomp_u, libceed::EvalMode::Grad)?
+        .input("qdata", dim * (dim + 1) / 2, libceed::EvalMode::None)?
+        .output("dv", dim * ncomp_u, libceed::EvalMode::Grad)?;
     // -- QFunction from gallery
     let qf_diff_named = {
         let name = format!("Vector3Poisson{}DApply", dim);
@@ -349,7 +356,7 @@ fn example_4(options: opt::Opt) -> libceed::Result<()> {
     op_diff.apply(&u, &mut v)?;
 
     // Compute the mesh surface area
-    let area: Scalar = v
+    let area: libceed::Scalar = v
         .view()?
         .iter()
         .map(|v| (*v).abs())
diff --git a/examples/rust/ex4-vector-surface/src/transform.rs b/examples/rust/ex4-vector-surface/src/transform.rs
index 666c56c886..d8c21af927 100644
--- a/examples/rust/ex4-vector-surface/src/transform.rs
+++ b/examples/rust/ex4-vector-surface/src/transform.rs
@@ -5,21 +5,19 @@
 //
 // This file is part of CEED:  http://github.com/ceed
 
-use libceed::prelude::*;
-
 // ----------------------------------------------------------------------------
 // Transform mesh coordinates
 // ----------------------------------------------------------------------------
 pub(crate) fn transform_mesh_coordinates(
     dim: usize,
-    mesh_coords: &mut Vector,
-) -> libceed::Result<Scalar> {
+    mesh_coords: &mut libceed::Vector,
+) -> libceed::Result<libceed::Scalar> {
     // Transform coordinates
     for coord in mesh_coords.view_mut()?.iter_mut() {
         // map [0,1] to [0,1] varying the mesh density
         *coord = 0.5
-            + 1.0 / (3.0 as Scalar).sqrt()
-                * ((2.0 / 3.0) * std::f64::consts::PI as Scalar * (*coord - 0.5)).sin()
+            + 1.0 / (3.0 as libceed::Scalar).sqrt()
+                * ((2.0 / 3.0) * std::f64::consts::PI as libceed::Scalar * (*coord - 0.5)).sin()
     }
 
     // Exact surface area of transformed region
diff --git a/examples/rust/mesh/src/lib.rs b/examples/rust/mesh/src/lib.rs
index 0b7a9da4e5..f249e6dfab 100644
--- a/examples/rust/mesh/src/lib.rs
+++ b/examples/rust/mesh/src/lib.rs
@@ -5,7 +5,7 @@
 //
 // This file is part of CEED:  http://github.com/ceed
 
-use libceed::{prelude::*, Ceed};
+use libceed::{Ceed, ElemRestriction, Vector};
 
 // ----------------------------------------------------------------------------
 // Determine problem size in each dimension from size and dimenison
@@ -91,7 +91,7 @@ pub fn build_cartesian_restriction(
         num_comp,
         scalar_size,
         num_comp * scalar_size,
-        MemType::Host,
+        libceed::MemType::Host,
         &elem_nodes,
     )?;
 
@@ -101,7 +101,7 @@ pub fn build_cartesian_restriction(
         elem_qpts,
         num_comp,
         num_comp * elem_qpts * num_elem,
-        CEED_STRIDES_BACKEND,
+        libceed::CEED_STRIDES_BACKEND,
     )?;
     Ok((rstr, rstr_qdata))
 }
@@ -125,13 +125,14 @@ pub fn cartesian_mesh_coords(
     }
 
     // Lobatto points
-    let lobatto_basis = ceed.basis_tensor_H1_Lagrange(1, 1, 2, p, QuadMode::GaussLobatto)?;
+    let lobatto_basis =
+        ceed.basis_tensor_H1_Lagrange(1, 1, 2, p, libceed::QuadMode::GaussLobatto)?;
     let nodes_corners = ceed.vector_from_slice(&[0.0, 1.0])?;
     let mut nodes_full = ceed.vector(p)?;
     lobatto_basis.apply(
         1,
-        TransposeMode::NoTranspose,
-        EvalMode::Interp,
+        libceed::TransposeMode::NoTranspose,
+        libceed::EvalMode::Interp,
         &nodes_corners,
         &mut nodes_full,
     )?;
@@ -146,8 +147,9 @@ pub fn cartesian_mesh_coords(
             let mut r_nodes = gs_nodes;
             for d in 0..dim {
                 let d_1d = r_nodes % num_d[d];
-                coords[gs_nodes + scalar_size * d] =
-                    ((d_1d / (p - 1)) as Scalar + nodes[d_1d % (p - 1)]) / num_xyz[d] as Scalar;
+                coords[gs_nodes + scalar_size * d] = ((d_1d / (p - 1)) as libceed::Scalar
+                    + nodes[d_1d % (p - 1)])
+                    / num_xyz[d] as libceed::Scalar;
                 r_nodes /= num_d[d];
             }
         }
diff --git a/rust/libceed/src/basis.rs b/rust/libceed/src/basis.rs
index d613745fc5..d16f8420f6 100644
--- a/rust/libceed/src/basis.rs
+++ b/rust/libceed/src/basis.rs
@@ -8,7 +8,7 @@
 //! A Ceed Basis defines the discrete finite element basis and associated
 //! quadrature rule.
 
-use crate::prelude::*;
+use crate::{prelude::*, vector::Vector, EvalMode, TransposeMode};
 
 // -----------------------------------------------------------------------------
 // Basis option
@@ -37,7 +37,7 @@ impl<'a> BasisOpt<'a> {
     /// Check if a BasisOpt is Some
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, BasisOpt, QuadMode};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let b = ceed.basis_tensor_H1_Lagrange(1, 2, 3, 4, QuadMode::Gauss)?;
@@ -59,7 +59,7 @@ impl<'a> BasisOpt<'a> {
     /// Check if a BasisOpt is None
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, BasisOpt, QuadMode};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let b = ceed.basis_tensor_H1_Lagrange(1, 2, 3, 4, QuadMode::Gauss)?;
@@ -108,7 +108,7 @@ impl<'a> fmt::Display for Basis<'a> {
     /// View a Basis
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, QuadMode};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let b = ceed.basis_tensor_H1_Lagrange(1, 2, 3, 4, QuadMode::Gauss)?;
@@ -343,7 +343,7 @@ impl<'a> Basis<'a> {
     /// * `v`     - Output Vector
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, EvalMode, TransposeMode, QuadMode};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// const Q: usize = 6;
@@ -416,7 +416,7 @@ impl<'a> Basis<'a> {
     /// Returns the dimension for given Basis
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, QuadMode};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let dim = 2;
@@ -436,7 +436,7 @@ impl<'a> Basis<'a> {
     /// Returns number of components for given Basis
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, QuadMode};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let ncomp = 2;
@@ -456,7 +456,7 @@ impl<'a> Basis<'a> {
     /// Returns total number of nodes (in dim dimensions) of a Basis
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, QuadMode};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let p = 3;
@@ -477,7 +477,7 @@ impl<'a> Basis<'a> {
     /// Basis
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, QuadMode};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let q = 4;
@@ -506,7 +506,7 @@ impl<'a> Basis<'a> {
     /// points and weights.
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, EvalMode, TransposeMode, QuadMode};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let coarse = ceed.basis_tensor_H1_Lagrange(1, 1, 2, 3, QuadMode::Gauss)?;
diff --git a/rust/libceed/src/elem_restriction.rs b/rust/libceed/src/elem_restriction.rs
index 3a340e7b23..60692b49e5 100644
--- a/rust/libceed/src/elem_restriction.rs
+++ b/rust/libceed/src/elem_restriction.rs
@@ -8,7 +8,7 @@
 //! A Ceed ElemRestriction decomposes elements and groups the degrees of freedom
 //! (dofs) according to the different elements they belong to.
 
-use crate::prelude::*;
+use crate::{prelude::*, vector::Vector, TransposeMode};
 
 // -----------------------------------------------------------------------------
 // ElemRestriction option
@@ -38,7 +38,7 @@ impl<'a> ElemRestrictionOpt<'a> {
     /// Check if an ElemRestrictionOpt is Some
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, ElemRestrictionOpt, MemType};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let nelem = 3;
@@ -66,7 +66,7 @@ impl<'a> ElemRestrictionOpt<'a> {
     /// Check if an ElemRestrictionOpt is None
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, ElemRestrictionOpt, MemType};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let nelem = 3;
@@ -121,7 +121,7 @@ impl<'a> fmt::Display for ElemRestriction<'a> {
     /// View an ElemRestriction
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, MemType};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let nelem = 3;
@@ -328,7 +328,7 @@ impl<'a> ElemRestriction<'a> {
     /// Create an Lvector for an ElemRestriction
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, MemType};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let nelem = 3;
@@ -357,7 +357,7 @@ impl<'a> ElemRestriction<'a> {
     /// Create an Evector for an ElemRestriction
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, MemType};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let nelem = 3;
@@ -386,7 +386,7 @@ impl<'a> ElemRestriction<'a> {
     /// Create Vectors for an ElemRestriction
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, MemType};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let nelem = 3;
@@ -426,7 +426,7 @@ impl<'a> ElemRestriction<'a> {
     ///               decided by the backend.
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, MemType, Scalar, TransposeMode};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let nelem = 3;
@@ -469,7 +469,7 @@ impl<'a> ElemRestriction<'a> {
     /// Returns the Lvector component stride
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, MemType};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let nelem = 3;
@@ -495,7 +495,7 @@ impl<'a> ElemRestriction<'a> {
     /// Returns the total number of elements in the range of a ElemRestriction
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, MemType};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let nelem = 3;
@@ -520,7 +520,7 @@ impl<'a> ElemRestriction<'a> {
     /// Returns the size of elements in the ElemRestriction
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, MemType};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let nelem = 3;
@@ -546,7 +546,7 @@ impl<'a> ElemRestriction<'a> {
     /// Returns the size of the Lvector for an ElemRestriction
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, MemType};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let nelem = 3;
@@ -571,7 +571,7 @@ impl<'a> ElemRestriction<'a> {
     /// Returns the number of components in the elements of an ElemRestriction
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, MemType};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let nelem = 3;
@@ -597,7 +597,7 @@ impl<'a> ElemRestriction<'a> {
     /// Returns the multiplicity of nodes in an ElemRestriction
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, MemType};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let nelem = 3;
diff --git a/rust/libceed/src/lib.rs b/rust/libceed/src/lib.rs
index 9c68db4dc6..7ead6ea964 100755
--- a/rust/libceed/src/lib.rs
+++ b/rust/libceed/src/lib.rs
@@ -19,18 +19,6 @@ use crate::prelude::*;
 use std::sync::Once;
 
 pub mod prelude {
-    pub use crate::{
-        basis::{self, Basis, BasisOpt},
-        elem_restriction::{self, ElemRestriction, ElemRestrictionOpt},
-        operator::{self, CompositeOperator, Operator, OperatorField},
-        qfunction::{
-            self, QFunction, QFunctionByName, QFunctionField, QFunctionInputs, QFunctionOpt,
-            QFunctionOutputs,
-        },
-        vector::{self, Vector, VectorOpt, VectorSliceWrapper},
-        ElemTopology, EvalMode, MemType, NormType, QuadMode, Scalar, TransposeMode,
-        CEED_STRIDES_BACKEND, EPSILON, MAX_QFUNCTION_FIELDS,
-    };
     pub(crate) use libceed_sys::bind_ceed;
     pub(crate) use std::convert::TryFrom;
     pub(crate) use std::ffi::{CStr, CString};
@@ -157,6 +145,19 @@ impl fmt::Display for Error {
     }
 }
 
+// -----------------------------------------------------------------------------
+// Internal crate contents
+// -----------------------------------------------------------------------------
+pub use crate::{
+    basis::{Basis, BasisOpt},
+    elem_restriction::{ElemRestriction, ElemRestrictionOpt},
+    operator::{CompositeOperator, Operator, OperatorField},
+    qfunction::{
+        QFunction, QFunctionByName, QFunctionField, QFunctionInputs, QFunctionOpt, QFunctionOutputs,
+    },
+    vector::{Vector, VectorOpt, VectorSliceWrapper},
+};
+
 // -----------------------------------------------------------------------------
 // Internal error checker
 // -----------------------------------------------------------------------------
@@ -427,7 +428,7 @@ impl Ceed {
     ///                    `[0, lsize - 1]`.
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, MemType};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let nelem = 3;
@@ -486,7 +487,7 @@ impl Ceed {
     ///                    orientation.
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, MemType};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let nelem = 3;
@@ -550,7 +551,7 @@ impl Ceed {
     ///                     unknowns upon restriction.
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, MemType};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let nelem = 3;
@@ -719,7 +720,7 @@ impl Ceed {
     ///               accuracy for the quadrature)
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, QuadMode};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let b = ceed.basis_tensor_H1_Lagrange(2, 1, 3, 4, QuadMode::Gauss)?;
@@ -755,7 +756,7 @@ impl Ceed {
     ///                 the reference element
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, ElemTopology};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let interp = [
@@ -886,7 +887,7 @@ impl Ceed {
     ///                 the reference element
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, ElemTopology};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let interp = [
@@ -983,7 +984,7 @@ impl Ceed {
     ///                 the reference element
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, ElemTopology};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let interp = [
@@ -1077,7 +1078,7 @@ impl Ceed {
     /// * `f`       - Boxed closure to evaluate weak form at quadrature points.
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, QFunctionInputs, QFunctionOutputs};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let mut user_f = |[u, weights, ..]: QFunctionInputs, [v, ..]: QFunctionOutputs| {
@@ -1135,7 +1136,7 @@ impl Ceed {
     ///              Jacobian of the qf (or qfunction_none)
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, QFunctionOpt};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let qf = ceed.q_function_interior_by_name("Mass1DBuild")?;
diff --git a/rust/libceed/src/operator.rs b/rust/libceed/src/operator.rs
index 7fd2891a66..ec92f2f3e1 100644
--- a/rust/libceed/src/operator.rs
+++ b/rust/libceed/src/operator.rs
@@ -9,7 +9,13 @@
 //! Ceed QFunction. A Ceed Operator connects Ceed ElemRestrictions,
 //! Ceed Bases, and Ceed QFunctions.
 
-use crate::prelude::*;
+use crate::{
+    basis::{Basis, BasisOpt},
+    elem_restriction::{ElemRestriction, ElemRestrictionOpt},
+    prelude::*,
+    qfunction::QFunctionOpt,
+    vector::{Vector, VectorOpt},
+};
 
 // -----------------------------------------------------------------------------
 // Operator Field context wrapper
@@ -61,7 +67,7 @@ impl<'a> OperatorField<'a> {
     /// Get the name of an OperatorField
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, BasisOpt, ElemRestrictionOpt, MemType, QFunctionOpt, QuadMode, VectorOpt};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let qf = ceed.q_function_interior_by_name("Mass1DBuild")?;
@@ -108,7 +114,7 @@ impl<'a> OperatorField<'a> {
     /// Get the ElemRestriction of an OperatorField
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, BasisOpt, ElemRestrictionOpt, MemType, QFunctionOpt, QuadMode, VectorOpt};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let qf = ceed.q_function_interior_by_name("Mass1DBuild")?;
@@ -180,7 +186,7 @@ impl<'a> OperatorField<'a> {
     /// Get the Basis of an OperatorField
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, BasisOpt, ElemRestrictionOpt, MemType, QFunctionOpt, QuadMode, VectorOpt};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let qf = ceed.q_function_interior_by_name("Mass1DBuild")?;
@@ -242,7 +248,7 @@ impl<'a> OperatorField<'a> {
     /// Get the Vector of an OperatorField
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, BasisOpt, ElemRestrictionOpt, MemType, QFunctionOpt, QuadMode, VectorOpt};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let qf = ceed.q_function_interior_by_name("Mass1DBuild")?;
@@ -340,7 +346,7 @@ impl<'a> fmt::Display for OperatorCore<'a> {
 /// View an Operator
 ///
 /// ```
-/// # use libceed::prelude::*;
+/// # use libceed::{prelude::*, BasisOpt, ElemRestrictionOpt, MemType, QFunctionOpt, QuadMode, VectorOpt};
 /// # fn main() -> libceed::Result<()> {
 /// # let ceed = libceed::Ceed::default_init();
 /// let qf = ceed.q_function_interior_by_name("Mass1DBuild")?;
@@ -380,7 +386,7 @@ impl<'a> fmt::Display for Operator<'a> {
 /// View a composite Operator
 ///
 /// ```
-/// # use libceed::prelude::*;
+/// # use libceed::{prelude::*, BasisOpt, ElemRestrictionOpt, MemType, QFunctionOpt, QuadMode, VectorOpt};
 /// # fn main() -> libceed::Result<()> {
 /// # let ceed = libceed::Ceed::default_init();
 ///
@@ -571,7 +577,7 @@ impl<'a> Operator<'a> {
     /// * 'name' - Name to set
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, BasisOpt, ElemRestrictionOpt, MemType, QFunctionOpt, QuadMode, VectorOpt};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let qf = ceed.q_function_interior_by_name("Mass1DBuild")?;
@@ -612,7 +618,7 @@ impl<'a> Operator<'a> {
     /// * `output` - Output Vector
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, BasisOpt, ElemRestrictionOpt, MemType, QFunctionOpt, QuadMode, Scalar, VectorOpt};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let ne = 4;
@@ -690,7 +696,7 @@ impl<'a> Operator<'a> {
     /// * `output` - Output Vector
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, BasisOpt, ElemRestrictionOpt, MemType, QFunctionOpt, QuadMode, Scalar, VectorOpt};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let ne = 4;
@@ -770,7 +776,7 @@ impl<'a> Operator<'a> {
     ///
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, MemType, QFunctionOpt, QuadMode, VectorOpt};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let qf = ceed.q_function_interior_by_name("Mass1DBuild")?;
@@ -818,7 +824,7 @@ impl<'a> Operator<'a> {
     /// Get a slice of Operator inputs
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, BasisOpt, ElemRestrictionOpt, MemType, QFunctionOpt, QuadMode, Scalar, VectorOpt};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let qf = ceed.q_function_interior_by_name("Mass1DBuild")?;
@@ -888,7 +894,7 @@ impl<'a> Operator<'a> {
     /// Get a slice of Operator outputs
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, BasisOpt, ElemRestrictionOpt, MemType, QFunctionOpt, QuadMode, VectorOpt};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let qf = ceed.q_function_interior_by_name("Mass1DBuild")?;
@@ -958,7 +964,7 @@ impl<'a> Operator<'a> {
     /// Check if Operator is setup correctly
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, BasisOpt, ElemRestrictionOpt, MemType, QFunctionOpt, QuadMode, VectorOpt};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let ne = 4;
@@ -1001,7 +1007,7 @@ impl<'a> Operator<'a> {
     ///
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, MemType, QFunctionOpt, QuadMode, VectorOpt};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let qf = ceed.q_function_interior_by_name("Mass1DBuild")?;
@@ -1038,7 +1044,7 @@ impl<'a> Operator<'a> {
     ///
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, MemType, QFunctionOpt, QuadMode, VectorOpt};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let qf = ceed.q_function_interior_by_name("Mass1DBuild")?;
@@ -1085,7 +1091,7 @@ impl<'a> Operator<'a> {
     /// * `assembled` - Vector to store assembled Operator diagonal
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, BasisOpt, ElemRestrictionOpt, MemType, QFunctionOpt, QuadMode, VectorOpt};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let ne = 4;
@@ -1192,7 +1198,7 @@ impl<'a> Operator<'a> {
     ///
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, BasisOpt, ElemRestrictionOpt, MemType, QFunctionOpt, QuadMode, Scalar, VectorOpt};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let ne = 4;
@@ -1304,7 +1310,7 @@ impl<'a> Operator<'a> {
     ///                   `[nodes, component out, component in]`.
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, BasisOpt, ElemRestrictionOpt, EvalMode, MemType, QFunctionInputs, QFunctionOpt, QFunctionOutputs, QuadMode, VectorOpt};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let ne = 4;
@@ -1443,7 +1449,7 @@ impl<'a> Operator<'a> {
     ///                   `[nodes, component out, component in]`.
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, BasisOpt, ElemRestrictionOpt, EvalMode, MemType, QFunctionInputs, QFunctionOpt, QFunctionOutputs, QuadMode, Scalar, VectorOpt};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let ne = 4;
@@ -1575,7 +1581,7 @@ impl<'a> Operator<'a> {
     /// * `basis_coarse` - Coarse grid active vector basis
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, BasisOpt, ElemRestrictionOpt, MemType, QFunctionOpt, QuadMode, Scalar, VectorOpt};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let ne = 15;
@@ -1734,7 +1740,7 @@ impl<'a> Operator<'a> {
     /// * `interp_c_to_f` - Matrix for coarse to fine
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, BasisOpt, ElemRestrictionOpt, EvalMode, MemType, QFunctionOpt, QuadMode, Scalar, TransposeMode, VectorOpt};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let ne = 15;
@@ -1892,7 +1898,7 @@ impl<'a> Operator<'a> {
         p_mult_fine: &Vector,
         rstr_coarse: &ElemRestriction,
         basis_coarse: &Basis,
-        interpCtoF: &Vec<Scalar>,
+        interpCtoF: &Vec<crate::Scalar>,
     ) -> crate::Result<(Operator<'b>, Operator<'b>, Operator<'b>)> {
         let mut ptr_coarse = std::ptr::null_mut();
         let mut ptr_prolong = std::ptr::null_mut();
@@ -1924,7 +1930,7 @@ impl<'a> Operator<'a> {
     /// * `interp_c_to_f` - Matrix for coarse to fine
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, BasisOpt, ElemRestrictionOpt, EvalMode, MemType, QFunctionOpt, QuadMode, Scalar, TransposeMode, VectorOpt};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let ne = 15;
@@ -2082,7 +2088,7 @@ impl<'a> Operator<'a> {
         p_mult_fine: &Vector,
         rstr_coarse: &ElemRestriction,
         basis_coarse: &Basis,
-        interpCtoF: &[Scalar],
+        interpCtoF: &[crate::Scalar],
     ) -> crate::Result<(Operator<'b>, Operator<'b>, Operator<'b>)> {
         let mut ptr_coarse = std::ptr::null_mut();
         let mut ptr_prolong = std::ptr::null_mut();
@@ -2127,7 +2133,7 @@ impl<'a> CompositeOperator<'a> {
     /// * 'name' - Name to set
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, BasisOpt, MemType, QFunctionOpt, QuadMode, VectorOpt};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     ///
@@ -2184,7 +2190,7 @@ impl<'a> CompositeOperator<'a> {
     /// * `output` - Output Vector
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, BasisOpt, ElemRestrictionOpt, MemType, QFunctionOpt, QuadMode, Scalar, VectorOpt};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let ne = 4;
@@ -2281,7 +2287,7 @@ impl<'a> CompositeOperator<'a> {
     /// * `output` - Output Vector
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, BasisOpt, ElemRestrictionOpt, MemType, QFunctionOpt, QuadMode, Scalar, VectorOpt};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let ne = 4;
@@ -2377,7 +2383,7 @@ impl<'a> CompositeOperator<'a> {
     /// * `subop` - Sub-Operator
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, QFunctionOpt};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let mut op = ceed.composite_operator()?;
@@ -2403,7 +2409,7 @@ impl<'a> CompositeOperator<'a> {
     /// Check if CompositeOperator is setup correctly
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, BasisOpt, ElemRestrictionOpt, MemType, QFunctionOpt, QuadMode, VectorOpt};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let ne = 4;
diff --git a/rust/libceed/src/qfunction.rs b/rust/libceed/src/qfunction.rs
index cf1dbbdcf3..c127cbe98b 100644
--- a/rust/libceed/src/qfunction.rs
+++ b/rust/libceed/src/qfunction.rs
@@ -10,7 +10,7 @@
 
 use std::pin::Pin;
 
-use crate::prelude::*;
+use crate::{prelude::*, vector::Vector, MAX_QFUNCTION_FIELDS};
 
 pub type QFunctionInputs<'a> = [&'a [crate::Scalar]; MAX_QFUNCTION_FIELDS];
 pub type QFunctionOutputs<'a> = [&'a mut [crate::Scalar]; MAX_QFUNCTION_FIELDS];
@@ -82,7 +82,7 @@ impl<'a> QFunctionField<'a> {
     /// Get the evaluation mode of a QFunctionField
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, EvalMode};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// const Q: usize = 8;
@@ -150,7 +150,7 @@ impl<'a> QFunctionOpt<'a> {
     /// Check if a QFunctionOpt is Some
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, EvalMode, QFunctionInputs, QFunctionOpt, QFunctionOutputs};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let mut user_f = |[u, weights, ..]: QFunctionInputs, [v, ..]: QFunctionOutputs| {
@@ -191,7 +191,7 @@ impl<'a> QFunctionOpt<'a> {
     /// Check if a QFunctionOpt is SomeQFunction
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, EvalMode, QFunctionInputs, QFunctionOpt, QFunctionOutputs};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let mut user_f = |[u, weights, ..]: QFunctionInputs, [v, ..]: QFunctionOutputs| {
@@ -232,7 +232,7 @@ impl<'a> QFunctionOpt<'a> {
     /// Check if a QFunctionOpt is SomeQFunctionByName
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, EvalMode, QFunctionInputs, QFunctionOpt, QFunctionOutputs};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let mut user_f = |[u, weights, ..]: QFunctionInputs, [v, ..]: QFunctionOutputs| {
@@ -282,7 +282,7 @@ impl<'a> QFunctionOpt<'a> {
     /// Check if a QFunctionOpt is None
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, EvalMode, QFunctionInputs, QFunctionOpt, QFunctionOutputs};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let mut user_f = |[u, weights, ..]: QFunctionInputs, [v, ..]: QFunctionOutputs| {
@@ -389,7 +389,7 @@ impl<'a> fmt::Display for QFunctionCore<'a> {
 /// View a QFunction
 ///
 /// ```
-/// # use libceed::prelude::*;
+/// # use libceed::{prelude::*, EvalMode, QFunctionInputs, QFunctionOutputs};
 /// # fn main() -> libceed::Result<()> {
 /// # let ceed = libceed::Ceed::default_init();
 /// let mut user_f = |[u, weights, ..]: QFunctionInputs, [v, ..]: QFunctionOutputs| {
@@ -467,7 +467,7 @@ impl<'a> QFunctionCore<'a> {
         })
     }
 
-    pub fn inputs(&self) -> crate::Result<&[crate::QFunctionField]> {
+    pub fn inputs(&self) -> crate::Result<&[QFunctionField]> {
         // Get array of raw C pointers for inputs
         let mut num_inputs = 0;
         let mut inputs_ptr = std::ptr::null_mut();
@@ -482,15 +482,12 @@ impl<'a> QFunctionCore<'a> {
         })?;
         // Convert raw C pointers to fixed length slice
         let inputs_slice = unsafe {
-            std::slice::from_raw_parts(
-                inputs_ptr as *const crate::QFunctionField,
-                num_inputs as usize,
-            )
+            std::slice::from_raw_parts(inputs_ptr as *const QFunctionField, num_inputs as usize)
         };
         Ok(inputs_slice)
     }
 
-    pub fn outputs(&self) -> crate::Result<&[crate::QFunctionField]> {
+    pub fn outputs(&self) -> crate::Result<&[QFunctionField]> {
         // Get array of raw C pointers for outputs
         let mut num_outputs = 0;
         let mut outputs_ptr = std::ptr::null_mut();
@@ -505,10 +502,7 @@ impl<'a> QFunctionCore<'a> {
         })?;
         // Convert raw C pointers to fixed length slice
         let outputs_slice = unsafe {
-            std::slice::from_raw_parts(
-                outputs_ptr as *const crate::QFunctionField,
-                num_outputs as usize,
-            )
+            std::slice::from_raw_parts(outputs_ptr as *const QFunctionField, num_outputs as usize)
         };
         Ok(outputs_slice)
     }
@@ -658,7 +652,7 @@ impl<'a> QFunction<'a> {
     /// * `output` - Array of output Vectors
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, EvalMode, QFunctionInputs, QFunctionOutputs, Scalar};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let mut user_f = |[u, weights, ..]: QFunctionInputs, [v, ..]: QFunctionOutputs| {
@@ -726,7 +720,7 @@ impl<'a> QFunction<'a> {
     ///                   gradients, `EvalMode::Weight` to use quadrature weights
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, EvalMode, QFunctionInputs, QFunctionOutputs};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let mut user_f = |[u, weights, ..]: QFunctionInputs, [v, ..]: QFunctionOutputs| {
@@ -776,7 +770,7 @@ impl<'a> QFunction<'a> {
     ///                   gradients
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, EvalMode, QFunctionInputs, QFunctionOutputs};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let mut user_f = |[u, weights, ..]: QFunctionInputs, [v, ..]: QFunctionOutputs| {
@@ -818,7 +812,7 @@ impl<'a> QFunction<'a> {
     /// Get a slice of QFunction inputs
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, EvalMode, QFunctionInputs, QFunctionOutputs};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let mut user_f = |[u, weights, ..]: QFunctionInputs, [v, ..]: QFunctionOutputs| {
@@ -842,14 +836,14 @@ impl<'a> QFunction<'a> {
     /// # Ok(())
     /// # }
     /// ```
-    pub fn inputs(&self) -> crate::Result<&[crate::QFunctionField]> {
+    pub fn inputs(&self) -> crate::Result<&[QFunctionField]> {
         self.qf_core.inputs()
     }
 
     /// Get a slice of QFunction outputs
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, EvalMode, QFunctionInputs, QFunctionOutputs};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let mut user_f = |[u, weights, ..]: QFunctionInputs, [v, ..]: QFunctionOutputs| {
@@ -872,7 +866,7 @@ impl<'a> QFunction<'a> {
     /// # Ok(())
     /// # }
     /// ```
-    pub fn outputs(&self) -> crate::Result<&[crate::QFunctionField]> {
+    pub fn outputs(&self) -> crate::Result<&[QFunctionField]> {
         self.qf_core.outputs()
     }
 }
@@ -903,7 +897,7 @@ impl<'a> QFunctionByName<'a> {
     /// * `output` - Array of output Vectors
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, EvalMode, QFunctionInputs, QFunctionOutputs, Scalar};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// const Q: usize = 8;
@@ -976,7 +970,7 @@ impl<'a> QFunctionByName<'a> {
     /// # Ok(())
     /// # }
     /// ```
-    pub fn inputs(&self) -> crate::Result<&[crate::QFunctionField]> {
+    pub fn inputs(&self) -> crate::Result<&[QFunctionField]> {
         self.qf_core.inputs()
     }
 
@@ -995,7 +989,7 @@ impl<'a> QFunctionByName<'a> {
     /// # Ok(())
     /// # }
     /// ```
-    pub fn outputs(&self) -> crate::Result<&[crate::QFunctionField]> {
+    pub fn outputs(&self) -> crate::Result<&[QFunctionField]> {
         self.qf_core.outputs()
     }
 }
diff --git a/rust/libceed/src/vector.rs b/rust/libceed/src/vector.rs
index 3793e437b0..dcf86d9e16 100644
--- a/rust/libceed/src/vector.rs
+++ b/rust/libceed/src/vector.rs
@@ -45,7 +45,7 @@ impl<'a> VectorOpt<'a> {
     /// Check if a VectorOpt is Some
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, VectorOpt};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let vec = libceed::vector::Vector::from_slice(&ceed, &[1., 2., 3.])?;
@@ -71,7 +71,7 @@ impl<'a> VectorOpt<'a> {
     /// Check if a VectorOpt is Active
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, VectorOpt};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let vec = libceed::vector::Vector::from_slice(&ceed, &[1., 2., 3.])?;
@@ -97,7 +97,7 @@ impl<'a> VectorOpt<'a> {
     /// Check if a VectorOpt is Some
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, VectorOpt};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let vec = libceed::vector::Vector::from_slice(&ceed, &[1., 2., 3.])?;
@@ -125,7 +125,7 @@ impl<'a> VectorOpt<'a> {
 // Vector borrowed slice wrapper
 // -----------------------------------------------------------------------------
 pub struct VectorSliceWrapper<'a> {
-    pub(crate) vector: crate::Vector<'a>,
+    pub(crate) vector: Vector<'a>,
     pub(crate) _slice: &'a mut [crate::Scalar],
 }
 
@@ -149,7 +149,7 @@ impl<'a> Drop for VectorSliceWrapper<'a> {
 // -----------------------------------------------------------------------------
 impl<'a> VectorSliceWrapper<'a> {
     fn from_vector_and_slice_mut<'b>(
-        vec: &'b mut crate::Vector,
+        vec: &'b mut Vector,
         slice: &'a mut [crate::Scalar],
     ) -> crate::Result<Self> {
         assert_eq!(vec.length(), slice.len());
@@ -166,7 +166,7 @@ impl<'a> VectorSliceWrapper<'a> {
             )
         })?;
         Ok(Self {
-            vector: unsafe { crate::Vector::from_raw(vec.ptr_copy_mut()?)? },
+            vector: unsafe { Vector::from_raw(vec.ptr_copy_mut()?)? },
             _slice: slice,
         })
     }
@@ -272,7 +272,7 @@ impl<'a> Vector<'a> {
     /// * `vec_source` - vector to copy array values from
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, Scalar};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let a = ceed.vector_from_slice(&[1., 2., 3.])?;
@@ -286,7 +286,7 @@ impl<'a> Vector<'a> {
     /// # }
     /// ```
     /// ```
-    pub fn copy_from(&mut self, vec_source: &crate::Vector) -> crate::Result<i32> {
+    pub fn copy_from(&mut self, vec_source: &Vector) -> crate::Result<i32> {
         self.check_error(unsafe { bind_ceed::CeedVectorCopy(vec_source.ptr, self.ptr) })
     }
 
@@ -300,7 +300,7 @@ impl<'a> Vector<'a> {
     /// # use libceed::prelude::*;
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
-    /// let vec = vector::Vector::from_slice(&ceed, &[1., 2., 3.])?;
+    /// let vec = libceed::Vector::from_slice(&ceed, &[1., 2., 3.])?;
     /// assert_eq!(vec.length(), 3, "Incorrect length from slice");
     /// # Ok(())
     /// # }
@@ -418,7 +418,7 @@ impl<'a> Vector<'a> {
     /// * `slice` - values to into self; length must match
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, Scalar};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let mut vec = ceed.vector(4)?;
@@ -453,7 +453,7 @@ impl<'a> Vector<'a> {
     /// * `slice` - values to wrap in self; length must match
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, Scalar};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let mut vec = ceed.vector(4)?;
@@ -495,7 +495,7 @@ impl<'a> Vector<'a> {
         &mut self,
         slice: &'b mut [crate::Scalar],
     ) -> crate::Result<VectorSliceWrapper<'b>> {
-        crate::VectorSliceWrapper::from_vector_and_slice_mut(self, slice)
+        VectorSliceWrapper::from_vector_and_slice_mut(self, slice)
     }
 
     /// Sync the Vector to a specified memtype
@@ -505,7 +505,7 @@ impl<'a> Vector<'a> {
     /// * `mtype` - Memtype to be synced
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, MemType};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let len = 10;
@@ -577,7 +577,7 @@ impl<'a> Vector<'a> {
     /// * `ntype` - Norm type One, Two, or Max
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, NormType};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let vec = ceed.vector_from_slice(&[1., 2., 3., 4.])?;
@@ -608,7 +608,7 @@ impl<'a> Vector<'a> {
     /// * `alpha` - scaling factor
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, Scalar};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let mut vec = ceed.vector_from_slice(&[0., 1., 2., 3., 4.])?;
@@ -634,7 +634,7 @@ impl<'a> Vector<'a> {
     /// * `x`     - second vector, must be different than self
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, Scalar};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let x = ceed.vector_from_slice(&[0., 1., 2., 3., 4.])?;
@@ -648,7 +648,7 @@ impl<'a> Vector<'a> {
     /// # }
     /// ```
     #[allow(unused_mut)]
-    pub fn axpy(mut self, alpha: crate::Scalar, x: &crate::Vector) -> crate::Result<Self> {
+    pub fn axpy(mut self, alpha: crate::Scalar, x: &Vector) -> crate::Result<Self> {
         self.check_error(unsafe { bind_ceed::CeedVectorAXPY(self.ptr, alpha, x.ptr) })?;
         Ok(self)
     }
@@ -662,7 +662,7 @@ impl<'a> Vector<'a> {
     /// * `x`     - second vector, must be different than self
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, Scalar};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let x = ceed.vector_from_slice(&[0., 1., 2., 3., 4.])?;
@@ -680,7 +680,7 @@ impl<'a> Vector<'a> {
         mut self,
         alpha: crate::Scalar,
         beta: crate::Scalar,
-        x: &crate::Vector,
+        x: &Vector,
     ) -> crate::Result<Self> {
         self.check_error(unsafe { bind_ceed::CeedVectorAXPBY(self.ptr, alpha, beta, x.ptr) })?;
         Ok(self)
@@ -694,7 +694,7 @@ impl<'a> Vector<'a> {
     /// * `y` - second vector for product
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, Scalar};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let mut w = ceed.vector_from_slice(&[0., 1., 2., 3., 4.])?;
@@ -709,7 +709,7 @@ impl<'a> Vector<'a> {
     /// # }
     /// ```
     #[allow(unused_mut)]
-    pub fn pointwise_mult(mut self, x: &crate::Vector, y: &crate::Vector) -> crate::Result<Self> {
+    pub fn pointwise_mult(mut self, x: &Vector, y: &Vector) -> crate::Result<Self> {
         self.check_error(unsafe { bind_ceed::CeedVectorPointwiseMult(self.ptr, x.ptr, y.ptr) })?;
         Ok(self)
     }
@@ -721,7 +721,7 @@ impl<'a> Vector<'a> {
     /// * `x` - second vector for product
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, Scalar};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let mut w = ceed.vector_from_slice(&[0., 1., 2., 3., 4.])?;
@@ -735,7 +735,7 @@ impl<'a> Vector<'a> {
     /// # }
     /// ```
     #[allow(unused_mut)]
-    pub fn pointwise_scale(mut self, x: &crate::Vector) -> crate::Result<Self> {
+    pub fn pointwise_scale(mut self, x: &Vector) -> crate::Result<Self> {
         self.check_error(unsafe { bind_ceed::CeedVectorPointwiseMult(self.ptr, self.ptr, x.ptr) })?;
         Ok(self)
     }
@@ -743,7 +743,7 @@ impl<'a> Vector<'a> {
     /// Compute the pointwise multiplication w = w .* w for a Vector
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, Scalar};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let mut w = ceed.vector_from_slice(&[0., 1., 2., 3., 4.])?;

From bf55b007047d3d1ea1227af6b83a1b17e380b530 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 30 Aug 2024 14:10:41 -0600
Subject: [PATCH 355/571] rust - satiate clippy

---
 examples/rust/ex1-volume/src/main.rs         | 17 +++++++------
 examples/rust/ex2-surface/src/main.rs        | 15 ++++++-----
 examples/rust/ex3-vector-volume/src/main.rs  | 17 +++++++------
 examples/rust/ex4-vector-surface/src/main.rs | 15 ++++++-----
 rust/libceed/src/lib.rs                      |  2 +-
 rust/libceed/src/operator.rs                 | 26 ++++++++++----------
 rust/libceed/src/qfunction.rs                |  6 ++---
 rust/libceed/src/vector.rs                   | 10 ++------
 8 files changed, 53 insertions(+), 55 deletions(-)

diff --git a/examples/rust/ex1-volume/src/main.rs b/examples/rust/ex1-volume/src/main.rs
index fe6eb3019f..88f87d656e 100644
--- a/examples/rust/ex1-volume/src/main.rs
+++ b/examples/rust/ex1-volume/src/main.rs
@@ -46,17 +46,20 @@ fn example_1(options: opt::Opt) -> libceed::Result<()> {
         quiet,
         gallery,
     } = options;
-    assert!(dim >= 1 && dim <= 3);
+    assert!((1..=3).contains(&dim));
     assert!(mesh_degree >= 1);
     assert!(solution_degree >= 1);
     assert!(num_qpts >= 1);
     let ncomp_x = dim;
-    let problem_size: i64;
-    if problem_size_requested < 0 {
-        problem_size = if test { 8 * 16 } else { 256 * 1024 };
+    let problem_size: i64 = if problem_size_requested < 0 {
+        if test {
+            8 * 16
+        } else {
+            256 * 1024
+        }
     } else {
-        problem_size = problem_size_requested;
-    }
+        problem_size_requested
+    };
 
     // Summary output
     if !quiet {
@@ -102,7 +105,7 @@ fn example_1(options: opt::Opt) -> libceed::Result<()> {
         if dim > 2 {
             print!(", nz = {}", num_xyz[2]);
         }
-        print!("\n");
+        println!();
     }
 
     // Build ElemRestriction objects describing the mesh and solution discrete
diff --git a/examples/rust/ex2-surface/src/main.rs b/examples/rust/ex2-surface/src/main.rs
index cd5e119a17..e0cec46e7c 100644
--- a/examples/rust/ex2-surface/src/main.rs
+++ b/examples/rust/ex2-surface/src/main.rs
@@ -47,21 +47,20 @@ fn example_2(options: opt::Opt) -> libceed::Result<()> {
         quiet,
         gallery,
     } = options;
-    assert!(dim >= 1 && dim <= 3);
+    assert!((0..=3).contains(&dim));
     assert!(mesh_degree >= 1);
     assert!(solution_degree >= 1);
     assert!(num_qpts >= 1);
     let ncomp_x = dim;
-    let problem_size: i64;
-    if problem_size_requested < 0 {
-        problem_size = if test {
+    let problem_size: i64 = if problem_size_requested < 0 {
+        if test {
             16 * 16 * (dim * dim) as i64
         } else {
             256 * 1024
-        };
+        }
     } else {
-        problem_size = problem_size_requested;
-    }
+        problem_size_requested
+    };
 
     // Summary output
     if !quiet {
@@ -107,7 +106,7 @@ fn example_2(options: opt::Opt) -> libceed::Result<()> {
         if dim > 2 {
             print!(", nz = {}", num_xyz[2]);
         }
-        print!("\n");
+        println!();
     }
 
     // Build ElemRestriction objects describing the mesh and solution discrete
diff --git a/examples/rust/ex3-vector-volume/src/main.rs b/examples/rust/ex3-vector-volume/src/main.rs
index 8a50b2f09f..7b9a86a465 100644
--- a/examples/rust/ex3-vector-volume/src/main.rs
+++ b/examples/rust/ex3-vector-volume/src/main.rs
@@ -47,17 +47,20 @@ fn example_3(options: opt::Opt) -> libceed::Result<()> {
         quiet,
         gallery,
     } = options;
-    assert!(dim >= 1 && dim <= 3);
+    assert!((0..=3).contains(&dim));
     assert!(mesh_degree >= 1);
     assert!(solution_degree >= 1);
     assert!(num_qpts >= 1);
     let ncomp_x = dim;
-    let problem_size: i64;
-    if problem_size_requested < 0 {
-        problem_size = if test { 8 * 16 } else { 256 * 1024 };
+    let problem_size: i64 = if problem_size_requested < 0 {
+        if test {
+            8 * 16
+        } else {
+            256 * 1024
+        }
     } else {
-        problem_size = problem_size_requested;
-    }
+        problem_size_requested
+    };
     let ncomp_u = 3;
 
     // Summary output
@@ -104,7 +107,7 @@ fn example_3(options: opt::Opt) -> libceed::Result<()> {
         if dim > 2 {
             print!(", nz = {}", num_xyz[2]);
         }
-        print!("\n");
+        println!();
     }
 
     // Build ElemRestriction objects describing the mesh and solution discrete
diff --git a/examples/rust/ex4-vector-surface/src/main.rs b/examples/rust/ex4-vector-surface/src/main.rs
index 3ac489e8a2..fc15c1b034 100644
--- a/examples/rust/ex4-vector-surface/src/main.rs
+++ b/examples/rust/ex4-vector-surface/src/main.rs
@@ -48,21 +48,20 @@ fn example_4(options: opt::Opt) -> libceed::Result<()> {
         quiet,
         gallery,
     } = options;
-    assert!(dim >= 1 && dim <= 3);
+    assert!((0..=3).contains(&dim));
     assert!(mesh_degree >= 1);
     assert!(solution_degree >= 1);
     assert!(num_qpts >= 1);
     let ncomp_x = dim;
-    let problem_size: i64;
-    if problem_size_requested < 0 {
-        problem_size = if test {
+    let problem_size: i64 = if problem_size_requested < 0 {
+        if test {
             16 * 16 * (dim * dim) as i64
         } else {
             256 * 1024
-        };
+        }
     } else {
-        problem_size = problem_size_requested;
-    }
+        problem_size_requested
+    };
     let ncomp_u = 3;
 
     // Summary output
@@ -109,7 +108,7 @@ fn example_4(options: opt::Opt) -> libceed::Result<()> {
         if dim > 2 {
             print!(", nz = {}", num_xyz[2]);
         }
-        print!("\n");
+        println!();
     }
 
     // Build ElemRestriction objects describing the mesh and solution discrete
diff --git a/rust/libceed/src/lib.rs b/rust/libceed/src/lib.rs
index 7ead6ea964..04d34ad25d 100755
--- a/rust/libceed/src/lib.rs
+++ b/rust/libceed/src/lib.rs
@@ -309,7 +309,7 @@ impl Ceed {
 
         // Call to libCEED
         let mut ptr = std::ptr::null_mut();
-        let mut ierr = unsafe { bind_ceed::CeedInit(c_resource.as_ptr() as *const i8, &mut ptr) };
+        let mut ierr = unsafe { bind_ceed::CeedInit(c_resource.as_ptr(), &mut ptr) };
         if ierr != 0 {
             panic!("Error initializing backend resource: {}", resource)
         }
diff --git a/rust/libceed/src/operator.rs b/rust/libceed/src/operator.rs
index ec92f2f3e1..e4887f63f9 100644
--- a/rust/libceed/src/operator.rs
+++ b/rust/libceed/src/operator.rs
@@ -74,7 +74,7 @@ impl<'a> OperatorField<'a> {
     ///
     /// // Operator field arguments
     /// let ne = 3;
-    /// let q = 4 as usize;
+    /// let q = 4_usize;
     /// let mut ind: Vec<i32> = vec![0; 2 * ne];
     /// for i in 0..ne {
     ///     ind[2 * i + 0] = i as i32;
@@ -121,7 +121,7 @@ impl<'a> OperatorField<'a> {
     ///
     /// // Operator field arguments
     /// let ne = 3;
-    /// let q = 4 as usize;
+    /// let q = 4_usize;
     /// let mut ind: Vec<i32> = vec![0; 2 * ne];
     /// for i in 0..ne {
     ///     ind[2 * i + 0] = i as i32;
@@ -193,7 +193,7 @@ impl<'a> OperatorField<'a> {
     ///
     /// // Operator field arguments
     /// let ne = 3;
-    /// let q = 4 as usize;
+    /// let q = 4_usize;
     /// let mut ind: Vec<i32> = vec![0; 2 * ne];
     /// for i in 0..ne {
     ///     ind[2 * i + 0] = i as i32;
@@ -255,7 +255,7 @@ impl<'a> OperatorField<'a> {
     ///
     /// // Operator field arguments
     /// let ne = 3;
-    /// let q = 4 as usize;
+    /// let q = 4_usize;
     /// let mut ind: Vec<i32> = vec![0; 2 * ne];
     /// for i in 0..ne {
     ///     ind[2 * i + 0] = i as i32;
@@ -353,7 +353,7 @@ impl<'a> fmt::Display for OperatorCore<'a> {
 ///
 /// // Operator field arguments
 /// let ne = 3;
-/// let q = 4 as usize;
+/// let q = 4_usize;
 /// let mut ind: Vec<i32> = vec![0; 2 * ne];
 /// for i in 0..ne {
 ///     ind[2 * i + 0] = i as i32;
@@ -392,7 +392,7 @@ impl<'a> fmt::Display for Operator<'a> {
 ///
 /// // Sub operator field arguments
 /// let ne = 3;
-/// let q = 4 as usize;
+/// let q = 4_usize;
 /// let mut ind: Vec<i32> = vec![0; 2 * ne];
 /// for i in 0..ne {
 ///     ind[2 * i + 0] = i as i32;
@@ -584,7 +584,7 @@ impl<'a> Operator<'a> {
     ///
     /// // Operator field arguments
     /// let ne = 3;
-    /// let q = 4 as usize;
+    /// let q = 4_usize;
     /// let mut ind: Vec<i32> = vec![0; 2 * ne];
     /// for i in 0..ne {
     ///     ind[2 * i + 0] = i as i32;
@@ -808,7 +808,7 @@ impl<'a> Operator<'a> {
         v: impl Into<VectorOpt<'b>>,
     ) -> crate::Result<Self> {
         let fieldname = CString::new(fieldname).expect("CString::new failed");
-        let fieldname = fieldname.as_ptr() as *const i8;
+        let fieldname = fieldname.as_ptr();
         self.op_core.check_error(unsafe {
             bind_ceed::CeedOperatorSetField(
                 self.op_core.ptr,
@@ -831,7 +831,7 @@ impl<'a> Operator<'a> {
     ///
     /// // Operator field arguments
     /// let ne = 3;
-    /// let q = 4 as usize;
+    /// let q = 4_usize;
     /// let mut ind: Vec<i32> = vec![0; 2 * ne];
     /// for i in 0..ne {
     ///     ind[2 * i + 0] = i as i32;
@@ -866,7 +866,7 @@ impl<'a> Operator<'a> {
                 &mut num_inputs,
                 &mut inputs_ptr,
                 std::ptr::null_mut() as *mut bind_ceed::CeedInt,
-                std::ptr::null_mut() as *mut *mut bind_ceed::CeedOperatorField,
+                std::ptr::null_mut(),
             )
         })?;
         // Convert raw C pointers to fixed length slice
@@ -901,7 +901,7 @@ impl<'a> Operator<'a> {
     ///
     /// // Operator field arguments
     /// let ne = 3;
-    /// let q = 4 as usize;
+    /// let q = 4_usize;
     /// let mut ind: Vec<i32> = vec![0; 2 * ne];
     /// for i in 0..ne {
     ///     ind[2 * i + 0] = i as i32;
@@ -934,7 +934,7 @@ impl<'a> Operator<'a> {
             bind_ceed::CeedOperatorGetFields(
                 self.op_core.ptr,
                 std::ptr::null_mut() as *mut bind_ceed::CeedInt,
-                std::ptr::null_mut() as *mut *mut bind_ceed::CeedOperatorField,
+                std::ptr::null_mut(),
                 &mut num_outputs,
                 &mut outputs_ptr,
             )
@@ -2139,7 +2139,7 @@ impl<'a> CompositeOperator<'a> {
     ///
     /// // Sub operator field arguments
     /// let ne = 3;
-    /// let q = 4 as usize;
+    /// let q = 4_usize;
     /// let mut ind: Vec<i32> = vec![0; 2 * ne];
     /// for i in 0..ne {
     ///     ind[2 * i + 0] = i as i32;
diff --git a/rust/libceed/src/qfunction.rs b/rust/libceed/src/qfunction.rs
index c127cbe98b..08ca709051 100644
--- a/rust/libceed/src/qfunction.rs
+++ b/rust/libceed/src/qfunction.rs
@@ -108,7 +108,7 @@ impl<'a> QFunctionField<'a> {
         unsafe {
             bind_ceed::CeedQFunctionFieldGetEvalMode(self.ptr, &mut mode);
         }
-        crate::EvalMode::from_u32(mode as u32)
+        crate::EvalMode::from_u32(mode)
     }
 }
 
@@ -477,7 +477,7 @@ impl<'a> QFunctionCore<'a> {
                 &mut num_inputs,
                 &mut inputs_ptr,
                 std::ptr::null_mut() as *mut bind_ceed::CeedInt,
-                std::ptr::null_mut() as *mut *mut bind_ceed::CeedQFunctionField,
+                std::ptr::null_mut(),
             )
         })?;
         // Convert raw C pointers to fixed length slice
@@ -495,7 +495,7 @@ impl<'a> QFunctionCore<'a> {
             bind_ceed::CeedQFunctionGetFields(
                 self.ptr,
                 std::ptr::null_mut() as *mut bind_ceed::CeedInt,
-                std::ptr::null_mut() as *mut *mut bind_ceed::CeedQFunctionField,
+                std::ptr::null_mut(),
                 &mut num_outputs,
                 &mut outputs_ptr,
             )
diff --git a/rust/libceed/src/vector.rs b/rust/libceed/src/vector.rs
index dcf86d9e16..a095822c15 100644
--- a/rust/libceed/src/vector.rs
+++ b/rust/libceed/src/vector.rs
@@ -787,10 +787,7 @@ impl<'a> VectorView<'a> {
                 &mut array,
             )
         })?;
-        Ok(Self {
-            vec: vec,
-            array: array,
-        })
+        Ok(Self { vec, array })
     }
 }
 
@@ -839,10 +836,7 @@ impl<'a> VectorViewMut<'a> {
                 &mut ptr,
             )
         })?;
-        Ok(Self {
-            vec: vec,
-            array: ptr,
-        })
+        Ok(Self { vec, array: ptr })
     }
 }
 

From e60d507d35e0fc62a9fa0cec36b2c643702b7059 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Mon, 24 Feb 2025 12:59:06 -0700
Subject: [PATCH 356/571] rust - minor consistency

---
 rust/libceed/src/vector.rs | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/rust/libceed/src/vector.rs b/rust/libceed/src/vector.rs
index a095822c15..2a25041f48 100644
--- a/rust/libceed/src/vector.rs
+++ b/rust/libceed/src/vector.rs
@@ -828,15 +828,15 @@ pub struct VectorViewMut<'a> {
 impl<'a> VectorViewMut<'a> {
     /// Construct a VectorViewMut from a Vector reference
     fn new(vec: &'a mut Vector) -> crate::Result<Self> {
-        let mut ptr = std::ptr::null_mut();
+        let mut array = std::ptr::null_mut();
         vec.check_error(unsafe {
             bind_ceed::CeedVectorGetArray(
                 vec.ptr,
                 crate::MemType::Host as bind_ceed::CeedMemType,
-                &mut ptr,
+                &mut array,
             )
         })?;
-        Ok(Self { vec, array: ptr })
+        Ok(Self { vec, array })
     }
 }
 

From 78c2cefab85ecc740ba63f6df2f5d30a7426d159 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 21 Mar 2025 10:46:27 -0600
Subject: [PATCH 357/571] rust - more clippy fixes

---
 examples/rust/ex1-volume/src/main.rs         |  2 ++
 examples/rust/ex2-surface/src/main.rs        |  2 ++
 examples/rust/ex3-vector-volume/src/main.rs  |  2 ++
 examples/rust/ex4-vector-surface/src/main.rs |  2 ++
 examples/rust/mesh/src/lib.rs                | 26 +++++++++++---------
 rust/libceed/src/basis.rs                    |  6 ++++-
 rust/libceed/src/elem_restriction.rs         |  5 +++-
 rust/libceed/src/lib.rs                      |  7 ++++++
 rust/libceed/src/qfunction.rs                |  2 +-
 rust/libceed/src/vector.rs                   | 19 +++++++++++++-
 10 files changed, 58 insertions(+), 15 deletions(-)

diff --git a/examples/rust/ex1-volume/src/main.rs b/examples/rust/ex1-volume/src/main.rs
index 88f87d656e..dac51d4278 100644
--- a/examples/rust/ex1-volume/src/main.rs
+++ b/examples/rust/ex1-volume/src/main.rs
@@ -33,6 +33,8 @@ fn main() -> libceed::Result<()> {
     example_1(options)
 }
 
+#[allow(clippy::erasing_op)]
+#[allow(clippy::identity_op)]
 fn example_1(options: opt::Opt) -> libceed::Result<()> {
     // Process command line arguments
     let opt::Opt {
diff --git a/examples/rust/ex2-surface/src/main.rs b/examples/rust/ex2-surface/src/main.rs
index e0cec46e7c..de5172aef6 100644
--- a/examples/rust/ex2-surface/src/main.rs
+++ b/examples/rust/ex2-surface/src/main.rs
@@ -34,6 +34,8 @@ fn main() -> libceed::Result<()> {
     example_2(options)
 }
 
+#[allow(clippy::erasing_op)]
+#[allow(clippy::identity_op)]
 fn example_2(options: opt::Opt) -> libceed::Result<()> {
     // Process command line arguments
     let opt::Opt {
diff --git a/examples/rust/ex3-vector-volume/src/main.rs b/examples/rust/ex3-vector-volume/src/main.rs
index 7b9a86a465..ef8636791a 100644
--- a/examples/rust/ex3-vector-volume/src/main.rs
+++ b/examples/rust/ex3-vector-volume/src/main.rs
@@ -34,6 +34,8 @@ fn main() -> libceed::Result<()> {
     example_3(options)
 }
 
+#[allow(clippy::erasing_op)]
+#[allow(clippy::identity_op)]
 fn example_3(options: opt::Opt) -> libceed::Result<()> {
     // Process command line arguments
     let opt::Opt {
diff --git a/examples/rust/ex4-vector-surface/src/main.rs b/examples/rust/ex4-vector-surface/src/main.rs
index fc15c1b034..a05d35a222 100644
--- a/examples/rust/ex4-vector-surface/src/main.rs
+++ b/examples/rust/ex4-vector-surface/src/main.rs
@@ -35,6 +35,8 @@ fn main() -> libceed::Result<()> {
     example_4(options)
 }
 
+#[allow(clippy::erasing_op)]
+#[allow(clippy::identity_op)]
 fn example_4(options: opt::Opt) -> libceed::Result<()> {
     // Process command line arguments
     let opt::Opt {
diff --git a/examples/rust/mesh/src/lib.rs b/examples/rust/mesh/src/lib.rs
index f249e6dfab..5623cdaaaf 100644
--- a/examples/rust/mesh/src/lib.rs
+++ b/examples/rust/mesh/src/lib.rs
@@ -6,6 +6,7 @@
 // This file is part of CEED:  http://github.com/ceed
 
 use libceed::{Ceed, ElemRestriction, Vector};
+use std::convert::TryInto;
 
 // ----------------------------------------------------------------------------
 // Determine problem size in each dimension from size and dimenison
@@ -22,16 +23,19 @@ pub fn cartesian_mesh_size(dim: usize, solution_degree: usize, problem_size: i64
 
     // Size per dimension
     let mut r = s % dim;
-    let mut num_xyz = [0; 3];
-    for d in 0..dim {
-        let mut sd = s / dim;
-        if r > 0 {
-            sd += 1;
-            r -= 1;
-        }
-        num_xyz[d] = 1 << sd;
-    }
-    num_xyz
+    let xyz: [usize; 3] = (0..3)
+        .map(|_| -> usize {
+            let mut sd = s / dim;
+            if r > 0 {
+                sd += 1;
+                r -= 1;
+            }
+            1 << sd
+        })
+        .collect::<Vec<usize>>()
+        .try_into()
+        .unwrap();
+    xyz
 }
 
 // ----------------------------------------------------------------------------
@@ -95,7 +99,7 @@ pub fn build_cartesian_restriction(
         &elem_nodes,
     )?;
 
-    // Quadratue data restriction
+    // Quadrature data restriction
     let rstr_qdata = ceed.strided_elem_restriction(
         num_elem,
         elem_qpts,
diff --git a/rust/libceed/src/basis.rs b/rust/libceed/src/basis.rs
index d16f8420f6..72c3cb8bde 100644
--- a/rust/libceed/src/basis.rs
+++ b/rust/libceed/src/basis.rs
@@ -27,7 +27,7 @@ impl<'a> From<&'a Basis<'_>> for BasisOpt<'a> {
 }
 impl<'a> BasisOpt<'a> {
     /// Transform a Rust libCEED BasisOpt into C libCEED CeedBasis
-    pub(crate) fn to_raw(self) -> bind_ceed::CeedBasis {
+    pub(crate) fn to_raw(&self) -> bind_ceed::CeedBasis {
         match self {
             Self::Some(basis) => basis.ptr,
             Self::None => unsafe { bind_ceed::CEED_BASIS_NONE },
@@ -134,6 +134,7 @@ impl<'a> fmt::Display for Basis<'a> {
 // -----------------------------------------------------------------------------
 impl<'a> Basis<'a> {
     // Constructors
+    #[allow(clippy::too_many_arguments)]
     pub fn create_tensor_H1(
         ceed: &crate::Ceed,
         dim: usize,
@@ -204,6 +205,7 @@ impl<'a> Basis<'a> {
         })
     }
 
+    #[allow(clippy::too_many_arguments)]
     pub fn create_H1(
         ceed: &crate::Ceed,
         topo: crate::ElemTopology,
@@ -242,6 +244,7 @@ impl<'a> Basis<'a> {
         })
     }
 
+    #[allow(clippy::too_many_arguments)]
     pub fn create_Hdiv(
         ceed: &crate::Ceed,
         topo: crate::ElemTopology,
@@ -280,6 +283,7 @@ impl<'a> Basis<'a> {
         })
     }
 
+    #[allow(clippy::too_many_arguments)]
     pub fn create_Hcurl(
         ceed: &crate::Ceed,
         topo: crate::ElemTopology,
diff --git a/rust/libceed/src/elem_restriction.rs b/rust/libceed/src/elem_restriction.rs
index 60692b49e5..e6dcd74145 100644
--- a/rust/libceed/src/elem_restriction.rs
+++ b/rust/libceed/src/elem_restriction.rs
@@ -28,7 +28,7 @@ impl<'a> From<&'a ElemRestriction<'_>> for ElemRestrictionOpt<'a> {
 impl<'a> ElemRestrictionOpt<'a> {
     /// Transform a Rust libCEED ElemRestrictionOpt into C libCEED
     /// CeedElemRestriction
-    pub(crate) fn to_raw(self) -> bind_ceed::CeedElemRestriction {
+    pub(crate) fn to_raw(&self) -> bind_ceed::CeedElemRestriction {
         match self {
             Self::Some(rstr) => rstr.ptr,
             Self::None => unsafe { bind_ceed::CEED_ELEMRESTRICTION_NONE },
@@ -153,6 +153,7 @@ impl<'a> fmt::Display for ElemRestriction<'a> {
 // -----------------------------------------------------------------------------
 impl<'a> ElemRestriction<'a> {
     // Constructors
+    #[allow(clippy::too_many_arguments)]
     pub fn create(
         ceed: &crate::Ceed,
         nelem: usize,
@@ -199,6 +200,7 @@ impl<'a> ElemRestriction<'a> {
         })
     }
 
+    #[allow(clippy::too_many_arguments)]
     pub fn create_oriented(
         ceed: &crate::Ceed,
         nelem: usize,
@@ -240,6 +242,7 @@ impl<'a> ElemRestriction<'a> {
         })
     }
 
+    #[allow(clippy::too_many_arguments)]
     pub fn create_curl_oriented(
         ceed: &crate::Ceed,
         nelem: usize,
diff --git a/rust/libceed/src/lib.rs b/rust/libceed/src/lib.rs
index 04d34ad25d..0821989a7e 100755
--- a/rust/libceed/src/lib.rs
+++ b/rust/libceed/src/lib.rs
@@ -441,6 +441,7 @@ impl Ceed {
     /// # Ok(())
     /// # }
     /// ```
+    #[allow(clippy::too_many_arguments)]
     pub fn elem_restriction<'a>(
         &self,
         nelem: usize,
@@ -504,6 +505,7 @@ impl Ceed {
     /// # Ok(())
     /// # }
     /// ```
+    #[allow(clippy::too_many_arguments)]
     pub fn oriented_elem_restriction<'a>(
         &self,
         nelem: usize,
@@ -590,6 +592,7 @@ impl Ceed {
     /// # Ok(())
     /// # }
     /// ```
+    #[allow(clippy::too_many_arguments)]
     pub fn curl_oriented_elem_restriction<'a>(
         &self,
         nelem: usize,
@@ -691,6 +694,7 @@ impl Ceed {
     /// # Ok(())
     /// # }
     /// ```
+    #[allow(clippy::too_many_arguments)]
     pub fn basis_tensor_H1<'a>(
         &self,
         dim: usize,
@@ -853,6 +857,7 @@ impl Ceed {
     /// # Ok(())
     /// # }
     /// ```
+    #[allow(clippy::too_many_arguments)]
     pub fn basis_H1<'a>(
         &self,
         topo: ElemTopology,
@@ -951,6 +956,7 @@ impl Ceed {
     /// # Ok(())
     /// # }
     /// ```
+    #[allow(clippy::too_many_arguments)]
     pub fn basis_Hdiv<'a>(
         &self,
         topo: ElemTopology,
@@ -1045,6 +1051,7 @@ impl Ceed {
     /// # Ok(())
     /// # }
     /// ```
+    #[allow(clippy::too_many_arguments)]
     pub fn basis_Hcurl<'a>(
         &self,
         topo: ElemTopology,
diff --git a/rust/libceed/src/qfunction.rs b/rust/libceed/src/qfunction.rs
index 08ca709051..652ad3b4e4 100644
--- a/rust/libceed/src/qfunction.rs
+++ b/rust/libceed/src/qfunction.rs
@@ -139,7 +139,7 @@ impl<'a> From<&'a QFunctionByName<'_>> for QFunctionOpt<'a> {
 
 impl<'a> QFunctionOpt<'a> {
     /// Transform a Rust libCEED QFunctionOpt into C libCEED CeedQFunction
-    pub(crate) fn to_raw(self) -> bind_ceed::CeedQFunction {
+    pub(crate) fn to_raw(&self) -> bind_ceed::CeedQFunction {
         match self {
             Self::SomeQFunction(qfunc) => qfunc.qf_core.ptr,
             Self::SomeQFunctionByName(qfunc) => qfunc.qf_core.ptr,
diff --git a/rust/libceed/src/vector.rs b/rust/libceed/src/vector.rs
index 2a25041f48..827aac8436 100644
--- a/rust/libceed/src/vector.rs
+++ b/rust/libceed/src/vector.rs
@@ -34,7 +34,7 @@ impl<'a> From<&'a Vector<'_>> for VectorOpt<'a> {
 }
 impl<'a> VectorOpt<'a> {
     /// Transform a Rust libCEED VectorOpt into C libCEED CeedVector
-    pub(crate) fn to_raw(self) -> bind_ceed::CeedVector {
+    pub(crate) fn to_raw(&self) -> bind_ceed::CeedVector {
         match self {
             Self::Some(vec) => vec.ptr,
             Self::Active => unsafe { bind_ceed::CEED_VECTOR_ACTIVE },
@@ -385,6 +385,23 @@ impl<'a> Vector<'a> {
         self.length()
     }
 
+    /// Returns true if the Vector contains no elements
+    ///
+    /// ```
+    /// # use libceed::prelude::*;
+    /// # fn main() -> libceed::Result<()> {
+    /// # let ceed = libceed::Ceed::default_init();
+    /// let vec = ceed.vector(10)?;
+    /// assert!(!vec.is_empty(), "Incorrect emptiness");
+    /// let empty_vec = ceed.vector(0)?;
+    /// assert!(empty_vec.is_empty(), "Incorrect emptiness");
+    /// # Ok(())
+    /// # }
+    /// ```
+    pub fn is_empty(&self) -> bool {
+        self.length() == 0
+    }
+
     /// Set the Vector to a constant value
     ///
     /// # arguments

From 9ab2bffdf1d647bda81a50b6e25869e9013f7257 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 21 Mar 2025 10:55:34 -0600
Subject: [PATCH 358/571] rust - more clippy fixes

---
 rust/libceed/src/operator.rs  |  2 +-
 rust/libceed/src/qfunction.rs | 18 ++++--------------
 2 files changed, 5 insertions(+), 15 deletions(-)

diff --git a/rust/libceed/src/operator.rs b/rust/libceed/src/operator.rs
index e4887f63f9..bd480c3936 100644
--- a/rust/libceed/src/operator.rs
+++ b/rust/libceed/src/operator.rs
@@ -1898,7 +1898,7 @@ impl<'a> Operator<'a> {
         p_mult_fine: &Vector,
         rstr_coarse: &ElemRestriction,
         basis_coarse: &Basis,
-        interpCtoF: &Vec<crate::Scalar>,
+        interpCtoF: &[crate::Scalar],
     ) -> crate::Result<(Operator<'b>, Operator<'b>, Operator<'b>)> {
         let mut ptr_coarse = std::ptr::null_mut();
         let mut ptr_prolong = std::ptr::null_mut();
diff --git a/rust/libceed/src/qfunction.rs b/rust/libceed/src/qfunction.rs
index 652ad3b4e4..dac75c8f9f 100644
--- a/rust/libceed/src/qfunction.rs
+++ b/rust/libceed/src/qfunction.rs
@@ -566,12 +566,6 @@ unsafe extern "C" fn trampoline(
     (trampoline_data.get_unchecked_mut().user_f)(inputs_array, outputs_array)
 }
 
-unsafe extern "C" fn destroy_trampoline(ctx: *mut ::std::os::raw::c_void) -> ::std::os::raw::c_int {
-    let trampoline_data: Pin<&mut QFunctionTrampolineData> = std::mem::transmute(ctx);
-    drop(trampoline_data);
-    0 // Clean error code
-}
-
 // -----------------------------------------------------------------------------
 // QFunction
 // -----------------------------------------------------------------------------
@@ -623,14 +617,10 @@ impl<'a> QFunction<'a> {
                 crate::MemType::Host as bind_ceed::CeedMemType,
                 crate::CopyMode::UsePointer as bind_ceed::CeedCopyMode,
                 std::mem::size_of::<QFunctionTrampolineData>(),
-                std::mem::transmute(trampoline_data.as_ref()),
-            )
-        })?;
-        ceed.check_error(unsafe {
-            bind_ceed::CeedQFunctionContextSetDataDestroy(
-                qf_ctx_ptr,
-                crate::MemType::Host as bind_ceed::CeedMemType,
-                Some(destroy_trampoline),
+                std::mem::transmute::<
+                    std::pin::Pin<&QFunctionTrampolineData>,
+                    *mut std::ffi::c_void,
+                >(trampoline_data.as_ref()),
             )
         })?;
         ceed.check_error(unsafe { bind_ceed::CeedQFunctionSetContext(ptr, qf_ctx_ptr) })?;

From 91216f769229b3793e221c647f948ce6d0249ef6 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 21 Mar 2025 12:24:27 -0600
Subject: [PATCH 359/571] rust - add annotation to generated bindings for
 clippy

---
 rust/libceed-sys/src/lib.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/rust/libceed-sys/src/lib.rs b/rust/libceed-sys/src/lib.rs
index 1279fdddf5..3335910adb 100644
--- a/rust/libceed-sys/src/lib.rs
+++ b/rust/libceed-sys/src/lib.rs
@@ -17,5 +17,6 @@ pub mod bind_ceed {
     #![allow(non_upper_case_globals)]
     #![allow(non_camel_case_types)]
     #![allow(dead_code)]
+    #![allow(clippy::too_long_first_doc_paragraph)]
     include!(concat!(env!("OUT_DIR"), "/bindings.rs"));
 }

From 6eb06d7cb0f5787c494a4969c0aa6769f3bcfbd0 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 21 Mar 2025 12:48:53 -0600
Subject: [PATCH 360/571] format - use llvm 19

---
 .github/workflows/c-fortran-test-style.yml | 6 +++---
 include/ceed/backend.h                     | 6 ++++--
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/c-fortran-test-style.yml b/.github/workflows/c-fortran-test-style.yml
index 5658082596..7ee4f7f2d6 100644
--- a/.github/workflows/c-fortran-test-style.yml
+++ b/.github/workflows/c-fortran-test-style.yml
@@ -21,12 +21,12 @@ jobs:
     - name: Install clang-format
       run: |
           wget -O- https://apt.llvm.org/llvm-snapshot.gpg.key | sudo apt-key add -
-          sudo add-apt-repository 'deb http://apt.llvm.org/noble/ llvm-toolchain-noble-18 main'
-          sudo apt update && sudo apt install clang-format-18
+          sudo add-apt-repository 'deb http://apt.llvm.org/noble/ llvm-toolchain-noble-19 main'
+          sudo apt update && sudo apt install clang-format-19
     - name: C style
       env:
         CC: ${{ matrix.compiler }}
         FC: gfortran-11
       run: |
         make info
-        make format-c -j CLANG_FORMAT=clang-format-18 && git diff --exit-code
+        make format-c -j CLANG_FORMAT=clang-format-19 && git diff --exit-code
diff --git a/include/ceed/backend.h b/include/ceed/backend.h
index a599730de8..7cdbe63ef9 100644
--- a/include/ceed/backend.h
+++ b/include/ceed/backend.h
@@ -142,8 +142,10 @@ CEED_EXTERN bool CeedDebugFlagEnv(void);
   @ingroup Ceed
   @ref     Backend
 **/
-#define CeedWarn(...) \
-  { CeedDebugImpl256(CEED_DEBUG_COLOR_WARNING, ##__VA_ARGS__); }
+#define CeedWarn(...)                                          \
+  {                                                            \
+    CeedDebugImpl256(CEED_DEBUG_COLOR_WARNING, ##__VA_ARGS__); \
+  }
 
 /**
   Swap the values of two CeedScalars

From eaf9ad10749a18f88ca456c5a77eb8cd19c96504 Mon Sep 17 00:00:00 2001
From: Zach Atkins <Zach.Atkins@colorado.edu>
Date: Fri, 21 Mar 2025 15:47:59 -0700
Subject: [PATCH 361/571] Include tensor basis header for pure-tensor operators

---
 backends/hip-gen/ceed-hip-gen-operator-build.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backends/hip-gen/ceed-hip-gen-operator-build.cpp b/backends/hip-gen/ceed-hip-gen-operator-build.cpp
index 7e9e3d6863..0638c12dca 100644
--- a/backends/hip-gen/ceed-hip-gen-operator-build.cpp
+++ b/backends/hip-gen/ceed-hip-gen-operator-build.cpp
@@ -1204,7 +1204,7 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_bu
   }
 
   // Load basis source files
-  if (!is_all_tensor) {
+  if (!is_all_nontensor) {
     code << "// Tensor basis source\n";
     code << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-templates.h>\n\n";
   }

From 8ddf3821d65409a20079cd5155deefdfc980e853 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Tue, 25 Mar 2025 09:26:44 -0600
Subject: [PATCH 362/571] ci - use macos-15

---
 .github/workflows/c-fortran-test-linux-osx.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/c-fortran-test-linux-osx.yml b/.github/workflows/c-fortran-test-linux-osx.yml
index 180eb9ecdc..b49d23e1bc 100644
--- a/.github/workflows/c-fortran-test-linux-osx.yml
+++ b/.github/workflows/c-fortran-test-linux-osx.yml
@@ -10,7 +10,7 @@ jobs:
   test:
     strategy:
       matrix:
-        os: [ubuntu-24.04, macos-13]
+        os: [ubuntu-24.04, macos-15]
         compiler: [gcc-13, clang]
 
     runs-on: ${{ matrix.os }}

From 44bedce47970978d946e78a8673df21b3b2dd4a0 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Tue, 25 Mar 2025 12:42:11 -0600
Subject: [PATCH 363/571] ci - macos needs -jn now over -j

---
 .github/workflows/c-fortran-test-linux-osx.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/c-fortran-test-linux-osx.yml b/.github/workflows/c-fortran-test-linux-osx.yml
index b49d23e1bc..1705f42936 100644
--- a/.github/workflows/c-fortran-test-linux-osx.yml
+++ b/.github/workflows/c-fortran-test-linux-osx.yml
@@ -25,4 +25,4 @@ jobs:
       run: |
         make info
         make -j
-        make prove -j
+        make prove -j2

From 2686ebe66fefd82d726dbc7f3840c05a817b6ad4 Mon Sep 17 00:00:00 2001
From: Jed Brown <jed@jedbrown.org>
Date: Fri, 21 Mar 2025 23:31:06 -0600
Subject: [PATCH 364/571] doc: fix misspelled parameter

---
 interface/ceed.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/interface/ceed.c b/interface/ceed.c
index 0caff73f62..445959fd20 100644
--- a/interface/ceed.c
+++ b/interface/ceed.c
@@ -949,12 +949,12 @@ int CeedRestoreJitSourceRoots(Ceed ceed, const char ***jit_source_roots) {
 
   @ref Backend
 **/
-int CeedGetJitDefines(Ceed ceed, CeedInt *num_defines, const char ***jit_defines) {
+int CeedGetJitDefines(Ceed ceed, CeedInt *num_jit_defines, const char ***jit_defines) {
   Ceed ceed_parent;
 
   CeedCall(CeedGetParent(ceed, &ceed_parent));
-  *num_defines = ceed_parent->num_jit_defines;
-  *jit_defines = (const char **)ceed_parent->jit_defines;
+  *num_jit_defines = ceed_parent->num_jit_defines;
+  *jit_defines     = (const char **)ceed_parent->jit_defines;
   ceed_parent->num_jit_defines_readers++;
   CeedCall(CeedDestroy(&ceed_parent));
   return CEED_ERROR_SUCCESS;

From 4f69910b6e3819988a1446e35e0e85e74672bc23 Mon Sep 17 00:00:00 2001
From: Jed Brown <jed@jedbrown.org>
Date: Fri, 21 Mar 2025 23:32:05 -0600
Subject: [PATCH 365/571] Add run-time interface for build-time config (fix
 #1732)

This adds CeedGetGitVersion() to access output of `git describe --dirty`
and CeedGetBuildConfiguration() to get variables like compilers and
flags.

For builds without a Git repo, we return "unknown" or the user can set
the string via `make GIT_DESCRIBE=deadbeaf`.
---
 Makefile                          | 23 +++++++++++++++++--
 doc/sphinx/source/releasenotes.md |  2 ++
 include/ceed/ceed.h               |  2 ++
 interface/ceed.c                  | 37 ++++++++++++++++++++++++++++++-
 tests/t010-config.c               | 14 ++++++++++++
 5 files changed, 75 insertions(+), 3 deletions(-)
 create mode 100644 tests/t010-config.c

diff --git a/Makefile b/Makefile
index c49058bbcb..dffb530d4b 100644
--- a/Makefile
+++ b/Makefile
@@ -171,7 +171,7 @@ CFLAGS ?= $(OPT) $(CFLAGS.$(CC_VENDOR)) $(if $(PEDANTIC),$(PEDANTICFLAGS))
 CXXFLAGS ?= $(OPT) $(CXXFLAGS.$(CC_VENDOR)) $(if $(PEDANTIC),$(PEDANTICFLAGS))
 FFLAGS ?= $(OPT) $(FFLAGS.$(FC_VENDOR))
 LIBCXX ?= -lstdc++
-NVCCFLAGS ?= -ccbin $(CXX) -Xcompiler "$(OPT)" -Xcompiler -fPIC
+NVCCFLAGS ?= -ccbin $(CXX) -Xcompiler '$(OPT)' -Xcompiler -fPIC
 ifneq ($(CUDA_ARCH),)
   NVCCFLAGS += -arch=$(CUDA_ARCH)
 endif
@@ -660,7 +660,7 @@ endif
 # when creating shared or static libraries.
 weak_last = $(filter-out %-weak.o,$(1)) $(filter %-weak.o,$(1))
 
-libceed.o = $(libceed.c:%.c=$(OBJDIR)/%.o) $(libceed.cpp:%.cpp=$(OBJDIR)/%.o) $(libceed.cu:%.cu=$(OBJDIR)/%.o) $(libceed.hip:%.hip.cpp=$(OBJDIR)/%.o) $(libceed.sycl:%.sycl.cpp=$(OBJDIR)/%.o)
+libceed.o = $(libceed.c:%.c=$(OBJDIR)/%.o) $(OBJDIR)/interface/ceed-config.o $(libceed.cpp:%.cpp=$(OBJDIR)/%.o) $(libceed.cu:%.cu=$(OBJDIR)/%.o) $(libceed.hip:%.hip.cpp=$(OBJDIR)/%.o) $(libceed.sycl:%.sycl.cpp=$(OBJDIR)/%.o)
 $(filter %fortran.o,$(libceed.o)) : CPPFLAGS += $(if $(filter 1,$(UNDERSCORE)),-DUNDERSCORE)
 $(libceed.o): | info-backends
 $(libceed.so) : $(call weak_last,$(libceed.o)) | $$(@D)/.DIR
@@ -672,6 +672,9 @@ $(libceed.a) : $(call weak_last,$(libceed.o)) | $$(@D)/.DIR
 $(OBJDIR)/%.o : $(CURDIR)/%.c | $$(@D)/.DIR
 	$(call quiet,CC) $(CPPFLAGS) $(CFLAGS) -c -o $@ $(abspath $<)
 
+$(OBJDIR)/%.o : $(OBJDIR)/%.c | $$(@D)/.DIR # source files generated in OBJDIR
+	$(call quiet,CC) $(CPPFLAGS) $(CFLAGS) -c -o $@ $(abspath $<)
+
 $(OBJDIR)/%.o : $(CURDIR)/%.cpp | $$(@D)/.DIR
 	$(call quiet,CXX) $(CPPFLAGS) $(CXXFLAGS) -c -o $@ $(abspath $<)
 
@@ -847,6 +850,22 @@ $(OBJDIR)/ceed.pc : pkgconfig-prefix = $(prefix)
 	    -e "s:%prefix%:$(pkgconfig-prefix):" \
 	    -e "s:%libs_private%:$(pkgconfig-libs-private):" $< > $@
 
+GIT_DESCRIBE = $(shell git describe --dirty || printf "unknown\n")
+
+$(OBJDIR)/interface/ceed-config.c: Makefile
+	@$(file >$@,#include <ceed-impl.h>) \
+	$(file >>$@,int CeedGetGitVersion(const char **git_version) {) \
+  $(file >>$@,  *git_version = "$(GIT_DESCRIBE)";) \
+	$(file >>$@,  return 0;) \
+  $(file >>$@,}) \
+  $(file >>$@,) \
+  $(file >>$@,int CeedGetBuildConfiguration(const char **build_config) {) \
+  $(file >>$@,  *build_config =) \
+  $(foreach v,$(CONFIG_VARS),$(file >>$@,"$(v) = $($(v))\n")) \
+	$(file >>$@,  ;) \
+	$(file >>$@,  return 0;) \
+  $(file >>$@,})
+
 $(OBJDIR)/interface/ceed-jit-source-root-default.o : CPPFLAGS += -DCEED_JIT_SOURCE_ROOT_DEFAULT="\"$(abspath ./include)/\""
 $(OBJDIR)/interface/ceed-jit-source-root-install.o : CPPFLAGS += -DCEED_JIT_SOURCE_ROOT_DEFAULT="\"$(abspath $(includedir))/\""
 
diff --git a/doc/sphinx/source/releasenotes.md b/doc/sphinx/source/releasenotes.md
index 28d5caaf62..2e329e54d7 100644
--- a/doc/sphinx/source/releasenotes.md
+++ b/doc/sphinx/source/releasenotes.md
@@ -26,6 +26,8 @@ On this page we provide a summary of the main API changes, new features and exam
 Specifically, directories set with `CeedAddJitSourceRoot(ceed, "foo/bar")` will be used to set `-Ifoo/bar` and defines set with `CeedAddJitDefine(ceed, "foo=bar")` will be used to set `-Dfoo=bar`.
 - Added non-tensor basis support to code generation backends `/gpu/cuda/gen` and `/gpu/hip/gen`.
 - Added support to code generation backends `/gpu/cuda/gen` and `/gpu/hip/gen` for operators with both tensor and non-tensor bases.
+- Add `CeedGetGitVersion()` to access the Git commit and dirty state of the repository at build time.
+- Add `CeedGetBuildConfiguration()` to access compilers, flags, and related information about the build environment.
 
 ### Examples
 
diff --git a/include/ceed/ceed.h b/include/ceed/ceed.h
index b1851d6a27..b56a719754 100644
--- a/include/ceed/ceed.h
+++ b/include/ceed/ceed.h
@@ -163,6 +163,8 @@ CEED_EXTERN int CeedErrorExit(Ceed ceed, const char *filename, int line_no, cons
     (CEED_VERSION_MAJOR == major && (CEED_VERSION_MINOR > minor || (CEED_VERSION_MINOR == minor && CEED_VERSION_PATCH >= patch)))))
 
 CEED_EXTERN int CeedGetVersion(int *major, int *minor, int *patch, bool *release);
+CEED_EXTERN int CeedGetGitVersion(const char **git_version);
+CEED_EXTERN int CeedGetBuildConfiguration(const char **build_config);
 
 CEED_EXTERN int CeedGetScalarType(CeedScalarType *scalar_type);
 
diff --git a/interface/ceed.c b/interface/ceed.c
index 445959fd20..2d4ea6f6ec 100644
--- a/interface/ceed.c
+++ b/interface/ceed.c
@@ -1672,7 +1672,7 @@ int CeedResetErrorMessage(Ceed ceed, const char **err_msg) {
 
   @ref Developer
 
-  @sa CEED_VERSION_GE()
+  @sa CEED_VERSION_GE() CeedGetGitVersion() CeedGetBuildConfiguration()
 */
 int CeedGetVersion(int *major, int *minor, int *patch, bool *release) {
   if (major) *major = CEED_VERSION_MAJOR;
@@ -1682,6 +1682,41 @@ int CeedGetVersion(int *major, int *minor, int *patch, bool *release) {
   return CEED_ERROR_SUCCESS;
 }
 
+/**
+  @brief Get output of `git describe --dirty` from build time
+
+  While @ref CeedGetVersion() uniqely identifies the source code for release
+  builds, it does not identify builds from other commits.
+
+  @param[out] git_version A static string containing the Git commit description.
+
+  If `git describe --dirty` fails, the string `"unknown"` will be provided. This
+  could occur if Git is not installed or if libCEED is not being built from a
+  repository, for example.`
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Developer
+
+  @sa CeedGetVersion() CeedGetBuildConfiguration()
+*/
+int CeedGetGitVersion(const char **git_version);  // defined in generated ceed-config.h
+
+/**
+  @brief Get build variables as a multi-line string
+
+  Each line of the string has the format `VARNAME = value`.
+
+  @param[out] build_config A static string containing build variables
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Developer
+
+  @sa CeedGetVersion() CeedGetGitVersion()
+*/
+int CeedGetBuildConfiguration(const char **build_config);  // defined in generated ceed-config.h
+
 /**
   @brief Get libCEED scalar type, such as F64 or F32
 
diff --git a/tests/t010-config.c b/tests/t010-config.c
new file mode 100644
index 0000000000..1becbc6ddd
--- /dev/null
+++ b/tests/t010-config.c
@@ -0,0 +1,14 @@
+/// @file
+/// Test git version and build configuration
+/// \test Test git version and build configuration
+#include <ceed.h>
+#include <stdio.h>
+
+int main(int argc, char **argv) {
+  const char *git_version, *build_config;
+  CeedGetGitVersion(&git_version);
+  CeedGetBuildConfiguration(&build_config);
+  // printf("Git: %s\n", git_version);
+  // puts(build_config);
+  return 0;
+}

From fd27ce53f063357d4a24dac1b435f98ba46cb3f8 Mon Sep 17 00:00:00 2001
From: Jed Brown <jed@jedbrown.org>
Date: Fri, 21 Mar 2025 23:38:12 -0600
Subject: [PATCH 366/571] doc: upgrade Python packages

---
 doc/sphinx/requirements.txt | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/doc/sphinx/requirements.txt b/doc/sphinx/requirements.txt
index 76b40ca3ab..f4f8145a5c 100644
--- a/doc/sphinx/requirements.txt
+++ b/doc/sphinx/requirements.txt
@@ -1,11 +1,11 @@
 altair>=5.0
-breathe>=4.30
-myst-parser[linkify]>=0.14.0
-sphinx-hoverxref>=0.3b1
+breathe>=4.36
+myst-parser[linkify]>=4.0.1
+sphinx-hoverxref>=1.4.2
 sphinx-design
-sphinx>=5.3,<6
+sphinx>=7.2
 sphinx_rtd_theme
-sphinxcontrib-bibtex==2.5
+sphinxcontrib-bibtex==2.6.3
 sphinxcontrib-katex
 sphinxcontrib-mermaid
 sphinxcontrib-svg2pdfconverter

From c2d180adb4355a42c129dc3a120cdc0aec996d4b Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Tue, 25 Mar 2025 08:47:34 -0600
Subject: [PATCH 367/571] make - fallback to simpler git describe if able

---
 Makefile | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index dffb530d4b..b87e7b30c0 100644
--- a/Makefile
+++ b/Makefile
@@ -456,6 +456,11 @@ info:
 	$(info pkgconfigdir  = $(value pkgconfigdir))
 	$(info )
 	$(info -----------------------------------------)
+	$(info )
+	$(info Git:)
+	$(info describe      = $(GIT_DESCRIBE))
+	$(info )
+	$(info -----------------------------------------)
 	@true
 
 info-backends:
@@ -850,10 +855,11 @@ $(OBJDIR)/ceed.pc : pkgconfig-prefix = $(prefix)
 	    -e "s:%prefix%:$(pkgconfig-prefix):" \
 	    -e "s:%libs_private%:$(pkgconfig-libs-private):" $< > $@
 
-GIT_DESCRIBE = $(shell git describe --dirty || printf "unknown\n")
+GIT_DESCRIBE = $(shell git describe --always --dirty 2>/dev/null || printf "unknown\n")
 
 $(OBJDIR)/interface/ceed-config.c: Makefile
 	@$(file >$@,#include <ceed-impl.h>) \
+	$(file >>$@,) \
 	$(file >>$@,int CeedGetGitVersion(const char **git_version) {) \
   $(file >>$@,  *git_version = "$(GIT_DESCRIBE)";) \
 	$(file >>$@,  return 0;) \

From 428c7f6a3ea1462dd35b3c8a672a530c0e133c75 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Tue, 25 Mar 2025 14:40:40 -0600
Subject: [PATCH 368/571] make - version <4 does not support file

---
 Makefile | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/Makefile b/Makefile
index b87e7b30c0..fddda541b3 100644
--- a/Makefile
+++ b/Makefile
@@ -858,19 +858,17 @@ $(OBJDIR)/ceed.pc : pkgconfig-prefix = $(prefix)
 GIT_DESCRIBE = $(shell git describe --always --dirty 2>/dev/null || printf "unknown\n")
 
 $(OBJDIR)/interface/ceed-config.c: Makefile
-	@$(file >$@,#include <ceed-impl.h>) \
-	$(file >>$@,) \
-	$(file >>$@,int CeedGetGitVersion(const char **git_version) {) \
-  $(file >>$@,  *git_version = "$(GIT_DESCRIBE)";) \
-	$(file >>$@,  return 0;) \
-  $(file >>$@,}) \
-  $(file >>$@,) \
-  $(file >>$@,int CeedGetBuildConfiguration(const char **build_config) {) \
-  $(file >>$@,  *build_config =) \
-  $(foreach v,$(CONFIG_VARS),$(file >>$@,"$(v) = $($(v))\n")) \
-	$(file >>$@,  ;) \
-	$(file >>$@,  return 0;) \
-  $(file >>$@,})
+	@printf '#include <ceed-impl.h>\n\n' > $@
+	@printf 'int CeedGetGitVersion(const char **git_version) {\n' >> $@
+	@printf '  *git_version = "$(GIT_DESCRIBE)";\n' >> $@
+	@printf '  return 0;\n' >> $@
+	@printf '}\n\n' >> $@
+	@printf 'int CeedGetBuildConfiguration(const char **build_config) {\n' >> $@
+	@printf '  *build_config =' >> $@
+	@printf "$(foreach v,$(CONFIG_VARS),\n\"$(v) = $($(v))\\\n\")\n" >> $@
+	@printf ';\n' >> $@
+	@printf '  return 0;\n' >> $@ 
+	@printf '}\n' >> $@
 
 $(OBJDIR)/interface/ceed-jit-source-root-default.o : CPPFLAGS += -DCEED_JIT_SOURCE_ROOT_DEFAULT="\"$(abspath ./include)/\""
 $(OBJDIR)/interface/ceed-jit-source-root-install.o : CPPFLAGS += -DCEED_JIT_SOURCE_ROOT_DEFAULT="\"$(abspath $(includedir))/\""

From 944f002e154c874f981181073a53f172adf7bcd6 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Tue, 25 Mar 2025 15:43:33 -0600
Subject: [PATCH 369/571] make - ci doesn't always have perm to write files

---
 Makefile                | 22 ++++------------
 interface/ceed-config.c | 56 +++++++++++++++++++++++++++++++++++++++++
 interface/ceed.c        | 35 --------------------------
 3 files changed, 61 insertions(+), 52 deletions(-)
 create mode 100644 interface/ceed-config.c

diff --git a/Makefile b/Makefile
index fddda541b3..7cf1c34dad 100644
--- a/Makefile
+++ b/Makefile
@@ -665,7 +665,7 @@ endif
 # when creating shared or static libraries.
 weak_last = $(filter-out %-weak.o,$(1)) $(filter %-weak.o,$(1))
 
-libceed.o = $(libceed.c:%.c=$(OBJDIR)/%.o) $(OBJDIR)/interface/ceed-config.o $(libceed.cpp:%.cpp=$(OBJDIR)/%.o) $(libceed.cu:%.cu=$(OBJDIR)/%.o) $(libceed.hip:%.hip.cpp=$(OBJDIR)/%.o) $(libceed.sycl:%.sycl.cpp=$(OBJDIR)/%.o)
+libceed.o = $(libceed.c:%.c=$(OBJDIR)/%.o) $(libceed.cpp:%.cpp=$(OBJDIR)/%.o) $(libceed.cu:%.cu=$(OBJDIR)/%.o) $(libceed.hip:%.hip.cpp=$(OBJDIR)/%.o) $(libceed.sycl:%.sycl.cpp=$(OBJDIR)/%.o)
 $(filter %fortran.o,$(libceed.o)) : CPPFLAGS += $(if $(filter 1,$(UNDERSCORE)),-DUNDERSCORE)
 $(libceed.o): | info-backends
 $(libceed.so) : $(call weak_last,$(libceed.o)) | $$(@D)/.DIR
@@ -675,10 +675,7 @@ $(libceed.a) : $(call weak_last,$(libceed.o)) | $$(@D)/.DIR
 	$(call quiet,AR) $(ARFLAGS) $@ $^
 
 $(OBJDIR)/%.o : $(CURDIR)/%.c | $$(@D)/.DIR
-	$(call quiet,CC) $(CPPFLAGS) $(CFLAGS) -c -o $@ $(abspath $<)
-
-$(OBJDIR)/%.o : $(OBJDIR)/%.c | $$(@D)/.DIR # source files generated in OBJDIR
-	$(call quiet,CC) $(CPPFLAGS) $(CFLAGS) -c -o $@ $(abspath $<)
+	$(call quiet,CC) $(CPPFLAGS) $(CFLAGS) $(CONFIGFLAGS) -c -o $@ $(abspath $<)
 
 $(OBJDIR)/%.o : $(CURDIR)/%.cpp | $$(@D)/.DIR
 	$(call quiet,CXX) $(CPPFLAGS) $(CXXFLAGS) -c -o $@ $(abspath $<)
@@ -857,18 +854,9 @@ $(OBJDIR)/ceed.pc : pkgconfig-prefix = $(prefix)
 
 GIT_DESCRIBE = $(shell git describe --always --dirty 2>/dev/null || printf "unknown\n")
 
-$(OBJDIR)/interface/ceed-config.c: Makefile
-	@printf '#include <ceed-impl.h>\n\n' > $@
-	@printf 'int CeedGetGitVersion(const char **git_version) {\n' >> $@
-	@printf '  *git_version = "$(GIT_DESCRIBE)";\n' >> $@
-	@printf '  return 0;\n' >> $@
-	@printf '}\n\n' >> $@
-	@printf 'int CeedGetBuildConfiguration(const char **build_config) {\n' >> $@
-	@printf '  *build_config =' >> $@
-	@printf "$(foreach v,$(CONFIG_VARS),\n\"$(v) = $($(v))\\\n\")\n" >> $@
-	@printf ';\n' >> $@
-	@printf '  return 0;\n' >> $@ 
-	@printf '}\n' >> $@
+$(OBJDIR)/interface/ceed-config.o: Makefile
+$(OBJDIR)/interface/ceed-config.o: CONFIGFLAGS += -DCEED_GIT_VERSION="\"$(GIT_DESCRIBE)\""
+$(OBJDIR)/interface/ceed-config.o: CONFIGFLAGS += -DCEED_BUILD_CONFIGURATION="\"// Build Configuration:$(foreach v,$(CONFIG_VARS),\n$(v) = $($(v)))\""
 
 $(OBJDIR)/interface/ceed-jit-source-root-default.o : CPPFLAGS += -DCEED_JIT_SOURCE_ROOT_DEFAULT="\"$(abspath ./include)/\""
 $(OBJDIR)/interface/ceed-jit-source-root-install.o : CPPFLAGS += -DCEED_JIT_SOURCE_ROOT_DEFAULT="\"$(abspath $(includedir))/\""
diff --git a/interface/ceed-config.c b/interface/ceed-config.c
new file mode 100644
index 0000000000..114f67d81e
--- /dev/null
+++ b/interface/ceed-config.c
@@ -0,0 +1,56 @@
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+#include <ceed-impl.h>
+
+const char *CeedGitVersion         = CEED_GIT_VERSION;
+const char *CeedBuildConfiguration = CEED_BUILD_CONFIGURATION;
+
+/// @addtogroup CeedUser
+/// @{
+
+/**
+  @brief Get output of `git describe --dirty` from build time
+
+  While @ref CeedGetVersion() uniqely identifies the source code for release
+  builds, it does not identify builds from other commits.
+
+  @param[out] git_version A static string containing the Git commit description.
+
+  If `git describe --always --dirty` fails, the string `"unknown"` will be provided.
+  This could occur if Git is not installed or if libCEED is not being built from a repository, for example.`
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Developer
+
+  @sa CeedGetVersion() CeedGetBuildConfiguration()
+*/
+int CeedGetGitVersion(const char **git_version) {
+  *git_version = CeedGitVersion;
+  return CEED_ERROR_SUCCESS;
+}
+
+/**
+  @brief Get build variables as a multi-line string
+
+  Each line of the string has the format `VARNAME = value`.
+
+  @param[out] build_config A static string containing build variables
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Developer
+
+  @sa CeedGetVersion() CeedGetGitVersion()
+*/
+int CeedGetBuildConfiguration(const char **build_config) {
+  *build_config = CeedBuildConfiguration;
+  return CEED_ERROR_SUCCESS;
+}
+
+/// @}
diff --git a/interface/ceed.c b/interface/ceed.c
index 2d4ea6f6ec..114aae9a20 100644
--- a/interface/ceed.c
+++ b/interface/ceed.c
@@ -1682,41 +1682,6 @@ int CeedGetVersion(int *major, int *minor, int *patch, bool *release) {
   return CEED_ERROR_SUCCESS;
 }
 
-/**
-  @brief Get output of `git describe --dirty` from build time
-
-  While @ref CeedGetVersion() uniqely identifies the source code for release
-  builds, it does not identify builds from other commits.
-
-  @param[out] git_version A static string containing the Git commit description.
-
-  If `git describe --dirty` fails, the string `"unknown"` will be provided. This
-  could occur if Git is not installed or if libCEED is not being built from a
-  repository, for example.`
-
-  @return An error code: 0 - success, otherwise - failure
-
-  @ref Developer
-
-  @sa CeedGetVersion() CeedGetBuildConfiguration()
-*/
-int CeedGetGitVersion(const char **git_version);  // defined in generated ceed-config.h
-
-/**
-  @brief Get build variables as a multi-line string
-
-  Each line of the string has the format `VARNAME = value`.
-
-  @param[out] build_config A static string containing build variables
-
-  @return An error code: 0 - success, otherwise - failure
-
-  @ref Developer
-
-  @sa CeedGetVersion() CeedGetGitVersion()
-*/
-int CeedGetBuildConfiguration(const char **build_config);  // defined in generated ceed-config.h
-
 /**
   @brief Get libCEED scalar type, such as F64 or F32
 

From 5ad1e4caa56f68bbb3a0afebca93fbb72e33e160 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Wed, 26 Mar 2025 10:07:19 -0600
Subject: [PATCH 370/571] lib - add OPT as cflags_extra in ceed.pc

---
 Makefile         | 1 +
 ceed.pc.template | 1 +
 2 files changed, 2 insertions(+)

diff --git a/Makefile b/Makefile
index c49058bbcb..7b7e254d93 100644
--- a/Makefile
+++ b/Makefile
@@ -845,6 +845,7 @@ $(OBJDIR)/ceed.pc : pkgconfig-prefix = $(prefix)
 %/ceed.pc : ceed.pc.template | $$(@D)/.DIR
 	@$(SED) \
 	    -e "s:%prefix%:$(pkgconfig-prefix):" \
+	    -e "s:%opt%:$(OPT):" \
 	    -e "s:%libs_private%:$(pkgconfig-libs-private):" $< > $@
 
 $(OBJDIR)/interface/ceed-jit-source-root-default.o : CPPFLAGS += -DCEED_JIT_SOURCE_ROOT_DEFAULT="\"$(abspath ./include)/\""
diff --git a/ceed.pc.template b/ceed.pc.template
index 56bc5a076f..1d8458a4ee 100644
--- a/ceed.pc.template
+++ b/ceed.pc.template
@@ -1,6 +1,7 @@
 prefix=%prefix%
 includedir=${prefix}/include
 libdir=${prefix}/lib
+cflags_extra=%opt%
 
 Name: CEED
 Description: Code for Efficient Extensible Discretization

From a3b195ef6dd39c849072dd5df2f934c50a4df099 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Thu, 20 Mar 2025 10:42:36 -0600
Subject: [PATCH 371/571] hip - use unified memory Co-authored-by: Zach Atkins
 <zach.atkins@colorado.edu>

---
 backends/hip-ref/ceed-hip-ref-vector.c | 118 ++++++++++++++++++++++---
 backends/hip-ref/ceed-hip-ref.h        |   1 +
 backends/hip/ceed-hip-common.c         |   9 +-
 backends/hip/ceed-hip-common.h         |   1 +
 4 files changed, 114 insertions(+), 15 deletions(-)

diff --git a/backends/hip-ref/ceed-hip-ref-vector.c b/backends/hip-ref/ceed-hip-ref-vector.c
index 50e7064551..2c1748033e 100644
--- a/backends/hip-ref/ceed-hip-ref-vector.c
+++ b/backends/hip-ref/ceed-hip-ref-vector.c
@@ -95,7 +95,15 @@ static inline int CeedVectorSyncD2H_Hip(const CeedVector vec) {
 // Sync arrays
 //------------------------------------------------------------------------------
 static int CeedVectorSyncArray_Hip(const CeedVector vec, CeedMemType mem_type) {
-  bool need_sync = false;
+  bool            need_sync = false;
+  CeedVector_Hip *impl;
+
+  // Sync for unified memory
+  CeedCallBackend(CeedVectorGetData(vec, &impl));
+  if (impl->has_unified_addressing && !impl->h_array_borrowed) {
+    CeedCallHip(CeedVectorReturnCeed(vec), hipDeviceSynchronize());
+    return CEED_ERROR_SUCCESS;
+  }
 
   // Check whether device/host sync is needed
   CeedCallBackend(CeedVectorNeedSync_Hip(vec, mem_type, &need_sync));
@@ -158,6 +166,10 @@ static inline int CeedVectorHasBorrowedArrayOfType_Hip(const CeedVector vec, Cee
   CeedVector_Hip *impl;
 
   CeedCallBackend(CeedVectorGetData(vec, &impl));
+
+  // Use device memory for unified memory
+  mem_type = impl->has_unified_addressing && !impl->h_array_borrowed ? CEED_MEM_DEVICE : mem_type;
+
   switch (mem_type) {
     case CEED_MEM_HOST:
       *has_borrowed_array_of_type = impl->h_array_borrowed;
@@ -202,6 +214,43 @@ static int CeedVectorSetArrayDevice_Hip(const CeedVector vec, const CeedCopyMode
   return CEED_ERROR_SUCCESS;
 }
 
+//------------------------------------------------------------------------------
+// Set array with unified memory
+//------------------------------------------------------------------------------
+static int CeedVectorSetArrayUnifiedHostToDevice_Hip(const CeedVector vec, const CeedCopyMode copy_mode, CeedScalar *array) {
+  CeedSize        length;
+  Ceed            ceed;
+  CeedVector_Hip *impl;
+
+  CeedCallBackend(CeedVectorGetCeed(vec, &ceed));
+  CeedCallBackend(CeedVectorGetData(vec, &impl));
+  CeedCallBackend(CeedVectorGetLength(vec, &length));
+
+  switch (copy_mode) {
+    case CEED_COPY_VALUES:
+    case CEED_OWN_POINTER:
+      if (!impl->d_array) {
+        if (impl->d_array_borrowed) {
+          impl->d_array = impl->d_array_borrowed;
+        } else {
+          if (!impl->d_array_owned) CeedCallHip(ceed, hipMalloc((void **)&impl->d_array_owned, sizeof(CeedScalar) * length));
+          impl->d_array = impl->d_array_owned;
+        }
+      }
+      if (array) CeedCallHip(ceed, hipMemcpy(impl->d_array, array, sizeof(CeedScalar) * length, hipMemcpyHostToDevice));
+      if (copy_mode == CEED_OWN_POINTER) CeedCallBackend(CeedFree(&array));
+      break;
+    case CEED_USE_POINTER:
+      CeedCallHip(ceed, hipFree(impl->d_array_owned));
+      CeedCallBackend(CeedFree(&impl->h_array_owned));
+      impl->h_array_owned    = NULL;
+      impl->h_array_borrowed = array;
+      impl->d_array          = impl->h_array_borrowed;
+  }
+  CeedCallBackend(CeedDestroy(&ceed));
+  return CEED_ERROR_SUCCESS;
+}
+
 //------------------------------------------------------------------------------
 // Set the array used by a vector,
 //   freeing any previously allocated array if applicable
@@ -213,7 +262,11 @@ static int CeedVectorSetArray_Hip(const CeedVector vec, const CeedMemType mem_ty
   CeedCallBackend(CeedVectorSetAllInvalid_Hip(vec));
   switch (mem_type) {
     case CEED_MEM_HOST:
-      return CeedVectorSetArrayHost_Hip(vec, copy_mode, array);
+      if (impl->has_unified_addressing) {
+        return CeedVectorSetArrayUnifiedHostToDevice_Hip(vec, copy_mode, array);
+      } else {
+        return CeedVectorSetArrayHost_Hip(vec, copy_mode, array);
+      }
     case CEED_MEM_DEVICE:
       return CeedVectorSetArrayDevice_Hip(vec, copy_mode, array);
   }
@@ -303,8 +356,10 @@ int CeedDeviceSetValue_Hip(CeedScalar *d_array, CeedSize length, CeedScalar val)
 static int CeedVectorSetValue_Hip(CeedVector vec, CeedScalar val) {
   CeedSize        length;
   CeedVector_Hip *impl;
+  Ceed_Hip       *hip_data;
 
   CeedCallBackend(CeedVectorGetData(vec, &impl));
+  CeedCallBackend(CeedGetData(CeedVectorReturnCeed(vec), &hip_data));
   CeedCallBackend(CeedVectorGetLength(vec, &length));
   // Set value for synced device/host array
   if (!impl->d_array && !impl->h_array) {
@@ -321,7 +376,7 @@ static int CeedVectorSetValue_Hip(CeedVector vec, CeedScalar val) {
     }
   }
   if (impl->d_array) {
-    if (val == 0) {
+    if (val == 0 && !impl->h_array_borrowed) {
       CeedCallHip(CeedVectorReturnCeed(vec), hipMemset(impl->d_array, 0, length * sizeof(CeedScalar)));
     } else {
       CeedCallBackend(CeedDeviceSetValue_Hip(impl->d_array, length, val));
@@ -398,14 +453,17 @@ static int CeedVectorTakeArray_Hip(CeedVector vec, CeedMemType mem_type, CeedSca
 }
 
 //------------------------------------------------------------------------------
-// Core logic for array syncronization for GetArray.
+// Core logic for array synchronization for GetArray.
 //   If a different memory type is most up to date, this will perform a copy
 //------------------------------------------------------------------------------
-static int CeedVectorGetArrayCore_Hip(const CeedVector vec, const CeedMemType mem_type, CeedScalar **array) {
+static int CeedVectorGetArrayCore_Hip(const CeedVector vec, CeedMemType mem_type, CeedScalar **array) {
   CeedVector_Hip *impl;
 
   CeedCallBackend(CeedVectorGetData(vec, &impl));
 
+  // Use device memory for unified memory
+  mem_type = impl->has_unified_addressing && !impl->h_array_borrowed ? CEED_MEM_DEVICE : mem_type;
+
   // Sync array to requested mem_type
   CeedCallBackend(CeedVectorSyncArray(vec, mem_type));
 
@@ -431,15 +489,21 @@ static int CeedVectorGetArrayRead_Hip(const CeedVector vec, const CeedMemType me
 //------------------------------------------------------------------------------
 // Get read/write access to a vector via the specified mem_type
 //------------------------------------------------------------------------------
-static int CeedVectorGetArray_Hip(const CeedVector vec, const CeedMemType mem_type, CeedScalar **array) {
+static int CeedVectorGetArray_Hip(const CeedVector vec, CeedMemType mem_type, CeedScalar **array) {
   CeedVector_Hip *impl;
 
   CeedCallBackend(CeedVectorGetData(vec, &impl));
+
+  // Use device memory for unified memory
+  mem_type = impl->has_unified_addressing && !impl->h_array_borrowed ? CEED_MEM_DEVICE : mem_type;
+
+  // 'Get' array and set only 'get'ed array as valid
   CeedCallBackend(CeedVectorGetArrayCore_Hip(vec, mem_type, array));
   CeedCallBackend(CeedVectorSetAllInvalid_Hip(vec));
   switch (mem_type) {
     case CEED_MEM_HOST:
       impl->h_array = *array;
+      if (impl->has_unified_addressing) impl->d_array = *array;
       break;
     case CEED_MEM_DEVICE:
       impl->d_array = *array;
@@ -451,11 +515,17 @@ static int CeedVectorGetArray_Hip(const CeedVector vec, const CeedMemType mem_ty
 //------------------------------------------------------------------------------
 // Get write access to a vector via the specified mem_type
 //------------------------------------------------------------------------------
-static int CeedVectorGetArrayWrite_Hip(const CeedVector vec, const CeedMemType mem_type, CeedScalar **array) {
+static int CeedVectorGetArrayWrite_Hip(const CeedVector vec, CeedMemType mem_type, CeedScalar **array) {
   bool            has_array_of_type = true;
   CeedVector_Hip *impl;
+  Ceed_Hip       *hip_data;
 
   CeedCallBackend(CeedVectorGetData(vec, &impl));
+  CeedCallBackend(CeedGetData(CeedVectorReturnCeed(vec), &hip_data));
+
+  // Use device memory for unified memory
+  mem_type = impl->has_unified_addressing && !impl->h_array_borrowed ? CEED_MEM_DEVICE : mem_type;
+
   CeedCallBackend(CeedVectorHasArrayOfType_Hip(vec, mem_type, &has_array_of_type));
   if (!has_array_of_type) {
     // Allocate if array is not yet allocated
@@ -487,8 +557,10 @@ static int CeedVectorNorm_Hip(CeedVector vec, CeedNormType type, CeedScalar *nor
   const CeedScalar *d_array;
   CeedVector_Hip   *impl;
   hipblasHandle_t   handle;
+  Ceed_Hip         *hip_data;
 
   CeedCallBackend(CeedVectorGetCeed(vec, &ceed));
+  CeedCallBackend(CeedGetData(ceed, &hip_data));
   CeedCallBackend(CeedVectorGetData(vec, &impl));
   CeedCallBackend(CeedVectorGetLength(vec, &length));
   CeedCallBackend(CeedGetHipblasHandle_Hip(ceed, &handle));
@@ -518,7 +590,7 @@ static int CeedVectorNorm_Hip(CeedVector vec, CeedNormType type, CeedScalar *nor
         CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX;
         CeedInt  sub_length       = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
 
-        CeedCallHipblas(ceed, cublasSasum(handle, (CeedInt)sub_length, (float *)d_array_start, 1, &sub_norm));
+        CeedCallHipblas(ceed, hipblasSasum(handle, (CeedInt)sub_length, (float *)d_array_start, 1, &sub_norm));
         *norm += sub_norm;
       }
 #endif /* HIP_VERSION */
@@ -545,7 +617,7 @@ static int CeedVectorNorm_Hip(CeedVector vec, CeedNormType type, CeedScalar *nor
 #if defined(CEED_SCALAR_IS_FP32)
 #if (HIP_VERSION >= 60000000)
       CeedCallHipblas(ceed, hipblasSnrm2_64(handle, (int64_t)length, (float *)d_array, 1, (float *)norm));
-#else  /* CUDA_VERSION */
+#else  /* HIP_VERSION */
       float  sub_norm = 0.0, norm_sum = 0.0;
       float *d_array_start;
 
@@ -562,7 +634,7 @@ static int CeedVectorNorm_Hip(CeedVector vec, CeedNormType type, CeedScalar *nor
 #else  /* CEED_SCALAR */
 #if (HIP_VERSION >= 60000000)
       CeedCallHipblas(ceed, hipblasDnrm2_64(handle, (int64_t)length, (double *)d_array, 1, (double *)norm));
-#else  /* CUDA_VERSION */
+#else  /* HIP_VERSION */
       double  sub_norm = 0.0, norm_sum = 0.0;
       double *d_array_start;
 
@@ -599,7 +671,12 @@ static int CeedVectorNorm_Hip(CeedVector vec, CeedNormType type, CeedScalar *nor
         CeedInt  sub_length       = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
 
         CeedCallHipblas(ceed, hipblasIsamax(handle, (CeedInt)sub_length, (float *)d_array_start, 1, &index));
-        CeedCallHip(ceed, hipMemcpy(&sub_max, d_array_start + index - 1, sizeof(CeedScalar), hipMemcpyDeviceToHost));
+        if (hip_data->has_unified_addressing) {
+          CeedCallHip(ceed, hipDeviceSynchronize());
+          sub_max = fabs(d_array[index - 1]);
+        } else {
+          CeedCallHip(ceed, hipMemcpy(&sub_max, d_array_start + index - 1, sizeof(CeedScalar), hipMemcpyDeviceToHost));
+        }
         if (fabs(sub_max) > current_max) current_max = fabs(sub_max);
       }
       *norm = current_max;
@@ -610,7 +687,12 @@ static int CeedVectorNorm_Hip(CeedVector vec, CeedNormType type, CeedScalar *nor
       CeedScalar norm_no_abs;
 
       CeedCallHipblas(ceed, hipblasIdamax_64(handle, (int64_t)length, (double *)d_array, 1, &index));
-      CeedCallHip(ceed, hipMemcpy(&norm_no_abs, impl->d_array + index - 1, sizeof(CeedScalar), hipMemcpyDeviceToHost));
+      if (hip_data->has_unified_addressing) {
+        CeedCallHip(ceed, hipDeviceSynchronize());
+        norm_no_abs = fabs(d_array[index - 1]);
+      } else {
+        CeedCallHip(ceed, hipMemcpy(&norm_no_abs, impl->d_array + index - 1, sizeof(CeedScalar), hipMemcpyDeviceToHost));
+      }
       *norm = fabs(norm_no_abs);
 #else  /* HIP_VERSION */
       CeedInt index;
@@ -623,7 +705,12 @@ static int CeedVectorNorm_Hip(CeedVector vec, CeedNormType type, CeedScalar *nor
         CeedInt  sub_length       = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
 
         CeedCallHipblas(ceed, hipblasIdamax(handle, (CeedInt)sub_length, (double *)d_array_start, 1, &index));
-        CeedCallHip(ceed, hipMemcpy(&sub_max, d_array_start + index - 1, sizeof(CeedScalar), hipMemcpyDeviceToHost));
+        if (hip_data->has_unified_addressing) {
+          CeedCallHip(ceed, hipDeviceSynchronize());
+          sub_max = fabs(d_array[index - 1]);
+        } else {
+          CeedCallHip(ceed, hipMemcpy(&sub_max, d_array_start + index - 1, sizeof(CeedScalar), hipMemcpyDeviceToHost));
+        }
         if (fabs(sub_max) > current_max) current_max = fabs(sub_max);
       }
       *norm = current_max;
@@ -854,6 +941,7 @@ static int CeedVectorDestroy_Hip(const CeedVector vec) {
 //------------------------------------------------------------------------------
 int CeedVectorCreate_Hip(CeedSize n, CeedVector vec) {
   CeedVector_Hip *impl;
+  Ceed_Hip       *hip_impl;
   Ceed            ceed;
 
   CeedCallBackend(CeedVectorGetCeed(vec, &ceed));
@@ -875,8 +963,10 @@ int CeedVectorCreate_Hip(CeedSize n, CeedVector vec) {
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "AXPBY", CeedVectorAXPBY_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "PointwiseMult", CeedVectorPointwiseMult_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "Destroy", CeedVectorDestroy_Hip));
-  CeedCallBackend(CeedDestroy(&ceed));
   CeedCallBackend(CeedCalloc(1, &impl));
+  CeedCallBackend(CeedGetData(ceed, &hip_impl));
+  CeedCallBackend(CeedDestroy(&ceed));
+  impl->has_unified_addressing = hip_impl->has_unified_addressing;
   CeedCallBackend(CeedVectorSetData(vec, impl));
   return CEED_ERROR_SUCCESS;
 }
diff --git a/backends/hip-ref/ceed-hip-ref.h b/backends/hip-ref/ceed-hip-ref.h
index 9209e88049..993e4601fd 100644
--- a/backends/hip-ref/ceed-hip-ref.h
+++ b/backends/hip-ref/ceed-hip-ref.h
@@ -17,6 +17,7 @@
 #endif
 
 typedef struct {
+  int         has_unified_addressing;
   CeedScalar *h_array;
   CeedScalar *h_array_borrowed;
   CeedScalar *h_array_owned;
diff --git a/backends/hip/ceed-hip-common.c b/backends/hip/ceed-hip-common.c
index 837cbdc869..0b98004346 100644
--- a/backends/hip/ceed-hip-common.c
+++ b/backends/hip/ceed-hip-common.c
@@ -19,7 +19,8 @@ int CeedInit_Hip(Ceed ceed, const char *resource) {
   Ceed_Hip   *data;
   const char *device_spec = strstr(resource, ":device_id=");
   const int   device_id   = (device_spec) ? atoi(device_spec + 11) : -1;
-  int         current_device_id;
+  int         current_device_id, xnack_value;
+  const char *xnack;
 
   CeedCallHip(ceed, hipGetDevice(&current_device_id));
   if (device_id >= 0 && current_device_id != device_id) {
@@ -30,6 +31,12 @@ int CeedInit_Hip(Ceed ceed, const char *resource) {
   CeedCallBackend(CeedGetData(ceed, &data));
   data->device_id = current_device_id;
   CeedCallHip(ceed, hipGetDeviceProperties(&data->device_prop, current_device_id));
+  xnack                        = getenv("HSA_XNACK");
+  xnack_value                  = !!xnack ? atol(xnack) : 0;
+  data->has_unified_addressing = xnack_value > 0 ? data->device_prop.unifiedAddressing : 0;
+  if (data->has_unified_addressing) {
+    CeedDebug(ceed, "Using unified memory addressing");
+  }
   data->opt_block_size = 256;
   return CEED_ERROR_SUCCESS;
 }
diff --git a/backends/hip/ceed-hip-common.h b/backends/hip/ceed-hip-common.h
index 28805b944b..23fdaacffe 100644
--- a/backends/hip/ceed-hip-common.h
+++ b/backends/hip/ceed-hip-common.h
@@ -72,6 +72,7 @@ typedef struct {
   hipblasHandle_t        hipblas_handle;
   struct hipDeviceProp_t device_prop;
   int                    opt_block_size;
+  int                    has_unified_addressing;
 } Ceed_Hip;
 
 CEED_INTERN int CeedInit_Hip(Ceed ceed, const char *resource);

From 4bccaee3c3d3cf53339de550538ec4313ba28889 Mon Sep 17 00:00:00 2001
From: James Wright <james@jameswright.xyz>
Date: Thu, 27 Mar 2025 15:04:24 -0400
Subject: [PATCH 372/571] fix(fluids): Correct bc for change in
 PetscOptionsObject

Changed from struct to pointer-to-struct
---
 examples/fluids/include/bc_definition.h | 2 +-
 examples/fluids/src/bc_definition.c     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/fluids/include/bc_definition.h b/examples/fluids/include/bc_definition.h
index 095c3884b1..c43f68f5cd 100644
--- a/examples/fluids/include/bc_definition.h
+++ b/examples/fluids/include/bc_definition.h
@@ -34,7 +34,7 @@ struct _p_BCDefinition {
 **/
 #define PetscOptionsBCDefinition(opt, text, man, name, bc_def, set) \
   PetscOptionsBCDefinition_Private(PetscOptionsObject, opt, text, man, name, bc_def, set)
-PetscErrorCode PetscOptionsBCDefinition_Private(PetscOptionItems *PetscOptionsObject, const char opt[], const char text[], const char man[],
+PetscErrorCode PetscOptionsBCDefinition_Private(PetscOptionItems PetscOptionsObject, const char opt[], const char text[], const char man[],
                                                 const char name[], BCDefinition *bc_def, PetscBool *set);
 
 PetscErrorCode BCDefinitionCreate(const char *name, PetscInt num_label_values, PetscInt label_values[], BCDefinition *bc_def);
diff --git a/examples/fluids/src/bc_definition.c b/examples/fluids/src/bc_definition.c
index 03bb6e3569..7b917ae642 100644
--- a/examples/fluids/src/bc_definition.c
+++ b/examples/fluids/src/bc_definition.c
@@ -91,7 +91,7 @@ PetscErrorCode BCDefinitionGetEssential(BCDefinition bc_def, PetscInt *num_essen
 #define LABEL_ARRAY_SIZE 256
 
 // @brief See `PetscOptionsBCDefinition`
-PetscErrorCode PetscOptionsBCDefinition_Private(PetscOptionItems *PetscOptionsObject, const char opt[], const char text[], const char man[],
+PetscErrorCode PetscOptionsBCDefinition_Private(PetscOptionItems PetscOptionsObject, const char opt[], const char text[], const char man[],
                                                 const char name[], BCDefinition *bc_def, PetscBool *set) {
   PetscInt num_label_values = LABEL_ARRAY_SIZE, label_values[LABEL_ARRAY_SIZE] = {0};
 

From 9a2e771e266eaf0d22181b9f7f12731e40739a7a Mon Sep 17 00:00:00 2001
From: James Wright <james@jameswright.xyz>
Date: Thu, 27 Mar 2025 15:04:56 -0400
Subject: [PATCH 373/571] fix(fluids): Address unintialized compiler warnings

---
 examples/fluids/qfunctions/stg_shur14.h | 4 ++--
 examples/fluids/src/misc.c              | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/examples/fluids/qfunctions/stg_shur14.h b/examples/fluids/qfunctions/stg_shur14.h
index 1394ac53e4..edf74bbb6f 100644
--- a/examples/fluids/qfunctions/stg_shur14.h
+++ b/examples/fluids/qfunctions/stg_shur14.h
@@ -310,7 +310,7 @@ CEED_QFUNCTION(ICsStg)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedSc
       for (CeedInt j = 0; j < 3; j++) u[j] = ubar[j];
     }
 
-    CeedScalar Y[5] = {P0, u[0], u[1], u[2], theta0}, q[5];
+    CeedScalar Y[5] = {P0, u[0], u[1], u[2], theta0}, q[5] = {0.};
     State      s    = StateFromY(gas, Y);
     StateToQ(gas, s, q, gas->state_var);
     for (CeedInt j = 0; j < 5; j++) {
@@ -508,7 +508,7 @@ CEED_QFUNCTION(StgShur14InflowStrongQF)(void *ctx, CeedInt Q, const CeedScalar *
       for (CeedInt j = 0; j < 3; j++) u[j] = ubar[j];
     }
 
-    CeedScalar Y[5] = {P0, u[0], u[1], u[2], theta0}, q[5];
+    CeedScalar Y[5] = {P0, u[0], u[1], u[2], theta0}, q[5] = {0.};
     State      s    = StateFromY(gas, Y);
     StateToQ(gas, s, q, gas->state_var);
     switch (gas->state_var) {
diff --git a/examples/fluids/src/misc.c b/examples/fluids/src/misc.c
index 8b54f7c90b..3f912a1b11 100644
--- a/examples/fluids/src/misc.c
+++ b/examples/fluids/src/misc.c
@@ -95,6 +95,7 @@ PetscErrorCode DMPlexInsertBoundaryValues_FromICs(DM dm, PetscBool insert_essent
 
 static PetscErrorCode BinaryReadIntoInt(PetscViewer viewer, PetscInt *out, PetscDataType file_type) {
   PetscFunctionBeginUser;
+  *out = -13;  // appease the overzealous GCC compiler warning Gods
   if (file_type == PETSC_INT32) {
     PetscInt32 val;
     PetscCall(PetscViewerBinaryRead(viewer, &val, 1, NULL, PETSC_INT32));

From 5d4f3a240e49215b0094ec3aa221cba8f1f56bde Mon Sep 17 00:00:00 2001
From: James Wright <james@jameswright.xyz>
Date: Thu, 27 Mar 2025 15:22:15 -0400
Subject: [PATCH 374/571] style: clang-format 19 changes

---
 examples/fluids/qfunctions/stg_shur14.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/fluids/qfunctions/stg_shur14.h b/examples/fluids/qfunctions/stg_shur14.h
index edf74bbb6f..b62bbda326 100644
--- a/examples/fluids/qfunctions/stg_shur14.h
+++ b/examples/fluids/qfunctions/stg_shur14.h
@@ -311,7 +311,7 @@ CEED_QFUNCTION(ICsStg)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedSc
     }
 
     CeedScalar Y[5] = {P0, u[0], u[1], u[2], theta0}, q[5] = {0.};
-    State      s    = StateFromY(gas, Y);
+    State      s = StateFromY(gas, Y);
     StateToQ(gas, s, q, gas->state_var);
     for (CeedInt j = 0; j < 5; j++) {
       q0[j][i] = q[j];
@@ -509,7 +509,7 @@ CEED_QFUNCTION(StgShur14InflowStrongQF)(void *ctx, CeedInt Q, const CeedScalar *
     }
 
     CeedScalar Y[5] = {P0, u[0], u[1], u[2], theta0}, q[5] = {0.};
-    State      s    = StateFromY(gas, Y);
+    State      s = StateFromY(gas, Y);
     StateToQ(gas, s, q, gas->state_var);
     switch (gas->state_var) {
       case STATEVAR_CONSERVATIVE:

From bfbde1a268fe98bfe2ba1c9d50ebdf542389ce5e Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Mon, 31 Mar 2025 10:48:08 -0600
Subject: [PATCH 375/571] petsc - require 3.23

---
 examples/fluids/navierstokes.h        | 4 ++--
 examples/petsc/include/petscversion.h | 4 ++--
 examples/solids/elasticity.h          | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/examples/fluids/navierstokes.h b/examples/fluids/navierstokes.h
index 4f93e64a71..a5c2b9fc30 100644
--- a/examples/fluids/navierstokes.h
+++ b/examples/fluids/navierstokes.h
@@ -17,8 +17,8 @@
 #include "./include/petsc_ops.h"
 #include "qfunctions/newtonian_types.h"
 
-#if PETSC_VERSION_LT(3, 22, 0)
-#error "PETSc v3.22 or later is required"
+#if PETSC_VERSION_LT(3, 23, 0)
+#error "PETSc v3.23 or later is required"
 #endif
 
 // -----------------------------------------------------------------------------
diff --git a/examples/petsc/include/petscversion.h b/examples/petsc/include/petscversion.h
index 4ef951a893..50d333c33e 100644
--- a/examples/petsc/include/petscversion.h
+++ b/examples/petsc/include/petscversion.h
@@ -9,6 +9,6 @@
 /// Petsc version check
 #pragma once
 
-#if PETSC_VERSION_LT(3, 22, 0)
-#error "PETSc v3.22 or later is required"
+#if PETSC_VERSION_LT(3, 23, 0)
+#error "PETSc v3.23 or later is required"
 #endif
diff --git a/examples/solids/elasticity.h b/examples/solids/elasticity.h
index 3e964cc7c4..155a022136 100644
--- a/examples/solids/elasticity.h
+++ b/examples/solids/elasticity.h
@@ -21,6 +21,6 @@
 #include "include/utils.h"
 #include "problems/problems.h"
 
-#if PETSC_VERSION_LT(3, 22, 0)
-#error "PETSc v3.22 or later is required"
+#if PETSC_VERSION_LT(3, 23, 0)
+#error "PETSc v3.23 or later is required"
 #endif

From 3b9caef58af0b75e45cbbaf9089919028a809fac Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Mon, 7 Apr 2025 11:59:35 -0600
Subject: [PATCH 376/571] tidy - minor fixes

---
 Makefile                   |  2 +-
 interface/ceed-config.c    | 14 +++++++-------
 interface/ceed-jit-tools.c |  8 +++++++-
 3 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/Makefile b/Makefile
index c0e96c3553..483c57f2ad 100644
--- a/Makefile
+++ b/Makefile
@@ -957,7 +957,7 @@ vermin    :
 CLANG_TIDY ?= clang-tidy
 
 %.c.tidy : %.c
-	$(CLANG_TIDY) $(TIDY_OPTS) $^ -- $(CPPFLAGS) --std=c99 -I$(CUDA_DIR)/include -I$(ROCM_DIR)/include -DCEED_JIT_SOURCE_ROOT_DEFAULT="\"$(abspath ./include)/\""
+	$(CLANG_TIDY) $(TIDY_OPTS) $^ -- $(CPPFLAGS) --std=c99 -I$(CUDA_DIR)/include -I$(ROCM_DIR)/include -DCEED_JIT_SOURCE_ROOT_DEFAULT="\"$(abspath ./include)/\"" -DCEED_GIT_VERSION="\"$(GIT_DESCRIBE)\"" -DCEED_BUILD_CONFIGURATION="\"// Build Configuration:$(foreach v,$(CONFIG_VARS),\n$(v) = $($(v)))\""
 
 %.cpp.tidy : %.cpp
 	$(CLANG_TIDY) $(TIDY_OPTS) $^ -- $(CPPFLAGS) --std=c++11 -I$(CUDA_DIR)/include -I$(OCCA_DIR)/include -I$(ROCM_DIR)/include
diff --git a/interface/ceed-config.c b/interface/ceed-config.c
index 114f67d81e..bb7aaa9258 100644
--- a/interface/ceed-config.c
+++ b/interface/ceed-config.c
@@ -14,9 +14,9 @@ const char *CeedBuildConfiguration = CEED_BUILD_CONFIGURATION;
 /// @{
 
 /**
-  @brief Get output of `git describe --dirty` from build time
+  @brief Get output of `git describe --dirty` from build time.
 
-  While @ref CeedGetVersion() uniqely identifies the source code for release
+  While @ref CeedGetVersion() uniquely identifies the source code for release
   builds, it does not identify builds from other commits.
 
   @param[out] git_version A static string containing the Git commit description.
@@ -24,11 +24,11 @@ const char *CeedBuildConfiguration = CEED_BUILD_CONFIGURATION;
   If `git describe --always --dirty` fails, the string `"unknown"` will be provided.
   This could occur if Git is not installed or if libCEED is not being built from a repository, for example.`
 
-  @return An error code: 0 - success, otherwise - failure
-
   @ref Developer
 
   @sa CeedGetVersion() CeedGetBuildConfiguration()
+
+  @return An error code: 0 - success, otherwise - failure
 */
 int CeedGetGitVersion(const char **git_version) {
   *git_version = CeedGitVersion;
@@ -36,17 +36,17 @@ int CeedGetGitVersion(const char **git_version) {
 }
 
 /**
-  @brief Get build variables as a multi-line string
+  @brief Get build variables as a multi-line string.
 
   Each line of the string has the format `VARNAME = value`.
 
   @param[out] build_config A static string containing build variables
 
-  @return An error code: 0 - success, otherwise - failure
-
   @ref Developer
 
   @sa CeedGetVersion() CeedGetGitVersion()
+
+  @return An error code: 0 - success, otherwise - failure
 */
 int CeedGetBuildConfiguration(const char **build_config) {
   *build_config = CeedBuildConfiguration;
diff --git a/interface/ceed-jit-tools.c b/interface/ceed-jit-tools.c
index 978bc41f61..16a1fa31ab 100644
--- a/interface/ceed-jit-tools.c
+++ b/interface/ceed-jit-tools.c
@@ -139,7 +139,13 @@ int CeedLoadSourceToInitializedBuffer(Ceed ceed, const char *source_file_path, C
   file_size = ftell(source_file);
   rewind(source_file);
   //  -- Allocate memory for entire source file
-  CeedCall(CeedCalloc(file_size + 1, &temp_buffer));
+  {
+    const int ierr = CeedCalloc(file_size + 1, &temp_buffer);
+
+    // Close stream before error handling, if necessary
+    if (ierr != CEED_ERROR_SUCCESS) fclose(source_file);
+    CeedCall(ierr);
+  }
   // -- Copy the file into the buffer
   if (1 != fread(temp_buffer, file_size, 1, source_file)) {
     // LCOV_EXCL_START

From 7d0543c0678f07bb414f92ad0b748b4cdb69d81d Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Mon, 7 Apr 2025 12:38:27 -0600
Subject: [PATCH 377/571] minor - another small tidy fix

---
 interface/ceed-jit-tools.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/interface/ceed-jit-tools.c b/interface/ceed-jit-tools.c
index 16a1fa31ab..fdc0efb217 100644
--- a/interface/ceed-jit-tools.c
+++ b/interface/ceed-jit-tools.c
@@ -137,7 +137,7 @@ int CeedLoadSourceToInitializedBuffer(Ceed ceed, const char *source_file_path, C
   // -- Compute size of source
   fseek(source_file, 0L, SEEK_END);
   file_size = ftell(source_file);
-  rewind(source_file);
+  fseek(source_file, 0L, SEEK_SET);
   //  -- Allocate memory for entire source file
   {
     const int ierr = CeedCalloc(file_size + 1, &temp_buffer);

From ed1ebff7abaa668c9b274382abc627b7a8754a20 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 18 Apr 2025 10:04:36 -0600
Subject: [PATCH 378/571] actions - use latest gcc

---
 .github/workflows/c-fortan-test-ppc64le.yml    | 4 ++--
 .github/workflows/c-fortran-test-arm64.yml     | 4 ++--
 .github/workflows/c-fortran-test-linux-osx.yml | 4 ++--
 .github/workflows/c-fortran-test-style.yml     | 2 +-
 .github/workflows/python-test-with-style.yml   | 8 ++++----
 .github/workflows/rust-test-with-style.yml     | 2 +-
 6 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/c-fortan-test-ppc64le.yml b/.github/workflows/c-fortan-test-ppc64le.yml
index b959d929d5..f710c9ba12 100644
--- a/.github/workflows/c-fortan-test-ppc64le.yml
+++ b/.github/workflows/c-fortan-test-ppc64le.yml
@@ -11,7 +11,7 @@ jobs:
     strategy:
       matrix:
         os: [ubuntu-24.04]
-        compiler: [gcc-13]
+        compiler: [gcc]
         arch: [ppc64le]
         distro: [ubuntu22.04]
 
@@ -24,7 +24,7 @@ jobs:
       uses: uraimo/run-on-arch-action@v3
       env:
         CC: ${{ matrix.compiler }}
-        FC: gfortran-13
+        FC: gfortran
       id: runcmd
       with:
         arch: ${{ matrix.arch }}
diff --git a/.github/workflows/c-fortran-test-arm64.yml b/.github/workflows/c-fortran-test-arm64.yml
index d75d11512a..6927f37b68 100644
--- a/.github/workflows/c-fortran-test-arm64.yml
+++ b/.github/workflows/c-fortran-test-arm64.yml
@@ -11,7 +11,7 @@ jobs:
     strategy:
       matrix:
         os: [ubuntu-24.04-arm]
-        compiler: [gcc-13, clang]
+        compiler: [gcc, clang]
 
     runs-on: ${{ matrix.os }}
 
@@ -21,7 +21,7 @@ jobs:
     - name: Build and test libCEED
       env:
         CC: ${{ matrix.compiler }}
-        FC: gfortran-13
+        FC: gfortran
       run: |
         make info
         make -j
diff --git a/.github/workflows/c-fortran-test-linux-osx.yml b/.github/workflows/c-fortran-test-linux-osx.yml
index 1705f42936..e732fcc9a1 100644
--- a/.github/workflows/c-fortran-test-linux-osx.yml
+++ b/.github/workflows/c-fortran-test-linux-osx.yml
@@ -11,7 +11,7 @@ jobs:
     strategy:
       matrix:
         os: [ubuntu-24.04, macos-15]
-        compiler: [gcc-13, clang]
+        compiler: [gcc, clang]
 
     runs-on: ${{ matrix.os }}
 
@@ -21,7 +21,7 @@ jobs:
     - name: Build and test libCEED
       env:
         CC: ${{ matrix.compiler }}
-        FC: gfortran-13
+        FC: gfortran-14
       run: |
         make info
         make -j
diff --git a/.github/workflows/c-fortran-test-style.yml b/.github/workflows/c-fortran-test-style.yml
index 7ee4f7f2d6..ff55101bde 100644
--- a/.github/workflows/c-fortran-test-style.yml
+++ b/.github/workflows/c-fortran-test-style.yml
@@ -26,7 +26,7 @@ jobs:
     - name: C style
       env:
         CC: ${{ matrix.compiler }}
-        FC: gfortran-11
+        FC: gfortran
       run: |
         make info
         make format-c -j CLANG_FORMAT=clang-format-19 && git diff --exit-code
diff --git a/.github/workflows/python-test-with-style.yml b/.github/workflows/python-test-with-style.yml
index 57112d1e3f..ef87f09090 100644
--- a/.github/workflows/python-test-with-style.yml
+++ b/.github/workflows/python-test-with-style.yml
@@ -11,7 +11,7 @@ jobs:
     strategy:
       matrix:
         os: [ubuntu-24.04]
-        compiler: [gcc-13]
+        compiler: [gcc]
         python-version: ['3.x']
 
     runs-on: ${{ matrix.os }}
@@ -30,7 +30,7 @@ jobs:
     - name: Python test
       env:
         CC: ${{ matrix.compiler }}
-        FC: gfortran-13
+        FC: gfortran
       run: |
         make info
         make -j2
@@ -42,12 +42,12 @@ jobs:
     - name: Python style
       env:
         CC: ${{ matrix.compiler }}
-        FC: gfortran-13
+        FC: gfortran
       run: |
         make format-py && git diff --exit-code
     - name: Python version
       env:
         CC: ${{ matrix.compiler }}
-        FC: gfortran-13
+        FC: gfortran
       run: |
         make vermin
diff --git a/.github/workflows/rust-test-with-style.yml b/.github/workflows/rust-test-with-style.yml
index d6e1f2a42b..f8e9499fe2 100644
--- a/.github/workflows/rust-test-with-style.yml
+++ b/.github/workflows/rust-test-with-style.yml
@@ -31,7 +31,7 @@ jobs:
     - name: Rust test with coverage
       env:
         CC: ${{ matrix.compiler }}
-        FC: gfortran-11
+        FC: gfortran
       run: cargo llvm-cov test --doctests --lcov --output-path lcov.info
     - name: Codecov upload
       uses: codecov/codecov-action@v4

From 8c76f87758827c6c8e0f6fb994e85a37c1b5ce99 Mon Sep 17 00:00:00 2001
From: Zach Atkins <Zach.Atkins@colorado.edu>
Date: Thu, 17 Apr 2025 21:20:52 -0700
Subject: [PATCH 379/571] Fix edge case in offset padding for empty cells

---
 backends/cuda-ref/ceed-cuda-ref-restriction.c | 4 +++-
 backends/hip-ref/ceed-hip-ref-restriction.c   | 4 +++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/backends/cuda-ref/ceed-cuda-ref-restriction.c b/backends/cuda-ref/ceed-cuda-ref-restriction.c
index bde12c38cd..30e2ee1623 100644
--- a/backends/cuda-ref/ceed-cuda-ref-restriction.c
+++ b/backends/cuda-ref/ceed-cuda-ref-restriction.c
@@ -550,16 +550,18 @@ int CeedElemRestrictionCreate_Cuda(CeedMemType mem_type, CeedCopyMode copy_mode,
     CeedCallBackend(CeedMalloc(num_elem, &points_per_elem));
     for (CeedInt i = 0; i < num_elem; i++) {
       CeedInt num_points = offsets[i + 1] - offsets[i];
+      CeedInt last_point = offsets[offsets[i]] * num_comp;
 
       points_per_elem[i] = num_points;
       at_points_size += num_points;
       // -- Copy all points in element
       for (CeedInt j = 0; j < num_points; j++) {
         offsets_padded[i * max_points + j] = offsets[offsets[i] + j] * num_comp;
+        last_point                         = offsets_padded[i * max_points + j];
       }
       // -- Replicate out last point in element
       for (CeedInt j = num_points; j < max_points; j++) {
-        offsets_padded[i * max_points + j] = offsets[offsets[i] + num_points - 1] * num_comp;
+        offsets_padded[i * max_points + j] = last_point;
       }
     }
     CeedCallBackend(CeedSetHostCeedIntArray(offsets, copy_mode, at_points_size, &impl->h_offsets_at_points_owned, &impl->h_offsets_at_points_borrowed,
diff --git a/backends/hip-ref/ceed-hip-ref-restriction.c b/backends/hip-ref/ceed-hip-ref-restriction.c
index 543839a2bf..54d8b13ea0 100644
--- a/backends/hip-ref/ceed-hip-ref-restriction.c
+++ b/backends/hip-ref/ceed-hip-ref-restriction.c
@@ -551,16 +551,18 @@ int CeedElemRestrictionCreate_Hip(CeedMemType mem_type, CeedCopyMode copy_mode,
     CeedCallBackend(CeedMalloc(num_elem, &points_per_elem));
     for (CeedInt i = 0; i < num_elem; i++) {
       CeedInt num_points = offsets[i + 1] - offsets[i];
+      CeedInt last_point = offsets[offsets[i]] * num_comp;
 
       points_per_elem[i] = num_points;
       at_points_size += num_points;
       // -- Copy all points in element
       for (CeedInt j = 0; j < num_points; j++) {
         offsets_padded[i * max_points + j] = offsets[offsets[i] + j] * num_comp;
+        last_point                         = offsets_padded[i * max_points + j];
       }
       // -- Replicate out last point in element
       for (CeedInt j = num_points; j < max_points; j++) {
-        offsets_padded[i * max_points + j] = offsets[offsets[i] + num_points - 1] * num_comp;
+        offsets_padded[i * max_points + j] = last_point;
       }
     }
     CeedCallBackend(CeedSetHostCeedIntArray(offsets, copy_mode, at_points_size, &impl->h_offsets_at_points_owned, &impl->h_offsets_at_points_borrowed,

From 48fdef17394b48363460b50625ac6d4bb530a981 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Mon, 21 Apr 2025 13:28:42 -0600
Subject: [PATCH 380/571] op - minor at points diagonal improvements

---
 backends/ref/ceed-ref-operator.c | 96 +++++++++++---------------------
 1 file changed, 34 insertions(+), 62 deletions(-)

diff --git a/backends/ref/ceed-ref-operator.c b/backends/ref/ceed-ref-operator.c
index 151316f83d..9adc6e4c3c 100644
--- a/backends/ref/ceed-ref-operator.c
+++ b/backends/ref/ceed-ref-operator.c
@@ -886,8 +886,8 @@ static int CeedOperatorSetupAtPoints_Ref(CeedOperator op) {
 //------------------------------------------------------------------------------
 static inline int CeedOperatorInputBasisAtPoints_Ref(CeedInt e, CeedInt num_points_offset, CeedInt num_points, CeedQFunctionField *qf_input_fields,
                                                      CeedOperatorField *op_input_fields, CeedInt num_input_fields, CeedVector in_vec,
-                                                     CeedVector point_coords_elem, bool skip_active, CeedScalar *e_data[2 * CEED_FIELD_MAX],
-                                                     CeedOperator_Ref *impl, CeedRequest *request) {
+                                                     CeedVector point_coords_elem, bool skip_active, bool skip_passive,
+                                                     CeedScalar *e_data[2 * CEED_FIELD_MAX], CeedOperator_Ref *impl, CeedRequest *request) {
   for (CeedInt i = 0; i < num_input_fields; i++) {
     bool                is_active;
     CeedInt             elem_size, size, num_comp;
@@ -902,6 +902,7 @@ static inline int CeedOperatorInputBasisAtPoints_Ref(CeedInt e, CeedInt num_poin
     is_active = vec == CEED_VECTOR_ACTIVE;
     CeedCallBackend(CeedVectorDestroy(&vec));
     if (skip_active && is_active) continue;
+    if (skip_passive && !is_active) continue;
 
     // Get elem_size, eval_mode, size
     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
@@ -909,7 +910,8 @@ static inline int CeedOperatorInputBasisAtPoints_Ref(CeedInt e, CeedInt num_poin
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
     CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &size));
     // Restrict block active input
-    if (is_active && !impl->skip_rstr_in[i]) {
+    // When skipping passive inputs, we're doing assembly and should not restrict
+    if (is_active && !impl->skip_rstr_in[i] && !skip_passive) {
       if (rstr_type == CEED_RESTRICTION_POINTS) {
         CeedCallBackend(CeedElemRestrictionApplyAtPointsInElement(elem_rstr, e, CEED_NOTRANSPOSE, in_vec, impl->e_vecs_in[i], request));
       } else {
@@ -952,7 +954,7 @@ static inline int CeedOperatorInputBasisAtPoints_Ref(CeedInt e, CeedInt num_poin
 static inline int CeedOperatorOutputBasisAtPoints_Ref(CeedInt e, CeedInt num_points_offset, CeedInt num_points, CeedQFunctionField *qf_output_fields,
                                                       CeedOperatorField *op_output_fields, CeedInt num_input_fields, CeedInt num_output_fields,
                                                       bool *apply_add_basis, bool *skip_rstr, CeedOperator op, CeedVector out_vec,
-                                                      CeedVector point_coords_elem, CeedOperator_Ref *impl, CeedRequest *request) {
+                                                      CeedVector point_coords_elem, bool skip_passive, CeedOperator_Ref *impl, CeedRequest *request) {
   for (CeedInt i = 0; i < num_output_fields; i++) {
     bool                is_active;
     CeedRestrictionType rstr_type;
@@ -961,6 +963,12 @@ static inline int CeedOperatorOutputBasisAtPoints_Ref(CeedInt e, CeedInt num_poi
     CeedElemRestriction elem_rstr;
     CeedBasis           basis;
 
+    // Skip active input
+    CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
+    is_active = vec == CEED_VECTOR_ACTIVE;
+    CeedCallBackend(CeedVectorDestroy(&vec));
+    if (skip_passive && !is_active) continue;
+
     // Get elem_size, eval_mode, size
     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
@@ -989,7 +997,8 @@ static inline int CeedOperatorOutputBasisAtPoints_Ref(CeedInt e, CeedInt num_poi
       }
     }
     // Restrict output block
-    if (skip_rstr[i]) {
+    // When skipping passive outputs, we're doing assembly and should not restrict
+    if (skip_rstr[i] || skip_passive) {
       CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
       continue;
     }
@@ -997,7 +1006,6 @@ static inline int CeedOperatorOutputBasisAtPoints_Ref(CeedInt e, CeedInt num_poi
     // Get output vector
     CeedCallBackend(CeedElemRestrictionGetType(elem_rstr, &rstr_type));
     CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
-    is_active = vec == CEED_VECTOR_ACTIVE;
     if (is_active) vec = out_vec;
     // Restrict
     if (rstr_type == CEED_RESTRICTION_POINTS) {
@@ -1049,7 +1057,7 @@ static int CeedOperatorApplyAddAtPoints_Ref(CeedOperator op, CeedVector in_vec,
 
     // Input basis apply
     CeedCallBackend(CeedOperatorInputBasisAtPoints_Ref(e, num_points_offset, num_points, qf_input_fields, op_input_fields, num_input_fields, in_vec,
-                                                       impl->point_coords_elem, false, e_data, impl, request));
+                                                       impl->point_coords_elem, false, false, e_data, impl, request));
 
     // Q function
     if (!impl->is_identity_qf) {
@@ -1059,7 +1067,7 @@ static int CeedOperatorApplyAddAtPoints_Ref(CeedOperator op, CeedVector in_vec,
     // Output basis apply and restriction
     CeedCallBackend(CeedOperatorOutputBasisAtPoints_Ref(e, num_points_offset, num_points, qf_output_fields, op_output_fields, num_input_fields,
                                                         num_output_fields, impl->apply_add_basis_out, impl->skip_rstr_out, op, out_vec,
-                                                        impl->point_coords_elem, impl, request));
+                                                        impl->point_coords_elem, false, impl, request));
 
     num_points_offset += num_points;
   }
@@ -1202,7 +1210,7 @@ static inline int CeedOperatorLinearAssembleQFunctionAtPointsCore_Ref(CeedOperat
 
     // Input basis apply
     CeedCallBackend(CeedOperatorInputBasisAtPoints_Ref(e, num_points_offset, num_points, qf_input_fields, op_input_fields, num_input_fields, NULL,
-                                                       impl->point_coords_elem, true, e_data_full, impl, request));
+                                                       impl->point_coords_elem, true, false, e_data_full, impl, request));
 
     // Assemble QFunction
     for (CeedInt i = 0; i < num_input_fields; i++) {
@@ -1360,7 +1368,7 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Ref(CeedOperator op, Ce
     CeedCallBackend(CeedVectorSetValue(out_vec, 0.0));
   }
 
-  // Clear input Qvecs
+  // Clear input Evecs
   for (CeedInt i = 0; i < num_input_fields; i++) {
     bool       is_active;
     CeedVector vec;
@@ -1368,8 +1376,8 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Ref(CeedOperator op, Ce
     CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
     is_active = vec == CEED_VECTOR_ACTIVE;
     CeedCallBackend(CeedVectorDestroy(&vec));
-    if (!is_active) continue;
-    CeedCallBackend(CeedVectorSetValue(impl->q_vecs_in[i], 0.0));
+    if (!is_active || impl->skip_rstr_in[i]) continue;
+    CeedCallBackend(CeedVectorSetValue(impl->e_vecs_in[i], 0.0));
   }
 
   // Input Evecs and Restriction
@@ -1385,7 +1393,7 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Ref(CeedOperator op, Ce
 
     // Input basis apply for non-active bases
     CeedCallBackend(CeedOperatorInputBasisAtPoints_Ref(e, num_points_offset, num_points, qf_input_fields, op_input_fields, num_input_fields, in_vec,
-                                                       impl->point_coords_elem, true, e_data, impl, request));
+                                                       impl->point_coords_elem, true, false, e_data, impl, request));
 
     // Loop over points on element
     for (CeedInt i = 0; i < num_input_fields; i++) {
@@ -1399,7 +1407,7 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Ref(CeedOperator op, Ce
       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
       is_active = vec == CEED_VECTOR_ACTIVE;
       CeedCallBackend(CeedVectorDestroy(&vec));
-      if (!is_active) continue;
+      if (!is_active || impl->skip_rstr_in[i]) continue;
 
       // -- Get active restriction type
       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
@@ -1412,37 +1420,18 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Ref(CeedOperator op, Ce
 
       e_vec_size = elem_size_active * num_comp_active;
       for (CeedInt s = 0; s < e_vec_size; s++) {
-        CeedEvalMode eval_mode;
-        CeedBasis    basis;
-
         // -- Update unit vector
         {
           CeedScalar *array;
 
-          if (s == 0) CeedCallBackend(CeedVectorSetValue(impl->e_vecs_in[i], 0.0));
           CeedCallBackend(CeedVectorGetArray(impl->e_vecs_in[i], CEED_MEM_HOST, &array));
           array[s] = 1.0;
           if (s > 0) array[s - 1] = 0.0;
           CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs_in[i], &array));
         }
-        // -- Basis action
-        CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
-        switch (eval_mode) {
-          case CEED_EVAL_NONE:
-            break;
-          // Note - these basis eval modes require FEM fields
-          case CEED_EVAL_INTERP:
-          case CEED_EVAL_GRAD:
-          case CEED_EVAL_DIV:
-          case CEED_EVAL_CURL:
-            CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
-            CeedCallBackend(CeedBasisApplyAtPoints(basis, 1, &num_points, CEED_NOTRANSPOSE, eval_mode, impl->point_coords_elem, impl->e_vecs_in[i],
-                                                   impl->q_vecs_in[i]));
-            CeedCallBackend(CeedBasisDestroy(&basis));
-            break;
-          case CEED_EVAL_WEIGHT:
-            break;  // No action
-        }
+        // Input basis apply for active bases
+        CeedCallBackend(CeedOperatorInputBasisAtPoints_Ref(e, num_points_offset, num_points, qf_input_fields, op_input_fields, num_input_fields,
+                                                           in_vec, impl->point_coords_elem, false, true, e_data, impl, request));
 
         // -- Q function
         if (!impl->is_identity_qf) {
@@ -1452,23 +1441,21 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Ref(CeedOperator op, Ce
         // -- Output basis apply and restriction
         CeedCallBackend(CeedOperatorOutputBasisAtPoints_Ref(e, num_points_offset, num_points, qf_output_fields, op_output_fields, num_input_fields,
                                                             num_output_fields, impl->apply_add_basis_out, impl->skip_rstr_out, op, out_vec,
-                                                            impl->point_coords_elem, impl, request));
+                                                            impl->point_coords_elem, true, impl, request));
 
         // -- Grab diagonal value
         for (CeedInt j = 0; j < num_output_fields; j++) {
           bool                is_active;
           CeedInt             elem_size = 0;
           CeedRestrictionType rstr_type;
-          CeedEvalMode        eval_mode;
           CeedVector          vec;
           CeedElemRestriction elem_rstr;
-          CeedBasis           basis;
 
           // ---- Skip non-active output
           CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[j], &vec));
           is_active = vec == CEED_VECTOR_ACTIVE;
           CeedCallBackend(CeedVectorDestroy(&vec));
-          if (!is_active) continue;
+          if (!is_active || impl->skip_rstr_out[j]) continue;
 
           // ---- Check if elem size matches
           CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[j], &elem_rstr));
@@ -1491,27 +1478,6 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Ref(CeedOperator op, Ce
               continue;
             }
           }
-
-          // ---- Basis action
-          CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode));
-          switch (eval_mode) {
-            case CEED_EVAL_NONE:
-              break;  // No action
-            case CEED_EVAL_INTERP:
-            case CEED_EVAL_GRAD:
-            case CEED_EVAL_DIV:
-            case CEED_EVAL_CURL:
-              CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[j], &basis));
-              CeedCallBackend(CeedBasisApplyAtPoints(basis, 1, &num_points, CEED_TRANSPOSE, eval_mode, impl->point_coords_elem, impl->q_vecs_out[j],
-                                                     impl->e_vecs_out[j]));
-              CeedCallBackend(CeedBasisDestroy(&basis));
-              break;
-            // LCOV_EXCL_START
-            case CEED_EVAL_WEIGHT: {
-              return CeedError(CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
-              // LCOV_EXCL_STOP
-            }
-          }
           // ---- Update output vector
           {
             CeedScalar *array, current_value = 0.0;
@@ -1533,7 +1499,13 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Ref(CeedOperator op, Ce
           CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
         }
         // -- Reset unit vector
-        if (s == e_vec_size - 1) CeedCallBackend(CeedVectorSetValue(impl->q_vecs_in[i], 0.0));
+        if (s == e_vec_size - 1) {
+          CeedScalar *array;
+
+          CeedCallBackend(CeedVectorGetArray(impl->e_vecs_in[i], CEED_MEM_HOST, &array));
+          array[s] = 0.0;
+          CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs_in[i], &array));
+        }
       }
     }
     num_points_offset += num_points;

From 8de854460df9d2de27a2c7bd7936d1f408d56ed7 Mon Sep 17 00:00:00 2001
From: Hugh Carson <hughcars@amazon.com>
Date: Tue, 22 Apr 2025 16:04:57 +0000
Subject: [PATCH 381/571] Fix CC_VENDOR for Ubuntu wrapped gcc

---
 Makefile | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 483c57f2ad..88f1911f2a 100644
--- a/Makefile
+++ b/Makefile
@@ -107,8 +107,9 @@ V ?= $(VERBOSE)
 AFLAGS ?= -fsanitize=address #-fsanitize=undefined -fno-omit-frame-pointer
 
 # Note: Intel oneAPI C/C++ compiler is now icx/icpx
-CC_VENDOR := $(firstword $(filter gcc (GCC) clang icc icc_orig oneAPI XL emcc,$(subst -, ,$(shell $(CC) --version))))
+CC_VENDOR := $(firstword $(filter gcc (GCC) clang cc icc icc_orig oneAPI XL emcc,$(subst -, ,$(shell $(CC) --version))))
 CC_VENDOR := $(subst (GCC),gcc,$(subst icc_orig,icc,$(CC_VENDOR)))
+CC_VENDOR := $(if $(filter cc,$(CC_VENDOR)),gcc,$(CC_VENDOR))
 FC_VENDOR := $(if $(FC),$(firstword $(filter GNU ifort ifx XL,$(shell $(FC) --version 2>&1 || $(FC) -qversion))))
 
 # Default extra flags by vendor

From 5cde1db9152a67762c6859bebaeefa844dfdd9a0 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Tue, 22 Apr 2025 16:57:05 -0600
Subject: [PATCH 382/571] op - gpu minor at points diagonal improvements

---
 backends/cuda-ref/ceed-cuda-ref-operator.c | 95 ++++++++-------------
 backends/hip-ref/ceed-hip-ref-operator.c   | 97 +++++++++-------------
 2 files changed, 74 insertions(+), 118 deletions(-)

diff --git a/backends/cuda-ref/ceed-cuda-ref-operator.c b/backends/cuda-ref/ceed-cuda-ref-operator.c
index 808816986f..025d5b6501 100644
--- a/backends/cuda-ref/ceed-cuda-ref-operator.c
+++ b/backends/cuda-ref/ceed-cuda-ref-operator.c
@@ -750,7 +750,7 @@ static int CeedOperatorSetupAtPoints_Cuda(CeedOperator op) {
 //------------------------------------------------------------------------------
 static inline int CeedOperatorInputBasisAtPoints_Cuda(CeedOperatorField op_input_field, CeedQFunctionField qf_input_field, CeedInt input_field,
                                                       CeedVector in_vec, CeedVector active_e_vec, CeedInt num_elem, const CeedInt *num_points,
-                                                      const bool skip_active, CeedOperator_Cuda *impl) {
+                                                      const bool skip_active, const bool skip_passive, CeedOperator_Cuda *impl) {
   bool         is_active = false;
   CeedEvalMode eval_mode;
   CeedVector   l_vec, e_vec = impl->e_vecs_in[input_field], q_vec = impl->q_vecs_in[input_field];
@@ -758,7 +758,11 @@ static inline int CeedOperatorInputBasisAtPoints_Cuda(CeedOperatorField op_input
   // Skip active input
   CeedCallBackend(CeedOperatorFieldGetVector(op_input_field, &l_vec));
   is_active = l_vec == CEED_VECTOR_ACTIVE;
-  if (is_active && skip_active) return CEED_ERROR_SUCCESS;
+  if (skip_active && is_active) return CEED_ERROR_SUCCESS;
+  if (skip_passive && !is_active) {
+    CeedCallBackend(CeedVectorDestroy(&l_vec));
+    return CEED_ERROR_SUCCESS;
+  }
   if (is_active) {
     l_vec = in_vec;
     if (!e_vec) e_vec = active_e_vec;
@@ -842,7 +846,7 @@ static int CeedOperatorApplyAddAtPoints_Cuda(CeedOperator op, CeedVector in_vec,
     CeedCallBackend(
         CeedOperatorInputRestrict_Cuda(op_input_fields[field], qf_input_fields[field], field, in_vec, active_e_vec, false, impl, request));
     CeedCallBackend(CeedOperatorInputBasisAtPoints_Cuda(op_input_fields[field], qf_input_fields[field], field, in_vec, active_e_vec, num_elem,
-                                                        num_points, false, impl));
+                                                        num_points, false, false, impl));
   }
 
   // Output pointers, as necessary
@@ -1845,19 +1849,8 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda(CeedOperator op, C
   // Process inputs
   for (CeedInt i = 0; i < num_input_fields; i++) {
     CeedCallBackend(CeedOperatorInputRestrict_Cuda(op_input_fields[i], qf_input_fields[i], i, NULL, NULL, true, impl, request));
-    CeedCallBackend(CeedOperatorInputBasisAtPoints_Cuda(op_input_fields[i], qf_input_fields[i], i, NULL, NULL, num_elem, num_points, true, impl));
-  }
-
-  // Clear active input Qvecs
-  for (CeedInt i = 0; i < num_input_fields; i++) {
-    bool       is_active = false;
-    CeedVector l_vec;
-
-    CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &l_vec));
-    is_active = l_vec == CEED_VECTOR_ACTIVE;
-    CeedCallBackend(CeedVectorDestroy(&l_vec));
-    if (!is_active) continue;
-    CeedCallBackend(CeedVectorSetValue(impl->q_vecs_in[i], 0.0));
+    CeedCallBackend(
+        CeedOperatorInputBasisAtPoints_Cuda(op_input_fields[i], qf_input_fields[i], i, NULL, NULL, num_elem, num_points, true, false, impl));
   }
 
   // Output pointers, as necessary
@@ -1876,19 +1869,19 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda(CeedOperator op, C
   // Loop over active fields
   for (CeedInt i = 0; i < num_input_fields; i++) {
     bool                is_active = false, is_active_at_points = true;
-    CeedInt             elem_size = 1, num_comp_active = 1, e_vec_size = 0;
+    CeedInt             elem_size = 1, num_comp_active = 1, e_vec_size = 0, field_in = impl->input_field_order[i];
     CeedRestrictionType rstr_type;
     CeedVector          l_vec;
     CeedElemRestriction elem_rstr;
 
     // -- Skip non-active input
-    CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &l_vec));
+    CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[field_in], &l_vec));
     is_active = l_vec == CEED_VECTOR_ACTIVE;
     CeedCallBackend(CeedVectorDestroy(&l_vec));
-    if (!is_active) continue;
+    if (!is_active || impl->skip_rstr_in[field_in]) continue;
 
     // -- Get active restriction type
-    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
+    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[field_in], &elem_rstr));
     CeedCallBackend(CeedElemRestrictionGetType(elem_rstr, &rstr_type));
     is_active_at_points = rstr_type == CEED_RESTRICTION_POINTS;
     if (!is_active_at_points) CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
@@ -1897,16 +1890,9 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda(CeedOperator op, C
     CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
 
     e_vec_size = elem_size * num_comp_active;
+    CeedCallBackend(CeedVectorSetValue(active_e_vec_in, 0.0));
     for (CeedInt s = 0; s < e_vec_size; s++) {
-      bool         is_active = false;
-      CeedEvalMode eval_mode;
-      CeedVector   l_vec, q_vec = impl->q_vecs_in[i];
-
-      // Skip non-active input
-      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &l_vec));
-      is_active = l_vec == CEED_VECTOR_ACTIVE;
-      CeedCallBackend(CeedVectorDestroy(&l_vec));
-      if (!is_active) continue;
+      CeedVector q_vec = impl->q_vecs_in[field_in];
 
       // Update unit vector
       {
@@ -1915,8 +1901,7 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda(CeedOperator op, C
         CeedSize start = node * 1 + comp * (elem_size * num_elem);
         CeedSize stop  = (comp + 1) * (elem_size * num_elem);
 
-        if (s == 0) CeedCallBackend(CeedVectorSetValue(active_e_vec_in, 0.0));
-        else CeedCallBackend(CeedVectorSetValueStrided(active_e_vec_in, start, stop, elem_size, 0.0));
+        if (s != 0) CeedCallBackend(CeedVectorSetValueStrided(active_e_vec_in, start, stop, elem_size, 0.0));
 
         node = s % elem_size, comp = s / elem_size;
         start = node * 1 + comp * (elem_size * num_elem);
@@ -1925,29 +1910,11 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda(CeedOperator op, C
       }
 
       // Basis action
-      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
-      switch (eval_mode) {
-        case CEED_EVAL_NONE: {
-          const CeedScalar *e_vec_array;
-
-          CeedCallBackend(CeedVectorGetArrayRead(active_e_vec_in, CEED_MEM_DEVICE, &e_vec_array));
-          CeedCallBackend(CeedVectorSetArray(q_vec, CEED_MEM_DEVICE, CEED_USE_POINTER, (CeedScalar *)e_vec_array));
-          break;
-        }
-        case CEED_EVAL_INTERP:
-        case CEED_EVAL_GRAD:
-        case CEED_EVAL_DIV:
-        case CEED_EVAL_CURL: {
-          CeedBasis basis;
+      for (CeedInt j = 0; j < num_input_fields; j++) {
+        CeedInt field = impl->input_field_order[j];
 
-          CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
-          CeedCallBackend(
-              CeedBasisApplyAtPoints(basis, num_elem, num_points, CEED_NOTRANSPOSE, eval_mode, impl->point_coords_elem, active_e_vec_in, q_vec));
-          CeedCallBackend(CeedBasisDestroy(&basis));
-          break;
-        }
-        case CEED_EVAL_WEIGHT:
-          break;  // No action
+        CeedCallBackend(CeedOperatorInputBasisAtPoints_Cuda(op_input_fields[field], qf_input_fields[field], field, NULL, active_e_vec_in, num_elem,
+                                                            num_points, false, true, impl));
       }
 
       // Q function
@@ -1957,20 +1924,21 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda(CeedOperator op, C
       for (CeedInt j = 0; j < num_output_fields; j++) {
         bool                is_active = false;
         CeedInt             elem_size = 0;
+        CeedInt             field_out = impl->output_field_order[j];
         CeedRestrictionType rstr_type;
         CeedEvalMode        eval_mode;
-        CeedVector          l_vec, e_vec = impl->e_vecs_out[j], q_vec = impl->q_vecs_out[j];
+        CeedVector          l_vec, e_vec = impl->e_vecs_out[field_out], q_vec = impl->q_vecs_out[field_out];
         CeedElemRestriction elem_rstr;
 
         // ---- Skip non-active output
-        CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[j], &l_vec));
+        CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[field_out], &l_vec));
         is_active = l_vec == CEED_VECTOR_ACTIVE;
         CeedCallBackend(CeedVectorDestroy(&l_vec));
         if (!is_active) continue;
         if (!e_vec) e_vec = active_e_vec_out;
 
         // ---- Check if elem size matches
-        CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[j], &elem_rstr));
+        CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[field_out], &elem_rstr));
         CeedCallBackend(CeedElemRestrictionGetType(elem_rstr, &rstr_type));
         if (is_active_at_points && rstr_type != CEED_RESTRICTION_POINTS) continue;
         if (rstr_type == CEED_RESTRICTION_POINTS) {
@@ -1986,7 +1954,7 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda(CeedOperator op, C
         }
 
         // Basis action
-        CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode));
+        CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[field_out], &eval_mode));
         switch (eval_mode) {
           case CEED_EVAL_NONE: {
             CeedScalar *e_vec_array;
@@ -2001,8 +1969,13 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda(CeedOperator op, C
           case CEED_EVAL_CURL: {
             CeedBasis basis;
 
-            CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[j], &basis));
-            CeedCallBackend(CeedBasisApplyAtPoints(basis, num_elem, num_points, CEED_TRANSPOSE, eval_mode, impl->point_coords_elem, q_vec, e_vec));
+            CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[field_out], &basis));
+            if (impl->apply_add_basis_out[field_out]) {
+              CeedCallBackend(
+                  CeedBasisApplyAddAtPoints(basis, num_elem, num_points, CEED_TRANSPOSE, eval_mode, impl->point_coords_elem, q_vec, e_vec));
+            } else {
+              CeedCallBackend(CeedBasisApplyAtPoints(basis, num_elem, num_points, CEED_TRANSPOSE, eval_mode, impl->point_coords_elem, q_vec, e_vec));
+            }
             CeedCallBackend(CeedBasisDestroy(&basis));
             break;
           }
@@ -2014,6 +1987,10 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda(CeedOperator op, C
         }
 
         // Mask output e-vec
+        if (impl->skip_rstr_out[field_out]) {
+          CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
+          continue;
+        }
         CeedCallBackend(CeedVectorPointwiseMult(e_vec, active_e_vec_in, e_vec));
 
         // Restrict
diff --git a/backends/hip-ref/ceed-hip-ref-operator.c b/backends/hip-ref/ceed-hip-ref-operator.c
index 2d28627a72..b9a7231247 100644
--- a/backends/hip-ref/ceed-hip-ref-operator.c
+++ b/backends/hip-ref/ceed-hip-ref-operator.c
@@ -748,7 +748,7 @@ static int CeedOperatorSetupAtPoints_Hip(CeedOperator op) {
 //------------------------------------------------------------------------------
 static inline int CeedOperatorInputBasisAtPoints_Hip(CeedOperatorField op_input_field, CeedQFunctionField qf_input_field, CeedInt input_field,
                                                      CeedVector in_vec, CeedVector active_e_vec, CeedInt num_elem, const CeedInt *num_points,
-                                                     const bool skip_active, CeedOperator_Hip *impl) {
+                                                     const bool skip_active, const bool skip_passive, CeedOperator_Hip *impl) {
   bool         is_active = false;
   CeedEvalMode eval_mode;
   CeedVector   l_vec, e_vec = impl->e_vecs_in[input_field], q_vec = impl->q_vecs_in[input_field];
@@ -756,7 +756,11 @@ static inline int CeedOperatorInputBasisAtPoints_Hip(CeedOperatorField op_input_
   // Skip active input
   CeedCallBackend(CeedOperatorFieldGetVector(op_input_field, &l_vec));
   is_active = l_vec == CEED_VECTOR_ACTIVE;
-  if (is_active && skip_active) return CEED_ERROR_SUCCESS;
+  if (skip_active && is_active) return CEED_ERROR_SUCCESS;
+  if (skip_passive && !is_active) {
+    CeedCallBackend(CeedVectorDestroy(&l_vec));
+    return CEED_ERROR_SUCCESS;
+  }
   if (is_active) {
     l_vec = in_vec;
     if (!e_vec) e_vec = active_e_vec;
@@ -839,7 +843,7 @@ static int CeedOperatorApplyAddAtPoints_Hip(CeedOperator op, CeedVector in_vec,
 
     CeedCallBackend(CeedOperatorInputRestrict_Hip(op_input_fields[field], qf_input_fields[field], field, in_vec, active_e_vec, false, impl, request));
     CeedCallBackend(CeedOperatorInputBasisAtPoints_Hip(op_input_fields[field], qf_input_fields[field], field, in_vec, active_e_vec, num_elem,
-                                                       num_points, false, impl));
+                                                       num_points, false, false, impl));
   }
 
   // Output pointers, as necessary
@@ -1842,19 +1846,8 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip(CeedOperator op, Ce
   // Process inputs
   for (CeedInt i = 0; i < num_input_fields; i++) {
     CeedCallBackend(CeedOperatorInputRestrict_Hip(op_input_fields[i], qf_input_fields[i], i, NULL, NULL, true, impl, request));
-    CeedCallBackend(CeedOperatorInputBasisAtPoints_Hip(op_input_fields[i], qf_input_fields[i], i, NULL, NULL, num_elem, num_points, true, impl));
-  }
-
-  // Clear active input Qvecs
-  for (CeedInt i = 0; i < num_input_fields; i++) {
-    bool       is_active = false;
-    CeedVector l_vec;
-
-    CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &l_vec));
-    is_active = l_vec == CEED_VECTOR_ACTIVE;
-    CeedCallBackend(CeedVectorDestroy(&l_vec));
-    if (!is_active) continue;
-    CeedCallBackend(CeedVectorSetValue(impl->q_vecs_in[i], 0.0));
+    CeedCallBackend(
+        CeedOperatorInputBasisAtPoints_Hip(op_input_fields[i], qf_input_fields[i], i, NULL, NULL, num_elem, num_points, true, false, impl));
   }
 
   // Output pointers, as necessary
@@ -1873,19 +1866,19 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip(CeedOperator op, Ce
   // Loop over active fields
   for (CeedInt i = 0; i < num_input_fields; i++) {
     bool                is_active = false, is_active_at_points = true;
-    CeedInt             elem_size = 1, num_comp_active = 1, e_vec_size = 0;
+    CeedInt             elem_size = 1, num_comp_active = 1, e_vec_size = 0, field_in = impl->input_field_order[i];
     CeedRestrictionType rstr_type;
     CeedVector          l_vec;
     CeedElemRestriction elem_rstr;
 
     // -- Skip non-active input
-    CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &l_vec));
+    CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[field_in], &l_vec));
     is_active = l_vec == CEED_VECTOR_ACTIVE;
     CeedCallBackend(CeedVectorDestroy(&l_vec));
-    if (!is_active) continue;
+    if (!is_active || impl->skip_rstr_in[field_in]) continue;
 
     // -- Get active restriction type
-    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
+    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[field_in], &elem_rstr));
     CeedCallBackend(CeedElemRestrictionGetType(elem_rstr, &rstr_type));
     is_active_at_points = rstr_type == CEED_RESTRICTION_POINTS;
     if (!is_active_at_points) CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
@@ -1894,16 +1887,9 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip(CeedOperator op, Ce
     CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
 
     e_vec_size = elem_size * num_comp_active;
+    CeedCallBackend(CeedVectorSetValue(active_e_vec_in, 0.0));
     for (CeedInt s = 0; s < e_vec_size; s++) {
-      bool         is_active = false;
-      CeedEvalMode eval_mode;
-      CeedVector   l_vec, q_vec = impl->q_vecs_in[i];
-
-      // Skip non-active input
-      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &l_vec));
-      is_active = l_vec == CEED_VECTOR_ACTIVE;
-      CeedCallBackend(CeedVectorDestroy(&l_vec));
-      if (!is_active) continue;
+      CeedVector q_vec = impl->q_vecs_in[field_in];
 
       // Update unit vector
       {
@@ -1912,8 +1898,7 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip(CeedOperator op, Ce
         CeedSize start = node * 1 + comp * (elem_size * num_elem);
         CeedSize stop  = (comp + 1) * (elem_size * num_elem);
 
-        if (s == 0) CeedCallBackend(CeedVectorSetValue(active_e_vec_in, 0.0));
-        else CeedCallBackend(CeedVectorSetValueStrided(active_e_vec_in, start, stop, elem_size, 0.0));
+        if (s != 0) CeedCallBackend(CeedVectorSetValueStrided(active_e_vec_in, start, stop, elem_size, 0.0));
 
         node = s % elem_size, comp = s / elem_size;
         start = node * 1 + comp * (elem_size * num_elem);
@@ -1922,29 +1907,11 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip(CeedOperator op, Ce
       }
 
       // Basis action
-      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
-      switch (eval_mode) {
-        case CEED_EVAL_NONE: {
-          const CeedScalar *e_vec_array;
-
-          CeedCallBackend(CeedVectorGetArrayRead(active_e_vec_in, CEED_MEM_DEVICE, &e_vec_array));
-          CeedCallBackend(CeedVectorSetArray(q_vec, CEED_MEM_DEVICE, CEED_USE_POINTER, (CeedScalar *)e_vec_array));
-          break;
-        }
-        case CEED_EVAL_INTERP:
-        case CEED_EVAL_GRAD:
-        case CEED_EVAL_DIV:
-        case CEED_EVAL_CURL: {
-          CeedBasis basis;
+      for (CeedInt j = 0; j < num_input_fields; j++) {
+        CeedInt field = impl->input_field_order[j];
 
-          CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
-          CeedCallBackend(
-              CeedBasisApplyAtPoints(basis, num_elem, num_points, CEED_NOTRANSPOSE, eval_mode, impl->point_coords_elem, active_e_vec_in, q_vec));
-          CeedCallBackend(CeedBasisDestroy(&basis));
-          break;
-        }
-        case CEED_EVAL_WEIGHT:
-          break;  // No action
+        CeedCallBackend(CeedOperatorInputBasisAtPoints_Hip(op_input_fields[field], qf_input_fields[field], field, NULL, active_e_vec_in, num_elem,
+                                                           num_points, false, true, impl));
       }
 
       // Q function
@@ -1954,20 +1921,21 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip(CeedOperator op, Ce
       for (CeedInt j = 0; j < num_output_fields; j++) {
         bool                is_active = false;
         CeedInt             elem_size = 0;
+        CeedInt             field_out = impl->output_field_order[j];
         CeedRestrictionType rstr_type;
         CeedEvalMode        eval_mode;
-        CeedVector          l_vec, e_vec = impl->e_vecs_out[j], q_vec = impl->q_vecs_out[j];
+        CeedVector          l_vec, e_vec = impl->e_vecs_out[field_out], q_vec = impl->q_vecs_out[field_out];
         CeedElemRestriction elem_rstr;
 
         // ---- Skip non-active output
-        CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[j], &l_vec));
+        CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[field_out], &l_vec));
         is_active = l_vec == CEED_VECTOR_ACTIVE;
         CeedCallBackend(CeedVectorDestroy(&l_vec));
         if (!is_active) continue;
         if (!e_vec) e_vec = active_e_vec_out;
 
         // ---- Check if elem size matches
-        CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[j], &elem_rstr));
+        CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[field_out], &elem_rstr));
         CeedCallBackend(CeedElemRestrictionGetType(elem_rstr, &rstr_type));
         if (is_active_at_points && rstr_type != CEED_RESTRICTION_POINTS) continue;
         if (rstr_type == CEED_RESTRICTION_POINTS) {
@@ -1983,7 +1951,7 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip(CeedOperator op, Ce
         }
 
         // Basis action
-        CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode));
+        CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[field_out], &eval_mode));
         switch (eval_mode) {
           case CEED_EVAL_NONE: {
             CeedScalar *e_vec_array;
@@ -1998,8 +1966,13 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip(CeedOperator op, Ce
           case CEED_EVAL_CURL: {
             CeedBasis basis;
 
-            CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[j], &basis));
-            CeedCallBackend(CeedBasisApplyAtPoints(basis, num_elem, num_points, CEED_TRANSPOSE, eval_mode, impl->point_coords_elem, q_vec, e_vec));
+            CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[field_out], &basis));
+            if (impl->apply_add_basis_out[field_out]) {
+              CeedCallBackend(
+                  CeedBasisApplyAddAtPoints(basis, num_elem, num_points, CEED_TRANSPOSE, eval_mode, impl->point_coords_elem, q_vec, e_vec));
+            } else {
+              CeedCallBackend(CeedBasisApplyAtPoints(basis, num_elem, num_points, CEED_TRANSPOSE, eval_mode, impl->point_coords_elem, q_vec, e_vec));
+            }
             CeedCallBackend(CeedBasisDestroy(&basis));
             break;
           }
@@ -2010,6 +1983,12 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip(CeedOperator op, Ce
           }
         }
 
+        // Continue if a field that is summed into
+        if (impl->skip_rstr_out[field_out]) {
+          CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
+          continue;
+        }
+
         // Mask output e-vec
         CeedCallBackend(CeedVectorPointwiseMult(e_vec, active_e_vec_in, e_vec));
 

From 1b95d8c6326a50bc83b21741e59f087f026c971e Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Thu, 10 Apr 2025 16:51:17 -0600
Subject: [PATCH 383/571] pc - CPU support for AtPoints assembly

---
 backends/ref/ceed-ref-operator.c | 207 +++++++++++++++++++++++++++++++
 interface/ceed-preconditioning.c |   6 +-
 tests/t596-operator.c            | 202 ++++++++++++++++++++++++++++++
 tests/t596-operator.h            |  29 +++++
 tests/t597-operator.c            | 203 ++++++++++++++++++++++++++++++
 tests/t597-operator.h            |  59 +++++++++
 6 files changed, 705 insertions(+), 1 deletion(-)
 create mode 100644 tests/t596-operator.c
 create mode 100644 tests/t596-operator.h
 create mode 100644 tests/t597-operator.c
 create mode 100644 tests/t597-operator.h

diff --git a/backends/ref/ceed-ref-operator.c b/backends/ref/ceed-ref-operator.c
index 9adc6e4c3c..89fd47df76 100644
--- a/backends/ref/ceed-ref-operator.c
+++ b/backends/ref/ceed-ref-operator.c
@@ -1524,6 +1524,212 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Ref(CeedOperator op, Ce
   return CEED_ERROR_SUCCESS;
 }
 
+//------------------------------------------------------------------------------
+// Assemble Operator AtPoints
+//------------------------------------------------------------------------------
+static int CeedSingleOperatorAssembleAtPoints_Ref(CeedOperator op, CeedInt offset, CeedVector values) {
+  CeedInt             num_points_offset = 0, num_input_fields, num_output_fields, num_elem, num_comp_active = 1;
+  CeedScalar         *e_data[2 * CEED_FIELD_MAX] = {0}, *assembled;
+  Ceed                ceed;
+  CeedVector          point_coords = NULL, in_vec, out_vec;
+  CeedElemRestriction rstr_points  = NULL;
+  CeedQFunctionField *qf_input_fields, *qf_output_fields;
+  CeedQFunction       qf;
+  CeedOperatorField  *op_input_fields, *op_output_fields;
+  CeedOperator_Ref   *impl;
+
+  CeedCallBackend(CeedOperatorGetData(op, &impl));
+  CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
+  CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
+  CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
+  CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
+
+  // Setup
+  CeedCallBackend(CeedOperatorSetupAtPoints_Ref(op));
+
+  // Ceed
+  {
+    Ceed ceed_parent;
+
+    CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
+    CeedCallBackend(CeedGetParent(ceed, &ceed_parent));
+    CeedCallBackend(CeedReferenceCopy(ceed_parent, &ceed));
+    CeedCallBackend(CeedDestroy(&ceed_parent));
+  }
+
+  // Point coordinates
+  CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, &point_coords));
+
+  // Input and output vectors
+  {
+    CeedSize input_size, output_size;
+
+    CeedCallBackend(CeedOperatorGetActiveVectorLengths(op, &input_size, &output_size));
+    CeedCallBackend(CeedVectorCreate(ceed, input_size, &in_vec));
+    CeedCallBackend(CeedVectorCreate(ceed, output_size, &out_vec));
+    CeedCallBackend(CeedVectorSetValue(out_vec, 0.0));
+  }
+
+  // Assembled array
+  CeedCallBackend(CeedVectorGetArray(values, CEED_MEM_HOST, &assembled));
+
+  // Clear input Evecs
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    bool       is_active;
+    CeedVector vec;
+
+    CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
+    is_active = vec == CEED_VECTOR_ACTIVE;
+    CeedCallBackend(CeedVectorDestroy(&vec));
+    if (!is_active || impl->skip_rstr_in[i]) continue;
+    CeedCallBackend(CeedVectorSetValue(impl->e_vecs_in[i], 0.0));
+  }
+
+  // Input Evecs and Restriction
+  CeedCallBackend(CeedOperatorSetupInputs_Ref(num_input_fields, qf_input_fields, op_input_fields, NULL, true, e_data, impl, CEED_REQUEST_IMMEDIATE));
+
+  // Loop through elements
+  for (CeedInt e = 0; e < num_elem; e++) {
+    CeedInt num_points, e_vec_size = 0;
+
+    // Setup points for element
+    CeedCallBackend(
+        CeedElemRestrictionApplyAtPointsInElement(rstr_points, e, CEED_NOTRANSPOSE, point_coords, impl->point_coords_elem, CEED_REQUEST_IMMEDIATE));
+    CeedCallBackend(CeedElemRestrictionGetNumPointsInElement(rstr_points, e, &num_points));
+
+    // Input basis apply for non-active bases
+    CeedCallBackend(CeedOperatorInputBasisAtPoints_Ref(e, num_points_offset, num_points, qf_input_fields, op_input_fields, num_input_fields, in_vec,
+                                                       impl->point_coords_elem, true, false, e_data, impl, CEED_REQUEST_IMMEDIATE));
+
+    // Loop over points on element
+    for (CeedInt i = 0; i < num_input_fields; i++) {
+      bool                is_active_at_points = true, is_active;
+      CeedInt             elem_size_active    = 1;
+      CeedRestrictionType rstr_type;
+      CeedVector          vec;
+      CeedElemRestriction elem_rstr;
+
+      // -- Skip non-active input
+      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
+      is_active = vec == CEED_VECTOR_ACTIVE;
+      CeedCallBackend(CeedVectorDestroy(&vec));
+      if (!is_active || impl->skip_rstr_in[i]) continue;
+
+      // -- Get active restriction type
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
+      CeedCallBackend(CeedElemRestrictionGetType(elem_rstr, &rstr_type));
+      is_active_at_points = rstr_type == CEED_RESTRICTION_POINTS;
+      if (!is_active_at_points) CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size_active));
+      else elem_size_active = num_points;
+      CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp_active));
+      CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
+
+      e_vec_size = elem_size_active * num_comp_active;
+      for (CeedInt s = 0; s < e_vec_size; s++) {
+        const CeedInt comp_in = s / elem_size_active;
+        const CeedInt node_in = s % elem_size_active;
+
+        // -- Update unit vector
+        {
+          CeedScalar *array;
+
+          if (s == 0) CeedCallBackend(CeedVectorSetValue(impl->e_vecs_in[i], 0.0));
+          CeedCallBackend(CeedVectorGetArray(impl->e_vecs_in[i], CEED_MEM_HOST, &array));
+          array[s] = 1.0;
+          if (s > 0) array[s - 1] = 0.0;
+          CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs_in[i], &array));
+        }
+        // Input basis apply for active bases
+        CeedCallBackend(CeedOperatorInputBasisAtPoints_Ref(e, num_points_offset, num_points, qf_input_fields, op_input_fields, num_input_fields,
+                                                           in_vec, impl->point_coords_elem, false, true, e_data, impl, CEED_REQUEST_IMMEDIATE));
+
+        // -- Q function
+        if (!impl->is_identity_qf) {
+          CeedCallBackend(CeedQFunctionApply(qf, num_points, impl->q_vecs_in, impl->q_vecs_out));
+        }
+
+        // -- Output basis apply and restriction
+        CeedCallBackend(CeedOperatorOutputBasisAtPoints_Ref(e, num_points_offset, num_points, qf_output_fields, op_output_fields, num_input_fields,
+                                                            num_output_fields, impl->apply_add_basis_out, impl->skip_rstr_out, op, out_vec,
+                                                            impl->point_coords_elem, true, impl, CEED_REQUEST_IMMEDIATE));
+
+        // -- Build element matrix
+        for (CeedInt j = 0; j < num_output_fields; j++) {
+          bool                is_active;
+          CeedInt             elem_size = 0;
+          CeedRestrictionType rstr_type;
+          CeedVector          vec;
+          CeedElemRestriction elem_rstr;
+
+          // ---- Skip non-active output
+          CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[j], &vec));
+          is_active = vec == CEED_VECTOR_ACTIVE;
+          CeedCallBackend(CeedVectorDestroy(&vec));
+          if (!is_active || impl->skip_rstr_out[j]) continue;
+
+          // ---- Check if elem size matches
+          CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[j], &elem_rstr));
+          CeedCallBackend(CeedElemRestrictionGetType(elem_rstr, &rstr_type));
+          if (is_active_at_points && rstr_type != CEED_RESTRICTION_POINTS) {
+            CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
+            continue;
+          }
+          if (rstr_type == CEED_RESTRICTION_POINTS) {
+            CeedCallBackend(CeedElemRestrictionGetNumPointsInElement(elem_rstr, e, &elem_size));
+          } else {
+            CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
+          }
+          {
+            CeedInt num_comp = 0;
+
+            CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
+            CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
+            if (e_vec_size != num_comp * elem_size) continue;
+          }
+          // ---- Copy output
+          {
+            const CeedScalar *output;
+
+            CeedCallBackend(CeedVectorGetArrayRead(impl->e_vecs_out[j], CEED_MEM_HOST, &output));
+            for (CeedInt k = 0; k < e_vec_size; k++) {
+              const CeedInt comp_out = k / elem_size_active;
+              const CeedInt node_out = k % elem_size_active;
+
+              assembled[offset + e * e_vec_size * e_vec_size + (comp_in * num_comp_active + comp_out) * elem_size_active * elem_size_active +
+                        node_out * elem_size_active + node_in] = output[k];
+            }
+            CeedCallBackend(CeedVectorRestoreArrayRead(impl->e_vecs_out[j], &output));
+          }
+        }
+        // -- Reset unit vector
+        if (s == e_vec_size - 1) {
+          CeedScalar *array;
+
+          CeedCallBackend(CeedVectorGetArray(impl->e_vecs_in[i], CEED_MEM_HOST, &array));
+          array[s] = 0.0;
+          CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs_in[i], &array));
+        }
+      }
+    }
+    num_points_offset += num_points;
+  }
+
+  // Restore input arrays
+  CeedCallBackend(CeedOperatorRestoreInputs_Ref(num_input_fields, qf_input_fields, op_input_fields, true, e_data, impl));
+
+  // Restore assembled values
+  CeedCallBackend(CeedVectorRestoreArray(values, &assembled));
+
+  // Cleanup
+  CeedCallBackend(CeedDestroy(&ceed));
+  CeedCallBackend(CeedVectorDestroy(&in_vec));
+  CeedCallBackend(CeedVectorDestroy(&out_vec));
+  CeedCallBackend(CeedVectorDestroy(&point_coords));
+  CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
+  CeedCallBackend(CeedQFunctionDestroy(&qf));
+  return CEED_ERROR_SUCCESS;
+}
+
 //------------------------------------------------------------------------------
 // Operator Destroy
 //------------------------------------------------------------------------------
@@ -1592,6 +1798,7 @@ int CeedOperatorCreateAtPoints_Ref(CeedOperator op) {
   CeedCallBackend(
       CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunctionUpdate", CeedOperatorLinearAssembleQFunctionAtPointsUpdate_Ref));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddDiagonal", CeedOperatorLinearAssembleAddDiagonalAtPoints_Ref));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleSingle", CeedSingleOperatorAssembleAtPoints_Ref));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAddAtPoints_Ref));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "Destroy", CeedOperatorDestroy_Ref));
   CeedCallBackend(CeedDestroy(&ceed));
diff --git a/interface/ceed-preconditioning.c b/interface/ceed-preconditioning.c
index 970f3549b5..5de71fed38 100644
--- a/interface/ceed-preconditioning.c
+++ b/interface/ceed-preconditioning.c
@@ -567,7 +567,7 @@ static int CeedSingleOperatorAssembleSymbolic(CeedOperator op, CeedInt offset, C
   @ref Developer
 **/
 static int CeedSingleOperatorAssemble(CeedOperator op, CeedInt offset, CeedVector values) {
-  bool is_composite;
+  bool is_composite, is_at_points;
 
   CeedCall(CeedOperatorIsComposite(op, &is_composite));
   CeedCheck(!is_composite, CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED, "Composite operator not supported");
@@ -595,6 +595,10 @@ static int CeedSingleOperatorAssemble(CeedOperator op, CeedInt offset, CeedVecto
     }
   }
 
+  CeedCall(CeedOperatorIsAtPoints(op, &is_at_points));
+  CeedCheck(!is_at_points, CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED,
+            "Backend does not implement CeedOperatorLinearAssemble for AtPoints operator");
+
   // Assemble QFunction
   CeedInt             layout_qf[3];
   const CeedScalar   *assembled_qf_array;
diff --git a/tests/t596-operator.c b/tests/t596-operator.c
new file mode 100644
index 0000000000..81ca865ebd
--- /dev/null
+++ b/tests/t596-operator.c
@@ -0,0 +1,202 @@
+/// @file
+/// Test full assembly of mass matrix operator
+/// \test Test full assembly of mass matrix operator AtPoints
+#include <ceed.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "t596-operator.h"
+
+int main(int argc, char **argv) {
+  Ceed ceed;
+
+  CeedInit(argv[1], &ceed);
+
+  for (CeedInt num_comp = 1; num_comp <= 3; num_comp++) {
+    CeedElemRestriction elem_restriction_x, elem_restriction_x_points, elem_restriction_u, elem_restriction_q_data;
+    CeedBasis           basis_x, basis_u;
+    CeedQFunction       qf_setup, qf_mass;
+    CeedOperator        op_setup, op_mass;
+    CeedVector          q_data, x, x_points, u, v;
+    CeedInt             p = 3, q = 4, dim = 2;
+    CeedInt             n_x = 3, n_y = 2;
+    CeedInt             num_elem = n_x * n_y;
+    CeedInt             num_dofs = (n_x * 2 + 1) * (n_y * 2 + 1), num_points_per_elem = 4, num_points = num_elem * num_points_per_elem;
+    CeedInt             ind_x[num_elem * p * p];
+    CeedScalar          assembled_values[num_comp * num_comp * num_dofs * num_dofs];
+    CeedScalar          assembled_true[num_comp * num_comp * num_dofs * num_dofs];
+
+    // Points
+    CeedVectorCreate(ceed, dim * num_points, &x_points);
+    {
+      CeedScalar x_array[dim * num_points];
+
+      for (CeedInt e = 0; e < num_elem; e++) {
+        for (CeedInt d = 0; d < dim; d++) {
+          x_array[num_points_per_elem * (e * dim + d) + 0] = 0.25;
+          x_array[num_points_per_elem * (e * dim + d) + 1] = d == 0 ? -0.25 : 0.25;
+          x_array[num_points_per_elem * (e * dim + d) + 2] = d == 0 ? 0.25 : -0.25;
+          x_array[num_points_per_elem * (e * dim + d) + 3] = 0.25;
+        }
+      }
+      CeedVectorSetArray(x_points, CEED_MEM_HOST, CEED_COPY_VALUES, x_array);
+    }
+    {
+      CeedInt ind_x[num_elem + 1 + num_points];
+
+      for (CeedInt i = 0; i <= num_elem; i++) ind_x[i] = num_elem + 1 + i * num_points_per_elem;
+      for (CeedInt i = 0; i < num_points; i++) ind_x[num_elem + 1 + i] = i;
+      CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points, dim, num_points * dim, CEED_MEM_HOST, CEED_COPY_VALUES, ind_x,
+                                        &elem_restriction_x_points);
+      CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points, 1, num_points, CEED_MEM_HOST, CEED_COPY_VALUES, ind_x, &elem_restriction_q_data);
+    }
+
+    // Vectors
+    CeedVectorCreate(ceed, dim * num_dofs, &x);
+    {
+      CeedScalar x_array[dim * num_dofs];
+
+      for (CeedInt i = 0; i < n_x * 2 + 1; i++) {
+        for (CeedInt j = 0; j < n_y * 2 + 1; j++) {
+          x_array[i + j * (n_x * 2 + 1) + 0 * num_dofs] = (CeedScalar)i / (2 * n_x);
+          x_array[i + j * (n_x * 2 + 1) + 1 * num_dofs] = (CeedScalar)j / (2 * n_y);
+        }
+      }
+      CeedVectorSetArray(x, CEED_MEM_HOST, CEED_COPY_VALUES, x_array);
+    }
+    CeedVectorCreate(ceed, num_comp * num_dofs, &u);
+    CeedVectorCreate(ceed, num_comp * num_dofs, &v);
+    CeedVectorCreate(ceed, num_points, &q_data);
+
+    // Restrictions
+    for (CeedInt i = 0; i < num_elem; i++) {
+      CeedInt col, row, offset;
+      col    = i % n_x;
+      row    = i / n_x;
+      offset = col * (p - 1) + row * (n_x * 2 + 1) * (p - 1);
+      for (CeedInt j = 0; j < p; j++) {
+        for (CeedInt k = 0; k < p; k++) ind_x[p * (p * i + k) + j] = offset + k * (n_x * 2 + 1) + j;
+      }
+    }
+    CeedElemRestrictionCreate(ceed, num_elem, p * p, dim, num_dofs, dim * num_dofs, CEED_MEM_HOST, CEED_USE_POINTER, ind_x, &elem_restriction_x);
+    CeedElemRestrictionCreate(ceed, num_elem, p * p, num_comp, num_dofs, num_comp * num_dofs, CEED_MEM_HOST, CEED_USE_POINTER, ind_x,
+                              &elem_restriction_u);
+
+    // Bases
+    CeedBasisCreateTensorH1Lagrange(ceed, dim, dim, p, q, CEED_GAUSS, &basis_x);
+    CeedBasisCreateTensorH1Lagrange(ceed, dim, num_comp, p, q, CEED_GAUSS, &basis_u);
+
+    // QFunctions
+    CeedQFunctionCreateInterior(ceed, 1, setup, setup_loc, &qf_setup);
+    CeedQFunctionAddInput(qf_setup, "weight", 1, CEED_EVAL_WEIGHT);
+    CeedQFunctionAddInput(qf_setup, "dx", dim * dim, CEED_EVAL_GRAD);
+    CeedQFunctionAddOutput(qf_setup, "rho", 1, CEED_EVAL_NONE);
+
+    CeedQFunctionCreateInterior(ceed, 1, mass, mass_loc, &qf_mass);
+    CeedQFunctionAddInput(qf_mass, "rho", 1, CEED_EVAL_NONE);
+    CeedQFunctionAddInput(qf_mass, "u", num_comp, CEED_EVAL_INTERP);
+    CeedQFunctionAddOutput(qf_mass, "v", num_comp, CEED_EVAL_INTERP);
+    {
+      CeedQFunctionContext qf_context;
+
+      CeedQFunctionContextCreate(ceed, &qf_context);
+      CeedQFunctionContextSetData(qf_context, CEED_MEM_HOST, CEED_COPY_VALUES, sizeof(CeedInt), &num_comp);
+      CeedQFunctionSetContext(qf_mass, qf_context);
+      CeedQFunctionContextDestroy(&qf_context);
+    }
+
+    // Operators
+    CeedOperatorCreateAtPoints(ceed, qf_setup, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_setup);
+    CeedOperatorSetField(op_setup, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE);
+    CeedOperatorSetField(op_setup, "dx", elem_restriction_x, basis_x, CEED_VECTOR_ACTIVE);
+    CeedOperatorSetField(op_setup, "rho", elem_restriction_q_data, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE);
+    CeedOperatorAtPointsSetPoints(op_setup, elem_restriction_x_points, x_points);
+
+    CeedOperatorCreateAtPoints(ceed, qf_mass, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_mass);
+    CeedOperatorSetField(op_mass, "rho", elem_restriction_q_data, CEED_BASIS_NONE, q_data);
+    CeedOperatorSetField(op_mass, "u", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE);
+    CeedOperatorSetField(op_mass, "v", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE);
+    CeedOperatorAtPointsSetPoints(op_mass, elem_restriction_x_points, x_points);
+
+    // Apply Setup Operator
+    CeedOperatorApply(op_setup, x, q_data, CEED_REQUEST_IMMEDIATE);
+
+    // Fully assemble operator
+    CeedSize   num_entries;
+    CeedInt   *rows;
+    CeedInt   *cols;
+    CeedVector assembled;
+
+    for (CeedInt k = 0; k < num_comp * num_comp * num_dofs * num_dofs; ++k) {
+      assembled_values[k] = 0.0;
+      assembled_true[k]   = 0.0;
+    }
+    CeedOperatorLinearAssembleSymbolic(op_mass, &num_entries, &rows, &cols);
+    CeedVectorCreate(ceed, num_entries, &assembled);
+    CeedOperatorLinearAssemble(op_mass, assembled);
+    {
+      const CeedScalar *assembled_array;
+
+      CeedVectorGetArrayRead(assembled, CEED_MEM_HOST, &assembled_array);
+      for (CeedInt k = 0; k < num_entries; k++) {
+        assembled_values[rows[k] * num_comp * num_dofs + cols[k]] += assembled_array[k];
+      }
+      CeedVectorRestoreArrayRead(assembled, &assembled_array);
+    }
+
+    // Manually assemble operator
+    CeedVectorSetValue(u, 0.0);
+    for (CeedInt j = 0; j < num_comp * num_dofs; j++) {
+      CeedScalar       *u_array;
+      const CeedScalar *v_array;
+
+      // Set input
+      CeedVectorGetArray(u, CEED_MEM_HOST, &u_array);
+      u_array[j] = 1.0;
+      if (j) u_array[j - 1] = 0.0;
+      CeedVectorRestoreArray(u, &u_array);
+
+      // Compute entries for column j
+      CeedOperatorApply(op_mass, u, v, CEED_REQUEST_IMMEDIATE);
+
+      CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array);
+      for (CeedInt i = 0; i < num_comp * num_dofs; i++) assembled_true[i * num_comp * num_dofs + j] = v_array[i];
+      CeedVectorRestoreArrayRead(v, &v_array);
+    }
+
+    // Check output
+    for (CeedInt i = 0; i < num_comp * num_dofs; i++) {
+      for (CeedInt j = 0; j < num_comp * num_dofs; j++) {
+        if (fabs(assembled_values[i * num_dofs * num_comp + j] - assembled_true[i * num_dofs * num_comp + j]) > 100. * CEED_EPSILON) {
+          // LCOV_EXCL_START
+          printf("[%" CeedInt_FMT ", %" CeedInt_FMT "] Error in assembly: %f != %f\n", i, j, assembled_values[i * num_dofs * num_comp + j],
+                 assembled_true[i * num_dofs * num_comp + j]);
+          // LCOV_EXCL_STOP
+        }
+      }
+    }
+
+    // Cleanup
+    free(rows);
+    free(cols);
+    CeedVectorDestroy(&x);
+    CeedVectorDestroy(&x_points);
+    CeedVectorDestroy(&q_data);
+    CeedVectorDestroy(&u);
+    CeedVectorDestroy(&v);
+    CeedVectorDestroy(&assembled);
+    CeedElemRestrictionDestroy(&elem_restriction_u);
+    CeedElemRestrictionDestroy(&elem_restriction_x);
+    CeedElemRestrictionDestroy(&elem_restriction_x_points);
+    CeedElemRestrictionDestroy(&elem_restriction_q_data);
+    CeedBasisDestroy(&basis_u);
+    CeedBasisDestroy(&basis_x);
+    CeedQFunctionDestroy(&qf_setup);
+    CeedQFunctionDestroy(&qf_mass);
+    CeedOperatorDestroy(&op_setup);
+    CeedOperatorDestroy(&op_mass);
+  }
+  CeedDestroy(&ceed);
+  return 0;
+}
diff --git a/tests/t596-operator.h b/tests/t596-operator.h
new file mode 100644
index 0000000000..4fe3d700f5
--- /dev/null
+++ b/tests/t596-operator.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+#include <ceed/types.h>
+
+CEED_QFUNCTION(setup)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+  const CeedScalar *weight = in[0], *J = in[1];
+  CeedScalar       *rho = out[0];
+
+  for (CeedInt i = 0; i < Q; i++) {
+    rho[i] = weight[i] * (J[i + Q * 0] * J[i + Q * 3] - J[i + Q * 1] * J[i + Q * 2]);
+  }
+  return 0;
+}
+
+CEED_QFUNCTION(mass)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+  CeedInt           num_comp = *(CeedInt *)ctx;
+  const CeedScalar *rho = in[0], *u = in[1];
+  CeedScalar       *v = out[0];
+
+  for (CeedInt i = 0; i < Q; i++) {
+    for (CeedInt c = 0; c < num_comp; c++) v[i + c * Q] = rho[i] * c * u[i + c * Q];
+  }
+  return 0;
+}
diff --git a/tests/t597-operator.c b/tests/t597-operator.c
new file mode 100644
index 0000000000..25d6b3cf3f
--- /dev/null
+++ b/tests/t597-operator.c
@@ -0,0 +1,203 @@
+/// @file
+/// Test full assembly of Poisson operator AtPoints
+/// \test Test full assembly of Poisson operator AtPoints
+#include <ceed.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "t597-operator.h"
+
+int main(int argc, char **argv) {
+  Ceed ceed;
+
+  CeedInit(argv[1], &ceed);
+
+  for (CeedInt num_comp = 1; num_comp <= 3; num_comp++) {
+    CeedElemRestriction elem_restriction_x, elem_restriction_x_points, elem_restriction_u, elem_restriction_q_data;
+    CeedBasis           basis_x, basis_u;
+    CeedQFunction       qf_setup, qf_diff;
+    CeedOperator        op_setup, op_diff;
+    CeedVector          q_data, x, x_points, u, v;
+    CeedInt             p = 3, q = 4, dim = 2;
+    CeedInt             n_x = 3, n_y = 2;
+    CeedInt             num_elem = n_x * n_y;
+    CeedInt             num_dofs = (n_x * 2 + 1) * (n_y * 2 + 1), num_points_per_elem = 4, num_points = num_elem * num_points_per_elem;
+    CeedInt             ind_x[num_elem * p * p];
+    CeedScalar          assembled_values[num_comp * num_comp * num_dofs * num_dofs];
+    CeedScalar          assembled_true[num_comp * num_comp * num_dofs * num_dofs];
+
+    // Points
+    CeedVectorCreate(ceed, dim * num_points, &x_points);
+    {
+      CeedScalar x_array[dim * num_points];
+
+      for (CeedInt e = 0; e < num_elem; e++) {
+        for (CeedInt d = 0; d < dim; d++) {
+          x_array[num_points_per_elem * (e * dim + d) + 0] = 0.25;
+          x_array[num_points_per_elem * (e * dim + d) + 1] = d == 0 ? -0.25 : 0.25;
+          x_array[num_points_per_elem * (e * dim + d) + 2] = d == 0 ? 0.25 : -0.25;
+          x_array[num_points_per_elem * (e * dim + d) + 3] = 0.25;
+        }
+      }
+      CeedVectorSetArray(x_points, CEED_MEM_HOST, CEED_COPY_VALUES, x_array);
+    }
+    {
+      CeedInt ind_x[num_elem + 1 + num_points];
+
+      for (CeedInt i = 0; i <= num_elem; i++) ind_x[i] = num_elem + 1 + i * num_points_per_elem;
+      for (CeedInt i = 0; i < num_points; i++) ind_x[num_elem + 1 + i] = i;
+      CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points, dim, num_points * dim, CEED_MEM_HOST, CEED_COPY_VALUES, ind_x,
+                                        &elem_restriction_x_points);
+      CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points, dim * (dim + 1) / 2, num_points * dim * (dim + 1) / 2, CEED_MEM_HOST,
+                                        CEED_COPY_VALUES, ind_x, &elem_restriction_q_data);
+    }
+
+    // Vectors
+    CeedVectorCreate(ceed, dim * num_dofs, &x);
+    {
+      CeedScalar x_array[dim * num_dofs];
+
+      for (CeedInt i = 0; i < n_x * 2 + 1; i++) {
+        for (CeedInt j = 0; j < n_y * 2 + 1; j++) {
+          x_array[i + j * (n_x * 2 + 1) + 0 * num_dofs] = (CeedScalar)i / (2 * n_x);
+          x_array[i + j * (n_x * 2 + 1) + 1 * num_dofs] = (CeedScalar)j / (2 * n_y);
+        }
+      }
+      CeedVectorSetArray(x, CEED_MEM_HOST, CEED_COPY_VALUES, x_array);
+    }
+    CeedVectorCreate(ceed, num_comp * num_dofs, &u);
+    CeedVectorCreate(ceed, num_comp * num_dofs, &v);
+    CeedVectorCreate(ceed, num_points * dim * (dim + 1) / 2, &q_data);
+
+    // Restrictions
+    for (CeedInt i = 0; i < num_elem; i++) {
+      CeedInt col, row, offset;
+      col    = i % n_x;
+      row    = i / n_x;
+      offset = col * (p - 1) + row * (n_x * 2 + 1) * (p - 1);
+      for (CeedInt j = 0; j < p; j++) {
+        for (CeedInt k = 0; k < p; k++) ind_x[p * (p * i + k) + j] = offset + k * (n_x * 2 + 1) + j;
+      }
+    }
+    CeedElemRestrictionCreate(ceed, num_elem, p * p, dim, num_dofs, dim * num_dofs, CEED_MEM_HOST, CEED_USE_POINTER, ind_x, &elem_restriction_x);
+    CeedElemRestrictionCreate(ceed, num_elem, p * p, num_comp, num_dofs, num_comp * num_dofs, CEED_MEM_HOST, CEED_USE_POINTER, ind_x,
+                              &elem_restriction_u);
+
+    // Bases
+    CeedBasisCreateTensorH1Lagrange(ceed, dim, dim, p, q, CEED_GAUSS, &basis_x);
+    CeedBasisCreateTensorH1Lagrange(ceed, dim, num_comp, p, q, CEED_GAUSS, &basis_u);
+
+    // QFunction - setup
+    CeedQFunctionCreateInterior(ceed, 1, setup, setup_loc, &qf_setup);
+    CeedQFunctionAddInput(qf_setup, "dx", dim * dim, CEED_EVAL_GRAD);
+    CeedQFunctionAddInput(qf_setup, "weight", 1, CEED_EVAL_WEIGHT);
+    CeedQFunctionAddOutput(qf_setup, "q data", dim * (dim + 1) / 2, CEED_EVAL_NONE);
+
+    // Operator - setup
+    CeedOperatorCreateAtPoints(ceed, qf_setup, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_setup);
+    CeedOperatorSetField(op_setup, "dx", elem_restriction_x, basis_x, CEED_VECTOR_ACTIVE);
+    CeedOperatorSetField(op_setup, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE);
+    CeedOperatorSetField(op_setup, "q data", elem_restriction_q_data, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE);
+    CeedOperatorAtPointsSetPoints(op_setup, elem_restriction_x_points, x_points);
+
+    // Apply Setup Operator
+    CeedOperatorApply(op_setup, x, q_data, CEED_REQUEST_IMMEDIATE);
+
+    // QFunction - apply
+    CeedQFunctionCreateInterior(ceed, 1, diff, diff_loc, &qf_diff);
+    CeedQFunctionAddInput(qf_diff, "du", num_comp * dim, CEED_EVAL_GRAD);
+    CeedQFunctionAddInput(qf_diff, "q data", dim * (dim + 1) / 2, CEED_EVAL_NONE);
+    CeedQFunctionAddOutput(qf_diff, "dv", num_comp * dim, CEED_EVAL_GRAD);
+    {
+      CeedQFunctionContext qf_context;
+
+      CeedQFunctionContextCreate(ceed, &qf_context);
+      CeedQFunctionContextSetData(qf_context, CEED_MEM_HOST, CEED_COPY_VALUES, sizeof(CeedInt), &num_comp);
+      CeedQFunctionSetContext(qf_diff, qf_context);
+      CeedQFunctionContextDestroy(&qf_context);
+    }
+
+    // Operator - apply
+    CeedOperatorCreateAtPoints(ceed, qf_diff, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_diff);
+    CeedOperatorSetField(op_diff, "du", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE);
+    CeedOperatorSetField(op_diff, "q data", elem_restriction_q_data, CEED_BASIS_NONE, q_data);
+    CeedOperatorSetField(op_diff, "dv", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE);
+    CeedOperatorAtPointsSetPoints(op_diff, elem_restriction_x_points, x_points);
+
+    // Fully assemble operator
+    CeedSize   num_entries;
+    CeedInt   *rows;
+    CeedInt   *cols;
+    CeedVector assembled;
+
+    for (CeedInt k = 0; k < num_comp * num_comp * num_dofs * num_dofs; ++k) {
+      assembled_values[k] = 0.0;
+      assembled_true[k]   = 0.0;
+    }
+    CeedOperatorLinearAssembleSymbolic(op_diff, &num_entries, &rows, &cols);
+    CeedVectorCreate(ceed, num_entries, &assembled);
+    CeedOperatorLinearAssemble(op_diff, assembled);
+    {
+      const CeedScalar *assembled_array;
+
+      CeedVectorGetArrayRead(assembled, CEED_MEM_HOST, &assembled_array);
+      for (CeedInt k = 0; k < num_entries; k++) assembled_values[rows[k] * num_comp * num_dofs + cols[k]] += assembled_array[k];
+      CeedVectorRestoreArrayRead(assembled, &assembled_array);
+    }
+
+    // Manually assemble operator
+    CeedVectorSetValue(u, 0.0);
+    for (CeedInt j = 0; j < num_comp * num_dofs; j++) {
+      CeedScalar       *u_array;
+      const CeedScalar *v_array;
+
+      // Set input
+      CeedVectorGetArray(u, CEED_MEM_HOST, &u_array);
+      u_array[j] = 1.0;
+      if (j) u_array[j - 1] = 0.0;
+      CeedVectorRestoreArray(u, &u_array);
+
+      // Compute entries for column j
+      CeedOperatorApply(op_diff, u, v, CEED_REQUEST_IMMEDIATE);
+
+      CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array);
+      for (CeedInt i = 0; i < num_comp * num_dofs; i++) assembled_true[i * num_comp * num_dofs + j] = v_array[i];
+      CeedVectorRestoreArrayRead(v, &v_array);
+    }
+
+    // Check output
+    for (CeedInt i = 0; i < num_comp * num_dofs; i++) {
+      for (CeedInt j = 0; j < num_comp * num_dofs; j++) {
+        if (fabs(assembled_values[i * num_comp * num_dofs + j] - assembled_true[i * num_comp * num_dofs + j]) > 100. * CEED_EPSILON) {
+          // LCOV_EXCL_START
+          printf("[%" CeedInt_FMT ", %" CeedInt_FMT "] Error in assembly: %f != %f\n", i, j, assembled_values[i * num_comp * num_dofs + j],
+                 assembled_true[i * num_comp * num_dofs + j]);
+          // LCOV_EXCL_STOP
+        }
+      }
+    }
+
+    // Cleanup
+    free(rows);
+    free(cols);
+    CeedVectorDestroy(&x);
+    CeedVectorDestroy(&x_points);
+    CeedVectorDestroy(&q_data);
+    CeedVectorDestroy(&u);
+    CeedVectorDestroy(&v);
+    CeedVectorDestroy(&assembled);
+    CeedElemRestrictionDestroy(&elem_restriction_u);
+    CeedElemRestrictionDestroy(&elem_restriction_x);
+    CeedElemRestrictionDestroy(&elem_restriction_x_points);
+    CeedElemRestrictionDestroy(&elem_restriction_q_data);
+    CeedBasisDestroy(&basis_u);
+    CeedBasisDestroy(&basis_x);
+    CeedQFunctionDestroy(&qf_setup);
+    CeedQFunctionDestroy(&qf_diff);
+    CeedOperatorDestroy(&op_setup);
+    CeedOperatorDestroy(&op_diff);
+  }
+  CeedDestroy(&ceed);
+  return 0;
+}
diff --git a/tests/t597-operator.h b/tests/t597-operator.h
new file mode 100644
index 0000000000..b68854b6fe
--- /dev/null
+++ b/tests/t597-operator.h
@@ -0,0 +1,59 @@
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+#include <ceed/types.h>
+
+CEED_QFUNCTION(setup)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+  // At every quadrature point, compute qw/det(J).adj(J).adj(J)^T and store
+  // the symmetric part of the result.
+
+  // in[0] is Jacobians with shape [2, nc=2, Q]
+  // in[1] is quadrature weights, size (Q)
+  const CeedScalar *J = in[0], *qw = in[1];
+
+  // out[0] is qdata, size (Q)
+  CeedScalar *qd = out[0];
+
+  // Quadrature point loop
+  for (CeedInt i = 0; i < Q; i++) {
+    // J: 0 2   qd: 0 2   adj(J):  J22 -J12
+    //    1 3       2 1           -J21  J11
+    const CeedScalar J11 = J[i + Q * 0];
+    const CeedScalar J21 = J[i + Q * 1];
+    const CeedScalar J12 = J[i + Q * 2];
+    const CeedScalar J22 = J[i + Q * 3];
+    const CeedScalar w   = qw[i] / (J11 * J22 - J21 * J12);
+    qd[i + Q * 0]        = w * (J12 * J12 + J22 * J22);
+    qd[i + Q * 2]        = w * (J11 * J11 + J21 * J21);
+    qd[i + Q * 1]        = -w * (J11 * J12 + J21 * J22);
+  }
+
+  return 0;
+}
+
+CEED_QFUNCTION(diff)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+  CeedInt num_comp = *(CeedInt *)ctx;
+  // in[0] is gradient u, shape [2, nc=1, Q]
+  // in[1] is quadrature data, size (3*Q)
+  const CeedScalar *du = in[0], *qd = in[1];
+
+  // out[0] is output to multiply against gradient v, shape [2, nc=1, Q]
+  CeedScalar *dv = out[0];
+
+  // Quadrature point loop
+  for (CeedInt i = 0; i < Q; i++) {
+    for (CeedInt c = 0; c < num_comp; c++) {
+      const CeedScalar du0 = du[i + Q * (2 * c + 0)];
+      const CeedScalar du1 = du[i + Q * (2 * c + 1)];
+
+      dv[i + Q * (2 * c + 0)] = qd[i + Q * 0] * du0 + qd[i + Q * 2] * du1;
+      dv[i + Q * (2 * c + 1)] = qd[i + Q * 2] * du0 + qd[i + Q * 1] * du1;
+    }
+  }
+
+  return 0;
+}

From 8b7d3340714d052d4785a048802399d76a15146b Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Wed, 23 Apr 2025 11:19:20 -0600
Subject: [PATCH 384/571] gen - unload module when destroying op

---
 backends/cuda-gen/ceed-cuda-gen-operator.c | 1 +
 backends/hip-gen/ceed-hip-gen-operator.c   | 1 +
 2 files changed, 2 insertions(+)

diff --git a/backends/cuda-gen/ceed-cuda-gen-operator.c b/backends/cuda-gen/ceed-cuda-gen-operator.c
index c3b6ec0644..f1295252a6 100644
--- a/backends/cuda-gen/ceed-cuda-gen-operator.c
+++ b/backends/cuda-gen/ceed-cuda-gen-operator.c
@@ -27,6 +27,7 @@ static int CeedOperatorDestroy_Cuda_gen(CeedOperator op) {
 
   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
   CeedCallBackend(CeedOperatorGetData(op, &impl));
+  if (impl->module) CeedCallCuda(ceed, cuModuleUnload(impl->module));
   if (impl->points.num_per_elem) CeedCallCuda(ceed, cudaFree((void **)impl->points.num_per_elem));
   CeedCallBackend(CeedFree(&impl));
   CeedCallBackend(CeedDestroy(&ceed));
diff --git a/backends/hip-gen/ceed-hip-gen-operator.c b/backends/hip-gen/ceed-hip-gen-operator.c
index d083ca84c3..fc90b93228 100644
--- a/backends/hip-gen/ceed-hip-gen-operator.c
+++ b/backends/hip-gen/ceed-hip-gen-operator.c
@@ -25,6 +25,7 @@ static int CeedOperatorDestroy_Hip_gen(CeedOperator op) {
 
   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
   CeedCallBackend(CeedOperatorGetData(op, &impl));
+  if (impl->module) CeedCallHip(ceed, hipModuleUnload(impl->module));
   if (impl->points.num_per_elem) CeedCallHip(ceed, hipFree((void **)impl->points.num_per_elem));
   CeedCallBackend(CeedFree(&impl));
   CeedCallBackend(CeedDestroy(&ceed));

From c2620745127085f3a3887653cdbe37743cb5942b Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Wed, 23 Apr 2025 11:45:44 -0600
Subject: [PATCH 385/571] cuda - remove unused struct members

---
 backends/cuda-shared/ceed-cuda-shared.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/backends/cuda-shared/ceed-cuda-shared.h b/backends/cuda-shared/ceed-cuda-shared.h
index ab66e38926..6ef6770758 100644
--- a/backends/cuda-shared/ceed-cuda-shared.h
+++ b/backends/cuda-shared/ceed-cuda-shared.h
@@ -32,8 +32,6 @@ typedef struct {
   CeedScalar *d_collo_grad_1d;
   CeedScalar *d_q_weight_1d;
   CeedScalar *d_chebyshev_interp_1d;
-  CeedScalar *c_B;
-  CeedScalar *c_G;
   CeedInt     num_elem_at_points;
   CeedInt    *h_points_per_elem;
   CeedInt    *d_points_per_elem;

From 0b37c0667fc56f667e0e505cc49b864fa3916f61 Mon Sep 17 00:00:00 2001
From: Zach Atkins <zach.atkins@colorado.edu>
Date: Fri, 25 Apr 2025 11:46:32 -0600
Subject: [PATCH 386/571] Reset work vectors in operator setup to save memory

---
 backends/cuda-ref/ceed-cuda-ref-operator.c |  5 ++
 backends/hip-ref/ceed-hip-ref-operator.c   |  2 +
 include/ceed/backend.h                     |  1 +
 interface/ceed.c                           | 58 ++++++++++++++++++++++
 4 files changed, 66 insertions(+)

diff --git a/backends/cuda-ref/ceed-cuda-ref-operator.c b/backends/cuda-ref/ceed-cuda-ref-operator.c
index 025d5b6501..8b702c3802 100644
--- a/backends/cuda-ref/ceed-cuda-ref-operator.c
+++ b/backends/cuda-ref/ceed-cuda-ref-operator.c
@@ -353,6 +353,8 @@ static int CeedOperatorSetup_Cuda(CeedOperator op) {
       CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
     }
   }
+
+  CeedCallBackend(CeedClearWorkVectors(CeedOperatorReturnCeed(op), impl->max_active_e_vec_len));
   CeedCallBackend(CeedOperatorSetSetupDone(op));
   CeedCallBackend(CeedQFunctionDestroy(&qf));
   return CEED_ERROR_SUCCESS;
@@ -740,6 +742,9 @@ static int CeedOperatorSetupAtPoints_Cuda(CeedOperator op) {
       CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
     }
   }
+
+  CeedCallBackend(CeedClearWorkVectors(CeedOperatorReturnCeed(op), impl->max_active_e_vec_len));
+
   CeedCallBackend(CeedOperatorSetSetupDone(op));
   CeedCallBackend(CeedQFunctionDestroy(&qf));
   return CEED_ERROR_SUCCESS;
diff --git a/backends/hip-ref/ceed-hip-ref-operator.c b/backends/hip-ref/ceed-hip-ref-operator.c
index b9a7231247..b719bea991 100644
--- a/backends/hip-ref/ceed-hip-ref-operator.c
+++ b/backends/hip-ref/ceed-hip-ref-operator.c
@@ -352,6 +352,7 @@ static int CeedOperatorSetup_Hip(CeedOperator op) {
       CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
     }
   }
+  CeedCallBackend(CeedClearWorkVectors(CeedOperatorReturnCeed(op), impl->max_active_e_vec_len));
   CeedCallBackend(CeedOperatorSetSetupDone(op));
   CeedCallBackend(CeedQFunctionDestroy(&qf));
   return CEED_ERROR_SUCCESS;
@@ -738,6 +739,7 @@ static int CeedOperatorSetupAtPoints_Hip(CeedOperator op) {
       CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
     }
   }
+  CeedCallBackend(CeedClearWorkVectors(CeedOperatorReturnCeed(op), impl->max_active_e_vec_len));
   CeedCallBackend(CeedOperatorSetSetupDone(op));
   CeedCallBackend(CeedQFunctionDestroy(&qf));
   return CEED_ERROR_SUCCESS;
diff --git a/include/ceed/backend.h b/include/ceed/backend.h
index 7cdbe63ef9..a67f585859 100644
--- a/include/ceed/backend.h
+++ b/include/ceed/backend.h
@@ -256,6 +256,7 @@ CEED_EXTERN int CeedSetData(Ceed ceed, void *data);
 CEED_EXTERN int CeedReference(Ceed ceed);
 CEED_EXTERN int CeedGetWorkVector(Ceed ceed, CeedSize len, CeedVector *vec);
 CEED_EXTERN int CeedRestoreWorkVector(Ceed ceed, CeedVector *vec);
+CEED_EXTERN int CeedClearWorkVectors(Ceed ceed, CeedSize min_len);
 CEED_EXTERN int CeedGetJitSourceRoots(Ceed ceed, CeedInt *num_source_roots, const char ***jit_source_roots);
 CEED_EXTERN int CeedRestoreJitSourceRoots(Ceed ceed, const char ***jit_source_roots);
 CEED_EXTERN int CeedGetJitDefines(Ceed ceed, CeedInt *num_defines, const char ***jit_defines);
diff --git a/interface/ceed.c b/interface/ceed.c
index 114aae9a20..d745c14d8a 100644
--- a/interface/ceed.c
+++ b/interface/ceed.c
@@ -817,6 +817,63 @@ int CeedReference(Ceed ceed) {
   return CEED_ERROR_SUCCESS;
 }
 
+/**
+  @brief Computes the current memory usage of the work vectors in a `Ceed` context and prints to debug.abort
+
+  @param[in] ceed `Ceed` context
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Developer
+**/
+static int CeedViewWorkVectorMemoryUsage(Ceed ceed) {
+  CeedScalar work_len = 0.;
+
+  if (ceed->work_vectors) {
+    for (CeedInt i = 0; i < ceed->work_vectors->num_vecs; i++) {
+      CeedSize vec_len;
+      CeedCall(CeedVectorGetLength(ceed->work_vectors->vecs[i], &vec_len));
+      work_len += vec_len;
+    }
+    work_len *= sizeof(CeedScalar) * 1e-6;
+    CeedDebug(ceed, "Resource {%s} Work Vectors Memory Usage: %" CeedInt_FMT " vectors, %g MB\n", ceed->resource, ceed->work_vectors->num_vecs,
+              work_len);
+  }
+  return CEED_ERROR_SUCCESS;
+}
+
+/**
+  @brief Clear inactive work vectors in a `Ceed` context below a minimum length.
+
+  @param[in,out] ceed    `Ceed` context
+  @param[in]     min_len Minimum length of work vector to keep
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Backend
+**/
+int CeedClearWorkVectors(Ceed ceed, CeedSize min_len) {
+  if (!ceed->work_vectors) return CEED_ERROR_SUCCESS;
+  for (CeedInt i = 0; i < ceed->work_vectors->num_vecs; i++) {
+    if (ceed->work_vectors->is_in_use[i]) continue;
+    CeedSize vec_len;
+    CeedCall(CeedVectorGetLength(ceed->work_vectors->vecs[i], &vec_len));
+    if (vec_len < min_len) {
+      ceed->ref_count += 2;  // Note: increase ref_count to prevent Ceed destructor from triggering again
+      CeedCall(CeedVectorDestroy(&ceed->work_vectors->vecs[i]));
+      ceed->ref_count -= 1;  // Note: restore ref_count
+      ceed->work_vectors->num_vecs--;
+      if (ceed->work_vectors->num_vecs > 0) {
+        ceed->work_vectors->vecs[i]                                 = ceed->work_vectors->vecs[ceed->work_vectors->num_vecs];
+        ceed->work_vectors->is_in_use[i]                            = ceed->work_vectors->is_in_use[ceed->work_vectors->num_vecs];
+        ceed->work_vectors->is_in_use[ceed->work_vectors->num_vecs] = false;
+        i--;
+      }
+    }
+  }
+  return CEED_ERROR_SUCCESS;
+}
+
 /**
   @brief Get a `CeedVector` for scratch work from a `Ceed` context.
 
@@ -858,6 +915,7 @@ int CeedGetWorkVector(Ceed ceed, CeedSize len, CeedVector *vec) {
     ceed->work_vectors->num_vecs++;
     CeedCallBackend(CeedVectorCreate(ceed, len, &ceed->work_vectors->vecs[i]));
     ceed->ref_count--;  // Note: ref_count manipulation to prevent a ref-loop
+    if (ceed->is_debug) CeedViewWorkVectorMemoryUsage(ceed);
   }
   // Return pointer to work vector
   ceed->work_vectors->is_in_use[i] = true;

From 55326fe74c96ccbef44e39fe76fb9d1eaadabe42 Mon Sep 17 00:00:00 2001
From: Zach Atkins <zach.atkins@colorado.edu>
Date: Fri, 25 Apr 2025 13:46:54 -0600
Subject: [PATCH 387/571] Change `CeedViewWorkVectorMemoryUsage` to
 `CeedGetWorkVectorMemoryUsage`

---
 include/ceed/backend.h |  1 +
 interface/ceed.c       | 18 +++++++++---------
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/include/ceed/backend.h b/include/ceed/backend.h
index a67f585859..a3707eba4a 100644
--- a/include/ceed/backend.h
+++ b/include/ceed/backend.h
@@ -257,6 +257,7 @@ CEED_EXTERN int CeedReference(Ceed ceed);
 CEED_EXTERN int CeedGetWorkVector(Ceed ceed, CeedSize len, CeedVector *vec);
 CEED_EXTERN int CeedRestoreWorkVector(Ceed ceed, CeedVector *vec);
 CEED_EXTERN int CeedClearWorkVectors(Ceed ceed, CeedSize min_len);
+CEED_EXTERN int CeedGetWorkVectorMemoryUsage(Ceed ceed, CeedScalar *usage_mb);
 CEED_EXTERN int CeedGetJitSourceRoots(Ceed ceed, CeedInt *num_source_roots, const char ***jit_source_roots);
 CEED_EXTERN int CeedRestoreJitSourceRoots(Ceed ceed, const char ***jit_source_roots);
 CEED_EXTERN int CeedGetJitDefines(Ceed ceed, CeedInt *num_defines, const char ***jit_defines);
diff --git a/interface/ceed.c b/interface/ceed.c
index d745c14d8a..56291fe882 100644
--- a/interface/ceed.c
+++ b/interface/ceed.c
@@ -826,18 +826,17 @@ int CeedReference(Ceed ceed) {
 
   @ref Developer
 **/
-static int CeedViewWorkVectorMemoryUsage(Ceed ceed) {
-  CeedScalar work_len = 0.;
-
+int CeedGetWorkVectorMemoryUsage(Ceed ceed, CeedScalar *usage_mb) {
+  *usage_mb = 0.0;
   if (ceed->work_vectors) {
     for (CeedInt i = 0; i < ceed->work_vectors->num_vecs; i++) {
       CeedSize vec_len;
       CeedCall(CeedVectorGetLength(ceed->work_vectors->vecs[i], &vec_len));
-      work_len += vec_len;
+      *usage_mb += vec_len;
     }
-    work_len *= sizeof(CeedScalar) * 1e-6;
-    CeedDebug(ceed, "Resource {%s} Work Vectors Memory Usage: %" CeedInt_FMT " vectors, %g MB\n", ceed->resource, ceed->work_vectors->num_vecs,
-              work_len);
+    *usage_mb *= sizeof(CeedScalar) * 1e-6;
+    CeedDebug(ceed, "Resource {%s}: Work vectors memory usage: %" CeedInt_FMT " vectors, %g MB\n", ceed->resource, ceed->work_vectors->num_vecs,
+              *usage_mb);
   }
   return CEED_ERROR_SUCCESS;
 }
@@ -888,7 +887,8 @@ int CeedClearWorkVectors(Ceed ceed, CeedSize min_len) {
   @ref Backend
 **/
 int CeedGetWorkVector(Ceed ceed, CeedSize len, CeedVector *vec) {
-  CeedInt i = 0;
+  CeedInt    i = 0;
+  CeedScalar usage_mb;
 
   if (!ceed->work_vectors) CeedCall(CeedWorkVectorsCreate(ceed));
 
@@ -915,7 +915,7 @@ int CeedGetWorkVector(Ceed ceed, CeedSize len, CeedVector *vec) {
     ceed->work_vectors->num_vecs++;
     CeedCallBackend(CeedVectorCreate(ceed, len, &ceed->work_vectors->vecs[i]));
     ceed->ref_count--;  // Note: ref_count manipulation to prevent a ref-loop
-    if (ceed->is_debug) CeedViewWorkVectorMemoryUsage(ceed);
+    if (ceed->is_debug) CeedGetWorkVectorMemoryUsage(ceed, &usage_mb);
   }
   // Return pointer to work vector
   ceed->work_vectors->is_in_use[i] = true;

From 021a32ddaeb3f5ae90defc18a1194608e9df3b1b Mon Sep 17 00:00:00 2001
From: Zach Atkins <zach.atkins@colorado.edu>
Date: Fri, 25 Apr 2025 13:47:22 -0600
Subject: [PATCH 388/571] Add tests for work vector memory usage

---
 tests/t131-vector.c | 58 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 58 insertions(+)
 create mode 100644 tests/t131-vector.c

diff --git a/tests/t131-vector.c b/tests/t131-vector.c
new file mode 100644
index 0000000000..3fe78c6b94
--- /dev/null
+++ b/tests/t131-vector.c
@@ -0,0 +1,58 @@
+/// @file
+/// Test clearing work vectors
+/// \test Test clearing work vectors
+
+#include <ceed.h>
+#include <ceed/backend.h>
+#include <math.h>
+#include <stdio.h>
+
+static CeedScalar expected_usage(CeedSize length) { return length * sizeof(CeedScalar) * 1e-6; }
+
+int main(int argc, char **argv) {
+  Ceed       ceed;
+  CeedVector x, y, z;
+  CeedScalar usage_mb;
+
+  CeedInit(argv[1], &ceed);
+
+  // Add work vectors of different lengths
+  CeedGetWorkVector(ceed, 10, &x);
+  CeedGetWorkVector(ceed, 20, &y);
+  CeedGetWorkVector(ceed, 30, &z);
+
+  // Check memory usage, should be 60 * sizeof(CeedScalar)
+  CeedGetWorkVectorMemoryUsage(ceed, &usage_mb);
+  if (fabs(usage_mb - expected_usage(60)) > 100. * CEED_EPSILON) printf("Wrong usage: %0.8g MB != %0.8g MB\n", usage_mb, expected_usage(60));
+
+  // Restore x and z
+  CeedRestoreWorkVector(ceed, &x);
+  CeedRestoreWorkVector(ceed, &z);
+
+  // Clear work vectors with length < 30. This should:
+  //  - Remove x
+  //  - Leave y, since it is still in use
+  //  - Leave z, since it is length 30
+  CeedClearWorkVectors(ceed, 30);
+  CeedGetWorkVectorMemoryUsage(ceed, &usage_mb);
+  if (fabs(usage_mb - expected_usage(50)) > 100. * CEED_EPSILON) printf("Wrong usage: %0.8g MB != %0.8g MB\n", usage_mb, expected_usage(50));
+
+  // Clear work vectors with length < 31. This should:
+  //  - Leave y, since it is still in use
+  //  - Remove z
+  CeedClearWorkVectors(ceed, 31);
+  CeedGetWorkVectorMemoryUsage(ceed, &usage_mb);
+  if (fabs(usage_mb - expected_usage(20)) > 100. * CEED_EPSILON) printf("Wrong usage: %0.8g MB != %0.8g MB\n", usage_mb, expected_usage(20));
+
+  // Restore y
+  CeedRestoreWorkVector(ceed, &y);
+
+  // Make sure we can still get back y without allocating a new work vector
+  CeedGetWorkVector(ceed, 20, &y);
+  CeedGetWorkVectorMemoryUsage(ceed, &usage_mb);
+  if (fabs(usage_mb - expected_usage(20)) > 100. * CEED_EPSILON) printf("Wrong usage: %0.8g MB != %0.8g MB\n", usage_mb, expected_usage(20));
+  CeedRestoreWorkVector(ceed, &y);
+
+  CeedDestroy(&ceed);
+  return 0;
+}

From 56a26733dd7a83996d18a1b6748faedf092f167d Mon Sep 17 00:00:00 2001
From: Zach Atkins <zach.atkins@colorado.edu>
Date: Fri, 25 Apr 2025 15:46:24 -0700
Subject: [PATCH 389/571] Minor wording clarification

Co-authored-by: Jeremy L Thompson <jeremy@jeremylt.org>
---
 interface/ceed.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/interface/ceed.c b/interface/ceed.c
index 56291fe882..203d2e5790 100644
--- a/interface/ceed.c
+++ b/interface/ceed.c
@@ -858,7 +858,7 @@ int CeedClearWorkVectors(Ceed ceed, CeedSize min_len) {
     CeedSize vec_len;
     CeedCall(CeedVectorGetLength(ceed->work_vectors->vecs[i], &vec_len));
     if (vec_len < min_len) {
-      ceed->ref_count += 2;  // Note: increase ref_count to prevent Ceed destructor from triggering again
+      ceed->ref_count += 2;  // Note: increase ref_count to prevent Ceed destructor from triggering
       CeedCall(CeedVectorDestroy(&ceed->work_vectors->vecs[i]));
       ceed->ref_count -= 1;  // Note: restore ref_count
       ceed->work_vectors->num_vecs--;

From 25433d25587bd1aa42a64ac5062e4dc758580d26 Mon Sep 17 00:00:00 2001
From: Zach Atkins <zach.atkins@colorado.edu>
Date: Fri, 25 Apr 2025 16:55:00 -0600
Subject: [PATCH 390/571] Pre-request work vectors in operator setup

---
 backends/cuda-ref/ceed-cuda-ref-operator.c | 19 +++++++++++++++++--
 backends/hip-ref/ceed-hip-ref-operator.c   | 18 ++++++++++++++++++
 2 files changed, 35 insertions(+), 2 deletions(-)

diff --git a/backends/cuda-ref/ceed-cuda-ref-operator.c b/backends/cuda-ref/ceed-cuda-ref-operator.c
index 8b702c3802..de6129fc25 100644
--- a/backends/cuda-ref/ceed-cuda-ref-operator.c
+++ b/backends/cuda-ref/ceed-cuda-ref-operator.c
@@ -353,8 +353,16 @@ static int CeedOperatorSetup_Cuda(CeedOperator op) {
       CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
     }
   }
-
   CeedCallBackend(CeedClearWorkVectors(CeedOperatorReturnCeed(op), impl->max_active_e_vec_len));
+  {
+    // Create two work vectors for diagonal assembly
+    CeedVector temp_1, temp_2;
+
+    CeedCallBackend(CeedGetWorkVector(CeedOperatorReturnCeed(op), impl->max_active_e_vec_len, &temp_1));
+    CeedCallBackend(CeedGetWorkVector(CeedOperatorReturnCeed(op), impl->max_active_e_vec_len, &temp_2));
+    CeedCallBackend(CeedRestoreWorkVector(CeedOperatorReturnCeed(op), &temp_1));
+    CeedCallBackend(CeedRestoreWorkVector(CeedOperatorReturnCeed(op), &temp_2));
+  }
   CeedCallBackend(CeedOperatorSetSetupDone(op));
   CeedCallBackend(CeedQFunctionDestroy(&qf));
   return CEED_ERROR_SUCCESS;
@@ -742,9 +750,16 @@ static int CeedOperatorSetupAtPoints_Cuda(CeedOperator op) {
       CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
     }
   }
-
   CeedCallBackend(CeedClearWorkVectors(CeedOperatorReturnCeed(op), impl->max_active_e_vec_len));
+  {
+    // Create two work vectors for diagonal assembly
+    CeedVector temp_1, temp_2;
 
+    CeedCallBackend(CeedGetWorkVector(CeedOperatorReturnCeed(op), impl->max_active_e_vec_len, &temp_1));
+    CeedCallBackend(CeedGetWorkVector(CeedOperatorReturnCeed(op), impl->max_active_e_vec_len, &temp_2));
+    CeedCallBackend(CeedRestoreWorkVector(CeedOperatorReturnCeed(op), &temp_1));
+    CeedCallBackend(CeedRestoreWorkVector(CeedOperatorReturnCeed(op), &temp_2));
+  }
   CeedCallBackend(CeedOperatorSetSetupDone(op));
   CeedCallBackend(CeedQFunctionDestroy(&qf));
   return CEED_ERROR_SUCCESS;
diff --git a/backends/hip-ref/ceed-hip-ref-operator.c b/backends/hip-ref/ceed-hip-ref-operator.c
index b719bea991..15fc21b73a 100644
--- a/backends/hip-ref/ceed-hip-ref-operator.c
+++ b/backends/hip-ref/ceed-hip-ref-operator.c
@@ -353,6 +353,15 @@ static int CeedOperatorSetup_Hip(CeedOperator op) {
     }
   }
   CeedCallBackend(CeedClearWorkVectors(CeedOperatorReturnCeed(op), impl->max_active_e_vec_len));
+  {
+    // Create two work vectors for diagonal assembly
+    CeedVector temp_1, temp_2;
+
+    CeedCallBackend(CeedGetWorkVector(CeedOperatorReturnCeed(op), impl->max_active_e_vec_len, &temp_1));
+    CeedCallBackend(CeedGetWorkVector(CeedOperatorReturnCeed(op), impl->max_active_e_vec_len, &temp_2));
+    CeedCallBackend(CeedRestoreWorkVector(CeedOperatorReturnCeed(op), &temp_1));
+    CeedCallBackend(CeedRestoreWorkVector(CeedOperatorReturnCeed(op), &temp_2));
+  }
   CeedCallBackend(CeedOperatorSetSetupDone(op));
   CeedCallBackend(CeedQFunctionDestroy(&qf));
   return CEED_ERROR_SUCCESS;
@@ -740,6 +749,15 @@ static int CeedOperatorSetupAtPoints_Hip(CeedOperator op) {
     }
   }
   CeedCallBackend(CeedClearWorkVectors(CeedOperatorReturnCeed(op), impl->max_active_e_vec_len));
+  {
+    // Create two work vectors for diagonal assembly
+    CeedVector temp_1, temp_2;
+
+    CeedCallBackend(CeedGetWorkVector(CeedOperatorReturnCeed(op), impl->max_active_e_vec_len, &temp_1));
+    CeedCallBackend(CeedGetWorkVector(CeedOperatorReturnCeed(op), impl->max_active_e_vec_len, &temp_2));
+    CeedCallBackend(CeedRestoreWorkVector(CeedOperatorReturnCeed(op), &temp_1));
+    CeedCallBackend(CeedRestoreWorkVector(CeedOperatorReturnCeed(op), &temp_2));
+  }
   CeedCallBackend(CeedOperatorSetSetupDone(op));
   CeedCallBackend(CeedQFunctionDestroy(&qf));
   return CEED_ERROR_SUCCESS;

From 99641342ef271fbaf0db3da7242ca437a6202b8c Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Mon, 28 Apr 2025 12:10:01 -0600
Subject: [PATCH 391/571] pc - multigrid level setup for AtPoints

---
 interface/ceed-preconditioning.c |  18 +-
 tests/t598-operator.c            | 279 +++++++++++++++++++++++++++++++
 2 files changed, 296 insertions(+), 1 deletion(-)
 create mode 100644 tests/t598-operator.c

diff --git a/interface/ceed-preconditioning.c b/interface/ceed-preconditioning.c
index 5de71fed38..9fa12e28e2 100644
--- a/interface/ceed-preconditioning.c
+++ b/interface/ceed-preconditioning.c
@@ -885,7 +885,23 @@ static int CeedSingleOperatorMultigridLevel(CeedOperator op_fine, CeedVector p_m
   CeedCheck(!is_composite, ceed, CEED_ERROR_UNSUPPORTED, "Automatic multigrid setup for composite operators not supported");
 
   // Coarse Grid
-  CeedCall(CeedOperatorCreate(ceed, op_fine->qf, op_fine->dqf, op_fine->dqfT, op_coarse));
+  {
+    bool is_at_points;
+
+    CeedCall(CeedOperatorIsAtPoints(op_fine, &is_at_points));
+    if (is_at_points) {
+      CeedVector          point_coords;
+      CeedElemRestriction rstr_points;
+
+      CeedCall(CeedOperatorCreateAtPoints(ceed, op_fine->qf, op_fine->dqf, op_fine->dqfT, op_coarse));
+      CeedCall(CeedOperatorAtPointsGetPoints(op_fine, &rstr_points, &point_coords));
+      CeedCall(CeedOperatorAtPointsSetPoints(*op_coarse, rstr_points, point_coords));
+      CeedCall(CeedVectorDestroy(&point_coords));
+      CeedCall(CeedElemRestrictionDestroy(&rstr_points));
+    } else {
+      CeedCall(CeedOperatorCreate(ceed, op_fine->qf, op_fine->dqf, op_fine->dqfT, op_coarse));
+    }
+  }
   CeedCall(CeedOperatorGetFields(op_fine, &num_input_fields, &input_fields, &num_output_fields, &output_fields));
   // -- Clone input fields
   for (CeedInt i = 0; i < num_input_fields; i++) {
diff --git a/tests/t598-operator.c b/tests/t598-operator.c
new file mode 100644
index 0000000000..55c7560fbb
--- /dev/null
+++ b/tests/t598-operator.c
@@ -0,0 +1,279 @@
+/// @file
+/// Test creation, action, and destruction for mass matrix operator AtPoints
+/// \test Test creation, action, and destruction for mass matrix operator AtPoints
+#include "t591-operator.h"
+
+#include <ceed.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+int main(int argc, char **argv) {
+  Ceed                ceed;
+  CeedInt             num_elem_1d = 3, num_elem = num_elem_1d * num_elem_1d, dim = 2, p_coarse = 2, p_fine = 3, q = 5;
+  CeedInt             num_points_per_elem = 4, num_points = num_elem * num_points_per_elem;
+  CeedInt             num_nodes_coarse = (num_elem_1d * (p_coarse - 1) + 1) * (num_elem_1d * (p_coarse - 1) + 1);
+  CeedInt             num_nodes_fine   = (num_elem_1d * (p_fine - 1) + 1) * (num_elem_1d * (p_fine - 1) + 1);
+  CeedVector          x_points, x_elem, q_data, u_coarse, u_fine, v_coarse, v_fine, p_mult_fine;
+  CeedElemRestriction elem_restriction_x_points, elem_restriction_q_data, elem_restriction_x, elem_restriction_u_coarse, elem_restriction_u_fine;
+  CeedBasis           basis_x, basis_u_coarse, basis_u_fine;
+  CeedQFunction       qf_setup, qf_mass;
+  CeedOperator        op_setup, op_mass_coarse, op_mass_fine, op_prolong, op_restrict;
+
+  CeedInit(argv[1], &ceed);
+
+  // Point reference coordinates
+  CeedVectorCreate(ceed, dim * num_points, &x_points);
+  {
+    CeedScalar x_array[dim * num_points];
+
+    for (CeedInt e = 0; e < num_elem; e++) {
+      for (CeedInt d = 0; d < dim; d++) {
+        x_array[num_points_per_elem * (e * dim + d) + 0] = 0.25;
+        x_array[num_points_per_elem * (e * dim + d) + 1] = d == 0 ? -0.25 : 0.25;
+        x_array[num_points_per_elem * (e * dim + d) + 2] = d == 0 ? 0.25 : -0.25;
+        x_array[num_points_per_elem * (e * dim + d) + 3] = 0.25;
+      }
+    }
+    CeedVectorSetArray(x_points, CEED_MEM_HOST, CEED_COPY_VALUES, x_array);
+  }
+  {
+    CeedInt ind_x[num_elem + 1 + num_points];
+
+    for (CeedInt i = 0; i <= num_elem; i++) ind_x[i] = num_elem + 1 + i * num_points_per_elem;
+    for (CeedInt i = 0; i < num_points; i++) ind_x[num_elem + 1 + i] = i;
+    CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points, dim, num_points * dim, CEED_MEM_HOST, CEED_COPY_VALUES, ind_x,
+                                      &elem_restriction_x_points);
+    CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points, 1, num_points, CEED_MEM_HOST, CEED_COPY_VALUES, ind_x, &elem_restriction_q_data);
+  }
+
+  // Q data
+  CeedVectorCreate(ceed, num_points, &q_data);
+
+  // Cell coordinates
+  {
+    CeedInt p = 2, num_nodes = (num_elem_1d * (p - 1) + 1) * (num_elem_1d * (p - 1) + 1);
+    CeedInt ind_x[num_elem * p * p];
+
+    for (CeedInt e = 0; e < num_elem; e++) {
+      CeedInt elem_xy[2] = {1, 1}, n_d[2] = {0, 0};
+
+      for (CeedInt d = 0; d < dim; d++) n_d[d] = num_elem_1d * (p - 1) + 1;
+      {
+        CeedInt r_e = e;
+
+        for (CeedInt d = 0; d < dim; d++) {
+          elem_xy[d] = r_e % num_elem_1d;
+          r_e /= num_elem_1d;
+        }
+      }
+      CeedInt num_nodes_in_elem = p * p, *elem_nodes = ind_x + e * num_nodes_in_elem;
+
+      for (CeedInt n = 0; n < num_nodes_in_elem; n++) {
+        CeedInt g_node = 0, g_node_stride = 1, r_node = n;
+
+        for (CeedInt d = 0; d < dim; d++) {
+          g_node += (elem_xy[d] * (p - 1) + r_node % p) * g_node_stride;
+          g_node_stride *= n_d[d];
+          r_node /= p;
+        }
+        elem_nodes[n] = p * g_node;
+      }
+    }
+    CeedElemRestrictionCreate(ceed, num_elem, p * p, dim, 1, dim * num_nodes, CEED_MEM_HOST, CEED_COPY_VALUES, ind_x, &elem_restriction_x);
+    CeedVectorCreate(ceed, dim * num_nodes, &x_elem);
+    {
+      CeedScalar x_array[dim * num_nodes];
+
+      for (CeedInt i = 0; i <= num_elem_1d; i++) {
+        for (CeedInt j = 0; j <= num_elem_1d; j++) {
+          x_array[(i * (num_elem_1d + 1) + j) * dim + 0] = j;
+          x_array[(i * (num_elem_1d + 1) + j) * dim + 1] = i;
+        }
+      }
+      CeedVectorSetArray(x_elem, CEED_MEM_HOST, CEED_COPY_VALUES, x_array);
+    }
+  }
+
+  CeedBasisCreateTensorH1Lagrange(ceed, dim, dim, 2, q, CEED_GAUSS, &basis_x);
+
+  // Cell solution
+  {
+    CeedInt ind_u[num_elem * p_coarse * p_coarse];
+
+    for (CeedInt e = 0; e < num_elem; e++) {
+      CeedInt elem_xy[2] = {1, 1}, n_d[2] = {0, 0};
+
+      for (CeedInt d = 0; d < dim; d++) n_d[d] = num_elem_1d * (p_coarse - 1) + 1;
+      {
+        CeedInt r_e = e;
+
+        for (CeedInt d = 0; d < dim; d++) {
+          elem_xy[d] = r_e % num_elem_1d;
+          r_e /= num_elem_1d;
+        }
+      }
+      CeedInt num_nodes_in_elem = p_coarse * p_coarse, *elem_nodes = ind_u + e * num_nodes_in_elem;
+
+      for (CeedInt n = 0; n < num_nodes_in_elem; n++) {
+        CeedInt g_node = 0, g_node_stride = 1, r_node = n;
+
+        for (CeedInt d = 0; d < dim; d++) {
+          g_node += (elem_xy[d] * (p_coarse - 1) + r_node % p_coarse) * g_node_stride;
+          g_node_stride *= n_d[d];
+          r_node /= p_coarse;
+        }
+        elem_nodes[n] = g_node;
+      }
+    }
+    CeedElemRestrictionCreate(ceed, num_elem, p_coarse * p_coarse, 1, 1, num_nodes_coarse, CEED_MEM_HOST, CEED_COPY_VALUES, ind_u,
+                              &elem_restriction_u_coarse);
+  }
+  CeedBasisCreateTensorH1Lagrange(ceed, dim, 1, p_coarse, q, CEED_GAUSS, &basis_u_coarse);
+  {
+    CeedInt ind_u[num_elem * p_fine * p_fine];
+
+    for (CeedInt e = 0; e < num_elem; e++) {
+      CeedInt elem_xy[2] = {1, 1}, n_d[2] = {0, 0};
+
+      for (CeedInt d = 0; d < dim; d++) n_d[d] = num_elem_1d * (p_fine - 1) + 1;
+      {
+        CeedInt r_e = e;
+
+        for (CeedInt d = 0; d < dim; d++) {
+          elem_xy[d] = r_e % num_elem_1d;
+          r_e /= num_elem_1d;
+        }
+      }
+      CeedInt num_nodes_in_elem = p_fine * p_fine, *elem_nodes = ind_u + e * num_nodes_in_elem;
+
+      for (CeedInt n = 0; n < num_nodes_in_elem; n++) {
+        CeedInt g_node = 0, g_node_stride = 1, r_node = n;
+
+        for (CeedInt d = 0; d < dim; d++) {
+          g_node += (elem_xy[d] * (p_fine - 1) + r_node % p_fine) * g_node_stride;
+          g_node_stride *= n_d[d];
+          r_node /= p_fine;
+        }
+        elem_nodes[n] = g_node;
+      }
+    }
+    CeedElemRestrictionCreate(ceed, num_elem, p_fine * p_fine, 1, 1, num_nodes_fine, CEED_MEM_HOST, CEED_COPY_VALUES, ind_u,
+                              &elem_restriction_u_fine);
+  }
+  CeedBasisCreateTensorH1Lagrange(ceed, dim, 1, p_fine, q, CEED_GAUSS, &basis_u_fine);
+
+  // Setup geometric scaling
+  CeedQFunctionCreateInterior(ceed, 1, setup, setup_loc, &qf_setup);
+  CeedQFunctionAddInput(qf_setup, "x", dim * dim, CEED_EVAL_GRAD);
+  CeedQFunctionAddInput(qf_setup, "weight", 1, CEED_EVAL_WEIGHT);
+  CeedQFunctionAddOutput(qf_setup, "rho", 1, CEED_EVAL_NONE);
+
+  CeedOperatorCreateAtPoints(ceed, qf_setup, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_setup);
+  CeedOperatorSetField(op_setup, "x", elem_restriction_x, basis_x, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetField(op_setup, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE);
+  CeedOperatorSetField(op_setup, "rho", elem_restriction_q_data, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE);
+  CeedOperatorAtPointsSetPoints(op_setup, elem_restriction_x_points, x_points);
+
+  CeedOperatorApply(op_setup, x_elem, q_data, CEED_REQUEST_IMMEDIATE);
+
+  // Mass operator
+  CeedQFunctionCreateInterior(ceed, 1, mass, mass_loc, &qf_mass);
+  CeedQFunctionAddInput(qf_mass, "u", 1, CEED_EVAL_INTERP);
+  CeedQFunctionAddInput(qf_mass, "rho", 1, CEED_EVAL_NONE);
+  CeedQFunctionAddOutput(qf_mass, "v", 1, CEED_EVAL_INTERP);
+
+  CeedOperatorCreateAtPoints(ceed, qf_mass, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_mass_fine);
+  CeedOperatorSetField(op_mass_fine, "u", elem_restriction_u_fine, basis_u_fine, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetField(op_mass_fine, "rho", elem_restriction_q_data, CEED_BASIS_NONE, q_data);
+  CeedOperatorSetField(op_mass_fine, "v", elem_restriction_u_fine, basis_u_fine, CEED_VECTOR_ACTIVE);
+  CeedOperatorAtPointsSetPoints(op_mass_fine, elem_restriction_x_points, x_points);
+
+  CeedVectorCreate(ceed, num_nodes_fine, &u_fine);
+  CeedVectorCreate(ceed, num_nodes_fine, &v_fine);
+  CeedVectorCreate(ceed, num_nodes_fine, &p_mult_fine);
+  CeedVectorCreate(ceed, num_nodes_coarse, &u_coarse);
+  CeedVectorCreate(ceed, num_nodes_coarse, &v_coarse);
+
+  // Create multigrid level
+  CeedVectorSetValue(p_mult_fine, 1.0);
+  CeedOperatorMultigridLevelCreate(op_mass_fine, p_mult_fine, elem_restriction_u_coarse, basis_u_coarse, &op_mass_coarse, &op_prolong, &op_restrict);
+
+  // Coarse problem
+  CeedVectorSetValue(u_coarse, 1.0);
+  CeedOperatorApply(op_mass_coarse, u_coarse, v_coarse, CEED_REQUEST_IMMEDIATE);
+
+  // Check output
+  {
+    const CeedScalar *v_array;
+    CeedScalar        sum = 0.;
+
+    CeedVectorGetArrayRead(v_coarse, CEED_MEM_HOST, &v_array);
+    for (CeedInt i = 0; i < num_nodes_coarse; i++) {
+      sum += v_array[i];
+    }
+    CeedVectorRestoreArrayRead(v_coarse, &v_array);
+    if (fabs(sum - num_elem) > 1000. * CEED_EPSILON) printf("Computed Area Coarse Grid: %f != True Area: 2.0\n", sum);
+  }
+
+  // Prolong coarse u
+  CeedOperatorApply(op_prolong, u_coarse, u_fine, CEED_REQUEST_IMMEDIATE);
+
+  // Fine problem
+  CeedOperatorApply(op_mass_fine, u_fine, v_fine, CEED_REQUEST_IMMEDIATE);
+
+  // Check output
+  {
+    const CeedScalar *v_array;
+    CeedScalar        sum = 0.;
+
+    CeedVectorGetArrayRead(v_fine, CEED_MEM_HOST, &v_array);
+    for (CeedInt i = 0; i < num_nodes_fine; i++) {
+      sum += v_array[i];
+    }
+    CeedVectorRestoreArrayRead(v_fine, &v_array);
+
+    if (fabs(sum - num_elem) > 1000. * CEED_EPSILON) printf("Computed Area Fine Grid: %f != True Area: 2.0\n", sum);
+  }
+  // Restrict state to coarse grid
+  CeedOperatorApply(op_restrict, v_fine, v_coarse, CEED_REQUEST_IMMEDIATE);
+
+  // Check output
+  {
+    const CeedScalar *v_array;
+    CeedScalar        sum = 0.;
+
+    CeedVectorGetArrayRead(v_coarse, CEED_MEM_HOST, &v_array);
+    for (CeedInt i = 0; i < num_nodes_coarse; i++) {
+      sum += v_array[i];
+    }
+    CeedVectorRestoreArrayRead(v_coarse, &v_array);
+    if (fabs(sum - num_elem) > 1000. * CEED_EPSILON) printf("Computed Area Coarse Grid: %f != True Area: 2.0\n", sum);
+  }
+
+  CeedVectorDestroy(&x_points);
+  CeedVectorDestroy(&q_data);
+  CeedVectorDestroy(&x_elem);
+  CeedVectorDestroy(&u_coarse);
+  CeedVectorDestroy(&u_fine);
+  CeedVectorDestroy(&v_fine);
+  CeedVectorDestroy(&v_coarse);
+  CeedVectorDestroy(&p_mult_fine);
+  CeedElemRestrictionDestroy(&elem_restriction_x_points);
+  CeedElemRestrictionDestroy(&elem_restriction_q_data);
+  CeedElemRestrictionDestroy(&elem_restriction_x);
+  CeedElemRestrictionDestroy(&elem_restriction_u_coarse);
+  CeedElemRestrictionDestroy(&elem_restriction_u_fine);
+  CeedBasisDestroy(&basis_x);
+  CeedBasisDestroy(&basis_u_coarse);
+  CeedBasisDestroy(&basis_u_fine);
+  CeedQFunctionDestroy(&qf_setup);
+  CeedQFunctionDestroy(&qf_mass);
+  CeedOperatorDestroy(&op_setup);
+  CeedOperatorDestroy(&op_mass_coarse);
+  CeedOperatorDestroy(&op_mass_fine);
+  CeedOperatorDestroy(&op_prolong);
+  CeedOperatorDestroy(&op_restrict);
+  CeedDestroy(&ceed);
+  return 0;
+}

From c5a41aec3f06ad4e8ccb2139dcac9574697db9f7 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Wed, 30 Apr 2025 10:49:56 -0600
Subject: [PATCH 392/571] ci - use newer images for docs build (#1816)

* ci - use newer images for docs build

* doc - explicitly install playwright first
---
 .readthedocs.yml | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/.readthedocs.yml b/.readthedocs.yml
index b530328523..2b173bda47 100644
--- a/.readthedocs.yml
+++ b/.readthedocs.yml
@@ -5,14 +5,15 @@
 version: 2
 
 build:
-  os: ubuntu-22.04
+  os: ubuntu-24.04
   tools:
-    python: "3.11"
-    nodejs: "19"
+    python: "3.13"
+    nodejs: "23"
   apt_packages:
     - librsvg2-bin
   jobs:
     post_create_environment:
+      - npx playwright install
       - npm install -g @mermaid-js/mermaid-cli
 
 # Build documentation in the docs/ directory with Sphinx

From 0002d81d205a4e0fbfcfe4897732c8c00278f8e6 Mon Sep 17 00:00:00 2001
From: Zach Atkins <Zach.Atkins@colorado.edu>
Date: Sun, 4 May 2025 15:59:45 -0700
Subject: [PATCH 393/571] Use stream sync instead of device sync for hipBlas
 calls

---
 backends/hip-ref/ceed-hip-ref-vector.c | 40 ++++++++++++++++++++------
 backends/hip-ref/ceed-hip-ref.c        |  5 +++-
 2 files changed, 35 insertions(+), 10 deletions(-)

diff --git a/backends/hip-ref/ceed-hip-ref-vector.c b/backends/hip-ref/ceed-hip-ref-vector.c
index 2c1748033e..639a11bb28 100644
--- a/backends/hip-ref/ceed-hip-ref-vector.c
+++ b/backends/hip-ref/ceed-hip-ref-vector.c
@@ -309,15 +309,18 @@ static int CeedVectorCopyStrided_Hip(CeedVector vec, CeedSize start, CeedSize st
     CeedCallBackend(CeedVectorGetArray(vec_copy, CEED_MEM_DEVICE, &copy_array));
 #if (HIP_VERSION >= 60000000)
     hipblasHandle_t handle;
+    hipStream_t     stream;
     Ceed            ceed;
 
     CeedCallBackend(CeedVectorGetCeed(vec, &ceed));
     CeedCallBackend(CeedGetHipblasHandle_Hip(ceed, &handle));
+    CeedCallHipblas(ceed, hipblasGetStream(handle, &stream));
 #if defined(CEED_SCALAR_IS_FP32)
     CeedCallHipblas(ceed, hipblasScopy_64(handle, (int64_t)(stop - start), impl->d_array + start, (int64_t)step, copy_array + start, (int64_t)step));
 #else  /* CEED_SCALAR */
     CeedCallHipblas(ceed, hipblasDcopy_64(handle, (int64_t)(stop - start), impl->d_array + start, (int64_t)step, copy_array + start, (int64_t)step));
 #endif /* CEED_SCALAR */
+    CeedCallHip(ceed, hipStreamSynchronize(stream));
 #else  /* HIP_VERSION */
     CeedCallBackend(CeedDeviceCopyStrided_Hip(impl->d_array, start, stop, step, copy_array));
 #endif /* HIP_VERSION */
@@ -557,6 +560,7 @@ static int CeedVectorNorm_Hip(CeedVector vec, CeedNormType type, CeedScalar *nor
   const CeedScalar *d_array;
   CeedVector_Hip   *impl;
   hipblasHandle_t   handle;
+  hipStream_t       stream;
   Ceed_Hip         *hip_data;
 
   CeedCallBackend(CeedVectorGetCeed(vec, &ceed));
@@ -564,7 +568,7 @@ static int CeedVectorNorm_Hip(CeedVector vec, CeedNormType type, CeedScalar *nor
   CeedCallBackend(CeedVectorGetData(vec, &impl));
   CeedCallBackend(CeedVectorGetLength(vec, &length));
   CeedCallBackend(CeedGetHipblasHandle_Hip(ceed, &handle));
-
+  CeedCallHipblas(ceed, hipblasGetStream(handle, &stream));
 #if (HIP_VERSION < 60000000)
   // With ROCm 6, we can use the 64-bit integer interface. Prior to that,
   // we need to check if the vector is too long to handle with int32,
@@ -581,6 +585,7 @@ static int CeedVectorNorm_Hip(CeedVector vec, CeedNormType type, CeedScalar *nor
 #if defined(CEED_SCALAR_IS_FP32)
 #if (HIP_VERSION >= 60000000)  // We have ROCm 6, and can use 64-bit integers
       CeedCallHipblas(ceed, hipblasSasum_64(handle, (int64_t)length, (float *)d_array, 1, (float *)norm));
+      CeedCallHip(ceed, hipStreamSynchronize(stream));
 #else  /* HIP_VERSION */
       float  sub_norm = 0.0;
       float *d_array_start;
@@ -591,12 +596,14 @@ static int CeedVectorNorm_Hip(CeedVector vec, CeedNormType type, CeedScalar *nor
         CeedInt  sub_length       = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
 
         CeedCallHipblas(ceed, hipblasSasum(handle, (CeedInt)sub_length, (float *)d_array_start, 1, &sub_norm));
+        CeedCallHip(ceed, hipStreamSynchronize(stream));
         *norm += sub_norm;
       }
 #endif /* HIP_VERSION */
 #else  /* CEED_SCALAR */
 #if (HIP_VERSION >= 60000000)
       CeedCallHipblas(ceed, hipblasDasum_64(handle, (int64_t)length, (double *)d_array, 1, (double *)norm));
+      CeedCallHip(ceed, hipStreamSynchronize(stream));
 #else  /* HIP_VERSION */
       double  sub_norm = 0.0;
       double *d_array_start;
@@ -607,6 +614,7 @@ static int CeedVectorNorm_Hip(CeedVector vec, CeedNormType type, CeedScalar *nor
         CeedInt  sub_length       = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
 
         CeedCallHipblas(ceed, hipblasDasum(handle, (CeedInt)sub_length, (double *)d_array_start, 1, &sub_norm));
+        CeedCallHip(ceed, hipStreamSynchronize(stream));
         *norm += sub_norm;
       }
 #endif /* HIP_VERSION */
@@ -617,6 +625,7 @@ static int CeedVectorNorm_Hip(CeedVector vec, CeedNormType type, CeedScalar *nor
 #if defined(CEED_SCALAR_IS_FP32)
 #if (HIP_VERSION >= 60000000)
       CeedCallHipblas(ceed, hipblasSnrm2_64(handle, (int64_t)length, (float *)d_array, 1, (float *)norm));
+      CeedCallHip(ceed, hipStreamSynchronize(stream));
 #else  /* HIP_VERSION */
       float  sub_norm = 0.0, norm_sum = 0.0;
       float *d_array_start;
@@ -627,6 +636,7 @@ static int CeedVectorNorm_Hip(CeedVector vec, CeedNormType type, CeedScalar *nor
         CeedInt  sub_length       = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
 
         CeedCallHipblas(ceed, hipblasSnrm2(handle, (CeedInt)sub_length, (float *)d_array_start, 1, &sub_norm));
+        CeedCallHip(ceed, hipStreamSynchronize(stream));
         norm_sum += sub_norm * sub_norm;
       }
       *norm = sqrt(norm_sum);
@@ -634,6 +644,7 @@ static int CeedVectorNorm_Hip(CeedVector vec, CeedNormType type, CeedScalar *nor
 #else  /* CEED_SCALAR */
 #if (HIP_VERSION >= 60000000)
       CeedCallHipblas(ceed, hipblasDnrm2_64(handle, (int64_t)length, (double *)d_array, 1, (double *)norm));
+      CeedCallHip(ceed, hipStreamSynchronize(stream));
 #else  /* HIP_VERSION */
       double  sub_norm = 0.0, norm_sum = 0.0;
       double *d_array_start;
@@ -644,6 +655,7 @@ static int CeedVectorNorm_Hip(CeedVector vec, CeedNormType type, CeedScalar *nor
         CeedInt  sub_length       = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
 
         CeedCallHipblas(ceed, hipblasDnrm2(handle, (CeedInt)sub_length, (double *)d_array_start, 1, &sub_norm));
+        CeedCallHip(ceed, hipStreamSynchronize(stream));
         norm_sum += sub_norm * sub_norm;
       }
       *norm = sqrt(norm_sum);
@@ -658,7 +670,8 @@ static int CeedVectorNorm_Hip(CeedVector vec, CeedNormType type, CeedScalar *nor
       CeedScalar norm_no_abs;
 
       CeedCallHipblas(ceed, hipblasIsamax_64(handle, (int64_t)length, (float *)d_array, 1, &index));
-      CeedCallHip(ceed, hipMemcpy(&norm_no_abs, impl->d_array + index - 1, sizeof(CeedScalar), hipMemcpyDeviceToHost));
+      CeedCallHip(ceed, hipMemcpyAsync(&norm_no_abs, impl->d_array + index - 1, sizeof(CeedScalar), hipMemcpyDeviceToHost, stream));
+      CeedCallHip(ceed, hipStreamSynchronize(stream));
       *norm = fabs(norm_no_abs);
 #else  /* HIP_VERSION */
       CeedInt index;
@@ -672,10 +685,11 @@ static int CeedVectorNorm_Hip(CeedVector vec, CeedNormType type, CeedScalar *nor
 
         CeedCallHipblas(ceed, hipblasIsamax(handle, (CeedInt)sub_length, (float *)d_array_start, 1, &index));
         if (hip_data->has_unified_addressing) {
-          CeedCallHip(ceed, hipDeviceSynchronize());
+          CeedCallHip(ceed, hipStreamSynchronize(stream));
           sub_max = fabs(d_array[index - 1]);
         } else {
-          CeedCallHip(ceed, hipMemcpy(&sub_max, d_array_start + index - 1, sizeof(CeedScalar), hipMemcpyDeviceToHost));
+          CeedCallHip(ceed, hipMemcpyAsync(&sub_max, d_array_start + index - 1, sizeof(CeedScalar), hipMemcpyDeviceToHost, stream));
+          CeedCallHip(ceed, hipStreamSynchronize(stream));
         }
         if (fabs(sub_max) > current_max) current_max = fabs(sub_max);
       }
@@ -688,10 +702,11 @@ static int CeedVectorNorm_Hip(CeedVector vec, CeedNormType type, CeedScalar *nor
 
       CeedCallHipblas(ceed, hipblasIdamax_64(handle, (int64_t)length, (double *)d_array, 1, &index));
       if (hip_data->has_unified_addressing) {
-        CeedCallHip(ceed, hipDeviceSynchronize());
+        CeedCallHip(ceed, hipStreamSynchronize(stream));
         norm_no_abs = fabs(d_array[index - 1]);
       } else {
-        CeedCallHip(ceed, hipMemcpy(&norm_no_abs, impl->d_array + index - 1, sizeof(CeedScalar), hipMemcpyDeviceToHost));
+        CeedCallHip(ceed, hipMemcpyAsync(&norm_no_abs, impl->d_array + index - 1, sizeof(CeedScalar), hipMemcpyDeviceToHost, stream));
+        CeedCallHip(ceed, hipStreamSynchronize(stream));
       }
       *norm = fabs(norm_no_abs);
 #else  /* HIP_VERSION */
@@ -706,10 +721,11 @@ static int CeedVectorNorm_Hip(CeedVector vec, CeedNormType type, CeedScalar *nor
 
         CeedCallHipblas(ceed, hipblasIdamax(handle, (CeedInt)sub_length, (double *)d_array_start, 1, &index));
         if (hip_data->has_unified_addressing) {
-          CeedCallHip(ceed, hipDeviceSynchronize());
+          CeedCallHip(ceed, hipStreamSynchronize(stream));
           sub_max = fabs(d_array[index - 1]);
         } else {
-          CeedCallHip(ceed, hipMemcpy(&sub_max, d_array_start + index - 1, sizeof(CeedScalar), hipMemcpyDeviceToHost));
+          CeedCallHip(ceed, hipMemcpyAsync(&sub_max, d_array_start + index - 1, sizeof(CeedScalar), hipMemcpyDeviceToHost, stream));
+          CeedCallHip(ceed, hipStreamSynchronize(stream));
         }
         if (fabs(sub_max) > current_max) current_max = fabs(sub_max);
       }
@@ -780,13 +796,16 @@ static int CeedVectorScale_Hip(CeedVector x, CeedScalar alpha) {
   if (impl->d_array) {
 #if (HIP_VERSION >= 60000000)
     hipblasHandle_t handle;
+    hipStream_t     stream;
 
     CeedCallBackend(CeedGetHipblasHandle_Hip(CeedVectorReturnCeed(x), &handle));
+    CeedCallHipblas(CeedVectorReturnCeed(x), hipblasGetStream(handle, &stream));
 #if defined(CEED_SCALAR_IS_FP32)
     CeedCallHipblas(CeedVectorReturnCeed(x), hipblasSscal_64(handle, (int64_t)length, &alpha, impl->d_array, 1));
 #else  /* CEED_SCALAR */
     CeedCallHipblas(CeedVectorReturnCeed(x), hipblasDscal_64(handle, (int64_t)length, &alpha, impl->d_array, 1));
 #endif /* CEED_SCALAR */
+    CeedCallHip(CeedVectorReturnCeed(x), hipStreamSynchronize(stream));
 #else  /* HIP_VERSION */
     CeedCallBackend(CeedDeviceScale_Hip(impl->d_array, alpha, length));
 #endif /* HIP_VERSION */
@@ -827,13 +846,16 @@ static int CeedVectorAXPY_Hip(CeedVector y, CeedScalar alpha, CeedVector x) {
     CeedCallBackend(CeedVectorSyncArray(x, CEED_MEM_DEVICE));
 #if (HIP_VERSION >= 60000000)
     hipblasHandle_t handle;
+    hipStream_t     stream;
 
-    CeedCallBackend(CeedGetHipblasHandle_Hip(CeedVectorReturnCeed(y), &handle));
+    CeedCallBackend(CeedGetHipblasHandle_Hip(CeedVectorReturnCeed(x), &handle));
+    CeedCallHipblas(CeedVectorReturnCeed(y), hipblasGetStream(handle, &stream));
 #if defined(CEED_SCALAR_IS_FP32)
     CeedCallHipblas(CeedVectorReturnCeed(y), hipblasSaxpy_64(handle, (int64_t)length, &alpha, x_impl->d_array, 1, y_impl->d_array, 1));
 #else  /* CEED_SCALAR */
     CeedCallHipblas(CeedVectorReturnCeed(y), hipblasDaxpy_64(handle, (int64_t)length, &alpha, x_impl->d_array, 1, y_impl->d_array, 1));
 #endif /* CEED_SCALAR */
+    CeedCallHip(CeedVectorReturnCeed(y), hipStreamSynchronize(stream));
 #else  /* HIP_VERSION */
     CeedCallBackend(CeedDeviceAXPY_Hip(y_impl->d_array, alpha, x_impl->d_array, length));
 #endif /* HIP_VERSION */
diff --git a/backends/hip-ref/ceed-hip-ref.c b/backends/hip-ref/ceed-hip-ref.c
index b15686b8dc..2587e7fba3 100644
--- a/backends/hip-ref/ceed-hip-ref.c
+++ b/backends/hip-ref/ceed-hip-ref.c
@@ -29,7 +29,10 @@ int CeedGetHipblasHandle_Hip(Ceed ceed, hipblasHandle_t *handle) {
   Ceed_Hip *data;
 
   CeedCallBackend(CeedGetData(ceed, &data));
-  if (!data->hipblas_handle) CeedCallHipblas(ceed, hipblasCreate(&data->hipblas_handle));
+  if (!data->hipblas_handle) {
+    CeedCallHipblas(ceed, hipblasCreate(&data->hipblas_handle));
+    CeedCallHipblas(ceed, hipblasSetPointerMode(data->hipblas_handle, HIPBLAS_POINTER_MODE_HOST));
+  }
   *handle = data->hipblas_handle;
   return CEED_ERROR_SUCCESS;
 }

From 7b3ff0698626cc2e5ce463afc10290072fd55c90 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Mon, 5 May 2025 11:01:18 -0600
Subject: [PATCH 394/571] New Python Examples (#1817)

* ex1-volume python example (#1804)

* Adds initial ex1 python example

* Update examples/python/ex1-volume.py

* after make format

* formatting change

* modifies main

* adds strided restriction

* Update examples/python/ex1-volume.py

---------

ex - add python version of ex1 volume

* ex2_Surface implementation in python (#1802)

* Add files via upload

* Update ex2_surface.py

* Update ex2_surface.py

* Update ex2_surface.py

* Update ex2_surface.py

* Update ex2_surface.py

* Update ex2_surface.py

* Update ex2_surface.py

* Delete examples/ceed/ex2_surface.py

* Add cleaned and formatted ex2_surface.py example

* style: update header and reformat ex2_surface.py

* Delete examples/ceed/ex2_surface.py

* Python Surface area example

----------------------------------------

ex - add ex2 surface example in python

* ex - consolidate common py code

* ex - add py gallery option

* ex - add py test harness

* ex3 volume python example

* ex - switch ex3 to use common python code

* ci - add new python examples to testing

---------

Co-authored-by: katayoonk <122122167+katayoonk@users.noreply.github.com>
Co-authored-by: Surinder singh chhabra <93762514+arrowguy234@users.noreply.github.com>
Co-authored-by: Arshia Ilaty <arshia.ilaty99@gmail.com>
---
 .github/workflows/python-test-with-style.yml |   2 +
 examples/ceed/ex3-volume.c                   |   2 +-
 examples/python/Makefile                     |  20 ++
 examples/python/conftest.py                  |  25 ++
 examples/python/ex1_volume.py                | 181 +++++++++++++
 examples/python/ex2_surface.py               | 186 +++++++++++++
 examples/python/ex3_volume.py                | 178 ++++++++++++
 examples/python/ex_common.py                 | 255 ++++++++++++++++++
 examples/python/ex_test.py                   | 269 +++++++++++++++++++
 examples/python/qfunctions/ex-common.h       |  14 +
 examples/python/qfunctions/ex1-volume.h      |  60 +++++
 examples/python/qfunctions/ex2-surface.h     | 135 ++++++++++
 examples/python/qfunctions/ex3-volume.h      | 168 ++++++++++++
 examples/python/qfunctions/qfunctions.c      |  22 ++
 examples/python/setup_qfunctions.py          |  32 +++
 python/tests/Makefile                        |   2 +
 python/tests/test-0-ceed.py                  |   2 +-
 17 files changed, 1551 insertions(+), 2 deletions(-)
 create mode 100644 examples/python/Makefile
 create mode 100644 examples/python/conftest.py
 create mode 100644 examples/python/ex1_volume.py
 create mode 100644 examples/python/ex2_surface.py
 create mode 100644 examples/python/ex3_volume.py
 create mode 100644 examples/python/ex_common.py
 create mode 100644 examples/python/ex_test.py
 create mode 100644 examples/python/qfunctions/ex-common.h
 create mode 100644 examples/python/qfunctions/ex1-volume.h
 create mode 100644 examples/python/qfunctions/ex2-surface.h
 create mode 100644 examples/python/qfunctions/ex3-volume.h
 create mode 100644 examples/python/qfunctions/qfunctions.c
 create mode 100644 examples/python/setup_qfunctions.py

diff --git a/.github/workflows/python-test-with-style.yml b/.github/workflows/python-test-with-style.yml
index ef87f09090..4c2764b244 100644
--- a/.github/workflows/python-test-with-style.yml
+++ b/.github/workflows/python-test-with-style.yml
@@ -38,6 +38,8 @@ jobs:
         pip install .
         cd python/tests
         PYTHON=python3 make test TEST_OPTS="--ceed /cpu/self/ref/serial -vv"
+        cd ../../examples/python
+        PYTHON=python3 make test TEST_OPTS="--ceed /cpu/self/ref/serial -vv"
         cd ../..
     - name: Python style
       env:
diff --git a/examples/ceed/ex3-volume.c b/examples/ceed/ex3-volume.c
index 00293f3b46..13f5275e07 100644
--- a/examples/ceed/ex3-volume.c
+++ b/examples/ceed/ex3-volume.c
@@ -204,7 +204,7 @@ int main(int argc, const char *argv[]) {
   CeedQFunctionAddOutput(qf_apply, "dv", dim, CEED_EVAL_GRAD);
   CeedQFunctionSetContext(qf_apply, build_ctx);
 
-  // Create the mass +diffusion operator.
+  // Create the mass + diffusion operator.
   CeedOperator op_apply;
 
   CeedOperatorCreate(ceed, qf_apply, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_apply);
diff --git a/examples/python/Makefile b/examples/python/Makefile
new file mode 100644
index 0000000000..2e22bc0f2d
--- /dev/null
+++ b/examples/python/Makefile
@@ -0,0 +1,20 @@
+# Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+# All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+#
+# SPDX-License-Identifier: BSD-2-Clause
+#
+# This file is part of CEED:  http://github.com/ceed
+
+PYTHON ?= python3
+
+clean:
+	rm -rf build __pycache__ .pytest_cache *.so
+
+setup:
+	$(PYTHON) setup_qfunctions.py build
+
+TEST_OPTS ?= --ceed /cpu/self/ref/serial
+test: setup
+	$(PYTHON) -m pytest ex_test.py $(TEST_OPTS)
+
+.PHONY: clean setup test
diff --git a/examples/python/conftest.py b/examples/python/conftest.py
new file mode 100644
index 0000000000..6c763ac90a
--- /dev/null
+++ b/examples/python/conftest.py
@@ -0,0 +1,25 @@
+# Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors
+# All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+#
+# SPDX-License-Identifier: BSD-2-Clause
+#
+# This file is part of CEED:  http://github.com/ceed
+
+import pytest
+
+# -------------------------------------------------------------------------------
+# Add --ceed command line argument
+# -------------------------------------------------------------------------------
+
+
+def pytest_addoption(parser):
+    parser.addoption("--ceed", action="store", default='/cpu/self/ref/blocked')
+
+
+@pytest.fixture(scope='session')
+def ceed_resource(request):
+    ceed_resource = request.config.option.ceed
+
+    return ceed_resource
+
+# -------------------------------------------------------------------------------
diff --git a/examples/python/ex1_volume.py b/examples/python/ex1_volume.py
new file mode 100644
index 0000000000..91b6b8d41d
--- /dev/null
+++ b/examples/python/ex1_volume.py
@@ -0,0 +1,181 @@
+#!/usr/bin/env python3
+# Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+# All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+#
+# SPDX-License-Identifier: BSD-2-Clause
+#
+# This file is part of CEED:  http://github.com/ceed
+#
+# libCEED example using diffusion operator to compute surface area
+#
+# Sample runs:
+#
+#     python ex1_volume.py
+#     python ex1_volume -c /cpu/self
+#     python ex1_volume -c /gpu/cuda
+
+import sys
+import os
+import numpy as np
+import libceed
+import ex_common as common
+
+
+def main():
+    """Main function for volume example"""
+    args = common.parse_arguments()
+    return example_1(args)
+
+
+def example_1(args):
+    """Compute volume using mass operator
+
+    Args:
+        args: Parsed command line arguments
+
+    Returns:
+        int: 0 on success, error code on failure
+    """
+    # Process arguments
+    dim = args.dim
+    mesh_degree = max(args.mesh_degree, args.solution_degree)
+    sol_degree = args.solution_degree
+    num_qpts = args.quadrature_points
+    problem_size = args.problem_size if args.problem_size > 0 else (8 * 16 if args.test else 256 * 1024)
+    ncomp_x = dim  # Number of coordinate components
+
+    # Print configuration
+    if not args.quiet:
+        print("Selected options: [command line option] : <current value>")
+        print(f"    Ceed specification [-c] : {args.ceed}")
+        print(f"    Mesh dimension     [-d] : {dim}")
+        print(f"    Mesh degree        [-m] : {mesh_degree}")
+        print(f"    Solution degree    [-p] : {sol_degree}")
+        print(f"    Num. 1D quadr. pts [-q] : {num_qpts}")
+        print(f"    Approx. # unknowns [-s] : {problem_size}")
+        print(f"    QFunction source   [-g] : {'gallery' if args.gallery else 'user'}")
+
+    # Initialize CEED
+    ceed = libceed.Ceed(args.ceed)
+
+    # Create bases
+    # Tensor-product Lagrange basis for mesh coordinates
+    mesh_basis = ceed.BasisTensorH1Lagrange(
+        dim, ncomp_x, mesh_degree + 1, num_qpts, libceed.GAUSS)
+
+    # Tensor-product Lagrange basis for solution
+    solution_basis = ceed.BasisTensorH1Lagrange(
+        dim, 1, sol_degree + 1, num_qpts, libceed.GAUSS)
+
+    # Create mesh
+    # Determine mesh size
+    num_xyz = common.get_cartesian_mesh_size(dim, sol_degree, problem_size)
+    if not args.quiet:
+        print("\nMesh size                   : nx = %d" % num_xyz[0], end="")
+        if dim > 1:
+            print(", ny = %d" % num_xyz[1], end="")
+        if dim > 2:
+            print(", nz = %d" % num_xyz[2], end="")
+        print()
+
+    # Create element restrictions
+    num_q_comp = 1
+    mesh_restriction, mesh_size, _, _, _ = common.build_cartesian_restriction(
+        ceed, dim, num_xyz, mesh_degree, ncomp_x, num_q_comp, num_qpts, create_qdata=False)
+    solution_restriction, sol_size, q_data_restriction, num_elem, elem_qpts = common.build_cartesian_restriction(
+        ceed, dim, num_xyz, sol_degree, 1, num_q_comp, num_qpts, create_qdata=True)
+
+    if not args.quiet:
+        print("Number of mesh nodes        : %d" % (mesh_size // dim))
+        print("Number of solution nodes    : %d" % sol_size)
+
+    # Create and transform mesh coordinates
+    mesh_coords = ceed.Vector(mesh_size)
+    common.set_cartesian_mesh_coords(ceed, dim, num_xyz, mesh_degree, mesh_coords)
+    exact_volume, _ = common.transform_mesh_coords(dim, mesh_size, mesh_coords)
+
+    # Create the QFunction that builds the mass operator (i.e. computes its quadrature data) and set its context data
+    qf_build = None
+    if args.gallery:
+        qf_build = ceed.QFunctionByName(f"Mass{dim}DBuild")
+    else:
+        build_ctx = ceed.QFunctionContext()
+        ctx_data = np.array([dim, dim], dtype=np.int32)
+        build_ctx.set_data(ctx_data)
+
+        qfs_so = common.load_qfs_so()
+        file_dir = os.path.dirname(os.path.abspath(__file__))
+
+        qf_build = ceed.QFunction(1, qfs_so.build_mass,
+                                  os.path.join(file_dir, "ex1-volume.h:build_mass"))
+        qf_build.add_input("dx", dim * dim, libceed.EVAL_GRAD)
+        qf_build.add_input("weights", 1, libceed.EVAL_WEIGHT)
+        qf_build.add_output("qdata", num_q_comp, libceed.EVAL_NONE)
+        qf_build.set_context(build_ctx)
+
+    # Create the operator that builds the quadrature data for the mass operator
+    op_build = ceed.Operator(qf_build)
+    op_build.set_field("dx", mesh_restriction, mesh_basis, libceed.VECTOR_ACTIVE)
+    op_build.set_field("weights", libceed.ELEMRESTRICTION_NONE, mesh_basis, libceed.VECTOR_NONE)
+    op_build.set_field("qdata", q_data_restriction, libceed.BASIS_NONE, libceed.VECTOR_ACTIVE)
+
+    # Compute the quadrature data for the mass operator
+    q_data = ceed.Vector(num_elem * elem_qpts * num_q_comp)
+    op_build.apply(mesh_coords, q_data)
+
+    # Setup QFunction for applying the mass operator
+    qf_mass = None
+    if args.gallery:
+        qf_mass = ceed.QFunctionByName("MassApply")
+    else:
+        build_ctx = ceed.QFunctionContext()
+        ctx_data = np.array([dim, dim], dtype=np.int32)
+        build_ctx.set_data(ctx_data)
+
+        qfs_so = common.load_qfs_so()
+        file_dir = os.path.dirname(os.path.abspath(__file__))
+
+        qf_mass = ceed.QFunction(1, qfs_so.apply_mass,
+                                 os.path.join(file_dir, "ex1-volume.h:apply_mass"))
+        qf_mass.add_input("u", 1, libceed.EVAL_INTERP)
+        qf_mass.add_input("qdata", num_q_comp, libceed.EVAL_NONE)
+        qf_mass.add_output("v", 1, libceed.EVAL_INTERP)
+        qf_mass.set_context(build_ctx)
+
+    # Create the mass operator
+    op_mass = ceed.Operator(qf_mass)
+    op_mass.set_field("u", solution_restriction, solution_basis, libceed.VECTOR_ACTIVE)
+    op_mass.set_field("qdata", q_data_restriction, libceed.BASIS_NONE, q_data)
+    op_mass.set_field("v", solution_restriction, solution_basis, libceed.VECTOR_ACTIVE)
+
+    # Create solution vectors
+    u = ceed.Vector(sol_size)
+    v = ceed.Vector(sol_size)
+    u.set_value(1.0)  # Set all entries of u to 1.0
+
+    # Apply mass operator: v = M * u
+    op_mass.apply(u, v)
+
+    # Compute volume by summing all entries in v
+    volume = 0.0
+    with v.array_read() as v_array:
+        # Simply sum all values to compute the volume
+        volume = np.sum(v_array)
+
+    if not args.test:
+        print()
+        print(f"Exact mesh volume    : {exact_volume:.14g}")
+        print(f"Computed mesh volume : {volume:.14g}")
+        print(f"Volume error         : {volume - exact_volume:.14g}")
+    else:
+        # Test mode - check if error is within tolerance
+        tol = 200 * libceed.EPSILON if dim == 1 else 1e-5
+        if abs(volume - exact_volume) > tol:
+            print(f"Volume error : {volume - exact_volume:.14g}")
+            sys.exit(1)
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/examples/python/ex2_surface.py b/examples/python/ex2_surface.py
new file mode 100644
index 0000000000..97be1ef276
--- /dev/null
+++ b/examples/python/ex2_surface.py
@@ -0,0 +1,186 @@
+#!/usr/bin/env python3
+# Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+# All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+#
+# SPDX-License-Identifier: BSD-2-Clause
+#
+# This file is part of CEED:  http://github.com/ceed
+#
+# libCEED example using diffusion operator to compute surface area
+#
+# Sample runs:
+#
+#     python ex2_surface.py
+#     python ex2_surface.py -c /cpu/self
+#     python ex2_surface.py -c /gpu/cuda
+
+import sys
+import os
+import numpy as np
+import libceed
+import ex_common as common
+
+
+def main():
+    """Main driver for surface area example"""
+    args = common.parse_arguments()
+    return example_2(args)
+
+
+def example_2(options):
+    """Compute surface area using diffusion operator
+
+    Args:
+        args: Parsed command line arguments
+
+    Returns:
+        int: 0 on success, error code on failure
+    """
+    # Process arguments
+    args = options
+    dim = args.dim
+    mesh_degree = max(args.mesh_degree, args.solution_degree)
+    sol_degree = args.solution_degree
+    num_qpts = args.quadrature_points
+    problem_size = args.problem_size if args.problem_size > 0 else (500 * dim * dim if args.test else 256 * 1024)
+    ncomp_x = dim  # Number of coordinate components
+
+    # Print configuration
+    if not args.quiet:
+        print("Selected options: [command line option] : <current value>")
+        print(f"    Ceed specification [-c] : {args.ceed}")
+        print(f"    Mesh dimension     [-d] : {dim}")
+        print(f"    Mesh degree        [-m] : {mesh_degree}")
+        print(f"    Solution degree    [-p] : {sol_degree}")
+        print(f"    Num. 1D quadr. pts [-q] : {num_qpts}")
+        print(f"    Approx. # unknowns [-s] : {problem_size}")
+        print(f"    QFunction source   [-g] : {'gallery' if args.gallery else 'user'}")
+
+    # Initialize CEED
+    ceed = libceed.Ceed(args.ceed)
+
+    # Create bases
+    # Tensor-product Lagrange basis for mesh coordinates
+    mesh_basis = ceed.BasisTensorH1Lagrange(
+        dim, ncomp_x, mesh_degree + 1, num_qpts, libceed.GAUSS)
+
+    # Tensor-product Lagrange basis for solution
+    solution_basis = ceed.BasisTensorH1Lagrange(
+        dim, 1, sol_degree + 1, num_qpts, libceed.GAUSS)
+
+    # Create mesh
+    # Determine mesh size
+    num_xyz = common.get_cartesian_mesh_size(dim, sol_degree, problem_size)
+    if not args.quiet:
+        print("\nMesh size                   : nx = %d" % num_xyz[0], end="")
+        if dim > 1:
+            print(", ny = %d" % num_xyz[1], end="")
+        if dim > 2:
+            print(", nz = %d" % num_xyz[2], end="")
+        print()
+
+    # Create element restrictions
+    num_q_comp = dim * (dim + 1) // 2
+    mesh_restriction, mesh_size, _, _, _ = common.build_cartesian_restriction(
+        ceed, dim, num_xyz, mesh_degree, ncomp_x, num_q_comp, num_qpts, create_qdata=False)
+    solution_restriction, sol_size, q_data_restriction, num_elem, elem_qpts = common.build_cartesian_restriction(
+        ceed, dim, num_xyz, sol_degree, 1, num_q_comp, num_qpts, create_qdata=True)
+
+    if not args.quiet:
+        print("Number of mesh nodes        : %d" % (mesh_size // dim))
+        print("Number of solution nodes    : %d" % sol_size)
+
+    # Create and transform mesh coordinates
+    mesh_coords = ceed.Vector(mesh_size)
+    common.set_cartesian_mesh_coords(ceed, dim, num_xyz, mesh_degree, mesh_coords)
+    _, exact_surface_area = common.transform_mesh_coords(dim, mesh_size, mesh_coords, use_sin=False)
+
+    # Create the QFunction that builds the diffusion operator (i.e. computes
+    # its quadrature data) and set its context data
+    qf_build = None
+    if args.gallery:
+        qf_build = ceed.QFunctionByName(f"Poisson{dim}DBuild")
+    else:
+        build_ctx = ceed.QFunctionContext()
+        ctx_data = np.array([dim, dim], dtype=np.int32)
+        build_ctx.set_data(ctx_data)
+
+        qfs_so = common.load_qfs_so()
+        file_dir = os.path.dirname(os.path.abspath(__file__))
+
+        qf_build = ceed.QFunction(1, qfs_so.build_diff,
+                                  os.path.join(file_dir, "ex2-surface.h:build_diff"))
+        qf_build.add_input("dx", dim * dim, libceed.EVAL_GRAD)
+        qf_build.add_input("weights", 1, libceed.EVAL_WEIGHT)
+        qf_build.add_output("qdata", num_q_comp, libceed.EVAL_NONE)
+        qf_build.set_context(build_ctx)
+
+    # Operator for building quadrature data
+    op_build = ceed.Operator(qf_build)
+    op_build.set_field("dx", mesh_restriction, mesh_basis, libceed.VECTOR_ACTIVE)
+    op_build.set_field("weights", libceed.ELEMRESTRICTION_NONE, mesh_basis, libceed.VECTOR_NONE)
+    op_build.set_field("qdata", q_data_restriction, libceed.BASIS_NONE, libceed.VECTOR_ACTIVE)
+
+    # Compute quadrature data
+    q_data = ceed.Vector(num_elem * elem_qpts * num_q_comp)
+    op_build.apply(mesh_coords, q_data)
+
+    # Create the QFunction that defines the action of the diffusion operator
+    qf_diff = None
+    if args.gallery:
+        qf_diff = ceed.QFunctionByName(f"Poisson{dim}DApply")
+    else:
+        build_ctx = ceed.QFunctionContext()
+        ctx_data = np.array([dim, dim], dtype=np.int32)
+        build_ctx.set_data(ctx_data)
+
+        qfs_so = common.load_qfs_so()
+        file_dir = os.path.dirname(os.path.abspath(__file__))
+
+        qf_diff = ceed.QFunction(1, qfs_so.apply_diff,
+                                 os.path.join(file_dir, "ex2-surface.h:apply_diff"))
+        qf_diff.add_input("du", dim, libceed.EVAL_GRAD)
+        qf_diff.add_input("qdata", num_q_comp, libceed.EVAL_NONE)
+        qf_diff.add_output("dv", dim, libceed.EVAL_GRAD)
+        qf_diff.set_context(build_ctx)
+
+    # Diffusion operator
+    op_diff = ceed.Operator(qf_diff)
+    op_diff.set_field("du", solution_restriction, solution_basis, libceed.VECTOR_ACTIVE)
+    op_diff.set_field("qdata", q_data_restriction, libceed.BASIS_NONE, q_data)
+    op_diff.set_field("dv", solution_restriction, solution_basis, libceed.VECTOR_ACTIVE)
+
+    # Create vectors
+    u = ceed.Vector(sol_size)  # Input vector
+    v = ceed.Vector(sol_size)  # Output vector
+
+    # Initialize u with sum of coordinates (x + y + z)
+    with mesh_coords.array_read() as x_array, u.array_write() as u_array:
+        for i in range(sol_size):
+            u_array[i] = sum(x_array[i + j * (sol_size)] for j in range(dim))
+
+    # Apply operator: v = K * u
+    op_diff.apply(u, v)
+
+    # Compute surface area by summing absolute values of v
+    surface_area = 0.0
+    with v.array_read() as v_array:
+        surface_area = np.sum(abs(v_array))
+
+    if not args.test:
+        print()
+        print(f"Exact mesh surface area    : {exact_surface_area:.14g}")
+        print(f"Computed mesh surface area : {surface_area:.14g}")
+        print(f"Surface area error         : {surface_area - exact_surface_area:.14g}")
+    else:
+        # Test mode - check if error is within tolerance
+        tol = 10000 * libceed.EPSILON if dim == 1 else 1e-1
+        if abs(surface_area - exact_surface_area) > tol:
+            print(f"Surface area error : {surface_area - exact_surface_area:.14g}")
+            sys.exit(1)
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/examples/python/ex3_volume.py b/examples/python/ex3_volume.py
new file mode 100644
index 0000000000..f803adaec8
--- /dev/null
+++ b/examples/python/ex3_volume.py
@@ -0,0 +1,178 @@
+#!/usr/bin/env python3
+# Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+# All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+#
+# SPDX-License-Identifier: BSD-2-Clause
+#
+# This file is part of CEED:  http://github.com/ceed
+#
+# libCEED example using diffusion operator to compute surface area
+#
+# Sample runs:
+#
+#     python ex3_volume.py
+#     python ex3_volume -c /cpu/self
+#     python ex3_volume -c /gpu/cuda
+
+import sys
+import os
+import numpy as np
+import libceed
+import ex_common as common
+
+
+def main():
+    """Main function for volume example"""
+    args = common.parse_arguments()
+    example_3(args)
+
+
+def example_3(args):
+    """Compute volume using mass and diff operator
+
+    Args:
+        args: Parsed command line arguments
+
+    Returns:
+        int: 0 on success, error code on failure
+    """
+    # Process arguments
+    dim = args.dim
+    mesh_degree = max(args.mesh_degree, args.solution_degree)
+    sol_degree = args.solution_degree
+    num_qpts = args.quadrature_points
+    problem_size = args.problem_size if args.problem_size > 0 else (8 * 16 if args.test else 256 * 1024)
+    ncomp_x = dim  # Number of coordinate components
+
+    # Print configuration
+    if not args.quiet:
+        print("Selected options: [command line option] : <current value>")
+        print(f"    Ceed specification [-c] : {args.ceed}")
+        print(f"    Mesh dimension     [-d] : {dim}")
+        print(f"    Mesh degree        [-m] : {mesh_degree}")
+        print(f"    Solution degree    [-p] : {sol_degree}")
+        print(f"    Num. 1D quadr. pts [-q] : {num_qpts}")
+        print(f"    Approx. # unknowns [-s] : {problem_size}")
+        print(f"    QFunction source   [-g] : {'gallery' if args.gallery else 'user'}")
+
+    # Check - Gallery not supported
+    if args.gallery:
+        print("Gallery QFunction not supported for example 3")
+        sys.exit(1)
+
+    # Initialize CEED
+    ceed = libceed.Ceed(args.ceed)
+
+    # Create bases
+    # Tensor-product Lagrange basis for mesh coordinates
+    mesh_basis = ceed.BasisTensorH1Lagrange(
+        dim, ncomp_x, mesh_degree + 1, num_qpts, libceed.GAUSS)
+
+    # Tensor-product Lagrange basis for solution
+    solution_basis = ceed.BasisTensorH1Lagrange(
+        dim, 1, sol_degree + 1, num_qpts, libceed.GAUSS)
+
+    # Create mesh
+    # Determine mesh size
+    num_xyz = common.get_cartesian_mesh_size(dim, sol_degree, problem_size)
+    if not args.quiet:
+        print("\nMesh size                   : nx = %d" % num_xyz[0], end="")
+        if dim > 1:
+            print(", ny = %d" % num_xyz[1], end="")
+        if dim > 2:
+            print(", nz = %d" % num_xyz[2], end="")
+        print()
+
+    # Create element restrictions
+    num_q_comp = 1 + dim * (dim + 1) // 2
+    mesh_restriction, mesh_size, _, _, _ = common.build_cartesian_restriction(
+        ceed, dim, num_xyz, mesh_degree, ncomp_x, num_q_comp, num_qpts, create_qdata=False)
+    solution_restriction, sol_size, q_data_restriction, num_elem, elem_qpts = common.build_cartesian_restriction(
+        ceed, dim, num_xyz, sol_degree, 1, num_q_comp, num_qpts, create_qdata=True)
+
+    if not args.quiet:
+        print("Number of mesh nodes        : %d" % (mesh_size // dim))
+        print("Number of solution nodes    : %d" % sol_size)
+
+    # Create and transform mesh coordinates
+    mesh_coords = ceed.Vector(mesh_size)
+    common.set_cartesian_mesh_coords(ceed, dim, num_xyz, mesh_degree, mesh_coords)
+    exact_volume, _ = common.transform_mesh_coords(dim, mesh_size, mesh_coords)
+
+    # Create QFunction context
+    build_ctx = ceed.QFunctionContext()
+    ctx_data = np.array([dim, dim], dtype=np.int32)
+    build_ctx.set_data(ctx_data)
+
+    # Load QFunctions
+    qfs_so = common.load_qfs_so()
+    file_dir = os.path.dirname(os.path.abspath(__file__))
+
+    # Create the QFunction that builds the mass + diffusion operator (i.e.
+    # computes its quadrature data) and set its context data
+    qf_build = ceed.QFunction(1, qfs_so.build_mass_diff,
+                              os.path.join(file_dir, "ex3-volume.h:build_mass_diff"))
+    qf_build.add_input("dx", dim * dim, libceed.EVAL_GRAD)
+    qf_build.add_input("weights", 1, libceed.EVAL_WEIGHT)
+    qf_build.add_output("qdata", num_q_comp, libceed.EVAL_NONE)
+    qf_build.set_context(build_ctx)
+
+    # Create the operator that builds the quadrature data for the mass + diffusion operator
+    op_build = ceed.Operator(qf_build)
+    op_build.set_field("dx", mesh_restriction, mesh_basis, libceed.VECTOR_ACTIVE)
+    op_build.set_field("weights", libceed.ELEMRESTRICTION_NONE, mesh_basis, libceed.VECTOR_NONE)
+    op_build.set_field("qdata", q_data_restriction, libceed.BASIS_NONE, libceed.VECTOR_ACTIVE)
+
+    # Compute the quadrature data for the mass + diffusion operator
+    q_data = ceed.Vector(num_elem * elem_qpts * num_q_comp)
+    op_build.apply(mesh_coords, q_data)
+
+    # Create the QFunction that defines the action of the mass + diffusion operator
+    qf_apply = ceed.QFunction(1, qfs_so.apply_mass_diff,
+                              os.path.join(file_dir, "ex3-volume.h:apply_mass_diff"))
+    qf_apply.add_input("u", 1, libceed.EVAL_INTERP)
+    qf_apply.add_input("du", dim, libceed.EVAL_GRAD)
+    qf_apply.add_input("qdata", num_q_comp, libceed.EVAL_NONE)
+    qf_apply.add_output("v", 1, libceed.EVAL_INTERP)
+    qf_apply.add_output("dv", dim, libceed.EVAL_GRAD)
+    qf_apply.set_context(build_ctx)
+
+    # Create the mass + diffusion operator
+    op_apply = ceed.Operator(qf_apply)
+    op_apply.set_field("u", solution_restriction, solution_basis, libceed.VECTOR_ACTIVE)
+    op_apply.set_field("du", solution_restriction, solution_basis, libceed.VECTOR_ACTIVE)
+    op_apply.set_field("qdata", q_data_restriction, libceed.BASIS_NONE, q_data)
+    op_apply.set_field("v", solution_restriction, solution_basis, libceed.VECTOR_ACTIVE)
+    op_apply.set_field("dv", solution_restriction, solution_basis, libceed.VECTOR_ACTIVE)
+
+    # Create solution vectors
+    u = ceed.Vector(sol_size)
+    v = ceed.Vector(sol_size)
+    u.set_value(1.0)  # Set all entries of u to 1.0
+
+    # Apply mass + diffusion operator: v = (M + K) * u
+    op_apply.apply(u, v)
+
+    # Compute volume by summing all entries in v
+    volume = 0.0
+    with v.array_read() as v_array:
+        # Simply sum all values to compute the volume
+        volume = np.sum(v_array)
+
+    if not args.test:
+        print()
+        print(f"Exact mesh volume    : {exact_volume:.14g}")
+        print(f"Computed mesh volume : {volume:.14g}")
+        print(f"Volume error         : {volume - exact_volume:.14g}")
+    else:
+        # Test mode - check if error is within tolerance
+        tol = 200 * libceed.EPSILON if dim == 1 else 1e-5
+        if abs(volume - exact_volume) > tol:
+            print(f"Volume error : {volume - exact_volume:.14g}")
+            sys.exit(1)
+
+    return 0
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/python/ex_common.py b/examples/python/ex_common.py
new file mode 100644
index 0000000000..db68b8f567
--- /dev/null
+++ b/examples/python/ex_common.py
@@ -0,0 +1,255 @@
+# Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors
+# All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+#
+# SPDX-License-Identifier: BSD-2-Clause
+#
+# This file is part of CEED:  http://github.com/ceed
+
+import sys
+import os
+from sysconfig import get_config_var
+import argparse
+import math
+import numpy as np
+import libceed
+import ctypes
+
+
+def parse_arguments():
+    """Parse command line arguments for surface area computation
+
+    Returns:
+        Namespace: Parsed arguments with fields:
+            ceed: CEED resource specifier
+            dim: Problem dimension (1-3)
+            mesh_degree: Mesh polynomial degree
+            solution_degree: Solution polynomial degree
+            num_qpts: Number of quadrature points
+            problem_size: Approximate problem size
+            test: Test mode flag
+            quiet: Suppress output flag
+            gallery: Use gallery QFunctions flag
+    """
+    parser = argparse.ArgumentParser(description="libCEED surface area example")
+    parser.add_argument("-c", "--ceed", default="/cpu/self",
+                        help="libCEED resource specifier (default: /cpu/self)")
+    parser.add_argument("-d", "--dim", type=int, default=3,
+                        help="Problem dimension (1-3) (default: 3)")
+    parser.add_argument("-m", "--mesh-degree", type=int, default=4,
+                        help="Mesh polynomial degree (default: 4)")
+    parser.add_argument("-p", "--solution-degree", type=int, default=4,
+                        help="Solution polynomial degree (default: 4)")
+    parser.add_argument("-q", "--quadrature-points", type=int, default=6,
+                        help="Number of quadrature points (default: 6)")
+    parser.add_argument("-s", "--problem-size", type=int, default=-1,
+                        help="Approximate problem size (default: ~256k)")
+    parser.add_argument("-t", "--test", action="store_true",
+                        help="Test mode (reduced problem size)")
+    parser.add_argument("--quiet", action="store_true",
+                        help="Suppress output")
+    parser.add_argument("-g", "--gallery", action="store_true",
+                        help="Use gallery QFunctions")
+
+    args = parser.parse_args()
+    if args.dim not in [1, 2, 3]:
+        parser.error("Dimension must be 1, 2, or 3")
+    return args
+
+
+def get_cartesian_mesh_size(dim, degree, prob_size):
+    """Determine Cartesian mesh size for given problem size
+
+    Args:
+        dim: Spatial dimension (1-3)
+        degree: Polynomial degree
+        prob_size: Target problem size
+
+    Returns:
+        list: Number of elements in each dimension
+    """
+    # Calculate number of elements needed
+    num_elem = prob_size // (degree ** dim)
+
+    # Find smallest power of 2 >= num_elem
+    s = 0
+    while num_elem > 1:
+        num_elem = num_elem / 2
+        s += 1
+
+    # Distribute across dimensions
+    r = s % dim
+    num_xyz = []
+    for d in range(dim):
+        sd = s // dim
+        if r > 0:
+            sd += 1
+            r -= 1
+        num_xyz.append(1 << sd)
+    return num_xyz
+
+
+def build_cartesian_restriction(ceed, dim, num_xyz, degree, num_comp, num_q_comp, num_qpts, create_qdata=False):
+    """Build element restriction for Cartesian grid
+
+    Args:
+        ceed: libCEED context
+        dim: Spatial dimension
+        num_xyz: Elements per dimension
+        degree: Polynomial degree
+        num_comp: Number of components
+        num_q_comp: Number of quadrature data components
+        num_qpts: Quadrature points per dimension
+        build_qdata: Flag to build restriction for quadrature data
+
+    Returns:
+        tuple: (elem_restriction, size, q_data_restriction, num_elem, elem_qpts)
+    """
+    p = degree + 1  # Nodes per element per dimension
+    num_nodes = p ** dim
+    elem_qpts = num_qpts ** dim
+
+    # Calculate grid parameters
+    nd = []
+    num_elem = 1
+    scalar_size = 1
+    for d in range(dim):
+        num_elem *= num_xyz[d]
+        nd.append(num_xyz[d] * (p - 1) + 1)  # Nodes per dimension
+        scalar_size *= nd[d]
+
+    size = scalar_size * num_comp
+
+    # Create element connectivity
+    elem_nodes = np.zeros(num_elem * num_nodes, dtype=np.int32)
+    for e in range(num_elem):
+        # Get element coordinates
+        e_xyz = [0] * dim
+        re = e
+        for d in range(dim):
+            e_xyz[d] = re % num_xyz[d]
+            re //= num_xyz[d]
+
+        # Calculate global node numbers
+        for n in range(num_nodes):
+            g_node = 0
+            g_stride = 1
+            r_node = n
+            for d in range(dim):
+                g_node += (e_xyz[d] * (p - 1) + r_node % p) * g_stride
+                g_stride *= nd[d]
+                r_node //= p
+            elem_nodes[e * num_nodes + n] = g_node
+
+    # Create restrictions
+    elem_restriction = ceed.ElemRestriction(
+        num_elem, num_nodes, num_comp, scalar_size, size, elem_nodes)
+
+    q_data_restriction = None
+    if create_qdata:
+        strides = np.array([1, elem_qpts, elem_qpts * num_q_comp], dtype=np.int32)
+        q_data_restriction = ceed.StridedElemRestriction(
+            num_elem, elem_qpts, num_q_comp, num_elem * elem_qpts * num_q_comp, strides)
+
+    return elem_restriction, size, q_data_restriction, num_elem, elem_qpts
+
+
+def set_cartesian_mesh_coords(ceed, dim, num_xyz, mesh_degree, mesh_coords):
+    """Create Cartesian mesh coordinates
+
+    Args:
+        ceed: libCEED context
+        dim: Spatial dimension
+        num_xyz: Elements per dimension
+        mesh_degree: Mesh polynomial degree
+        mesh_coords: CeedVector to hold mesh coordinates
+
+    Returns:
+        Vector: Mesh coordinates
+    """
+    p = mesh_degree + 1
+    nd = []
+    scalar_size = 1
+    for d in range(dim):
+        nd.append(num_xyz[d] * (p - 1) + 1)
+        scalar_size *= nd[d]
+
+    # Get Lobatto nodes (quadrature points)
+    nodes, _ = ceed.lobatto_quadrature(p)
+    nodes = 0.5 + 0.5 * nodes  # Map from [-1,1] to [0,1]
+
+    # Create coordinates
+    coords = np.zeros(scalar_size * dim)
+    for gs_node in range(scalar_size):
+        r_node = gs_node
+        for d in range(dim):
+            d_1d = r_node % nd[d]
+            elem_id = d_1d // (p - 1)
+            node_id = d_1d % (p - 1)
+            coords[gs_node + scalar_size * d] = (elem_id + nodes[node_id]) / num_xyz[d]
+            r_node //= nd[d]
+
+    mesh_coords.set_array(coords, cmode=libceed.COPY_VALUES)
+    return mesh_coords
+
+
+def transform_mesh_coords(dim, mesh_size, mesh_coords, use_sin=True):
+    """Transform mesh coordinates and return exact surface area
+
+    Args:
+        dim: Spatial dimension
+        mesh_size: Total mesh vector size
+        mesh_coords: Mesh coordinates vector
+        use_sin: Use sinusoidal transformation
+
+    Returns:
+        float: Tuple with exact volume and surface area for transformed mesh
+    """
+    exact_volume = {1: 1.0, 2: 3. / 4. * np.pi, 3: 3. / 4. * np.pi}[dim]
+    exact_area = {1: 2.0, 2: 4.0, 3: 6.0}[dim]
+
+    # Apply transformation to coordinates
+    num_nodes = mesh_size // dim
+    with mesh_coords.array_write() as coords:
+        if dim == 1:
+            for i in range(num_nodes):
+                x = coords[i] - 0.5
+                coords[i] = 0.5 + (1.0 / np.sqrt(3.0)) * np.sin((2.0 / 3.0) * np.pi * x)
+        else:
+            if use_sin:
+                for i in range(num_nodes):
+                    u = 1. + coords[i]
+                    v = np.pi / 2. * coords[i + num_nodes]
+                    coords[i] = u * np.cos(v)
+                    coords[i + num_nodes] = u * np.sin(v)
+            else:
+                for i in range(num_nodes):
+                    x = coords[i] - 0.5
+                    coords[i] = 0.5 + (1.0 / np.sqrt(3.0)) * np.sin((2.0 / 3.0) * np.pi * x)
+
+    return (exact_volume, exact_area)
+
+
+def find_qfs_so(name, path):
+    """Find the QFunctions shared library.
+    Returns:
+        Filepath to shared library object
+    """
+    for root, dirs, files in os.walk(path):
+        if name in files:
+            return os.path.join(root, name)
+
+
+def load_qfs_so():
+    """Load the QFunctions shared library.
+    Returns:
+        Loaded shared library object
+    """
+    file_dir = os.path.join(
+        os.path.dirname(os.path.abspath(__file__)),
+        "build")
+    qfs_so = find_qfs_so(
+        "libceed_c_qfunctions" + get_config_var("EXT_SUFFIX"),
+        file_dir)
+
+    # Load library
+    return ctypes.cdll.LoadLibrary(qfs_so)
diff --git a/examples/python/ex_test.py b/examples/python/ex_test.py
new file mode 100644
index 0000000000..e814007500
--- /dev/null
+++ b/examples/python/ex_test.py
@@ -0,0 +1,269 @@
+#!/usr/bin/env python3
+# Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors
+# All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+#
+# SPDX-License-Identifier: BSD-2-Clause
+#
+# This file is part of CEED:  http://github.com/ceed
+
+import pytest
+from argparse import Namespace
+import ex1_volume
+import ex2_surface
+import ex3_volume
+
+# -------------------------------------------------------------------------------
+
+
+def test_101(ceed_resource):
+    args = Namespace(
+        ceed=ceed_resource,
+        dim=1,
+        mesh_degree=4,
+        solution_degree=4,
+        quadrature_points=6,
+        problem_size=-1,
+        test=True,
+        quiet=True,
+        gallery=False,
+    )
+    ex1_volume.example_1(args)
+
+# -------------------------------------------------------------------------------
+
+
+def test_101g(ceed_resource):
+    args = Namespace(
+        ceed=ceed_resource,
+        dim=1,
+        mesh_degree=4,
+        solution_degree=4,
+        quadrature_points=6,
+        problem_size=-1,
+        test=True,
+        quiet=True,
+        gallery=True,
+    )
+    ex1_volume.example_1(args)
+
+# -------------------------------------------------------------------------------
+
+
+def test_102(ceed_resource):
+    args = Namespace(
+        ceed=ceed_resource,
+        dim=2,
+        mesh_degree=4,
+        solution_degree=4,
+        quadrature_points=6,
+        problem_size=-1,
+        test=True,
+        quiet=True,
+        gallery=False,
+    )
+    ex1_volume.example_1(args)
+
+# -------------------------------------------------------------------------------
+
+
+def test_102g(ceed_resource):
+    args = Namespace(
+        ceed=ceed_resource,
+        dim=2,
+        mesh_degree=4,
+        solution_degree=4,
+        quadrature_points=6,
+        problem_size=-1,
+        test=True,
+        quiet=True,
+        gallery=True,
+    )
+    ex1_volume.example_1(args)
+
+# -------------------------------------------------------------------------------
+
+
+def test_103(ceed_resource):
+    args = Namespace(
+        ceed=ceed_resource,
+        dim=3,
+        mesh_degree=4,
+        solution_degree=4,
+        quadrature_points=6,
+        problem_size=-1,
+        test=True,
+        quiet=True,
+        gallery=False,
+    )
+    ex1_volume.example_1(args)
+
+# -------------------------------------------------------------------------------
+
+
+def test_103g(ceed_resource):
+    args = Namespace(
+        ceed=ceed_resource,
+        dim=3,
+        mesh_degree=4,
+        solution_degree=4,
+        quadrature_points=6,
+        problem_size=-1,
+        test=True,
+        quiet=True,
+        gallery=True,
+    )
+    ex1_volume.example_1(args)
+
+
+# -------------------------------------------------------------------------------
+def test_201(ceed_resource):
+    args = Namespace(
+        ceed=ceed_resource,
+        dim=1,
+        mesh_degree=4,
+        solution_degree=4,
+        quadrature_points=6,
+        problem_size=-1,
+        test=True,
+        quiet=True,
+        gallery=False,
+    )
+    ex2_surface.example_2(args)
+
+# -------------------------------------------------------------------------------
+
+
+def test_201g(ceed_resource):
+    args = Namespace(
+        ceed=ceed_resource,
+        dim=1,
+        mesh_degree=4,
+        solution_degree=4,
+        quadrature_points=6,
+        problem_size=-1,
+        test=True,
+        quiet=True,
+        gallery=True,
+    )
+    ex2_surface.example_2(args)
+
+# -------------------------------------------------------------------------------
+
+
+def test_202(ceed_resource):
+    args = Namespace(
+        ceed=ceed_resource,
+        dim=2,
+        mesh_degree=4,
+        solution_degree=4,
+        quadrature_points=6,
+        problem_size=-1,
+        test=True,
+        quiet=True,
+        gallery=False,
+    )
+    ex2_surface.example_2(args)
+
+# -------------------------------------------------------------------------------
+
+
+def test_202g(ceed_resource):
+    args = Namespace(
+        ceed=ceed_resource,
+        dim=2,
+        mesh_degree=4,
+        solution_degree=4,
+        quadrature_points=6,
+        problem_size=-1,
+        test=True,
+        quiet=True,
+        gallery=True,
+    )
+    ex2_surface.example_2(args)
+
+# -------------------------------------------------------------------------------
+
+
+def test_203(ceed_resource):
+    args = Namespace(
+        ceed=ceed_resource,
+        dim=3,
+        mesh_degree=4,
+        solution_degree=4,
+        quadrature_points=6,
+        problem_size=-1,
+        test=True,
+        quiet=True,
+        gallery=False,
+    )
+    ex2_surface.example_2(args)
+
+# -------------------------------------------------------------------------------
+
+
+def test_203g(ceed_resource):
+    args = Namespace(
+        ceed=ceed_resource,
+        dim=3,
+        mesh_degree=4,
+        solution_degree=4,
+        quadrature_points=6,
+        problem_size=-1,
+        test=True,
+        quiet=True,
+        gallery=True,
+    )
+    ex2_surface.example_2(args)
+
+# -------------------------------------------------------------------------------
+
+
+def test_301(ceed_resource):
+    args = Namespace(
+        ceed=ceed_resource,
+        dim=1,
+        mesh_degree=4,
+        solution_degree=4,
+        quadrature_points=6,
+        problem_size=-1,
+        test=True,
+        quiet=True,
+        gallery=False,
+    )
+    ex3_volume.example_3(args)
+
+# -------------------------------------------------------------------------------
+
+
+def test_302(ceed_resource):
+    args = Namespace(
+        ceed=ceed_resource,
+        dim=2,
+        mesh_degree=4,
+        solution_degree=4,
+        quadrature_points=6,
+        problem_size=-1,
+        test=True,
+        quiet=True,
+        gallery=False,
+    )
+    ex3_volume.example_3(args)
+
+# -------------------------------------------------------------------------------
+
+
+def test_303(ceed_resource):
+    args = Namespace(
+        ceed=ceed_resource,
+        dim=3,
+        mesh_degree=4,
+        solution_degree=4,
+        quadrature_points=6,
+        problem_size=-1,
+        test=True,
+        quiet=True,
+        gallery=False,
+    )
+    ex3_volume.example_3(args)
+
+# -------------------------------------------------------------------------------
diff --git a/examples/python/qfunctions/ex-common.h b/examples/python/qfunctions/ex-common.h
new file mode 100644
index 0000000000..b6e98500b2
--- /dev/null
+++ b/examples/python/qfunctions/ex-common.h
@@ -0,0 +1,14 @@
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+#pragma once
+
+#include <ceed/types.h>
+
+/// A structure used to pass additional data
+struct BuildContext {
+  CeedInt dim, space_dim;
+};
diff --git a/examples/python/qfunctions/ex1-volume.h b/examples/python/qfunctions/ex1-volume.h
new file mode 100644
index 0000000000..984339cb86
--- /dev/null
+++ b/examples/python/qfunctions/ex1-volume.h
@@ -0,0 +1,60 @@
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+#pragma once
+
+#include <ceed/types.h>
+#include "ex-common.h"
+
+/// libCEED Q-function for building quadrature data for a mass operator
+CEED_QFUNCTION(build_mass)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+  // in[0] is Jacobians with shape [dim, dim, Q]
+  // in[1] is quadrature weights with shape [1, Q]
+  const CeedScalar    *w          = in[1];
+  CeedScalar          *q_data     = out[0];
+  struct BuildContext *build_data = (struct BuildContext *)ctx;
+
+  switch (build_data->dim + 10 * build_data->space_dim) {
+    case 11: {
+      const CeedScalar(*J)[1][CEED_Q_VLA] = (const CeedScalar(*)[1][CEED_Q_VLA])in[0];
+
+      // Quadrature Point Loop
+      CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { q_data[i] = J[0][0][i] * w[i]; }  // End of Quadrature Point Loop
+    } break;
+    case 22: {
+      const CeedScalar(*J)[2][CEED_Q_VLA] = (const CeedScalar(*)[2][CEED_Q_VLA])in[0];
+
+      // Quadrature Point Loop
+      CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
+        q_data[i] = (J[0][0][i] * J[1][1][i] - J[0][1][i] * J[1][0][i]) * w[i];
+      }  // End of Quadrature Point Loop
+    } break;
+    case 33: {
+      const CeedScalar(*J)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0];
+
+      // Quadrature Point Loop
+      CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
+        q_data[i] =
+            (J[0][0][i] * (J[1][1][i] * J[2][2][i] - J[1][2][i] * J[2][1][i]) - J[0][1][i] * (J[1][0][i] * J[2][2][i] - J[1][2][i] * J[2][0][i]) +
+             J[0][2][i] * (J[1][0][i] * J[2][1][i] - J[1][1][i] * J[2][0][i])) *
+            w[i];
+      }  // End of Quadrature Point Loop
+    } break;
+  }
+  return CEED_ERROR_SUCCESS;
+}
+
+/// libCEED Q-function for applying a mass operator
+CEED_QFUNCTION(apply_mass)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+  // in[0], out[0] are solution variables with shape [1, Q]
+  // in[1] is quadrature data with shape [1, Q]
+  const CeedScalar *u = in[0], *q_data = in[1];
+  CeedScalar       *v = out[0];
+
+  // Quadrature Point Loop
+  CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { v[i] = q_data[i] * u[i]; }  // End of Quadrature Point Loop
+  return CEED_ERROR_SUCCESS;
+}
diff --git a/examples/python/qfunctions/ex2-surface.h b/examples/python/qfunctions/ex2-surface.h
new file mode 100644
index 0000000000..52f0aea1ae
--- /dev/null
+++ b/examples/python/qfunctions/ex2-surface.h
@@ -0,0 +1,135 @@
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+#pragma once
+
+#include <ceed/types.h>
+#include "ex-common.h"
+
+/// libCEED Q-function for building quadrature data for a diffusion operator
+CEED_QFUNCTION(build_diff)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+  // in[0] is Jacobians with shape [dim, dim, Q]
+  // in[1] is quadrature weights, size (Q)
+  const CeedScalar *w             = in[1];
+  CeedScalar(*q_data)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0];
+  struct BuildContext *build_data = (struct BuildContext *)ctx;
+
+  // At every quadrature point, compute w/det(J).adj(J).adj(J)^T and store
+  // the symmetric part of the result.
+  switch (build_data->dim + 10 * build_data->space_dim) {
+    case 11: {
+      const CeedScalar(*J)[1][CEED_Q_VLA] = (const CeedScalar(*)[1][CEED_Q_VLA])in[0];
+
+      CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { q_data[0][i] = w[i] / J[0][0][i]; }  // End of Quadrature Point Loop
+    } break;
+    case 22: {
+      const CeedScalar(*J)[2][CEED_Q_VLA] = (const CeedScalar(*)[2][CEED_Q_VLA])in[0];
+
+      CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
+        // J: 0 2   q_data: 0 2   adj(J):  J11 -J01
+        //    1 3           2 1           -J10  J00
+        const CeedScalar J00 = J[0][0][i];
+        const CeedScalar J10 = J[0][1][i];
+        const CeedScalar J01 = J[1][0][i];
+        const CeedScalar J11 = J[1][1][i];
+        const CeedScalar qw  = w[i] / (J00 * J11 - J10 * J01);
+
+        q_data[0][i] = qw * (J01 * J01 + J11 * J11);
+        q_data[1][i] = qw * (J00 * J00 + J10 * J10);
+        q_data[2][i] = -qw * (J00 * J01 + J10 * J11);
+      }  // End of Quadrature Point Loop
+    } break;
+    case 33: {
+      const CeedScalar(*J)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0];
+
+      CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
+        // Compute the adjoint
+        CeedScalar A[3][3];
+
+        for (CeedInt j = 0; j < 3; j++) {
+          for (CeedInt k = 0; k < 3; k++) {
+            // Equivalent code with J as a VLA and no mod operations:
+            // A[k][j] = J[j+1][k+1]*J[j+2][k+2] - J[j+1][k+2]*J[j+2][k+1]
+            A[k][j] =
+                J[(k + 1) % 3][(j + 1) % 3][i] * J[(k + 2) % 3][(j + 2) % 3][i] - J[(k + 2) % 3][(j + 1) % 3][i] * J[(k + 1) % 3][(j + 2) % 3][i];
+          }
+        }
+
+        // Compute quadrature weight / det(J)
+        const CeedScalar qw = w[i] / (J[0][0][i] * A[0][0] + J[0][1][i] * A[0][1] + J[0][2][i] * A[0][2]);
+
+        // Compute geometric factors
+        // Stored in Voigt convention
+        // 0 5 4
+        // 5 1 3
+        // 4 3 2
+        q_data[0][i] = qw * (A[0][0] * A[0][0] + A[0][1] * A[0][1] + A[0][2] * A[0][2]);
+        q_data[1][i] = qw * (A[1][0] * A[1][0] + A[1][1] * A[1][1] + A[1][2] * A[1][2]);
+        q_data[2][i] = qw * (A[2][0] * A[2][0] + A[2][1] * A[2][1] + A[2][2] * A[2][2]);
+        q_data[3][i] = qw * (A[1][0] * A[2][0] + A[1][1] * A[2][1] + A[1][2] * A[2][2]);
+        q_data[4][i] = qw * (A[0][0] * A[2][0] + A[0][1] * A[2][1] + A[0][2] * A[2][2]);
+        q_data[5][i] = qw * (A[0][0] * A[1][0] + A[0][1] * A[1][1] + A[0][2] * A[1][2]);
+      }  // End of Quadrature Point Loop
+    } break;
+  }
+  return CEED_ERROR_SUCCESS;
+}
+
+/// libCEED Q-function for applying a diff operator
+CEED_QFUNCTION(apply_diff)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+  struct BuildContext *build_data = (struct BuildContext *)ctx;
+  // in[0], out[0] solution gradients with shape [dim, 1, Q]
+  // in[1] is quadrature data with shape [num_components, Q]
+  const CeedScalar(*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[1];
+
+  switch (build_data->dim) {
+    case 1: {
+      const CeedScalar *ug = in[0];
+      CeedScalar       *vg = out[0];
+
+      CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { vg[i] = ug[i] * q_data[0][i]; }  // End of Quadrature Point Loop
+    } break;
+    case 2: {
+      const CeedScalar(*ug)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0];
+      CeedScalar(*vg)[CEED_Q_VLA]       = (CeedScalar(*)[CEED_Q_VLA])out[0];
+
+      CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
+        // Read q_data (dXdxdXdx_T symmetric matrix)
+        // Stored in Voigt convention
+        // 0 2
+        // 2 1
+        const CeedScalar dXdxdXdx_T[2][2] = {
+            {q_data[0][i], q_data[2][i]},
+            {q_data[2][i], q_data[1][i]}
+        };
+
+        // j = direction of vg
+        for (int j = 0; j < 2; j++) vg[j][i] = (ug[0][i] * dXdxdXdx_T[0][j] + ug[1][i] * dXdxdXdx_T[1][j]);
+      }  // End of Quadrature Point Loop
+    } break;
+    case 3: {
+      const CeedScalar(*ug)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0];
+      CeedScalar(*vg)[CEED_Q_VLA]       = (CeedScalar(*)[CEED_Q_VLA])out[0];
+
+      CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
+        // Read q_data (dXdxdXdx_T symmetric matrix)
+        // Stored in Voigt convention
+        // 0 5 4
+        // 5 1 3
+        // 4 3 2
+        const CeedScalar dXdxdXdx_T[3][3] = {
+            {q_data[0][i], q_data[5][i], q_data[4][i]},
+            {q_data[5][i], q_data[1][i], q_data[3][i]},
+            {q_data[4][i], q_data[3][i], q_data[2][i]}
+        };
+
+        // j = direction of vg
+        for (int j = 0; j < 3; j++) vg[j][i] = (ug[0][i] * dXdxdXdx_T[0][j] + ug[1][i] * dXdxdXdx_T[1][j] + ug[2][i] * dXdxdXdx_T[2][j]);
+      }  // End of Quadrature Point Loop
+    } break;
+  }
+  return CEED_ERROR_SUCCESS;
+}
diff --git a/examples/python/qfunctions/ex3-volume.h b/examples/python/qfunctions/ex3-volume.h
new file mode 100644
index 0000000000..76489a622a
--- /dev/null
+++ b/examples/python/qfunctions/ex3-volume.h
@@ -0,0 +1,168 @@
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+#pragma once
+
+#include <ceed.h>
+#include "ex-common.h"
+
+/// libCEED Q-function for building quadrature data for a mass + diffusion operator
+CEED_QFUNCTION(build_mass_diff)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+  // in[0] is Jacobians with shape [dim, dim, Q]
+  // in[1] is quadrature weights, size (Q)
+  const CeedScalar *w             = in[1];
+  CeedScalar(*q_data)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0];
+  struct BuildContext *build_data = (struct BuildContext *)ctx;
+
+  // At every quadrature point, compute w/det(J).adj(J).adj(J)^T and store
+  // the symmetric part of the result.
+  switch (build_data->dim + 10 * build_data->space_dim) {
+    case 11: {  // dim = 1, space_dim = 1
+      const CeedScalar(*J)[1][CEED_Q_VLA] = (const CeedScalar(*)[1][CEED_Q_VLA])in[0];
+
+      CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
+        // Mass
+        q_data[0][i] = w[i] * J[0][0][i];
+
+        // Diffusion
+        q_data[1][i] = w[i] / J[0][0][i];
+      }
+    } break;
+    case 22: {  // dim = 2, space_dim = 2
+      const CeedScalar(*J)[2][CEED_Q_VLA] = (const CeedScalar(*)[2][CEED_Q_VLA])in[0];
+
+      CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
+        // J: 0 2   q_data: 0 2   adj(J):  J22 -J12
+        //    1 3           2 1           -J10  J00
+        const CeedScalar J00 = J[0][0][i];
+        const CeedScalar J10 = J[0][1][i];
+        const CeedScalar J01 = J[1][0][i];
+        const CeedScalar J11 = J[1][1][i];
+        const CeedScalar qw  = w[i] / (J00 * J11 - J10 * J01);
+
+        // Mass
+        q_data[0][i] = w[i] * (J00 * J11 - J10 * J01);
+
+        // Diffusion
+        q_data[1][i] = qw * (J01 * J01 + J11 * J11);
+        q_data[2][i] = qw * (J00 * J00 + J10 * J10);
+        q_data[3][i] = -qw * (J00 * J01 + J10 * J11);
+      }
+    } break;
+    case 33: {  // dim = 3, space_dim = 3
+      const CeedScalar(*J)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0];
+
+      CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
+        // Compute the adjoint
+        CeedScalar A[3][3];
+        for (CeedInt j = 0; j < 3; j++) {
+          for (CeedInt k = 0; k < 3; k++) {
+            A[k][j] =
+                J[(k + 1) % 3][(j + 1) % 3][i] * J[(k + 2) % 3][(j + 2) % 3][i] - J[(k + 2) % 3][(j + 1) % 3][i] * J[(k + 1) % 3][(j + 2) % 3][i];
+          }
+        }
+
+        // Compute quadrature weight / det(J)
+        const CeedScalar qw = w[i] / (J[0][0][i] * A[0][0] + J[0][1][i] * A[0][1] + J[0][2][i] * A[0][2]);
+
+        // Mass
+        q_data[0][i] = w[i] * (J[0][0][i] * A[0][0] + J[0][1][i] * A[0][1] + J[0][2][i] * A[0][2]);
+
+        // Diffusion
+        // Stored in Voigt convention
+        // 1 6 5
+        // 6 2 4
+        // 5 4 3
+        q_data[1][i] = qw * (A[0][0] * A[0][0] + A[0][1] * A[0][1] + A[0][2] * A[0][2]);
+        q_data[2][i] = qw * (A[1][0] * A[1][0] + A[1][1] * A[1][1] + A[1][2] * A[1][2]);
+        q_data[3][i] = qw * (A[2][0] * A[2][0] + A[2][1] * A[2][1] + A[2][2] * A[2][2]);
+        q_data[4][i] = qw * (A[1][0] * A[2][0] + A[1][1] * A[2][1] + A[1][2] * A[2][2]);
+        q_data[5][i] = qw * (A[0][0] * A[2][0] + A[0][1] * A[2][1] + A[0][2] * A[2][2]);
+        q_data[6][i] = qw * (A[0][0] * A[1][0] + A[0][1] * A[1][1] + A[0][2] * A[1][2]);
+      }
+    } break;
+  }
+  return CEED_ERROR_SUCCESS;
+}
+
+/// libCEED Q-function for applying a mass + diffusion operator
+CEED_QFUNCTION(apply_mass_diff)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+  struct BuildContext *build_data = (struct BuildContext *)ctx;
+  // in[0], out[0] solution values with shape [1, 1, Q]
+  // in[1], out[1] solution gradients with shape [dim, 1, Q]
+  // in[2] is quadrature data with shape [num_components, Q]
+  const CeedScalar(*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[2];
+
+  switch (build_data->dim) {
+    case 1: {
+      const CeedScalar *u = in[0], *ug = in[1];
+      CeedScalar       *v = out[0], *vg = out[1];
+
+      CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
+        // Mass
+        v[i] = q_data[0][i] * u[i];
+
+        // Diffusion
+        vg[i] = q_data[1][i] * ug[i];
+      }
+    } break;
+    case 2: {
+      const CeedScalar *u               = in[0];
+      const CeedScalar(*ug)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[1];
+      CeedScalar *v                     = out[0];
+      CeedScalar(*vg)[CEED_Q_VLA]       = (CeedScalar(*)[CEED_Q_VLA])out[1];
+
+      CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
+        // Mass
+        v[i] = q_data[0][i] * u[i];
+
+        // Diffusion
+        // Read q_data (dXdxdXdx_T symmetric matrix)
+        // Stored in Voigt convention
+        // 1 3
+        // 3 2
+        const CeedScalar dXdxdXdx_T[2][2] = {
+            {q_data[1][i], q_data[3][i]},
+            {q_data[3][i], q_data[2][i]}
+        };
+
+        // j = direction of vg
+        for (int j = 0; j < 2; j++) {
+          vg[j][i] = (ug[0][i] * dXdxdXdx_T[0][j] + ug[1][i] * dXdxdXdx_T[1][j]);
+        }
+      }
+    } break;
+    case 3: {
+      const CeedScalar *u               = in[0];
+      const CeedScalar(*ug)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[1];
+      CeedScalar *v                     = out[0];
+      CeedScalar(*vg)[CEED_Q_VLA]       = (CeedScalar(*)[CEED_Q_VLA])out[1];
+
+      CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
+        // Mass
+        v[i] = q_data[0][i] * u[i];
+
+        // Diffusion
+        // Read q_data (dXdxdXdx_T symmetric matrix)
+        // Stored in Voigt convention
+        // 1 6 5
+        // 6 2 4
+        // 5 4 3
+        const CeedScalar dXdxdXdx_T[3][3] = {
+            {q_data[1][i], q_data[6][i], q_data[5][i]},
+            {q_data[6][i], q_data[2][i], q_data[4][i]},
+            {q_data[5][i], q_data[4][i], q_data[3][i]}
+        };
+
+        // j = direction of vg
+        for (int j = 0; j < 3; j++) {
+          vg[j][i] = (ug[0][i] * dXdxdXdx_T[0][j] + ug[1][i] * dXdxdXdx_T[1][j] + ug[2][i] * dXdxdXdx_T[2][j]);
+        }
+      }
+    } break;
+  }
+  return CEED_ERROR_SUCCESS;
+}
diff --git a/examples/python/qfunctions/qfunctions.c b/examples/python/qfunctions/qfunctions.c
new file mode 100644
index 0000000000..f7fd7f945d
--- /dev/null
+++ b/examples/python/qfunctions/qfunctions.c
@@ -0,0 +1,22 @@
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+#include <ceed.h>
+// -----------------------------------------------------------------------------
+// Redefine QFunction Macro
+// -----------------------------------------------------------------------------
+#undef CEED_QFUNCTION
+#define CEED_QFUNCTION(name) extern int name
+
+// -----------------------------------------------------------------------------
+// QFunction Sources
+// -----------------------------------------------------------------------------
+#include "ex1-volume.h"
+#include "ex2-surface.h"
+#include "ex3-volume.h"
+
+// -----------------------------------------------------------------------------
diff --git a/examples/python/setup_qfunctions.py b/examples/python/setup_qfunctions.py
new file mode 100644
index 0000000000..c92e6d397e
--- /dev/null
+++ b/examples/python/setup_qfunctions.py
@@ -0,0 +1,32 @@
+from setuptools import setup, Extension
+from sys import platform
+import os
+
+# Get CEED directory
+ceed_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+# Include directories
+include_dirs = [os.path.join(ceed_dir, "include")]
+
+# Library directories
+library_dirs = [os.path.join(ceed_dir, "lib")]
+
+# Source files
+sources = ["qfunctions/qfunctions.c"]
+
+# Compiler arguments
+extra_compile_args = []
+if platform == "linux" or platform == "linux2" or platform == "darwin":
+    extra_compile_args = ["-O3", "-march=native", "-std=c99"]
+
+# Define the extension module
+qfunctions = Extension("libceed_c_qfunctions",
+                       sources=sources,
+                       include_dirs=include_dirs,
+                       library_dirs=library_dirs,
+                       libraries=["ceed"],
+                       extra_compile_args=extra_compile_args)
+
+# Setup
+setup(name="libceed_c_qfunctions",
+      ext_modules=[qfunctions])
diff --git a/python/tests/Makefile b/python/tests/Makefile
index be8ad8e707..b13dd58922 100644
--- a/python/tests/Makefile
+++ b/python/tests/Makefile
@@ -5,6 +5,8 @@
 #
 # This file is part of CEED:  http://github.com/ceed
 
+PYTHON ?= python3
+
 clean:
 	rm -rf build __pycache__ .pytest_cache *.so
 
diff --git a/python/tests/test-0-ceed.py b/python/tests/test-0-ceed.py
index e8486049b4..76c2d14332 100644
--- a/python/tests/test-0-ceed.py
+++ b/python/tests/test-0-ceed.py
@@ -20,7 +20,7 @@ def test_000(ceed_resource):
     ceed = libceed.Ceed(ceed_resource)
 
 # -------------------------------------------------------------------------------
-# Test return of Ceed backend prefered memory type
+# Test return of Ceed backend preferred memory type
 # -------------------------------------------------------------------------------
 
 
From fd326ce88a53ff49092498e1dead7d137136fb60 Mon Sep 17 00:00:00 2001
From: Zach Atkins <Zach.Atkins@colorado.edu>
Date: Sun, 4 May 2025 16:00:41 -0700
Subject: [PATCH 395/571] Use backend delegates for work vectors

---
 interface/ceed.c | 38 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

diff --git a/interface/ceed.c b/interface/ceed.c
index 203d2e5790..2b8bfd1c1e 100644
--- a/interface/ceed.c
+++ b/interface/ceed.c
@@ -827,6 +827,15 @@ int CeedReference(Ceed ceed) {
   @ref Developer
 **/
 int CeedGetWorkVectorMemoryUsage(Ceed ceed, CeedScalar *usage_mb) {
+  if (!ceed->VectorCreate) {
+    Ceed delegate;
+
+    CeedCall(CeedGetObjectDelegate(ceed, &delegate, "Vector"));
+    CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement VectorCreate");
+    CeedCall(CeedGetWorkVectorMemoryUsage(delegate, usage_mb));
+    CeedCall(CeedDestroy(&delegate));
+    return CEED_ERROR_SUCCESS;
+  }
   *usage_mb = 0.0;
   if (ceed->work_vectors) {
     for (CeedInt i = 0; i < ceed->work_vectors->num_vecs; i++) {
@@ -852,6 +861,15 @@ int CeedGetWorkVectorMemoryUsage(Ceed ceed, CeedScalar *usage_mb) {
   @ref Backend
 **/
 int CeedClearWorkVectors(Ceed ceed, CeedSize min_len) {
+  if (!ceed->VectorCreate) {
+    Ceed delegate;
+
+    CeedCall(CeedGetObjectDelegate(ceed, &delegate, "Vector"));
+    CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement VectorCreate");
+    CeedCall(CeedClearWorkVectors(delegate, min_len));
+    CeedCall(CeedDestroy(&delegate));
+    return CEED_ERROR_SUCCESS;
+  }
   if (!ceed->work_vectors) return CEED_ERROR_SUCCESS;
   for (CeedInt i = 0; i < ceed->work_vectors->num_vecs; i++) {
     if (ceed->work_vectors->is_in_use[i]) continue;
@@ -890,6 +908,16 @@ int CeedGetWorkVector(Ceed ceed, CeedSize len, CeedVector *vec) {
   CeedInt    i = 0;
   CeedScalar usage_mb;
 
+  if (!ceed->VectorCreate) {
+    Ceed delegate;
+
+    CeedCall(CeedGetObjectDelegate(ceed, &delegate, "Vector"));
+    CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement VectorCreate");
+    CeedCall(CeedGetWorkVector(delegate, len, vec));
+    CeedCall(CeedDestroy(&delegate));
+    return CEED_ERROR_SUCCESS;
+  }
+
   if (!ceed->work_vectors) CeedCall(CeedWorkVectorsCreate(ceed));
 
   // Search for big enough work vector
@@ -936,6 +964,16 @@ int CeedGetWorkVector(Ceed ceed, CeedSize len, CeedVector *vec) {
   @ref Backend
 **/
 int CeedRestoreWorkVector(Ceed ceed, CeedVector *vec) {
+  if (!ceed->VectorCreate) {
+    Ceed delegate;
+
+    CeedCall(CeedGetObjectDelegate(ceed, &delegate, "Vector"));
+    CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement VectorCreate");
+    CeedCall(CeedRestoreWorkVector(delegate, vec));
+    CeedCall(CeedDestroy(&delegate));
+    return CEED_ERROR_SUCCESS;
+  }
+
   for (CeedInt i = 0; i < ceed->work_vectors->num_vecs; i++) {
     if (*vec == ceed->work_vectors->vecs[i]) {
       CeedCheck(ceed->work_vectors->is_in_use[i], ceed, CEED_ERROR_ACCESS, "Work vector %" CeedSize_FMT " was not checked out but is being returned");

From 6eee1ffc21ba1b6c5586e8a6fa353666fb694027 Mon Sep 17 00:00:00 2001
From: Zach Atkins <Zach.Atkins@colorado.edu>
Date: Sun, 4 May 2025 16:01:02 -0700
Subject: [PATCH 396/571] Use persistent streams to allow truly async gen
 operators

---
 backends/hip-gen/ceed-hip-gen-operator.c | 44 +++++++++++++++++-------
 backends/hip-gen/ceed-hip-gen.h          |  1 +
 2 files changed, 32 insertions(+), 13 deletions(-)

diff --git a/backends/hip-gen/ceed-hip-gen-operator.c b/backends/hip-gen/ceed-hip-gen-operator.c
index fc90b93228..3b780d295a 100644
--- a/backends/hip-gen/ceed-hip-gen-operator.c
+++ b/backends/hip-gen/ceed-hip-gen-operator.c
@@ -22,9 +22,20 @@
 static int CeedOperatorDestroy_Hip_gen(CeedOperator op) {
   Ceed                  ceed;
   CeedOperator_Hip_gen *impl;
+  bool                  is_composite;
 
   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
   CeedCallBackend(CeedOperatorGetData(op, &impl));
+  CeedCallBackend(CeedOperatorIsComposite(op, &is_composite));
+  if (is_composite) {
+    CeedInt num_suboperators;
+
+    CeedCall(CeedCompositeOperatorGetNumSub(op, &num_suboperators));
+    for (CeedInt i = 0; i < num_suboperators; i++) {
+      if (impl->streams[i]) CeedCallHip(ceed, hipStreamDestroy(impl->streams[i]));
+      impl->streams[i] = NULL;
+    }
+  }
   if (impl->module) CeedCallHip(ceed, hipModuleUnload(impl->module));
   if (impl->points.num_per_elem) CeedCallHip(ceed, hipFree((void **)impl->points.num_per_elem));
   CeedCallBackend(CeedFree(&impl));
@@ -239,28 +250,35 @@ static int CeedOperatorApplyAdd_Hip_gen(CeedOperator op, CeedVector input_vec, C
 }
 
 static int CeedOperatorApplyAddComposite_Hip_gen(CeedOperator op, CeedVector input_vec, CeedVector output_vec, CeedRequest *request) {
-  bool              is_run_good[CEED_COMPOSITE_MAX] = {false};
-  CeedInt           num_suboperators;
-  const CeedScalar *input_arr  = NULL;
-  CeedScalar       *output_arr = NULL;
-  Ceed              ceed;
-  CeedOperator     *sub_operators;
+  bool                  is_run_good[CEED_COMPOSITE_MAX] = {true};
+  CeedInt               num_suboperators;
+  const CeedScalar     *input_arr = NULL;
+  CeedScalar           *output_arr;
+  Ceed                  ceed;
+  CeedOperator_Hip_gen *impl;
+  CeedOperator         *sub_operators;
 
   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
-  CeedCall(CeedCompositeOperatorGetNumSub(op, &num_suboperators));
-  CeedCall(CeedCompositeOperatorGetSubList(op, &sub_operators));
+  CeedCallBackend(CeedOperatorGetData(op, &impl));
+  CeedCallBackend(CeedCompositeOperatorGetNumSub(op, &num_suboperators));
+  CeedCallBackend(CeedCompositeOperatorGetSubList(op, &sub_operators));
   if (input_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(input_vec, CEED_MEM_DEVICE, &input_arr));
   if (output_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArray(output_vec, CEED_MEM_DEVICE, &output_arr));
   for (CeedInt i = 0; i < num_suboperators; i++) {
     CeedInt num_elem = 0;
 
-    CeedCall(CeedOperatorGetNumElements(sub_operators[i], &num_elem));
+    CeedCallBackend(CeedOperatorGetNumElements(sub_operators[i], &num_elem));
     if (num_elem > 0) {
-      hipStream_t stream = NULL;
+      if (!impl->streams[i]) CeedCallHip(ceed, hipStreamCreate(&impl->streams[i]));
+      CeedCallBackend(CeedOperatorApplyAddCore_Hip_gen(sub_operators[i], impl->streams[i], input_arr, output_arr, &is_run_good[i], request));
+    } else {
+      is_run_good[i] = true;
+    }
+  }
 
-      CeedCallHip(ceed, hipStreamCreate(&stream));
-      CeedCallBackend(CeedOperatorApplyAddCore_Hip_gen(sub_operators[i], stream, input_arr, output_arr, &is_run_good[i], request));
-      CeedCallHip(ceed, hipStreamDestroy(stream));
+  for (CeedInt i = 0; i < num_suboperators; i++) {
+    if (impl->streams[i]) {
+      if (is_run_good[i]) CeedCallHip(ceed, hipStreamSynchronize(impl->streams[i]));
     }
   }
   if (input_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorRestoreArrayRead(input_vec, &input_arr));
diff --git a/backends/hip-gen/ceed-hip-gen.h b/backends/hip-gen/ceed-hip-gen.h
index e3e5c18975..4335302471 100644
--- a/backends/hip-gen/ceed-hip-gen.h
+++ b/backends/hip-gen/ceed-hip-gen.h
@@ -17,6 +17,7 @@ typedef struct {
   CeedInt       Q, Q_1d;
   CeedInt       max_P_1d;
   CeedInt       thread_1d;
+  hipStream_t   streams[CEED_COMPOSITE_MAX];
   hipModule_t   module;
   hipFunction_t op;
   FieldsInt_Hip indices;

From b46df0d23d416892813aae9c232a5a88657bbf88 Mon Sep 17 00:00:00 2001
From: Zach Atkins <Zach.Atkins@colorado.edu>
Date: Tue, 6 May 2025 13:05:21 -0700
Subject: [PATCH 397/571] Add missing CeedDestroy

---
 backends/hip-ref/ceed-hip-ref-vector.c      | 5 ++---
 backends/hip-shared/ceed-hip-shared-basis.c | 3 +++
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/backends/hip-ref/ceed-hip-ref-vector.c b/backends/hip-ref/ceed-hip-ref-vector.c
index 639a11bb28..77f63b2fe4 100644
--- a/backends/hip-ref/ceed-hip-ref-vector.c
+++ b/backends/hip-ref/ceed-hip-ref-vector.c
@@ -305,14 +305,13 @@ static int CeedVectorCopyStrided_Hip(CeedVector vec, CeedSize start, CeedSize st
   // Set value for synced device/host array
   if (impl->d_array) {
     CeedScalar *copy_array;
+    Ceed        ceed;
 
+    CeedCallBackend(CeedVectorGetCeed(vec, &ceed));
     CeedCallBackend(CeedVectorGetArray(vec_copy, CEED_MEM_DEVICE, &copy_array));
 #if (HIP_VERSION >= 60000000)
     hipblasHandle_t handle;
     hipStream_t     stream;
-    Ceed            ceed;
-
-    CeedCallBackend(CeedVectorGetCeed(vec, &ceed));
     CeedCallBackend(CeedGetHipblasHandle_Hip(ceed, &handle));
     CeedCallHipblas(ceed, hipblasGetStream(handle, &stream));
 #if defined(CEED_SCALAR_IS_FP32)
diff --git a/backends/hip-shared/ceed-hip-shared-basis.c b/backends/hip-shared/ceed-hip-shared-basis.c
index 410d13af2e..ae1591995f 100644
--- a/backends/hip-shared/ceed-hip-shared-basis.c
+++ b/backends/hip-shared/ceed-hip-shared-basis.c
@@ -489,6 +489,7 @@ static int CeedBasisApplyAtPointsCore_Hip_shared(CeedBasis basis, bool apply_add
   CeedCallBackend(CeedVectorRestoreArray(v, &d_v));
   if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u));
   if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -644,6 +645,7 @@ static int CeedBasisDestroy_Hip_shared(CeedBasis basis) {
   CeedCallHip(ceed, hipFree(data->d_collo_grad_1d));
   CeedCallHip(ceed, hipFree(data->d_chebyshev_interp_1d));
   CeedCallBackend(CeedFree(&data));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -737,6 +739,7 @@ int CeedBasisCreateH1_Hip_shared(CeedElemTopology topo, CeedInt dim, CeedInt num
     if (((size_t)num_nodes * (size_t)num_qpts * (size_t)dim + (size_t)CeedIntMax(num_nodes, num_qpts)) * sizeof(CeedScalar) >
         hip_data->device_prop.sharedMemPerBlock) {
       CeedCallBackend(CeedBasisCreateH1Fallback(ceed, topo, dim, num_nodes, num_qpts, interp, grad, q_ref, q_weight, basis));
+      CeedCallBackend(CeedDestroy(&ceed));
       return CEED_ERROR_SUCCESS;
     }
   }

From f8fffaefe2b0bf336e4f45347da405b61e8c02d8 Mon Sep 17 00:00:00 2001
From: Umesh Unnikrishnan <unnikrishnan@anl.gov>
Date: Thu, 8 May 2025 12:51:32 -0400
Subject: [PATCH 398/571] SYCL backend updates for latest oneAPI
 release/2025.0.5 (#1815)

* Fixes for oneAPI-2025

* Neat fix for oneAPI 2025

* style changes
---
 backends/sycl/ceed-sycl-common.sycl.cpp | 1 +
 backends/sycl/online_compiler.hpp       | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/backends/sycl/ceed-sycl-common.sycl.cpp b/backends/sycl/ceed-sycl-common.sycl.cpp
index d75e3ec7db..253d00d077 100644
--- a/backends/sycl/ceed-sycl-common.sycl.cpp
+++ b/backends/sycl/ceed-sycl-common.sycl.cpp
@@ -8,6 +8,7 @@
 
 #include "ceed-sycl-common.hpp"
 
+#include <sstream>
 #include <string>
 #include <sycl/sycl.hpp>
 
diff --git a/backends/sycl/online_compiler.hpp b/backends/sycl/online_compiler.hpp
index f9fbf529fa..74d2577bc3 100644
--- a/backends/sycl/online_compiler.hpp
+++ b/backends/sycl/online_compiler.hpp
@@ -63,7 +63,7 @@ class device_arch {
 class online_compile_error : public sycl::exception {
  public:
   online_compile_error() = default;
-  online_compile_error(const std::string &Msg) : sycl::exception(Msg) {}
+  online_compile_error(const std::string &Msg) : sycl::exception(make_error_code(errc::invalid), Msg) {}
 };
 
 /// Designates a source language for the online compiler.

From 3c11f1fc4579930f14e261f40b0038a1553ab9e9 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Mon, 12 May 2025 10:44:27 -0600
Subject: [PATCH 399/571] ex - add PETSc BP1+3 and BP2+4 examples

---
 examples/petsc/bps.c                    |  11 +-
 examples/petsc/bps.h                    |   2 +-
 examples/petsc/include/bpsproblemdata.h | 242 ++++++++++++++----------
 examples/petsc/include/structs.h        |   2 +-
 examples/petsc/qfunctions/bps/bp13.h    |  74 ++++++++
 examples/petsc/qfunctions/bps/bp24.h    |  92 +++++++++
 examples/petsc/qfunctions/bps/bp3.h     |   1 -
 examples/petsc/qfunctions/bps/bp4.h     |  10 +-
 examples/petsc/src/libceedsetup.c       |  34 +++-
 9 files changed, 347 insertions(+), 121 deletions(-)
 create mode 100644 examples/petsc/qfunctions/bps/bp13.h
 create mode 100644 examples/petsc/qfunctions/bps/bp24.h

diff --git a/examples/petsc/bps.c b/examples/petsc/bps.c
index be55bbf5c4..5ced922afd 100644
--- a/examples/petsc/bps.c
+++ b/examples/petsc/bps.c
@@ -26,6 +26,8 @@
 //
 //TESTARGS(name="BP3, tet elements") -ceed {ceed_resource} -test -problem bp3 -degree 3 -ksp_max_it_clip 50,50 -simplex
 //TESTARGS(name="BP5, hex elements") -ceed {ceed_resource} -test -problem bp5 -degree 3 -ksp_max_it_clip 18,18
+//TESTARGS(name="BP1+3, hex elements") -ceed {ceed_resource} -test -problem bp1_3 -degree 3 -ksp_max_it_clip 18,18
+//TESTARGS(name="BP2+4, hex elements") -ceed {ceed_resource} -test -problem bp2_4 -degree 3 -ksp_max_it_clip 18,18
 
 /// @file
 /// CEED BPs example using PETSc with DMPlex
@@ -183,9 +185,9 @@ static PetscErrorCode RunWithDM(RunParams rp, DM dm, const char *ceed_resource)
   {
     PC pc;
     PetscCall(KSPGetPC(ksp, &pc));
-    if (rp->bp_choice == CEED_BP1 || rp->bp_choice == CEED_BP2) {
+    if (rp->bp_choice == CEED_BP1 || rp->bp_choice == CEED_BP2 || rp->bp_choice == CEED_BP13 || rp->bp_choice == CEED_BP24) {
       PetscCall(PCSetType(pc, PCJACOBI));
-      if (rp->simplex) {
+      if (rp->simplex || rp->bp_choice == CEED_BP13 || rp->bp_choice == CEED_BP24) {
         PetscCall(PCJacobiSetType(pc, PC_JACOBI_DIAGONAL));
       } else {
         PetscCall(PCJacobiSetType(pc, PC_JACOBI_ROWSUM));
@@ -255,7 +257,10 @@ static PetscErrorCode RunWithDM(RunParams rp, DM dm, const char *ceed_resource)
       PetscCall(SetupErrorOperatorCtx(rp->comm, dm, ceed, ceed_data, X_loc, op_error, op_error_ctx));
       PetscScalar l2_error;
       PetscCall(ComputeL2Error(X, &l2_error, op_error_ctx));
-      PetscReal tol = 5e-2;
+      // Tighter tol for BP1, BP2
+      // Looser tol for BP3, BP4, BP5, and BP6 with extra for vector valued problems
+      // BP1+3 and BP2+4 follow the pattern for BP3 and BP4
+      PetscReal tol = rp->bp_choice < CEED_BP3 ? 5e-4 : (5e-2 + (rp->bp_choice % 2 == 1 ? 5e-3 : 0));
       if (!rp->test_mode || l2_error > tol) {
         PetscCall(MPI_Allreduce(&my_rt, &rt_min, 1, MPI_DOUBLE, MPI_MIN, rp->comm));
         PetscCall(MPI_Allreduce(&my_rt, &rt_max, 1, MPI_DOUBLE, MPI_MAX, rp->comm));
diff --git a/examples/petsc/bps.h b/examples/petsc/bps.h
index 1867020c74..0564f4aeb0 100644
--- a/examples/petsc/bps.h
+++ b/examples/petsc/bps.h
@@ -17,4 +17,4 @@ static const char *const mem_types[] = {"host", "device", "memType", "CEED_MEM_"
 typedef enum { COARSEN_UNIFORM = 0, COARSEN_LOGARITHMIC = 1 } CoarsenType;
 static const char *const coarsen_types[] = {"uniform", "logarithmic", "CoarsenType", "COARSEN", 0};
 
-static const char *const bp_types[] = {"bp1", "bp2", "bp3", "bp4", "bp5", "bp6", "BPType", "CEED_BP", 0};
+static const char *const bp_types[] = {"bp1", "bp2", "bp3", "bp4", "bp5", "bp6", "bp1_3", "bp2_4", "BPType", "CEED_BP", 0};
diff --git a/examples/petsc/include/bpsproblemdata.h b/examples/petsc/include/bpsproblemdata.h
index db5f2b7d04..4d50ccc98d 100644
--- a/examples/petsc/include/bpsproblemdata.h
+++ b/examples/petsc/include/bpsproblemdata.h
@@ -14,7 +14,9 @@
 
 #include "../include/structs.h"
 #include "../qfunctions/bps/bp1.h"
+#include "../qfunctions/bps/bp13.h"
 #include "../qfunctions/bps/bp2.h"
+#include "../qfunctions/bps/bp24.h"
 #include "../qfunctions/bps/bp3.h"
 #include "../qfunctions/bps/bp4.h"
 #include "../qfunctions/bps/common.h"
@@ -23,107 +25,141 @@
 // BP Option Data
 // -----------------------------------------------------------------------------
 
-BPData bp_options[6] = {
-    [CEED_BP1] = {.num_comp_u    = 1,
-                  .num_comp_x    = 3,
-                  .topo_dim      = 3,
-                  .q_data_size   = 1,
-                  .q_extra       = 1,
-                  .setup_geo     = SetupMassGeo,
-                  .setup_rhs     = SetupMassRhs,
-                  .apply         = Mass,
-                  .error         = Error,
-                  .setup_geo_loc = SetupMassGeo_loc,
-                  .setup_rhs_loc = SetupMassRhs_loc,
-                  .apply_loc     = Mass_loc,
-                  .error_loc     = Error_loc,
-                  .in_mode       = CEED_EVAL_INTERP,
-                  .out_mode      = CEED_EVAL_INTERP,
-                  .q_mode        = CEED_GAUSS,
-                  .enforce_bc    = PETSC_FALSE},
-    [CEED_BP2] = {.num_comp_u    = 3,
-                  .num_comp_x    = 3,
-                  .topo_dim      = 3,
-                  .q_data_size   = 1,
-                  .q_extra       = 1,
-                  .setup_geo     = SetupMassGeo,
-                  .setup_rhs     = SetupMassRhs3,
-                  .apply         = Mass3,
-                  .error         = Error3,
-                  .setup_geo_loc = SetupMassGeo_loc,
-                  .setup_rhs_loc = SetupMassRhs3_loc,
-                  .apply_loc     = Mass3_loc,
-                  .error_loc     = Error3_loc,
-                  .in_mode       = CEED_EVAL_INTERP,
-                  .out_mode      = CEED_EVAL_INTERP,
-                  .q_mode        = CEED_GAUSS,
-                  .enforce_bc    = PETSC_FALSE},
-    [CEED_BP3] = {.num_comp_u    = 1,
-                  .num_comp_x    = 3,
-                  .topo_dim      = 3,
-                  .q_data_size   = 7,
-                  .q_extra       = 1,
-                  .setup_geo     = SetupDiffGeo,
-                  .setup_rhs     = SetupDiffRhs,
-                  .apply         = Diff,
-                  .error         = Error,
-                  .setup_geo_loc = SetupDiffGeo_loc,
-                  .setup_rhs_loc = SetupDiffRhs_loc,
-                  .apply_loc     = Diff_loc,
-                  .error_loc     = Error_loc,
-                  .in_mode       = CEED_EVAL_GRAD,
-                  .out_mode      = CEED_EVAL_GRAD,
-                  .q_mode        = CEED_GAUSS,
-                  .enforce_bc    = PETSC_TRUE },
-    [CEED_BP4] = {.num_comp_u    = 3,
-                  .num_comp_x    = 3,
-                  .topo_dim      = 3,
-                  .q_data_size   = 7,
-                  .q_extra       = 1,
-                  .setup_geo     = SetupDiffGeo,
-                  .setup_rhs     = SetupDiffRhs3,
-                  .apply         = Diff3,
-                  .error         = Error3,
-                  .setup_geo_loc = SetupDiffGeo_loc,
-                  .setup_rhs_loc = SetupDiffRhs3_loc,
-                  .apply_loc     = Diff3_loc,
-                  .error_loc     = Error3_loc,
-                  .in_mode       = CEED_EVAL_GRAD,
-                  .out_mode      = CEED_EVAL_GRAD,
-                  .q_mode        = CEED_GAUSS,
-                  .enforce_bc    = PETSC_TRUE },
-    [CEED_BP5] = {.num_comp_u    = 1,
-                  .num_comp_x    = 3,
-                  .topo_dim      = 3,
-                  .q_data_size   = 7,
-                  .q_extra       = 0,
-                  .setup_geo     = SetupDiffGeo,
-                  .setup_rhs     = SetupDiffRhs,
-                  .apply         = Diff,
-                  .error         = Error,
-                  .setup_geo_loc = SetupDiffGeo_loc,
-                  .setup_rhs_loc = SetupDiffRhs_loc,
-                  .apply_loc     = Diff_loc,
-                  .error_loc     = Error_loc,
-                  .in_mode       = CEED_EVAL_GRAD,
-                  .out_mode      = CEED_EVAL_GRAD,
-                  .q_mode        = CEED_GAUSS_LOBATTO,
-                  .enforce_bc    = PETSC_TRUE },
-    [CEED_BP6] = {.num_comp_u    = 3,
-                  .num_comp_x    = 3,
-                  .topo_dim      = 3,
-                  .q_data_size   = 7,
-                  .q_extra       = 0,
-                  .setup_geo     = SetupDiffGeo,
-                  .setup_rhs     = SetupDiffRhs3,
-                  .apply         = Diff3,
-                  .error         = Error3,
-                  .setup_geo_loc = SetupDiffGeo_loc,
-                  .setup_rhs_loc = SetupDiffRhs3_loc,
-                  .apply_loc     = Diff3_loc,
-                  .error_loc     = Error3_loc,
-                  .in_mode       = CEED_EVAL_GRAD,
-                  .out_mode      = CEED_EVAL_GRAD,
-                  .q_mode        = CEED_GAUSS_LOBATTO,
-                  .enforce_bc    = PETSC_TRUE }
+BPData bp_options[8] = {
+    [CEED_BP1]  = {.num_comp_u    = 1,
+                   .num_comp_x    = 3,
+                   .topo_dim      = 3,
+                   .q_data_size   = 1,
+                   .q_extra       = 1,
+                   .setup_geo     = SetupMassGeo,
+                   .setup_rhs     = SetupMassRhs,
+                   .apply         = Mass,
+                   .error         = Error,
+                   .setup_geo_loc = SetupMassGeo_loc,
+                   .setup_rhs_loc = SetupMassRhs_loc,
+                   .apply_loc     = Mass_loc,
+                   .error_loc     = Error_loc,
+                   .in_mode       = CEED_EVAL_INTERP,
+                   .out_mode      = CEED_EVAL_INTERP,
+                   .q_mode        = CEED_GAUSS,
+                   .enforce_bc    = PETSC_FALSE},
+    [CEED_BP2]  = {.num_comp_u    = 3,
+                   .num_comp_x    = 3,
+                   .topo_dim      = 3,
+                   .q_data_size   = 1,
+                   .q_extra       = 1,
+                   .setup_geo     = SetupMassGeo,
+                   .setup_rhs     = SetupMassRhs3,
+                   .apply         = Mass3,
+                   .error         = Error3,
+                   .setup_geo_loc = SetupMassGeo_loc,
+                   .setup_rhs_loc = SetupMassRhs3_loc,
+                   .apply_loc     = Mass3_loc,
+                   .error_loc     = Error3_loc,
+                   .in_mode       = CEED_EVAL_INTERP,
+                   .out_mode      = CEED_EVAL_INTERP,
+                   .q_mode        = CEED_GAUSS,
+                   .enforce_bc    = PETSC_FALSE},
+    [CEED_BP3]  = {.num_comp_u    = 1,
+                   .num_comp_x    = 3,
+                   .topo_dim      = 3,
+                   .q_data_size   = 7,
+                   .q_extra       = 1,
+                   .setup_geo     = SetupDiffGeo,
+                   .setup_rhs     = SetupDiffRhs,
+                   .apply         = Diff,
+                   .error         = Error,
+                   .setup_geo_loc = SetupDiffGeo_loc,
+                   .setup_rhs_loc = SetupDiffRhs_loc,
+                   .apply_loc     = Diff_loc,
+                   .error_loc     = Error_loc,
+                   .in_mode       = CEED_EVAL_GRAD,
+                   .out_mode      = CEED_EVAL_GRAD,
+                   .q_mode        = CEED_GAUSS,
+                   .enforce_bc    = PETSC_TRUE },
+    [CEED_BP4]  = {.num_comp_u    = 3,
+                   .num_comp_x    = 3,
+                   .topo_dim      = 3,
+                   .q_data_size   = 7,
+                   .q_extra       = 1,
+                   .setup_geo     = SetupDiffGeo,
+                   .setup_rhs     = SetupDiffRhs3,
+                   .apply         = Diff3,
+                   .error         = Error3,
+                   .setup_geo_loc = SetupDiffGeo_loc,
+                   .setup_rhs_loc = SetupDiffRhs3_loc,
+                   .apply_loc     = Diff3_loc,
+                   .error_loc     = Error3_loc,
+                   .in_mode       = CEED_EVAL_GRAD,
+                   .out_mode      = CEED_EVAL_GRAD,
+                   .q_mode        = CEED_GAUSS,
+                   .enforce_bc    = PETSC_TRUE },
+    [CEED_BP5]  = {.num_comp_u    = 1,
+                   .num_comp_x    = 3,
+                   .topo_dim      = 3,
+                   .q_data_size   = 7,
+                   .q_extra       = 0,
+                   .setup_geo     = SetupDiffGeo,
+                   .setup_rhs     = SetupDiffRhs,
+                   .apply         = Diff,
+                   .error         = Error,
+                   .setup_geo_loc = SetupDiffGeo_loc,
+                   .setup_rhs_loc = SetupDiffRhs_loc,
+                   .apply_loc     = Diff_loc,
+                   .error_loc     = Error_loc,
+                   .in_mode       = CEED_EVAL_GRAD,
+                   .out_mode      = CEED_EVAL_GRAD,
+                   .q_mode        = CEED_GAUSS_LOBATTO,
+                   .enforce_bc    = PETSC_TRUE },
+    [CEED_BP6]  = {.num_comp_u    = 3,
+                   .num_comp_x    = 3,
+                   .topo_dim      = 3,
+                   .q_data_size   = 7,
+                   .q_extra       = 0,
+                   .setup_geo     = SetupDiffGeo,
+                   .setup_rhs     = SetupDiffRhs3,
+                   .apply         = Diff3,
+                   .error         = Error3,
+                   .setup_geo_loc = SetupDiffGeo_loc,
+                   .setup_rhs_loc = SetupDiffRhs3_loc,
+                   .apply_loc     = Diff3_loc,
+                   .error_loc     = Error3_loc,
+                   .in_mode       = CEED_EVAL_GRAD,
+                   .out_mode      = CEED_EVAL_GRAD,
+                   .q_mode        = CEED_GAUSS_LOBATTO,
+                   .enforce_bc    = PETSC_TRUE },
+    [CEED_BP13] = {.num_comp_u    = 1,
+                   .num_comp_x    = 3,
+                   .topo_dim      = 3,
+                   .q_data_size   = 7,
+                   .q_extra       = 1,
+                   .setup_geo     = SetupDiffGeo,
+                   .setup_rhs     = SetupMassDiffRhs,
+                   .apply         = MassDiff,
+                   .error         = Error,
+                   .setup_geo_loc = SetupDiffGeo_loc,
+                   .setup_rhs_loc = SetupMassDiffRhs_loc,
+                   .apply_loc     = MassDiff_loc,
+                   .error_loc     = Error_loc,
+                   .in_mode       = CEED_EVAL_INTERP + CEED_EVAL_GRAD,
+                   .out_mode      = CEED_EVAL_INTERP + CEED_EVAL_GRAD,
+                   .q_mode        = CEED_GAUSS,
+                   .enforce_bc    = PETSC_TRUE },
+    [CEED_BP24] = {.num_comp_u    = 3,
+                   .num_comp_x    = 3,
+                   .topo_dim      = 3,
+                   .q_data_size   = 7,
+                   .q_extra       = 1,
+                   .setup_geo     = SetupDiffGeo,
+                   .setup_rhs     = SetupMassDiffRhs3,
+                   .apply         = MassDiff3,
+                   .error         = Error3,
+                   .setup_geo_loc = SetupDiffGeo_loc,
+                   .setup_rhs_loc = SetupMassDiffRhs3_loc,
+                   .apply_loc     = MassDiff3_loc,
+                   .error_loc     = Error3_loc,
+                   .in_mode       = CEED_EVAL_INTERP + CEED_EVAL_GRAD,
+                   .out_mode      = CEED_EVAL_INTERP + CEED_EVAL_GRAD,
+                   .q_mode        = CEED_GAUSS,
+                   .enforce_bc    = PETSC_TRUE },
 };
diff --git a/examples/petsc/include/structs.h b/examples/petsc/include/structs.h
index 63e1656763..ff6602c891 100644
--- a/examples/petsc/include/structs.h
+++ b/examples/petsc/include/structs.h
@@ -65,7 +65,7 @@ typedef struct {
 } BPData;
 
 // BP options
-typedef enum { CEED_BP1 = 0, CEED_BP2 = 1, CEED_BP3 = 2, CEED_BP4 = 3, CEED_BP5 = 4, CEED_BP6 = 5 } BPType;
+typedef enum { CEED_BP1 = 0, CEED_BP2 = 1, CEED_BP3 = 2, CEED_BP4 = 3, CEED_BP5 = 4, CEED_BP6 = 5, CEED_BP13 = 6, CEED_BP24 = 7 } BPType;
 
 // -----------------------------------------------------------------------------
 // Parameter structure for running problems
diff --git a/examples/petsc/qfunctions/bps/bp13.h b/examples/petsc/qfunctions/bps/bp13.h
new file mode 100644
index 0000000000..a721b21f5b
--- /dev/null
+++ b/examples/petsc/qfunctions/bps/bp13.h
@@ -0,0 +1,74 @@
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+/// @file
+/// libCEED QFunctions for diffusion operator example using PETSc
+
+#include <ceed/types.h>
+#ifndef CEED_RUNNING_JIT_PASS
+#include <math.h>
+#endif
+
+// -----------------------------------------------------------------------------
+// This QFunction sets up the rhs and true solution for the problem
+// -----------------------------------------------------------------------------
+CEED_QFUNCTION(SetupMassDiffRhs)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+#ifndef M_PI
+#define M_PI 3.14159265358979323846
+#endif
+  const CeedScalar *x = in[0], *w = in[1];
+  CeedScalar       *true_soln = out[0], *rhs = out[1];
+
+  // Quadrature Point Loop
+  CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
+    const CeedScalar c[3] = {0, 1., 2.};
+    const CeedScalar k[3] = {1., 2., 3.};
+
+    true_soln[i] = sin(M_PI * (c[0] + k[0] * x[i + Q * 0])) * sin(M_PI * (c[1] + k[1] * x[i + Q * 1])) * sin(M_PI * (c[2] + k[2] * x[i + Q * 2]));
+
+    rhs[i] = w[i + Q * 0] * (M_PI * M_PI * (k[0] * k[0] + k[1] * k[1] + k[2] * k[2]) + 1.0) * true_soln[i];
+  }  // End of Quadrature Point Loop
+  return 0;
+}
+
+// -----------------------------------------------------------------------------
+// This QFunction applies the mass + diffusion operator for a scalar field.
+//
+// Inputs:
+//   u       - Input vector at quadrature points
+//   ug      - Input vector gradient at quadrature points
+//   q_data  - Geometric factors
+//
+// Output:
+//   v      - Output vector (test functions) at quadrature points
+//   vg     - Output vector (test functions) gradient at quadrature points
+// -----------------------------------------------------------------------------
+CEED_QFUNCTION(MassDiff)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+  const CeedScalar *u = in[0], *ug = in[1], *q_data = in[2];
+  CeedScalar       *v = out[0], *vg = out[1];
+
+  // Quadrature Point Loop
+  CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
+    // Read spatial derivatives of u
+    const CeedScalar du[3] = {ug[i + Q * 0], ug[i + Q * 1], ug[i + Q * 2]};
+    // Read q_data (dXdxdXdx_T symmetric matrix)
+    const CeedScalar dXdxdXdx_T[3][3] = {
+        {q_data[i + 1 * Q], q_data[i + 2 * Q], q_data[i + 3 * Q]},
+        {q_data[i + 2 * Q], q_data[i + 4 * Q], q_data[i + 5 * Q]},
+        {q_data[i + 3 * Q], q_data[i + 5 * Q], q_data[i + 6 * Q]}
+    };
+
+    // Mass
+    v[i] = q_data[i + 0 * Q] * u[i];
+    // Diff
+    for (int j = 0; j < 3; j++) {  // j = direction of vg
+      vg[i + j * Q] = (du[0] * dXdxdXdx_T[0][j] + du[1] * dXdxdXdx_T[1][j] + du[2] * dXdxdXdx_T[2][j]);
+    }
+  }  // End of Quadrature Point Loop
+  return 0;
+}
+// -----------------------------------------------------------------------------
diff --git a/examples/petsc/qfunctions/bps/bp24.h b/examples/petsc/qfunctions/bps/bp24.h
new file mode 100644
index 0000000000..dad569f852
--- /dev/null
+++ b/examples/petsc/qfunctions/bps/bp24.h
@@ -0,0 +1,92 @@
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+/// @file
+/// libCEED QFunctions for diffusion operator example using PETSc
+
+#include <ceed/types.h>
+#ifndef CEED_RUNNING_JIT_PASS
+#include <math.h>
+#endif
+
+// -----------------------------------------------------------------------------
+// This QFunction sets up the rhs and true solution for the problem
+// -----------------------------------------------------------------------------
+CEED_QFUNCTION(SetupMassDiffRhs3)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+#ifndef M_PI
+#define M_PI 3.14159265358979323846
+#endif
+  const CeedScalar *x = in[0], *w = in[1];
+  CeedScalar       *true_soln = out[0], *rhs = out[1];
+
+  // Quadrature Point Loop
+  CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
+    const CeedScalar c[3] = {0, 1., 2.};
+    const CeedScalar k[3] = {1., 2., 3.};
+
+    // Component 1
+    true_soln[i + 0 * Q] =
+        sin(M_PI * (c[0] + k[0] * x[i + Q * 0])) * sin(M_PI * (c[1] + k[1] * x[i + Q * 1])) * sin(M_PI * (c[2] + k[2] * x[i + Q * 2]));
+    // Component 2
+    true_soln[i + 1 * Q] = 2 * true_soln[i + 0 * Q];
+    // Component 3
+    true_soln[i + 2 * Q] = 3 * true_soln[i + 0 * Q];
+
+    // Component 1
+    rhs[i + 0 * Q] = w[i + Q * 0] * (M_PI * M_PI * (k[0] * k[0] + k[1] * k[1] + k[2] * k[2]) + 1.0) * true_soln[i + 0 * Q];
+    // Component 2
+    rhs[i + 1 * Q] = 2 * rhs[i + 0 * Q];
+    // Component 3
+    rhs[i + 2 * Q] = 3 * rhs[i + 0 * Q];
+  }  // End of Quadrature Point Loop
+  return 0;
+}
+
+// -----------------------------------------------------------------------------
+// This QFunction applies the mass + diffusion operator for a vector field of 3 components.
+//
+// Inputs:
+//   u       - Input vector at quadrature points
+//   ug      - Input vector Jacobian at quadrature points
+//   q_data  - Geometric factors
+//
+// Output:
+//   v      - Output vector (test functions) at quadrature points
+//   vJ     - Output vector (test functions) Jacobian at quadrature points
+// -----------------------------------------------------------------------------
+CEED_QFUNCTION(MassDiff3)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+  const CeedScalar *u = in[0], *ug = in[1], *q_data = in[2];
+  CeedScalar       *v = out[0], *vg = out[1];
+
+  // Quadrature Point Loop
+  CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
+    // Read spatial derivatives of u components
+    const CeedScalar uJ[3][3] = {
+        {ug[i + (0 + 0 * 3) * Q], ug[i + (0 + 1 * 3) * Q], ug[i + (0 + 2 * 3) * Q]},
+        {ug[i + (1 + 0 * 3) * Q], ug[i + (1 + 1 * 3) * Q], ug[i + (1 + 2 * 3) * Q]},
+        {ug[i + (2 + 0 * 3) * Q], ug[i + (2 + 1 * 3) * Q], ug[i + (2 + 2 * 3) * Q]}
+    };
+    // Read q_data (dXdxdXdx_T symmetric matrix)
+    const CeedScalar dXdxdXdx_T[3][3] = {
+        {q_data[i + 1 * Q], q_data[i + 2 * Q], q_data[i + 3 * Q]},
+        {q_data[i + 2 * Q], q_data[i + 4 * Q], q_data[i + 5 * Q]},
+        {q_data[i + 3 * Q], q_data[i + 5 * Q], q_data[i + 6 * Q]}
+    };
+
+    for (int k = 0; k < 3; k++) {  // k = component
+      // Mass
+      v[i + k * Q] = q_data[i + 0 * Q] * u[i + k * Q];
+      // Diff
+      for (int j = 0; j < 3; j++) {  // j = direction of vg
+        vg[i + (k + j * 3) * Q] = (uJ[k][0] * dXdxdXdx_T[0][j] + uJ[k][1] * dXdxdXdx_T[1][j] + uJ[k][2] * dXdxdXdx_T[2][j]);
+      }
+    }
+  }  // End of Quadrature Point Loop
+
+  return 0;
+}
+// -----------------------------------------------------------------------------
diff --git a/examples/petsc/qfunctions/bps/bp3.h b/examples/petsc/qfunctions/bps/bp3.h
index ade23682c8..aeac6005d9 100644
--- a/examples/petsc/qfunctions/bps/bp3.h
+++ b/examples/petsc/qfunctions/bps/bp3.h
@@ -89,7 +89,6 @@ CEED_QFUNCTION(SetupDiffRhs)(void *ctx, CeedInt Q, const CeedScalar *const *in,
 
     rhs[i] = w[i + Q * 0] * M_PI * M_PI * (k[0] * k[0] + k[1] * k[1] + k[2] * k[2]) * true_soln[i];
   }  // End of Quadrature Point Loop
-
   return 0;
 }
 
diff --git a/examples/petsc/qfunctions/bps/bp4.h b/examples/petsc/qfunctions/bps/bp4.h
index 4771fb479d..89d8ce98e9 100644
--- a/examples/petsc/qfunctions/bps/bp4.h
+++ b/examples/petsc/qfunctions/bps/bp4.h
@@ -43,7 +43,6 @@ CEED_QFUNCTION(SetupDiffRhs3)(void *ctx, CeedInt Q, const CeedScalar *const *in,
     // Component 3
     rhs[i + 2 * Q] = 3 * rhs[i + 0 * Q];
   }  // End of Quadrature Point Loop
-
   return 0;
 }
 
@@ -58,7 +57,7 @@ CEED_QFUNCTION(SetupDiffRhs3)(void *ctx, CeedInt Q, const CeedScalar *const *in,
 //   vJ     - Output vector (test functions) Jacobian at quadrature points
 // -----------------------------------------------------------------------------
 CEED_QFUNCTION(Diff3)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
-  const CeedScalar *ug = in[0], *qd = in[1];
+  const CeedScalar *ug = in[0], *q_data = in[1];
   CeedScalar       *vg = out[0];
 
   // Quadrature Point Loop
@@ -71,9 +70,9 @@ CEED_QFUNCTION(Diff3)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedSca
     };
     // Read q_data (dXdxdXdx_T symmetric matrix)
     const CeedScalar dXdxdXdx_T[3][3] = {
-        {qd[i + 1 * Q], qd[i + 2 * Q], qd[i + 3 * Q]},
-        {qd[i + 2 * Q], qd[i + 4 * Q], qd[i + 5 * Q]},
-        {qd[i + 3 * Q], qd[i + 5 * Q], qd[i + 6 * Q]}
+        {q_data[i + 1 * Q], q_data[i + 2 * Q], q_data[i + 3 * Q]},
+        {q_data[i + 2 * Q], q_data[i + 4 * Q], q_data[i + 5 * Q]},
+        {q_data[i + 3 * Q], q_data[i + 5 * Q], q_data[i + 6 * Q]}
     };
 
     for (int k = 0; k < 3; k++) {    // k = component
@@ -82,7 +81,6 @@ CEED_QFUNCTION(Diff3)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedSca
       }
     }
   }  // End of Quadrature Point Loop
-
   return 0;
 }
 // -----------------------------------------------------------------------------
diff --git a/examples/petsc/src/libceedsetup.c b/examples/petsc/src/libceedsetup.c
index 50c174a939..446cf93f72 100644
--- a/examples/petsc/src/libceedsetup.c
+++ b/examples/petsc/src/libceedsetup.c
@@ -105,18 +105,40 @@ PetscErrorCode SetupLibceedByDegree(DM dm, Ceed ceed, CeedInt degree, CeedInt to
     CeedOperatorApply(op_setup_geo, x_coord, q_data, CEED_REQUEST_IMMEDIATE);
 
     // Set up PDE operator
-    CeedInt in_scale  = bp_data.in_mode == CEED_EVAL_GRAD ? topo_dim : 1;
-    CeedInt out_scale = bp_data.out_mode == CEED_EVAL_GRAD ? topo_dim : 1;
+    PetscBool is_interp = bp_data.in_mode == CEED_EVAL_INTERP;
+    CeedInt   in_scale  = bp_data.in_mode == CEED_EVAL_GRAD ? topo_dim : 1;
+    CeedInt   out_scale = bp_data.out_mode == CEED_EVAL_GRAD ? topo_dim : 1;
+
     CeedQFunctionCreateInterior(ceed, 1, bp_data.apply, bp_data.apply_loc, &qf_apply);
-    CeedQFunctionAddInput(qf_apply, "u", num_comp_u * in_scale, bp_data.in_mode);
+    if (bp_data.in_mode == CEED_EVAL_INTERP + CEED_EVAL_GRAD) {
+      CeedQFunctionAddInput(qf_apply, "u", num_comp_u, CEED_EVAL_INTERP);
+      CeedQFunctionAddInput(qf_apply, "du", num_comp_u * topo_dim, CEED_EVAL_GRAD);
+    } else {
+      CeedQFunctionAddInput(qf_apply, is_interp ? "u" : "du", num_comp_u * in_scale, bp_data.in_mode);
+    }
     CeedQFunctionAddInput(qf_apply, "qdata", q_data_size, CEED_EVAL_NONE);
-    CeedQFunctionAddOutput(qf_apply, "v", num_comp_u * out_scale, bp_data.out_mode);
+    if (bp_data.out_mode == CEED_EVAL_INTERP + CEED_EVAL_GRAD) {
+      CeedQFunctionAddOutput(qf_apply, "v", num_comp_u, CEED_EVAL_INTERP);
+      CeedQFunctionAddOutput(qf_apply, "dv", num_comp_u * topo_dim, CEED_EVAL_GRAD);
+    } else {
+      CeedQFunctionAddOutput(qf_apply, is_interp ? "v" : "dv", num_comp_u * out_scale, bp_data.out_mode);
+    }
 
     // Create the mass or diff operator
     CeedOperatorCreate(ceed, qf_apply, NULL, NULL, &op_apply);
-    CeedOperatorSetField(op_apply, "u", elem_restr_u, basis_u, CEED_VECTOR_ACTIVE);
+    if (bp_data.in_mode == CEED_EVAL_INTERP + CEED_EVAL_GRAD) {
+      CeedOperatorSetField(op_apply, "u", elem_restr_u, basis_u, CEED_VECTOR_ACTIVE);
+      CeedOperatorSetField(op_apply, "du", elem_restr_u, basis_u, CEED_VECTOR_ACTIVE);
+    } else {
+      CeedOperatorSetField(op_apply, is_interp ? "u" : "du", elem_restr_u, basis_u, CEED_VECTOR_ACTIVE);
+    }
     CeedOperatorSetField(op_apply, "qdata", elem_restr_qd_i, CEED_BASIS_NONE, q_data);
-    CeedOperatorSetField(op_apply, "v", elem_restr_u, basis_u, CEED_VECTOR_ACTIVE);
+    if (bp_data.out_mode == CEED_EVAL_INTERP + CEED_EVAL_GRAD) {
+      CeedOperatorSetField(op_apply, "v", elem_restr_u, basis_u, CEED_VECTOR_ACTIVE);
+      CeedOperatorSetField(op_apply, "dv", elem_restr_u, basis_u, CEED_VECTOR_ACTIVE);
+    } else {
+      CeedOperatorSetField(op_apply, is_interp ? "v" : "dv", elem_restr_u, basis_u, CEED_VECTOR_ACTIVE);
+    }
 
     // Cleanup
     CeedQFunctionDestroy(&qf_setup_geo);

From 4a46e67df50c06c939702143cae794ef585805bd Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Mon, 12 May 2025 12:13:18 -0600
Subject: [PATCH 400/571] doc - minor fixes

---
 README.md                                 | 2 +-
 examples/deal.II/{README.MD => README.md} | 5 ++---
 examples/fluids/README.md                 | 1 -
 examples/nek/README.md                    | 2 +-
 examples/petsc/README.md                  | 4 +---
 examples/solids/README.md                 | 1 -
 6 files changed, 5 insertions(+), 10 deletions(-)
 rename examples/deal.II/{README.MD => README.md} (71%)

diff --git a/README.md b/README.md
index 41fa318582..f81fcb23ae 100644
--- a/README.md
+++ b/README.md
@@ -232,7 +232,7 @@ For example:
 
 The `/*/occa` backends rely upon the [OCCA](http://github.com/libocca/occa) package to provide cross platform performance.
 To enable the OCCA backend, the environment variable `OCCA_DIR` must point to the top-level OCCA directory, with the OCCA library located in the `${OCCA_DIR}/lib` (By default, `OCCA_DIR` is set to `../occa`).
-OCCA version 1.4.0 or newer is required.
+OCCA version 1.6.0 or newer is required.
 
 Users can pass specific OCCA device properties after setting the CEED resource.
 For example:
diff --git a/examples/deal.II/README.MD b/examples/deal.II/README.md
similarity index 71%
rename from examples/deal.II/README.MD
rename to examples/deal.II/README.md
index cd3f14a3cb..f6065b2c53 100644
--- a/examples/deal.II/README.MD
+++ b/examples/deal.II/README.md
@@ -1,6 +1,5 @@
-An example how to write libCEED operators (BP1-BP6) within the open-source
-finite element library [deal.II](https://www.dealii.org/). As reference,
-operators are presented that use the native matrix-free infrastructure.
+An example how to write libCEED operators (BP1-BP6) within the open-source finite element library [deal.II](https://www.dealii.org/).
+As reference, operators are presented that use the native matrix-free infrastructure.
 
 First compile deal.II and libCEED individually. After that, compile the deal.II example:
 
diff --git a/examples/fluids/README.md b/examples/fluids/README.md
index be0bd82597..79c33a1ee2 100644
--- a/examples/fluids/README.md
+++ b/examples/fluids/README.md
@@ -1,7 +1,6 @@
 ## libCEED: Navier-Stokes Example
 
 This page provides a description of the Navier-Stokes example for the libCEED library, based on PETSc.
-PETSc v3.17 or a development version of PETSc at commit 0e95d842 or later is required.
 
 HONEE, a more fully featured fluid dynamics solver, can be found on [GitLab](https://gitlab.com/phypid/honee).
 
diff --git a/examples/nek/README.md b/examples/nek/README.md
index 6c1cfdee44..1b8faec64d 100644
--- a/examples/nek/README.md
+++ b/examples/nek/README.md
@@ -2,7 +2,7 @@
 
 ### Prerequisites
 
-Nek5000 v18.0 or greater must be [installed](https://nek5000.mcs.anl.gov/getstarted/) to run these examples.
+Nek5000 v19.0 or greater must be [installed](https://nek5000.mcs.anl.gov/getstarted/) to run these examples.
 It is assumed to exist at `../../../Nek5000` (a sibling to the libCEED directory) or at a path defined in the environment variable `NEK5K_DIR`.
 For example, you could set
 ```sh
diff --git a/examples/petsc/README.md b/examples/petsc/README.md
index 4ec9e4baff..b63e7d0e98 100644
--- a/examples/petsc/README.md
+++ b/examples/petsc/README.md
@@ -1,6 +1,6 @@
 ## libCEED + PETSc Examples
 
-PETSc v3.17 or a development version of PETSc at commit 0e95d842 or later is required.
+This page provides a description of the CEED bakeoff problem examples for the libCEED library, based on PETSc.
 
 ### CEED bakeoff problems with raw mesh management - bpsraw
 
@@ -17,7 +17,6 @@ In addition to the common arguments, the following arguments may be set:
 ### CEED bakeoff problems with DMPlex - bps
 
 This code solves the CEED bakeoff problems on a unstructured grid using DMPlex.
-This example requires a PETSc version later than 3.11.3.
 
 To build, run `make bps`
 
@@ -43,7 +42,6 @@ The resulting log file can be read by the Python plotting scripts in `benchmarks
 ### CEED bakeoff problems with DMPlex and PCMG - multigrid
 
 This code solves the CEED bakeoff problems on a unstructured grid using DMPlex with p-multigrid implemented in PCMG.
-This example requires a PETSc version later than 3.11.3.
 
 To build, run `make multigrid`
 
diff --git a/examples/solids/README.md b/examples/solids/README.md
index 80f242b809..696a044098 100644
--- a/examples/solids/README.md
+++ b/examples/solids/README.md
@@ -1,7 +1,6 @@
 # libCEED: Solid Mechanics Example
 
 This page provides a description of the solid mechanics example for the libCEED library, based on PETSc.
-PETSc v3.17 or a development version of PETSc at commit 0e95d842 or later is required.
 
 Ratel, a more fully featured solid mechanics library, can be found on [GitLab](https://gitlab.com/micromorph/ratel).
 

From 465e63d4360400a97854f5fe9df5a988c6c958c5 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Thu, 15 May 2025 13:40:02 -0600
Subject: [PATCH 401/571] test - fix path to stay in folder

---
 tests/t406-qfunction.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/t406-qfunction.h b/tests/t406-qfunction.h
index db9235d3ff..e886a61b4c 100644
--- a/tests/t406-qfunction.h
+++ b/tests/t406-qfunction.h
@@ -19,7 +19,7 @@
 // Also test include path with "/../"
 #include "../tests/t406-qfunction-helper.h"
 // Also test include path with "/../../"
-#include "../../libCEED/tests/t406-qfunction-helper.h"
+#include "output/../../tests/t406-qfunction-helper.h"
 #  include "t406-qfunction-scales.h"
 // clang-format on
 

From 3eb59678ecb8a0fe884fb9297a6048221d053835 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Mon, 19 May 2025 15:00:33 -0600
Subject: [PATCH 402/571] rust - update rust examples to match c/python

---
 Cargo.toml                                    |   6 +-
 examples/deal.II/CMakeLists.txt               |   2 +-
 .../.gitignore                                |   0
 .../Cargo.toml                                |   2 +-
 .../src/main.rs                               |  28 +-
 .../src/opt.rs                                |   0
 .../src/transform.rs                          |   3 +-
 examples/rust/ex1-volume/src/transform.rs     |   3 +-
 .../.gitignore                                |   0
 .../Cargo.toml                                |   2 +-
 .../src/main.rs                               |  48 +-
 .../src/opt.rs                                |   0
 .../src/transform.rs                          |   3 +-
 examples/rust/ex2-surface/src/main.rs         |   1 -
 examples/rust/ex2-surface/src/transform.rs    |   3 +-
 examples/rust/ex3-volume-vector/.gitignore    |   2 +
 examples/rust/ex3-volume-vector/Cargo.toml    |  15 +
 examples/rust/ex3-volume-vector/src/main.rs   | 438 ++++++++++++++++++
 examples/rust/ex3-volume-vector/src/opt.rs    |  45 ++
 .../rust/ex3-volume-vector/src/transform.rs   |  50 ++
 examples/rust/ex3-volume/.gitignore           |   2 +
 examples/rust/ex3-volume/Cargo.toml           |  15 +
 examples/rust/ex3-volume/src/main.rs          | 415 +++++++++++++++++
 examples/rust/ex3-volume/src/opt.rs           |  45 ++
 examples/rust/ex3-volume/src/transform.rs     |  50 ++
 25 files changed, 1130 insertions(+), 48 deletions(-)
 rename examples/rust/{ex3-vector-volume => ex1-volume-vector}/.gitignore (100%)
 rename examples/rust/{ex3-vector-volume => ex1-volume-vector}/Cargo.toml (91%)
 rename examples/rust/{ex3-vector-volume => ex1-volume-vector}/src/main.rs (95%)
 rename examples/rust/{ex3-vector-volume => ex1-volume-vector}/src/opt.rs (100%)
 rename examples/rust/{ex3-vector-volume => ex1-volume-vector}/src/transform.rs (94%)
 rename examples/rust/{ex4-vector-surface => ex2-surface-vector}/.gitignore (100%)
 rename examples/rust/{ex4-vector-surface => ex2-surface-vector}/Cargo.toml (91%)
 rename examples/rust/{ex4-vector-surface => ex2-surface-vector}/src/main.rs (93%)
 rename examples/rust/{ex4-vector-surface => ex2-surface-vector}/src/opt.rs (100%)
 rename examples/rust/{ex4-vector-surface => ex2-surface-vector}/src/transform.rs (96%)
 create mode 100644 examples/rust/ex3-volume-vector/.gitignore
 create mode 100644 examples/rust/ex3-volume-vector/Cargo.toml
 create mode 100644 examples/rust/ex3-volume-vector/src/main.rs
 create mode 100644 examples/rust/ex3-volume-vector/src/opt.rs
 create mode 100644 examples/rust/ex3-volume-vector/src/transform.rs
 create mode 100644 examples/rust/ex3-volume/.gitignore
 create mode 100644 examples/rust/ex3-volume/Cargo.toml
 create mode 100644 examples/rust/ex3-volume/src/main.rs
 create mode 100644 examples/rust/ex3-volume/src/opt.rs
 create mode 100644 examples/rust/ex3-volume/src/transform.rs

diff --git a/Cargo.toml b/Cargo.toml
index a987ca8a95..218a10b19e 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -3,8 +3,10 @@ members = [
         "rust/libceed",
         "rust/libceed-sys",
         "examples/rust/ex1-volume",
+        "examples/rust/ex1-volume-vector",
         "examples/rust/ex2-surface",
-        "examples/rust/ex3-vector-volume",
-        "examples/rust/ex4-vector-surface",
+        "examples/rust/ex2-surface-vector",
+        "examples/rust/ex3-volume",
+        "examples/rust/ex3-volume-vector",
         "examples/rust/mesh",
 ]
diff --git a/examples/deal.II/CMakeLists.txt b/examples/deal.II/CMakeLists.txt
index 272facfc00..46522cf0b8 100644
--- a/examples/deal.II/CMakeLists.txt
+++ b/examples/deal.II/CMakeLists.txt
@@ -1,4 +1,4 @@
-CMAKE_MINIMUM_REQUIRED(VERSION 2.8.8)
+CMAKE_MINIMUM_REQUIRED(VERSION 3.5.0)
 
 FIND_PACKAGE(deal.II 8.0 QUIET
   HINTS ${deal.II_DIR} ${DEAL_II_DIR} ../ ../../ $ENV{DEAL_II_DIR}
diff --git a/examples/rust/ex3-vector-volume/.gitignore b/examples/rust/ex1-volume-vector/.gitignore
similarity index 100%
rename from examples/rust/ex3-vector-volume/.gitignore
rename to examples/rust/ex1-volume-vector/.gitignore
diff --git a/examples/rust/ex3-vector-volume/Cargo.toml b/examples/rust/ex1-volume-vector/Cargo.toml
similarity index 91%
rename from examples/rust/ex3-vector-volume/Cargo.toml
rename to examples/rust/ex1-volume-vector/Cargo.toml
index 3bee448ac7..d3f5b74832 100644
--- a/examples/rust/ex3-vector-volume/Cargo.toml
+++ b/examples/rust/ex1-volume-vector/Cargo.toml
@@ -1,5 +1,5 @@
 [package]
-name = "ex3-vector-volume"
+name = "ex1-volume-vector"
 version = "0.11.0"
 authors = [
     "Jeremy L Thompson <thompson.jeremy.luke@gmail.com>",
diff --git a/examples/rust/ex3-vector-volume/src/main.rs b/examples/rust/ex1-volume-vector/src/main.rs
similarity index 95%
rename from examples/rust/ex3-vector-volume/src/main.rs
rename to examples/rust/ex1-volume-vector/src/main.rs
index ef8636791a..b319d0b6f2 100644
--- a/examples/rust/ex3-vector-volume/src/main.rs
+++ b/examples/rust/ex1-volume-vector/src/main.rs
@@ -31,12 +31,12 @@ mod transform;
 // ----------------------------------------------------------------------------
 fn main() -> libceed::Result<()> {
     let options = opt::Opt::parse();
-    example_3(options)
+    example_1_vector(options)
 }
 
 #[allow(clippy::erasing_op)]
 #[allow(clippy::identity_op)]
-fn example_3(options: opt::Opt) -> libceed::Result<()> {
+fn example_1_vector(options: opt::Opt) -> libceed::Result<()> {
     // Process command line arguments
     let opt::Opt {
         ceed_spec,
@@ -304,7 +304,7 @@ mod tests {
     use super::*;
 
     #[test]
-    fn example_3_1d() {
+    fn example_1_vector_1d() {
         let options = opt::Opt {
             ceed_spec: "/cpu/self/ref/serial".to_string(),
             dim: 1,
@@ -316,11 +316,11 @@ mod tests {
             quiet: true,
             gallery: false,
         };
-        assert!(example_3(options).is_ok());
+        assert!(example_1_vector(options).is_ok());
     }
 
     #[test]
-    fn example_3_2d() {
+    fn example_1_vector_2d() {
         let options = opt::Opt {
             ceed_spec: "/cpu/self/ref/serial".to_string(),
             dim: 2,
@@ -332,11 +332,11 @@ mod tests {
             quiet: true,
             gallery: false,
         };
-        assert!(example_3(options).is_ok());
+        assert!(example_1_vector(options).is_ok());
     }
 
     #[test]
-    fn example_3_3d() {
+    fn example_1_vector_3d() {
         let options = opt::Opt {
             ceed_spec: "/cpu/self/ref/serial".to_string(),
             dim: 3,
@@ -348,11 +348,11 @@ mod tests {
             quiet: false,
             gallery: false,
         };
-        assert!(example_3(options).is_ok());
+        assert!(example_1_vector(options).is_ok());
     }
 
     #[test]
-    fn example_3_1d_gallery() {
+    fn example_1_vector_1d_gallery() {
         let options = opt::Opt {
             ceed_spec: "/cpu/self/ref/serial".to_string(),
             dim: 1,
@@ -364,11 +364,11 @@ mod tests {
             quiet: true,
             gallery: true,
         };
-        assert!(example_3(options).is_ok());
+        assert!(example_1_vector(options).is_ok());
     }
 
     #[test]
-    fn example_3_2d_gallery() {
+    fn example_1_vector_2d_gallery() {
         let options = opt::Opt {
             ceed_spec: "/cpu/self/ref/serial".to_string(),
             dim: 2,
@@ -380,11 +380,11 @@ mod tests {
             quiet: true,
             gallery: true,
         };
-        assert!(example_3(options).is_ok());
+        assert!(example_1_vector(options).is_ok());
     }
 
     #[test]
-    fn example_3_3d_gallery() {
+    fn example_1_vector_3d_gallery() {
         let options = opt::Opt {
             ceed_spec: "/cpu/self/ref/serial".to_string(),
             dim: 3,
@@ -396,7 +396,7 @@ mod tests {
             quiet: true,
             gallery: true,
         };
-        assert!(example_3(options).is_ok());
+        assert!(example_1_vector(options).is_ok());
     }
 }
 
diff --git a/examples/rust/ex3-vector-volume/src/opt.rs b/examples/rust/ex1-volume-vector/src/opt.rs
similarity index 100%
rename from examples/rust/ex3-vector-volume/src/opt.rs
rename to examples/rust/ex1-volume-vector/src/opt.rs
diff --git a/examples/rust/ex3-vector-volume/src/transform.rs b/examples/rust/ex1-volume-vector/src/transform.rs
similarity index 94%
rename from examples/rust/ex3-vector-volume/src/transform.rs
rename to examples/rust/ex1-volume-vector/src/transform.rs
index 875194b829..e18f0e4948 100644
--- a/examples/rust/ex3-vector-volume/src/transform.rs
+++ b/examples/rust/ex1-volume-vector/src/transform.rs
@@ -37,7 +37,8 @@ pub(crate) fn transform_mesh_coordinates(
     // Exact volume of transformed region
     let exact_volume = match dim {
         1 => 1.0,
-        _ => 3.0 / 4.0 * std::f64::consts::PI as libceed::Scalar,
+        2 | 3 => 3.0 / 4.0 * std::f64::consts::PI as libceed::Scalar,
+        _ => unreachable!(),
     };
     Ok(exact_volume)
 }
diff --git a/examples/rust/ex1-volume/src/transform.rs b/examples/rust/ex1-volume/src/transform.rs
index 875194b829..e18f0e4948 100644
--- a/examples/rust/ex1-volume/src/transform.rs
+++ b/examples/rust/ex1-volume/src/transform.rs
@@ -37,7 +37,8 @@ pub(crate) fn transform_mesh_coordinates(
     // Exact volume of transformed region
     let exact_volume = match dim {
         1 => 1.0,
-        _ => 3.0 / 4.0 * std::f64::consts::PI as libceed::Scalar,
+        2 | 3 => 3.0 / 4.0 * std::f64::consts::PI as libceed::Scalar,
+        _ => unreachable!(),
     };
     Ok(exact_volume)
 }
diff --git a/examples/rust/ex4-vector-surface/.gitignore b/examples/rust/ex2-surface-vector/.gitignore
similarity index 100%
rename from examples/rust/ex4-vector-surface/.gitignore
rename to examples/rust/ex2-surface-vector/.gitignore
diff --git a/examples/rust/ex4-vector-surface/Cargo.toml b/examples/rust/ex2-surface-vector/Cargo.toml
similarity index 91%
rename from examples/rust/ex4-vector-surface/Cargo.toml
rename to examples/rust/ex2-surface-vector/Cargo.toml
index 6b41826088..4eac55c52e 100644
--- a/examples/rust/ex4-vector-surface/Cargo.toml
+++ b/examples/rust/ex2-surface-vector/Cargo.toml
@@ -1,5 +1,5 @@
 [package]
-name = "ex4-vector-surface"
+name = "ex2-surface-vector"
 version = "0.11.0"
 authors = [
     "Jeremy L Thompson <thompson.jeremy.luke@gmail.com>",
diff --git a/examples/rust/ex4-vector-surface/src/main.rs b/examples/rust/ex2-surface-vector/src/main.rs
similarity index 93%
rename from examples/rust/ex4-vector-surface/src/main.rs
rename to examples/rust/ex2-surface-vector/src/main.rs
index a05d35a222..9314572c62 100644
--- a/examples/rust/ex4-vector-surface/src/main.rs
+++ b/examples/rust/ex2-surface-vector/src/main.rs
@@ -32,12 +32,12 @@ mod transform;
 // ----------------------------------------------------------------------------
 fn main() -> libceed::Result<()> {
     let options = opt::Opt::parse();
-    example_4(options)
+    example_2_vector(options)
 }
 
 #[allow(clippy::erasing_op)]
 #[allow(clippy::identity_op)]
-fn example_4(options: opt::Opt) -> libceed::Result<()> {
+fn example_2_vector(options: opt::Opt) -> libceed::Result<()> {
     // Process command line arguments
     let opt::Opt {
         ceed_spec,
@@ -256,7 +256,7 @@ fn example_4(options: opt::Opt) -> libceed::Result<()> {
         match dim {
             1 => {
                 let q = qdata.len();
-                for c in 0..3 {
+                for c in 0..ncomp_u {
                     vg.iter_mut()
                         .skip(c * q)
                         .zip(ug.iter().skip(c * q).zip(qdata.iter()))
@@ -266,12 +266,12 @@ fn example_4(options: opt::Opt) -> libceed::Result<()> {
             2 => {
                 let q = qdata.len() / 3;
                 for i in 0..q {
+                    let dxdxdxdx_t = [
+                        [qdata[i + 0 * q], qdata[i + 2 * q]],
+                        [qdata[i + 2 * q], qdata[i + 1 * q]],
+                    ];
                     for c in 0..ncomp_u {
                         let du = [ug[i + (c + 0 * ncomp_u) * q], ug[i + (c + 1 * ncomp_u) * q]];
-                        let dxdxdxdx_t = [
-                            [qdata[i + 0 * q], qdata[i + 2 * q]],
-                            [qdata[i + 2 * q], qdata[i + 1 * q]],
-                        ];
                         for j in 0..dim {
                             vg[i + (c + j * ncomp_u) * q] =
                                 du[0] * dxdxdxdx_t[0][j] + du[1] * dxdxdxdx_t[1][j];
@@ -282,17 +282,17 @@ fn example_4(options: opt::Opt) -> libceed::Result<()> {
             3 => {
                 let q = qdata.len() / 6;
                 for i in 0..q {
+                    let dxdxdxdx_t = [
+                        [qdata[i + 0 * q], qdata[i + 5 * q], qdata[i + 4 * q]],
+                        [qdata[i + 5 * q], qdata[i + 1 * q], qdata[i + 3 * q]],
+                        [qdata[i + 4 * q], qdata[i + 3 * q], qdata[i + 2 * q]],
+                    ];
                     for c in 0..ncomp_u {
                         let du = [
                             ug[i + (c + 0 * ncomp_u) * q],
                             ug[i + (c + 1 * ncomp_u) * q],
                             ug[i + (c + 2 * ncomp_u) * q],
                         ];
-                        let dxdxdxdx_t = [
-                            [qdata[i + 0 * q], qdata[i + 5 * q], qdata[i + 4 * q]],
-                            [qdata[i + 5 * q], qdata[i + 1 * q], qdata[i + 3 * q]],
-                            [qdata[i + 4 * q], qdata[i + 3 * q], qdata[i + 2 * q]],
-                        ];
                         for j in 0..dim {
                             vg[i + (c + j * ncomp_u) * q] = du[0] * dxdxdxdx_t[0][j]
                                 + du[1] * dxdxdxdx_t[1][j]
@@ -395,7 +395,7 @@ mod tests {
     use super::*;
 
     #[test]
-    fn example_4_1d() {
+    fn example_2_vector_1d() {
         let options = opt::Opt {
             ceed_spec: "/cpu/self/ref/serial".to_string(),
             dim: 1,
@@ -407,11 +407,11 @@ mod tests {
             quiet: true,
             gallery: false,
         };
-        assert!(example_4(options).is_ok());
+        assert!(example_2_vector(options).is_ok());
     }
 
     #[test]
-    fn example_4_2d() {
+    fn example_2_vector_2d() {
         let options = opt::Opt {
             ceed_spec: "/cpu/self/ref/serial".to_string(),
             dim: 2,
@@ -423,11 +423,11 @@ mod tests {
             quiet: true,
             gallery: false,
         };
-        assert!(example_4(options).is_ok());
+        assert!(example_2_vector(options).is_ok());
     }
 
     #[test]
-    fn example_4_3d() {
+    fn example_2_vector_3d() {
         let options = opt::Opt {
             ceed_spec: "/cpu/self/ref/serial".to_string(),
             dim: 3,
@@ -439,11 +439,11 @@ mod tests {
             quiet: false,
             gallery: false,
         };
-        assert!(example_4(options).is_ok());
+        assert!(example_2_vector(options).is_ok());
     }
 
     #[test]
-    fn example_4_1d_gallery() {
+    fn example_2_vector_1d_gallery() {
         let options = opt::Opt {
             ceed_spec: "/cpu/self/ref/serial".to_string(),
             dim: 1,
@@ -455,11 +455,11 @@ mod tests {
             quiet: true,
             gallery: true,
         };
-        assert!(example_4(options).is_ok());
+        assert!(example_2_vector(options).is_ok());
     }
 
     #[test]
-    fn example_4_2d_gallery() {
+    fn example_2_vector_2d_gallery() {
         let options = opt::Opt {
             ceed_spec: "/cpu/self/ref/serial".to_string(),
             dim: 2,
@@ -471,11 +471,11 @@ mod tests {
             quiet: true,
             gallery: true,
         };
-        assert!(example_4(options).is_ok());
+        assert!(example_2_vector(options).is_ok());
     }
 
     #[test]
-    fn example_4_3d_gallery() {
+    fn example_2_vector_3d_gallery() {
         let options = opt::Opt {
             ceed_spec: "/cpu/self/ref/serial".to_string(),
             dim: 3,
@@ -487,7 +487,7 @@ mod tests {
             quiet: true,
             gallery: true,
         };
-        assert!(example_4(options).is_ok());
+        assert!(example_2_vector(options).is_ok());
     }
 }
 
diff --git a/examples/rust/ex4-vector-surface/src/opt.rs b/examples/rust/ex2-surface-vector/src/opt.rs
similarity index 100%
rename from examples/rust/ex4-vector-surface/src/opt.rs
rename to examples/rust/ex2-surface-vector/src/opt.rs
diff --git a/examples/rust/ex4-vector-surface/src/transform.rs b/examples/rust/ex2-surface-vector/src/transform.rs
similarity index 96%
rename from examples/rust/ex4-vector-surface/src/transform.rs
rename to examples/rust/ex2-surface-vector/src/transform.rs
index d8c21af927..5a15323e28 100644
--- a/examples/rust/ex4-vector-surface/src/transform.rs
+++ b/examples/rust/ex2-surface-vector/src/transform.rs
@@ -24,7 +24,8 @@ pub(crate) fn transform_mesh_coordinates(
     let exact_area = match dim {
         1 => 2.0,
         2 => 4.0,
-        _ => 6.0,
+        3 => 6.0,
+        _ => unreachable!(),
     };
     Ok(exact_area)
 }
diff --git a/examples/rust/ex2-surface/src/main.rs b/examples/rust/ex2-surface/src/main.rs
index de5172aef6..b4f892d30e 100644
--- a/examples/rust/ex2-surface/src/main.rs
+++ b/examples/rust/ex2-surface/src/main.rs
@@ -123,7 +123,6 @@ fn example_2(options: opt::Opt) -> libceed::Result<()> {
         dim * (dim + 1) / 2,
         num_qpts,
     )?;
-
     let (rstr_solution, _) =
         mesh::build_cartesian_restriction(&ceed, dim, num_xyz, solution_degree, 1, num_qpts)?;
     let mesh_size = rstr_mesh.lvector_size();
diff --git a/examples/rust/ex2-surface/src/transform.rs b/examples/rust/ex2-surface/src/transform.rs
index d8c21af927..5a15323e28 100644
--- a/examples/rust/ex2-surface/src/transform.rs
+++ b/examples/rust/ex2-surface/src/transform.rs
@@ -24,7 +24,8 @@ pub(crate) fn transform_mesh_coordinates(
     let exact_area = match dim {
         1 => 2.0,
         2 => 4.0,
-        _ => 6.0,
+        3 => 6.0,
+        _ => unreachable!(),
     };
     Ok(exact_area)
 }
diff --git a/examples/rust/ex3-volume-vector/.gitignore b/examples/rust/ex3-volume-vector/.gitignore
new file mode 100644
index 0000000000..a9d37c560c
--- /dev/null
+++ b/examples/rust/ex3-volume-vector/.gitignore
@@ -0,0 +1,2 @@
+target
+Cargo.lock
diff --git a/examples/rust/ex3-volume-vector/Cargo.toml b/examples/rust/ex3-volume-vector/Cargo.toml
new file mode 100644
index 0000000000..bfbe7241e0
--- /dev/null
+++ b/examples/rust/ex3-volume-vector/Cargo.toml
@@ -0,0 +1,15 @@
+[package]
+name = "ex3-volume-vector"
+version = "0.11.0"
+authors = [
+    "Jeremy L Thompson <thompson.jeremy.luke@gmail.com>",
+]
+edition = "2018"
+
+[dependencies]
+clap = { version = "4.0.17", features = ["derive"] }
+libceed = { path = "../../../rust/libceed" }
+mesh = { path = "../mesh" }
+
+[package.metadata.release]
+release = false
diff --git a/examples/rust/ex3-volume-vector/src/main.rs b/examples/rust/ex3-volume-vector/src/main.rs
new file mode 100644
index 0000000000..0d572871b3
--- /dev/null
+++ b/examples/rust/ex3-volume-vector/src/main.rs
@@ -0,0 +1,438 @@
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+//
+//                             libCEED Example 1
+//
+// This example illustrates a simple usage of libCEED to compute the volume of a
+// 3D body using matrix-free application of a mass + diff operator.  Arbitrary
+// mesh and solution orders in 1D, 2D and 3D are supported from the same code.
+// This calculation is executed in triplicate with a 3 component vector system.
+//
+// The example has no dependencies, and is designed to be self-contained. For
+// additional examples that use external discretization libraries (MFEM, PETSc,
+// etc.) see the subdirectories in libceed/examples.
+//
+// All libCEED objects use a Ceed device object constructed based on a command
+// line argument (-ceed).
+
+use clap::Parser;
+use libceed::{
+    BasisOpt, Ceed, ElemRestrictionOpt, QFunctionInputs, QFunctionOpt, QFunctionOutputs, VectorOpt,
+};
+mod opt;
+mod transform;
+
+// ----------------------------------------------------------------------------
+// Example 1
+// ----------------------------------------------------------------------------
+fn main() -> libceed::Result<()> {
+    let options = opt::Opt::parse();
+    example_3_vector(options)
+}
+
+#[allow(clippy::erasing_op)]
+#[allow(clippy::identity_op)]
+fn example_3_vector(options: opt::Opt) -> libceed::Result<()> {
+    // Process command line arguments
+    let opt::Opt {
+        ceed_spec,
+        dim,
+        mesh_degree,
+        solution_degree,
+        num_qpts,
+        problem_size_requested,
+        test,
+        quiet,
+    } = options;
+    assert!((1..=3).contains(&dim));
+    assert!(mesh_degree >= 1);
+    assert!(solution_degree >= 1);
+    assert!(num_qpts >= 1);
+    let ncomp_x = dim;
+    let problem_size: i64 = if problem_size_requested < 0 {
+        if test {
+            8 * 16
+        } else {
+            256 * 1024
+        }
+    } else {
+        problem_size_requested
+    };
+    let ncomp_u = 3;
+
+    // Summary output
+    if !quiet {
+        println!("Selected options: [command line option] : <current value>");
+        println!("    Ceed specification [-c] : {}", ceed_spec);
+        println!("    Mesh dimension     [-d] : {}", dim);
+        println!("    Mesh degree        [-m] : {}", mesh_degree);
+        println!("    Solution degree    [-p] : {}", solution_degree);
+        println!("    Num. 1D quadr. pts [-q] : {}", num_qpts);
+        println!("    Approx. # unknowns [-s] : {}", problem_size);
+        println!("    QFunction source        : user closure");
+    }
+
+    // Initalize ceed context
+    let ceed = Ceed::init(&ceed_spec);
+
+    // Mesh and solution bases
+    let basis_mesh = ceed.basis_tensor_H1_Lagrange(
+        dim,
+        ncomp_x,
+        mesh_degree + 1,
+        num_qpts,
+        libceed::QuadMode::Gauss,
+    )?;
+    let basis_solution = ceed.basis_tensor_H1_Lagrange(
+        dim,
+        ncomp_u,
+        solution_degree + 1,
+        num_qpts,
+        libceed::QuadMode::Gauss,
+    )?;
+
+    // Determine mesh size from approximate problem size
+    let num_xyz = mesh::cartesian_mesh_size(dim, solution_degree, problem_size);
+    if !quiet {
+        print!("\nMesh size                   : nx = {}", num_xyz[0]);
+        if dim > 1 {
+            print!(", ny = {}", num_xyz[1]);
+        }
+        if dim > 2 {
+            print!(", nz = {}", num_xyz[2]);
+        }
+        println!();
+    }
+
+    // Build ElemRestriction objects describing the mesh and solution discrete
+    // representations
+    let (rstr_mesh, _) =
+        mesh::build_cartesian_restriction(&ceed, dim, num_xyz, mesh_degree, ncomp_x, num_qpts)?;
+    let (_, rstr_qdata) = mesh::build_cartesian_restriction(
+        &ceed,
+        dim,
+        num_xyz,
+        solution_degree,
+        1 + dim * (dim + 1) / 2,
+        num_qpts,
+    )?;
+    let (rstr_solution, _) =
+        mesh::build_cartesian_restriction(&ceed, dim, num_xyz, solution_degree, ncomp_u, num_qpts)?;
+    let mesh_size = rstr_mesh.lvector_size();
+    let solution_size = rstr_solution.lvector_size();
+    if !quiet {
+        println!("Number of mesh nodes        : {}", mesh_size / dim);
+        println!("Number of solution nodes    : {}", solution_size);
+    }
+
+    // Create a Vector with the mesh coordinates
+    let mut mesh_coords = mesh::cartesian_mesh_coords(&ceed, dim, num_xyz, mesh_degree, mesh_size)?;
+
+    // Apply a transformation to the mesh coordinates
+    let exact_volume = transform::transform_mesh_coordinates(dim, mesh_size, &mut mesh_coords)?;
+
+    // QFunction that builds the quadrature data for the mass + diff operator
+    // -- QFunction from user closure
+    let build_mass_diff = move |[jacobian, weights, ..]: QFunctionInputs,
+                                [qdata, ..]: QFunctionOutputs| {
+        // Build quadrature data
+        match dim {
+            1 => {
+                let q = qdata.len() / 2;
+                for i in 0..q {
+                    // Mass
+                    qdata[i + q * 0] = weights[i] * jacobian[i];
+                    // Diff
+                    qdata[i + q * 1] = weights[i] / jacobian[i];
+                }
+            }
+            2 => {
+                let q = qdata.len() / 4;
+                for i in 0..q {
+                    let j11 = jacobian[i + q * 0];
+                    let j21 = jacobian[i + q * 1];
+                    let j12 = jacobian[i + q * 2];
+                    let j22 = jacobian[i + q * 3];
+                    // Mass
+                    qdata[i + q * 0] = weights[i] * (j11 * j22 - j21 * j12);
+                    // Diff
+                    let qw = weights[i] / (j11 * j22 - j21 * j12);
+                    qdata[i + q * 1] = qw * (j12 * j12 + j22 * j22);
+                    qdata[i + q * 2] = qw * (j11 * j11 + j21 * j21);
+                    qdata[i + q * 3] = -qw * (j11 * j12 + j21 * j22);
+                }
+            }
+            3 => {
+                let q = qdata.len() / 7;
+                for i in 0..q {
+                    let mut a = [0.0; 9];
+                    for j in 0..3 {
+                        for k in 0..3 {
+                            a[k * 3 + j] = jacobian[i + q * ((j + 1) % 3 + 3 * ((k + 1) % 3))]
+                                * jacobian[i + q * ((j + 2) % 3 + 3 * ((k + 2) % 3))]
+                                - jacobian[i + q * ((j + 1) % 3 + 3 * ((k + 2) % 3))]
+                                    * jacobian[i + q * ((j + 2) % 3 + 3 * ((k + 1) % 3))];
+                        }
+                    }
+                    // Mass
+                    qdata[i + q * 0] = weights[i]
+                        * (jacobian[i + q * 0] * a[0 * 3 + 0]
+                            + jacobian[i + q * 1] * a[0 * 3 + 1]
+                            + jacobian[i + q * 2] * a[0 * 3 + 2]);
+                    let qw = weights[i]
+                        / (jacobian[i + q * 0] * a[0 * 3 + 0]
+                            + jacobian[i + q * 1] * a[0 * 3 + 1]
+                            + jacobian[i + q * 2] * a[0 * 3 + 2]);
+                    // Diff
+                    qdata[i + q * 1] = qw
+                        * (a[0 * 3 + 0] * a[0 * 3 + 0]
+                            + a[0 * 3 + 1] * a[0 * 3 + 1]
+                            + a[0 * 3 + 2] * a[0 * 3 + 2]);
+                    qdata[i + q * 2] = qw
+                        * (a[1 * 3 + 0] * a[1 * 3 + 0]
+                            + a[1 * 3 + 1] * a[1 * 3 + 1]
+                            + a[1 * 3 + 2] * a[1 * 3 + 2]);
+                    qdata[i + q * 3] = qw
+                        * (a[2 * 3 + 0] * a[2 * 3 + 0]
+                            + a[2 * 3 + 1] * a[2 * 3 + 1]
+                            + a[2 * 3 + 2] * a[2 * 3 + 2]);
+                    qdata[i + q * 4] = qw
+                        * (a[1 * 3 + 0] * a[2 * 3 + 0]
+                            + a[1 * 3 + 1] * a[2 * 3 + 1]
+                            + a[1 * 3 + 2] * a[2 * 3 + 2]);
+                    qdata[i + q * 5] = qw
+                        * (a[0 * 3 + 0] * a[2 * 3 + 0]
+                            + a[0 * 3 + 1] * a[2 * 3 + 1]
+                            + a[0 * 3 + 2] * a[2 * 3 + 2]);
+                    qdata[i + q * 6] = qw
+                        * (a[0 * 3 + 0] * a[1 * 3 + 0]
+                            + a[0 * 3 + 1] * a[1 * 3 + 1]
+                            + a[0 * 3 + 2] * a[1 * 3 + 2]);
+                }
+            }
+            _ => unreachable!(),
+        };
+
+        // Return clean error code
+        0
+    };
+    let qf_build_closure = ceed
+        .q_function_interior(1, Box::new(build_mass_diff))?
+        .input("dx", ncomp_x * dim, libceed::EvalMode::Grad)?
+        .input("weights", 1, libceed::EvalMode::Weight)?
+        .output("qdata", 1 + dim * (dim + 1) / 2, libceed::EvalMode::None)?;
+    // -- QFunction for use with Operator
+    let qf_build = QFunctionOpt::SomeQFunction(&qf_build_closure);
+
+    // Operator that build the quadrature data for the mass + diff operator
+    let op_build = ceed
+        .operator(qf_build, QFunctionOpt::None, QFunctionOpt::None)?
+        .name("build qdata")?
+        .field("dx", &rstr_mesh, &basis_mesh, VectorOpt::Active)?
+        .field(
+            "weights",
+            ElemRestrictionOpt::None,
+            &basis_mesh,
+            VectorOpt::None,
+        )?
+        .field("qdata", &rstr_qdata, BasisOpt::None, VectorOpt::Active)?
+        .check()?;
+
+    // Compute the quadrature data for the mass + diff operator
+    let elem_qpts = num_qpts.pow(dim as u32);
+    let num_elem: usize = num_xyz.iter().take(dim).product();
+    let mut qdata = ceed.vector(num_elem * elem_qpts * (1 + dim * (dim + 1) / 2))?;
+    op_build.apply(&mesh_coords, &mut qdata)?;
+
+    // QFunction that applies the mass + diff operator
+    // -- QFunction from user closure
+    let apply_mass_diff = move |[u, ug, qdata, ..]: QFunctionInputs,
+                                [v, vg, ..]: QFunctionOutputs| {
+        // Apply diffusion operator
+        match dim {
+            1 => {
+                let q = qdata.len() / 2;
+                for i in 0..q {
+                    for c in 0..ncomp_u {
+                        // Mass
+                        v[i + c * q] = u[i + c * q] * qdata[i + 0 * q];
+                        // Diff
+                        vg[i + c * q] = ug[i + c * q] * qdata[i + 1 * q];
+                    }
+                }
+            }
+            2 => {
+                let q = qdata.len() / 4;
+                for i in 0..q {
+                    let dxdxdxdx_t = [
+                        [qdata[i + 1 * q], qdata[i + 3 * q]],
+                        [qdata[i + 3 * q], qdata[i + 2 * q]],
+                    ];
+                    for c in 0..ncomp_u {
+                        // Mass
+                        v[i + c * q] = u[i + c * q] * qdata[i + 0 * q];
+                        // Diff
+                        let du = [ug[i + (c + 0 * ncomp_u) * q], ug[i + (c + 1 * ncomp_u) * q]];
+                        for j in 0..2 {
+                            vg[i + (j + j * ncomp_u) * q] =
+                                du[0] * dxdxdxdx_t[0][j] + du[1] * dxdxdxdx_t[1][j];
+                        }
+                    }
+                }
+            }
+            3 => {
+                let q = qdata.len() / 7;
+                for i in 0..q {
+                    let dxdxdxdx_t = [
+                        [qdata[i + 1 * q], qdata[i + 6 * q], qdata[i + 5 * q]],
+                        [qdata[i + 6 * q], qdata[i + 2 * q], qdata[i + 4 * q]],
+                        [qdata[i + 5 * q], qdata[i + 4 * q], qdata[i + 3 * q]],
+                    ];
+                    for c in 0..ncomp_u {
+                        // Mass
+                        v[i + c * q] = u[i + c * q] * qdata[i + 0 * q];
+                        // Diff
+                        let du = [
+                            ug[i + (c + 0 * ncomp_u) * q],
+                            ug[i + (c + 1 * ncomp_u) * q],
+                            ug[i + (c + 2 * ncomp_u) * q],
+                        ];
+                        for j in 0..3 {
+                            vg[i + (c + j * ncomp_u) * q] = du[0] * dxdxdxdx_t[0][j]
+                                + du[1] * dxdxdxdx_t[1][j]
+                                + du[2] * dxdxdxdx_t[2][j];
+                        }
+                    }
+                }
+            }
+            _ => unreachable!(),
+        };
+
+        // Return clean error code
+        0
+    };
+    let qf_mass_diff_closure = ceed
+        .q_function_interior(1, Box::new(apply_mass_diff))?
+        .input("u", ncomp_u, libceed::EvalMode::Interp)?
+        .input("du", dim * ncomp_u, libceed::EvalMode::Grad)?
+        .input("qdata", 1 + dim * (dim + 1) / 2, libceed::EvalMode::None)?
+        .output("v", ncomp_u, libceed::EvalMode::Interp)?
+        .output("dv", dim * ncomp_u, libceed::EvalMode::Grad)?;
+    // -- QFunction for use with Operator
+    let qf_mass_diff = QFunctionOpt::SomeQFunction(&qf_mass_diff_closure);
+
+    // Mass + diff Operator
+    let op_mass_diff = ceed
+        .operator(qf_mass_diff, QFunctionOpt::None, QFunctionOpt::None)?
+        .name("mass diff")?
+        .field("u", &rstr_solution, &basis_solution, VectorOpt::Active)?
+        .field("du", &rstr_solution, &basis_solution, VectorOpt::Active)?
+        .field("qdata", &rstr_qdata, BasisOpt::None, &qdata)?
+        .field("v", &rstr_solution, &basis_solution, VectorOpt::Active)?
+        .field("dv", &rstr_solution, &basis_solution, VectorOpt::Active)?
+        .check()?;
+
+    // Solution vectors
+    let mut u = ceed.vector(solution_size)?;
+    let mut v = ceed.vector(solution_size)?;
+
+    // Initialize u with component index
+    u.set_value(0.0)?;
+    for c in 0..ncomp_u {
+        let q = solution_size / ncomp_u;
+        u.view_mut()?.iter_mut().skip(c * q).take(q).for_each(|u| {
+            *u = (c + 1) as libceed::Scalar;
+        });
+    }
+
+    // Apply the mass + diff operator
+    op_mass_diff.apply(&u, &mut v)?;
+
+    // Compute the mesh volume
+    let volume: libceed::Scalar = v.view()?.iter().sum::<libceed::Scalar>()
+        / ((ncomp_u * (ncomp_u + 1)) / 2) as libceed::Scalar;
+
+    // Output results
+    if !quiet {
+        println!("Exact mesh volume           : {:.12}", exact_volume);
+        println!("Computed mesh volume        : {:.12}", volume);
+        println!(
+            "Volume error                : {:.12e}",
+            volume - exact_volume
+        );
+    }
+    let tolerance = match dim {
+        1 => 200.0 * libceed::EPSILON,
+        _ => 1E-5,
+    };
+    let error = (volume - exact_volume).abs();
+    if error > tolerance {
+        println!("Volume error too large: {:.12e}", error);
+        return Err(libceed::Error {
+            message: format!(
+                "Volume error too large - expected: {:.12e}, actual: {:.12e}",
+                tolerance, error
+            ),
+        });
+    }
+    Ok(())
+}
+
+// ----------------------------------------------------------------------------
+// Tests
+// ----------------------------------------------------------------------------
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn example_3_vector_1d() {
+        let options = opt::Opt {
+            ceed_spec: "/cpu/self/ref/serial".to_string(),
+            dim: 1,
+            mesh_degree: 4,
+            solution_degree: 4,
+            num_qpts: 6,
+            problem_size_requested: -1,
+            test: true,
+            quiet: true,
+        };
+        assert!(example_3_vector(options).is_ok());
+    }
+
+    #[test]
+    fn example_3_vector_2d() {
+        let options = opt::Opt {
+            ceed_spec: "/cpu/self/ref/serial".to_string(),
+            dim: 2,
+            mesh_degree: 4,
+            solution_degree: 4,
+            num_qpts: 6,
+            problem_size_requested: -1,
+            test: true,
+            quiet: true,
+        };
+        assert!(example_3_vector(options).is_ok());
+    }
+
+    #[test]
+    fn example_3_vector_vector_3d() {
+        let options = opt::Opt {
+            ceed_spec: "/cpu/self/ref/serial".to_string(),
+            dim: 3,
+            mesh_degree: 4,
+            solution_degree: 4,
+            num_qpts: 6,
+            problem_size_requested: -1,
+            test: true,
+            quiet: false,
+        };
+        assert!(example_3_vector(options).is_ok());
+    }
+}
+
+// ----------------------------------------------------------------------------
diff --git a/examples/rust/ex3-volume-vector/src/opt.rs b/examples/rust/ex3-volume-vector/src/opt.rs
new file mode 100644
index 0000000000..e21db82caa
--- /dev/null
+++ b/examples/rust/ex3-volume-vector/src/opt.rs
@@ -0,0 +1,45 @@
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+use clap::Parser;
+
+// ----------------------------------------------------------------------------
+// Command line arguments
+// ----------------------------------------------------------------------------
+#[derive(Debug, Parser)]
+#[command(
+    name = "libCEED Rust Example 3 - Volume",
+    about = "This example uses the mass matrix and diffusion matrices to compute the length, area, or volume of a region, depending upon runtime parameters."
+)]
+pub(crate) struct Opt {
+    /// libCEED backend resource to use
+    #[arg(name = "CEED", short, long = "ceed", default_value = "/cpu/self")]
+    pub(crate) ceed_spec: String,
+    /// Mesh dimension
+    #[arg(short, long = "dimension", default_value = "3")]
+    pub(crate) dim: usize,
+    /// Polynomial degree for the mesh
+    #[arg(short, long, default_value = "4")]
+    pub(crate) mesh_degree: usize,
+    /// Polynomial degree for the solution
+    #[arg(short = 'p', long, default_value = "4")]
+    pub(crate) solution_degree: usize,
+    /// Number of quadrature points in 1D
+    #[arg(short = 'q', long, default_value = "6")]
+    pub(crate) num_qpts: usize,
+    /// Approximate problem size
+    #[arg(name = "DoF", short = 's', long = "problem_size", default_value = "-1")]
+    pub(crate) problem_size_requested: i64,
+    /// Test mode
+    #[arg(short, long)]
+    pub(crate) test: bool,
+    /// Quiet mode
+    #[arg(short = 'x', long)]
+    pub(crate) quiet: bool,
+}
+
+// ----------------------------------------------------------------------------
diff --git a/examples/rust/ex3-volume-vector/src/transform.rs b/examples/rust/ex3-volume-vector/src/transform.rs
new file mode 100644
index 0000000000..01054d2ecd
--- /dev/null
+++ b/examples/rust/ex3-volume-vector/src/transform.rs
@@ -0,0 +1,50 @@
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+// ----------------------------------------------------------------------------
+// Transform mesh coordinates
+// ----------------------------------------------------------------------------
+pub(crate) fn transform_mesh_coordinates(
+    dim: usize,
+    mesh_size: usize,
+    mesh_coords: &mut libceed::Vector,
+) -> libceed::Result<libceed::Scalar> {
+    // Transform coordinates
+    match dim {
+        1 => {
+            for coord in mesh_coords.view_mut()?.iter_mut() {
+                // map [0,1] to [0,1] varying the mesh density
+                *coord = 0.5
+                    + 1.0 / (3.0 as libceed::Scalar).sqrt()
+                        * ((2.0 / 3.0) * std::f64::consts::PI as libceed::Scalar * (*coord - 0.5))
+                            .sin()
+            }
+        }
+        _ => {
+            let num_nodes = mesh_size / dim;
+            let mut coords = mesh_coords.view_mut()?;
+            for i in 0..num_nodes {
+                // map (x,y) from [0,1]x[0,1] to the quarter annulus with polar
+                // coordinates, (r,phi) in [1,2]x[0,pi/2] with area = 3/4*pi
+                let u = coords[i] + 1.;
+                let v = coords[i + num_nodes] * std::f64::consts::PI / 2.;
+                coords[i] = u * v.cos();
+                coords[i + num_nodes] = u * v.sin();
+            }
+        }
+    }
+
+    // Exact volume of transformed region
+    let exact_volume = match dim {
+        1 => 1.,
+        2 | 3 => 3. / 4. * std::f64::consts::PI,
+        _ => unreachable!(),
+    };
+    Ok(exact_volume)
+}
+
+// ----------------------------------------------------------------------------
diff --git a/examples/rust/ex3-volume/.gitignore b/examples/rust/ex3-volume/.gitignore
new file mode 100644
index 0000000000..a9d37c560c
--- /dev/null
+++ b/examples/rust/ex3-volume/.gitignore
@@ -0,0 +1,2 @@
+target
+Cargo.lock
diff --git a/examples/rust/ex3-volume/Cargo.toml b/examples/rust/ex3-volume/Cargo.toml
new file mode 100644
index 0000000000..0c2f979c6b
--- /dev/null
+++ b/examples/rust/ex3-volume/Cargo.toml
@@ -0,0 +1,15 @@
+[package]
+name = "ex3-volume"
+version = "0.11.0"
+authors = [
+    "Jeremy L Thompson <thompson.jeremy.luke@gmail.com>",
+]
+edition = "2018"
+
+[dependencies]
+clap = { version = "4.0.17", features = ["derive"] }
+libceed = { path = "../../../rust/libceed" }
+mesh = { path = "../mesh" }
+
+[package.metadata.release]
+release = false
diff --git a/examples/rust/ex3-volume/src/main.rs b/examples/rust/ex3-volume/src/main.rs
new file mode 100644
index 0000000000..7f7b691be0
--- /dev/null
+++ b/examples/rust/ex3-volume/src/main.rs
@@ -0,0 +1,415 @@
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+//
+//                             libCEED Example 1
+//
+// This example illustrates a simple usage of libCEED to compute the volume of a
+// 3D body using matrix-free application of a mass + diff operator.  Arbitrary
+// mesh and solution orders in 1D, 2D and 3D are supported from the same code.
+//
+// The example has no dependencies, and is designed to be self-contained. For
+// additional examples that use external discretization libraries (MFEM, PETSc,
+// etc.) see the subdirectories in libceed/examples.
+//
+// All libCEED objects use a Ceed device object constructed based on a command
+// line argument (-ceed).
+
+use clap::Parser;
+use libceed::{
+    BasisOpt, Ceed, ElemRestrictionOpt, QFunctionInputs, QFunctionOpt, QFunctionOutputs, VectorOpt,
+};
+mod opt;
+mod transform;
+
+// ----------------------------------------------------------------------------
+// Example 1
+// ----------------------------------------------------------------------------
+fn main() -> libceed::Result<()> {
+    let options = opt::Opt::parse();
+    example_3(options)
+}
+
+#[allow(clippy::erasing_op)]
+#[allow(clippy::identity_op)]
+fn example_3(options: opt::Opt) -> libceed::Result<()> {
+    // Process command line arguments
+    let opt::Opt {
+        ceed_spec,
+        dim,
+        mesh_degree,
+        solution_degree,
+        num_qpts,
+        problem_size_requested,
+        test,
+        quiet,
+    } = options;
+    assert!((1..=3).contains(&dim));
+    assert!(mesh_degree >= 1);
+    assert!(solution_degree >= 1);
+    assert!(num_qpts >= 1);
+    let ncomp_x = dim;
+    let problem_size: i64 = if problem_size_requested < 0 {
+        if test {
+            8 * 16
+        } else {
+            256 * 1024
+        }
+    } else {
+        problem_size_requested
+    };
+
+    // Summary output
+    if !quiet {
+        println!("Selected options: [command line option] : <current value>");
+        println!("    Ceed specification [-c] : {}", ceed_spec);
+        println!("    Mesh dimension     [-d] : {}", dim);
+        println!("    Mesh degree        [-m] : {}", mesh_degree);
+        println!("    Solution degree    [-p] : {}", solution_degree);
+        println!("    Num. 1D quadr. pts [-q] : {}", num_qpts);
+        println!("    Approx. # unknowns [-s] : {}", problem_size);
+        println!("    QFunction source        : user closure");
+    }
+
+    // Initalize ceed context
+    let ceed = Ceed::init(&ceed_spec);
+
+    // Mesh and solution bases
+    let basis_mesh = ceed.basis_tensor_H1_Lagrange(
+        dim,
+        ncomp_x,
+        mesh_degree + 1,
+        num_qpts,
+        libceed::QuadMode::Gauss,
+    )?;
+    let basis_solution = ceed.basis_tensor_H1_Lagrange(
+        dim,
+        1,
+        solution_degree + 1,
+        num_qpts,
+        libceed::QuadMode::Gauss,
+    )?;
+
+    // Determine mesh size from approximate problem size
+    let num_xyz = mesh::cartesian_mesh_size(dim, solution_degree, problem_size);
+    if !quiet {
+        print!("\nMesh size                   : nx = {}", num_xyz[0]);
+        if dim > 1 {
+            print!(", ny = {}", num_xyz[1]);
+        }
+        if dim > 2 {
+            print!(", nz = {}", num_xyz[2]);
+        }
+        println!();
+    }
+
+    // Build ElemRestriction objects describing the mesh and solution discrete
+    // representations
+    let (rstr_mesh, _) =
+        mesh::build_cartesian_restriction(&ceed, dim, num_xyz, mesh_degree, ncomp_x, num_qpts)?;
+    let (_, rstr_qdata) = mesh::build_cartesian_restriction(
+        &ceed,
+        dim,
+        num_xyz,
+        solution_degree,
+        1 + dim * (dim + 1) / 2,
+        num_qpts,
+    )?;
+    let (rstr_solution, _) =
+        mesh::build_cartesian_restriction(&ceed, dim, num_xyz, solution_degree, 1, num_qpts)?;
+    let mesh_size = rstr_mesh.lvector_size();
+    let solution_size = rstr_solution.lvector_size();
+    if !quiet {
+        println!("Number of mesh nodes        : {}", mesh_size / dim);
+        println!("Number of solution nodes    : {}", solution_size);
+    }
+
+    // Create a Vector with the mesh coordinates
+    let mut mesh_coords = mesh::cartesian_mesh_coords(&ceed, dim, num_xyz, mesh_degree, mesh_size)?;
+
+    // Apply a transformation to the mesh coordinates
+    let exact_volume = transform::transform_mesh_coordinates(dim, mesh_size, &mut mesh_coords)?;
+
+    // QFunction that builds the quadrature data for the mass + diff operator
+    // -- QFunction from user closure
+    let build_mass_diff = move |[jacobian, weights, ..]: QFunctionInputs,
+                                [qdata, ..]: QFunctionOutputs| {
+        // Build quadrature data
+        match dim {
+            1 => {
+                let q = qdata.len() / 2;
+                for i in 0..q {
+                    // Mass
+                    qdata[i + q * 0] = weights[i] * jacobian[i];
+                    // Diff
+                    qdata[i + q * 1] = weights[i] / jacobian[i];
+                }
+            }
+            2 => {
+                let q = qdata.len() / 4;
+                for i in 0..q {
+                    let j11 = jacobian[i + q * 0];
+                    let j21 = jacobian[i + q * 1];
+                    let j12 = jacobian[i + q * 2];
+                    let j22 = jacobian[i + q * 3];
+                    // Mass
+                    qdata[i + q * 0] = weights[i] * (j11 * j22 - j21 * j12);
+                    // Diff
+                    let qw = weights[i] / (j11 * j22 - j21 * j12);
+                    qdata[i + q * 1] = qw * (j12 * j12 + j22 * j22);
+                    qdata[i + q * 2] = qw * (j11 * j11 + j21 * j21);
+                    qdata[i + q * 3] = -qw * (j11 * j12 + j21 * j22);
+                }
+            }
+            3 => {
+                let q = qdata.len() / 7;
+                for i in 0..q {
+                    let mut a = [0.0; 9];
+                    for j in 0..3 {
+                        for k in 0..3 {
+                            a[k * 3 + j] = jacobian[i + q * ((j + 1) % 3 + 3 * ((k + 1) % 3))]
+                                * jacobian[i + q * ((j + 2) % 3 + 3 * ((k + 2) % 3))]
+                                - jacobian[i + q * ((j + 1) % 3 + 3 * ((k + 2) % 3))]
+                                    * jacobian[i + q * ((j + 2) % 3 + 3 * ((k + 1) % 3))];
+                        }
+                    }
+                    // Mass
+                    qdata[i + q * 0] = weights[i]
+                        * (jacobian[i + q * 0] * a[0 * 3 + 0]
+                            + jacobian[i + q * 1] * a[0 * 3 + 1]
+                            + jacobian[i + q * 2] * a[0 * 3 + 2]);
+                    let qw = weights[i]
+                        / (jacobian[i + q * 0] * a[0 * 3 + 0]
+                            + jacobian[i + q * 1] * a[0 * 3 + 1]
+                            + jacobian[i + q * 2] * a[0 * 3 + 2]);
+                    // Diff
+                    qdata[i + q * 1] = qw
+                        * (a[0 * 3 + 0] * a[0 * 3 + 0]
+                            + a[0 * 3 + 1] * a[0 * 3 + 1]
+                            + a[0 * 3 + 2] * a[0 * 3 + 2]);
+                    qdata[i + q * 2] = qw
+                        * (a[1 * 3 + 0] * a[1 * 3 + 0]
+                            + a[1 * 3 + 1] * a[1 * 3 + 1]
+                            + a[1 * 3 + 2] * a[1 * 3 + 2]);
+                    qdata[i + q * 3] = qw
+                        * (a[2 * 3 + 0] * a[2 * 3 + 0]
+                            + a[2 * 3 + 1] * a[2 * 3 + 1]
+                            + a[2 * 3 + 2] * a[2 * 3 + 2]);
+                    qdata[i + q * 4] = qw
+                        * (a[1 * 3 + 0] * a[2 * 3 + 0]
+                            + a[1 * 3 + 1] * a[2 * 3 + 1]
+                            + a[1 * 3 + 2] * a[2 * 3 + 2]);
+                    qdata[i + q * 5] = qw
+                        * (a[0 * 3 + 0] * a[2 * 3 + 0]
+                            + a[0 * 3 + 1] * a[2 * 3 + 1]
+                            + a[0 * 3 + 2] * a[2 * 3 + 2]);
+                    qdata[i + q * 6] = qw
+                        * (a[0 * 3 + 0] * a[1 * 3 + 0]
+                            + a[0 * 3 + 1] * a[1 * 3 + 1]
+                            + a[0 * 3 + 2] * a[1 * 3 + 2]);
+                }
+            }
+            _ => unreachable!(),
+        };
+
+        // Return clean error code
+        0
+    };
+    let qf_build_closure = ceed
+        .q_function_interior(1, Box::new(build_mass_diff))?
+        .input("dx", ncomp_x * dim, libceed::EvalMode::Grad)?
+        .input("weights", 1, libceed::EvalMode::Weight)?
+        .output("qdata", 1 + dim * (dim + 1) / 2, libceed::EvalMode::None)?;
+    // -- QFunction for use with Operator
+    let qf_build = QFunctionOpt::SomeQFunction(&qf_build_closure);
+
+    // Operator that build the quadrature data for the mass + diff operator
+    let op_build = ceed
+        .operator(qf_build, QFunctionOpt::None, QFunctionOpt::None)?
+        .name("build qdata")?
+        .field("dx", &rstr_mesh, &basis_mesh, VectorOpt::Active)?
+        .field(
+            "weights",
+            ElemRestrictionOpt::None,
+            &basis_mesh,
+            VectorOpt::None,
+        )?
+        .field("qdata", &rstr_qdata, BasisOpt::None, VectorOpt::Active)?
+        .check()?;
+
+    // Compute the quadrature data for the mass + diff operator
+    let elem_qpts = num_qpts.pow(dim as u32);
+    let num_elem: usize = num_xyz.iter().take(dim).product();
+    let mut qdata = ceed.vector(num_elem * elem_qpts * (1 + dim * (dim + 1) / 2))?;
+    op_build.apply(&mesh_coords, &mut qdata)?;
+
+    // QFunction that applies the mass + diff operator
+    // -- QFunction from user closure
+    let apply_mass_diff = move |[u, ug, qdata, ..]: QFunctionInputs,
+                                [v, vg, ..]: QFunctionOutputs| {
+        // Apply diffusion operator
+        match dim {
+            1 => {
+                let q = qdata.len() / 2;
+                for i in 0..q {
+                    // Mass
+                    v[i] = u[i] * qdata[i + 0 * q];
+                    // Diff
+                    vg[i] = ug[i] * qdata[i + 1 * q];
+                }
+            }
+            2 => {
+                let q = qdata.len() / 4;
+                for i in 0..q {
+                    // Mass
+                    v[i] = u[i] * qdata[i + 0 * q];
+                    // Diff
+                    let du = [ug[i + q * 0], ug[i + q * 1]];
+                    let dxdxdxdx_t = [
+                        [qdata[i + 1 * q], qdata[i + 3 * q]],
+                        [qdata[i + 3 * q], qdata[i + 2 * q]],
+                    ];
+                    for j in 0..2 {
+                        vg[i + j * q] = du[0] * dxdxdxdx_t[0][j] + du[1] * dxdxdxdx_t[1][j];
+                    }
+                }
+            }
+            3 => {
+                let q = qdata.len() / 7;
+                for i in 0..q {
+                    // Mass
+                    v[i] = u[i] * qdata[i + 0 * q];
+                    // Diff
+                    let du = [ug[i + q * 0], ug[i + q * 1], ug[i + q * 2]];
+                    let dxdxdxdx_t = [
+                        [qdata[i + 1 * q], qdata[i + 6 * q], qdata[i + 5 * q]],
+                        [qdata[i + 6 * q], qdata[i + 2 * q], qdata[i + 4 * q]],
+                        [qdata[i + 5 * q], qdata[i + 4 * q], qdata[i + 3 * q]],
+                    ];
+                    for j in 0..3 {
+                        vg[i + j * q] = du[0] * dxdxdxdx_t[0][j]
+                            + du[1] * dxdxdxdx_t[1][j]
+                            + du[2] * dxdxdxdx_t[2][j];
+                    }
+                }
+            }
+            _ => unreachable!(),
+        };
+
+        // Return clean error code
+        0
+    };
+    let qf_mass_diff_closure = ceed
+        .q_function_interior(1, Box::new(apply_mass_diff))?
+        .input("u", 1, libceed::EvalMode::Interp)?
+        .input("du", dim, libceed::EvalMode::Grad)?
+        .input("qdata", 1 + dim * (dim + 1) / 2, libceed::EvalMode::None)?
+        .output("v", 1, libceed::EvalMode::Interp)?
+        .output("dv", dim, libceed::EvalMode::Grad)?;
+    // -- QFunction for use with Operator
+    let qf_mass_diff = QFunctionOpt::SomeQFunction(&qf_mass_diff_closure);
+
+    // Mass + diff Operator
+    let op_mass_diff = ceed
+        .operator(qf_mass_diff, QFunctionOpt::None, QFunctionOpt::None)?
+        .name("mass diff")?
+        .field("u", &rstr_solution, &basis_solution, VectorOpt::Active)?
+        .field("du", &rstr_solution, &basis_solution, VectorOpt::Active)?
+        .field("qdata", &rstr_qdata, BasisOpt::None, &qdata)?
+        .field("v", &rstr_solution, &basis_solution, VectorOpt::Active)?
+        .field("dv", &rstr_solution, &basis_solution, VectorOpt::Active)?
+        .check()?;
+
+    // Solution vectors
+    let u = ceed.vector_from_slice(&vec![1.0; solution_size])?;
+    let mut v = ceed.vector(solution_size)?;
+
+    // Apply the mass + diff operator
+    op_mass_diff.apply(&u, &mut v)?;
+
+    // Compute the mesh volume
+    let volume: libceed::Scalar = v.view()?.iter().sum();
+
+    // Output results
+    if !quiet {
+        println!("Exact mesh volume           : {:.12}", exact_volume);
+        println!("Computed mesh volume        : {:.12}", volume);
+        println!(
+            "Volume error                : {:.12e}",
+            volume - exact_volume
+        );
+    }
+    let tolerance = match dim {
+        1 => 200.0 * libceed::EPSILON,
+        _ => 1E-5,
+    };
+    let error = (volume - exact_volume).abs();
+    if error > tolerance {
+        println!("Volume error too large: {:.12e}", error);
+        return Err(libceed::Error {
+            message: format!(
+                "Volume error too large - expected: {:.12e}, actual: {:.12e}",
+                tolerance, error
+            ),
+        });
+    }
+    Ok(())
+}
+
+// ----------------------------------------------------------------------------
+// Tests
+// ----------------------------------------------------------------------------
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn example_3_1d() {
+        let options = opt::Opt {
+            ceed_spec: "/cpu/self/ref/serial".to_string(),
+            dim: 1,
+            mesh_degree: 4,
+            solution_degree: 4,
+            num_qpts: 6,
+            problem_size_requested: -1,
+            test: true,
+            quiet: true,
+        };
+        assert!(example_3(options).is_ok());
+    }
+
+    #[test]
+    fn example_3_2d() {
+        let options = opt::Opt {
+            ceed_spec: "/cpu/self/ref/serial".to_string(),
+            dim: 2,
+            mesh_degree: 4,
+            solution_degree: 4,
+            num_qpts: 6,
+            problem_size_requested: -1,
+            test: true,
+            quiet: true,
+        };
+        assert!(example_3(options).is_ok());
+    }
+
+    #[test]
+    fn example_3_3d() {
+        let options = opt::Opt {
+            ceed_spec: "/cpu/self/ref/serial".to_string(),
+            dim: 3,
+            mesh_degree: 4,
+            solution_degree: 4,
+            num_qpts: 6,
+            problem_size_requested: -1,
+            test: true,
+            quiet: false,
+        };
+        assert!(example_3(options).is_ok());
+    }
+}
+
+// ----------------------------------------------------------------------------
diff --git a/examples/rust/ex3-volume/src/opt.rs b/examples/rust/ex3-volume/src/opt.rs
new file mode 100644
index 0000000000..e21db82caa
--- /dev/null
+++ b/examples/rust/ex3-volume/src/opt.rs
@@ -0,0 +1,45 @@
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+use clap::Parser;
+
+// ----------------------------------------------------------------------------
+// Command line arguments
+// ----------------------------------------------------------------------------
+#[derive(Debug, Parser)]
+#[command(
+    name = "libCEED Rust Example 3 - Volume",
+    about = "This example uses the mass matrix and diffusion matrices to compute the length, area, or volume of a region, depending upon runtime parameters."
+)]
+pub(crate) struct Opt {
+    /// libCEED backend resource to use
+    #[arg(name = "CEED", short, long = "ceed", default_value = "/cpu/self")]
+    pub(crate) ceed_spec: String,
+    /// Mesh dimension
+    #[arg(short, long = "dimension", default_value = "3")]
+    pub(crate) dim: usize,
+    /// Polynomial degree for the mesh
+    #[arg(short, long, default_value = "4")]
+    pub(crate) mesh_degree: usize,
+    /// Polynomial degree for the solution
+    #[arg(short = 'p', long, default_value = "4")]
+    pub(crate) solution_degree: usize,
+    /// Number of quadrature points in 1D
+    #[arg(short = 'q', long, default_value = "6")]
+    pub(crate) num_qpts: usize,
+    /// Approximate problem size
+    #[arg(name = "DoF", short = 's', long = "problem_size", default_value = "-1")]
+    pub(crate) problem_size_requested: i64,
+    /// Test mode
+    #[arg(short, long)]
+    pub(crate) test: bool,
+    /// Quiet mode
+    #[arg(short = 'x', long)]
+    pub(crate) quiet: bool,
+}
+
+// ----------------------------------------------------------------------------
diff --git a/examples/rust/ex3-volume/src/transform.rs b/examples/rust/ex3-volume/src/transform.rs
new file mode 100644
index 0000000000..01054d2ecd
--- /dev/null
+++ b/examples/rust/ex3-volume/src/transform.rs
@@ -0,0 +1,50 @@
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+// ----------------------------------------------------------------------------
+// Transform mesh coordinates
+// ----------------------------------------------------------------------------
+pub(crate) fn transform_mesh_coordinates(
+    dim: usize,
+    mesh_size: usize,
+    mesh_coords: &mut libceed::Vector,
+) -> libceed::Result<libceed::Scalar> {
+    // Transform coordinates
+    match dim {
+        1 => {
+            for coord in mesh_coords.view_mut()?.iter_mut() {
+                // map [0,1] to [0,1] varying the mesh density
+                *coord = 0.5
+                    + 1.0 / (3.0 as libceed::Scalar).sqrt()
+                        * ((2.0 / 3.0) * std::f64::consts::PI as libceed::Scalar * (*coord - 0.5))
+                            .sin()
+            }
+        }
+        _ => {
+            let num_nodes = mesh_size / dim;
+            let mut coords = mesh_coords.view_mut()?;
+            for i in 0..num_nodes {
+                // map (x,y) from [0,1]x[0,1] to the quarter annulus with polar
+                // coordinates, (r,phi) in [1,2]x[0,pi/2] with area = 3/4*pi
+                let u = coords[i] + 1.;
+                let v = coords[i + num_nodes] * std::f64::consts::PI / 2.;
+                coords[i] = u * v.cos();
+                coords[i + num_nodes] = u * v.sin();
+            }
+        }
+    }
+
+    // Exact volume of transformed region
+    let exact_volume = match dim {
+        1 => 1.,
+        2 | 3 => 3. / 4. * std::f64::consts::PI,
+        _ => unreachable!(),
+    };
+    Ok(exact_volume)
+}
+
+// ----------------------------------------------------------------------------

From 25220598ac232b61e90e8799d2beaac4b16966ad Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Mon, 19 May 2025 15:07:25 -0600
Subject: [PATCH 403/571] rust - fix build warning

---
 rust/libceed-sys/src/lib.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/rust/libceed-sys/src/lib.rs b/rust/libceed-sys/src/lib.rs
index 3335910adb..419ed405bb 100644
--- a/rust/libceed-sys/src/lib.rs
+++ b/rust/libceed-sys/src/lib.rs
@@ -18,5 +18,6 @@ pub mod bind_ceed {
     #![allow(non_camel_case_types)]
     #![allow(dead_code)]
     #![allow(clippy::too_long_first_doc_paragraph)]
+    #![allow(non_snake_case)]
     include!(concat!(env!("OUT_DIR"), "/bindings.rs"));
 }

From 4566048fb6713186ca3ec70b6f0dfa7b673ba3a9 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Tue, 27 May 2025 10:53:55 -0600
Subject: [PATCH 404/571] ex - use deal.II fe to build libCEED Basis

---
 examples/deal.II/bps.h | 38 ++++++++++++++++++++++++++++++++++++--
 1 file changed, 36 insertions(+), 2 deletions(-)

diff --git a/examples/deal.II/bps.h b/examples/deal.II/bps.h
index 32ac936aa7..720b57128d 100644
--- a/examples/deal.II/bps.h
+++ b/examples/deal.II/bps.h
@@ -202,8 +202,42 @@ class OperatorCeed : public OperatorBase<Number>
     // 2) create shape functions -> "ShapeInfo"
     const unsigned int fe_degree  = fe.tensor_degree();
     const unsigned int n_q_points = quadrature.get_tensor_basis()[0].size();
-    CeedBasisCreateTensorH1Lagrange(
-      ceed, dim, n_components, fe_degree + 1, n_q_points, CEED_GAUSS, &sol_basis);
+    {
+      FE_Q<1>            fe_1d{FE_Q<1>(fe_degree)};
+      const unsigned int array_size  = (fe_degree + 1) * (n_q_points);
+      double            *q_ref_1d    = new double[n_q_points];
+      double            *q_weight_1d = new double[n_q_points];
+      double            *interp_1d   = new double[array_size];
+      double            *grad_1d     = new double[array_size];
+      for (unsigned int i = 0; i < n_q_points; i++)
+        {
+          // Retrieve quadrature info
+          // Converting from [0, 1] to [-1, 1]
+          Point point    = quadrature.get_tensor_basis()[0].point(i);
+          q_ref_1d[i]    = 2.0 * (point(0) - 0.5);
+          q_weight_1d[i] = 2.0 * quadrature.get_tensor_basis()[0].weight(i);
+
+          // Retrieve 1D shape function values
+          for (unsigned int j = 0; j < fe_degree + 1; j++)
+            {
+              // Shuffle index of DoF
+              const int k                        = j == 0 ? 0 : ((j % fe_degree) + 1);
+              interp_1d[j + i * (fe_degree + 1)] = fe_1d.shape_value_component(k, point, 0);
+              grad_1d[j + i * (fe_degree + 1)]   = 0.5 * fe_1d.shape_grad_component(k, point, 0)[0];
+            }
+        }
+
+      CeedBasisCreateTensorH1(ceed,
+                              dim,
+                              n_components,
+                              fe_degree + 1,
+                              n_q_points,
+                              interp_1d,
+                              grad_1d,
+                              q_ref_1d,
+                              q_weight_1d,
+                              &sol_basis);
+    }
 
     // 3) create restriction matrix -> DoFInfo
     unsigned int n_local_active_cells = 0;

From 571e8cf012cd9ba55521730d89ff29ce614d2bcd Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Tue, 27 May 2025 12:32:40 -0600
Subject: [PATCH 405/571] ex - use ShapeInfo to build libCEED Basis

---
 examples/deal.II/bps.h | 38 +++++++++++++++++++++-----------------
 1 file changed, 21 insertions(+), 17 deletions(-)

diff --git a/examples/deal.II/bps.h b/examples/deal.II/bps.h
index 720b57128d..8e9924ed1b 100644
--- a/examples/deal.II/bps.h
+++ b/examples/deal.II/bps.h
@@ -24,6 +24,7 @@
 
 #include <deal.II/matrix_free/fe_evaluation.h>
 #include <deal.II/matrix_free/matrix_free.h>
+#include <deal.II/matrix_free/shape_info.h>
 #include <deal.II/matrix_free/tools.h>
 
 // libCEED includes
@@ -203,27 +204,30 @@ class OperatorCeed : public OperatorBase<Number>
     const unsigned int fe_degree  = fe.tensor_degree();
     const unsigned int n_q_points = quadrature.get_tensor_basis()[0].size();
     {
-      FE_Q<1>            fe_1d{FE_Q<1>(fe_degree)};
-      const unsigned int array_size  = (fe_degree + 1) * (n_q_points);
-      double            *q_ref_1d    = new double[n_q_points];
-      double            *q_weight_1d = new double[n_q_points];
-      double            *interp_1d   = new double[array_size];
-      double            *grad_1d     = new double[array_size];
+      dealii::internal::MatrixFreeFunctions::ShapeInfo<double> shape_info;
+      shape_info.reinit(quadrature, fe, 0);
+      dealii::internal::MatrixFreeFunctions::UnivariateShapeData<double> shape_data =
+        shape_info.get_shape_data();
+
+      std::vector<CeedScalar> q_ref_1d(n_q_points);
+      std::vector<CeedScalar> q_weight_1d(n_q_points);
+      const unsigned int      array_size = (fe_degree + 1) * (n_q_points);
+      std::vector<CeedScalar> interp_1d(array_size);
+      std::vector<CeedScalar> grad_1d(array_size);
       for (unsigned int i = 0; i < n_q_points; i++)
         {
           // Retrieve quadrature info
-          // Converting from [0, 1] to [-1, 1]
-          Point point    = quadrature.get_tensor_basis()[0].point(i);
-          q_ref_1d[i]    = 2.0 * (point(0) - 0.5);
+          // Convert reference element from [0, 1] to [-1, 1]
+          q_ref_1d[i]    = 2.0 * (quadrature.get_tensor_basis()[0].point(i)(0) - 0.5);
           q_weight_1d[i] = 2.0 * quadrature.get_tensor_basis()[0].weight(i);
 
           // Retrieve 1D shape function values
           for (unsigned int j = 0; j < fe_degree + 1; j++)
             {
-              // Shuffle index of DoF
-              const int k                        = j == 0 ? 0 : ((j % fe_degree) + 1);
-              interp_1d[j + i * (fe_degree + 1)] = fe_1d.shape_value_component(k, point, 0);
-              grad_1d[j + i * (fe_degree + 1)]   = 0.5 * fe_1d.shape_grad_component(k, point, 0)[0];
+              interp_1d[j + i * (fe_degree + 1)] = shape_data.shape_values[j + i * (fe_degree + 1)];
+              // Scale derivatives by 1/2 for new reference element length
+              grad_1d[j + i * (fe_degree + 1)] =
+                0.5 * shape_data.shape_gradients[j + i * (fe_degree + 1)];
             }
         }
 
@@ -232,10 +236,10 @@ class OperatorCeed : public OperatorBase<Number>
                               n_components,
                               fe_degree + 1,
                               n_q_points,
-                              interp_1d,
-                              grad_1d,
-                              q_ref_1d,
-                              q_weight_1d,
+                              interp_1d.data(),
+                              grad_1d.data(),
+                              q_ref_1d.data(),
+                              q_weight_1d.data(),
                               &sol_basis);
     }
 

From 0dd71613d151b524f4c19d05b5d2bbb37bac7329 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Tue, 27 May 2025 14:01:18 -0600
Subject: [PATCH 406/571] ex - use native deal.II ref element

---
 examples/deal.II/bps.h | 57 +++++++++++++++++++++---------------------
 1 file changed, 28 insertions(+), 29 deletions(-)

diff --git a/examples/deal.II/bps.h b/examples/deal.II/bps.h
index 8e9924ed1b..654c1b4358 100644
--- a/examples/deal.II/bps.h
+++ b/examples/deal.II/bps.h
@@ -208,38 +208,19 @@ class OperatorCeed : public OperatorBase<Number>
       shape_info.reinit(quadrature, fe, 0);
       dealii::internal::MatrixFreeFunctions::UnivariateShapeData<double> shape_data =
         shape_info.get_shape_data();
-
-      std::vector<CeedScalar> q_ref_1d(n_q_points);
-      std::vector<CeedScalar> q_weight_1d(n_q_points);
-      const unsigned int      array_size = (fe_degree + 1) * (n_q_points);
-      std::vector<CeedScalar> interp_1d(array_size);
-      std::vector<CeedScalar> grad_1d(array_size);
-      for (unsigned int i = 0; i < n_q_points; i++)
-        {
-          // Retrieve quadrature info
-          // Convert reference element from [0, 1] to [-1, 1]
-          q_ref_1d[i]    = 2.0 * (quadrature.get_tensor_basis()[0].point(i)(0) - 0.5);
-          q_weight_1d[i] = 2.0 * quadrature.get_tensor_basis()[0].weight(i);
-
-          // Retrieve 1D shape function values
-          for (unsigned int j = 0; j < fe_degree + 1; j++)
-            {
-              interp_1d[j + i * (fe_degree + 1)] = shape_data.shape_values[j + i * (fe_degree + 1)];
-              // Scale derivatives by 1/2 for new reference element length
-              grad_1d[j + i * (fe_degree + 1)] =
-                0.5 * shape_data.shape_gradients[j + i * (fe_degree + 1)];
-            }
-        }
+      std::vector<CeedScalar> q_ref_1d;
+      for (const auto q : quadrature.get_tensor_basis()[0].get_points())
+        q_ref_1d.push_back(q(0));
 
       CeedBasisCreateTensorH1(ceed,
                               dim,
                               n_components,
                               fe_degree + 1,
                               n_q_points,
-                              interp_1d.data(),
-                              grad_1d.data(),
+                              shape_data.shape_values.data(),
+                              shape_data.shape_gradients.data(),
                               q_ref_1d.data(),
-                              q_weight_1d.data(),
+                              quadrature.get_tensor_basis()[0].get_weights().data(),
                               &sol_basis);
     }
 
@@ -611,8 +592,28 @@ class OperatorCeed : public OperatorBase<Number>
 
     const unsigned int fe_degree = mapping_q->get_degree();
 
-    CeedBasisCreateTensorH1Lagrange(
-      ceed, dim, dim, fe_degree + 1, n_q_points, CEED_GAUSS, &geo_basis);
+    FE_Q<dim> geo_fe(fe_degree);
+
+    {
+      dealii::internal::MatrixFreeFunctions::ShapeInfo<double> shape_info;
+      shape_info.reinit(quadrature, geo_fe, 0);
+      dealii::internal::MatrixFreeFunctions::UnivariateShapeData<double> shape_data =
+        shape_info.get_shape_data();
+      std::vector<CeedScalar> q_ref_1d;
+      for (const auto q : quadrature.get_tensor_basis()[0].get_points())
+        q_ref_1d.push_back(q(0));
+
+      CeedBasisCreateTensorH1(ceed,
+                              dim,
+                              dim,
+                              fe_degree + 1,
+                              n_q_points,
+                              shape_data.shape_values.data(),
+                              shape_data.shape_gradients.data(),
+                              q_ref_1d.data(),
+                              quadrature.get_tensor_basis()[0].get_weights().data(),
+                              &geo_basis);
+    }
 
     unsigned int n_local_active_cells = 0;
 
@@ -623,8 +624,6 @@ class OperatorCeed : public OperatorBase<Number>
     std::vector<double>  geo_support_points;
     std::vector<CeedInt> geo_indices;
 
-    FE_Q<dim> geo_fe(fe_degree);
-
     DoFHandler<dim> geo_dof_handler(tria);
     geo_dof_handler.distribute_dofs(geo_fe);
 

From d89549c6e0afe634fc6cadb472a2a3224f4b1b63 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Wed, 28 May 2025 15:20:11 -0600
Subject: [PATCH 407/571] doc - hide inclusion markers in md

---
 README.md                           | 6 +++---
 doc/sphinx/source/gettingstarted.md | 2 +-
 examples/README.md                  | 8 ++++----
 examples/fluids/README.md           | 2 +-
 examples/fluids/index.md            | 2 +-
 examples/solids/README.md           | 2 +-
 examples/solids/index.md            | 2 +-
 7 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/README.md b/README.md
index f81fcb23ae..dcf4898a1c 100644
--- a/README.md
+++ b/README.md
@@ -27,7 +27,7 @@ The CEED research is supported by the [Exascale Computing Project](https://exasc
 
 For more details on the CEED API see the [user manual](https://libceed.org/en/latest/).
 
-% gettingstarted-inclusion-marker
+<!-- % gettingstarted-inclusion-marker-->
 
 ## Building
 
@@ -254,7 +254,7 @@ To build the examples, set the `MFEM_DIR`, `PETSC_DIR` (and optionally `PETSC_AR
 $ cd examples/
 ```
 
-% running-examples-inclusion-marker
+<!-- % running-examples-inclusion-marker-->
 
 ```console
 # libCEED examples on CPU and GPU
@@ -337,7 +337,7 @@ The above code assumes a GPU-capable machine with the CUDA backends enabled.
 Depending on the available backends, other CEED resource specifiers can be provided with the `-ceed` option.
 Other command line arguments can be found in [examples/petsc](https://github.com/CEED/libCEED/blob/main/examples/petsc/README.md).
 
-% benchmarks-marker
+<!-- % benchmarks-marker-->
 
 ## Benchmarks
 
diff --git a/doc/sphinx/source/gettingstarted.md b/doc/sphinx/source/gettingstarted.md
index 0f1a831b59..eb75a1e06c 100644
--- a/doc/sphinx/source/gettingstarted.md
+++ b/doc/sphinx/source/gettingstarted.md
@@ -1,5 +1,5 @@
 # Getting Started
 
 ```{include} ./README.md
-:start-after: gettingstarted-inclusion-marker
+:start-after: gettingstarted-inclusion-marker-->
 ```
diff --git a/examples/README.md b/examples/README.md
index e1177992f0..3103440c1b 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -9,7 +9,7 @@ For more details, please see the dedicated [documentation section](https://libce
 
 ## Bakeoff Problems
 
-% bps-inclusion-marker
+<!-- % bps-inclusion-marker-->
 
 The Center for Efficient Exascale Discretizations (CEED) uses Bakeoff Problems (BPs) to test and compare the performance of high-order finite element implementations.
 The definitions of the problems are given on the ceed [website](https://ceed.exascaleproject.org/bps/).
@@ -46,7 +46,7 @@ The BPs are parametrized by the number $P$ of Gauss-Legendre-Lobatto nodal point
 A $Q$-point Gauss-Legendre quadrature is used for all BPs except BP5 and BP6, which choose $Q = P$ and Gauss-Legendre-Lobatto quadrature to collocate with the interpolation nodes.
 This latter choice is popular in applications that use spectral element methods because it produces a diagonal mass matrix (enabling easy explicit time integration) and significantly reduces the number of floating point operations to apply the operator.
 
-% bps-exclusion-marker
+<!-- % bps-exclusion-marker-->
 
 For a more detailed description of the operators employed in the BPs, please see the dedicated [BPs documentation section](https://libceed.org/en/latest/examples/bps.html).
 
@@ -75,6 +75,6 @@ For a detailed description, please see the corresponding [problems on the cubed-
 To build the examples, set the `MFEM_DIR`, `PETSC_DIR`, and `NEK5K_DIR` variables and, from the `examples/` directory, run
 
 ```{include} ../README.md
-:start-after: running-examples-inclusion-marker
-:end-before: benchmarks-marker
+:start-after: running-examples-inclusion-marker-->
+:end-before: benchmarks-marker-->
 ```
diff --git a/examples/fluids/README.md b/examples/fluids/README.md
index 79c33a1ee2..9687420530 100644
--- a/examples/fluids/README.md
+++ b/examples/fluids/README.md
@@ -23,7 +23,7 @@ and run with:
 
 ## Runtime options
 
-% inclusion-fluids-marker
+<!-- % inclusion-fluids-marker-->
 
 The Navier-Stokes mini-app is controlled via command-line options.
 The following options are common among all problem types:
diff --git a/examples/fluids/index.md b/examples/fluids/index.md
index 31e1a12aa2..ea341b9ff5 100644
--- a/examples/fluids/index.md
+++ b/examples/fluids/index.md
@@ -9,7 +9,7 @@ Moreover, the Navier-Stokes example has been developed using PETSc, so that the
 ## Running the mini-app
 
 ```{include} README.md
-:start-after: inclusion-fluids-marker
+:start-after: inclusion-fluids-marker-->
 ```
 ## The Navier-Stokes equations
 
diff --git a/examples/solids/README.md b/examples/solids/README.md
index 696a044098..e10c7ef5be 100644
--- a/examples/solids/README.md
+++ b/examples/solids/README.md
@@ -22,7 +22,7 @@ and run with:
 
 ## Runtime options
 
-% inclusion-solids-marker
+<!-- % inclusion-solids-marker-->
 
 The elasticity mini-app is controlled via command-line options, the following of which are mandatory.
 
diff --git a/examples/solids/index.md b/examples/solids/index.md
index e1334ff308..faceae2094 100644
--- a/examples/solids/index.md
+++ b/examples/solids/index.md
@@ -34,7 +34,7 @@ $$ (hyperelastic-cd)
 ## Running the mini-app
 
 ```{include} README.md
-:start-after: inclusion-solids-marker
+:start-after: inclusion-solids-marker-->
 ```
 
 (problem-linear-elasticity)=

From ec4241e62f4c94aba4d62657b73038495830e634 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Wed, 28 May 2025 15:22:23 -0600
Subject: [PATCH 408/571] doc - add missing function param

---
 interface/ceed.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/interface/ceed.c b/interface/ceed.c
index 2b8bfd1c1e..96f0093984 100644
--- a/interface/ceed.c
+++ b/interface/ceed.c
@@ -820,7 +820,8 @@ int CeedReference(Ceed ceed) {
 /**
   @brief Computes the current memory usage of the work vectors in a `Ceed` context and prints to debug.abort
 
-  @param[in] ceed `Ceed` context
+  @param[in]  ceed     `Ceed` context
+  @param[out] usage_mb Address of the variable where the MB of work vector usage will be stored
 
   @return An error code: 0 - success, otherwise - failure
 

From 84332965a123aba5756ec82f4904275fbee0d2ae Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Wed, 28 May 2025 15:25:08 -0600
Subject: [PATCH 409/571] doc - remove sphinx warning

---
 doc/sphinx/source/conf.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doc/sphinx/source/conf.py b/doc/sphinx/source/conf.py
index 2dbcfa648a..4353a893b5 100755
--- a/doc/sphinx/source/conf.py
+++ b/doc/sphinx/source/conf.py
@@ -107,6 +107,7 @@
     "examples/nek/README.md",
     "examples/petsc/README.md",
     "examples/solid/README.md",
+    "examples/deal.II/README.md",
 ]
 
 # The name of the Pygments (syntax highlighting) style to use.

From 6d7116d72e016bae92b657063812cb029c7e7da4 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Wed, 28 May 2025 15:37:08 -0600
Subject: [PATCH 410/571] doc - fix duplicate formula warning

---
 examples/ceed/index.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/ceed/index.md b/examples/ceed/index.md
index 785c2cff8f..02b0f41749 100644
--- a/examples/ceed/index.md
+++ b/examples/ceed/index.md
@@ -79,7 +79,7 @@ Using the same notation as in {ref}`theoretical-framework`, we write here the ve
 
 $$
 \sum_e \int_{\Omega_e}\left( v(x) 1 + \nabla v(x) \cdot 0 \right) \, dV
-$$ (volume-sum)
+$$ (volume-sum-mass-diff)
 
 with $v(x) \in \mathcal{V}_p = \{ v \in H^{1}(\Omega_e) \,|\, v \in P_p(\bm{I}), e=1,\ldots,N_e \}$, the test functions.
 

From 525f58efad3fdbd516b1ca0b43ee04d6060753fc Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Thu, 29 May 2025 09:52:49 -0600
Subject: [PATCH 411/571] doc - use HTML comments as inclusion markers

---
 README.md                           | 8 +++++---
 doc/sphinx/source/gettingstarted.md | 3 ++-
 examples/README.md                  | 8 ++++----
 examples/bps.md                     | 4 ++--
 examples/fluids/README.md           | 2 +-
 examples/fluids/index.md            | 2 +-
 examples/solids/README.md           | 2 +-
 examples/solids/index.md            | 2 +-
 8 files changed, 17 insertions(+), 14 deletions(-)

diff --git a/README.md b/README.md
index dcf4898a1c..65f1819aa9 100644
--- a/README.md
+++ b/README.md
@@ -27,7 +27,7 @@ The CEED research is supported by the [Exascale Computing Project](https://exasc
 
 For more details on the CEED API see the [user manual](https://libceed.org/en/latest/).
 
-<!-- % gettingstarted-inclusion-marker-->
+<!-- getting-started-inclusion -->
 
 ## Building
 
@@ -243,6 +243,8 @@ Bit-for-bit reproducibility is important in some applications.
 However, some libCEED backends use non-deterministic operations, such as `atomicAdd` for increased performance.
 The backends which are capable of generating reproducible results, with the proper compilation options, are highlighted in the list above.
 
+<!-- getting-started-exclusion -->
+
 ## Examples
 
 libCEED comes with several examples of its usage, ranging from standalone C codes in the `/examples/ceed` directory to examples based on external packages, such as MFEM, PETSc, and Nek5000.
@@ -254,7 +256,7 @@ To build the examples, set the `MFEM_DIR`, `PETSC_DIR` (and optionally `PETSC_AR
 $ cd examples/
 ```
 
-<!-- % running-examples-inclusion-marker-->
+<!-- running-examples-inclusion -->
 
 ```console
 # libCEED examples on CPU and GPU
@@ -337,7 +339,7 @@ The above code assumes a GPU-capable machine with the CUDA backends enabled.
 Depending on the available backends, other CEED resource specifiers can be provided with the `-ceed` option.
 Other command line arguments can be found in [examples/petsc](https://github.com/CEED/libCEED/blob/main/examples/petsc/README.md).
 
-<!-- % benchmarks-marker-->
+<!-- running-examples-exclusion -->
 
 ## Benchmarks
 
diff --git a/doc/sphinx/source/gettingstarted.md b/doc/sphinx/source/gettingstarted.md
index eb75a1e06c..0369bbaa40 100644
--- a/doc/sphinx/source/gettingstarted.md
+++ b/doc/sphinx/source/gettingstarted.md
@@ -1,5 +1,6 @@
 # Getting Started
 
 ```{include} ./README.md
-:start-after: gettingstarted-inclusion-marker-->
+:start-after: <!-- getting-started-inclusion -->
+:end-before: <!-- getting-started-exclusion -->
 ```
diff --git a/examples/README.md b/examples/README.md
index 3103440c1b..d5adadad36 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -9,7 +9,7 @@ For more details, please see the dedicated [documentation section](https://libce
 
 ## Bakeoff Problems
 
-<!-- % bps-inclusion-marker-->
+<!-- bps-inclusion -->
 
 The Center for Efficient Exascale Discretizations (CEED) uses Bakeoff Problems (BPs) to test and compare the performance of high-order finite element implementations.
 The definitions of the problems are given on the ceed [website](https://ceed.exascaleproject.org/bps/).
@@ -46,7 +46,7 @@ The BPs are parametrized by the number $P$ of Gauss-Legendre-Lobatto nodal point
 A $Q$-point Gauss-Legendre quadrature is used for all BPs except BP5 and BP6, which choose $Q = P$ and Gauss-Legendre-Lobatto quadrature to collocate with the interpolation nodes.
 This latter choice is popular in applications that use spectral element methods because it produces a diagonal mass matrix (enabling easy explicit time integration) and significantly reduces the number of floating point operations to apply the operator.
 
-<!-- % bps-exclusion-marker-->
+<!-- bps-exclusion -->
 
 For a more detailed description of the operators employed in the BPs, please see the dedicated [BPs documentation section](https://libceed.org/en/latest/examples/bps.html).
 
@@ -75,6 +75,6 @@ For a detailed description, please see the corresponding [problems on the cubed-
 To build the examples, set the `MFEM_DIR`, `PETSC_DIR`, and `NEK5K_DIR` variables and, from the `examples/` directory, run
 
 ```{include} ../README.md
-:start-after: running-examples-inclusion-marker-->
-:end-before: benchmarks-marker-->
+:start-after: <!-- running-examples-inclusion -->
+:end-before: <!-- running-examples-exclusion -->
 ```
diff --git a/examples/bps.md b/examples/bps.md
index 47ba00e80e..7014c71f77 100644
--- a/examples/bps.md
+++ b/examples/bps.md
@@ -3,8 +3,8 @@
 # CEED Bakeoff Problems
 
 ```{include} ./README.md
-:start-after: bps-inclusion-marker
-:end-before: bps-exclusion-marker
+:start-after: <!-- bps-inclusion -->
+:end-before: <!-- bps-exclusion -->
 ```
 
 (mass-operator)=
diff --git a/examples/fluids/README.md b/examples/fluids/README.md
index 9687420530..ec0ea560e1 100644
--- a/examples/fluids/README.md
+++ b/examples/fluids/README.md
@@ -23,7 +23,7 @@ and run with:
 
 ## Runtime options
 
-<!-- % inclusion-fluids-marker-->
+<!-- fluids-inclusion -->
 
 The Navier-Stokes mini-app is controlled via command-line options.
 The following options are common among all problem types:
diff --git a/examples/fluids/index.md b/examples/fluids/index.md
index ea341b9ff5..27fe5b9a3c 100644
--- a/examples/fluids/index.md
+++ b/examples/fluids/index.md
@@ -9,7 +9,7 @@ Moreover, the Navier-Stokes example has been developed using PETSc, so that the
 ## Running the mini-app
 
 ```{include} README.md
-:start-after: inclusion-fluids-marker-->
+:start-after: <!-- fluids-inclusion -->
 ```
 ## The Navier-Stokes equations
 
diff --git a/examples/solids/README.md b/examples/solids/README.md
index e10c7ef5be..31b4651c3f 100644
--- a/examples/solids/README.md
+++ b/examples/solids/README.md
@@ -22,7 +22,7 @@ and run with:
 
 ## Runtime options
 
-<!-- % inclusion-solids-marker-->
+<!-- solids-inclusion -->
 
 The elasticity mini-app is controlled via command-line options, the following of which are mandatory.
 
diff --git a/examples/solids/index.md b/examples/solids/index.md
index faceae2094..6d164bd9df 100644
--- a/examples/solids/index.md
+++ b/examples/solids/index.md
@@ -34,7 +34,7 @@ $$ (hyperelastic-cd)
 ## Running the mini-app
 
 ```{include} README.md
-:start-after: inclusion-solids-marker-->
+:start-after: <!-- solids-inclusion -->
 ```
 
 (problem-linear-elasticity)=

From cf415216851e455690c5797385b9bba87c798dc1 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 30 May 2025 12:29:31 -0600
Subject: [PATCH 412/571] cpu - add ncomp=4 as common restriction variant

---
 backends/ref/ceed-ref-restriction.c | 36 +++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/backends/ref/ceed-ref-restriction.c b/backends/ref/ceed-ref-restriction.c
index 3b5a5d50e4..1a9c58e3ac 100644
--- a/backends/ref/ceed-ref-restriction.c
+++ b/backends/ref/ceed-ref-restriction.c
@@ -553,6 +553,30 @@ static int CeedElemRestrictionApply_Ref_381(CeedElemRestriction rstr, const Ceed
 }
 
 // LCOV_EXCL_START
+static int CeedElemRestrictionApply_Ref_410(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size, const CeedInt comp_stride,
+                                            CeedInt start, CeedInt stop, CeedTransposeMode t_mode, bool use_signs, bool use_orients, CeedVector u,
+                                            CeedVector v, CeedRequest *request) {
+  return CeedElemRestrictionApply_Ref_Core(rstr, 4, 1, comp_stride, start, stop, t_mode, use_signs, use_orients, u, v, request);
+}
+
+static int CeedElemRestrictionApply_Ref_411(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size, const CeedInt comp_stride,
+                                            CeedInt start, CeedInt stop, CeedTransposeMode t_mode, bool use_signs, bool use_orients, CeedVector u,
+                                            CeedVector v, CeedRequest *request) {
+  return CeedElemRestrictionApply_Ref_Core(rstr, 4, 1, 1, start, stop, t_mode, use_signs, use_orients, u, v, request);
+}
+
+static int CeedElemRestrictionApply_Ref_480(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size, const CeedInt comp_stride,
+                                            CeedInt start, CeedInt stop, CeedTransposeMode t_mode, bool use_signs, bool use_orients, CeedVector u,
+                                            CeedVector v, CeedRequest *request) {
+  return CeedElemRestrictionApply_Ref_Core(rstr, 4, 8, comp_stride, start, stop, t_mode, use_signs, use_orients, u, v, request);
+}
+
+static int CeedElemRestrictionApply_Ref_481(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size, const CeedInt comp_stride,
+                                            CeedInt start, CeedInt stop, CeedTransposeMode t_mode, bool use_signs, bool use_orients, CeedVector u,
+                                            CeedVector v, CeedRequest *request) {
+  return CeedElemRestrictionApply_Ref_Core(rstr, 4, 8, 1, start, stop, t_mode, use_signs, use_orients, u, v, request);
+}
+
 static int CeedElemRestrictionApply_Ref_510(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size, const CeedInt comp_stride,
                                             CeedInt start, CeedInt stop, CeedTransposeMode t_mode, bool use_signs, bool use_orients, CeedVector u,
                                             CeedVector v, CeedRequest *request) {
@@ -836,6 +860,18 @@ int CeedElemRestrictionCreate_Ref(CeedMemType mem_type, CeedCopyMode copy_mode,
       impl->Apply = CeedElemRestrictionApply_Ref_381;
       break;
     // LCOV_EXCL_START
+    case 410:
+      impl->Apply = CeedElemRestrictionApply_Ref_410;
+      break;
+    case 411:
+      impl->Apply = CeedElemRestrictionApply_Ref_411;
+      break;
+    case 480:
+      impl->Apply = CeedElemRestrictionApply_Ref_480;
+      break;
+    case 481:
+      impl->Apply = CeedElemRestrictionApply_Ref_481;
+      break;
     case 510:
       impl->Apply = CeedElemRestrictionApply_Ref_510;
       break;

From 4baa7aec0dd4cb16a4e44df395cefec334617a2e Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 30 May 2025 12:23:49 -0600
Subject: [PATCH 413/571] cpu - remove vectorize warnings for Clang -O3

---
 backends/memcheck/ceed-memcheck-restriction.c | 20 ++--
 backends/ref/ceed-ref-restriction.c           | 95 ++++++++++---------
 2 files changed, 58 insertions(+), 57 deletions(-)

diff --git a/backends/memcheck/ceed-memcheck-restriction.c b/backends/memcheck/ceed-memcheck-restriction.c
index 47200bf004..5bae298ddb 100644
--- a/backends/memcheck/ceed-memcheck-restriction.c
+++ b/backends/memcheck/ceed-memcheck-restriction.c
@@ -57,8 +57,8 @@ static inline int CeedElemRestrictionApplyStridedNoTranspose_Memcheck_Core(CeedE
 
   // Apply restriction
   for (CeedSize e = start * block_size; e < stop * block_size; e += block_size) {
-    CeedPragmaSIMD for (CeedSize k = 0; k < num_comp; k++) {
-      CeedPragmaSIMD for (CeedSize n = 0; n < elem_size; n++) {
+    for (CeedSize k = 0; k < num_comp; k++) {
+      for (CeedSize n = 0; n < elem_size; n++) {
         CeedPragmaSIMD for (CeedSize j = 0; j < block_size; j++) {
           vv[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset] =
               uu[n * strides[0] + k * strides[1] + CeedIntMin(e + j, num_elem - 1) * (CeedSize)strides[2]];
@@ -78,7 +78,7 @@ static inline int CeedElemRestrictionApplyOffsetNoTranspose_Memcheck_Core(CeedEl
 
   CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
   for (CeedSize e = start * block_size; e < stop * block_size; e += block_size) {
-    CeedPragmaSIMD for (CeedSize k = 0; k < num_comp; k++) {
+    for (CeedSize k = 0; k < num_comp; k++) {
       CeedPragmaSIMD for (CeedSize i = 0; i < elem_size * block_size; i++) {
         vv[elem_size * (k * block_size + e * num_comp) + i - v_offset] = uu[impl->offsets[i + e * elem_size] + k * comp_stride];
       }
@@ -96,7 +96,7 @@ static inline int CeedElemRestrictionApplyOrientedNoTranspose_Memcheck_Core(Ceed
 
   CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
   for (CeedSize e = start * block_size; e < stop * block_size; e += block_size) {
-    CeedPragmaSIMD for (CeedSize k = 0; k < num_comp; k++) {
+    for (CeedSize k = 0; k < num_comp; k++) {
       CeedPragmaSIMD for (CeedSize i = 0; i < elem_size * block_size; i++) {
         vv[elem_size * (k * block_size + e * num_comp) + i - v_offset] =
             uu[impl->offsets[i + e * elem_size] + k * comp_stride] * (impl->orients[i + e * elem_size] ? -1.0 : 1.0);
@@ -115,7 +115,7 @@ static inline int CeedElemRestrictionApplyCurlOrientedNoTranspose_Memcheck_Core(
 
   CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
   for (CeedSize e = start * block_size; e < stop * block_size; e += block_size) {
-    CeedPragmaSIMD for (CeedSize k = 0; k < num_comp; k++) {
+    for (CeedSize k = 0; k < num_comp; k++) {
       CeedSize n = 0;
 
       CeedPragmaSIMD for (CeedSize j = 0; j < block_size; j++) {
@@ -125,7 +125,7 @@ static inline int CeedElemRestrictionApplyCurlOrientedNoTranspose_Memcheck_Core(
             uu[impl->offsets[j + (n + 1) * block_size + e * elem_size] + k * comp_stride] *
                 impl->curl_orients[j + (3 * n + 2) * block_size + e * 3 * elem_size];
       }
-      CeedPragmaSIMD for (n = 1; n < elem_size - 1; n++) {
+      for (n = 1; n < elem_size - 1; n++) {
         CeedPragmaSIMD for (CeedSize j = 0; j < block_size; j++) {
           vv[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset] =
               uu[impl->offsets[j + (n - 1) * block_size + e * elem_size] + k * comp_stride] *
@@ -156,7 +156,7 @@ static inline int CeedElemRestrictionApplyCurlOrientedUnsignedNoTranspose_Memche
 
   CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
   for (CeedSize e = start * block_size; e < stop * block_size; e += block_size) {
-    CeedPragmaSIMD for (CeedSize k = 0; k < num_comp; k++) {
+    for (CeedSize k = 0; k < num_comp; k++) {
       CeedSize n = 0;
 
       CeedPragmaSIMD for (CeedSize j = 0; j < block_size; j++) {
@@ -166,7 +166,7 @@ static inline int CeedElemRestrictionApplyCurlOrientedUnsignedNoTranspose_Memche
             uu[impl->offsets[j + (n + 1) * block_size + e * elem_size] + k * comp_stride] *
                 abs(impl->curl_orients[j + (3 * n + 2) * block_size + e * 3 * elem_size]);
       }
-      CeedPragmaSIMD for (n = 1; n < elem_size - 1; n++) {
+      for (n = 1; n < elem_size - 1; n++) {
         CeedPragmaSIMD for (CeedSize j = 0; j < block_size; j++) {
           vv[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset] =
               uu[impl->offsets[j + (n - 1) * block_size + e * elem_size] + k * comp_stride] *
@@ -203,8 +203,8 @@ static inline int CeedElemRestrictionApplyStridedTranspose_Memcheck_Core(CeedEle
 
   // Apply restriction
   for (CeedSize e = start * block_size; e < stop * block_size; e += block_size) {
-    CeedPragmaSIMD for (CeedSize k = 0; k < num_comp; k++) {
-      CeedPragmaSIMD for (CeedSize n = 0; n < elem_size; n++) {
+    for (CeedSize k = 0; k < num_comp; k++) {
+      for (CeedSize n = 0; n < elem_size; n++) {
         CeedPragmaSIMD for (CeedSize j = 0; j < CeedIntMin(block_size, num_elem - e); j++) {
           vv[n * strides[0] + k * strides[1] + (e + j) * strides[2]] +=
               uu[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset];
diff --git a/backends/ref/ceed-ref-restriction.c b/backends/ref/ceed-ref-restriction.c
index 1a9c58e3ac..ce0ad60c75 100644
--- a/backends/ref/ceed-ref-restriction.c
+++ b/backends/ref/ceed-ref-restriction.c
@@ -17,8 +17,8 @@
 // Core ElemRestriction Apply Code
 //------------------------------------------------------------------------------
 static inline int CeedElemRestrictionApplyStridedNoTranspose_Ref_Core(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size,
-                                                                      CeedInt start, CeedInt stop, CeedInt num_elem, CeedInt elem_size,
-                                                                      CeedSize v_offset, const CeedScalar *__restrict__ uu,
+                                                                      const CeedInt start, const CeedInt stop, const CeedInt num_elem,
+                                                                      const CeedInt elem_size, CeedSize v_offset, const CeedScalar *__restrict__ uu,
                                                                       CeedScalar *__restrict__ vv) {
   // No offsets provided, identity restriction
   bool has_backend_strides;
@@ -28,8 +28,8 @@ static inline int CeedElemRestrictionApplyStridedNoTranspose_Ref_Core(CeedElemRe
     // CPU backend strides are {1, elem_size, elem_size*num_comp}
     // This if branch is left separate to allow better inlining
     for (CeedSize e = start * block_size; e < stop * block_size; e += block_size) {
-      CeedPragmaSIMD for (CeedSize k = 0; k < num_comp; k++) {
-        CeedPragmaSIMD for (CeedSize n = 0; n < elem_size; n++) {
+      for (CeedSize k = 0; k < num_comp; k++) {
+        for (CeedSize n = 0; n < elem_size; n++) {
           CeedPragmaSIMD for (CeedSize j = 0; j < block_size; j++) {
             vv[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset] =
                 uu[n + k * elem_size + CeedIntMin(e + j, num_elem - 1) * elem_size * (CeedSize)num_comp];
@@ -43,8 +43,8 @@ static inline int CeedElemRestrictionApplyStridedNoTranspose_Ref_Core(CeedElemRe
 
     CeedCallBackend(CeedElemRestrictionGetStrides(rstr, strides));
     for (CeedSize e = start * block_size; e < stop * block_size; e += block_size) {
-      CeedPragmaSIMD for (CeedSize k = 0; k < num_comp; k++) {
-        CeedPragmaSIMD for (CeedSize n = 0; n < elem_size; n++) {
+      for (CeedSize k = 0; k < num_comp; k++) {
+        for (CeedSize n = 0; n < elem_size; n++) {
           CeedPragmaSIMD for (CeedSize j = 0; j < block_size; j++) {
             vv[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset] =
                 uu[n * strides[0] + k * strides[1] + CeedIntMin(e + j, num_elem - 1) * (CeedSize)strides[2]];
@@ -57,15 +57,15 @@ static inline int CeedElemRestrictionApplyStridedNoTranspose_Ref_Core(CeedElemRe
 }
 
 static inline int CeedElemRestrictionApplyOffsetNoTranspose_Ref_Core(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size,
-                                                                     const CeedInt comp_stride, CeedInt start, CeedInt stop, CeedInt num_elem,
-                                                                     CeedInt elem_size, CeedSize v_offset, const CeedScalar *__restrict__ uu,
-                                                                     CeedScalar *__restrict__ vv) {
+                                                                     const CeedInt comp_stride, const CeedInt start, const CeedInt stop,
+                                                                     const CeedInt num_elem, const CeedInt elem_size, const CeedSize v_offset,
+                                                                     const CeedScalar *__restrict__ uu, CeedScalar *__restrict__ vv) {
   // Default restriction with offsets
   CeedElemRestriction_Ref *impl;
 
   CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
   for (CeedSize e = start * block_size; e < stop * block_size; e += block_size) {
-    CeedPragmaSIMD for (CeedSize k = 0; k < num_comp; k++) {
+    for (CeedSize k = 0; k < num_comp; k++) {
       CeedPragmaSIMD for (CeedSize i = 0; i < elem_size * block_size; i++) {
         vv[elem_size * (k * block_size + e * num_comp) + i - v_offset] = uu[impl->offsets[i + e * elem_size] + k * comp_stride];
       }
@@ -75,15 +75,15 @@ static inline int CeedElemRestrictionApplyOffsetNoTranspose_Ref_Core(CeedElemRes
 }
 
 static inline int CeedElemRestrictionApplyOrientedNoTranspose_Ref_Core(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size,
-                                                                       const CeedInt comp_stride, CeedInt start, CeedInt stop, CeedInt num_elem,
-                                                                       CeedInt elem_size, CeedSize v_offset, const CeedScalar *__restrict__ uu,
-                                                                       CeedScalar *__restrict__ vv) {
+                                                                       const CeedInt comp_stride, const CeedInt start, const CeedInt stop,
+                                                                       const CeedInt num_elem, const CeedInt elem_size, const CeedSize v_offset,
+                                                                       const CeedScalar *__restrict__ uu, CeedScalar *__restrict__ vv) {
   // Restriction with orientations
   CeedElemRestriction_Ref *impl;
 
   CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
   for (CeedSize e = start * block_size; e < stop * block_size; e += block_size) {
-    CeedPragmaSIMD for (CeedSize k = 0; k < num_comp; k++) {
+    for (CeedSize k = 0; k < num_comp; k++) {
       CeedPragmaSIMD for (CeedSize i = 0; i < elem_size * block_size; i++) {
         vv[elem_size * (k * block_size + e * num_comp) + i - v_offset] =
             uu[impl->offsets[i + e * elem_size] + k * comp_stride] * (impl->orients[i + e * elem_size] ? -1.0 : 1.0);
@@ -94,15 +94,15 @@ static inline int CeedElemRestrictionApplyOrientedNoTranspose_Ref_Core(CeedElemR
 }
 
 static inline int CeedElemRestrictionApplyCurlOrientedNoTranspose_Ref_Core(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size,
-                                                                           const CeedInt comp_stride, CeedInt start, CeedInt stop, CeedInt num_elem,
-                                                                           CeedInt elem_size, CeedSize v_offset, const CeedScalar *__restrict__ uu,
-                                                                           CeedScalar *__restrict__ vv) {
+                                                                           const CeedInt comp_stride, const CeedInt start, const CeedInt stop,
+                                                                           const CeedInt num_elem, const CeedInt elem_size, const CeedSize v_offset,
+                                                                           const CeedScalar *__restrict__ uu, CeedScalar *__restrict__ vv) {
   // Restriction with tridiagonal transformation
   CeedElemRestriction_Ref *impl;
 
   CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
   for (CeedSize e = start * block_size; e < stop * block_size; e += block_size) {
-    CeedPragmaSIMD for (CeedSize k = 0; k < num_comp; k++) {
+    for (CeedSize k = 0; k < num_comp; k++) {
       CeedSize n = 0;
 
       CeedPragmaSIMD for (CeedSize j = 0; j < block_size; j++) {
@@ -112,7 +112,7 @@ static inline int CeedElemRestrictionApplyCurlOrientedNoTranspose_Ref_Core(CeedE
             uu[impl->offsets[j + (n + 1) * block_size + e * elem_size] + k * comp_stride] *
                 impl->curl_orients[j + (3 * n + 2) * block_size + e * 3 * elem_size];
       }
-      CeedPragmaSIMD for (n = 1; n < elem_size - 1; n++) {
+      for (n = 1; n < elem_size - 1; n++) {
         CeedPragmaSIMD for (CeedSize j = 0; j < block_size; j++) {
           vv[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset] =
               uu[impl->offsets[j + (n - 1) * block_size + e * elem_size] + k * comp_stride] *
@@ -136,16 +136,16 @@ static inline int CeedElemRestrictionApplyCurlOrientedNoTranspose_Ref_Core(CeedE
 }
 
 static inline int CeedElemRestrictionApplyCurlOrientedUnsignedNoTranspose_Ref_Core(CeedElemRestriction rstr, const CeedInt num_comp,
-                                                                                   const CeedInt block_size, const CeedInt comp_stride, CeedInt start,
-                                                                                   CeedInt stop, CeedInt num_elem, CeedInt elem_size,
-                                                                                   CeedSize v_offset, const CeedScalar *__restrict__ uu,
-                                                                                   CeedScalar *__restrict__ vv) {
+                                                                                   const CeedInt block_size, const CeedInt comp_stride,
+                                                                                   const CeedInt start, const CeedInt stop, const CeedInt num_elem,
+                                                                                   const CeedInt elem_size, const CeedSize v_offset,
+                                                                                   const CeedScalar *__restrict__ uu, CeedScalar *__restrict__ vv) {
   // Restriction with (unsigned) tridiagonal transformation
   CeedElemRestriction_Ref *impl;
 
   CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
   for (CeedSize e = start * block_size; e < stop * block_size; e += block_size) {
-    CeedPragmaSIMD for (CeedSize k = 0; k < num_comp; k++) {
+    for (CeedSize k = 0; k < num_comp; k++) {
       CeedSize n = 0;
 
       CeedPragmaSIMD for (CeedSize j = 0; j < block_size; j++) {
@@ -155,7 +155,7 @@ static inline int CeedElemRestrictionApplyCurlOrientedUnsignedNoTranspose_Ref_Co
             uu[impl->offsets[j + (n + 1) * block_size + e * elem_size] + k * comp_stride] *
                 abs(impl->curl_orients[j + (3 * n + 2) * block_size + e * 3 * elem_size]);
       }
-      CeedPragmaSIMD for (n = 1; n < elem_size - 1; n++) {
+      for (n = 1; n < elem_size - 1; n++) {
         CeedPragmaSIMD for (CeedSize j = 0; j < block_size; j++) {
           vv[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset] =
               uu[impl->offsets[j + (n - 1) * block_size + e * elem_size] + k * comp_stride] *
@@ -179,9 +179,9 @@ static inline int CeedElemRestrictionApplyCurlOrientedUnsignedNoTranspose_Ref_Co
 }
 
 static inline int CeedElemRestrictionApplyStridedTranspose_Ref_Core(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size,
-                                                                    CeedInt start, CeedInt stop, CeedInt num_elem, CeedInt elem_size,
-                                                                    CeedSize v_offset, const CeedScalar *__restrict__ uu,
-                                                                    CeedScalar *__restrict__ vv) {
+                                                                    const CeedInt start, const CeedInt stop, const CeedInt num_elem,
+                                                                    const CeedInt elem_size, const CeedSize v_offset,
+                                                                    const CeedScalar *__restrict__ uu, CeedScalar *__restrict__ vv) {
   // No offsets provided, identity restriction
   bool has_backend_strides;
 
@@ -190,8 +190,8 @@ static inline int CeedElemRestrictionApplyStridedTranspose_Ref_Core(CeedElemRest
     // CPU backend strides are {1, elem_size, elem_size*num_comp}
     // This if brach is left separate to allow better inlining
     for (CeedSize e = start * block_size; e < stop * block_size; e += block_size) {
-      CeedPragmaSIMD for (CeedSize k = 0; k < num_comp; k++) {
-        CeedPragmaSIMD for (CeedSize n = 0; n < elem_size; n++) {
+      for (CeedSize k = 0; k < num_comp; k++) {
+        for (CeedSize n = 0; n < elem_size; n++) {
           CeedPragmaSIMD for (CeedSize j = 0; j < CeedIntMin(block_size, num_elem - e); j++) {
             vv[n + k * elem_size + (e + j) * elem_size * num_comp] += uu[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset];
           }
@@ -204,8 +204,8 @@ static inline int CeedElemRestrictionApplyStridedTranspose_Ref_Core(CeedElemRest
 
     CeedCallBackend(CeedElemRestrictionGetStrides(rstr, strides));
     for (CeedInt e = start * block_size; e < stop * block_size; e += block_size) {
-      CeedPragmaSIMD for (CeedSize k = 0; k < num_comp; k++) {
-        CeedPragmaSIMD for (CeedSize n = 0; n < elem_size; n++) {
+      for (CeedSize k = 0; k < num_comp; k++) {
+        for (CeedSize n = 0; n < elem_size; n++) {
           CeedPragmaSIMD for (CeedSize j = 0; j < CeedIntMin(block_size, num_elem - e); j++) {
             vv[n * strides[0] + k * strides[1] + (e + j) * strides[2]] +=
                 uu[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset];
@@ -218,9 +218,9 @@ static inline int CeedElemRestrictionApplyStridedTranspose_Ref_Core(CeedElemRest
 }
 
 static inline int CeedElemRestrictionApplyOffsetTranspose_Ref_Core(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size,
-                                                                   const CeedInt comp_stride, CeedInt start, CeedInt stop, CeedInt num_elem,
-                                                                   CeedInt elem_size, CeedSize v_offset, const CeedScalar *__restrict__ uu,
-                                                                   CeedScalar *__restrict__ vv) {
+                                                                   const CeedInt comp_stride, const CeedInt start, const CeedInt stop,
+                                                                   const CeedInt num_elem, const CeedInt elem_size, const CeedSize v_offset,
+                                                                   const CeedScalar *__restrict__ uu, CeedScalar *__restrict__ vv) {
   // Default restriction with offsets
   CeedElemRestriction_Ref *impl;
 
@@ -242,9 +242,9 @@ static inline int CeedElemRestrictionApplyOffsetTranspose_Ref_Core(CeedElemRestr
 }
 
 static inline int CeedElemRestrictionApplyOrientedTranspose_Ref_Core(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size,
-                                                                     const CeedInt comp_stride, CeedInt start, CeedInt stop, CeedInt num_elem,
-                                                                     CeedInt elem_size, CeedSize v_offset, const CeedScalar *__restrict__ uu,
-                                                                     CeedScalar *__restrict__ vv) {
+                                                                     const CeedInt comp_stride, const CeedInt start, const CeedInt stop,
+                                                                     const CeedInt num_elem, const CeedInt elem_size, const CeedSize v_offset,
+                                                                     const CeedScalar *__restrict__ uu, CeedScalar *__restrict__ vv) {
   // Restriction with orientations
   CeedElemRestriction_Ref *impl;
 
@@ -266,9 +266,9 @@ static inline int CeedElemRestrictionApplyOrientedTranspose_Ref_Core(CeedElemRes
 }
 
 static inline int CeedElemRestrictionApplyCurlOrientedTranspose_Ref_Core(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size,
-                                                                         const CeedInt comp_stride, CeedInt start, CeedInt stop, CeedInt num_elem,
-                                                                         CeedInt elem_size, CeedSize v_offset, const CeedScalar *__restrict__ uu,
-                                                                         CeedScalar *__restrict__ vv) {
+                                                                         const CeedInt comp_stride, const CeedInt start, const CeedInt stop,
+                                                                         const CeedInt num_elem, const CeedInt elem_size, const CeedSize v_offset,
+                                                                         const CeedScalar *__restrict__ uu, CeedScalar *__restrict__ vv) {
   // Restriction with tridiagonal transformation
   CeedElemRestriction_Ref *impl;
   CeedScalar               vv_loc[block_size];
@@ -317,8 +317,9 @@ static inline int CeedElemRestrictionApplyCurlOrientedTranspose_Ref_Core(CeedEle
 }
 
 static inline int CeedElemRestrictionApplyCurlOrientedUnsignedTranspose_Ref_Core(CeedElemRestriction rstr, const CeedInt num_comp,
-                                                                                 const CeedInt block_size, const CeedInt comp_stride, CeedInt start,
-                                                                                 CeedInt stop, CeedInt num_elem, CeedInt elem_size, CeedSize v_offset,
+                                                                                 const CeedInt block_size, const CeedInt comp_stride,
+                                                                                 const CeedInt start, const CeedInt stop, const CeedInt num_elem,
+                                                                                 const CeedInt elem_size, const CeedSize v_offset,
                                                                                  const CeedScalar *__restrict__ uu, CeedScalar *__restrict__ vv) {
   // Restriction with (unsigned) tridiagonal transformation
   CeedElemRestriction_Ref *impl;
@@ -367,8 +368,8 @@ static inline int CeedElemRestrictionApplyCurlOrientedUnsignedTranspose_Ref_Core
   return CEED_ERROR_SUCCESS;
 }
 
-static inline int CeedElemRestrictionApplyAtPointsInElement_Ref_Core(CeedElemRestriction rstr, const CeedInt num_comp, CeedInt start, CeedInt stop,
-                                                                     CeedTransposeMode t_mode, const CeedScalar *__restrict__ uu,
+static inline int CeedElemRestrictionApplyAtPointsInElement_Ref_Core(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt start,
+                                                                     const CeedInt stop, CeedTransposeMode t_mode, const CeedScalar *__restrict__ uu,
                                                                      CeedScalar *__restrict__ vv) {
   CeedInt                  num_points, l_vec_offset;
   CeedSize                 e_vec_offset = 0;
@@ -393,8 +394,8 @@ static inline int CeedElemRestrictionApplyAtPointsInElement_Ref_Core(CeedElemRes
 }
 
 static inline int CeedElemRestrictionApply_Ref_Core(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size,
-                                                    const CeedInt comp_stride, CeedInt start, CeedInt stop, CeedTransposeMode t_mode, bool use_signs,
-                                                    bool use_orients, CeedVector u, CeedVector v, CeedRequest *request) {
+                                                    const CeedInt comp_stride, const CeedInt start, const CeedInt stop, CeedTransposeMode t_mode,
+                                                    bool use_signs, bool use_orients, CeedVector u, CeedVector v, CeedRequest *request) {
   CeedInt             num_elem, elem_size;
   CeedSize            v_offset = 0;
   CeedRestrictionType rstr_type;

From bec2b2889fc07db077dc30cb0ca6a3f36864e4ae Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Mon, 2 Jun 2025 10:05:30 -0600
Subject: [PATCH 414/571] ex - update for style

Co-authored-by: Peter Munch <peterrmuench@gmail.com>
---
 examples/deal.II/bps.h | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/examples/deal.II/bps.h b/examples/deal.II/bps.h
index 654c1b4358..3ff1c8cd0d 100644
--- a/examples/deal.II/bps.h
+++ b/examples/deal.II/bps.h
@@ -204,12 +204,10 @@ class OperatorCeed : public OperatorBase<Number>
     const unsigned int fe_degree  = fe.tensor_degree();
     const unsigned int n_q_points = quadrature.get_tensor_basis()[0].size();
     {
-      dealii::internal::MatrixFreeFunctions::ShapeInfo<double> shape_info;
-      shape_info.reinit(quadrature, fe, 0);
-      dealii::internal::MatrixFreeFunctions::UnivariateShapeData<double> shape_data =
-        shape_info.get_shape_data();
+      const dealii::internal::MatrixFreeFunctions::ShapeInfo<double> shape_info(quadrature, fe, 0);
+      const auto             &shape_data = shape_info.get_shape_data();
       std::vector<CeedScalar> q_ref_1d;
-      for (const auto q : quadrature.get_tensor_basis()[0].get_points())
+      for (const auto q : shape_data.quadrature.get_points())
         q_ref_1d.push_back(q(0));
 
       CeedBasisCreateTensorH1(ceed,
@@ -595,12 +593,12 @@ class OperatorCeed : public OperatorBase<Number>
     FE_Q<dim> geo_fe(fe_degree);
 
     {
-      dealii::internal::MatrixFreeFunctions::ShapeInfo<double> shape_info;
-      shape_info.reinit(quadrature, geo_fe, 0);
-      dealii::internal::MatrixFreeFunctions::UnivariateShapeData<double> shape_data =
-        shape_info.get_shape_data();
+      const dealii::internal::MatrixFreeFunctions::ShapeInfo<double> shape_info(quadrature,
+                                                                                geo_fe,
+                                                                                0);
+      const auto             &shape_data = shape_info.get_shape_data();
       std::vector<CeedScalar> q_ref_1d;
-      for (const auto q : quadrature.get_tensor_basis()[0].get_points())
+      for (const auto q : shape_data.quadrature.get_points())
         q_ref_1d.push_back(q(0));
 
       CeedBasisCreateTensorH1(ceed,

From 6561c4d27500f07007fbe687c5f7106b89f55e19 Mon Sep 17 00:00:00 2001
From: Leila Ghaffari <Leila.Ghaffari@colorado.edu>
Date: Thu, 12 Jun 2025 18:58:32 -0600
Subject: [PATCH 415/571] Layla name change

---
 AUTHORS | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/AUTHORS b/AUTHORS
index c41f42dfb9..adc091e7a1 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -6,7 +6,7 @@ Jed Brown
 Jean-Sylvain Camier
 Veselin Dobrev
 Yohann Dudouit
-Leila Ghaffari
+Layla Ghaffari
 Sebastian Grimberg
 Tzanio Kolev
 David Medina

From d538d163358b73723887a2d3949507319f119601 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Mon, 16 Jun 2025 10:29:41 -0600
Subject: [PATCH 416/571] doc - improve internal API documentation (#1834)

* doc - improve internal API documentation

* doc - clarify delegation

* doc - update wording for clarity

Co-authored-by: James Wright <james@jameswright.xyz>

* doc - clarify memcheck==valgrind

* doc - clarify shared gpu backends

* doc - clarify l/e/q-vecs

* doc - clarify occa backend status

* doc - more inheritance clarification

* doc - minor claification to QF fields

* minor - remove unneeded variables

* doc - update inheritance language

* doc - update shared description

* doc - more gpu notes

* doc - update fallback description

* doc - clarify fallback is only for pc support

* doc - minor updates to user facing GPU section

* doc - minor reordering of major sections

---------

Co-authored-by: James Wright <james@jameswright.xyz>
---
 backends/cuda-gen/ceed-cuda-gen.c        |   3 +-
 backends/hip-gen/ceed-hip-gen.c          |   9 +-
 backends/sycl-gen/ceed-sycl-gen.sycl.cpp |   3 +-
 doc/sphinx/source/gpu.md                 |  13 +-
 doc/sphinx/source/index.md               |   2 +-
 doc/sphinx/source/libCEEDdev.md          | 258 ++++++++++++++---------
 interface/ceed-qfunction.c               |  18 +-
 7 files changed, 194 insertions(+), 112 deletions(-)

diff --git a/backends/cuda-gen/ceed-cuda-gen.c b/backends/cuda-gen/ceed-cuda-gen.c
index 89ef059c1a..f38b700225 100644
--- a/backends/cuda-gen/ceed-cuda-gen.c
+++ b/backends/cuda-gen/ceed-cuda-gen.c
@@ -18,7 +18,6 @@
 //------------------------------------------------------------------------------
 static int CeedInit_Cuda_gen(const char *resource, Ceed ceed) {
   char      *resource_root;
-  const char fallback_resource[] = "/gpu/cuda/ref";
   Ceed       ceed_shared;
   Ceed_Cuda *data;
 
@@ -35,7 +34,7 @@ static int CeedInit_Cuda_gen(const char *resource, Ceed ceed) {
   CeedCallBackend(CeedSetDelegate(ceed, ceed_shared));
   CeedCallBackend(CeedDestroy(&ceed_shared));
 
-  CeedCallBackend(CeedSetOperatorFallbackResource(ceed, fallback_resource));
+  CeedCallBackend(CeedSetOperatorFallbackResource(ceed, "/gpu/cuda/ref"));
 
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "QFunctionCreate", CeedQFunctionCreate_Cuda_gen));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "OperatorCreate", CeedOperatorCreate_Cuda_gen));
diff --git a/backends/hip-gen/ceed-hip-gen.c b/backends/hip-gen/ceed-hip-gen.c
index ab0a566630..9871863507 100644
--- a/backends/hip-gen/ceed-hip-gen.c
+++ b/backends/hip-gen/ceed-hip-gen.c
@@ -17,10 +17,9 @@
 // Backend init
 //------------------------------------------------------------------------------
 static int CeedInit_Hip_gen(const char *resource, Ceed ceed) {
-  char      *resource_root;
-  const char fallback_resource[] = "/gpu/hip/ref";
-  Ceed       ceed_shared;
-  Ceed_Hip  *data;
+  char     *resource_root;
+  Ceed      ceed_shared;
+  Ceed_Hip *data;
 
   CeedCallBackend(CeedGetResourceRoot(ceed, resource, ":", &resource_root));
   CeedCheck(!strcmp(resource_root, "/gpu/hip") || !strcmp(resource_root, "/gpu/hip/gen"), ceed, CEED_ERROR_BACKEND,
@@ -35,7 +34,7 @@ static int CeedInit_Hip_gen(const char *resource, Ceed ceed) {
   CeedCallBackend(CeedSetDelegate(ceed, ceed_shared));
   CeedCallBackend(CeedDestroy(&ceed_shared));
 
-  CeedCallBackend(CeedSetOperatorFallbackResource(ceed, fallback_resource));
+  CeedCallBackend(CeedSetOperatorFallbackResource(ceed, "/gpu/hip/ref"));
 
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "QFunctionCreate", CeedQFunctionCreate_Hip_gen));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "OperatorCreate", CeedOperatorCreate_Hip_gen));
diff --git a/backends/sycl-gen/ceed-sycl-gen.sycl.cpp b/backends/sycl-gen/ceed-sycl-gen.sycl.cpp
index 6335df787a..29abd0acb2 100644
--- a/backends/sycl-gen/ceed-sycl-gen.sycl.cpp
+++ b/backends/sycl-gen/ceed-sycl-gen.sycl.cpp
@@ -21,7 +21,6 @@ static int CeedInit_Sycl_gen(const char *resource, Ceed ceed) {
   Ceed       ceed_shared;
   Ceed_Sycl *data;
   char      *resource_root;
-  const char fallback_resource[] = "/gpu/sycl/ref";
 
   CeedCallBackend(CeedGetResourceRoot(ceed, resource, ":device_id=", &resource_root));
   CeedCheck(!strcmp(resource_root, "/gpu/sycl") || !strcmp(resource_root, "/gpu/sycl/gen"), ceed, CEED_ERROR_BACKEND,
@@ -37,7 +36,7 @@ static int CeedInit_Sycl_gen(const char *resource, Ceed ceed) {
   CeedCallBackend(CeedSetStream_Sycl(ceed_shared, &(data->sycl_queue)));
   CeedCallBackend(CeedDestroy(&ceed_shared));
 
-  CeedCallBackend(CeedSetOperatorFallbackResource(ceed, fallback_resource));
+  CeedCallBackend(CeedSetOperatorFallbackResource(ceed, "/gpu/sycl/ref"));
 
   Ceed ceed_fallback = NULL;
   CeedCallBackend(CeedGetOperatorFallbackCeed(ceed, &ceed_fallback));
diff --git a/doc/sphinx/source/gpu.md b/doc/sphinx/source/gpu.md
index f7418ac5f3..6040e52a8d 100644
--- a/doc/sphinx/source/gpu.md
+++ b/doc/sphinx/source/gpu.md
@@ -7,11 +7,12 @@ Code that produces correct results with CPU backends will produce correct result
 
 The filepath to the user source code is passed in {c:func}`CeedQFunctionCreateInterior` as the `source` argument.
 This filepath should typically be an absolute path to ensure the JiT compilation can locate the source file.
-The filepath may also be relative to a root directory set with {c:func}`CeedAddJitSourceRoot`.
-The {c:macro}`CEED_QFUNCTION` macro automatically creates a string with the absolute path stored in the variable `user_loc` for a {c:type}`CeedQFunctionUser` called `user`.
+The filepath may also be a relative path with respect to a root directory set with {c:func}`CeedAddJitSourceRoot`.
+The {c:macro}`CEED_QFUNCTION` macro automatically creates a string with the absolute path, for example a {c:type}`CeedQFunctionUser` called `user` would have this string stored in the variable `user_loc`.
 
-The entire contents of this file and all locally included files (`#include "foo.h"`) are used during JiT compilation for GPU backends.
-Installed headers (`#include <bar.h>`) are omitted in the source code passed to JiT, but the compilation environment may supply common headers such as `<math.h>`.
+The entire contents of this source file and all included files are used during JiT compilation for GPU backends.
+Include statements for system headers that are required for CPU compilation but are not available in GPU compilation environments should be guarded with `#ifdef CEED_RUNNING_JIT_PASS`.
+Any function definitions in these system headers must still be available in the GPU compilation environments, such as the contents of `<math.h>`.
 These source file must only contain syntax constructs supported by C99 and all targeted backends (i.e. CUDA for `/gpu/cuda`, OpenCL/SYCL for `/gpu/sycl`, etc.).
 
 All source files must be at the provided filepath at runtime for JiT to function.
@@ -20,10 +21,10 @@ All source files must be at the provided filepath at runtime for JiT to function
 
 GPU backends require stricter adherence to memory access assumptions, but CPU backends may occasionally report correct results despite violations of memory access assumptions.
 Both `CeedVector` and `CeedQFunctionContext` have read-only and read-write accessors, and `CeedVector` allow write-only access.
-Read-only access of `CeedVector` and `CeedQFunctionContext` memory spaces must be respected for proper GPU behavior.
+Read-only access of `CeedVector` and `CeedQFunctionContext` memory spaces must be respected for to ensure proper GPU behavior.
 Write-only access of `CeedVector` memory spaces asserts that all data in the `CeedVector` is invalid until overwritten.
 
-`CeedQFunction` assume that all input arrays are read-only and all output arrays are write-only and the {c:type}`CeedQFunctionUser` must adhere to these assumptions, only reading data in the input arrays and fully overwriting the output arrays.
+`CeedQFunction` assume that all input arrays are read-only and all output arrays are write-only and the {c:type}`CeedQFunctionUser` must adhere to these assumptions, only reading data in the input arrays and fully overwriting all entries in the output arrays.
 Additionally, {c:type}`CeedQFunctionUser` have read-write access for `CeedQFunctionContext` data, unless {c:func}`CeedQFunctionSetContextWritable` was used to indicate that read-only access is sufficient.
 
 The `/cpu/self/memcheck` backends explicitly verify read-only and write-only memory access assumptions.
diff --git a/doc/sphinx/source/index.md b/doc/sphinx/source/index.md
index 82272c54d4..f35c1bb5a0 100644
--- a/doc/sphinx/source/index.md
+++ b/doc/sphinx/source/index.md
@@ -8,9 +8,9 @@ intro
 gettingstarted
 libCEEDapi
 examples/index
+api/index
 ffi
 gpu
-api/index
 precision
 libCEEDdev
 Contributing <CONTRIBUTING>
diff --git a/doc/sphinx/source/libCEEDdev.md b/doc/sphinx/source/libCEEDdev.md
index 7a009ee811..536588fe06 100644
--- a/doc/sphinx/source/libCEEDdev.md
+++ b/doc/sphinx/source/libCEEDdev.md
@@ -1,98 +1,119 @@
 # Developer Notes
 
-## Style Guide
+## Library Design
 
-Please check your code for style issues by running
+LibCEED has a single user facing API for creating and using the libCEED objects ({ref}`CeedVector`, {ref}`CeedBasis`, etc).
+Different Ceed backends are selected by instantiating a different {ref}`Ceed` object to create the other libCEED objects, in a [bridge pattern](https://en.wikipedia.org/wiki/Bridge_pattern).
+At runtime, the user can select the different backend implementations to target different hardware, such as CPUs or GPUs.
 
-`make format`
+When designing new features, developers should place the function definitions for the user facing API in the header `/include/ceed/ceed.h`.
+The basic implementation of these functions should typically be placed in `/interface/*.c` files.
+The interface should pass any computationally expensive or hardware specific operations to a backend implementation.
+A new method for the associated libCEED object can be added in `/include/ceed-impl.h`, with a corresponding `CEED_FTABLE_ENTRY` in `/interface/ceed.c` to allow backends to set their own implementations of this method.
+Then in the creation of the backend specific implementation of the object, typically found in `/backends/[impl]/ceed-[impl]-[object].c`, the developer creates the backend implementation of the specific method and calls {c:func}`CeedSetBackendFunction` to set this implementation of the method for the backend.
+Any supplemental functions intended to be used in the interface or by the backends may be added to the backend API in the header `/include/ceed/backend.h`.
+The basic implementation of these functions should also be placed in `/interface/*.c` files.
 
-In addition to those automatically enforced style rules, libCEED tends to follow the following code style conventions:
+LibCEED generally follows a "CPU first" implementation strategy when adding new functionality to the user facing API.
+If there are no performance specific considerations, it is generally recommended to include a basic CPU default implementation in `/interface/*.c`.
+Any new functions must be well documented and tested.
+Once the user facing API and the default implementation are in place and verified correct via tests, then the developer can focus on hardware specific implementations (AVX, CUDA, HIP, etc.) as necessary.
 
-- Variable names: `snake_case`
-- Strut members: `snake_case`
-- Function and method names: `PascalCase` or language specific style
-- Type names: `PascalCase` or language specific style
-- Constant names: `CAPS_SNAKE_CASE` or language specific style
+## Backend Inheritance
 
-Also, documentation files should have one sentence per line to help make git diffs clearer and less disruptive.
+A Ceed backend is not required to implement all libCeed objects or {ref}`CeedOperator` methods.
+There are three mechanisms by which a Ceed backend can inherit implementations from another Ceed backend.
 
-## Clang-tidy
+1. Delegation - Developers may use {c:func}`CeedSetDelegate` to set a general delegate {ref}`Ceed` object.
+   This delegate {ref}`Ceed` will provide the implementation of any libCeed objects that parent backend does not implement.
+   For example, the `/cpu/self/xsmm/serial` backend implements the `CeedTensorContract` object itself but delegates all other functionality to the `/cpu/self/opt/serial` backend.
 
-Please check your code for common issues by running
+2. Object delegation  - Developers may use {c:func}`CeedSetObjectDelegate` to set a delegate {ref}`Ceed` object for a specific libCEED object.
+   This delegate {ref}`Ceed` will only provide the implementation of that specific libCeed object for the parent backend.
+   Object delegation has higher precedence than delegation.
 
-`make tidy`
+3. Operator fallback - Developers may use {c:func}`CeedSetOperatorFallbackResource` to set a string identifying which {ref}`Ceed` backend will be instantiated to provide any unimplemented {ref}`CeedOperator` methods that support preconditioning, such as {c:func}`CeedOperatorLinearAssemble`.
+   The parent backend must implement the basic {ref}`CeedOperator` functionality.
+   This fallback {ref}`Ceed` object will only be created if a method is called that is not implemented by the parent backend.
+   In order to use operator fallback, the parent backend and fallback backend must use compatible E-vector and Q-vector layouts.
+   For example, `/gpu/cuda/gen` falls back to `/gpu/cuda/ref` for missing {ref}`CeedOperator` preconditioning support methods.
+   If an unimplemented method is called, then the parent `/gpu/cuda/gen` {ref}`Ceed` object creates a fallback `/gpu/cuda/ref` {ref}`Ceed` object and creates a clone of the {ref}`CeedOperator` with this fallback {ref}`Ceed` object.
+   This clone {ref}`CeedOperator` is then used for the unimplemented preconditioning support methods.
 
-which uses the `clang-tidy` utility included in recent releases of Clang.
-This tool is much slower than actual compilation (`make -j8` parallelism helps).
-To run on a single file, use
+## Backend Families
 
-`make interface/ceed.c.tidy`
+There are 4 general 'families' of backend implementations.
+As internal data layouts are specific to backend families, it is generally not possible to delegate between backend families.
 
-for example.
-All issues reported by `make tidy` should be fixed.
+### CPU Backends
 
-## Include-What-You-Use
+The basic CPU with the simplest implementation is `/cpu/self/ref/serial`.
+This backend contains the basic implementations of most objects that other backends rely upon.
+Most of the other CPU backends only update the {ref}`CeedOperator` and `CeedTensorContract` objects.
 
-Header inclusion for source files should follow the principal of 'include what you use' rather than relying upon transitive `#include` to define all symbols.
+The `/cpu/self/ref/blockend` and `/cpu/self/opt/*` backends delegate to the `/cpu/self/ref/serial` backend.
+The `/cpu/self/ref/blocked` backend updates the {ref}`CeedOperator` to use an E-vector and Q-vector ordering when data for 8 elements are interlaced to provide better vectorization.
+The `/cpu/self/opt/*` backends update the {ref}`CeedOperator` to apply the action of the operator in 1 or 8 element batches, depending upon if the blocking strategy is used.
+This reduced the memory required to utilize this backend significantly.
 
-Every symbol that is used in the source file `foo.c` should be defined in `foo.c`, `foo.h`, or in a header file `#include`d in one of these two locations.
-Please check your code by running the tool [`include-what-you-use`](https://include-what-you-use.org/) to see recommendations for changes to your source.
-Most issues reported by `include-what-you-use` should be fixed; however this rule is flexible to account for differences in header file organization in external libraries.
-If you have `include-what-you-use` installed in a sibling directory to libCEED or set the environment variable `IWYU_CC`, then you can use the makefile target `make iwyu`.
+The `/cpu/self/avx/*` and `/cpu/self/xsmm/*` backends delegate to the corresponding `/cpu/self/opt/*` backends.
+These backends update the `CeedTensorContract` objects using AVX intrinsics and libXSMM functions, respectively.
 
-Header files should be listed in alphabetical order, with installed headers preceding local headers and `ceed` headers being listed first.
-The `ceed-f64.h` and `ceed-f32.h` headers should only be included in `ceed.h`.
+The `/cpu/self/memcheck/*` backends delegate to the `/cpu/self/ref/*` backends.
+These backends replace many of the implementations with methods that include more verification checks and a memory management model that more closely matches the memory management for GPU backends.
+These backends rely upon the [Valgrind](https://valgrind.org/) Memcheck tool and Valgrind headers.
 
-```c
-#include <ceed.h>
-#include <ceed/backend.h>
-#include <stdbool.h>
-#include <string.h>
-#include "ceed-avx.h"
-```
+### GPU Backends
 
-## Shape
+The CUDA, HIP, and SYCL backend families all follow similar designs.
+The CUDA and HIP backends are very similar, with minor differences.
+While the SYCL backend was based upon the CUDA and HIP backends, there are more internal differences to accommodate OpenCL and Intel hardware.
 
-Backends often manipulate tensors of dimension greater than 2.
-It is awkward to pass fully-specified multi-dimensional arrays using C99 and certain operations will flatten/reshape the tensors for computational convenience.
-We frequently use comments to document shapes using a lexicographic ordering.
-For example, the comment
+The `/gpu/*/ref` backends provide basic functionality.
+In these backends, the operator is applied in multiple separate kernel launches, following the libCEED operator decomposition, where first {ref}`CeedElemRestriction` kernels map from the L-vectors to E-vectors, then {ref}`CeedBasis` kernels map from the E-vectors to Q-vectors, then the {ref}`CeedQFunction` kernel provides the action of the user quadrature point function, and the transpose {ref}`CeedBasis` and {ref}`CeedElemRestriction` kernels are applied to go back to the E-vectors and finally the L-vectors.
+These kernels apply to all points across all elements in order to maximize the amount of work each kernel launch has.
+Some of these kernels are compiled at runtime via NVRTC, HIPRTC, or OpenCL RTC.
 
-```c
-// u has shape [dim, num_comp, Q, num_elem]
-```
+The `/gpu/*/shared` backends delegate to the corresponding `/gpu/*/ref` backends.
+These backends use shared memory to improve performance for the {ref}`CeedBasis` kernels.
+All other libCEED objects are delegated to `/gpu/*/ref`.
+These kernels are compiled at runtime via NVRTC, HIPRTC, or OpenCL RTC.
 
-means that it can be traversed as
+The `/gpu/*/gen` backends delegate to the corresponding `/gpu/*/shared` backends.
+These backends write a single comprehensive kernel to apply the action of the {ref}`CeedOperator`, significantly improving performance by eliminating intermediate data structures and reducing the total number of kernel launches required.
+This kernel is compiled at runtime via NVRTC, HIPRTC, or OpenCL RTC.
 
-```c
-for (d=0; d<dim; d++)
-  for (c=0; c<num_comp; c++)
-    for (q=0; q<Q; q++)
-      for (e=0; e<num_elem; e++)
-        u[((d*num_comp + c)*Q + q)*num_elem + e] = ...
-```
+The `/gpu/*/magma` backends delegate to the corresponding `/gpu/cuda/ref` and `/gpu/hip/ref` backends.
+These backends provide better performance for {ref}`CeedBasis` kernels but do not have the improvements from the `/gpu/*/gen` backends for {ref}`CeedOperator`.
 
-This ordering is sometimes referred to as row-major or C-style.
-Note that flattening such as
+The `/*/*/occa` backends are an experimental feature and not part of any family.
 
-```c
-// u has shape [dim, num_comp, Q*num_elem]
-```
+## Internal Layouts
 
-and
+Ceed backends are free to use any E-vector and Q-vector data layout (including never fully forming these vectors) so long as the backend passes the `t5**` series tests and all examples.
+There are several common layouts for L-vectors, E-vectors, and Q-vectors, detailed below:
 
-```c
-// u has shape [dim*num_comp, Q, num_elem]
-```
+- **L-vector** layouts
 
-are purely implicit -- one just indexes the same array using the appropriate convention.
+  - L-vectors described by a standard {ref}`CeedElemRestriction` have a layout described by the `offsets` array and `comp_stride` parameter.
+    Data for node `i`, component `j`, element `k` can be found in the L-vector at index `offsets[i + k*elem_size] + j*comp_stride`.
+  - L-vectors described by a strided {ref}`CeedElemRestriction` have a layout described by the `strides` array.
+    Data for node `i`, component `j`, element `k` can be found in the L-vector at index `i*strides[0] + j*strides[1] + k*strides[2]`.
 
-## `restrict` Semantics
+- **E-vector** layouts
 
-QFunction arguments can be assumed to have `restrict` semantics.
-That is, each input and output array must reside in distinct memory without overlap.
+  - If possible, backends should use {c:func}`CeedElemRestrictionSetELayout()` to use the `t2**` tests.
+    If the backend uses a strided E-vector layout, then the data for node `i`, component `j`, element `k` in the E-vector is given by `i*layout[0] + j*layout[1] + k*layout[2]`.
+  - Backends may choose to use a non-strided E-vector layout; however, the `t2**` tests will not function correctly in this case and these tests will need to be marked as allowable failures for this backend in the test suite.
 
-## CeedVector Array Access Semantics
+- **Q-vector** layouts
+
+  - When the size of a {ref}`CeedQFunction` field is greater than `1`, data for quadrature point `i` component `j` can be found in the Q-vector at index `i + Q*j`, where `Q` is the total number of quadrature points in the Q-vector.
+    Backends are free to provide the quadrature points in any order.
+  - When the {ref}`CeedQFunction` field has `emode` `CEED_EVAL_GRAD`, data for quadrature point `i`, component `j`, derivative `k` can be found in the Q-vector at index `i + Q*j + Q*num_comp*k`.
+  - Backend developers must take special care to ensure that the data in the Q-vectors for a field with `emode` `CEED_EVAL_NONE` is properly ordered when the backend uses different layouts for E-vectors and Q-vectors.
+
+## CeedVector Array Access
 
 Backend implementations are expected to separately track 'owned' and 'borrowed' memory locations.
 Backends are responsible for freeing 'owned' memory; 'borrowed' memory is set by the user and backends only have read/write access to 'borrowed' memory.
@@ -127,45 +148,94 @@ All backends may assume that array access will conform to these guidelines:
     Data synchronization is not required for the memory location returned by {c:func}`CeedVectorGetArrayWrite`.
     The caller should assume that all data at the memory location returned by {c:func}`CeedVectorGetArrayWrite` is *invalid*.
 
-## Internal Layouts
+## Shape
 
-Ceed backends are free to use any **E-vector** and **Q-vector** data layout, to include never fully forming these vectors, so long as the backend passes the `t5**` series tests and all examples.
-There are several common layouts for **L-vectors**, **E-vectors**, and **Q-vectors**, detailed below:
+Backends often manipulate tensors of dimension greater than 2.
+It is awkward to pass fully-specified multi-dimensional arrays using C99 and certain operations will flatten/reshape the tensors for computational convenience.
+We frequently use comments to document shapes using a lexicographic ordering.
+For example, the comment
 
-- **L-vector** layouts
+```c
+// u has shape [dim, num_comp, Q, num_elem]
+```
 
-  - **L-vectors** described by a {ref}`CeedElemRestriction` have a layout described by the `offsets` array and `comp_stride` parameter.
-    Data for node `i`, component `j`, element `k` can be found in the **L-vector** at index `offsets[i + k*elem_size] + j*comp_stride`.
-  - **L-vectors** described by a strided {ref}`CeedElemRestriction` have a layout described by the `strides` array.
-    Data for node `i`, component `j`, element `k` can be found in the **L-vector** at index `i*strides[0] + j*strides[1] + k*strides[2]`.
+means that it can be traversed as
 
-- **E-vector** layouts
+```c
+for (d = 0; d < dim; d++) {
+  for (c = 0; c < num_comp; c++) {
+    for (q = 0; q < Q; q++) {
+      for (e = 0; e < num_elem; e++) {
+        u[((d*num_comp + c)*Q + q)*num_elem + e] = ...
+```
 
-  - If possible, backends should use {c:func}`CeedElemRestrictionSetELayout()` to use the `t2**` tests.
-    If the backend uses a strided **E-vector** layout, then the data for node `i`, component `j`, element `k` in the **E-vector** is given by `i*layout[0] + j*layout[1] + k*layout[2]`.
-  - Backends may choose to use a non-strided **E-vector** layout; however, the `t2**` tests will not function correctly in this case and the tests will need to be whitelisted for the backend to pass the test suite.
+This ordering is sometimes referred to as row-major or C-style.
+Note that flattening such as
 
-- **Q-vector** layouts
+```c
+// u has shape [dim, num_comp, Q*num_elem]
+```
 
-  - When the size of a {ref}`CeedQFunction` field is greater than `1`, data for quadrature point `i` component `j` can be found in the **Q-vector** at index `i + Q*j`.
-    Backends are free to provide the quadrature points in any order.
-  - When the {ref}`CeedQFunction` field has `emode` `CEED_EVAL_GRAD`, data for quadrature point `i`, component `j`, derivative `k` can be found in the **Q-vector** at index `i + Q*j + Q*size*k`.
-  - Note that backend developers must take special care to ensure that the data in the **Q-vectors** for a field with `emode` `CEED_EVAL_NONE` is properly ordered when the backend uses different layouts for **E-vectors** and **Q-vectors**.
+and
 
-## Backend Inheritance
+```c
+// u has shape [dim*num_comp, Q, num_elem]
+```
 
-There are three mechanisms by which a Ceed backend can inherit implementation from another Ceed backend.
-These options are set in the backend initialization routine.
+are purely implicit -- one just indexes the same array using the appropriate convention.
 
-1. Delegation - Developers may use {c:func}`CeedSetDelegate()` to set a backend that will provide the implementation of any unimplemented Ceed objects.
-2. Object delegation  - Developers may use {c:func}`CeedSetObjectDelegate()` to set a backend that will provide the implementation of a specific unimplemented Ceed object.
-   Object delegation has higher precedence than delegation.
-3. Operator fallback - Developers may use {c:func}`CeedSetOperatorFallbackResource()` to set a {ref}`Ceed` resource that will provide the implementation of unimplemented {ref}`CeedOperator` methods.
-   A fallback {ref}`Ceed` with this resource will only be instantiated if a method is called that is not implemented by the parent {ref}`Ceed`.
-   In order to use the fallback mechanism, the parent {ref}`Ceed` and fallback resource must use compatible **E-vector** and **Q-vector** layouts.
+## `restrict` Semantics
+
+QFunction arguments can be assumed to have `restrict` semantics.
+That is, each input and output array must reside in distinct memory without overlap.
+
+## Style Guide
+
+Please check your code for style issues by running
+
+`make format`
+
+In addition to those automatically enforced style rules, libCEED tends to follow the following code style conventions:
+
+- Variable names: `snake_case`
+- Strut members: `snake_case`
+- Function and method names: `PascalCase` or language specific style
+- Type names: `PascalCase` or language specific style
+- Constant names: `CAPS_SNAKE_CASE` or language specific style
+
+Also, documentation files should have one sentence per line to help make git diffs clearer and less disruptive.
+
+## Clang-tidy
+
+Please check your code for common issues by running
+
+`make tidy`
+
+which uses the `clang-tidy` utility included in recent releases of Clang.
+This tool is much slower than actual compilation (`make -j8` parallelism helps).
+To run on a single file, use
+
+`make interface/ceed.c.tidy`
+
+for example.
+All issues reported by `make tidy` should be fixed.
+
+## Include-What-You-Use
+
+Header inclusion for source files should follow the principal of 'include what you use' rather than relying upon transitive `#include` to define all symbols.
 
-For example, the `/cpu/self/xsmm/serial/` backend implements the `CeedTensorContract` object but delegates all other functionality to the `/cpu/self/opt/serial` backend.
-The `/cpu/self/opt/serial` backend implements the `CeedTensorContract` and `CeedOperator` objects but delegates all other functionality to the `/cpu/self/ref/serial` backend.
+Every symbol that is used in the source file `foo.c` should be defined in `foo.c`, `foo.h`, or in a header file `#include`d in one of these two locations.
+Please check your code by running the tool [`include-what-you-use`](https://include-what-you-use.org/) to see recommendations for changes to your source.
+Most issues reported by `include-what-you-use` should be fixed; however this rule is flexible to account for differences in header file organization in external libraries.
+If you have `include-what-you-use` installed in a sibling directory to libCEED or set the environment variable `IWYU_CC`, then you can use the makefile target `make iwyu`.
 
-If the `/cpu/self/opt/serial` backend had missing {ref}`CeedOperator` functionality, then it could fallback to `/cpu/self/ref/serial` for missing methods.
-In this case, the fallback {ref}`Ceed` would clone the `/cpu/self/opt/serial` {ref}`CeedOperator` and use this clone to execute the missing functionality.
+Header files should be listed in alphabetical order, with installed headers preceding local headers and `ceed` headers being listed first.
+The `ceed-f64.h` and `ceed-f32.h` headers should only be included in `ceed.h`.
+
+```c
+#include <ceed.h>
+#include <ceed/backend.h>
+#include <stdbool.h>
+#include <string.h>
+#include "ceed-avx.h"
+```
diff --git a/interface/ceed-qfunction.c b/interface/ceed-qfunction.c
index cfce6575df..22e90c8b54 100644
--- a/interface/ceed-qfunction.c
+++ b/interface/ceed-qfunction.c
@@ -764,13 +764,20 @@ int CeedQFunctionReferenceCopy(CeedQFunction qf, CeedQFunction *qf_copy) {
 
   @param[in,out] qf         `CeedQFunction`
   @param[in]     field_name Name of `CeedQFunction` field
-  @param[in]     size       Size of `CeedQFunction` field, (`num_comp * 1`) for @ref CEED_EVAL_NONE, (`num_comp * 1`) for @ref CEED_EVAL_INTERP for an \f$H^1\f$ space or (`num_comp * dim`) for an \f$H(\mathrm{div})\f$ or \f$H(\mathrm{curl})\f$ space, (`num_comp * dim`) for @ref CEED_EVAL_GRAD, or (`num_comp * 1`) for @ref CEED_EVAL_DIV, and (`num_comp * curl_dim`) with `curl_dim = 1` if `dim < 3` otherwise `curl_dim = dim` for @ref CEED_EVAL_CURL.
+  @param[in]     size       Size of `CeedQFunction` field,
+                              (`num_comp * 1`) for @ref CEED_EVAL_NONE,
+                              (`num_comp * 1`) for @ref CEED_EVAL_INTERP for an \f$H^1\f$ space or (`num_comp * dim`) for an \f$H(\mathrm{div})\f$ or \f$H(\mathrm{curl})\f$ space,
+                              (`num_comp * dim`) for @ref CEED_EVAL_GRAD,
+                              (`num_comp * 1`) for @ref CEED_EVAL_DIV, and
+                              (`num_comp * curl_dim`) with `curl_dim = 1` if `dim < 3` otherwise `curl_dim = dim` for @ref CEED_EVAL_CURL.
   @param[in]     eval_mode  @ref CEED_EVAL_NONE to use values directly,
                               @ref CEED_EVAL_INTERP to use interpolated values,
                               @ref CEED_EVAL_GRAD to use gradients,
                               @ref CEED_EVAL_DIV to use divergence,
                               @ref CEED_EVAL_CURL to use curl
 
+  Note: In the user `CeedQFunctionUser`, the `in` argument list the fields in the order given by the calls to `CeedQFunctionAddInput`.
+
   @return An error code: 0 - success, otherwise - failure
 
   @ref User
@@ -799,13 +806,20 @@ int CeedQFunctionAddInput(CeedQFunction qf, const char *field_name, CeedInt size
 
   @param[in,out] qf         `CeedQFunction`
   @param[in]     field_name Name of `CeedQFunction` field
-  @param[in]     size       Size of `CeedQFunction` field, (`num_comp * 1`) for @ref CEED_EVAL_NONE, (`num_comp * 1`) for @ref CEED_EVAL_INTERP for an \f$H^1\f$ space or (`num_comp * dim`) for an \f$H(\mathrm{div})\f$ or \f$H(\mathrm{curl})\f$ space, (`num_comp * dim`) for @ref CEED_EVAL_GRAD, or (`num_comp * 1`) for @ref CEED_EVAL_DIV, and (`num_comp * curl_dim`) with `curl_dim = 1` if `dim < 3` else dim for @ref CEED_EVAL_CURL.
+  @param[in]     size       Size of `CeedQFunction` field,
+                              (`num_comp * 1`) for @ref CEED_EVAL_NONE,
+                              (`num_comp * 1`) for @ref CEED_EVAL_INTERP for an \f$H^1\f$ space or (`num_comp * dim`) for an \f$H(\mathrm{div})\f$ or \f$H(\mathrm{curl})\f$ space,
+                              (`num_comp * dim`) for @ref CEED_EVAL_GRAD,
+                              (`num_comp * 1`) for @ref CEED_EVAL_DIV, and
+                              (`num_comp * curl_dim`) with `curl_dim = 1` if `dim < 3` otherwise `curl_dim = dim` for @ref CEED_EVAL_CURL.
   @param[in]     eval_mode  @ref CEED_EVAL_NONE to use values directly,
                               @ref CEED_EVAL_INTERP to use interpolated values,
                               @ref CEED_EVAL_GRAD to use gradients,
                               @ref CEED_EVAL_DIV to use divergence,
                               @ref CEED_EVAL_CURL to use curl.
 
+  Note: In the user `CeedQFunctionUser`, the `out` argument list the fields in the order given by the calls to `CeedQFunctionAddOutput`.
+
   @return An error code: 0 - success, otherwise - failure
 
   @ref User

From 7ad93db142e270d25656527d7e9abc8dac4a6205 Mon Sep 17 00:00:00 2001
From: Jed Brown <jed@jedbrown.org>
Date: Mon, 16 Jun 2025 18:11:47 -0600
Subject: [PATCH 417/571] ci: noether CUDA_DIR=/usr/local/cuda-12.9

---
 .gitlab-ci.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index b6cb108385..c590f143a7 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -196,7 +196,7 @@ noether-cuda:
   script:
     - rm -f .SUCCESS
     # libCEED
-    - make configure OPT='-O -march=native -ffp-contract=fast' CUDA_DIR=/usr
+    - make configure OPT='-O -march=native -ffp-contract=fast' CUDA_DIR=/usr/local/cuda-12.9
     - echo "-------------- libCEED -------------" && make info
     - BACKENDS_GPU=$(make info-backends | grep -o '/gpu[^ ]*' | tr '\n' ' ')
     - echo "-------------- BACKENDS_GPU --------" && echo $BACKENDS_GPU
@@ -388,7 +388,7 @@ noether-float:
     # Change to single precision
     - sed -i 's/ceed-f64/ceed-f32/1' include/ceed/types.h
     # Build libCEED
-    - make configure OPT='-O -march=native -ffp-contract=fast' CUDA_DIR=/usr
+    - make configure OPT='-O -march=native -ffp-contract=fast' CUDA_DIR=/usr/local/cuda-12.9
     - BACKENDS_CPU=$(make info-backends-all | grep -o '/cpu[^ ]*' | tr '\n' ' ') && BACKENDS_GPU=$(make info-backends | grep -o '/gpu[^ ]*' | tr '\n' ' ')
     - echo "-------------- libCEED -------------" && make info
     - echo "-------------- BACKENDS_CPU --------" && echo $BACKENDS_CPU

From d4f9124cb1a45ca88b676a102fa1807cec70f276 Mon Sep 17 00:00:00 2001
From: Jed Brown <jed@jedbrown.org>
Date: Mon, 16 Jun 2025 21:53:44 -0600
Subject: [PATCH 418/571] makefiles: update defaults from c99 to c11

cuda-12.9 contains anonymous unions, which are not in c99. But c11 is
widely supported these days so it seems fair to update.

One nit is that c11 does not require VLA, and libCEED uses VLA-pointers
internally for some purposes.
---
 Makefile                            | 4 ++--
 examples/ceed/Makefile              | 2 +-
 examples/fluids/Makefile            | 2 +-
 examples/petsc/Makefile             | 2 +-
 examples/python/setup_qfunctions.py | 2 +-
 examples/solids/Makefile            | 2 +-
 python/tests/setup-qfunctions.py    | 2 +-
 7 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/Makefile b/Makefile
index 88f1911f2a..822e21f83c 100644
--- a/Makefile
+++ b/Makefile
@@ -134,7 +134,7 @@ OPT.clang               := $(OPT.gcc)
 OPT.icc                 := $(OPT.gcc)
 OPT.oneAPI              := $(OPT.clang)
 OPT.emcc                :=
-CFLAGS.gcc              := $(if $(STATIC),,-fPIC) -std=c99 -Wall -Wextra -Wno-unused-parameter -MMD -MP
+CFLAGS.gcc              := $(if $(STATIC),,-fPIC) -std=c11 -Wall -Wextra -Wno-unused-parameter -MMD -MP
 CFLAGS.clang            := $(CFLAGS.gcc)
 CFLAGS.icc              := $(CFLAGS.gcc)
 CFLAGS.oneAPI           := $(CFLAGS.clang)
@@ -958,7 +958,7 @@ vermin    :
 CLANG_TIDY ?= clang-tidy
 
 %.c.tidy : %.c
-	$(CLANG_TIDY) $(TIDY_OPTS) $^ -- $(CPPFLAGS) --std=c99 -I$(CUDA_DIR)/include -I$(ROCM_DIR)/include -DCEED_JIT_SOURCE_ROOT_DEFAULT="\"$(abspath ./include)/\"" -DCEED_GIT_VERSION="\"$(GIT_DESCRIBE)\"" -DCEED_BUILD_CONFIGURATION="\"// Build Configuration:$(foreach v,$(CONFIG_VARS),\n$(v) = $($(v)))\""
+	$(CLANG_TIDY) $(TIDY_OPTS) $^ -- $(CPPFLAGS) --std=c11 -I$(CUDA_DIR)/include -I$(ROCM_DIR)/include -DCEED_JIT_SOURCE_ROOT_DEFAULT="\"$(abspath ./include)/\"" -DCEED_GIT_VERSION="\"$(GIT_DESCRIBE)\"" -DCEED_BUILD_CONFIGURATION="\"// Build Configuration:$(foreach v,$(CONFIG_VARS),\n$(v) = $($(v)))\""
 
 %.cpp.tidy : %.cpp
 	$(CLANG_TIDY) $(TIDY_OPTS) $^ -- $(CPPFLAGS) --std=c++11 -I$(CUDA_DIR)/include -I$(OCCA_DIR)/include -I$(ROCM_DIR)/include
diff --git a/examples/ceed/Makefile b/examples/ceed/Makefile
index 419499f05e..a5dbc70b4a 100644
--- a/examples/ceed/Makefile
+++ b/examples/ceed/Makefile
@@ -9,7 +9,7 @@ OPT ?= -O -g
 
 # Ceed directory
 CEED_DIR ?= ../..
-CEED_FLAGS ?= -I$(CEED_DIR)/include -std=c99 $(OPT)
+CEED_FLAGS ?= -I$(CEED_DIR)/include -std=c11 $(OPT)
 CEED_LIBS ?= -Wl,-rpath,$(abspath $(CEED_DIR)/lib) -L$(CEED_DIR)/lib -lceed -lm
 
 EXAMPLES.c = $(wildcard ex*.c)
diff --git a/examples/fluids/Makefile b/examples/fluids/Makefile
index 1162dbc8bb..c99a63a0b3 100644
--- a/examples/fluids/Makefile
+++ b/examples/fluids/Makefile
@@ -24,7 +24,7 @@ CEED_DIR ?= ../..
 ceed.pc := $(CEED_DIR)/lib/pkgconfig/ceed.pc
 
 CC = $(call pkgconf, --variable=ccompiler $(PETSc.pc) $(ceed.pc))
-CFLAGS = -std=c99 \
+CFLAGS = -std=c11 \
   $(call pkgconf, --variable=cflags_extra $(PETSc.pc)) \
   $(call pkgconf, --cflags-only-other $(PETSc.pc)) \
   $(OPT) $(OPT_EXAMPLES)
diff --git a/examples/petsc/Makefile b/examples/petsc/Makefile
index 37f08a9dee..ee3bd9313d 100644
--- a/examples/petsc/Makefile
+++ b/examples/petsc/Makefile
@@ -15,7 +15,7 @@ CEED_DIR ?= ../..
 ceed.pc := $(CEED_DIR)/lib/pkgconfig/ceed.pc
 
 CC = $(call pkgconf, --variable=ccompiler $(PETSc.pc) $(ceed.pc))
-CFLAGS = -std=c99 \
+CFLAGS = -std=c11 \
   $(call pkgconf, --variable=cflags_extra $(PETSc.pc)) \
   $(call pkgconf, --cflags-only-other $(PETSc.pc)) \
   $(OPT)
diff --git a/examples/python/setup_qfunctions.py b/examples/python/setup_qfunctions.py
index c92e6d397e..8c337621e7 100644
--- a/examples/python/setup_qfunctions.py
+++ b/examples/python/setup_qfunctions.py
@@ -17,7 +17,7 @@
 # Compiler arguments
 extra_compile_args = []
 if platform == "linux" or platform == "linux2" or platform == "darwin":
-    extra_compile_args = ["-O3", "-march=native", "-std=c99"]
+    extra_compile_args = ["-O3", "-march=native", "-std=c11"]
 
 # Define the extension module
 qfunctions = Extension("libceed_c_qfunctions",
diff --git a/examples/solids/Makefile b/examples/solids/Makefile
index 54a560f02e..5c87c8f0d0 100644
--- a/examples/solids/Makefile
+++ b/examples/solids/Makefile
@@ -15,7 +15,7 @@ CEED_DIR ?= ../..
 ceed.pc := $(CEED_DIR)/lib/pkgconfig/ceed.pc
 
 CC = $(call pkgconf, --variable=ccompiler $(PETSc.pc) $(ceed.pc))
-CFLAGS = -std=c99 \
+CFLAGS = -std=c11 \
   $(call pkgconf, --variable=cflags_extra $(PETSc.pc)) \
   $(call pkgconf, --cflags-only-other $(PETSc.pc)) \
   $(OPT)
diff --git a/python/tests/setup-qfunctions.py b/python/tests/setup-qfunctions.py
index 7a09e50e09..3a697113a4 100644
--- a/python/tests/setup-qfunctions.py
+++ b/python/tests/setup-qfunctions.py
@@ -16,7 +16,7 @@
 qf_module = Extension("libceed_qfunctions",
                       include_dirs=[os.path.join(CEED_DIR, 'include')],
                       sources=["libceed-qfunctions.c"],
-                      extra_compile_args=["-O3", "-std=c99",
+                      extra_compile_args=["-O3", "-std=c11",
                                           "-Wno-unused-variable",
                                           "-Wno-unused-function"])
 

From fe4cac902085df27e5190289edece310bcc3c565 Mon Sep 17 00:00:00 2001
From: Jed Brown <jed@jedbrown.org>
Date: Mon, 16 Jun 2025 22:42:40 -0600
Subject: [PATCH 419/571] tidy:
 -clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling

This has warnings about strncpy (vs strncpy_s) and fprintf (vs fprintf_s).
---
 .clang-tidy | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.clang-tidy b/.clang-tidy
index ab45c266bf..04cd208737 100644
--- a/.clang-tidy
+++ b/.clang-tidy
@@ -1,3 +1,3 @@
-Checks: "clang-diagnostic-*,clang-analyzer-*,readability-inconsistent-declaration-parameter-name,bugprone-too-small-loop-variable"
+Checks: "clang-diagnostic-*,clang-analyzer-*,readability-inconsistent-declaration-parameter-name,bugprone-too-small-loop-variable,-clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling"
 HeaderFilterRegex: .*
 WarningsAsErrors: "clang-diagnostic-*,clang-analyzer-*,readability-inconsistent-declaration-parameter-name"

From f4239526bb0ce4e797667b3f08cf1478ae4441d4 Mon Sep 17 00:00:00 2001
From: Jed Brown <jed@jedbrown.org>
Date: Mon, 16 Jun 2025 22:50:18 -0600
Subject: [PATCH 420/571] ci: noether rocm-6.3

---
 .gitlab-ci.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index c590f143a7..b9d06a271f 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -280,7 +280,7 @@ noether-cuda:
 #  script:
 #    - rm -f .SUCCESS
 #    # libCEED
-#    - make configure ROCM_DIR=/opt/rocm-6.1.0 OPT='-O -march=native -ffp-contract=fast'
+#    - make configure ROCM_DIR=/opt/rocm-6.3.0 OPT='-O -march=native -ffp-contract=fast'
 #    - BACKENDS_CPU=$(make info-backends-all | grep -o '/cpu[^ ]*' | tr '\n' ' ') && BACKENDS_GPU=$(make info-backends | grep -o '/gpu[^ ]*' | tr '\n' ' ')
 #    - echo "-------------- libCEED -------------" && make info
 #    - echo "-------------- BACKENDS_GPU --------" && echo $BACKENDS_GPU
@@ -342,7 +342,7 @@ noether-rocm:
   script:
     - rm -f .SUCCESS
     # libCEED
-    - make configure ROCM_DIR=/opt/rocm-6.1.0 OPT='-O -march=native -ffp-contract=fast'
+    - make configure ROCM_DIR=/opt/rocm-6.3.0 OPT='-O -march=native -ffp-contract=fast'
     - BACKENDS_CPU=$(make info-backends-all | grep -o '/cpu[^ ]*' | tr '\n' ' ') && BACKENDS_GPU=$(make info-backends | grep -o '/gpu[^ ]*' | tr '\n' ' ')
     - echo "-------------- libCEED -------------" && make info
     - echo "-------------- BACKENDS_GPU --------" && echo $BACKENDS_GPU

From d6c19ee8504c74d8f30ec67127f069a58291b3ac Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Tue, 17 Jun 2025 12:15:58 -0600
Subject: [PATCH 421/571] gpu - clarify __syncthreads usage (#1838)

---
 .../ceed/jit-source/cuda/cuda-gen-templates.h |  6 ++--
 ...-shared-basis-tensor-at-points-templates.h |  5 +++
 .../cuda/cuda-shared-basis-tensor-at-points.h |  4 ---
 ...-shared-basis-tensor-flattened-templates.h | 36 +++++++++----------
 .../cuda/cuda-shared-basis-tensor-templates.h | 26 +++++++-------
 .../ceed/jit-source/hip/hip-gen-templates.h   |  6 ++--
 ...-shared-basis-tensor-at-points-templates.h |  6 ++++
 .../hip/hip-shared-basis-tensor-at-points.h   |  4 ---
 ...-shared-basis-tensor-flattened-templates.h | 36 +++++++++----------
 .../hip/hip-shared-basis-tensor-templates.h   | 26 +++++++-------
 10 files changed, 79 insertions(+), 76 deletions(-)

diff --git a/include/ceed/jit-source/cuda/cuda-gen-templates.h b/include/ceed/jit-source/cuda/cuda-gen-templates.h
index f80a2af717..ba4ecfc2bd 100644
--- a/include/ceed/jit-source/cuda/cuda-gen-templates.h
+++ b/include/ceed/jit-source/cuda/cuda-gen-templates.h
@@ -274,6 +274,7 @@ inline __device__ void GradColloSlice3d(SharedData_Cuda &data, const CeedInt q,
                                         CeedScalar *__restrict__ r_V) {
   if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+      __syncthreads();
       data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[q + comp * Q_1D];
       __syncthreads();
       // X derivative
@@ -291,7 +292,6 @@ inline __device__ void GradColloSlice3d(SharedData_Cuda &data, const CeedInt q,
       for (CeedInt i = 0; i < Q_1D; i++) {
         r_V[comp + 2 * NUM_COMP] += c_G[i + q * Q_1D] * r_U[i + comp * Q_1D];
       }
-      __syncthreads();
     }
   }
 }
@@ -304,20 +304,20 @@ inline __device__ void GradColloSliceTranspose3d(SharedData_Cuda &data, const Ce
                                                  CeedScalar *__restrict__ r_V) {
   if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+      __syncthreads();
       data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 0 * NUM_COMP];
       __syncthreads();
       // X derivative
       for (CeedInt i = 0; i < Q_1D; i++) {
         r_V[q + comp * Q_1D] += c_G[data.t_id_x + i * Q_1D] * data.slice[i + data.t_id_y * T_1D];
       }
-      __syncthreads();
       // Y derivative
+      __syncthreads();
       data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 1 * NUM_COMP];
       __syncthreads();
       for (CeedInt i = 0; i < Q_1D; i++) {
         r_V[q + comp * Q_1D] += c_G[data.t_id_y + i * Q_1D] * data.slice[data.t_id_x + i * T_1D];
       }
-      __syncthreads();
       // Z derivative
       for (CeedInt i = 0; i < Q_1D; i++) {
         r_V[i + comp * Q_1D] += c_G[i + q * Q_1D] * r_U[comp + 2 * NUM_COMP];
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h
index a553ddff58..ae22a5f1ba 100644
--- a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h
@@ -95,6 +95,7 @@ inline __device__ void GradAtPoints1d(SharedData_Cuda &data, const CeedInt p, co
   for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
     // Load coefficients
+    __syncthreads();
     if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = r_C[comp];
     __syncthreads();
     // Contract x direction
@@ -145,6 +146,7 @@ inline __device__ void InterpAtPoints2d(SharedData_Cuda &data, const CeedInt p,
     CeedScalar chebyshev_x[Q_1D];
 
     // Load coefficients
+    __syncthreads();
     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[comp];
     __syncthreads();
     // Contract x direction
@@ -213,6 +215,7 @@ inline __device__ void GradAtPoints2d(SharedData_Cuda &data, const CeedInt p, co
     CeedScalar chebyshev_x[Q_1D];
 
     // Load coefficients
+    __syncthreads();
     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[comp];
     __syncthreads();
     for (CeedInt dim = 0; dim < 2; dim++) {
@@ -294,6 +297,7 @@ inline __device__ void InterpAtPoints3d(SharedData_Cuda &data, const CeedInt p,
       CeedScalar chebyshev_x[Q_1D];
 
       // Load coefficients
+      __syncthreads();
       if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[k + comp * Q_1D];
       __syncthreads();
       // Contract x direction
@@ -372,6 +376,7 @@ inline __device__ void GradAtPoints3d(SharedData_Cuda &data, const CeedInt p, co
       CeedScalar chebyshev_x[Q_1D];
 
       // Load coefficients
+      __syncthreads();
       if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[k + comp * Q_1D];
       __syncthreads();
       for (CeedInt dim = 0; dim < 3; dim++) {
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points.h
index 0216d0e0ba..dcb1763e38 100644
--- a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points.h
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points.h
@@ -129,7 +129,6 @@ extern "C" __global__ void InterpTransposeAtPoints(const CeedInt num_elem, const
         InterpTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
       }
     }
-    __syncthreads();
 
     // Map from coefficients
     if (BASIS_DIM == 1) {
@@ -189,7 +188,6 @@ extern "C" __global__ void InterpTransposeAddAtPoints(const CeedInt num_elem, co
         InterpTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
       }
     }
-    __syncthreads();
 
     // Map from coefficients
     if (BASIS_DIM == 1) {
@@ -319,7 +317,6 @@ extern "C" __global__ void GradTransposeAtPoints(const CeedInt num_elem, const C
         GradTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
       }
     }
-    __syncthreads();
 
     // Map from coefficients
     if (BASIS_DIM == 1) {
@@ -380,7 +377,6 @@ extern "C" __global__ void GradTransposeAddAtPoints(const CeedInt num_elem, cons
         GradTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
       }
     }
-    __syncthreads();
 
     // Map from coefficients
     if (BASIS_DIM == 1) {
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-flattened-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-flattened-templates.h
index e08eb7a20e..d004c341a6 100644
--- a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-flattened-templates.h
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-flattened-templates.h
@@ -19,6 +19,7 @@
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void ContractX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
                                             CeedScalar *V) {
+  __syncthreads();
   data.slice[t_id_x + t_id_y * T_1D] = *U;
   __syncthreads();
   *V = 0.0;
@@ -27,7 +28,6 @@ inline __device__ void ContractX2dFlattened(SharedData_Cuda &data, const int t_i
       *V += B[i + t_id_x * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
     }
   }
-  __syncthreads();
 }
 
 //------------------------------------------------------------------------------
@@ -36,6 +36,7 @@ inline __device__ void ContractX2dFlattened(SharedData_Cuda &data, const int t_i
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void ContractY2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
                                             CeedScalar *V) {
+  __syncthreads();
   data.slice[t_id_x + t_id_y * T_1D] = *U;
   __syncthreads();
   *V = 0.0;
@@ -44,7 +45,6 @@ inline __device__ void ContractY2dFlattened(SharedData_Cuda &data, const int t_i
       *V += B[i + t_id_y * P_1D] * data.slice[t_id_x + i * T_1D];  // Contract y direction
     }
   }
-  __syncthreads();
 }
 
 //------------------------------------------------------------------------------
@@ -53,6 +53,7 @@ inline __device__ void ContractY2dFlattened(SharedData_Cuda &data, const int t_i
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void ContractTransposeY2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U,
                                                      const CeedScalar *B, CeedScalar *V) {
+  __syncthreads();
   data.slice[t_id_x + t_id_y * T_1D] = *U;
   __syncthreads();
   *V = 0.0;
@@ -61,7 +62,6 @@ inline __device__ void ContractTransposeY2dFlattened(SharedData_Cuda &data, cons
       *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D];  // Contract y direction
     }
   }
-  __syncthreads();
 }
 
 //------------------------------------------------------------------------------
@@ -70,6 +70,7 @@ inline __device__ void ContractTransposeY2dFlattened(SharedData_Cuda &data, cons
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void ContractTransposeX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U,
                                                      const CeedScalar *B, CeedScalar *V) {
+  __syncthreads();
   data.slice[t_id_x + t_id_y * T_1D] = *U;
   __syncthreads();
   *V = 0.0;
@@ -78,7 +79,6 @@ inline __device__ void ContractTransposeX2dFlattened(SharedData_Cuda &data, cons
       *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
     }
   }
-  __syncthreads();
 }
 
 //------------------------------------------------------------------------------
@@ -87,6 +87,7 @@ inline __device__ void ContractTransposeX2dFlattened(SharedData_Cuda &data, cons
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void ContractTransposeAddX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U,
                                                         const CeedScalar *B, CeedScalar *V) {
+  __syncthreads();
   data.slice[t_id_x + t_id_y * T_1D] = *U;
   __syncthreads();
   if (t_id_x < P_1D && t_id_y < P_1D) {
@@ -94,7 +95,6 @@ inline __device__ void ContractTransposeAddX2dFlattened(SharedData_Cuda &data, c
       *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
     }
   }
-  __syncthreads();
 }
 
 //------------------------------------------------------------------------------
@@ -105,10 +105,10 @@ inline __device__ void QPack2d(SharedData_Cuda &data, const int t_id_x, const in
   const CeedInt new_t_id_x = data.t_id_x % Q_1D, new_t_id_y = data.t_id_x / Q_1D;
 
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    __syncthreads();
     if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * T_1D] = U[comp];
     __syncthreads();
     U[comp] = data.t_id_x < (Q_1D * Q_1D) ? data.slice[new_t_id_x + new_t_id_y * T_1D] : 0.0;
-    __syncthreads();
   }
 }
 
@@ -117,10 +117,10 @@ inline __device__ void QUnpack2d(SharedData_Cuda &data, const int t_id_x, const
   const CeedInt old_t_id_x = data.t_id_x % Q_1D, old_t_id_y = data.t_id_x / Q_1D;
 
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    __syncthreads();
     if (data.t_id_x < (Q_1D * Q_1D)) data.slice[old_t_id_x + old_t_id_y * T_1D] = U[comp];
     __syncthreads();
     U[comp] = (t_id_x < Q_1D && t_id_y < Q_1D) ? data.slice[t_id_x + t_id_y * T_1D] : 0.0;
-    __syncthreads();
   }
 }
 
@@ -218,6 +218,7 @@ inline __device__ void WeightTensor2dFlattened(SharedData_Cuda &data, const Ceed
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void ContractX3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
                                             const CeedScalar *B, CeedScalar *V) {
+  __syncthreads();
   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
   __syncthreads();
   *V = 0.0;
@@ -226,7 +227,6 @@ inline __device__ void ContractX3dFlattened(SharedData_Cuda &data, const int t_i
       *V += B[i + t_id_x * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D];  // Contract x direction
     }
   }
-  __syncthreads();
 }
 
 //------------------------------------------------------------------------------
@@ -235,6 +235,7 @@ inline __device__ void ContractX3dFlattened(SharedData_Cuda &data, const int t_i
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void ContractY3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
                                             const CeedScalar *B, CeedScalar *V) {
+  __syncthreads();
   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
   __syncthreads();
   *V = 0.0;
@@ -243,7 +244,6 @@ inline __device__ void ContractY3dFlattened(SharedData_Cuda &data, const int t_i
       *V += B[i + t_id_y * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D];  // Contract y direction
     }
   }
-  __syncthreads();
 }
 
 //------------------------------------------------------------------------------
@@ -252,6 +252,7 @@ inline __device__ void ContractY3dFlattened(SharedData_Cuda &data, const int t_i
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void ContractZ3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
                                             const CeedScalar *B, CeedScalar *V) {
+  __syncthreads();
   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
   __syncthreads();
   *V = 0.0;
@@ -260,7 +261,6 @@ inline __device__ void ContractZ3dFlattened(SharedData_Cuda &data, const int t_i
       *V += B[i + t_id_z * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D];  // Contract z direction
     }
   }
-  __syncthreads();
 }
 
 //------------------------------------------------------------------------------
@@ -269,6 +269,7 @@ inline __device__ void ContractZ3dFlattened(SharedData_Cuda &data, const int t_i
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void ContractTransposeZ3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
                                                      const CeedScalar *B, CeedScalar *V) {
+  __syncthreads();
   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
   __syncthreads();
   *V = 0.0;
@@ -277,7 +278,6 @@ inline __device__ void ContractTransposeZ3dFlattened(SharedData_Cuda &data, cons
       *V += B[t_id_z + i * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D];  // Contract z direction
     }
   }
-  __syncthreads();
 }
 
 //------------------------------------------------------------------------------
@@ -286,6 +286,7 @@ inline __device__ void ContractTransposeZ3dFlattened(SharedData_Cuda &data, cons
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void ContractTransposeAddZ3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z,
                                                         const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+  __syncthreads();
   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
   __syncthreads();
   if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) {
@@ -293,7 +294,6 @@ inline __device__ void ContractTransposeAddZ3dFlattened(SharedData_Cuda &data, c
       *V += B[t_id_z + i * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D];  // Contract z direction
     }
   }
-  __syncthreads();
 }
 
 //------------------------------------------------------------------------------
@@ -302,6 +302,7 @@ inline __device__ void ContractTransposeAddZ3dFlattened(SharedData_Cuda &data, c
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void ContractTransposeY3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
                                                      const CeedScalar *B, CeedScalar *V) {
+  __syncthreads();
   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
   __syncthreads();
   *V = 0.0;
@@ -310,7 +311,6 @@ inline __device__ void ContractTransposeY3dFlattened(SharedData_Cuda &data, cons
       *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D];  // Contract y direction
     }
   }
-  __syncthreads();
 }
 
 //------------------------------------------------------------------------------
@@ -319,6 +319,7 @@ inline __device__ void ContractTransposeY3dFlattened(SharedData_Cuda &data, cons
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void ContractTransposeAddY3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z,
                                                         const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+  __syncthreads();
   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
   __syncthreads();
   if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) {
@@ -326,7 +327,6 @@ inline __device__ void ContractTransposeAddY3dFlattened(SharedData_Cuda &data, c
       *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D];  // Contract y direction
     }
   }
-  __syncthreads();
 }
 
 //------------------------------------------------------------------------------
@@ -335,6 +335,7 @@ inline __device__ void ContractTransposeAddY3dFlattened(SharedData_Cuda &data, c
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void ContractTransposeX3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
                                                      const CeedScalar *B, CeedScalar *V) {
+  __syncthreads();
   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
   __syncthreads();
   *V = 0.0;
@@ -343,7 +344,6 @@ inline __device__ void ContractTransposeX3dFlattened(SharedData_Cuda &data, cons
       *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D];  // Contract x direction
     }
   }
-  __syncthreads();
 }
 
 //------------------------------------------------------------------------------
@@ -352,6 +352,7 @@ inline __device__ void ContractTransposeX3dFlattened(SharedData_Cuda &data, cons
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void ContractTransposeAddX3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z,
                                                         const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+  __syncthreads();
   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
   __syncthreads();
   if (t_id_x < P_1D && t_id_y < P_1D && t_id_z < P_1D) {
@@ -359,7 +360,6 @@ inline __device__ void ContractTransposeAddX3dFlattened(SharedData_Cuda &data, c
       *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D];  // Contract x direction
     }
   }
-  __syncthreads();
 }
 
 //------------------------------------------------------------------------------
@@ -370,10 +370,10 @@ inline __device__ void QPack3d(SharedData_Cuda &data, const int t_id_x, const in
   const CeedInt new_t_id_x = data.t_id_x % Q_1D, new_t_id_y = (data.t_id_x / Q_1D) % Q_1D, new_t_id_z = data.t_id_x / (Q_1D * Q_1D);
 
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    __syncthreads();
     if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = U[comp];
     __syncthreads();
     U[comp] = data.t_id_x < (Q_1D * Q_1D * Q_1D) ? data.slice[new_t_id_x + new_t_id_y * T_1D + new_t_id_z * T_1D * T_1D] : 0.0;
-    __syncthreads();
   }
 }
 
@@ -382,10 +382,10 @@ inline __device__ void QUnpack3d(SharedData_Cuda &data, const int t_id_x, const
   const CeedInt old_t_id_x = data.t_id_x % Q_1D, old_t_id_y = (data.t_id_x / Q_1D) % Q_1D, old_t_id_z = data.t_id_x / (Q_1D * Q_1D);
 
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    __syncthreads();
     if (data.t_id_x < Q_1D * Q_1D * Q_1D) data.slice[old_t_id_x + old_t_id_y * T_1D + old_t_id_z * T_1D * T_1D] = U[comp];
     __syncthreads();
     U[comp] = (t_id_x < Q_1D && t_id_y < Q_1D) ? data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] : 0.0;
-    __syncthreads();
   }
 }
 
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h
index cf4b7daaa5..f4f701505a 100644
--- a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h
@@ -18,6 +18,7 @@
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int P_1D, int Q_1D>
 inline __device__ void ContractX1d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+  __syncthreads();
   data.slice[data.t_id_x] = *U;
   __syncthreads();
   *V = 0.0;
@@ -26,7 +27,6 @@ inline __device__ void ContractX1d(SharedData_Cuda &data, const CeedScalar *U, c
       *V += B[i + data.t_id_x * P_1D] * data.slice[i];  // Contract x direction
     }
   }
-  __syncthreads();
 }
 
 //------------------------------------------------------------------------------
@@ -34,6 +34,7 @@ inline __device__ void ContractX1d(SharedData_Cuda &data, const CeedScalar *U, c
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int P_1D, int Q_1D>
 inline __device__ void ContractTransposeX1d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+  __syncthreads();
   data.slice[data.t_id_x] = *U;
   __syncthreads();
   *V = 0.0;
@@ -42,7 +43,6 @@ inline __device__ void ContractTransposeX1d(SharedData_Cuda &data, const CeedSca
       *V += B[data.t_id_x + i * P_1D] * data.slice[i];  // Contract x direction
     }
   }
-  __syncthreads();
 }
 
 //------------------------------------------------------------------------------
@@ -105,6 +105,7 @@ inline __device__ void Weight1d(SharedData_Cuda &data, const CeedScalar *__restr
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void ContractX2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+  __syncthreads();
   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
   __syncthreads();
   *V = 0.0;
@@ -113,7 +114,6 @@ inline __device__ void ContractX2d(SharedData_Cuda &data, const CeedScalar *U, c
       *V += B[i + data.t_id_x * P_1D] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
     }
   }
-  __syncthreads();
 }
 
 //------------------------------------------------------------------------------
@@ -121,6 +121,7 @@ inline __device__ void ContractX2d(SharedData_Cuda &data, const CeedScalar *U, c
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void ContractY2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+  __syncthreads();
   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
   __syncthreads();
   *V = 0.0;
@@ -129,7 +130,6 @@ inline __device__ void ContractY2d(SharedData_Cuda &data, const CeedScalar *U, c
       *V += B[i + data.t_id_y * P_1D] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
     }
   }
-  __syncthreads();
 }
 
 //------------------------------------------------------------------------------
@@ -137,6 +137,7 @@ inline __device__ void ContractY2d(SharedData_Cuda &data, const CeedScalar *U, c
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void ContractTransposeY2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+  __syncthreads();
   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
   __syncthreads();
   *V = 0.0;
@@ -145,7 +146,6 @@ inline __device__ void ContractTransposeY2d(SharedData_Cuda &data, const CeedSca
       *V += B[data.t_id_y + i * P_1D] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
     }
   }
-  __syncthreads();
 }
 
 //------------------------------------------------------------------------------
@@ -153,6 +153,7 @@ inline __device__ void ContractTransposeY2d(SharedData_Cuda &data, const CeedSca
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void ContractTransposeX2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+  __syncthreads();
   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
   __syncthreads();
   *V = 0.0;
@@ -161,7 +162,6 @@ inline __device__ void ContractTransposeX2d(SharedData_Cuda &data, const CeedSca
       *V += B[data.t_id_x + i * P_1D] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
     }
   }
-  __syncthreads();
 }
 
 //------------------------------------------------------------------------------
@@ -169,6 +169,7 @@ inline __device__ void ContractTransposeX2d(SharedData_Cuda &data, const CeedSca
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void ContractTransposeAddX2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+  __syncthreads();
   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
   __syncthreads();
   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
@@ -176,7 +177,6 @@ inline __device__ void ContractTransposeAddX2d(SharedData_Cuda &data, const Ceed
       *V += B[data.t_id_x + i * P_1D] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
     }
   }
-  __syncthreads();
 }
 
 //------------------------------------------------------------------------------
@@ -258,6 +258,7 @@ inline __device__ void ContractX3d(SharedData_Cuda &data, const CeedScalar *U, c
   }
 
   for (CeedInt k = 0; k < P_1D; k++) {
+    __syncthreads();
     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
     __syncthreads();
     V[k] = 0.0;
@@ -266,7 +267,6 @@ inline __device__ void ContractX3d(SharedData_Cuda &data, const CeedScalar *U, c
         V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
       }
     }
-    __syncthreads();
   }
 }
 
@@ -281,6 +281,7 @@ inline __device__ void ContractY3d(SharedData_Cuda &data, const CeedScalar *U, c
   }
 
   for (CeedInt k = 0; k < P_1D; k++) {
+    __syncthreads();
     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
     __syncthreads();
     V[k] = 0.0;
@@ -289,7 +290,6 @@ inline __device__ void ContractY3d(SharedData_Cuda &data, const CeedScalar *U, c
         V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
       }
     }
-    __syncthreads();
   }
 }
 
@@ -334,6 +334,7 @@ inline __device__ void ContractTransposeY3d(SharedData_Cuda &data, const CeedSca
   }
 
   for (CeedInt k = 0; k < P_1D; k++) {
+    __syncthreads();
     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
     __syncthreads();
     V[k] = 0.0;
@@ -342,7 +343,6 @@ inline __device__ void ContractTransposeY3d(SharedData_Cuda &data, const CeedSca
         V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
       }
     }
-    __syncthreads();
   }
 }
 
@@ -357,6 +357,7 @@ inline __device__ void ContractTransposeAddY3d(SharedData_Cuda &data, const Ceed
   }
 
   for (CeedInt k = 0; k < P_1D; k++) {
+    __syncthreads();
     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
     __syncthreads();
     if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
@@ -364,7 +365,6 @@ inline __device__ void ContractTransposeAddY3d(SharedData_Cuda &data, const Ceed
         V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
       }
     }
-    __syncthreads();
   }
 }
 
@@ -379,6 +379,7 @@ inline __device__ void ContractTransposeX3d(SharedData_Cuda &data, const CeedSca
   }
 
   for (CeedInt k = 0; k < P_1D; k++) {
+    __syncthreads();
     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
     __syncthreads();
     V[k] = 0.0;
@@ -387,7 +388,6 @@ inline __device__ void ContractTransposeX3d(SharedData_Cuda &data, const CeedSca
         V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
       }
     }
-    __syncthreads();
   }
 }
 
@@ -402,6 +402,7 @@ inline __device__ void ContractTransposeAddX3d(SharedData_Cuda &data, const Ceed
   }
 
   for (CeedInt k = 0; k < P_1D; k++) {
+    __syncthreads();
     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
     __syncthreads();
     if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
@@ -409,7 +410,6 @@ inline __device__ void ContractTransposeAddX3d(SharedData_Cuda &data, const Ceed
         V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
       }
     }
-    __syncthreads();
   }
 }
 
diff --git a/include/ceed/jit-source/hip/hip-gen-templates.h b/include/ceed/jit-source/hip/hip-gen-templates.h
index 80f8d047b0..3d45310556 100644
--- a/include/ceed/jit-source/hip/hip-gen-templates.h
+++ b/include/ceed/jit-source/hip/hip-gen-templates.h
@@ -271,6 +271,7 @@ inline __device__ void GradColloSlice3d(SharedData_Hip &data, const CeedInt q, c
                                         CeedScalar *__restrict__ r_V) {
   if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+      __syncthreads();
       data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[q + comp * Q_1D];
       __syncthreads();
       // X derivative
@@ -288,7 +289,6 @@ inline __device__ void GradColloSlice3d(SharedData_Hip &data, const CeedInt q, c
       for (CeedInt i = 0; i < Q_1D; i++) {
         r_V[comp + 2 * NUM_COMP] += c_G[i + q * Q_1D] * r_U[i + comp * Q_1D];
       }
-      __syncthreads();
     }
   }
 }
@@ -302,19 +302,19 @@ inline __device__ void GradColloSliceTranspose3d(SharedData_Hip &data, const Cee
   if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
       // X derivative
+      __syncthreads();
       data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 0 * NUM_COMP];
       __syncthreads();
       for (CeedInt i = 0; i < Q_1D; i++) {
         r_V[q + comp * Q_1D] += c_G[data.t_id_x + i * Q_1D] * data.slice[i + data.t_id_y * T_1D];
       }
-      __syncthreads();
       // Y derivative
+      __syncthreads();
       data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 1 * NUM_COMP];
       __syncthreads();
       for (CeedInt i = 0; i < Q_1D; i++) {
         r_V[q + comp * Q_1D] += c_G[data.t_id_y + i * Q_1D] * data.slice[data.t_id_x + i * T_1D];
       }
-      __syncthreads();
       // Z derivative
       for (CeedInt i = 0; i < Q_1D; i++) {
         r_V[i + comp * Q_1D] += c_G[i + q * Q_1D] * r_U[comp + 2 * NUM_COMP];
diff --git a/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h b/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h
index 7b28732444..5184f03443 100644
--- a/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h
+++ b/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h
@@ -49,6 +49,7 @@ inline __device__ void InterpAtPoints1d(SharedData_Hip &data, const CeedInt p, c
   ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
     // Load coefficients
+    __syncthreads();
     if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = r_C[comp];
     __syncthreads();
     // Contract x direction
@@ -94,6 +95,7 @@ inline __device__ void GradAtPoints1d(SharedData_Hip &data, const CeedInt p, con
   ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
   for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    __syncthreads();
     // Load coefficients
     if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = r_C[comp];
     __syncthreads();
@@ -145,6 +147,7 @@ inline __device__ void InterpAtPoints2d(SharedData_Hip &data, const CeedInt p, c
     CeedScalar chebyshev_x[Q_1D];
 
     // Load coefficients
+    __syncthreads();
     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[comp];
     __syncthreads();
     // Contract x direction
@@ -213,6 +216,7 @@ inline __device__ void GradAtPoints2d(SharedData_Hip &data, const CeedInt p, con
     CeedScalar chebyshev_x[Q_1D];
 
     // Load coefficients
+    __syncthreads();
     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[comp];
     __syncthreads();
     for (CeedInt dim = 0; dim < 2; dim++) {
@@ -294,6 +298,7 @@ inline __device__ void InterpAtPoints3d(SharedData_Hip &data, const CeedInt p, c
       CeedScalar chebyshev_x[Q_1D];
 
       // Load coefficients
+      __syncthreads();
       if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[k + comp * Q_1D];
       __syncthreads();
       // Contract x direction
@@ -372,6 +377,7 @@ inline __device__ void GradAtPoints3d(SharedData_Hip &data, const CeedInt p, con
       CeedScalar chebyshev_x[Q_1D];
 
       // Load coefficients
+      __syncthreads();
       if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[k + comp * Q_1D];
       __syncthreads();
       for (CeedInt dim = 0; dim < 3; dim++) {
diff --git a/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points.h b/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points.h
index 9cb191ef7b..5b009d525e 100644
--- a/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points.h
+++ b/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points.h
@@ -130,7 +130,6 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
         InterpTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
       }
     }
-    __syncthreads();
 
     // Map from coefficients
     if (BASIS_DIM == 1) {
@@ -190,7 +189,6 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
         InterpTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
       }
     }
-    __syncthreads();
 
     // Map from coefficients
     if (BASIS_DIM == 1) {
@@ -321,7 +319,6 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
         GradTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
       }
     }
-    __syncthreads();
 
     // Map from coefficients
     if (BASIS_DIM == 1) {
@@ -382,7 +379,6 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
         GradTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
       }
     }
-    __syncthreads();
 
     // Map from coefficients
     if (BASIS_DIM == 1) {
diff --git a/include/ceed/jit-source/hip/hip-shared-basis-tensor-flattened-templates.h b/include/ceed/jit-source/hip/hip-shared-basis-tensor-flattened-templates.h
index 634509c311..94e17c1dc0 100644
--- a/include/ceed/jit-source/hip/hip-shared-basis-tensor-flattened-templates.h
+++ b/include/ceed/jit-source/hip/hip-shared-basis-tensor-flattened-templates.h
@@ -19,6 +19,7 @@
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void ContractX2dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
                                             CeedScalar *V) {
+  __syncthreads();
   data.slice[t_id_x + t_id_y * T_1D] = *U;
   __syncthreads();
   *V = 0.0;
@@ -27,7 +28,6 @@ inline __device__ void ContractX2dFlattened(SharedData_Hip &data, const int t_id
       *V += B[i + t_id_x * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
     }
   }
-  __syncthreads();
 }
 
 //------------------------------------------------------------------------------
@@ -36,6 +36,7 @@ inline __device__ void ContractX2dFlattened(SharedData_Hip &data, const int t_id
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void ContractY2dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
                                             CeedScalar *V) {
+  __syncthreads();
   data.slice[t_id_x + t_id_y * T_1D] = *U;
   __syncthreads();
   *V = 0.0;
@@ -44,7 +45,6 @@ inline __device__ void ContractY2dFlattened(SharedData_Hip &data, const int t_id
       *V += B[i + t_id_y * P_1D] * data.slice[t_id_x + i * T_1D];  // Contract y direction
     }
   }
-  __syncthreads();
 }
 
 //------------------------------------------------------------------------------
@@ -53,6 +53,7 @@ inline __device__ void ContractY2dFlattened(SharedData_Hip &data, const int t_id
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void ContractTransposeY2dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U,
                                                      const CeedScalar *B, CeedScalar *V) {
+  __syncthreads();
   data.slice[t_id_x + t_id_y * T_1D] = *U;
   __syncthreads();
   *V = 0.0;
@@ -61,7 +62,6 @@ inline __device__ void ContractTransposeY2dFlattened(SharedData_Hip &data, const
       *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D];  // Contract y direction
     }
   }
-  __syncthreads();
 }
 
 //------------------------------------------------------------------------------
@@ -70,6 +70,7 @@ inline __device__ void ContractTransposeY2dFlattened(SharedData_Hip &data, const
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void ContractTransposeX2dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U,
                                                      const CeedScalar *B, CeedScalar *V) {
+  __syncthreads();
   data.slice[t_id_x + t_id_y * T_1D] = *U;
   __syncthreads();
   *V = 0.0;
@@ -78,7 +79,6 @@ inline __device__ void ContractTransposeX2dFlattened(SharedData_Hip &data, const
       *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
     }
   }
-  __syncthreads();
 }
 
 //------------------------------------------------------------------------------
@@ -87,6 +87,7 @@ inline __device__ void ContractTransposeX2dFlattened(SharedData_Hip &data, const
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void ContractTransposeAddX2dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U,
                                                         const CeedScalar *B, CeedScalar *V) {
+  __syncthreads();
   data.slice[t_id_x + t_id_y * T_1D] = *U;
   __syncthreads();
   if (t_id_x < P_1D && t_id_y < P_1D) {
@@ -94,7 +95,6 @@ inline __device__ void ContractTransposeAddX2dFlattened(SharedData_Hip &data, co
       *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
     }
   }
-  __syncthreads();
 }
 
 //------------------------------------------------------------------------------
@@ -105,10 +105,10 @@ inline __device__ void QPack2d(SharedData_Hip &data, const int t_id_x, const int
   const CeedInt new_t_id_x = data.t_id_x % Q_1D, new_t_id_y = data.t_id_x / Q_1D;
 
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    __syncthreads();
     if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * T_1D] = U[comp];
     __syncthreads();
     U[comp] = data.t_id_x < (Q_1D * Q_1D) ? data.slice[new_t_id_x + new_t_id_y * T_1D] : 0.0;
-    __syncthreads();
   }
 }
 
@@ -117,10 +117,10 @@ inline __device__ void QUnpack2d(SharedData_Hip &data, const int t_id_x, const i
   const CeedInt old_t_id_x = data.t_id_x % Q_1D, old_t_id_y = data.t_id_x / Q_1D;
 
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    __syncthreads();
     if (data.t_id_x < (Q_1D * Q_1D)) data.slice[old_t_id_x + old_t_id_y * T_1D] = U[comp];
     __syncthreads();
     U[comp] = (t_id_x < Q_1D && t_id_y < Q_1D) ? data.slice[t_id_x + t_id_y * T_1D] : 0.0;
-    __syncthreads();
   }
 }
 
@@ -218,6 +218,7 @@ inline __device__ void WeightTensor2dFlattened(SharedData_Hip &data, const CeedS
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void ContractX3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
                                             const CeedScalar *B, CeedScalar *V) {
+  __syncthreads();
   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
   __syncthreads();
   *V = 0.0;
@@ -226,7 +227,6 @@ inline __device__ void ContractX3dFlattened(SharedData_Hip &data, const int t_id
       *V += B[i + t_id_x * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D];  // Contract x direction
     }
   }
-  __syncthreads();
 }
 
 //------------------------------------------------------------------------------
@@ -235,6 +235,7 @@ inline __device__ void ContractX3dFlattened(SharedData_Hip &data, const int t_id
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void ContractY3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
                                             const CeedScalar *B, CeedScalar *V) {
+  __syncthreads();
   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
   __syncthreads();
   *V = 0.0;
@@ -243,7 +244,6 @@ inline __device__ void ContractY3dFlattened(SharedData_Hip &data, const int t_id
       *V += B[i + t_id_y * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D];  // Contract y direction
     }
   }
-  __syncthreads();
 }
 
 //------------------------------------------------------------------------------
@@ -252,6 +252,7 @@ inline __device__ void ContractY3dFlattened(SharedData_Hip &data, const int t_id
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void ContractZ3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
                                             const CeedScalar *B, CeedScalar *V) {
+  __syncthreads();
   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
   __syncthreads();
   *V = 0.0;
@@ -260,7 +261,6 @@ inline __device__ void ContractZ3dFlattened(SharedData_Hip &data, const int t_id
       *V += B[i + t_id_z * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D];  // Contract z direction
     }
   }
-  __syncthreads();
 }
 
 //------------------------------------------------------------------------------
@@ -269,6 +269,7 @@ inline __device__ void ContractZ3dFlattened(SharedData_Hip &data, const int t_id
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void ContractTransposeZ3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
                                                      const CeedScalar *B, CeedScalar *V) {
+  __syncthreads();
   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
   __syncthreads();
   *V = 0.0;
@@ -277,7 +278,6 @@ inline __device__ void ContractTransposeZ3dFlattened(SharedData_Hip &data, const
       *V += B[t_id_z + i * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D];  // Contract z direction
     }
   }
-  __syncthreads();
 }
 
 //------------------------------------------------------------------------------
@@ -286,6 +286,7 @@ inline __device__ void ContractTransposeZ3dFlattened(SharedData_Hip &data, const
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void ContractTransposeAddZ3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z,
                                                         const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+  __syncthreads();
   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
   __syncthreads();
   if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) {
@@ -293,7 +294,6 @@ inline __device__ void ContractTransposeAddZ3dFlattened(SharedData_Hip &data, co
       *V += B[t_id_z + i * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D];  // Contract z direction
     }
   }
-  __syncthreads();
 }
 
 //------------------------------------------------------------------------------
@@ -302,6 +302,7 @@ inline __device__ void ContractTransposeAddZ3dFlattened(SharedData_Hip &data, co
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void ContractTransposeY3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
                                                      const CeedScalar *B, CeedScalar *V) {
+  __syncthreads();
   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
   __syncthreads();
   *V = 0.0;
@@ -310,7 +311,6 @@ inline __device__ void ContractTransposeY3dFlattened(SharedData_Hip &data, const
       *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D];  // Contract y direction
     }
   }
-  __syncthreads();
 }
 
 //------------------------------------------------------------------------------
@@ -319,6 +319,7 @@ inline __device__ void ContractTransposeY3dFlattened(SharedData_Hip &data, const
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void ContractTransposeAddY3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z,
                                                         const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+  __syncthreads();
   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
   __syncthreads();
   if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) {
@@ -326,7 +327,6 @@ inline __device__ void ContractTransposeAddY3dFlattened(SharedData_Hip &data, co
       *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D];  // Contract y direction
     }
   }
-  __syncthreads();
 }
 
 //------------------------------------------------------------------------------
@@ -335,6 +335,7 @@ inline __device__ void ContractTransposeAddY3dFlattened(SharedData_Hip &data, co
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void ContractTransposeX3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
                                                      const CeedScalar *B, CeedScalar *V) {
+  __syncthreads();
   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
   __syncthreads();
   *V = 0.0;
@@ -343,7 +344,6 @@ inline __device__ void ContractTransposeX3dFlattened(SharedData_Hip &data, const
       *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D];  // Contract x direction
     }
   }
-  __syncthreads();
 }
 
 //------------------------------------------------------------------------------
@@ -352,6 +352,7 @@ inline __device__ void ContractTransposeX3dFlattened(SharedData_Hip &data, const
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void ContractTransposeAddX3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z,
                                                         const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+  __syncthreads();
   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
   __syncthreads();
   if (t_id_x < P_1D && t_id_y < P_1D && t_id_z < P_1D) {
@@ -359,7 +360,6 @@ inline __device__ void ContractTransposeAddX3dFlattened(SharedData_Hip &data, co
       *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D];  // Contract x direction
     }
   }
-  __syncthreads();
 }
 
 //------------------------------------------------------------------------------
@@ -370,10 +370,10 @@ inline __device__ void QPack3d(SharedData_Hip &data, const int t_id_x, const int
   const CeedInt new_t_id_x = data.t_id_x % Q_1D, new_t_id_y = (data.t_id_x / Q_1D) % Q_1D, new_t_id_z = data.t_id_x / (Q_1D * Q_1D);
 
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    __syncthreads();
     if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = U[comp];
     __syncthreads();
     U[comp] = data.t_id_x < (Q_1D * Q_1D * Q_1D) ? data.slice[new_t_id_x + new_t_id_y * T_1D + new_t_id_z * T_1D * T_1D] : 0.0;
-    __syncthreads();
   }
 }
 
@@ -382,10 +382,10 @@ inline __device__ void QUnpack3d(SharedData_Hip &data, const int t_id_x, const i
   const CeedInt old_t_id_x = data.t_id_x % Q_1D, old_t_id_y = (data.t_id_x / Q_1D) % Q_1D, old_t_id_z = data.t_id_x / (Q_1D * Q_1D);
 
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    __syncthreads();
     if (data.t_id_x < Q_1D * Q_1D * Q_1D) data.slice[old_t_id_x + old_t_id_y * T_1D + old_t_id_z * T_1D * T_1D] = U[comp];
     __syncthreads();
     U[comp] = (t_id_x < Q_1D && t_id_y < Q_1D) ? data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] : 0.0;
-    __syncthreads();
   }
 }
 
diff --git a/include/ceed/jit-source/hip/hip-shared-basis-tensor-templates.h b/include/ceed/jit-source/hip/hip-shared-basis-tensor-templates.h
index 4285ae9c7d..a7d24c4cd1 100644
--- a/include/ceed/jit-source/hip/hip-shared-basis-tensor-templates.h
+++ b/include/ceed/jit-source/hip/hip-shared-basis-tensor-templates.h
@@ -18,6 +18,7 @@
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int P_1D, int Q_1D>
 inline __device__ void ContractX1d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+  __syncthreads();
   data.slice[data.t_id_x] = *U;
   __syncthreads();
   *V = 0.0;
@@ -26,7 +27,6 @@ inline __device__ void ContractX1d(SharedData_Hip &data, const CeedScalar *U, co
       *V += B[i + data.t_id_x * P_1D] * data.slice[i];  // Contract x direction
     }
   }
-  __syncthreads();
 }
 
 //------------------------------------------------------------------------------
@@ -34,6 +34,7 @@ inline __device__ void ContractX1d(SharedData_Hip &data, const CeedScalar *U, co
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int P_1D, int Q_1D>
 inline __device__ void ContractTransposeX1d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+  __syncthreads();
   data.slice[data.t_id_x] = *U;
   __syncthreads();
   *V = 0.0;
@@ -42,7 +43,6 @@ inline __device__ void ContractTransposeX1d(SharedData_Hip &data, const CeedScal
       *V += B[data.t_id_x + i * P_1D] * data.slice[i];  // Contract x direction
     }
   }
-  __syncthreads();
 }
 
 //------------------------------------------------------------------------------
@@ -105,6 +105,7 @@ inline __device__ void Weight1d(SharedData_Hip &data, const CeedScalar *__restri
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void ContractX2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+  __syncthreads();
   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
   __syncthreads();
   *V = 0.0;
@@ -113,7 +114,6 @@ inline __device__ void ContractX2d(SharedData_Hip &data, const CeedScalar *U, co
       *V += B[i + data.t_id_x * P_1D] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
     }
   }
-  __syncthreads();
 }
 
 //------------------------------------------------------------------------------
@@ -121,6 +121,7 @@ inline __device__ void ContractX2d(SharedData_Hip &data, const CeedScalar *U, co
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void ContractY2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+  __syncthreads();
   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
   __syncthreads();
   *V = 0.0;
@@ -129,7 +130,6 @@ inline __device__ void ContractY2d(SharedData_Hip &data, const CeedScalar *U, co
       *V += B[i + data.t_id_y * P_1D] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
     }
   }
-  __syncthreads();
 }
 
 //------------------------------------------------------------------------------
@@ -137,6 +137,7 @@ inline __device__ void ContractY2d(SharedData_Hip &data, const CeedScalar *U, co
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void ContractTransposeY2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+  __syncthreads();
   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
   __syncthreads();
   *V = 0.0;
@@ -145,7 +146,6 @@ inline __device__ void ContractTransposeY2d(SharedData_Hip &data, const CeedScal
       *V += B[data.t_id_y + i * P_1D] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
     }
   }
-  __syncthreads();
 }
 
 //------------------------------------------------------------------------------
@@ -153,6 +153,7 @@ inline __device__ void ContractTransposeY2d(SharedData_Hip &data, const CeedScal
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void ContractTransposeX2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+  __syncthreads();
   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
   __syncthreads();
   *V = 0.0;
@@ -161,7 +162,6 @@ inline __device__ void ContractTransposeX2d(SharedData_Hip &data, const CeedScal
       *V += B[data.t_id_x + i * P_1D] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
     }
   }
-  __syncthreads();
 }
 
 //------------------------------------------------------------------------------
@@ -169,6 +169,7 @@ inline __device__ void ContractTransposeX2d(SharedData_Hip &data, const CeedScal
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void ContractTransposeAddX2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+  __syncthreads();
   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
   __syncthreads();
   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
@@ -176,7 +177,6 @@ inline __device__ void ContractTransposeAddX2d(SharedData_Hip &data, const CeedS
       *V += B[data.t_id_x + i * P_1D] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
     }
   }
-  __syncthreads();
 }
 
 //------------------------------------------------------------------------------
@@ -257,6 +257,7 @@ inline __device__ void ContractX3d(SharedData_Hip &data, const CeedScalar *U, co
   }
 
   for (CeedInt k = 0; k < P_1D; k++) {
+    __syncthreads();
     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
     __syncthreads();
     V[k] = 0.0;
@@ -265,7 +266,6 @@ inline __device__ void ContractX3d(SharedData_Hip &data, const CeedScalar *U, co
         V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
       }
     }
-    __syncthreads();
   }
 }
 
@@ -280,6 +280,7 @@ inline __device__ void ContractY3d(SharedData_Hip &data, const CeedScalar *U, co
   }
 
   for (CeedInt k = 0; k < P_1D; k++) {
+    __syncthreads();
     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
     __syncthreads();
     V[k] = 0.0;
@@ -288,7 +289,6 @@ inline __device__ void ContractY3d(SharedData_Hip &data, const CeedScalar *U, co
         V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
       }
     }
-    __syncthreads();
   }
 }
 
@@ -333,6 +333,7 @@ inline __device__ void ContractTransposeY3d(SharedData_Hip &data, const CeedScal
   }
 
   for (CeedInt k = 0; k < P_1D; k++) {
+    __syncthreads();
     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
     __syncthreads();
     V[k] = 0.0;
@@ -341,7 +342,6 @@ inline __device__ void ContractTransposeY3d(SharedData_Hip &data, const CeedScal
         V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
       }
     }
-    __syncthreads();
   }
 }
 
@@ -356,6 +356,7 @@ inline __device__ void ContractTransposeAddY3d(SharedData_Hip &data, const CeedS
   }
 
   for (CeedInt k = 0; k < P_1D; k++) {
+    __syncthreads();
     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
     __syncthreads();
     if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
@@ -363,7 +364,6 @@ inline __device__ void ContractTransposeAddY3d(SharedData_Hip &data, const CeedS
         V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
       }
     }
-    __syncthreads();
   }
 }
 
@@ -378,6 +378,7 @@ inline __device__ void ContractTransposeX3d(SharedData_Hip &data, const CeedScal
   }
 
   for (CeedInt k = 0; k < P_1D; k++) {
+    __syncthreads();
     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
     __syncthreads();
     V[k] = 0.0;
@@ -386,7 +387,6 @@ inline __device__ void ContractTransposeX3d(SharedData_Hip &data, const CeedScal
         V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
       }
     }
-    __syncthreads();
   }
 }
 
@@ -401,6 +401,7 @@ inline __device__ void ContractTransposeAddX3d(SharedData_Hip &data, const CeedS
   }
 
   for (CeedInt k = 0; k < P_1D; k++) {
+    __syncthreads();
     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
     __syncthreads();
     if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
@@ -408,7 +409,6 @@ inline __device__ void ContractTransposeAddX3d(SharedData_Hip &data, const CeedS
         V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
       }
     }
-    __syncthreads();
   }
 }
 

From 0183ed61035d97ff853cf8c8e722c0fda76e54df Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Tue, 17 Jun 2025 13:20:03 -0600
Subject: [PATCH 422/571] GPU Assembly AtPoints (#1833)

* cuda - AtPoints diagonal assembly for gen

* hip - AtPoints diagonal assembly for gen

* pc - use subops for LinearAssemble[Add]Diagonal if composite

* gen - turn more numbers into named variables

* gen - fix alignment for assembly

* gen - check for only one active basis in/out

* HIP gen at points syntax error fixes

* hip - embarassing fix

* gen - add Tab helper to manage indentation

---------

Co-authored-by: Zach Atkins <Zach.Atkins@colorado.edu>
---
 .../cuda-gen/ceed-cuda-gen-operator-build.cpp | 1001 ++++++++++++----
 .../cuda-gen/ceed-cuda-gen-operator-build.h   |    2 +
 backends/cuda-gen/ceed-cuda-gen-operator.c    |  171 ++-
 backends/cuda-gen/ceed-cuda-gen.h             |    6 +-
 .../hip-gen/ceed-hip-gen-operator-build.cpp   | 1009 ++++++++++++-----
 .../hip-gen/ceed-hip-gen-operator-build.h     |    2 +
 backends/hip-gen/ceed-hip-gen-operator.c      |  183 ++-
 backends/hip-gen/ceed-hip-gen.h               |    6 +-
 include/ceed/backend.h                        |    1 +
 include/ceed/gen-tools.h                      |   27 +
 .../ceed/jit-source/cuda/cuda-gen-templates.h |   89 ++
 .../ceed/jit-source/hip/hip-gen-templates.h   |   89 ++
 interface/ceed-preconditioning.c              |   16 +-
 13 files changed, 2083 insertions(+), 519 deletions(-)
 create mode 100644 include/ceed/gen-tools.h

diff --git a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
index 2ec854b59c..02bef7a16a 100644
--- a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
+++ b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
@@ -9,6 +9,7 @@
 
 #include <ceed.h>
 #include <ceed/backend.h>
+#include <ceed/gen-tools.h>
 #include <ceed/jit-tools.h>
 #include <cuda_runtime.h>
 
@@ -176,9 +177,10 @@ static int CeedOperatorBuildKernelData_Cuda_gen(Ceed ceed, CeedInt num_input_fie
 //------------------------------------------------------------------------------
 // Setup fields
 //------------------------------------------------------------------------------
-static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, CeedInt i, CeedOperatorField op_field,
-                                                     CeedQFunctionField qf_field, FieldReuse_Cuda field_reuse, CeedInt max_dim, CeedInt Q,
-                                                     CeedInt Q_1d, bool is_input, bool is_all_tensor, bool is_at_points, bool use_3d_slices) {
+static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, Tab &tab, CeedInt i,
+                                                     CeedOperatorField op_field, CeedQFunctionField qf_field, FieldReuse_Cuda field_reuse,
+                                                     CeedInt max_dim, CeedInt Q, CeedInt Q_1d, bool is_input, bool is_all_tensor, bool is_at_points,
+                                                     bool use_3d_slices) {
   bool      is_tensor = true;
   CeedBasis basis;
   CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis));
@@ -197,7 +199,7 @@ static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, C
   bool use_previous_field = field_reuse.index != -1;
 
   CeedCallBackend(CeedOperatorFieldGetName(op_field, &field_name));
-  code << "  // -- " << (is_input ? "Input" : "Output") << " field " << i << ": " << field_name << "\n";
+  code << tab << "// -- " << (is_input ? "Input" : "Output") << " field " << i << ": " << field_name << "\n";
 
   // Get field data
   CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr));
@@ -215,20 +217,20 @@ static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, C
   CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode));
 
   // Set field constants
-  code << "  const CeedInt dim" << var_suffix << " = " << dim << ";\n";
+  code << tab << "const CeedInt dim" << var_suffix << " = " << dim << ";\n";
   if (is_tensor && !is_all_tensor) {
     CeedInt P = 0;
 
     CeedCallBackend(CeedBasisGetNumNodes(basis, &P));
-    code << "  const CeedInt P" << var_suffix << " = " << (basis == CEED_BASIS_NONE ? Q : P) << ";\n";
+    code << tab << "const CeedInt P" << var_suffix << " = " << (basis == CEED_BASIS_NONE ? Q : P) << ";\n";
   }
-  code << "  const CeedInt " << P_name << " = " << (basis == CEED_BASIS_NONE ? Q_1d : P_1d) << ";\n";
+  code << tab << "const CeedInt " << P_name << " = " << (basis == CEED_BASIS_NONE ? Q_1d : P_1d) << ";\n";
   if (eval_mode != CEED_EVAL_WEIGHT) {
-    code << "  const CeedInt num_comp" << var_suffix << " = " << num_comp << ";\n";
+    code << tab << "const CeedInt num_comp" << var_suffix << " = " << num_comp << ";\n";
   }
 
   // Load basis data
-  code << "  // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
+  code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n";
   switch (eval_mode) {
     case CEED_EVAL_NONE:
       break;
@@ -257,10 +259,10 @@ static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, C
       if (use_previous_field) {
         std::string reuse_var = "s_B" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
 
-        code << "  CeedScalar *s_B" << var_suffix << " = " << reuse_var << ";\n";
+        code << tab << "CeedScalar *s_B" << var_suffix << " = " << reuse_var << ";\n";
       } else {
-        code << "  __shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n";
-        code << "  LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
+        code << tab << "__shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n";
+        code << tab << "LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
       }
       break;
     case CEED_EVAL_GRAD:
@@ -289,10 +291,10 @@ static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, C
         if (use_previous_field) {
           std::string reuse_var = "s_B" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
 
-          code << "  CeedScalar *s_B" << var_suffix << " = " << reuse_var << ";\n";
+          code << tab << "CeedScalar *s_B" << var_suffix << " = " << reuse_var << ";\n";
         } else {
-          code << "  __shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n";
-          code << "  LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
+          code << tab << "__shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n";
+          code << tab << "LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
         }
       }
       if (is_at_points) break;  // No G mat for AtPoints
@@ -302,10 +304,10 @@ static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, C
         if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD) {
           std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
 
-          code << "  CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
+          code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
         } else {
-          code << "  __shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n";
-          code << "  LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
+          code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n";
+          code << tab << "LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
         }
       } else {
         bool has_collo_grad = basis_data->d_collo_grad_1d;
@@ -316,20 +318,20 @@ static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, C
           if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD) {
             std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
 
-            code << "  CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
+            code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
           } else {
-            code << "  __shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n";
-            code << "  LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
+            code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n";
+            code << tab << "LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
           }
         } else {
           if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD) {
             std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
 
-            code << "  CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
+            code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
           } else {
-            code << "  __shared__ CeedScalar s_G" << var_suffix << "[" << P_name << "*" << Q_name << (is_tensor ? "" : "*dim")
+            code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << P_name << "*" << Q_name << (is_tensor ? "" : "*dim")
                  << (is_tensor ? "" : var_suffix) << "];\n";
-            code << "  LoadMatrix<" << P_name << ", " << Q_name << (is_tensor ? "" : "*dim") << (is_tensor ? "" : var_suffix) << ">(data, G."
+            code << tab << "LoadMatrix<" << P_name << ", " << Q_name << (is_tensor ? "" : "*dim") << (is_tensor ? "" : var_suffix) << ">(data, G."
                  << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
           }
         }
@@ -350,9 +352,10 @@ static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, C
 //------------------------------------------------------------------------------
 // Restriction
 //------------------------------------------------------------------------------
-static int CeedOperatorBuildKernelRestriction_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, CeedInt i, CeedInt field_input_buffer[],
-                                                       CeedOperatorField op_field, CeedQFunctionField qf_field, CeedInt max_dim, CeedInt Q_1d,
-                                                       bool is_input, bool is_all_tensor, bool is_at_points, bool use_3d_slices) {
+static int CeedOperatorBuildKernelRestriction_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, Tab &tab, CeedInt i,
+                                                       CeedInt field_input_buffer[], CeedOperatorField op_field, CeedQFunctionField qf_field,
+                                                       CeedInt max_dim, CeedInt Q_1d, bool is_input, bool is_all_tensor, bool is_at_points,
+                                                       bool use_3d_slices) {
   std::string               var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
   std::string               P_name     = (is_all_tensor ? "P_1d" : "P") + var_suffix;
   CeedEvalMode              eval_mode  = CEED_EVAL_NONE;
@@ -379,26 +382,27 @@ static int CeedOperatorBuildKernelRestriction_Cuda_gen(std::ostringstream &code,
       std::string buffer_name = "r_e_in_" + std::to_string(field_input_buffer[i]);
 
       // Restriction was already done for previous input
-      code << "    CeedScalar *r_e" << var_suffix << " = " << buffer_name << ";\n";
+      code << tab << "CeedScalar *r_e" << var_suffix << " = " << buffer_name << ";\n";
     } else if (eval_mode != CEED_EVAL_WEIGHT && !((eval_mode == CEED_EVAL_NONE) && use_3d_slices && is_at_points)) {
       if (eval_mode == CEED_EVAL_NONE && rstr_type != CEED_RESTRICTION_POINTS) {
         // No basis action, so r_e_in_* in also r_q_in_* and needs to be allocated
-        code << "    CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << P_name << "];\n";
+        code << tab << "CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << P_name << "];\n";
       } else if (rstr_type != CEED_RESTRICTION_POINTS) {
         // Otherwise we're using the scratch space
-        code << "    CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
+        code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
       }
       switch (rstr_type) {
         case CEED_RESTRICTION_STANDARD: {
           CeedInt comp_stride;
 
           CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
-          code << "    const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
+          code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
           CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
-          code << "    // CompStride: " << comp_stride << "\n";
+          code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
           data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets;
-          code << "    ReadLVecStandard" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", " << comp_stride << ", " << P_name
-               << ">(data, l_size" << var_suffix << ", elem, indices.inputs[" << i << "], d" << var_suffix << ", r_e" << var_suffix << ");\n";
+          code << tab << "ReadLVecStandard" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", "
+               << P_name << ">(data, l_size" << var_suffix << ", elem, indices.inputs[" << i << "], d" << var_suffix << ", r_e" << var_suffix
+               << ");\n";
           break;
         }
         case CEED_RESTRICTION_STRIDED: {
@@ -412,16 +416,18 @@ static int CeedOperatorBuildKernelRestriction_Cuda_gen(std::ostringstream &code,
           if (!has_backend_strides) {
             CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
           }
-          code << "    // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n";
-          code << "    ReadLVecStrided" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", " << P_name << ", " << strides[0] << ", "
-               << strides[1] << ", " << strides[2] << ">(data, elem, d" << var_suffix << ", r_e" << var_suffix << ");\n";
+          code << tab << "const CeedInt strides" << var_suffix << "_0 = " << strides[0] << ", strides" << var_suffix << "_1 = " << strides[1]
+               << ", strides" << var_suffix << "_2 = " << strides[2] << ";\n";
+          code << tab << "ReadLVecStrided" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", " << P_name << ", strides"
+               << var_suffix << "_0, strides" << var_suffix << "_1, strides" << var_suffix << "_2>(data, elem, d" << var_suffix << ", r_e"
+               << var_suffix << ");\n";
           break;
         }
         case CEED_RESTRICTION_POINTS: {
           CeedInt comp_stride;
 
           CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
-          code << "    const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
+          code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
           data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets;
           break;
         }
@@ -439,12 +445,13 @@ static int CeedOperatorBuildKernelRestriction_Cuda_gen(std::ostringstream &code,
         CeedInt comp_stride;
 
         CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
-        code << "    const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
+        code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
         CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
-        code << "    // CompStride: " << comp_stride << "\n";
+        code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
         data->indices.outputs[i] = (CeedInt *)rstr_data->d_offsets;
-        code << "    WriteLVecStandard" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", " << comp_stride << ", " << P_name
-             << ">(data, l_size" << var_suffix << ", elem, indices.outputs[" << i << "], r_e" << var_suffix << ", d" << var_suffix << ");\n";
+        code << tab << "WriteLVecStandard" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", "
+             << P_name << ">(data, l_size" << var_suffix << ", elem, indices.outputs[" << i << "], r_e" << var_suffix << ", d" << var_suffix
+             << ");\n";
         break;
       }
       case CEED_RESTRICTION_STRIDED: {
@@ -458,9 +465,11 @@ static int CeedOperatorBuildKernelRestriction_Cuda_gen(std::ostringstream &code,
         if (!has_backend_strides) {
           CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
         }
-        code << "    // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n";
-        code << "    WriteLVecStrided" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", " << P_name << ", " << strides[0] << ", "
-             << strides[1] << ", " << strides[2] << ">(data, elem, r_e" << var_suffix << ", d" << var_suffix << ");\n";
+        code << tab << "const CeedInt strides" << var_suffix << "_0 = " << strides[0] << ", strides" << var_suffix << "_1 = " << strides[1]
+             << ", strides" << var_suffix << "_2 = " << strides[2] << ";\n";
+        code << tab << "WriteLVecStrided" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", " << P_name << ", strides"
+             << var_suffix << "_0, strides" << var_suffix << "_1, strides" << var_suffix << "_2>(data, elem, r_e" << var_suffix << ", d" << var_suffix
+             << ");\n";
         break;
       }
       case CEED_RESTRICTION_POINTS:
@@ -480,9 +489,9 @@ static int CeedOperatorBuildKernelRestriction_Cuda_gen(std::ostringstream &code,
 //------------------------------------------------------------------------------
 // Basis
 //------------------------------------------------------------------------------
-static int CeedOperatorBuildKernelBasis_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, CeedInt i, CeedOperatorField op_field,
-                                                 CeedQFunctionField qf_field, CeedInt max_dim, CeedInt Q_1d, bool is_input, bool is_all_tensor,
-                                                 bool is_at_points, bool use_3d_slices) {
+static int CeedOperatorBuildKernelBasis_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, Tab &tab, CeedInt i,
+                                                 CeedOperatorField op_field, CeedQFunctionField qf_field, CeedInt max_dim, CeedInt Q_1d,
+                                                 bool is_input, bool is_all_tensor, bool is_at_points, bool use_3d_slices) {
   bool      is_tensor = true;
   CeedBasis basis;
   CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis));
@@ -509,20 +518,20 @@ static int CeedOperatorBuildKernelBasis_Cuda_gen(std::ostringstream &code, CeedO
   CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode));
 
   // Basis
-  code << "    // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
+  code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n";
   if (is_input) {
     switch (eval_mode) {
       case CEED_EVAL_NONE:
         if (!use_3d_slices && !is_at_points) {
-          code << "    CeedScalar *r_q" << var_suffix << " = r_e" << var_suffix << ";\n";
+          code << tab << "CeedScalar *r_q" << var_suffix << " = r_e" << var_suffix << ";\n";
         }
         break;
       case CEED_EVAL_INTERP:
         if (is_at_points) {
           std::string function_name = (dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d";
 
-          code << "    CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "];\n";
-          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix
+          code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "];\n";
+          code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix
                << ", s_B" << var_suffix << ", r_c" << var_suffix << ");\n";
         } else {
           std::string function_name = is_tensor
@@ -530,8 +539,8 @@ static int CeedOperatorBuildKernelBasis_Cuda_gen(std::ostringstream &code, CeedO
                                           : "InterpNonTensor";
           std::string op_t_1d_name  = (is_all_tensor || !is_tensor) ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
 
-          code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
-          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_e"
+          code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
+          code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_e"
                << var_suffix << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n";
         }
         break;
@@ -539,14 +548,14 @@ static int CeedOperatorBuildKernelBasis_Cuda_gen(std::ostringstream &code, CeedO
         if (is_at_points) {
           std::string function_name = (dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d";
 
-          code << "    CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "];\n";
-          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix
+          code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "];\n";
+          code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix
                << ", s_B" << var_suffix << ", r_c" << var_suffix << ");\n";
         } else if (use_3d_slices) {
           std::string function_name = (dim > 1 ? "InterpTensor" : "Interp") + std::to_string(dim) + "d";
 
-          code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
-          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix
+          code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
+          code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix
                << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n";
         } else if (is_tensor) {
           bool        is_collocated = dim == 3 && Q_1d >= P_1d;
@@ -554,31 +563,31 @@ static int CeedOperatorBuildKernelBasis_Cuda_gen(std::ostringstream &code, CeedO
                                       (is_all_tensor ? "" : "Flattened");
           std::string op_t_1d_name = is_all_tensor ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
 
-          code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "*"
+          code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "*"
                << (is_all_tensor && dim >= 3 ? Q_name : "1") << "];\n";
-          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_e"
+          code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_e"
                << var_suffix << ", s_B" << var_suffix << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n";
         } else {
           std::string function_name = "GradNonTensor";
 
-          code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
-          code << "    " << function_name << "<num_comp" << var_suffix << ", dim" << var_suffix << ", " << P_name << ", " << Q_name
+          code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
+          code << tab << function_name << "<num_comp" << var_suffix << ", dim" << var_suffix << ", " << P_name << ", " << Q_name
                << ", OP_T_1D>(data, r_e" << var_suffix << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n";
         }
         break;
       case CEED_EVAL_WEIGHT: {
         if (is_at_points) {
-          code << "    // Nothing to do AtPoints\n";
+          code << tab << "// Nothing to do AtPoints\n";
         } else {
           CeedBasis_Cuda_shared *basis_data;
           std::string            function_name = is_tensor
                                                      ? ((dim == 1 ? "Weight" : "WeightTensor") + std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened"))
                                                      : "WeightNonTensor";
 
-          code << "    CeedScalar r_q" << var_suffix << "[" << (is_all_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
+          code << tab << "CeedScalar r_q" << var_suffix << "[" << (is_all_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
           CeedCallBackend(CeedBasisGetData(basis, &basis_data));
           data->W = basis_data->d_q_weight_1d;
-          code << "    " << function_name << "<" << P_name << ", " << Q_name << ">(data, W, r_q" << var_suffix << ");\n";
+          code << tab << function_name << "<" << P_name << ", " << Q_name << ">(data, W, r_q" << var_suffix << ");\n";
         }
         break;
       }
@@ -591,14 +600,14 @@ static int CeedOperatorBuildKernelBasis_Cuda_gen(std::ostringstream &code, CeedO
   } else {
     switch (eval_mode) {
       case CEED_EVAL_NONE:
-        code << "    CeedScalar *r_e" << var_suffix << " = r_q" << var_suffix << ";\n";
+        code << tab << "CeedScalar *r_e" << var_suffix << " = r_q" << var_suffix << ";\n";
         break;  // No action
       case CEED_EVAL_INTERP:
-        code << "    CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
+        code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
         if (is_at_points) {
           std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d";
 
-          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_c" << var_suffix
+          code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_c" << var_suffix
                << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
         } else {
           std::string function_name =
@@ -606,21 +615,21 @@ static int CeedOperatorBuildKernelBasis_Cuda_gen(std::ostringstream &code, CeedO
                         : "InterpTransposeNonTensor";
           std::string op_t_1d_name = (is_all_tensor || !is_tensor) ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
 
-          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_q"
+          code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_q"
                << var_suffix << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
         }
         break;
       case CEED_EVAL_GRAD:
-        code << "    CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
+        code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
         if (is_at_points) {
           std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d";
 
-          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_c" << var_suffix
+          code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_c" << var_suffix
                << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
         } else if (use_3d_slices) {
           std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d";
 
-          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_q" << var_suffix
+          code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_q" << var_suffix
                << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
         } else if (is_tensor) {
           bool        is_collocated = dim == 3 && Q_1d >= P_1d;
@@ -628,12 +637,12 @@ static int CeedOperatorBuildKernelBasis_Cuda_gen(std::ostringstream &code, CeedO
                                       std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened");
           std::string op_t_1d_name = is_all_tensor ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
 
-          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_q"
+          code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_q"
                << var_suffix << ", s_B" << var_suffix << ", s_G" << var_suffix << ", r_e" << var_suffix << ");\n";
         } else {
           std::string function_name = "GradTransposeNonTensor";
 
-          code << "    " << function_name << "<num_comp" << var_suffix << ", dim" << var_suffix << ", " << P_name << ", " << Q_name
+          code << tab << function_name << "<num_comp" << var_suffix << ", dim" << var_suffix << ", " << P_name << ", " << Q_name
                << ", OP_T_1D>(data, r_q" << var_suffix << ", s_G" << var_suffix << ", r_e" << var_suffix << ");\n";
         }
         break;
@@ -653,8 +662,8 @@ static int CeedOperatorBuildKernelBasis_Cuda_gen(std::ostringstream &code, CeedO
 //------------------------------------------------------------------------------
 // QFunction
 //------------------------------------------------------------------------------
-static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, CeedInt max_dim, CeedInt max_num_points,
-                                                     CeedInt num_input_fields, CeedOperatorField *op_input_fields,
+static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, Tab &tab, CeedInt max_dim,
+                                                     CeedInt max_num_points, CeedInt num_input_fields, CeedOperatorField *op_input_fields,
                                                      CeedQFunctionField *qf_input_fields, CeedInt num_output_fields,
                                                      CeedOperatorField *op_output_fields, CeedQFunctionField *qf_output_fields,
                                                      std::string qfunction_name, CeedInt Q_1d, bool is_all_tensor, bool is_at_points,
@@ -664,46 +673,47 @@ static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, C
   CeedElemRestriction elem_rstr;
 
   // Setup output arrays
-  code << "\n    // -- Output field setup\n";
+  code << "\n";
+  code << tab << "// -- Output field setup\n";
   for (CeedInt i = 0; i < num_output_fields; i++) {
     const char *field_name;
     std::string var_suffix = "_out_" + std::to_string(i);
 
     CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
-    code << "    // ---- Output field " << i << ": " << field_name << "\n";
+    code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
     switch (eval_mode) {
       case CEED_EVAL_NONE:
         if (is_at_points) {
-          code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "];\n";
+          code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "];\n";
         } else {
-          code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tensor && (max_dim >= 3) ? Q_name : "1")
+          code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tensor && (max_dim >= 3) ? Q_name : "1")
                << "];\n";
         }
         break;
       case CEED_EVAL_INTERP:
         if (is_at_points) {
           // Accumulator for point data
-          code << "    CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "];\n";
-          code << "    for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "; i++) r_c" << var_suffix
+          code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "];\n";
+          code << tab << "for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "; i++) r_c" << var_suffix
                << "[i] = 0.0;\n";
         } else {
-          code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tensor && (max_dim >= 3) ? Q_name : "1")
+          code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tensor && (max_dim >= 3) ? Q_name : "1")
                << "];\n";
         }
         break;
       case CEED_EVAL_GRAD:
         if (is_at_points) {
           // Accumulator for point data
-          code << "    CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "];\n";
-          code << "    for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "; i++) r_c" << var_suffix
+          code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "];\n";
+          code << tab << "for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "; i++) r_c" << var_suffix
                << "[i] = 0.0;\n";
         } else if (use_3d_slices) {
           // Accumulator for gradient slices
-          code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
-          code << "    for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << Q_name << "; i++) r_q" << var_suffix << "[i] = 0.0;\n";
+          code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
+          code << tab << "for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << Q_name << "; i++) r_q" << var_suffix << "[i] = 0.0;\n";
         } else {
-          code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "*"
+          code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "*"
                << (is_all_tensor && (max_dim >= 3) ? Q_name : "1") << "];\n";
         }
         break;
@@ -719,46 +729,48 @@ static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, C
 
   if (is_at_points) {
     // We need to handle batches of points
-    code << "\n    // Note: Using batches of points\n";
-    code << "    const CeedInt point_loop_bound = (blockDim.x*blockDim.y) * ceil((1.0*max_num_points) / (blockDim.x*blockDim.y));\n\n";
-    code << "    #pragma unroll\n";
-    code << "    for (CeedInt i = threadIdx.x + threadIdx.y*blockDim.x; i < point_loop_bound; i += blockDim.x*blockDim.y) {\n";
-    code << "      const CeedInt p = i % max_num_points;\n\n";
-
-    code << "      // -- Coordinates\n";
-    code << "      CeedScalar r_x[max_dim];\n";
-    code << "      ReadPoint<max_dim, coords_comp_stride, max_num_points>(data, elem, p, max_num_points, points.indices, points.coords, r_x);\n\n";
-
-    code << "      // -- Input fields\n";
+    code << "\n";
+    code << tab << "// Note: Using batches of points\n";
+    code << tab << "const CeedInt point_loop_bound = (blockDim.x*blockDim.y) * ceil((1.0*max_num_points) / (blockDim.x*blockDim.y));\n\n";
+    code << tab << "#pragma unroll\n";
+    code << tab << "for (CeedInt i = threadIdx.x + threadIdx.y*blockDim.x; i < point_loop_bound; i += blockDim.x*blockDim.y) {\n";
+    tab.push();
+    code << tab << "const CeedInt p = i % max_num_points;\n\n";
+
+    code << tab << "// -- Coordinates\n";
+    code << tab << "CeedScalar r_x[max_dim];\n";
+    code << tab << "ReadPoint<max_dim, coords_comp_stride, max_num_points>(data, elem, p, max_num_points, points.indices, points.coords, r_x);\n\n";
+
+    code << tab << "// -- Input fields\n";
     for (CeedInt i = 0; i < num_input_fields; i++) {
       const char *field_name;
       std::string var_suffix = "_in_" + std::to_string(i);
       std::string P_name     = "P_1d" + var_suffix;
 
       CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name));
-      code << "      // ---- Input field " << i << ": " << field_name << "\n";
+      code << tab << "// ---- Input field " << i << ": " << field_name << "\n";
       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
       // Basis action
-      code << "      // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
+      code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n";
       switch (eval_mode) {
         case CEED_EVAL_NONE:
-          code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
-          code << "      ReadPoint<num_comp" << var_suffix << ", comp_stride" << var_suffix
+          code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
+          code << tab << "ReadPoint<num_comp" << var_suffix << ", comp_stride" << var_suffix
                << ", max_num_points>(data, elem, p, max_num_points, indices.inputs[" << i << "], d" << var_suffix << ", r_s" << var_suffix << ");\n";
           break;
         case CEED_EVAL_INTERP:
-          code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
-          code << "      InterpAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
+          code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
+          code << tab << "InterpAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
                << ">(data, i, r_c" << var_suffix << ", r_x, r_s" << var_suffix << ");\n";
           break;
         case CEED_EVAL_GRAD:
-          code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
-          code << "      GradAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
+          code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
+          code << tab << "GradAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
                << ">(data, i, r_c" << var_suffix << ", r_x, r_s" << var_suffix << ");\n";
           break;
         case CEED_EVAL_WEIGHT:
-          code << "      CeedScalar r_s" << var_suffix << "[1];\n";
-          code << "      r_s" << var_suffix << "[0] = 1.0;\n";
+          code << tab << "CeedScalar r_s" << var_suffix << "[1];\n";
+          code << tab << "r_s" << var_suffix << "[0] = 1.0;\n";
           break;
           // LCOV_EXCL_START
         case CEED_EVAL_DIV:
@@ -767,24 +779,25 @@ static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, C
                   // LCOV_EXCL_STOP
       }
     }
-    code << "\n      // -- Output fields\n";
+    code << "\n";
+    code << tab << "// -- Output fields\n";
     for (CeedInt i = 0; i < num_output_fields; i++) {
       const char *field_name;
       std::string var_suffix = "_out_" + std::to_string(i);
 
       CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
-      code << "      // ---- Output field " << i << ": " << field_name << "\n";
+      code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
       // Basis action
       switch (eval_mode) {
         case CEED_EVAL_NONE:
-          code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
+          code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
           break;
         case CEED_EVAL_INTERP:
-          code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
+          code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
           break;
         case CEED_EVAL_GRAD:
-          code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
+          code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
           break;
           // LCOV_EXCL_START
         case CEED_EVAL_WEIGHT:
@@ -798,24 +811,26 @@ static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, C
 
   } else if (use_3d_slices) {
     // We treat quadrature points per slice in 3d to save registers
-    code << "\n    // Note: Using planes of 3D elements\n";
-    code << "    #pragma unroll\n";
-    code << "    for (CeedInt q = 0; q < " << Q_name << "; q++) {\n";
-    code << "      // -- Input fields\n";
+    code << "\n";
+    code << tab << "// Note: Using planes of 3D elements\n";
+    code << tab << "#pragma unroll\n";
+    code << tab << "for (CeedInt q = 0; q < " << Q_name << "; q++) {\n";
+    tab.push();
+    code << tab << "// -- Input fields\n";
     for (CeedInt i = 0; i < num_input_fields; i++) {
       const char *field_name;
       std::string var_suffix = "_in_" + std::to_string(i);
 
       CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name));
-      code << "      // ---- Input field " << i << ": " << field_name << "\n";
+      code << tab << "// ---- Input field " << i << ": " << field_name << "\n";
       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
       // Basis action
-      code << "      // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
+      code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n";
       switch (eval_mode) {
         case CEED_EVAL_NONE:
           bool is_strided;
 
-          code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
+          code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
 
           CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
           CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided));
@@ -831,39 +846,42 @@ static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, C
             if (!has_backend_strides) {
               CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
             }
-            code << "      // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n";
-            code << "      ReadEVecSliceStrided3d<num_comp" << var_suffix << ", " << Q_name << ", " << strides[0] << ", " << strides[1] << ", "
-                 << strides[2] << ">(data, elem, q, d" << var_suffix << ", r_s" << var_suffix << ");\n";
+            code << tab << "const CeedInt strides" << var_suffix << "_0 = " << strides[0] << ", strides" << var_suffix << "_1 = " << strides[1]
+                 << ", strides" << var_suffix << "_2 = " << strides[2] << ";\n";
+            code << tab << "ReadEVecSliceStrided3d<num_comp" << var_suffix << ", " << Q_name << ", strides" << var_suffix << "_0, strides"
+                 << var_suffix << "_1, strides" << var_suffix << "_2>(data, elem, q, d" << var_suffix << ", r_s" << var_suffix << ");\n";
           } else {
             CeedSize                  l_size = 0;
             CeedInt                   comp_stride;
             CeedElemRestriction_Cuda *rstr_data;
 
             CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
-            code << "      const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
+            code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
             CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
-            code << "      // CompStride: " << comp_stride << "\n";
+            code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
             CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data));
             data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets;
-            code << "      ReadEVecSliceStandard3d<num_comp" << var_suffix << ", " << comp_stride << ", " << Q_name << ">(data, l_size" << var_suffix
-                 << ", elem, q, indices.inputs[" << i << "], d" << var_suffix << ", r_s" << var_suffix << ");\n";
+            code << tab << "ReadEVecSliceStandard3d<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", " << Q_name << ">(data, l_size"
+                 << var_suffix << ", elem, q, indices.inputs[" << i << "], d" << var_suffix << ", r_s" << var_suffix << ");\n";
           }
           CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
           break;
         case CEED_EVAL_INTERP:
-          code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
-          code << "      for (CeedInt j = 0; j < num_comp" << var_suffix << "; j++) {\n";
-          code << "        r_s" << var_suffix << "[j] = r_q" << var_suffix << "[q + j*" << Q_name << "];\n";
-          code << "      }\n";
+          code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
+          code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << "; j++) {\n";
+          tab.push();
+          code << "r_s" << var_suffix << "[j] = r_q" << var_suffix << "[q + j*" << Q_name << "];\n";
+          tab.pop();
+          code << tab << "}\n";
           break;
         case CEED_EVAL_GRAD:
-          code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
-          code << "      GradColloSlice3d<num_comp" << var_suffix << ", " << Q_name << ", OP_T_1D>(data, q, r_q" << var_suffix << ", s_G"
+          code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
+          code << tab << "GradColloSlice3d<num_comp" << var_suffix << ", " << Q_name << ", OP_T_1D>(data, q, r_q" << var_suffix << ", s_G"
                << var_suffix << ", r_s" << var_suffix << ");\n";
           break;
         case CEED_EVAL_WEIGHT:
-          code << "      CeedScalar r_s" << var_suffix << "[1];\n";
-          code << "      r_s" << var_suffix << "[0] = r_q" << var_suffix << "[q];\n";
+          code << tab << "CeedScalar r_s" << var_suffix << "[1];\n";
+          code << tab << "r_s" << var_suffix << "[0] = r_q" << var_suffix << "[q];\n";
           break;
           // LCOV_EXCL_START
         case CEED_EVAL_DIV:
@@ -872,24 +890,25 @@ static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, C
                   // LCOV_EXCL_STOP
       }
     }
-    code << "\n      // -- Output fields\n";
+    code << "\n";
+    code << tab << "// -- Output fields\n";
     for (CeedInt i = 0; i < num_output_fields; i++) {
       const char *field_name;
       std::string var_suffix = "_out_" + std::to_string(i);
 
       CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
-      code << "      // ---- Output field " << i << ": " << field_name << "\n";
+      code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
       // Basis action
       switch (eval_mode) {
         case CEED_EVAL_NONE:
-          code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
+          code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
           break;
         case CEED_EVAL_INTERP:
-          code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
+          code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
           break;
         case CEED_EVAL_GRAD:
-          code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
+          code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
           break;
           // LCOV_EXCL_START
         case CEED_EVAL_WEIGHT:
@@ -901,50 +920,54 @@ static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, C
       }
     }
   } else {
-    code << "\n    // Note: Using full elements\n";
-    code << "    {\n";
-    code << "      // -- Input fields\n";
+    code << "\n";
+    code << tab << "// Note: Using full elements\n";
+    code << tab << "{\n";
+    tab.push();
+    code << tab << "// -- Input fields\n";
     for (CeedInt i = 0; i < num_input_fields; i++) {
       const char *field_name;
 
       CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name));
-      code << "      // ---- Input field " << i << ": " << field_name << "\n";
-      code << "      CeedScalar *r_s_in_" << i << " = r_q_in_" << i << ";\n";
+      code << tab << "// ---- Input field " << i << ": " << field_name << "\n";
+      code << tab << "CeedScalar *r_s_in_" << i << " = r_q_in_" << i << ";\n";
     }
-    code << "      // -- Output fields\n";
+    code << tab << "// -- Output fields\n";
     for (CeedInt i = 0; i < num_output_fields; i++) {
       const char *field_name;
 
       CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
-      code << "      // ---- Output field " << i << ": " << field_name << "\n";
-      code << "      CeedScalar *r_s_out_" << i << " = r_q_out_" << i << ";\n";
+      code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
+      code << tab << "CeedScalar *r_s_out_" << i << " = r_q_out_" << i << ";\n";
     }
   }
 
   // Input and output buffers
-  code << "\n      // -- QFunction inputs and outputs\n";
-  code << "      // ---- Inputs\n";
-  code << "      CeedScalar *inputs[" << CeedIntMax(num_input_fields, 1) << "];\n";
+  code << "\n";
+  code << tab << "// -- QFunction inputs and outputs\n";
+  code << tab << "// ---- Inputs\n";
+  code << tab << "CeedScalar *inputs[" << CeedIntMax(num_input_fields, 1) << "];\n";
   for (CeedInt i = 0; i < num_input_fields; i++) {
     const char *field_name;
 
     CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name));
-    code << "      // ------ Input field " << i << ": " << field_name << "\n";
-    code << "      inputs[" << i << "] = r_s_in_" << i << ";\n";
+    code << tab << "// ------ Input field " << i << ": " << field_name << "\n";
+    code << tab << "inputs[" << i << "] = r_s_in_" << i << ";\n";
   }
-  code << "      // ---- Outputs\n";
-  code << "      CeedScalar *outputs[" << CeedIntMax(num_output_fields, 1) << "];\n";
+  code << tab << "// ---- Outputs\n";
+  code << tab << "CeedScalar *outputs[" << CeedIntMax(num_output_fields, 1) << "];\n";
   for (CeedInt i = 0; i < num_output_fields; i++) {
     const char *field_name;
 
     CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
-    code << "      // ------ Output field " << i << ": " << field_name << "\n";
-    code << "      outputs[" << i << "] = r_s_out_" << i << ";\n";
+    code << tab << "// ------ Output field " << i << ": " << field_name << "\n";
+    code << tab << "outputs[" << i << "] = r_s_out_" << i << ";\n";
   }
 
   // Apply QFunction
-  code << "\n      // -- Apply QFunction\n";
-  code << "      " << qfunction_name << "(ctx, ";
+  code << "\n";
+  code << tab << "// -- Apply QFunction\n";
+  code << tab << "" << qfunction_name << "(ctx, ";
   if (max_dim != 3 || is_at_points || use_3d_slices || !is_all_tensor) {
     code << "1";
   } else {
@@ -954,17 +977,18 @@ static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, C
 
   if (is_at_points) {
     // Map back to coefficients
-    code << "\n      // -- Output fields\n";
+    code << "\n";
+    code << tab << "// -- Output fields\n";
     for (CeedInt i = 0; i < num_output_fields; i++) {
       const char *field_name;
       std::string var_suffix = "_out_" + std::to_string(i);
       std::string P_name     = "P_1d" + var_suffix;
 
       CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
-      code << "      // ---- Output field " << i << ": " << field_name << "\n";
+      code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
       // Basis action
-      code << "      // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
+      code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n";
       switch (eval_mode) {
         case CEED_EVAL_NONE: {
           CeedInt             comp_stride;
@@ -973,24 +997,28 @@ static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, C
           CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
           CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
           CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
-          code << "      const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
-          code << "      WritePoint<num_comp" << var_suffix << ", comp_stride" << var_suffix
+          code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
+          code << tab << "WritePoint<num_comp" << var_suffix << ", comp_stride" << var_suffix
                << ", max_num_points>(data, elem, i, points.num_per_elem[elem], indices.outputs[" << i << "]"
                << ", r_s" << var_suffix << ", d" << var_suffix << ");\n";
           break;
         }
         case CEED_EVAL_INTERP:
-          code << "      if (i >= points.num_per_elem[elem]) {\n";
-          code << "        for (CeedInt j = 0; j < num_comp" << var_suffix << "; j++) r_s" << var_suffix << "[j] = 0.0;\n";
-          code << "      }\n";
-          code << "      InterpTransposeAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
+          code << tab << "if (i >= points.num_per_elem[elem]) {\n";
+          tab.push();
+          code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << "; j++) r_s" << var_suffix << "[j] = 0.0;\n";
+          tab.pop();
+          code << tab << "}\n";
+          code << tab << "InterpTransposeAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
                << ">(data, i, r_s" << var_suffix << ", r_x, r_c" << var_suffix << ");\n";
           break;
         case CEED_EVAL_GRAD:
-          code << "      if (i >= points.num_per_elem[elem]) {\n";
-          code << "        for (CeedInt j = 0; j < num_comp" << var_suffix << "*dim" << var_suffix << "; j++) r_s" << var_suffix << "[j] = 0.0;\n";
-          code << "      }\n";
-          code << "      GradTransposeAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
+          code << tab << "if (i >= points.num_per_elem[elem]) {\n";
+          tab.push();
+          code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << "*dim" << var_suffix << "; j++) r_s" << var_suffix << "[j] = 0.0;\n";
+          tab.pop();
+          code << tab << "}\n";
+          code << tab << "GradTransposeAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
                << ">(data, i, r_s" << var_suffix << ", r_x, r_c" << var_suffix << ");\n";
           break;
           // LCOV_EXCL_START
@@ -1004,30 +1032,35 @@ static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, C
     }
   } else if (use_3d_slices) {
     // Copy or apply transpose grad, if needed
-    code << "\n      // -- Output fields\n";
+    code << "\n";
+    code << tab << "// -- Output fields\n";
     for (CeedInt i = 0; i < num_output_fields; i++) {
       const char *field_name;
       std::string var_suffix = "_out_" + std::to_string(i);
       std::string P_name     = "P_1d" + var_suffix;
 
       CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
-      code << "      // ---- Output field " << i << ": " << field_name << "\n";
+      code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
       // Basis action
-      code << "      // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
+      code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n";
       switch (eval_mode) {
         case CEED_EVAL_NONE:
-          code << "      for (CeedInt j = 0; j < num_comp" << var_suffix << " ; j++) {\n";
-          code << "        r_q" << var_suffix << "[q + j*" << Q_name << "] = r_s" << var_suffix << "[j];\n";
-          code << "      }\n";
+          code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << " ; j++) {\n";
+          tab.push();
+          code << tab << "r_q" << var_suffix << "[q + j*" << Q_name << "] = r_s" << var_suffix << "[j];\n";
+          tab.pop();
+          code << tab << "}\n";
           break;
         case CEED_EVAL_INTERP:
-          code << "      for (CeedInt j = 0; j < num_comp" << var_suffix << " ; j++) {\n";
-          code << "        r_q" << var_suffix << "[q + j*" << Q_name << "] = r_s" << var_suffix << "[j];\n";
-          code << "      }\n";
+          code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << " ; j++) {\n";
+          tab.push();
+          code << tab << "r_q" << var_suffix << "[q + j*" << Q_name << "] = r_s" << var_suffix << "[j];\n";
+          tab.pop();
+          code << tab << "}\n";
           break;
         case CEED_EVAL_GRAD:
-          code << "      GradColloSliceTranspose3d<num_comp" << var_suffix << ", " << Q_name << ", OP_T_1D>(data, q, r_s" << var_suffix << ", s_G"
+          code << tab << "GradColloSliceTranspose3d<num_comp" << var_suffix << ", " << Q_name << ", OP_T_1D>(data, q, r_s" << var_suffix << ", s_G"
                << var_suffix << ", r_q" << var_suffix << ");\n";
           break;
           // LCOV_EXCL_START
@@ -1040,7 +1073,8 @@ static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, C
       }
     }
   }
-  code << "    }\n";
+  tab.pop();
+  code << tab << "}\n";
   return CEED_ERROR_SUCCESS;
 }
 
@@ -1057,6 +1091,7 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_b
   CeedOperatorField      *op_input_fields, *op_output_fields;
   CeedOperator_Cuda_gen  *data;
   std::ostringstream      code;
+  Tab                     tab;
 
   CeedCallBackend(CeedOperatorGetData(op, &data));
   {
@@ -1186,19 +1221,19 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_b
     CeedCallBackend(CeedGetData(ceed, &ceed_data));
     CeedCallBackend(cudaGetDeviceProperties(&prop, ceed_data->device_id));
     if ((prop.major < 6) && (CEED_SCALAR_TYPE != CEED_SCALAR_FP32)) {
-      code << "// AtomicAdd fallback source\n";
-      code << "#include <ceed/jit-source/cuda/cuda-atomic-add-fallback.h>\n\n";
+      code << tab << "// AtomicAdd fallback source\n";
+      code << tab << "#include <ceed/jit-source/cuda/cuda-atomic-add-fallback.h>\n\n";
     }
   }
 
   // Load basis source files
   if (!is_all_nontensor) {
-    code << "// Tensor basis source\n";
-    code << "#include <ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h>\n\n";
+    code << tab << "// Tensor basis source\n";
+    code << tab << "#include <ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h>\n\n";
   }
   if (!is_all_tensor) {
-    code << "// Non-tensor basis source\n";
-    code << "#include <ceed/jit-source/cuda/cuda-shared-basis-nontensor-templates.h>\n\n";
+    code << tab << "// Non-tensor basis source\n";
+    code << tab << "#include <ceed/jit-source/cuda/cuda-shared-basis-nontensor-templates.h>\n\n";
   }
   if (!is_all_tensor && !is_all_nontensor) {
     code << "// Tensor basis source\n";
@@ -1218,11 +1253,11 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_b
   operator_name = "CeedKernelCudaGenOperator_" + qfunction_name;
 
   // Define CEED_Q_VLA
-  code << "\n#undef CEED_Q_VLA\n";
+  code << "\n" << tab << "#undef CEED_Q_VLA\n";
   if (max_dim != 3 || is_at_points || use_3d_slices || !is_all_tensor) {
-    code << "#define CEED_Q_VLA 1\n\n";
+    code << tab << "#define CEED_Q_VLA 1\n\n";
   } else {
-    code << "#define CEED_Q_VLA " << Q_1d << "\n\n";
+    code << tab << "#define CEED_Q_VLA " << Q_1d << "\n\n";
   }
 
   // Add user QFunction source
@@ -1232,26 +1267,27 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_b
     CeedCallBackend(CeedQFunctionGetSourcePath(qf, &source_path));
     CeedCheck(source_path, ceed, CEED_ERROR_UNSUPPORTED, "/gpu/cuda/gen backend requires QFunction source code file");
 
-    code << "// User QFunction source\n";
-    code << "#include \"" << source_path << "\"\n\n";
+    code << tab << "// User QFunction source\n";
+    code << tab << "#include \"" << source_path << "\"\n\n";
   }
 
   // Setup
-  code << "\n// -----------------------------------------------------------------------------\n";
-  code << "// Operator Kernel\n";
-  code << "// \n";
-  code << "// d_[in,out]_i:   CeedVector device array\n";
-  code << "// r_[in,out]_e_i: Element vector register\n";
-  code << "// r_[in,out]_q_i: Quadrature space vector register\n";
-  code << "// r_[in,out]_c_i: AtPoints Chebyshev coefficients register\n";
-  code << "// r_[in,out]_s_i: Quadrature space slice vector register\n";
-  code << "// \n";
-  code << "// s_B_[in,out]_i: Interpolation matrix, shared memory\n";
-  code << "// s_G_[in,out]_i: Gradient matrix, shared memory\n";
-  code << "// -----------------------------------------------------------------------------\n";
-  code << "extern \"C\" __global__ void " << operator_name
+  code << "\n" << tab << "// -----------------------------------------------------------------------------\n";
+  code << tab << "// Operator Kernel\n";
+  code << tab << "// \n";
+  code << tab << "// d_[in,out]_i:   CeedVector device array\n";
+  code << tab << "// r_[in,out]_e_i: Element vector register\n";
+  code << tab << "// r_[in,out]_q_i: Quadrature space vector register\n";
+  code << tab << "// r_[in,out]_c_i: AtPoints Chebyshev coefficients register\n";
+  code << tab << "// r_[in,out]_s_i: Quadrature space slice vector register\n";
+  code << tab << "// \n";
+  code << tab << "// s_B_[in,out]_i: Interpolation matrix, shared memory\n";
+  code << tab << "// s_G_[in,out]_i: Gradient matrix, shared memory\n";
+  code << tab << "// -----------------------------------------------------------------------------\n";
+  code << tab << "extern \"C\" __global__ void " << operator_name
        << "(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalar *W, Points_Cuda "
           "points) {\n";
+  tab.push();
 
   // Scratch buffers
   for (CeedInt i = 0; i < num_input_fields; i++) {
@@ -1259,33 +1295,33 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_b
 
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
     if (eval_mode != CEED_EVAL_WEIGHT) {  // Skip CEED_EVAL_WEIGHT
-      code << "  const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n";
+      code << tab << "const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n";
     }
   }
   for (CeedInt i = 0; i < num_output_fields; i++) {
-    code << "  CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n";
+    code << tab << "CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n";
   }
 
-  code << "  const CeedInt max_dim = " << max_dim << ";\n";
+  code << tab << "const CeedInt max_dim = " << max_dim << ";\n";
   if (!is_all_tensor) {
-    code << "  const CeedInt Q = " << Q << ";\n";
+    code << tab << "const CeedInt Q = " << Q << ";\n";
   }
   if (!is_all_nontensor) {
-    code << "  const CeedInt Q_1d = " << Q_1d << ";\n";
+    code << tab << "const CeedInt Q_1d = " << Q_1d << ";\n";
   }
   if (is_at_points) {
-    code << "  const CeedInt max_num_points = " << max_num_points << ";\n";
-    code << "  const CeedInt coords_comp_stride = " << coords_comp_stride << ";\n";
+    code << tab << "const CeedInt max_num_points = " << max_num_points << ";\n";
+    code << tab << "const CeedInt coords_comp_stride = " << coords_comp_stride << ";\n";
   }
 
   // Shared data
-  code << "  extern __shared__ CeedScalar slice[];\n";
-  code << "  SharedData_Cuda data;\n";
-  code << "  data.t_id_x = threadIdx.x;\n";
-  code << "  data.t_id_y = threadIdx.y;\n";
-  code << "  data.t_id_z = threadIdx.z;\n";
-  code << "  data.t_id   = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
-  code << "  data.slice  = slice + data.t_id_z*OP_T_1D" << ((!is_all_tensor || max_dim == 1) ? "" : "*OP_T_1D") << ";\n";
+  code << tab << "extern __shared__ CeedScalar slice[];\n";
+  code << tab << "SharedData_Cuda data;\n";
+  code << tab << "data.t_id_x = threadIdx.x;\n";
+  code << tab << "data.t_id_y = threadIdx.y;\n";
+  code << tab << "data.t_id_z = threadIdx.z;\n";
+  code << tab << "data.t_id   = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
+  code << tab << "data.slice  = slice + data.t_id_z*OP_T_1D" << ((!is_all_tensor || max_dim == 1) ? "" : "*OP_T_1D") << ";\n";
 
   // -- Determine input mat reuse
   FieldReuse_Cuda input_matrix_reuse[CEED_FIELD_MAX];
@@ -1392,21 +1428,22 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_b
   }
 
   // Initialize constants, and matrices B and G
-  code << "\n  // Input field constants and basis data\n";
+  code << "\n" << tab << "// Input field constants and basis data\n";
   for (CeedInt i = 0; i < num_input_fields; i++) {
-    CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i], max_dim,
-                                                              Q, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices));
+    CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, tab, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i],
+                                                              max_dim, Q, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices));
   }
-  code << "\n  // Output field constants and basis data\n";
+  code << "\n" << tab << "// Output field constants and basis data\n";
   for (CeedInt i = 0; i < num_output_fields; i++) {
-    CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i],
+    CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i],
                                                               max_dim, Q, Q_1d, false, is_all_tensor, is_at_points, use_3d_slices));
   }
 
   // Loop over all elements
-  code << "\n  // Element loop\n";
-  code << "  __syncthreads();\n";
-  code << "  for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n";
+  code << "\n" << tab << "// Element loop\n";
+  code << tab << "__syncthreads();\n";
+  code << tab << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n";
+  tab.push();
 
   // -- Compute minimum buffer space needed
   CeedInt max_rstr_buffer_size = 1;
@@ -1439,8 +1476,8 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_b
       CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
     }
   }
-  code << "    // Scratch restriction buffer space\n";
-  code << "    CeedScalar r_e_scratch[" << max_rstr_buffer_size << "];\n";
+  code << tab << "// Scratch restriction buffer space\n";
+  code << tab << "CeedScalar r_e_scratch[" << max_rstr_buffer_size << "];\n";
 
   // -- Determine best input field processing order
   CeedInt field_rstr_in_buffer[CEED_FIELD_MAX], input_field_order[CEED_FIELD_MAX];
@@ -1487,49 +1524,51 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_b
   }
 
   // -- Input restriction and basis
-  code << "\n    // -- Input field restrictions and basis actions\n";
+  code << "\n" << tab << "// -- Input field restrictions and basis actions\n";
   for (CeedInt i = 0; i < num_input_fields; i++) {
     const char   *field_name;
     const CeedInt f = input_field_order[i];
 
     CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[f], &field_name));
-    code << "    // ---- Input field " << f << ": " << field_name << "\n";
+    code << tab << "// ---- Input field " << f << ": " << field_name << "\n";
 
     // ---- Restriction
-    CeedCallBackend(CeedOperatorBuildKernelRestriction_Cuda_gen(code, data, f, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f], max_dim,
-                                                                Q_1d, true, is_all_tensor, is_at_points, use_3d_slices));
+    CeedCallBackend(CeedOperatorBuildKernelRestriction_Cuda_gen(code, data, tab, f, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f],
+                                                                max_dim, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices));
 
     // ---- Basis action
-    CeedCallBackend(CeedOperatorBuildKernelBasis_Cuda_gen(code, data, f, op_input_fields[f], qf_input_fields[f], max_dim, Q_1d, true, is_all_tensor,
-                                                          is_at_points, use_3d_slices));
+    CeedCallBackend(CeedOperatorBuildKernelBasis_Cuda_gen(code, data, tab, f, op_input_fields[f], qf_input_fields[f], max_dim, Q_1d, true,
+                                                          is_all_tensor, is_at_points, use_3d_slices));
   }
 
   // -- Q function
-  CeedCallBackend(CeedOperatorBuildKernelQFunction_Cuda_gen(code, data, max_dim, max_num_points, num_input_fields, op_input_fields, qf_input_fields,
-                                                            num_output_fields, op_output_fields, qf_output_fields, qfunction_name, Q_1d,
-                                                            is_all_tensor, is_at_points, use_3d_slices));
+  CeedCallBackend(CeedOperatorBuildKernelQFunction_Cuda_gen(code, data, tab, max_dim, max_num_points, num_input_fields, op_input_fields,
+                                                            qf_input_fields, num_output_fields, op_output_fields, qf_output_fields, qfunction_name,
+                                                            Q_1d, is_all_tensor, is_at_points, use_3d_slices));
 
   // -- Output basis and restriction
-  code << "\n    // -- Output field basis action and restrictions\n";
+  code << "\n" << tab << "// -- Output field basis action and restrictions\n";
   for (CeedInt i = 0; i < num_output_fields; i++) {
     const char *field_name;
 
     CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
-    code << "    // ---- Output field " << i << ": " << field_name << "\n";
+    code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
 
     // ---- Basis action
-    CeedCallBackend(CeedOperatorBuildKernelBasis_Cuda_gen(code, data, i, op_output_fields[i], qf_output_fields[i], max_dim, Q_1d, false,
+    CeedCallBackend(CeedOperatorBuildKernelBasis_Cuda_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], max_dim, Q_1d, false,
                                                           is_all_tensor, is_at_points, use_3d_slices));
 
     // ---- Restriction
-    CeedCallBackend(CeedOperatorBuildKernelRestriction_Cuda_gen(code, data, i, NULL, op_output_fields[i], qf_output_fields[i], max_dim, Q_1d, false,
-                                                                is_all_tensor, is_at_points, use_3d_slices));
+    CeedCallBackend(CeedOperatorBuildKernelRestriction_Cuda_gen(code, data, tab, i, NULL, op_output_fields[i], qf_output_fields[i], max_dim, Q_1d,
+                                                                false, is_all_tensor, is_at_points, use_3d_slices));
   }
 
   // Close loop and function
-  code << "  }\n";
-  code << "}\n";
-  code << "// -----------------------------------------------------------------------------\n\n";
+  tab.pop();
+  code << tab << "}\n";
+  tab.pop();
+  code << tab << "}\n";
+  code << tab << "// -----------------------------------------------------------------------------\n\n";
 
   // Compile
   {
@@ -1553,3 +1592,467 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_b
 }
 
 //------------------------------------------------------------------------------
+// Build AtPoints assembly operator kernel
+//------------------------------------------------------------------------------
+static int CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen(CeedOperator op, bool is_full, bool *is_good_build) {
+  bool                    is_all_tensor = true, is_at_points = false, use_3d_slices = false;
+  Ceed                    ceed;
+  CeedInt                 Q, Q_1d, num_input_fields, num_output_fields, max_dim = 1, max_num_points = 0, coords_comp_stride = 0;
+  CeedQFunctionField     *qf_input_fields, *qf_output_fields;
+  CeedQFunction_Cuda_gen *qf_data;
+  CeedQFunction           qf;
+  CeedOperatorField      *op_input_fields, *op_output_fields;
+  CeedOperator_Cuda_gen  *data;
+  std::ostringstream      code;
+  Tab                     tab;
+
+  // Check compatibility
+  CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
+  CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points));
+  CeedCheck(is_at_points, ceed, CEED_ERROR_BACKEND, "Only AtPoints operator assembly supported");
+
+  // Retrieve operator data
+  CeedCallBackend(CeedOperatorGetData(op, &data));
+  Q       = data->Q;
+  Q_1d    = data->Q_1d;
+  max_dim = data->dim;
+  {
+    CeedElemRestriction rstr_points = NULL;
+
+    CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL));
+    CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(rstr_points, &max_num_points));
+    CeedCallBackend(CeedElemRestrictionGetCompStride(rstr_points, &coords_comp_stride));
+    CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
+  }
+  CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
+  CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
+  CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
+  CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
+
+  // Add atomicAdd function for old NVidia architectures
+  {
+    Ceed_Cuda            *ceed_data;
+    struct cudaDeviceProp prop;
+
+    CeedCallBackend(CeedGetData(ceed, &ceed_data));
+    CeedCallBackend(cudaGetDeviceProperties(&prop, ceed_data->device_id));
+    if ((prop.major < 6) && (CEED_SCALAR_TYPE != CEED_SCALAR_FP32)) {
+      code << tab << "// AtomicAdd fallback source\n";
+      code << tab << "#include <ceed/jit-source/cuda/cuda-atomic-add-fallback.h>\n\n";
+    }
+  }
+
+  // Load basis source files
+  code << tab << "// Tensor basis source\n";
+  code << tab << "#include <ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h>\n\n";
+  code << tab << "// AtPoints basis source\n";
+  code << tab << "#include <ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h>\n\n";
+  code << tab << "// CodeGen operator source\n";
+  code << tab << "#include <ceed/jit-source/cuda/cuda-gen-templates.h>\n\n";
+
+  // Get QFunction name
+  std::string qfunction_name(qf_data->qfunction_name);
+  std::string operator_name;
+
+  if (is_full) {
+    operator_name = "CeedKernelCudaGenOperatorFullAssembly_" + qfunction_name;
+  } else {
+    operator_name = "CeedKernelCudaGenOperatorDiagonalAssembly_" + qfunction_name;
+  }
+
+  // Define CEED_Q_VLA
+  code << "\n" << tab << "#undef CEED_Q_VLA\n";
+  code << tab << "#define CEED_Q_VLA 1\n\n";
+
+  // Add user QFunction source
+  {
+    const char *source_path;
+
+    CeedCallBackend(CeedQFunctionGetSourcePath(qf, &source_path));
+    CeedCheck(source_path, ceed, CEED_ERROR_UNSUPPORTED, "/gpu/cuda/gen backend requires QFunction source code file");
+
+    code << tab << "// User QFunction source\n";
+    code << tab << "#include \"" << source_path << "\"\n\n";
+  }
+
+  // Setup
+  code << "\n" << tab << "// -----------------------------------------------------------------------------\n";
+  code << tab << "// Operator Assembly Kernel\n";
+  code << tab << "// \n";
+  code << tab << "// d_[in,out]_i:   CeedVector device array\n";
+  code << tab << "// r_[in,out]_e_i: Element vector register\n";
+  code << tab << "// r_[in,out]_q_i: Quadrature space vector register\n";
+  code << tab << "// r_[in,out]_c_i: AtPoints Chebyshev coefficients register\n";
+  code << tab << "// r_[in,out]_s_i: Quadrature space slice vector register\n";
+  code << tab << "// \n";
+  code << tab << "// s_B_[in,out]_i: Interpolation matrix, shared memory\n";
+  code << tab << "// s_G_[in,out]_i: Gradient matrix, shared memory\n";
+  code << tab << "// -----------------------------------------------------------------------------\n";
+  code << tab << "extern \"C\" __global__ void " << operator_name
+       << "(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalar *W, Points_Cuda "
+          "points, CeedScalar *__restrict__ values_array) {\n";
+  tab.push();
+
+  // Scratch buffers
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedEvalMode eval_mode;
+
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
+    if (eval_mode != CEED_EVAL_WEIGHT) {  // Skip CEED_EVAL_WEIGHT
+      code << tab << "const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n";
+    }
+  }
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    code << tab << "CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n";
+  }
+
+  code << tab << "const CeedInt max_dim = " << max_dim << ";\n";
+  code << tab << "const CeedInt Q_1d = " << Q_1d << ";\n";
+  code << tab << "const CeedInt max_num_points = " << max_num_points << ";\n";
+  code << tab << "const CeedInt coords_comp_stride = " << coords_comp_stride << ";\n";
+
+  // Shared data
+  code << tab << "extern __shared__ CeedScalar slice[];\n";
+  code << tab << "SharedData_Cuda data;\n";
+  code << tab << "data.t_id_x = threadIdx.x;\n";
+  code << tab << "data.t_id_y = threadIdx.y;\n";
+  code << tab << "data.t_id_z = threadIdx.z;\n";
+  code << tab << "data.t_id   = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
+  code << tab << "data.slice  = slice + data.t_id_z*OP_T_1D" << ((!is_all_tensor || max_dim == 1) ? "" : "*OP_T_1D") << ";\n";
+
+  // -- Determine input mat reuse
+  FieldReuse_Cuda input_matrix_reuse[CEED_FIELD_MAX];
+
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    input_matrix_reuse[i].index = -1;
+  }
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedEvalMode eval_mode_i;
+    CeedBasis    basis_i;
+
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode_i));
+    if (eval_mode_i == CEED_EVAL_WEIGHT) continue;
+    CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis_i));
+    for (CeedInt j = 0; (input_matrix_reuse[i].index == -1) && (j < i); j++) {
+      CeedEvalMode eval_mode_j;
+      CeedBasis    basis_j;
+
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
+      if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
+      CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
+      if (basis_i == basis_j) {
+        input_matrix_reuse[i].index     = j;
+        input_matrix_reuse[i].is_input  = true;
+        input_matrix_reuse[i].eval_mode = eval_mode_j;
+      }
+      CeedCallBackend(CeedBasisDestroy(&basis_j));
+    }
+    CeedCallBackend(CeedBasisDestroy(&basis_i));
+  }
+
+  // -- Determine output mat reuse
+  FieldReuse_Cuda output_matrix_reuse[CEED_FIELD_MAX];
+
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    output_matrix_reuse[i].index = -1;
+  }
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    CeedEvalMode eval_mode_i;
+    CeedBasis    basis_i;
+
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode_i));
+    CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis_i));
+    for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < num_input_fields); j++) {
+      CeedEvalMode eval_mode_j;
+      CeedBasis    basis_j;
+
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
+      if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
+      CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
+      if (basis_i == basis_j) {
+        output_matrix_reuse[i].index     = j;
+        output_matrix_reuse[i].is_input  = true;
+        output_matrix_reuse[i].eval_mode = eval_mode_j;
+      }
+      CeedCallBackend(CeedBasisDestroy(&basis_j));
+    }
+    for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < i); j++) {
+      CeedEvalMode eval_mode_j;
+      CeedBasis    basis_j;
+
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode_j));
+      if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
+      CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[j], &basis_j));
+      if (basis_i == basis_j) {
+        output_matrix_reuse[i].index     = j;
+        output_matrix_reuse[i].is_input  = false;
+        output_matrix_reuse[i].eval_mode = eval_mode_j;
+      }
+      CeedCallBackend(CeedBasisDestroy(&basis_j));
+    }
+    CeedCallBackend(CeedBasisDestroy(&basis_i));
+  }
+
+  // Initialize constants, and matrices B and G
+  code << "\n" << tab << "// Input field constants and basis data\n";
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, tab, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i],
+                                                              max_dim, Q, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices));
+  }
+  code << "\n" << tab << "// Output field constants and basis data\n";
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i],
+                                                              max_dim, Q, Q_1d, false, is_all_tensor, is_at_points, use_3d_slices));
+  }
+
+  // Loop over all elements
+  code << "\n" << tab << "// Element loop\n";
+  code << tab << "__syncthreads();\n";
+  code << tab << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n";
+  tab.push();
+
+  // -- Compute minimum buffer space needed
+  CeedInt max_rstr_buffer_size = 1;
+
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedEvalMode eval_mode;
+
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
+    if (eval_mode != CEED_EVAL_NONE && eval_mode != CEED_EVAL_WEIGHT) {
+      CeedInt             num_comp;
+      CeedElemRestriction elem_rstr;
+
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
+      CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
+      max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1));
+      CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
+    }
+  }
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    CeedEvalMode eval_mode;
+
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
+    if (eval_mode != CEED_EVAL_NONE) {
+      CeedInt             num_comp;
+      CeedElemRestriction elem_rstr;
+
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
+      CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
+      max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1));
+      CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
+    }
+  }
+  code << tab << "// Scratch restriction buffer space\n";
+  code << tab << "CeedScalar r_e_scratch[" << max_rstr_buffer_size << "];\n";
+
+  // -- Determine best input field processing order
+  CeedInt field_rstr_in_buffer[CEED_FIELD_MAX], input_field_order[CEED_FIELD_MAX];
+
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    field_rstr_in_buffer[i] = -1;
+    input_field_order[i]    = -1;
+  }
+  {
+    bool    is_ordered[CEED_FIELD_MAX];
+    CeedInt curr_index = 0;
+
+    for (CeedInt i = 0; i < num_input_fields; i++) is_ordered[i] = false;
+    for (CeedInt i = 0; i < num_input_fields; i++) {
+      CeedVector          vec_i;
+      CeedElemRestriction rstr_i;
+
+      if (is_ordered[i]) continue;
+      field_rstr_in_buffer[i]       = i;
+      is_ordered[i]                 = true;
+      input_field_order[curr_index] = i;
+      curr_index++;
+      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i));
+      if (vec_i == CEED_VECTOR_NONE) continue;  // CEED_EVAL_WEIGHT
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i));
+      for (CeedInt j = i + 1; j < num_input_fields; j++) {
+        CeedVector          vec_j;
+        CeedElemRestriction rstr_j;
+
+        CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec_j));
+        CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[j], &rstr_j));
+        if (rstr_i == rstr_j && vec_i == vec_j) {
+          field_rstr_in_buffer[j]       = i;
+          is_ordered[j]                 = true;
+          input_field_order[curr_index] = j;
+          curr_index++;
+        }
+        CeedCallBackend(CeedVectorDestroy(&vec_j));
+        CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j));
+      }
+      CeedCallBackend(CeedVectorDestroy(&vec_i));
+      CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
+    }
+  }
+
+  // -- Input restriction and basis
+  code << "\n" << tab << "// -- Input field restrictions and basis actions\n";
+  CeedInt active_field_index = -1;
+
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    bool          is_active = false;
+    const char   *field_name;
+    const CeedInt f = input_field_order[i];
+
+    {
+      CeedVector vec;
+
+      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[f], &vec));
+      is_active = vec == CEED_VECTOR_ACTIVE;
+      CeedCallBackend(CeedVectorDestroy(&vec));
+    }
+
+    CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[f], &field_name));
+    code << tab << "// ---- Input field " << f << ": " << field_name << "\n";
+
+    if (is_active) {
+      std::string var_suffix = "_in_" + std::to_string(f);
+
+      code << tab << "// Active field - no restriction or basis action here\n";
+      if (active_field_index == -1) {
+        active_field_index = f;
+        code << tab << "CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << (max_dim >= 3 ? "P_1d" + var_suffix : "1")
+             << "] = {0.0};\n";
+      } else {
+        code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_in_" << active_field_index << ";\n";
+      }
+    } else {
+      // ---- Restriction
+      CeedCallBackend(CeedOperatorBuildKernelRestriction_Cuda_gen(code, data, tab, f, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f],
+                                                                  max_dim, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices));
+
+      // ---- Basis action
+      CeedCallBackend(CeedOperatorBuildKernelBasis_Cuda_gen(code, data, tab, f, op_input_fields[f], qf_input_fields[f], max_dim, Q_1d, true,
+                                                            is_all_tensor, is_at_points, use_3d_slices));
+    }
+  }
+
+  // -- Loop over active field
+  std::string active_var_suffix = "_in_" + std::to_string(active_field_index);
+
+  code << "\n" << tab << "// Loop over nodes in active field\n";
+  code << tab << "for (CeedInt n = 0; n < num_comp" << active_var_suffix << "*P_1d" << active_var_suffix
+       << (max_dim > 1 ? "*P_1d" + active_var_suffix : "") << (max_dim > 2 ? "*P_1d" + active_var_suffix : "") << "; n++) {\n";
+  tab.push();
+
+  // -- Set current active node and component to 1
+  code << tab << "// Set current active node and component to 1.0\n";
+  code << tab << "SetEVecStandard" << max_dim << "d_Single<num_comp" << active_var_suffix << ", P_1d" << active_var_suffix << ">(data, n, 1.0, r_e"
+       << active_var_suffix << ");\n\n";
+
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    bool          is_active = false;
+    const char   *field_name;
+    const CeedInt f = input_field_order[i];
+
+    {
+      CeedVector vec;
+
+      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[f], &vec));
+      is_active = vec == CEED_VECTOR_ACTIVE;
+      CeedCallBackend(CeedVectorDestroy(&vec));
+    }
+    if (!is_active) continue;
+
+    CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[f], &field_name));
+    code << tab << "// ---- Input field " << f << ": " << field_name << "\n";
+
+    // ---- Basis action
+    CeedCallBackend(CeedOperatorBuildKernelBasis_Cuda_gen(code, data, tab, f, op_input_fields[f], qf_input_fields[f], max_dim, Q_1d, true,
+                                                          is_all_tensor, is_at_points, use_3d_slices));
+  }
+
+  // -- Q function
+  CeedCallBackend(CeedOperatorBuildKernelQFunction_Cuda_gen(code, data, tab, max_dim, max_num_points, num_input_fields, op_input_fields,
+                                                            qf_input_fields, num_output_fields, op_output_fields, qf_output_fields, qfunction_name,
+                                                            Q_1d, is_all_tensor, is_at_points, use_3d_slices));
+
+  // -- Output basis and restriction
+  code << "\n" << tab << "// -- Output field basis action and restrictions\n";
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    bool        is_active = false;
+    const char *field_name;
+
+    {
+      CeedVector vec;
+
+      CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
+      is_active = vec == CEED_VECTOR_ACTIVE;
+      CeedCallBackend(CeedVectorDestroy(&vec));
+    }
+    if (!is_active) continue;
+
+    CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
+    code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
+
+    // ---- Basis action
+    CeedCallBackend(CeedOperatorBuildKernelBasis_Cuda_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], max_dim, Q_1d, false,
+                                                          is_all_tensor, is_at_points, use_3d_slices));
+
+    // ---- Restriction
+    if (is_full) {
+      // TODO: UPDATE OUTPUTS FOR FULL ASSEMBLY
+    } else {
+      std::string         var_suffix = "_out_" + std::to_string(i);
+      CeedInt             comp_stride;
+      CeedSize            l_size;
+      CeedElemRestriction elem_rstr;
+
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
+      CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
+      code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
+      CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
+      code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
+      code << tab << "WriteLVecStandard" << max_dim << "d_Single<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", P_1d" + var_suffix
+           << ">(data, l_size" << var_suffix << ", elem, n, indices.outputs[" << i << "], r_e" << var_suffix << ", values_array);\n";
+      CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
+    }
+  }
+
+  // -- Reset current active node and component
+  code << "\n" << tab << "// Reset current active node and component to 0.0\n";
+  code << tab << "SetEVecStandard" << max_dim << "d_Single<num_comp" << active_var_suffix << ", P_1d" << active_var_suffix << ">(data, n, 0.0, r_e"
+       << active_var_suffix << ");\n";
+
+  // -- End of loop over active field
+  tab.pop();
+  code << tab << "}\n";
+
+  // Close loop and function
+  tab.pop();
+  code << tab << "}\n";
+  tab.pop();
+  code << tab << "}\n";
+  code << tab << "// -----------------------------------------------------------------------------\n\n";
+
+  // Compile
+  {
+    bool          is_compile_good = false;
+    const CeedInt T_1d            = CeedIntMax(is_all_tensor ? Q_1d : Q, data->max_P_1d);
+
+    data->thread_1d = T_1d;
+    CeedCallBackend(CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good,
+                                        is_full ? &data->module_assemble_full : &data->module_assemble_diagonal, 1, "OP_T_1D", T_1d));
+    if (is_compile_good) {
+      *is_good_build = true;
+      CeedCallBackend(CeedGetKernel_Cuda(ceed, is_full ? data->module_assemble_full : data->module_assemble_diagonal, operator_name.c_str(),
+                                         is_full ? &data->assemble_full : &data->assemble_diagonal));
+    } else {
+      *is_good_build              = false;
+      data->use_assembly_fallback = true;
+    }
+  }
+  CeedCallBackend(CeedDestroy(&ceed));
+  CeedCallBackend(CeedQFunctionDestroy(&qf));
+  return CEED_ERROR_SUCCESS;
+}
+
+extern "C" int CeedOperatorBuildKernelDiagonalAssemblyAtPoints_Cuda_gen(CeedOperator op, bool *is_good_build) {
+  return CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen(op, false, is_good_build);
+}
+
+//------------------------------------------------------------------------------
diff --git a/backends/cuda-gen/ceed-cuda-gen-operator-build.h b/backends/cuda-gen/ceed-cuda-gen-operator-build.h
index 87b2674b7c..7c116e1bfa 100644
--- a/backends/cuda-gen/ceed-cuda-gen-operator-build.h
+++ b/backends/cuda-gen/ceed-cuda-gen-operator-build.h
@@ -7,3 +7,5 @@
 #pragma once
 
 CEED_INTERN int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_build);
+CEED_INTERN int CeedOperatorBuildKernelFullAssemblyAtPoints_Cuda_gen(CeedOperator op, bool *is_good_build);
+CEED_INTERN int CeedOperatorBuildKernelDiagonalAssemblyAtPoints_Cuda_gen(CeedOperator op, bool *is_good_build);
diff --git a/backends/cuda-gen/ceed-cuda-gen-operator.c b/backends/cuda-gen/ceed-cuda-gen-operator.c
index f1295252a6..5b9acfdb86 100644
--- a/backends/cuda-gen/ceed-cuda-gen-operator.c
+++ b/backends/cuda-gen/ceed-cuda-gen-operator.c
@@ -28,6 +28,8 @@ static int CeedOperatorDestroy_Cuda_gen(CeedOperator op) {
   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
   CeedCallBackend(CeedOperatorGetData(op, &impl));
   if (impl->module) CeedCallCuda(ceed, cuModuleUnload(impl->module));
+  if (impl->module_assemble_full) CeedCallCuda(ceed, cuModuleUnload(impl->module_assemble_full));
+  if (impl->module_assemble_diagonal) CeedCallCuda(ceed, cuModuleUnload(impl->module_assemble_diagonal));
   if (impl->points.num_per_elem) CeedCallCuda(ceed, cudaFree((void **)impl->points.num_per_elem));
   CeedCallBackend(CeedFree(&impl));
   CeedCallBackend(CeedDestroy(&ceed));
@@ -333,11 +335,173 @@ static int CeedOperatorApplyAddComposite_Cuda_gen(CeedOperator op, CeedVector in
   return CEED_ERROR_SUCCESS;
 }
 
+//------------------------------------------------------------------------------
+// AtPoints diagonal assembly
+//------------------------------------------------------------------------------
+static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda_gen(CeedOperator op, CeedVector assembled, CeedRequest *request) {
+  Ceed                   ceed;
+  CeedOperator_Cuda_gen *data;
+
+  CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
+  CeedCallBackend(CeedOperatorGetData(op, &data));
+
+  // Build the assembly kernel
+  if (!data->assemble_diagonal && !data->use_assembly_fallback) {
+    bool                     is_build_good = false;
+    CeedInt                  num_active_bases_in, num_active_bases_out;
+    CeedOperatorAssemblyData assembly_data;
+
+    CeedCallBackend(CeedOperatorGetOperatorAssemblyData(op, &assembly_data));
+    CeedCallBackend(
+        CeedOperatorAssemblyDataGetEvalModes(assembly_data, &num_active_bases_in, NULL, NULL, NULL, &num_active_bases_out, NULL, NULL, NULL, NULL));
+    if (num_active_bases_in == num_active_bases_out) {
+      CeedCallBackend(CeedOperatorBuildKernel_Cuda_gen(op, &is_build_good));
+      if (is_build_good) CeedCallBackend(CeedOperatorBuildKernelDiagonalAssemblyAtPoints_Cuda_gen(op, &is_build_good));
+    }
+    if (!is_build_good) data->use_assembly_fallback = true;
+  }
+
+  // Try assembly
+  if (!data->use_assembly_fallback) {
+    bool                    is_run_good = true;
+    Ceed_Cuda              *cuda_data;
+    CeedInt                 num_elem, num_input_fields, num_output_fields;
+    CeedEvalMode            eval_mode;
+    CeedScalar             *assembled_array;
+    CeedQFunctionField     *qf_input_fields, *qf_output_fields;
+    CeedQFunction_Cuda_gen *qf_data;
+    CeedQFunction           qf;
+    CeedOperatorField      *op_input_fields, *op_output_fields;
+
+    CeedCallBackend(CeedGetData(ceed, &cuda_data));
+    CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
+    CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
+    CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
+    CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
+    CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
+
+    // Input vectors
+    for (CeedInt i = 0; i < num_input_fields; i++) {
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
+      if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
+        data->fields.inputs[i] = NULL;
+      } else {
+        bool       is_active;
+        CeedVector vec;
+
+        // Get input vector
+        CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
+        is_active = vec == CEED_VECTOR_ACTIVE;
+        if (is_active) data->fields.inputs[i] = NULL;
+        else CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->fields.inputs[i]));
+        CeedCallBackend(CeedVectorDestroy(&vec));
+      }
+    }
+
+    // Point coordinates
+    {
+      CeedVector vec;
+
+      CeedCallBackend(CeedOperatorAtPointsGetPoints(op, NULL, &vec));
+      CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->points.coords));
+      CeedCallBackend(CeedVectorDestroy(&vec));
+
+      // Points per elem
+      if (num_elem != data->points.num_elem) {
+        CeedInt            *points_per_elem;
+        const CeedInt       num_bytes   = num_elem * sizeof(CeedInt);
+        CeedElemRestriction rstr_points = NULL;
+
+        data->points.num_elem = num_elem;
+        CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL));
+        CeedCallBackend(CeedCalloc(num_elem, &points_per_elem));
+        for (CeedInt e = 0; e < num_elem; e++) {
+          CeedInt num_points_elem;
+
+          CeedCallBackend(CeedElemRestrictionGetNumPointsInElement(rstr_points, e, &num_points_elem));
+          points_per_elem[e] = num_points_elem;
+        }
+        if (data->points.num_per_elem) CeedCallCuda(ceed, cudaFree((void **)data->points.num_per_elem));
+        CeedCallCuda(ceed, cudaMalloc((void **)&data->points.num_per_elem, num_bytes));
+        CeedCallCuda(ceed, cudaMemcpy((void *)data->points.num_per_elem, points_per_elem, num_bytes, cudaMemcpyHostToDevice));
+        CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
+        CeedCallBackend(CeedFree(&points_per_elem));
+      }
+    }
+
+    // Get context data
+    CeedCallBackend(CeedQFunctionGetInnerContextData(qf, CEED_MEM_DEVICE, &qf_data->d_c));
+
+    // Assembly array
+    CeedCallBackend(CeedVectorGetArray(assembled, CEED_MEM_DEVICE, &assembled_array));
+
+    // Assemble diagonal
+    void *opargs[] = {(void *)&num_elem, &qf_data->d_c, &data->indices, &data->fields, &data->B, &data->G, &data->W, &data->points, &assembled_array};
+    int   max_threads_per_block, min_grid_size, grid;
+
+    CeedCallCuda(ceed, cuOccupancyMaxPotentialBlockSize(&min_grid_size, &max_threads_per_block, data->op, dynamicSMemSize, 0, 0x10000));
+    int block[3] = {data->thread_1d, (data->dim == 1 ? 1 : data->thread_1d), -1};
+
+    CeedCallBackend(BlockGridCalculate(num_elem, min_grid_size / cuda_data->device_prop.multiProcessorCount, 1,
+                                       cuda_data->device_prop.maxThreadsDim[2], cuda_data->device_prop.warpSize, block, &grid));
+    CeedInt shared_mem = block[0] * block[1] * block[2] * sizeof(CeedScalar);
+
+    CeedCallBackend(
+        CeedTryRunKernelDimShared_Cuda(ceed, data->assemble_diagonal, NULL, grid, block[0], block[1], block[2], shared_mem, &is_run_good, opargs));
+
+    // Restore input arrays
+    for (CeedInt i = 0; i < num_input_fields; i++) {
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
+      if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
+      } else {
+        bool       is_active;
+        CeedVector vec;
+
+        CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
+        is_active = vec == CEED_VECTOR_ACTIVE;
+        if (!is_active) CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->fields.inputs[i]));
+        CeedCallBackend(CeedVectorDestroy(&vec));
+      }
+    }
+
+    // Restore point coordinates
+    {
+      CeedVector vec;
+
+      CeedCallBackend(CeedOperatorAtPointsGetPoints(op, NULL, &vec));
+      CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->points.coords));
+      CeedCallBackend(CeedVectorDestroy(&vec));
+    }
+
+    // Restore context data
+    CeedCallBackend(CeedQFunctionRestoreInnerContextData(qf, &qf_data->d_c));
+
+    // Restore assembly array
+    CeedCallBackend(CeedVectorRestoreArray(assembled, &assembled_array));
+
+    // Cleanup
+    CeedCallBackend(CeedQFunctionDestroy(&qf));
+    if (!is_run_good) data->use_assembly_fallback = true;
+  }
+  CeedCallBackend(CeedDestroy(&ceed));
+
+  // Fallback, if needed
+  if (data->use_assembly_fallback) {
+    CeedOperator op_fallback;
+
+    CeedDebug256(CeedOperatorReturnCeed(op), CEED_DEBUG_COLOR_SUCCESS, "Falling back to /gpu/cuda/ref CeedOperator");
+    CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback));
+    CeedCallBackend(CeedOperatorLinearAssembleAddDiagonal(op_fallback, assembled, request));
+    return CEED_ERROR_SUCCESS;
+  }
+  return CEED_ERROR_SUCCESS;
+}
+
 //------------------------------------------------------------------------------
 // Create operator
 //------------------------------------------------------------------------------
 int CeedOperatorCreate_Cuda_gen(CeedOperator op) {
-  bool                   is_composite;
+  bool                   is_composite, is_at_points;
   Ceed                   ceed;
   CeedOperator_Cuda_gen *impl;
 
@@ -350,6 +514,11 @@ int CeedOperatorCreate_Cuda_gen(CeedOperator op) {
   } else {
     CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAdd_Cuda_gen));
   }
+  CeedCall(CeedOperatorIsAtPoints(op, &is_at_points));
+  if (is_at_points) {
+    CeedCallBackend(
+        CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddDiagonal", CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda_gen));
+  }
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "Destroy", CeedOperatorDestroy_Cuda_gen));
   CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
diff --git a/backends/cuda-gen/ceed-cuda-gen.h b/backends/cuda-gen/ceed-cuda-gen.h
index a9999c535b..ef80881a77 100644
--- a/backends/cuda-gen/ceed-cuda-gen.h
+++ b/backends/cuda-gen/ceed-cuda-gen.h
@@ -12,13 +12,13 @@
 #include <cuda.h>
 
 typedef struct {
-  bool           use_fallback;
+  bool           use_fallback, use_assembly_fallback;
   CeedInt        dim;
   CeedInt        Q, Q_1d;
   CeedInt        max_P_1d;
   CeedInt        thread_1d;
-  CUmodule       module;
-  CUfunction     op;
+  CUmodule       module, module_assemble_full, module_assemble_diagonal;
+  CUfunction     op, assemble_full, assemble_diagonal;
   FieldsInt_Cuda indices;
   Fields_Cuda    fields;
   Fields_Cuda    B;
diff --git a/backends/hip-gen/ceed-hip-gen-operator-build.cpp b/backends/hip-gen/ceed-hip-gen-operator-build.cpp
index 0638c12dca..a4e410c4a6 100644
--- a/backends/hip-gen/ceed-hip-gen-operator-build.cpp
+++ b/backends/hip-gen/ceed-hip-gen-operator-build.cpp
@@ -9,6 +9,7 @@
 
 #include <ceed.h>
 #include <ceed/backend.h>
+#include <ceed/gen-tools.h>
 #include <ceed/jit-tools.h>
 
 #include <iostream>
@@ -203,9 +204,10 @@ static int CeedOperatorBuildKernelData_Hip_gen(Ceed ceed, CeedInt num_input_fiel
 //------------------------------------------------------------------------------
 // Setup fields
 //------------------------------------------------------------------------------
-static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, CeedInt i, CeedOperatorField op_field,
-                                                    CeedQFunctionField qf_field, FieldReuse_Hip field_reuse, CeedInt max_dim, CeedInt Q, CeedInt Q_1d,
-                                                    bool is_input, bool is_all_tensor, bool is_at_points, bool use_3d_slices) {
+static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, Tab &tab, CeedInt i,
+                                                    CeedOperatorField op_field, CeedQFunctionField qf_field, FieldReuse_Hip field_reuse,
+                                                    CeedInt max_dim, CeedInt Q, CeedInt Q_1d, bool is_input, bool is_all_tensor, bool is_at_points,
+                                                    bool use_3d_slices) {
   bool      is_tensor = true;
   CeedBasis basis;
   CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis));
@@ -224,7 +226,7 @@ static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, Ce
   bool use_previous_field = field_reuse.index != -1;
 
   CeedCallBackend(CeedOperatorFieldGetName(op_field, &field_name));
-  code << "  // -- " << (is_input ? "Input" : "Output") << " field " << i << ": " << field_name << "\n";
+  code << tab << "// -- " << (is_input ? "Input" : "Output") << " field " << i << ": " << field_name << "\n";
 
   // Get field data
   CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr));
@@ -242,20 +244,20 @@ static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, Ce
   CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode));
 
   // Set field constants
-  code << "  const CeedInt dim" << var_suffix << " = " << dim << ";\n";
+  code << tab << "const CeedInt dim" << var_suffix << " = " << dim << ";\n";
   if (is_tensor && !is_all_tensor) {
     CeedInt P = 0;
 
     CeedCallBackend(CeedBasisGetNumNodes(basis, &P));
-    code << "  const CeedInt P" << var_suffix << " = " << (basis == CEED_BASIS_NONE ? Q : P) << ";\n";
+    code << tab << "const CeedInt P" << var_suffix << " = " << (basis == CEED_BASIS_NONE ? Q : P) << ";\n";
   }
-  code << "  const CeedInt " << P_name << " = " << (basis == CEED_BASIS_NONE ? Q_1d : P_1d) << ";\n";
+  code << tab << "const CeedInt " << P_name << " = " << (basis == CEED_BASIS_NONE ? Q_1d : P_1d) << ";\n";
   if (eval_mode != CEED_EVAL_WEIGHT) {
-    code << "  const CeedInt num_comp" << var_suffix << " = " << num_comp << ";\n";
+    code << tab << "const CeedInt num_comp" << var_suffix << " = " << num_comp << ";\n";
   }
 
   // Load basis data
-  code << "  // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
+  code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n";
   switch (eval_mode) {
     case CEED_EVAL_NONE:
       break;
@@ -284,10 +286,10 @@ static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, Ce
       if (use_previous_field) {
         std::string reuse_var = "s_B" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
 
-        code << "  CeedScalar *s_B" << var_suffix << " = " << reuse_var << ";\n";
+        code << tab << "CeedScalar *s_B" << var_suffix << " = " << reuse_var << ";\n";
       } else {
-        code << "  __shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n";
-        code << "  LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
+        code << tab << "__shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n";
+        code << tab << "LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
       }
       break;
     case CEED_EVAL_GRAD:
@@ -316,10 +318,10 @@ static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, Ce
         if (use_previous_field) {
           std::string reuse_var = "s_B" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
 
-          code << "  CeedScalar *s_B" << var_suffix << " = " << reuse_var << ";\n";
+          code << tab << "CeedScalar *s_B" << var_suffix << " = " << reuse_var << ";\n";
         } else {
-          code << "  __shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n";
-          code << "  LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
+          code << tab << "__shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n";
+          code << tab << "LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
         }
       }
       if (is_at_points) break;  // No G mat for AtPoints
@@ -329,10 +331,10 @@ static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, Ce
         if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD) {
           std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
 
-          code << "  CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
+          code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
         } else {
-          code << "  __shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n";
-          code << "  LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
+          code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n";
+          code << tab << "LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
         }
       } else {
         bool has_collo_grad = basis_data->d_collo_grad_1d;
@@ -343,20 +345,20 @@ static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, Ce
           if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD) {
             std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
 
-            code << "  CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
+            code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
           } else {
-            code << "  __shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n";
-            code << "  LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
+            code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n";
+            code << tab << "LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
           }
         } else {
           if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD) {
             std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
 
-            code << "  CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
+            code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
           } else {
-            code << "  __shared__ CeedScalar s_G" << var_suffix << "[" << P_name << "*" << Q_name << (is_tensor ? "" : "*dim")
+            code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << P_name << "*" << Q_name << (is_tensor ? "" : "*dim")
                  << (is_tensor ? "" : var_suffix) << "];\n";
-            code << "  LoadMatrix<" << P_name << ", " << Q_name << (is_tensor ? "" : "*dim") << (is_tensor ? "" : var_suffix) << ">(data, G."
+            code << tab << "LoadMatrix<" << P_name << ", " << Q_name << (is_tensor ? "" : "*dim") << (is_tensor ? "" : var_suffix) << ">(data, G."
                  << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
           }
         }
@@ -377,9 +379,10 @@ static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, Ce
 //------------------------------------------------------------------------------
 // Restriction
 //------------------------------------------------------------------------------
-static int CeedOperatorBuildKernelRestriction_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, CeedInt i, CeedInt field_input_buffer[],
-                                                      CeedOperatorField op_field, CeedQFunctionField qf_field, CeedInt max_dim, CeedInt Q_1d,
-                                                      bool is_input, bool is_all_tensor, bool is_at_points, bool use_3d_slices) {
+static int CeedOperatorBuildKernelRestriction_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, Tab &tab, CeedInt i,
+                                                      CeedInt field_input_buffer[], CeedOperatorField op_field, CeedQFunctionField qf_field,
+                                                      CeedInt max_dim, CeedInt Q_1d, bool is_input, bool is_all_tensor, bool is_at_points,
+                                                      bool use_3d_slices) {
   std::string              var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
   std::string              P_name     = (is_all_tensor ? "P_1d" : "P") + var_suffix;
   CeedEvalMode             eval_mode  = CEED_EVAL_NONE;
@@ -406,26 +409,27 @@ static int CeedOperatorBuildKernelRestriction_Hip_gen(std::ostringstream &code,
       std::string buffer_name = "r_e_in_" + std::to_string(field_input_buffer[i]);
 
       // Restriction was already done for previous input
-      code << "    CeedScalar *r_e" << var_suffix << " = " << buffer_name << ";\n";
+      code << tab << "CeedScalar *r_e" << var_suffix << " = " << buffer_name << ";\n";
     } else if (eval_mode != CEED_EVAL_WEIGHT && !((eval_mode == CEED_EVAL_NONE) && use_3d_slices && is_at_points)) {
       if (eval_mode == CEED_EVAL_NONE && rstr_type != CEED_RESTRICTION_POINTS) {
         // No basis action, so r_e_in_* in also r_q_in_* and needs to be allocated
-        code << "    CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << P_name << "];\n";
+        code << tab << "CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << P_name << "];\n";
       } else if (rstr_type != CEED_RESTRICTION_POINTS) {
         // Otherwise we're using the scratch space
-        code << "    CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
+        code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
       }
       switch (rstr_type) {
         case CEED_RESTRICTION_STANDARD: {
           CeedInt comp_stride;
 
           CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
-          code << "    const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
+          code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
           CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
-          code << "    // CompStride: " << comp_stride << "\n";
+          code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
           data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets;
-          code << "    ReadLVecStandard" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", " << comp_stride << ", " << P_name
-               << ">(data, l_size" << var_suffix << ", elem, indices.inputs[" << i << "], d" << var_suffix << ", r_e" << var_suffix << ");\n";
+          code << tab << "ReadLVecStandard" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", "
+               << P_name << ">(data, l_size" << var_suffix << ", elem, indices.inputs[" << i << "], d" << var_suffix << ", r_e" << var_suffix
+               << ");\n";
           break;
         }
         case CEED_RESTRICTION_STRIDED: {
@@ -439,16 +443,18 @@ static int CeedOperatorBuildKernelRestriction_Hip_gen(std::ostringstream &code,
           if (!has_backend_strides) {
             CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
           }
-          code << "    // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n";
-          code << "    ReadLVecStrided" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", " << P_name << ", " << strides[0] << ", "
-               << strides[1] << ", " << strides[2] << ">(data, elem, d" << var_suffix << ", r_e" << var_suffix << ");\n";
+          code << tab << "const CeedInt strides" << var_suffix << "_0 = " << strides[0] << ", strides" << var_suffix << "_1 = " << strides[1]
+               << ", strides" << var_suffix << "_2 = " << strides[2] << ";\n";
+          code << tab << "ReadLVecStrided" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", " << P_name << ", strides"
+               << var_suffix << "_0, strides" << var_suffix << "_1, strides" << var_suffix << "_2>(data, elem, d" << var_suffix << ", r_e"
+               << var_suffix << ");\n";
           break;
         }
         case CEED_RESTRICTION_POINTS: {
           CeedInt comp_stride;
 
           CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
-          code << "    const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
+          code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
           data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets;
           break;
         }
@@ -466,12 +472,13 @@ static int CeedOperatorBuildKernelRestriction_Hip_gen(std::ostringstream &code,
         CeedInt comp_stride;
 
         CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
-        code << "    const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
+        code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
         CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
-        code << "    // CompStride: " << comp_stride << "\n";
+        code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
         data->indices.outputs[i] = (CeedInt *)rstr_data->d_offsets;
-        code << "    WriteLVecStandard" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", " << comp_stride << ", " << P_name
-             << ">(data, l_size" << var_suffix << ", elem, indices.outputs[" << i << "], r_e" << var_suffix << ", d" << var_suffix << ");\n";
+        code << tab << "WriteLVecStandard" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", "
+             << P_name << ">(data, l_size" << var_suffix << ", elem, indices.outputs[" << i << "], r_e" << var_suffix << ", d" << var_suffix
+             << ");\n";
         break;
       }
       case CEED_RESTRICTION_STRIDED: {
@@ -485,9 +492,11 @@ static int CeedOperatorBuildKernelRestriction_Hip_gen(std::ostringstream &code,
         if (!has_backend_strides) {
           CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
         }
-        code << "    // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n";
-        code << "    WriteLVecStrided" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", " << P_name << ", " << strides[0] << ", "
-             << strides[1] << ", " << strides[2] << ">(data, elem, r_e" << var_suffix << ", d" << var_suffix << ");\n";
+        code << tab << "const CeedInt strides" << var_suffix << "_0 = " << strides[0] << ", strides" << var_suffix << "_1 = " << strides[1]
+             << ", strides" << var_suffix << "_2 = " << strides[2] << ";\n";
+        code << tab << "WriteLVecStrided" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", " << P_name << ", strides"
+             << var_suffix << "_0, strides" << var_suffix << "_1, strides" << var_suffix << "_2>(data, elem, r_e" << var_suffix << ", d" << var_suffix
+             << ");\n";
         break;
       }
       case CEED_RESTRICTION_POINTS:
@@ -507,7 +516,7 @@ static int CeedOperatorBuildKernelRestriction_Hip_gen(std::ostringstream &code,
 //------------------------------------------------------------------------------
 // Basis
 //------------------------------------------------------------------------------
-static int CeedOperatorBuildKernelBasis_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, CeedInt i, CeedOperatorField op_field,
+static int CeedOperatorBuildKernelBasis_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, Tab &tab, CeedInt i, CeedOperatorField op_field,
                                                 CeedQFunctionField qf_field, CeedInt max_dim, CeedInt Q_1d, bool is_input, bool is_all_tensor,
                                                 bool is_at_points, bool use_3d_slices) {
   bool      is_tensor = true;
@@ -536,20 +545,20 @@ static int CeedOperatorBuildKernelBasis_Hip_gen(std::ostringstream &code, CeedOp
   CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode));
 
   // Basis
-  code << "    // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
+  code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n";
   if (is_input) {
     switch (eval_mode) {
       case CEED_EVAL_NONE:
         if (!use_3d_slices && !is_at_points) {
-          code << "    CeedScalar *r_q" << var_suffix << " = r_e" << var_suffix << ";\n";
+          code << tab << "CeedScalar *r_q" << var_suffix << " = r_e" << var_suffix << ";\n";
         }
         break;
       case CEED_EVAL_INTERP:
         if (is_at_points) {
           std::string function_name = (dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d";
 
-          code << "    CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "];\n";
-          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix
+          code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "];\n";
+          code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix
                << ", s_B" << var_suffix << ", r_c" << var_suffix << ");\n";
         } else {
           std::string function_name = is_tensor
@@ -557,8 +566,8 @@ static int CeedOperatorBuildKernelBasis_Hip_gen(std::ostringstream &code, CeedOp
                                           : "InterpNonTensor";
           std::string op_t_1d_name  = (is_all_tensor || !is_tensor) ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
 
-          code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
-          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_e"
+          code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
+          code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_e"
                << var_suffix << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n";
         }
         break;
@@ -566,14 +575,14 @@ static int CeedOperatorBuildKernelBasis_Hip_gen(std::ostringstream &code, CeedOp
         if (is_at_points) {
           std::string function_name = (dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d";
 
-          code << "    CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "];\n";
-          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix
+          code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "];\n";
+          code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix
                << ", s_B" << var_suffix << ", r_c" << var_suffix << ");\n";
         } else if (use_3d_slices) {
           std::string function_name = (dim > 1 ? "InterpTensor" : "Interp") + std::to_string(dim) + "d";
 
-          code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
-          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix
+          code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
+          code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix
                << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n";
         } else if (is_tensor) {
           bool        is_collocated = dim == 3 && Q_1d >= P_1d;
@@ -581,31 +590,31 @@ static int CeedOperatorBuildKernelBasis_Hip_gen(std::ostringstream &code, CeedOp
                                       (is_all_tensor ? "" : "Flattened");
           std::string op_t_1d_name = is_all_tensor ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
 
-          code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "*"
+          code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "*"
                << (is_all_tensor && dim >= 3 ? Q_name : "1") << "];\n";
-          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_e"
+          code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_e"
                << var_suffix << ", s_B" << var_suffix << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n";
         } else {
           std::string function_name = "GradNonTensor";
 
-          code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
-          code << "    " << function_name << "<num_comp" << var_suffix << ", dim" << var_suffix << ", " << P_name << ", " << Q_name
+          code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
+          code << tab << function_name << "<num_comp" << var_suffix << ", dim" << var_suffix << ", " << P_name << ", " << Q_name
                << ", OP_T_1D>(data, r_e" << var_suffix << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n";
         }
         break;
       case CEED_EVAL_WEIGHT: {
         if (is_at_points) {
-          code << "    // Nothing to do AtPoints\n";
+          code << tab << "// Nothing to do AtPoints\n";
         } else {
           CeedBasis_Hip_shared *basis_data;
           std::string           function_name = is_tensor
                                                     ? ((dim == 1 ? "Weight" : "WeightTensor") + std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened"))
                                                     : "WeightNonTensor";
 
-          code << "    CeedScalar r_q" << var_suffix << "[" << (is_all_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
+          code << tab << "CeedScalar r_q" << var_suffix << "[" << (is_all_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
           CeedCallBackend(CeedBasisGetData(basis, &basis_data));
           data->W = basis_data->d_q_weight_1d;
-          code << "    " << function_name << "<" << P_name << ", " << Q_name << ">(data, W, r_q" << var_suffix << ");\n";
+          code << tab << function_name << "<" << P_name << ", " << Q_name << ">(data, W, r_q" << var_suffix << ");\n";
         }
         break;
       }
@@ -618,14 +627,14 @@ static int CeedOperatorBuildKernelBasis_Hip_gen(std::ostringstream &code, CeedOp
   } else {
     switch (eval_mode) {
       case CEED_EVAL_NONE:
-        code << "    CeedScalar *r_e" << var_suffix << " = r_q" << var_suffix << ";\n";
+        code << tab << "CeedScalar *r_e" << var_suffix << " = r_q" << var_suffix << ";\n";
         break;  // No action
       case CEED_EVAL_INTERP:
-        code << "    CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
+        code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
         if (is_at_points) {
           std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d";
 
-          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_c" << var_suffix
+          code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_c" << var_suffix
                << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
         } else {
           std::string function_name =
@@ -633,21 +642,21 @@ static int CeedOperatorBuildKernelBasis_Hip_gen(std::ostringstream &code, CeedOp
                         : "InterpTransposeNonTensor";
           std::string op_t_1d_name = (is_all_tensor || !is_tensor) ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
 
-          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_q"
+          code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_q"
                << var_suffix << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
         }
         break;
       case CEED_EVAL_GRAD:
-        code << "    CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
+        code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
         if (is_at_points) {
           std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d";
 
-          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_c" << var_suffix
+          code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_c" << var_suffix
                << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
         } else if (use_3d_slices) {
           std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d";
 
-          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_q" << var_suffix
+          code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_q" << var_suffix
                << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
         } else if (is_tensor) {
           bool        is_collocated = dim == 3 && Q_1d >= P_1d;
@@ -655,12 +664,12 @@ static int CeedOperatorBuildKernelBasis_Hip_gen(std::ostringstream &code, CeedOp
                                       std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened");
           std::string op_t_1d_name = is_all_tensor ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
 
-          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_q"
+          code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_q"
                << var_suffix << ", s_B" << var_suffix << ", s_G" << var_suffix << ", r_e" << var_suffix << ");\n";
         } else {
           std::string function_name = "GradTransposeNonTensor";
 
-          code << "    " << function_name << "<num_comp" << var_suffix << ", dim" << var_suffix << ", " << P_name << ", " << Q_name
+          code << tab << function_name << "<num_comp" << var_suffix << ", dim" << var_suffix << ", " << P_name << ", " << Q_name
                << ", OP_T_1D>(data, r_q" << var_suffix << ", s_G" << var_suffix << ", r_e" << var_suffix << ");\n";
         }
         break;
@@ -680,56 +689,58 @@ static int CeedOperatorBuildKernelBasis_Hip_gen(std::ostringstream &code, CeedOp
 //------------------------------------------------------------------------------
 // QFunction
 //------------------------------------------------------------------------------
-static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, CeedInt max_dim, CeedInt max_num_points,
-                                                    CeedInt num_input_fields, CeedOperatorField *op_input_fields, CeedQFunctionField *qf_input_fields,
-                                                    CeedInt num_output_fields, CeedOperatorField *op_output_fields,
-                                                    CeedQFunctionField *qf_output_fields, std::string qfunction_name, CeedInt Q_1d,
-                                                    bool is_all_tensor, bool is_at_points, bool use_3d_slices) {
+static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, Tab &tab, CeedInt max_dim,
+                                                    CeedInt max_num_points, CeedInt num_input_fields, CeedOperatorField *op_input_fields,
+                                                    CeedQFunctionField *qf_input_fields, CeedInt num_output_fields,
+                                                    CeedOperatorField *op_output_fields, CeedQFunctionField *qf_output_fields,
+                                                    std::string qfunction_name, CeedInt Q_1d, bool is_all_tensor, bool is_at_points,
+                                                    bool use_3d_slices) {
   std::string         Q_name    = is_all_tensor ? "Q_1d" : "Q";
   CeedEvalMode        eval_mode = CEED_EVAL_NONE;
   CeedElemRestriction elem_rstr;
 
   // Setup output arrays
-  code << "\n    // -- Output field setup\n";
+  code << "\n";
+  code << tab << "// -- Output field setup\n";
   for (CeedInt i = 0; i < num_output_fields; i++) {
     const char *field_name;
     std::string var_suffix = "_out_" + std::to_string(i);
 
     CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
-    code << "    // ---- Output field " << i << ": " << field_name << "\n";
+    code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
     switch (eval_mode) {
       case CEED_EVAL_NONE:
         if (is_at_points) {
-          code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "];\n";
+          code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "];\n";
         } else {
-          code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tensor && (max_dim >= 3) ? Q_name : "1")
+          code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tensor && (max_dim >= 3) ? Q_name : "1")
                << "];\n";
         }
         break;
       case CEED_EVAL_INTERP:
         if (is_at_points) {
           // Accumulator for point data
-          code << "    CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "];\n";
-          code << "    for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "; i++) r_c" << var_suffix
+          code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "];\n";
+          code << tab << "for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "; i++) r_c" << var_suffix
                << "[i] = 0.0;\n";
         } else {
-          code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tensor && (max_dim >= 3) ? Q_name : "1")
+          code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tensor && (max_dim >= 3) ? Q_name : "1")
                << "];\n";
         }
         break;
       case CEED_EVAL_GRAD:
         if (is_at_points) {
           // Accumulator for point data
-          code << "    CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "];\n";
-          code << "    for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "; i++) r_c" << var_suffix
+          code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "];\n";
+          code << tab << "for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "; i++) r_c" << var_suffix
                << "[i] = 0.0;\n";
         } else if (use_3d_slices) {
           // Accumulator for gradient slices
-          code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
-          code << "    for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << Q_name << "; i++) r_q" << var_suffix << "[i] = 0.0;\n";
+          code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
+          code << tab << "for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << Q_name << "; i++) r_q" << var_suffix << "[i] = 0.0;\n";
         } else {
-          code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "*"
+          code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "*"
                << (is_all_tensor && (max_dim >= 3) ? Q_name : "1") << "];\n";
         }
         break;
@@ -745,46 +756,48 @@ static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, Ce
 
   if (is_at_points) {
     // We need to handle batches of points
-    code << "\n    // Note: Using batches of points\n";
-    code << "    const CeedInt point_loop_bound = (blockDim.x*blockDim.y) * ceil((1.0*max_num_points) / (blockDim.x*blockDim.y));\n\n";
-    code << "    #pragma unroll\n";
-    code << "    for (CeedInt i = threadIdx.x + threadIdx.y*blockDim.x; i < point_loop_bound; i += blockDim.x*blockDim.y) {\n";
-    code << "      const CeedInt p = i % max_num_points;\n\n";
-
-    code << "      // -- Coordinates\n";
-    code << "      CeedScalar r_x[max_dim];\n";
-    code << "      ReadPoint<max_dim, coords_comp_stride, max_num_points>(data, elem, p, max_num_points, points.indices, points.coords, r_x);\n\n";
-
-    code << "      // -- Input fields\n";
+    code << "\n";
+    code << tab << "// Note: Using batches of points\n";
+    code << tab << "const CeedInt point_loop_bound = (blockDim.x*blockDim.y) * ceil((1.0*max_num_points) / (blockDim.x*blockDim.y));\n\n";
+    code << tab << "#pragma unroll\n";
+    code << tab << "for (CeedInt i = threadIdx.x + threadIdx.y*blockDim.x; i < point_loop_bound; i += blockDim.x*blockDim.y) {\n";
+    tab.push();
+    code << tab << "const CeedInt p = i % max_num_points;\n\n";
+
+    code << tab << "// -- Coordinates\n";
+    code << tab << "CeedScalar r_x[max_dim];\n";
+    code << tab << "ReadPoint<max_dim, coords_comp_stride, max_num_points>(data, elem, p, max_num_points, points.indices, points.coords, r_x);\n\n";
+
+    code << tab << "// -- Input fields\n";
     for (CeedInt i = 0; i < num_input_fields; i++) {
       const char *field_name;
       std::string var_suffix = "_in_" + std::to_string(i);
       std::string P_name     = "P_1d" + var_suffix;
 
       CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name));
-      code << "      // ---- Input field " << i << ": " << field_name << "\n";
+      code << tab << "// ---- Input field " << i << ": " << field_name << "\n";
       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
       // Basis action
-      code << "      // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
+      code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n";
       switch (eval_mode) {
         case CEED_EVAL_NONE:
-          code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
-          code << "      ReadPoint<num_comp" << var_suffix << ", comp_stride" << var_suffix
+          code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
+          code << tab << "ReadPoint<num_comp" << var_suffix << ", comp_stride" << var_suffix
                << ", max_num_points>(data, elem, p, max_num_points, indices.inputs[" << i << "], d" << var_suffix << ", r_s" << var_suffix << ");\n";
           break;
         case CEED_EVAL_INTERP:
-          code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
-          code << "      InterpAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
+          code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
+          code << tab << "InterpAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
                << ">(data, i, r_c" << var_suffix << ", r_x, r_s" << var_suffix << ");\n";
           break;
         case CEED_EVAL_GRAD:
-          code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
-          code << "      GradAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
+          code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
+          code << tab << "GradAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
                << ">(data, i, r_c" << var_suffix << ", r_x, r_s" << var_suffix << ");\n";
           break;
         case CEED_EVAL_WEIGHT:
-          code << "      CeedScalar r_s" << var_suffix << "[1];\n";
-          code << "      r_s" << var_suffix << "[0] = 1.0;\n";
+          code << tab << "CeedScalar r_s" << var_suffix << "[1];\n";
+          code << tab << "r_s" << var_suffix << "[0] = 1.0;\n";
           break;
           // LCOV_EXCL_START
         case CEED_EVAL_DIV:
@@ -793,24 +806,25 @@ static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, Ce
                   // LCOV_EXCL_STOP
       }
     }
-    code << "\n      // -- Output fields\n";
+    code << "\n";
+    code << tab << "// -- Output fields\n";
     for (CeedInt i = 0; i < num_output_fields; i++) {
       const char *field_name;
       std::string var_suffix = "_out_" + std::to_string(i);
 
       CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
-      code << "      // ---- Output field " << i << ": " << field_name << "\n";
+      code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
       // Basis action
       switch (eval_mode) {
         case CEED_EVAL_NONE:
-          code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
+          code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
           break;
         case CEED_EVAL_INTERP:
-          code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
+          code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
           break;
         case CEED_EVAL_GRAD:
-          code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
+          code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
           break;
           // LCOV_EXCL_START
         case CEED_EVAL_WEIGHT:
@@ -824,24 +838,26 @@ static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, Ce
 
   } else if (use_3d_slices) {
     // We treat quadrature points per slice in 3d to save registers
-    code << "\n    // Note: Using planes of 3D elements\n";
-    code << "    #pragma unroll\n";
-    code << "    for (CeedInt q = 0; q < " << Q_name << "; q++) {\n";
-    code << "      // -- Input fields\n";
+    code << "\n";
+    code << tab << "// Note: Using planes of 3D elements\n";
+    code << tab << "#pragma unroll\n";
+    code << tab << "for (CeedInt q = 0; q < " << Q_name << "; q++) {\n";
+    tab.push();
+    code << tab << "// -- Input fields\n";
     for (CeedInt i = 0; i < num_input_fields; i++) {
       const char *field_name;
       std::string var_suffix = "_in_" + std::to_string(i);
 
       CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name));
-      code << "      // ---- Input field " << i << ": " << field_name << "\n";
+      code << tab << "// ---- Input field " << i << ": " << field_name << "\n";
       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
       // Basis action
-      code << "      // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
+      code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n";
       switch (eval_mode) {
         case CEED_EVAL_NONE:
           bool is_strided;
 
-          code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
+          code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
 
           CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
           CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided));
@@ -857,39 +873,42 @@ static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, Ce
             if (!has_backend_strides) {
               CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
             }
-            code << "      // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n";
-            code << "      ReadEVecSliceStrided3d<num_comp" << var_suffix << ", " << Q_name << ", " << strides[0] << ", " << strides[1] << ", "
-                 << strides[2] << ">(data, elem, q, d" << var_suffix << ", r_s" << var_suffix << ");\n";
+            code << tab << "const CeedInt strides" << var_suffix << "_0 = " << strides[0] << ", strides" << var_suffix << "_1 = " << strides[1]
+                 << ", strides" << var_suffix << "_2 = " << strides[2] << ";\n";
+            code << tab << "ReadEVecSliceStrided3d<num_comp" << var_suffix << ", " << Q_name << ", strides" << var_suffix << "_0, strides"
+                 << var_suffix << "_1, strides" << var_suffix << "_2>(data, elem, q, d" << var_suffix << ", r_s" << var_suffix << ");\n";
           } else {
             CeedSize                 l_size = 0;
             CeedInt                  comp_stride;
             CeedElemRestriction_Hip *rstr_data;
 
             CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
-            code << "      const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
+            code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
             CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
-            code << "      // CompStride: " << comp_stride << "\n";
+            code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
             CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data));
             data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets;
-            code << "      ReadEVecSliceStandard3d<num_comp" << var_suffix << ", " << comp_stride << ", " << Q_name << ">(data, l_size" << var_suffix
-                 << ", elem, q, indices.inputs[" << i << "], d" << var_suffix << ", r_s" << var_suffix << ");\n";
+            code << tab << "ReadEVecSliceStandard3d<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", " << Q_name << ">(data, l_size"
+                 << var_suffix << ", elem, q, indices.inputs[" << i << "], d" << var_suffix << ", r_s" << var_suffix << ");\n";
           }
           CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
           break;
         case CEED_EVAL_INTERP:
-          code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
-          code << "      for (CeedInt j = 0; j < num_comp" << var_suffix << "; j++) {\n";
-          code << "        r_s" << var_suffix << "[j] = r_q" << var_suffix << "[q + j*" << Q_name << "];\n";
-          code << "      }\n";
+          code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
+          code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << "; j++) {\n";
+          tab.push();
+          code << tab << "r_s" << var_suffix << "[j] = r_q" << var_suffix << "[q + j*" << Q_name << "];\n";
+          tab.pop();
+          code << tab << "}\n";
           break;
         case CEED_EVAL_GRAD:
-          code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
-          code << "      GradColloSlice3d<num_comp" << var_suffix << ", " << Q_name << ", OP_T_1D>(data, q, r_q" << var_suffix << ", s_G"
+          code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
+          code << tab << "GradColloSlice3d<num_comp" << var_suffix << ", " << Q_name << ", OP_T_1D>(data, q, r_q" << var_suffix << ", s_G"
                << var_suffix << ", r_s" << var_suffix << ");\n";
           break;
         case CEED_EVAL_WEIGHT:
-          code << "      CeedScalar r_s" << var_suffix << "[1];\n";
-          code << "      r_s" << var_suffix << "[0] = r_q" << var_suffix << "[q];\n";
+          code << tab << "CeedScalar r_s" << var_suffix << "[1];\n";
+          code << tab << "r_s" << var_suffix << "[0] = r_q" << var_suffix << "[q];\n";
           break;
           // LCOV_EXCL_START
         case CEED_EVAL_DIV:
@@ -898,24 +917,25 @@ static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, Ce
                   // LCOV_EXCL_STOP
       }
     }
-    code << "\n      // -- Output fields\n";
+    code << "\n";
+    code << tab << "// -- Output fields\n";
     for (CeedInt i = 0; i < num_output_fields; i++) {
       const char *field_name;
       std::string var_suffix = "_out_" + std::to_string(i);
 
       CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
-      code << "      // ---- Output field " << i << ": " << field_name << "\n";
+      code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
       // Basis action
       switch (eval_mode) {
         case CEED_EVAL_NONE:
-          code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
+          code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
           break;
         case CEED_EVAL_INTERP:
-          code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
+          code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
           break;
         case CEED_EVAL_GRAD:
-          code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
+          code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
           break;
           // LCOV_EXCL_START
         case CEED_EVAL_WEIGHT:
@@ -927,50 +947,54 @@ static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, Ce
       }
     }
   } else {
-    code << "\n    // Note: Using full elements\n";
-    code << "    {\n";
-    code << "      // -- Input fields\n";
+    code << "\n";
+    code << tab << "// Note: Using full elements\n";
+    code << tab << "{\n";
+    tab.push();
+    code << tab << "// -- Input fields\n";
     for (CeedInt i = 0; i < num_input_fields; i++) {
       const char *field_name;
 
       CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name));
-      code << "      // ---- Input field " << i << ": " << field_name << "\n";
-      code << "      CeedScalar *r_s_in_" << i << " = r_q_in_" << i << ";\n";
+      code << tab << "// ---- Input field " << i << ": " << field_name << "\n";
+      code << tab << "CeedScalar *r_s_in_" << i << " = r_q_in_" << i << ";\n";
     }
-    code << "      // -- Output fields\n";
+    code << tab << "// -- Output fields\n";
     for (CeedInt i = 0; i < num_output_fields; i++) {
       const char *field_name;
 
       CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
-      code << "      // ---- Output field " << i << ": " << field_name << "\n";
-      code << "      CeedScalar *r_s_out_" << i << " = r_q_out_" << i << ";\n";
+      code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
+      code << tab << "CeedScalar *r_s_out_" << i << " = r_q_out_" << i << ";\n";
     }
   }
 
   // Input and output buffers
-  code << "\n      // -- QFunction inputs and outputs\n";
-  code << "      // ---- Inputs\n";
-  code << "      CeedScalar *inputs[" << CeedIntMax(num_input_fields, 1) << "];\n";
+  code << "\n";
+  code << tab << "// -- QFunction inputs and outputs\n";
+  code << tab << "// ---- Inputs\n";
+  code << tab << "CeedScalar *inputs[" << CeedIntMax(num_input_fields, 1) << "];\n";
   for (CeedInt i = 0; i < num_input_fields; i++) {
     const char *field_name;
 
     CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name));
-    code << "      // ------ Input field " << i << ": " << field_name << "\n";
-    code << "      inputs[" << i << "] = r_s_in_" << i << ";\n";
+    code << tab << "// ------ Input field " << i << ": " << field_name << "\n";
+    code << tab << "inputs[" << i << "] = r_s_in_" << i << ";\n";
   }
-  code << "      // ---- Outputs\n";
-  code << "      CeedScalar *outputs[" << CeedIntMax(num_output_fields, 1) << "];\n";
+  code << tab << "// ---- Outputs\n";
+  code << tab << "CeedScalar *outputs[" << CeedIntMax(num_output_fields, 1) << "];\n";
   for (CeedInt i = 0; i < num_output_fields; i++) {
     const char *field_name;
 
     CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
-    code << "      // ------ Output field " << i << ": " << field_name << "\n";
-    code << "      outputs[" << i << "] = r_s_out_" << i << ";\n";
+    code << tab << "// ------ Output field " << i << ": " << field_name << "\n";
+    code << tab << "outputs[" << i << "] = r_s_out_" << i << ";\n";
   }
 
   // Apply QFunction
-  code << "\n      // -- Apply QFunction\n";
-  code << "      " << qfunction_name << "(ctx, ";
+  code << "\n";
+  code << tab << "// -- Apply QFunction\n";
+  code << tab << "" << qfunction_name << "(ctx, ";
   if (max_dim != 3 || is_at_points || use_3d_slices || !is_all_tensor) {
     code << "1";
   } else {
@@ -980,17 +1004,18 @@ static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, Ce
 
   if (is_at_points) {
     // Map back to coefficients
-    code << "\n      // -- Output fields\n";
+    code << "\n";
+    code << tab << "// -- Output fields\n";
     for (CeedInt i = 0; i < num_output_fields; i++) {
       const char *field_name;
       std::string var_suffix = "_out_" + std::to_string(i);
       std::string P_name     = "P_1d" + var_suffix;
 
       CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
-      code << "      // ---- Output field " << i << ": " << field_name << "\n";
+      code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
       // Basis action
-      code << "      // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
+      code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n";
       switch (eval_mode) {
         case CEED_EVAL_NONE: {
           CeedInt             comp_stride;
@@ -999,24 +1024,28 @@ static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, Ce
           CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
           CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
           CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
-          code << "      const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
-          code << "      WritePoint<num_comp" << var_suffix << ", comp_stride" << var_suffix
+          code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
+          code << tab << "WritePoint<num_comp" << var_suffix << ", comp_stride" << var_suffix
                << ", max_num_points>(data, elem, i, points.num_per_elem[elem], indices.outputs[" << i << "]"
                << ", r_s" << var_suffix << ", d" << var_suffix << ");\n";
           break;
         }
         case CEED_EVAL_INTERP:
-          code << "      if (i >= points.num_per_elem[elem]) {\n";
-          code << "        for (CeedInt j = 0; j < num_comp" << var_suffix << "; j++) r_s" << var_suffix << "[j] = 0.0;\n";
-          code << "      }\n";
-          code << "      InterpTransposeAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
+          code << tab << "if (i >= points.num_per_elem[elem]) {\n";
+          tab.push();
+          code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << "; j++) r_s" << var_suffix << "[j] = 0.0;\n";
+          tab.pop();
+          code << tab << "}\n";
+          code << tab << "InterpTransposeAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
                << ">(data, i, r_s" << var_suffix << ", r_x, r_c" << var_suffix << ");\n";
           break;
         case CEED_EVAL_GRAD:
-          code << "      if (i >= points.num_per_elem[elem]) {\n";
-          code << "        for (CeedInt j = 0; j < num_comp" << var_suffix << "*dim" << var_suffix << "; j++) r_s" << var_suffix << "[j] = 0.0;\n";
-          code << "      }\n";
-          code << "      GradTransposeAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
+          code << tab << "if (i >= points.num_per_elem[elem]) {\n";
+          tab.push();
+          code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << "*dim" << var_suffix << "; j++) r_s" << var_suffix << "[j] = 0.0;\n";
+          tab.pop();
+          code << tab << "}\n";
+          code << tab << "GradTransposeAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
                << ">(data, i, r_s" << var_suffix << ", r_x, r_c" << var_suffix << ");\n";
           break;
           // LCOV_EXCL_START
@@ -1030,30 +1059,35 @@ static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, Ce
     }
   } else if (use_3d_slices) {
     // Copy or apply transpose grad, if needed
-    code << "\n      // -- Output fields\n";
+    code << "\n";
+    code << tab << "// -- Output fields\n";
     for (CeedInt i = 0; i < num_output_fields; i++) {
       const char *field_name;
       std::string var_suffix = "_out_" + std::to_string(i);
       std::string P_name     = "P_1d" + var_suffix;
 
       CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
-      code << "      // ---- Output field " << i << ": " << field_name << "\n";
+      code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
       // Basis action
-      code << "      // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
+      code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n";
       switch (eval_mode) {
         case CEED_EVAL_NONE:
-          code << "      for (CeedInt j = 0; j < num_comp" << var_suffix << " ; j++) {\n";
-          code << "        r_q" << var_suffix << "[q + j*" << Q_name << "] = r_s" << var_suffix << "[j];\n";
-          code << "      }\n";
+          code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << " ; j++) {\n";
+          tab.push();
+          code << tab << "r_q" << var_suffix << "[q + j*" << Q_name << "] = r_s" << var_suffix << "[j];\n";
+          tab.pop();
+          code << tab << "}\n";
           break;
         case CEED_EVAL_INTERP:
-          code << "      for (CeedInt j = 0; j < num_comp" << var_suffix << " ; j++) {\n";
-          code << "        r_q" << var_suffix << "[q + j*" << Q_name << "] = r_s" << var_suffix << "[j];\n";
-          code << "      }\n";
+          code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << " ; j++) {\n";
+          tab.push();
+          code << tab << "r_q" << var_suffix << "[q + j*" << Q_name << "] = r_s" << var_suffix << "[j];\n";
+          tab.pop();
+          code << tab << "}\n";
           break;
         case CEED_EVAL_GRAD:
-          code << "      GradColloSliceTranspose3d<num_comp" << var_suffix << ", " << Q_name << ", OP_T_1D>(data, q, r_s" << var_suffix << ", s_G"
+          code << tab << "GradColloSliceTranspose3d<num_comp" << var_suffix << ", " << Q_name << ", OP_T_1D>(data, q, r_s" << var_suffix << ", s_G"
                << var_suffix << ", r_q" << var_suffix << ");\n";
           break;
           // LCOV_EXCL_START
@@ -1066,7 +1100,8 @@ static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, Ce
       }
     }
   }
-  code << "    }\n";
+  tab.pop();
+  code << tab << "}\n";
   return CEED_ERROR_SUCCESS;
 }
 
@@ -1083,6 +1118,7 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_bu
   CeedOperatorField     *op_input_fields, *op_output_fields;
   CeedOperator_Hip_gen  *data;
   std::ostringstream     code;
+  Tab                    tab;
 
   CeedCallBackend(CeedOperatorGetData(op, &data));
   {
@@ -1205,23 +1241,23 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_bu
 
   // Load basis source files
   if (!is_all_nontensor) {
-    code << "// Tensor basis source\n";
-    code << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-templates.h>\n\n";
+    code << tab << "// Tensor basis source\n";
+    code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-templates.h>\n\n";
   }
   if (!is_all_tensor) {
-    code << "// Non-tensor basis source\n";
-    code << "#include <ceed/jit-source/hip/hip-shared-basis-nontensor-templates.h>\n\n";
+    code << tab << "// Non-tensor basis source\n";
+    code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-nontensor-templates.h>\n\n";
   }
   if (is_at_points) {
-    code << "// AtPoints basis source\n";
-    code << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h>\n\n";
+    code << tab << "// AtPoints basis source\n";
+    code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h>\n\n";
   }
   if (!is_all_tensor && !is_all_nontensor) {
-    code << "// Tensor basis source\n";
-    code << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-flattened-templates.h>\n\n";
+    code << tab << "// Tensor basis source\n";
+    code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-flattened-templates.h>\n\n";
   }
-  code << "// CodeGen operator source\n";
-  code << "#include <ceed/jit-source/hip/hip-gen-templates.h>\n\n";
+  code << tab << "// CodeGen operator source\n";
+  code << tab << "#include <ceed/jit-source/hip/hip-gen-templates.h>\n\n";
 
   // Get QFunction name
   std::string qfunction_name(qf_data->qfunction_name);
@@ -1230,11 +1266,11 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_bu
   operator_name = "CeedKernelHipGenOperator_" + qfunction_name;
 
   // Define CEED_Q_VLA
-  code << "\n#undef CEED_Q_VLA\n";
+  code << "\n" << tab << "#undef CEED_Q_VLA\n";
   if (max_dim != 3 || is_at_points || use_3d_slices || !is_all_tensor) {
-    code << "#define CEED_Q_VLA 1\n\n";
+    code << tab << "#define CEED_Q_VLA 1\n\n";
   } else {
-    code << "#define CEED_Q_VLA " << Q_1d << "\n\n";
+    code << tab << "#define CEED_Q_VLA " << Q_1d << "\n\n";
   }
 
   // Add user QFunction source
@@ -1244,26 +1280,27 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_bu
     CeedCallBackend(CeedQFunctionGetSourcePath(qf, &source_path));
     CeedCheck(source_path, ceed, CEED_ERROR_UNSUPPORTED, "/gpu/hip/gen backend requires QFunction source code file");
 
-    code << "// User QFunction source\n";
-    code << "#include \"" << source_path << "\"\n\n";
+    code << tab << "// User QFunction source\n";
+    code << tab << "#include \"" << source_path << "\"\n\n";
   }
 
   // Setup
-  code << "\n// -----------------------------------------------------------------------------\n";
-  code << "// Operator Kernel\n";
-  code << "// \n";
-  code << "// d_[in,out]_i:   CeedVector device array\n";
-  code << "// r_[in,out]_e_i: Element vector register\n";
-  code << "// r_[in,out]_q_i: Quadrature space vector register\n";
-  code << "// r_[in,out]_c_i: AtPoints Chebyshev coefficients register\n";
-  code << "// r_[in,out]_s_i: Quadrature space slice vector register\n";
-  code << "// \n";
-  code << "// s_B_[in,out]_i: Interpolation matrix, shared memory\n";
-  code << "// s_G_[in,out]_i: Gradient matrix, shared memory\n";
-  code << "// -----------------------------------------------------------------------------\n";
-  code << "\nextern \"C\" __launch_bounds__(BLOCK_SIZE)\n";
+  code << "\n" << tab << "// -----------------------------------------------------------------------------\n";
+  code << tab << "// Operator Kernel\n";
+  code << tab << "// \n";
+  code << tab << "// d_[in,out]_i:   CeedVector device array\n";
+  code << tab << "// r_[in,out]_e_i: Element vector register\n";
+  code << tab << "// r_[in,out]_q_i: Quadrature space vector register\n";
+  code << tab << "// r_[in,out]_c_i: AtPoints Chebyshev coefficients register\n";
+  code << tab << "// r_[in,out]_s_i: Quadrature space slice vector register\n";
+  code << tab << "// \n";
+  code << tab << "// s_B_[in,out]_i: Interpolation matrix, shared memory\n";
+  code << tab << "// s_G_[in,out]_i: Gradient matrix, shared memory\n";
+  code << tab << "// -----------------------------------------------------------------------------\n";
+  code << tab << "extern \"C\" __launch_bounds__(BLOCK_SIZE)\n";
   code << "__global__ void " << operator_name
        << "(CeedInt num_elem, void* ctx, FieldsInt_Hip indices, Fields_Hip fields, Fields_Hip B, Fields_Hip G, CeedScalar* W, Points_Hip points) {\n";
+  tab.push();
 
   // Scratch buffers
   for (CeedInt i = 0; i < num_input_fields; i++) {
@@ -1271,33 +1308,33 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_bu
 
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
     if (eval_mode != CEED_EVAL_WEIGHT) {  // Skip CEED_EVAL_WEIGHT
-      code << "  const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n";
+      code << tab << "const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n";
     }
   }
   for (CeedInt i = 0; i < num_output_fields; i++) {
-    code << "  CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n";
+    code << tab << "CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n";
   }
 
-  code << " const CeedInt max_dim = " << max_dim << ";\n";
+  code << tab << "const CeedInt max_dim = " << max_dim << ";\n";
   if (!is_all_tensor) {
-    code << "  const CeedInt Q = " << Q << ";\n";
+    code << tab << "const CeedInt Q = " << Q << ";\n";
   }
   if (!is_all_nontensor) {
-    code << "  const CeedInt Q_1d = " << Q_1d << ";\n";
+    code << tab << "const CeedInt Q_1d = " << Q_1d << ";\n";
   }
   if (is_at_points) {
-    code << "  const CeedInt max_num_points = " << max_num_points << ";\n";
-    code << "  const CeedInt coords_comp_stride = " << coords_comp_stride << ";\n";
+    code << tab << "const CeedInt max_num_points = " << max_num_points << ";\n";
+    code << tab << "const CeedInt coords_comp_stride = " << coords_comp_stride << ";\n";
   }
 
   // Shared data
-  code << "  extern __shared__ CeedScalar slice[];\n";
-  code << "  SharedData_Hip data;\n";
-  code << "  data.t_id_x = threadIdx.x;\n";
-  code << "  data.t_id_y = threadIdx.y;\n";
-  code << "  data.t_id_z = threadIdx.z;\n";
-  code << "  data.t_id   = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
-  code << "  data.slice  = slice + data.t_id_z*OP_T_1D" << ((!is_all_tensor || max_dim == 1) ? "" : "*OP_T_1D") << ";\n";
+  code << tab << "extern __shared__ CeedScalar slice[];\n";
+  code << tab << "SharedData_Hip data;\n";
+  code << tab << "data.t_id_x = threadIdx.x;\n";
+  code << tab << "data.t_id_y = threadIdx.y;\n";
+  code << tab << "data.t_id_z = threadIdx.z;\n";
+  code << tab << "data.t_id   = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
+  code << tab << "data.slice  = slice + data.t_id_z*OP_T_1D" << ((!is_all_tensor || max_dim == 1) ? "" : "*OP_T_1D") << ";\n";
 
   // -- Determine input mat reuse
   FieldReuse_Hip input_matrix_reuse[CEED_FIELD_MAX];
@@ -1404,21 +1441,22 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_bu
   }
 
   // Initialize constants, and matrices B and G
-  code << "\n  // Input field constants and basis data\n";
+  code << "\n" << tab << "// Input field constants and basis data\n";
   for (CeedInt i = 0; i < num_input_fields; i++) {
-    CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i], max_dim, Q,
-                                                             Q_1d, true, is_all_tensor, is_at_points, use_3d_slices));
+    CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i],
+                                                             max_dim, Q, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices));
   }
-  code << "\n  // Output field constants and basis data\n";
+  code << "\n" << tab << "// Output field constants and basis data\n";
   for (CeedInt i = 0; i < num_output_fields; i++) {
-    CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i], max_dim,
-                                                             Q, Q_1d, false, is_all_tensor, is_at_points, use_3d_slices));
+    CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i],
+                                                             max_dim, Q, Q_1d, false, is_all_tensor, is_at_points, use_3d_slices));
   }
 
   // Loop over all elements
-  code << "\n  // Element loop\n";
-  code << "  __syncthreads();\n";
-  code << "  for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n";
+  code << "\n" << tab << "// Element loop\n";
+  code << tab << "__syncthreads();\n";
+  code << tab << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n";
+  tab.push();
 
   // -- Compute minimum buffer space needed
   CeedInt max_rstr_buffer_size = 1;
@@ -1451,8 +1489,8 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_bu
       CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
     }
   }
-  code << "    // Scratch restriction buffer space\n";
-  code << "    CeedScalar r_e_scratch[" << max_rstr_buffer_size << "];\n";
+  code << tab << "// Scratch restriction buffer space\n";
+  code << tab << "CeedScalar r_e_scratch[" << max_rstr_buffer_size << "];\n";
 
   // -- Determine best input field processing order
   CeedInt field_rstr_in_buffer[CEED_FIELD_MAX], input_field_order[CEED_FIELD_MAX];
@@ -1499,49 +1537,51 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_bu
   }
 
   // -- Input restriction and basis
-  code << "\n    // -- Input field restrictions and basis actions\n";
+  code << "\n" << tab << "// -- Input field restrictions and basis actions\n";
   for (CeedInt i = 0; i < num_input_fields; i++) {
     const char   *field_name;
     const CeedInt f = input_field_order[i];
 
     CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[f], &field_name));
-    code << "    // ---- Input field " << f << ": " << field_name << "\n";
+    code << tab << "// ---- Input field " << f << ": " << field_name << "\n";
 
     // ---- Restriction
-    CeedCallBackend(CeedOperatorBuildKernelRestriction_Hip_gen(code, data, f, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f], max_dim,
-                                                               Q_1d, true, is_all_tensor, is_at_points, use_3d_slices));
+    CeedCallBackend(CeedOperatorBuildKernelRestriction_Hip_gen(code, data, tab, f, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f],
+                                                               max_dim, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices));
 
     // ---- Basis action
-    CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, f, op_input_fields[f], qf_input_fields[f], max_dim, Q_1d, true, is_all_tensor,
-                                                         is_at_points, use_3d_slices));
+    CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, f, op_input_fields[f], qf_input_fields[f], max_dim, Q_1d, true,
+                                                         is_all_tensor, is_at_points, use_3d_slices));
   }
 
   // -- Q function
-  CeedCallBackend(CeedOperatorBuildKernelQFunction_Hip_gen(code, data, max_dim, max_num_points, num_input_fields, op_input_fields, qf_input_fields,
-                                                           num_output_fields, op_output_fields, qf_output_fields, qfunction_name, Q_1d, is_all_tensor,
-                                                           is_at_points, use_3d_slices));
+  CeedCallBackend(CeedOperatorBuildKernelQFunction_Hip_gen(code, data, tab, max_dim, max_num_points, num_input_fields, op_input_fields,
+                                                           qf_input_fields, num_output_fields, op_output_fields, qf_output_fields, qfunction_name,
+                                                           Q_1d, is_all_tensor, is_at_points, use_3d_slices));
 
   // -- Output basis and restriction
-  code << "\n    // -- Output field basis action and restrictions\n";
+  code << "\n" << tab << "// -- Output field basis action and restrictions\n";
   for (CeedInt i = 0; i < num_output_fields; i++) {
     const char *field_name;
 
     CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
-    code << "    // ---- Output field " << i << ": " << field_name << "\n";
+    code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
 
     // ---- Basis action
-    CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, i, op_output_fields[i], qf_output_fields[i], max_dim, Q_1d, false, is_all_tensor,
-                                                         is_at_points, use_3d_slices));
+    CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], max_dim, Q_1d, false,
+                                                         is_all_tensor, is_at_points, use_3d_slices));
 
     // ---- Restriction
-    CeedCallBackend(CeedOperatorBuildKernelRestriction_Hip_gen(code, data, i, NULL, op_output_fields[i], qf_output_fields[i], max_dim, Q_1d, false,
-                                                               is_all_tensor, is_at_points, use_3d_slices));
+    CeedCallBackend(CeedOperatorBuildKernelRestriction_Hip_gen(code, data, tab, i, NULL, op_output_fields[i], qf_output_fields[i], max_dim, Q_1d,
+                                                               false, is_all_tensor, is_at_points, use_3d_slices));
   }
 
   // Close loop and function
-  code << "  }\n";
-  code << "}\n";
-  code << "// -----------------------------------------------------------------------------\n\n";
+  tab.pop();
+  code << tab << "}\n";
+  tab.pop();
+  code << tab << "}\n";
+  code << tab << "// -----------------------------------------------------------------------------\n\n";
 
   CeedInt block_sizes[3] = {0, 0, 0};
   CeedInt num_elem;
@@ -1571,3 +1611,460 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_bu
 }
 
 //------------------------------------------------------------------------------
+// Build AtPoints assembly operator kernel
+//------------------------------------------------------------------------------
+static int CeedOperatorBuildKernelAssemblyAtPoints_Hip_gen(CeedOperator op, bool is_full, bool *is_good_build) {
+  bool                   is_all_tensor = true, is_at_points = false, use_3d_slices = false;
+  Ceed                   ceed;
+  CeedInt                Q, Q_1d, num_input_fields, num_output_fields, max_dim = 1, max_num_points = 0, coords_comp_stride = 0;
+  CeedQFunctionField    *qf_input_fields, *qf_output_fields;
+  CeedQFunction_Hip_gen *qf_data;
+  CeedQFunction          qf;
+  CeedOperatorField     *op_input_fields, *op_output_fields;
+  CeedOperator_Hip_gen  *data;
+  std::ostringstream     code;
+  Tab                    tab;
+
+  // Check compatibility
+  CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
+  CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points));
+  CeedCheck(is_at_points, ceed, CEED_ERROR_BACKEND, "Only AtPoints operator assembly supported");
+
+  // Retrieve operator data
+  CeedCallBackend(CeedOperatorGetData(op, &data));
+  Q       = data->Q;
+  Q_1d    = data->Q_1d;
+  max_dim = data->dim;
+  {
+    CeedElemRestriction rstr_points = NULL;
+
+    CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL));
+    CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(rstr_points, &max_num_points));
+    CeedCallBackend(CeedElemRestrictionGetCompStride(rstr_points, &coords_comp_stride));
+    CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
+  }
+  CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
+  CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
+  CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
+  CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
+
+  // Load basis source files
+  code << tab << "// Tensor basis source\n";
+  code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-templates.h>\n\n";
+  code << tab << "// AtPoints basis source\n";
+  code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h>\n\n";
+  code << tab << "// CodeGen operator source\n";
+  code << tab << "#include <ceed/jit-source/hip/hip-gen-templates.h>\n\n";
+
+  // Get QFunction name
+  std::string qfunction_name(qf_data->qfunction_name);
+  std::string operator_name;
+
+  if (is_full) {
+    operator_name = "CeedKernelHipGenOperatorFullAssembly_" + qfunction_name;
+  } else {
+    operator_name = "CeedKernelHipGenOperatorDiagonalAssembly_" + qfunction_name;
+  }
+
+  // Define CEED_Q_VLA
+  code << "\n" << tab << "#undef CEED_Q_VLA\n";
+  code << tab << "#define CEED_Q_VLA 1\n\n";
+
+  // Add user QFunction source
+  {
+    const char *source_path;
+
+    CeedCallBackend(CeedQFunctionGetSourcePath(qf, &source_path));
+    CeedCheck(source_path, ceed, CEED_ERROR_UNSUPPORTED, "/gpu/hip/gen backend requires QFunction source code file");
+
+    code << tab << "// User QFunction source\n";
+    code << tab << "#include \"" << source_path << "\"\n\n";
+  }
+
+  // Setup
+  code << "\n" << tab << "// -----------------------------------------------------------------------------\n";
+  code << tab << "// Operator Assembly Kernel\n";
+  code << tab << "// \n";
+  code << tab << "// d_[in,out]_i:   CeedVector device array\n";
+  code << tab << "// r_[in,out]_e_i: Element vector register\n";
+  code << tab << "// r_[in,out]_q_i: Quadrature space vector register\n";
+  code << tab << "// r_[in,out]_c_i: AtPoints Chebyshev coefficients register\n";
+  code << tab << "// r_[in,out]_s_i: Quadrature space slice vector register\n";
+  code << tab << "// \n";
+  code << tab << "// s_B_[in,out]_i: Interpolation matrix, shared memory\n";
+  code << tab << "// s_G_[in,out]_i: Gradient matrix, shared memory\n";
+  code << tab << "// -----------------------------------------------------------------------------\n";
+  code << tab << "extern \"C\" __global__ void " << operator_name
+       << "(CeedInt num_elem, void* ctx, FieldsInt_Hip indices, Fields_Hip fields, Fields_Hip B, Fields_Hip G, CeedScalar *W, Points_Hip "
+          "points, CeedScalar *__restrict__ values_array) {\n";
+  tab.push();
+
+  // Scratch buffers
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedEvalMode eval_mode;
+
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
+    if (eval_mode != CEED_EVAL_WEIGHT) {  // Skip CEED_EVAL_WEIGHT
+      code << tab << "const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n";
+    }
+  }
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    code << tab << "CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n";
+  }
+
+  code << tab << "const CeedInt max_dim = " << max_dim << ";\n";
+  code << tab << "const CeedInt Q_1d = " << Q_1d << ";\n";
+  code << tab << "const CeedInt max_num_points = " << max_num_points << ";\n";
+  code << tab << "const CeedInt coords_comp_stride = " << coords_comp_stride << ";\n";
+
+  // Shared data
+  code << tab << "extern __shared__ CeedScalar slice[];\n";
+  code << tab << "SharedData_Hip data;\n";
+  code << tab << "data.t_id_x = threadIdx.x;\n";
+  code << tab << "data.t_id_y = threadIdx.y;\n";
+  code << tab << "data.t_id_z = threadIdx.z;\n";
+  code << tab << "data.t_id   = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
+  code << tab << "data.slice  = slice + data.t_id_z*OP_T_1D" << ((!is_all_tensor || max_dim == 1) ? "" : "*OP_T_1D") << ";\n";
+
+  // -- Determine input mat reuse
+  FieldReuse_Hip input_matrix_reuse[CEED_FIELD_MAX];
+
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    input_matrix_reuse[i].index = -1;
+  }
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedEvalMode eval_mode_i;
+    CeedBasis    basis_i;
+
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode_i));
+    if (eval_mode_i == CEED_EVAL_WEIGHT) continue;
+    CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis_i));
+    for (CeedInt j = 0; (input_matrix_reuse[i].index == -1) && (j < i); j++) {
+      CeedEvalMode eval_mode_j;
+      CeedBasis    basis_j;
+
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
+      if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
+      CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
+      if (basis_i == basis_j) {
+        input_matrix_reuse[i].index     = j;
+        input_matrix_reuse[i].is_input  = true;
+        input_matrix_reuse[i].eval_mode = eval_mode_j;
+      }
+      CeedCallBackend(CeedBasisDestroy(&basis_j));
+    }
+    CeedCallBackend(CeedBasisDestroy(&basis_i));
+  }
+
+  // -- Determine output mat reuse
+  FieldReuse_Hip output_matrix_reuse[CEED_FIELD_MAX];
+
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    output_matrix_reuse[i].index = -1;
+  }
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    CeedEvalMode eval_mode_i;
+    CeedBasis    basis_i;
+
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode_i));
+    CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis_i));
+    for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < num_input_fields); j++) {
+      CeedEvalMode eval_mode_j;
+      CeedBasis    basis_j;
+
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
+      if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
+      CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
+      if (basis_i == basis_j) {
+        output_matrix_reuse[i].index     = j;
+        output_matrix_reuse[i].is_input  = true;
+        output_matrix_reuse[i].eval_mode = eval_mode_j;
+      }
+      CeedCallBackend(CeedBasisDestroy(&basis_j));
+    }
+    for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < i); j++) {
+      CeedEvalMode eval_mode_j;
+      CeedBasis    basis_j;
+
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode_j));
+      if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
+      CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[j], &basis_j));
+      if (basis_i == basis_j) {
+        output_matrix_reuse[i].index     = j;
+        output_matrix_reuse[i].is_input  = false;
+        output_matrix_reuse[i].eval_mode = eval_mode_j;
+      }
+      CeedCallBackend(CeedBasisDestroy(&basis_j));
+    }
+    CeedCallBackend(CeedBasisDestroy(&basis_i));
+  }
+
+  // Initialize constants, and matrices B and G
+  code << "\n" << tab << "// Input field constants and basis data\n";
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i],
+                                                             max_dim, Q, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices));
+  }
+  code << "\n" << tab << "// Output field constants and basis data\n";
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i],
+                                                             max_dim, Q, Q_1d, false, is_all_tensor, is_at_points, use_3d_slices));
+  }
+
+  // Loop over all elements
+  code << "\n" << tab << "// Element loop\n";
+  code << tab << "__syncthreads();\n";
+  code << tab << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n";
+  tab.push();
+
+  // -- Compute minimum buffer space needed
+  CeedInt max_rstr_buffer_size = 1;
+
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedEvalMode eval_mode;
+
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
+    if (eval_mode != CEED_EVAL_NONE && eval_mode != CEED_EVAL_WEIGHT) {
+      CeedInt             num_comp;
+      CeedElemRestriction elem_rstr;
+
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
+      CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
+      max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1));
+      CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
+    }
+  }
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    CeedEvalMode eval_mode;
+
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
+    if (eval_mode != CEED_EVAL_NONE) {
+      CeedInt             num_comp;
+      CeedElemRestriction elem_rstr;
+
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
+      CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
+      max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1));
+      CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
+    }
+  }
+  code << tab << "// Scratch restriction buffer space\n";
+  code << tab << "CeedScalar r_e_scratch[" << max_rstr_buffer_size << "];\n";
+
+  // -- Determine best input field processing order
+  CeedInt field_rstr_in_buffer[CEED_FIELD_MAX], input_field_order[CEED_FIELD_MAX];
+
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    field_rstr_in_buffer[i] = -1;
+    input_field_order[i]    = -1;
+  }
+  {
+    bool    is_ordered[CEED_FIELD_MAX];
+    CeedInt curr_index = 0;
+
+    for (CeedInt i = 0; i < num_input_fields; i++) is_ordered[i] = false;
+    for (CeedInt i = 0; i < num_input_fields; i++) {
+      CeedVector          vec_i;
+      CeedElemRestriction rstr_i;
+
+      if (is_ordered[i]) continue;
+      field_rstr_in_buffer[i]       = i;
+      is_ordered[i]                 = true;
+      input_field_order[curr_index] = i;
+      curr_index++;
+      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i));
+      if (vec_i == CEED_VECTOR_NONE) continue;  // CEED_EVAL_WEIGHT
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i));
+      for (CeedInt j = i + 1; j < num_input_fields; j++) {
+        CeedVector          vec_j;
+        CeedElemRestriction rstr_j;
+
+        CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec_j));
+        CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[j], &rstr_j));
+        if (rstr_i == rstr_j && vec_i == vec_j) {
+          field_rstr_in_buffer[j]       = i;
+          is_ordered[j]                 = true;
+          input_field_order[curr_index] = j;
+          curr_index++;
+        }
+        CeedCallBackend(CeedVectorDestroy(&vec_j));
+        CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j));
+      }
+      CeedCallBackend(CeedVectorDestroy(&vec_i));
+      CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
+    }
+  }
+
+  // -- Input restriction and basis
+  code << "\n" << tab << "// -- Input field restrictions and basis actions\n";
+  CeedInt active_field_index = -1;
+
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    bool          is_active = false;
+    const char   *field_name;
+    const CeedInt f = input_field_order[i];
+
+    {
+      CeedVector vec;
+
+      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[f], &vec));
+      is_active = vec == CEED_VECTOR_ACTIVE;
+      CeedCallBackend(CeedVectorDestroy(&vec));
+    }
+
+    CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[f], &field_name));
+    code << tab << "// ---- Input field " << f << ": " << field_name << "\n";
+
+    if (is_active) {
+      std::string var_suffix = "_in_" + std::to_string(f);
+
+      code << tab << "// Active field - no restriction or basis action here\n";
+      if (active_field_index == -1) {
+        active_field_index = f;
+        code << tab << "CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << (max_dim >= 3 ? "P_1d" + var_suffix : "1")
+             << "] = {0.0};\n";
+      } else {
+        code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_in_" << active_field_index << ";\n";
+      }
+    } else {
+      // ---- Restriction
+      CeedCallBackend(CeedOperatorBuildKernelRestriction_Hip_gen(code, data, tab, f, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f],
+                                                                 max_dim, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices));
+
+      // ---- Basis action
+      CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, f, op_input_fields[f], qf_input_fields[f], max_dim, Q_1d, true,
+                                                           is_all_tensor, is_at_points, use_3d_slices));
+    }
+  }
+
+  // -- Loop over active field
+  std::string active_var_suffix = "_in_" + std::to_string(active_field_index);
+
+  code << "\n" << tab << "// Loop over nodes in active field\n";
+  code << tab << "for (CeedInt n = 0; n < num_comp" << active_var_suffix << "*P_1d" << active_var_suffix
+       << (max_dim > 1 ? "*P_1d" + active_var_suffix : "") << (max_dim > 2 ? "*P_1d" + active_var_suffix : "") << "; n++) {\n";
+  tab.push();
+
+  // -- Set current active node and component to 1
+  code << tab << "// Set current active node and component to 1.0\n";
+  code << tab << "SetEVecStandard" << max_dim << "d_Single<num_comp" << active_var_suffix << ", P_1d" << active_var_suffix << ">(data, n, 1.0, r_e"
+       << active_var_suffix << ");\n\n";
+
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    bool          is_active = false;
+    const char   *field_name;
+    const CeedInt f = input_field_order[i];
+
+    {
+      CeedVector vec;
+
+      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[f], &vec));
+      is_active = vec == CEED_VECTOR_ACTIVE;
+      CeedCallBackend(CeedVectorDestroy(&vec));
+    }
+    if (!is_active) continue;
+
+    CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[f], &field_name));
+    code << tab << "// ---- Input field " << f << ": " << field_name << "\n";
+
+    // ---- Basis action
+    CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, f, op_input_fields[f], qf_input_fields[f], max_dim, Q_1d, true,
+                                                         is_all_tensor, is_at_points, use_3d_slices));
+  }
+
+  // -- Q function
+  CeedCallBackend(CeedOperatorBuildKernelQFunction_Hip_gen(code, data, tab, max_dim, max_num_points, num_input_fields, op_input_fields,
+                                                           qf_input_fields, num_output_fields, op_output_fields, qf_output_fields, qfunction_name,
+                                                           Q_1d, is_all_tensor, is_at_points, use_3d_slices));
+
+  // -- Output basis and restriction
+  code << "\n" << tab << "// -- Output field basis action and restrictions\n";
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    bool        is_active = false;
+    const char *field_name;
+
+    {
+      CeedVector vec;
+
+      CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
+      is_active = vec == CEED_VECTOR_ACTIVE;
+      CeedCallBackend(CeedVectorDestroy(&vec));
+    }
+    if (!is_active) continue;
+
+    CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
+    code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
+
+    // ---- Basis action
+    CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], max_dim, Q_1d, false,
+                                                         is_all_tensor, is_at_points, use_3d_slices));
+
+    // ---- Restriction
+    if (is_full) {
+      // TODO: UPDATE OUTPUTS FOR FULL ASSEMBLY
+    } else {
+      std::string         var_suffix = "_out_" + std::to_string(i);
+      CeedInt             comp_stride;
+      CeedSize            l_size;
+      CeedElemRestriction elem_rstr;
+
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
+      CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
+      code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
+      CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
+      code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
+      code << tab << "WriteLVecStandard" << max_dim << "d_Single<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", P_1d" + var_suffix
+           << ">(data, l_size" << var_suffix << ", elem, n, indices.outputs[" << i << "], r_e" << var_suffix << ", values_array);\n";
+      CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
+    }
+  }
+
+  // -- Reset current active node and component
+  code << "\n" << tab << "// Reset current active node and component to 0.0\n";
+  code << tab << "SetEVecStandard" << max_dim << "d_Single<num_comp" << active_var_suffix << ", P_1d" << active_var_suffix << ">(data, n, 0.0, r_e"
+       << active_var_suffix << ");\n";
+
+  // -- End of loop over active field
+  tab.pop();
+  code << tab << "}\n";
+
+  // Close loop and function
+  tab.pop();
+  code << tab << "}\n";
+  tab.pop();
+  code << tab << "}\n";
+  code << tab << "// -----------------------------------------------------------------------------\n\n";
+
+  CeedInt block_sizes[3] = {0, 0, 0};
+  CeedInt num_elem;
+
+  // Compile
+  CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
+  CeedCallBackend(BlockGridCalculate_Hip_gen(max_dim, num_elem, data->max_P_1d, Q_1d, block_sizes));
+  block_sizes[2] = 1;
+  {
+    bool is_compile_good = false;
+
+    data->thread_1d = block_sizes[0];
+    CeedCallBackend(CeedTryCompile_Hip(ceed, code.str().c_str(), &is_compile_good,
+                                       is_full ? &data->module_assemble_full : &data->module_assemble_diagonal, 2, "OP_T_1D", block_sizes[0],
+                                       "BLOCK_SIZE", block_sizes[0] * block_sizes[1] * block_sizes[2]));
+    if (is_compile_good) {
+      *is_good_build = true;
+      CeedCallBackend(CeedGetKernel_Hip(ceed, is_full ? data->module_assemble_full : data->module_assemble_diagonal, operator_name.c_str(),
+                                        is_full ? &data->assemble_full : &data->assemble_diagonal));
+    } else {
+      *is_good_build              = false;
+      data->use_assembly_fallback = true;
+    }
+  }
+  CeedCallBackend(CeedDestroy(&ceed));
+  CeedCallBackend(CeedQFunctionDestroy(&qf));
+  return CEED_ERROR_SUCCESS;
+}
+
+extern "C" int CeedOperatorBuildKernelDiagonalAssemblyAtPoints_Hip_gen(CeedOperator op, bool *is_good_build) {
+  return CeedOperatorBuildKernelAssemblyAtPoints_Hip_gen(op, false, is_good_build);
+}
+
+//------------------------------------------------------------------------------
diff --git a/backends/hip-gen/ceed-hip-gen-operator-build.h b/backends/hip-gen/ceed-hip-gen-operator-build.h
index 3193505f5c..ac74461ecb 100644
--- a/backends/hip-gen/ceed-hip-gen-operator-build.h
+++ b/backends/hip-gen/ceed-hip-gen-operator-build.h
@@ -8,3 +8,5 @@
 
 CEED_INTERN int BlockGridCalculate_Hip_gen(CeedInt dim, CeedInt num_elem, CeedInt P_1d, CeedInt Q_1d, CeedInt *block_sizes);
 CEED_INTERN int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_build);
+CEED_INTERN int CeedOperatorBuildKernelFullAssemblyAtPoints_Hip_gen(CeedOperator op, bool *is_good_build);
+CEED_INTERN int CeedOperatorBuildKernelDiagonalAssemblyAtPoints_Hip_gen(CeedOperator op, bool *is_good_build);
diff --git a/backends/hip-gen/ceed-hip-gen-operator.c b/backends/hip-gen/ceed-hip-gen-operator.c
index 3b780d295a..e4622612c6 100644
--- a/backends/hip-gen/ceed-hip-gen-operator.c
+++ b/backends/hip-gen/ceed-hip-gen-operator.c
@@ -37,6 +37,8 @@ static int CeedOperatorDestroy_Hip_gen(CeedOperator op) {
     }
   }
   if (impl->module) CeedCallHip(ceed, hipModuleUnload(impl->module));
+  if (impl->module_assemble_full) CeedCallHip(ceed, hipModuleUnload(impl->module_assemble_full));
+  if (impl->module_assemble_diagonal) CeedCallHip(ceed, hipModuleUnload(impl->module_assemble_diagonal));
   if (impl->points.num_per_elem) CeedCallHip(ceed, hipFree((void **)impl->points.num_per_elem));
   CeedCallBackend(CeedFree(&impl));
   CeedCallBackend(CeedDestroy(&ceed));
@@ -299,11 +301,186 @@ static int CeedOperatorApplyAddComposite_Hip_gen(CeedOperator op, CeedVector inp
   return CEED_ERROR_SUCCESS;
 }
 
+//------------------------------------------------------------------------------
+// AtPoints diagonal assembly
+//------------------------------------------------------------------------------
+static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip_gen(CeedOperator op, CeedVector assembled, CeedRequest *request) {
+  Ceed                  ceed;
+  CeedOperator_Hip_gen *data;
+
+  CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
+  CeedCallBackend(CeedOperatorGetData(op, &data));
+
+  // Build the assembly kernel
+  if (!data->assemble_diagonal && !data->use_assembly_fallback) {
+    bool                     is_build_good = false;
+    CeedInt                  num_active_bases_in, num_active_bases_out;
+    CeedOperatorAssemblyData assembly_data;
+
+    CeedCallBackend(CeedOperatorGetOperatorAssemblyData(op, &assembly_data));
+    CeedCallBackend(
+        CeedOperatorAssemblyDataGetEvalModes(assembly_data, &num_active_bases_in, NULL, NULL, NULL, &num_active_bases_out, NULL, NULL, NULL, NULL));
+    if (num_active_bases_in == num_active_bases_out) {
+      CeedCallBackend(CeedOperatorBuildKernel_Hip_gen(op, &is_build_good));
+      if (is_build_good) CeedCallBackend(CeedOperatorBuildKernelDiagonalAssemblyAtPoints_Hip_gen(op, &is_build_good));
+    }
+    if (!is_build_good) data->use_assembly_fallback = true;
+  }
+
+  // Try assembly
+  if (!data->use_assembly_fallback) {
+    bool                   is_run_good = true;
+    Ceed_Hip              *hip_data;
+    CeedInt                num_elem, num_input_fields, num_output_fields;
+    CeedEvalMode           eval_mode;
+    CeedScalar            *assembled_array;
+    CeedQFunctionField    *qf_input_fields, *qf_output_fields;
+    CeedQFunction_Hip_gen *qf_data;
+    CeedQFunction          qf;
+    CeedOperatorField     *op_input_fields, *op_output_fields;
+
+    CeedCallBackend(CeedGetData(ceed, &hip_data));
+    CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
+    CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
+    CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
+    CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
+    CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
+
+    // Input vectors
+    for (CeedInt i = 0; i < num_input_fields; i++) {
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
+      if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
+        data->fields.inputs[i] = NULL;
+      } else {
+        bool       is_active;
+        CeedVector vec;
+
+        // Get input vector
+        CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
+        is_active = vec == CEED_VECTOR_ACTIVE;
+        if (is_active) data->fields.inputs[i] = NULL;
+        else CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->fields.inputs[i]));
+        CeedCallBackend(CeedVectorDestroy(&vec));
+      }
+    }
+
+    // Point coordinates
+    {
+      CeedVector vec;
+
+      CeedCallBackend(CeedOperatorAtPointsGetPoints(op, NULL, &vec));
+      CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->points.coords));
+      CeedCallBackend(CeedVectorDestroy(&vec));
+
+      // Points per elem
+      if (num_elem != data->points.num_elem) {
+        CeedInt            *points_per_elem;
+        const CeedInt       num_bytes   = num_elem * sizeof(CeedInt);
+        CeedElemRestriction rstr_points = NULL;
+
+        data->points.num_elem = num_elem;
+        CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL));
+        CeedCallBackend(CeedCalloc(num_elem, &points_per_elem));
+        for (CeedInt e = 0; e < num_elem; e++) {
+          CeedInt num_points_elem;
+
+          CeedCallBackend(CeedElemRestrictionGetNumPointsInElement(rstr_points, e, &num_points_elem));
+          points_per_elem[e] = num_points_elem;
+        }
+        if (data->points.num_per_elem) CeedCallHip(ceed, hipFree((void **)data->points.num_per_elem));
+        CeedCallHip(ceed, hipMalloc((void **)&data->points.num_per_elem, num_bytes));
+        CeedCallHip(ceed, hipMemcpy((void *)data->points.num_per_elem, points_per_elem, num_bytes, hipMemcpyHostToDevice));
+        CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
+        CeedCallBackend(CeedFree(&points_per_elem));
+      }
+    }
+
+    // Get context data
+    CeedCallBackend(CeedQFunctionGetInnerContextData(qf, CEED_MEM_DEVICE, &qf_data->d_c));
+
+    // Assembly array
+    CeedCallBackend(CeedVectorGetArray(assembled, CEED_MEM_DEVICE, &assembled_array));
+
+    // Assemble diagonal
+    void *opargs[] = {(void *)&num_elem, &qf_data->d_c, &data->indices, &data->fields, &data->B, &data->G, &data->W, &data->points, &assembled_array};
+
+    CeedInt block_sizes[3] = {data->thread_1d, (data->dim == 1 ? 1 : data->thread_1d), -1};
+
+    CeedCallBackend(BlockGridCalculate_Hip_gen(data->dim, num_elem, data->max_P_1d, data->Q_1d, block_sizes));
+    block_sizes[2] = 1;
+    if (data->dim == 1) {
+      CeedInt grid      = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
+      CeedInt sharedMem = block_sizes[2] * data->thread_1d * sizeof(CeedScalar);
+
+      CeedCallBackend(CeedTryRunKernelDimShared_Hip(ceed, data->assemble_diagonal, NULL, grid, block_sizes[0], block_sizes[1], block_sizes[2],
+                                                    sharedMem, &is_run_good, opargs));
+    } else if (data->dim == 2) {
+      CeedInt grid      = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
+      CeedInt sharedMem = block_sizes[2] * data->thread_1d * data->thread_1d * sizeof(CeedScalar);
+
+      CeedCallBackend(CeedTryRunKernelDimShared_Hip(ceed, data->assemble_diagonal, NULL, grid, block_sizes[0], block_sizes[1], block_sizes[2],
+                                                    sharedMem, &is_run_good, opargs));
+    } else if (data->dim == 3) {
+      CeedInt grid      = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
+      CeedInt sharedMem = block_sizes[2] * data->thread_1d * data->thread_1d * sizeof(CeedScalar);
+
+      CeedCallBackend(CeedTryRunKernelDimShared_Hip(ceed, data->assemble_diagonal, NULL, grid, block_sizes[0], block_sizes[1], block_sizes[2],
+                                                    sharedMem, &is_run_good, opargs));
+    }
+
+    // Restore input arrays
+    for (CeedInt i = 0; i < num_input_fields; i++) {
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
+      if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
+      } else {
+        bool       is_active;
+        CeedVector vec;
+
+        CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
+        is_active = vec == CEED_VECTOR_ACTIVE;
+        if (!is_active) CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->fields.inputs[i]));
+        CeedCallBackend(CeedVectorDestroy(&vec));
+      }
+    }
+
+    // Restore point coordinates
+    {
+      CeedVector vec;
+
+      CeedCallBackend(CeedOperatorAtPointsGetPoints(op, NULL, &vec));
+      CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->points.coords));
+      CeedCallBackend(CeedVectorDestroy(&vec));
+    }
+
+    // Restore context data
+    CeedCallBackend(CeedQFunctionRestoreInnerContextData(qf, &qf_data->d_c));
+
+    // Restore assembly array
+    CeedCallBackend(CeedVectorRestoreArray(assembled, &assembled_array));
+
+    // Cleanup
+    CeedCallBackend(CeedQFunctionDestroy(&qf));
+    if (!is_run_good) data->use_assembly_fallback = true;
+  }
+  CeedCallBackend(CeedDestroy(&ceed));
+
+  // Fallback, if needed
+  if (data->use_assembly_fallback) {
+    CeedOperator op_fallback;
+
+    CeedDebug256(CeedOperatorReturnCeed(op), CEED_DEBUG_COLOR_SUCCESS, "Falling back to /gpu/hip/ref CeedOperator");
+    CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback));
+    CeedCallBackend(CeedOperatorLinearAssembleAddDiagonal(op_fallback, assembled, request));
+    return CEED_ERROR_SUCCESS;
+  }
+  return CEED_ERROR_SUCCESS;
+}
+
 //------------------------------------------------------------------------------
 // Create operator
 //------------------------------------------------------------------------------
 int CeedOperatorCreate_Hip_gen(CeedOperator op) {
-  bool                  is_composite;
+  bool                  is_composite, is_at_points;
   Ceed                  ceed;
   CeedOperator_Hip_gen *impl;
 
@@ -316,6 +493,10 @@ int CeedOperatorCreate_Hip_gen(CeedOperator op) {
   } else {
     CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAdd_Hip_gen));
   }
+  CeedCall(CeedOperatorIsAtPoints(op, &is_at_points));
+  if (is_at_points) {
+    CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddDiagonal", CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip_gen));
+  }
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "Destroy", CeedOperatorDestroy_Hip_gen));
   CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
diff --git a/backends/hip-gen/ceed-hip-gen.h b/backends/hip-gen/ceed-hip-gen.h
index 4335302471..cd7dfe6773 100644
--- a/backends/hip-gen/ceed-hip-gen.h
+++ b/backends/hip-gen/ceed-hip-gen.h
@@ -12,14 +12,14 @@
 #include <hip/hip_runtime.h>
 
 typedef struct {
-  bool          use_fallback;
+  bool          use_fallback, use_assembly_fallback;
   CeedInt       dim;
   CeedInt       Q, Q_1d;
   CeedInt       max_P_1d;
   CeedInt       thread_1d;
   hipStream_t   streams[CEED_COMPOSITE_MAX];
-  hipModule_t   module;
-  hipFunction_t op;
+  hipModule_t   module, module_assemble_full, module_assemble_diagonal;
+  hipFunction_t op, assemble_full, assemble_diagonal;
   FieldsInt_Hip indices;
   Fields_Hip    fields;
   Fields_Hip    B;
diff --git a/include/ceed/backend.h b/include/ceed/backend.h
index a3707eba4a..3a5171a8b5 100644
--- a/include/ceed/backend.h
+++ b/include/ceed/backend.h
@@ -460,6 +460,7 @@ CEED_EXTERN int CeedOperatorReference(CeedOperator op);
 CEED_EXTERN int CeedOperatorGetFallback(CeedOperator op, CeedOperator *op_fallback);
 CEED_EXTERN int CeedOperatorGetFallbackParent(CeedOperator op, CeedOperator *parent);
 CEED_EXTERN int CeedOperatorGetFallbackParentCeed(CeedOperator op, Ceed *parent);
+CEED_INTERN int CeedSingleOperatorAssemble(CeedOperator op, CeedInt offset, CeedVector values);
 CEED_EXTERN int CeedOperatorSetSetupDone(CeedOperator op);
 
 CEED_INTERN int CeedMatrixMatrixMultiply(Ceed ceed, const CeedScalar *mat_A, const CeedScalar *mat_B, CeedScalar *mat_C, CeedInt m, CeedInt n,
diff --git a/include/ceed/gen-tools.h b/include/ceed/gen-tools.h
new file mode 100644
index 0000000000..f1f3743000
--- /dev/null
+++ b/include/ceed/gen-tools.h
@@ -0,0 +1,27 @@
+#include <ceed.h>
+#include <sstream>
+
+class Tab {
+ private:
+  CeedInt       _num_tabs{0};
+  const CeedInt _width{2};
+
+  template <class OStream>
+  friend OStream &operator<<(OStream &os, const Tab &tab);
+
+ public:
+  Tab &push() {
+    _num_tabs++;
+    return *this;
+  }
+  Tab &pop() {
+    if (_num_tabs > 0) _num_tabs--;
+    return *this;
+  }
+};
+
+template <class OStream>
+OStream &operator<<(OStream &os, const Tab &tab) {
+  os << std::string(tab._num_tabs * tab._width, ' ');
+  return os;
+}
diff --git a/include/ceed/jit-source/cuda/cuda-gen-templates.h b/include/ceed/jit-source/cuda/cuda-gen-templates.h
index ba4ecfc2bd..3f8508e544 100644
--- a/include/ceed/jit-source/cuda/cuda-gen-templates.h
+++ b/include/ceed/jit-source/cuda/cuda-gen-templates.h
@@ -53,6 +53,19 @@ inline __device__ void WritePoint(SharedData_Cuda &data, const CeedInt elem, con
 // 1D
 //------------------------------------------------------------------------------
 
+//------------------------------------------------------------------------------
+// Set E-vector value
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D>
+inline __device__ void SetEVecStandard1d_Single(SharedData_Cuda &data, const CeedInt n, const CeedScalar value, CeedScalar *__restrict__ r_v) {
+  const CeedInt target_comp = n / P_1D;
+  const CeedInt target_node = n % P_1D;
+
+  if (data.t_id_x == target_node) {
+    r_v[target_comp] = value;
+  }
+}
+
 //------------------------------------------------------------------------------
 // L-vector -> E-vector, offsets provided
 //------------------------------------------------------------------------------
@@ -95,6 +108,20 @@ inline __device__ void WriteLVecStandard1d(SharedData_Cuda &data, const CeedInt
   }
 }
 
+template <int NUM_COMP, int COMP_STRIDE, int P_1D>
+inline __device__ void WriteLVecStandard1d_Single(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt n,
+                                                  const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_v,
+                                                  CeedScalar *__restrict__ d_v) {
+  const CeedInt target_comp = n / P_1D;
+  const CeedInt target_node = n % P_1D;
+
+  if (data.t_id_x == target_node) {
+    const CeedInt ind = indices[target_node + elem * P_1D];
+
+    atomicAdd(&d_v[ind + COMP_STRIDE * target_comp], r_v[target_comp]);
+  }
+}
+
 //------------------------------------------------------------------------------
 // E-vector -> L-vector, strided
 //------------------------------------------------------------------------------
@@ -113,6 +140,20 @@ inline __device__ void WriteLVecStrided1d(SharedData_Cuda &data, const CeedInt e
 // 2D
 //------------------------------------------------------------------------------
 
+//------------------------------------------------------------------------------
+// Set E-vector value
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D>
+inline __device__ void SetEVecStandard2d_Single(SharedData_Cuda &data, const CeedInt n, const CeedScalar value, CeedScalar *__restrict__ r_v) {
+  const CeedInt target_comp   = n / (P_1D * P_1D);
+  const CeedInt target_node_x = n % P_1D;
+  const CeedInt target_node_y = (n % (P_1D * P_1D)) / P_1D;
+
+  if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) {
+    r_v[target_comp] = value;
+  }
+}
+
 //------------------------------------------------------------------------------
 // L-vector -> E-vector, offsets provided
 //------------------------------------------------------------------------------
@@ -155,6 +196,22 @@ inline __device__ void WriteLVecStandard2d(SharedData_Cuda &data, const CeedInt
   }
 }
 
+template <int NUM_COMP, int COMP_STRIDE, int P_1D>
+inline __device__ void WriteLVecStandard2d_Single(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt n,
+                                                  const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_v,
+                                                  CeedScalar *__restrict__ d_v) {
+  const CeedInt target_comp   = n / (P_1D * P_1D);
+  const CeedInt target_node_x = n % P_1D;
+  const CeedInt target_node_y = (n % (P_1D * P_1D)) / P_1D;
+
+  if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) {
+    const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
+    const CeedInt ind  = indices[node + elem * P_1D * P_1D];
+
+    atomicAdd(&d_v[ind + COMP_STRIDE * target_comp], r_v[target_comp]);
+  }
+}
+
 //------------------------------------------------------------------------------
 // E-vector -> L-vector, strided
 //------------------------------------------------------------------------------
@@ -173,6 +230,21 @@ inline __device__ void WriteLVecStrided2d(SharedData_Cuda &data, const CeedInt e
 // 3D
 //------------------------------------------------------------------------------
 
+//------------------------------------------------------------------------------
+// Set E-vector value
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D>
+inline __device__ void SetEVecStandard3d_Single(SharedData_Cuda &data, const CeedInt n, const CeedScalar value, CeedScalar *__restrict__ r_v) {
+  const CeedInt target_comp   = n / (P_1D * P_1D * P_1D);
+  const CeedInt target_node_x = n % P_1D;
+  const CeedInt target_node_y = (n % (P_1D * P_1D)) / P_1D;
+  const CeedInt target_node_z = (n % (P_1D * P_1D * P_1D)) / (P_1D * P_1D);
+
+  if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) {
+    r_v[target_node_z + target_comp * P_1D] = value;
+  }
+}
+
 //------------------------------------------------------------------------------
 // L-vector -> E-vector, offsets provided
 //------------------------------------------------------------------------------
@@ -250,6 +322,23 @@ inline __device__ void WriteLVecStandard3d(SharedData_Cuda &data, const CeedInt
   }
 }
 
+template <int NUM_COMP, int COMP_STRIDE, int P_1D>
+inline __device__ void WriteLVecStandard3d_Single(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt n,
+                                                  const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_v,
+                                                  CeedScalar *__restrict__ d_v) {
+  const CeedInt target_comp   = n / (P_1D * P_1D * P_1D);
+  const CeedInt target_node_x = n % P_1D;
+  const CeedInt target_node_y = (n % (P_1D * P_1D)) / P_1D;
+  const CeedInt target_node_z = (n % (P_1D * P_1D * P_1D)) / (P_1D * P_1D);
+
+  if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) {
+    const CeedInt node = data.t_id_x + data.t_id_y * P_1D + target_node_z * P_1D * P_1D;
+    const CeedInt ind  = indices[node + elem * P_1D * P_1D * P_1D];
+
+    atomicAdd(&d_v[ind + COMP_STRIDE * target_comp], r_v[target_node_z + target_comp * P_1D]);
+  }
+}
+
 //------------------------------------------------------------------------------
 // E-vector -> L-vector, strided
 //------------------------------------------------------------------------------
diff --git a/include/ceed/jit-source/hip/hip-gen-templates.h b/include/ceed/jit-source/hip/hip-gen-templates.h
index 3d45310556..68fb8e9693 100644
--- a/include/ceed/jit-source/hip/hip-gen-templates.h
+++ b/include/ceed/jit-source/hip/hip-gen-templates.h
@@ -53,6 +53,19 @@ inline __device__ void WritePoint(SharedData_Hip &data, const CeedInt elem, cons
 // 1D
 //------------------------------------------------------------------------------
 
+//------------------------------------------------------------------------------
+// Set E-vector value
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D>
+inline __device__ void SetEVecStandard1d_Single(SharedData_Hip &data, const CeedInt n, const CeedScalar value, CeedScalar *__restrict__ r_v) {
+  const CeedInt target_comp = n / P_1D;
+  const CeedInt target_node = n % P_1D;
+
+  if (data.t_id_x == target_node) {
+    r_v[target_comp] = value;
+  }
+}
+
 //------------------------------------------------------------------------------
 // L-vector -> E-vector, offsets provided
 //------------------------------------------------------------------------------
@@ -94,6 +107,20 @@ inline __device__ void WriteLVecStandard1d(SharedData_Hip &data, const CeedInt n
   }
 }
 
+template <int NUM_COMP, int COMP_STRIDE, int P_1D>
+inline __device__ void WriteLVecStandard1d_Single(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt n,
+                                                  const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_v,
+                                                  CeedScalar *__restrict__ d_v) {
+  const CeedInt target_comp = n / P_1D;
+  const CeedInt target_node = n % P_1D;
+
+  if (data.t_id_x == target_node) {
+    const CeedInt ind = indices[target_node + elem * P_1D];
+
+    atomicAdd(&d_v[ind + COMP_STRIDE * target_comp], r_v[target_comp]);
+  }
+}
+
 //------------------------------------------------------------------------------
 // E-vector -> L-vector, strided
 //------------------------------------------------------------------------------
@@ -112,6 +139,20 @@ inline __device__ void WriteLVecStrided1d(SharedData_Hip &data, const CeedInt el
 // 2D
 //------------------------------------------------------------------------------
 
+//------------------------------------------------------------------------------
+// Set E-vector value
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D>
+inline __device__ void SetEVecStandard2d_Single(SharedData_Hip &data, const CeedInt n, const CeedScalar value, CeedScalar *__restrict__ r_v) {
+  const CeedInt target_comp   = n / (P_1D * P_1D);
+  const CeedInt target_node_x = n % P_1D;
+  const CeedInt target_node_y = (n % (P_1D * P_1D)) / P_1D;
+
+  if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) {
+    r_v[target_comp] = value;
+  }
+}
+
 //------------------------------------------------------------------------------
 // L-vector -> E-vector, offsets provided
 //------------------------------------------------------------------------------
@@ -153,6 +194,22 @@ inline __device__ void WriteLVecStandard2d(SharedData_Hip &data, const CeedInt n
   }
 }
 
+template <int NUM_COMP, int COMP_STRIDE, int P_1D>
+inline __device__ void WriteLVecStandard2d_Single(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt n,
+                                                  const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_v,
+                                                  CeedScalar *__restrict__ d_v) {
+  const CeedInt target_comp   = n / (P_1D * P_1D);
+  const CeedInt target_node_x = n % P_1D;
+  const CeedInt target_node_y = (n % (P_1D * P_1D)) / P_1D;
+
+  if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) {
+    const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
+    const CeedInt ind  = indices[node + elem * P_1D * P_1D];
+
+    atomicAdd(&d_v[ind + COMP_STRIDE * target_comp], r_v[target_comp]);
+  }
+}
+
 //------------------------------------------------------------------------------
 // E-vector -> L-vector, strided
 //------------------------------------------------------------------------------
@@ -171,6 +228,21 @@ inline __device__ void WriteLVecStrided2d(SharedData_Hip &data, const CeedInt el
 // 3D
 //------------------------------------------------------------------------------
 
+//------------------------------------------------------------------------------
+// Set E-vector value
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D>
+inline __device__ void SetEVecStandard3d_Single(SharedData_Hip &data, const CeedInt n, const CeedScalar value, CeedScalar *__restrict__ r_v) {
+  const CeedInt target_comp   = n / (P_1D * P_1D * P_1D);
+  const CeedInt target_node_x = n % P_1D;
+  const CeedInt target_node_y = ((n % (P_1D * P_1D * P_1D)) / P_1D) % P_1D;
+  const CeedInt target_node_z = (n % (P_1D * P_1D * P_1D)) / (P_1D * P_1D);
+
+  if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) {
+    r_v[target_node_z + target_comp * P_1D] = value;
+  }
+}
+
 //------------------------------------------------------------------------------
 // L-vector -> E-vector, offsets provided
 //------------------------------------------------------------------------------
@@ -247,6 +319,23 @@ inline __device__ void WriteLVecStandard3d(SharedData_Hip &data, const CeedInt n
   }
 }
 
+template <int NUM_COMP, int COMP_STRIDE, int P_1D>
+inline __device__ void WriteLVecStandard3d_Single(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt n,
+                                                  const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_v,
+                                                  CeedScalar *__restrict__ d_v) {
+  const CeedInt target_comp   = n / (P_1D * P_1D * P_1D);
+  const CeedInt target_node_x = n % P_1D;
+  const CeedInt target_node_y = ((n % (P_1D * P_1D * P_1D)) / P_1D) % P_1D;
+  const CeedInt target_node_z = (n % (P_1D * P_1D * P_1D)) / (P_1D * P_1D);
+
+  if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) {
+    const CeedInt node = data.t_id_x + data.t_id_y * P_1D + target_node_z * P_1D * P_1D;
+    const CeedInt ind  = indices[node + elem * P_1D * P_1D * P_1D];
+
+    atomicAdd(&d_v[ind + COMP_STRIDE * target_comp], r_v[target_node_z + target_comp * P_1D]);
+  }
+}
+
 //------------------------------------------------------------------------------
 // E-vector -> L-vector, strided
 //------------------------------------------------------------------------------
diff --git a/interface/ceed-preconditioning.c b/interface/ceed-preconditioning.c
index 9fa12e28e2..ec5b8252b3 100644
--- a/interface/ceed-preconditioning.c
+++ b/interface/ceed-preconditioning.c
@@ -566,7 +566,7 @@ static int CeedSingleOperatorAssembleSymbolic(CeedOperator op, CeedInt offset, C
 
   @ref Developer
 **/
-static int CeedSingleOperatorAssemble(CeedOperator op, CeedInt offset, CeedVector values) {
+int CeedSingleOperatorAssemble(CeedOperator op, CeedInt offset, CeedVector values) {
   bool is_composite, is_at_points;
 
   CeedCall(CeedOperatorIsComposite(op, &is_composite));
@@ -2051,6 +2051,11 @@ int CeedOperatorLinearAssembleDiagonal(CeedOperator op, CeedVector assembled, Ce
     CeedCall(CeedVectorSetValue(assembled, 0.0));
     CeedCall(op->LinearAssembleAddDiagonal(op, assembled, request));
     return CEED_ERROR_SUCCESS;
+  } else if (is_composite) {
+    // Default to summing contributions of suboperators
+    CeedCall(CeedVectorSetValue(assembled, 0.0));
+    CeedCall(CeedCompositeOperatorLinearAssembleAddDiagonal(op, request, false, assembled));
+    return CEED_ERROR_SUCCESS;
   } else {
     // Operator fallback
     CeedOperator op_fallback;
@@ -2106,6 +2111,9 @@ int CeedOperatorLinearAssembleAddDiagonal(CeedOperator op, CeedVector assembled,
     // Backend version
     CeedCall(op->LinearAssembleAddDiagonal(op, assembled, request));
     return CEED_ERROR_SUCCESS;
+  } else if (is_composite) {
+    // Default to summing contributions of suboperators
+    CeedCall(CeedCompositeOperatorLinearAssembleAddDiagonal(op, request, false, assembled));
   } else {
     // Operator fallback
     CeedOperator op_fallback;
@@ -2117,11 +2125,7 @@ int CeedOperatorLinearAssembleAddDiagonal(CeedOperator op, CeedVector assembled,
     }
   }
   // Default interface implementation
-  if (is_composite) {
-    CeedCall(CeedCompositeOperatorLinearAssembleAddDiagonal(op, request, false, assembled));
-  } else {
-    CeedCall(CeedSingleOperatorLinearAssembleAddDiagonal(op, request, false, assembled));
-  }
+  CeedCall(CeedSingleOperatorLinearAssembleAddDiagonal(op, request, false, assembled));
   return CEED_ERROR_SUCCESS;
 }
 

From 915834c9f1e582e3fdfc87db6b4fa4e010d293bb Mon Sep 17 00:00:00 2001
From: Zach Atkins <zach.atkins@colorado.edu>
Date: Tue, 17 Jun 2025 17:05:46 -0600
Subject: [PATCH 423/571] gen - full assembly at points for CUDA

---
 .../cuda-gen/ceed-cuda-gen-operator-build.cpp |  18 +-
 backends/cuda-gen/ceed-cuda-gen-operator.c    | 167 ++++++++++++++++++
 .../ceed/jit-source/cuda/cuda-gen-templates.h |  70 ++++++++
 interface/ceed-preconditioning.c              |  25 ++-
 4 files changed, 278 insertions(+), 2 deletions(-)

diff --git a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
index 02bef7a16a..98b5fdd85a 100644
--- a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
+++ b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
@@ -1995,7 +1995,19 @@ static int CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen(CeedOperator op, boo
 
     // ---- Restriction
     if (is_full) {
-      // TODO: UPDATE OUTPUTS FOR FULL ASSEMBLY
+      std::string         var_suffix = "_out_" + std::to_string(i);
+      CeedInt             comp_stride;
+      CeedSize            l_size;
+      CeedElemRestriction elem_rstr;
+
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
+      CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
+      code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
+      CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
+      code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
+      code << tab << "WriteLVecStandard" << max_dim << "d_Assembly<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", P_1d" + var_suffix
+           << ">(data, l_size" << var_suffix << ", elem, n, r_e" << var_suffix << ", values_array);\n";
+      CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
     } else {
       std::string         var_suffix = "_out_" + std::to_string(i);
       CeedInt             comp_stride;
@@ -2055,4 +2067,8 @@ extern "C" int CeedOperatorBuildKernelDiagonalAssemblyAtPoints_Cuda_gen(CeedOper
   return CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen(op, false, is_good_build);
 }
 
+extern "C" int CeedOperatorBuildKernelFullAssemblyAtPoints_Cuda_gen(CeedOperator op, bool *is_good_build) {
+  return CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen(op, true, is_good_build);
+}
+
 //------------------------------------------------------------------------------
diff --git a/backends/cuda-gen/ceed-cuda-gen-operator.c b/backends/cuda-gen/ceed-cuda-gen-operator.c
index 5b9acfdb86..f0728b091c 100644
--- a/backends/cuda-gen/ceed-cuda-gen-operator.c
+++ b/backends/cuda-gen/ceed-cuda-gen-operator.c
@@ -448,6 +448,7 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda_gen(CeedOperator o
 
     CeedCallBackend(
         CeedTryRunKernelDimShared_Cuda(ceed, data->assemble_diagonal, NULL, grid, block[0], block[1], block[2], shared_mem, &is_run_good, opargs));
+    CeedCallCuda(ceed, cudaDeviceSynchronize());
 
     // Restore input arrays
     for (CeedInt i = 0; i < num_input_fields; i++) {
@@ -497,6 +498,171 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda_gen(CeedOperator o
   return CEED_ERROR_SUCCESS;
 }
 
+//------------------------------------------------------------------------------
+// AtPoints full assembly
+//------------------------------------------------------------------------------
+static int CeedSingleOperatorAssembleAtPoints_Cuda_gen(CeedOperator op, CeedInt offset, CeedVector assembled) {
+  Ceed                   ceed;
+  CeedOperator_Cuda_gen *data;
+
+  CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
+  CeedCallBackend(CeedOperatorGetData(op, &data));
+
+  // Build the assembly kernel
+  if (!data->assemble_full && !data->use_assembly_fallback) {
+    bool                     is_build_good = false;
+    CeedInt                  num_active_bases_in, num_active_bases_out;
+    CeedOperatorAssemblyData assembly_data;
+
+    CeedCallBackend(CeedOperatorGetOperatorAssemblyData(op, &assembly_data));
+    CeedCallBackend(
+        CeedOperatorAssemblyDataGetEvalModes(assembly_data, &num_active_bases_in, NULL, NULL, NULL, &num_active_bases_out, NULL, NULL, NULL, NULL));
+    if (num_active_bases_in == num_active_bases_out) {
+      CeedCallBackend(CeedOperatorBuildKernel_Cuda_gen(op, &is_build_good));
+      if (is_build_good) CeedCallBackend(CeedOperatorBuildKernelFullAssemblyAtPoints_Cuda_gen(op, &is_build_good));
+    }
+    if (!is_build_good) data->use_assembly_fallback = true;
+  }
+
+  // Try assembly
+  if (!data->use_assembly_fallback) {
+    bool                    is_run_good = true;
+    Ceed_Cuda              *cuda_data;
+    CeedInt                 num_elem, num_input_fields, num_output_fields;
+    CeedEvalMode            eval_mode;
+    CeedScalar             *assembled_array;
+    CeedQFunctionField     *qf_input_fields, *qf_output_fields;
+    CeedQFunction_Cuda_gen *qf_data;
+    CeedQFunction           qf;
+    CeedOperatorField      *op_input_fields, *op_output_fields;
+
+    CeedCallBackend(CeedGetData(ceed, &cuda_data));
+    CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
+    CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
+    CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
+    CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
+    CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
+
+    // Input vectors
+    for (CeedInt i = 0; i < num_input_fields; i++) {
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
+      if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
+        data->fields.inputs[i] = NULL;
+      } else {
+        bool       is_active;
+        CeedVector vec;
+
+        // Get input vector
+        CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
+        is_active = vec == CEED_VECTOR_ACTIVE;
+        if (is_active) data->fields.inputs[i] = NULL;
+        else CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->fields.inputs[i]));
+        CeedCallBackend(CeedVectorDestroy(&vec));
+      }
+    }
+
+    // Point coordinates
+    {
+      CeedVector vec;
+
+      CeedCallBackend(CeedOperatorAtPointsGetPoints(op, NULL, &vec));
+      CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->points.coords));
+      CeedCallBackend(CeedVectorDestroy(&vec));
+
+      // Points per elem
+      if (num_elem != data->points.num_elem) {
+        CeedInt            *points_per_elem;
+        const CeedInt       num_bytes   = num_elem * sizeof(CeedInt);
+        CeedElemRestriction rstr_points = NULL;
+
+        data->points.num_elem = num_elem;
+        CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL));
+        CeedCallBackend(CeedCalloc(num_elem, &points_per_elem));
+        for (CeedInt e = 0; e < num_elem; e++) {
+          CeedInt num_points_elem;
+
+          CeedCallBackend(CeedElemRestrictionGetNumPointsInElement(rstr_points, e, &num_points_elem));
+          points_per_elem[e] = num_points_elem;
+        }
+        if (data->points.num_per_elem) CeedCallCuda(ceed, cudaFree((void **)data->points.num_per_elem));
+        CeedCallCuda(ceed, cudaMalloc((void **)&data->points.num_per_elem, num_bytes));
+        CeedCallCuda(ceed, cudaMemcpy((void *)data->points.num_per_elem, points_per_elem, num_bytes, cudaMemcpyHostToDevice));
+        CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
+        CeedCallBackend(CeedFree(&points_per_elem));
+      }
+    }
+
+    // Get context data
+    CeedCallBackend(CeedQFunctionGetInnerContextData(qf, CEED_MEM_DEVICE, &qf_data->d_c));
+
+    // Assembly array
+    CeedCallBackend(CeedVectorGetArray(assembled, CEED_MEM_DEVICE, &assembled_array));
+    CeedScalar *assembled_offset_array = &assembled_array[offset];
+
+    // Assemble diagonal
+    void *opargs[] = {(void *)&num_elem, &qf_data->d_c, &data->indices, &data->fields,          &data->B,
+                      &data->G,          &data->W,      &data->points,  &assembled_offset_array};
+    int   max_threads_per_block, min_grid_size, grid;
+
+    CeedCallCuda(ceed, cuOccupancyMaxPotentialBlockSize(&min_grid_size, &max_threads_per_block, data->op, dynamicSMemSize, 0, 0x10000));
+    int block[3] = {data->thread_1d, (data->dim == 1 ? 1 : data->thread_1d), -1};
+
+    CeedCallBackend(BlockGridCalculate(num_elem, min_grid_size / cuda_data->device_prop.multiProcessorCount, 1,
+                                       cuda_data->device_prop.maxThreadsDim[2], cuda_data->device_prop.warpSize, block, &grid));
+    CeedInt shared_mem = block[0] * block[1] * block[2] * sizeof(CeedScalar);
+
+    CeedCallBackend(
+        CeedTryRunKernelDimShared_Cuda(ceed, data->assemble_full, NULL, grid, block[0], block[1], block[2], shared_mem, &is_run_good, opargs));
+    CeedCallCuda(ceed, cudaDeviceSynchronize());
+
+    // Restore input arrays
+    for (CeedInt i = 0; i < num_input_fields; i++) {
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
+      if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
+      } else {
+        bool       is_active;
+        CeedVector vec;
+
+        CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
+        is_active = vec == CEED_VECTOR_ACTIVE;
+        if (!is_active) CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->fields.inputs[i]));
+        CeedCallBackend(CeedVectorDestroy(&vec));
+      }
+    }
+
+    // Restore point coordinates
+    {
+      CeedVector vec;
+
+      CeedCallBackend(CeedOperatorAtPointsGetPoints(op, NULL, &vec));
+      CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->points.coords));
+      CeedCallBackend(CeedVectorDestroy(&vec));
+    }
+
+    // Restore context data
+    CeedCallBackend(CeedQFunctionRestoreInnerContextData(qf, &qf_data->d_c));
+
+    // Restore assembly array
+    CeedCallBackend(CeedVectorRestoreArray(assembled, &assembled_array));
+
+    // Cleanup
+    CeedCallBackend(CeedQFunctionDestroy(&qf));
+    if (!is_run_good) data->use_assembly_fallback = true;
+  }
+  CeedCallBackend(CeedDestroy(&ceed));
+
+  // Fallback, if needed
+  if (data->use_assembly_fallback) {
+    CeedOperator op_fallback;
+
+    CeedDebug256(CeedOperatorReturnCeed(op), CEED_DEBUG_COLOR_SUCCESS, "Falling back to /gpu/cuda/ref CeedOperator");
+    CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback));
+    CeedCallBackend(CeedSingleOperatorAssemble(op_fallback, offset, assembled));
+    return CEED_ERROR_SUCCESS;
+  }
+  return CEED_ERROR_SUCCESS;
+}
+
 //------------------------------------------------------------------------------
 // Create operator
 //------------------------------------------------------------------------------
@@ -518,6 +684,7 @@ int CeedOperatorCreate_Cuda_gen(CeedOperator op) {
   if (is_at_points) {
     CeedCallBackend(
         CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddDiagonal", CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda_gen));
+    CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleSingle", CeedSingleOperatorAssembleAtPoints_Cuda_gen));
   }
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "Destroy", CeedOperatorDestroy_Cuda_gen));
   CeedCallBackend(CeedDestroy(&ceed));
diff --git a/include/ceed/jit-source/cuda/cuda-gen-templates.h b/include/ceed/jit-source/cuda/cuda-gen-templates.h
index 3f8508e544..e26e2aaf5e 100644
--- a/include/ceed/jit-source/cuda/cuda-gen-templates.h
+++ b/include/ceed/jit-source/cuda/cuda-gen-templates.h
@@ -122,6 +122,25 @@ inline __device__ void WriteLVecStandard1d_Single(SharedData_Cuda &data, const C
   }
 }
 
+//------------------------------------------------------------------------------
+// E-vector -> L-vector, full assembly
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int COMP_STRIDE, int P_1D>
+inline __device__ void WriteLVecStandard1d_Assembly(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt in,
+                                                    const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
+  const CeedInt in_comp    = in / P_1D;
+  const CeedInt in_node    = in % P_1D;
+  const CeedInt e_vec_size = P_1D * NUM_COMP;
+
+  if (data.t_id_x < P_1D) {
+    const CeedInt out_node = data.t_id_x;
+
+    for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+      d_v[elem * e_vec_size * e_vec_size + (in_comp * NUM_COMP + comp) * P_1D * P_1D + out_node * P_1D + in_node] += r_v[comp];
+    }
+  }
+}
+
 //------------------------------------------------------------------------------
 // E-vector -> L-vector, strided
 //------------------------------------------------------------------------------
@@ -212,6 +231,30 @@ inline __device__ void WriteLVecStandard2d_Single(SharedData_Cuda &data, const C
   }
 }
 
+//------------------------------------------------------------------------------
+// E-vector -> L-vector, full assembly
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int COMP_STRIDE, int P_1D>
+inline __device__ void WriteLVecStandard2d_Assembly(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt in,
+                                                    const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
+  const CeedInt elem_size  = P_1D * P_1D;
+  const CeedInt in_comp    = in / elem_size;
+  const CeedInt in_node_x  = in % P_1D;
+  const CeedInt in_node_y  = (in % elem_size) / P_1D;
+  const CeedInt e_vec_size = elem_size * NUM_COMP;
+
+  if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
+    const CeedInt in_node  = in_node_x + in_node_y * P_1D;
+    const CeedInt out_node = data.t_id_x + data.t_id_y * P_1D;
+
+    for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+      const CeedInt index = (in_comp * NUM_COMP + comp) * elem_size * elem_size + out_node * elem_size + in_node;
+
+      d_v[elem * e_vec_size * e_vec_size + index] += r_v[comp];
+    }
+  }
+}
+
 //------------------------------------------------------------------------------
 // E-vector -> L-vector, strided
 //------------------------------------------------------------------------------
@@ -339,6 +382,33 @@ inline __device__ void WriteLVecStandard3d_Single(SharedData_Cuda &data, const C
   }
 }
 
+//------------------------------------------------------------------------------
+// E-vector -> L-vector, full assembly
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int COMP_STRIDE, int P_1D>
+inline __device__ void WriteLVecStandard3d_Assembly(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt in,
+                                                    const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
+  const CeedInt elem_size  = P_1D * P_1D * P_1D;
+  const CeedInt in_comp    = in / elem_size;
+  const CeedInt in_node_x  = in % P_1D;
+  const CeedInt in_node_y  = (in % (P_1D * P_1D)) / P_1D;
+  const CeedInt in_node_z  = (in % elem_size) / (P_1D * P_1D);
+  const CeedInt e_vec_size = elem_size * NUM_COMP;
+
+  if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
+    const CeedInt in_node = in_node_x + in_node_y * P_1D + in_node_z * P_1D * P_1D;
+    for (CeedInt z = 0; z < P_1D; z++) {
+      const CeedInt out_node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
+
+      for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+        const CeedInt index = (in_comp * NUM_COMP + comp) * elem_size * elem_size + out_node * elem_size + in_node;
+
+        d_v[elem * e_vec_size * e_vec_size + index] += r_v[z + comp * P_1D];
+      }
+    }
+  }
+}
+
 //------------------------------------------------------------------------------
 // E-vector -> L-vector, strided
 //------------------------------------------------------------------------------
diff --git a/interface/ceed-preconditioning.c b/interface/ceed-preconditioning.c
index ec5b8252b3..e1fb0f7b54 100644
--- a/interface/ceed-preconditioning.c
+++ b/interface/ceed-preconditioning.c
@@ -2451,7 +2451,7 @@ int CeedOperatorLinearAssembleSymbolic(CeedOperator op, CeedSize *num_entries, C
    @ref User
 **/
 int CeedOperatorLinearAssemble(CeedOperator op, CeedVector values) {
-  bool          is_composite;
+  bool          is_composite, has_linear_assemble_single;
   CeedInt       num_suboperators, offset = 0;
   CeedSize      single_entries = 0;
   CeedOperator *sub_operators;
@@ -2465,12 +2465,35 @@ int CeedOperatorLinearAssemble(CeedOperator op, CeedVector values) {
 
     CeedCall(CeedOperatorGetNumElements(op, &num_elem));
     if (num_elem == 0) return CEED_ERROR_SUCCESS;
+    has_linear_assemble_single = op->LinearAssembleSingle != NULL;
+  } else {
+    CeedCall(CeedCompositeOperatorGetNumSub(op, &num_suboperators));
+    CeedCall(CeedCompositeOperatorGetSubList(op, &sub_operators));
+    has_linear_assemble_single = true;
+    for (CeedInt i = 0; i < num_suboperators; i++) {
+      has_linear_assemble_single = has_linear_assemble_single && sub_operators[i]->LinearAssembleSingle != NULL;
+    }
   }
 
   if (op->LinearAssemble) {
     // Backend version
     CeedCall(op->LinearAssemble(op, values));
     return CEED_ERROR_SUCCESS;
+  } else if (has_linear_assemble_single) {
+    // Default to summing contributions of suboperators
+    CeedCall(CeedVectorSetValue(values, 0.0));
+    if (is_composite && num_suboperators > 0 && sub_operators[0]) {
+      CeedCall(CeedCompositeOperatorGetNumSub(op, &num_suboperators));
+      CeedCall(CeedCompositeOperatorGetSubList(op, &sub_operators));
+      for (CeedInt k = 0; k < num_suboperators; k++) {
+        CeedCall(CeedSingleOperatorAssemble(sub_operators[k], offset, values));
+        CeedCall(CeedSingleOperatorAssemblyCountEntries(sub_operators[k], &single_entries));
+        offset += single_entries;
+      }
+    } else {
+      CeedCall(CeedSingleOperatorAssemble(op, offset, values));
+    }
+    return CEED_ERROR_SUCCESS;
   } else {
     // Operator fallback
     CeedOperator op_fallback;

From 692716b783f81dd51daef618726177f4c6d7441d Mon Sep 17 00:00:00 2001
From: Zach Atkins <Zach.Atkins@colorado.edu>
Date: Tue, 17 Jun 2025 16:10:37 -0700
Subject: [PATCH 424/571] gen - full assembly at points for hip

---
 .../hip-gen/ceed-hip-gen-operator-build.cpp   |  18 +-
 backends/hip-gen/ceed-hip-gen-operator.c      | 187 ++++++++++++++++++
 .../ceed/jit-source/hip/hip-gen-templates.h   |  70 +++++++
 3 files changed, 274 insertions(+), 1 deletion(-)

diff --git a/backends/hip-gen/ceed-hip-gen-operator-build.cpp b/backends/hip-gen/ceed-hip-gen-operator-build.cpp
index a4e410c4a6..e7c1f4e8c7 100644
--- a/backends/hip-gen/ceed-hip-gen-operator-build.cpp
+++ b/backends/hip-gen/ceed-hip-gen-operator-build.cpp
@@ -2001,7 +2001,19 @@ static int CeedOperatorBuildKernelAssemblyAtPoints_Hip_gen(CeedOperator op, bool
 
     // ---- Restriction
     if (is_full) {
-      // TODO: UPDATE OUTPUTS FOR FULL ASSEMBLY
+      std::string         var_suffix = "_out_" + std::to_string(i);
+      CeedInt             comp_stride;
+      CeedSize            l_size;
+      CeedElemRestriction elem_rstr;
+
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
+      CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
+      code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
+      CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
+      code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
+      code << tab << "WriteLVecStandard" << max_dim << "d_Assembly<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", P_1d" + var_suffix
+           << ">(data, l_size" << var_suffix << ", elem, n, r_e" << var_suffix << ", values_array);\n";
+      CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
     } else {
       std::string         var_suffix = "_out_" + std::to_string(i);
       CeedInt             comp_stride;
@@ -2067,4 +2079,8 @@ extern "C" int CeedOperatorBuildKernelDiagonalAssemblyAtPoints_Hip_gen(CeedOpera
   return CeedOperatorBuildKernelAssemblyAtPoints_Hip_gen(op, false, is_good_build);
 }
 
+extern "C" int CeedOperatorBuildKernelFullAssemblyAtPoints_Hip_gen(CeedOperator op, bool *is_good_build) {
+  return CeedOperatorBuildKernelAssemblyAtPoints_Hip_gen(op, true, is_good_build);
+}
+
 //------------------------------------------------------------------------------
diff --git a/backends/hip-gen/ceed-hip-gen-operator.c b/backends/hip-gen/ceed-hip-gen-operator.c
index e4622612c6..2bd1b4d6ae 100644
--- a/backends/hip-gen/ceed-hip-gen-operator.c
+++ b/backends/hip-gen/ceed-hip-gen-operator.c
@@ -427,6 +427,7 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip_gen(CeedOperator op
       CeedCallBackend(CeedTryRunKernelDimShared_Hip(ceed, data->assemble_diagonal, NULL, grid, block_sizes[0], block_sizes[1], block_sizes[2],
                                                     sharedMem, &is_run_good, opargs));
     }
+    CeedCallHip(ceed, hipDeviceSynchronize());
 
     // Restore input arrays
     for (CeedInt i = 0; i < num_input_fields; i++) {
@@ -476,6 +477,191 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip_gen(CeedOperator op
   return CEED_ERROR_SUCCESS;
 }
 
+//------------------------------------------------------------------------------
+// AtPoints full assembly
+//------------------------------------------------------------------------------
+static int CeedSingleOperatorAssembleAtPoints_Hip_gen(CeedOperator op, CeedInt offset, CeedVector assembled) {
+  Ceed                  ceed;
+  CeedOperator_Hip_gen *data;
+
+  CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
+  CeedCallBackend(CeedOperatorGetData(op, &data));
+
+  // Build the assembly kernel
+  if (!data->assemble_full && !data->use_assembly_fallback) {
+    bool                     is_build_good = false;
+    CeedInt                  num_active_bases_in, num_active_bases_out;
+    CeedOperatorAssemblyData assembly_data;
+
+    CeedCallBackend(CeedOperatorGetOperatorAssemblyData(op, &assembly_data));
+    CeedCallBackend(
+        CeedOperatorAssemblyDataGetEvalModes(assembly_data, &num_active_bases_in, NULL, NULL, NULL, &num_active_bases_out, NULL, NULL, NULL, NULL));
+    if (num_active_bases_in == num_active_bases_out) {
+      CeedCallBackend(CeedOperatorBuildKernel_Hip_gen(op, &is_build_good));
+      if (is_build_good) CeedCallBackend(CeedOperatorBuildKernelFullAssemblyAtPoints_Hip_gen(op, &is_build_good));
+    }
+    if (!is_build_good) {
+      CeedDebug(ceed, "Single Operator Assemble at Points compile failed, using fallback\n");
+      data->use_assembly_fallback = true;
+    }
+  }
+
+  // Try assembly
+  if (!data->use_assembly_fallback) {
+    bool                   is_run_good = true;
+    Ceed_Hip              *Hip_data;
+    CeedInt                num_elem, num_input_fields, num_output_fields;
+    CeedEvalMode           eval_mode;
+    CeedScalar            *assembled_array;
+    CeedQFunctionField    *qf_input_fields, *qf_output_fields;
+    CeedQFunction_Hip_gen *qf_data;
+    CeedQFunction          qf;
+    CeedOperatorField     *op_input_fields, *op_output_fields;
+
+    CeedCallBackend(CeedGetData(ceed, &Hip_data));
+    CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
+    CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
+    CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
+    CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
+    CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
+    CeedDebug(ceed, "Running single operator assemble for /gpu/hip/gen\n");
+
+    // Input vectors
+    for (CeedInt i = 0; i < num_input_fields; i++) {
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
+      if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
+        data->fields.inputs[i] = NULL;
+      } else {
+        bool       is_active;
+        CeedVector vec;
+
+        // Get input vector
+        CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
+        is_active = vec == CEED_VECTOR_ACTIVE;
+        if (is_active) data->fields.inputs[i] = NULL;
+        else CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->fields.inputs[i]));
+        CeedCallBackend(CeedVectorDestroy(&vec));
+      }
+    }
+
+    // Point coordinates
+    {
+      CeedVector vec;
+
+      CeedCallBackend(CeedOperatorAtPointsGetPoints(op, NULL, &vec));
+      CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->points.coords));
+      CeedCallBackend(CeedVectorDestroy(&vec));
+
+      // Points per elem
+      if (num_elem != data->points.num_elem) {
+        CeedInt            *points_per_elem;
+        const CeedInt       num_bytes   = num_elem * sizeof(CeedInt);
+        CeedElemRestriction rstr_points = NULL;
+
+        data->points.num_elem = num_elem;
+        CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL));
+        CeedCallBackend(CeedCalloc(num_elem, &points_per_elem));
+        for (CeedInt e = 0; e < num_elem; e++) {
+          CeedInt num_points_elem;
+
+          CeedCallBackend(CeedElemRestrictionGetNumPointsInElement(rstr_points, e, &num_points_elem));
+          points_per_elem[e] = num_points_elem;
+        }
+        if (data->points.num_per_elem) CeedCallHip(ceed, hipFree((void **)data->points.num_per_elem));
+        CeedCallHip(ceed, hipMalloc((void **)&data->points.num_per_elem, num_bytes));
+        CeedCallHip(ceed, hipMemcpy((void *)data->points.num_per_elem, points_per_elem, num_bytes, hipMemcpyHostToDevice));
+        CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
+        CeedCallBackend(CeedFree(&points_per_elem));
+      }
+    }
+
+    // Get context data
+    CeedCallBackend(CeedQFunctionGetInnerContextData(qf, CEED_MEM_DEVICE, &qf_data->d_c));
+
+    // Assembly array
+    CeedCallBackend(CeedVectorGetArray(assembled, CEED_MEM_DEVICE, &assembled_array));
+    CeedScalar *assembled_offset_array = &assembled_array[offset];
+
+    // Assemble diagonal
+    void *opargs[] = {(void *)&num_elem, &qf_data->d_c, &data->indices, &data->fields,          &data->B,
+                      &data->G,          &data->W,      &data->points,  &assembled_offset_array};
+
+    CeedInt block_sizes[3] = {data->thread_1d, (data->dim == 1 ? 1 : data->thread_1d), -1};
+
+    CeedCallBackend(BlockGridCalculate_Hip_gen(data->dim, num_elem, data->max_P_1d, data->Q_1d, block_sizes));
+    block_sizes[2] = 1;
+    if (data->dim == 1) {
+      CeedInt grid      = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
+      CeedInt sharedMem = block_sizes[2] * data->thread_1d * sizeof(CeedScalar);
+
+      CeedCallBackend(CeedTryRunKernelDimShared_Hip(ceed, data->assemble_full, NULL, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem,
+                                                    &is_run_good, opargs));
+    } else if (data->dim == 2) {
+      CeedInt grid      = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
+      CeedInt sharedMem = block_sizes[2] * data->thread_1d * data->thread_1d * sizeof(CeedScalar);
+
+      CeedCallBackend(CeedTryRunKernelDimShared_Hip(ceed, data->assemble_full, NULL, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem,
+                                                    &is_run_good, opargs));
+    } else if (data->dim == 3) {
+      CeedInt grid      = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
+      CeedInt sharedMem = block_sizes[2] * data->thread_1d * data->thread_1d * sizeof(CeedScalar);
+
+      CeedCallBackend(CeedTryRunKernelDimShared_Hip(ceed, data->assemble_full, NULL, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem,
+                                                    &is_run_good, opargs));
+    }
+    CeedCallHip(ceed, hipDeviceSynchronize());
+
+    // Restore input arrays
+    for (CeedInt i = 0; i < num_input_fields; i++) {
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
+      if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
+      } else {
+        bool       is_active;
+        CeedVector vec;
+
+        CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
+        is_active = vec == CEED_VECTOR_ACTIVE;
+        if (!is_active) CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->fields.inputs[i]));
+        CeedCallBackend(CeedVectorDestroy(&vec));
+      }
+    }
+
+    // Restore point coordinates
+    {
+      CeedVector vec;
+
+      CeedCallBackend(CeedOperatorAtPointsGetPoints(op, NULL, &vec));
+      CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->points.coords));
+      CeedCallBackend(CeedVectorDestroy(&vec));
+    }
+
+    // Restore context data
+    CeedCallBackend(CeedQFunctionRestoreInnerContextData(qf, &qf_data->d_c));
+
+    // Restore assembly array
+    CeedCallBackend(CeedVectorRestoreArray(assembled, &assembled_array));
+
+    // Cleanup
+    CeedCallBackend(CeedQFunctionDestroy(&qf));
+    if (!is_run_good) {
+      CeedDebug(ceed, "Single Operator Assemble at Points run failed, using fallback\n");
+      data->use_assembly_fallback = true;
+    }
+  }
+  CeedCallBackend(CeedDestroy(&ceed));
+
+  // Fallback, if needed
+  if (data->use_assembly_fallback) {
+    CeedOperator op_fallback;
+
+    CeedDebug256(CeedOperatorReturnCeed(op), CEED_DEBUG_COLOR_SUCCESS, "Falling back to /gpu/hip/ref CeedOperator");
+    CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback));
+    CeedCallBackend(CeedSingleOperatorAssemble(op_fallback, offset, assembled));
+    return CEED_ERROR_SUCCESS;
+  }
+  return CEED_ERROR_SUCCESS;
+}
+
 //------------------------------------------------------------------------------
 // Create operator
 //------------------------------------------------------------------------------
@@ -496,6 +682,7 @@ int CeedOperatorCreate_Hip_gen(CeedOperator op) {
   CeedCall(CeedOperatorIsAtPoints(op, &is_at_points));
   if (is_at_points) {
     CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddDiagonal", CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip_gen));
+    CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleSingle", CeedSingleOperatorAssembleAtPoints_Hip_gen));
   }
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "Destroy", CeedOperatorDestroy_Hip_gen));
   CeedCallBackend(CeedDestroy(&ceed));
diff --git a/include/ceed/jit-source/hip/hip-gen-templates.h b/include/ceed/jit-source/hip/hip-gen-templates.h
index 68fb8e9693..e442cbc3c2 100644
--- a/include/ceed/jit-source/hip/hip-gen-templates.h
+++ b/include/ceed/jit-source/hip/hip-gen-templates.h
@@ -121,6 +121,25 @@ inline __device__ void WriteLVecStandard1d_Single(SharedData_Hip &data, const Ce
   }
 }
 
+//------------------------------------------------------------------------------
+// E-vector -> L-vector, full assembly
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int COMP_STRIDE, int P_1D>
+inline __device__ void WriteLVecStandard1d_Assembly(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt in,
+                                                    const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
+  const CeedInt in_comp    = in / P_1D;
+  const CeedInt in_node    = in % P_1D;
+  const CeedInt e_vec_size = P_1D * NUM_COMP;
+
+  if (data.t_id_x < P_1D) {
+    const CeedInt out_node = data.t_id_x;
+
+    for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+      d_v[elem * e_vec_size * e_vec_size + (in_comp * NUM_COMP + comp) * P_1D * P_1D + out_node * P_1D + in_node] += r_v[comp];
+    }
+  }
+}
+
 //------------------------------------------------------------------------------
 // E-vector -> L-vector, strided
 //------------------------------------------------------------------------------
@@ -210,6 +229,30 @@ inline __device__ void WriteLVecStandard2d_Single(SharedData_Hip &data, const Ce
   }
 }
 
+//------------------------------------------------------------------------------
+// E-vector -> L-vector, full assembly
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int COMP_STRIDE, int P_1D>
+inline __device__ void WriteLVecStandard2d_Assembly(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt in,
+                                                    const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
+  const CeedInt elem_size  = P_1D * P_1D;
+  const CeedInt in_comp    = in / elem_size;
+  const CeedInt in_node_x  = in % P_1D;
+  const CeedInt in_node_y  = (in % elem_size) / P_1D;
+  const CeedInt e_vec_size = elem_size * NUM_COMP;
+
+  if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
+    const CeedInt in_node  = in_node_x + in_node_y * P_1D;
+    const CeedInt out_node = data.t_id_x + data.t_id_y * P_1D;
+
+    for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+      const CeedInt index = (in_comp * NUM_COMP + comp) * elem_size * elem_size + out_node * elem_size + in_node;
+
+      d_v[elem * e_vec_size * e_vec_size + index] += r_v[comp];
+    }
+  }
+}
+
 //------------------------------------------------------------------------------
 // E-vector -> L-vector, strided
 //------------------------------------------------------------------------------
@@ -336,6 +379,33 @@ inline __device__ void WriteLVecStandard3d_Single(SharedData_Hip &data, const Ce
   }
 }
 
+//------------------------------------------------------------------------------
+// E-vector -> L-vector, full assembly
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int COMP_STRIDE, int P_1D>
+inline __device__ void WriteLVecStandard3d_Assembly(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt in,
+                                                    const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
+  const CeedInt elem_size  = P_1D * P_1D * P_1D;
+  const CeedInt in_comp    = in / elem_size;
+  const CeedInt in_node_x  = in % P_1D;
+  const CeedInt in_node_y  = (in % (P_1D * P_1D)) / P_1D;
+  const CeedInt in_node_z  = (in % elem_size) / (P_1D * P_1D);
+  const CeedInt e_vec_size = elem_size * NUM_COMP;
+
+  if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
+    const CeedInt in_node = in_node_x + in_node_y * P_1D + in_node_z * P_1D * P_1D;
+    for (CeedInt z = 0; z < P_1D; z++) {
+      const CeedInt out_node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
+
+      for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+        const CeedInt index = (in_comp * NUM_COMP + comp) * elem_size * elem_size + out_node * elem_size + in_node;
+
+        d_v[elem * e_vec_size * e_vec_size + index] += r_v[z + comp * P_1D];
+      }
+    }
+  }
+}
+
 //------------------------------------------------------------------------------
 // E-vector -> L-vector, strided
 //------------------------------------------------------------------------------

From a34b87f3093c7ebf2340d51111ccb8fb4cdeaebd Mon Sep 17 00:00:00 2001
From: Zach Atkins <zach.atkins@colorado.edu>
Date: Wed, 18 Jun 2025 14:04:02 -0600
Subject: [PATCH 425/571] Update logic for CeedOperatorLinearAssemble to
 preference single operator assembly over fallback

---
 interface/ceed-preconditioning.c | 46 ++++++++++----------------------
 1 file changed, 14 insertions(+), 32 deletions(-)

diff --git a/interface/ceed-preconditioning.c b/interface/ceed-preconditioning.c
index e1fb0f7b54..b8551c19b5 100644
--- a/interface/ceed-preconditioning.c
+++ b/interface/ceed-preconditioning.c
@@ -2451,7 +2451,7 @@ int CeedOperatorLinearAssembleSymbolic(CeedOperator op, CeedSize *num_entries, C
    @ref User
 **/
 int CeedOperatorLinearAssemble(CeedOperator op, CeedVector values) {
-  bool          is_composite, has_linear_assemble_single;
+  bool          is_composite;
   CeedInt       num_suboperators, offset = 0;
   CeedSize      single_entries = 0;
   CeedOperator *sub_operators;
@@ -2465,35 +2465,27 @@ int CeedOperatorLinearAssemble(CeedOperator op, CeedVector values) {
 
     CeedCall(CeedOperatorGetNumElements(op, &num_elem));
     if (num_elem == 0) return CEED_ERROR_SUCCESS;
-    has_linear_assemble_single = op->LinearAssembleSingle != NULL;
-  } else {
-    CeedCall(CeedCompositeOperatorGetNumSub(op, &num_suboperators));
-    CeedCall(CeedCompositeOperatorGetSubList(op, &sub_operators));
-    has_linear_assemble_single = true;
-    for (CeedInt i = 0; i < num_suboperators; i++) {
-      has_linear_assemble_single = has_linear_assemble_single && sub_operators[i]->LinearAssembleSingle != NULL;
-    }
   }
 
   if (op->LinearAssemble) {
     // Backend version
     CeedCall(op->LinearAssemble(op, values));
     return CEED_ERROR_SUCCESS;
-  } else if (has_linear_assemble_single) {
+  } else if (is_composite) {
     // Default to summing contributions of suboperators
     CeedCall(CeedVectorSetValue(values, 0.0));
-    if (is_composite && num_suboperators > 0 && sub_operators[0]) {
-      CeedCall(CeedCompositeOperatorGetNumSub(op, &num_suboperators));
-      CeedCall(CeedCompositeOperatorGetSubList(op, &sub_operators));
-      for (CeedInt k = 0; k < num_suboperators; k++) {
-        CeedCall(CeedSingleOperatorAssemble(sub_operators[k], offset, values));
-        CeedCall(CeedSingleOperatorAssemblyCountEntries(sub_operators[k], &single_entries));
-        offset += single_entries;
-      }
-    } else {
-      CeedCall(CeedSingleOperatorAssemble(op, offset, values));
+    CeedCall(CeedCompositeOperatorGetNumSub(op, &num_suboperators));
+    CeedCall(CeedCompositeOperatorGetSubList(op, &sub_operators));
+    for (CeedInt k = 0; k < num_suboperators; k++) {
+      CeedCall(CeedSingleOperatorAssemble(sub_operators[k], offset, values));
+      CeedCall(CeedSingleOperatorAssemblyCountEntries(sub_operators[k], &single_entries));
+      offset += single_entries;
     }
     return CEED_ERROR_SUCCESS;
+  } else if (op->LinearAssembleSingle) {
+    CeedCall(CeedVectorSetValue(values, 0.0));
+    CeedCall(CeedSingleOperatorAssemble(op, offset, values));
+    return CEED_ERROR_SUCCESS;
   } else {
     // Operator fallback
     CeedOperator op_fallback;
@@ -2505,19 +2497,9 @@ int CeedOperatorLinearAssemble(CeedOperator op, CeedVector values) {
     }
   }
 
-  // Default interface implementation
+  // Default to interface version if non-composite and no fallback
   CeedCall(CeedVectorSetValue(values, 0.0));
-  if (is_composite) {
-    CeedCall(CeedCompositeOperatorGetNumSub(op, &num_suboperators));
-    CeedCall(CeedCompositeOperatorGetSubList(op, &sub_operators));
-    for (CeedInt k = 0; k < num_suboperators; k++) {
-      CeedCall(CeedSingleOperatorAssemble(sub_operators[k], offset, values));
-      CeedCall(CeedSingleOperatorAssemblyCountEntries(sub_operators[k], &single_entries));
-      offset += single_entries;
-    }
-  } else {
-    CeedCall(CeedSingleOperatorAssemble(op, offset, values));
-  }
+  CeedCall(CeedSingleOperatorAssemble(op, offset, values));
   return CEED_ERROR_SUCCESS;
 }
 

From bce4db6f154acd10f9bd9018530c044d45753d01 Mon Sep 17 00:00:00 2001
From: James Wright <james@jameswright.xyz>
Date: Fri, 20 Jun 2025 15:43:42 +0000
Subject: [PATCH 426/571] feat(sycl): Define CEED_RUNNING_JIT_PASS in JIT

---
 backends/sycl/ceed-sycl-compile.sycl.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backends/sycl/ceed-sycl-compile.sycl.cpp b/backends/sycl/ceed-sycl-compile.sycl.cpp
index 02caf67dbc..0a24ec1e2a 100644
--- a/backends/sycl/ceed-sycl-compile.sycl.cpp
+++ b/backends/sycl/ceed-sycl-compile.sycl.cpp
@@ -61,7 +61,7 @@ static int CeedJitAddDefinitions_Sycl(Ceed ceed, const std::string &kernel_sourc
 // TODO: Add architecture flags, optimization flags
 //------------------------------------------------------------------------------
 static inline int CeedJitGetFlags_Sycl(std::vector<std::string> &flags) {
-  flags = {std::string("-cl-std=CL3.0"), std::string("-Dint32_t=int")};
+  flags = {std::string("-cl-std=CL3.0"), std::string("-Dint32_t=int"), std::string("-DCEED_RUNNING_JIT_PASS=1")};
   return CEED_ERROR_SUCCESS;
 }
 

From 54d1655432d457cbb92f9afed9995f62076cbbfe Mon Sep 17 00:00:00 2001
From: Hugh Carson <hughcars@amazon.com>
Date: Mon, 23 Jun 2025 15:45:49 -0400
Subject: [PATCH 427/571] Add missing early return

---
 interface/ceed-preconditioning.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/interface/ceed-preconditioning.c b/interface/ceed-preconditioning.c
index b8551c19b5..479a28dcb6 100644
--- a/interface/ceed-preconditioning.c
+++ b/interface/ceed-preconditioning.c
@@ -2114,6 +2114,7 @@ int CeedOperatorLinearAssembleAddDiagonal(CeedOperator op, CeedVector assembled,
   } else if (is_composite) {
     // Default to summing contributions of suboperators
     CeedCall(CeedCompositeOperatorLinearAssembleAddDiagonal(op, request, false, assembled));
+    return CEED_ERROR_SUCCESS;
   } else {
     // Operator fallback
     CeedOperator op_fallback;

From bdcc27286a8034df1dd97bd8aefef85a0efa7b00 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Mon, 23 Jun 2025 15:01:07 -0600
Subject: [PATCH 428/571] debug - show JiT defines

---
 backends/cuda/ceed-cuda-compile.cpp | 10 ++++++++++
 backends/hip/ceed-hip-compile.cpp   | 10 ++++++++++
 2 files changed, 20 insertions(+)

diff --git a/backends/cuda/ceed-cuda-compile.cpp b/backends/cuda/ceed-cuda-compile.cpp
index 215c9c017d..367eb5389d 100644
--- a/backends/cuda/ceed-cuda-compile.cpp
+++ b/backends/cuda/ceed-cuda-compile.cpp
@@ -122,6 +122,16 @@ static int CeedCompileCore_Cuda(Ceed ceed, const char *source, const bool throw_
   CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- ATTEMPTING TO COMPILE JIT SOURCE ----------\n");
   CeedDebug(ceed, "Source:\n%s\n", code.str().c_str());
   CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- END OF JIT SOURCE ----------\n");
+  if (CeedDebugFlag(ceed)) {
+    // LCOV_EXCL_START
+    CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- JiT COMPILER OPTIONS ----------\n");
+    for (CeedInt i = 0; i < num_opts + num_jit_source_dirs + num_jit_defines; i++) {
+      CeedDebug(ceed, "Option %d: %s", i, opts[i]);
+    }
+    CeedDebug(ceed, "");
+    CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- END OF JiT COMPILER OPTIONS ----------\n");
+    // LCOV_EXCL_STOP
+  }
   nvrtcResult result = nvrtcCompileProgram(prog, num_opts + num_jit_source_dirs + num_jit_defines, opts);
 
   for (CeedInt i = 0; i < num_jit_source_dirs; i++) {
diff --git a/backends/hip/ceed-hip-compile.cpp b/backends/hip/ceed-hip-compile.cpp
index 557f6e584e..650be9efaa 100644
--- a/backends/hip/ceed-hip-compile.cpp
+++ b/backends/hip/ceed-hip-compile.cpp
@@ -124,6 +124,16 @@ static int CeedCompileCore_Hip(Ceed ceed, const char *source, const bool throw_e
   CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- ATTEMPTING TO COMPILE JIT SOURCE ----------\n");
   CeedDebug(ceed, "Source:\n%s\n", code.str().c_str());
   CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- END OF JIT SOURCE ----------\n");
+  if (CeedDebugFlag(ceed)) {
+    // LCOV_EXCL_START
+    CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- JiT COMPILER OPTIONS ----------\n");
+    for (CeedInt i = 0; i < num_opts + num_jit_source_dirs + num_jit_defines; i++) {
+      CeedDebug(ceed, "Option %d: %s", i, opts[i]);
+    }
+    CeedDebug(ceed, "");
+    CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- END OF JiT COMPILER OPTIONS ----------\n");
+    // LCOV_EXCL_STOP
+  }
   hiprtcResult result = hiprtcCompileProgram(prog, num_opts + num_jit_source_dirs + num_jit_defines, opts);
 
   for (CeedInt i = 0; i < num_jit_source_dirs; i++) {

From c49dc7a77224ad43df2d4313d5938b1b09f285d1 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Thu, 26 Jun 2025 15:50:11 -0600
Subject: [PATCH 429/571] jit - report launch errors when using try-catch

---
 backends/cuda/ceed-cuda-compile.cpp | 21 +++++++++++++++------
 backends/hip/ceed-hip-compile.cpp   | 20 ++++++++++++++++++--
 2 files changed, 33 insertions(+), 8 deletions(-)

diff --git a/backends/cuda/ceed-cuda-compile.cpp b/backends/cuda/ceed-cuda-compile.cpp
index 367eb5389d..370e165fc5 100644
--- a/backends/cuda/ceed-cuda-compile.cpp
+++ b/backends/cuda/ceed-cuda-compile.cpp
@@ -152,12 +152,14 @@ static int CeedCompileCore_Cuda(Ceed ceed, const char *source, const bool throw_
     if (throw_error) {
       return CeedError(ceed, CEED_ERROR_BACKEND, "%s\n%s", nvrtcGetErrorString(result), log);
     } else {
+      // LCOV_EXCL_START
       CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- COMPILE ERROR DETECTED ----------\n");
       CeedDebug(ceed, "Error: %s\nCompile log:\n%s\n", nvrtcGetErrorString(result), log);
       CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- BACKEND MAY FALLBACK ----------\n");
       CeedCallBackend(CeedFree(&log));
       CeedCallNvrtc(ceed, nvrtcDestroyProgram(&prog));
       return CEED_ERROR_SUCCESS;
+      // LCOV_EXCL_STOP
     }
   }
 
@@ -250,17 +252,24 @@ static int CeedRunKernelDimSharedCore_Cuda(Ceed ceed, CUfunction kernel, CUstrea
   CUresult result = cuLaunchKernel(kernel, grid_size, 1, 1, block_size_x, block_size_y, block_size_z, shared_mem_size, stream, args, NULL);
 
   if (result == CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES) {
-    *is_good_run = false;
-    if (throw_error) {
-      int max_threads_per_block, shared_size_bytes, num_regs;
+    int max_threads_per_block, shared_size_bytes, num_regs;
 
-      cuFuncGetAttribute(&max_threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, kernel);
-      cuFuncGetAttribute(&shared_size_bytes, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, kernel);
-      cuFuncGetAttribute(&num_regs, CU_FUNC_ATTRIBUTE_NUM_REGS, kernel);
+    cuFuncGetAttribute(&max_threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, kernel);
+    cuFuncGetAttribute(&shared_size_bytes, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, kernel);
+    cuFuncGetAttribute(&num_regs, CU_FUNC_ATTRIBUTE_NUM_REGS, kernel);
+    if (throw_error) {
       return CeedError(ceed, CEED_ERROR_BACKEND,
                        "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: max_threads_per_block %d on block size (%d,%d,%d), shared_size %d, num_regs %d",
                        max_threads_per_block, block_size_x, block_size_y, block_size_z, shared_size_bytes, num_regs);
+    } else {
+      // LCOV_EXCL_START
+      CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- LAUNCH ERROR DETECTED ----------\n");
+      CeedDebug(ceed, "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: max_threads_per_block %d on block size (%d,%d,%d), shared_size %d, num_regs %d\n",
+                max_threads_per_block, block_size_x, block_size_y, block_size_z, shared_size_bytes, num_regs);
+      CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- BACKEND MAY FALLBACK ----------\n");
+      // LCOV_EXCL_STOP
     }
+    *is_good_run = false;
   } else CeedChk_Cu(ceed, result);
   return CEED_ERROR_SUCCESS;
 }
diff --git a/backends/hip/ceed-hip-compile.cpp b/backends/hip/ceed-hip-compile.cpp
index 650be9efaa..bc44c5e0f7 100644
--- a/backends/hip/ceed-hip-compile.cpp
+++ b/backends/hip/ceed-hip-compile.cpp
@@ -154,12 +154,14 @@ static int CeedCompileCore_Hip(Ceed ceed, const char *source, const bool throw_e
     if (throw_error) {
       return CeedError(ceed, CEED_ERROR_BACKEND, "%s\n%s", hiprtcGetErrorString(result), log);
     } else {
+      // LCOV_EXCL_START
       CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- COMPILE ERROR DETECTED ----------\n");
       CeedDebug(ceed, "Error: %s\nCompile log:\n%s\n", hiprtcGetErrorString(result), log);
       CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- BACKEND MAY FALLBACK ----------\n");
       CeedCallBackend(CeedFree(&log));
       CeedCallHiprtc(ceed, hiprtcDestroyProgram(&prog));
       return CEED_ERROR_SUCCESS;
+      // LCOV_EXCL_STOP
     }
   }
 
@@ -229,8 +231,22 @@ static int CeedRunKernelDimSharedCore_Hip(Ceed ceed, hipFunction_t kernel, hipSt
                                           bool *is_good_run, void **args) {
   hipError_t result = hipModuleLaunchKernel(kernel, grid_size, 1, 1, block_size_x, block_size_y, block_size_z, shared_mem_size, stream, args, NULL);
 
-  *is_good_run = result == hipSuccess;
-  if (throw_error) CeedCallHip(ceed, result);
+  if (result == hipSuccess) {
+    *is_good_run = true;
+  } else {
+    if (throw_error) {
+      CeedCallHip(ceed, result);
+    } else {
+      // LCOV_EXCL_START
+      const char *message = hipGetErrorName(result);
+
+      CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- LAUNCH ERROR DETECTED ----------\n");
+      CeedDebug(ceed, "%s\n", message);
+      CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- BACKEND MAY FALLBACK ----------\n");
+      // LCOV_EXCL_STOP
+    }
+    *is_good_run = false;
+  }
   return CEED_ERROR_SUCCESS;
 }
 

From d3d5610df460248361ab17f2fa259b4661019597 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Thu, 26 Jun 2025 12:41:59 -0600
Subject: [PATCH 430/571] op - add GetName helper

---
 include/ceed/backend.h           |  1 +
 include/ceed/ceed.h              |  1 +
 interface/ceed-operator.c        | 32 +++++++++++++++++++++++++++++---
 interface/ceed-qfunction.c       | 28 ++++++++++++++++++++++++----
 python/tests/output/test_504.out |  4 ++--
 5 files changed, 57 insertions(+), 9 deletions(-)

diff --git a/include/ceed/backend.h b/include/ceed/backend.h
index 3a5171a8b5..a86e317e1e 100644
--- a/include/ceed/backend.h
+++ b/include/ceed/backend.h
@@ -366,6 +366,7 @@ CEED_EXTERN int CeedQFunctionSetFortranStatus(CeedQFunction qf, bool status);
 CEED_EXTERN int CeedQFunctionGetVectorLength(CeedQFunction qf, CeedInt *vec_length);
 CEED_EXTERN int CeedQFunctionGetNumArgs(CeedQFunction qf, CeedInt *num_input_fields, CeedInt *num_output_fields);
 CEED_EXTERN int CeedQFunctionGetKernelName(CeedQFunction qf, const char **kernel_name);
+CEED_EXTERN int CeedQFunctionGetName(CeedQFunction qf, const char **name);
 CEED_EXTERN int CeedQFunctionGetSourcePath(CeedQFunction qf, const char **source_path);
 CEED_EXTERN int CeedQFunctionLoadSourceToBuffer(CeedQFunction qf, const char **source_buffer);
 CEED_EXTERN int CeedQFunctionGetUserFunction(CeedQFunction qf, CeedQFunctionUser *f);
diff --git a/include/ceed/ceed.h b/include/ceed/ceed.h
index b56a719754..af510065eb 100644
--- a/include/ceed/ceed.h
+++ b/include/ceed/ceed.h
@@ -448,6 +448,7 @@ CEED_EXTERN int  CeedOperatorMultigridLevelCreateH1(CeedOperator op_fine, CeedVe
                                                     CeedOperator *op_prolong, CeedOperator *op_restrict);
 CEED_EXTERN int  CeedOperatorCreateFDMElementInverse(CeedOperator op, CeedOperator *fdm_inv, CeedRequest *request);
 CEED_EXTERN int  CeedOperatorSetName(CeedOperator op, const char *name);
+CEED_EXTERN int  CeedOperatorGetName(CeedOperator op, const char **name);
 CEED_EXTERN int  CeedOperatorView(CeedOperator op, FILE *stream);
 CEED_EXTERN int  CeedOperatorViewTerse(CeedOperator op, FILE *stream);
 CEED_EXTERN int  CeedOperatorGetCeed(CeedOperator op, Ceed *ceed);
diff --git a/interface/ceed-operator.c b/interface/ceed-operator.c
index be83326781..184e41799f 100644
--- a/interface/ceed-operator.c
+++ b/interface/ceed-operator.c
@@ -1553,6 +1553,29 @@ int CeedOperatorSetName(CeedOperator op, const char *name) {
   return CEED_ERROR_SUCCESS;
 }
 
+/**
+  @brief Get name of `CeedOperator`
+
+  @param[in]     op   `CeedOperator`
+  @param[in,out] name Address of variable to hold currently set name
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref User
+**/
+int CeedOperatorGetName(CeedOperator op, const char **name) {
+  if (op->name) {
+    *name = op->name;
+  } else if (!op->is_composite) {
+    CeedQFunction qf;
+
+    CeedCall(CeedOperatorGetQFunction(op, &qf));
+    if (qf) CeedCall(CeedQFunctionGetName(qf, name));
+    CeedCall(CeedQFunctionDestroy(&qf));
+  }
+  return CEED_ERROR_SUCCESS;
+}
+
 /**
   @brief Core logic for viewing a `CeedOperator`
 
@@ -1565,8 +1588,11 @@ int CeedOperatorSetName(CeedOperator op, const char *name) {
   @ref Developer
 **/
 static int CeedOperatorView_Core(CeedOperator op, FILE *stream, bool is_full) {
-  bool has_name = op->name, is_composite, is_at_points;
+  bool        has_name, is_composite, is_at_points;
+  const char *name = NULL;
 
+  CeedCall(CeedOperatorGetName(op, &name));
+  has_name = name ? strlen(name) : false;
   CeedCall(CeedOperatorIsComposite(op, &is_composite));
   CeedCall(CeedOperatorIsAtPoints(op, &is_at_points));
   if (is_composite) {
@@ -1575,7 +1601,7 @@ static int CeedOperatorView_Core(CeedOperator op, FILE *stream, bool is_full) {
 
     CeedCall(CeedCompositeOperatorGetNumSub(op, &num_suboperators));
     CeedCall(CeedCompositeOperatorGetSubList(op, &sub_operators));
-    fprintf(stream, "Composite CeedOperator%s%s\n", has_name ? " - " : "", has_name ? op->name : "");
+    fprintf(stream, "Composite CeedOperator%s%s\n", has_name ? " - " : "", has_name ? name : "");
 
     for (CeedInt i = 0; i < num_suboperators; i++) {
       has_name = sub_operators[i]->name;
@@ -1584,7 +1610,7 @@ static int CeedOperatorView_Core(CeedOperator op, FILE *stream, bool is_full) {
       if (is_full) CeedCall(CeedOperatorSingleView(sub_operators[i], 1, stream));
     }
   } else {
-    fprintf(stream, "CeedOperator%s%s%s\n", is_at_points ? " AtPoints" : "", has_name ? " - " : "", has_name ? op->name : "");
+    fprintf(stream, "CeedOperator%s%s%s\n", is_at_points ? " AtPoints" : "", has_name ? " - " : "", has_name ? name : "");
     if (is_full) CeedCall(CeedOperatorSingleView(op, 0, stream));
   }
   return CEED_ERROR_SUCCESS;
diff --git a/interface/ceed-qfunction.c b/interface/ceed-qfunction.c
index 22e90c8b54..4f9f5e55cd 100644
--- a/interface/ceed-qfunction.c
+++ b/interface/ceed-qfunction.c
@@ -196,11 +196,31 @@ int CeedQFunctionGetNumArgs(CeedQFunction qf, CeedInt *num_input, CeedInt *num_o
   return CEED_ERROR_SUCCESS;
 }
 
+/**
+  @brief Get the name of the `CeedQFunction`.
+    Use the `name` if created via @ref `CeedQFunctionCreateInteriorByName`, otherwise return the kernel name via @ref `CeedQFunctionGetKernelName`.
+
+  @param[in]  qf          `CeedQFunction`
+  @param[out] kernel_name Variable to store `CeedQFunction` name
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Backend
+**/
+int CeedQFunctionGetName(CeedQFunction qf, const char **name) {
+  if (qf->is_gallery) {
+    *name = qf->gallery_name;
+  } else {
+    CeedCall(CeedQFunctionGetKernelName(qf, name));
+  }
+  return CEED_ERROR_SUCCESS;
+}
+
 /**
   @brief Get the name of the user function for a `CeedQFunction`
 
   @param[in]  qf          `CeedQFunction`
-  @param[out] kernel_name Variable to store source path string
+  @param[out] kernel_name Variable to store string holding kernel name
 
   @return An error code: 0 - success, otherwise - failure
 
@@ -1001,10 +1021,10 @@ int CeedQFunctionSetUserFlopsEstimate(CeedQFunction qf, CeedSize flops) {
   @ref User
 **/
 int CeedQFunctionView(CeedQFunction qf, FILE *stream) {
-  const char *kernel_name;
+  const char *name;
 
-  CeedCall(CeedQFunctionGetKernelName(qf, &kernel_name));
-  fprintf(stream, "%sCeedQFunction - %s\n", qf->is_gallery ? "Gallery " : "User ", qf->is_gallery ? qf->gallery_name : kernel_name);
+  CeedCall(CeedQFunctionGetName(qf, &name));
+  fprintf(stream, "%sCeedQFunction - %s\n", qf->is_gallery ? "Gallery " : "User ", name);
 
   fprintf(stream, "  %" CeedInt_FMT " input field%s:\n", qf->num_input_fields, qf->num_input_fields > 1 ? "s" : "");
   for (CeedInt i = 0; i < qf->num_input_fields; i++) {
diff --git a/python/tests/output/test_504.out b/python/tests/output/test_504.out
index 3e1d828503..3b8acf130a 100644
--- a/python/tests/output/test_504.out
+++ b/python/tests/output/test_504.out
@@ -1,4 +1,4 @@
-CeedOperator
+CeedOperator - setup_mass
   15 elements with 8 quadrature points each
   3 fields
   2 input fields:
@@ -20,7 +20,7 @@ CeedOperator
       No basis
       Active vector
 
-CeedOperator
+CeedOperator - apply_mass
   15 elements with 8 quadrature points each
   3 fields
   2 input fields:

From 120566fcca1d0a2e0b255629f5180057633cc46e Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Thu, 26 Jun 2025 12:44:12 -0600
Subject: [PATCH 431/571] pc - clearer debug message

---
 interface/ceed-preconditioning.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/interface/ceed-preconditioning.c b/interface/ceed-preconditioning.c
index 479a28dcb6..c62cd66826 100644
--- a/interface/ceed-preconditioning.c
+++ b/interface/ceed-preconditioning.c
@@ -1838,14 +1838,16 @@ int CeedOperatorGetFallback(CeedOperator op, CeedOperator *op_fallback) {
     CeedCall(CeedIsDebug(ceed, &is_debug));
     if (is_debug) {
       Ceed        ceed_fallback;
-      const char *resource, *resource_fallback;
+      const char *resource, *resource_fallback, *op_name;
 
       CeedCall(CeedGetOperatorFallbackCeed(ceed, &ceed_fallback));
       CeedCall(CeedGetResource(ceed, &resource));
       CeedCall(CeedGetResource(ceed_fallback, &resource_fallback));
+      CeedCall(CeedOperatorGetName(op, &op_name));
 
       CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "---------- CeedOperator Fallback ----------\n");
-      CeedDebug(ceed, "Falling back from %s operator at address %p to %s operator at address %p\n", resource, op, resource_fallback, op->op_fallback);
+      CeedDebug(ceed, "CeedOperator \"%s\": falling back from operator at address %p with backend %s to operator at address %p with backend %s\n",
+                op_name, resource, op, resource_fallback, op->op_fallback);
       CeedCall(CeedDestroy(&ceed_fallback));
     }
     CeedCall(CeedDestroy(&ceed));

From 3e2e790d55559d3802e94c4d2ca1c8066babe5db Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 27 Jun 2025 17:29:08 -0600
Subject: [PATCH 432/571] gen - add missing syncthreads in flattened kernels

---
 .../cuda-shared-basis-tensor-flattened-templates.h     | 10 ++++++++++
 .../hip/hip-shared-basis-tensor-flattened-templates.h  | 10 ++++++++++
 2 files changed, 20 insertions(+)

diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-flattened-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-flattened-templates.h
index d004c341a6..4f76825d50 100644
--- a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-flattened-templates.h
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-flattened-templates.h
@@ -138,6 +138,7 @@ inline __device__ void InterpTensor2dFlattened(SharedData_Cuda &data, CeedScalar
     ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
     ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
   }
+  __syncthreads();
   if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
   if (Q_1D != T_1D) QPack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V);
 }
@@ -156,6 +157,7 @@ inline __device__ void InterpTransposeTensor2dFlattened(SharedData_Cuda &data, C
     ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
     ContractTransposeX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
   }
+  __syncthreads();
   if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V);
 }
 
@@ -175,6 +177,7 @@ inline __device__ void GradTensor2dFlattened(SharedData_Cuda &data, CeedScalar *
     ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
     ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp + 1 * NUM_COMP]);
   }
+  __syncthreads();
   if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
   if (Q_1D != T_1D) QPack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V);
 }
@@ -195,6 +198,7 @@ inline __device__ void GradTransposeTensor2dFlattened(SharedData_Cuda &data, Cee
     ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 1 * NUM_COMP], c_G, r_t);
     ContractTransposeAddX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
   }
+  __syncthreads();
   if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V);
 }
 
@@ -404,6 +408,7 @@ inline __device__ void InterpTensor3dFlattened(SharedData_Cuda &data, CeedScalar
     ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
     ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]);
   }
+  __syncthreads();
   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
   if (Q_1D != T_1D) QPack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
 }
@@ -423,6 +428,7 @@ inline __device__ void InterpTransposeTensor3dFlattened(SharedData_Cuda &data, C
     ContractTransposeY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
     ContractTransposeX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]);
   }
+  __syncthreads();
   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
 }
 
@@ -447,6 +453,7 @@ inline __device__ void GradTensor3dFlattened(SharedData_Cuda &data, CeedScalar *
     ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
     ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_G, &r_V[comp + 2 * NUM_COMP]);
   }
+  __syncthreads();
   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
   if (Q_1D != T_1D) QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
 }
@@ -472,6 +479,7 @@ inline __device__ void GradTransposeTensor3dFlattened(SharedData_Cuda &data, Cee
     ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
     ContractTransposeAddX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp]);
   }
+  __syncthreads();
   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
 }
 
@@ -493,6 +501,7 @@ inline __device__ void GradTensorCollocated3dFlattened(SharedData_Cuda &data, Ce
     ContractY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 1 * NUM_COMP]);
     ContractZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 2 * NUM_COMP]);
   }
+  __syncthreads();
   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
   if (Q_1D != T_1D) QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
 }
@@ -515,6 +524,7 @@ inline __device__ void GradTransposeTensorCollocated3dFlattened(SharedData_Cuda
     ContractTransposeY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
     ContractTransposeX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]);
   }
+  __syncthreads();
   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
 }
 
diff --git a/include/ceed/jit-source/hip/hip-shared-basis-tensor-flattened-templates.h b/include/ceed/jit-source/hip/hip-shared-basis-tensor-flattened-templates.h
index 94e17c1dc0..25c5078718 100644
--- a/include/ceed/jit-source/hip/hip-shared-basis-tensor-flattened-templates.h
+++ b/include/ceed/jit-source/hip/hip-shared-basis-tensor-flattened-templates.h
@@ -138,6 +138,7 @@ inline __device__ void InterpTensor2dFlattened(SharedData_Hip &data, CeedScalar
     ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
     ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
   }
+  __syncthreads();
   if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
   if (Q_1D != T_1D) QPack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V);
 }
@@ -156,6 +157,7 @@ inline __device__ void InterpTransposeTensor2dFlattened(SharedData_Hip &data, Ce
     ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
     ContractTransposeX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
   }
+  __syncthreads();
   if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V);
 }
 
@@ -175,6 +177,7 @@ inline __device__ void GradTensor2dFlattened(SharedData_Hip &data, CeedScalar *_
     ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
     ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp + 1 * NUM_COMP]);
   }
+  __syncthreads();
   if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
   if (Q_1D != T_1D) QPack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V);
 }
@@ -195,6 +198,7 @@ inline __device__ void GradTransposeTensor2dFlattened(SharedData_Hip &data, Ceed
     ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 1 * NUM_COMP], c_G, r_t);
     ContractTransposeAddX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
   }
+  __syncthreads();
   if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V);
 }
 
@@ -404,6 +408,7 @@ inline __device__ void InterpTensor3dFlattened(SharedData_Hip &data, CeedScalar
     ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
     ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]);
   }
+  __syncthreads();
   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
   if (Q_1D != T_1D) QPack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
 }
@@ -423,6 +428,7 @@ inline __device__ void InterpTransposeTensor3dFlattened(SharedData_Hip &data, Ce
     ContractTransposeY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
     ContractTransposeX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]);
   }
+  __syncthreads();
   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
 }
 
@@ -447,6 +453,7 @@ inline __device__ void GradTensor3dFlattened(SharedData_Hip &data, CeedScalar *_
     ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
     ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_G, &r_V[comp + 2 * NUM_COMP]);
   }
+  __syncthreads();
   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
   if (Q_1D != T_1D) QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
 }
@@ -472,6 +479,7 @@ inline __device__ void GradTransposeTensor3dFlattened(SharedData_Hip &data, Ceed
     ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
     ContractTransposeAddX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp]);
   }
+  __syncthreads();
   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
 }
 
@@ -493,6 +501,7 @@ inline __device__ void GradTensorCollocated3dFlattened(SharedData_Hip &data, Cee
     ContractY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 1 * NUM_COMP]);
     ContractZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 2 * NUM_COMP]);
   }
+  __syncthreads();
   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
   if (Q_1D != T_1D) QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
 }
@@ -515,6 +524,7 @@ inline __device__ void GradTransposeTensorCollocated3dFlattened(SharedData_Hip &
     ContractTransposeY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
     ContractTransposeX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]);
   }
+  __syncthreads();
   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
 }
 

From 025ec10c5198b74b3e040f4a57328b8535504c96 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 27 Jun 2025 17:29:29 -0600
Subject: [PATCH 433/571] minor - consistency with &=

---
 interface/ceed-operator.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/interface/ceed-operator.c b/interface/ceed-operator.c
index 184e41799f..d0545ba7e6 100644
--- a/interface/ceed-operator.c
+++ b/interface/ceed-operator.c
@@ -586,7 +586,7 @@ int CeedOperatorHasTensorBases(CeedOperator op, bool *has_tensor_bases) {
     CeedCall(CeedOperatorFieldGetBasis(input_fields[i], &basis));
     if (basis != CEED_BASIS_NONE) {
       CeedCall(CeedBasisIsTensor(basis, &is_tensor));
-      *has_tensor_bases &= is_tensor;
+      *has_tensor_bases = *has_tensor_bases & is_tensor;
     }
     CeedCall(CeedBasisDestroy(&basis));
   }
@@ -597,7 +597,7 @@ int CeedOperatorHasTensorBases(CeedOperator op, bool *has_tensor_bases) {
     CeedCall(CeedOperatorFieldGetBasis(output_fields[i], &basis));
     if (basis != CEED_BASIS_NONE) {
       CeedCall(CeedBasisIsTensor(basis, &is_tensor));
-      *has_tensor_bases &= is_tensor;
+      *has_tensor_bases = *has_tensor_bases & is_tensor;
     }
     CeedCall(CeedBasisDestroy(&basis));
   }

From 2b0143a5204bafdd8e64d42b75210eecaef0ebbd Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Mon, 30 Jun 2025 08:51:49 -0600
Subject: [PATCH 434/571] make - use root as safe dir for GIT_DESCRIBE

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 822e21f83c..0f3c74fd5a 100644
--- a/Makefile
+++ b/Makefile
@@ -854,7 +854,7 @@ $(OBJDIR)/ceed.pc : pkgconfig-prefix = $(prefix)
 	    -e "s:%opt%:$(OPT):" \
 	    -e "s:%libs_private%:$(pkgconfig-libs-private):" $< > $@
 
-GIT_DESCRIBE = $(shell git describe --always --dirty 2>/dev/null || printf "unknown\n")
+GIT_DESCRIBE = $(shell git -c safe.directory=$PWD describe --always --dirty 2>/dev/null || printf "unknown\n")
 
 $(OBJDIR)/interface/ceed-config.o: Makefile
 $(OBJDIR)/interface/ceed-config.o: CONFIGFLAGS += -DCEED_GIT_VERSION="\"$(GIT_DESCRIBE)\""

From 0816752e297ae5dd4074175fee440caf6c69c9f1 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Wed, 2 Jul 2025 14:20:56 -0600
Subject: [PATCH 435/571] cuda - QFunction assembly for gen

---
 .../cuda-gen/ceed-cuda-gen-operator-build.cpp | 570 ++++++++++++++++++
 .../cuda-gen/ceed-cuda-gen-operator-build.h   |   1 +
 backends/cuda-gen/ceed-cuda-gen-operator.c    | 199 ++++++
 backends/cuda-gen/ceed-cuda-gen.h             |   4 +-
 include/ceed/backend.h                        |   2 +
 .../ceed/jit-source/cuda/cuda-gen-templates.h |  47 ++
 interface/ceed-preconditioning.c              | 157 +++--
 tests/t531-operator.c                         |   2 +-
 8 files changed, 925 insertions(+), 57 deletions(-)

diff --git a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
index 98b5fdd85a..481c358466 100644
--- a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
+++ b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
@@ -2072,3 +2072,573 @@ extern "C" int CeedOperatorBuildKernelFullAssemblyAtPoints_Cuda_gen(CeedOperator
 }
 
 //------------------------------------------------------------------------------
+// Build QFunction assembly operator kernel
+//------------------------------------------------------------------------------
+extern "C" int CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen(CeedOperator op, bool *is_good_build) {
+  bool                    is_all_tensor = true, is_all_nontensor = true, is_at_points = false, use_3d_slices = false;
+  Ceed                    ceed;
+  CeedInt                 Q, Q_1d, num_input_fields, num_output_fields, max_dim = 1, max_num_points = 0;
+  CeedQFunctionField     *qf_input_fields, *qf_output_fields;
+  CeedQFunction_Cuda_gen *qf_data;
+  CeedQFunction           qf;
+  CeedOperatorField      *op_input_fields, *op_output_fields;
+  CeedOperator_Cuda_gen  *data;
+  std::ostringstream      code;
+  Tab                     tab;
+
+  // Check compatibility
+  CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
+  CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points));
+  CeedCheck(!is_at_points, ceed, CEED_ERROR_BACKEND, "AtPoints QFunction assembly is not supported");
+
+  // Check field compatibility
+  CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
+  {
+    bool has_shared_bases = true;
+
+    for (CeedInt i = 0; i < num_input_fields; i++) {
+      CeedBasis basis;
+
+      CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
+      if (basis != CEED_BASIS_NONE) {
+        bool        is_tensor = true;
+        const char *resource;
+        char       *resource_root;
+        Ceed        basis_ceed;
+
+        CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
+        is_all_tensor    = is_all_tensor && is_tensor;
+        is_all_nontensor = is_all_nontensor && !is_tensor;
+        CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
+        CeedCallBackend(CeedGetResource(basis_ceed, &resource));
+        CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
+        has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/cuda/shared");
+        CeedCallBackend(CeedFree(&resource_root));
+        CeedCallBackend(CeedDestroy(&basis_ceed));
+      }
+      CeedCallBackend(CeedBasisDestroy(&basis));
+    }
+
+    for (CeedInt i = 0; i < num_output_fields; i++) {
+      CeedBasis basis;
+
+      CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
+      if (basis != CEED_BASIS_NONE) {
+        bool        is_tensor = true;
+        const char *resource;
+        char       *resource_root;
+        Ceed        basis_ceed;
+
+        CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
+        is_all_tensor    = is_all_tensor && is_tensor;
+        is_all_nontensor = is_all_nontensor && !is_tensor;
+
+        CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
+        CeedCallBackend(CeedGetResource(basis_ceed, &resource));
+        CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
+        has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/cuda/shared");
+        CeedCallBackend(CeedFree(&resource_root));
+        CeedCallBackend(CeedDestroy(&basis_ceed));
+      }
+      CeedCallBackend(CeedBasisDestroy(&basis));
+    }
+  }
+
+  // Retrieve operator data
+  CeedCallBackend(CeedOperatorGetData(op, &data));
+  Q       = data->Q;
+  Q_1d    = data->Q_1d;
+  max_dim = data->dim;
+  CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
+  CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
+  CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
+
+  // Add atomicAdd function for old NVidia architectures
+  {
+    Ceed_Cuda            *ceed_data;
+    struct cudaDeviceProp prop;
+
+    CeedCallBackend(CeedGetData(ceed, &ceed_data));
+    CeedCallBackend(cudaGetDeviceProperties(&prop, ceed_data->device_id));
+    if ((prop.major < 6) && (CEED_SCALAR_TYPE != CEED_SCALAR_FP32)) {
+      code << tab << "// AtomicAdd fallback source\n";
+      code << tab << "#include <ceed/jit-source/cuda/cuda-atomic-add-fallback.h>\n\n";
+    }
+  }
+
+  // Load basis source files
+  if (!is_all_nontensor) {
+    code << tab << "// Tensor basis source\n";
+    code << tab << "#include <ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h>\n\n";
+  }
+  if (!is_all_tensor) {
+    code << tab << "// Non-tensor basis source\n";
+    code << tab << "#include <ceed/jit-source/cuda/cuda-shared-basis-nontensor-templates.h>\n\n";
+  }
+  if (!is_all_tensor && !is_all_nontensor) {
+    code << "// Tensor basis source\n";
+    code << "#include <ceed/jit-source/cuda/cuda-shared-basis-tensor-flattened-templates.h>\n\n";
+  }
+  code << "// CodeGen operator source\n";
+  code << "#include <ceed/jit-source/cuda/cuda-gen-templates.h>\n\n";
+
+  // Get QFunction name
+  std::string qfunction_name(qf_data->qfunction_name);
+  std::string operator_name;
+
+  operator_name = "CeedKernelCudaGenQFunctionAssembly_" + qfunction_name;
+
+  // Define CEED_Q_VLA
+  code << "\n" << tab << "#undef CEED_Q_VLA\n";
+  if (max_dim != 3 || is_at_points || use_3d_slices || !is_all_tensor) {
+    code << tab << "#define CEED_Q_VLA 1\n\n";
+  } else {
+    code << tab << "#define CEED_Q_VLA " << Q_1d << "\n\n";
+  }
+
+  // Add user QFunction source
+  {
+    const char *source_path;
+
+    CeedCallBackend(CeedQFunctionGetSourcePath(qf, &source_path));
+    CeedCheck(source_path, ceed, CEED_ERROR_UNSUPPORTED, "/gpu/cuda/gen backend requires QFunction source code file");
+
+    code << tab << "// User QFunction source\n";
+    code << tab << "#include \"" << source_path << "\"\n\n";
+  }
+
+  // Setup
+  code << "\n" << tab << "// -----------------------------------------------------------------------------\n";
+  code << tab << "// Operator Assembly Kernel\n";
+  code << tab << "// \n";
+  code << tab << "// d_[in,out]_i:   CeedVector device array\n";
+  code << tab << "// r_[in,out]_e_i: Element vector register\n";
+  code << tab << "// r_[in,out]_q_i: Quadrature space vector register\n";
+  code << tab << "// r_[in,out]_c_i: AtPoints Chebyshev coefficients register\n";
+  code << tab << "// r_[in,out]_s_i: Quadrature space slice vector register\n";
+  code << tab << "// \n";
+  code << tab << "// s_B_[in,out]_i: Interpolation matrix, shared memory\n";
+  code << tab << "// s_G_[in,out]_i: Gradient matrix, shared memory\n";
+  code << tab << "// -----------------------------------------------------------------------------\n";
+  code << tab << "extern \"C\" __global__ void " << operator_name
+       << "(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalar *W, Points_Cuda "
+          "points, CeedScalar *__restrict__ values_array) {\n";
+  tab.push();
+
+  // Scratch buffers
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedEvalMode eval_mode;
+
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
+    if (eval_mode != CEED_EVAL_WEIGHT) {  // Skip CEED_EVAL_WEIGHT
+      code << tab << "const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n";
+    }
+  }
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    code << tab << "CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n";
+  }
+
+  code << tab << "const CeedInt max_dim = " << max_dim << ";\n";
+  if (!is_all_tensor) {
+    code << tab << "const CeedInt Q = " << Q << ";\n";
+  }
+  if (!is_all_nontensor) {
+    code << tab << "const CeedInt Q_1d = " << Q_1d << ";\n";
+  }
+
+  // Shared data
+  code << tab << "extern __shared__ CeedScalar slice[];\n";
+  code << tab << "SharedData_Cuda data;\n";
+  code << tab << "data.t_id_x = threadIdx.x;\n";
+  code << tab << "data.t_id_y = threadIdx.y;\n";
+  code << tab << "data.t_id_z = threadIdx.z;\n";
+  code << tab << "data.t_id   = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
+  code << tab << "data.slice  = slice + data.t_id_z*OP_T_1D" << ((!is_all_tensor || max_dim == 1) ? "" : "*OP_T_1D") << ";\n";
+
+  // -- Determine input mat reuse
+  FieldReuse_Cuda input_matrix_reuse[CEED_FIELD_MAX];
+
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    input_matrix_reuse[i].index = -1;
+  }
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    bool         is_tensor = true;
+    CeedEvalMode eval_mode_i;
+    CeedBasis    basis_i;
+
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode_i));
+    if (eval_mode_i == CEED_EVAL_WEIGHT) continue;
+    CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis_i));
+    CeedCallBackend(CeedBasisIsTensor(basis_i, &is_tensor));
+    for (CeedInt j = 0; (input_matrix_reuse[i].index == -1) && (j < i); j++) {
+      CeedEvalMode eval_mode_j;
+      CeedBasis    basis_j;
+
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
+      if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
+      CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
+      if (basis_i == basis_j) {
+        if (is_tensor) {
+          input_matrix_reuse[i].index     = j;
+          input_matrix_reuse[i].is_input  = true;
+          input_matrix_reuse[i].eval_mode = eval_mode_j;
+        } else {
+          // For non-tensor can only re-use with the same eval mode
+          if (eval_mode_i == eval_mode_j) {
+            input_matrix_reuse[i].index     = j;
+            input_matrix_reuse[i].is_input  = true;
+            input_matrix_reuse[i].eval_mode = eval_mode_j;
+          }
+        }
+      }
+      CeedCallBackend(CeedBasisDestroy(&basis_j));
+    }
+    CeedCallBackend(CeedBasisDestroy(&basis_i));
+  }
+
+  // -- Determine output mat reuse
+  FieldReuse_Cuda output_matrix_reuse[CEED_FIELD_MAX];
+
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    output_matrix_reuse[i].index = -1;
+  }
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    bool         is_tensor = true;
+    CeedEvalMode eval_mode_i;
+    CeedBasis    basis_i;
+
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode_i));
+    CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis_i));
+    CeedCallBackend(CeedBasisIsTensor(basis_i, &is_tensor));
+    for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < num_input_fields); j++) {
+      CeedEvalMode eval_mode_j;
+      CeedBasis    basis_j;
+
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
+      if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
+      CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
+      if (basis_i == basis_j) {
+        if (is_tensor) {
+          output_matrix_reuse[i].index     = j;
+          output_matrix_reuse[i].is_input  = true;
+          output_matrix_reuse[i].eval_mode = eval_mode_j;
+        } else {
+          // For non-tensor can only re-use with the same eval mode
+          if (eval_mode_i == eval_mode_j) {
+            output_matrix_reuse[i].index     = j;
+            output_matrix_reuse[i].is_input  = true;
+            output_matrix_reuse[i].eval_mode = eval_mode_j;
+          }
+        }
+      }
+      CeedCallBackend(CeedBasisDestroy(&basis_j));
+    }
+    for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < i); j++) {
+      CeedEvalMode eval_mode_j;
+      CeedBasis    basis_j;
+
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode_j));
+      if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
+      CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[j], &basis_j));
+      if (basis_i == basis_j) {
+        if (is_tensor) {
+          output_matrix_reuse[i].index     = j;
+          output_matrix_reuse[i].is_input  = false;
+          output_matrix_reuse[i].eval_mode = eval_mode_j;
+        } else {
+          // For non-tensor can only re-use with the same eval mode
+          if (eval_mode_i == eval_mode_j) {
+            output_matrix_reuse[i].index     = j;
+            output_matrix_reuse[i].is_input  = false;
+            output_matrix_reuse[i].eval_mode = eval_mode_j;
+          }
+        }
+      }
+      CeedCallBackend(CeedBasisDestroy(&basis_j));
+    }
+    CeedCallBackend(CeedBasisDestroy(&basis_i));
+  }
+
+  // Initialize constants, and matrices B and G
+  code << "\n" << tab << "// Input field constants and basis data\n";
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, tab, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i],
+                                                              max_dim, Q, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices));
+  }
+  code << "\n" << tab << "// Output field constants and basis data\n";
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i],
+                                                              max_dim, Q, Q_1d, false, is_all_tensor, is_at_points, use_3d_slices));
+  }
+
+  // Loop over all elements
+  code << "\n" << tab << "// Element loop\n";
+  code << tab << "__syncthreads();\n";
+  code << tab << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n";
+  tab.push();
+
+  // -- Compute minimum buffer space needed
+  CeedInt max_rstr_buffer_size = 1;
+
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedEvalMode eval_mode;
+
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
+    if (eval_mode != CEED_EVAL_NONE && eval_mode != CEED_EVAL_WEIGHT) {
+      CeedInt             num_comp;
+      CeedElemRestriction elem_rstr;
+
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
+      CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
+      max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1));
+      CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
+    }
+  }
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    CeedEvalMode eval_mode;
+
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
+    if (eval_mode != CEED_EVAL_NONE) {
+      CeedInt             num_comp;
+      CeedElemRestriction elem_rstr;
+
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
+      CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
+      max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1));
+      CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
+    }
+  }
+  code << tab << "// Scratch restriction buffer space\n";
+  code << tab << "CeedScalar r_e_scratch[" << max_rstr_buffer_size << "];\n";
+
+  // -- Determine best input field processing order
+  CeedInt field_rstr_in_buffer[CEED_FIELD_MAX], input_field_order[CEED_FIELD_MAX];
+
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    field_rstr_in_buffer[i] = -1;
+    input_field_order[i]    = -1;
+  }
+  {
+    bool    is_ordered[CEED_FIELD_MAX];
+    CeedInt curr_index = 0;
+
+    for (CeedInt i = 0; i < num_input_fields; i++) is_ordered[i] = false;
+    for (CeedInt i = 0; i < num_input_fields; i++) {
+      CeedVector          vec_i;
+      CeedElemRestriction rstr_i;
+
+      if (is_ordered[i]) continue;
+      field_rstr_in_buffer[i]       = i;
+      is_ordered[i]                 = true;
+      input_field_order[curr_index] = i;
+      curr_index++;
+      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i));
+      if (vec_i == CEED_VECTOR_NONE) continue;  // CEED_EVAL_WEIGHT
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i));
+      for (CeedInt j = i + 1; j < num_input_fields; j++) {
+        CeedVector          vec_j;
+        CeedElemRestriction rstr_j;
+
+        CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec_j));
+        CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[j], &rstr_j));
+        if (rstr_i == rstr_j && vec_i == vec_j) {
+          field_rstr_in_buffer[j]       = i;
+          is_ordered[j]                 = true;
+          input_field_order[curr_index] = j;
+          curr_index++;
+        }
+        CeedCallBackend(CeedVectorDestroy(&vec_j));
+        CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j));
+      }
+      CeedCallBackend(CeedVectorDestroy(&vec_i));
+      CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
+    }
+  }
+
+  // -- Input restriction and basis
+  code << "\n" << tab << "// -- Input field restrictions and basis actions\n";
+  CeedInt num_active_in = 0, num_active_out = 0, qf_assembly_size_out = 0;
+  CeedInt active_fields_in[CEED_FIELD_MAX], active_fields_out[CEED_FIELD_MAX];
+
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    bool          is_active = false;
+    const char   *field_name;
+    const CeedInt f = input_field_order[i];
+
+    {
+      CeedVector vec;
+
+      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[f], &vec));
+      is_active = vec == CEED_VECTOR_ACTIVE;
+      CeedCallBackend(CeedVectorDestroy(&vec));
+    }
+
+    CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[f], &field_name));
+    code << tab << "// ---- Input field " << f << ": " << field_name << "\n";
+
+    if (is_active) {
+      CeedEvalMode eval_mode;
+      CeedInt      field_size;
+
+      active_fields_in[num_active_in] = f;
+      num_active_in++;
+      CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[f], &field_size));
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[f], &eval_mode));
+      if (eval_mode == CEED_EVAL_GRAD) {
+        code << tab << "CeedScalar r_q_in_" << f << "[num_comp_in_" << f << "*" << "dim_in_" << f << "*"
+             << (is_all_tensor && (max_dim >= 3) ? "Q_1d" : "1") << "] = {0.};\n";
+      } else {
+        code << tab << "CeedScalar r_q_in_" << f << "[num_comp_in_" << f << "*" << (is_all_tensor && (max_dim >= 3) ? "Q_1d" : "1") << "] = {0.};\n";
+      }
+      code << tab << "const CeedInt field_size_in_" << f << " = " << field_size << ";\n";
+    } else {
+      // ---- Restriction
+      CeedCallBackend(CeedOperatorBuildKernelRestriction_Cuda_gen(code, data, tab, f, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f],
+                                                                  max_dim, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices));
+
+      // ---- Basis action
+      CeedCallBackend(CeedOperatorBuildKernelBasis_Cuda_gen(code, data, tab, f, op_input_fields[f], qf_input_fields[f], max_dim, Q_1d, true,
+                                                            is_all_tensor, is_at_points, use_3d_slices));
+    }
+  }
+  code << tab << "const CeedInt field_sizes_in[" << num_active_in << "] = {";
+  for (CeedInt i = 0; i < num_active_in; i++) {
+    code << "field_size_in_" << active_fields_in[i] << (i < num_active_in - 1 ? ", " : "");
+  }
+  code << "};\n";
+  code << tab << "CeedScalar * r_q_in[" << num_active_in << "] = {";
+  for (CeedInt i = 0; i < num_active_in; i++) {
+    code << "r_q_in_" << active_fields_in[i] << (i < num_active_in - 1 ? ", " : "");
+  }
+  code << "};\n";
+
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    bool is_active = false;
+
+    {
+      CeedVector vec;
+
+      CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
+      is_active = vec == CEED_VECTOR_ACTIVE;
+      CeedCallBackend(CeedVectorDestroy(&vec));
+    }
+    if (is_active) {
+      const char *field_name;
+      CeedInt     field_size;
+
+      active_fields_out[num_active_out] = i;
+      num_active_out++;
+      CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &field_size));
+      qf_assembly_size_out += field_size;
+      CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
+      code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
+      code << tab << "const CeedInt field_size_out_" << i << " = " << field_size << ";\n";
+    }
+  }
+  code << tab << "const CeedInt field_sizes_out[" << num_active_out << "] = {";
+  for (CeedInt i = 0; i < num_active_out; i++) {
+    code << "field_size_out_" << active_fields_out[i] << (i < num_active_out - 1 ? ", " : "");
+  }
+  code << "};\n";
+  code << tab << "const CeedInt total_size_out = " << qf_assembly_size_out << ";\n";
+
+  // -- Loop over active field
+  code << "\n" << tab << "CeedInt input_offset = 0;\n";
+  code << tab << "// Loop over active QFunction input fields\n";
+  code << tab << "const CeedInt num_active_in = " << num_active_in << ";\n";
+  code << tab << "for (CeedInt a = 0; a < num_active_in; a++) {\n";
+  tab.push();
+
+  // -- Loop over size of active field
+  code << "\n" << tab << "// Loop over current active input field size\n";
+  code << tab << "const CeedInt field_size_in = field_sizes_in[a];\n";
+  code << tab << "for (CeedInt s = 0; s < field_size_in; s++) {\n";
+  tab.push();
+
+  // -- Set current active point and component to 1
+  code << tab << "// Set current active point and component to 1.0\n";
+  if (is_all_tensor && (max_dim >= 3)) {
+    code << tab << "for (CeedInt i = 0; i < Q_1d; i++) r_q_in[a][i + s * Q_1d] = 1.0;\n";
+  } else {
+    code << tab << "r_q_in[a][s] = 1.0;\n";
+  }
+
+  // -- Q function
+  CeedCallBackend(CeedOperatorBuildKernelQFunction_Cuda_gen(code, data, tab, max_dim, max_num_points, num_input_fields, op_input_fields,
+                                                            qf_input_fields, num_output_fields, op_output_fields, qf_output_fields, qfunction_name,
+                                                            Q_1d, is_all_tensor, is_at_points, use_3d_slices));
+
+  // -- Output basis and restriction
+  code << "\n" << tab << "// -- Output field basis action and restrictions\n";
+  CeedScalar offset = 0;
+
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    bool        is_active = false;
+    const char *field_name;
+
+    {
+      CeedVector vec;
+
+      CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
+      is_active = vec == CEED_VECTOR_ACTIVE;
+      CeedCallBackend(CeedVectorDestroy(&vec));
+    }
+    if (!is_active) continue;
+
+    CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
+    code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
+
+    // ---- Restriction
+    CeedInt field_size;
+
+    code << tab << "WriteLVecStandard" << (is_all_tensor ? max_dim : 1) << "d_QFAssembly<total_size_out, field_size_out_" << i << ", "
+         << (is_all_tensor ? "Q_1d" : "Q") << ">(data, num_elem, elem, input_offset + s, " << offset << ", r_q_out_" << i << ", values_array);\n";
+    CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &field_size));
+    offset += field_size;
+  }
+
+  // -- Reset current active node and component
+  code << "\n" << tab << "// Reset current active node and component to 0.0\n";
+  if (is_all_tensor && (max_dim >= 3)) {
+    code << tab << "for (CeedInt i = 0; i < Q_1d; i++) r_q_in[a][i + s * Q_1d] = 0.0;\n";
+  } else {
+    code << tab << "r_q_in[a][s] = 0.0;\n";
+  }
+
+  // -- End of loop over size of active field
+  tab.pop();
+  code << tab << "}\n";
+  code << tab << "input_offset += field_size_in;\n";
+
+  // -- End of loop over active field
+  tab.pop();
+  code << tab << "}\n";
+
+  // Close loop and function
+  tab.pop();
+  code << tab << "}\n";
+  tab.pop();
+  code << tab << "}\n";
+  code << tab << "// -----------------------------------------------------------------------------\n\n";
+
+  // Compile
+  {
+    bool          is_compile_good = false;
+    const CeedInt T_1d            = CeedIntMax(is_all_tensor ? Q_1d : Q, data->max_P_1d);
+
+    data->thread_1d = T_1d;
+    CeedCallBackend(CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good, &data->module_assemble_qfunction, 1, "OP_T_1D", T_1d));
+    if (is_compile_good) {
+      *is_good_build = true;
+      CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module_assemble_qfunction, operator_name.c_str(), &data->assemble_qfunction));
+    } else {
+      *is_good_build              = false;
+      data->use_assembly_fallback = true;
+    }
+  }
+  CeedCallBackend(CeedDestroy(&ceed));
+  CeedCallBackend(CeedQFunctionDestroy(&qf));
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
diff --git a/backends/cuda-gen/ceed-cuda-gen-operator-build.h b/backends/cuda-gen/ceed-cuda-gen-operator-build.h
index 7c116e1bfa..3568e57ee3 100644
--- a/backends/cuda-gen/ceed-cuda-gen-operator-build.h
+++ b/backends/cuda-gen/ceed-cuda-gen-operator-build.h
@@ -9,3 +9,4 @@
 CEED_INTERN int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_build);
 CEED_INTERN int CeedOperatorBuildKernelFullAssemblyAtPoints_Cuda_gen(CeedOperator op, bool *is_good_build);
 CEED_INTERN int CeedOperatorBuildKernelDiagonalAssemblyAtPoints_Cuda_gen(CeedOperator op, bool *is_good_build);
+CEED_INTERN int CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen(CeedOperator op, bool *is_good_build);
diff --git a/backends/cuda-gen/ceed-cuda-gen-operator.c b/backends/cuda-gen/ceed-cuda-gen-operator.c
index f0728b091c..5342b6a4a8 100644
--- a/backends/cuda-gen/ceed-cuda-gen-operator.c
+++ b/backends/cuda-gen/ceed-cuda-gen-operator.c
@@ -30,6 +30,7 @@ static int CeedOperatorDestroy_Cuda_gen(CeedOperator op) {
   if (impl->module) CeedCallCuda(ceed, cuModuleUnload(impl->module));
   if (impl->module_assemble_full) CeedCallCuda(ceed, cuModuleUnload(impl->module_assemble_full));
   if (impl->module_assemble_diagonal) CeedCallCuda(ceed, cuModuleUnload(impl->module_assemble_diagonal));
+  if (impl->module_assemble_qfunction) CeedCallCuda(ceed, cuModuleUnload(impl->module_assemble_qfunction));
   if (impl->points.num_per_elem) CeedCallCuda(ceed, cudaFree((void **)impl->points.num_per_elem));
   CeedCallBackend(CeedFree(&impl));
   CeedCallBackend(CeedDestroy(&ceed));
@@ -335,6 +336,199 @@ static int CeedOperatorApplyAddComposite_Cuda_gen(CeedOperator op, CeedVector in
   return CEED_ERROR_SUCCESS;
 }
 
+//------------------------------------------------------------------------------
+// QFunction assembly
+//------------------------------------------------------------------------------
+static int CeedOperatorLinearAssembleQFunctionCore_Cuda_gen(CeedOperator op, bool build_objects, CeedVector *assembled, CeedElemRestriction *rstr,
+                                                            CeedRequest *request) {
+  Ceed                   ceed;
+  CeedOperator_Cuda_gen *data;
+
+  CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
+  CeedCallBackend(CeedOperatorGetData(op, &data));
+
+  // Build the assembly kernel
+  if (!data->assemble_qfunction && !data->use_assembly_fallback) {
+    bool is_build_good = false;
+
+    CeedCallBackend(CeedOperatorBuildKernel_Cuda_gen(op, &is_build_good));
+    if (is_build_good) CeedCallBackend(CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen(op, &is_build_good));
+    if (!is_build_good) data->use_assembly_fallback = true;
+  }
+
+  // Try assembly
+  if (!data->use_assembly_fallback) {
+    bool                    is_run_good = true;
+    Ceed_Cuda              *cuda_data;
+    CeedInt                 num_elem, num_input_fields, num_output_fields;
+    CeedEvalMode            eval_mode;
+    CeedScalar             *assembled_array;
+    CeedQFunctionField     *qf_input_fields, *qf_output_fields;
+    CeedQFunction_Cuda_gen *qf_data;
+    CeedQFunction           qf;
+    CeedOperatorField      *op_input_fields, *op_output_fields;
+
+    CeedCallBackend(CeedGetData(ceed, &cuda_data));
+    CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
+    CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
+    CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
+    CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
+    CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
+
+    // Input vectors
+    for (CeedInt i = 0; i < num_input_fields; i++) {
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
+      if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
+        data->fields.inputs[i] = NULL;
+      } else {
+        bool       is_active;
+        CeedVector vec;
+
+        // Get input vector
+        CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
+        is_active = vec == CEED_VECTOR_ACTIVE;
+        if (is_active) data->fields.inputs[i] = NULL;
+        else CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->fields.inputs[i]));
+        CeedCallBackend(CeedVectorDestroy(&vec));
+      }
+    }
+
+    // Get context data
+    CeedCallBackend(CeedQFunctionGetInnerContextData(qf, CEED_MEM_DEVICE, &qf_data->d_c));
+
+    // Build objects if needed
+    if (build_objects) {
+      CeedInt qf_size_in = 0, qf_size_out = 0, Q;
+
+      // Count number of active input fields
+      {
+        for (CeedInt i = 0; i < num_input_fields; i++) {
+          CeedInt    field_size;
+          CeedVector vec;
+
+          // Get input vector
+          CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
+          // Check if active input
+          if (vec == CEED_VECTOR_ACTIVE) {
+            CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &field_size));
+            qf_size_in += field_size;
+          }
+          CeedCallBackend(CeedVectorDestroy(&vec));
+        }
+        CeedCheck(qf_size_in > 0, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs");
+      }
+
+      // Count number of active output fields
+      {
+        for (CeedInt i = 0; i < num_output_fields; i++) {
+          CeedInt    field_size;
+          CeedVector vec;
+
+          // Get output vector
+          CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
+          // Check if active output
+          if (vec == CEED_VECTOR_ACTIVE) {
+            CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &field_size));
+            qf_size_out += field_size;
+          }
+          CeedCallBackend(CeedVectorDestroy(&vec));
+        }
+        CeedCheck(qf_size_out > 0, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs");
+      }
+      CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q));
+
+      // Actually build objects now
+      const CeedSize l_size     = (CeedSize)num_elem * Q * qf_size_in * qf_size_out;
+      CeedInt        strides[3] = {1, num_elem * Q, Q}; /* *NOPAD* */
+
+      // Create output restriction
+      CeedCallBackend(CeedElemRestrictionCreateStrided(ceed, num_elem, Q, qf_size_in * qf_size_out,
+                                                       (CeedSize)qf_size_in * (CeedSize)qf_size_out * (CeedSize)num_elem * (CeedSize)Q, strides,
+                                                       rstr));
+      // Create assembled vector
+      CeedCallBackend(CeedVectorCreate(ceed, l_size, assembled));
+    }
+
+    // Assembly array
+    CeedCallBackend(CeedVectorGetArrayWrite(*assembled, CEED_MEM_DEVICE, &assembled_array));
+
+    // Assemble QFunction
+    void *opargs[] = {(void *)&num_elem, &qf_data->d_c, &data->indices, &data->fields, &data->B, &data->G, &data->W, &data->points, &assembled_array};
+    bool  is_tensor = false;
+    int   max_threads_per_block, min_grid_size, grid;
+
+    CeedCallBackend(CeedOperatorHasTensorBases(op, &is_tensor));
+    CeedCallCuda(ceed, cuOccupancyMaxPotentialBlockSize(&min_grid_size, &max_threads_per_block, data->op, dynamicSMemSize, 0, 0x10000));
+    int block[3] = {data->thread_1d, ((!is_tensor || data->dim == 1) ? 1 : data->thread_1d), -1};
+
+    if (is_tensor) {
+      CeedCallBackend(BlockGridCalculate(num_elem, min_grid_size / cuda_data->device_prop.multiProcessorCount, max_threads_per_block,
+                                         cuda_data->device_prop.maxThreadsDim[2], cuda_data->device_prop.warpSize, block, &grid));
+    } else {
+      CeedInt elems_per_block = CeedIntMin(cuda_data->device_prop.maxThreadsDim[2], CeedIntMax(512 / data->thread_1d, 1));
+
+      grid     = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
+      block[2] = elems_per_block;
+    }
+    CeedInt shared_mem = block[0] * block[1] * block[2] * sizeof(CeedScalar);
+
+    CeedCallBackend(
+        CeedTryRunKernelDimShared_Cuda(ceed, data->assemble_qfunction, NULL, grid, block[0], block[1], block[2], shared_mem, &is_run_good, opargs));
+    CeedCallCuda(ceed, cudaDeviceSynchronize());
+
+    // Restore input arrays
+    for (CeedInt i = 0; i < num_input_fields; i++) {
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
+      if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
+      } else {
+        bool       is_active;
+        CeedVector vec;
+
+        CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
+        is_active = vec == CEED_VECTOR_ACTIVE;
+        if (!is_active) CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->fields.inputs[i]));
+        CeedCallBackend(CeedVectorDestroy(&vec));
+      }
+    }
+
+    // Restore context data
+    CeedCallBackend(CeedQFunctionRestoreInnerContextData(qf, &qf_data->d_c));
+
+    // Restore assembly array
+    CeedCallBackend(CeedVectorRestoreArray(*assembled, &assembled_array));
+
+    // Cleanup
+    CeedCallBackend(CeedQFunctionDestroy(&qf));
+    if (!is_run_good) {
+      data->use_assembly_fallback = true;
+      if (build_objects) {
+        CeedCallBackend(CeedVectorDestroy(assembled));
+        CeedCallBackend(CeedElemRestrictionDestroy(rstr));
+      }
+    }
+  }
+  CeedCallBackend(CeedDestroy(&ceed));
+
+  // Fallback, if needed
+  if (data->use_assembly_fallback) {
+    CeedOperator op_fallback;
+
+    CeedDebug256(CeedOperatorReturnCeed(op), CEED_DEBUG_COLOR_SUCCESS, "Falling back to /gpu/cuda/ref CeedOperator");
+    CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback));
+    CeedCallBackend(CeedOperatorFallbackLinearAssembleQFunctionBuildOrUpdate(op_fallback, assembled, rstr, request));
+    return CEED_ERROR_SUCCESS;
+  }
+  return CEED_ERROR_SUCCESS;
+}
+
+static int CeedOperatorLinearAssembleQFunction_Cuda_gen(CeedOperator op, CeedVector *assembled, CeedElemRestriction *rstr, CeedRequest *request) {
+  return CeedOperatorLinearAssembleQFunctionCore_Cuda_gen(op, true, assembled, rstr, request);
+}
+
+static int CeedOperatorLinearAssembleQFunctionUpdate_Cuda_gen(CeedOperator op, CeedVector assembled, CeedElemRestriction rstr, CeedRequest *request) {
+  return CeedOperatorLinearAssembleQFunctionCore_Cuda_gen(op, false, &assembled, &rstr, request);
+}
+
 //------------------------------------------------------------------------------
 // AtPoints diagonal assembly
 //------------------------------------------------------------------------------
@@ -686,6 +880,11 @@ int CeedOperatorCreate_Cuda_gen(CeedOperator op) {
         CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddDiagonal", CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda_gen));
     CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleSingle", CeedSingleOperatorAssembleAtPoints_Cuda_gen));
   }
+  if (!is_at_points) {
+    CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunction", CeedOperatorLinearAssembleQFunction_Cuda_gen));
+    CeedCallBackend(
+        CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunctionUpdate", CeedOperatorLinearAssembleQFunctionUpdate_Cuda_gen));
+  }
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "Destroy", CeedOperatorDestroy_Cuda_gen));
   CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
diff --git a/backends/cuda-gen/ceed-cuda-gen.h b/backends/cuda-gen/ceed-cuda-gen.h
index ef80881a77..4f64d3a4f8 100644
--- a/backends/cuda-gen/ceed-cuda-gen.h
+++ b/backends/cuda-gen/ceed-cuda-gen.h
@@ -17,8 +17,8 @@ typedef struct {
   CeedInt        Q, Q_1d;
   CeedInt        max_P_1d;
   CeedInt        thread_1d;
-  CUmodule       module, module_assemble_full, module_assemble_diagonal;
-  CUfunction     op, assemble_full, assemble_diagonal;
+  CUmodule       module, module_assemble_full, module_assemble_diagonal, module_assemble_qfunction;
+  CUfunction     op, assemble_full, assemble_diagonal, assemble_qfunction;
   FieldsInt_Cuda indices;
   Fields_Cuda    fields;
   Fields_Cuda    B;
diff --git a/include/ceed/backend.h b/include/ceed/backend.h
index a86e317e1e..e6b608e571 100644
--- a/include/ceed/backend.h
+++ b/include/ceed/backend.h
@@ -461,6 +461,8 @@ CEED_EXTERN int CeedOperatorReference(CeedOperator op);
 CEED_EXTERN int CeedOperatorGetFallback(CeedOperator op, CeedOperator *op_fallback);
 CEED_EXTERN int CeedOperatorGetFallbackParent(CeedOperator op, CeedOperator *parent);
 CEED_EXTERN int CeedOperatorGetFallbackParentCeed(CeedOperator op, Ceed *parent);
+CEED_EXTERN int CeedOperatorFallbackLinearAssembleQFunctionBuildOrUpdate(CeedOperator op, CeedVector *assembled, CeedElemRestriction *rstr,
+                                                                         CeedRequest *request);
 CEED_INTERN int CeedSingleOperatorAssemble(CeedOperator op, CeedInt offset, CeedVector values);
 CEED_EXTERN int CeedOperatorSetSetupDone(CeedOperator op);
 
diff --git a/include/ceed/jit-source/cuda/cuda-gen-templates.h b/include/ceed/jit-source/cuda/cuda-gen-templates.h
index e26e2aaf5e..f4dccf54ea 100644
--- a/include/ceed/jit-source/cuda/cuda-gen-templates.h
+++ b/include/ceed/jit-source/cuda/cuda-gen-templates.h
@@ -141,6 +141,21 @@ inline __device__ void WriteLVecStandard1d_Assembly(SharedData_Cuda &data, const
   }
 }
 
+//------------------------------------------------------------------------------
+// E-vector -> L-vector, Qfunction assembly
+//------------------------------------------------------------------------------
+template <int NUM_COMP_OUT, int NUM_COMP_FIELD, int Q_1D>
+inline __device__ void WriteLVecStandard1d_QFAssembly(SharedData_Cuda &data, const CeedInt num_elem, const CeedInt elem, const CeedInt input_offset,
+                                                      const CeedInt output_offset, const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
+  if (data.t_id_x < Q_1D) {
+    const CeedInt ind = data.t_id_x + elem * Q_1D;
+
+    for (CeedInt comp = 0; comp < NUM_COMP_FIELD; comp++) {
+      d_v[ind + (input_offset * NUM_COMP_OUT + output_offset + comp) * (Q_1D * num_elem)] = r_v[comp];
+    }
+  }
+}
+
 //------------------------------------------------------------------------------
 // E-vector -> L-vector, strided
 //------------------------------------------------------------------------------
@@ -255,6 +270,21 @@ inline __device__ void WriteLVecStandard2d_Assembly(SharedData_Cuda &data, const
   }
 }
 
+//------------------------------------------------------------------------------
+// E-vector -> L-vector, Qfunction assembly
+//------------------------------------------------------------------------------
+template <int NUM_COMP_OUT, int NUM_COMP_FIELD, int Q_1D>
+inline __device__ void WriteLVecStandard2d_QFAssembly(SharedData_Cuda &data, const CeedInt num_elem, const CeedInt elem, const CeedInt input_offset,
+                                                      const CeedInt output_offset, const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
+  if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
+    const CeedInt ind = (data.t_id_x + data.t_id_y * Q_1D) + elem * Q_1D * Q_1D;
+
+    for (CeedInt comp = 0; comp < NUM_COMP_FIELD; comp++) {
+      d_v[ind + (input_offset * NUM_COMP_OUT + output_offset + comp) * (Q_1D * Q_1D * num_elem)] = r_v[comp];
+    }
+  }
+}
+
 //------------------------------------------------------------------------------
 // E-vector -> L-vector, strided
 //------------------------------------------------------------------------------
@@ -409,6 +439,23 @@ inline __device__ void WriteLVecStandard3d_Assembly(SharedData_Cuda &data, const
   }
 }
 
+//------------------------------------------------------------------------------
+// E-vector -> L-vector, Qfunction assembly
+//------------------------------------------------------------------------------
+template <int NUM_COMP_OUT, int NUM_COMP_FIELD, int Q_1D>
+inline __device__ void WriteLVecStandard3d_QFAssembly(SharedData_Cuda &data, const CeedInt num_elem, const CeedInt elem, const CeedInt input_offset,
+                                                      const CeedInt output_offset, const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
+  if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
+    for (CeedInt z = 0; z < Q_1D; z++) {
+      const CeedInt ind = (data.t_id_x + data.t_id_y * Q_1D + z * Q_1D * Q_1D) + elem * Q_1D * Q_1D * Q_1D;
+
+      for (CeedInt comp = 0; comp < NUM_COMP_FIELD; comp++) {
+        d_v[ind + (input_offset * NUM_COMP_OUT + output_offset + comp) * (Q_1D * Q_1D * Q_1D * num_elem)] = r_v[z + comp * Q_1D];
+      }
+    }
+  }
+}
+
 //------------------------------------------------------------------------------
 // E-vector -> L-vector, strided
 //------------------------------------------------------------------------------
diff --git a/interface/ceed-preconditioning.c b/interface/ceed-preconditioning.c
index c62cd66826..fe583ab9e5 100644
--- a/interface/ceed-preconditioning.c
+++ b/interface/ceed-preconditioning.c
@@ -553,6 +553,108 @@ static int CeedSingleOperatorAssembleSymbolic(CeedOperator op, CeedInt offset, C
   return CEED_ERROR_SUCCESS;
 }
 
+/**
+  @brief Core logic to assemble `CeedQFunction` and store result internally.
+
+  Return copied references of stored data to the caller.
+  Caller is responsible for ownership and destruction of the copied references.
+  See also @ref CeedOperatorLinearAssembleQFunction().
+
+  Note: If the value of `assembled` or `rstr` passed to this function are non-`NULL` , then it is assumed that they hold valid pointers.
+        These objects will be destroyed if `*assembled` or `*rstr` is the only reference to the object.
+
+  @param[in]  op         `CeedOperator` to assemble `CeedQFunction`
+  @param[in]  use_parent Boolean flag to check for fallback parent implementation
+  @param[out] assembled  `CeedVector` to store assembled `CeedQFunction` at quadrature points
+  @param[out] rstr       `CeedElemRestriction` for `CeedVector` containing assembled `CeedQFunction`
+  @param[in]  request    Address of @ref CeedRequest for non-blocking completion, else @ref CEED_REQUEST_IMMEDIATE
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref User
+**/
+static int CeedOperatorLinearAssembleQFunctionBuildOrUpdate_Core(CeedOperator op, bool use_parent, CeedVector *assembled, CeedElemRestriction *rstr,
+                                                                 CeedRequest *request) {
+  int (*LinearAssembleQFunctionUpdate)(CeedOperator, CeedVector, CeedElemRestriction, CeedRequest *) = NULL;
+  CeedOperator op_assemble                                                                           = NULL;
+  CeedOperator op_fallback_parent                                                                    = NULL;
+
+  CeedCall(CeedOperatorCheckReady(op));
+
+  // Determine if fallback parent or operator has implementation
+  CeedCall(CeedOperatorGetFallbackParent(op, &op_fallback_parent));
+  if (op_fallback_parent && use_parent && op_fallback_parent->LinearAssembleQFunctionUpdate) {
+    // -- Backend version for op fallback parent is faster, if it exists
+    LinearAssembleQFunctionUpdate = op_fallback_parent->LinearAssembleQFunctionUpdate;
+    op_assemble                   = op_fallback_parent;
+  } else if (op->LinearAssembleQFunctionUpdate) {
+    // -- Backend version for op
+    LinearAssembleQFunctionUpdate = op->LinearAssembleQFunctionUpdate;
+    op_assemble                   = op;
+  }
+
+  // Assemble QFunction
+  if (LinearAssembleQFunctionUpdate) {
+    // Backend or fallback parent version
+    CeedQFunctionAssemblyData data;
+    bool                      data_is_setup;
+    CeedVector                assembled_vec  = NULL;
+    CeedElemRestriction       assembled_rstr = NULL;
+
+    CeedCall(CeedOperatorGetQFunctionAssemblyData(op, &data));
+    CeedCall(CeedQFunctionAssemblyDataIsSetup(data, &data_is_setup));
+    if (data_is_setup) {
+      bool update_needed;
+
+      CeedCall(CeedQFunctionAssemblyDataGetObjects(data, &assembled_vec, &assembled_rstr));
+      CeedCall(CeedQFunctionAssemblyDataIsUpdateNeeded(data, &update_needed));
+      if (update_needed) CeedCall(LinearAssembleQFunctionUpdate(op_assemble, assembled_vec, assembled_rstr, request));
+    } else {
+      CeedCall(CeedOperatorLinearAssembleQFunction(op_assemble, &assembled_vec, &assembled_rstr, request));
+      CeedCall(CeedQFunctionAssemblyDataSetObjects(data, assembled_vec, assembled_rstr));
+    }
+    CeedCall(CeedQFunctionAssemblyDataSetUpdateNeeded(data, false));
+
+    // Copy reference from internally held copy
+    CeedCall(CeedVectorReferenceCopy(assembled_vec, assembled));
+    CeedCall(CeedElemRestrictionReferenceCopy(assembled_rstr, rstr));
+    CeedCall(CeedVectorDestroy(&assembled_vec));
+    CeedCall(CeedElemRestrictionDestroy(&assembled_rstr));
+  } else {
+    // Operator fallback
+    CeedOperator op_fallback;
+
+    CeedCall(CeedOperatorGetFallback(op, &op_fallback));
+    if (op_fallback) CeedCall(CeedOperatorLinearAssembleQFunctionBuildOrUpdate(op_fallback, assembled, rstr, request));
+    else return CeedError(CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED, "Backend does not support CeedOperatorLinearAssembleQFunctionUpdate");
+  }
+  return CEED_ERROR_SUCCESS;
+}
+
+/**
+  @brief Assemble `CeedQFunction` and store result internally, but do not use fallback parent.
+
+  Return copied references of stored data to the caller.
+  Caller is responsible for ownership and destruction of the copied references.
+  See also @ref CeedOperatorLinearAssembleQFunction().
+
+  Note: If the value of `assembled` or `rstr` passed to this function are non-`NULL` , then it is assumed that they hold valid pointers.
+        These objects will be destroyed if `*assembled` or `*rstr` is the only reference to the object.
+
+  @param[in]  op        `CeedOperator` to assemble `CeedQFunction`
+  @param[out] assembled `CeedVector` to store assembled `CeedQFunction` at quadrature points
+  @param[out] rstr      `CeedElemRestriction` for `CeedVector` containing assembled `CeedQFunction`
+  @param[in]  request   Address of @ref CeedRequest for non-blocking completion, else @ref CEED_REQUEST_IMMEDIATE
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Developer
+**/
+int CeedOperatorFallbackLinearAssembleQFunctionBuildOrUpdate(CeedOperator op, CeedVector *assembled, CeedElemRestriction *rstr,
+                                                             CeedRequest *request) {
+  return CeedOperatorLinearAssembleQFunctionBuildOrUpdate_Core(op, false, assembled, rstr, request);
+}
+
 /**
   @brief Assemble nonzero entries for non-composite `CeedOperator`.
 
@@ -1953,60 +2055,7 @@ int CeedOperatorLinearAssembleQFunction(CeedOperator op, CeedVector *assembled,
   @ref User
 **/
 int CeedOperatorLinearAssembleQFunctionBuildOrUpdate(CeedOperator op, CeedVector *assembled, CeedElemRestriction *rstr, CeedRequest *request) {
-  int (*LinearAssembleQFunctionUpdate)(CeedOperator, CeedVector, CeedElemRestriction, CeedRequest *) = NULL;
-  CeedOperator op_assemble                                                                           = NULL;
-  CeedOperator op_fallback_parent                                                                    = NULL;
-
-  CeedCall(CeedOperatorCheckReady(op));
-
-  // Determine if fallback parent or operator has implementation
-  CeedCall(CeedOperatorGetFallbackParent(op, &op_fallback_parent));
-  if (op_fallback_parent && op_fallback_parent->LinearAssembleQFunctionUpdate) {
-    // -- Backend version for op fallback parent is faster, if it exists
-    LinearAssembleQFunctionUpdate = op_fallback_parent->LinearAssembleQFunctionUpdate;
-    op_assemble                   = op_fallback_parent;
-  } else if (op->LinearAssembleQFunctionUpdate) {
-    // -- Backend version for op
-    LinearAssembleQFunctionUpdate = op->LinearAssembleQFunctionUpdate;
-    op_assemble                   = op;
-  }
-
-  // Assemble QFunction
-  if (LinearAssembleQFunctionUpdate) {
-    // Backend or fallback parent version
-    CeedQFunctionAssemblyData data;
-    bool                      data_is_setup;
-    CeedVector                assembled_vec  = NULL;
-    CeedElemRestriction       assembled_rstr = NULL;
-
-    CeedCall(CeedOperatorGetQFunctionAssemblyData(op, &data));
-    CeedCall(CeedQFunctionAssemblyDataIsSetup(data, &data_is_setup));
-    if (data_is_setup) {
-      bool update_needed;
-
-      CeedCall(CeedQFunctionAssemblyDataGetObjects(data, &assembled_vec, &assembled_rstr));
-      CeedCall(CeedQFunctionAssemblyDataIsUpdateNeeded(data, &update_needed));
-      if (update_needed) CeedCall(LinearAssembleQFunctionUpdate(op_assemble, assembled_vec, assembled_rstr, request));
-    } else {
-      CeedCall(CeedOperatorLinearAssembleQFunction(op_assemble, &assembled_vec, &assembled_rstr, request));
-      CeedCall(CeedQFunctionAssemblyDataSetObjects(data, assembled_vec, assembled_rstr));
-    }
-    CeedCall(CeedQFunctionAssemblyDataSetUpdateNeeded(data, false));
-
-    // Copy reference from internally held copy
-    CeedCall(CeedVectorReferenceCopy(assembled_vec, assembled));
-    CeedCall(CeedElemRestrictionReferenceCopy(assembled_rstr, rstr));
-    CeedCall(CeedVectorDestroy(&assembled_vec));
-    CeedCall(CeedElemRestrictionDestroy(&assembled_rstr));
-  } else {
-    // Operator fallback
-    CeedOperator op_fallback;
-
-    CeedCall(CeedOperatorGetFallback(op, &op_fallback));
-    if (op_fallback) CeedCall(CeedOperatorLinearAssembleQFunctionBuildOrUpdate(op_fallback, assembled, rstr, request));
-    else return CeedError(CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED, "Backend does not support CeedOperatorLinearAssembleQFunctionUpdate");
-  }
-  return CEED_ERROR_SUCCESS;
+  return CeedOperatorLinearAssembleQFunctionBuildOrUpdate_Core(op, true, assembled, rstr, request);
 }
 
 /**
diff --git a/tests/t531-operator.c b/tests/t531-operator.c
index 767f6a769e..b0e09caf8b 100644
--- a/tests/t531-operator.c
+++ b/tests/t531-operator.c
@@ -133,7 +133,7 @@ int main(int argc, char **argv) {
     for (CeedInt i = 0; i < num_dofs; i++) {
       if (fabs(v_array[i] - v_assembled_array[i]) > 100. * CEED_EPSILON) {
         // LCOV_EXCL_START
-        printf("Error: Linearized operator computed v[i] = %f != %f\n", v_assembled_array[i], v_array[i]);
+        printf("Error: Linearized operator computed v[%d] = %f != %f\n", i, v_assembled_array[i], v_array[i]);
         // LCOV_EXCL_STOP
       }
     }

From 5daefc96c3d6c1b0bbb656215f0640792d88e993 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Tue, 8 Jul 2025 10:09:37 -0600
Subject: [PATCH 436/571] hip - QFunction assembly for gen

---
 .../hip-gen/ceed-hip-gen-operator-build.cpp   | 562 ++++++++++++++++++
 .../hip-gen/ceed-hip-gen-operator-build.h     |   1 +
 backends/hip-gen/ceed-hip-gen-operator.c      | 209 +++++++
 backends/hip-gen/ceed-hip-gen.h               |   4 +-
 .../ceed/jit-source/hip/hip-gen-templates.h   |  47 ++
 5 files changed, 821 insertions(+), 2 deletions(-)

diff --git a/backends/hip-gen/ceed-hip-gen-operator-build.cpp b/backends/hip-gen/ceed-hip-gen-operator-build.cpp
index e7c1f4e8c7..674f94b428 100644
--- a/backends/hip-gen/ceed-hip-gen-operator-build.cpp
+++ b/backends/hip-gen/ceed-hip-gen-operator-build.cpp
@@ -2082,5 +2082,567 @@ extern "C" int CeedOperatorBuildKernelDiagonalAssemblyAtPoints_Hip_gen(CeedOpera
 extern "C" int CeedOperatorBuildKernelFullAssemblyAtPoints_Hip_gen(CeedOperator op, bool *is_good_build) {
   return CeedOperatorBuildKernelAssemblyAtPoints_Hip_gen(op, true, is_good_build);
 }
+//------------------------------------------------------------------------------
+// Build QFunction assembly operator kernel
+//------------------------------------------------------------------------------
+extern "C" int CeedOperatorBuildKernelLinearAssembleQFunction_Hip_gen(CeedOperator op, bool *is_good_build) {
+  bool                   is_all_tensor = true, is_all_nontensor = true, is_at_points = false, use_3d_slices = false;
+  Ceed                   ceed;
+  CeedInt                Q, Q_1d, num_input_fields, num_output_fields, max_dim = 1, max_num_points = 0;
+  CeedQFunctionField    *qf_input_fields, *qf_output_fields;
+  CeedQFunction_Hip_gen *qf_data;
+  CeedQFunction          qf;
+  CeedOperatorField     *op_input_fields, *op_output_fields;
+  CeedOperator_Hip_gen  *data;
+  std::ostringstream     code;
+  Tab                    tab;
+
+  // Check compatibility
+  CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
+  CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points));
+  CeedCheck(!is_at_points, ceed, CEED_ERROR_BACKEND, "AtPoints QFunction assembly is not supported");
+
+  // Check field compatibility
+  CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
+  {
+    bool has_shared_bases = true;
+
+    for (CeedInt i = 0; i < num_input_fields; i++) {
+      CeedBasis basis;
+
+      CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
+      if (basis != CEED_BASIS_NONE) {
+        bool        is_tensor = true;
+        const char *resource;
+        char       *resource_root;
+        Ceed        basis_ceed;
+
+        CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
+        is_all_tensor    = is_all_tensor && is_tensor;
+        is_all_nontensor = is_all_nontensor && !is_tensor;
+        CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
+        CeedCallBackend(CeedGetResource(basis_ceed, &resource));
+        CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
+        has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/hip/shared");
+        CeedCallBackend(CeedFree(&resource_root));
+        CeedCallBackend(CeedDestroy(&basis_ceed));
+      }
+      CeedCallBackend(CeedBasisDestroy(&basis));
+    }
+
+    for (CeedInt i = 0; i < num_output_fields; i++) {
+      CeedBasis basis;
+
+      CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
+      if (basis != CEED_BASIS_NONE) {
+        bool        is_tensor = true;
+        const char *resource;
+        char       *resource_root;
+        Ceed        basis_ceed;
+
+        CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
+        is_all_tensor    = is_all_tensor && is_tensor;
+        is_all_nontensor = is_all_nontensor && !is_tensor;
+
+        CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
+        CeedCallBackend(CeedGetResource(basis_ceed, &resource));
+        CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
+        has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/hip/shared");
+        CeedCallBackend(CeedFree(&resource_root));
+        CeedCallBackend(CeedDestroy(&basis_ceed));
+      }
+      CeedCallBackend(CeedBasisDestroy(&basis));
+    }
+  }
+
+  // Retrieve operator data
+  CeedCallBackend(CeedOperatorGetData(op, &data));
+  Q       = data->Q;
+  Q_1d    = data->Q_1d;
+  max_dim = data->dim;
+  CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
+  CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
+  CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
+
+  // Load basis source files
+  if (!is_all_nontensor) {
+    code << tab << "// Tensor basis source\n";
+    code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-templates.h>\n\n";
+  }
+  if (!is_all_tensor) {
+    code << tab << "// Non-tensor basis source\n";
+    code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-nontensor-templates.h>\n\n";
+  }
+  if (!is_all_tensor && !is_all_nontensor) {
+    code << "// Tensor basis source\n";
+    code << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-flattened-templates.h>\n\n";
+  }
+  code << "// CodeGen operator source\n";
+  code << "#include <ceed/jit-source/hip/hip-gen-templates.h>\n\n";
+
+  // Get QFunction name
+  std::string qfunction_name(qf_data->qfunction_name);
+  std::string operator_name;
+
+  operator_name = "CeedKernelHipGenQFunctionAssembly_" + qfunction_name;
+
+  // Define CEED_Q_VLA
+  code << "\n" << tab << "#undef CEED_Q_VLA\n";
+  if (max_dim != 3 || is_at_points || use_3d_slices || !is_all_tensor) {
+    code << tab << "#define CEED_Q_VLA 1\n\n";
+  } else {
+    code << tab << "#define CEED_Q_VLA " << Q_1d << "\n\n";
+  }
+
+  // Add user QFunction source
+  {
+    const char *source_path;
+
+    CeedCallBackend(CeedQFunctionGetSourcePath(qf, &source_path));
+    CeedCheck(source_path, ceed, CEED_ERROR_UNSUPPORTED, "/gpu/hip/gen backend requires QFunction source code file");
+
+    code << tab << "// User QFunction source\n";
+    code << tab << "#include \"" << source_path << "\"\n\n";
+  }
+
+  // Setup
+  code << "\n" << tab << "// -----------------------------------------------------------------------------\n";
+  code << tab << "// Operator Assembly Kernel\n";
+  code << tab << "// \n";
+  code << tab << "// d_[in,out]_i:   CeedVector device array\n";
+  code << tab << "// r_[in,out]_e_i: Element vector register\n";
+  code << tab << "// r_[in,out]_q_i: Quadrature space vector register\n";
+  code << tab << "// r_[in,out]_c_i: AtPoints Chebyshev coefficients register\n";
+  code << tab << "// r_[in,out]_s_i: Quadrature space slice vector register\n";
+  code << tab << "// \n";
+  code << tab << "// s_B_[in,out]_i: Interpolation matrix, shared memory\n";
+  code << tab << "// s_G_[in,out]_i: Gradient matrix, shared memory\n";
+  code << tab << "// -----------------------------------------------------------------------------\n";
+  code << tab << "extern \"C\" __global__ void " << operator_name
+       << "(CeedInt num_elem, void* ctx, FieldsInt_Hip indices, Fields_Hip fields, Fields_Hip B, Fields_Hip G, CeedScalar *W, Points_Hip "
+          "points, CeedScalar *__restrict__ values_array) {\n";
+  tab.push();
+
+  // Scratch buffers
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedEvalMode eval_mode;
+
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
+    if (eval_mode != CEED_EVAL_WEIGHT) {  // Skip CEED_EVAL_WEIGHT
+      code << tab << "const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n";
+    }
+  }
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    code << tab << "CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n";
+  }
+
+  code << tab << "const CeedInt max_dim = " << max_dim << ";\n";
+  if (!is_all_tensor) {
+    code << tab << "const CeedInt Q = " << Q << ";\n";
+  }
+  if (!is_all_nontensor) {
+    code << tab << "const CeedInt Q_1d = " << Q_1d << ";\n";
+  }
+
+  // Shared data
+  code << tab << "extern __shared__ CeedScalar slice[];\n";
+  code << tab << "SharedData_Hip data;\n";
+  code << tab << "data.t_id_x = threadIdx.x;\n";
+  code << tab << "data.t_id_y = threadIdx.y;\n";
+  code << tab << "data.t_id_z = threadIdx.z;\n";
+  code << tab << "data.t_id   = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
+  code << tab << "data.slice  = slice + data.t_id_z*OP_T_1D" << ((!is_all_tensor || max_dim == 1) ? "" : "*OP_T_1D") << ";\n";
+
+  // -- Determine input mat reuse
+  FieldReuse_Hip input_matrix_reuse[CEED_FIELD_MAX];
+
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    input_matrix_reuse[i].index = -1;
+  }
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    bool         is_tensor = true;
+    CeedEvalMode eval_mode_i;
+    CeedBasis    basis_i;
+
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode_i));
+    if (eval_mode_i == CEED_EVAL_WEIGHT) continue;
+    CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis_i));
+    CeedCallBackend(CeedBasisIsTensor(basis_i, &is_tensor));
+    for (CeedInt j = 0; (input_matrix_reuse[i].index == -1) && (j < i); j++) {
+      CeedEvalMode eval_mode_j;
+      CeedBasis    basis_j;
+
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
+      if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
+      CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
+      if (basis_i == basis_j) {
+        if (is_tensor) {
+          input_matrix_reuse[i].index     = j;
+          input_matrix_reuse[i].is_input  = true;
+          input_matrix_reuse[i].eval_mode = eval_mode_j;
+        } else {
+          // For non-tensor can only re-use with the same eval mode
+          if (eval_mode_i == eval_mode_j) {
+            input_matrix_reuse[i].index     = j;
+            input_matrix_reuse[i].is_input  = true;
+            input_matrix_reuse[i].eval_mode = eval_mode_j;
+          }
+        }
+      }
+      CeedCallBackend(CeedBasisDestroy(&basis_j));
+    }
+    CeedCallBackend(CeedBasisDestroy(&basis_i));
+  }
+
+  // -- Determine output mat reuse
+  FieldReuse_Hip output_matrix_reuse[CEED_FIELD_MAX];
+
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    output_matrix_reuse[i].index = -1;
+  }
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    bool         is_tensor = true;
+    CeedEvalMode eval_mode_i;
+    CeedBasis    basis_i;
+
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode_i));
+    CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis_i));
+    CeedCallBackend(CeedBasisIsTensor(basis_i, &is_tensor));
+    for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < num_input_fields); j++) {
+      CeedEvalMode eval_mode_j;
+      CeedBasis    basis_j;
+
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
+      if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
+      CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
+      if (basis_i == basis_j) {
+        if (is_tensor) {
+          output_matrix_reuse[i].index     = j;
+          output_matrix_reuse[i].is_input  = true;
+          output_matrix_reuse[i].eval_mode = eval_mode_j;
+        } else {
+          // For non-tensor can only re-use with the same eval mode
+          if (eval_mode_i == eval_mode_j) {
+            output_matrix_reuse[i].index     = j;
+            output_matrix_reuse[i].is_input  = true;
+            output_matrix_reuse[i].eval_mode = eval_mode_j;
+          }
+        }
+      }
+      CeedCallBackend(CeedBasisDestroy(&basis_j));
+    }
+    for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < i); j++) {
+      CeedEvalMode eval_mode_j;
+      CeedBasis    basis_j;
+
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode_j));
+      if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
+      CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[j], &basis_j));
+      if (basis_i == basis_j) {
+        if (is_tensor) {
+          output_matrix_reuse[i].index     = j;
+          output_matrix_reuse[i].is_input  = false;
+          output_matrix_reuse[i].eval_mode = eval_mode_j;
+        } else {
+          // For non-tensor can only re-use with the same eval mode
+          if (eval_mode_i == eval_mode_j) {
+            output_matrix_reuse[i].index     = j;
+            output_matrix_reuse[i].is_input  = false;
+            output_matrix_reuse[i].eval_mode = eval_mode_j;
+          }
+        }
+      }
+      CeedCallBackend(CeedBasisDestroy(&basis_j));
+    }
+    CeedCallBackend(CeedBasisDestroy(&basis_i));
+  }
+
+  // Initialize constants, and matrices B and G
+  code << "\n" << tab << "// Input field constants and basis data\n";
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i],
+                                                             max_dim, Q, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices));
+  }
+  code << "\n" << tab << "// Output field constants and basis data\n";
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i],
+                                                             max_dim, Q, Q_1d, false, is_all_tensor, is_at_points, use_3d_slices));
+  }
+
+  // Loop over all elements
+  code << "\n" << tab << "// Element loop\n";
+  code << tab << "__syncthreads();\n";
+  code << tab << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n";
+  tab.push();
+
+  // -- Compute minimum buffer space needed
+  CeedInt max_rstr_buffer_size = 1;
+
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedEvalMode eval_mode;
+
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
+    if (eval_mode != CEED_EVAL_NONE && eval_mode != CEED_EVAL_WEIGHT) {
+      CeedInt             num_comp;
+      CeedElemRestriction elem_rstr;
+
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
+      CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
+      max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1));
+      CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
+    }
+  }
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    CeedEvalMode eval_mode;
+
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
+    if (eval_mode != CEED_EVAL_NONE) {
+      CeedInt             num_comp;
+      CeedElemRestriction elem_rstr;
+
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
+      CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
+      max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1));
+      CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
+    }
+  }
+  code << tab << "// Scratch restriction buffer space\n";
+  code << tab << "CeedScalar r_e_scratch[" << max_rstr_buffer_size << "];\n";
+
+  // -- Determine best input field processing order
+  CeedInt field_rstr_in_buffer[CEED_FIELD_MAX], input_field_order[CEED_FIELD_MAX];
+
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    field_rstr_in_buffer[i] = -1;
+    input_field_order[i]    = -1;
+  }
+  {
+    bool    is_ordered[CEED_FIELD_MAX];
+    CeedInt curr_index = 0;
+
+    for (CeedInt i = 0; i < num_input_fields; i++) is_ordered[i] = false;
+    for (CeedInt i = 0; i < num_input_fields; i++) {
+      CeedVector          vec_i;
+      CeedElemRestriction rstr_i;
+
+      if (is_ordered[i]) continue;
+      field_rstr_in_buffer[i]       = i;
+      is_ordered[i]                 = true;
+      input_field_order[curr_index] = i;
+      curr_index++;
+      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i));
+      if (vec_i == CEED_VECTOR_NONE) continue;  // CEED_EVAL_WEIGHT
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i));
+      for (CeedInt j = i + 1; j < num_input_fields; j++) {
+        CeedVector          vec_j;
+        CeedElemRestriction rstr_j;
+
+        CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec_j));
+        CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[j], &rstr_j));
+        if (rstr_i == rstr_j && vec_i == vec_j) {
+          field_rstr_in_buffer[j]       = i;
+          is_ordered[j]                 = true;
+          input_field_order[curr_index] = j;
+          curr_index++;
+        }
+        CeedCallBackend(CeedVectorDestroy(&vec_j));
+        CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j));
+      }
+      CeedCallBackend(CeedVectorDestroy(&vec_i));
+      CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
+    }
+  }
+
+  // -- Input restriction and basis
+  code << "\n" << tab << "// -- Input field restrictions and basis actions\n";
+  CeedInt num_active_in = 0, num_active_out = 0, qf_assembly_size_out = 0;
+  CeedInt active_fields_in[CEED_FIELD_MAX], active_fields_out[CEED_FIELD_MAX];
+
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    bool          is_active = false;
+    const char   *field_name;
+    const CeedInt f = input_field_order[i];
+
+    {
+      CeedVector vec;
+
+      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[f], &vec));
+      is_active = vec == CEED_VECTOR_ACTIVE;
+      CeedCallBackend(CeedVectorDestroy(&vec));
+    }
+
+    CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[f], &field_name));
+    code << tab << "// ---- Input field " << f << ": " << field_name << "\n";
+
+    if (is_active) {
+      CeedEvalMode eval_mode;
+      CeedInt      field_size;
+
+      active_fields_in[num_active_in] = f;
+      num_active_in++;
+      CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[f], &field_size));
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[f], &eval_mode));
+      if (eval_mode == CEED_EVAL_GRAD) {
+        code << tab << "CeedScalar r_q_in_" << f << "[num_comp_in_" << f << "*" << "dim_in_" << f << "*"
+             << (is_all_tensor && (max_dim >= 3) ? "Q_1d" : "1") << "] = {0.};\n";
+      } else {
+        code << tab << "CeedScalar r_q_in_" << f << "[num_comp_in_" << f << "*" << (is_all_tensor && (max_dim >= 3) ? "Q_1d" : "1") << "] = {0.};\n";
+      }
+      code << tab << "const CeedInt field_size_in_" << f << " = " << field_size << ";\n";
+    } else {
+      // ---- Restriction
+      CeedCallBackend(CeedOperatorBuildKernelRestriction_Hip_gen(code, data, tab, f, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f],
+                                                                 max_dim, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices));
+
+      // ---- Basis action
+      CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, f, op_input_fields[f], qf_input_fields[f], max_dim, Q_1d, true,
+                                                           is_all_tensor, is_at_points, use_3d_slices));
+    }
+  }
+  code << tab << "const CeedInt field_sizes_in[" << num_active_in << "] = {";
+  for (CeedInt i = 0; i < num_active_in; i++) {
+    code << "field_size_in_" << active_fields_in[i] << (i < num_active_in - 1 ? ", " : "");
+  }
+  code << "};\n";
+  code << tab << "CeedScalar * r_q_in[" << num_active_in << "] = {";
+  for (CeedInt i = 0; i < num_active_in; i++) {
+    code << "r_q_in_" << active_fields_in[i] << (i < num_active_in - 1 ? ", " : "");
+  }
+  code << "};\n";
+
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    bool is_active = false;
+
+    {
+      CeedVector vec;
+
+      CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
+      is_active = vec == CEED_VECTOR_ACTIVE;
+      CeedCallBackend(CeedVectorDestroy(&vec));
+    }
+    if (is_active) {
+      const char *field_name;
+      CeedInt     field_size;
+
+      active_fields_out[num_active_out] = i;
+      num_active_out++;
+      CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &field_size));
+      qf_assembly_size_out += field_size;
+      CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
+      code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
+      code << tab << "const CeedInt field_size_out_" << i << " = " << field_size << ";\n";
+    }
+  }
+  code << tab << "const CeedInt field_sizes_out[" << num_active_out << "] = {";
+  for (CeedInt i = 0; i < num_active_out; i++) {
+    code << "field_size_out_" << active_fields_out[i] << (i < num_active_out - 1 ? ", " : "");
+  }
+  code << "};\n";
+  code << tab << "const CeedInt total_size_out = " << qf_assembly_size_out << ";\n";
+
+  // -- Loop over active field
+  code << "\n" << tab << "CeedInt input_offset = 0;\n";
+  code << tab << "// Loop over active QFunction input fields\n";
+  code << tab << "const CeedInt num_active_in = " << num_active_in << ";\n";
+  code << tab << "for (CeedInt a = 0; a < num_active_in; a++) {\n";
+  tab.push();
+
+  // -- Loop over size of active field
+  code << "\n" << tab << "// Loop over current active input field size\n";
+  code << tab << "const CeedInt field_size_in = field_sizes_in[a];\n";
+  code << tab << "for (CeedInt s = 0; s < field_size_in; s++) {\n";
+  tab.push();
+
+  // -- Set current active point and component to 1
+  code << tab << "// Set current active point and component to 1.0\n";
+  if (is_all_tensor && (max_dim >= 3)) {
+    code << tab << "for (CeedInt i = 0; i < Q_1d; i++) r_q_in[a][i + s * Q_1d] = 1.0;\n";
+  } else {
+    code << tab << "r_q_in[a][s] = 1.0;\n";
+  }
+
+  // -- Q function
+  CeedCallBackend(CeedOperatorBuildKernelQFunction_Hip_gen(code, data, tab, max_dim, max_num_points, num_input_fields, op_input_fields,
+                                                           qf_input_fields, num_output_fields, op_output_fields, qf_output_fields, qfunction_name,
+                                                           Q_1d, is_all_tensor, is_at_points, use_3d_slices));
+
+  // -- Output basis and restriction
+  code << "\n" << tab << "// -- Output field basis action and restrictions\n";
+  CeedScalar offset = 0;
+
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    bool        is_active = false;
+    const char *field_name;
+
+    {
+      CeedVector vec;
+
+      CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
+      is_active = vec == CEED_VECTOR_ACTIVE;
+      CeedCallBackend(CeedVectorDestroy(&vec));
+    }
+    if (!is_active) continue;
+
+    CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
+    code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
+
+    // ---- Restriction
+    CeedInt field_size;
+
+    code << tab << "WriteLVecStandard" << (is_all_tensor ? max_dim : 1) << "d_QFAssembly<total_size_out, field_size_out_" << i << ", "
+         << (is_all_tensor ? "Q_1d" : "Q") << ">(data, num_elem, elem, input_offset + s, " << offset << ", r_q_out_" << i << ", values_array);\n";
+    CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &field_size));
+    offset += field_size;
+  }
+
+  // -- Reset current active node and component
+  code << "\n" << tab << "// Reset current active node and component to 0.0\n";
+  if (is_all_tensor && (max_dim >= 3)) {
+    code << tab << "for (CeedInt i = 0; i < Q_1d; i++) r_q_in[a][i + s * Q_1d] = 0.0;\n";
+  } else {
+    code << tab << "r_q_in[a][s] = 0.0;\n";
+  }
+
+  // -- End of loop over size of active field
+  tab.pop();
+  code << tab << "}\n";
+  code << tab << "input_offset += field_size_in;\n";
+
+  // -- End of loop over active field
+  tab.pop();
+  code << tab << "}\n";
+
+  // Close loop and function
+  tab.pop();
+  code << tab << "}\n";
+  tab.pop();
+  code << tab << "}\n";
+  code << tab << "// -----------------------------------------------------------------------------\n\n";
+
+  CeedInt block_sizes[3] = {0, 0, 0};
+  CeedInt num_elem;
+
+  // Compile
+  CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
+  CeedCallBackend(BlockGridCalculate_Hip_gen(max_dim, num_elem, data->max_P_1d, Q_1d, block_sizes));
+  block_sizes[2] = 1;
+  {
+    bool is_compile_good = false;
+
+    data->thread_1d = block_sizes[0];
+    CeedCallBackend(CeedTryCompile_Hip(ceed, code.str().c_str(), &is_compile_good, &data->module_assemble_qfunction, 2, "OP_T_1D", block_sizes[0],
+                                       "BLOCK_SIZE", block_sizes[0] * block_sizes[1] * block_sizes[2]));
+    if (is_compile_good) {
+      *is_good_build = true;
+      CeedCallBackend(CeedGetKernel_Hip(ceed, data->module_assemble_qfunction, operator_name.c_str(), &data->assemble_qfunction));
+    } else {
+      *is_good_build              = false;
+      data->use_assembly_fallback = true;
+    }
+  }
+  CeedCallBackend(CeedDestroy(&ceed));
+  CeedCallBackend(CeedQFunctionDestroy(&qf));
+  return CEED_ERROR_SUCCESS;
+}
 
 //------------------------------------------------------------------------------
diff --git a/backends/hip-gen/ceed-hip-gen-operator-build.h b/backends/hip-gen/ceed-hip-gen-operator-build.h
index ac74461ecb..60c2fcd479 100644
--- a/backends/hip-gen/ceed-hip-gen-operator-build.h
+++ b/backends/hip-gen/ceed-hip-gen-operator-build.h
@@ -10,3 +10,4 @@ CEED_INTERN int BlockGridCalculate_Hip_gen(CeedInt dim, CeedInt num_elem, CeedIn
 CEED_INTERN int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_build);
 CEED_INTERN int CeedOperatorBuildKernelFullAssemblyAtPoints_Hip_gen(CeedOperator op, bool *is_good_build);
 CEED_INTERN int CeedOperatorBuildKernelDiagonalAssemblyAtPoints_Hip_gen(CeedOperator op, bool *is_good_build);
+CEED_INTERN int CeedOperatorBuildKernelLinearAssembleQFunction_Hip_gen(CeedOperator op, bool *is_good_build);
diff --git a/backends/hip-gen/ceed-hip-gen-operator.c b/backends/hip-gen/ceed-hip-gen-operator.c
index 2bd1b4d6ae..d513fd488d 100644
--- a/backends/hip-gen/ceed-hip-gen-operator.c
+++ b/backends/hip-gen/ceed-hip-gen-operator.c
@@ -39,6 +39,7 @@ static int CeedOperatorDestroy_Hip_gen(CeedOperator op) {
   if (impl->module) CeedCallHip(ceed, hipModuleUnload(impl->module));
   if (impl->module_assemble_full) CeedCallHip(ceed, hipModuleUnload(impl->module_assemble_full));
   if (impl->module_assemble_diagonal) CeedCallHip(ceed, hipModuleUnload(impl->module_assemble_diagonal));
+  if (impl->module_assemble_qfunction) CeedCallHip(ceed, hipModuleUnload(impl->module_assemble_qfunction));
   if (impl->points.num_per_elem) CeedCallHip(ceed, hipFree((void **)impl->points.num_per_elem));
   CeedCallBackend(CeedFree(&impl));
   CeedCallBackend(CeedDestroy(&ceed));
@@ -301,6 +302,210 @@ static int CeedOperatorApplyAddComposite_Hip_gen(CeedOperator op, CeedVector inp
   return CEED_ERROR_SUCCESS;
 }
 
+//------------------------------------------------------------------------------
+// QFunction assembly
+//------------------------------------------------------------------------------
+static int CeedOperatorLinearAssembleQFunctionCore_Hip_gen(CeedOperator op, bool build_objects, CeedVector *assembled, CeedElemRestriction *rstr,
+                                                           CeedRequest *request) {
+  Ceed                  ceed;
+  CeedOperator_Hip_gen *data;
+
+  CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
+  CeedCallBackend(CeedOperatorGetData(op, &data));
+
+  // Build the assembly kernel
+  if (!data->assemble_qfunction && !data->use_assembly_fallback) {
+    bool is_build_good = false;
+
+    CeedCallBackend(CeedOperatorBuildKernel_Hip_gen(op, &is_build_good));
+    if (is_build_good) CeedCallBackend(CeedOperatorBuildKernelLinearAssembleQFunction_Hip_gen(op, &is_build_good));
+    if (!is_build_good) data->use_assembly_fallback = true;
+  }
+
+  // Try assembly
+  if (!data->use_assembly_fallback) {
+    bool                   is_run_good = true;
+    Ceed_Hip              *hip_data;
+    CeedInt                num_elem, num_input_fields, num_output_fields;
+    CeedEvalMode           eval_mode;
+    CeedScalar            *assembled_array;
+    CeedQFunctionField    *qf_input_fields, *qf_output_fields;
+    CeedQFunction_Hip_gen *qf_data;
+    CeedQFunction          qf;
+    CeedOperatorField     *op_input_fields, *op_output_fields;
+
+    CeedCallBackend(CeedGetData(ceed, &hip_data));
+    CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
+    CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
+    CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
+    CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
+    CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
+
+    // Input vectors
+    for (CeedInt i = 0; i < num_input_fields; i++) {
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
+      if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
+        data->fields.inputs[i] = NULL;
+      } else {
+        bool       is_active;
+        CeedVector vec;
+
+        // Get input vector
+        CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
+        is_active = vec == CEED_VECTOR_ACTIVE;
+        if (is_active) data->fields.inputs[i] = NULL;
+        else CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->fields.inputs[i]));
+        CeedCallBackend(CeedVectorDestroy(&vec));
+      }
+    }
+
+    // Get context data
+    CeedCallBackend(CeedQFunctionGetInnerContextData(qf, CEED_MEM_DEVICE, &qf_data->d_c));
+
+    // Build objects if needed
+    if (build_objects) {
+      CeedInt qf_size_in = 0, qf_size_out = 0, Q;
+
+      // Count number of active input fields
+      {
+        for (CeedInt i = 0; i < num_input_fields; i++) {
+          CeedInt    field_size;
+          CeedVector vec;
+
+          // Get input vector
+          CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
+          // Check if active input
+          if (vec == CEED_VECTOR_ACTIVE) {
+            CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &field_size));
+            qf_size_in += field_size;
+          }
+          CeedCallBackend(CeedVectorDestroy(&vec));
+        }
+        CeedCheck(qf_size_in > 0, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs");
+      }
+
+      // Count number of active output fields
+      {
+        for (CeedInt i = 0; i < num_output_fields; i++) {
+          CeedInt    field_size;
+          CeedVector vec;
+
+          // Get output vector
+          CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
+          // Check if active output
+          if (vec == CEED_VECTOR_ACTIVE) {
+            CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &field_size));
+            qf_size_out += field_size;
+          }
+          CeedCallBackend(CeedVectorDestroy(&vec));
+        }
+        CeedCheck(qf_size_out > 0, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs");
+      }
+      CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q));
+
+      // Actually build objects now
+      const CeedSize l_size     = (CeedSize)num_elem * Q * qf_size_in * qf_size_out;
+      CeedInt        strides[3] = {1, num_elem * Q, Q}; /* *NOPAD* */
+
+      // Create output restriction
+      CeedCallBackend(CeedElemRestrictionCreateStrided(ceed, num_elem, Q, qf_size_in * qf_size_out,
+                                                       (CeedSize)qf_size_in * (CeedSize)qf_size_out * (CeedSize)num_elem * (CeedSize)Q, strides,
+                                                       rstr));
+      // Create assembled vector
+      CeedCallBackend(CeedVectorCreate(ceed, l_size, assembled));
+    }
+
+    // Assembly array
+    CeedCallBackend(CeedVectorGetArrayWrite(*assembled, CEED_MEM_DEVICE, &assembled_array));
+
+    // Assemble QFunction
+    bool  is_tensor = false;
+    void *opargs[] = {(void *)&num_elem, &qf_data->d_c, &data->indices, &data->fields, &data->B, &data->G, &data->W, &data->points, &assembled_array};
+
+    CeedCallBackend(CeedOperatorHasTensorBases(op, &is_tensor));
+    CeedInt block_sizes[3] = {data->thread_1d, ((!is_tensor || data->dim == 1) ? 1 : data->thread_1d), -1};
+
+    if (is_tensor) {
+      CeedCallBackend(BlockGridCalculate_Hip_gen(data->dim, num_elem, data->max_P_1d, data->Q_1d, block_sizes));
+    } else {
+      CeedInt elems_per_block = 64 * data->thread_1d > 256 ? 256 / data->thread_1d : 64;
+
+      elems_per_block = elems_per_block > 0 ? elems_per_block : 1;
+      block_sizes[2]  = elems_per_block;
+    }
+    if (data->dim == 1 || !is_tensor) {
+      CeedInt grid      = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
+      CeedInt sharedMem = block_sizes[2] * data->thread_1d * sizeof(CeedScalar);
+
+      CeedCallBackend(CeedTryRunKernelDimShared_Hip(ceed, data->assemble_qfunction, NULL, grid, block_sizes[0], block_sizes[1], block_sizes[2],
+                                                    sharedMem, &is_run_good, opargs));
+    } else if (data->dim == 2) {
+      CeedInt grid      = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
+      CeedInt sharedMem = block_sizes[2] * data->thread_1d * data->thread_1d * sizeof(CeedScalar);
+
+      CeedCallBackend(CeedTryRunKernelDimShared_Hip(ceed, data->assemble_qfunction, NULL, grid, block_sizes[0], block_sizes[1], block_sizes[2],
+                                                    sharedMem, &is_run_good, opargs));
+    } else if (data->dim == 3) {
+      CeedInt grid      = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
+      CeedInt sharedMem = block_sizes[2] * data->thread_1d * data->thread_1d * sizeof(CeedScalar);
+
+      CeedCallBackend(CeedTryRunKernelDimShared_Hip(ceed, data->assemble_qfunction, NULL, grid, block_sizes[0], block_sizes[1], block_sizes[2],
+                                                    sharedMem, &is_run_good, opargs));
+    }
+
+    // Restore input arrays
+    for (CeedInt i = 0; i < num_input_fields; i++) {
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
+      if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
+      } else {
+        bool       is_active;
+        CeedVector vec;
+
+        CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
+        is_active = vec == CEED_VECTOR_ACTIVE;
+        if (!is_active) CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->fields.inputs[i]));
+        CeedCallBackend(CeedVectorDestroy(&vec));
+      }
+    }
+
+    // Restore context data
+    CeedCallBackend(CeedQFunctionRestoreInnerContextData(qf, &qf_data->d_c));
+
+    // Restore assembly array
+    CeedCallBackend(CeedVectorRestoreArray(*assembled, &assembled_array));
+
+    // Cleanup
+    CeedCallBackend(CeedQFunctionDestroy(&qf));
+    if (!is_run_good) {
+      data->use_assembly_fallback = true;
+      if (build_objects) {
+        CeedCallBackend(CeedVectorDestroy(assembled));
+        CeedCallBackend(CeedElemRestrictionDestroy(rstr));
+      }
+    }
+  }
+  CeedCallBackend(CeedDestroy(&ceed));
+
+  // Fallback, if needed
+  if (data->use_assembly_fallback) {
+    CeedOperator op_fallback;
+
+    CeedDebug256(CeedOperatorReturnCeed(op), CEED_DEBUG_COLOR_SUCCESS, "Falling back to /gpu/hip/ref CeedOperator");
+    CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback));
+    CeedCallBackend(CeedOperatorFallbackLinearAssembleQFunctionBuildOrUpdate(op_fallback, assembled, rstr, request));
+    return CEED_ERROR_SUCCESS;
+  }
+  return CEED_ERROR_SUCCESS;
+}
+
+static int CeedOperatorLinearAssembleQFunction_Hip_gen(CeedOperator op, CeedVector *assembled, CeedElemRestriction *rstr, CeedRequest *request) {
+  return CeedOperatorLinearAssembleQFunctionCore_Hip_gen(op, true, assembled, rstr, request);
+}
+
+static int CeedOperatorLinearAssembleQFunctionUpdate_Hip_gen(CeedOperator op, CeedVector assembled, CeedElemRestriction rstr, CeedRequest *request) {
+  return CeedOperatorLinearAssembleQFunctionCore_Hip_gen(op, false, &assembled, &rstr, request);
+}
+
 //------------------------------------------------------------------------------
 // AtPoints diagonal assembly
 //------------------------------------------------------------------------------
@@ -684,6 +889,10 @@ int CeedOperatorCreate_Hip_gen(CeedOperator op) {
     CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddDiagonal", CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip_gen));
     CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleSingle", CeedSingleOperatorAssembleAtPoints_Hip_gen));
   }
+  if (!is_at_points) {
+    CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunction", CeedOperatorLinearAssembleQFunction_Hip_gen));
+    CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunctionUpdate", CeedOperatorLinearAssembleQFunctionUpdate_Hip_gen));
+  }
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "Destroy", CeedOperatorDestroy_Hip_gen));
   CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
diff --git a/backends/hip-gen/ceed-hip-gen.h b/backends/hip-gen/ceed-hip-gen.h
index cd7dfe6773..06bbe5a1ad 100644
--- a/backends/hip-gen/ceed-hip-gen.h
+++ b/backends/hip-gen/ceed-hip-gen.h
@@ -18,8 +18,8 @@ typedef struct {
   CeedInt       max_P_1d;
   CeedInt       thread_1d;
   hipStream_t   streams[CEED_COMPOSITE_MAX];
-  hipModule_t   module, module_assemble_full, module_assemble_diagonal;
-  hipFunction_t op, assemble_full, assemble_diagonal;
+  hipModule_t   module, module_assemble_full, module_assemble_diagonal, module_assemble_qfunction;
+  hipFunction_t op, assemble_full, assemble_diagonal, assemble_qfunction;
   FieldsInt_Hip indices;
   Fields_Hip    fields;
   Fields_Hip    B;
diff --git a/include/ceed/jit-source/hip/hip-gen-templates.h b/include/ceed/jit-source/hip/hip-gen-templates.h
index e442cbc3c2..03f9204a99 100644
--- a/include/ceed/jit-source/hip/hip-gen-templates.h
+++ b/include/ceed/jit-source/hip/hip-gen-templates.h
@@ -140,6 +140,21 @@ inline __device__ void WriteLVecStandard1d_Assembly(SharedData_Hip &data, const
   }
 }
 
+//------------------------------------------------------------------------------
+// E-vector -> L-vector, Qfunction assembly
+//------------------------------------------------------------------------------
+template <int NUM_COMP_OUT, int NUM_COMP_FIELD, int Q_1D>
+inline __device__ void WriteLVecStandard1d_QFAssembly(SharedData_Hip &data, const CeedInt num_elem, const CeedInt elem, const CeedInt input_offset,
+                                                      const CeedInt output_offset, const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
+  if (data.t_id_x < Q_1D) {
+    const CeedInt ind = data.t_id_x + elem * Q_1D;
+
+    for (CeedInt comp = 0; comp < NUM_COMP_FIELD; comp++) {
+      d_v[ind + (input_offset * NUM_COMP_OUT + output_offset + comp) * (Q_1D * num_elem)] = r_v[comp];
+    }
+  }
+}
+
 //------------------------------------------------------------------------------
 // E-vector -> L-vector, strided
 //------------------------------------------------------------------------------
@@ -253,6 +268,21 @@ inline __device__ void WriteLVecStandard2d_Assembly(SharedData_Hip &data, const
   }
 }
 
+//------------------------------------------------------------------------------
+// E-vector -> L-vector, Qfunction assembly
+//------------------------------------------------------------------------------
+template <int NUM_COMP_OUT, int NUM_COMP_FIELD, int Q_1D>
+inline __device__ void WriteLVecStandard2d_QFAssembly(SharedData_Hip &data, const CeedInt num_elem, const CeedInt elem, const CeedInt input_offset,
+                                                      const CeedInt output_offset, const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
+  if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
+    const CeedInt ind = (data.t_id_x + data.t_id_y * Q_1D) + elem * Q_1D * Q_1D;
+
+    for (CeedInt comp = 0; comp < NUM_COMP_FIELD; comp++) {
+      d_v[ind + (input_offset * NUM_COMP_OUT + output_offset + comp) * (Q_1D * Q_1D * num_elem)] = r_v[comp];
+    }
+  }
+}
+
 //------------------------------------------------------------------------------
 // E-vector -> L-vector, strided
 //------------------------------------------------------------------------------
@@ -406,6 +436,23 @@ inline __device__ void WriteLVecStandard3d_Assembly(SharedData_Hip &data, const
   }
 }
 
+//------------------------------------------------------------------------------
+// E-vector -> L-vector, Qfunction assembly
+//------------------------------------------------------------------------------
+template <int NUM_COMP_OUT, int NUM_COMP_FIELD, int Q_1D>
+inline __device__ void WriteLVecStandard3d_QFAssembly(SharedData_Hip &data, const CeedInt num_elem, const CeedInt elem, const CeedInt input_offset,
+                                                      const CeedInt output_offset, const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
+  if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
+    for (CeedInt z = 0; z < Q_1D; z++) {
+      const CeedInt ind = (data.t_id_x + data.t_id_y * Q_1D + z * Q_1D * Q_1D) + elem * Q_1D * Q_1D * Q_1D;
+
+      for (CeedInt comp = 0; comp < NUM_COMP_FIELD; comp++) {
+        d_v[ind + (input_offset * NUM_COMP_OUT + output_offset + comp) * (Q_1D * Q_1D * Q_1D * num_elem)] = r_v[z + comp * Q_1D];
+      }
+    }
+  }
+}
+
 //------------------------------------------------------------------------------
 // E-vector -> L-vector, strided
 //------------------------------------------------------------------------------

From af34f196daa22ba0e05ead6c75a46a2ccef75ae4 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Tue, 8 Jul 2025 10:34:50 -0600
Subject: [PATCH 437/571] tidy - fix warning, minor consistency

---
 backends/hip-gen/ceed-hip-gen-operator.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/backends/hip-gen/ceed-hip-gen-operator.c b/backends/hip-gen/ceed-hip-gen-operator.c
index d513fd488d..9a04027f24 100644
--- a/backends/hip-gen/ceed-hip-gen-operator.c
+++ b/backends/hip-gen/ceed-hip-gen-operator.c
@@ -253,10 +253,10 @@ static int CeedOperatorApplyAdd_Hip_gen(CeedOperator op, CeedVector input_vec, C
 }
 
 static int CeedOperatorApplyAddComposite_Hip_gen(CeedOperator op, CeedVector input_vec, CeedVector output_vec, CeedRequest *request) {
-  bool                  is_run_good[CEED_COMPOSITE_MAX] = {true};
+  bool                  is_run_good[CEED_COMPOSITE_MAX] = {false};
   CeedInt               num_suboperators;
-  const CeedScalar     *input_arr = NULL;
-  CeedScalar           *output_arr;
+  const CeedScalar     *input_arr  = NULL;
+  CeedScalar           *output_arr = NULL;
   Ceed                  ceed;
   CeedOperator_Hip_gen *impl;
   CeedOperator         *sub_operators;

From c8758636b5c68890a733ece6c4c5be0c3dca218d Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Wed, 9 Jul 2025 09:18:51 -0600
Subject: [PATCH 438/571] minor - fix debug print args

---
 interface/ceed-preconditioning.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/interface/ceed-preconditioning.c b/interface/ceed-preconditioning.c
index fe583ab9e5..ecc224b7e2 100644
--- a/interface/ceed-preconditioning.c
+++ b/interface/ceed-preconditioning.c
@@ -1949,7 +1949,7 @@ int CeedOperatorGetFallback(CeedOperator op, CeedOperator *op_fallback) {
 
       CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "---------- CeedOperator Fallback ----------\n");
       CeedDebug(ceed, "CeedOperator \"%s\": falling back from operator at address %p with backend %s to operator at address %p with backend %s\n",
-                op_name, resource, op, resource_fallback, op->op_fallback);
+                op_name, op, resource, op->op_fallback, resource_fallback);
       CeedCall(CeedDestroy(&ceed_fallback));
     }
     CeedCall(CeedDestroy(&ceed));

From ca38d01d95e1d63e8425c25ab66131b07bd24aa2 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Wed, 9 Jul 2025 10:13:33 -0600
Subject: [PATCH 439/571] debug - clearer debug messages for fallback control
 flow

---
 backends/cuda-gen/ceed-cuda-gen-operator.c | 10 +++++-----
 backends/hip-gen/ceed-hip-gen-operator.c   | 10 +++++-----
 interface/ceed-preconditioning.c           | 15 +++++++++++++--
 interface/ceed.c                           |  7 ++++---
 4 files changed, 27 insertions(+), 15 deletions(-)

diff --git a/backends/cuda-gen/ceed-cuda-gen-operator.c b/backends/cuda-gen/ceed-cuda-gen-operator.c
index 5342b6a4a8..1e55763462 100644
--- a/backends/cuda-gen/ceed-cuda-gen-operator.c
+++ b/backends/cuda-gen/ceed-cuda-gen-operator.c
@@ -286,7 +286,7 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec,
   if (!is_run_good) {
     CeedOperator op_fallback;
 
-    CeedDebug256(CeedOperatorReturnCeed(op), CEED_DEBUG_COLOR_SUCCESS, "Falling back to /gpu/cuda/ref CeedOperator");
+    CeedDebug(CeedOperatorReturnCeed(op), "\nFalling back to /gpu/cuda/ref CeedOperator for ApplyAdd\n");
     CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback));
     CeedCallBackend(CeedOperatorApplyAdd(op_fallback, input_vec, output_vec, request));
   }
@@ -327,7 +327,7 @@ static int CeedOperatorApplyAddComposite_Cuda_gen(CeedOperator op, CeedVector in
     if (!is_run_good[i]) {
       CeedOperator op_fallback;
 
-      CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "Falling back to /gpu/cuda/ref CeedOperator");
+      CeedDebug(ceed, "\nFalling back to /gpu/cuda/ref CeedOperator for ApplyAdd\n");
       CeedCallBackend(CeedOperatorGetFallback(sub_operators[i], &op_fallback));
       CeedCallBackend(CeedOperatorApplyAdd(op_fallback, input_vec, output_vec, request));
     }
@@ -513,7 +513,7 @@ static int CeedOperatorLinearAssembleQFunctionCore_Cuda_gen(CeedOperator op, boo
   if (data->use_assembly_fallback) {
     CeedOperator op_fallback;
 
-    CeedDebug256(CeedOperatorReturnCeed(op), CEED_DEBUG_COLOR_SUCCESS, "Falling back to /gpu/cuda/ref CeedOperator");
+    CeedDebug(CeedOperatorReturnCeed(op), "\nFalling back to /gpu/cuda/ref CeedOperator for LinearAssemblyQFunction\n");
     CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback));
     CeedCallBackend(CeedOperatorFallbackLinearAssembleQFunctionBuildOrUpdate(op_fallback, assembled, rstr, request));
     return CEED_ERROR_SUCCESS;
@@ -684,7 +684,7 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda_gen(CeedOperator o
   if (data->use_assembly_fallback) {
     CeedOperator op_fallback;
 
-    CeedDebug256(CeedOperatorReturnCeed(op), CEED_DEBUG_COLOR_SUCCESS, "Falling back to /gpu/cuda/ref CeedOperator");
+    CeedDebug(CeedOperatorReturnCeed(op), "\nFalling back to /gpu/cuda/ref CeedOperator for AtPoints LinearAssembleAddDiagonal\n");
     CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback));
     CeedCallBackend(CeedOperatorLinearAssembleAddDiagonal(op_fallback, assembled, request));
     return CEED_ERROR_SUCCESS;
@@ -849,7 +849,7 @@ static int CeedSingleOperatorAssembleAtPoints_Cuda_gen(CeedOperator op, CeedInt
   if (data->use_assembly_fallback) {
     CeedOperator op_fallback;
 
-    CeedDebug256(CeedOperatorReturnCeed(op), CEED_DEBUG_COLOR_SUCCESS, "Falling back to /gpu/cuda/ref CeedOperator");
+    CeedDebug(CeedOperatorReturnCeed(op), "\nFalling back to /gpu/cuda/ref CeedOperator for AtPoints SingleOperatorAssemble\n");
     CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback));
     CeedCallBackend(CeedSingleOperatorAssemble(op_fallback, offset, assembled));
     return CEED_ERROR_SUCCESS;
diff --git a/backends/hip-gen/ceed-hip-gen-operator.c b/backends/hip-gen/ceed-hip-gen-operator.c
index 9a04027f24..08bf7455d2 100644
--- a/backends/hip-gen/ceed-hip-gen-operator.c
+++ b/backends/hip-gen/ceed-hip-gen-operator.c
@@ -245,7 +245,7 @@ static int CeedOperatorApplyAdd_Hip_gen(CeedOperator op, CeedVector input_vec, C
   if (!is_run_good) {
     CeedOperator op_fallback;
 
-    CeedDebug256(CeedOperatorReturnCeed(op), CEED_DEBUG_COLOR_SUCCESS, "Falling back to /gpu/hip/ref CeedOperator");
+    CeedDebug(CeedOperatorReturnCeed(op), "\nFalling back to /gpu/hip/ref CeedOperator for ApplyAdd\n");
     CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback));
     CeedCallBackend(CeedOperatorApplyAdd(op_fallback, input_vec, output_vec, request));
   }
@@ -293,7 +293,7 @@ static int CeedOperatorApplyAddComposite_Hip_gen(CeedOperator op, CeedVector inp
     if (!is_run_good[i]) {
       CeedOperator op_fallback;
 
-      CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "Falling back to /gpu/hip/ref CeedOperator");
+      CeedDebug(ceed, "\nFalling back to /gpu/hip/ref CeedOperator for ApplyAdd\n");
       CeedCallBackend(CeedOperatorGetFallback(sub_operators[i], &op_fallback));
       CeedCallBackend(CeedOperatorApplyAdd(op_fallback, input_vec, output_vec, request));
     }
@@ -490,7 +490,7 @@ static int CeedOperatorLinearAssembleQFunctionCore_Hip_gen(CeedOperator op, bool
   if (data->use_assembly_fallback) {
     CeedOperator op_fallback;
 
-    CeedDebug256(CeedOperatorReturnCeed(op), CEED_DEBUG_COLOR_SUCCESS, "Falling back to /gpu/hip/ref CeedOperator");
+    CeedDebug(CeedOperatorReturnCeed(op), "\nFalling back to /gpu/hip/ref CeedOperator for LineearAssembleQFunction\n");
     CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback));
     CeedCallBackend(CeedOperatorFallbackLinearAssembleQFunctionBuildOrUpdate(op_fallback, assembled, rstr, request));
     return CEED_ERROR_SUCCESS;
@@ -674,7 +674,7 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip_gen(CeedOperator op
   if (data->use_assembly_fallback) {
     CeedOperator op_fallback;
 
-    CeedDebug256(CeedOperatorReturnCeed(op), CEED_DEBUG_COLOR_SUCCESS, "Falling back to /gpu/hip/ref CeedOperator");
+    CeedDebug(CeedOperatorReturnCeed(op), "\nFalling back to /gpu/hip/ref CeedOperator for AtPoints LinearAssembleAddDiagonal\n");
     CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback));
     CeedCallBackend(CeedOperatorLinearAssembleAddDiagonal(op_fallback, assembled, request));
     return CEED_ERROR_SUCCESS;
@@ -859,7 +859,7 @@ static int CeedSingleOperatorAssembleAtPoints_Hip_gen(CeedOperator op, CeedInt o
   if (data->use_assembly_fallback) {
     CeedOperator op_fallback;
 
-    CeedDebug256(CeedOperatorReturnCeed(op), CEED_DEBUG_COLOR_SUCCESS, "Falling back to /gpu/hip/ref CeedOperator");
+    CeedDebug(CeedOperatorReturnCeed(op), "\nFalling back to /gpu/hip/ref CeedOperator for AtPoints SingleOperatorAssemble\n");
     CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback));
     CeedCallBackend(CeedSingleOperatorAssemble(op_fallback, offset, assembled));
     return CEED_ERROR_SUCCESS;
diff --git a/interface/ceed-preconditioning.c b/interface/ceed-preconditioning.c
index ecc224b7e2..0cab467cd5 100644
--- a/interface/ceed-preconditioning.c
+++ b/interface/ceed-preconditioning.c
@@ -585,6 +585,7 @@ static int CeedOperatorLinearAssembleQFunctionBuildOrUpdate_Core(CeedOperator op
   CeedCall(CeedOperatorGetFallbackParent(op, &op_fallback_parent));
   if (op_fallback_parent && use_parent && op_fallback_parent->LinearAssembleQFunctionUpdate) {
     // -- Backend version for op fallback parent is faster, if it exists
+    CeedDebug(CeedOperatorReturnCeed(op), "Using fallback parent for CeedOperatorLinearAssembleQFunctionBuildOrUpdate\n");
     LinearAssembleQFunctionUpdate = op_fallback_parent->LinearAssembleQFunctionUpdate;
     op_assemble                   = op_fallback_parent;
   } else if (op->LinearAssembleQFunctionUpdate) {
@@ -624,6 +625,7 @@ static int CeedOperatorLinearAssembleQFunctionBuildOrUpdate_Core(CeedOperator op
     // Operator fallback
     CeedOperator op_fallback;
 
+    CeedDebug(CeedOperatorReturnCeed(op), "\nFalling back for CeedOperatorLinearAssembleQFunctionBuildOrUpdate\n");
     CeedCall(CeedOperatorGetFallback(op, &op_fallback));
     if (op_fallback) CeedCall(CeedOperatorLinearAssembleQFunctionBuildOrUpdate(op_fallback, assembled, rstr, request));
     else return CeedError(CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED, "Backend does not support CeedOperatorLinearAssembleQFunctionUpdate");
@@ -690,6 +692,7 @@ int CeedSingleOperatorAssemble(CeedOperator op, CeedInt offset, CeedVector value
     // Operator fallback
     CeedOperator op_fallback;
 
+    CeedDebug(CeedOperatorReturnCeed(op), "\nFalling back for CeedSingleOperatorAssemble\n");
     CeedCall(CeedOperatorGetFallback(op, &op_fallback));
     if (op_fallback) {
       CeedCall(CeedSingleOperatorAssemble(op_fallback, offset, values));
@@ -1948,8 +1951,8 @@ int CeedOperatorGetFallback(CeedOperator op, CeedOperator *op_fallback) {
       CeedCall(CeedOperatorGetName(op, &op_name));
 
       CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "---------- CeedOperator Fallback ----------\n");
-      CeedDebug(ceed, "CeedOperator \"%s\": falling back from operator at address %p with backend %s to operator at address %p with backend %s\n",
-                op_name, op, resource, op->op_fallback, resource_fallback);
+      CeedDebug(ceed, "Falling back from Operator with backend %s at address %p to Operator with backend %s at address %p for CeedOperator \"%s\"\n",
+                resource, op, resource_fallback, op->op_fallback, op_name);
       CeedCall(CeedDestroy(&ceed_fallback));
     }
     CeedCall(CeedDestroy(&ceed));
@@ -2028,6 +2031,7 @@ int CeedOperatorLinearAssembleQFunction(CeedOperator op, CeedVector *assembled,
     // Operator fallback
     CeedOperator op_fallback;
 
+    CeedDebug(CeedOperatorReturnCeed(op), "\nFalling back for CeedOperatorLinearAssembleQFunction\n");
     CeedCall(CeedOperatorGetFallback(op, &op_fallback));
     if (op_fallback) CeedCall(CeedOperatorLinearAssembleQFunction(op_fallback, assembled, rstr, request));
     else return CeedError(CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED, "Backend does not support CeedOperatorLinearAssembleQFunction");
@@ -2111,6 +2115,7 @@ int CeedOperatorLinearAssembleDiagonal(CeedOperator op, CeedVector assembled, Ce
     // Operator fallback
     CeedOperator op_fallback;
 
+    CeedDebug(CeedOperatorReturnCeed(op), "\nFalling back for CeedOperatorLinearAssembleDiagonal\n");
     CeedCall(CeedOperatorGetFallback(op, &op_fallback));
     if (op_fallback) {
       CeedCall(CeedOperatorLinearAssembleDiagonal(op_fallback, assembled, request));
@@ -2170,6 +2175,7 @@ int CeedOperatorLinearAssembleAddDiagonal(CeedOperator op, CeedVector assembled,
     // Operator fallback
     CeedOperator op_fallback;
 
+    CeedDebug(CeedOperatorReturnCeed(op), "\nFalling back for CeedOperatorLinearAssembleAddDiagonal\n");
     CeedCall(CeedOperatorGetFallback(op, &op_fallback));
     if (op_fallback) {
       CeedCall(CeedOperatorLinearAssembleAddDiagonal(op_fallback, assembled, request));
@@ -2335,6 +2341,7 @@ int CeedOperatorLinearAssemblePointBlockDiagonal(CeedOperator op, CeedVector ass
     // Operator fallback
     CeedOperator op_fallback;
 
+    CeedDebug(CeedOperatorReturnCeed(op), "\nFalling back for CeedOperatorLinearAssemblePointBlockDiagonal\n");
     CeedCall(CeedOperatorGetFallback(op, &op_fallback));
     if (op_fallback) {
       CeedCall(CeedOperatorLinearAssemblePointBlockDiagonal(op_fallback, assembled, request));
@@ -2392,6 +2399,7 @@ int CeedOperatorLinearAssembleAddPointBlockDiagonal(CeedOperator op, CeedVector
     // Operator fallback
     CeedOperator op_fallback;
 
+    CeedDebug(CeedOperatorReturnCeed(op), "\nFalling back for CeedOperatorLinearAssembleAddPointBlockDiagonal\n");
     CeedCall(CeedOperatorGetFallback(op, &op_fallback));
     if (op_fallback) {
       CeedCall(CeedOperatorLinearAssembleAddPointBlockDiagonal(op_fallback, assembled, request));
@@ -2444,6 +2452,7 @@ int CeedOperatorLinearAssembleSymbolic(CeedOperator op, CeedSize *num_entries, C
     // Operator fallback
     CeedOperator op_fallback;
 
+    CeedDebug(CeedOperatorReturnCeed(op), "\nFalling back for CeedOperatorLinearAssembleSymbolic\n");
     CeedCall(CeedOperatorGetFallback(op, &op_fallback));
     if (op_fallback) {
       CeedCall(CeedOperatorLinearAssembleSymbolic(op_fallback, num_entries, rows, cols));
@@ -2542,6 +2551,7 @@ int CeedOperatorLinearAssemble(CeedOperator op, CeedVector values) {
     // Operator fallback
     CeedOperator op_fallback;
 
+    CeedDebug(CeedOperatorReturnCeed(op), "\nFalling back for CeedOperatorLinearAssemble\n");
     CeedCall(CeedOperatorGetFallback(op, &op_fallback));
     if (op_fallback) {
       CeedCall(CeedOperatorLinearAssemble(op_fallback, values));
@@ -2839,6 +2849,7 @@ int CeedOperatorCreateFDMElementInverse(CeedOperator op, CeedOperator *fdm_inv,
     // Operator fallback
     CeedOperator op_fallback;
 
+    CeedDebug(CeedOperatorReturnCeed(op), "\nFalling back for CeedOperatorCreateFDMElementInverse\n");
     CeedCall(CeedOperatorGetFallback(op, &op_fallback));
     if (op_fallback) {
       CeedCall(CeedOperatorCreateFDMElementInverse(op_fallback, fdm_inv, request));
diff --git a/interface/ceed.c b/interface/ceed.c
index 96f0093984..39a8d3a911 100644
--- a/interface/ceed.c
+++ b/interface/ceed.c
@@ -650,11 +650,11 @@ int CeedGetOperatorFallbackResource(Ceed ceed, const char **resource) {
 **/
 int CeedGetOperatorFallbackCeed(Ceed ceed, Ceed *fallback_ceed) {
   if (ceed->has_valid_op_fallback_resource) {
-    CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "---------- CeedOperator Fallback ----------\n");
-    CeedDebug(ceed, "Getting fallback from %s to %s\n", ceed->resource, ceed->op_fallback_resource);
+    CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "---------- Ceed Fallback ----------\n");
+    CeedDebug(ceed, "Falling back from Ceed with backend %s at address %p to Ceed with backend %s", ceed->resource, ceed, ceed->op_fallback_resource);
   }
 
-  // Create fallback Ceed if uninitalized
+  // Create fallback Ceed if uninitialized
   if (!ceed->op_fallback_ceed && ceed->has_valid_op_fallback_resource) {
     CeedDebug(ceed, "Creating fallback Ceed");
 
@@ -688,6 +688,7 @@ int CeedGetOperatorFallbackCeed(Ceed ceed, Ceed *fallback_ceed) {
     }
   }
   *fallback_ceed = NULL;
+  CeedDebug(ceed, "Fallback Ceed with backend %s at address %p\n", ceed->op_fallback_resource, ceed->op_fallback_ceed);
   if (ceed->op_fallback_ceed) CeedCall(CeedReferenceCopy(ceed->op_fallback_ceed, fallback_ceed));
   return CEED_ERROR_SUCCESS;
 }

From c21e34e2069f004c216393392d2c47e0444dd836 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Wed, 9 Jul 2025 10:22:06 -0600
Subject: [PATCH 440/571] debug - color consistency

---
 backends/cuda/ceed-cuda-compile.cpp | 12 ++++++------
 backends/hip/ceed-hip-compile.cpp   | 12 ++++++------
 interface/ceed-preconditioning.c    |  2 --
 interface/ceed-qfunction-register.c |  2 +-
 interface/ceed-register.c           |  2 +-
 5 files changed, 14 insertions(+), 16 deletions(-)

diff --git a/backends/cuda/ceed-cuda-compile.cpp b/backends/cuda/ceed-cuda-compile.cpp
index 370e165fc5..dc430b81fa 100644
--- a/backends/cuda/ceed-cuda-compile.cpp
+++ b/backends/cuda/ceed-cuda-compile.cpp
@@ -119,17 +119,17 @@ static int CeedCompileCore_Cuda(Ceed ceed, const char *source, const bool throw_
   CeedCallNvrtc(ceed, nvrtcCreateProgram(&prog, code.str().c_str(), NULL, 0, NULL, NULL));
 
   // Compile kernel
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- ATTEMPTING TO COMPILE JIT SOURCE ----------\n");
+  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "---------- ATTEMPTING TO COMPILE JIT SOURCE ----------\n");
   CeedDebug(ceed, "Source:\n%s\n", code.str().c_str());
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- END OF JIT SOURCE ----------\n");
+  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "---------- END OF JIT SOURCE ----------\n");
   if (CeedDebugFlag(ceed)) {
     // LCOV_EXCL_START
-    CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- JiT COMPILER OPTIONS ----------\n");
+    CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "---------- JiT COMPILER OPTIONS ----------\n");
     for (CeedInt i = 0; i < num_opts + num_jit_source_dirs + num_jit_defines; i++) {
       CeedDebug(ceed, "Option %d: %s", i, opts[i]);
     }
     CeedDebug(ceed, "");
-    CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- END OF JiT COMPILER OPTIONS ----------\n");
+    CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "---------- END OF JiT COMPILER OPTIONS ----------\n");
     // LCOV_EXCL_STOP
   }
   nvrtcResult result = nvrtcCompileProgram(prog, num_opts + num_jit_source_dirs + num_jit_defines, opts);
@@ -155,7 +155,7 @@ static int CeedCompileCore_Cuda(Ceed ceed, const char *source, const bool throw_
       // LCOV_EXCL_START
       CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- COMPILE ERROR DETECTED ----------\n");
       CeedDebug(ceed, "Error: %s\nCompile log:\n%s\n", nvrtcGetErrorString(result), log);
-      CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- BACKEND MAY FALLBACK ----------\n");
+      CeedDebug256(ceed, CEED_DEBUG_COLOR_WARNING, "---------- BACKEND MAY FALLBACK ----------\n");
       CeedCallBackend(CeedFree(&log));
       CeedCallNvrtc(ceed, nvrtcDestroyProgram(&prog));
       return CEED_ERROR_SUCCESS;
@@ -266,7 +266,7 @@ static int CeedRunKernelDimSharedCore_Cuda(Ceed ceed, CUfunction kernel, CUstrea
       CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- LAUNCH ERROR DETECTED ----------\n");
       CeedDebug(ceed, "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: max_threads_per_block %d on block size (%d,%d,%d), shared_size %d, num_regs %d\n",
                 max_threads_per_block, block_size_x, block_size_y, block_size_z, shared_size_bytes, num_regs);
-      CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- BACKEND MAY FALLBACK ----------\n");
+      CeedDebug256(ceed, CEED_DEBUG_COLOR_WARNING, "---------- BACKEND MAY FALLBACK ----------\n");
       // LCOV_EXCL_STOP
     }
     *is_good_run = false;
diff --git a/backends/hip/ceed-hip-compile.cpp b/backends/hip/ceed-hip-compile.cpp
index bc44c5e0f7..0eeec57c78 100644
--- a/backends/hip/ceed-hip-compile.cpp
+++ b/backends/hip/ceed-hip-compile.cpp
@@ -121,17 +121,17 @@ static int CeedCompileCore_Hip(Ceed ceed, const char *source, const bool throw_e
   CeedCallHiprtc(ceed, hiprtcCreateProgram(&prog, code.str().c_str(), NULL, 0, NULL, NULL));
 
   // Compile kernel
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- ATTEMPTING TO COMPILE JIT SOURCE ----------\n");
+  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "---------- ATTEMPTING TO COMPILE JIT SOURCE ----------\n");
   CeedDebug(ceed, "Source:\n%s\n", code.str().c_str());
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- END OF JIT SOURCE ----------\n");
+  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "---------- END OF JIT SOURCE ----------\n");
   if (CeedDebugFlag(ceed)) {
     // LCOV_EXCL_START
-    CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- JiT COMPILER OPTIONS ----------\n");
+    CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "---------- JiT COMPILER OPTIONS ----------\n");
     for (CeedInt i = 0; i < num_opts + num_jit_source_dirs + num_jit_defines; i++) {
       CeedDebug(ceed, "Option %d: %s", i, opts[i]);
     }
     CeedDebug(ceed, "");
-    CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- END OF JiT COMPILER OPTIONS ----------\n");
+    CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "---------- END OF JiT COMPILER OPTIONS ----------\n");
     // LCOV_EXCL_STOP
   }
   hiprtcResult result = hiprtcCompileProgram(prog, num_opts + num_jit_source_dirs + num_jit_defines, opts);
@@ -157,7 +157,7 @@ static int CeedCompileCore_Hip(Ceed ceed, const char *source, const bool throw_e
       // LCOV_EXCL_START
       CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- COMPILE ERROR DETECTED ----------\n");
       CeedDebug(ceed, "Error: %s\nCompile log:\n%s\n", hiprtcGetErrorString(result), log);
-      CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- BACKEND MAY FALLBACK ----------\n");
+      CeedDebug256(ceed, CEED_DEBUG_COLOR_WARNING, "---------- BACKEND MAY FALLBACK ----------\n");
       CeedCallBackend(CeedFree(&log));
       CeedCallHiprtc(ceed, hiprtcDestroyProgram(&prog));
       return CEED_ERROR_SUCCESS;
@@ -242,7 +242,7 @@ static int CeedRunKernelDimSharedCore_Hip(Ceed ceed, hipFunction_t kernel, hipSt
 
       CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- LAUNCH ERROR DETECTED ----------\n");
       CeedDebug(ceed, "%s\n", message);
-      CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- BACKEND MAY FALLBACK ----------\n");
+      CeedDebug256(ceed, CEED_DEBUG_COLOR_WARNING, "---------- BACKEND MAY FALLBACK ----------\n");
       // LCOV_EXCL_STOP
     }
     *is_good_run = false;
diff --git a/interface/ceed-preconditioning.c b/interface/ceed-preconditioning.c
index 0cab467cd5..cffa77d2ae 100644
--- a/interface/ceed-preconditioning.c
+++ b/interface/ceed-preconditioning.c
@@ -42,7 +42,6 @@ static int CeedQFunctionCreateFallback(Ceed fallback_ceed, CeedQFunction qf, Cee
   // Check if NULL qf passed in
   if (!qf) return CEED_ERROR_SUCCESS;
 
-  CeedDebug256(CeedQFunctionReturnCeed(qf), 1, "---------- CeedOperator Fallback ----------\n");
   CeedDebug(CeedQFunctionReturnCeed(qf), "Creating fallback CeedQFunction\n");
 
   if (qf->source_path) {
@@ -117,7 +116,6 @@ static int CeedOperatorCreateFallback(CeedOperator op) {
   CeedCall(CeedDestroy(&ceed));
   if (!ceed_fallback) return CEED_ERROR_SUCCESS;
 
-  CeedDebug256(CeedOperatorReturnCeed(op), 1, "---------- CeedOperator Fallback ----------\n");
   CeedDebug(CeedOperatorReturnCeed(op), "Creating fallback CeedOperator\n");
 
   // Clone Op
diff --git a/interface/ceed-qfunction-register.c b/interface/ceed-qfunction-register.c
index bdce3f8815..05163b6a3d 100644
--- a/interface/ceed-qfunction-register.c
+++ b/interface/ceed-qfunction-register.c
@@ -32,7 +32,7 @@ int CeedQFunctionRegisterAll(void) {
 
   CeedPragmaCritical(CeedQFunctionRegisterAll) {
     if (!register_all_called) {
-      CeedDebugEnv256(1, "\n---------- Registering Gallery QFunctions ----------\n");
+      CeedDebugEnv256(CEED_DEBUG_COLOR_SUCCESS, "\n---------- Registering Gallery QFunctions ----------\n");
 #define CEED_GALLERY_QFUNCTION(name) \
   if (!ierr) ierr = name();
 #include "../gallery/ceed-gallery-list.h"
diff --git a/interface/ceed-register.c b/interface/ceed-register.c
index 2d3413eaf2..08d2189cc2 100644
--- a/interface/ceed-register.c
+++ b/interface/ceed-register.c
@@ -32,7 +32,7 @@ int CeedRegisterAll(void) {
 
   CeedPragmaCritical(CeedRegisterAll) {
     if (!register_all_called) {
-      CeedDebugEnv256(1, "\n---------- Registering Backends ----------\n");
+      CeedDebugEnv256(CEED_DEBUG_COLOR_SUCCESS, "\n---------- Registering Backends ----------\n");
 #define CEED_BACKEND(name, ...) \
   if (!ierr) ierr = name();
 #include "../backends/ceed-backend-list.h"

From 5278038698e0397fd3ea85fa7eb4dab7e3d3aba2 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Wed, 9 Jul 2025 14:46:31 -0600
Subject: [PATCH 441/571] basis - fix flop counting for gpu at-points

---
 interface/ceed-basis.c | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/interface/ceed-basis.c b/interface/ceed-basis.c
index d17b8466f9..0be8f6ac08 100644
--- a/interface/ceed-basis.c
+++ b/interface/ceed-basis.c
@@ -903,6 +903,15 @@ int CeedBasisGetFlopsEstimate(CeedBasis basis, CeedTransposeMode t_mode, CeedEva
       post *= Q_1d;
     }
     if (is_at_points) {
+      bool is_gpu = false;
+
+      {
+        CeedMemType mem_type;
+
+        CeedCall(CeedGetPreferredMemType(CeedBasisReturnCeed(basis), &mem_type));
+        is_gpu = mem_type == CEED_MEM_DEVICE;
+      }
+
       CeedInt chebyshev_flops = (Q_1d - 2) * 3 + 1, d_chebyshev_flops = (Q_1d - 2) * 8 + 1;
       CeedInt point_tensor_flops = 0, pre = CeedIntPow(Q_1d, dim - 1), post = 1;
 
@@ -917,16 +926,17 @@ int CeedBasisGetFlopsEstimate(CeedBasis basis, CeedTransposeMode t_mode, CeedEva
           *flops = 0;
           break;
         case CEED_EVAL_INTERP:
-          *flops = tensor_flops + num_points * (dim * chebyshev_flops + point_tensor_flops + (t_mode == CEED_TRANSPOSE ? CeedIntPow(Q_1d, dim) : 0));
+          *flops = tensor_flops + num_points * num_comp * (point_tensor_flops + (t_mode == CEED_TRANSPOSE ? CeedIntPow(Q_1d, dim) : 0)) +
+                   num_points * (is_gpu ? num_comp : 1) * dim * chebyshev_flops;
           break;
         case CEED_EVAL_GRAD:
-          *flops = tensor_flops + num_points * (dim * (d_chebyshev_flops + (dim - 1) * chebyshev_flops + point_tensor_flops +
-                                                       (t_mode == CEED_TRANSPOSE ? CeedIntPow(Q_1d, dim) : 0)));
+          *flops = tensor_flops + num_points * num_comp * (point_tensor_flops + (t_mode == CEED_TRANSPOSE ? CeedIntPow(Q_1d, dim) : 0)) +
+                   num_points * (is_gpu ? num_comp : 1) * dim * (d_chebyshev_flops + (dim - 1) * chebyshev_flops);
           break;
         case CEED_EVAL_DIV:
         case CEED_EVAL_CURL: {
           // LCOV_EXCL_START
-          return CeedError(CeedBasisReturnCeed(basis), CEED_ERROR_INCOMPATIBLE, "Tensor basis evaluation for %s not supported",
+          return CeedError(CeedBasisReturnCeed(basis), CEED_ERROR_INCOMPATIBLE, "Tensor basis evaluation for %s not supported at points",
                            CeedEvalModes[eval_mode]);
           break;
           // LCOV_EXCL_STOP

From a82cd097ff20f09688a65f4c4c86d934c8731d68 Mon Sep 17 00:00:00 2001
From: Zach Atkins <Zach.Atkins@colorado.edu>
Date: Wed, 9 Jul 2025 15:19:57 -0700
Subject: [PATCH 442/571] Update flop counts to match GPU templates for 3D

---
 interface/ceed-basis.c | 23 +++++++++++++++++------
 1 file changed, 17 insertions(+), 6 deletions(-)

diff --git a/interface/ceed-basis.c b/interface/ceed-basis.c
index 0be8f6ac08..2cf7fb526d 100644
--- a/interface/ceed-basis.c
+++ b/interface/ceed-basis.c
@@ -925,14 +925,25 @@ int CeedBasisGetFlopsEstimate(CeedBasis basis, CeedTransposeMode t_mode, CeedEva
         case CEED_EVAL_NONE:
           *flops = 0;
           break;
-        case CEED_EVAL_INTERP:
-          *flops = tensor_flops + num_points * num_comp * (point_tensor_flops + (t_mode == CEED_TRANSPOSE ? CeedIntPow(Q_1d, dim) : 0)) +
-                   num_points * (is_gpu ? num_comp : 1) * dim * chebyshev_flops;
+        case CEED_EVAL_INTERP: {
+          *flops = tensor_flops + num_points * num_comp * (point_tensor_flops + (t_mode == CEED_TRANSPOSE ? CeedIntPow(Q_1d, dim) : 0));
+          if (dim == 3 && is_gpu) {
+            *flops += num_points * num_comp * Q_1d * (dim * chebyshev_flops + 2 * Q_1d * Q_1d + (t_mode == CEED_TRANSPOSE ? 2 * Q_1d + 1 : 3 * Q_1d));
+          } else {
+            *flops += num_points * (is_gpu ? num_comp : 1) * dim * chebyshev_flops;
+          }
           break;
-        case CEED_EVAL_GRAD:
-          *flops = tensor_flops + num_points * num_comp * (point_tensor_flops + (t_mode == CEED_TRANSPOSE ? CeedIntPow(Q_1d, dim) : 0)) +
-                   num_points * (is_gpu ? num_comp : 1) * dim * (d_chebyshev_flops + (dim - 1) * chebyshev_flops);
+        }
+        case CEED_EVAL_GRAD: {
+          *flops = tensor_flops + num_points * num_comp * (point_tensor_flops + (t_mode == CEED_TRANSPOSE ? CeedIntPow(Q_1d, dim) : 0));
+          if (dim == 3 && is_gpu) {
+            CeedInt inner_flops = (dim - 1) * chebyshev_flops + d_chebyshev_flops + 2 * Q_1d * Q_1d + (t_mode == CEED_TRANSPOSE ? 2 : 3) * Q_1d;
+            *flops += num_points * num_comp * Q_1d * (dim * inner_flops + (t_mode == CEED_TRANSPOSE ? 1 : 0));
+          } else {
+            *flops += num_points * (is_gpu ? num_comp : 1) * dim * (d_chebyshev_flops + (dim - 1) * chebyshev_flops);
+          }
           break;
+        }
         case CEED_EVAL_DIV:
         case CEED_EVAL_CURL: {
           // LCOV_EXCL_START

From 802d760a181830887ae39e389e664dcc61030cbc Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Thu, 10 Jul 2025 08:30:21 -0600
Subject: [PATCH 443/571] gpu - minor reordering to reduce at-points flops

---
 ...-shared-basis-tensor-at-points-templates.h | 74 +++++++++++--------
 ...-shared-basis-tensor-at-points-templates.h | 74 +++++++++++--------
 interface/ceed-basis.c                        |  9 ++-
 3 files changed, 94 insertions(+), 63 deletions(-)

diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h
index ae22a5f1ba..a9cd1209ef 100644
--- a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h
@@ -291,11 +291,15 @@ template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
 inline __device__ void InterpAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
                                         CeedScalar *__restrict__ r_V) {
   for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
-  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    for (CeedInt k = 0; k < Q_1D; k++) {
-      CeedScalar buffer[Q_1D];
-      CeedScalar chebyshev_x[Q_1D];
+  for (CeedInt k = 0; k < Q_1D; k++) {
+    CeedScalar buffer[Q_1D];
+    CeedScalar chebyshev_x[Q_1D];
+
+    // Get z contraction value
+    ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
+    const CeedScalar z = chebyshev_x[k];
 
+    for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
       // Load coefficients
       __syncthreads();
       if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[k + comp * Q_1D];
@@ -309,9 +313,6 @@ inline __device__ void InterpAtPoints3d(SharedData_Cuda &data, const CeedInt p,
         }
       }
       // Contract y and z direction
-      ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
-      const CeedScalar z = chebyshev_x[k];
-
       ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
       for (CeedInt i = 0; i < Q_1D; i++) {
         r_V[comp] += chebyshev_x[i] * buffer[i] * z;
@@ -326,18 +327,19 @@ inline __device__ void InterpAtPoints3d(SharedData_Cuda &data, const CeedInt p,
 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
 inline __device__ void InterpTransposeAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
                                                  CeedScalar *__restrict__ r_C) {
-  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    for (CeedInt k = 0; k < Q_1D; k++) {
-      CeedScalar buffer[Q_1D];
-      CeedScalar chebyshev_x[Q_1D];
+  for (CeedInt k = 0; k < Q_1D; k++) {
+    CeedScalar buffer[Q_1D];
+    CeedScalar chebyshev_x[Q_1D];
+
+    // Get z contraction value
+    ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
+    const CeedScalar z = chebyshev_x[k];
 
+    for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
       // Clear shared memory
       if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
       __syncthreads();
       // Contract y and z direction
-      ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
-      const CeedScalar z = chebyshev_x[k];
-
       ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
       for (CeedInt i = 0; i < Q_1D; i++) {
         buffer[i] = chebyshev_x[i] * r_U[comp] * z;
@@ -370,16 +372,26 @@ template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
 inline __device__ void GradAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
                                       CeedScalar *__restrict__ r_V) {
   for (CeedInt i = 0; i < NUM_COMP * 3; i++) r_V[i] = 0.0;
-  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    for (CeedInt k = 0; k < Q_1D; k++) {
-      CeedScalar buffer[Q_1D];
-      CeedScalar chebyshev_x[Q_1D];
+  for (CeedInt k = 0; k < Q_1D; k++) {
+    CeedScalar buffer[Q_1D];
+    CeedScalar chebyshev_x[Q_1D];
+
+    for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+      // Get z contraction value
+      ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
+      CeedScalar z = chebyshev_x[k];
 
       // Load coefficients
       __syncthreads();
       if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[k + comp * Q_1D];
       __syncthreads();
+      // Gradient directions
       for (CeedInt dim = 0; dim < 3; dim++) {
+        // Update z value for final pass
+        if (dim == 2) {
+          ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x);
+          z = chebyshev_x[k];
+        }
         // Contract x direction
         if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
         else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
@@ -390,10 +402,6 @@ inline __device__ void GradAtPoints3d(SharedData_Cuda &data, const CeedInt p, co
           }
         }
         // Contract y and z direction
-        if (dim == 2) ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x);
-        else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
-        const CeedScalar z = chebyshev_x[k];
-
         if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
         else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
         for (CeedInt i = 0; i < Q_1D; i++) {
@@ -410,20 +418,26 @@ inline __device__ void GradAtPoints3d(SharedData_Cuda &data, const CeedInt p, co
 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
 inline __device__ void GradTransposeAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
                                                CeedScalar *__restrict__ r_C) {
-  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    for (CeedInt k = 0; k < Q_1D; k++) {
-      CeedScalar buffer[Q_1D];
-      CeedScalar chebyshev_x[Q_1D];
+  for (CeedInt k = 0; k < Q_1D; k++) {
+    CeedScalar buffer[Q_1D];
+    CeedScalar chebyshev_x[Q_1D];
+
+    for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+      // Get z contraction value
+      ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
+      CeedScalar z = chebyshev_x[k];
 
       // Clear shared memory
       if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
       __syncthreads();
+      // Gradient directions
       for (CeedInt dim = 0; dim < 3; dim++) {
+        // Update z value for final pass
+        if (dim == 2) {
+          ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x);
+          z = chebyshev_x[k];
+        }
         // Contract y and z direction
-        if (dim == 2) ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x);
-        else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
-        const CeedScalar z = chebyshev_x[k];
-
         if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
         else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
         for (CeedInt i = 0; i < Q_1D; i++) {
diff --git a/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h b/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h
index 5184f03443..1c8661ce3c 100644
--- a/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h
+++ b/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h
@@ -292,11 +292,15 @@ template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
 inline __device__ void InterpAtPoints3d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X,
                                         CeedScalar *__restrict__ r_V) {
   for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
-  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    for (CeedInt k = 0; k < Q_1D; k++) {
-      CeedScalar buffer[Q_1D];
-      CeedScalar chebyshev_x[Q_1D];
+  for (CeedInt k = 0; k < Q_1D; k++) {
+    CeedScalar buffer[Q_1D];
+    CeedScalar chebyshev_x[Q_1D];
+
+    // Get z contraction value
+    ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
+    const CeedScalar z = chebyshev_x[k];
 
+    for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
       // Load coefficients
       __syncthreads();
       if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[k + comp * Q_1D];
@@ -310,9 +314,6 @@ inline __device__ void InterpAtPoints3d(SharedData_Hip &data, const CeedInt p, c
         }
       }
       // Contract y and z direction
-      ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
-      const CeedScalar z = chebyshev_x[k];
-
       ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
       for (CeedInt i = 0; i < Q_1D; i++) {
         r_V[comp] += chebyshev_x[i] * buffer[i] * z;
@@ -327,18 +328,19 @@ inline __device__ void InterpAtPoints3d(SharedData_Hip &data, const CeedInt p, c
 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
 inline __device__ void InterpTransposeAtPoints3d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U,
                                                  const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) {
-  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    for (CeedInt k = 0; k < Q_1D; k++) {
-      CeedScalar buffer[Q_1D];
-      CeedScalar chebyshev_x[Q_1D];
+  for (CeedInt k = 0; k < Q_1D; k++) {
+    CeedScalar buffer[Q_1D];
+    CeedScalar chebyshev_x[Q_1D];
+
+    // Get z contraction value
+    ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
+    const CeedScalar z = chebyshev_x[k];
 
+    for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
       // Clear shared memory
       if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
       __syncthreads();
       // Contract y and z direction
-      ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
-      const CeedScalar z = chebyshev_x[k];
-
       ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
       for (CeedInt i = 0; i < Q_1D; i++) {
         buffer[i] = chebyshev_x[i] * r_U[comp] * z;
@@ -371,16 +373,26 @@ template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
 inline __device__ void GradAtPoints3d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X,
                                       CeedScalar *__restrict__ r_V) {
   for (CeedInt i = 0; i < NUM_COMP * 3; i++) r_V[i] = 0.0;
-  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    for (CeedInt k = 0; k < Q_1D; k++) {
-      CeedScalar buffer[Q_1D];
-      CeedScalar chebyshev_x[Q_1D];
+  for (CeedInt k = 0; k < Q_1D; k++) {
+    CeedScalar buffer[Q_1D];
+    CeedScalar chebyshev_x[Q_1D];
+
+    for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+      // Get z contraction value
+      ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
+      CeedScalar z = chebyshev_x[k];
 
       // Load coefficients
       __syncthreads();
       if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[k + comp * Q_1D];
       __syncthreads();
+      // Gradient directions
       for (CeedInt dim = 0; dim < 3; dim++) {
+        // Update z value for final pass
+        if (dim == 2) {
+          ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x);
+          z = chebyshev_x[k];
+        }
         // Contract x direction
         if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
         else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
@@ -391,10 +403,6 @@ inline __device__ void GradAtPoints3d(SharedData_Hip &data, const CeedInt p, con
           }
         }
         // Contract y and z direction
-        if (dim == 2) ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x);
-        else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
-        const CeedScalar z = chebyshev_x[k];
-
         if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
         else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
         for (CeedInt i = 0; i < Q_1D; i++) {
@@ -411,20 +419,26 @@ inline __device__ void GradAtPoints3d(SharedData_Hip &data, const CeedInt p, con
 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
 inline __device__ void GradTransposeAtPoints3d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U,
                                                const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) {
-  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    for (CeedInt k = 0; k < Q_1D; k++) {
-      CeedScalar buffer[Q_1D];
-      CeedScalar chebyshev_x[Q_1D];
+  for (CeedInt k = 0; k < Q_1D; k++) {
+    CeedScalar buffer[Q_1D];
+    CeedScalar chebyshev_x[Q_1D];
+
+    for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+      // Get z contraction value
+      ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
+      CeedScalar z = chebyshev_x[k];
 
       // Clear shared memory
       if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
       __syncthreads();
+      // Gradient directions
       for (CeedInt dim = 0; dim < 3; dim++) {
+        // Update z value for final pass
+        if (dim == 2) {
+          ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x);
+          z = chebyshev_x[k];
+        }
         // Contract y and z direction
-        if (dim == 2) ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x);
-        else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
-        const CeedScalar z = chebyshev_x[k];
-
         if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
         else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
         for (CeedInt i = 0; i < Q_1D; i++) {
diff --git a/interface/ceed-basis.c b/interface/ceed-basis.c
index 2cf7fb526d..9a99ae6611 100644
--- a/interface/ceed-basis.c
+++ b/interface/ceed-basis.c
@@ -928,7 +928,8 @@ int CeedBasisGetFlopsEstimate(CeedBasis basis, CeedTransposeMode t_mode, CeedEva
         case CEED_EVAL_INTERP: {
           *flops = tensor_flops + num_points * num_comp * (point_tensor_flops + (t_mode == CEED_TRANSPOSE ? CeedIntPow(Q_1d, dim) : 0));
           if (dim == 3 && is_gpu) {
-            *flops += num_points * num_comp * Q_1d * (dim * chebyshev_flops + 2 * Q_1d * Q_1d + (t_mode == CEED_TRANSPOSE ? 2 * Q_1d + 1 : 3 * Q_1d));
+            *flops += num_points * Q_1d *
+                      (chebyshev_flops + num_comp * (2 * chebyshev_flops + 2 * Q_1d * Q_1d + (t_mode == CEED_TRANSPOSE ? 2 * Q_1d + 1 : 3 * Q_1d)));
           } else {
             *flops += num_points * (is_gpu ? num_comp : 1) * dim * chebyshev_flops;
           }
@@ -937,8 +938,10 @@ int CeedBasisGetFlopsEstimate(CeedBasis basis, CeedTransposeMode t_mode, CeedEva
         case CEED_EVAL_GRAD: {
           *flops = tensor_flops + num_points * num_comp * (point_tensor_flops + (t_mode == CEED_TRANSPOSE ? CeedIntPow(Q_1d, dim) : 0));
           if (dim == 3 && is_gpu) {
-            CeedInt inner_flops = (dim - 1) * chebyshev_flops + d_chebyshev_flops + 2 * Q_1d * Q_1d + (t_mode == CEED_TRANSPOSE ? 2 : 3) * Q_1d;
-            *flops += num_points * num_comp * Q_1d * (dim * inner_flops + (t_mode == CEED_TRANSPOSE ? 1 : 0));
+            CeedInt inner_flops =
+                dim * (2 * Q_1d * Q_1d + (t_mode == CEED_TRANSPOSE ? 2 : 3) * Q_1d + d_chebyshev_flops) + (2 * dim - 1) * chebyshev_flops;
+
+            *flops += num_points * Q_1d * num_comp * (inner_flops + (t_mode == CEED_TRANSPOSE ? 1 : 0));
           } else {
             *flops += num_points * (is_gpu ? num_comp : 1) * dim * (d_chebyshev_flops + (dim - 1) * chebyshev_flops);
           }

From 77c494ee9d34a4bf4ff3b2cfcca6ea48e8c1ea97 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Thu, 10 Jul 2025 08:30:21 -0600
Subject: [PATCH 444/571] gpu - minor reordering to reduce at-points flops

---
 backends/hip-gen/ceed-hip-gen-operator-build.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/backends/hip-gen/ceed-hip-gen-operator-build.cpp b/backends/hip-gen/ceed-hip-gen-operator-build.cpp
index 674f94b428..16d8f59f3b 100644
--- a/backends/hip-gen/ceed-hip-gen-operator-build.cpp
+++ b/backends/hip-gen/ceed-hip-gen-operator-build.cpp
@@ -1589,7 +1589,6 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_bu
   // Compile
   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
   CeedCallBackend(BlockGridCalculate_Hip_gen(is_all_tensor ? max_dim : 1, num_elem, data->max_P_1d, is_all_tensor ? Q_1d : Q, block_sizes));
-  if (is_at_points) block_sizes[2] = 1;
   {
     bool is_compile_good = false;
 
@@ -2625,7 +2624,6 @@ extern "C" int CeedOperatorBuildKernelLinearAssembleQFunction_Hip_gen(CeedOperat
   // Compile
   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
   CeedCallBackend(BlockGridCalculate_Hip_gen(max_dim, num_elem, data->max_P_1d, Q_1d, block_sizes));
-  block_sizes[2] = 1;
   {
     bool is_compile_good = false;
 

From dc7b9553a2ac0166d392b9608b84ba98fa3f0a4a Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 11 Jul 2025 11:09:58 -0600
Subject: [PATCH 445/571] gpu - minor reduction in AtPoints grad FLOPs

---
 ...-shared-basis-tensor-at-points-templates.h | 52 +++++++++----------
 ...-shared-basis-tensor-at-points-templates.h | 40 +++++++-------
 interface/ceed-basis.c                        |  4 +-
 3 files changed, 48 insertions(+), 48 deletions(-)

diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h
index a9cd1209ef..f0832cd073 100644
--- a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h
@@ -74,7 +74,7 @@ inline __device__ void InterpTransposeAtPoints1d(SharedData_Cuda &data, const Ce
     // Contract x direction
     if (p < NUM_POINTS) {
       for (CeedInt i = 0; i < Q_1D; i++) {
-        atomicAdd(&data.slice[comp * Q_1D + (i + data.t_id_x) % Q_1D], chebyshev_x[(i + data.t_id_x) % Q_1D] * r_U[comp]);
+        atomicAdd_block(&data.slice[comp * Q_1D + (i + data.t_id_x) % Q_1D], chebyshev_x[(i + data.t_id_x) % Q_1D] * r_U[comp]);
       }
     }
     // Pull from shared to register
@@ -121,7 +121,7 @@ inline __device__ void GradTransposeAtPoints1d(SharedData_Cuda &data, const Ceed
     // Contract x direction
     if (p < NUM_POINTS) {
       for (CeedInt i = 0; i < Q_1D; i++) {
-        atomicAdd(&data.slice[comp * Q_1D + (i + data.t_id_x) % Q_1D], chebyshev_x[(i + data.t_id_x) % Q_1D] * r_U[comp]);
+        atomicAdd_block(&data.slice[comp * Q_1D + (i + data.t_id_x) % Q_1D], chebyshev_x[(i + data.t_id_x) % Q_1D] * r_U[comp]);
       }
     }
     // Pull from shared to register
@@ -193,7 +193,7 @@ inline __device__ void InterpTransposeAtPoints2d(SharedData_Cuda &data, const Ce
         for (CeedInt j = 0; j < Q_1D; j++) {
           const CeedInt jj = (j + data.t_id_y) % Q_1D;
 
-          atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
+          atomicAdd_block(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
         }
       }
     }
@@ -269,7 +269,7 @@ inline __device__ void GradTransposeAtPoints2d(SharedData_Cuda &data, const Ceed
           for (CeedInt j = 0; j < Q_1D; j++) {
             const CeedInt jj = (j + data.t_id_y) % Q_1D;
 
-            atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
+            atomicAdd_block(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
           }
         }
       }
@@ -354,7 +354,7 @@ inline __device__ void InterpTransposeAtPoints3d(SharedData_Cuda &data, const Ce
           for (CeedInt j = 0; j < Q_1D; j++) {
             const CeedInt jj = (j + data.t_id_y) % Q_1D;
 
-            atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
+            atomicAdd_block(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
           }
         }
       }
@@ -376,22 +376,20 @@ inline __device__ void GradAtPoints3d(SharedData_Cuda &data, const CeedInt p, co
     CeedScalar buffer[Q_1D];
     CeedScalar chebyshev_x[Q_1D];
 
-    for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-      // Get z contraction value
-      ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
-      CeedScalar z = chebyshev_x[k];
+    // Get z contraction values
+    ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
+    const CeedScalar z = chebyshev_x[k];
+
+    ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x);
+    const CeedScalar dz = chebyshev_x[k];
 
+    for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
       // Load coefficients
       __syncthreads();
       if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[k + comp * Q_1D];
       __syncthreads();
       // Gradient directions
       for (CeedInt dim = 0; dim < 3; dim++) {
-        // Update z value for final pass
-        if (dim == 2) {
-          ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x);
-          z = chebyshev_x[k];
-        }
         // Contract x direction
         if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
         else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
@@ -404,8 +402,10 @@ inline __device__ void GradAtPoints3d(SharedData_Cuda &data, const CeedInt p, co
         // Contract y and z direction
         if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
         else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
+        const CeedScalar zz = dim == 2 ? dz : z;
+
         for (CeedInt i = 0; i < Q_1D; i++) {
-          r_V[comp + dim * NUM_COMP] += chebyshev_x[i] * buffer[i] * z;
+          r_V[comp + dim * NUM_COMP] += chebyshev_x[i] * buffer[i] * zz;
         }
       }
     }
@@ -422,26 +422,26 @@ inline __device__ void GradTransposeAtPoints3d(SharedData_Cuda &data, const Ceed
     CeedScalar buffer[Q_1D];
     CeedScalar chebyshev_x[Q_1D];
 
-    for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-      // Get z contraction value
-      ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
-      CeedScalar z = chebyshev_x[k];
+    // Get z contraction values
+    ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
+    const CeedScalar z = chebyshev_x[k];
+
+    ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x);
+    const CeedScalar dz = chebyshev_x[k];
 
+    for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
       // Clear shared memory
       if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
       __syncthreads();
       // Gradient directions
       for (CeedInt dim = 0; dim < 3; dim++) {
-        // Update z value for final pass
-        if (dim == 2) {
-          ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x);
-          z = chebyshev_x[k];
-        }
         // Contract y and z direction
         if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
         else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
+        const CeedScalar zz = dim == 2 ? dz : z;
+
         for (CeedInt i = 0; i < Q_1D; i++) {
-          buffer[i] = chebyshev_x[i] * r_U[comp + dim * NUM_COMP] * z;
+          buffer[i] = chebyshev_x[i] * r_U[comp + dim * NUM_COMP] * zz;
         }
         // Contract x direction
         if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
@@ -454,7 +454,7 @@ inline __device__ void GradTransposeAtPoints3d(SharedData_Cuda &data, const Ceed
             for (CeedInt j = 0; j < Q_1D; j++) {
               const CeedInt jj = (j + data.t_id_y) % Q_1D;
 
-              atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
+              atomicAdd_block(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
             }
           }
         }
diff --git a/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h b/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h
index 1c8661ce3c..42ea65d1b0 100644
--- a/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h
+++ b/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h
@@ -377,22 +377,20 @@ inline __device__ void GradAtPoints3d(SharedData_Hip &data, const CeedInt p, con
     CeedScalar buffer[Q_1D];
     CeedScalar chebyshev_x[Q_1D];
 
-    for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-      // Get z contraction value
-      ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
-      CeedScalar z = chebyshev_x[k];
+    // Get z contraction values
+    ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
+    const CeedScalar z = chebyshev_x[k];
+
+    ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x);
+    const CeedScalar dz = chebyshev_x[k];
 
+    for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
       // Load coefficients
       __syncthreads();
       if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[k + comp * Q_1D];
       __syncthreads();
       // Gradient directions
       for (CeedInt dim = 0; dim < 3; dim++) {
-        // Update z value for final pass
-        if (dim == 2) {
-          ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x);
-          z = chebyshev_x[k];
-        }
         // Contract x direction
         if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
         else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
@@ -405,8 +403,10 @@ inline __device__ void GradAtPoints3d(SharedData_Hip &data, const CeedInt p, con
         // Contract y and z direction
         if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
         else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
+        const CeedScalar zz = dim == 2 ? dz : z;
+
         for (CeedInt i = 0; i < Q_1D; i++) {
-          r_V[comp + dim * NUM_COMP] += chebyshev_x[i] * buffer[i] * z;
+          r_V[comp + dim * NUM_COMP] += chebyshev_x[i] * buffer[i] * zz;
         }
       }
     }
@@ -423,26 +423,26 @@ inline __device__ void GradTransposeAtPoints3d(SharedData_Hip &data, const CeedI
     CeedScalar buffer[Q_1D];
     CeedScalar chebyshev_x[Q_1D];
 
-    for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-      // Get z contraction value
-      ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
-      CeedScalar z = chebyshev_x[k];
+    // Get z contraction values
+    ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
+    const CeedScalar z = chebyshev_x[k];
+
+    ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x);
+    const CeedScalar dz = chebyshev_x[k];
 
+    for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
       // Clear shared memory
       if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
       __syncthreads();
       // Gradient directions
       for (CeedInt dim = 0; dim < 3; dim++) {
-        // Update z value for final pass
-        if (dim == 2) {
-          ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x);
-          z = chebyshev_x[k];
-        }
         // Contract y and z direction
         if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
         else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
+        const CeedScalar zz = dim == 2 ? dz : z;
+
         for (CeedInt i = 0; i < Q_1D; i++) {
-          buffer[i] = chebyshev_x[i] * r_U[comp + dim * NUM_COMP] * z;
+          buffer[i] = chebyshev_x[i] * r_U[comp + dim * NUM_COMP] * zz;
         }
         // Contract x direction
         if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
diff --git a/interface/ceed-basis.c b/interface/ceed-basis.c
index 9a99ae6611..e074532c78 100644
--- a/interface/ceed-basis.c
+++ b/interface/ceed-basis.c
@@ -939,9 +939,9 @@ int CeedBasisGetFlopsEstimate(CeedBasis basis, CeedTransposeMode t_mode, CeedEva
           *flops = tensor_flops + num_points * num_comp * (point_tensor_flops + (t_mode == CEED_TRANSPOSE ? CeedIntPow(Q_1d, dim) : 0));
           if (dim == 3 && is_gpu) {
             CeedInt inner_flops =
-                dim * (2 * Q_1d * Q_1d + (t_mode == CEED_TRANSPOSE ? 2 : 3) * Q_1d + d_chebyshev_flops) + (2 * dim - 1) * chebyshev_flops;
+                dim * (2 * Q_1d * Q_1d + (t_mode == CEED_TRANSPOSE ? 2 : 3) * Q_1d) + (dim - 1) * (2 * chebyshev_flops + d_chebyshev_flops);
 
-            *flops += num_points * Q_1d * num_comp * (inner_flops + (t_mode == CEED_TRANSPOSE ? 1 : 0));
+            *flops += num_points * Q_1d * (chebyshev_flops + d_chebyshev_flops) * num_comp * (inner_flops + (t_mode == CEED_TRANSPOSE ? 1 : 0));
           } else {
             *flops += num_points * (is_gpu ? num_comp : 1) * dim * (d_chebyshev_flops + (dim - 1) * chebyshev_flops);
           }

From 4db22773792bd8ce6cd524c612e7d04d4f840914 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 11 Jul 2025 14:09:33 -0600
Subject: [PATCH 446/571] atpoints - drop old hack to avoid bug

---
 backends/cuda-gen/ceed-cuda-gen-operator.c       | 2 +-
 backends/hip-gen/ceed-hip-gen-operator-build.cpp | 3 ---
 backends/hip-gen/ceed-hip-gen-operator.c         | 1 -
 3 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/backends/cuda-gen/ceed-cuda-gen-operator.c b/backends/cuda-gen/ceed-cuda-gen-operator.c
index 1e55763462..7965b2a70f 100644
--- a/backends/cuda-gen/ceed-cuda-gen-operator.c
+++ b/backends/cuda-gen/ceed-cuda-gen-operator.c
@@ -209,7 +209,7 @@ static int CeedOperatorApplyAddCore_Cuda_gen(CeedOperator op, CUstream stream, c
   int block[3] = {data->thread_1d, ((!is_tensor || data->dim == 1) ? 1 : data->thread_1d), -1};
 
   if (is_tensor) {
-    CeedCallBackend(BlockGridCalculate(num_elem, min_grid_size / cuda_data->device_prop.multiProcessorCount, is_at_points ? 1 : max_threads_per_block,
+    CeedCallBackend(BlockGridCalculate(num_elem, min_grid_size / cuda_data->device_prop.multiProcessorCount, max_threads_per_block,
                                        cuda_data->device_prop.maxThreadsDim[2], cuda_data->device_prop.warpSize, block, &grid));
   } else {
     CeedInt elems_per_block = CeedIntMin(cuda_data->device_prop.maxThreadsDim[2], CeedIntMax(512 / data->thread_1d, 1));
diff --git a/backends/hip-gen/ceed-hip-gen-operator-build.cpp b/backends/hip-gen/ceed-hip-gen-operator-build.cpp
index 674f94b428..9e29d16cb4 100644
--- a/backends/hip-gen/ceed-hip-gen-operator-build.cpp
+++ b/backends/hip-gen/ceed-hip-gen-operator-build.cpp
@@ -1589,7 +1589,6 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_bu
   // Compile
   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
   CeedCallBackend(BlockGridCalculate_Hip_gen(is_all_tensor ? max_dim : 1, num_elem, data->max_P_1d, is_all_tensor ? Q_1d : Q, block_sizes));
-  if (is_at_points) block_sizes[2] = 1;
   {
     bool is_compile_good = false;
 
@@ -2053,7 +2052,6 @@ static int CeedOperatorBuildKernelAssemblyAtPoints_Hip_gen(CeedOperator op, bool
   // Compile
   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
   CeedCallBackend(BlockGridCalculate_Hip_gen(max_dim, num_elem, data->max_P_1d, Q_1d, block_sizes));
-  block_sizes[2] = 1;
   {
     bool is_compile_good = false;
 
@@ -2625,7 +2623,6 @@ extern "C" int CeedOperatorBuildKernelLinearAssembleQFunction_Hip_gen(CeedOperat
   // Compile
   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
   CeedCallBackend(BlockGridCalculate_Hip_gen(max_dim, num_elem, data->max_P_1d, Q_1d, block_sizes));
-  block_sizes[2] = 1;
   {
     bool is_compile_good = false;
 
diff --git a/backends/hip-gen/ceed-hip-gen-operator.c b/backends/hip-gen/ceed-hip-gen-operator.c
index 08bf7455d2..bf673730db 100644
--- a/backends/hip-gen/ceed-hip-gen-operator.c
+++ b/backends/hip-gen/ceed-hip-gen-operator.c
@@ -153,7 +153,6 @@ static int CeedOperatorApplyAddCore_Hip_gen(CeedOperator op, hipStream_t stream,
 
   if (is_tensor) {
     CeedCallBackend(BlockGridCalculate_Hip_gen(data->dim, num_elem, data->max_P_1d, data->Q_1d, block_sizes));
-    if (is_at_points) block_sizes[2] = 1;
   } else {
     CeedInt elems_per_block = 64 * data->thread_1d > 256 ? 256 / data->thread_1d : 64;
 

From 27a8a65097afae0831e3f350b020ff161fd52392 Mon Sep 17 00:00:00 2001
From: Zach Atkins <Zach.Atkins@colorado.edu>
Date: Sat, 12 Jul 2025 16:31:33 -0700
Subject: [PATCH 447/571] Fix grad basis flop counts

---
 interface/ceed-basis.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/interface/ceed-basis.c b/interface/ceed-basis.c
index e074532c78..4b798ab91a 100644
--- a/interface/ceed-basis.c
+++ b/interface/ceed-basis.c
@@ -941,7 +941,7 @@ int CeedBasisGetFlopsEstimate(CeedBasis basis, CeedTransposeMode t_mode, CeedEva
             CeedInt inner_flops =
                 dim * (2 * Q_1d * Q_1d + (t_mode == CEED_TRANSPOSE ? 2 : 3) * Q_1d) + (dim - 1) * (2 * chebyshev_flops + d_chebyshev_flops);
 
-            *flops += num_points * Q_1d * (chebyshev_flops + d_chebyshev_flops) * num_comp * (inner_flops + (t_mode == CEED_TRANSPOSE ? 1 : 0));
+            *flops += num_points * Q_1d * (chebyshev_flops + d_chebyshev_flops + num_comp * (inner_flops + (t_mode == CEED_TRANSPOSE ? 1 : 0)));
           } else {
             *flops += num_points * (is_gpu ? num_comp : 1) * dim * (d_chebyshev_flops + (dim - 1) * chebyshev_flops);
           }

From 73b5a3bf55aeb5ad8a797943274ffe079610ce9e Mon Sep 17 00:00:00 2001
From: Zach Atkins <zach.atkins@colorado.edu>
Date: Tue, 15 Jul 2025 11:21:32 -0600
Subject: [PATCH 448/571] ceed - replace usage of `ceed->op_fallback_parent`
 with `ceed->parent`

---
 include/ceed-impl.h |  2 +-
 interface/ceed.c    | 30 +++---------------------------
 2 files changed, 4 insertions(+), 28 deletions(-)

diff --git a/include/ceed-impl.h b/include/ceed-impl.h
index 95c920604d..31b281ba7e 100644
--- a/include/ceed-impl.h
+++ b/include/ceed-impl.h
@@ -96,7 +96,7 @@ struct Ceed_private {
   Ceed         parent;
   ObjDelegate *obj_delegates;
   int          obj_delegate_count;
-  Ceed         op_fallback_ceed, op_fallback_parent;
+  Ceed         op_fallback_ceed;
   const char  *op_fallback_resource;
   char       **jit_source_roots;
   CeedInt      num_jit_source_roots, max_jit_source_roots, num_jit_source_roots_readers;
diff --git a/interface/ceed.c b/interface/ceed.c
index 39a8d3a911..9d38a5742a 100644
--- a/interface/ceed.c
+++ b/interface/ceed.c
@@ -663,29 +663,9 @@ int CeedGetOperatorFallbackCeed(Ceed ceed, Ceed *fallback_ceed) {
 
     CeedCall(CeedGetOperatorFallbackResource(ceed, &fallback_resource));
     CeedCall(CeedInit(fallback_resource, &fallback_ceed));
-    fallback_ceed->op_fallback_parent = ceed;
-    fallback_ceed->Error              = ceed->Error;
-    ceed->op_fallback_ceed            = fallback_ceed;
-    {
-      const char **jit_source_roots;
-      CeedInt      num_jit_source_roots = 0;
-
-      CeedCall(CeedGetJitSourceRoots(ceed, &num_jit_source_roots, &jit_source_roots));
-      for (CeedInt i = 0; i < num_jit_source_roots; i++) {
-        CeedCall(CeedAddJitSourceRoot(fallback_ceed, jit_source_roots[i]));
-      }
-      CeedCall(CeedRestoreJitSourceRoots(ceed, &jit_source_roots));
-    }
-    {
-      const char **jit_defines;
-      CeedInt      num_jit_defines = 0;
-
-      CeedCall(CeedGetJitDefines(ceed, &num_jit_defines, &jit_defines));
-      for (CeedInt i = 0; i < num_jit_defines; i++) {
-        CeedCall(CeedAddJitSourceRoot(fallback_ceed, jit_defines[i]));
-      }
-      CeedCall(CeedRestoreJitDefines(ceed, &jit_defines));
-    }
+    fallback_ceed->parent  = ceed;
+    fallback_ceed->Error   = ceed->Error;
+    ceed->op_fallback_ceed = fallback_ceed;
   }
   *fallback_ceed = NULL;
   CeedDebug(ceed, "Fallback Ceed with backend %s at address %p\n", ceed->op_fallback_resource, ceed->op_fallback_ceed);
@@ -1584,7 +1564,6 @@ int CeedDestroy(Ceed *ceed) {
 // LCOV_EXCL_START
 const char *CeedErrorFormat(Ceed ceed, const char *format, va_list *args) {
   if (ceed->parent) return CeedErrorFormat(ceed->parent, format, args);
-  if (ceed->op_fallback_parent) return CeedErrorFormat(ceed->op_fallback_parent, format, args);
   // Using pointer to va_list for better FFI, but clang-tidy can't verify va_list is initalized
   vsnprintf(ceed->err_msg, CEED_MAX_RESOURCE_LEN, format, *args);  // NOLINT
   return ceed->err_msg;
@@ -1648,7 +1627,6 @@ int CeedErrorReturn(Ceed ceed, const char *filename, int line_no, const char *fu
 // LCOV_EXCL_START
 int CeedErrorStore(Ceed ceed, const char *filename, int line_no, const char *func, int err_code, const char *format, va_list *args) {
   if (ceed->parent) return CeedErrorStore(ceed->parent, filename, line_no, func, err_code, format, args);
-  if (ceed->op_fallback_parent) return CeedErrorStore(ceed->op_fallback_parent, filename, line_no, func, err_code, format, args);
 
   // Build message
   int len = snprintf(ceed->err_msg, CEED_MAX_RESOURCE_LEN, "%s:%d in %s(): ", filename, line_no, func);
@@ -1728,7 +1706,6 @@ int CeedSetErrorHandler(Ceed ceed, CeedErrorHandler handler) {
 **/
 int CeedGetErrorMessage(Ceed ceed, const char **err_msg) {
   if (ceed->parent) return CeedGetErrorMessage(ceed->parent, err_msg);
-  if (ceed->op_fallback_parent) return CeedGetErrorMessage(ceed->op_fallback_parent, err_msg);
   *err_msg = ceed->err_msg;
   return CEED_ERROR_SUCCESS;
 }
@@ -1747,7 +1724,6 @@ int CeedGetErrorMessage(Ceed ceed, const char **err_msg) {
 **/
 int CeedResetErrorMessage(Ceed ceed, const char **err_msg) {
   if (ceed->parent) return CeedResetErrorMessage(ceed->parent, err_msg);
-  if (ceed->op_fallback_parent) return CeedResetErrorMessage(ceed->op_fallback_parent, err_msg);
   *err_msg = NULL;
   memcpy(ceed->err_msg, "No error message stored", 24);
   return CEED_ERROR_SUCCESS;

From 45e62b5fd85c07bf32886201b28fd86143ee0826 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Wed, 16 Jul 2025 06:43:51 -0600
Subject: [PATCH 449/571] ex - update example folder Readmes

---
 examples/README.md         | 24 ++++++++++++++++++++----
 examples/ceed/README.md    |  2 +-
 examples/deal.II/README.md |  2 ++
 examples/mfem/README.md    | 18 ++++++++++++++++++
 examples/python/README.md  | 25 +++++++++++++++++++++++++
 5 files changed, 66 insertions(+), 5 deletions(-)
 create mode 100644 examples/mfem/README.md
 create mode 100644 examples/python/README.md

diff --git a/examples/README.md b/examples/README.md
index d5adadad36..006d6c0c71 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -13,7 +13,7 @@ For more details, please see the dedicated [documentation section](https://libce
 
 The Center for Efficient Exascale Discretizations (CEED) uses Bakeoff Problems (BPs) to test and compare the performance of high-order finite element implementations.
 The definitions of the problems are given on the ceed [website](https://ceed.exascaleproject.org/bps/).
-Each of the following bakeoff problems that use external discretization libraries (such as MFEM, PETSc, and Nek5000) are located in the subdirectories `mfem/`, `petsc/`, and `nek5000/`, respectively.
+Each of the following bakeoff problems that use external discretization libraries (such as deal.II, MFEM, PETSc, and Nek5000) are located in the subdirectories `deal.II/`, `mfem/`, `petsc/`, and `nek5000/`, respectively.
 
 Here we provide a short summary:
 
@@ -22,6 +22,13 @@ Here we provide a short summary:
 :widths: auto
 * - User code
   - Supported BPs
+* - `deal.II`
+  - * BP1 (scalar mass operator) with $Q=P+1$
+    * BP2 (vector mass operator) with $Q=P+1$
+    * BP3 (scalar Laplace operator) with $Q=P+1$
+    * BP4 (vector Laplace operator) with $Q=P+1$
+    * BP5 (collocated scalar Laplace operator) with $Q=P$
+    * BP6 (collocated vector Laplace operator) with $Q=P$
 * - `mfem`
   - * BP1 (scalar mass operator) with $Q=P+1$
     * BP3 (scalar Laplace operator) with $Q=P+1$
@@ -50,12 +57,12 @@ This latter choice is popular in applications that use spectral element methods
 
 For a more detailed description of the operators employed in the BPs, please see the dedicated [BPs documentation section](https://libceed.org/en/latest/examples/bps.html).
 
-## PETSc+libCEED Navier-Stokes Solver
+## PETSc+libCEED Fluid Dynamics Navier-Stokes Mini-App
 
 The Navier-Stokes problem solves the compressible Navier-Stokes equations using an explicit or implicit time integration.
 A more detailed description of the problem formulation can be found in the [fluids/](./fluids) folder and the corresponding [fluids documentation page](https://libceed.org/en/latest/examples/fluids/index.html).
 
-## PETSc+libCEED Solid mechanics elasticity mini-app
+## PETSc+libCEED Solid Mechanics Elasticity Mini-App
 
 This example solves the steady-state static momentum balance equations using unstructured high-order finite/spectral element spatial discretizations.
 A more detailed description of the problem formulation can be found in the [solids/](./solids) folder and the corresponding [solids documentation page](https://libceed.org/en/latest/examples/solids/index.html).
@@ -70,9 +77,18 @@ For a detailed description, please see the corresponding [area documentation pag
 These examples, located in the [petsc/](./petsc) folder, reproduce the Bakeoff Problems 1-6 on a discrete cubed-sphere, using PETSc.
 For a detailed description, please see the corresponding [problems on the cubed-sphere documentation page](https://libceed.org/en/latest/examples/petsc/index.html#bakeoff-problems-on-the-cubed-sphere).
 
+## libCEED Python Examples
+
+These Jupyter notebooks explore the concepts of the libCEED API, including how to install the Python interface and the usage of each API object, with interactive examples.
+The basic libCEED C examples in `/ceed` folder are also available as Python examples.
+
+## libCEED Rust Examples
+
+The basic libCEED C examples in `/ceed` folder are also available as Rust examples.
+
 ## Running Examples
 
-To build the examples, set the `MFEM_DIR`, `PETSC_DIR`, and `NEK5K_DIR` variables and, from the `examples/` directory, run
+To build the examples, set the `DEAL_II_DIR`, `MFEM_DIR`, `PETSC_DIR`, and `NEK5K_DIR` variables and, from the `examples/` directory, run
 
 ```{include} ../README.md
 :start-after: <!-- running-examples-inclusion -->
diff --git a/examples/ceed/README.md b/examples/ceed/README.md
index 2495f01d0c..0d6c64188c 100644
--- a/examples/ceed/README.md
+++ b/examples/ceed/README.md
@@ -1,4 +1,4 @@
-## libCEED: Basic Examples
+## libCEED Basic Examples
 
 Two examples are provided that rely only upon libCEED without any external libraries.
 
diff --git a/examples/deal.II/README.md b/examples/deal.II/README.md
index f6065b2c53..1ae1f794d8 100644
--- a/examples/deal.II/README.md
+++ b/examples/deal.II/README.md
@@ -1,3 +1,5 @@
+## libCEED deal.II Example
+
 An example how to write libCEED operators (BP1-BP6) within the open-source finite element library [deal.II](https://www.dealii.org/).
 As reference, operators are presented that use the native matrix-free infrastructure.
 
diff --git a/examples/mfem/README.md b/examples/mfem/README.md
new file mode 100644
index 0000000000..d6d2002177
--- /dev/null
+++ b/examples/mfem/README.md
@@ -0,0 +1,18 @@
+## libCEED MFEM Examples
+
+These examples show to write libCEED operators (BP1 and BP3) within the open-source finite element library [MFEM](https://www.mfem.org/).
+
+First compile MFEM and libCEED individually. After that, compile the MFEM example:
+
+```bash
+export MFEM_DIR=/path/to/mfem
+make
+```
+
+To run the executable, write:
+
+```
+./bp[1, 3]
+```
+
+Optional command-line arguments are shown by adding the command-line argument "--help".
diff --git a/examples/python/README.md b/examples/python/README.md
new file mode 100644
index 0000000000..ca0019ab94
--- /dev/null
+++ b/examples/python/README.md
@@ -0,0 +1,25 @@
+## libCEED Python Examples
+
+These examples are written using libCEED's Python interface.
+
+### Tutorials
+
+These Jupyter notebooks explore the concepts of the libCEED API, including how to install the Python interface and the usage of each API object, with interactive examples.
+
+### Basic Examples
+
+The basic libCEED C examples in the folder `/examples/ceed` are also available as Python examples.
+
+To build the QFunctions into a shared library that the Python examples use, run
+
+```bash
+make setup
+```
+
+To execute the examples, run:
+
+```
+python ex1_volume.py
+```
+
+A full list of command-line arguments are shown by adding the command-line argument "--help".

From 5da5ab9fd3cba19342bf8c55e0062d74fad46ef3 Mon Sep 17 00:00:00 2001
From: Zach Atkins <zach.atkins@colorado.edu>
Date: Wed, 16 Jul 2025 09:45:38 -0600
Subject: [PATCH 450/571] op - Add CeedOperatorApplyAddActive for only summing
 into active outputs

---
 include/ceed/ceed.h       |  1 +
 interface/ceed-operator.c | 77 +++++++++++++++++++++++++++++++++++++--
 2 files changed, 75 insertions(+), 3 deletions(-)

diff --git a/include/ceed/ceed.h b/include/ceed/ceed.h
index af510065eb..ce6284b910 100644
--- a/include/ceed/ceed.h
+++ b/include/ceed/ceed.h
@@ -469,6 +469,7 @@ CEED_EXTERN int  CeedOperatorGetContextBooleanRead(CeedOperator op, CeedContextF
 CEED_EXTERN int  CeedOperatorRestoreContextBooleanRead(CeedOperator op, CeedContextFieldLabel field_label, const bool **values);
 CEED_EXTERN int  CeedOperatorApply(CeedOperator op, CeedVector in, CeedVector out, CeedRequest *request);
 CEED_EXTERN int  CeedOperatorApplyAdd(CeedOperator op, CeedVector in, CeedVector out, CeedRequest *request);
+CEED_EXTERN int  CeedOperatorApplyAddActive(CeedOperator op, CeedVector in, CeedVector out, CeedRequest *request);
 CEED_EXTERN int  CeedOperatorAssemblyDataStrip(CeedOperator op);
 CEED_EXTERN int  CeedOperatorDestroy(CeedOperator *op);
 
diff --git a/interface/ceed-operator.c b/interface/ceed-operator.c
index d0545ba7e6..d23aa7fea5 100644
--- a/interface/ceed-operator.c
+++ b/interface/ceed-operator.c
@@ -1076,10 +1076,10 @@ int CeedOperatorAtPointsSetPoints(CeedOperator op, CeedElemRestriction rstr_poin
 
 /**
   @brief Get a boolean value indicating if the `CeedOperator` was created with `CeedOperatorCreateAtPoints`
-    
+
   @param[in]  op           `CeedOperator`
   @param[out] is_at_points Variable to store at points status
-  
+
   @return An error code: 0 - success, otherwise - failure
 
   @ref User
@@ -2142,7 +2142,7 @@ int CeedOperatorRestoreContextBooleanRead(CeedOperator op, CeedContextFieldLabel
   This computes the action of the operator on the specified (active) input, yielding its (active) output.
   All inputs and outputs must be specified using @ref CeedOperatorSetField().
 
-  Note: Calling this function asserts that setup is complete and sets the `CeedOperator` as immutable.
+  @note Calling this function asserts that setup is complete and sets the `CeedOperator` as immutable.
 
   @param[in]  op      `CeedOperator` to apply
   @param[in]  in      `CeedVector` containing input state or @ref CEED_VECTOR_NONE if there are no active inputs
@@ -2223,6 +2223,10 @@ int CeedOperatorApply(CeedOperator op, CeedVector in, CeedVector out, CeedReques
   This computes the action of the operator on the specified (active) input, yielding its (active) output.
   All inputs and outputs must be specified using @ref CeedOperatorSetField().
 
+  @note Calling this function asserts that setup is complete and sets the `CeedOperator` as immutable.
+  @warning This function adds into ALL outputs, including passive outputs. To only add into the active output, use `CeedOperatorApplyAddActive()`.
+  @see `CeedOperatorApplyAddActive()`
+
   @param[in]  op      `CeedOperator` to apply
   @param[in]  in      `CeedVector` containing input state or @ref CEED_VECTOR_NONE if there are no active inputs
   @param[out] out     `CeedVector` to sum in result of applying operator (must be distinct from `in`) or @ref CEED_VECTOR_NONE if there are no active outputs
@@ -2259,6 +2263,73 @@ int CeedOperatorApplyAdd(CeedOperator op, CeedVector in, CeedVector out, CeedReq
   return CEED_ERROR_SUCCESS;
 }
 
+/**
+  @brief Apply `CeedOperator` to a `CeedVector` and add result to output `CeedVector`. Only sums into active outputs, overwrites passive outputs.
+
+  This computes the action of the operator on the specified (active) input, yielding its (active) output.
+  All inputs and outputs must be specified using @ref CeedOperatorSetField().
+
+  @note Calling this function asserts that setup is complete and sets the `CeedOperator` as immutable.
+
+  @param[in]  op      `CeedOperator` to apply
+  @param[in]  in      `CeedVector` containing input state or @ref CEED_VECTOR_NONE if there are no active inputs
+  @param[out] out     `CeedVector` to sum in result of applying operator (must be distinct from `in`) or @ref CEED_VECTOR_NONE if there are no active outputs
+  @param[in]  request Address of @ref CeedRequest for non-blocking completion, else @ref CEED_REQUEST_IMMEDIATE
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref User
+**/
+int CeedOperatorApplyAddActive(CeedOperator op, CeedVector in, CeedVector out, CeedRequest *request) {
+  bool is_composite;
+
+  CeedCall(CeedOperatorCheckReady(op));
+
+  CeedCall(CeedOperatorIsComposite(op, &is_composite));
+  if (is_composite) {
+    // Composite Operator
+    CeedInt       num_suboperators;
+    CeedOperator *sub_operators;
+
+    CeedCall(CeedCompositeOperatorGetNumSub(op, &num_suboperators));
+    CeedCall(CeedCompositeOperatorGetSubList(op, &sub_operators));
+
+    // Zero all output vectors
+    for (CeedInt i = 0; i < num_suboperators; i++) {
+      CeedInt            num_output_fields;
+      CeedOperatorField *output_fields;
+
+      CeedCall(CeedOperatorGetFields(sub_operators[i], NULL, NULL, &num_output_fields, &output_fields));
+      for (CeedInt j = 0; j < num_output_fields; j++) {
+        CeedVector vec;
+
+        CeedCall(CeedOperatorFieldGetVector(output_fields[j], &vec));
+        if (vec != CEED_VECTOR_ACTIVE && vec != CEED_VECTOR_NONE) CeedCall(CeedVectorSetValue(vec, 0.0));
+        CeedCall(CeedVectorDestroy(&vec));
+      }
+    }
+    // ApplyAdd
+    CeedCall(CeedOperatorApplyAdd(op, in, out, request));
+  } else {
+    // Standard Operator
+    CeedInt            num_output_fields;
+    CeedOperatorField *output_fields;
+
+    CeedCall(CeedOperatorGetFields(op, NULL, NULL, &num_output_fields, &output_fields));
+    // Zero all output vectors
+    for (CeedInt i = 0; i < num_output_fields; i++) {
+      CeedVector vec;
+
+      CeedCall(CeedOperatorFieldGetVector(output_fields[i], &vec));
+      if (vec != CEED_VECTOR_ACTIVE && vec != CEED_VECTOR_NONE) CeedCall(CeedVectorSetValue(vec, 0.0));
+      CeedCall(CeedVectorDestroy(&vec));
+    }
+    // ApplyAdd
+    CeedCall(CeedOperatorApplyAdd(op, in, out, request));
+  }
+  return CEED_ERROR_SUCCESS;
+}
+
 /**
   @brief Destroy temporary assembly data associated with a `CeedOperator`
 

From 0db52efc76395f116dcbcc2ef5af6412f2cc714a Mon Sep 17 00:00:00 2001
From: Zach Atkins <zach.atkins@colorado.edu>
Date: Wed, 16 Jul 2025 10:09:28 -0600
Subject: [PATCH 451/571] Reduce code duplication

---
 interface/ceed-operator.c | 61 +++++++--------------------------------
 1 file changed, 10 insertions(+), 51 deletions(-)

diff --git a/interface/ceed-operator.c b/interface/ceed-operator.c
index d23aa7fea5..039f8522db 100644
--- a/interface/ceed-operator.c
+++ b/interface/ceed-operator.c
@@ -2159,60 +2159,19 @@ int CeedOperatorApply(CeedOperator op, CeedVector in, CeedVector out, CeedReques
   CeedCall(CeedOperatorCheckReady(op));
 
   CeedCall(CeedOperatorIsComposite(op, &is_composite));
-  if (is_composite) {
+  if (is_composite && op->ApplyComposite) {
     // Composite Operator
-    if (op->ApplyComposite) {
-      CeedCall(op->ApplyComposite(op, in, out, request));
-    } else {
-      CeedInt       num_suboperators;
-      CeedOperator *sub_operators;
-
-      CeedCall(CeedCompositeOperatorGetNumSub(op, &num_suboperators));
-      CeedCall(CeedCompositeOperatorGetSubList(op, &sub_operators));
-
-      // Zero all output vectors
-      if (out != CEED_VECTOR_NONE) CeedCall(CeedVectorSetValue(out, 0.0));
-      for (CeedInt i = 0; i < num_suboperators; i++) {
-        CeedInt            num_output_fields;
-        CeedOperatorField *output_fields;
-
-        CeedCall(CeedOperatorGetFields(sub_operators[i], NULL, NULL, &num_output_fields, &output_fields));
-        for (CeedInt j = 0; j < num_output_fields; j++) {
-          CeedVector vec;
-
-          CeedCall(CeedOperatorFieldGetVector(output_fields[j], &vec));
-          if (vec != CEED_VECTOR_ACTIVE && vec != CEED_VECTOR_NONE) {
-            CeedCall(CeedVectorSetValue(vec, 0.0));
-          }
-          CeedCall(CeedVectorDestroy(&vec));
-        }
-      }
-      // ApplyAdd
-      CeedCall(CeedOperatorApplyAdd(op, in, out, request));
-    }
-  } else {
+    CeedCall(op->ApplyComposite(op, in, out, request));
+  } else if (!is_composite && op->Apply) {
     // Standard Operator
-    if (op->Apply) {
-      CeedCall(op->Apply(op, in, out, request));
-    } else {
-      CeedInt            num_output_fields;
-      CeedOperatorField *output_fields;
-
-      CeedCall(CeedOperatorGetFields(op, NULL, NULL, &num_output_fields, &output_fields));
-      // Zero all output vectors
-      for (CeedInt i = 0; i < num_output_fields; i++) {
-        bool       is_active;
-        CeedVector vec;
+    CeedCall(op->Apply(op, in, out, request));
+  } else {
+    // Standard or composite, default to zeroing out and calling ApplyAddActive
+    // Zero active output
+    if (out != CEED_VECTOR_NONE) CeedCall(CeedVectorSetValue(out, 0.0));
 
-        CeedCall(CeedOperatorFieldGetVector(output_fields[i], &vec));
-        is_active = vec == CEED_VECTOR_ACTIVE;
-        if (is_active) vec = out;
-        if (vec != CEED_VECTOR_NONE) CeedCall(CeedVectorSetValue(vec, 0.0));
-        if (!is_active) CeedCall(CeedVectorDestroy(&vec));
-      }
-      // Apply
-      if (op->num_elem > 0) CeedCall(op->ApplyAdd(op, in, out, request));
-    }
+    // ApplyAddActive
+    CeedCall(CeedOperatorApplyAddActive(op, in, out, request));
   }
   return CEED_ERROR_SUCCESS;
 }

From 46b50f9e3387fe901ca6f8767a26864fb30d4072 Mon Sep 17 00:00:00 2001
From: Zach Atkins <zach.atkins@colorado.edu>
Date: Wed, 16 Jul 2025 10:28:06 -0600
Subject: [PATCH 452/571] op - Simplify operator fallback to use
 pre-initialized fallback Ceed context

---
 backends/cuda-gen/ceed-cuda-gen.c        |  6 ++-
 backends/hip-gen/ceed-hip-gen.c          |  6 ++-
 backends/sycl-gen/ceed-sycl-gen.sycl.cpp | 12 ++---
 include/ceed-impl.h                      |  2 -
 include/ceed/backend.h                   |  3 +-
 interface/ceed.c                         | 58 ++++--------------------
 6 files changed, 23 insertions(+), 64 deletions(-)

diff --git a/backends/cuda-gen/ceed-cuda-gen.c b/backends/cuda-gen/ceed-cuda-gen.c
index f38b700225..e826f0aab1 100644
--- a/backends/cuda-gen/ceed-cuda-gen.c
+++ b/backends/cuda-gen/ceed-cuda-gen.c
@@ -18,7 +18,7 @@
 //------------------------------------------------------------------------------
 static int CeedInit_Cuda_gen(const char *resource, Ceed ceed) {
   char      *resource_root;
-  Ceed       ceed_shared;
+  Ceed       ceed_shared, ceed_ref;
   Ceed_Cuda *data;
 
   CeedCallBackend(CeedGetResourceRoot(ceed, resource, ":", &resource_root));
@@ -34,7 +34,9 @@ static int CeedInit_Cuda_gen(const char *resource, Ceed ceed) {
   CeedCallBackend(CeedSetDelegate(ceed, ceed_shared));
   CeedCallBackend(CeedDestroy(&ceed_shared));
 
-  CeedCallBackend(CeedSetOperatorFallbackResource(ceed, "/gpu/cuda/ref"));
+  CeedCallBackend(CeedInit("/gpu/cuda/ref", &ceed_ref));
+  CeedCallBackend(CeedSetOperatorFallbackCeed(ceed, ceed_ref));
+  CeedCallBackend(CeedDestroy(&ceed_ref));
 
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "QFunctionCreate", CeedQFunctionCreate_Cuda_gen));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "OperatorCreate", CeedOperatorCreate_Cuda_gen));
diff --git a/backends/hip-gen/ceed-hip-gen.c b/backends/hip-gen/ceed-hip-gen.c
index 9871863507..9dd958b321 100644
--- a/backends/hip-gen/ceed-hip-gen.c
+++ b/backends/hip-gen/ceed-hip-gen.c
@@ -18,7 +18,7 @@
 //------------------------------------------------------------------------------
 static int CeedInit_Hip_gen(const char *resource, Ceed ceed) {
   char     *resource_root;
-  Ceed      ceed_shared;
+  Ceed      ceed_shared, ceed_ref;
   Ceed_Hip *data;
 
   CeedCallBackend(CeedGetResourceRoot(ceed, resource, ":", &resource_root));
@@ -34,7 +34,9 @@ static int CeedInit_Hip_gen(const char *resource, Ceed ceed) {
   CeedCallBackend(CeedSetDelegate(ceed, ceed_shared));
   CeedCallBackend(CeedDestroy(&ceed_shared));
 
-  CeedCallBackend(CeedSetOperatorFallbackResource(ceed, "/gpu/hip/ref"));
+  CeedCallBackend(CeedInit("/gpu/hip/ref", &ceed_ref));
+  CeedCallBackend(CeedSetOperatorFallbackCeed(ceed, ceed_ref));
+  CeedCallBackend(CeedDestroy(&ceed_ref));
 
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "QFunctionCreate", CeedQFunctionCreate_Hip_gen));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "OperatorCreate", CeedOperatorCreate_Hip_gen));
diff --git a/backends/sycl-gen/ceed-sycl-gen.sycl.cpp b/backends/sycl-gen/ceed-sycl-gen.sycl.cpp
index 29abd0acb2..e07fe476d6 100644
--- a/backends/sycl-gen/ceed-sycl-gen.sycl.cpp
+++ b/backends/sycl-gen/ceed-sycl-gen.sycl.cpp
@@ -18,7 +18,7 @@
 // Backend init
 //------------------------------------------------------------------------------
 static int CeedInit_Sycl_gen(const char *resource, Ceed ceed) {
-  Ceed       ceed_shared;
+  Ceed       ceed_shared, ceed_ref;
   Ceed_Sycl *data;
   char      *resource_root;
 
@@ -36,12 +36,10 @@ static int CeedInit_Sycl_gen(const char *resource, Ceed ceed) {
   CeedCallBackend(CeedSetStream_Sycl(ceed_shared, &(data->sycl_queue)));
   CeedCallBackend(CeedDestroy(&ceed_shared));
 
-  CeedCallBackend(CeedSetOperatorFallbackResource(ceed, "/gpu/sycl/ref"));
-
-  Ceed ceed_fallback = NULL;
-  CeedCallBackend(CeedGetOperatorFallbackCeed(ceed, &ceed_fallback));
-  CeedCallBackend(CeedSetStream_Sycl(ceed_fallback, &(data->sycl_queue)));
-  CeedCallBackend(CeedDestroy(&ceed_fallback));
+  CeedCallBackend(CeedInit("/gpu/sycl/ref", &ceed_ref));
+  CeedCallBackend(CeedSetOperatorFallbackCeed(ceed, ceed_ref));
+  CeedCallBackend(CeedSetStream_Sycl(ceed_ref, &(data->sycl_queue)));
+  CeedCallBackend(CeedDestroy(&ceed_ref));
 
   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Ceed", ceed, "QFunctionCreate", CeedQFunctionCreate_Sycl_gen));
   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Ceed", ceed, "OperatorCreate", CeedOperatorCreate_Sycl_gen));
diff --git a/include/ceed-impl.h b/include/ceed-impl.h
index 31b281ba7e..1b6875a6ca 100644
--- a/include/ceed-impl.h
+++ b/include/ceed-impl.h
@@ -97,7 +97,6 @@ struct Ceed_private {
   ObjDelegate *obj_delegates;
   int          obj_delegate_count;
   Ceed         op_fallback_ceed;
-  const char  *op_fallback_resource;
   char       **jit_source_roots;
   CeedInt      num_jit_source_roots, max_jit_source_roots, num_jit_source_roots_readers;
   char       **jit_defines;
@@ -126,7 +125,6 @@ struct Ceed_private {
   int             ref_count;
   void           *data;
   bool            is_debug;
-  bool            has_valid_op_fallback_resource;
   bool            is_deterministic;
   char            err_msg[CEED_MAX_RESOURCE_LEN];
   FOffset        *f_offsets;
diff --git a/include/ceed/backend.h b/include/ceed/backend.h
index e6b608e571..b4b5a980c7 100644
--- a/include/ceed/backend.h
+++ b/include/ceed/backend.h
@@ -246,9 +246,8 @@ CEED_EXTERN int CeedGetDelegate(Ceed ceed, Ceed *delegate);
 CEED_EXTERN int CeedSetDelegate(Ceed ceed, Ceed delegate);
 CEED_EXTERN int CeedGetObjectDelegate(Ceed ceed, Ceed *delegate, const char *obj_name);
 CEED_EXTERN int CeedSetObjectDelegate(Ceed ceed, Ceed delegate, const char *obj_name);
-CEED_EXTERN int CeedGetOperatorFallbackResource(Ceed ceed, const char **resource);
 CEED_EXTERN int CeedGetOperatorFallbackCeed(Ceed ceed, Ceed *fallback_ceed);
-CEED_EXTERN int CeedSetOperatorFallbackResource(Ceed ceed, const char *resource);
+CEED_EXTERN int CeedSetOperatorFallbackCeed(Ceed ceed, Ceed fallback_ceed);
 CEED_EXTERN int CeedSetDeterministic(Ceed ceed, bool is_deterministic);
 CEED_EXTERN int CeedSetBackendFunctionImpl(Ceed ceed, const char *type, void *object, const char *func_name, void (*f)(void));
 CEED_EXTERN int CeedGetData(Ceed ceed, void *data);
diff --git a/interface/ceed.c b/interface/ceed.c
index 9d38a5742a..b4bf09883a 100644
--- a/interface/ceed.c
+++ b/interface/ceed.c
@@ -623,21 +623,6 @@ int CeedSetObjectDelegate(Ceed ceed, Ceed delegate, const char *obj_name) {
   return CEED_ERROR_SUCCESS;
 }
 
-/**
-  @brief Get the fallback resource for `CeedOperator`
-
-  @param[in]  ceed     `Ceed` context
-  @param[out] resource Variable to store fallback resource
-
-  @return An error code: 0 - success, otherwise - failure
-
-  @ref Backend
-**/
-int CeedGetOperatorFallbackResource(Ceed ceed, const char **resource) {
-  *resource = (const char *)ceed->op_fallback_resource;
-  return CEED_ERROR_SUCCESS;
-}
-
 /**
   @brief Get the fallback `Ceed` for `CeedOperator`
 
@@ -649,26 +634,13 @@ int CeedGetOperatorFallbackResource(Ceed ceed, const char **resource) {
   @ref Backend
 **/
 int CeedGetOperatorFallbackCeed(Ceed ceed, Ceed *fallback_ceed) {
-  if (ceed->has_valid_op_fallback_resource) {
+  if (ceed->op_fallback_ceed) {
     CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "---------- Ceed Fallback ----------\n");
-    CeedDebug(ceed, "Falling back from Ceed with backend %s at address %p to Ceed with backend %s", ceed->resource, ceed, ceed->op_fallback_resource);
+    CeedDebug(ceed, "Falling back from Ceed with backend %s at address %p to Ceed with backend %s at address %p", ceed->resource, ceed,
+              ceed->op_fallback_ceed->resource, ceed->op_fallback_ceed);
   }
 
-  // Create fallback Ceed if uninitialized
-  if (!ceed->op_fallback_ceed && ceed->has_valid_op_fallback_resource) {
-    CeedDebug(ceed, "Creating fallback Ceed");
-
-    Ceed        fallback_ceed;
-    const char *fallback_resource;
-
-    CeedCall(CeedGetOperatorFallbackResource(ceed, &fallback_resource));
-    CeedCall(CeedInit(fallback_resource, &fallback_ceed));
-    fallback_ceed->parent  = ceed;
-    fallback_ceed->Error   = ceed->Error;
-    ceed->op_fallback_ceed = fallback_ceed;
-  }
   *fallback_ceed = NULL;
-  CeedDebug(ceed, "Fallback Ceed with backend %s at address %p\n", ceed->op_fallback_resource, ceed->op_fallback_ceed);
   if (ceed->op_fallback_ceed) CeedCall(CeedReferenceCopy(ceed->op_fallback_ceed, fallback_ceed));
   return CEED_ERROR_SUCCESS;
 }
@@ -676,25 +648,18 @@ int CeedGetOperatorFallbackCeed(Ceed ceed, Ceed *fallback_ceed) {
 /**
   @brief Set the fallback resource for `CeedOperator`.
 
-  The current resource, if any, is freed by calling this function.
-  This string is freed upon the destruction of the `Ceed` context.
+  The current fallback, if any, is freed by calling this function.
 
-  @param[in,out] ceed     `Ceed` context
-  @param[in]     resource Fallback resource to set
+  @param[in,out] ceed          `Ceed` context
+  @param[in]     fallback_ceed `Ceed` context to create fallback operators
 
   @return An error code: 0 - success, otherwise - failure
 
   @ref Backend
 **/
-int CeedSetOperatorFallbackResource(Ceed ceed, const char *resource) {
-  // Free old
-  CeedCall(CeedFree(&ceed->op_fallback_resource));
-
-  // Set new
-  CeedCall(CeedStringAllocCopy(resource, (char **)&ceed->op_fallback_resource));
-
-  // Check validity
-  ceed->has_valid_op_fallback_resource = ceed->op_fallback_resource && ceed->resource && strcmp(ceed->op_fallback_resource, ceed->resource);
+int CeedSetOperatorFallbackCeed(Ceed ceed, Ceed fallback_ceed) {
+  CeedCall(CeedReferenceCopy(fallback_ceed, &ceed->op_fallback_ceed));
+  fallback_ceed->parent = ceed;
   return CEED_ERROR_SUCCESS;
 }
 
@@ -1303,10 +1268,6 @@ int CeedInit(const char *resource, Ceed *ceed) {
   CeedCall(CeedCalloc(sizeof(f_offsets), &(*ceed)->f_offsets));
   memcpy((*ceed)->f_offsets, f_offsets, sizeof(f_offsets));
 
-  // Set fallback for advanced CeedOperator functions
-  const char fallback_resource[] = "";
-  CeedCall(CeedSetOperatorFallbackResource(*ceed, fallback_resource));
-
   // Record env variables CEED_DEBUG or DBG
   (*ceed)->is_debug = getenv("CEED_DEBUG") || getenv("DEBUG") || getenv("DBG");
 
@@ -1555,7 +1516,6 @@ int CeedDestroy(Ceed *ceed) {
   CeedCall(CeedFree(&(*ceed)->f_offsets));
   CeedCall(CeedFree(&(*ceed)->resource));
   CeedCall(CeedDestroy(&(*ceed)->op_fallback_ceed));
-  CeedCall(CeedFree(&(*ceed)->op_fallback_resource));
   CeedCall(CeedWorkVectorsDestroy(*ceed));
   CeedCall(CeedFree(ceed));
   return CEED_ERROR_SUCCESS;

From 360be29c4c3e403ad2db02a312d57171db745993 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Thu, 17 Jul 2025 08:28:18 -0600
Subject: [PATCH 453/571] gpu - simplify atpoints if guards to prevent
 divergance

---
 ...-shared-basis-tensor-at-points-templates.h | 73 +++++++++----------
 ...-shared-basis-tensor-at-points-templates.h | 73 +++++++++----------
 2 files changed, 72 insertions(+), 74 deletions(-)

diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h
index f0832cd073..0a7bd751ef 100644
--- a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h
@@ -180,21 +180,21 @@ inline __device__ void InterpTransposeAtPoints2d(SharedData_Cuda &data, const Ce
     __syncthreads();
     // Contract y direction
     ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
+    const CeedScalar r_u = p < NUM_POINTS ? r_U[comp] : 0.0;
+
     for (CeedInt i = 0; i < Q_1D; i++) {
-      buffer[i] = chebyshev_x[i] * r_U[comp];
+      buffer[i] = chebyshev_x[i] * r_u;
     }
     // Contract x direction
     ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
-    if (p < NUM_POINTS) {
-      for (CeedInt i = 0; i < Q_1D; i++) {
-        // Note: shifting to avoid atomic adds
-        const CeedInt ii = (i + data.t_id_x) % Q_1D;
+    for (CeedInt i = 0; i < Q_1D; i++) {
+      // Note: shifting to avoid atomic adds
+      const CeedInt ii = (i + data.t_id_x) % Q_1D;
 
-        for (CeedInt j = 0; j < Q_1D; j++) {
-          const CeedInt jj = (j + data.t_id_y) % Q_1D;
+      for (CeedInt j = 0; j < Q_1D; j++) {
+        const CeedInt jj = (j + data.t_id_y) % Q_1D;
 
-          atomicAdd_block(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
-        }
+        atomicAdd_block(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
       }
     }
     // Pull from shared to register
@@ -255,22 +255,22 @@ inline __device__ void GradTransposeAtPoints2d(SharedData_Cuda &data, const Ceed
       // Contract y direction
       if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
       else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
+      const CeedScalar r_u = p < NUM_POINTS ? r_U[comp + dim * NUM_COMP] : 0.0;
+
       for (CeedInt i = 0; i < Q_1D; i++) {
-        buffer[i] = chebyshev_x[i] * r_U[comp + dim * NUM_COMP];
+        buffer[i] = chebyshev_x[i] * r_u;
       }
       // Contract x direction
       if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
       else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
-      if (p < NUM_POINTS) {
-        for (CeedInt i = 0; i < Q_1D; i++) {
-          // Note: shifting to avoid atomic adds
-          const CeedInt ii = (i + data.t_id_x) % Q_1D;
+      for (CeedInt i = 0; i < Q_1D; i++) {
+        // Note: shifting to avoid atomic adds
+        const CeedInt ii = (i + data.t_id_x) % Q_1D;
 
-          for (CeedInt j = 0; j < Q_1D; j++) {
-            const CeedInt jj = (j + data.t_id_y) % Q_1D;
+        for (CeedInt j = 0; j < Q_1D; j++) {
+          const CeedInt jj = (j + data.t_id_y) % Q_1D;
 
-            atomicAdd_block(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
-          }
+          atomicAdd_block(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
         }
       }
     }
@@ -341,21 +341,21 @@ inline __device__ void InterpTransposeAtPoints3d(SharedData_Cuda &data, const Ce
       __syncthreads();
       // Contract y and z direction
       ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
+      const CeedScalar r_u = p < NUM_POINTS ? r_U[comp] : 0.0;
+
       for (CeedInt i = 0; i < Q_1D; i++) {
-        buffer[i] = chebyshev_x[i] * r_U[comp] * z;
+        buffer[i] = chebyshev_x[i] * r_u * z;
       }
       // Contract x direction
       ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
-      if (p < NUM_POINTS) {
-        for (CeedInt i = 0; i < Q_1D; i++) {
-          // Note: shifting to avoid atomic adds
-          const CeedInt ii = (i + data.t_id_x) % Q_1D;
+      for (CeedInt i = 0; i < Q_1D; i++) {
+        // Note: shifting to avoid atomic adds
+        const CeedInt ii = (i + data.t_id_x) % Q_1D;
 
-          for (CeedInt j = 0; j < Q_1D; j++) {
-            const CeedInt jj = (j + data.t_id_y) % Q_1D;
+        for (CeedInt j = 0; j < Q_1D; j++) {
+          const CeedInt jj = (j + data.t_id_y) % Q_1D;
 
-            atomicAdd_block(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
-          }
+          atomicAdd_block(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
         }
       }
       // Pull from shared to register
@@ -438,24 +438,23 @@ inline __device__ void GradTransposeAtPoints3d(SharedData_Cuda &data, const Ceed
         // Contract y and z direction
         if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
         else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
-        const CeedScalar zz = dim == 2 ? dz : z;
+        const CeedScalar zz  = dim == 2 ? dz : z;
+        const CeedScalar r_u = p < NUM_POINTS ? r_U[comp + dim * NUM_COMP] : 0.0;
 
         for (CeedInt i = 0; i < Q_1D; i++) {
-          buffer[i] = chebyshev_x[i] * r_U[comp + dim * NUM_COMP] * zz;
+          buffer[i] = chebyshev_x[i] * r_u * zz;
         }
         // Contract x direction
         if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
         else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
-        if (p < NUM_POINTS) {
-          for (CeedInt i = 0; i < Q_1D; i++) {
-            // Note: shifting to avoid atomic adds
-            const CeedInt ii = (i + data.t_id_x) % Q_1D;
+        for (CeedInt i = 0; i < Q_1D; i++) {
+          // Note: shifting to avoid atomic adds
+          const CeedInt ii = (i + data.t_id_x) % Q_1D;
 
-            for (CeedInt j = 0; j < Q_1D; j++) {
-              const CeedInt jj = (j + data.t_id_y) % Q_1D;
+          for (CeedInt j = 0; j < Q_1D; j++) {
+            const CeedInt jj = (j + data.t_id_y) % Q_1D;
 
-              atomicAdd_block(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
-            }
+            atomicAdd_block(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
           }
         }
       }
diff --git a/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h b/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h
index 42ea65d1b0..5da77545ff 100644
--- a/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h
+++ b/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h
@@ -181,21 +181,21 @@ inline __device__ void InterpTransposeAtPoints2d(SharedData_Hip &data, const Cee
     __syncthreads();
     // Contract y direction
     ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
+    const CeedScalar r_u = p < NUM_POINTS ? r_U[comp] : 0.0;
+
     for (CeedInt i = 0; i < Q_1D; i++) {
-      buffer[i] = chebyshev_x[i] * r_U[comp];
+      buffer[i] = chebyshev_x[i] * r_u;
     }
     // Contract x direction
     ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
-    if (p < NUM_POINTS) {
-      for (CeedInt i = 0; i < Q_1D; i++) {
-        // Note: shifting to avoid atomic adds
-        const CeedInt ii = (i + data.t_id_x) % Q_1D;
+    for (CeedInt i = 0; i < Q_1D; i++) {
+      // Note: shifting to avoid atomic adds
+      const CeedInt ii = (i + data.t_id_x) % Q_1D;
 
-        for (CeedInt j = 0; j < Q_1D; j++) {
-          const CeedInt jj = (j + data.t_id_y) % Q_1D;
+      for (CeedInt j = 0; j < Q_1D; j++) {
+        const CeedInt jj = (j + data.t_id_y) % Q_1D;
 
-          atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
-        }
+        atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
       }
     }
     // Pull from shared to register
@@ -256,22 +256,22 @@ inline __device__ void GradTransposeAtPoints2d(SharedData_Hip &data, const CeedI
       // Contract y direction
       if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
       else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
+      const CeedScalar r_u = p < NUM_POINTS ? r_U[comp + dim * NUM_COMP] : 0.0;
+
       for (CeedInt i = 0; i < Q_1D; i++) {
-        buffer[i] = chebyshev_x[i] * r_U[comp + dim * NUM_COMP];
+        buffer[i] = chebyshev_x[i] * r_u;
       }
       // Contract x direction
       if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
       else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
-      if (p < NUM_POINTS) {
-        for (CeedInt i = 0; i < Q_1D; i++) {
-          // Note: shifting to avoid atomic adds
-          const CeedInt ii = (i + data.t_id_x) % Q_1D;
+      for (CeedInt i = 0; i < Q_1D; i++) {
+        // Note: shifting to avoid atomic adds
+        const CeedInt ii = (i + data.t_id_x) % Q_1D;
 
-          for (CeedInt j = 0; j < Q_1D; j++) {
-            const CeedInt jj = (j + data.t_id_y) % Q_1D;
+        for (CeedInt j = 0; j < Q_1D; j++) {
+          const CeedInt jj = (j + data.t_id_y) % Q_1D;
 
-            atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
-          }
+          atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
         }
       }
     }
@@ -342,21 +342,21 @@ inline __device__ void InterpTransposeAtPoints3d(SharedData_Hip &data, const Cee
       __syncthreads();
       // Contract y and z direction
       ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
+      const CeedScalar r_u = p < NUM_POINTS ? r_U[comp] : 0.0;
+
       for (CeedInt i = 0; i < Q_1D; i++) {
-        buffer[i] = chebyshev_x[i] * r_U[comp] * z;
+        buffer[i] = chebyshev_x[i] * r_u * z;
       }
       // Contract x direction
       ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
-      if (p < NUM_POINTS) {
-        for (CeedInt i = 0; i < Q_1D; i++) {
-          // Note: shifting to avoid atomic adds
-          const CeedInt ii = (i + data.t_id_x) % Q_1D;
+      for (CeedInt i = 0; i < Q_1D; i++) {
+        // Note: shifting to avoid atomic adds
+        const CeedInt ii = (i + data.t_id_x) % Q_1D;
 
-          for (CeedInt j = 0; j < Q_1D; j++) {
-            const CeedInt jj = (j + data.t_id_y) % Q_1D;
+        for (CeedInt j = 0; j < Q_1D; j++) {
+          const CeedInt jj = (j + data.t_id_y) % Q_1D;
 
-            atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
-          }
+          atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
         }
       }
       // Pull from shared to register
@@ -439,24 +439,23 @@ inline __device__ void GradTransposeAtPoints3d(SharedData_Hip &data, const CeedI
         // Contract y and z direction
         if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
         else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
-        const CeedScalar zz = dim == 2 ? dz : z;
+        const CeedScalar zz  = dim == 2 ? dz : z;
+        const CeedScalar r_u = p < NUM_POINTS ? r_U[comp + dim * NUM_COMP] : 0.0;
 
         for (CeedInt i = 0; i < Q_1D; i++) {
-          buffer[i] = chebyshev_x[i] * r_U[comp + dim * NUM_COMP] * zz;
+          buffer[i] = chebyshev_x[i] * r_u * zz;
         }
         // Contract x direction
         if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
         else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
-        if (p < NUM_POINTS) {
-          for (CeedInt i = 0; i < Q_1D; i++) {
-            // Note: shifting to avoid atomic adds
-            const CeedInt ii = (i + data.t_id_x) % Q_1D;
+        for (CeedInt i = 0; i < Q_1D; i++) {
+          // Note: shifting to avoid atomic adds
+          const CeedInt ii = (i + data.t_id_x) % Q_1D;
 
-            for (CeedInt j = 0; j < Q_1D; j++) {
-              const CeedInt jj = (j + data.t_id_y) % Q_1D;
+          for (CeedInt j = 0; j < Q_1D; j++) {
+            const CeedInt jj = (j + data.t_id_y) % Q_1D;
 
-              atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
-            }
+            atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
           }
         }
       }

From c6cb50fa8deb031c155ee926195304aa24263da8 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Thu, 17 Jul 2025 08:31:03 -0600
Subject: [PATCH 454/571] gpu - reorder AtPoints shuffle to avoid bank
 conflicts

---
 ...uda-shared-basis-tensor-at-points-templates.h | 16 ++++++++--------
 ...hip-shared-basis-tensor-at-points-templates.h | 16 ++++++++--------
 2 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h
index 0a7bd751ef..0d1622a591 100644
--- a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h
@@ -189,10 +189,10 @@ inline __device__ void InterpTransposeAtPoints2d(SharedData_Cuda &data, const Ce
     ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
     for (CeedInt i = 0; i < Q_1D; i++) {
       // Note: shifting to avoid atomic adds
-      const CeedInt ii = (i + data.t_id_x) % Q_1D;
+      const CeedInt ii = (i + data.t_id_y) % Q_1D;
 
       for (CeedInt j = 0; j < Q_1D; j++) {
-        const CeedInt jj = (j + data.t_id_y) % Q_1D;
+        const CeedInt jj = (j + data.t_id_x) % Q_1D;
 
         atomicAdd_block(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
       }
@@ -265,10 +265,10 @@ inline __device__ void GradTransposeAtPoints2d(SharedData_Cuda &data, const Ceed
       else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
       for (CeedInt i = 0; i < Q_1D; i++) {
         // Note: shifting to avoid atomic adds
-        const CeedInt ii = (i + data.t_id_x) % Q_1D;
+        const CeedInt ii = (i + data.t_id_y) % Q_1D;
 
         for (CeedInt j = 0; j < Q_1D; j++) {
-          const CeedInt jj = (j + data.t_id_y) % Q_1D;
+          const CeedInt jj = (j + data.t_id_x) % Q_1D;
 
           atomicAdd_block(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
         }
@@ -350,10 +350,10 @@ inline __device__ void InterpTransposeAtPoints3d(SharedData_Cuda &data, const Ce
       ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
       for (CeedInt i = 0; i < Q_1D; i++) {
         // Note: shifting to avoid atomic adds
-        const CeedInt ii = (i + data.t_id_x) % Q_1D;
+        const CeedInt ii = (i + data.t_id_y) % Q_1D;
 
         for (CeedInt j = 0; j < Q_1D; j++) {
-          const CeedInt jj = (j + data.t_id_y) % Q_1D;
+          const CeedInt jj = (j + data.t_id_x) % Q_1D;
 
           atomicAdd_block(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
         }
@@ -449,10 +449,10 @@ inline __device__ void GradTransposeAtPoints3d(SharedData_Cuda &data, const Ceed
         else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
         for (CeedInt i = 0; i < Q_1D; i++) {
           // Note: shifting to avoid atomic adds
-          const CeedInt ii = (i + data.t_id_x) % Q_1D;
+          const CeedInt ii = (i + data.t_id_y) % Q_1D;
 
           for (CeedInt j = 0; j < Q_1D; j++) {
-            const CeedInt jj = (j + data.t_id_y) % Q_1D;
+            const CeedInt jj = (j + data.t_id_x) % Q_1D;
 
             atomicAdd_block(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
           }
diff --git a/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h b/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h
index 5da77545ff..09523d0fd2 100644
--- a/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h
+++ b/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h
@@ -190,10 +190,10 @@ inline __device__ void InterpTransposeAtPoints2d(SharedData_Hip &data, const Cee
     ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
     for (CeedInt i = 0; i < Q_1D; i++) {
       // Note: shifting to avoid atomic adds
-      const CeedInt ii = (i + data.t_id_x) % Q_1D;
+      const CeedInt ii = (i + data.t_id_y) % Q_1D;
 
       for (CeedInt j = 0; j < Q_1D; j++) {
-        const CeedInt jj = (j + data.t_id_y) % Q_1D;
+        const CeedInt jj = (j + data.t_id_x) % Q_1D;
 
         atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
       }
@@ -266,10 +266,10 @@ inline __device__ void GradTransposeAtPoints2d(SharedData_Hip &data, const CeedI
       else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
       for (CeedInt i = 0; i < Q_1D; i++) {
         // Note: shifting to avoid atomic adds
-        const CeedInt ii = (i + data.t_id_x) % Q_1D;
+        const CeedInt ii = (i + data.t_id_y) % Q_1D;
 
         for (CeedInt j = 0; j < Q_1D; j++) {
-          const CeedInt jj = (j + data.t_id_y) % Q_1D;
+          const CeedInt jj = (j + data.t_id_x) % Q_1D;
 
           atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
         }
@@ -351,10 +351,10 @@ inline __device__ void InterpTransposeAtPoints3d(SharedData_Hip &data, const Cee
       ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
       for (CeedInt i = 0; i < Q_1D; i++) {
         // Note: shifting to avoid atomic adds
-        const CeedInt ii = (i + data.t_id_x) % Q_1D;
+        const CeedInt ii = (i + data.t_id_y) % Q_1D;
 
         for (CeedInt j = 0; j < Q_1D; j++) {
-          const CeedInt jj = (j + data.t_id_y) % Q_1D;
+          const CeedInt jj = (j + data.t_id_x) % Q_1D;
 
           atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
         }
@@ -450,10 +450,10 @@ inline __device__ void GradTransposeAtPoints3d(SharedData_Hip &data, const CeedI
         else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
         for (CeedInt i = 0; i < Q_1D; i++) {
           // Note: shifting to avoid atomic adds
-          const CeedInt ii = (i + data.t_id_x) % Q_1D;
+          const CeedInt ii = (i + data.t_id_y) % Q_1D;
 
           for (CeedInt j = 0; j < Q_1D; j++) {
-            const CeedInt jj = (j + data.t_id_y) % Q_1D;
+            const CeedInt jj = (j + data.t_id_x) % Q_1D;
 
             atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
           }

From 8a3c90c815d7c47d4c85d45b709b827586c3b9a0 Mon Sep 17 00:00:00 2001
From: Zach Atkins <zach.atkins@colorado.edu>
Date: Thu, 17 Jul 2025 17:03:14 -0700
Subject: [PATCH 455/571] Update libCEEDdev.md with fallback changes

---
 doc/sphinx/source/libCEEDdev.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/doc/sphinx/source/libCEEDdev.md b/doc/sphinx/source/libCEEDdev.md
index 536588fe06..61d0156730 100644
--- a/doc/sphinx/source/libCEEDdev.md
+++ b/doc/sphinx/source/libCEEDdev.md
@@ -32,12 +32,12 @@ There are three mechanisms by which a Ceed backend can inherit implementations f
    This delegate {ref}`Ceed` will only provide the implementation of that specific libCeed object for the parent backend.
    Object delegation has higher precedence than delegation.
 
-3. Operator fallback - Developers may use {c:func}`CeedSetOperatorFallbackResource` to set a string identifying which {ref}`Ceed` backend will be instantiated to provide any unimplemented {ref}`CeedOperator` methods that support preconditioning, such as {c:func}`CeedOperatorLinearAssemble`.
+3. Operator fallback - Developers may use {c:func}`CeedSetOperatorFallbackCeed` to set a {ref}`Ceed` object to provide any unimplemented {ref}`CeedOperator` methods that support preconditioning, such as {c:func}`CeedOperatorLinearAssemble`.
    The parent backend must implement the basic {ref}`CeedOperator` functionality.
-   This fallback {ref}`Ceed` object will only be created if a method is called that is not implemented by the parent backend.
+   Like the delegates above, this fallback {ref}`Ceed` object should be created and set in the backend `CeedInit` function.
    In order to use operator fallback, the parent backend and fallback backend must use compatible E-vector and Q-vector layouts.
    For example, `/gpu/cuda/gen` falls back to `/gpu/cuda/ref` for missing {ref}`CeedOperator` preconditioning support methods.
-   If an unimplemented method is called, then the parent `/gpu/cuda/gen` {ref}`Ceed` object creates a fallback `/gpu/cuda/ref` {ref}`Ceed` object and creates a clone of the {ref}`CeedOperator` with this fallback {ref}`Ceed` object.
+   If an unimplemented method is called, then the parent `/gpu/cuda/gen` {ref}`Ceed` object uses its fallback `/gpu/cuda/ref` {ref}`Ceed` object to create a clone of the {ref}`CeedOperator`.
    This clone {ref}`CeedOperator` is then used for the unimplemented preconditioning support methods.
 
 ## Backend Families

From fba0e8d2b7919c5bf59742a5391779e176058e33 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Thu, 17 Jul 2025 09:27:50 -0600
Subject: [PATCH 456/571] dealii - drop redundant copy

---
 examples/deal.II/bps.h | 121 +++++++++--------------------------------
 1 file changed, 27 insertions(+), 94 deletions(-)

diff --git a/examples/deal.II/bps.h b/examples/deal.II/bps.h
index 3ff1c8cd0d..61bd7a4872 100644
--- a/examples/deal.II/bps.h
+++ b/examples/deal.II/bps.h
@@ -249,15 +249,14 @@ class OperatorCeed : public OperatorBase<Number>
 
           for (const auto i : dof_mapping)
             indices.emplace_back(
-              partitioner->global_to_local(local_indices[fe.component_to_system_index(0, i)]) /
-              n_components);
+              partitioner->global_to_local(local_indices[fe.component_to_system_index(0, i)]));
         }
 
     CeedElemRestrictionCreate(ceed,
                               n_local_active_cells,
                               fe.n_dofs_per_cell() / n_components,
                               n_components,
-                              std::max<unsigned int>(this->extended_local_size() / n_components, 1),
+                              1,
                               this->extended_local_size(),
                               CEED_MEM_HOST,
                               CEED_COPY_VALUES,
@@ -344,47 +343,20 @@ class OperatorCeed : public OperatorBase<Number>
     // communicate: update ghost values
     src.update_ghost_values();
 
-    if (dof_handler.get_fe().n_components() == 1)
-      {
-        // pass memory buffers to libCEED
-        VectorTypeCeed x(src_ceed);
-        VectorTypeCeed y(dst_ceed);
-        x.import_array(src, CEED_MEM_HOST);
-        y.import_array(dst, CEED_MEM_HOST);
-
-        // apply operator
-        CeedOperatorApply(op_apply, x(), y(), CEED_REQUEST_IMMEDIATE);
-
-        // pull arrays back to deal.II
-        x.sync_array();
-        y.sync_array();
-      }
-    else // TODO: needed for multiple components
-      {
-        // allocate space for block vectors
-        src_tmp.reinit(this->extended_local_size(), true);
-        dst_tmp.reinit(this->extended_local_size(), true);
-
-        // copy to block vector
-        copy_to_block_vector(src_tmp, src);
-
-        // pass memory buffers to libCEED
-        VectorTypeCeed x(src_ceed);
-        VectorTypeCeed y(dst_ceed);
-        x.import_array(src_tmp, CEED_MEM_HOST);
-        y.import_array(dst_tmp, CEED_MEM_HOST);
-
-        // apply operator
-        CeedOperatorApply(op_apply, x(), y(), CEED_REQUEST_IMMEDIATE);
-
-        // pull arrays back to deal.II
-        x.sync_array();
-        y.sync_array();
-
-        // copy from block vector
-        copy_from_block_vector(dst, dst_tmp);
-      }
-
+    {
+      // pass memory buffers to libCEED
+      VectorTypeCeed x(src_ceed);
+      VectorTypeCeed y(dst_ceed);
+      x.import_array(src, CEED_MEM_HOST);
+      y.import_array(dst, CEED_MEM_HOST);
+
+      // apply operator
+      CeedOperatorApply(op_apply, x(), y(), CEED_REQUEST_IMMEDIATE);
+
+      // pull arrays back to deal.II
+      x.sync_array();
+      y.sync_array();
+    }
     // communicate: compress
     src.zero_out_ghost_values();
     dst.compress(VectorOperation::add);
@@ -410,25 +382,16 @@ class OperatorCeed : public OperatorBase<Number>
   {
     this->initialize_dof_vector(diagonal);
 
-    // pass memory buffer to libCEED
-    VectorTypeCeed y(dst_ceed);
-    y.import_array(diagonal, CEED_MEM_HOST);
-
-    CeedOperatorLinearAssembleDiagonal(op_apply, y(), CEED_REQUEST_IMMEDIATE);
-
-    // pull array back to deal.II
-    y.sync_array();
-
-    const unsigned int n_components = dof_handler.get_fe().n_components();
-
-    if (n_components > 1) // TODO: needed for multiple components
-      {
-        VectorType tmp(diagonal);
+    {
+      // pass memory buffer to libCEED
+      VectorTypeCeed y(dst_ceed);
+      y.import_array(diagonal, CEED_MEM_HOST);
 
-        copy_from_block_vector(tmp, diagonal);
+      CeedOperatorLinearAssembleDiagonal(op_apply, y(), CEED_REQUEST_IMMEDIATE);
 
-        std::swap(tmp, diagonal);
-      }
+      // pull array back to deal.II
+      y.sync_array();
+    }
 
     diagonal.compress(VectorOperation::add);
 
@@ -503,36 +466,6 @@ class OperatorCeed : public OperatorBase<Number>
     CeedVector  vec_ceed;
   };
 
-  /**
-   * Copy from block vector.
-   *
-   * @note Only needed for multiple components.
-   */
-  void
-  copy_from_block_vector(VectorType &dst, const VectorType &src) const
-  {
-    const unsigned int scalar_size = this->extended_local_size() / dim;
-
-    for (unsigned int i = 0; i < scalar_size; ++i)
-      for (unsigned int j = 0; j < dim; ++j)
-        dst.get_values()[j + i * dim] = src.get_values()[j * scalar_size + i];
-  }
-
-  /**
-   * Copy to block vector.
-   *
-   * @note Only needed for multiple components.
-   */
-  void
-  copy_to_block_vector(VectorType &dst, const VectorType &src) const
-  {
-    const unsigned int scalar_size = this->extended_local_size() / dim;
-
-    for (unsigned int i = 0; i < scalar_size; ++i)
-      for (unsigned int j = 0; j < dim; ++j)
-        dst.get_values()[j * scalar_size + i] = src.get_values()[j + i * dim];
-  }
-
   /**
    * Number of locally active DoFs.
    */
@@ -656,12 +589,12 @@ class OperatorCeed : public OperatorBase<Number>
           for (const auto i : dof_mapping)
             {
               const auto index = geo_partitioner->global_to_local(local_indices[i]);
-              geo_indices.emplace_back(index);
+              geo_indices.emplace_back(index * dim);
 
               const auto point = fe_values.quadrature_point(i);
 
               for (unsigned int d = 0; d < dim; ++d)
-                geo_support_points[index + d * n_points] = point[d];
+                geo_support_points[index * dim + d] = point[d];
             }
         }
 
@@ -688,7 +621,7 @@ class OperatorCeed : public OperatorBase<Number>
                               n_local_active_cells,
                               geo_fe.n_dofs_per_cell(),
                               dim,
-                              std::max<unsigned int>(geo_support_points.size() / dim, 1),
+                              1,
                               geo_support_points.size(),
                               CEED_MEM_HOST,
                               CEED_COPY_VALUES,

From 704703efc9822a8339f73fbc4da0d2a9396e454a Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 18 Jul 2025 06:36:07 -0600
Subject: [PATCH 457/571] dealii - transpose basis matrices Co-authored-by:
 Peter Munch <muench@math.tu-berlin.de>

---
 examples/deal.II/bps.h | 28 ++++++++++++++++++++++++----
 1 file changed, 24 insertions(+), 4 deletions(-)

diff --git a/examples/deal.II/bps.h b/examples/deal.II/bps.h
index 61bd7a4872..1665a01605 100644
--- a/examples/deal.II/bps.h
+++ b/examples/deal.II/bps.h
@@ -210,13 +210,23 @@ class OperatorCeed : public OperatorBase<Number>
       for (const auto q : shape_data.quadrature.get_points())
         q_ref_1d.push_back(q(0));
 
+      // transpose bases for compatibility with restriction
+      std::vector<CeedScalar> interp_1d(shape_data.shape_values.size());
+      std::vector<CeedScalar> grad_1d(shape_data.shape_gradients.size());
+      for (unsigned int i = 0; i < n_q_points; ++i)
+        for (unsigned int j = 0; j < fe_degree + 1; ++j)
+          {
+            interp_1d[j + i * (fe_degree + 1)] = shape_data.shape_values[j * n_q_points + i];
+            grad_1d[j + i * (fe_degree + 1)]   = shape_data.shape_gradients[j * n_q_points + i];
+          }
+
       CeedBasisCreateTensorH1(ceed,
                               dim,
                               n_components,
                               fe_degree + 1,
                               n_q_points,
-                              shape_data.shape_values.data(),
-                              shape_data.shape_gradients.data(),
+                              interp_1d.data(),
+                              grad_1d.data(),
                               q_ref_1d.data(),
                               quadrature.get_tensor_basis()[0].get_weights().data(),
                               &sol_basis);
@@ -534,13 +544,23 @@ class OperatorCeed : public OperatorBase<Number>
       for (const auto q : shape_data.quadrature.get_points())
         q_ref_1d.push_back(q(0));
 
+      // transpose bases for compatibility with restriction
+      std::vector<CeedScalar> interp_1d(shape_data.shape_values.size());
+      std::vector<CeedScalar> grad_1d(shape_data.shape_gradients.size());
+      for (unsigned int i = 0; i < n_q_points; ++i)
+        for (unsigned int j = 0; j < fe_degree + 1; ++j)
+          {
+            interp_1d[j + i * (fe_degree + 1)] = shape_data.shape_values[j * n_q_points + i];
+            grad_1d[j + i * (fe_degree + 1)]   = shape_data.shape_gradients[j * n_q_points + i];
+          }
+
       CeedBasisCreateTensorH1(ceed,
                               dim,
                               dim,
                               fe_degree + 1,
                               n_q_points,
-                              shape_data.shape_values.data(),
-                              shape_data.shape_gradients.data(),
+                              interp_1d.data(),
+                              grad_1d.data(),
                               q_ref_1d.data(),
                               quadrature.get_tensor_basis()[0].get_weights().data(),
                               &geo_basis);

From 7d3b97770519497b43cb9643139085f19391e5c3 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 18 Jul 2025 06:41:11 -0600
Subject: [PATCH 458/571] dealii - add take_array for clearer syncing intent

---
 examples/deal.II/bps.h | 51 ++++++++++++++++++++++++------------------
 1 file changed, 29 insertions(+), 22 deletions(-)

diff --git a/examples/deal.II/bps.h b/examples/deal.II/bps.h
index 1665a01605..2068fcce95 100644
--- a/examples/deal.II/bps.h
+++ b/examples/deal.II/bps.h
@@ -353,20 +353,19 @@ class OperatorCeed : public OperatorBase<Number>
     // communicate: update ghost values
     src.update_ghost_values();
 
-    {
-      // pass memory buffers to libCEED
-      VectorTypeCeed x(src_ceed);
-      VectorTypeCeed y(dst_ceed);
-      x.import_array(src, CEED_MEM_HOST);
-      y.import_array(dst, CEED_MEM_HOST);
-
-      // apply operator
-      CeedOperatorApply(op_apply, x(), y(), CEED_REQUEST_IMMEDIATE);
-
-      // pull arrays back to deal.II
-      x.sync_array();
-      y.sync_array();
-    }
+    // pass memory buffers to libCEED
+    VectorTypeCeed x(src_ceed);
+    VectorTypeCeed y(dst_ceed);
+    x.import_array(src, CEED_MEM_HOST);
+    y.import_array(dst, CEED_MEM_HOST);
+
+    // apply operator
+    CeedOperatorApply(op_apply, x(), y(), CEED_REQUEST_IMMEDIATE);
+
+    // pull arrays back to deal.II
+    x.take_array();
+    y.take_array();
+
     // communicate: compress
     src.zero_out_ghost_values();
     dst.compress(VectorOperation::add);
@@ -392,16 +391,14 @@ class OperatorCeed : public OperatorBase<Number>
   {
     this->initialize_dof_vector(diagonal);
 
-    {
-      // pass memory buffer to libCEED
-      VectorTypeCeed y(dst_ceed);
-      y.import_array(diagonal, CEED_MEM_HOST);
+    // pass memory buffer to libCEED
+    VectorTypeCeed y(dst_ceed);
+    y.import_array(diagonal, CEED_MEM_HOST);
 
-      CeedOperatorLinearAssembleDiagonal(op_apply, y(), CEED_REQUEST_IMMEDIATE);
+    CeedOperatorLinearAssembleDiagonal(op_apply, y(), CEED_REQUEST_IMMEDIATE);
 
-      // pull array back to deal.II
-      y.sync_array();
-    }
+    // pull array back to deal.II
+    y.take_array();
 
     diagonal.compress(VectorOperation::add);
 
@@ -453,6 +450,16 @@ class OperatorCeed : public OperatorBase<Number>
       CeedVectorSyncArray(vec_ceed, mem_space);
     }
 
+    /**
+     * Take previously set deal.II array from libCEED vector
+     */
+    void
+    take_array()
+    {
+      CeedScalar *ptr;
+      CeedVectorTakeArray(vec_ceed, mem_space, &ptr);
+    }
+
     /**
      * Destructor: destroy vector view.
      */

From 49337e268dd02dd9226ab0b6553a69aaad904883 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 18 Jul 2025 07:27:40 -0600
Subject: [PATCH 459/571] gpu - fix gen AtPoints transpose

---
 .../cuda-shared-basis-tensor-at-points-templates.h    | 11 ++++++-----
 .../hip/hip-shared-basis-tensor-at-points-templates.h |  8 ++++----
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h
index 0d1622a591..0da9163716 100644
--- a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h
@@ -194,7 +194,7 @@ inline __device__ void InterpTransposeAtPoints2d(SharedData_Cuda &data, const Ce
       for (CeedInt j = 0; j < Q_1D; j++) {
         const CeedInt jj = (j + data.t_id_x) % Q_1D;
 
-        atomicAdd_block(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
+        if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) atomicAdd_block(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
       }
     }
     // Pull from shared to register
@@ -270,7 +270,7 @@ inline __device__ void GradTransposeAtPoints2d(SharedData_Cuda &data, const Ceed
         for (CeedInt j = 0; j < Q_1D; j++) {
           const CeedInt jj = (j + data.t_id_x) % Q_1D;
 
-          atomicAdd_block(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
+          if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) atomicAdd_block(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
         }
       }
     }
@@ -355,7 +355,7 @@ inline __device__ void InterpTransposeAtPoints3d(SharedData_Cuda &data, const Ce
         for (CeedInt j = 0; j < Q_1D; j++) {
           const CeedInt jj = (j + data.t_id_x) % Q_1D;
 
-          atomicAdd_block(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
+          if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) atomicAdd_block(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
         }
       }
       // Pull from shared to register
@@ -439,11 +439,12 @@ inline __device__ void GradTransposeAtPoints3d(SharedData_Cuda &data, const Ceed
         if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
         else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
         const CeedScalar zz  = dim == 2 ? dz : z;
-        const CeedScalar r_u = p < NUM_POINTS ? r_U[comp + dim * NUM_COMP] : 0.0;
+        const CeedScalar r_u = (p < NUM_POINTS) ? r_U[comp + dim * NUM_COMP] : 0.0;
 
         for (CeedInt i = 0; i < Q_1D; i++) {
           buffer[i] = chebyshev_x[i] * r_u * zz;
         }
+
         // Contract x direction
         if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
         else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
@@ -454,7 +455,7 @@ inline __device__ void GradTransposeAtPoints3d(SharedData_Cuda &data, const Ceed
           for (CeedInt j = 0; j < Q_1D; j++) {
             const CeedInt jj = (j + data.t_id_x) % Q_1D;
 
-            atomicAdd_block(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
+            if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) atomicAdd_block(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
           }
         }
       }
diff --git a/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h b/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h
index 09523d0fd2..1b1e6675f1 100644
--- a/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h
+++ b/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h
@@ -195,7 +195,7 @@ inline __device__ void InterpTransposeAtPoints2d(SharedData_Hip &data, const Cee
       for (CeedInt j = 0; j < Q_1D; j++) {
         const CeedInt jj = (j + data.t_id_x) % Q_1D;
 
-        atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
+        if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
       }
     }
     // Pull from shared to register
@@ -271,7 +271,7 @@ inline __device__ void GradTransposeAtPoints2d(SharedData_Hip &data, const CeedI
         for (CeedInt j = 0; j < Q_1D; j++) {
           const CeedInt jj = (j + data.t_id_x) % Q_1D;
 
-          atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
+          if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
         }
       }
     }
@@ -356,7 +356,7 @@ inline __device__ void InterpTransposeAtPoints3d(SharedData_Hip &data, const Cee
         for (CeedInt j = 0; j < Q_1D; j++) {
           const CeedInt jj = (j + data.t_id_x) % Q_1D;
 
-          atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
+          if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
         }
       }
       // Pull from shared to register
@@ -455,7 +455,7 @@ inline __device__ void GradTransposeAtPoints3d(SharedData_Hip &data, const CeedI
           for (CeedInt j = 0; j < Q_1D; j++) {
             const CeedInt jj = (j + data.t_id_x) % Q_1D;
 
-            atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
+            if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
           }
         }
       }

From fe470f9dd60f9a04ff67bd8fff7ba78e2dba18cf Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 18 Jul 2025 11:38:19 -0600
Subject: [PATCH 460/571] ci - bypass strange lcov error

---
 .gitlab-ci.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index b9d06a271f..5389460d56 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -124,7 +124,7 @@ noether-cpu:
   after_script:
     - |
       if [ -f .SUCCESS ]; then
-        lcov --directory . --capture --output-file coverage.info;
+        lcov --directory . --capture --output-file coverage.info --ignore-errors source,mismatch;
         bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F interface;
         bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F gallery;
         bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F backends;
@@ -238,7 +238,7 @@ noether-cuda:
   after_script:
     - |
       if [ -f .SUCCESS ]; then
-        lcov --directory . --capture --output-file coverage.info;
+        lcov --directory . --capture --output-file coverage.info --ignore-errors source,mismatch;
         bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F interface;
         bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F gallery;
         bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F backends;
@@ -305,7 +305,7 @@ noether-cuda:
 #  after_script:
 #    - |
 #      if [ -f .SUCCESS ]; then
-#        lcov --directory . --capture --output-file coverage.info;
+#        lcov --directory . --capture --output-file coverage.info --ignore-errors source,mismatch;
 #        bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F interface;
 #        bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F gallery;
 #        bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F backends;
@@ -408,7 +408,7 @@ noether-float:
   after_script:
     - |
       if [ $(cat .job_status) == "SUCCESS" ]; then
-        lcov --directory . --capture --output-file coverage.info;
+        lcov --directory . --capture --output-file coverage.info --ignore-errors source,mismatch;
         bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F interface;
         bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F gallery;
         bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F backends;

From f36e753100926c940e21f506e95c0b0794520501 Mon Sep 17 00:00:00 2001
From: Zach Atkins <zach.atkins@colorado.edu>
Date: Fri, 18 Jul 2025 14:05:09 -0600
Subject: [PATCH 461/571] ci - update test suite for parity with Ratel

---
 Makefile              |   7 +-
 tests/junit.py        |   8 +-
 tests/junit_common.py | 349 ++++++++++++++++++++++++++++++++----------
 3 files changed, 282 insertions(+), 82 deletions(-)

diff --git a/Makefile b/Makefile
index 0f3c74fd5a..3d22b24d01 100644
--- a/Makefile
+++ b/Makefile
@@ -797,7 +797,7 @@ NPROC_POOL ?= 1
 export NPROC_POOL
 
 run-% : $(OBJDIR)/%
-	@$(PYTHON) tests/junit.py --mode tap --ceed-backends $(BACKENDS) --nproc $(NPROC_TEST) --pool-size $(NPROC_POOL) $(<:$(OBJDIR)/%=%)
+	@$(PYTHON) tests/junit.py --mode tap --ceed-backends $(BACKENDS) --nproc $(NPROC_TEST) --pool-size $(NPROC_POOL) --search '$(subsearch)' $(<:$(OBJDIR)/%=%)
 
 # The test and prove targets can be controlled via pattern searches.  The
 # default is to run tests and those examples that have no external dependencies.
@@ -809,6 +809,7 @@ run-% : $(OBJDIR)/%
 search ?= t ex
 realsearch = $(search:%=%%)
 matched = $(foreach pattern,$(realsearch),$(filter $(OBJDIR)/$(pattern),$(tests) $(allexamples)))
+subsearch ?= .*
 JUNIT_BATCH ?= ''
 
 # Test core libCEED
@@ -823,7 +824,7 @@ ctc-% : $(ctests);@$(foreach tst,$(ctests),$(tst) /cpu/$*;)
 # https://testanything.org/tap-specification.html
 prove : $(matched)
 	$(info Testing backends: $(BACKENDS))
-	$(PROVE) $(PROVE_OPTS) --exec '$(PYTHON) tests/junit.py --mode tap --ceed-backends $(BACKENDS) --nproc $(NPROC_TEST) --pool-size $(NPROC_POOL)' $(matched:$(OBJDIR)/%=%)
+	$(PROVE) $(PROVE_OPTS) --exec '$(PYTHON) tests/junit.py' $(matched:$(OBJDIR)/%=%) :: --mode tap --ceed-backends $(BACKENDS) --nproc $(NPROC_TEST) --pool-size $(NPROC_POOL) --search '$(subsearch)'
 # Run prove target in parallel
 prv : ;@$(MAKE) $(MFLAGS) V=$(V) prove
 
@@ -831,7 +832,7 @@ prove-all :
 	+$(MAKE) prove realsearch=%
 
 junit-% : $(OBJDIR)/%
-	@printf "  %10s %s\n" TEST $(<:$(OBJDIR)/%=%); $(PYTHON) tests/junit.py --ceed-backends $(BACKENDS) --nproc $(NPROC_TEST) --pool-size $(NPROC_POOL) --junit-batch $(JUNIT_BATCH) $(<:$(OBJDIR)/%=%)
+	@printf "  %10s %s\n" TEST $(<:$(OBJDIR)/%=%); $(PYTHON) tests/junit.py --ceed-backends $(BACKENDS) --nproc $(NPROC_TEST) --pool-size $(NPROC_POOL) --search '$(subsearch)' --junit-batch $(JUNIT_BATCH) $(<:$(OBJDIR)/%=%)
 
 junit : $(matched:$(OBJDIR)/%=junit-%)
 
diff --git a/tests/junit.py b/tests/junit.py
index a20360d91f..b0144454af 100755
--- a/tests/junit.py
+++ b/tests/junit.py
@@ -27,6 +27,10 @@ def create_argparser() -> argparse.ArgumentParser:
     parser.add_argument('-o', '--output', type=Optional[Path], default=None, help='Output file to write test')
     parser.add_argument('-b', '--junit-batch', type=str, default='', help='Name of JUnit batch for output file')
     parser.add_argument('-np', '--pool-size', type=int, default=1, help='Number of test cases to run in parallel')
+    parser.add_argument('-s', '--search', type=str, default='.*',
+                        help='Search string to filter tests, using `re` package format')
+    parser.add_argument('-v', '--verbose', action='store_true', default=False,
+                        help='print details for all runs, not just failures')
     parser.add_argument('test', help='Test executable', nargs='?')
 
     return parser
@@ -201,7 +205,9 @@ def check_allowed_stdout(self, test: str) -> bool:
         args.mode,
         args.nproc,
         CeedSuiteSpec(),
-        args.pool_size)
+        args.pool_size,
+        search=args.search,
+        verbose=args.verbose)
 
     # write output and check for failures
     if args.mode is RunMode.JUNIT:
diff --git a/tests/junit_common.py b/tests/junit_common.py
index 94a888440f..42461b132c 100644
--- a/tests/junit_common.py
+++ b/tests/junit_common.py
@@ -1,7 +1,8 @@
 from abc import ABC, abstractmethod
+from collections.abc import Iterable
 import argparse
 import csv
-from dataclasses import dataclass, field
+from dataclasses import dataclass, field, fields
 import difflib
 from enum import Enum
 from math import isclose
@@ -10,52 +11,75 @@
 import re
 import subprocess
 import multiprocessing as mp
-from itertools import product
 import sys
 import time
-from typing import Optional, Tuple, List, Callable
+from typing import Optional, Tuple, List, Dict, Callable, Iterable, get_origin
+import shutil
 
 sys.path.insert(0, str(Path(__file__).parent / "junit-xml"))
 from junit_xml import TestCase, TestSuite, to_xml_report_string  # nopep8
 
 
+class ParseError(RuntimeError):
+    """A custom exception for failed parsing."""
+
+    def __init__(self, message):
+        super().__init__(message)
+
+
 class CaseInsensitiveEnumAction(argparse.Action):
     """Action to convert input values to lower case prior to converting to an Enum type"""
 
     def __init__(self, option_strings, dest, type, default, **kwargs):
-        if not (issubclass(type, Enum) and issubclass(type, str)):
-            raise ValueError(f"{type} must be a StrEnum or str and Enum")
+        if not issubclass(type, Enum):
+            raise ValueError(f"{type} must be an Enum")
         # store provided enum type
         self.enum_type = type
-        if isinstance(default, str):
+        if isinstance(default, self.enum_type):
+            pass
+        elif isinstance(default, str):
             default = self.enum_type(default.lower())
-        else:
+        elif isinstance(default, Iterable):
             default = [self.enum_type(v.lower()) for v in default]
+        else:
+            raise argparse.ArgumentTypeError("Invalid value type, must be str or iterable")
         # prevent automatic type conversion
         super().__init__(option_strings, dest, default=default, **kwargs)
 
     def __call__(self, parser, namespace, values, option_string=None):
-        if isinstance(values, str):
+        if isinstance(values, self.enum_type):
+            pass
+        elif isinstance(values, str):
             values = self.enum_type(values.lower())
-        else:
+        elif isinstance(values, Iterable):
             values = [self.enum_type(v.lower()) for v in values]
+        else:
+            raise argparse.ArgumentTypeError("Invalid value type, must be str or iterable")
         setattr(namespace, self.dest, values)
 
 
 @dataclass
 class TestSpec:
     """Dataclass storing information about a single test case"""
-    name: str
+    name: str = field(default_factory=str)
+    csv_rtol: float = -1
+    csv_ztol: float = -1
+    cgns_tol: float = -1
     only: List = field(default_factory=list)
     args: List = field(default_factory=list)
+    key_values: Dict = field(default_factory=dict)
 
 
-class RunMode(str, Enum):
+class RunMode(Enum):
     """Enumeration of run modes, either `RunMode.TAP` or `RunMode.JUNIT`"""
-    __str__ = str.__str__
-    __format__ = str.__format__
-    TAP: str = 'tap'
-    JUNIT: str = 'junit'
+    TAP = 'tap'
+    JUNIT = 'junit'
+
+    def __str__(self):
+        return self.value
+
+    def __repr__(self):
+        return self.value
 
 
 class SuiteSpec(ABC):
@@ -97,6 +121,11 @@ def get_output_path(self, test: str, output_file: str) -> Path:
         """
         raise NotImplementedError
 
+    @property
+    def test_failure_artifacts_path(self) -> Path:
+        """Path to test failure artifacts"""
+        return Path('build') / 'test_failure_artifacts'
+
     @property
     def cgns_tol(self):
         """Absolute tolerance for CGNS diff"""
@@ -107,15 +136,24 @@ def cgns_tol(self, val):
         self._cgns_tol = val
 
     @property
-    def diff_csv_kwargs(self):
+    def csv_ztol(self):
+        """Keyword arguments to be passed to diff_csv()"""
+        return getattr(self, '_csv_ztol', 3e-10)
+
+    @csv_ztol.setter
+    def csv_ztol(self, val):
+        self._csv_ztol = val
+
+    @property
+    def csv_rtol(self):
         """Keyword arguments to be passed to diff_csv()"""
-        return getattr(self, '_diff_csv_kwargs', {})
+        return getattr(self, '_csv_rtol', 1e-6)
 
-    @diff_csv_kwargs.setter
-    def diff_csv_kwargs(self, val):
-        self._diff_csv_kwargs = val
+    @csv_rtol.setter
+    def csv_rtol(self, val):
+        self._csv_rtol = val
 
-    def post_test_hook(self, test: str, spec: TestSpec) -> None:
+    def post_test_hook(self, test: str, spec: TestSpec, backend: str) -> None:
         """Function callback ran after each test case
 
         Args:
@@ -219,6 +257,39 @@ def startswith_any(base: str, prefixes: List[str]) -> bool:
     return any((base.startswith(prefix) for prefix in prefixes))
 
 
+def find_matching(line: str, open: str = '(', close: str = ')') -> Tuple[int, int]:
+    """Find the start and end positions of the first outer paired delimeters
+
+    Args:
+        line (str): Line to search
+        open (str, optional): Opening delimiter, must be different than `close`. Defaults to '('.
+        close (str, optional): Closing delimeter, must be different than `open`. Defaults to ')'.
+
+    Raises:
+        RuntimeError: If open or close is not a single character
+        RuntimeError: If open and close are the same characters
+
+    Returns:
+        Tuple[int]: If matching delimeters are found, return indices in `list`. Otherwise, return end < start.
+    """
+    if len(open) != 1 or len(close) != 1:
+        raise RuntimeError("`open` and `close` must be single characters")
+    if open == close:
+        raise RuntimeError("`open` and `close` must be different characters")
+    start: int = line.find(open)
+    if start < 0:
+        return -1, -1
+    count: int = 1
+    for i in range(start + 1, len(line)):
+        if line[i] == open:
+            count += 1
+        if line[i] == close:
+            count -= 1
+            if count == 0:
+                return start, i
+    return start, -1
+
+
 def parse_test_line(line: str) -> TestSpec:
     """Parse a single line of TESTARGS and CLI arguments into a `TestSpec` object
 
@@ -228,18 +299,58 @@ def parse_test_line(line: str) -> TestSpec:
     Returns:
         TestSpec: Parsed specification of test case
     """
-    args: List[str] = re.findall("(?:\".*?\"|\\S)+", line.strip())
-    if args[0] == 'TESTARGS':
-        return TestSpec(name='', args=args[1:])
-    raw_test_args: str = args[0][args[0].index('TESTARGS(') + 9:args[0].rindex(')')]
-    # transform 'name="myname",only="serial,int32"' into {'name': 'myname', 'only': 'serial,int32'}
-    test_args: dict = dict([''.join(t).split('=') for t in re.findall(r"""([^,=]+)(=)"([^"]*)\"""", raw_test_args)])
-    name: str = test_args.get('name', '')
-    constraints: List[str] = test_args['only'].split(',') if 'only' in test_args else []
-    if len(args) > 1:
-        return TestSpec(name=name, only=constraints, args=args[1:])
-    else:
-        return TestSpec(name=name, only=constraints)
+    test_fields = fields(TestSpec)
+    field_names = [f.name for f in test_fields]
+    known: Dict = dict()
+    other: Dict = dict()
+    if line[0] == "(":
+        # have key/value pairs to parse
+        start, end = find_matching(line)
+        if end < start:
+            raise ParseError(f"Mismatched parentheses in TESTCASE: {line}")
+
+        keyvalues_str = line[start:end + 1]
+        keyvalues_pattern = re.compile(r'''
+            (?:\(\s*|\s*,\s*)   # start with open parentheses or comma, no capture
+            ([A-Za-z]+[\w\-]+)  # match key starting with alpha, containing alphanumeric, _, or -; captured as Group 1
+            \s*=\s*             # key is followed by = (whitespace ignored)
+            (?:                 # uncaptured group for OR
+              "((?:[^"]|\\")+)" #   match quoted value (any internal " must be escaped as \"); captured as Group 2
+            | ([^=]+)           #   OR match unquoted value (no equals signs allowed); captured as Group 3
+            )                   # end uncaptured group for OR
+            \s*(?=,|\))         # lookahead for either next comma or closing parentheses
+        ''', re.VERBOSE)
+
+        for match in re.finditer(keyvalues_pattern, keyvalues_str):
+            if not match:  # empty
+                continue
+            key = match.group(1)
+            value = match.group(2) if match.group(2) else match.group(3)
+            try:
+                index = field_names.index(key)
+                if key == "only":  # weird bc only is a list
+                    value = [constraint.strip() for constraint in value.split(',')]
+                try:
+                    # TODO: stop supporting python <=3.8
+                    known[key] = test_fields[index].type(value)  # type: ignore
+                except TypeError:
+                    # TODO: this is still liable to fail for complex types
+                    known[key] = get_origin(test_fields[index].type)(value)  # type: ignore
+            except ValueError:
+                other[key] = value
+
+        line = line[end + 1:]
+
+    args_pattern = re.compile(r'''
+        \s+(            # remove leading space
+            (?:"[^"]+") # match quoted CLI option
+          | (?:[\S]+)   # match anything else that is space separated
+        )
+    ''', re.VERBOSE)
+    args: List[str] = re.findall(args_pattern, line)
+    for k, v in other.items():
+        print(f"warning, unknown TESTCASE option for test '{known['name']}': {k}={v}")
+    return TestSpec(**known, key_values=other, args=args)
 
 
 def get_test_args(source_file: Path) -> List[TestSpec]:
@@ -266,20 +377,20 @@ def get_test_args(source_file: Path) -> List[TestSpec]:
     else:
         raise RuntimeError(f'Unrecognized extension for file: {source_file}')
 
-    return [parse_test_line(line.strip(comment_str))
+    return [parse_test_line(line.strip(comment_str).removeprefix("TESTARGS"))
             for line in source_file.read_text().splitlines()
             if line.startswith(f'{comment_str}TESTARGS')] or [TestSpec('', args=['{ceed_resource}'])]
 
 
-def diff_csv(test_csv: Path, true_csv: Path, zero_tol: float = 3e-10, rel_tol: float = 1e-2,
+def diff_csv(test_csv: Path, true_csv: Path, zero_tol: float, rel_tol: float,
              comment_str: str = '#', comment_func: Optional[Callable[[str, str], Optional[str]]] = None) -> str:
     """Compare CSV results against an expected CSV file with tolerances
 
     Args:
         test_csv (Path): Path to output CSV results
         true_csv (Path): Path to expected CSV results
-        zero_tol (float, optional): Tolerance below which values are considered to be zero. Defaults to 3e-10.
-        rel_tol (float, optional): Relative tolerance for comparing non-zero values. Defaults to 1e-2.
+        zero_tol (float): Tolerance below which values are considered to be zero.
+        rel_tol (float): Relative tolerance for comparing non-zero values.
         comment_str (str, optional): String to denoting commented line
         comment_func (Callable, optional): Function to determine if test and true line are different
 
@@ -317,6 +428,10 @@ def diff_csv(test_csv: Path, true_csv: Path, zero_tol: float = 3e-10, rel_tol: f
 
     test_reader: csv.DictReader = csv.DictReader(test_lines)
     true_reader: csv.DictReader = csv.DictReader(true_lines)
+    if not test_reader.fieldnames:
+        return f'No CSV columns found in test output {test_csv}'
+    if not true_reader.fieldnames:
+        return f'No CSV columns found in test source {true_csv}'
     if test_reader.fieldnames != true_reader.fieldnames:
         return ''.join(difflib.unified_diff([f'{test_lines[0]}\n'], [f'{true_lines[0]}\n'],
                        tofile='found CSV columns', fromfile='expected CSV columns'))
@@ -344,13 +459,13 @@ def diff_csv(test_csv: Path, true_csv: Path, zero_tol: float = 3e-10, rel_tol: f
     return '\n'.join(diff_lines)
 
 
-def diff_cgns(test_cgns: Path, true_cgns: Path, cgns_tol: float = 1e-12) -> str:
+def diff_cgns(test_cgns: Path, true_cgns: Path, cgns_tol: float) -> str:
     """Compare CGNS results against an expected CGSN file with tolerance
 
     Args:
         test_cgns (Path): Path to output CGNS file
         true_cgns (Path): Path to expected CGNS file
-        cgns_tol (float, optional): Tolerance for comparing floating-point values
+        cgns_tol (float): Tolerance for comparing floating-point values
 
     Returns:
         str: Diff output between result and expected CGNS files
@@ -367,32 +482,60 @@ def diff_cgns(test_cgns: Path, true_cgns: Path, cgns_tol: float = 1e-12) -> str:
     return proc.stderr.decode('utf-8') + proc.stdout.decode('utf-8')
 
 
+def diff_ascii(test_file: Path, true_file: Path, backend: str) -> str:
+    """Compare ASCII results against an expected ASCII file
+
+    Args:
+        test_file (Path): Path to output ASCII file
+        true_file (Path): Path to expected ASCII file
+
+    Returns:
+        str: Diff output between result and expected ASCII files
+    """
+    tmp_backend: str = backend.replace('/', '-')
+    true_str: str = true_file.read_text().replace('{ceed_resource}', tmp_backend)
+    diff = list(difflib.unified_diff(test_file.read_text().splitlines(keepends=True),
+                                     true_str.splitlines(keepends=True),
+                                     fromfile=str(test_file),
+                                     tofile=str(true_file)))
+    return ''.join(diff)
+
+
 def test_case_output_string(test_case: TestCase, spec: TestSpec, mode: RunMode,
-                            backend: str, test: str, index: int) -> str:
+                            backend: str, test: str, index: int, verbose: bool) -> str:
     output_str = ''
     if mode is RunMode.TAP:
         # print incremental output if TAP mode
         if test_case.is_skipped():
             output_str += f'    ok {index} - {spec.name}, {backend} # SKIP {test_case.skipped[0]["message"]}\n'
         elif test_case.is_failure() or test_case.is_error():
-            output_str += f'    not ok {index} - {spec.name}, {backend}\n'
+            output_str += f'    not ok {index} - {spec.name}, {backend} ({test_case.elapsed_sec} s)\n'
         else:
-            output_str += f'    ok {index} - {spec.name}, {backend}\n'
-        output_str += f'      ---\n'
-        if spec.only:
-            output_str += f'      only: {",".join(spec.only)}\n'
-        output_str += f'      args: {test_case.args}\n'
-        if test_case.is_error():
-            output_str += f'      error: {test_case.errors[0]["message"]}\n'
-        if test_case.is_failure():
-            output_str += f'      num_failures: {len(test_case.failures)}\n'
-            for i, failure in enumerate(test_case.failures):
-                output_str += f'      failure_{i}: {failure["message"]}\n'
-                output_str += f'        message: {failure["message"]}\n'
-                if failure["output"]:
-                    out = failure["output"].strip().replace('\n', '\n          ')
-                    output_str += f'        output: |\n          {out}\n'
-        output_str += f'      ...\n'
+            output_str += f'    ok {index} - {spec.name}, {backend} ({test_case.elapsed_sec} s)\n'
+        if test_case.is_failure() or test_case.is_error() or verbose:
+            output_str += f'      ---\n'
+            if spec.only:
+                output_str += f'      only: {",".join(spec.only)}\n'
+            output_str += f'      args: {test_case.args}\n'
+            if spec.csv_ztol > 0:
+                output_str += f'      csv_ztol: {spec.csv_ztol}\n'
+            if spec.csv_rtol > 0:
+                output_str += f'      csv_rtol: {spec.csv_rtol}\n'
+            if spec.cgns_tol > 0:
+                output_str += f'      cgns_tol: {spec.cgns_tol}\n'
+            for k, v in spec.key_values.items():
+                output_str += f'      {k}: {v}\n'
+            if test_case.is_error():
+                output_str += f'      error: {test_case.errors[0]["message"]}\n'
+            if test_case.is_failure():
+                output_str += f'      failures:\n'
+                for i, failure in enumerate(test_case.failures):
+                    output_str += f'        -\n'
+                    output_str += f'          message: {failure["message"]}\n'
+                    if failure["output"]:
+                        out = failure["output"].strip().replace('\n', '\n            ')
+                        output_str += f'          output: |\n            {out}\n'
+            output_str += f'      ...\n'
     else:
         # print error or failure information if JUNIT mode
         if test_case.is_error() or test_case.is_failure():
@@ -408,8 +551,20 @@ def test_case_output_string(test_case: TestCase, spec: TestSpec, mode: RunMode,
     return output_str
 
 
+def save_failure_artifact(suite_spec: SuiteSpec, file: Path) -> Path:
+    """Attach a file to a test case
+
+    Args:
+        test_case (TestCase): Test case to attach the file to
+        file (Path): Path to the file to attach
+    """
+    save_path: Path = suite_spec.test_failure_artifacts_path / file.name
+    shutil.copyfile(file, save_path)
+    return save_path
+
+
 def run_test(index: int, test: str, spec: TestSpec, backend: str,
-             mode: RunMode, nproc: int, suite_spec: SuiteSpec) -> TestCase:
+             mode: RunMode, nproc: int, suite_spec: SuiteSpec, verbose: bool = False) -> TestCase:
     """Run a single test case and backend combination
 
     Args:
@@ -420,6 +575,7 @@ def run_test(index: int, test: str, spec: TestSpec, backend: str,
         mode (RunMode): Output mode
         nproc (int): Number of MPI processes to use when running test case
         suite_spec (SuiteSpec): Specification of test suite
+        verbose (bool, optional): Print detailed output for all runs, not just failures. Defaults to False.
 
     Returns:
         TestCase: Test case result
@@ -438,7 +594,7 @@ def run_test(index: int, test: str, spec: TestSpec, backend: str,
         run_args = ['mpiexec', '-n', f'{nproc}', *run_args]
 
     # run test
-    skip_reason: str = suite_spec.check_pre_skip(test, spec, backend, nproc)
+    skip_reason: Optional[str] = suite_spec.check_pre_skip(test, spec, backend, nproc)
     if skip_reason:
         test_case: TestCase = TestCase(f'{test}, "{spec.name}", n{nproc}, {backend}',
                                        elapsed_sec=0,
@@ -464,19 +620,23 @@ def run_test(index: int, test: str, spec: TestSpec, backend: str,
                              allow_multiple_subelements=True,
                              category=spec.name,)
         ref_csvs: List[Path] = []
+        ref_ascii: List[Path] = []
         output_files: List[str] = [arg for arg in run_args if 'ascii:' in arg]
         if output_files:
-            ref_csvs = [suite_spec.get_output_path(test, file.split(':')[1]) for file in output_files]
+            ref_csvs = [suite_spec.get_output_path(test, file.split(':')[1])
+                        for file in output_files if file.endswith('.csv')]
+            ref_ascii = [suite_spec.get_output_path(test, file.split(':')[1])
+                         for file in output_files if not file.endswith('.csv')]
         ref_cgns: List[Path] = []
         output_files = [arg for arg in run_args if 'cgns:' in arg]
         if output_files:
             ref_cgns = [suite_spec.get_output_path(test, file.split('cgns:')[-1]) for file in output_files]
         ref_stdout: Path = suite_spec.get_output_path(test, test + '.out')
-        suite_spec.post_test_hook(test, spec)
+        suite_spec.post_test_hook(test, spec, backend)
 
     # check allowed failures
     if not test_case.is_skipped() and test_case.stderr:
-        skip_reason: str = suite_spec.check_post_skip(test, spec, backend, test_case.stderr)
+        skip_reason: Optional[str] = suite_spec.check_post_skip(test, spec, backend, test_case.stderr)
         if skip_reason:
             test_case.add_skipped_info(skip_reason)
 
@@ -507,39 +667,67 @@ def run_test(index: int, test: str, spec: TestSpec, backend: str,
         # expected CSV output
         for ref_csv in ref_csvs:
             csv_name = ref_csv.name
+            out_file = Path.cwd() / csv_name
             if not ref_csv.is_file():
                 # remove _{ceed_backend} from path name
                 ref_csv = (ref_csv.parent / ref_csv.name.rsplit('_', 1)[0]).with_suffix('.csv')
             if not ref_csv.is_file():
                 test_case.add_failure_info('csv', output=f'{ref_csv} not found')
-            elif not (Path.cwd() / csv_name).is_file():
-                test_case.add_failure_info('csv', output=f'{csv_name} not found')
+            elif not out_file.is_file():
+                test_case.add_failure_info('csv', output=f'{out_file} not found')
             else:
-                diff: str = diff_csv(Path.cwd() / csv_name, ref_csv, **suite_spec.diff_csv_kwargs)
+                csv_ztol: float = spec.csv_ztol if spec.csv_ztol > 0 else suite_spec.csv_ztol
+                csv_rtol: float = spec.csv_rtol if spec.csv_rtol > 0 else suite_spec.csv_rtol
+                diff = diff_csv(out_file, ref_csv, zero_tol=csv_ztol, rel_tol=csv_rtol)
                 if diff:
-                    test_case.add_failure_info('csv', output=diff)
+                    save_path: Path = suite_spec.test_failure_artifacts_path / csv_name
+                    shutil.move(out_file, save_path)
+                    test_case.add_failure_info(f'csv: {save_path}', output=diff)
                 else:
-                    (Path.cwd() / csv_name).unlink()
+                    out_file.unlink()
         # expected CGNS output
         for ref_cgn in ref_cgns:
             cgn_name = ref_cgn.name
+            out_file = Path.cwd() / cgn_name
             if not ref_cgn.is_file():
                 # remove _{ceed_backend} from path name
                 ref_cgn = (ref_cgn.parent / ref_cgn.name.rsplit('_', 1)[0]).with_suffix('.cgns')
             if not ref_cgn.is_file():
                 test_case.add_failure_info('cgns', output=f'{ref_cgn} not found')
-            elif not (Path.cwd() / cgn_name).is_file():
-                test_case.add_failure_info('csv', output=f'{cgn_name} not found')
+            elif not out_file.is_file():
+                test_case.add_failure_info('cgns', output=f'{out_file} not found')
+            else:
+                cgns_tol = spec.cgns_tol if spec.cgns_tol > 0 else suite_spec.cgns_tol
+                diff = diff_cgns(out_file, ref_cgn, cgns_tol=cgns_tol)
+                if diff:
+                    save_path: Path = suite_spec.test_failure_artifacts_path / cgn_name
+                    shutil.move(out_file, save_path)
+                    test_case.add_failure_info(f'cgns: {save_path}', output=diff)
+                else:
+                    out_file.unlink()
+        # expected ASCII output
+        for ref_file in ref_ascii:
+            ref_name = ref_file.name
+            out_file = Path.cwd() / ref_name
+            if not ref_file.is_file():
+                # remove _{ceed_backend} from path name
+                ref_file = (ref_file.parent / ref_file.name.rsplit('_', 1)[0]).with_suffix(ref_file.suffix)
+            if not ref_file.is_file():
+                test_case.add_failure_info('ascii', output=f'{ref_file} not found')
+            elif not out_file.is_file():
+                test_case.add_failure_info('ascii', output=f'{out_file} not found')
             else:
-                diff = diff_cgns(Path.cwd() / cgn_name, ref_cgn, cgns_tol=suite_spec.cgns_tol)
+                diff = diff_ascii(out_file, ref_file, backend)
                 if diff:
-                    test_case.add_failure_info('cgns', output=diff)
+                    save_path: Path = suite_spec.test_failure_artifacts_path / ref_name
+                    shutil.move(out_file, save_path)
+                    test_case.add_failure_info(f'ascii: {save_path}', output=diff)
                 else:
-                    (Path.cwd() / cgn_name).unlink()
+                    out_file.unlink()
 
     # store result
     test_case.args = ' '.join(str(arg) for arg in run_args)
-    output_str = test_case_output_string(test_case, spec, mode, backend, test, index)
+    output_str = test_case_output_string(test_case, spec, mode, backend, test, index, verbose)
 
     return test_case, output_str
 
@@ -553,7 +741,7 @@ def init_process():
 
 
 def run_tests(test: str, ceed_backends: List[str], mode: RunMode, nproc: int,
-              suite_spec: SuiteSpec, pool_size: int = 1) -> TestSuite:
+              suite_spec: SuiteSpec, pool_size: int = 1, search: str = ".*", verbose: bool = False) -> TestSuite:
     """Run all test cases for `test` with each of the provided `ceed_backends`
 
     Args:
@@ -563,18 +751,23 @@ def run_tests(test: str, ceed_backends: List[str], mode: RunMode, nproc: int,
         nproc (int): Number of MPI processes to use when running each test case
         suite_spec (SuiteSpec): Object defining required methods for running tests
         pool_size (int, optional): Number of processes to use when running tests in parallel. Defaults to 1.
+        search (str, optional): Regular expression used to match tests. Defaults to ".*".
+        verbose (bool, optional): Print detailed output for all runs, not just failures. Defaults to False.
 
     Returns:
         TestSuite: JUnit `TestSuite` containing results of all test cases
     """
-    test_specs: List[TestSpec] = get_test_args(suite_spec.get_source_path(test))
+    test_specs: List[TestSpec] = [
+        t for t in get_test_args(suite_spec.get_source_path(test)) if re.search(search, t.name, re.IGNORECASE)
+    ]
+    suite_spec.test_failure_artifacts_path.mkdir(parents=True, exist_ok=True)
     if mode is RunMode.TAP:
         print('TAP version 13')
         print(f'1..{len(test_specs)}')
 
     with mp.Pool(processes=pool_size, initializer=init_process) as pool:
-        async_outputs: List[List[mp.AsyncResult]] = [
-            [pool.apply_async(run_test, (i, test, spec, backend, mode, nproc, suite_spec))
+        async_outputs: List[List[mp.pool.AsyncResult]] = [
+            [pool.apply_async(run_test, (i, test, spec, backend, mode, nproc, suite_spec, verbose))
              for (i, backend) in enumerate(ceed_backends, start=1)]
             for spec in test_specs
         ]
@@ -607,7 +800,7 @@ def write_junit_xml(test_suite: TestSuite, output_file: Optional[Path], batch: s
         output_file (Optional[Path]): Path to output file, or `None` to generate automatically as `build/{test_suite.name}{batch}.junit`
         batch (str): Name of JUnit batch, defaults to empty string
     """
-    output_file: Path = output_file or Path('build') / (f'{test_suite.name}{batch}.junit')
+    output_file = output_file or Path('build') / (f'{test_suite.name}{batch}.junit')
     output_file.write_text(to_xml_report_string([test_suite]))
 
 
From 2c835e8042c5235890e856ae02c8e6009c87f12c Mon Sep 17 00:00:00 2001
From: Zach Atkins <zach.atkins@colorado.edu>
Date: Fri, 18 Jul 2025 14:09:37 -0600
Subject: [PATCH 462/571] ci - bump vermin to Python 3.8

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 3d22b24d01..5331ca3dc5 100644
--- a/Makefile
+++ b/Makefile
@@ -950,7 +950,7 @@ format    : format-c format-py format-ot
 
 # Vermin - python version requirements
 VERMIN            ?= vermin
-VERMIN_OPTS       += -t=3.7- --violations
+VERMIN_OPTS       += -t=3.8- --violations
 
 vermin    :
 	$(VERMIN) $(VERMIN_OPTS) $(format.py)

From 39472f1800a7201e795b8e67cf33eaec17608dee Mon Sep 17 00:00:00 2001
From: Peter Munch <peterrmuench@gmail.com>
Date: Sat, 19 Jul 2025 13:03:28 +0200
Subject: [PATCH 463/571] deal.II examples: some clean up

---
 examples/deal.II/bps.h | 62 ++++++++++++++----------------------------
 1 file changed, 20 insertions(+), 42 deletions(-)

diff --git a/examples/deal.II/bps.h b/examples/deal.II/bps.h
index 2068fcce95..1d984999f9 100644
--- a/examples/deal.II/bps.h
+++ b/examples/deal.II/bps.h
@@ -276,18 +276,18 @@ class OperatorCeed : public OperatorBase<Number>
     // 4) create mapping -> MappingInfo
     const unsigned int n_components_metric = (bp <= BPType::BP2) ? 1 : (dim * (dim + 1) / 2);
 
-    this->weights = compute_metric_data(ceed, mapping, tria, quadrature, bp);
+    this->metric_data = compute_metric_data(ceed, mapping, tria, quadrature, bp);
 
     strides = {{1,
                 static_cast<int>(quadrature.size()),
                 static_cast<int>(quadrature.size() * n_components_metric)}};
-    CeedVectorCreate(ceed, weights.size(), &q_data);
-    CeedVectorSetArray(q_data, CEED_MEM_HOST, CEED_USE_POINTER, weights.data());
+    CeedVectorCreate(ceed, metric_data.size(), &q_data);
+    CeedVectorSetArray(q_data, CEED_MEM_HOST, CEED_USE_POINTER, metric_data.data());
     CeedElemRestrictionCreateStrided(ceed,
                                      n_local_active_cells,
                                      quadrature.size(),
                                      n_components_metric,
-                                     weights.size(),
+                                     metric_data.size(),
                                      strides.data(),
                                      &q_data_restriction);
 
@@ -402,6 +402,9 @@ class OperatorCeed : public OperatorBase<Number>
 
     diagonal.compress(VectorOperation::add);
 
+    // apply constraints: we assume homogeneous DBC
+    constraints.set_zero(diagonal);
+
     for (auto &i : diagonal)
       i = (std::abs(i) > 1.0e-10) ? (1.0 / i) : 1.0;
   }
@@ -502,24 +505,7 @@ class OperatorCeed : public OperatorBase<Number>
                       const Quadrature<dim>    &quadrature,
                       const BPType              bp)
   {
-    std::vector<double> weights;
-
-    if (false)
-      {
-        FE_Nothing<dim> dummy_fe;
-        FEValues<dim>   fe_values(mapping, dummy_fe, quadrature, update_JxW_values);
-
-        for (const auto &cell : tria.active_cell_iterators())
-          if (cell->is_locally_owned())
-            {
-              fe_values.reinit(cell);
-
-              for (const auto q : fe_values.quadrature_point_indices())
-                weights.emplace_back(fe_values.JxW(q));
-            }
-
-        return weights;
-      }
+    std::vector<double> metric_data;
 
     CeedBasis            geo_basis;
     CeedVector           q_data;
@@ -532,7 +518,7 @@ class OperatorCeed : public OperatorBase<Number>
 
     const unsigned int n_q_points = quadrature.get_tensor_basis()[0].size();
 
-    const unsigned int n_components = (bp <= BPType::BP2) ? 1 : (dim * (dim + 1) / 2);
+    const unsigned int n_components_metric = (bp <= BPType::BP2) ? 1 : (dim * (dim + 1) / 2);
 
     const auto mapping_q = dynamic_cast<const MappingQ<dim> *>(&mapping);
 
@@ -625,19 +611,19 @@ class OperatorCeed : public OperatorBase<Number>
             }
         }
 
-    weights.resize(n_local_active_cells * quadrature.size() * n_components);
+    metric_data.resize(n_local_active_cells * quadrature.size() * n_components_metric);
 
     CeedInt strides[3] = {1,
                           static_cast<int>(quadrature.size()),
-                          static_cast<int>(quadrature.size() * n_components)};
+                          static_cast<int>(quadrature.size() * n_components_metric)};
 
-    CeedVectorCreate(ceed, weights.size(), &q_data);
-    CeedVectorSetArray(q_data, CEED_MEM_HOST, CEED_USE_POINTER, weights.data());
+    CeedVectorCreate(ceed, metric_data.size(), &q_data);
+    CeedVectorSetArray(q_data, CEED_MEM_HOST, CEED_USE_POINTER, metric_data.data());
     CeedElemRestrictionCreateStrided(ceed,
                                      n_local_active_cells,
                                      quadrature.size(),
-                                     n_components,
-                                     weights.size(),
+                                     n_components_metric,
+                                     metric_data.size(),
                                      strides,
                                      &q_data_restriction);
 
@@ -670,15 +656,15 @@ class OperatorCeed : public OperatorBase<Number>
       CeedQFunctionCreateInterior(ceed, 1, f_build_poisson, f_build_poisson_loc, &qf_build);
 
     CeedQFunctionAddInput(qf_build, "geo", dim * dim, CEED_EVAL_GRAD);
-    CeedQFunctionAddInput(qf_build, "weights", 1, CEED_EVAL_WEIGHT);
-    CeedQFunctionAddOutput(qf_build, "qdata", n_components, CEED_EVAL_NONE);
+    CeedQFunctionAddInput(qf_build, "metric_data", 1, CEED_EVAL_WEIGHT);
+    CeedQFunctionAddOutput(qf_build, "qdata", n_components_metric, CEED_EVAL_NONE);
     CeedQFunctionSetContext(qf_build, build_ctx);
 
     // 6) put everything together
     CeedOperatorCreate(ceed, qf_build, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_build);
     CeedOperatorSetField(op_build, "geo", geo_restriction, geo_basis, CEED_VECTOR_ACTIVE);
     CeedOperatorSetField(
-      op_build, "weights", CEED_ELEMRESTRICTION_NONE, geo_basis, CEED_VECTOR_NONE);
+      op_build, "metric_data", CEED_ELEMRESTRICTION_NONE, geo_basis, CEED_VECTOR_NONE);
     CeedOperatorSetField(
       op_build, "qdata", q_data_restriction, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE);
 
@@ -694,7 +680,7 @@ class OperatorCeed : public OperatorBase<Number>
     CeedQFunctionDestroy(&qf_build);
     CeedOperatorDestroy(&op_build);
 
-    return weights;
+    return metric_data;
   }
 
   /**
@@ -736,19 +722,11 @@ class OperatorCeed : public OperatorBase<Number>
    * libCEED data structures.
    */
   Ceed                   ceed;
-  std::vector<double>    weights;
+  std::vector<double>    metric_data;
   std::array<CeedInt, 3> strides;
   CeedVector             src_ceed;
   CeedVector             dst_ceed;
   CeedOperator           op_apply;
-
-  /**
-   * Temporal (tempral) vectors.
-   *
-   * @note Only needed for multiple components.
-   */
-  mutable VectorType src_tmp;
-  mutable VectorType dst_tmp;
 };
 
 
From ca62d558e488f6cc806d30b1949b4c18ff816b3a Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 18 Jul 2025 12:33:01 -0600
Subject: [PATCH 464/571] basis - pull BasisIsCollocated helper to interface
 level

---
 backends/ref/ceed-ref-basis.c | 12 +-----------
 include/ceed/backend.h        |  1 +
 interface/ceed-basis.c        | 26 ++++++++++++++++++++++++++
 3 files changed, 28 insertions(+), 11 deletions(-)

diff --git a/backends/ref/ceed-ref-basis.c b/backends/ref/ceed-ref-basis.c
index 2ae551eaf2..9b05f064db 100644
--- a/backends/ref/ceed-ref-basis.c
+++ b/backends/ref/ceed-ref-basis.c
@@ -286,17 +286,7 @@ int CeedBasisCreateTensorH1_Ref(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const C
 
   CeedCallBackend(CeedCalloc(1, &impl));
   // Check for collocated interp
-  if (Q_1d == P_1d) {
-    bool has_collocated = true;
-
-    for (CeedInt i = 0; i < P_1d; i++) {
-      has_collocated = has_collocated && (fabs(interp_1d[i + P_1d * i] - 1.0) < 1e-14);
-      for (CeedInt j = 0; j < P_1d; j++) {
-        if (j != i) has_collocated = has_collocated && (fabs(interp_1d[j + P_1d * i]) < 1e-14);
-      }
-    }
-    impl->has_collo_interp = has_collocated;
-  }
+  CeedCallBackend(CeedBasisIsCollocated(basis, &impl->has_collo_interp));
   // Calculate collocated grad
   if (Q_1d >= P_1d && !impl->has_collo_interp) {
     CeedCallBackend(CeedMalloc(Q_1d * Q_1d, &impl->collo_grad_1d));
diff --git a/include/ceed/backend.h b/include/ceed/backend.h
index b4b5a980c7..c48e0d4666 100644
--- a/include/ceed/backend.h
+++ b/include/ceed/backend.h
@@ -331,6 +331,7 @@ CEED_EXTERN const char *const CeedFESpaces[];
 CEED_EXTERN int CeedBasisGetCollocatedGrad(CeedBasis basis, CeedScalar *colo_grad_1d);
 CEED_EXTERN int CeedBasisGetChebyshevInterp1D(CeedBasis basis, CeedScalar *chebyshev_interp_1d);
 CEED_EXTERN int CeedBasisIsTensor(CeedBasis basis, bool *is_tensor);
+CEED_EXTERN int CeedBasisIsCollocated(CeedBasis basis, bool *is_collocated);
 CEED_EXTERN int CeedBasisGetData(CeedBasis basis, void *data);
 CEED_EXTERN int CeedBasisSetData(CeedBasis basis, void *data);
 CEED_EXTERN int CeedBasisReference(CeedBasis basis);
diff --git a/interface/ceed-basis.c b/interface/ceed-basis.c
index 4b798ab91a..e4494aae91 100644
--- a/interface/ceed-basis.c
+++ b/interface/ceed-basis.c
@@ -780,6 +780,32 @@ int CeedBasisIsTensor(CeedBasis basis, bool *is_tensor) {
   return CEED_ERROR_SUCCESS;
 }
 
+/**
+  @brief Determine if given `CeedBasis` has nodes collocated with quadrature points
+
+  @param[in]  basis     `CeedBasis`
+  @param[out] is_tensor Variable to store collocated status
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Backend
+**/
+int CeedBasisIsCollocated(CeedBasis basis, bool *is_collocated) {
+  if (basis->is_tensor_basis && (basis->Q_1d == basis->P_1d)) {
+    *is_collocated = true;
+
+    for (CeedInt i = 0; i < basis->P_1d; i++) {
+      *is_collocated = *is_collocated && (fabs(basis->interp_1d[i + basis->P_1d * i] - 1.0) < 10 * CEED_EPSILON);
+      for (CeedInt j = 0; j < basis->Q_1d; j++) {
+        if (j != i) *is_collocated = *is_collocated && (fabs(basis->interp_1d[j + basis->P_1d * i]) < 10 * CEED_EPSILON);
+      }
+    }
+  } else {
+    *is_collocated = false;
+  }
+  return CEED_ERROR_SUCCESS;
+}
+
 /**
   @brief Get backend data of a `CeedBasis`
 

From aa4b4a9fcfd0e85f86607a23b052958a8bf51783 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 18 Jul 2025 12:54:26 -0600
Subject: [PATCH 465/571] ref - minor variable name clarity

---
 backends/ref/ceed-ref-basis.c | 9 ++++-----
 backends/ref/ceed-ref.h       | 2 +-
 interface/ceed-basis.c        | 4 ++--
 3 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/backends/ref/ceed-ref-basis.c b/backends/ref/ceed-ref-basis.c
index 9b05f064db..15efd97e36 100644
--- a/backends/ref/ceed-ref-basis.c
+++ b/backends/ref/ceed-ref-basis.c
@@ -55,7 +55,7 @@ static int CeedBasisApplyCore_Ref(CeedBasis basis, bool apply_add, CeedInt num_e
     switch (eval_mode) {
       // Interpolate to/from quadrature points
       case CEED_EVAL_INTERP: {
-        if (impl->has_collo_interp) {
+        if (impl->is_collocated) {
           memcpy(v, u, num_elem * num_comp * num_nodes * sizeof(u[0]));
         } else {
           CeedInt P = P_1d, Q = Q_1d;
@@ -124,7 +124,7 @@ static int CeedBasisApplyCore_Ref(CeedBasis basis, bool apply_add, CeedInt num_e
             pre /= P;
             post *= Q;
           }
-        } else if (impl->has_collo_interp) {  // Qpts collocated with nodes
+        } else if (impl->is_collocated) {  // Qpts collocated with nodes
           const CeedScalar *grad_1d;
 
           CeedCallBackend(CeedBasisGetGrad1D(basis, &grad_1d));
@@ -285,10 +285,9 @@ int CeedBasisCreateTensorH1_Ref(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const C
   CeedCallBackend(CeedGetParent(ceed, &ceed_parent));
 
   CeedCallBackend(CeedCalloc(1, &impl));
-  // Check for collocated interp
-  CeedCallBackend(CeedBasisIsCollocated(basis, &impl->has_collo_interp));
   // Calculate collocated grad
-  if (Q_1d >= P_1d && !impl->has_collo_interp) {
+  CeedCallBackend(CeedBasisIsCollocated(basis, &impl->is_collocated));
+  if (Q_1d >= P_1d && !impl->is_collocated) {
     CeedCallBackend(CeedMalloc(Q_1d * Q_1d, &impl->collo_grad_1d));
     CeedCallBackend(CeedBasisGetCollocatedGrad(basis, impl->collo_grad_1d));
   }
diff --git a/backends/ref/ceed-ref.h b/backends/ref/ceed-ref.h
index 621424b2ed..8396fc1a69 100644
--- a/backends/ref/ceed-ref.h
+++ b/backends/ref/ceed-ref.h
@@ -33,7 +33,7 @@ typedef struct {
 
 typedef struct {
   CeedScalar *collo_grad_1d;
-  bool        has_collo_interp;
+  bool        is_collocated;
 } CeedBasis_Ref;
 
 typedef struct {
diff --git a/interface/ceed-basis.c b/interface/ceed-basis.c
index e4494aae91..6dfbd55d61 100644
--- a/interface/ceed-basis.c
+++ b/interface/ceed-basis.c
@@ -783,8 +783,8 @@ int CeedBasisIsTensor(CeedBasis basis, bool *is_tensor) {
 /**
   @brief Determine if given `CeedBasis` has nodes collocated with quadrature points
 
-  @param[in]  basis     `CeedBasis`
-  @param[out] is_tensor Variable to store collocated status
+  @param[in]  basis         `CeedBasis`
+  @param[out] is_collocated Variable to store collocated status
 
   @return An error code: 0 - success, otherwise - failure
 

From 7b1ec8807c067184328a454e68b003059cbc1258 Mon Sep 17 00:00:00 2001
From: Zach Atkins <zach.atkins@colorado.edu>
Date: Mon, 21 Jul 2025 09:27:53 -0600
Subject: [PATCH 466/571] ci - add default test name (file name)

---
 tests/junit_common.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/tests/junit_common.py b/tests/junit_common.py
index 42461b132c..f8f4a4d556 100644
--- a/tests/junit_common.py
+++ b/tests/junit_common.py
@@ -290,7 +290,7 @@ def find_matching(line: str, open: str = '(', close: str = ')') -> Tuple[int, in
     return start, -1
 
 
-def parse_test_line(line: str) -> TestSpec:
+def parse_test_line(line: str, fallback_name: str = '') -> TestSpec:
     """Parse a single line of TESTARGS and CLI arguments into a `TestSpec` object
 
     Args:
@@ -341,6 +341,9 @@ def parse_test_line(line: str) -> TestSpec:
 
         line = line[end + 1:]
 
+    if not 'name' in known.keys():
+        known['name'] = fallback_name
+
     args_pattern = re.compile(r'''
         \s+(            # remove leading space
             (?:"[^"]+") # match quoted CLI option
@@ -377,9 +380,9 @@ def get_test_args(source_file: Path) -> List[TestSpec]:
     else:
         raise RuntimeError(f'Unrecognized extension for file: {source_file}')
 
-    return [parse_test_line(line.strip(comment_str).removeprefix("TESTARGS"))
+    return [parse_test_line(line.strip(comment_str).removeprefix("TESTARGS"), source_file.stem)
             for line in source_file.read_text().splitlines()
-            if line.startswith(f'{comment_str}TESTARGS')] or [TestSpec('', args=['{ceed_resource}'])]
+            if line.startswith(f'{comment_str}TESTARGS')] or [TestSpec(source_file.stem, args=['{ceed_resource}'])]
 
 
 def diff_csv(test_csv: Path, true_csv: Path, zero_tol: float, rel_tol: float,

From eb3489a0858d8ed90c2b4b27f8469c6b0b9f1b88 Mon Sep 17 00:00:00 2001
From: Peter Munch <peterrmuench@gmail.com>
Date: Mon, 21 Jul 2025 21:34:39 +0200
Subject: [PATCH 467/571] Rename

---
 examples/deal.II/bps.h | 59 +++++++++++++++++++++---------------------
 1 file changed, 30 insertions(+), 29 deletions(-)

diff --git a/examples/deal.II/bps.h b/examples/deal.II/bps.h
index 1d984999f9..bcb7899f68 100644
--- a/examples/deal.II/bps.h
+++ b/examples/deal.II/bps.h
@@ -174,10 +174,10 @@ class OperatorCeed : public OperatorBase<Number>
   void
   reinit() override
   {
-    CeedVector           q_data;
+    CeedVector           metric_data;
     CeedBasis            sol_basis;
     CeedElemRestriction  sol_restriction;
-    CeedElemRestriction  q_data_restriction;
+    CeedElemRestriction  metric_data_restriction;
     BuildContext         build_ctx_data;
     CeedQFunctionContext build_ctx;
     CeedQFunction        qf_apply;
@@ -276,20 +276,20 @@ class OperatorCeed : public OperatorBase<Number>
     // 4) create mapping -> MappingInfo
     const unsigned int n_components_metric = (bp <= BPType::BP2) ? 1 : (dim * (dim + 1) / 2);
 
-    this->metric_data = compute_metric_data(ceed, mapping, tria, quadrature, bp);
+    metric_data_raw = compute_metric_data(ceed, mapping, tria, quadrature, bp);
 
     strides = {{1,
                 static_cast<int>(quadrature.size()),
                 static_cast<int>(quadrature.size() * n_components_metric)}};
-    CeedVectorCreate(ceed, metric_data.size(), &q_data);
-    CeedVectorSetArray(q_data, CEED_MEM_HOST, CEED_USE_POINTER, metric_data.data());
+    CeedVectorCreate(ceed, metric_data_raw.size(), &metric_data);
+    CeedVectorSetArray(metric_data, CEED_MEM_HOST, CEED_USE_POINTER, metric_data_raw.data());
     CeedElemRestrictionCreateStrided(ceed,
                                      n_local_active_cells,
                                      quadrature.size(),
                                      n_components_metric,
-                                     metric_data.size(),
+                                     metric_data_raw.size(),
                                      strides.data(),
-                                     &q_data_restriction);
+                                     &metric_data_restriction);
 
     build_ctx_data.dim       = dim;
     build_ctx_data.space_dim = dim;
@@ -315,7 +315,7 @@ class OperatorCeed : public OperatorBase<Number>
     else
       CeedQFunctionAddInput(qf_apply, "u", dim * n_components, CEED_EVAL_GRAD);
 
-    CeedQFunctionAddInput(qf_apply, "qdata", n_components_metric, CEED_EVAL_NONE);
+    CeedQFunctionAddInput(qf_apply, "metric data", n_components_metric, CEED_EVAL_NONE);
 
     if (bp <= BPType::BP2)
       CeedQFunctionAddOutput(qf_apply, "v", n_components, CEED_EVAL_INTERP);
@@ -328,7 +328,8 @@ class OperatorCeed : public OperatorBase<Number>
     CeedOperatorCreate(ceed, qf_apply, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_apply);
 
     CeedOperatorSetField(op_apply, "u", sol_restriction, sol_basis, CEED_VECTOR_ACTIVE);
-    CeedOperatorSetField(op_apply, "qdata", q_data_restriction, CEED_BASIS_NONE, q_data);
+    CeedOperatorSetField(
+      op_apply, "metric data", metric_data_restriction, CEED_BASIS_NONE, metric_data);
     CeedOperatorSetField(op_apply, "v", sol_restriction, sol_basis, CEED_VECTOR_ACTIVE);
 
     // 7) libCEED vectors
@@ -336,8 +337,8 @@ class OperatorCeed : public OperatorBase<Number>
     CeedElemRestrictionCreateVector(sol_restriction, &dst_ceed, NULL);
 
     // 8) cleanup
-    CeedVectorDestroy(&q_data);
-    CeedElemRestrictionDestroy(&q_data_restriction);
+    CeedVectorDestroy(&metric_data);
+    CeedElemRestrictionDestroy(&metric_data_restriction);
     CeedElemRestrictionDestroy(&sol_restriction);
     CeedBasisDestroy(&sol_basis);
     CeedQFunctionContextDestroy(&build_ctx);
@@ -505,11 +506,11 @@ class OperatorCeed : public OperatorBase<Number>
                       const Quadrature<dim>    &quadrature,
                       const BPType              bp)
   {
-    std::vector<double> metric_data;
+    std::vector<double> metric_data_raw;
 
     CeedBasis            geo_basis;
-    CeedVector           q_data;
-    CeedElemRestriction  q_data_restriction;
+    CeedVector           metric_data;
+    CeedElemRestriction  metric_data_restriction;
     CeedVector           node_coords;
     CeedElemRestriction  geo_restriction;
     CeedQFunctionContext build_ctx;
@@ -611,21 +612,21 @@ class OperatorCeed : public OperatorBase<Number>
             }
         }
 
-    metric_data.resize(n_local_active_cells * quadrature.size() * n_components_metric);
+    metric_data_raw.resize(n_local_active_cells * quadrature.size() * n_components_metric);
 
     CeedInt strides[3] = {1,
                           static_cast<int>(quadrature.size()),
                           static_cast<int>(quadrature.size() * n_components_metric)};
 
-    CeedVectorCreate(ceed, metric_data.size(), &q_data);
-    CeedVectorSetArray(q_data, CEED_MEM_HOST, CEED_USE_POINTER, metric_data.data());
+    CeedVectorCreate(ceed, metric_data_raw.size(), &metric_data);
+    CeedVectorSetArray(metric_data, CEED_MEM_HOST, CEED_USE_POINTER, metric_data_raw.data());
     CeedElemRestrictionCreateStrided(ceed,
                                      n_local_active_cells,
                                      quadrature.size(),
                                      n_components_metric,
-                                     metric_data.size(),
+                                     metric_data_raw.size(),
                                      strides,
-                                     &q_data_restriction);
+                                     &metric_data_restriction);
 
     CeedVectorCreate(ceed, geo_support_points.size(), &node_coords);
     CeedVectorSetArray(node_coords, CEED_MEM_HOST, CEED_USE_POINTER, geo_support_points.data());
@@ -656,31 +657,31 @@ class OperatorCeed : public OperatorBase<Number>
       CeedQFunctionCreateInterior(ceed, 1, f_build_poisson, f_build_poisson_loc, &qf_build);
 
     CeedQFunctionAddInput(qf_build, "geo", dim * dim, CEED_EVAL_GRAD);
-    CeedQFunctionAddInput(qf_build, "metric_data", 1, CEED_EVAL_WEIGHT);
-    CeedQFunctionAddOutput(qf_build, "qdata", n_components_metric, CEED_EVAL_NONE);
+    CeedQFunctionAddInput(qf_build, "weight", 1, CEED_EVAL_WEIGHT);
+    CeedQFunctionAddOutput(qf_build, "metric data", n_components_metric, CEED_EVAL_NONE);
     CeedQFunctionSetContext(qf_build, build_ctx);
 
     // 6) put everything together
     CeedOperatorCreate(ceed, qf_build, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_build);
     CeedOperatorSetField(op_build, "geo", geo_restriction, geo_basis, CEED_VECTOR_ACTIVE);
     CeedOperatorSetField(
-      op_build, "metric_data", CEED_ELEMRESTRICTION_NONE, geo_basis, CEED_VECTOR_NONE);
+      op_build, "weight", CEED_ELEMRESTRICTION_NONE, geo_basis, CEED_VECTOR_NONE);
     CeedOperatorSetField(
-      op_build, "qdata", q_data_restriction, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE);
+      op_build, "metric data", metric_data_restriction, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE);
 
-    CeedOperatorApply(op_build, node_coords, q_data, CEED_REQUEST_IMMEDIATE);
+    CeedOperatorApply(op_build, node_coords, metric_data, CEED_REQUEST_IMMEDIATE);
 
     CeedVectorDestroy(&node_coords);
-    CeedVectorSyncArray(q_data, CEED_MEM_HOST);
-    CeedVectorDestroy(&q_data);
+    CeedVectorSyncArray(metric_data, CEED_MEM_HOST);
+    CeedVectorDestroy(&metric_data);
     CeedElemRestrictionDestroy(&geo_restriction);
-    CeedElemRestrictionDestroy(&q_data_restriction);
+    CeedElemRestrictionDestroy(&metric_data_restriction);
     CeedBasisDestroy(&geo_basis);
     CeedQFunctionContextDestroy(&build_ctx);
     CeedQFunctionDestroy(&qf_build);
     CeedOperatorDestroy(&op_build);
 
-    return metric_data;
+    return metric_data_raw;
   }
 
   /**
@@ -722,7 +723,7 @@ class OperatorCeed : public OperatorBase<Number>
    * libCEED data structures.
    */
   Ceed                   ceed;
-  std::vector<double>    metric_data;
+  std::vector<double>    metric_data_raw;
   std::array<CeedInt, 3> strides;
   CeedVector             src_ceed;
   CeedVector             dst_ceed;

From 5f1423ff9910874ab167693e74826c4f9b91328c Mon Sep 17 00:00:00 2001
From: Zach Atkins <zach.atkins@colorado.edu>
Date: Mon, 21 Jul 2025 16:23:28 -0600
Subject: [PATCH 468/571] ci - update junit_common with changes from HONEE

---
 tests/junit_common.py | 34 ++++++++++++++++++++++++++++------
 1 file changed, 28 insertions(+), 6 deletions(-)

diff --git a/tests/junit_common.py b/tests/junit_common.py
index f8f4a4d556..60aa2a136c 100644
--- a/tests/junit_common.py
+++ b/tests/junit_common.py
@@ -153,6 +153,22 @@ def csv_rtol(self):
     def csv_rtol(self, val):
         self._csv_rtol = val
 
+    @property
+    def csv_comment_diff_fn(self):  # -> Any | Callable[..., None]:
+        return getattr(self, '_csv_comment_diff_fn', None)
+
+    @csv_comment_diff_fn.setter
+    def csv_comment_diff_fn(self, test_fn):
+        self._csv_comment_diff_fn = test_fn
+
+    @property
+    def csv_comment_str(self):
+        return getattr(self, '_csv_comment_str', '#')
+
+    @csv_comment_str.setter
+    def csv_comment_str(self, comment_str):
+        self._csv_comment_str = comment_str
+
     def post_test_hook(self, test: str, spec: TestSpec, backend: str) -> None:
         """Function callback ran after each test case
 
@@ -624,16 +640,16 @@ def run_test(index: int, test: str, spec: TestSpec, backend: str,
                              category=spec.name,)
         ref_csvs: List[Path] = []
         ref_ascii: List[Path] = []
-        output_files: List[str] = [arg for arg in run_args if 'ascii:' in arg]
+        output_files: List[str] = [arg.split(':')[1] for arg in run_args if arg.startswith('ascii:')]
         if output_files:
-            ref_csvs = [suite_spec.get_output_path(test, file.split(':')[1])
+            ref_csvs = [suite_spec.get_output_path(test, file)
                         for file in output_files if file.endswith('.csv')]
-            ref_ascii = [suite_spec.get_output_path(test, file.split(':')[1])
+            ref_ascii = [suite_spec.get_output_path(test, file)
                          for file in output_files if not file.endswith('.csv')]
         ref_cgns: List[Path] = []
-        output_files = [arg for arg in run_args if 'cgns:' in arg]
+        output_files = [arg.split(':')[1] for arg in run_args if arg.startswith('cgns:')]
         if output_files:
-            ref_cgns = [suite_spec.get_output_path(test, file.split('cgns:')[-1]) for file in output_files]
+            ref_cgns = [suite_spec.get_output_path(test, file) for file in output_files]
         ref_stdout: Path = suite_spec.get_output_path(test, test + '.out')
         suite_spec.post_test_hook(test, spec, backend)
 
@@ -681,7 +697,13 @@ def run_test(index: int, test: str, spec: TestSpec, backend: str,
             else:
                 csv_ztol: float = spec.csv_ztol if spec.csv_ztol > 0 else suite_spec.csv_ztol
                 csv_rtol: float = spec.csv_rtol if spec.csv_rtol > 0 else suite_spec.csv_rtol
-                diff = diff_csv(out_file, ref_csv, zero_tol=csv_ztol, rel_tol=csv_rtol)
+                diff = diff_csv(
+                    out_file,
+                    ref_csv,
+                    csv_ztol,
+                    csv_rtol,
+                    suite_spec.csv_comment_str,
+                    suite_spec.csv_comment_diff_fn)
                 if diff:
                     save_path: Path = suite_spec.test_failure_artifacts_path / csv_name
                     shutil.move(out_file, save_path)

From df8ed97b2581bbffe64e3b8ae3776931511bd05a Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Wed, 23 Jul 2025 11:27:48 -0600
Subject: [PATCH 469/571] dealii - use /cpu/self as default resource to get
 best CPU perf

---
 examples/deal.II/bps.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/deal.II/bps.cc b/examples/deal.II/bps.cc
index 7205d0987d..c059460656 100644
--- a/examples/deal.II/bps.cc
+++ b/examples/deal.II/bps.cc
@@ -61,7 +61,7 @@ struct Parameters
   unsigned int n_global_refinements = 1;
   unsigned int fe_degree            = 2;
   bool         print_timings        = true;
-  std::string  libCEED_resource      = "/cpu/self/avx/blocked";
+  std::string  libCEED_resource      = "/cpu/self";
 
   bool
   parse(int argc, char *argv[])

From 2129291034139a1db9ffe414bf8bc92a8e43e245 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Mon, 28 Jul 2025 11:30:05 -0600
Subject: [PATCH 470/571] cuda - collocated nodes/qpts for shared

---
 backends/cuda-shared/ceed-cuda-shared-basis.c |  15 +-
 .../cuda/cuda-shared-basis-tensor-templates.h |  50 +++++
 .../cuda/cuda-shared-basis-tensor.h           | 212 +++++++++++++++++-
 3 files changed, 270 insertions(+), 7 deletions(-)

diff --git a/backends/cuda-shared/ceed-cuda-shared-basis.c b/backends/cuda-shared/ceed-cuda-shared-basis.c
index 5b2fa9256e..a84ccd1410 100644
--- a/backends/cuda-shared/ceed-cuda-shared-basis.c
+++ b/backends/cuda-shared/ceed-cuda-shared-basis.c
@@ -627,18 +627,21 @@ int CeedBasisCreateTensorH1_Cuda_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d,
   }
 
   // Compile basis kernels
+  bool       is_collocated         = false;
   const char basis_kernel_source[] = "// Tensor basis source\n#include <ceed/jit-source/cuda/cuda-shared-basis-tensor.h>\n";
 
   CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
   CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 8, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_T_1D",
                                    CeedIntMax(Q_1d, P_1d), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, "BASIS_NUM_NODES", CeedIntPow(P_1d, dim),
                                    "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_HAS_COLLOCATED_GRAD", has_collocated_grad));
-  CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp));
-  CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "InterpTranspose", &data->InterpTranspose));
-  CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "InterpTransposeAdd", &data->InterpTransposeAdd));
-  CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Grad", &data->Grad));
-  CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "GradTranspose", &data->GradTranspose));
-  CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "GradTransposeAdd", &data->GradTransposeAdd));
+  CeedCallBackend(CeedBasisIsCollocated(basis, &is_collocated));
+  CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, is_collocated ? "InterpCollocated" : "Interp", &data->Interp));
+  CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, is_collocated ? "InterpCollocatedTranspose" : "InterpTranspose", &data->InterpTranspose));
+  CeedCallBackend(
+      CeedGetKernel_Cuda(ceed, data->module, is_collocated ? "InterpCollocatedTransposeAdd" : "InterpTransposeAdd", &data->InterpTransposeAdd));
+  CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, is_collocated ? "GradCollocated" : "Grad", &data->Grad));
+  CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, is_collocated ? "GradCollocatedTranspose" : "GradTranspose", &data->GradTranspose));
+  CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, is_collocated ? "GradCollocatedTransposeAdd" : "GradTransposeAdd", &data->GradTransposeAdd));
   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight));
 
   CeedCallBackend(CeedBasisSetData(basis, data));
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h
index f4f701505a..6ccd26f96a 100644
--- a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h
@@ -235,6 +235,30 @@ inline __device__ void GradTransposeTensor2d(SharedData_Cuda &data, const CeedSc
   }
 }
 
+//------------------------------------------------------------------------------
+// 2D derivatives at quadrature points, nodes and quadrature points collocated
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void GradTensorCollocatedNodes2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
+                                                   CeedScalar *__restrict__ r_V) {
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_G, &r_V[comp + 0 * NUM_COMP]);
+    ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_G, &r_V[comp + 1 * NUM_COMP]);
+  }
+}
+
+//------------------------------------------------------------------------------
+// 2D derivatives transpose, nodes and quadrature points collocated
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void GradTransposeTensorCollocatedNodes2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
+                                                            CeedScalar *__restrict__ r_V) {
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp + 1 * NUM_COMP], c_G, &r_V[comp]);
+    ContractTransposeAddX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp + 0 * NUM_COMP], c_G, &r_V[comp]);
+  }
+}
+
 //------------------------------------------------------------------------------
 // 2D quadrature weights
 //------------------------------------------------------------------------------
@@ -521,6 +545,32 @@ inline __device__ void GradTransposeTensorCollocated3d(SharedData_Cuda &data, co
   }
 }
 
+//------------------------------------------------------------------------------
+// 3D derivatives at quadrature points, nodes and quadrature points collocated
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void GradTensorCollocatedNodes3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
+                                                   CeedScalar *__restrict__ r_V) {
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_G, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]);
+    ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_G, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]);
+    ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]);
+  }
+}
+
+//------------------------------------------------------------------------------
+// 3D derivatives transpose, nodes and quadrature points collocated
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void GradTransposeTensorCollocatedNodes3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
+                                                            CeedScalar *__restrict__ r_V) {
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, &r_V[comp * P_1D]);
+    ContractTransposeAddY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_G, &r_V[comp * P_1D]);
+    ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_G, &r_V[comp * P_1D]);
+  }
+}
+
 //------------------------------------------------------------------------------
 // 3D quadrature weights
 //------------------------------------------------------------------------------
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor.h
index 1252c8197d..1925488c73 100644
--- a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor.h
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor.h
@@ -13,7 +13,7 @@
 #include "cuda-shared-basis-tensor-templates.h"
 
 //------------------------------------------------------------------------------
-// Interp kernel by dim
+// Interp kernels by dim
 //------------------------------------------------------------------------------
 extern "C" __global__ void Interp(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
   extern __shared__ CeedScalar slice[];
@@ -53,6 +53,36 @@ extern "C" __global__ void Interp(const CeedInt num_elem, const CeedScalar *c_B,
   }
 }
 
+extern "C" __global__ void InterpCollocated(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U,
+                                            CeedScalar *__restrict__ d_V) {
+  extern __shared__ CeedScalar slice[];
+
+  SharedData_Cuda data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
+
+  CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
+
+  // Apply basis element by element
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    if (BASIS_DIM == 1) {
+      ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U);
+      WriteElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, r_U, d_V);
+    } else if (BASIS_DIM == 2) {
+      ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U);
+      WriteElementStrided2d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, r_U, d_V);
+    } else if (BASIS_DIM == 3) {
+      ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                       BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U);
+      WriteElementStrided3d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
+                                                        BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_U, d_V);
+    }
+  }
+}
+
 extern "C" __global__ void InterpTranspose(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U,
                                            CeedScalar *__restrict__ d_V) {
   extern __shared__ CeedScalar slice[];
@@ -92,6 +122,36 @@ extern "C" __global__ void InterpTranspose(const CeedInt num_elem, const CeedSca
   }
 }
 
+extern "C" __global__ void InterpCollocatedTranspose(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U,
+                                                     CeedScalar *__restrict__ d_V) {
+  extern __shared__ CeedScalar slice[];
+
+  SharedData_Cuda data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
+
+  CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
+
+  // Apply basis element by element
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    if (BASIS_DIM == 1) {
+      ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
+      WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_U, d_V);
+    } else if (BASIS_DIM == 2) {
+      ReadElementStrided2d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
+      WriteElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_U, d_V);
+    } else if (BASIS_DIM == 3) {
+      ReadElementStrided3d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
+                                                       BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
+      WriteElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                        BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_U, d_V);
+    }
+  }
+}
+
 extern "C" __global__ void InterpTransposeAdd(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U,
                                               CeedScalar *__restrict__ d_V) {
   extern __shared__ CeedScalar slice[];
@@ -131,6 +191,36 @@ extern "C" __global__ void InterpTransposeAdd(const CeedInt num_elem, const Ceed
   }
 }
 
+extern "C" __global__ void InterpCollocatedTransposeAdd(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U,
+                                                        CeedScalar *__restrict__ d_V) {
+  extern __shared__ CeedScalar slice[];
+
+  SharedData_Cuda data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
+
+  CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
+
+  // Apply basis element by element
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    if (BASIS_DIM == 1) {
+      ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
+      SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_U, d_V);
+    } else if (BASIS_DIM == 2) {
+      ReadElementStrided2d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
+      SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_U, d_V);
+    } else if (BASIS_DIM == 3) {
+      ReadElementStrided3d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
+                                                       BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
+      SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                      BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_U, d_V);
+    }
+  }
+}
+
 //------------------------------------------------------------------------------
 // Grad kernel by dim
 //------------------------------------------------------------------------------
@@ -177,6 +267,46 @@ extern "C" __global__ void Grad(const CeedInt num_elem, const CeedScalar *c_B, c
   }
 }
 
+extern "C" __global__ void GradCollocated(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U,
+                                          CeedScalar *__restrict__ d_V) {
+  extern __shared__ CeedScalar slice[];
+
+  SharedData_Cuda data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
+
+  CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
+  CeedScalar r_V[BASIS_NUM_COMP * BASIS_DIM * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
+
+  // load grad_1d into shared memory
+  __shared__ CeedScalar s_G[BASIS_Q_1D * (BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)];
+  LoadMatrix<BASIS_Q_1D, BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D>(data, c_G, s_G);
+  __syncthreads();
+
+  // Apply basis element by element
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    if (BASIS_DIM == 1) {
+      ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U);
+      Grad1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
+      WriteElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, r_V, d_V);
+    } else if (BASIS_DIM == 2) {
+      ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U);
+      GradTensorCollocatedNodes2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_G, r_V);
+      WriteElementStrided2d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, r_V,
+                                                                    d_V);
+    } else if (BASIS_DIM == 3) {
+      ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                       BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U);
+      GradTensorCollocatedNodes3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_G, r_V);
+      WriteElementStrided3d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
+                                                                    BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_V, d_V);
+    }
+  }
+}
+
 extern "C" __global__ void GradTranspose(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U,
                                          CeedScalar *__restrict__ d_V) {
   extern __shared__ CeedScalar slice[];
@@ -220,6 +350,46 @@ extern "C" __global__ void GradTranspose(const CeedInt num_elem, const CeedScala
   }
 }
 
+extern "C" __global__ void GradCollocatedTranspose(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *c_G,
+                                                   const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
+  extern __shared__ CeedScalar slice[];
+
+  SharedData_Cuda data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
+
+  CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
+  CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
+
+  // load grad_1d into shared memory
+  __shared__ CeedScalar s_G[BASIS_Q_1D * (BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)];
+  LoadMatrix<BASIS_Q_1D, BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D>(data, c_G, s_G);
+  __syncthreads();
+
+  // Apply basis element by element
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    if (BASIS_DIM == 1) {
+      ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
+      GradTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
+      WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 2) {
+      ReadElementStrided2d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U,
+                                                                   r_U);
+      GradTransposeTensorCollocatedNodes2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_G, r_V);
+      WriteElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 3) {
+      ReadElementStrided3d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
+                                                                   BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
+      GradTransposeTensorCollocatedNodes3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_G, r_V);
+      WriteElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                        BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    }
+  }
+}
+
 extern "C" __global__ void GradTransposeAdd(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U,
                                             CeedScalar *__restrict__ d_V) {
   extern __shared__ CeedScalar slice[];
@@ -263,6 +433,46 @@ extern "C" __global__ void GradTransposeAdd(const CeedInt num_elem, const CeedSc
   }
 }
 
+extern "C" __global__ void GradCollocatedTransposeAdd(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *c_G,
+                                                      const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
+  extern __shared__ CeedScalar slice[];
+
+  SharedData_Cuda data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
+
+  CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
+  CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
+
+  // load grad_1d into shared memory
+  __shared__ CeedScalar s_G[BASIS_Q_1D * (BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)];
+  LoadMatrix<BASIS_Q_1D, BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D>(data, c_G, s_G);
+  __syncthreads();
+
+  // Apply basis element by element
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    if (BASIS_DIM == 1) {
+      ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
+      GradTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
+      SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 2) {
+      ReadElementStrided2d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U,
+                                                                   r_U);
+      GradTransposeTensorCollocatedNodes2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_G, r_V);
+      SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 3) {
+      ReadElementStrided3d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
+                                                                   BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
+      GradTransposeTensorCollocatedNodes3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_G, r_V);
+      SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                      BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    }
+  }
+}
+
 //------------------------------------------------------------------------------
 // Weight kernels by dim
 //------------------------------------------------------------------------------

From 02219a082eb38cf2d3edc97fbfe55fa395a4dc99 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Mon, 28 Jul 2025 11:44:17 -0600
Subject: [PATCH 471/571] hip - collocated nodes/qpts for shared

---
 backends/hip-shared/ceed-hip-shared-basis.c   |  15 +-
 .../hip/hip-shared-basis-tensor-templates.h   |  50 ++++
 .../jit-source/hip/hip-shared-basis-tensor.h  | 214 ++++++++++++++++++
 3 files changed, 273 insertions(+), 6 deletions(-)

diff --git a/backends/hip-shared/ceed-hip-shared-basis.c b/backends/hip-shared/ceed-hip-shared-basis.c
index ae1591995f..6201f2aff1 100644
--- a/backends/hip-shared/ceed-hip-shared-basis.c
+++ b/backends/hip-shared/ceed-hip-shared-basis.c
@@ -692,6 +692,7 @@ int CeedBasisCreateTensorH1_Hip_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d,
   CeedCallBackend(ComputeBasisThreadBlockSizes(dim, P_1d, Q_1d, num_comp, data->block_sizes));
 
   // Compile basis kernels
+  bool       is_collocated         = false;
   const char basis_kernel_source[] = "// Tensor basis source\n#include <ceed/jit-source/hip/hip-shared-basis-tensor.h>\n";
 
   CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->module, 11, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_T_1D",
@@ -699,12 +700,14 @@ int CeedBasisCreateTensorH1_Hip_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d,
                                   "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_INTERP_BLOCK_SIZE", data->block_sizes[0], "BASIS_GRAD_BLOCK_SIZE",
                                   data->block_sizes[1], "BASIS_WEIGHT_BLOCK_SIZE", data->block_sizes[2], "BASIS_HAS_COLLOCATED_GRAD",
                                   has_collocated_grad));
-  CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Interp", &data->Interp));
-  CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "InterpTranspose", &data->InterpTranspose));
-  CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "InterpTransposeAdd", &data->InterpTransposeAdd));
-  CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Grad", &data->Grad));
-  CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "GradTranspose", &data->GradTranspose));
-  CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "GradTransposeAdd", &data->GradTransposeAdd));
+  CeedCallBackend(CeedBasisIsCollocated(basis, &is_collocated));
+  CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, is_collocated ? "InterpCollocated" : "Interp", &data->Interp));
+  CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, is_collocated ? "InterpCollocatedTranspose" : "InterpTranspose", &data->InterpTranspose));
+  CeedCallBackend(
+      CeedGetKernel_Hip(ceed, data->module, is_collocated ? "InterpCollocatedTransposeAdd" : "InterpTransposeAdd", &data->InterpTransposeAdd));
+  CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, is_collocated ? "GradCollocated" : "Grad", &data->Grad));
+  CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, is_collocated ? "GradCollocatedTranspose" : "GradTranspose", &data->GradTranspose));
+  CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, is_collocated ? "GradCollocatedTransposeAdd" : "GradTransposeAdd", &data->GradTransposeAdd));
   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Weight", &data->Weight));
 
   CeedCallBackend(CeedBasisSetData(basis, data));
diff --git a/include/ceed/jit-source/hip/hip-shared-basis-tensor-templates.h b/include/ceed/jit-source/hip/hip-shared-basis-tensor-templates.h
index a7d24c4cd1..c9b2c319a4 100644
--- a/include/ceed/jit-source/hip/hip-shared-basis-tensor-templates.h
+++ b/include/ceed/jit-source/hip/hip-shared-basis-tensor-templates.h
@@ -234,6 +234,30 @@ inline __device__ void GradTransposeTensor2d(SharedData_Hip &data, const CeedSca
   }
 }
 
+//------------------------------------------------------------------------------
+// 2D derivatives at quadrature points, nodes and quadrature points collocated
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void GradTensorCollocatedNodes2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
+                                                   CeedScalar *__restrict__ r_V) {
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_G, &r_V[comp + 0 * NUM_COMP]);
+    ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_G, &r_V[comp + 1 * NUM_COMP]);
+  }
+}
+
+//------------------------------------------------------------------------------
+// 2D derivatives transpose, nodes and quadrature points collocated
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void GradTransposeTensorCollocatedNodes2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
+                                                            CeedScalar *__restrict__ r_V) {
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp + 1 * NUM_COMP], c_G, &r_V[comp]);
+    ContractTransposeAddX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp + 0 * NUM_COMP], c_G, &r_V[comp]);
+  }
+}
+
 //------------------------------------------------------------------------------
 // 2D quadrature weights
 //------------------------------------------------------------------------------
@@ -519,6 +543,32 @@ inline __device__ void GradTransposeTensorCollocated3d(SharedData_Hip &data, con
   }
 }
 
+//------------------------------------------------------------------------------
+// 3D derivatives at quadrature points, nodes and quadrature points collocated
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void GradTensorCollocatedNodes3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
+                                                   CeedScalar *__restrict__ r_V) {
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_G, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]);
+    ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_G, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]);
+    ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]);
+  }
+}
+
+//------------------------------------------------------------------------------
+// 3D derivatives transpose, nodes and quadrature points collocated
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void GradTransposeTensorCollocatedNodes3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
+                                                            CeedScalar *__restrict__ r_V) {
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, &r_V[comp * P_1D]);
+    ContractTransposeAddY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_G, &r_V[comp * P_1D]);
+    ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_G, &r_V[comp * P_1D]);
+  }
+}
+
 //------------------------------------------------------------------------------
 // 3D quadrature weights
 //------------------------------------------------------------------------------
diff --git a/include/ceed/jit-source/hip/hip-shared-basis-tensor.h b/include/ceed/jit-source/hip/hip-shared-basis-tensor.h
index 59abdf4c2a..f86b43b9c8 100644
--- a/include/ceed/jit-source/hip/hip-shared-basis-tensor.h
+++ b/include/ceed/jit-source/hip/hip-shared-basis-tensor.h
@@ -54,6 +54,36 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
   }
 }
 
+extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
+    void InterpCollocated(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
+  extern __shared__ CeedScalar slice[];
+
+  SharedData_Hip data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
+
+  CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
+
+  // Apply basis element by element
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    if (BASIS_DIM == 1) {
+      ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U);
+      WriteElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, r_U, d_V);
+    } else if (BASIS_DIM == 2) {
+      ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U);
+      WriteElementStrided2d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, r_U, d_V);
+    } else if (BASIS_DIM == 3) {
+      ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                       BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U);
+      WriteElementStrided3d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
+                                                        BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_U, d_V);
+    }
+  }
+}
+
 extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
     void InterpTranspose(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
   extern __shared__ CeedScalar slice[];
@@ -93,6 +123,36 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
   }
 }
 
+extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
+    void InterpCollocatedTranspose(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
+  extern __shared__ CeedScalar slice[];
+
+  SharedData_Hip data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
+
+  CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
+
+  // Apply basis element by element
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    if (BASIS_DIM == 1) {
+      ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
+      WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_U, d_V);
+    } else if (BASIS_DIM == 2) {
+      ReadElementStrided2d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
+      WriteElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_U, d_V);
+    } else if (BASIS_DIM == 3) {
+      ReadElementStrided3d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
+                                                       BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
+      WriteElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                        BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_U, d_V);
+    }
+  }
+}
+
 extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
     void InterpTransposeAdd(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
   extern __shared__ CeedScalar slice[];
@@ -132,6 +192,37 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
   }
 }
 
+extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
+    void InterpCollocatedTransposeAdd(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U,
+                                      CeedScalar *__restrict__ d_V) {
+  extern __shared__ CeedScalar slice[];
+
+  SharedData_Hip data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
+
+  CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
+
+  // Apply basis element by element
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    if (BASIS_DIM == 1) {
+      ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
+      SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_U, d_V);
+    } else if (BASIS_DIM == 2) {
+      ReadElementStrided2d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
+      SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_U, d_V);
+    } else if (BASIS_DIM == 3) {
+      ReadElementStrided3d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
+                                                       BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
+      SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                      BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_U, d_V);
+    }
+  }
+}
+
 //------------------------------------------------------------------------------
 // Grad kernel by dim
 //------------------------------------------------------------------------------
@@ -178,6 +269,47 @@ extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__ void Grad(const C
   }
 }
 
+extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
+    void GradCollocated(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U,
+                        CeedScalar *__restrict__ d_V) {
+  extern __shared__ CeedScalar slice[];
+
+  SharedData_Hip data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
+
+  CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
+  CeedScalar r_V[BASIS_NUM_COMP * BASIS_DIM * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
+
+  // load interp_1d and grad_1d into shared memory
+  __shared__ CeedScalar s_G[BASIS_Q_1D * (BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)];
+  LoadMatrix<BASIS_Q_1D, BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D>(data, c_G, s_G);
+  __syncthreads();
+
+  // Apply basis element by element
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    if (BASIS_DIM == 1) {
+      ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U);
+      Grad1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
+      WriteElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, r_V, d_V);
+    } else if (BASIS_DIM == 2) {
+      ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U);
+      GradTensorCollocatedNodes2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_G, r_V);
+      WriteElementStrided2d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, r_V,
+                                                                    d_V);
+    } else if (BASIS_DIM == 3) {
+      ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                       BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U);
+      GradTensorCollocatedNodes3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_G, r_V);
+      WriteElementStrided3d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
+                                                                    BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_V, d_V);
+    }
+  }
+}
+
 extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
     void GradTranspose(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U,
                        CeedScalar *__restrict__ d_V) {
@@ -222,6 +354,47 @@ extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
   }
 }
 
+extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
+    void GradCollocatedTranspose(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U,
+                                 CeedScalar *__restrict__ d_V) {
+  extern __shared__ CeedScalar slice[];
+
+  SharedData_Hip data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
+
+  CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
+  CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
+
+  // load interp_1d and grad_1d into shared memory
+  __shared__ CeedScalar s_G[BASIS_Q_1D * (BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)];
+  LoadMatrix<BASIS_Q_1D, BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D>(data, c_G, s_G);
+  __syncthreads();
+
+  // Apply basis element by element
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    if (BASIS_DIM == 1) {
+      ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
+      GradTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
+      WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 2) {
+      ReadElementStrided2d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U,
+                                                                   r_U);
+      GradTransposeTensorCollocatedNodes2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_G, r_V);
+      WriteElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 3) {
+      ReadElementStrided3d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
+                                                                   BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
+      GradTransposeTensorCollocatedNodes3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_G, r_V);
+      WriteElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                        BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    }
+  }
+}
+
 extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
     void GradTransposeAdd(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U,
                           CeedScalar *__restrict__ d_V) {
@@ -266,6 +439,47 @@ extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
   }
 }
 
+extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
+    void GradCollocatedTransposeAdd(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U,
+                                    CeedScalar *__restrict__ d_V) {
+  extern __shared__ CeedScalar slice[];
+
+  SharedData_Hip data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
+
+  CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
+  CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
+
+  // load interp_1d and grad_1d into shared memory
+  __shared__ CeedScalar s_G[BASIS_Q_1D * (BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)];
+  LoadMatrix<BASIS_Q_1D, BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D>(data, c_G, s_G);
+  __syncthreads();
+
+  // Apply basis element by element
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    if (BASIS_DIM == 1) {
+      ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
+      GradTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
+      SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 2) {
+      ReadElementStrided2d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U,
+                                                                   r_U);
+      GradTransposeTensorCollocatedNodes2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_G, r_V);
+      SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 3) {
+      ReadElementStrided3d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
+                                                                   BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
+      GradTransposeTensorCollocatedNodes3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_G, r_V);
+      SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                      BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    }
+  }
+}
+
 //------------------------------------------------------------------------------
 // Weight kernels by dim
 //------------------------------------------------------------------------------

From 0ccda8ebe42db3fb90cdb724a58e4e5d2aedf1a1 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Mon, 28 Jul 2025 13:18:42 -0600
Subject: [PATCH 472/571] gpu - collocated nodes/qpts for gen

---
 .../cuda-gen/ceed-cuda-gen-operator-build.cpp |  58 +++++--
 .../hip-gen/ceed-hip-gen-operator-build.cpp   |  55 ++++--
 ...-shared-basis-tensor-flattened-templates.h | 157 +++++++++++++++++-
 .../cuda/cuda-shared-basis-tensor-templates.h |  86 +++++++++-
 .../cuda/cuda-shared-basis-tensor.h           |  12 +-
 ...-shared-basis-tensor-flattened-templates.h | 138 +++++++++++++++
 .../hip/hip-shared-basis-tensor-templates.h   |  86 +++++++++-
 .../jit-source/hip/hip-shared-basis-tensor.h  |  12 +-
 8 files changed, 533 insertions(+), 71 deletions(-)

diff --git a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
index 481c358466..0f3bfd9021 100644
--- a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
+++ b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
@@ -261,8 +261,15 @@ static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, C
 
         code << tab << "CeedScalar *s_B" << var_suffix << " = " << reuse_var << ";\n";
       } else {
-        code << tab << "__shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n";
-        code << tab << "LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
+        bool is_collocated = false;
+
+        CeedCallBackend(CeedBasisIsCollocated(basis, &is_collocated));
+        if (is_collocated && !is_at_points) {
+          code << tab << "CeedScalar *s_B" << var_suffix << " = NULL;\n";
+        } else {
+          code << tab << "__shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n";
+          code << tab << "LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
+        }
       }
       break;
     case CEED_EVAL_GRAD:
@@ -293,8 +300,15 @@ static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, C
 
           code << tab << "CeedScalar *s_B" << var_suffix << " = " << reuse_var << ";\n";
         } else {
-          code << tab << "__shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n";
-          code << tab << "LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
+          bool is_collocated = false;
+
+          CeedCallBackend(CeedBasisIsCollocated(basis, &is_collocated));
+          if (is_collocated && !is_at_points) {
+            code << tab << "CeedScalar *s_B" << var_suffix << " = NULL;\n";
+          } else {
+            code << tab << "__shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n";
+            code << tab << "LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
+          }
         }
       }
       if (is_at_points) break;  // No G mat for AtPoints
@@ -492,10 +506,11 @@ static int CeedOperatorBuildKernelRestriction_Cuda_gen(std::ostringstream &code,
 static int CeedOperatorBuildKernelBasis_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, Tab &tab, CeedInt i,
                                                  CeedOperatorField op_field, CeedQFunctionField qf_field, CeedInt max_dim, CeedInt Q_1d,
                                                  bool is_input, bool is_all_tensor, bool is_at_points, bool use_3d_slices) {
-  bool      is_tensor = true;
+  bool      is_tensor = true, is_collocated = true;
   CeedBasis basis;
   CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis));
   CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
+  CeedCallBackend(CeedBasisIsCollocated(basis, &is_collocated));
 
   std::string         var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
   std::string         P_name = (is_tensor ? "P_1d" : "P") + var_suffix, Q_name = is_tensor ? "Q_1d" : "Q";
@@ -534,9 +549,9 @@ static int CeedOperatorBuildKernelBasis_Cuda_gen(std::ostringstream &code, CeedO
           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix
                << ", s_B" << var_suffix << ", r_c" << var_suffix << ");\n";
         } else {
-          std::string function_name = is_tensor
-                                          ? ((dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened"))
-                                          : "InterpNonTensor";
+          std::string function_name = is_tensor ? ((dim == 1 ? "Interp" : "InterpTensor") + std::string(is_collocated ? "CollocatedNodes" : "") +
+                                                   std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened"))
+                                                : "InterpNonTensor";
           std::string op_t_1d_name  = (is_all_tensor || !is_tensor) ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
 
           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
@@ -552,15 +567,18 @@ static int CeedOperatorBuildKernelBasis_Cuda_gen(std::ostringstream &code, CeedO
           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix
                << ", s_B" << var_suffix << ", r_c" << var_suffix << ");\n";
         } else if (use_3d_slices) {
-          std::string function_name = (dim > 1 ? "InterpTensor" : "Interp") + std::to_string(dim) + "d";
+          std::string function_name =
+              (dim > 1 ? "InterpTensor" : "Interp") + std::string(is_collocated ? "CollocatedNodes" : "") + std::to_string(dim) + "d";
 
           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix
                << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n";
         } else if (is_tensor) {
-          bool        is_collocated = dim == 3 && Q_1d >= P_1d;
-          std::string function_name = (dim == 1 ? "Grad" : (is_collocated ? "GradTensorCollocated" : "GradTensor")) + std::to_string(dim) + "d" +
-                                      (is_all_tensor ? "" : "Flattened");
+          bool        is_collocated_grad = dim == 3 && Q_1d >= P_1d;
+          std::string function_name =
+              (dim == 1 ? "Grad"
+                        : ("GradTensor" + std::string(is_collocated ? "CollocatedNodes" : (is_collocated_grad ? "Collocated" : ""))) +
+                              std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened"));
           std::string op_t_1d_name = is_all_tensor ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
 
           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "*"
@@ -611,7 +629,8 @@ static int CeedOperatorBuildKernelBasis_Cuda_gen(std::ostringstream &code, CeedO
                << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
         } else {
           std::string function_name =
-              is_tensor ? ((dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened"))
+              is_tensor ? ((dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::string(is_collocated ? "CollocatedNodes" : "") +
+                           std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened"))
                         : "InterpTransposeNonTensor";
           std::string op_t_1d_name = (is_all_tensor || !is_tensor) ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
 
@@ -627,14 +646,17 @@ static int CeedOperatorBuildKernelBasis_Cuda_gen(std::ostringstream &code, CeedO
           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_c" << var_suffix
                << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
         } else if (use_3d_slices) {
-          std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d";
+          std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::string(is_collocated ? "CollocatedNodes" : "") +
+                                      std::to_string(dim) + "d";
 
           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_q" << var_suffix
                << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
         } else if (is_tensor) {
-          bool        is_collocated = dim == 3 && Q_1d >= P_1d;
-          std::string function_name = (dim == 1 ? "GradTranspose" : (is_collocated ? "GradTransposeTensorCollocated" : "GradTransposeTensor")) +
-                                      std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened");
+          bool        is_collocated_grad = dim == 3 && Q_1d >= P_1d;
+          std::string function_name =
+              (dim == 1 ? "GradTranspose"
+                        : ("GradTransposeTensor" + std::string(is_collocated ? "CollocatedNodes" : (is_collocated_grad ? "Collocated" : "")))) +
+              std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened");
           std::string op_t_1d_name = is_all_tensor ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
 
           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_q"
@@ -870,7 +892,7 @@ static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, C
           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
           code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << "; j++) {\n";
           tab.push();
-          code << "r_s" << var_suffix << "[j] = r_q" << var_suffix << "[q + j*" << Q_name << "];\n";
+          code << tab << "r_s" << var_suffix << "[j] = r_q" << var_suffix << "[q + j*" << Q_name << "];\n";
           tab.pop();
           code << tab << "}\n";
           break;
diff --git a/backends/hip-gen/ceed-hip-gen-operator-build.cpp b/backends/hip-gen/ceed-hip-gen-operator-build.cpp
index 9e29d16cb4..7bf2d8f667 100644
--- a/backends/hip-gen/ceed-hip-gen-operator-build.cpp
+++ b/backends/hip-gen/ceed-hip-gen-operator-build.cpp
@@ -288,8 +288,15 @@ static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, Ce
 
         code << tab << "CeedScalar *s_B" << var_suffix << " = " << reuse_var << ";\n";
       } else {
-        code << tab << "__shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n";
-        code << tab << "LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
+        bool is_collocated = false;
+
+        CeedCallBackend(CeedBasisIsCollocated(basis, &is_collocated));
+        if (is_collocated && !is_at_points) {
+          code << tab << "CeedScalar *s_B" << var_suffix << " = NULL;\n";
+        } else {
+          code << tab << "__shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n";
+          code << tab << "LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
+        }
       }
       break;
     case CEED_EVAL_GRAD:
@@ -320,8 +327,15 @@ static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, Ce
 
           code << tab << "CeedScalar *s_B" << var_suffix << " = " << reuse_var << ";\n";
         } else {
-          code << tab << "__shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n";
-          code << tab << "LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
+          bool is_collocated = false;
+
+          CeedCallBackend(CeedBasisIsCollocated(basis, &is_collocated));
+          if (is_collocated && !is_at_points) {
+            code << tab << "CeedScalar *s_B" << var_suffix << " = NULL;\n";
+          } else {
+            code << tab << "__shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n";
+            code << tab << "LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
+          }
         }
       }
       if (is_at_points) break;  // No G mat for AtPoints
@@ -519,10 +533,11 @@ static int CeedOperatorBuildKernelRestriction_Hip_gen(std::ostringstream &code,
 static int CeedOperatorBuildKernelBasis_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, Tab &tab, CeedInt i, CeedOperatorField op_field,
                                                 CeedQFunctionField qf_field, CeedInt max_dim, CeedInt Q_1d, bool is_input, bool is_all_tensor,
                                                 bool is_at_points, bool use_3d_slices) {
-  bool      is_tensor = true;
+  bool      is_tensor = true, is_collocated = true;
   CeedBasis basis;
   CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis));
   CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
+  CeedCallBackend(CeedBasisIsCollocated(basis, &is_collocated));
 
   std::string         var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
   std::string         P_name = (is_tensor ? "P_1d" : "P") + var_suffix, Q_name = is_tensor ? "Q_1d" : "Q";
@@ -561,9 +576,9 @@ static int CeedOperatorBuildKernelBasis_Hip_gen(std::ostringstream &code, CeedOp
           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix
                << ", s_B" << var_suffix << ", r_c" << var_suffix << ");\n";
         } else {
-          std::string function_name = is_tensor
-                                          ? ((dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened"))
-                                          : "InterpNonTensor";
+          std::string function_name = is_tensor ? ((dim == 1 ? "Interp" : "InterpTensor") + std::string(is_collocated ? "CollocatedNodes" : "") +
+                                                   std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened"))
+                                                : "InterpNonTensor";
           std::string op_t_1d_name  = (is_all_tensor || !is_tensor) ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
 
           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
@@ -579,15 +594,17 @@ static int CeedOperatorBuildKernelBasis_Hip_gen(std::ostringstream &code, CeedOp
           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix
                << ", s_B" << var_suffix << ", r_c" << var_suffix << ");\n";
         } else if (use_3d_slices) {
-          std::string function_name = (dim > 1 ? "InterpTensor" : "Interp") + std::to_string(dim) + "d";
+          std::string function_name =
+              (dim > 1 ? "InterpTensor" : "Interp") + std::string(is_collocated ? "CollocatedNodes" : "") + std::to_string(dim) + "d";
 
           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix
                << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n";
         } else if (is_tensor) {
-          bool        is_collocated = dim == 3 && Q_1d >= P_1d;
-          std::string function_name = (dim == 1 ? "Grad" : (is_collocated ? "GradTensorCollocated" : "GradTensor")) + std::to_string(dim) + "d" +
-                                      (is_all_tensor ? "" : "Flattened");
+          bool        is_collocated_grad = dim == 3 && Q_1d >= P_1d;
+          std::string function_name =
+              (dim == 1 ? "Grad" : ("GradTensor" + std::string(is_collocated ? "CollocatedNodes" : (is_collocated_grad ? "Collocated" : "")))) +
+              std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened");
           std::string op_t_1d_name = is_all_tensor ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
 
           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "*"
@@ -638,7 +655,8 @@ static int CeedOperatorBuildKernelBasis_Hip_gen(std::ostringstream &code, CeedOp
                << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
         } else {
           std::string function_name =
-              is_tensor ? ((dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened"))
+              is_tensor ? ((dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::string(is_collocated ? "CollocatedNodes" : "") +
+                           std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened"))
                         : "InterpTransposeNonTensor";
           std::string op_t_1d_name = (is_all_tensor || !is_tensor) ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
 
@@ -654,14 +672,17 @@ static int CeedOperatorBuildKernelBasis_Hip_gen(std::ostringstream &code, CeedOp
           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_c" << var_suffix
                << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
         } else if (use_3d_slices) {
-          std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d";
+          std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::string(is_collocated ? "CollocatedNodes" : "") +
+                                      std::to_string(dim) + "d";
 
           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_q" << var_suffix
                << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
         } else if (is_tensor) {
-          bool        is_collocated = dim == 3 && Q_1d >= P_1d;
-          std::string function_name = (dim == 1 ? "GradTranspose" : (is_collocated ? "GradTransposeTensorCollocated" : "GradTransposeTensor")) +
-                                      std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened");
+          bool        is_collocated_grad = dim == 3 && Q_1d >= P_1d;
+          std::string function_name =
+              (dim == 1 ? "GradTranspose"
+                        : ("GradTransposeTensor" + std::string(is_collocated ? "CollocatedNodes" : (is_collocated_grad ? "Collocated" : "")))) +
+              std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened");
           std::string op_t_1d_name = is_all_tensor ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
 
           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_q"
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-flattened-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-flattened-templates.h
index 4f76825d50..fcc084c687 100644
--- a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-flattened-templates.h
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-flattened-templates.h
@@ -161,6 +161,39 @@ inline __device__ void InterpTransposeTensor2dFlattened(SharedData_Cuda &data, C
   if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V);
 }
 
+//------------------------------------------------------------------------------
+// 2D interpolate to quadrature points, nodes and quadrature points collocated
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void InterpTensorCollocatedNodes2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                              CeedScalar *__restrict__ r_V) {
+  const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
+
+  if (P_1D != T_1D) QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    r_V[comp] = r_U[comp];
+  }
+  __syncthreads();
+  if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
+  if (Q_1D != T_1D) QPack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V);
+}
+
+//------------------------------------------------------------------------------
+// 2D interpolate transpose, nodes and quadrature points collocated
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void InterpTransposeTensorCollocatedNodes2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                                       CeedScalar *__restrict__ r_V) {
+  const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
+
+  if (Q_1D != T_1D) QUnpack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U);
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    r_V[comp] = r_U[comp];
+  }
+  __syncthreads();
+  if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V);
+}
+
 //------------------------------------------------------------------------------
 // 2D derivatives at quadrature points
 //------------------------------------------------------------------------------
@@ -202,6 +235,41 @@ inline __device__ void GradTransposeTensor2dFlattened(SharedData_Cuda &data, Cee
   if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V);
 }
 
+//------------------------------------------------------------------------------
+// 2D derivatives at quadrature points, nodes and quadrature points collocated
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void GradTensorCollocatedNodes2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                            const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
+  const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
+
+  if (P_1D != T_1D) QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_G, &r_V[comp + 0 * NUM_COMP]);
+    ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_G, &r_V[comp + 1 * NUM_COMP]);
+  }
+  __syncthreads();
+  if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
+  if (Q_1D != T_1D) QPack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V);
+}
+
+//------------------------------------------------------------------------------
+// 2D derivatives transpose, nodes and quadrature points collocated
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void GradTransposeTensorCollocatedNodes2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                                     const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
+  const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
+
+  if (Q_1D != T_1D) QUnpack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U);
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 1 * NUM_COMP], c_G, &r_V[comp]);
+    ContractTransposeAddX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 0 * NUM_COMP], c_G, &r_V[comp]);
+  }
+  __syncthreads();
+  if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V);
+}
+
 //------------------------------------------------------------------------------
 // 2D quadrature weights
 //------------------------------------------------------------------------------
@@ -432,6 +500,39 @@ inline __device__ void InterpTransposeTensor3dFlattened(SharedData_Cuda &data, C
   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
 }
 
+//------------------------------------------------------------------------------
+// 3D interpolate to quadrature points, nodes and quadrature points collocated
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void InterpTensorCollocatedNodes3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                              CeedScalar *__restrict__ r_V) {
+  const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
+
+  if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    r_V[comp] = r_U[comp];
+  }
+  __syncthreads();
+  if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
+  if (Q_1D != T_1D) QPack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
+}
+
+//------------------------------------------------------------------------------
+// 3D interpolate transpose, nodes and quadrature points collocated
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void InterpTransposeTensorCollocatedNodes3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                                       CeedScalar *__restrict__ r_V) {
+  const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
+
+  if (Q_1D != T_1D) QUnpack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    r_V[comp] = r_U[comp];
+  }
+  __syncthreads();
+  if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
+}
+
 //------------------------------------------------------------------------------
 // 3D derivatives at quadrature points
 //------------------------------------------------------------------------------
@@ -510,19 +611,59 @@ inline __device__ void GradTensorCollocated3dFlattened(SharedData_Cuda &data, Ce
 // 3D derivatives transpose
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void GradTransposeTensorCollocated3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
-                                                                const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
+inline __device__ void GradTransposeTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                      const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
   CeedScalar    r_t1[1], r_t2[1];
 
   if (Q_1D != T_1D) QUnpack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractTransposeZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 2 * NUM_COMP], c_G, r_t2);
-    ContractTransposeAddY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 1 * NUM_COMP], c_G, r_t2);
-    ContractTransposeAddX3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 0 * NUM_COMP], c_G, r_t2);
-    ContractTransposeZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, r_t1);
-    ContractTransposeY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
-    ContractTransposeX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]);
+    ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 0 * NUM_COMP], c_B, r_t1);
+    ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
+    ContractTransposeX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp]);
+    ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 1 * NUM_COMP], c_B, r_t1);
+    ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2);
+    ContractTransposeAddX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp]);
+    ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 2 * NUM_COMP], c_G, r_t1);
+    ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
+    ContractTransposeAddX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp]);
+  }
+  __syncthreads();
+  if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
+}
+
+//------------------------------------------------------------------------------
+// 3D derivatives at quadrature points, nodes and quadrature points collocated
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void GradTensorCollocatedNodes3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                            const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
+  const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
+
+  if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    ContractX3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U[comp], c_G, &r_V[comp + 0 * NUM_COMP]);
+    ContractY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U[comp], c_G, &r_V[comp + 1 * NUM_COMP]);
+    ContractZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U[comp], c_G, &r_V[comp + 2 * NUM_COMP]);
+  }
+  __syncthreads();
+  if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
+  if (Q_1D != T_1D) QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
+}
+
+//------------------------------------------------------------------------------
+// 3D derivatives transpose, nodes and quadrature points collocated
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void GradTransposeTensorCollocatedNodes3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                                     const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
+  const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
+
+  if (Q_1D != T_1D) QUnpack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    ContractTransposeZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 2 * NUM_COMP], c_G, &r_V[comp]);
+    ContractTransposeAddY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 1 * NUM_COMP], c_G, &r_V[comp]);
+    ContractTransposeAddX3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 0 * NUM_COMP], c_G, &r_V[comp]);
   }
   __syncthreads();
   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h
index 6ccd26f96a..af2bdebcbd 100644
--- a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h
@@ -66,6 +66,28 @@ inline __device__ void InterpTranspose1d(SharedData_Cuda &data, const CeedScalar
   }
 }
 
+//------------------------------------------------------------------------------
+// 1D interpolate to quadrature points, nodes and quadrature points collocated
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void InterpCollocatedNodes1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                               CeedScalar *__restrict__ r_V) {
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    r_V[comp] = r_U[comp];
+  }
+}
+
+//------------------------------------------------------------------------------
+// 1D interpolate transpose, nodes and quadrature points collocated
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void InterpTransposeCollocatedNodes1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                        CeedScalar *__restrict__ r_V) {
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    r_V[comp] = r_U[comp];
+  }
+}
+
 //------------------------------------------------------------------------------
 // 1D derivatives at quadrature points
 //------------------------------------------------------------------------------
@@ -205,6 +227,28 @@ inline __device__ void InterpTransposeTensor2d(SharedData_Cuda &data, const Ceed
   }
 }
 
+//------------------------------------------------------------------------------
+// 2D interpolate to quadrature points, nodes and quadrature points collocated
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void InterpTensorCollocatedNodes2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                     CeedScalar *__restrict__ r_V) {
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    r_V[comp] = r_U[comp];
+  }
+}
+
+//------------------------------------------------------------------------------
+// 2D interpolate transpose, nodes and quadrature points collocated
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void InterpTransposeTensorCollocatedNodes2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                              CeedScalar *__restrict__ r_V) {
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    r_V[comp] = r_U[comp];
+  }
+}
+
 //------------------------------------------------------------------------------
 // 2D derivatives at quadrature points
 //------------------------------------------------------------------------------
@@ -239,8 +283,8 @@ inline __device__ void GradTransposeTensor2d(SharedData_Cuda &data, const CeedSc
 // 2D derivatives at quadrature points, nodes and quadrature points collocated
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void GradTensorCollocatedNodes2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
-                                                   CeedScalar *__restrict__ r_V) {
+inline __device__ void GradTensorCollocatedNodes2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                   const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
     ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_G, &r_V[comp + 0 * NUM_COMP]);
     ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_G, &r_V[comp + 1 * NUM_COMP]);
@@ -251,8 +295,8 @@ inline __device__ void GradTensorCollocatedNodes2d(SharedData_Cuda &data, const
 // 2D derivatives transpose, nodes and quadrature points collocated
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void GradTransposeTensorCollocatedNodes2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
-                                                            CeedScalar *__restrict__ r_V) {
+inline __device__ void GradTransposeTensorCollocatedNodes2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                            const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
     ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp + 1 * NUM_COMP], c_G, &r_V[comp]);
     ContractTransposeAddX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp + 0 * NUM_COMP], c_G, &r_V[comp]);
@@ -467,6 +511,32 @@ inline __device__ void InterpTransposeTensor3d(SharedData_Cuda &data, const Ceed
   }
 }
 
+//------------------------------------------------------------------------------
+// 3D interpolate to quadrature points, nodes and quadrature points collocated
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void InterpTensorCollocatedNodes3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                     CeedScalar *__restrict__ r_V) {
+  for (CeedInt i = 0; i < Q_1D; i++) {
+    for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+      r_V[i + comp * Q_1D] = r_U[i + comp * P_1D];
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// 3D interpolate transpose, nodes and quadrature points collocated
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void InterpTransposeTensorCollocatedNodes3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                              CeedScalar *__restrict__ r_V) {
+  for (CeedInt i = 0; i < Q_1D; i++) {
+    for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+      r_V[i + comp * P_1D] = r_U[i + comp * Q_1D];
+    }
+  }
+}
+
 //------------------------------------------------------------------------------
 // 3D derivatives at quadrature points
 //------------------------------------------------------------------------------
@@ -549,8 +619,8 @@ inline __device__ void GradTransposeTensorCollocated3d(SharedData_Cuda &data, co
 // 3D derivatives at quadrature points, nodes and quadrature points collocated
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void GradTensorCollocatedNodes3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
-                                                   CeedScalar *__restrict__ r_V) {
+inline __device__ void GradTensorCollocatedNodes3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                   const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
     ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_G, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]);
     ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_G, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]);
@@ -562,8 +632,8 @@ inline __device__ void GradTensorCollocatedNodes3d(SharedData_Cuda &data, const
 // 3D derivatives transpose, nodes and quadrature points collocated
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void GradTransposeTensorCollocatedNodes3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
-                                                            CeedScalar *__restrict__ r_V) {
+inline __device__ void GradTransposeTensorCollocatedNodes3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                            const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, &r_V[comp * P_1D]);
     ContractTransposeAddY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_G, &r_V[comp * P_1D]);
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor.h
index 1925488c73..53a50bfd16 100644
--- a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor.h
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor.h
@@ -294,13 +294,13 @@ extern "C" __global__ void GradCollocated(const CeedInt num_elem, const CeedScal
       WriteElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, r_V, d_V);
     } else if (BASIS_DIM == 2) {
       ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U);
-      GradTensorCollocatedNodes2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_G, r_V);
+      GradTensorCollocatedNodes2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
       WriteElementStrided2d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, r_V,
                                                                     d_V);
     } else if (BASIS_DIM == 3) {
       ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
                                                        BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U);
-      GradTensorCollocatedNodes3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_G, r_V);
+      GradTensorCollocatedNodes3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
       WriteElementStrided3d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
                                                                     BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_V, d_V);
     }
@@ -378,12 +378,12 @@ extern "C" __global__ void GradCollocatedTranspose(const CeedInt num_elem, const
     } else if (BASIS_DIM == 2) {
       ReadElementStrided2d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U,
                                                                    r_U);
-      GradTransposeTensorCollocatedNodes2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_G, r_V);
+      GradTransposeTensorCollocatedNodes2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
       WriteElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
     } else if (BASIS_DIM == 3) {
       ReadElementStrided3d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
                                                                    BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
-      GradTransposeTensorCollocatedNodes3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_G, r_V);
+      GradTransposeTensorCollocatedNodes3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
       WriteElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
                                                         BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
     }
@@ -461,12 +461,12 @@ extern "C" __global__ void GradCollocatedTransposeAdd(const CeedInt num_elem, co
     } else if (BASIS_DIM == 2) {
       ReadElementStrided2d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U,
                                                                    r_U);
-      GradTransposeTensorCollocatedNodes2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_G, r_V);
+      GradTransposeTensorCollocatedNodes2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
       SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
     } else if (BASIS_DIM == 3) {
       ReadElementStrided3d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
                                                                    BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
-      GradTransposeTensorCollocatedNodes3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_G, r_V);
+      GradTransposeTensorCollocatedNodes3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
       SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
                                                       BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
     }
diff --git a/include/ceed/jit-source/hip/hip-shared-basis-tensor-flattened-templates.h b/include/ceed/jit-source/hip/hip-shared-basis-tensor-flattened-templates.h
index 25c5078718..c13aab77f6 100644
--- a/include/ceed/jit-source/hip/hip-shared-basis-tensor-flattened-templates.h
+++ b/include/ceed/jit-source/hip/hip-shared-basis-tensor-flattened-templates.h
@@ -161,6 +161,39 @@ inline __device__ void InterpTransposeTensor2dFlattened(SharedData_Hip &data, Ce
   if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V);
 }
 
+//------------------------------------------------------------------------------
+// 2D interpolate to quadrature points, nodes and quadrature points collocated
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void InterpTensorCollocatedNodes2dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                              CeedScalar *__restrict__ r_V) {
+  const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
+
+  if (P_1D != T_1D) QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    r_V[comp] = r_U[comp];
+  }
+  __syncthreads();
+  if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
+  if (Q_1D != T_1D) QPack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V);
+}
+
+//------------------------------------------------------------------------------
+// 2D interpolate transpose, nodes and quadrature points collocated
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void InterpTransposeTensorCollocatedNodes2dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                                       CeedScalar *__restrict__ r_V) {
+  const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
+
+  if (Q_1D != T_1D) QUnpack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U);
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    r_V[comp] = r_U[comp];
+  }
+  __syncthreads();
+  if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V);
+}
+
 //------------------------------------------------------------------------------
 // 2D derivatives at quadrature points
 //------------------------------------------------------------------------------
@@ -202,6 +235,41 @@ inline __device__ void GradTransposeTensor2dFlattened(SharedData_Hip &data, Ceed
   if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V);
 }
 
+//------------------------------------------------------------------------------
+// 2D derivatives at quadrature points, nodes and quadrature points collocated
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void GradTensorCollocatedNodes2dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                            const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
+  const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
+
+  if (P_1D != T_1D) QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_G, &r_V[comp + 0 * NUM_COMP]);
+    ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_G, &r_V[comp + 1 * NUM_COMP]);
+  }
+  __syncthreads();
+  if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
+  if (Q_1D != T_1D) QPack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V);
+}
+
+//------------------------------------------------------------------------------
+// 2D derivatives transpose, nodes and quadrature points collocated
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void GradTransposeTensorCollocatedNodes2dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                                     const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
+  const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
+
+  if (Q_1D != T_1D) QUnpack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U);
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 1 * NUM_COMP], c_G, &r_V[comp]);
+    ContractTransposeAddX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 0 * NUM_COMP], c_G, &r_V[comp]);
+  }
+  __syncthreads();
+  if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V);
+}
+
 //------------------------------------------------------------------------------
 // 2D quadrature weights
 //------------------------------------------------------------------------------
@@ -432,6 +500,39 @@ inline __device__ void InterpTransposeTensor3dFlattened(SharedData_Hip &data, Ce
   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
 }
 
+//------------------------------------------------------------------------------
+// 3D interpolate to quadrature points, nodes and quadrature points collocated
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void InterpTensorCollocatedNodes3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                              CeedScalar *__restrict__ r_V) {
+  const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
+
+  if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    r_V[comp] = r_U[comp];
+  }
+  __syncthreads();
+  if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
+  if (Q_1D != T_1D) QPack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
+}
+
+//------------------------------------------------------------------------------
+// 3D interpolate transpose, nodes and quadrature points collocated
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void InterpTransposeTensorCollocatedNodes3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                                       CeedScalar *__restrict__ r_V) {
+  const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
+
+  if (Q_1D != T_1D) QUnpack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    r_V[comp] = r_U[comp];
+  }
+  __syncthreads();
+  if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
+}
+
 //------------------------------------------------------------------------------
 // 3D derivatives at quadrature points
 //------------------------------------------------------------------------------
@@ -528,6 +629,43 @@ inline __device__ void GradTransposeTensorCollocated3dFlattened(SharedData_Hip &
   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
 }
 
+//------------------------------------------------------------------------------
+// 3D derivatives at quadrature points, nodes and quadrature points collocated
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void GradTensorCollocatedNodes3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                            const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
+  const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
+
+  if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    ContractX3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U[comp], c_G, &r_V[comp + 0 * NUM_COMP]);
+    ContractY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U[comp], c_G, &r_V[comp + 1 * NUM_COMP]);
+    ContractZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U[comp], c_G, &r_V[comp + 2 * NUM_COMP]);
+  }
+  __syncthreads();
+  if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
+  if (Q_1D != T_1D) QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
+}
+
+//------------------------------------------------------------------------------
+// 3D derivatives transpose, nodes and quadrature points collocated
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void GradTransposeTensorCollocatedNodes3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                                     const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
+  const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
+
+  if (Q_1D != T_1D) QUnpack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    ContractTransposeZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 2 * NUM_COMP], c_G, &r_V[comp]);
+    ContractTransposeAddY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 1 * NUM_COMP], c_G, &r_V[comp]);
+    ContractTransposeAddX3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 0 * NUM_COMP], c_G, &r_V[comp]);
+  }
+  __syncthreads();
+  if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
+}
+
 //------------------------------------------------------------------------------
 // 3D quadrature weights
 //------------------------------------------------------------------------------
diff --git a/include/ceed/jit-source/hip/hip-shared-basis-tensor-templates.h b/include/ceed/jit-source/hip/hip-shared-basis-tensor-templates.h
index c9b2c319a4..72c1f6731d 100644
--- a/include/ceed/jit-source/hip/hip-shared-basis-tensor-templates.h
+++ b/include/ceed/jit-source/hip/hip-shared-basis-tensor-templates.h
@@ -66,6 +66,28 @@ inline __device__ void InterpTranspose1d(SharedData_Hip &data, const CeedScalar
   }
 }
 
+//------------------------------------------------------------------------------
+// 1D interpolate to quadrature points, nodes and quadrature points collocated
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void InterpCollocatedNodes1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                               CeedScalar *__restrict__ r_V) {
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    r_V[comp] = r_U[comp];
+  }
+}
+
+//------------------------------------------------------------------------------
+// 1D interpolate transpose, nodes and quadrature points collocated
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void InterpTransposeCollocatedNodes1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                        CeedScalar *__restrict__ r_V) {
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    r_V[comp] = r_U[comp];
+  }
+}
+
 //------------------------------------------------------------------------------
 // 1D derivatives at quadrature points
 //------------------------------------------------------------------------------
@@ -204,6 +226,28 @@ inline __device__ void InterpTransposeTensor2d(SharedData_Hip &data, const CeedS
   }
 }
 
+//------------------------------------------------------------------------------
+// 2D interpolate to quadrature points, nodes and quadrature points collocated
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void InterpTensorCollocatedNodes2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                     CeedScalar *__restrict__ r_V) {
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    r_V[comp] = r_U[comp];
+  }
+}
+
+//------------------------------------------------------------------------------
+// 2D interpolate transpose, nodes and quadrature points collocated
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void InterpTransposeTensorCollocatedNodes2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                              CeedScalar *__restrict__ r_V) {
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    r_V[comp] = r_U[comp];
+  }
+}
+
 //------------------------------------------------------------------------------
 // 2D derivatives at quadrature points
 //------------------------------------------------------------------------------
@@ -238,8 +282,8 @@ inline __device__ void GradTransposeTensor2d(SharedData_Hip &data, const CeedSca
 // 2D derivatives at quadrature points, nodes and quadrature points collocated
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void GradTensorCollocatedNodes2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
-                                                   CeedScalar *__restrict__ r_V) {
+inline __device__ void GradTensorCollocatedNodes2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                   const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
     ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_G, &r_V[comp + 0 * NUM_COMP]);
     ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_G, &r_V[comp + 1 * NUM_COMP]);
@@ -250,8 +294,8 @@ inline __device__ void GradTensorCollocatedNodes2d(SharedData_Hip &data, const C
 // 2D derivatives transpose, nodes and quadrature points collocated
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void GradTransposeTensorCollocatedNodes2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
-                                                            CeedScalar *__restrict__ r_V) {
+inline __device__ void GradTransposeTensorCollocatedNodes2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                            const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
     ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp + 1 * NUM_COMP], c_G, &r_V[comp]);
     ContractTransposeAddX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp + 0 * NUM_COMP], c_G, &r_V[comp]);
@@ -465,6 +509,32 @@ inline __device__ void InterpTransposeTensor3d(SharedData_Hip &data, const CeedS
   }
 }
 
+//------------------------------------------------------------------------------
+// 3D interpolate to quadrature points, nodes and quadrature points collocated
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void InterpTensorCollocatedNodes3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                     CeedScalar *__restrict__ r_V) {
+  for (CeedInt i = 0; i < Q_1D; i++) {
+    for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+      r_V[i + comp * Q_1D] = r_U[i + comp * P_1D];
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// 3D interpolate transpose, nodes and quadrature points collocated
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void InterpTransposeTensorCollocatedNodes3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                              CeedScalar *__restrict__ r_V) {
+  for (CeedInt i = 0; i < Q_1D; i++) {
+    for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+      r_V[i + comp * P_1D] = r_U[i + comp * Q_1D];
+    }
+  }
+}
+
 //------------------------------------------------------------------------------
 // 3D derivatives at quadrature points
 //------------------------------------------------------------------------------
@@ -547,8 +617,8 @@ inline __device__ void GradTransposeTensorCollocated3d(SharedData_Hip &data, con
 // 3D derivatives at quadrature points, nodes and quadrature points collocated
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void GradTensorCollocatedNodes3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
-                                                   CeedScalar *__restrict__ r_V) {
+inline __device__ void GradTensorCollocatedNodes3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                   const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
     ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_G, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]);
     ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_G, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]);
@@ -560,8 +630,8 @@ inline __device__ void GradTensorCollocatedNodes3d(SharedData_Hip &data, const C
 // 3D derivatives transpose, nodes and quadrature points collocated
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void GradTransposeTensorCollocatedNodes3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
-                                                            CeedScalar *__restrict__ r_V) {
+inline __device__ void GradTransposeTensorCollocatedNodes3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                            const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, &r_V[comp * P_1D]);
     ContractTransposeAddY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_G, &r_V[comp * P_1D]);
diff --git a/include/ceed/jit-source/hip/hip-shared-basis-tensor.h b/include/ceed/jit-source/hip/hip-shared-basis-tensor.h
index f86b43b9c8..f1fb0bbe6b 100644
--- a/include/ceed/jit-source/hip/hip-shared-basis-tensor.h
+++ b/include/ceed/jit-source/hip/hip-shared-basis-tensor.h
@@ -297,13 +297,13 @@ extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
       WriteElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, r_V, d_V);
     } else if (BASIS_DIM == 2) {
       ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U);
-      GradTensorCollocatedNodes2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_G, r_V);
+      GradTensorCollocatedNodes2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
       WriteElementStrided2d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, r_V,
                                                                     d_V);
     } else if (BASIS_DIM == 3) {
       ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
                                                        BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U);
-      GradTensorCollocatedNodes3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_G, r_V);
+      GradTensorCollocatedNodes3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
       WriteElementStrided3d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
                                                                     BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_V, d_V);
     }
@@ -383,12 +383,12 @@ extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
     } else if (BASIS_DIM == 2) {
       ReadElementStrided2d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U,
                                                                    r_U);
-      GradTransposeTensorCollocatedNodes2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_G, r_V);
+      GradTransposeTensorCollocatedNodes2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
       WriteElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
     } else if (BASIS_DIM == 3) {
       ReadElementStrided3d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
                                                                    BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
-      GradTransposeTensorCollocatedNodes3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_G, r_V);
+      GradTransposeTensorCollocatedNodes3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
       WriteElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
                                                         BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
     }
@@ -468,12 +468,12 @@ extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
     } else if (BASIS_DIM == 2) {
       ReadElementStrided2d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U,
                                                                    r_U);
-      GradTransposeTensorCollocatedNodes2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_G, r_V);
+      GradTransposeTensorCollocatedNodes2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
       SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
     } else if (BASIS_DIM == 3) {
       ReadElementStrided3d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
                                                                    BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
-      GradTransposeTensorCollocatedNodes3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_G, r_V);
+      GradTransposeTensorCollocatedNodes3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
       SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
                                                       BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
     }

From 725737e8460c40d58b42a1300b40f05b75ddbc05 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Mon, 28 Jul 2025 16:35:24 -0600
Subject: [PATCH 473/571] petsc - add BP1+5 and BP2+6 to help test colllocated

---
 examples/petsc/bps.c                    | 21 +++++++++++----
 examples/petsc/bps.h                    |  2 +-
 examples/petsc/include/bpsproblemdata.h | 36 ++++++++++++++++++++++++-
 examples/petsc/include/structs.h        | 13 ++++++++-
 4 files changed, 64 insertions(+), 8 deletions(-)

diff --git a/examples/petsc/bps.c b/examples/petsc/bps.c
index 5ced922afd..c5f02e3f6d 100644
--- a/examples/petsc/bps.c
+++ b/examples/petsc/bps.c
@@ -114,6 +114,15 @@ static PetscErrorCode RunWithDM(RunParams rp, DM dm, const char *ceed_resource)
     const char *used_resource;
     CeedGetResource(ceed, &used_resource);
 
+    bool is_combined_bp = rp->bp_choice > CEED_BP6;
+    char bp_name[6]     = "";
+
+    if (is_combined_bp) {
+      PetscCall(PetscSNPrintf(bp_name, 6, "%d + %d", rp->bp_choice % 2 ? 2 : 1, rp->bp_choice - CEED_BP4));
+    } else {
+      PetscCall(PetscSNPrintf(bp_name, 6, "%d", rp->bp_choice + 1));
+    }
+
     VecType vec_type;
     PetscCall(VecGetType(X, &vec_type));
 
@@ -125,7 +134,7 @@ static PetscErrorCode RunWithDM(RunParams rp, DM dm, const char *ceed_resource)
     PetscMPIInt      comm_size;
     PetscCall(MPI_Comm_size(rp->comm, &comm_size));
     PetscCall(PetscPrintf(rp->comm,
-                          "\n-- CEED Benchmark Problem %" CeedInt_FMT " -- libCEED + PETSc --\n"
+                          "\n-- CEED Benchmark Problem %s -- libCEED + PETSc --\n"
                           "  MPI:\n"
                           "    Hostname                                : %s\n"
                           "    Total ranks                             : %d\n"
@@ -144,8 +153,8 @@ static PetscErrorCode RunWithDM(RunParams rp, DM dm, const char *ceed_resource)
                           "    Element topology                        : %s\n"
                           "    Owned nodes                             : %" PetscInt_FMT "\n"
                           "    DoF per node                            : %" PetscInt_FMT "\n",
-                          rp->bp_choice + 1, rp->hostname, comm_size, rp->ranks_per_node, vec_type, used_resource, CeedMemTypes[mem_type_backend], P,
-                          Q, rp->q_extra, g_size / rp->num_comp_u, c_end - c_start, CeedElemTopologies[elem_topo], l_size / rp->num_comp_u,
+                          bp_name, rp->hostname, comm_size, rp->ranks_per_node, vec_type, used_resource, CeedMemTypes[mem_type_backend], P, Q,
+                          rp->q_extra, g_size / rp->num_comp_u, c_end - c_start, CeedElemTopologies[elem_topo], l_size / rp->num_comp_u,
                           rp->num_comp_u));
   }
 
@@ -185,9 +194,10 @@ static PetscErrorCode RunWithDM(RunParams rp, DM dm, const char *ceed_resource)
   {
     PC pc;
     PetscCall(KSPGetPC(ksp, &pc));
-    if (rp->bp_choice == CEED_BP1 || rp->bp_choice == CEED_BP2 || rp->bp_choice == CEED_BP13 || rp->bp_choice == CEED_BP24) {
+    if (rp->bp_choice == CEED_BP1 || rp->bp_choice == CEED_BP2 || rp->bp_choice == CEED_BP13 || rp->bp_choice == CEED_BP24 ||
+        rp->bp_choice == CEED_BP15 || rp->bp_choice == CEED_BP26) {
       PetscCall(PCSetType(pc, PCJACOBI));
-      if (rp->simplex || rp->bp_choice == CEED_BP13 || rp->bp_choice == CEED_BP24) {
+      if (rp->simplex || rp->bp_choice == CEED_BP13 || rp->bp_choice == CEED_BP24 || rp->bp_choice == CEED_BP15 || rp->bp_choice == CEED_BP26) {
         PetscCall(PCJacobiSetType(pc, PC_JACOBI_DIAGONAL));
       } else {
         PetscCall(PCJacobiSetType(pc, PC_JACOBI_ROWSUM));
@@ -260,6 +270,7 @@ static PetscErrorCode RunWithDM(RunParams rp, DM dm, const char *ceed_resource)
       // Tighter tol for BP1, BP2
       // Looser tol for BP3, BP4, BP5, and BP6 with extra for vector valued problems
       // BP1+3 and BP2+4 follow the pattern for BP3 and BP4
+      // BP1+5 and BP2+6 follow the pattern for BP5 and BP6
       PetscReal tol = rp->bp_choice < CEED_BP3 ? 5e-4 : (5e-2 + (rp->bp_choice % 2 == 1 ? 5e-3 : 0));
       if (!rp->test_mode || l2_error > tol) {
         PetscCall(MPI_Allreduce(&my_rt, &rt_min, 1, MPI_DOUBLE, MPI_MIN, rp->comm));
diff --git a/examples/petsc/bps.h b/examples/petsc/bps.h
index 0564f4aeb0..e45e9dcb17 100644
--- a/examples/petsc/bps.h
+++ b/examples/petsc/bps.h
@@ -17,4 +17,4 @@ static const char *const mem_types[] = {"host", "device", "memType", "CEED_MEM_"
 typedef enum { COARSEN_UNIFORM = 0, COARSEN_LOGARITHMIC = 1 } CoarsenType;
 static const char *const coarsen_types[] = {"uniform", "logarithmic", "CoarsenType", "COARSEN", 0};
 
-static const char *const bp_types[] = {"bp1", "bp2", "bp3", "bp4", "bp5", "bp6", "bp1_3", "bp2_4", "BPType", "CEED_BP", 0};
+static const char *const bp_types[] = {"bp1", "bp2", "bp3", "bp4", "bp5", "bp6", "bp1_3", "bp2_4", "bp1_5", "bp2_6", "BPType", "CEED_BP", 0};
diff --git a/examples/petsc/include/bpsproblemdata.h b/examples/petsc/include/bpsproblemdata.h
index 4d50ccc98d..5e3a55d8f5 100644
--- a/examples/petsc/include/bpsproblemdata.h
+++ b/examples/petsc/include/bpsproblemdata.h
@@ -25,7 +25,7 @@
 // BP Option Data
 // -----------------------------------------------------------------------------
 
-BPData bp_options[8] = {
+BPData bp_options[10] = {
     [CEED_BP1]  = {.num_comp_u    = 1,
                    .num_comp_x    = 3,
                    .topo_dim      = 3,
@@ -162,4 +162,38 @@ BPData bp_options[8] = {
                    .out_mode      = CEED_EVAL_INTERP + CEED_EVAL_GRAD,
                    .q_mode        = CEED_GAUSS,
                    .enforce_bc    = PETSC_TRUE },
+    [CEED_BP15] = {.num_comp_u    = 1,
+                   .num_comp_x    = 3,
+                   .topo_dim      = 3,
+                   .q_data_size   = 7,
+                   .q_extra       = 0,
+                   .setup_geo     = SetupDiffGeo,
+                   .setup_rhs     = SetupMassDiffRhs,
+                   .apply         = MassDiff,
+                   .error         = Error,
+                   .setup_geo_loc = SetupDiffGeo_loc,
+                   .setup_rhs_loc = SetupMassDiffRhs_loc,
+                   .apply_loc     = MassDiff_loc,
+                   .error_loc     = Error_loc,
+                   .in_mode       = CEED_EVAL_INTERP + CEED_EVAL_GRAD,
+                   .out_mode      = CEED_EVAL_INTERP + CEED_EVAL_GRAD,
+                   .q_mode        = CEED_GAUSS_LOBATTO,
+                   .enforce_bc    = PETSC_TRUE },
+    [CEED_BP26] = {.num_comp_u    = 3,
+                   .num_comp_x    = 3,
+                   .topo_dim      = 3,
+                   .q_data_size   = 7,
+                   .q_extra       = 0,
+                   .setup_geo     = SetupDiffGeo,
+                   .setup_rhs     = SetupMassDiffRhs3,
+                   .apply         = MassDiff3,
+                   .error         = Error3,
+                   .setup_geo_loc = SetupDiffGeo_loc,
+                   .setup_rhs_loc = SetupMassDiffRhs3_loc,
+                   .apply_loc     = MassDiff3_loc,
+                   .error_loc     = Error3_loc,
+                   .in_mode       = CEED_EVAL_INTERP + CEED_EVAL_GRAD,
+                   .out_mode      = CEED_EVAL_INTERP + CEED_EVAL_GRAD,
+                   .q_mode        = CEED_GAUSS_LOBATTO,
+                   .enforce_bc    = PETSC_TRUE },
 };
diff --git a/examples/petsc/include/structs.h b/examples/petsc/include/structs.h
index ff6602c891..8b6c48e296 100644
--- a/examples/petsc/include/structs.h
+++ b/examples/petsc/include/structs.h
@@ -65,7 +65,18 @@ typedef struct {
 } BPData;
 
 // BP options
-typedef enum { CEED_BP1 = 0, CEED_BP2 = 1, CEED_BP3 = 2, CEED_BP4 = 3, CEED_BP5 = 4, CEED_BP6 = 5, CEED_BP13 = 6, CEED_BP24 = 7 } BPType;
+typedef enum {
+  CEED_BP1  = 0,
+  CEED_BP2  = 1,
+  CEED_BP3  = 2,
+  CEED_BP4  = 3,
+  CEED_BP5  = 4,
+  CEED_BP6  = 5,
+  CEED_BP13 = 6,
+  CEED_BP24 = 7,
+  CEED_BP15 = 8,
+  CEED_BP26 = 9,
+} BPType;
 
 // -----------------------------------------------------------------------------
 // Parameter structure for running problems

From ca1da9b9b9d880e22f5c01cec39b447b56f5eccd Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Tue, 29 Jul 2025 09:28:22 -0600
Subject: [PATCH 474/571] gen - skip mat load when assembling QFs

---
 .../cuda-gen/ceed-cuda-gen-operator-build.cpp | 44 ++++++++++++-------
 .../hip-gen/ceed-hip-gen-operator-build.cpp   | 44 ++++++++++++-------
 2 files changed, 58 insertions(+), 30 deletions(-)

diff --git a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
index 0f3bfd9021..7059a2c891 100644
--- a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
+++ b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
@@ -180,11 +180,19 @@ static int CeedOperatorBuildKernelData_Cuda_gen(Ceed ceed, CeedInt num_input_fie
 static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, Tab &tab, CeedInt i,
                                                      CeedOperatorField op_field, CeedQFunctionField qf_field, FieldReuse_Cuda field_reuse,
                                                      CeedInt max_dim, CeedInt Q, CeedInt Q_1d, bool is_input, bool is_all_tensor, bool is_at_points,
-                                                     bool use_3d_slices) {
-  bool      is_tensor = true;
+                                                     bool use_3d_slices, bool skip_active_load) {
+  bool      is_tensor = true, is_active = true;
   CeedBasis basis;
+
   CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis));
   if (basis != CEED_BASIS_NONE) CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
+  {
+    CeedVector vec;
+
+    CeedCallBackend(CeedOperatorFieldGetVector(op_field, &vec));
+    is_active = vec == CEED_VECTOR_ACTIVE;
+    CeedCallBackend(CeedVectorDestroy(&vec));
+  }
 
   const char            *field_name;
   std::string            var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
@@ -256,7 +264,7 @@ static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, C
         if (is_input) data->B.inputs[i] = basis_data->d_interp_1d;
         else data->B.outputs[i] = basis_data->d_interp_1d;
       }
-      if (use_previous_field) {
+      if (use_previous_field && !skip_active_load) {
         std::string reuse_var = "s_B" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
 
         code << tab << "CeedScalar *s_B" << var_suffix << " = " << reuse_var << ";\n";
@@ -264,7 +272,7 @@ static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, C
         bool is_collocated = false;
 
         CeedCallBackend(CeedBasisIsCollocated(basis, &is_collocated));
-        if (is_collocated && !is_at_points) {
+        if ((is_active && skip_active_load) || (is_collocated && !is_at_points)) {
           code << tab << "CeedScalar *s_B" << var_suffix << " = NULL;\n";
         } else {
           code << tab << "__shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n";
@@ -295,7 +303,7 @@ static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, C
         else data->B.outputs[i] = basis_data->d_interp_1d;
       }
       if (is_tensor) {
-        if (use_previous_field) {
+        if (use_previous_field && !skip_active_load) {
           std::string reuse_var = "s_B" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
 
           code << tab << "CeedScalar *s_B" << var_suffix << " = " << reuse_var << ";\n";
@@ -303,7 +311,7 @@ static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, C
           bool is_collocated = false;
 
           CeedCallBackend(CeedBasisIsCollocated(basis, &is_collocated));
-          if (is_collocated && !is_at_points) {
+          if ((is_active && skip_active_load) || (is_collocated && !is_at_points)) {
             code << tab << "CeedScalar *s_B" << var_suffix << " = NULL;\n";
           } else {
             code << tab << "__shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n";
@@ -315,10 +323,12 @@ static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, C
       if (use_3d_slices) {
         if (is_input) data->G.inputs[i] = basis_data->d_collo_grad_1d;
         else data->G.outputs[i] = basis_data->d_collo_grad_1d;
-        if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD) {
+        if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD && !skip_active_load) {
           std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
 
           code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
+        } else if (is_active && skip_active_load) {
+          code << tab << "CeedScalar *s_G" << var_suffix << " = NULL;\n";
         } else {
           code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n";
           code << tab << "LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
@@ -329,19 +339,23 @@ static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, C
         if (is_input) data->G.inputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d;
         else data->G.outputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d;
         if (has_collo_grad) {
-          if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD) {
+          if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD && !skip_active_load) {
             std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
 
             code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
+          } else if (is_active && skip_active_load) {
+            code << tab << "CeedScalar *s_G" << var_suffix << " = NULL;\n";
           } else {
             code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n";
             code << tab << "LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
           }
         } else {
-          if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD) {
+          if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD && !skip_active_load) {
             std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
 
             code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
+          } else if (is_active && skip_active_load) {
+            code << tab << "CeedScalar *s_G" << var_suffix << " = NULL;\n";
           } else {
             code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << P_name << "*" << Q_name << (is_tensor ? "" : "*dim")
                  << (is_tensor ? "" : var_suffix) << "];\n";
@@ -1453,12 +1467,12 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_b
   code << "\n" << tab << "// Input field constants and basis data\n";
   for (CeedInt i = 0; i < num_input_fields; i++) {
     CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, tab, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i],
-                                                              max_dim, Q, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices));
+                                                              max_dim, Q, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices, false));
   }
   code << "\n" << tab << "// Output field constants and basis data\n";
   for (CeedInt i = 0; i < num_output_fields; i++) {
     CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i],
-                                                              max_dim, Q, Q_1d, false, is_all_tensor, is_at_points, use_3d_slices));
+                                                              max_dim, Q, Q_1d, false, is_all_tensor, is_at_points, use_3d_slices, false));
   }
 
   // Loop over all elements
@@ -1819,12 +1833,12 @@ static int CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen(CeedOperator op, boo
   code << "\n" << tab << "// Input field constants and basis data\n";
   for (CeedInt i = 0; i < num_input_fields; i++) {
     CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, tab, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i],
-                                                              max_dim, Q, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices));
+                                                              max_dim, Q, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices, false));
   }
   code << "\n" << tab << "// Output field constants and basis data\n";
   for (CeedInt i = 0; i < num_output_fields; i++) {
     CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i],
-                                                              max_dim, Q, Q_1d, false, is_all_tensor, is_at_points, use_3d_slices));
+                                                              max_dim, Q, Q_1d, false, is_all_tensor, is_at_points, use_3d_slices, false));
   }
 
   // Loop over all elements
@@ -2385,12 +2399,12 @@ extern "C" int CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen(CeedOpera
   code << "\n" << tab << "// Input field constants and basis data\n";
   for (CeedInt i = 0; i < num_input_fields; i++) {
     CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, tab, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i],
-                                                              max_dim, Q, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices));
+                                                              max_dim, Q, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices, true));
   }
   code << "\n" << tab << "// Output field constants and basis data\n";
   for (CeedInt i = 0; i < num_output_fields; i++) {
     CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i],
-                                                              max_dim, Q, Q_1d, false, is_all_tensor, is_at_points, use_3d_slices));
+                                                              max_dim, Q, Q_1d, false, is_all_tensor, is_at_points, use_3d_slices, true));
   }
 
   // Loop over all elements
diff --git a/backends/hip-gen/ceed-hip-gen-operator-build.cpp b/backends/hip-gen/ceed-hip-gen-operator-build.cpp
index 7bf2d8f667..3f20ab5070 100644
--- a/backends/hip-gen/ceed-hip-gen-operator-build.cpp
+++ b/backends/hip-gen/ceed-hip-gen-operator-build.cpp
@@ -207,11 +207,19 @@ static int CeedOperatorBuildKernelData_Hip_gen(Ceed ceed, CeedInt num_input_fiel
 static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, Tab &tab, CeedInt i,
                                                     CeedOperatorField op_field, CeedQFunctionField qf_field, FieldReuse_Hip field_reuse,
                                                     CeedInt max_dim, CeedInt Q, CeedInt Q_1d, bool is_input, bool is_all_tensor, bool is_at_points,
-                                                    bool use_3d_slices) {
-  bool      is_tensor = true;
+                                                    bool use_3d_slices, bool skip_active_load) {
+  bool      is_tensor = true, is_active = true;
   CeedBasis basis;
+
   CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis));
   if (basis != CEED_BASIS_NONE) CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
+  {
+    CeedVector vec;
+
+    CeedCallBackend(CeedOperatorFieldGetVector(op_field, &vec));
+    is_active = vec == CEED_VECTOR_ACTIVE;
+    CeedCallBackend(CeedVectorDestroy(&vec));
+  }
 
   const char           *field_name;
   std::string           var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
@@ -283,7 +291,7 @@ static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, Ce
         if (is_input) data->B.inputs[i] = basis_data->d_interp_1d;
         else data->B.outputs[i] = basis_data->d_interp_1d;
       }
-      if (use_previous_field) {
+      if (use_previous_field && !skip_active_load) {
         std::string reuse_var = "s_B" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
 
         code << tab << "CeedScalar *s_B" << var_suffix << " = " << reuse_var << ";\n";
@@ -291,7 +299,7 @@ static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, Ce
         bool is_collocated = false;
 
         CeedCallBackend(CeedBasisIsCollocated(basis, &is_collocated));
-        if (is_collocated && !is_at_points) {
+        if ((is_active && skip_active_load) || (is_collocated && !is_at_points)) {
           code << tab << "CeedScalar *s_B" << var_suffix << " = NULL;\n";
         } else {
           code << tab << "__shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n";
@@ -322,7 +330,7 @@ static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, Ce
         else data->B.outputs[i] = basis_data->d_interp_1d;
       }
       if (is_tensor) {
-        if (use_previous_field) {
+        if (use_previous_field && !skip_active_load) {
           std::string reuse_var = "s_B" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
 
           code << tab << "CeedScalar *s_B" << var_suffix << " = " << reuse_var << ";\n";
@@ -330,7 +338,7 @@ static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, Ce
           bool is_collocated = false;
 
           CeedCallBackend(CeedBasisIsCollocated(basis, &is_collocated));
-          if (is_collocated && !is_at_points) {
+          if ((is_active && skip_active_load) || (is_collocated && !is_at_points)) {
             code << tab << "CeedScalar *s_B" << var_suffix << " = NULL;\n";
           } else {
             code << tab << "__shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n";
@@ -342,10 +350,12 @@ static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, Ce
       if (use_3d_slices) {
         if (is_input) data->G.inputs[i] = basis_data->d_collo_grad_1d;
         else data->G.outputs[i] = basis_data->d_collo_grad_1d;
-        if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD) {
+        if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD && !skip_active_load) {
           std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
 
           code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
+        } else if (is_active && skip_active_load) {
+          code << tab << "CeedScalar *s_G" << var_suffix << " = NULL;\n";
         } else {
           code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n";
           code << tab << "LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
@@ -356,19 +366,23 @@ static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, Ce
         if (is_input) data->G.inputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d;
         else data->G.outputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d;
         if (has_collo_grad) {
-          if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD) {
+          if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD && !skip_active_load) {
             std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
 
             code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
+          } else if (is_active && skip_active_load) {
+            code << tab << "CeedScalar *s_G" << var_suffix << " = NULL;\n";
           } else {
             code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n";
             code << tab << "LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
           }
         } else {
-          if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD) {
+          if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD && !skip_active_load) {
             std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
 
             code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
+          } else if (is_active && skip_active_load) {
+            code << tab << "CeedScalar *s_G" << var_suffix << " = NULL;\n";
           } else {
             code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << P_name << "*" << Q_name << (is_tensor ? "" : "*dim")
                  << (is_tensor ? "" : var_suffix) << "];\n";
@@ -1465,12 +1479,12 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_bu
   code << "\n" << tab << "// Input field constants and basis data\n";
   for (CeedInt i = 0; i < num_input_fields; i++) {
     CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i],
-                                                             max_dim, Q, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices));
+                                                             max_dim, Q, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices, false));
   }
   code << "\n" << tab << "// Output field constants and basis data\n";
   for (CeedInt i = 0; i < num_output_fields; i++) {
     CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i],
-                                                             max_dim, Q, Q_1d, false, is_all_tensor, is_at_points, use_3d_slices));
+                                                             max_dim, Q, Q_1d, false, is_all_tensor, is_at_points, use_3d_slices, false));
   }
 
   // Loop over all elements
@@ -1823,12 +1837,12 @@ static int CeedOperatorBuildKernelAssemblyAtPoints_Hip_gen(CeedOperator op, bool
   code << "\n" << tab << "// Input field constants and basis data\n";
   for (CeedInt i = 0; i < num_input_fields; i++) {
     CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i],
-                                                             max_dim, Q, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices));
+                                                             max_dim, Q, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices, false));
   }
   code << "\n" << tab << "// Output field constants and basis data\n";
   for (CeedInt i = 0; i < num_output_fields; i++) {
     CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i],
-                                                             max_dim, Q, Q_1d, false, is_all_tensor, is_at_points, use_3d_slices));
+                                                             max_dim, Q, Q_1d, false, is_all_tensor, is_at_points, use_3d_slices, false));
   }
 
   // Loop over all elements
@@ -2380,12 +2394,12 @@ extern "C" int CeedOperatorBuildKernelLinearAssembleQFunction_Hip_gen(CeedOperat
   code << "\n" << tab << "// Input field constants and basis data\n";
   for (CeedInt i = 0; i < num_input_fields; i++) {
     CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i],
-                                                             max_dim, Q, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices));
+                                                             max_dim, Q, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices, true));
   }
   code << "\n" << tab << "// Output field constants and basis data\n";
   for (CeedInt i = 0; i < num_output_fields; i++) {
     CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i],
-                                                             max_dim, Q, Q_1d, false, is_all_tensor, is_at_points, use_3d_slices));
+                                                             max_dim, Q, Q_1d, false, is_all_tensor, is_at_points, use_3d_slices, true));
   }
 
   // Loop over all elements

From 239c7151e4ad57b8a0671e342ed106325657bbef Mon Sep 17 00:00:00 2001
From: Zach Atkins <zach.atkins@colorado.edu>
Date: Thu, 31 Jul 2025 08:58:36 -0600
Subject: [PATCH 475/571] Add .venv to gitignore

---
 .gitignore | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.gitignore b/.gitignore
index e7b100a069..06ab679a45 100644
--- a/.gitignore
+++ b/.gitignore
@@ -97,3 +97,5 @@ libCEED.includes
 *.txt
 *.proto
 *.csv
+
+.venv

From 7d878d164e15003bde30f90bc7bb38ba28c8c324 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 1 Aug 2025 10:03:42 -0600
Subject: [PATCH 476/571] cuda - fix misplaced ) for gen Grad (#1882)

---
 backends/cuda-gen/ceed-cuda-gen-operator-build.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
index 7059a2c891..804109754f 100644
--- a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
+++ b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
@@ -590,9 +590,8 @@ static int CeedOperatorBuildKernelBasis_Cuda_gen(std::ostringstream &code, CeedO
         } else if (is_tensor) {
           bool        is_collocated_grad = dim == 3 && Q_1d >= P_1d;
           std::string function_name =
-              (dim == 1 ? "Grad"
-                        : ("GradTensor" + std::string(is_collocated ? "CollocatedNodes" : (is_collocated_grad ? "Collocated" : ""))) +
-                              std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened"));
+              (dim == 1 ? "Grad" : ("GradTensor" + std::string(is_collocated ? "CollocatedNodes" : (is_collocated_grad ? "Collocated" : "")))) +
+              std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened");
           std::string op_t_1d_name = is_all_tensor ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
 
           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "*"

From ebfb1ab346d5a1addc1221edc1d1c7f1a6380df6 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Mon, 4 Aug 2025 11:42:48 -0600
Subject: [PATCH 477/571] doc - list HONEE in libCEED backends svg

---
 doc/img/libCEEDBackends.svg | 2988 +++++++++++++----------------------
 doc/img/libCEEDBackends.tex |  192 +++
 2 files changed, 1319 insertions(+), 1861 deletions(-)
 create mode 100644 doc/img/libCEEDBackends.tex

diff --git a/doc/img/libCEEDBackends.svg b/doc/img/libCEEDBackends.svg
index cff3b2527a..d8e96bb13b 100644
--- a/doc/img/libCEEDBackends.svg
+++ b/doc/img/libCEEDBackends.svg
@@ -1,1862 +1,1128 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="349.31" height="250.054" viewBox="0 0 349.31 250.054">
-<defs>
-<g>
-<g id="glyph-0-0">
-<path d="M 4.859375 -3.28125 C 4.46875 -3.28125 3.96875 -3.265625 3.625 -3.265625 C 3.28125 -3.265625 2.625 -3.28125 2.359375 -3.28125 L 3.640625 -6.234375 L 3.65625 -6.234375 C 4.140625 -5.109375 4.546875 -4.125 4.859375 -3.28125 Z M 2.140625 -2.71875 C 2.4375 -2.75 3.265625 -2.765625 3.6875 -2.765625 C 4.140625 -2.765625 4.796875 -2.75 5.078125 -2.71875 C 5.671875 -1.15625 5.921875 -0.234375 6.015625 0.03125 C 6.21875 0 6.40625 0 6.609375 0 C 6.8125 0 7.125 0 7.328125 0.03125 C 6.734375 -1.171875 5.15625 -5.25 4.046875 -7.859375 L 3.71875 -7.859375 C 2.5625 -5.21875 1.40625 -2.609375 0.1875 0.03125 C 0.328125 0 0.484375 0 0.609375 0 C 0.734375 0 1 0 1.140625 0.03125 C 1.34375 -0.6875 1.703125 -1.671875 2.140625 -2.71875 Z M 2.140625 -2.71875 "/>
-</g>
-<g id="glyph-0-1">
-<path d="M 1.9375 -3.515625 C 2.203125 -4.140625 2.90625 -4.703125 3.375 -4.703125 C 4.1875 -4.703125 4.609375 -3.953125 4.609375 -2.609375 C 4.609375 -1.625 4.328125 -0.359375 3.046875 -0.359375 C 2.859375 -0.359375 2.359375 -0.40625 1.9375 -0.890625 Z M 1.9375 -4.203125 L 1.9375 -5.078125 C 1.9375 -5.15625 1.90625 -5.203125 1.859375 -5.203125 C 1.59375 -5.140625 1.15625 -5.125 0.890625 -5.15625 L 0.875 -5.125 C 0.953125 -4.609375 0.984375 -3.703125 0.984375 -2.8125 L 0.984375 0.375 C 0.984375 1.28125 0.953125 2.140625 0.875 2.765625 L 0.890625 2.8125 C 1.046875 2.78125 1.3125 2.765625 1.453125 2.765625 C 1.609375 2.765625 1.875 2.78125 2.015625 2.8125 L 2.046875 2.765625 C 1.953125 2.09375 1.9375 1.296875 1.9375 0.375 L 1.9375 -0.140625 C 2.25 0.03125 2.6875 0.125 3.125 0.125 C 4.703125 0.125 5.65625 -1.078125 5.65625 -2.765625 C 5.65625 -4.03125 4.90625 -5.25 3.578125 -5.25 C 3.109375 -5.25 2.515625 -5.015625 1.953125 -4.171875 Z M 1.9375 -4.203125 "/>
-</g>
-<g id="glyph-0-2">
-<path d="M 1.078125 -2.390625 C 1.078125 -1.5 1.046875 -0.640625 0.96875 0 L 0.984375 0.03125 C 1.140625 0.015625 1.40625 0 1.546875 0 C 1.703125 0 1.96875 0.015625 2.109375 0.03125 L 2.140625 0 C 2.046875 -0.6875 2.03125 -1.484375 2.03125 -2.390625 L 2.03125 -5.96875 C 2.03125 -6.875 2.078125 -7.453125 2.140625 -8.21875 C 2.140625 -8.3125 2.109375 -8.34375 2.03125 -8.34375 C 1.71875 -8.21875 1.46875 -8.140625 0.984375 -8.109375 L 0.96875 -8.0625 C 1.046875 -7.546875 1.078125 -6.671875 1.078125 -5.765625 Z M 1.078125 -2.390625 "/>
-</g>
-<g id="glyph-0-3">
-<path d="M 0.953125 -7.140625 C 0.953125 -6.8125 1.25 -6.53125 1.5625 -6.53125 C 1.890625 -6.53125 2.171875 -6.8125 2.171875 -7.140625 C 2.171875 -7.453125 1.890625 -7.75 1.5625 -7.75 C 1.25 -7.75 0.953125 -7.453125 0.953125 -7.140625 Z M 1.078125 -2.8125 L 1.078125 -2.15625 C 1.078125 -1.25 1.046875 -0.640625 0.96875 0 L 0.984375 0.03125 C 1.140625 0.015625 1.40625 0 1.546875 0 C 1.703125 0 1.96875 0.015625 2.109375 0.03125 L 2.140625 0 C 2.046875 -0.671875 2.03125 -1.25 2.03125 -2.15625 L 2.03125 -3 C 2.03125 -3.890625 2.0625 -4.328125 2.140625 -5.0625 C 2.140625 -5.15625 2.109375 -5.171875 2.03125 -5.171875 C 1.75 -5.140625 1.25 -5.140625 0.984375 -5.15625 L 0.96875 -5.125 C 1.046875 -4.609375 1.078125 -3.703125 1.078125 -2.8125 Z M 1.078125 -2.8125 "/>
-</g>
-<g id="glyph-0-4">
-<path d="M 2.984375 -4.765625 C 3.4375 -4.765625 3.859375 -4.421875 4.1875 -3.890625 L 4.34375 -3.890625 L 4.609375 -4.875 L 4.59375 -4.90625 C 4.21875 -5.109375 3.59375 -5.25 3 -5.25 C 1.75 -5.25 0.453125 -4.234375 0.453125 -2.609375 C 0.453125 -0.9375 1.40625 0.125 2.859375 0.125 C 3.578125 0.125 4.140625 -0.125 4.609375 -0.734375 L 4.390625 -0.984375 L 4.34375 -0.984375 C 3.890625 -0.578125 3.5 -0.484375 3.0625 -0.484375 C 2.1875 -0.484375 1.5 -1.265625 1.5 -2.671875 C 1.5 -3.984375 2.203125 -4.765625 2.984375 -4.765625 Z M 2.984375 -4.765625 "/>
-</g>
-<g id="glyph-0-5">
-<path d="M 3.703125 -2.6875 L 3.65625 -1.265625 C 3.65625 -1.109375 3.578125 -1.03125 3.484375 -0.953125 C 3.15625 -0.703125 2.75 -0.484375 2.390625 -0.484375 C 1.859375 -0.484375 1.5 -0.84375 1.5 -1.21875 C 1.5 -1.75 1.75 -2.15625 2.6875 -2.421875 Z M 3.703125 -0.578125 C 3.84375 -0.09375 4.203125 0.125 4.640625 0.125 C 4.9375 0.125 5.3125 0.046875 5.578125 -0.25 L 5.5 -0.53125 C 5.359375 -0.484375 5.265625 -0.484375 5.171875 -0.484375 C 5.0625 -0.484375 4.90625 -0.5 4.8125 -0.578125 C 4.703125 -0.6875 4.625 -0.96875 4.625 -1.5 C 4.625 -1.859375 4.65625 -3.171875 4.65625 -3.3125 C 4.65625 -4.921875 3.59375 -5.25 2.6875 -5.25 C 1.78125 -5.25 1.234375 -4.8125 0.9375 -4.546875 L 0.890625 -4.5 L 1.09375 -3.703125 L 1.25 -3.6875 C 1.59375 -4.234375 2 -4.703125 2.5625 -4.703125 C 3 -4.703125 3.71875 -4.65625 3.71875 -3.296875 C 3.71875 -3.21875 3.671875 -3.171875 3.640625 -3.15625 L 2.515625 -2.90625 C 1.296875 -2.625 0.53125 -1.984375 0.53125 -1.171875 C 0.53125 -0.28125 1.140625 0.125 2.03125 0.125 C 2.6875 0.125 3.03125 -0.03125 3.65625 -0.578125 Z M 3.703125 -0.578125 "/>
-</g>
-<g id="glyph-0-6">
-<path d="M 1.140625 -5.125 C 0.890625 -5.125 0.640625 -5.125 0.484375 -5.15625 C 0.40625 -4.953125 0.328125 -4.84375 0.234375 -4.65625 L 0.28125 -4.578125 C 0.484375 -4.59375 0.859375 -4.59375 1.140625 -4.609375 L 1.140625 -2.96875 C 1.140625 -2.25 1.09375 -1.390625 1.09375 -1.03125 C 1.09375 -0.25 1.609375 0.125 2.15625 0.125 C 2.65625 0.125 3.03125 0 3.53125 -0.328125 L 3.375 -0.609375 C 3.03125 -0.484375 2.75 -0.484375 2.453125 -0.53125 C 2.15625 -0.5625 2.0625 -0.84375 2.0625 -1.5 C 2.0625 -1.859375 2.09375 -2.375 2.09375 -3.09375 L 2.09375 -4.609375 L 2.546875 -4.609375 C 2.828125 -4.609375 3.234375 -4.59375 3.40625 -4.578125 C 3.4375 -4.765625 3.484375 -4.890625 3.546875 -5.0625 L 3.484375 -5.15625 C 3.28125 -5.140625 2.921875 -5.125 2.65625 -5.125 L 2.09375 -5.125 C 2.09375 -6.015625 2.09375 -6.1875 2.15625 -6.875 C 2.15625 -6.953125 2.109375 -7 2.046875 -7 C 1.734375 -6.875 1.59375 -6.734375 1.203125 -6.6875 L 1.1875 -6.65625 C 1.15625 -6.234375 1.140625 -5.828125 1.140625 -5.125 Z M 1.140625 -5.125 "/>
-</g>
-<g id="glyph-0-7">
-<path d="M 0.453125 -2.453125 C 0.453125 -1.015625 1.40625 0.125 2.984375 0.125 C 4.5625 0.125 5.515625 -0.984375 5.515625 -2.5625 C 5.515625 -4.1875 4.65625 -5.25 3.03125 -5.25 C 1.46875 -5.25 0.453125 -4.140625 0.453125 -2.453125 Z M 2.953125 -4.765625 C 4.21875 -4.765625 4.46875 -3.796875 4.46875 -2.34375 C 4.46875 -1.1875 3.890625 -0.359375 3.09375 -0.359375 C 1.828125 -0.359375 1.5 -1.734375 1.5 -2.65625 C 1.5 -3.6875 1.828125 -4.765625 2.953125 -4.765625 Z M 2.953125 -4.765625 "/>
-</g>
-<g id="glyph-0-8">
-<path d="M 5.5 -2.15625 C 5.5 -2.53125 5.515625 -2.96875 5.515625 -3.359375 C 5.515625 -4.609375 5.109375 -5.25 4.046875 -5.25 C 3.578125 -5.25 2.734375 -5.0625 1.96875 -4.171875 L 1.953125 -4.203125 L 1.953125 -5.078125 C 1.9375 -5.15625 1.921875 -5.203125 1.875 -5.203125 C 1.609375 -5.140625 1.171875 -5.125 0.90625 -5.15625 L 0.890625 -5.125 C 0.96875 -4.609375 0.984375 -3.703125 0.984375 -2.8125 L 0.984375 -2.15625 C 0.984375 -1.25 0.984375 -0.640625 0.890625 0 L 0.90625 0.03125 C 1.046875 0.015625 1.328125 0 1.46875 0 C 1.609375 0 1.890625 0.015625 2.03125 0.03125 L 2.0625 0 C 1.953125 -0.6875 1.953125 -1.25 1.953125 -2.15625 L 1.953125 -3.546875 C 2.5625 -4.265625 3.25 -4.53125 3.6875 -4.53125 C 4.296875 -4.53125 4.546875 -4.296875 4.546875 -3.328125 L 4.546875 -2.15625 C 4.546875 -1.25 4.515625 -0.640625 4.4375 0 L 4.453125 0.03125 C 4.609375 0.015625 4.875 0 5.015625 0 C 5.15625 0 5.4375 0.015625 5.578125 0.03125 L 5.609375 0 C 5.515625 -0.6875 5.5 -1.25 5.5 -2.15625 Z M 5.5 -2.15625 "/>
-</g>
-<g id="glyph-0-9">
-<path d="M 2.28125 -5.3125 C 2.28125 -6.21875 2.3125 -7.078125 2.40625 -7.703125 L 2.390625 -7.75 C 2.25 -7.71875 1.90625 -7.703125 1.765625 -7.703125 C 1.625 -7.703125 1.296875 -7.71875 1.15625 -7.75 L 1.140625 -7.703125 C 1.234375 -7.03125 1.25 -6.21875 1.25 -5.3125 L 1.25 -2.390625 C 1.25 -1.5 1.234375 -0.640625 1.140625 0 L 1.140625 0.03125 C 1.140625 0.03125 1.359375 0 1.765625 0 L 5.03125 0 C 5.28125 0 5.5625 0.015625 5.75 0.03125 L 5.765625 0 C 5.765625 -0.171875 5.75 -0.40625 5.75 -0.53125 C 5.75 -0.640625 5.765625 -0.84375 5.765625 -0.9375 L 5.75 -0.984375 C 5.75 -0.984375 3.984375 -0.734375 2.375 -0.734375 C 2.296875 -1.015625 2.28125 -2.1875 2.28125 -2.390625 Z M 2.28125 -5.3125 "/>
-</g>
-<g id="glyph-0-10">
-<path d="M 1.9375 -0.96875 L 1.9375 -4.0625 C 2.40625 -4.546875 2.703125 -4.765625 3.1875 -4.765625 C 3.921875 -4.765625 4.5625 -4.15625 4.5625 -2.671875 C 4.5625 -1.1875 4.125 -0.359375 2.984375 -0.359375 C 2.625 -0.359375 2.171875 -0.671875 1.9375 -0.96875 Z M 1.9375 -5.96875 C 1.9375 -6.859375 1.953125 -7.484375 2.015625 -8.21875 C 2.015625 -8.3125 1.96875 -8.34375 1.90625 -8.34375 C 1.59375 -8.21875 1.34375 -8.140625 0.859375 -8.109375 L 0.84375 -8.0625 C 0.921875 -7.546875 0.984375 -6.671875 0.984375 -5.765625 L 0.984375 -0.875 C 0.984375 -0.40625 0.96875 -0.234375 0.921875 0.03125 C 0.984375 0.09375 1.125 0.125 1.25 0.125 C 1.390625 -0.03125 1.515625 -0.234375 1.65625 -0.46875 C 2 -0.171875 2.546875 0.125 3.0625 0.125 C 4.28125 0.125 5.625 -0.734375 5.625 -2.765625 C 5.625 -4.234375 4.59375 -5.25 3.4375 -5.25 C 2.859375 -5.25 2.34375 -5.09375 1.9375 -4.65625 Z M 1.9375 -5.96875 "/>
-</g>
-<g id="glyph-0-11">
-<path d="M 2 -4 L 2 -5.078125 C 2 -5.15625 1.96875 -5.203125 1.921875 -5.203125 C 1.65625 -5.140625 1.21875 -5.125 0.953125 -5.15625 L 0.9375 -5.125 C 1.015625 -4.609375 1.046875 -3.703125 1.046875 -2.8125 L 1.046875 -2.15625 C 1.046875 -1.25 1.015625 -0.640625 0.9375 0 L 0.953125 0.03125 C 1.09375 0.015625 1.375 0 1.515625 0 C 1.65625 0 1.9375 0.015625 2.078125 0.03125 L 2.109375 0 C 2.015625 -0.6875 2 -1.25 2 -2.15625 L 2 -2.765625 C 2 -3.28125 2.140625 -3.5625 2.40625 -3.953125 C 2.5625 -4.21875 2.84375 -4.375 3.078125 -4.375 C 3.3125 -4.375 3.53125 -4.34375 3.6875 -4.203125 L 3.796875 -4.234375 L 4.03125 -5.109375 L 3.984375 -5.15625 C 3.78125 -5.21875 3.765625 -5.25 3.546875 -5.25 C 2.90625 -5.25 2.5625 -4.875 2.03125 -3.96875 Z M 2 -4 "/>
-</g>
-<g id="glyph-0-12">
-<path d="M 0.328125 -5.15625 C 0.796875 -4.09375 2.21875 -1.015625 2.65625 0.125 C 2.265625 1.015625 1.78125 1.9375 1.25 2.828125 C 1.359375 2.796875 1.546875 2.765625 1.65625 2.765625 C 1.78125 2.765625 2.125 2.796875 2.25 2.828125 C 2.703125 1.34375 5.3125 -4.359375 5.71875 -5.15625 C 5.59375 -5.125 5.3125 -5.125 5.203125 -5.125 C 5.078125 -5.125 4.859375 -5.125 4.734375 -5.15625 C 4.296875 -3.8125 3.734375 -2.28125 3.15625 -1.046875 L 3.125 -1.046875 C 2.546875 -2.46875 1.984375 -3.796875 1.546875 -5.15625 C 1.390625 -5.125 1.125 -5.125 0.953125 -5.125 C 0.796875 -5.125 0.5 -5.125 0.328125 -5.15625 Z M 0.328125 -5.15625 "/>
-</g>
-<g id="glyph-0-13">
-<path d="M 2.265625 -6.6875 C 2.265625 -7.078125 2.390625 -7.234375 3.359375 -7.234375 C 4.03125 -7.234375 4.875 -6.96875 4.875 -5.75 C 4.875 -4.734375 4.203125 -4.4375 3.203125 -4.4375 L 2.265625 -4.4375 Z M 2.265625 -3.921875 L 3.375 -3.921875 C 4.78125 -3.921875 5.4375 -3.046875 5.4375 -1.96875 C 5.4375 -1.140625 5.15625 -0.484375 3.4375 -0.484375 C 2.59375 -0.484375 2.265625 -0.65625 2.265625 -1.046875 Z M 1.75 -7.703125 C 1.34375 -7.703125 1.140625 -7.75 1.140625 -7.75 L 1.125 -7.703125 C 1.21875 -7.03125 1.25 -6.21875 1.25 -5.3125 L 1.25 -2.390625 C 1.25 -1.5 1.21875 -0.640625 1.125 0 L 1.140625 0.03125 C 1.140625 0.03125 1.34375 0 1.75 0 C 2.5 0 2.59375 0.03125 3.65625 0.03125 C 6.03125 0.03125 6.625 -1.171875 6.625 -2.203125 C 6.625 -3.34375 5.84375 -4 4.8125 -4.296875 C 5.421875 -4.609375 5.921875 -5.234375 5.921875 -5.890625 C 5.921875 -6.6875 5.5 -7.75 3.28125 -7.75 C 2.859375 -7.75 2.359375 -7.703125 1.75 -7.703125 Z M 1.75 -7.703125 "/>
-</g>
-<g id="glyph-0-14">
-<path d="M 1.078125 -2.390625 C 1.078125 -1.5 1.046875 -0.640625 0.96875 0 L 0.984375 0.03125 C 1.140625 0.015625 1.40625 0 1.546875 0 C 1.703125 0 1.96875 0.015625 2.109375 0.03125 L 2.140625 0 C 2.046875 -0.6875 2.03125 -1.484375 2.03125 -2.390625 L 2.03125 -2.515625 C 2.125 -2.5 2.421875 -2.46875 2.515625 -2.375 C 3.296875 -1.5625 3.578125 -1.140625 4.40625 0.03125 C 4.578125 0.03125 4.984375 0 5.1875 0 C 5.375 0 5.796875 0.03125 5.90625 0.03125 L 5.921875 0 C 5.03125 -0.9375 4.40625 -1.453125 3.265625 -2.84375 C 3.734375 -3.328125 4.90625 -4.453125 5.671875 -5.125 L 5.65625 -5.15625 C 5.421875 -5.125 4.765625 -5.125 4.453125 -5.125 C 3.921875 -4.375 3.03125 -3.40625 2.5625 -3.046875 C 2.40625 -2.921875 2.1875 -2.875 2.03125 -2.875 L 2.03125 -5.96875 C 2.03125 -6.875 2.078125 -7.453125 2.140625 -8.21875 C 2.140625 -8.3125 2.109375 -8.34375 2.03125 -8.34375 C 1.71875 -8.21875 1.46875 -8.140625 0.984375 -8.109375 L 0.96875 -8.0625 C 1.046875 -7.546875 1.078125 -6.671875 1.078125 -5.765625 Z M 1.078125 -2.390625 "/>
-</g>
-<g id="glyph-0-15">
-<path d="M 1.5 -3.265625 C 1.703125 -4.5 2.46875 -4.765625 2.859375 -4.765625 C 3.328125 -4.765625 3.859375 -4.328125 3.859375 -3.4375 C 3.859375 -3.328125 3.8125 -3.265625 3.6875 -3.265625 Z M 4.6875 -1.25 C 4.265625 -0.78125 3.703125 -0.578125 3.046875 -0.578125 C 2.625 -0.578125 2.0625 -0.734375 1.75 -1.265625 C 1.53125 -1.609375 1.453125 -2.078125 1.453125 -2.78125 L 4.703125 -2.78125 C 4.84375 -2.78125 4.921875 -2.859375 4.921875 -2.984375 C 4.921875 -4 4.4375 -5.25 2.859375 -5.25 C 1.625 -5.25 0.40625 -4.25 0.40625 -2.5 C 0.40625 -1.8125 0.53125 -1.140625 0.9375 -0.671875 C 1.34375 -0.171875 2.03125 0.125 2.84375 0.125 C 3.703125 0.125 4.46875 -0.328125 4.921875 -0.9375 Z M 4.6875 -1.25 "/>
-</g>
-<g id="glyph-0-16">
-<path d="M 4.3125 -1.34375 C 3.875 -0.78125 3.328125 -0.5 2.84375 -0.5 C 2.203125 -0.5 1.65625 -1.140625 1.65625 -2.640625 C 1.65625 -4.4375 2.59375 -4.765625 3.171875 -4.765625 C 3.734375 -4.765625 4.03125 -4.53125 4.3125 -4.078125 Z M 4.3125 -0.6875 L 4.34375 -0.6875 L 4.40625 0 C 4.40625 0.03125 4.4375 0.03125 4.5 0.03125 C 4.65625 0.03125 4.75 0 4.90625 0 C 5.0625 0 5.296875 0.015625 5.453125 0.03125 L 5.46875 0 C 5.375 -0.515625 5.265625 -1.40625 5.265625 -2.296875 L 5.265625 -5.96875 C 5.265625 -6.859375 5.3125 -7.484375 5.375 -8.21875 C 5.375 -8.3125 5.34375 -8.34375 5.265625 -8.34375 C 4.953125 -8.21875 4.703125 -8.140625 4.234375 -8.109375 L 4.203125 -8.0625 C 4.296875 -7.546875 4.3125 -6.671875 4.3125 -5.765625 L 4.3125 -4.984375 C 4.046875 -5.140625 3.546875 -5.25 3.3125 -5.25 C 1.75 -5.25 0.609375 -4.171875 0.609375 -2.546875 C 0.609375 -1.078125 1.484375 0.125 2.765625 0.125 C 3.34375 0.125 3.890625 -0.125 4.3125 -0.6875 Z M 4.3125 -0.6875 "/>
-</g>
-<g id="glyph-0-17">
-<path d="M 0.59375 -1.203125 L 0.390625 -0.203125 C 1.078125 0.03125 1.75 0.125 2.203125 0.125 C 3.84375 0.125 4.234375 -0.859375 4.234375 -1.515625 C 4.234375 -2.5 3.453125 -2.875 2.609375 -3.078125 C 2.15625 -3.1875 1.5 -3.421875 1.5 -4.015625 C 1.5 -4.5 1.890625 -4.765625 2.40625 -4.765625 C 3.03125 -4.765625 3.40625 -4.28125 3.6875 -3.953125 L 3.84375 -3.96875 L 4.09375 -4.859375 L 4.0625 -4.890625 C 3.71875 -5.0625 3.0625 -5.25 2.453125 -5.25 C 1.5625 -5.25 0.640625 -4.78125 0.640625 -3.796875 C 0.640625 -2.828125 1.34375 -2.515625 2.078125 -2.3125 C 2.765625 -2.125 3.28125 -1.9375 3.28125 -1.34375 C 3.28125 -0.71875 2.828125 -0.359375 2.1875 -0.359375 C 1.609375 -0.359375 1.109375 -0.75 0.78125 -1.21875 Z M 0.59375 -1.203125 "/>
-</g>
-<g id="glyph-0-18">
-<path d="M 6.40625 -5.3125 L 6.40625 -4.359375 L 2.265625 -4.359375 L 2.265625 -5.3125 C 2.265625 -6.21875 2.296875 -7.078125 2.390625 -7.703125 L 2.375 -7.75 C 2.234375 -7.71875 1.90625 -7.703125 1.75 -7.703125 C 1.609375 -7.703125 1.296875 -7.71875 1.140625 -7.75 L 1.125 -7.703125 C 1.21875 -7.03125 1.25 -6.21875 1.25 -5.3125 L 1.25 -2.390625 C 1.25 -1.5 1.21875 -0.640625 1.125 0 L 1.140625 0.03125 C 1.28125 0.015625 1.609375 0 1.75 0 C 1.90625 0 2.21875 0.015625 2.359375 0.03125 L 2.390625 0 C 2.296875 -0.6875 2.265625 -1.5 2.265625 -2.390625 L 2.265625 -3.84375 L 6.40625 -3.84375 L 6.40625 -2.390625 C 6.40625 -1.5 6.390625 -0.640625 6.28125 0 L 6.296875 0.03125 C 6.4375 0.015625 6.78125 0 6.921875 0 C 7.0625 0 7.390625 0.015625 7.53125 0.03125 L 7.546875 0 C 7.453125 -0.6875 7.4375 -1.5 7.4375 -2.390625 L 7.4375 -5.3125 C 7.4375 -6.21875 7.453125 -7.078125 7.546875 -7.703125 L 7.546875 -7.75 C 7.40625 -7.71875 7.0625 -7.703125 6.921875 -7.703125 C 6.78125 -7.703125 6.453125 -7.71875 6.3125 -7.75 L 6.28125 -7.703125 C 6.390625 -7.03125 6.40625 -6.21875 6.40625 -5.3125 Z M 6.40625 -5.3125 "/>
-</g>
-<g id="glyph-0-19">
-<path d="M 2.453125 0.046875 C 2.5625 0.015625 2.71875 0.015625 2.84375 0.015625 C 2.96875 0.015625 3.171875 0.015625 3.28125 0.046875 C 3.59375 -0.859375 3.9375 -1.859375 4.328125 -2.78125 C 4.6875 -1.859375 5.0625 -0.890625 5.421875 0.03125 C 5.53125 0 5.609375 0 5.71875 0 C 5.84375 0 6.0625 0 6.171875 0.03125 C 6.75 -1.375 7.703125 -3.671875 8.390625 -5.15625 C 8.28125 -5.125 8 -5.125 7.875 -5.125 C 7.75 -5.125 7.578125 -5.125 7.46875 -5.15625 C 7.03125 -3.75 6.484375 -2.21875 5.984375 -1.078125 L 5.890625 -1.078125 C 5.40625 -2.390625 5.03125 -3.890625 4.75 -5.15625 C 4.609375 -5.125 4.328125 -5.125 4.15625 -5.125 C 3.96875 -5.125 3.65625 -5.125 3.46875 -5.15625 C 3.625 -4.671875 3.8125 -4.140625 4.015625 -3.59375 C 3.703125 -2.703125 3.375 -1.8125 3.03125 -1.0625 L 2.953125 -1.0625 C 2.359375 -2.421875 1.953125 -3.78125 1.53125 -5.15625 C 1.375 -5.109375 1.09375 -5.109375 0.9375 -5.109375 C 0.734375 -5.109375 0.4375 -5.109375 0.234375 -5.15625 C 1.046875 -3.390625 1.78125 -1.703125 2.453125 0.046875 Z M 2.453125 0.046875 "/>
-</g>
-<g id="glyph-1-0">
-<path d="M 1.890625 -2.84375 C 2.25 -2.84375 2.546875 -2.859375 2.8125 -2.875 C 3.09375 -2.546875 3.3125 -2.15625 3.5625 -1.75 C 3.921875 -1.171875 4.40625 -0.3125 4.515625 0.03125 C 4.703125 0 4.921875 0 5.109375 0 C 5.3125 0 5.515625 0 5.71875 0.03125 L 5.734375 0 C 5.234375 -0.5625 4.234375 -2.140625 3.59375 -2.984375 C 3.875 -3.03125 4.0625 -3.09375 4.203125 -3.171875 C 4.75 -3.4375 5.203125 -3.953125 5.203125 -4.78125 C 5.203125 -5.28125 5.03125 -5.6875 4.671875 -6.015625 C 4.203125 -6.46875 3.4375 -6.5 2.8125 -6.5 C 2.5625 -6.5 1.765625 -6.4375 1.46875 -6.4375 C 1.34375 -6.4375 1.078125 -6.4375 0.953125 -6.453125 L 0.9375 -6.4375 C 1.015625 -5.859375 1.03125 -5.1875 1.03125 -4.4375 L 1.03125 -2 C 1.03125 -1.25 1.015625 -0.53125 0.9375 0 L 0.953125 0.03125 C 1.078125 0.015625 1.34375 0 1.46875 0 C 1.578125 0 1.859375 0.015625 1.96875 0.03125 L 2 0 C 1.90625 -0.5625 1.890625 -1.25 1.890625 -2 Z M 2.828125 -6.078125 C 3.421875 -6.078125 4.25 -5.84375 4.25 -4.765625 C 4.25 -3.546875 3.4375 -3.25 2.46875 -3.25 L 1.890625 -3.25 L 1.890625 -5.625 C 1.890625 -6.015625 1.921875 -6.078125 2.828125 -6.078125 Z M 2.828125 -6.078125 "/>
-</g>
-<g id="glyph-1-1">
-<path d="M 3.09375 -2.25 L 3.046875 -1.0625 C 3.046875 -0.921875 2.984375 -0.859375 2.90625 -0.796875 C 2.625 -0.59375 2.296875 -0.40625 2 -0.40625 C 1.546875 -0.40625 1.25 -0.703125 1.25 -1.015625 C 1.25 -1.46875 1.46875 -1.796875 2.25 -2.015625 Z M 3.09375 -0.484375 C 3.203125 -0.078125 3.5 0.09375 3.875 0.09375 C 4.109375 0.09375 4.421875 0.046875 4.65625 -0.203125 L 4.578125 -0.453125 C 4.46875 -0.40625 4.390625 -0.40625 4.3125 -0.40625 C 4.21875 -0.40625 4.09375 -0.421875 4.015625 -0.484375 C 3.921875 -0.5625 3.859375 -0.8125 3.859375 -1.25 C 3.859375 -1.546875 3.890625 -2.65625 3.890625 -2.765625 C 3.890625 -4.109375 3 -4.375 2.25 -4.375 C 1.484375 -4.375 1.03125 -4 0.78125 -3.796875 L 0.75 -3.765625 L 0.921875 -3.09375 L 1.046875 -3.078125 C 1.328125 -3.53125 1.671875 -3.921875 2.140625 -3.921875 C 2.5 -3.921875 3.09375 -3.875 3.09375 -2.75 C 3.09375 -2.6875 3.0625 -2.640625 3.03125 -2.625 L 2.109375 -2.421875 C 1.09375 -2.1875 0.453125 -1.65625 0.453125 -0.984375 C 0.453125 -0.234375 0.953125 0.09375 1.6875 0.09375 C 2.25 0.09375 2.515625 -0.03125 3.046875 -0.484375 Z M 3.09375 -0.484375 "/>
-</g>
-<g id="glyph-1-2">
-<path d="M 0.953125 -4.28125 C 0.734375 -4.28125 0.53125 -4.28125 0.40625 -4.296875 C 0.34375 -4.125 0.28125 -4.03125 0.1875 -3.890625 L 0.234375 -3.8125 C 0.40625 -3.828125 0.71875 -3.828125 0.953125 -3.84375 L 0.953125 -2.46875 C 0.953125 -1.875 0.921875 -1.15625 0.921875 -0.859375 C 0.921875 -0.203125 1.328125 0.09375 1.796875 0.09375 C 2.21875 0.09375 2.53125 0 2.953125 -0.265625 L 2.828125 -0.515625 C 2.515625 -0.40625 2.296875 -0.40625 2.046875 -0.4375 C 1.796875 -0.46875 1.71875 -0.703125 1.71875 -1.25 C 1.71875 -1.5625 1.75 -1.984375 1.75 -2.578125 L 1.75 -3.84375 L 2.125 -3.84375 C 2.359375 -3.84375 2.703125 -3.828125 2.84375 -3.8125 C 2.875 -3.984375 2.90625 -4.078125 2.953125 -4.234375 L 2.90625 -4.296875 C 2.734375 -4.28125 2.4375 -4.28125 2.21875 -4.28125 L 1.75 -4.28125 C 1.75 -5.015625 1.75 -5.15625 1.796875 -5.734375 C 1.796875 -5.796875 1.765625 -5.828125 1.703125 -5.828125 C 1.453125 -5.734375 1.328125 -5.625 1 -5.578125 L 0.984375 -5.546875 C 0.96875 -5.203125 0.953125 -4.859375 0.953125 -4.28125 Z M 0.953125 -4.28125 "/>
-</g>
-<g id="glyph-1-3">
-<path d="M 1.25 -2.71875 C 1.421875 -3.765625 2.0625 -3.984375 2.375 -3.984375 C 2.765625 -3.984375 3.21875 -3.609375 3.21875 -2.859375 C 3.21875 -2.765625 3.1875 -2.71875 3.078125 -2.71875 Z M 3.90625 -1.03125 C 3.5625 -0.65625 3.09375 -0.484375 2.546875 -0.484375 C 2.1875 -0.484375 1.71875 -0.625 1.453125 -1.0625 C 1.28125 -1.328125 1.21875 -1.734375 1.21875 -2.328125 L 3.921875 -2.328125 C 4.03125 -2.328125 4.109375 -2.375 4.109375 -2.5 C 4.109375 -3.328125 3.703125 -4.375 2.375 -4.375 C 1.359375 -4.375 0.34375 -3.546875 0.34375 -2.078125 C 0.34375 -1.515625 0.453125 -0.953125 0.78125 -0.5625 C 1.125 -0.140625 1.6875 0.09375 2.375 0.09375 C 3.09375 0.09375 3.734375 -0.265625 4.109375 -0.78125 Z M 3.90625 -1.03125 "/>
-</g>
-<g id="glyph-1-4">
-<path d="M 0.890625 -2 C 0.890625 -1.25 0.875 -0.53125 0.8125 0 L 0.828125 0.03125 C 0.953125 0.015625 1.171875 0 1.296875 0 C 1.421875 0 1.640625 0.015625 1.765625 0.03125 L 1.78125 0 C 1.703125 -0.5625 1.6875 -1.234375 1.6875 -2 L 1.6875 -4.984375 C 1.6875 -5.734375 1.734375 -6.21875 1.78125 -6.859375 C 1.78125 -6.921875 1.75 -6.953125 1.6875 -6.953125 C 1.4375 -6.859375 1.21875 -6.78125 0.828125 -6.765625 L 0.8125 -6.734375 C 0.875 -6.296875 0.890625 -5.5625 0.890625 -4.8125 Z M 0.890625 -2 "/>
-</g>
-<g id="glyph-1-5">
-<path d="M 1.890625 -5.625 C 1.890625 -5.921875 2.078125 -6.078125 2.796875 -6.078125 C 3.484375 -6.078125 4.203125 -5.875 4.203125 -4.6875 C 4.203125 -3.546875 3.65625 -3.1875 2.6875 -3.1875 C 2.4375 -3.1875 2.03125 -3.203125 1.890625 -3.28125 Z M 1.03125 -4.4375 L 1.03125 -2 C 1.03125 -1.25 1.015625 -0.53125 0.9375 0 L 0.953125 0.03125 C 1.078125 0.015625 1.34375 0 1.46875 0 C 1.578125 0 1.859375 0.015625 1.96875 0.03125 L 2 0 C 1.90625 -0.5625 1.890625 -1.25 1.890625 -2 L 1.890625 -2.84375 C 2.109375 -2.78125 2.375 -2.75 2.734375 -2.75 C 4.546875 -2.75 5.171875 -3.890625 5.171875 -4.75 C 5.171875 -5.484375 4.703125 -6.5 2.859375 -6.5 C 2.609375 -6.5 1.765625 -6.4375 1.46875 -6.4375 C 1.34375 -6.4375 1.078125 -6.4375 0.953125 -6.453125 L 0.9375 -6.4375 C 1.015625 -5.859375 1.03125 -5.1875 1.03125 -4.4375 Z M 1.03125 -4.4375 "/>
-</g>
-<g id="glyph-1-6">
-<path d="M 1.890625 -2 L 1.890625 -3.15625 C 2.375 -3.15625 3.421875 -3.125 4.109375 -3.0625 L 4.140625 -3.09375 C 4.125 -3.1875 4.109375 -3.3125 4.109375 -3.40625 C 4.109375 -3.5 4.125 -3.640625 4.140625 -3.734375 L 4.109375 -3.765625 C 3.53125 -3.703125 3.078125 -3.65625 1.890625 -3.65625 L 1.890625 -4.4375 C 1.890625 -4.609375 1.90625 -5.625 1.96875 -5.875 C 3.3125 -5.875 4.6875 -5.75 4.6875 -5.75 L 4.703125 -5.796875 C 4.6875 -5.875 4.6875 -5.96875 4.6875 -6.046875 C 4.6875 -6.125 4.6875 -6.296875 4.703125 -6.4375 L 4.6875 -6.453125 C 4.53125 -6.4375 4.296875 -6.4375 4.09375 -6.4375 L 1.46875 -6.4375 C 1.125 -6.4375 0.953125 -6.453125 0.953125 -6.453125 L 0.9375 -6.4375 C 1.015625 -5.859375 1.03125 -5.1875 1.03125 -4.4375 L 1.03125 -2 C 1.03125 -1.25 1.015625 -0.53125 0.9375 0 L 0.953125 0.03125 C 0.953125 0.03125 1.109375 0 1.46875 0 L 4.1875 0 C 4.390625 0 4.625 0.015625 4.78125 0.03125 L 4.8125 0 C 4.796875 -0.140625 4.78125 -0.203125 4.78125 -0.3125 C 4.78125 -0.40625 4.796875 -0.5625 4.8125 -0.640625 L 4.78125 -0.671875 C 4.78125 -0.671875 3.3125 -0.5625 1.96875 -0.5625 C 1.90625 -0.796875 1.890625 -1.828125 1.890625 -2 Z M 1.890625 -2 "/>
-</g>
-<g id="glyph-1-7">
-<path d="M 2.265625 -4.4375 L 2.265625 -2 C 2.265625 -1.25 2.25 -0.53125 2.15625 0 L 2.1875 0.03125 C 2.296875 0.015625 2.578125 0 2.6875 0 C 2.8125 0 3.078125 0.015625 3.203125 0.03125 L 3.21875 0 C 3.140625 -0.5625 3.125 -1.25 3.125 -2 L 3.125 -4.4375 C 3.125 -4.609375 3.125 -5.625 3.203125 -5.875 C 4.53125 -5.875 5.21875 -5.734375 5.21875 -5.734375 L 5.234375 -5.765625 C 5.21875 -5.921875 5.21875 -6.15625 5.234375 -6.4375 L 5.21875 -6.453125 C 5.046875 -6.4375 4.828125 -6.4375 4.609375 -6.4375 L 0.765625 -6.4375 C 0.5625 -6.4375 0.328125 -6.4375 0.171875 -6.453125 L 0.15625 -6.4375 C 0.171875 -6.15625 0.171875 -5.921875 0.15625 -5.765625 L 0.171875 -5.734375 C 0.171875 -5.734375 0.84375 -5.875 2.1875 -5.875 C 2.25 -5.625 2.265625 -4.609375 2.265625 -4.4375 Z M 2.265625 -4.4375 "/>
-</g>
-<g id="glyph-1-8">
-<path d="M 2.40625 -0.34375 C 1.859375 -0.34375 1.140625 -0.828125 0.828125 -1.421875 L 0.734375 -1.40625 C 0.6875 -1.046875 0.578125 -0.671875 0.5 -0.375 L 0.515625 -0.34375 C 0.515625 -0.34375 1.171875 0.09375 2.328125 0.09375 C 3.5625 0.09375 4.5 -0.640625 4.5 -1.796875 C 4.5 -2.953125 3.515625 -3.484375 2.71875 -3.796875 C 2.21875 -3.984375 1.5 -4.28125 1.5 -5.109375 C 1.5 -5.46875 1.703125 -5.859375 1.96875 -6 C 2.140625 -6.09375 2.34375 -6.125 2.578125 -6.125 C 3.125 -6.125 3.65625 -5.6875 3.921875 -5.078125 L 4.03125 -5.078125 C 4.0625 -5.4375 4.171875 -5.78125 4.25 -6.078125 L 4.234375 -6.109375 C 4.234375 -6.109375 3.8125 -6.5625 2.65625 -6.5625 C 2.375 -6.5625 2.078125 -6.515625 1.796875 -6.390625 C 1.203125 -6.140625 0.703125 -5.5625 0.703125 -4.828125 C 0.703125 -3.78125 1.578125 -3.296875 2.40625 -2.953125 C 3.0625 -2.6875 3.578125 -2.359375 3.578125 -1.5 C 3.578125 -0.75 3 -0.34375 2.40625 -0.34375 Z M 2.40625 -0.34375 "/>
-</g>
-<g id="glyph-1-9">
-<path d="M 2.5 -3.984375 C 2.875 -3.984375 3.21875 -3.6875 3.484375 -3.234375 L 3.625 -3.25 L 3.84375 -4.0625 L 3.828125 -4.09375 C 3.515625 -4.25 3 -4.375 2.5 -4.375 C 1.46875 -4.375 0.375 -3.53125 0.375 -2.171875 C 0.375 -0.78125 1.171875 0.09375 2.375 0.09375 C 2.984375 0.09375 3.453125 -0.09375 3.84375 -0.609375 L 3.65625 -0.8125 L 3.625 -0.8125 C 3.234375 -0.484375 2.921875 -0.40625 2.546875 -0.40625 C 1.828125 -0.40625 1.25 -1.0625 1.25 -2.21875 C 1.25 -3.3125 1.828125 -3.984375 2.5 -3.984375 Z M 2.5 -3.984375 "/>
-</g>
-<g id="glyph-1-10">
-<path d="M 1.078125 -6.4375 C 1.171875 -4.53125 1.140625 -1.578125 0.984375 0 L 1 0.03125 C 1.109375 0.015625 1.21875 0 1.34375 0 C 1.46875 0 1.578125 0.015625 1.6875 0.03125 L 1.71875 0 C 1.640625 -0.5625 1.609375 -1.25 1.609375 -2 L 1.609375 -4.375 C 1.609375 -4.984375 1.625 -5.015625 2 -4.53125 L 5.484375 -0.140625 C 5.59375 0.015625 5.75 0.09375 5.90625 0.09375 C 6.046875 0.09375 6.09375 -0.015625 6.09375 -0.203125 C 6.15625 -2.484375 6.140625 -4.21875 6.296875 -6.4375 L 6.28125 -6.453125 C 6.15625 -6.4375 6.046875 -6.4375 5.9375 -6.4375 C 5.8125 -6.4375 5.703125 -6.4375 5.578125 -6.453125 L 5.5625 -6.4375 C 5.640625 -5.859375 5.65625 -5.1875 5.65625 -4.4375 L 5.65625 -1.78125 C 5.640625 -1.21875 5.5 -1.453125 5.078125 -2.03125 L 1.65625 -6.453125 C 1.65625 -6.453125 1.5625 -6.4375 1.515625 -6.4375 C 1.171875 -6.4375 1.09375 -6.453125 1.09375 -6.453125 Z M 1.078125 -6.4375 "/>
-</g>
-<g id="glyph-1-11">
-<path d="M 0.890625 -2 C 0.890625 -1.25 0.875 -0.53125 0.8125 0 L 0.828125 0.03125 C 0.953125 0.015625 1.171875 0 1.296875 0 C 1.421875 0 1.640625 0.015625 1.765625 0.03125 L 1.78125 0 C 1.703125 -0.5625 1.6875 -1.234375 1.6875 -2 L 1.6875 -2.09375 C 1.78125 -2.078125 2.015625 -2.046875 2.109375 -1.984375 C 2.75 -1.3125 2.984375 -0.953125 3.671875 0.03125 C 3.8125 0.015625 4.15625 0 4.328125 0 C 4.484375 0 4.828125 0.015625 4.921875 0.03125 L 4.9375 0 C 4.203125 -0.78125 3.671875 -1.21875 2.71875 -2.375 C 3.109375 -2.765625 4.09375 -3.71875 4.734375 -4.28125 L 4.71875 -4.3125 C 4.53125 -4.28125 3.984375 -4.28125 3.703125 -4.28125 C 3.265625 -3.65625 2.515625 -2.84375 2.140625 -2.546875 C 2 -2.4375 1.828125 -2.40625 1.6875 -2.390625 L 1.6875 -4.984375 C 1.6875 -5.734375 1.734375 -6.21875 1.78125 -6.859375 C 1.78125 -6.921875 1.75 -6.953125 1.6875 -6.953125 C 1.4375 -6.859375 1.21875 -6.78125 0.828125 -6.765625 L 0.8125 -6.734375 C 0.875 -6.296875 0.890625 -5.5625 0.890625 -4.8125 Z M 0.890625 -2 "/>
-</g>
-<g id="glyph-1-12">
-<path d="M 2.015625 0.09375 C 3.203125 0.09375 4.046875 -0.78125 4.046875 -1.953125 C 4.046875 -2.921875 3.40625 -3.78125 2.28125 -3.78125 C 1.875 -3.78125 1.421875 -3.71875 1.203125 -3.640625 L 1.40625 -5.40625 C 1.78125 -5.359375 2.203125 -5.3125 2.671875 -5.3125 C 2.96875 -5.3125 3.3125 -5.328125 3.71875 -5.375 L 3.875 -6.046875 L 3.8125 -6.078125 C 3.234375 -6.015625 2.703125 -5.984375 2.171875 -5.984375 C 1.796875 -5.984375 1.375 -6.015625 1.03125 -6.046875 L 0.71875 -3.09375 L 0.78125 -3.078125 C 1.171875 -3.234375 1.5625 -3.390625 2.03125 -3.390625 C 2.671875 -3.390625 3.171875 -2.875 3.171875 -1.78125 C 3.171875 -0.875 2.71875 -0.296875 2.03125 -0.296875 C 1.28125 -0.296875 1.15625 -0.703125 0.828125 -1.296875 L 0.6875 -1.28125 L 0.46875 -0.484375 L 0.515625 -0.453125 C 0.75 -0.234375 1.25 0.09375 2.015625 0.09375 Z M 2.015625 0.09375 "/>
-</g>
-<g id="glyph-1-13">
-<path d="M 2.328125 -5.6875 C 2.5 -5.6875 2.65625 -5.625 2.765625 -5.53125 C 3.078125 -5.28125 3.34375 -4.5 3.34375 -3.15625 C 3.34375 -2.25 3.3125 -1.71875 3.171875 -1.21875 C 2.953125 -0.390625 2.46875 -0.296875 2.28125 -0.296875 C 1.359375 -0.296875 1.25 -2 1.25 -2.859375 C 1.25 -5.328125 1.859375 -5.6875 2.328125 -5.6875 Z M 2.265625 0.09375 C 3.046875 0.09375 4.203125 -0.75 4.203125 -3.078125 C 4.203125 -4.640625 3.625 -5.40625 3.265625 -5.734375 C 3 -5.984375 2.6875 -6.078125 2.328125 -6.078125 C 1.328125 -6.078125 0.390625 -4.84375 0.390625 -2.875 C 0.390625 -1.265625 1.03125 0.09375 2.265625 0.09375 Z M 2.265625 0.09375 "/>
-</g>
-<g id="glyph-1-14">
-<path d="M 1.921875 -6.59375 C 1.578125 -4.578125 1.03125 -1.828125 0.640625 0.03125 C 0.75 0 0.84375 0 0.96875 0 C 1.078125 0 1.15625 0 1.265625 0.03125 C 1.421875 -1.015625 1.75 -3.234375 2 -4.75 L 2.046875 -4.75 C 2.796875 -3.15625 3.5 -1.546875 4.140625 0 L 4.328125 0 C 5.046875 -1.640625 5.75 -3.15625 6.546875 -4.71875 L 6.5625 -4.703125 C 6.78125 -3.140625 6.984375 -1.609375 7.125 0.03125 C 7.28125 0 7.46875 0 7.609375 0 C 7.765625 0 8.03125 0 8.171875 0.03125 C 7.8125 -2.109375 7.515625 -4.171875 7.234375 -6.59375 L 6.984375 -6.59375 L 4.5625 -1.65625 L 4.484375 -1.65625 C 3.703125 -3.296875 2.984375 -4.890625 2.25 -6.59375 Z M 1.921875 -6.59375 "/>
-</g>
-<g id="glyph-1-15">
-<path d="M 4.0625 -6.4375 L 1.46875 -6.4375 C 1.125 -6.4375 0.953125 -6.453125 0.953125 -6.453125 L 0.9375 -6.4375 C 1.015625 -5.859375 1.03125 -5.1875 1.03125 -4.4375 L 1.03125 -2 C 1.03125 -1.25 1.015625 -0.53125 0.9375 0 L 0.953125 0.03125 C 0.953125 0.03125 1.109375 0 1.46875 0 C 1.796875 0 1.96875 0.03125 1.96875 0.03125 L 2 0 C 1.90625 -0.5625 1.890625 -1.25 1.890625 -2 L 1.890625 -3.15625 C 2.375 -3.15625 3.421875 -3.125 4.109375 -3.0625 L 4.140625 -3.09375 C 4.125 -3.1875 4.109375 -3.3125 4.109375 -3.40625 C 4.109375 -3.5 4.125 -3.640625 4.140625 -3.734375 L 4.109375 -3.765625 C 3.53125 -3.703125 3.078125 -3.65625 1.890625 -3.65625 L 1.890625 -4.4375 C 1.890625 -4.609375 1.90625 -5.625 1.96875 -5.875 C 3.3125 -5.875 4.6875 -5.75 4.6875 -5.75 L 4.703125 -5.796875 C 4.6875 -5.875 4.6875 -6 4.6875 -6.09375 C 4.6875 -6.171875 4.6875 -6.296875 4.703125 -6.4375 L 4.6875 -6.453125 C 4.53125 -6.4375 4.296875 -6.4375 4.0625 -6.4375 Z M 4.0625 -6.4375 "/>
-</g>
-<g id="glyph-1-16">
-<path d="M 0.796875 -5.953125 C 0.796875 -5.6875 1.03125 -5.4375 1.3125 -5.4375 C 1.578125 -5.4375 1.8125 -5.6875 1.8125 -5.953125 C 1.8125 -6.21875 1.578125 -6.453125 1.3125 -6.453125 C 1.03125 -6.453125 0.796875 -6.21875 0.796875 -5.953125 Z M 0.890625 -2.34375 L 0.890625 -1.796875 C 0.890625 -1.046875 0.875 -0.53125 0.8125 0 L 0.828125 0.03125 C 0.953125 0.015625 1.171875 0 1.296875 0 C 1.421875 0 1.640625 0.015625 1.765625 0.03125 L 1.78125 0 C 1.703125 -0.5625 1.6875 -1.03125 1.6875 -1.796875 L 1.6875 -2.5 C 1.6875 -3.25 1.71875 -3.609375 1.78125 -4.21875 C 1.78125 -4.3125 1.765625 -4.3125 1.6875 -4.3125 C 1.46875 -4.28125 1.03125 -4.28125 0.828125 -4.3125 L 0.8125 -4.28125 C 0.875 -3.84375 0.890625 -3.09375 0.890625 -2.34375 Z M 0.890625 -2.34375 "/>
-</g>
-<g id="glyph-1-17">
-<path d="M 1.609375 -0.8125 L 1.609375 -3.390625 C 2 -3.78125 2.25 -3.984375 2.65625 -3.984375 C 3.265625 -3.984375 3.8125 -3.46875 3.8125 -2.234375 C 3.8125 -1 3.4375 -0.296875 2.5 -0.296875 C 2.1875 -0.296875 1.8125 -0.5625 1.609375 -0.8125 Z M 1.609375 -4.984375 C 1.609375 -5.71875 1.625 -6.234375 1.671875 -6.859375 C 1.671875 -6.921875 1.640625 -6.953125 1.578125 -6.953125 C 1.328125 -6.859375 1.109375 -6.78125 0.71875 -6.765625 L 0.703125 -6.734375 C 0.765625 -6.296875 0.8125 -5.5625 0.8125 -4.8125 L 0.8125 -0.734375 C 0.8125 -0.34375 0.8125 -0.1875 0.765625 0.015625 C 0.828125 0.078125 0.9375 0.09375 1.046875 0.09375 C 1.15625 -0.015625 1.265625 -0.1875 1.375 -0.390625 C 1.671875 -0.140625 2.125 0.09375 2.546875 0.09375 C 3.5625 0.09375 4.6875 -0.625 4.6875 -2.3125 C 4.6875 -3.53125 3.828125 -4.375 2.859375 -4.375 C 2.375 -4.375 1.953125 -4.25 1.609375 -3.875 Z M 1.609375 -4.984375 "/>
-</g>
-<g id="glyph-1-18">
-<path d="M 3.78125 -6.5625 C 2.109375 -6.5625 0.515625 -5.109375 0.515625 -3.140625 C 0.515625 -1.421875 1.5 0.09375 3.65625 0.09375 C 4.5625 0.09375 5.4375 -0.171875 6.09375 -0.984375 C 6.09375 -1.109375 6.078125 -1.328125 6.046875 -1.421875 L 5.96875 -1.453125 C 5.265625 -0.671875 4.671875 -0.40625 3.734375 -0.40625 C 2.46875 -0.40625 1.546875 -1.78125 1.546875 -3.328125 C 1.546875 -5.359375 2.828125 -6.09375 3.65625 -6.09375 C 4.546875 -6.09375 5.28125 -5.734375 5.6875 -4.921875 L 5.796875 -4.921875 C 5.828125 -5.4375 5.875 -5.609375 5.984375 -6.03125 L 5.96875 -6.0625 C 5.96875 -6.0625 5 -6.5625 3.78125 -6.5625 Z M 3.78125 -6.5625 "/>
-</g>
-<g id="glyph-1-19">
-<path d="M 1.890625 -0.8125 L 1.890625 -5.625 C 1.890625 -5.96875 2.25 -6.015625 2.734375 -6.015625 C 4.890625 -6.015625 5.53125 -4.359375 5.53125 -2.828125 C 5.53125 -0.8125 4.390625 -0.421875 2.984375 -0.421875 C 2 -0.421875 1.890625 -0.5 1.890625 -0.8125 Z M 1.46875 -6.4375 C 1.125 -6.4375 0.953125 -6.453125 0.953125 -6.453125 L 0.9375 -6.4375 C 1.015625 -5.859375 1.03125 -5.1875 1.03125 -4.4375 L 1.03125 -2 C 1.03125 -1.25 1.015625 -0.53125 0.9375 0 L 0.953125 0.03125 C 0.953125 0.03125 1.109375 0 1.46875 0 C 2.171875 0 2.265625 0.015625 3.296875 0.015625 C 4.6875 0.015625 6.5625 -0.625 6.5625 -3.078125 C 6.5625 -4.921875 5.03125 -6.453125 3.109375 -6.453125 C 2.46875 -6.453125 2.109375 -6.4375 1.46875 -6.4375 Z M 1.46875 -6.4375 "/>
-</g>
-<g id="glyph-1-20">
-<path d="M 3.640625 -0.921875 L 3.671875 -0.890625 L 3.734375 0 C 3.734375 0.015625 3.734375 0.03125 3.765625 0.03125 C 3.890625 0.015625 4.015625 0 4.15625 0 C 4.28125 0 4.46875 0.015625 4.609375 0.03125 L 4.625 0 C 4.546875 -0.421875 4.453125 -1.171875 4.453125 -1.90625 L 4.453125 -2.484375 C 4.453125 -3.234375 4.46875 -3.75 4.546875 -4.28125 L 4.53125 -4.3125 C 4.40625 -4.28125 4.171875 -4.28125 4.0625 -4.28125 C 3.9375 -4.28125 3.703125 -4.28125 3.59375 -4.3125 L 3.5625 -4.28125 C 3.640625 -3.703125 3.65625 -3.234375 3.65625 -2.484375 L 3.65625 -1.5 C 3.34375 -0.984375 2.78125 -0.453125 2.296875 -0.453125 C 1.9375 -0.453125 1.625 -0.5625 1.625 -1.5 L 1.625 -2.484375 C 1.625 -3.234375 1.640625 -3.75 1.71875 -4.28125 L 1.6875 -4.3125 C 1.578125 -4.28125 1.34375 -4.28125 1.21875 -4.28125 C 1.109375 -4.28125 0.875 -4.28125 0.75 -4.3125 L 0.734375 -4.28125 C 0.8125 -3.703125 0.828125 -3.234375 0.828125 -2.484375 L 0.828125 -1.28125 C 0.828125 -0.59375 1.140625 0.09375 2.140625 0.09375 C 2.75 0.09375 3.3125 -0.40625 3.640625 -0.921875 Z M 3.640625 -0.921875 "/>
-</g>
-<g id="glyph-1-21">
-<path d="M 1.671875 -3.328125 L 1.671875 -4.234375 C 1.671875 -4.3125 1.640625 -4.34375 1.609375 -4.34375 C 1.375 -4.28125 1.015625 -4.28125 0.796875 -4.3125 L 0.78125 -4.28125 C 0.84375 -3.84375 0.875 -3.09375 0.875 -2.34375 L 0.875 -1.796875 C 0.875 -1.046875 0.84375 -0.53125 0.78125 0 L 0.796875 0.03125 C 0.921875 0.015625 1.140625 0 1.265625 0 C 1.390625 0 1.609375 0.015625 1.734375 0.03125 L 1.75 0 C 1.671875 -0.5625 1.671875 -1.03125 1.671875 -1.796875 L 1.671875 -2.3125 C 1.671875 -2.734375 1.78125 -2.96875 2 -3.296875 C 2.140625 -3.515625 2.375 -3.65625 2.5625 -3.65625 C 2.765625 -3.65625 2.953125 -3.625 3.078125 -3.515625 L 3.15625 -3.53125 L 3.359375 -4.265625 L 3.3125 -4.3125 C 3.15625 -4.359375 3.140625 -4.375 2.953125 -4.375 C 2.421875 -4.375 2.140625 -4.0625 1.6875 -3.3125 Z M 1.671875 -3.328125 "/>
-</g>
-<g id="glyph-1-22">
-<path d="M 4.0625 -2.734375 C 3.734375 -2.734375 3.3125 -2.71875 3.015625 -2.71875 C 2.734375 -2.71875 2.1875 -2.734375 1.96875 -2.734375 L 3.03125 -5.1875 L 3.046875 -5.1875 C 3.453125 -4.25 3.78125 -3.4375 4.0625 -2.734375 Z M 1.78125 -2.265625 C 2.03125 -2.296875 2.71875 -2.296875 3.078125 -2.296875 C 3.453125 -2.296875 4 -2.296875 4.234375 -2.265625 C 4.71875 -0.96875 4.9375 -0.1875 5.015625 0.03125 C 5.1875 0 5.34375 0 5.515625 0 C 5.6875 0 5.9375 0 6.109375 0.03125 C 5.609375 -0.984375 4.296875 -4.375 3.375 -6.5625 L 3.09375 -6.5625 C 2.140625 -4.34375 1.171875 -2.171875 0.15625 0.03125 C 0.28125 0 0.40625 0 0.515625 0 C 0.625 0 0.84375 0 0.953125 0.03125 C 1.109375 -0.578125 1.421875 -1.390625 1.78125 -2.265625 Z M 1.78125 -2.265625 "/>
-</g>
-<g id="glyph-1-23">
-<path d="M 3.234375 -1.3125 L 3.203125 -1.3125 C 1.890625 -4.34375 1.390625 -6.109375 1.3125 -6.453125 C 1.140625 -6.4375 0.90625 -6.4375 0.75 -6.4375 C 0.59375 -6.4375 0.3125 -6.4375 0.15625 -6.453125 C 0.65625 -5.453125 1.96875 -2.078125 2.890625 0.09375 L 3.171875 0.09375 C 4.125 -2.109375 5.109375 -4.25 6.109375 -6.453125 C 5.984375 -6.4375 5.796875 -6.4375 5.71875 -6.4375 C 5.609375 -6.4375 5.375 -6.4375 5.265625 -6.453125 C 4.921875 -5.171875 3.9375 -2.96875 3.234375 -1.3125 Z M 3.234375 -1.3125 "/>
-</g>
-<g id="glyph-1-24">
-<path d="M 5.625 0.03125 C 4.8125 -1.109375 4.171875 -2.109375 3.296875 -3.4375 C 4.203125 -4.78125 5.03125 -5.859375 5.46875 -6.453125 C 5.359375 -6.4375 5.15625 -6.4375 5.046875 -6.4375 C 4.9375 -6.4375 4.734375 -6.4375 4.625 -6.453125 C 4.078125 -5.453125 3.84375 -5.09375 3.015625 -3.890625 C 2.359375 -4.890625 1.6875 -5.859375 1.40625 -6.453125 C 1.21875 -6.4375 1 -6.4375 0.84375 -6.4375 C 0.671875 -6.4375 0.453125 -6.4375 0.28125 -6.453125 L 2.4375 -3.203125 L 0.234375 0.03125 C 0.34375 0 0.53125 0 0.640625 0 C 0.765625 0 0.953125 0 1.0625 0.03125 C 1.59375 -0.953125 2.171875 -1.890625 2.734375 -2.734375 C 3.375 -1.78125 3.9375 -0.96875 4.46875 0.03125 C 4.640625 0 4.859375 0 5.046875 0 C 5.21875 0 5.4375 0 5.625 0.03125 Z M 5.625 0.03125 "/>
-</g>
-<g id="glyph-1-25">
-<path d="M 1.90625 -4.4375 C 1.90625 -5.1875 1.921875 -5.90625 2 -6.4375 L 2 -6.453125 C 1.875 -6.4375 1.59375 -6.4375 1.46875 -6.4375 C 1.359375 -6.4375 1.09375 -6.4375 0.96875 -6.453125 L 0.953125 -6.4375 C 1.03125 -5.859375 1.046875 -5.1875 1.046875 -4.4375 L 1.046875 -2 C 1.046875 -1.25 1.03125 -0.53125 0.953125 0 L 0.953125 0.03125 C 0.953125 0.03125 1.140625 0 1.46875 0 L 4.203125 0 C 4.40625 0 4.640625 0.015625 4.796875 0.03125 L 4.8125 0 C 4.8125 -0.140625 4.796875 -0.34375 4.796875 -0.4375 C 4.796875 -0.53125 4.8125 -0.703125 4.8125 -0.78125 L 4.796875 -0.828125 C 4.796875 -0.828125 3.3125 -0.609375 1.984375 -0.609375 C 1.90625 -0.84375 1.90625 -1.828125 1.90625 -2 Z M 1.90625 -4.4375 "/>
-</g>
-<g id="glyph-1-26">
-<path d="M 1.03125 -4.4375 L 1.03125 -2 C 1.03125 -1.25 1.015625 -0.53125 0.9375 0 L 0.953125 0.03125 C 1.078125 0.015625 1.34375 0 1.46875 0 C 1.578125 0 1.859375 0.015625 1.96875 0.03125 L 2 0 C 1.90625 -0.5625 1.890625 -1.25 1.890625 -2 L 1.890625 -4.4375 C 1.890625 -5.1875 1.90625 -5.90625 2 -6.4375 L 1.96875 -6.453125 C 1.859375 -6.4375 1.578125 -6.4375 1.46875 -6.4375 C 1.34375 -6.4375 1.078125 -6.4375 0.953125 -6.453125 L 0.9375 -6.4375 C 1.015625 -5.859375 1.03125 -5.1875 1.03125 -4.4375 Z M 1.03125 -4.4375 "/>
-</g>
-<g id="glyph-1-27">
-<path d="M 1.890625 -5.578125 C 1.890625 -5.90625 2 -6.03125 2.796875 -6.03125 C 3.359375 -6.03125 4.0625 -5.8125 4.0625 -4.796875 C 4.0625 -3.953125 3.5 -3.703125 2.671875 -3.703125 L 1.890625 -3.703125 Z M 1.890625 -3.265625 L 2.828125 -3.265625 C 3.984375 -3.265625 4.53125 -2.546875 4.53125 -1.640625 C 4.53125 -0.953125 4.296875 -0.40625 2.859375 -0.40625 C 2.15625 -0.40625 1.890625 -0.546875 1.890625 -0.875 Z M 1.46875 -6.4375 C 1.125 -6.4375 0.953125 -6.453125 0.953125 -6.453125 L 0.9375 -6.4375 C 1.015625 -5.859375 1.03125 -5.1875 1.03125 -4.4375 L 1.03125 -2 C 1.03125 -1.25 1.015625 -0.53125 0.9375 0 L 0.953125 0.03125 C 0.953125 0.03125 1.109375 0 1.46875 0 C 2.078125 0 2.15625 0.03125 3.046875 0.03125 C 5.03125 0.03125 5.515625 -0.984375 5.515625 -1.828125 C 5.515625 -2.796875 4.875 -3.34375 4 -3.59375 C 4.515625 -3.84375 4.9375 -4.359375 4.9375 -4.921875 C 4.9375 -5.578125 4.578125 -6.453125 2.734375 -6.453125 C 2.375 -6.453125 1.96875 -6.4375 1.46875 -6.4375 Z M 1.46875 -6.4375 "/>
-</g>
-<g id="glyph-1-28">
-<path d="M 5.421875 -4.4375 L 5.421875 -2.765625 C 5.421875 -1.640625 5.25 -0.453125 3.5625 -0.453125 C 1.890625 -0.453125 1.890625 -2.140625 1.890625 -2.6875 L 1.890625 -4.4375 C 1.890625 -5.1875 1.90625 -5.90625 2 -6.4375 L 1.96875 -6.453125 C 1.859375 -6.4375 1.578125 -6.4375 1.46875 -6.4375 C 1.34375 -6.4375 1.078125 -6.4375 0.953125 -6.453125 L 0.9375 -6.4375 C 1.015625 -5.859375 1.03125 -5.1875 1.03125 -4.4375 L 1.03125 -2.359375 C 1.03125 -0.3125 2.46875 0.09375 3.359375 0.09375 C 5.40625 0.09375 5.96875 -1.171875 5.96875 -2.9375 L 5.96875 -4.4375 C 5.96875 -5.1875 5.984375 -5.90625 6.0625 -6.4375 L 6.046875 -6.453125 C 5.921875 -6.4375 5.8125 -6.4375 5.6875 -6.4375 C 5.578125 -6.4375 5.46875 -6.4375 5.34375 -6.453125 L 5.328125 -6.4375 C 5.40625 -5.859375 5.421875 -5.1875 5.421875 -4.4375 Z M 5.421875 -4.4375 "/>
-</g>
-<g id="glyph-1-29">
-<path d="M 5.34375 -4.4375 L 5.34375 -3.640625 L 1.890625 -3.640625 L 1.890625 -4.4375 C 1.890625 -5.1875 1.90625 -5.90625 2 -6.4375 L 1.984375 -6.453125 C 1.859375 -6.4375 1.578125 -6.4375 1.46875 -6.4375 C 1.34375 -6.4375 1.078125 -6.4375 0.953125 -6.453125 L 0.9375 -6.4375 C 1.015625 -5.859375 1.03125 -5.1875 1.03125 -4.4375 L 1.03125 -2 C 1.03125 -1.25 1.015625 -0.53125 0.9375 0 L 0.953125 0.03125 C 1.0625 0.015625 1.34375 0 1.46875 0 C 1.578125 0 1.859375 0.015625 1.96875 0.03125 L 2 0 C 1.90625 -0.5625 1.890625 -1.25 1.890625 -2 L 1.890625 -3.203125 L 5.34375 -3.203125 L 5.34375 -2 C 5.34375 -1.25 5.328125 -0.53125 5.25 0 L 5.25 0.03125 C 5.375 0.015625 5.65625 0 5.765625 0 C 5.890625 0 6.15625 0.015625 6.28125 0.03125 L 6.296875 0 C 6.21875 -0.5625 6.203125 -1.25 6.203125 -2 L 6.203125 -4.4375 C 6.203125 -5.1875 6.21875 -5.90625 6.296875 -6.4375 L 6.296875 -6.453125 C 6.171875 -6.4375 5.890625 -6.4375 5.765625 -6.4375 C 5.65625 -6.4375 5.390625 -6.4375 5.265625 -6.453125 L 5.25 -6.4375 C 5.328125 -5.859375 5.34375 -5.1875 5.34375 -4.4375 Z M 5.34375 -4.4375 "/>
-</g>
-<g id="glyph-1-30">
-<path d="M 3.3125 -2.84375 C 3.734375 -3.640625 4.421875 -4.609375 5.625 -6.453125 C 5.5 -6.4375 5.328125 -6.4375 5.203125 -6.4375 C 5.078125 -6.4375 4.90625 -6.4375 4.78125 -6.453125 C 4.28125 -5.40625 3.625 -4.296875 3.046875 -3.390625 C 2.40625 -4.46875 1.828125 -5.40625 1.296875 -6.453125 C 1.125 -6.4375 0.890625 -6.4375 0.734375 -6.4375 C 0.5625 -6.4375 0.328125 -6.4375 0.15625 -6.453125 C 0.5625 -5.859375 2.078125 -3.390625 2.46875 -2.734375 C 2.46875 -1.765625 2.4375 -0.515625 2.390625 0.03125 C 2.515625 0.015625 2.765625 0 2.890625 0 C 3.015625 0 3.265625 0.015625 3.390625 0.03125 C 3.34375 -0.453125 3.328125 -1.859375 3.3125 -2.84375 Z M 3.3125 -2.84375 "/>
-</g>
-<g id="glyph-1-31">
-<path d="M 5.4375 -1.171875 L 5.4375 -0.71875 C 5.109375 -0.40625 4.515625 -0.34375 3.984375 -0.34375 C 2.203125 -0.34375 1.546875 -1.9375 1.546875 -3.265625 C 1.546875 -5 2.515625 -6.125 3.859375 -6.125 C 4.75 -6.125 5.515625 -5.625 5.96875 -4.859375 L 6.09375 -4.875 C 6.125 -5.390625 6.1875 -5.65625 6.296875 -6.03125 L 6.265625 -6.0625 C 6.265625 -6.0625 5.21875 -6.5625 4 -6.5625 C 2.28125 -6.5625 0.515625 -5.359375 0.515625 -3.15625 C 0.515625 -1.4375 1.75 0.09375 3.75 0.09375 C 4.90625 0.09375 5.71875 -0.234375 6.40625 -0.796875 L 6.40625 -0.828125 C 6.3125 -0.921875 6.296875 -1.203125 6.296875 -1.3125 L 6.296875 -1.359375 C 6.296875 -2.109375 6.3125 -2.53125 6.40625 -3.0625 L 6.390625 -3.09375 C 6.390625 -3.09375 6.21875 -3.0625 5.875 -3.0625 C 5.53125 -3.0625 5.359375 -3.09375 5.359375 -3.09375 L 5.34375 -3.0625 C 5.421875 -2.5 5.4375 -1.90625 5.4375 -1.171875 Z M 5.4375 -1.171875 "/>
-</g>
-<g id="glyph-1-32">
-<path d="M 4.578125 -1.796875 C 4.578125 -2.109375 4.609375 -2.484375 4.609375 -2.796875 C 4.609375 -3.84375 4.265625 -4.375 3.375 -4.375 C 2.984375 -4.375 2.28125 -4.234375 1.640625 -3.484375 L 1.625 -3.515625 L 1.625 -4.234375 C 1.609375 -4.3125 1.609375 -4.34375 1.5625 -4.34375 C 1.328125 -4.28125 0.984375 -4.28125 0.75 -4.3125 L 0.734375 -4.28125 C 0.8125 -3.84375 0.828125 -3.09375 0.828125 -2.34375 L 0.828125 -1.796875 C 0.828125 -1.046875 0.8125 -0.53125 0.734375 0 L 0.75 0.03125 C 0.875 0.015625 1.109375 0 1.21875 0 C 1.34375 0 1.578125 0.015625 1.6875 0.03125 L 1.71875 0 C 1.640625 -0.5625 1.625 -1.03125 1.625 -1.796875 L 1.625 -2.953125 C 2.140625 -3.5625 2.71875 -3.78125 3.078125 -3.78125 C 3.578125 -3.78125 3.78125 -3.578125 3.78125 -2.765625 L 3.78125 -1.796875 C 3.78125 -1.046875 3.765625 -0.53125 3.703125 0 L 3.71875 0.03125 C 3.84375 0.015625 4.0625 0 4.1875 0 C 4.3125 0 4.53125 0.015625 4.65625 0.03125 L 4.671875 0 C 4.59375 -0.5625 4.578125 -1.03125 4.578125 -1.796875 Z M 4.578125 -1.796875 "/>
-</g>
-</g>
-<clipPath id="clip-0">
-<path clip-rule="nonzero" d="M 8.953125 193.160156 L 54.308594 193.160156 L 54.308594 215.839844 L 8.953125 215.839844 Z M 8.953125 193.160156 "/>
-</clipPath>
-<linearGradient id="linear-pattern-0" gradientUnits="userSpaceOnUse" x1="0" y1="25.002712" x2="0" y2="74.997244" gradientTransform="matrix(0.90721, 0, 0, -0.4536, -13.72998, 227.18055)">
-<stop offset="0" stop-color="rgb(91.372681%, 71.765137%, 71.765137%)" stop-opacity="1"/>
-<stop offset="0.015625" stop-color="rgb(91.423035%, 71.929932%, 71.929932%)" stop-opacity="1"/>
-<stop offset="0.03125" stop-color="rgb(91.523743%, 72.261047%, 72.261047%)" stop-opacity="1"/>
-<stop offset="0.046875" stop-color="rgb(91.624451%, 72.592163%, 72.592163%)" stop-opacity="1"/>
-<stop offset="0.0625" stop-color="rgb(91.726685%, 72.923279%, 72.923279%)" stop-opacity="1"/>
-<stop offset="0.078125" stop-color="rgb(91.827393%, 73.252869%, 73.252869%)" stop-opacity="1"/>
-<stop offset="0.09375" stop-color="rgb(91.928101%, 73.583984%, 73.583984%)" stop-opacity="1"/>
-<stop offset="0.109375" stop-color="rgb(92.028809%, 73.9151%, 73.9151%)" stop-opacity="1"/>
-<stop offset="0.125" stop-color="rgb(92.131042%, 74.246216%, 74.246216%)" stop-opacity="1"/>
-<stop offset="0.140625" stop-color="rgb(92.23175%, 74.577332%, 74.577332%)" stop-opacity="1"/>
-<stop offset="0.15625" stop-color="rgb(92.332458%, 74.906921%, 74.906921%)" stop-opacity="1"/>
-<stop offset="0.171875" stop-color="rgb(92.433167%, 75.238037%, 75.238037%)" stop-opacity="1"/>
-<stop offset="0.1875" stop-color="rgb(92.5354%, 75.569153%, 75.569153%)" stop-opacity="1"/>
-<stop offset="0.203125" stop-color="rgb(92.636108%, 75.900269%, 75.900269%)" stop-opacity="1"/>
-<stop offset="0.21875" stop-color="rgb(92.736816%, 76.231384%, 76.231384%)" stop-opacity="1"/>
-<stop offset="0.234375" stop-color="rgb(92.837524%, 76.560974%, 76.560974%)" stop-opacity="1"/>
-<stop offset="0.25" stop-color="rgb(92.939758%, 76.89209%, 76.89209%)" stop-opacity="1"/>
-<stop offset="0.265625" stop-color="rgb(93.040466%, 77.223206%, 77.223206%)" stop-opacity="1"/>
-<stop offset="0.28125" stop-color="rgb(93.141174%, 77.554321%, 77.554321%)" stop-opacity="1"/>
-<stop offset="0.296875" stop-color="rgb(93.241882%, 77.885437%, 77.885437%)" stop-opacity="1"/>
-<stop offset="0.3125" stop-color="rgb(93.344116%, 78.216553%, 78.216553%)" stop-opacity="1"/>
-<stop offset="0.328125" stop-color="rgb(93.444824%, 78.546143%, 78.546143%)" stop-opacity="1"/>
-<stop offset="0.34375" stop-color="rgb(93.545532%, 78.877258%, 78.877258%)" stop-opacity="1"/>
-<stop offset="0.359375" stop-color="rgb(93.64624%, 79.208374%, 79.208374%)" stop-opacity="1"/>
-<stop offset="0.375" stop-color="rgb(93.748474%, 79.53949%, 79.53949%)" stop-opacity="1"/>
-<stop offset="0.390625" stop-color="rgb(93.849182%, 79.870605%, 79.870605%)" stop-opacity="1"/>
-<stop offset="0.40625" stop-color="rgb(93.94989%, 80.200195%, 80.200195%)" stop-opacity="1"/>
-<stop offset="0.421875" stop-color="rgb(94.050598%, 80.531311%, 80.531311%)" stop-opacity="1"/>
-<stop offset="0.4375" stop-color="rgb(94.152832%, 80.862427%, 80.862427%)" stop-opacity="1"/>
-<stop offset="0.453125" stop-color="rgb(94.25354%, 81.193542%, 81.193542%)" stop-opacity="1"/>
-<stop offset="0.46875" stop-color="rgb(94.354248%, 81.524658%, 81.524658%)" stop-opacity="1"/>
-<stop offset="0.484375" stop-color="rgb(94.454956%, 81.855774%, 81.855774%)" stop-opacity="1"/>
-<stop offset="0.5" stop-color="rgb(94.555664%, 82.185364%, 82.185364%)" stop-opacity="1"/>
-<stop offset="0.515625" stop-color="rgb(94.657898%, 82.516479%, 82.516479%)" stop-opacity="1"/>
-<stop offset="0.53125" stop-color="rgb(94.758606%, 82.847595%, 82.847595%)" stop-opacity="1"/>
-<stop offset="0.546875" stop-color="rgb(94.859314%, 83.178711%, 83.178711%)" stop-opacity="1"/>
-<stop offset="0.5625" stop-color="rgb(94.961548%, 83.509827%, 83.509827%)" stop-opacity="1"/>
-<stop offset="0.578125" stop-color="rgb(95.062256%, 83.839417%, 83.839417%)" stop-opacity="1"/>
-<stop offset="0.59375" stop-color="rgb(95.162964%, 84.170532%, 84.170532%)" stop-opacity="1"/>
-<stop offset="0.609375" stop-color="rgb(95.263672%, 84.501648%, 84.501648%)" stop-opacity="1"/>
-<stop offset="0.625" stop-color="rgb(95.365906%, 84.832764%, 84.832764%)" stop-opacity="1"/>
-<stop offset="0.640625" stop-color="rgb(95.466614%, 85.163879%, 85.163879%)" stop-opacity="1"/>
-<stop offset="0.65625" stop-color="rgb(95.567322%, 85.494995%, 85.494995%)" stop-opacity="1"/>
-<stop offset="0.671875" stop-color="rgb(95.66803%, 85.824585%, 85.824585%)" stop-opacity="1"/>
-<stop offset="0.6875" stop-color="rgb(95.770264%, 86.155701%, 86.155701%)" stop-opacity="1"/>
-<stop offset="0.703125" stop-color="rgb(95.870972%, 86.486816%, 86.486816%)" stop-opacity="1"/>
-<stop offset="0.71875" stop-color="rgb(95.97168%, 86.817932%, 86.817932%)" stop-opacity="1"/>
-<stop offset="0.734375" stop-color="rgb(96.072388%, 87.149048%, 87.149048%)" stop-opacity="1"/>
-<stop offset="0.75" stop-color="rgb(96.174622%, 87.478638%, 87.478638%)" stop-opacity="1"/>
-<stop offset="0.765625" stop-color="rgb(96.27533%, 87.809753%, 87.809753%)" stop-opacity="1"/>
-<stop offset="0.78125" stop-color="rgb(96.376038%, 88.140869%, 88.140869%)" stop-opacity="1"/>
-<stop offset="0.796875" stop-color="rgb(96.478271%, 88.471985%, 88.471985%)" stop-opacity="1"/>
-<stop offset="0.8125" stop-color="rgb(96.578979%, 88.803101%, 88.803101%)" stop-opacity="1"/>
-<stop offset="0.828125" stop-color="rgb(96.679688%, 89.13269%, 89.13269%)" stop-opacity="1"/>
-<stop offset="0.84375" stop-color="rgb(96.780396%, 89.463806%, 89.463806%)" stop-opacity="1"/>
-<stop offset="0.859375" stop-color="rgb(96.882629%, 89.794922%, 89.794922%)" stop-opacity="1"/>
-<stop offset="0.875" stop-color="rgb(96.983337%, 90.126038%, 90.126038%)" stop-opacity="1"/>
-<stop offset="0.890625" stop-color="rgb(97.084045%, 90.457153%, 90.457153%)" stop-opacity="1"/>
-<stop offset="0.90625" stop-color="rgb(97.184753%, 90.788269%, 90.788269%)" stop-opacity="1"/>
-<stop offset="0.921875" stop-color="rgb(97.286987%, 91.117859%, 91.117859%)" stop-opacity="1"/>
-<stop offset="0.9375" stop-color="rgb(97.387695%, 91.448975%, 91.448975%)" stop-opacity="1"/>
-<stop offset="0.953125" stop-color="rgb(97.488403%, 91.78009%, 91.78009%)" stop-opacity="1"/>
-<stop offset="0.96875" stop-color="rgb(97.589111%, 92.111206%, 92.111206%)" stop-opacity="1"/>
-<stop offset="0.984375" stop-color="rgb(97.691345%, 92.442322%, 92.442322%)" stop-opacity="1"/>
-<stop offset="1" stop-color="rgb(97.792053%, 92.771912%, 92.771912%)" stop-opacity="1"/>
-</linearGradient>
-<clipPath id="clip-1">
-<path clip-rule="nonzero" d="M 8.953125 159.144531 L 54.308594 159.144531 L 54.308594 181.824219 L 8.953125 181.824219 Z M 8.953125 159.144531 "/>
-</clipPath>
-<linearGradient id="linear-pattern-1" gradientUnits="userSpaceOnUse" x1="0" y1="25.002734" x2="0" y2="74.997244" gradientTransform="matrix(0.90721, 0, 0, -0.4536, -13.72998, 193.16448)">
-<stop offset="0" stop-color="rgb(91.372681%, 71.765137%, 71.765137%)" stop-opacity="1"/>
-<stop offset="0.015625" stop-color="rgb(91.423035%, 71.929932%, 71.929932%)" stop-opacity="1"/>
-<stop offset="0.03125" stop-color="rgb(91.523743%, 72.261047%, 72.261047%)" stop-opacity="1"/>
-<stop offset="0.046875" stop-color="rgb(91.624451%, 72.592163%, 72.592163%)" stop-opacity="1"/>
-<stop offset="0.0625" stop-color="rgb(91.726685%, 72.923279%, 72.923279%)" stop-opacity="1"/>
-<stop offset="0.078125" stop-color="rgb(91.827393%, 73.252869%, 73.252869%)" stop-opacity="1"/>
-<stop offset="0.09375" stop-color="rgb(91.928101%, 73.583984%, 73.583984%)" stop-opacity="1"/>
-<stop offset="0.109375" stop-color="rgb(92.028809%, 73.9151%, 73.9151%)" stop-opacity="1"/>
-<stop offset="0.125" stop-color="rgb(92.131042%, 74.246216%, 74.246216%)" stop-opacity="1"/>
-<stop offset="0.140625" stop-color="rgb(92.23175%, 74.577332%, 74.577332%)" stop-opacity="1"/>
-<stop offset="0.15625" stop-color="rgb(92.332458%, 74.906921%, 74.906921%)" stop-opacity="1"/>
-<stop offset="0.171875" stop-color="rgb(92.433167%, 75.238037%, 75.238037%)" stop-opacity="1"/>
-<stop offset="0.1875" stop-color="rgb(92.5354%, 75.569153%, 75.569153%)" stop-opacity="1"/>
-<stop offset="0.203125" stop-color="rgb(92.636108%, 75.900269%, 75.900269%)" stop-opacity="1"/>
-<stop offset="0.21875" stop-color="rgb(92.736816%, 76.231384%, 76.231384%)" stop-opacity="1"/>
-<stop offset="0.234375" stop-color="rgb(92.837524%, 76.560974%, 76.560974%)" stop-opacity="1"/>
-<stop offset="0.25" stop-color="rgb(92.939758%, 76.89209%, 76.89209%)" stop-opacity="1"/>
-<stop offset="0.265625" stop-color="rgb(93.040466%, 77.223206%, 77.223206%)" stop-opacity="1"/>
-<stop offset="0.28125" stop-color="rgb(93.141174%, 77.554321%, 77.554321%)" stop-opacity="1"/>
-<stop offset="0.296875" stop-color="rgb(93.241882%, 77.885437%, 77.885437%)" stop-opacity="1"/>
-<stop offset="0.3125" stop-color="rgb(93.344116%, 78.216553%, 78.216553%)" stop-opacity="1"/>
-<stop offset="0.328125" stop-color="rgb(93.444824%, 78.546143%, 78.546143%)" stop-opacity="1"/>
-<stop offset="0.34375" stop-color="rgb(93.545532%, 78.877258%, 78.877258%)" stop-opacity="1"/>
-<stop offset="0.359375" stop-color="rgb(93.64624%, 79.208374%, 79.208374%)" stop-opacity="1"/>
-<stop offset="0.375" stop-color="rgb(93.748474%, 79.53949%, 79.53949%)" stop-opacity="1"/>
-<stop offset="0.390625" stop-color="rgb(93.849182%, 79.870605%, 79.870605%)" stop-opacity="1"/>
-<stop offset="0.40625" stop-color="rgb(93.94989%, 80.200195%, 80.200195%)" stop-opacity="1"/>
-<stop offset="0.421875" stop-color="rgb(94.050598%, 80.531311%, 80.531311%)" stop-opacity="1"/>
-<stop offset="0.4375" stop-color="rgb(94.152832%, 80.862427%, 80.862427%)" stop-opacity="1"/>
-<stop offset="0.453125" stop-color="rgb(94.25354%, 81.193542%, 81.193542%)" stop-opacity="1"/>
-<stop offset="0.46875" stop-color="rgb(94.354248%, 81.524658%, 81.524658%)" stop-opacity="1"/>
-<stop offset="0.484375" stop-color="rgb(94.454956%, 81.855774%, 81.855774%)" stop-opacity="1"/>
-<stop offset="0.5" stop-color="rgb(94.555664%, 82.185364%, 82.185364%)" stop-opacity="1"/>
-<stop offset="0.515625" stop-color="rgb(94.657898%, 82.516479%, 82.516479%)" stop-opacity="1"/>
-<stop offset="0.53125" stop-color="rgb(94.758606%, 82.847595%, 82.847595%)" stop-opacity="1"/>
-<stop offset="0.546875" stop-color="rgb(94.859314%, 83.178711%, 83.178711%)" stop-opacity="1"/>
-<stop offset="0.5625" stop-color="rgb(94.961548%, 83.509827%, 83.509827%)" stop-opacity="1"/>
-<stop offset="0.578125" stop-color="rgb(95.062256%, 83.839417%, 83.839417%)" stop-opacity="1"/>
-<stop offset="0.59375" stop-color="rgb(95.162964%, 84.170532%, 84.170532%)" stop-opacity="1"/>
-<stop offset="0.609375" stop-color="rgb(95.263672%, 84.501648%, 84.501648%)" stop-opacity="1"/>
-<stop offset="0.625" stop-color="rgb(95.365906%, 84.832764%, 84.832764%)" stop-opacity="1"/>
-<stop offset="0.640625" stop-color="rgb(95.466614%, 85.163879%, 85.163879%)" stop-opacity="1"/>
-<stop offset="0.65625" stop-color="rgb(95.567322%, 85.494995%, 85.494995%)" stop-opacity="1"/>
-<stop offset="0.671875" stop-color="rgb(95.66803%, 85.824585%, 85.824585%)" stop-opacity="1"/>
-<stop offset="0.6875" stop-color="rgb(95.770264%, 86.155701%, 86.155701%)" stop-opacity="1"/>
-<stop offset="0.703125" stop-color="rgb(95.870972%, 86.486816%, 86.486816%)" stop-opacity="1"/>
-<stop offset="0.71875" stop-color="rgb(95.97168%, 86.817932%, 86.817932%)" stop-opacity="1"/>
-<stop offset="0.734375" stop-color="rgb(96.072388%, 87.149048%, 87.149048%)" stop-opacity="1"/>
-<stop offset="0.75" stop-color="rgb(96.174622%, 87.478638%, 87.478638%)" stop-opacity="1"/>
-<stop offset="0.765625" stop-color="rgb(96.27533%, 87.809753%, 87.809753%)" stop-opacity="1"/>
-<stop offset="0.78125" stop-color="rgb(96.376038%, 88.140869%, 88.140869%)" stop-opacity="1"/>
-<stop offset="0.796875" stop-color="rgb(96.478271%, 88.471985%, 88.471985%)" stop-opacity="1"/>
-<stop offset="0.8125" stop-color="rgb(96.578979%, 88.803101%, 88.803101%)" stop-opacity="1"/>
-<stop offset="0.828125" stop-color="rgb(96.679688%, 89.13269%, 89.13269%)" stop-opacity="1"/>
-<stop offset="0.84375" stop-color="rgb(96.780396%, 89.463806%, 89.463806%)" stop-opacity="1"/>
-<stop offset="0.859375" stop-color="rgb(96.882629%, 89.794922%, 89.794922%)" stop-opacity="1"/>
-<stop offset="0.875" stop-color="rgb(96.983337%, 90.126038%, 90.126038%)" stop-opacity="1"/>
-<stop offset="0.890625" stop-color="rgb(97.084045%, 90.457153%, 90.457153%)" stop-opacity="1"/>
-<stop offset="0.90625" stop-color="rgb(97.184753%, 90.788269%, 90.788269%)" stop-opacity="1"/>
-<stop offset="0.921875" stop-color="rgb(97.286987%, 91.117859%, 91.117859%)" stop-opacity="1"/>
-<stop offset="0.9375" stop-color="rgb(97.387695%, 91.448975%, 91.448975%)" stop-opacity="1"/>
-<stop offset="0.953125" stop-color="rgb(97.488403%, 91.78009%, 91.78009%)" stop-opacity="1"/>
-<stop offset="0.96875" stop-color="rgb(97.589111%, 92.111206%, 92.111206%)" stop-opacity="1"/>
-<stop offset="0.984375" stop-color="rgb(97.691345%, 92.442322%, 92.442322%)" stop-opacity="1"/>
-<stop offset="1" stop-color="rgb(97.792053%, 92.771912%, 92.771912%)" stop-opacity="1"/>
-</linearGradient>
-<clipPath id="clip-2">
-<path clip-rule="nonzero" d="M 8.953125 125.128906 L 54.308594 125.128906 L 54.308594 147.808594 L 8.953125 147.808594 Z M 8.953125 125.128906 "/>
-</clipPath>
-<linearGradient id="linear-pattern-2" gradientUnits="userSpaceOnUse" x1="0" y1="25.002778" x2="0" y2="74.99731" gradientTransform="matrix(0.90721, 0, 0, -0.4536, -13.72998, 159.14843)">
-<stop offset="0" stop-color="rgb(91.372681%, 71.765137%, 71.765137%)" stop-opacity="1"/>
-<stop offset="0.015625" stop-color="rgb(91.423035%, 71.929932%, 71.929932%)" stop-opacity="1"/>
-<stop offset="0.03125" stop-color="rgb(91.523743%, 72.261047%, 72.261047%)" stop-opacity="1"/>
-<stop offset="0.046875" stop-color="rgb(91.624451%, 72.592163%, 72.592163%)" stop-opacity="1"/>
-<stop offset="0.0625" stop-color="rgb(91.726685%, 72.923279%, 72.923279%)" stop-opacity="1"/>
-<stop offset="0.078125" stop-color="rgb(91.827393%, 73.252869%, 73.252869%)" stop-opacity="1"/>
-<stop offset="0.09375" stop-color="rgb(91.928101%, 73.583984%, 73.583984%)" stop-opacity="1"/>
-<stop offset="0.109375" stop-color="rgb(92.028809%, 73.9151%, 73.9151%)" stop-opacity="1"/>
-<stop offset="0.125" stop-color="rgb(92.131042%, 74.246216%, 74.246216%)" stop-opacity="1"/>
-<stop offset="0.140625" stop-color="rgb(92.23175%, 74.577332%, 74.577332%)" stop-opacity="1"/>
-<stop offset="0.15625" stop-color="rgb(92.332458%, 74.906921%, 74.906921%)" stop-opacity="1"/>
-<stop offset="0.171875" stop-color="rgb(92.433167%, 75.238037%, 75.238037%)" stop-opacity="1"/>
-<stop offset="0.1875" stop-color="rgb(92.5354%, 75.569153%, 75.569153%)" stop-opacity="1"/>
-<stop offset="0.203125" stop-color="rgb(92.636108%, 75.900269%, 75.900269%)" stop-opacity="1"/>
-<stop offset="0.21875" stop-color="rgb(92.736816%, 76.231384%, 76.231384%)" stop-opacity="1"/>
-<stop offset="0.234375" stop-color="rgb(92.837524%, 76.5625%, 76.5625%)" stop-opacity="1"/>
-<stop offset="0.25" stop-color="rgb(92.939758%, 76.89209%, 76.89209%)" stop-opacity="1"/>
-<stop offset="0.265625" stop-color="rgb(93.040466%, 77.223206%, 77.223206%)" stop-opacity="1"/>
-<stop offset="0.28125" stop-color="rgb(93.141174%, 77.554321%, 77.554321%)" stop-opacity="1"/>
-<stop offset="0.296875" stop-color="rgb(93.241882%, 77.885437%, 77.885437%)" stop-opacity="1"/>
-<stop offset="0.3125" stop-color="rgb(93.344116%, 78.216553%, 78.216553%)" stop-opacity="1"/>
-<stop offset="0.328125" stop-color="rgb(93.444824%, 78.546143%, 78.546143%)" stop-opacity="1"/>
-<stop offset="0.34375" stop-color="rgb(93.545532%, 78.877258%, 78.877258%)" stop-opacity="1"/>
-<stop offset="0.359375" stop-color="rgb(93.64624%, 79.208374%, 79.208374%)" stop-opacity="1"/>
-<stop offset="0.375" stop-color="rgb(93.748474%, 79.53949%, 79.53949%)" stop-opacity="1"/>
-<stop offset="0.390625" stop-color="rgb(93.849182%, 79.870605%, 79.870605%)" stop-opacity="1"/>
-<stop offset="0.40625" stop-color="rgb(93.94989%, 80.200195%, 80.200195%)" stop-opacity="1"/>
-<stop offset="0.421875" stop-color="rgb(94.050598%, 80.531311%, 80.531311%)" stop-opacity="1"/>
-<stop offset="0.4375" stop-color="rgb(94.152832%, 80.862427%, 80.862427%)" stop-opacity="1"/>
-<stop offset="0.453125" stop-color="rgb(94.25354%, 81.193542%, 81.193542%)" stop-opacity="1"/>
-<stop offset="0.46875" stop-color="rgb(94.354248%, 81.524658%, 81.524658%)" stop-opacity="1"/>
-<stop offset="0.484375" stop-color="rgb(94.454956%, 81.855774%, 81.855774%)" stop-opacity="1"/>
-<stop offset="0.5" stop-color="rgb(94.55719%, 82.185364%, 82.185364%)" stop-opacity="1"/>
-<stop offset="0.515625" stop-color="rgb(94.657898%, 82.516479%, 82.516479%)" stop-opacity="1"/>
-<stop offset="0.53125" stop-color="rgb(94.758606%, 82.847595%, 82.847595%)" stop-opacity="1"/>
-<stop offset="0.546875" stop-color="rgb(94.859314%, 83.178711%, 83.178711%)" stop-opacity="1"/>
-<stop offset="0.5625" stop-color="rgb(94.961548%, 83.509827%, 83.509827%)" stop-opacity="1"/>
-<stop offset="0.578125" stop-color="rgb(95.062256%, 83.839417%, 83.839417%)" stop-opacity="1"/>
-<stop offset="0.59375" stop-color="rgb(95.162964%, 84.170532%, 84.170532%)" stop-opacity="1"/>
-<stop offset="0.609375" stop-color="rgb(95.263672%, 84.501648%, 84.501648%)" stop-opacity="1"/>
-<stop offset="0.625" stop-color="rgb(95.365906%, 84.832764%, 84.832764%)" stop-opacity="1"/>
-<stop offset="0.640625" stop-color="rgb(95.466614%, 85.163879%, 85.163879%)" stop-opacity="1"/>
-<stop offset="0.65625" stop-color="rgb(95.567322%, 85.494995%, 85.494995%)" stop-opacity="1"/>
-<stop offset="0.671875" stop-color="rgb(95.66803%, 85.824585%, 85.824585%)" stop-opacity="1"/>
-<stop offset="0.6875" stop-color="rgb(95.770264%, 86.155701%, 86.155701%)" stop-opacity="1"/>
-<stop offset="0.703125" stop-color="rgb(95.870972%, 86.486816%, 86.486816%)" stop-opacity="1"/>
-<stop offset="0.71875" stop-color="rgb(95.97168%, 86.817932%, 86.817932%)" stop-opacity="1"/>
-<stop offset="0.734375" stop-color="rgb(96.072388%, 87.149048%, 87.149048%)" stop-opacity="1"/>
-<stop offset="0.75" stop-color="rgb(96.174622%, 87.478638%, 87.478638%)" stop-opacity="1"/>
-<stop offset="0.765625" stop-color="rgb(96.27533%, 87.809753%, 87.809753%)" stop-opacity="1"/>
-<stop offset="0.78125" stop-color="rgb(96.376038%, 88.140869%, 88.140869%)" stop-opacity="1"/>
-<stop offset="0.796875" stop-color="rgb(96.478271%, 88.471985%, 88.471985%)" stop-opacity="1"/>
-<stop offset="0.8125" stop-color="rgb(96.578979%, 88.803101%, 88.803101%)" stop-opacity="1"/>
-<stop offset="0.828125" stop-color="rgb(96.679688%, 89.134216%, 89.134216%)" stop-opacity="1"/>
-<stop offset="0.84375" stop-color="rgb(96.780396%, 89.463806%, 89.463806%)" stop-opacity="1"/>
-<stop offset="0.859375" stop-color="rgb(96.882629%, 89.794922%, 89.794922%)" stop-opacity="1"/>
-<stop offset="0.875" stop-color="rgb(96.983337%, 90.126038%, 90.126038%)" stop-opacity="1"/>
-<stop offset="0.890625" stop-color="rgb(97.084045%, 90.457153%, 90.457153%)" stop-opacity="1"/>
-<stop offset="0.90625" stop-color="rgb(97.184753%, 90.788269%, 90.788269%)" stop-opacity="1"/>
-<stop offset="0.921875" stop-color="rgb(97.286987%, 91.117859%, 91.117859%)" stop-opacity="1"/>
-<stop offset="0.9375" stop-color="rgb(97.387695%, 91.448975%, 91.448975%)" stop-opacity="1"/>
-<stop offset="0.953125" stop-color="rgb(97.488403%, 91.78009%, 91.78009%)" stop-opacity="1"/>
-<stop offset="0.96875" stop-color="rgb(97.589111%, 92.111206%, 92.111206%)" stop-opacity="1"/>
-<stop offset="0.984375" stop-color="rgb(97.691345%, 92.442322%, 92.442322%)" stop-opacity="1"/>
-<stop offset="1" stop-color="rgb(97.792053%, 92.771912%, 92.771912%)" stop-opacity="1"/>
-</linearGradient>
-<clipPath id="clip-3">
-<path clip-rule="nonzero" d="M 8.953125 91.113281 L 54.308594 91.113281 L 54.308594 113.792969 L 8.953125 113.792969 Z M 8.953125 91.113281 "/>
-</clipPath>
-<linearGradient id="linear-pattern-3" gradientUnits="userSpaceOnUse" x1="0" y1="25.002734" x2="0" y2="74.997266" gradientTransform="matrix(0.90721, 0, 0, -0.4536, -13.72998, 125.13233)">
-<stop offset="0" stop-color="rgb(91.372681%, 71.765137%, 71.765137%)" stop-opacity="1"/>
-<stop offset="0.015625" stop-color="rgb(91.423035%, 71.929932%, 71.929932%)" stop-opacity="1"/>
-<stop offset="0.03125" stop-color="rgb(91.523743%, 72.261047%, 72.261047%)" stop-opacity="1"/>
-<stop offset="0.046875" stop-color="rgb(91.624451%, 72.592163%, 72.592163%)" stop-opacity="1"/>
-<stop offset="0.0625" stop-color="rgb(91.726685%, 72.923279%, 72.923279%)" stop-opacity="1"/>
-<stop offset="0.078125" stop-color="rgb(91.827393%, 73.252869%, 73.252869%)" stop-opacity="1"/>
-<stop offset="0.09375" stop-color="rgb(91.928101%, 73.583984%, 73.583984%)" stop-opacity="1"/>
-<stop offset="0.109375" stop-color="rgb(92.028809%, 73.9151%, 73.9151%)" stop-opacity="1"/>
-<stop offset="0.125" stop-color="rgb(92.131042%, 74.246216%, 74.246216%)" stop-opacity="1"/>
-<stop offset="0.140625" stop-color="rgb(92.23175%, 74.577332%, 74.577332%)" stop-opacity="1"/>
-<stop offset="0.15625" stop-color="rgb(92.332458%, 74.906921%, 74.906921%)" stop-opacity="1"/>
-<stop offset="0.171875" stop-color="rgb(92.433167%, 75.238037%, 75.238037%)" stop-opacity="1"/>
-<stop offset="0.1875" stop-color="rgb(92.5354%, 75.569153%, 75.569153%)" stop-opacity="1"/>
-<stop offset="0.203125" stop-color="rgb(92.636108%, 75.900269%, 75.900269%)" stop-opacity="1"/>
-<stop offset="0.21875" stop-color="rgb(92.736816%, 76.231384%, 76.231384%)" stop-opacity="1"/>
-<stop offset="0.234375" stop-color="rgb(92.837524%, 76.5625%, 76.5625%)" stop-opacity="1"/>
-<stop offset="0.25" stop-color="rgb(92.939758%, 76.89209%, 76.89209%)" stop-opacity="1"/>
-<stop offset="0.265625" stop-color="rgb(93.040466%, 77.223206%, 77.223206%)" stop-opacity="1"/>
-<stop offset="0.28125" stop-color="rgb(93.141174%, 77.554321%, 77.554321%)" stop-opacity="1"/>
-<stop offset="0.296875" stop-color="rgb(93.241882%, 77.885437%, 77.885437%)" stop-opacity="1"/>
-<stop offset="0.3125" stop-color="rgb(93.344116%, 78.216553%, 78.216553%)" stop-opacity="1"/>
-<stop offset="0.328125" stop-color="rgb(93.444824%, 78.546143%, 78.546143%)" stop-opacity="1"/>
-<stop offset="0.34375" stop-color="rgb(93.545532%, 78.877258%, 78.877258%)" stop-opacity="1"/>
-<stop offset="0.359375" stop-color="rgb(93.64624%, 79.208374%, 79.208374%)" stop-opacity="1"/>
-<stop offset="0.375" stop-color="rgb(93.748474%, 79.53949%, 79.53949%)" stop-opacity="1"/>
-<stop offset="0.390625" stop-color="rgb(93.849182%, 79.870605%, 79.870605%)" stop-opacity="1"/>
-<stop offset="0.40625" stop-color="rgb(93.94989%, 80.200195%, 80.200195%)" stop-opacity="1"/>
-<stop offset="0.421875" stop-color="rgb(94.050598%, 80.531311%, 80.531311%)" stop-opacity="1"/>
-<stop offset="0.4375" stop-color="rgb(94.152832%, 80.862427%, 80.862427%)" stop-opacity="1"/>
-<stop offset="0.453125" stop-color="rgb(94.25354%, 81.193542%, 81.193542%)" stop-opacity="1"/>
-<stop offset="0.46875" stop-color="rgb(94.354248%, 81.524658%, 81.524658%)" stop-opacity="1"/>
-<stop offset="0.484375" stop-color="rgb(94.454956%, 81.855774%, 81.855774%)" stop-opacity="1"/>
-<stop offset="0.5" stop-color="rgb(94.555664%, 82.185364%, 82.185364%)" stop-opacity="1"/>
-<stop offset="0.515625" stop-color="rgb(94.657898%, 82.516479%, 82.516479%)" stop-opacity="1"/>
-<stop offset="0.53125" stop-color="rgb(94.758606%, 82.847595%, 82.847595%)" stop-opacity="1"/>
-<stop offset="0.546875" stop-color="rgb(94.859314%, 83.178711%, 83.178711%)" stop-opacity="1"/>
-<stop offset="0.5625" stop-color="rgb(94.961548%, 83.509827%, 83.509827%)" stop-opacity="1"/>
-<stop offset="0.578125" stop-color="rgb(95.062256%, 83.839417%, 83.839417%)" stop-opacity="1"/>
-<stop offset="0.59375" stop-color="rgb(95.162964%, 84.170532%, 84.170532%)" stop-opacity="1"/>
-<stop offset="0.609375" stop-color="rgb(95.263672%, 84.501648%, 84.501648%)" stop-opacity="1"/>
-<stop offset="0.625" stop-color="rgb(95.365906%, 84.832764%, 84.832764%)" stop-opacity="1"/>
-<stop offset="0.640625" stop-color="rgb(95.466614%, 85.163879%, 85.163879%)" stop-opacity="1"/>
-<stop offset="0.65625" stop-color="rgb(95.567322%, 85.494995%, 85.494995%)" stop-opacity="1"/>
-<stop offset="0.671875" stop-color="rgb(95.66803%, 85.824585%, 85.824585%)" stop-opacity="1"/>
-<stop offset="0.6875" stop-color="rgb(95.770264%, 86.155701%, 86.155701%)" stop-opacity="1"/>
-<stop offset="0.703125" stop-color="rgb(95.870972%, 86.486816%, 86.486816%)" stop-opacity="1"/>
-<stop offset="0.71875" stop-color="rgb(95.97168%, 86.817932%, 86.817932%)" stop-opacity="1"/>
-<stop offset="0.734375" stop-color="rgb(96.072388%, 87.149048%, 87.149048%)" stop-opacity="1"/>
-<stop offset="0.75" stop-color="rgb(96.174622%, 87.478638%, 87.478638%)" stop-opacity="1"/>
-<stop offset="0.765625" stop-color="rgb(96.27533%, 87.809753%, 87.809753%)" stop-opacity="1"/>
-<stop offset="0.78125" stop-color="rgb(96.376038%, 88.140869%, 88.140869%)" stop-opacity="1"/>
-<stop offset="0.796875" stop-color="rgb(96.478271%, 88.471985%, 88.471985%)" stop-opacity="1"/>
-<stop offset="0.8125" stop-color="rgb(96.578979%, 88.803101%, 88.803101%)" stop-opacity="1"/>
-<stop offset="0.828125" stop-color="rgb(96.679688%, 89.13269%, 89.13269%)" stop-opacity="1"/>
-<stop offset="0.84375" stop-color="rgb(96.780396%, 89.463806%, 89.463806%)" stop-opacity="1"/>
-<stop offset="0.859375" stop-color="rgb(96.882629%, 89.794922%, 89.794922%)" stop-opacity="1"/>
-<stop offset="0.875" stop-color="rgb(96.983337%, 90.126038%, 90.126038%)" stop-opacity="1"/>
-<stop offset="0.890625" stop-color="rgb(97.084045%, 90.457153%, 90.457153%)" stop-opacity="1"/>
-<stop offset="0.90625" stop-color="rgb(97.184753%, 90.788269%, 90.788269%)" stop-opacity="1"/>
-<stop offset="0.921875" stop-color="rgb(97.286987%, 91.117859%, 91.117859%)" stop-opacity="1"/>
-<stop offset="0.9375" stop-color="rgb(97.387695%, 91.448975%, 91.448975%)" stop-opacity="1"/>
-<stop offset="0.953125" stop-color="rgb(97.488403%, 91.78009%, 91.78009%)" stop-opacity="1"/>
-<stop offset="0.96875" stop-color="rgb(97.589111%, 92.111206%, 92.111206%)" stop-opacity="1"/>
-<stop offset="0.984375" stop-color="rgb(97.691345%, 92.442322%, 92.442322%)" stop-opacity="1"/>
-<stop offset="1" stop-color="rgb(97.792053%, 92.771912%, 92.771912%)" stop-opacity="1"/>
-</linearGradient>
-<clipPath id="clip-4">
-<path clip-rule="nonzero" d="M 99.664062 125.128906 L 145.019531 125.128906 L 145.019531 147.808594 L 99.664062 147.808594 Z M 99.664062 125.128906 "/>
-</clipPath>
-<linearGradient id="linear-pattern-4" gradientUnits="userSpaceOnUse" x1="0" y1="25.002778" x2="0" y2="74.99731" gradientTransform="matrix(0.90721, 0, 0, -0.4536, 76.97972, 159.14843)">
-<stop offset="0" stop-color="rgb(75.686646%, 83.529663%, 96.076965%)" stop-opacity="1"/>
-<stop offset="0.015625" stop-color="rgb(75.828552%, 83.625793%, 96.099854%)" stop-opacity="1"/>
-<stop offset="0.03125" stop-color="rgb(76.113892%, 83.81958%, 96.14563%)" stop-opacity="1"/>
-<stop offset="0.046875" stop-color="rgb(76.399231%, 84.011841%, 96.192932%)" stop-opacity="1"/>
-<stop offset="0.0625" stop-color="rgb(76.683044%, 84.205627%, 96.238708%)" stop-opacity="1"/>
-<stop offset="0.078125" stop-color="rgb(76.968384%, 84.397888%, 96.284485%)" stop-opacity="1"/>
-<stop offset="0.09375" stop-color="rgb(77.253723%, 84.590149%, 96.330261%)" stop-opacity="1"/>
-<stop offset="0.109375" stop-color="rgb(77.537537%, 84.783936%, 96.376038%)" stop-opacity="1"/>
-<stop offset="0.125" stop-color="rgb(77.822876%, 84.976196%, 96.421814%)" stop-opacity="1"/>
-<stop offset="0.140625" stop-color="rgb(78.108215%, 85.169983%, 96.46759%)" stop-opacity="1"/>
-<stop offset="0.15625" stop-color="rgb(78.393555%, 85.362244%, 96.514893%)" stop-opacity="1"/>
-<stop offset="0.171875" stop-color="rgb(78.677368%, 85.55603%, 96.560669%)" stop-opacity="1"/>
-<stop offset="0.1875" stop-color="rgb(78.962708%, 85.748291%, 96.606445%)" stop-opacity="1"/>
-<stop offset="0.203125" stop-color="rgb(79.248047%, 85.942078%, 96.652222%)" stop-opacity="1"/>
-<stop offset="0.21875" stop-color="rgb(79.53186%, 86.134338%, 96.697998%)" stop-opacity="1"/>
-<stop offset="0.234375" stop-color="rgb(79.8172%, 86.328125%, 96.743774%)" stop-opacity="1"/>
-<stop offset="0.25" stop-color="rgb(80.102539%, 86.520386%, 96.789551%)" stop-opacity="1"/>
-<stop offset="0.265625" stop-color="rgb(80.386353%, 86.714172%, 96.836853%)" stop-opacity="1"/>
-<stop offset="0.28125" stop-color="rgb(80.671692%, 86.906433%, 96.882629%)" stop-opacity="1"/>
-<stop offset="0.296875" stop-color="rgb(80.957031%, 87.10022%, 96.928406%)" stop-opacity="1"/>
-<stop offset="0.3125" stop-color="rgb(81.240845%, 87.29248%, 96.974182%)" stop-opacity="1"/>
-<stop offset="0.328125" stop-color="rgb(81.526184%, 87.484741%, 97.019958%)" stop-opacity="1"/>
-<stop offset="0.34375" stop-color="rgb(81.811523%, 87.678528%, 97.065735%)" stop-opacity="1"/>
-<stop offset="0.359375" stop-color="rgb(82.096863%, 87.870789%, 97.111511%)" stop-opacity="1"/>
-<stop offset="0.375" stop-color="rgb(82.380676%, 88.064575%, 97.158813%)" stop-opacity="1"/>
-<stop offset="0.390625" stop-color="rgb(82.666016%, 88.256836%, 97.20459%)" stop-opacity="1"/>
-<stop offset="0.40625" stop-color="rgb(82.951355%, 88.450623%, 97.250366%)" stop-opacity="1"/>
-<stop offset="0.421875" stop-color="rgb(83.235168%, 88.642883%, 97.296143%)" stop-opacity="1"/>
-<stop offset="0.4375" stop-color="rgb(83.520508%, 88.83667%, 97.341919%)" stop-opacity="1"/>
-<stop offset="0.453125" stop-color="rgb(83.805847%, 89.028931%, 97.387695%)" stop-opacity="1"/>
-<stop offset="0.46875" stop-color="rgb(84.089661%, 89.222717%, 97.433472%)" stop-opacity="1"/>
-<stop offset="0.484375" stop-color="rgb(84.375%, 89.414978%, 97.480774%)" stop-opacity="1"/>
-<stop offset="0.5" stop-color="rgb(84.660339%, 89.608765%, 97.52655%)" stop-opacity="1"/>
-<stop offset="0.515625" stop-color="rgb(84.945679%, 89.801025%, 97.572327%)" stop-opacity="1"/>
-<stop offset="0.53125" stop-color="rgb(85.229492%, 89.994812%, 97.618103%)" stop-opacity="1"/>
-<stop offset="0.546875" stop-color="rgb(85.514832%, 90.187073%, 97.663879%)" stop-opacity="1"/>
-<stop offset="0.5625" stop-color="rgb(85.800171%, 90.380859%, 97.709656%)" stop-opacity="1"/>
-<stop offset="0.578125" stop-color="rgb(86.083984%, 90.57312%, 97.755432%)" stop-opacity="1"/>
-<stop offset="0.59375" stop-color="rgb(86.369324%, 90.765381%, 97.801208%)" stop-opacity="1"/>
-<stop offset="0.609375" stop-color="rgb(86.654663%, 90.959167%, 97.846985%)" stop-opacity="1"/>
-<stop offset="0.625" stop-color="rgb(86.940002%, 91.151428%, 97.892761%)" stop-opacity="1"/>
-<stop offset="0.640625" stop-color="rgb(87.223816%, 91.345215%, 97.938538%)" stop-opacity="1"/>
-<stop offset="0.65625" stop-color="rgb(87.509155%, 91.537476%, 97.98584%)" stop-opacity="1"/>
-<stop offset="0.671875" stop-color="rgb(87.794495%, 91.731262%, 98.031616%)" stop-opacity="1"/>
-<stop offset="0.6875" stop-color="rgb(88.078308%, 91.923523%, 98.077393%)" stop-opacity="1"/>
-<stop offset="0.703125" stop-color="rgb(88.363647%, 92.11731%, 98.123169%)" stop-opacity="1"/>
-<stop offset="0.71875" stop-color="rgb(88.648987%, 92.30957%, 98.168945%)" stop-opacity="1"/>
-<stop offset="0.734375" stop-color="rgb(88.934326%, 92.503357%, 98.214722%)" stop-opacity="1"/>
-<stop offset="0.75" stop-color="rgb(89.21814%, 92.695618%, 98.260498%)" stop-opacity="1"/>
-<stop offset="0.765625" stop-color="rgb(89.503479%, 92.889404%, 98.306274%)" stop-opacity="1"/>
-<stop offset="0.78125" stop-color="rgb(89.788818%, 93.081665%, 98.352051%)" stop-opacity="1"/>
-<stop offset="0.796875" stop-color="rgb(90.072632%, 93.275452%, 98.397827%)" stop-opacity="1"/>
-<stop offset="0.8125" stop-color="rgb(90.357971%, 93.467712%, 98.443604%)" stop-opacity="1"/>
-<stop offset="0.828125" stop-color="rgb(90.643311%, 93.661499%, 98.48938%)" stop-opacity="1"/>
-<stop offset="0.84375" stop-color="rgb(90.927124%, 93.85376%, 98.536682%)" stop-opacity="1"/>
-<stop offset="0.859375" stop-color="rgb(91.212463%, 94.046021%, 98.582458%)" stop-opacity="1"/>
-<stop offset="0.875" stop-color="rgb(91.497803%, 94.239807%, 98.628235%)" stop-opacity="1"/>
-<stop offset="0.890625" stop-color="rgb(91.783142%, 94.432068%, 98.674011%)" stop-opacity="1"/>
-<stop offset="0.90625" stop-color="rgb(92.066956%, 94.625854%, 98.719788%)" stop-opacity="1"/>
-<stop offset="0.921875" stop-color="rgb(92.352295%, 94.818115%, 98.765564%)" stop-opacity="1"/>
-<stop offset="0.9375" stop-color="rgb(92.637634%, 95.011902%, 98.81134%)" stop-opacity="1"/>
-<stop offset="0.953125" stop-color="rgb(92.921448%, 95.204163%, 98.857117%)" stop-opacity="1"/>
-<stop offset="0.96875" stop-color="rgb(93.206787%, 95.397949%, 98.902893%)" stop-opacity="1"/>
-<stop offset="0.984375" stop-color="rgb(93.492126%, 95.59021%, 98.948669%)" stop-opacity="1"/>
-<stop offset="1" stop-color="rgb(93.777466%, 95.783997%, 98.994446%)" stop-opacity="1"/>
-</linearGradient>
-<clipPath id="clip-5">
-<path clip-rule="nonzero" d="M 190.371094 23.082031 L 247.066406 23.082031 L 247.066406 45.757812 L 190.371094 45.757812 Z M 190.371094 23.082031 "/>
-</clipPath>
-<linearGradient id="linear-pattern-5" gradientUnits="userSpaceOnUse" x1="0" y1="25.002734" x2="0" y2="74.997266" gradientTransform="matrix(1.134, 0, 0, -0.4536, 162.0192, 57.09974)">
-<stop offset="0" stop-color="rgb(80.000305%, 80.000305%, 80.000305%)" stop-opacity="1"/>
-<stop offset="0.015625" stop-color="rgb(80.116272%, 80.116272%, 80.116272%)" stop-opacity="1"/>
-<stop offset="0.03125" stop-color="rgb(80.351257%, 80.351257%, 80.351257%)" stop-opacity="1"/>
-<stop offset="0.046875" stop-color="rgb(80.586243%, 80.586243%, 80.586243%)" stop-opacity="1"/>
-<stop offset="0.0625" stop-color="rgb(80.819702%, 80.819702%, 80.819702%)" stop-opacity="1"/>
-<stop offset="0.078125" stop-color="rgb(81.054688%, 81.054688%, 81.054688%)" stop-opacity="1"/>
-<stop offset="0.09375" stop-color="rgb(81.288147%, 81.288147%, 81.288147%)" stop-opacity="1"/>
-<stop offset="0.109375" stop-color="rgb(81.523132%, 81.523132%, 81.523132%)" stop-opacity="1"/>
-<stop offset="0.125" stop-color="rgb(81.756592%, 81.756592%, 81.756592%)" stop-opacity="1"/>
-<stop offset="0.140625" stop-color="rgb(81.991577%, 81.991577%, 81.991577%)" stop-opacity="1"/>
-<stop offset="0.15625" stop-color="rgb(82.226562%, 82.226562%, 82.226562%)" stop-opacity="1"/>
-<stop offset="0.171875" stop-color="rgb(82.460022%, 82.460022%, 82.460022%)" stop-opacity="1"/>
-<stop offset="0.1875" stop-color="rgb(82.695007%, 82.695007%, 82.695007%)" stop-opacity="1"/>
-<stop offset="0.203125" stop-color="rgb(82.928467%, 82.928467%, 82.928467%)" stop-opacity="1"/>
-<stop offset="0.21875" stop-color="rgb(83.163452%, 83.163452%, 83.163452%)" stop-opacity="1"/>
-<stop offset="0.234375" stop-color="rgb(83.396912%, 83.396912%, 83.396912%)" stop-opacity="1"/>
-<stop offset="0.25" stop-color="rgb(83.631897%, 83.631897%, 83.631897%)" stop-opacity="1"/>
-<stop offset="0.265625" stop-color="rgb(83.866882%, 83.866882%, 83.866882%)" stop-opacity="1"/>
-<stop offset="0.28125" stop-color="rgb(84.100342%, 84.100342%, 84.100342%)" stop-opacity="1"/>
-<stop offset="0.296875" stop-color="rgb(84.335327%, 84.335327%, 84.335327%)" stop-opacity="1"/>
-<stop offset="0.3125" stop-color="rgb(84.568787%, 84.568787%, 84.568787%)" stop-opacity="1"/>
-<stop offset="0.328125" stop-color="rgb(84.803772%, 84.803772%, 84.803772%)" stop-opacity="1"/>
-<stop offset="0.34375" stop-color="rgb(85.038757%, 85.038757%, 85.038757%)" stop-opacity="1"/>
-<stop offset="0.359375" stop-color="rgb(85.272217%, 85.272217%, 85.272217%)" stop-opacity="1"/>
-<stop offset="0.375" stop-color="rgb(85.507202%, 85.507202%, 85.507202%)" stop-opacity="1"/>
-<stop offset="0.390625" stop-color="rgb(85.740662%, 85.740662%, 85.740662%)" stop-opacity="1"/>
-<stop offset="0.40625" stop-color="rgb(85.975647%, 85.975647%, 85.975647%)" stop-opacity="1"/>
-<stop offset="0.421875" stop-color="rgb(86.209106%, 86.209106%, 86.209106%)" stop-opacity="1"/>
-<stop offset="0.4375" stop-color="rgb(86.444092%, 86.444092%, 86.444092%)" stop-opacity="1"/>
-<stop offset="0.453125" stop-color="rgb(86.679077%, 86.679077%, 86.679077%)" stop-opacity="1"/>
-<stop offset="0.46875" stop-color="rgb(86.912537%, 86.912537%, 86.912537%)" stop-opacity="1"/>
-<stop offset="0.484375" stop-color="rgb(87.147522%, 87.147522%, 87.147522%)" stop-opacity="1"/>
-<stop offset="0.5" stop-color="rgb(87.380981%, 87.380981%, 87.380981%)" stop-opacity="1"/>
-<stop offset="0.515625" stop-color="rgb(87.615967%, 87.615967%, 87.615967%)" stop-opacity="1"/>
-<stop offset="0.53125" stop-color="rgb(87.850952%, 87.850952%, 87.850952%)" stop-opacity="1"/>
-<stop offset="0.546875" stop-color="rgb(88.084412%, 88.084412%, 88.084412%)" stop-opacity="1"/>
-<stop offset="0.5625" stop-color="rgb(88.319397%, 88.319397%, 88.319397%)" stop-opacity="1"/>
-<stop offset="0.578125" stop-color="rgb(88.552856%, 88.552856%, 88.552856%)" stop-opacity="1"/>
-<stop offset="0.59375" stop-color="rgb(88.787842%, 88.787842%, 88.787842%)" stop-opacity="1"/>
-<stop offset="0.609375" stop-color="rgb(89.021301%, 89.021301%, 89.021301%)" stop-opacity="1"/>
-<stop offset="0.625" stop-color="rgb(89.256287%, 89.256287%, 89.256287%)" stop-opacity="1"/>
-<stop offset="0.640625" stop-color="rgb(89.491272%, 89.491272%, 89.491272%)" stop-opacity="1"/>
-<stop offset="0.65625" stop-color="rgb(89.724731%, 89.724731%, 89.724731%)" stop-opacity="1"/>
-<stop offset="0.671875" stop-color="rgb(89.959717%, 89.959717%, 89.959717%)" stop-opacity="1"/>
-<stop offset="0.6875" stop-color="rgb(90.193176%, 90.193176%, 90.193176%)" stop-opacity="1"/>
-<stop offset="0.703125" stop-color="rgb(90.428162%, 90.428162%, 90.428162%)" stop-opacity="1"/>
-<stop offset="0.71875" stop-color="rgb(90.663147%, 90.663147%, 90.663147%)" stop-opacity="1"/>
-<stop offset="0.734375" stop-color="rgb(90.896606%, 90.896606%, 90.896606%)" stop-opacity="1"/>
-<stop offset="0.75" stop-color="rgb(91.131592%, 91.131592%, 91.131592%)" stop-opacity="1"/>
-<stop offset="0.765625" stop-color="rgb(91.365051%, 91.365051%, 91.365051%)" stop-opacity="1"/>
-<stop offset="0.78125" stop-color="rgb(91.600037%, 91.600037%, 91.600037%)" stop-opacity="1"/>
-<stop offset="0.796875" stop-color="rgb(91.833496%, 91.833496%, 91.833496%)" stop-opacity="1"/>
-<stop offset="0.8125" stop-color="rgb(92.068481%, 92.068481%, 92.068481%)" stop-opacity="1"/>
-<stop offset="0.828125" stop-color="rgb(92.303467%, 92.303467%, 92.303467%)" stop-opacity="1"/>
-<stop offset="0.84375" stop-color="rgb(92.536926%, 92.536926%, 92.536926%)" stop-opacity="1"/>
-<stop offset="0.859375" stop-color="rgb(92.771912%, 92.771912%, 92.771912%)" stop-opacity="1"/>
-<stop offset="0.875" stop-color="rgb(93.005371%, 93.005371%, 93.005371%)" stop-opacity="1"/>
-<stop offset="0.890625" stop-color="rgb(93.240356%, 93.240356%, 93.240356%)" stop-opacity="1"/>
-<stop offset="0.90625" stop-color="rgb(93.473816%, 93.473816%, 93.473816%)" stop-opacity="1"/>
-<stop offset="0.921875" stop-color="rgb(93.708801%, 93.708801%, 93.708801%)" stop-opacity="1"/>
-<stop offset="0.9375" stop-color="rgb(93.943787%, 93.943787%, 93.943787%)" stop-opacity="1"/>
-<stop offset="0.953125" stop-color="rgb(94.177246%, 94.177246%, 94.177246%)" stop-opacity="1"/>
-<stop offset="0.96875" stop-color="rgb(94.412231%, 94.412231%, 94.412231%)" stop-opacity="1"/>
-<stop offset="0.984375" stop-color="rgb(94.645691%, 94.645691%, 94.645691%)" stop-opacity="1"/>
-<stop offset="1" stop-color="rgb(94.880676%, 94.880676%, 94.880676%)" stop-opacity="1"/>
-</linearGradient>
-<clipPath id="clip-6">
-<path clip-rule="nonzero" d="M 190.371094 57.097656 L 247.066406 57.097656 L 247.066406 79.773438 L 190.371094 79.773438 Z M 190.371094 57.097656 "/>
-</clipPath>
-<linearGradient id="linear-pattern-6" gradientUnits="userSpaceOnUse" x1="0" y1="25.002756" x2="0" y2="74.997288" gradientTransform="matrix(1.134, 0, 0, -0.4536, 162.0192, 91.11583)">
-<stop offset="0" stop-color="rgb(80.000305%, 80.000305%, 80.000305%)" stop-opacity="1"/>
-<stop offset="0.015625" stop-color="rgb(80.116272%, 80.116272%, 80.116272%)" stop-opacity="1"/>
-<stop offset="0.03125" stop-color="rgb(80.351257%, 80.351257%, 80.351257%)" stop-opacity="1"/>
-<stop offset="0.046875" stop-color="rgb(80.586243%, 80.586243%, 80.586243%)" stop-opacity="1"/>
-<stop offset="0.0625" stop-color="rgb(80.819702%, 80.819702%, 80.819702%)" stop-opacity="1"/>
-<stop offset="0.078125" stop-color="rgb(81.054688%, 81.054688%, 81.054688%)" stop-opacity="1"/>
-<stop offset="0.09375" stop-color="rgb(81.288147%, 81.288147%, 81.288147%)" stop-opacity="1"/>
-<stop offset="0.109375" stop-color="rgb(81.523132%, 81.523132%, 81.523132%)" stop-opacity="1"/>
-<stop offset="0.125" stop-color="rgb(81.756592%, 81.756592%, 81.756592%)" stop-opacity="1"/>
-<stop offset="0.140625" stop-color="rgb(81.991577%, 81.991577%, 81.991577%)" stop-opacity="1"/>
-<stop offset="0.15625" stop-color="rgb(82.226562%, 82.226562%, 82.226562%)" stop-opacity="1"/>
-<stop offset="0.171875" stop-color="rgb(82.460022%, 82.460022%, 82.460022%)" stop-opacity="1"/>
-<stop offset="0.1875" stop-color="rgb(82.695007%, 82.695007%, 82.695007%)" stop-opacity="1"/>
-<stop offset="0.203125" stop-color="rgb(82.928467%, 82.928467%, 82.928467%)" stop-opacity="1"/>
-<stop offset="0.21875" stop-color="rgb(83.163452%, 83.163452%, 83.163452%)" stop-opacity="1"/>
-<stop offset="0.234375" stop-color="rgb(83.396912%, 83.396912%, 83.396912%)" stop-opacity="1"/>
-<stop offset="0.25" stop-color="rgb(83.631897%, 83.631897%, 83.631897%)" stop-opacity="1"/>
-<stop offset="0.265625" stop-color="rgb(83.866882%, 83.866882%, 83.866882%)" stop-opacity="1"/>
-<stop offset="0.28125" stop-color="rgb(84.100342%, 84.100342%, 84.100342%)" stop-opacity="1"/>
-<stop offset="0.296875" stop-color="rgb(84.335327%, 84.335327%, 84.335327%)" stop-opacity="1"/>
-<stop offset="0.3125" stop-color="rgb(84.568787%, 84.568787%, 84.568787%)" stop-opacity="1"/>
-<stop offset="0.328125" stop-color="rgb(84.803772%, 84.803772%, 84.803772%)" stop-opacity="1"/>
-<stop offset="0.34375" stop-color="rgb(85.038757%, 85.038757%, 85.038757%)" stop-opacity="1"/>
-<stop offset="0.359375" stop-color="rgb(85.272217%, 85.272217%, 85.272217%)" stop-opacity="1"/>
-<stop offset="0.375" stop-color="rgb(85.507202%, 85.507202%, 85.507202%)" stop-opacity="1"/>
-<stop offset="0.390625" stop-color="rgb(85.740662%, 85.740662%, 85.740662%)" stop-opacity="1"/>
-<stop offset="0.40625" stop-color="rgb(85.975647%, 85.975647%, 85.975647%)" stop-opacity="1"/>
-<stop offset="0.421875" stop-color="rgb(86.209106%, 86.209106%, 86.209106%)" stop-opacity="1"/>
-<stop offset="0.4375" stop-color="rgb(86.444092%, 86.444092%, 86.444092%)" stop-opacity="1"/>
-<stop offset="0.453125" stop-color="rgb(86.679077%, 86.679077%, 86.679077%)" stop-opacity="1"/>
-<stop offset="0.46875" stop-color="rgb(86.912537%, 86.912537%, 86.912537%)" stop-opacity="1"/>
-<stop offset="0.484375" stop-color="rgb(87.147522%, 87.147522%, 87.147522%)" stop-opacity="1"/>
-<stop offset="0.5" stop-color="rgb(87.380981%, 87.380981%, 87.380981%)" stop-opacity="1"/>
-<stop offset="0.515625" stop-color="rgb(87.615967%, 87.615967%, 87.615967%)" stop-opacity="1"/>
-<stop offset="0.53125" stop-color="rgb(87.850952%, 87.850952%, 87.850952%)" stop-opacity="1"/>
-<stop offset="0.546875" stop-color="rgb(88.084412%, 88.084412%, 88.084412%)" stop-opacity="1"/>
-<stop offset="0.5625" stop-color="rgb(88.319397%, 88.319397%, 88.319397%)" stop-opacity="1"/>
-<stop offset="0.578125" stop-color="rgb(88.552856%, 88.552856%, 88.552856%)" stop-opacity="1"/>
-<stop offset="0.59375" stop-color="rgb(88.787842%, 88.787842%, 88.787842%)" stop-opacity="1"/>
-<stop offset="0.609375" stop-color="rgb(89.021301%, 89.021301%, 89.021301%)" stop-opacity="1"/>
-<stop offset="0.625" stop-color="rgb(89.256287%, 89.256287%, 89.256287%)" stop-opacity="1"/>
-<stop offset="0.640625" stop-color="rgb(89.491272%, 89.491272%, 89.491272%)" stop-opacity="1"/>
-<stop offset="0.65625" stop-color="rgb(89.724731%, 89.724731%, 89.724731%)" stop-opacity="1"/>
-<stop offset="0.671875" stop-color="rgb(89.959717%, 89.959717%, 89.959717%)" stop-opacity="1"/>
-<stop offset="0.6875" stop-color="rgb(90.193176%, 90.193176%, 90.193176%)" stop-opacity="1"/>
-<stop offset="0.703125" stop-color="rgb(90.428162%, 90.428162%, 90.428162%)" stop-opacity="1"/>
-<stop offset="0.71875" stop-color="rgb(90.663147%, 90.663147%, 90.663147%)" stop-opacity="1"/>
-<stop offset="0.734375" stop-color="rgb(90.896606%, 90.896606%, 90.896606%)" stop-opacity="1"/>
-<stop offset="0.75" stop-color="rgb(91.131592%, 91.131592%, 91.131592%)" stop-opacity="1"/>
-<stop offset="0.765625" stop-color="rgb(91.365051%, 91.365051%, 91.365051%)" stop-opacity="1"/>
-<stop offset="0.78125" stop-color="rgb(91.600037%, 91.600037%, 91.600037%)" stop-opacity="1"/>
-<stop offset="0.796875" stop-color="rgb(91.833496%, 91.833496%, 91.833496%)" stop-opacity="1"/>
-<stop offset="0.8125" stop-color="rgb(92.068481%, 92.068481%, 92.068481%)" stop-opacity="1"/>
-<stop offset="0.828125" stop-color="rgb(92.303467%, 92.303467%, 92.303467%)" stop-opacity="1"/>
-<stop offset="0.84375" stop-color="rgb(92.536926%, 92.536926%, 92.536926%)" stop-opacity="1"/>
-<stop offset="0.859375" stop-color="rgb(92.771912%, 92.771912%, 92.771912%)" stop-opacity="1"/>
-<stop offset="0.875" stop-color="rgb(93.005371%, 93.005371%, 93.005371%)" stop-opacity="1"/>
-<stop offset="0.890625" stop-color="rgb(93.240356%, 93.240356%, 93.240356%)" stop-opacity="1"/>
-<stop offset="0.90625" stop-color="rgb(93.473816%, 93.473816%, 93.473816%)" stop-opacity="1"/>
-<stop offset="0.921875" stop-color="rgb(93.708801%, 93.708801%, 93.708801%)" stop-opacity="1"/>
-<stop offset="0.9375" stop-color="rgb(93.943787%, 93.943787%, 93.943787%)" stop-opacity="1"/>
-<stop offset="0.953125" stop-color="rgb(94.177246%, 94.177246%, 94.177246%)" stop-opacity="1"/>
-<stop offset="0.96875" stop-color="rgb(94.412231%, 94.412231%, 94.412231%)" stop-opacity="1"/>
-<stop offset="0.984375" stop-color="rgb(94.645691%, 94.645691%, 94.645691%)" stop-opacity="1"/>
-<stop offset="1" stop-color="rgb(94.880676%, 94.880676%, 94.880676%)" stop-opacity="1"/>
-</linearGradient>
-<clipPath id="clip-7">
-<path clip-rule="nonzero" d="M 190.371094 91.113281 L 247.066406 91.113281 L 247.066406 113.792969 L 190.371094 113.792969 Z M 190.371094 91.113281 "/>
-</clipPath>
-<linearGradient id="linear-pattern-7" gradientUnits="userSpaceOnUse" x1="0" y1="25.002734" x2="0" y2="74.997266" gradientTransform="matrix(1.134, 0, 0, -0.4536, 162.0192, 125.13233)">
-<stop offset="0" stop-color="rgb(80.000305%, 80.000305%, 80.000305%)" stop-opacity="1"/>
-<stop offset="0.015625" stop-color="rgb(80.116272%, 80.116272%, 80.116272%)" stop-opacity="1"/>
-<stop offset="0.03125" stop-color="rgb(80.351257%, 80.351257%, 80.351257%)" stop-opacity="1"/>
-<stop offset="0.046875" stop-color="rgb(80.586243%, 80.586243%, 80.586243%)" stop-opacity="1"/>
-<stop offset="0.0625" stop-color="rgb(80.819702%, 80.819702%, 80.819702%)" stop-opacity="1"/>
-<stop offset="0.078125" stop-color="rgb(81.054688%, 81.054688%, 81.054688%)" stop-opacity="1"/>
-<stop offset="0.09375" stop-color="rgb(81.288147%, 81.288147%, 81.288147%)" stop-opacity="1"/>
-<stop offset="0.109375" stop-color="rgb(81.523132%, 81.523132%, 81.523132%)" stop-opacity="1"/>
-<stop offset="0.125" stop-color="rgb(81.756592%, 81.756592%, 81.756592%)" stop-opacity="1"/>
-<stop offset="0.140625" stop-color="rgb(81.991577%, 81.991577%, 81.991577%)" stop-opacity="1"/>
-<stop offset="0.15625" stop-color="rgb(82.226562%, 82.226562%, 82.226562%)" stop-opacity="1"/>
-<stop offset="0.171875" stop-color="rgb(82.460022%, 82.460022%, 82.460022%)" stop-opacity="1"/>
-<stop offset="0.1875" stop-color="rgb(82.695007%, 82.695007%, 82.695007%)" stop-opacity="1"/>
-<stop offset="0.203125" stop-color="rgb(82.928467%, 82.928467%, 82.928467%)" stop-opacity="1"/>
-<stop offset="0.21875" stop-color="rgb(83.163452%, 83.163452%, 83.163452%)" stop-opacity="1"/>
-<stop offset="0.234375" stop-color="rgb(83.396912%, 83.396912%, 83.396912%)" stop-opacity="1"/>
-<stop offset="0.25" stop-color="rgb(83.631897%, 83.631897%, 83.631897%)" stop-opacity="1"/>
-<stop offset="0.265625" stop-color="rgb(83.866882%, 83.866882%, 83.866882%)" stop-opacity="1"/>
-<stop offset="0.28125" stop-color="rgb(84.100342%, 84.100342%, 84.100342%)" stop-opacity="1"/>
-<stop offset="0.296875" stop-color="rgb(84.335327%, 84.335327%, 84.335327%)" stop-opacity="1"/>
-<stop offset="0.3125" stop-color="rgb(84.568787%, 84.568787%, 84.568787%)" stop-opacity="1"/>
-<stop offset="0.328125" stop-color="rgb(84.803772%, 84.803772%, 84.803772%)" stop-opacity="1"/>
-<stop offset="0.34375" stop-color="rgb(85.038757%, 85.038757%, 85.038757%)" stop-opacity="1"/>
-<stop offset="0.359375" stop-color="rgb(85.272217%, 85.272217%, 85.272217%)" stop-opacity="1"/>
-<stop offset="0.375" stop-color="rgb(85.507202%, 85.507202%, 85.507202%)" stop-opacity="1"/>
-<stop offset="0.390625" stop-color="rgb(85.740662%, 85.740662%, 85.740662%)" stop-opacity="1"/>
-<stop offset="0.40625" stop-color="rgb(85.975647%, 85.975647%, 85.975647%)" stop-opacity="1"/>
-<stop offset="0.421875" stop-color="rgb(86.209106%, 86.209106%, 86.209106%)" stop-opacity="1"/>
-<stop offset="0.4375" stop-color="rgb(86.444092%, 86.444092%, 86.444092%)" stop-opacity="1"/>
-<stop offset="0.453125" stop-color="rgb(86.679077%, 86.679077%, 86.679077%)" stop-opacity="1"/>
-<stop offset="0.46875" stop-color="rgb(86.912537%, 86.912537%, 86.912537%)" stop-opacity="1"/>
-<stop offset="0.484375" stop-color="rgb(87.147522%, 87.147522%, 87.147522%)" stop-opacity="1"/>
-<stop offset="0.5" stop-color="rgb(87.380981%, 87.380981%, 87.380981%)" stop-opacity="1"/>
-<stop offset="0.515625" stop-color="rgb(87.615967%, 87.615967%, 87.615967%)" stop-opacity="1"/>
-<stop offset="0.53125" stop-color="rgb(87.850952%, 87.850952%, 87.850952%)" stop-opacity="1"/>
-<stop offset="0.546875" stop-color="rgb(88.084412%, 88.084412%, 88.084412%)" stop-opacity="1"/>
-<stop offset="0.5625" stop-color="rgb(88.319397%, 88.319397%, 88.319397%)" stop-opacity="1"/>
-<stop offset="0.578125" stop-color="rgb(88.552856%, 88.552856%, 88.552856%)" stop-opacity="1"/>
-<stop offset="0.59375" stop-color="rgb(88.787842%, 88.787842%, 88.787842%)" stop-opacity="1"/>
-<stop offset="0.609375" stop-color="rgb(89.021301%, 89.021301%, 89.021301%)" stop-opacity="1"/>
-<stop offset="0.625" stop-color="rgb(89.256287%, 89.256287%, 89.256287%)" stop-opacity="1"/>
-<stop offset="0.640625" stop-color="rgb(89.491272%, 89.491272%, 89.491272%)" stop-opacity="1"/>
-<stop offset="0.65625" stop-color="rgb(89.724731%, 89.724731%, 89.724731%)" stop-opacity="1"/>
-<stop offset="0.671875" stop-color="rgb(89.959717%, 89.959717%, 89.959717%)" stop-opacity="1"/>
-<stop offset="0.6875" stop-color="rgb(90.193176%, 90.193176%, 90.193176%)" stop-opacity="1"/>
-<stop offset="0.703125" stop-color="rgb(90.428162%, 90.428162%, 90.428162%)" stop-opacity="1"/>
-<stop offset="0.71875" stop-color="rgb(90.663147%, 90.663147%, 90.663147%)" stop-opacity="1"/>
-<stop offset="0.734375" stop-color="rgb(90.896606%, 90.896606%, 90.896606%)" stop-opacity="1"/>
-<stop offset="0.75" stop-color="rgb(91.131592%, 91.131592%, 91.131592%)" stop-opacity="1"/>
-<stop offset="0.765625" stop-color="rgb(91.365051%, 91.365051%, 91.365051%)" stop-opacity="1"/>
-<stop offset="0.78125" stop-color="rgb(91.600037%, 91.600037%, 91.600037%)" stop-opacity="1"/>
-<stop offset="0.796875" stop-color="rgb(91.833496%, 91.833496%, 91.833496%)" stop-opacity="1"/>
-<stop offset="0.8125" stop-color="rgb(92.068481%, 92.068481%, 92.068481%)" stop-opacity="1"/>
-<stop offset="0.828125" stop-color="rgb(92.303467%, 92.303467%, 92.303467%)" stop-opacity="1"/>
-<stop offset="0.84375" stop-color="rgb(92.536926%, 92.536926%, 92.536926%)" stop-opacity="1"/>
-<stop offset="0.859375" stop-color="rgb(92.771912%, 92.771912%, 92.771912%)" stop-opacity="1"/>
-<stop offset="0.875" stop-color="rgb(93.005371%, 93.005371%, 93.005371%)" stop-opacity="1"/>
-<stop offset="0.890625" stop-color="rgb(93.240356%, 93.240356%, 93.240356%)" stop-opacity="1"/>
-<stop offset="0.90625" stop-color="rgb(93.473816%, 93.473816%, 93.473816%)" stop-opacity="1"/>
-<stop offset="0.921875" stop-color="rgb(93.708801%, 93.708801%, 93.708801%)" stop-opacity="1"/>
-<stop offset="0.9375" stop-color="rgb(93.943787%, 93.943787%, 93.943787%)" stop-opacity="1"/>
-<stop offset="0.953125" stop-color="rgb(94.177246%, 94.177246%, 94.177246%)" stop-opacity="1"/>
-<stop offset="0.96875" stop-color="rgb(94.412231%, 94.412231%, 94.412231%)" stop-opacity="1"/>
-<stop offset="0.984375" stop-color="rgb(94.645691%, 94.645691%, 94.645691%)" stop-opacity="1"/>
-<stop offset="1" stop-color="rgb(94.880676%, 94.880676%, 94.880676%)" stop-opacity="1"/>
-</linearGradient>
-<clipPath id="clip-8">
-<path clip-rule="nonzero" d="M 190.371094 125.128906 L 247.066406 125.128906 L 247.066406 147.808594 L 190.371094 147.808594 Z M 190.371094 125.128906 "/>
-</clipPath>
-<linearGradient id="linear-pattern-8" gradientUnits="userSpaceOnUse" x1="0" y1="25.002778" x2="0" y2="74.99731" gradientTransform="matrix(1.134, 0, 0, -0.4536, 162.0192, 159.14843)">
-<stop offset="0" stop-color="rgb(80.000305%, 80.000305%, 80.000305%)" stop-opacity="1"/>
-<stop offset="0.015625" stop-color="rgb(80.116272%, 80.116272%, 80.116272%)" stop-opacity="1"/>
-<stop offset="0.03125" stop-color="rgb(80.351257%, 80.351257%, 80.351257%)" stop-opacity="1"/>
-<stop offset="0.046875" stop-color="rgb(80.586243%, 80.586243%, 80.586243%)" stop-opacity="1"/>
-<stop offset="0.0625" stop-color="rgb(80.819702%, 80.819702%, 80.819702%)" stop-opacity="1"/>
-<stop offset="0.078125" stop-color="rgb(81.054688%, 81.054688%, 81.054688%)" stop-opacity="1"/>
-<stop offset="0.09375" stop-color="rgb(81.288147%, 81.288147%, 81.288147%)" stop-opacity="1"/>
-<stop offset="0.109375" stop-color="rgb(81.523132%, 81.523132%, 81.523132%)" stop-opacity="1"/>
-<stop offset="0.125" stop-color="rgb(81.756592%, 81.756592%, 81.756592%)" stop-opacity="1"/>
-<stop offset="0.140625" stop-color="rgb(81.991577%, 81.991577%, 81.991577%)" stop-opacity="1"/>
-<stop offset="0.15625" stop-color="rgb(82.226562%, 82.226562%, 82.226562%)" stop-opacity="1"/>
-<stop offset="0.171875" stop-color="rgb(82.460022%, 82.460022%, 82.460022%)" stop-opacity="1"/>
-<stop offset="0.1875" stop-color="rgb(82.695007%, 82.695007%, 82.695007%)" stop-opacity="1"/>
-<stop offset="0.203125" stop-color="rgb(82.928467%, 82.928467%, 82.928467%)" stop-opacity="1"/>
-<stop offset="0.21875" stop-color="rgb(83.163452%, 83.163452%, 83.163452%)" stop-opacity="1"/>
-<stop offset="0.234375" stop-color="rgb(83.398438%, 83.398438%, 83.398438%)" stop-opacity="1"/>
-<stop offset="0.25" stop-color="rgb(83.631897%, 83.631897%, 83.631897%)" stop-opacity="1"/>
-<stop offset="0.265625" stop-color="rgb(83.866882%, 83.866882%, 83.866882%)" stop-opacity="1"/>
-<stop offset="0.28125" stop-color="rgb(84.100342%, 84.100342%, 84.100342%)" stop-opacity="1"/>
-<stop offset="0.296875" stop-color="rgb(84.335327%, 84.335327%, 84.335327%)" stop-opacity="1"/>
-<stop offset="0.3125" stop-color="rgb(84.568787%, 84.568787%, 84.568787%)" stop-opacity="1"/>
-<stop offset="0.328125" stop-color="rgb(84.803772%, 84.803772%, 84.803772%)" stop-opacity="1"/>
-<stop offset="0.34375" stop-color="rgb(85.038757%, 85.038757%, 85.038757%)" stop-opacity="1"/>
-<stop offset="0.359375" stop-color="rgb(85.272217%, 85.272217%, 85.272217%)" stop-opacity="1"/>
-<stop offset="0.375" stop-color="rgb(85.507202%, 85.507202%, 85.507202%)" stop-opacity="1"/>
-<stop offset="0.390625" stop-color="rgb(85.740662%, 85.740662%, 85.740662%)" stop-opacity="1"/>
-<stop offset="0.40625" stop-color="rgb(85.975647%, 85.975647%, 85.975647%)" stop-opacity="1"/>
-<stop offset="0.421875" stop-color="rgb(86.209106%, 86.209106%, 86.209106%)" stop-opacity="1"/>
-<stop offset="0.4375" stop-color="rgb(86.444092%, 86.444092%, 86.444092%)" stop-opacity="1"/>
-<stop offset="0.453125" stop-color="rgb(86.679077%, 86.679077%, 86.679077%)" stop-opacity="1"/>
-<stop offset="0.46875" stop-color="rgb(86.912537%, 86.912537%, 86.912537%)" stop-opacity="1"/>
-<stop offset="0.484375" stop-color="rgb(87.147522%, 87.147522%, 87.147522%)" stop-opacity="1"/>
-<stop offset="0.5" stop-color="rgb(87.380981%, 87.380981%, 87.380981%)" stop-opacity="1"/>
-<stop offset="0.515625" stop-color="rgb(87.615967%, 87.615967%, 87.615967%)" stop-opacity="1"/>
-<stop offset="0.53125" stop-color="rgb(87.850952%, 87.850952%, 87.850952%)" stop-opacity="1"/>
-<stop offset="0.546875" stop-color="rgb(88.084412%, 88.084412%, 88.084412%)" stop-opacity="1"/>
-<stop offset="0.5625" stop-color="rgb(88.319397%, 88.319397%, 88.319397%)" stop-opacity="1"/>
-<stop offset="0.578125" stop-color="rgb(88.552856%, 88.552856%, 88.552856%)" stop-opacity="1"/>
-<stop offset="0.59375" stop-color="rgb(88.787842%, 88.787842%, 88.787842%)" stop-opacity="1"/>
-<stop offset="0.609375" stop-color="rgb(89.021301%, 89.021301%, 89.021301%)" stop-opacity="1"/>
-<stop offset="0.625" stop-color="rgb(89.256287%, 89.256287%, 89.256287%)" stop-opacity="1"/>
-<stop offset="0.640625" stop-color="rgb(89.491272%, 89.491272%, 89.491272%)" stop-opacity="1"/>
-<stop offset="0.65625" stop-color="rgb(89.724731%, 89.724731%, 89.724731%)" stop-opacity="1"/>
-<stop offset="0.671875" stop-color="rgb(89.959717%, 89.959717%, 89.959717%)" stop-opacity="1"/>
-<stop offset="0.6875" stop-color="rgb(90.193176%, 90.193176%, 90.193176%)" stop-opacity="1"/>
-<stop offset="0.703125" stop-color="rgb(90.428162%, 90.428162%, 90.428162%)" stop-opacity="1"/>
-<stop offset="0.71875" stop-color="rgb(90.663147%, 90.663147%, 90.663147%)" stop-opacity="1"/>
-<stop offset="0.734375" stop-color="rgb(90.896606%, 90.896606%, 90.896606%)" stop-opacity="1"/>
-<stop offset="0.75" stop-color="rgb(91.131592%, 91.131592%, 91.131592%)" stop-opacity="1"/>
-<stop offset="0.765625" stop-color="rgb(91.365051%, 91.365051%, 91.365051%)" stop-opacity="1"/>
-<stop offset="0.78125" stop-color="rgb(91.600037%, 91.600037%, 91.600037%)" stop-opacity="1"/>
-<stop offset="0.796875" stop-color="rgb(91.833496%, 91.833496%, 91.833496%)" stop-opacity="1"/>
-<stop offset="0.8125" stop-color="rgb(92.068481%, 92.068481%, 92.068481%)" stop-opacity="1"/>
-<stop offset="0.828125" stop-color="rgb(92.303467%, 92.303467%, 92.303467%)" stop-opacity="1"/>
-<stop offset="0.84375" stop-color="rgb(92.536926%, 92.536926%, 92.536926%)" stop-opacity="1"/>
-<stop offset="0.859375" stop-color="rgb(92.771912%, 92.771912%, 92.771912%)" stop-opacity="1"/>
-<stop offset="0.875" stop-color="rgb(93.005371%, 93.005371%, 93.005371%)" stop-opacity="1"/>
-<stop offset="0.890625" stop-color="rgb(93.240356%, 93.240356%, 93.240356%)" stop-opacity="1"/>
-<stop offset="0.90625" stop-color="rgb(93.473816%, 93.473816%, 93.473816%)" stop-opacity="1"/>
-<stop offset="0.921875" stop-color="rgb(93.708801%, 93.708801%, 93.708801%)" stop-opacity="1"/>
-<stop offset="0.9375" stop-color="rgb(93.943787%, 93.943787%, 93.943787%)" stop-opacity="1"/>
-<stop offset="0.953125" stop-color="rgb(94.177246%, 94.177246%, 94.177246%)" stop-opacity="1"/>
-<stop offset="0.96875" stop-color="rgb(94.412231%, 94.412231%, 94.412231%)" stop-opacity="1"/>
-<stop offset="0.984375" stop-color="rgb(94.645691%, 94.645691%, 94.645691%)" stop-opacity="1"/>
-<stop offset="1" stop-color="rgb(94.880676%, 94.880676%, 94.880676%)" stop-opacity="1"/>
-</linearGradient>
-<clipPath id="clip-9">
-<path clip-rule="nonzero" d="M 190.371094 159.144531 L 247.066406 159.144531 L 247.066406 181.824219 L 190.371094 181.824219 Z M 190.371094 159.144531 "/>
-</clipPath>
-<linearGradient id="linear-pattern-9" gradientUnits="userSpaceOnUse" x1="0" y1="25.002734" x2="0" y2="74.997244" gradientTransform="matrix(1.134, 0, 0, -0.4536, 162.0192, 193.16448)">
-<stop offset="0" stop-color="rgb(80.000305%, 80.000305%, 80.000305%)" stop-opacity="1"/>
-<stop offset="0.015625" stop-color="rgb(80.116272%, 80.116272%, 80.116272%)" stop-opacity="1"/>
-<stop offset="0.03125" stop-color="rgb(80.351257%, 80.351257%, 80.351257%)" stop-opacity="1"/>
-<stop offset="0.046875" stop-color="rgb(80.586243%, 80.586243%, 80.586243%)" stop-opacity="1"/>
-<stop offset="0.0625" stop-color="rgb(80.819702%, 80.819702%, 80.819702%)" stop-opacity="1"/>
-<stop offset="0.078125" stop-color="rgb(81.054688%, 81.054688%, 81.054688%)" stop-opacity="1"/>
-<stop offset="0.09375" stop-color="rgb(81.288147%, 81.288147%, 81.288147%)" stop-opacity="1"/>
-<stop offset="0.109375" stop-color="rgb(81.523132%, 81.523132%, 81.523132%)" stop-opacity="1"/>
-<stop offset="0.125" stop-color="rgb(81.756592%, 81.756592%, 81.756592%)" stop-opacity="1"/>
-<stop offset="0.140625" stop-color="rgb(81.991577%, 81.991577%, 81.991577%)" stop-opacity="1"/>
-<stop offset="0.15625" stop-color="rgb(82.226562%, 82.226562%, 82.226562%)" stop-opacity="1"/>
-<stop offset="0.171875" stop-color="rgb(82.460022%, 82.460022%, 82.460022%)" stop-opacity="1"/>
-<stop offset="0.1875" stop-color="rgb(82.695007%, 82.695007%, 82.695007%)" stop-opacity="1"/>
-<stop offset="0.203125" stop-color="rgb(82.928467%, 82.928467%, 82.928467%)" stop-opacity="1"/>
-<stop offset="0.21875" stop-color="rgb(83.163452%, 83.163452%, 83.163452%)" stop-opacity="1"/>
-<stop offset="0.234375" stop-color="rgb(83.396912%, 83.396912%, 83.396912%)" stop-opacity="1"/>
-<stop offset="0.25" stop-color="rgb(83.631897%, 83.631897%, 83.631897%)" stop-opacity="1"/>
-<stop offset="0.265625" stop-color="rgb(83.866882%, 83.866882%, 83.866882%)" stop-opacity="1"/>
-<stop offset="0.28125" stop-color="rgb(84.100342%, 84.100342%, 84.100342%)" stop-opacity="1"/>
-<stop offset="0.296875" stop-color="rgb(84.335327%, 84.335327%, 84.335327%)" stop-opacity="1"/>
-<stop offset="0.3125" stop-color="rgb(84.568787%, 84.568787%, 84.568787%)" stop-opacity="1"/>
-<stop offset="0.328125" stop-color="rgb(84.803772%, 84.803772%, 84.803772%)" stop-opacity="1"/>
-<stop offset="0.34375" stop-color="rgb(85.038757%, 85.038757%, 85.038757%)" stop-opacity="1"/>
-<stop offset="0.359375" stop-color="rgb(85.272217%, 85.272217%, 85.272217%)" stop-opacity="1"/>
-<stop offset="0.375" stop-color="rgb(85.507202%, 85.507202%, 85.507202%)" stop-opacity="1"/>
-<stop offset="0.390625" stop-color="rgb(85.740662%, 85.740662%, 85.740662%)" stop-opacity="1"/>
-<stop offset="0.40625" stop-color="rgb(85.975647%, 85.975647%, 85.975647%)" stop-opacity="1"/>
-<stop offset="0.421875" stop-color="rgb(86.209106%, 86.209106%, 86.209106%)" stop-opacity="1"/>
-<stop offset="0.4375" stop-color="rgb(86.444092%, 86.444092%, 86.444092%)" stop-opacity="1"/>
-<stop offset="0.453125" stop-color="rgb(86.679077%, 86.679077%, 86.679077%)" stop-opacity="1"/>
-<stop offset="0.46875" stop-color="rgb(86.912537%, 86.912537%, 86.912537%)" stop-opacity="1"/>
-<stop offset="0.484375" stop-color="rgb(87.147522%, 87.147522%, 87.147522%)" stop-opacity="1"/>
-<stop offset="0.5" stop-color="rgb(87.380981%, 87.380981%, 87.380981%)" stop-opacity="1"/>
-<stop offset="0.515625" stop-color="rgb(87.615967%, 87.615967%, 87.615967%)" stop-opacity="1"/>
-<stop offset="0.53125" stop-color="rgb(87.850952%, 87.850952%, 87.850952%)" stop-opacity="1"/>
-<stop offset="0.546875" stop-color="rgb(88.084412%, 88.084412%, 88.084412%)" stop-opacity="1"/>
-<stop offset="0.5625" stop-color="rgb(88.319397%, 88.319397%, 88.319397%)" stop-opacity="1"/>
-<stop offset="0.578125" stop-color="rgb(88.552856%, 88.552856%, 88.552856%)" stop-opacity="1"/>
-<stop offset="0.59375" stop-color="rgb(88.787842%, 88.787842%, 88.787842%)" stop-opacity="1"/>
-<stop offset="0.609375" stop-color="rgb(89.021301%, 89.021301%, 89.021301%)" stop-opacity="1"/>
-<stop offset="0.625" stop-color="rgb(89.256287%, 89.256287%, 89.256287%)" stop-opacity="1"/>
-<stop offset="0.640625" stop-color="rgb(89.491272%, 89.491272%, 89.491272%)" stop-opacity="1"/>
-<stop offset="0.65625" stop-color="rgb(89.724731%, 89.724731%, 89.724731%)" stop-opacity="1"/>
-<stop offset="0.671875" stop-color="rgb(89.959717%, 89.959717%, 89.959717%)" stop-opacity="1"/>
-<stop offset="0.6875" stop-color="rgb(90.193176%, 90.193176%, 90.193176%)" stop-opacity="1"/>
-<stop offset="0.703125" stop-color="rgb(90.428162%, 90.428162%, 90.428162%)" stop-opacity="1"/>
-<stop offset="0.71875" stop-color="rgb(90.663147%, 90.663147%, 90.663147%)" stop-opacity="1"/>
-<stop offset="0.734375" stop-color="rgb(90.896606%, 90.896606%, 90.896606%)" stop-opacity="1"/>
-<stop offset="0.75" stop-color="rgb(91.131592%, 91.131592%, 91.131592%)" stop-opacity="1"/>
-<stop offset="0.765625" stop-color="rgb(91.365051%, 91.365051%, 91.365051%)" stop-opacity="1"/>
-<stop offset="0.78125" stop-color="rgb(91.600037%, 91.600037%, 91.600037%)" stop-opacity="1"/>
-<stop offset="0.796875" stop-color="rgb(91.833496%, 91.833496%, 91.833496%)" stop-opacity="1"/>
-<stop offset="0.8125" stop-color="rgb(92.068481%, 92.068481%, 92.068481%)" stop-opacity="1"/>
-<stop offset="0.828125" stop-color="rgb(92.303467%, 92.303467%, 92.303467%)" stop-opacity="1"/>
-<stop offset="0.84375" stop-color="rgb(92.536926%, 92.536926%, 92.536926%)" stop-opacity="1"/>
-<stop offset="0.859375" stop-color="rgb(92.771912%, 92.771912%, 92.771912%)" stop-opacity="1"/>
-<stop offset="0.875" stop-color="rgb(93.005371%, 93.005371%, 93.005371%)" stop-opacity="1"/>
-<stop offset="0.890625" stop-color="rgb(93.240356%, 93.240356%, 93.240356%)" stop-opacity="1"/>
-<stop offset="0.90625" stop-color="rgb(93.473816%, 93.473816%, 93.473816%)" stop-opacity="1"/>
-<stop offset="0.921875" stop-color="rgb(93.708801%, 93.708801%, 93.708801%)" stop-opacity="1"/>
-<stop offset="0.9375" stop-color="rgb(93.943787%, 93.943787%, 93.943787%)" stop-opacity="1"/>
-<stop offset="0.953125" stop-color="rgb(94.177246%, 94.177246%, 94.177246%)" stop-opacity="1"/>
-<stop offset="0.96875" stop-color="rgb(94.412231%, 94.412231%, 94.412231%)" stop-opacity="1"/>
-<stop offset="0.984375" stop-color="rgb(94.645691%, 94.645691%, 94.645691%)" stop-opacity="1"/>
-<stop offset="1" stop-color="rgb(94.880676%, 94.880676%, 94.880676%)" stop-opacity="1"/>
-</linearGradient>
-<clipPath id="clip-10">
-<path clip-rule="nonzero" d="M 190.371094 193.160156 L 247.066406 193.160156 L 247.066406 215.839844 L 190.371094 215.839844 Z M 190.371094 193.160156 "/>
-</clipPath>
-<linearGradient id="linear-pattern-10" gradientUnits="userSpaceOnUse" x1="0" y1="25.002712" x2="0" y2="74.997244" gradientTransform="matrix(1.134, 0, 0, -0.4536, 162.0192, 227.18055)">
-<stop offset="0" stop-color="rgb(80.000305%, 80.000305%, 80.000305%)" stop-opacity="1"/>
-<stop offset="0.015625" stop-color="rgb(80.116272%, 80.116272%, 80.116272%)" stop-opacity="1"/>
-<stop offset="0.03125" stop-color="rgb(80.351257%, 80.351257%, 80.351257%)" stop-opacity="1"/>
-<stop offset="0.046875" stop-color="rgb(80.586243%, 80.586243%, 80.586243%)" stop-opacity="1"/>
-<stop offset="0.0625" stop-color="rgb(80.819702%, 80.819702%, 80.819702%)" stop-opacity="1"/>
-<stop offset="0.078125" stop-color="rgb(81.054688%, 81.054688%, 81.054688%)" stop-opacity="1"/>
-<stop offset="0.09375" stop-color="rgb(81.288147%, 81.288147%, 81.288147%)" stop-opacity="1"/>
-<stop offset="0.109375" stop-color="rgb(81.523132%, 81.523132%, 81.523132%)" stop-opacity="1"/>
-<stop offset="0.125" stop-color="rgb(81.756592%, 81.756592%, 81.756592%)" stop-opacity="1"/>
-<stop offset="0.140625" stop-color="rgb(81.991577%, 81.991577%, 81.991577%)" stop-opacity="1"/>
-<stop offset="0.15625" stop-color="rgb(82.226562%, 82.226562%, 82.226562%)" stop-opacity="1"/>
-<stop offset="0.171875" stop-color="rgb(82.460022%, 82.460022%, 82.460022%)" stop-opacity="1"/>
-<stop offset="0.1875" stop-color="rgb(82.695007%, 82.695007%, 82.695007%)" stop-opacity="1"/>
-<stop offset="0.203125" stop-color="rgb(82.928467%, 82.928467%, 82.928467%)" stop-opacity="1"/>
-<stop offset="0.21875" stop-color="rgb(83.163452%, 83.163452%, 83.163452%)" stop-opacity="1"/>
-<stop offset="0.234375" stop-color="rgb(83.396912%, 83.396912%, 83.396912%)" stop-opacity="1"/>
-<stop offset="0.25" stop-color="rgb(83.631897%, 83.631897%, 83.631897%)" stop-opacity="1"/>
-<stop offset="0.265625" stop-color="rgb(83.866882%, 83.866882%, 83.866882%)" stop-opacity="1"/>
-<stop offset="0.28125" stop-color="rgb(84.100342%, 84.100342%, 84.100342%)" stop-opacity="1"/>
-<stop offset="0.296875" stop-color="rgb(84.335327%, 84.335327%, 84.335327%)" stop-opacity="1"/>
-<stop offset="0.3125" stop-color="rgb(84.568787%, 84.568787%, 84.568787%)" stop-opacity="1"/>
-<stop offset="0.328125" stop-color="rgb(84.803772%, 84.803772%, 84.803772%)" stop-opacity="1"/>
-<stop offset="0.34375" stop-color="rgb(85.038757%, 85.038757%, 85.038757%)" stop-opacity="1"/>
-<stop offset="0.359375" stop-color="rgb(85.272217%, 85.272217%, 85.272217%)" stop-opacity="1"/>
-<stop offset="0.375" stop-color="rgb(85.507202%, 85.507202%, 85.507202%)" stop-opacity="1"/>
-<stop offset="0.390625" stop-color="rgb(85.740662%, 85.740662%, 85.740662%)" stop-opacity="1"/>
-<stop offset="0.40625" stop-color="rgb(85.975647%, 85.975647%, 85.975647%)" stop-opacity="1"/>
-<stop offset="0.421875" stop-color="rgb(86.209106%, 86.209106%, 86.209106%)" stop-opacity="1"/>
-<stop offset="0.4375" stop-color="rgb(86.444092%, 86.444092%, 86.444092%)" stop-opacity="1"/>
-<stop offset="0.453125" stop-color="rgb(86.679077%, 86.679077%, 86.679077%)" stop-opacity="1"/>
-<stop offset="0.46875" stop-color="rgb(86.912537%, 86.912537%, 86.912537%)" stop-opacity="1"/>
-<stop offset="0.484375" stop-color="rgb(87.147522%, 87.147522%, 87.147522%)" stop-opacity="1"/>
-<stop offset="0.5" stop-color="rgb(87.380981%, 87.380981%, 87.380981%)" stop-opacity="1"/>
-<stop offset="0.515625" stop-color="rgb(87.615967%, 87.615967%, 87.615967%)" stop-opacity="1"/>
-<stop offset="0.53125" stop-color="rgb(87.850952%, 87.850952%, 87.850952%)" stop-opacity="1"/>
-<stop offset="0.546875" stop-color="rgb(88.084412%, 88.084412%, 88.084412%)" stop-opacity="1"/>
-<stop offset="0.5625" stop-color="rgb(88.319397%, 88.319397%, 88.319397%)" stop-opacity="1"/>
-<stop offset="0.578125" stop-color="rgb(88.552856%, 88.552856%, 88.552856%)" stop-opacity="1"/>
-<stop offset="0.59375" stop-color="rgb(88.787842%, 88.787842%, 88.787842%)" stop-opacity="1"/>
-<stop offset="0.609375" stop-color="rgb(89.021301%, 89.021301%, 89.021301%)" stop-opacity="1"/>
-<stop offset="0.625" stop-color="rgb(89.256287%, 89.256287%, 89.256287%)" stop-opacity="1"/>
-<stop offset="0.640625" stop-color="rgb(89.491272%, 89.491272%, 89.491272%)" stop-opacity="1"/>
-<stop offset="0.65625" stop-color="rgb(89.724731%, 89.724731%, 89.724731%)" stop-opacity="1"/>
-<stop offset="0.671875" stop-color="rgb(89.959717%, 89.959717%, 89.959717%)" stop-opacity="1"/>
-<stop offset="0.6875" stop-color="rgb(90.193176%, 90.193176%, 90.193176%)" stop-opacity="1"/>
-<stop offset="0.703125" stop-color="rgb(90.428162%, 90.428162%, 90.428162%)" stop-opacity="1"/>
-<stop offset="0.71875" stop-color="rgb(90.663147%, 90.663147%, 90.663147%)" stop-opacity="1"/>
-<stop offset="0.734375" stop-color="rgb(90.896606%, 90.896606%, 90.896606%)" stop-opacity="1"/>
-<stop offset="0.75" stop-color="rgb(91.131592%, 91.131592%, 91.131592%)" stop-opacity="1"/>
-<stop offset="0.765625" stop-color="rgb(91.365051%, 91.365051%, 91.365051%)" stop-opacity="1"/>
-<stop offset="0.78125" stop-color="rgb(91.600037%, 91.600037%, 91.600037%)" stop-opacity="1"/>
-<stop offset="0.796875" stop-color="rgb(91.833496%, 91.833496%, 91.833496%)" stop-opacity="1"/>
-<stop offset="0.8125" stop-color="rgb(92.068481%, 92.068481%, 92.068481%)" stop-opacity="1"/>
-<stop offset="0.828125" stop-color="rgb(92.303467%, 92.303467%, 92.303467%)" stop-opacity="1"/>
-<stop offset="0.84375" stop-color="rgb(92.536926%, 92.536926%, 92.536926%)" stop-opacity="1"/>
-<stop offset="0.859375" stop-color="rgb(92.771912%, 92.771912%, 92.771912%)" stop-opacity="1"/>
-<stop offset="0.875" stop-color="rgb(93.005371%, 93.005371%, 93.005371%)" stop-opacity="1"/>
-<stop offset="0.890625" stop-color="rgb(93.240356%, 93.240356%, 93.240356%)" stop-opacity="1"/>
-<stop offset="0.90625" stop-color="rgb(93.473816%, 93.473816%, 93.473816%)" stop-opacity="1"/>
-<stop offset="0.921875" stop-color="rgb(93.708801%, 93.708801%, 93.708801%)" stop-opacity="1"/>
-<stop offset="0.9375" stop-color="rgb(93.943787%, 93.943787%, 93.943787%)" stop-opacity="1"/>
-<stop offset="0.953125" stop-color="rgb(94.177246%, 94.177246%, 94.177246%)" stop-opacity="1"/>
-<stop offset="0.96875" stop-color="rgb(94.412231%, 94.412231%, 94.412231%)" stop-opacity="1"/>
-<stop offset="0.984375" stop-color="rgb(94.645691%, 94.645691%, 94.645691%)" stop-opacity="1"/>
-<stop offset="1" stop-color="rgb(94.880676%, 94.880676%, 94.880676%)" stop-opacity="1"/>
-</linearGradient>
-<clipPath id="clip-11">
-<path clip-rule="nonzero" d="M 190.371094 227.179688 L 247.066406 227.179688 L 247.066406 249.855469 L 190.371094 249.855469 Z M 190.371094 227.179688 "/>
-</clipPath>
-<linearGradient id="linear-pattern-11" gradientUnits="userSpaceOnUse" x1="0" y1="25.002734" x2="0" y2="74.997266" gradientTransform="matrix(1.134, 0, 0, -0.4536, 162.0192, 261.19664)">
-<stop offset="0" stop-color="rgb(80.000305%, 80.000305%, 80.000305%)" stop-opacity="1"/>
-<stop offset="0.015625" stop-color="rgb(80.116272%, 80.116272%, 80.116272%)" stop-opacity="1"/>
-<stop offset="0.03125" stop-color="rgb(80.351257%, 80.351257%, 80.351257%)" stop-opacity="1"/>
-<stop offset="0.046875" stop-color="rgb(80.586243%, 80.586243%, 80.586243%)" stop-opacity="1"/>
-<stop offset="0.0625" stop-color="rgb(80.819702%, 80.819702%, 80.819702%)" stop-opacity="1"/>
-<stop offset="0.078125" stop-color="rgb(81.054688%, 81.054688%, 81.054688%)" stop-opacity="1"/>
-<stop offset="0.09375" stop-color="rgb(81.288147%, 81.288147%, 81.288147%)" stop-opacity="1"/>
-<stop offset="0.109375" stop-color="rgb(81.523132%, 81.523132%, 81.523132%)" stop-opacity="1"/>
-<stop offset="0.125" stop-color="rgb(81.756592%, 81.756592%, 81.756592%)" stop-opacity="1"/>
-<stop offset="0.140625" stop-color="rgb(81.991577%, 81.991577%, 81.991577%)" stop-opacity="1"/>
-<stop offset="0.15625" stop-color="rgb(82.226562%, 82.226562%, 82.226562%)" stop-opacity="1"/>
-<stop offset="0.171875" stop-color="rgb(82.460022%, 82.460022%, 82.460022%)" stop-opacity="1"/>
-<stop offset="0.1875" stop-color="rgb(82.695007%, 82.695007%, 82.695007%)" stop-opacity="1"/>
-<stop offset="0.203125" stop-color="rgb(82.928467%, 82.928467%, 82.928467%)" stop-opacity="1"/>
-<stop offset="0.21875" stop-color="rgb(83.163452%, 83.163452%, 83.163452%)" stop-opacity="1"/>
-<stop offset="0.234375" stop-color="rgb(83.396912%, 83.396912%, 83.396912%)" stop-opacity="1"/>
-<stop offset="0.25" stop-color="rgb(83.631897%, 83.631897%, 83.631897%)" stop-opacity="1"/>
-<stop offset="0.265625" stop-color="rgb(83.866882%, 83.866882%, 83.866882%)" stop-opacity="1"/>
-<stop offset="0.28125" stop-color="rgb(84.100342%, 84.100342%, 84.100342%)" stop-opacity="1"/>
-<stop offset="0.296875" stop-color="rgb(84.335327%, 84.335327%, 84.335327%)" stop-opacity="1"/>
-<stop offset="0.3125" stop-color="rgb(84.568787%, 84.568787%, 84.568787%)" stop-opacity="1"/>
-<stop offset="0.328125" stop-color="rgb(84.803772%, 84.803772%, 84.803772%)" stop-opacity="1"/>
-<stop offset="0.34375" stop-color="rgb(85.038757%, 85.038757%, 85.038757%)" stop-opacity="1"/>
-<stop offset="0.359375" stop-color="rgb(85.272217%, 85.272217%, 85.272217%)" stop-opacity="1"/>
-<stop offset="0.375" stop-color="rgb(85.507202%, 85.507202%, 85.507202%)" stop-opacity="1"/>
-<stop offset="0.390625" stop-color="rgb(85.740662%, 85.740662%, 85.740662%)" stop-opacity="1"/>
-<stop offset="0.40625" stop-color="rgb(85.975647%, 85.975647%, 85.975647%)" stop-opacity="1"/>
-<stop offset="0.421875" stop-color="rgb(86.209106%, 86.209106%, 86.209106%)" stop-opacity="1"/>
-<stop offset="0.4375" stop-color="rgb(86.444092%, 86.444092%, 86.444092%)" stop-opacity="1"/>
-<stop offset="0.453125" stop-color="rgb(86.679077%, 86.679077%, 86.679077%)" stop-opacity="1"/>
-<stop offset="0.46875" stop-color="rgb(86.912537%, 86.912537%, 86.912537%)" stop-opacity="1"/>
-<stop offset="0.484375" stop-color="rgb(87.147522%, 87.147522%, 87.147522%)" stop-opacity="1"/>
-<stop offset="0.5" stop-color="rgb(87.380981%, 87.380981%, 87.380981%)" stop-opacity="1"/>
-<stop offset="0.515625" stop-color="rgb(87.615967%, 87.615967%, 87.615967%)" stop-opacity="1"/>
-<stop offset="0.53125" stop-color="rgb(87.850952%, 87.850952%, 87.850952%)" stop-opacity="1"/>
-<stop offset="0.546875" stop-color="rgb(88.084412%, 88.084412%, 88.084412%)" stop-opacity="1"/>
-<stop offset="0.5625" stop-color="rgb(88.319397%, 88.319397%, 88.319397%)" stop-opacity="1"/>
-<stop offset="0.578125" stop-color="rgb(88.552856%, 88.552856%, 88.552856%)" stop-opacity="1"/>
-<stop offset="0.59375" stop-color="rgb(88.787842%, 88.787842%, 88.787842%)" stop-opacity="1"/>
-<stop offset="0.609375" stop-color="rgb(89.021301%, 89.021301%, 89.021301%)" stop-opacity="1"/>
-<stop offset="0.625" stop-color="rgb(89.256287%, 89.256287%, 89.256287%)" stop-opacity="1"/>
-<stop offset="0.640625" stop-color="rgb(89.491272%, 89.491272%, 89.491272%)" stop-opacity="1"/>
-<stop offset="0.65625" stop-color="rgb(89.724731%, 89.724731%, 89.724731%)" stop-opacity="1"/>
-<stop offset="0.671875" stop-color="rgb(89.959717%, 89.959717%, 89.959717%)" stop-opacity="1"/>
-<stop offset="0.6875" stop-color="rgb(90.193176%, 90.193176%, 90.193176%)" stop-opacity="1"/>
-<stop offset="0.703125" stop-color="rgb(90.428162%, 90.428162%, 90.428162%)" stop-opacity="1"/>
-<stop offset="0.71875" stop-color="rgb(90.663147%, 90.663147%, 90.663147%)" stop-opacity="1"/>
-<stop offset="0.734375" stop-color="rgb(90.896606%, 90.896606%, 90.896606%)" stop-opacity="1"/>
-<stop offset="0.75" stop-color="rgb(91.131592%, 91.131592%, 91.131592%)" stop-opacity="1"/>
-<stop offset="0.765625" stop-color="rgb(91.365051%, 91.365051%, 91.365051%)" stop-opacity="1"/>
-<stop offset="0.78125" stop-color="rgb(91.600037%, 91.600037%, 91.600037%)" stop-opacity="1"/>
-<stop offset="0.796875" stop-color="rgb(91.833496%, 91.833496%, 91.833496%)" stop-opacity="1"/>
-<stop offset="0.8125" stop-color="rgb(92.068481%, 92.068481%, 92.068481%)" stop-opacity="1"/>
-<stop offset="0.828125" stop-color="rgb(92.303467%, 92.303467%, 92.303467%)" stop-opacity="1"/>
-<stop offset="0.84375" stop-color="rgb(92.536926%, 92.536926%, 92.536926%)" stop-opacity="1"/>
-<stop offset="0.859375" stop-color="rgb(92.771912%, 92.771912%, 92.771912%)" stop-opacity="1"/>
-<stop offset="0.875" stop-color="rgb(93.005371%, 93.005371%, 93.005371%)" stop-opacity="1"/>
-<stop offset="0.890625" stop-color="rgb(93.240356%, 93.240356%, 93.240356%)" stop-opacity="1"/>
-<stop offset="0.90625" stop-color="rgb(93.473816%, 93.473816%, 93.473816%)" stop-opacity="1"/>
-<stop offset="0.921875" stop-color="rgb(93.708801%, 93.708801%, 93.708801%)" stop-opacity="1"/>
-<stop offset="0.9375" stop-color="rgb(93.943787%, 93.943787%, 93.943787%)" stop-opacity="1"/>
-<stop offset="0.953125" stop-color="rgb(94.177246%, 94.177246%, 94.177246%)" stop-opacity="1"/>
-<stop offset="0.96875" stop-color="rgb(94.412231%, 94.412231%, 94.412231%)" stop-opacity="1"/>
-<stop offset="0.984375" stop-color="rgb(94.645691%, 94.645691%, 94.645691%)" stop-opacity="1"/>
-<stop offset="1" stop-color="rgb(94.880676%, 94.880676%, 94.880676%)" stop-opacity="1"/>
-</linearGradient>
-<clipPath id="clip-12">
-<path clip-rule="nonzero" d="M 190 226 L 248 226 L 248 250.054688 L 190 250.054688 Z M 190 226 "/>
-</clipPath>
-<clipPath id="clip-13">
-<path clip-rule="nonzero" d="M 281.082031 46 L 349.113281 46 L 349.113281 68 L 281.082031 68 Z M 281.082031 46 "/>
-</clipPath>
-<linearGradient id="linear-pattern-12" gradientUnits="userSpaceOnUse" x1="0" y1="25.002734" x2="0" y2="74.997266" gradientTransform="matrix(1.36081, 0, 0, -0.4536, 247.0581, 79.77726)">
-<stop offset="0" stop-color="rgb(57.649231%, 87.059021%, 57.649231%)" stop-opacity="1"/>
-<stop offset="0.0078125" stop-color="rgb(57.759094%, 87.09259%, 57.759094%)" stop-opacity="1"/>
-<stop offset="0.015625" stop-color="rgb(57.978821%, 87.159729%, 57.978821%)" stop-opacity="1"/>
-<stop offset="0.0234375" stop-color="rgb(58.200073%, 87.226868%, 58.200073%)" stop-opacity="1"/>
-<stop offset="0.03125" stop-color="rgb(58.4198%, 87.294006%, 58.4198%)" stop-opacity="1"/>
-<stop offset="0.0390625" stop-color="rgb(58.641052%, 87.361145%, 58.641052%)" stop-opacity="1"/>
-<stop offset="0.046875" stop-color="rgb(58.860779%, 87.42981%, 58.860779%)" stop-opacity="1"/>
-<stop offset="0.0546875" stop-color="rgb(59.082031%, 87.496948%, 59.082031%)" stop-opacity="1"/>
-<stop offset="0.0625" stop-color="rgb(59.301758%, 87.564087%, 59.301758%)" stop-opacity="1"/>
-<stop offset="0.0703125" stop-color="rgb(59.52301%, 87.631226%, 59.52301%)" stop-opacity="1"/>
-<stop offset="0.078125" stop-color="rgb(59.742737%, 87.698364%, 59.742737%)" stop-opacity="1"/>
-<stop offset="0.0859375" stop-color="rgb(59.963989%, 87.765503%, 59.963989%)" stop-opacity="1"/>
-<stop offset="0.09375" stop-color="rgb(60.185242%, 87.834167%, 60.185242%)" stop-opacity="1"/>
-<stop offset="0.101562" stop-color="rgb(60.404968%, 87.901306%, 60.404968%)" stop-opacity="1"/>
-<stop offset="0.109375" stop-color="rgb(60.626221%, 87.968445%, 60.626221%)" stop-opacity="1"/>
-<stop offset="0.117188" stop-color="rgb(60.845947%, 88.035583%, 60.845947%)" stop-opacity="1"/>
-<stop offset="0.125" stop-color="rgb(61.0672%, 88.102722%, 61.0672%)" stop-opacity="1"/>
-<stop offset="0.132813" stop-color="rgb(61.286926%, 88.169861%, 61.286926%)" stop-opacity="1"/>
-<stop offset="0.140625" stop-color="rgb(61.508179%, 88.238525%, 61.508179%)" stop-opacity="1"/>
-<stop offset="0.148438" stop-color="rgb(61.727905%, 88.305664%, 61.727905%)" stop-opacity="1"/>
-<stop offset="0.15625" stop-color="rgb(61.949158%, 88.372803%, 61.949158%)" stop-opacity="1"/>
-<stop offset="0.164063" stop-color="rgb(62.168884%, 88.439941%, 62.168884%)" stop-opacity="1"/>
-<stop offset="0.171875" stop-color="rgb(62.390137%, 88.50708%, 62.390137%)" stop-opacity="1"/>
-<stop offset="0.179688" stop-color="rgb(62.609863%, 88.574219%, 62.609863%)" stop-opacity="1"/>
-<stop offset="0.1875" stop-color="rgb(62.831116%, 88.642883%, 62.831116%)" stop-opacity="1"/>
-<stop offset="0.195312" stop-color="rgb(63.052368%, 88.710022%, 63.052368%)" stop-opacity="1"/>
-<stop offset="0.203125" stop-color="rgb(63.272095%, 88.777161%, 63.272095%)" stop-opacity="1"/>
-<stop offset="0.210938" stop-color="rgb(63.493347%, 88.844299%, 63.493347%)" stop-opacity="1"/>
-<stop offset="0.21875" stop-color="rgb(63.713074%, 88.911438%, 63.713074%)" stop-opacity="1"/>
-<stop offset="0.226563" stop-color="rgb(63.934326%, 88.980103%, 63.934326%)" stop-opacity="1"/>
-<stop offset="0.234375" stop-color="rgb(64.154053%, 89.047241%, 64.154053%)" stop-opacity="1"/>
-<stop offset="0.242188" stop-color="rgb(64.375305%, 89.11438%, 64.375305%)" stop-opacity="1"/>
-<stop offset="0.25" stop-color="rgb(64.595032%, 89.181519%, 64.595032%)" stop-opacity="1"/>
-<stop offset="0.257813" stop-color="rgb(64.816284%, 89.248657%, 64.816284%)" stop-opacity="1"/>
-<stop offset="0.265625" stop-color="rgb(65.036011%, 89.315796%, 65.036011%)" stop-opacity="1"/>
-<stop offset="0.273438" stop-color="rgb(65.257263%, 89.38446%, 65.257263%)" stop-opacity="1"/>
-<stop offset="0.28125" stop-color="rgb(65.478516%, 89.451599%, 65.478516%)" stop-opacity="1"/>
-<stop offset="0.289062" stop-color="rgb(65.698242%, 89.518738%, 65.698242%)" stop-opacity="1"/>
-<stop offset="0.296875" stop-color="rgb(65.919495%, 89.585876%, 65.919495%)" stop-opacity="1"/>
-<stop offset="0.304688" stop-color="rgb(66.139221%, 89.653015%, 66.139221%)" stop-opacity="1"/>
-<stop offset="0.3125" stop-color="rgb(66.360474%, 89.720154%, 66.360474%)" stop-opacity="1"/>
-<stop offset="0.320313" stop-color="rgb(66.5802%, 89.788818%, 66.5802%)" stop-opacity="1"/>
-<stop offset="0.328125" stop-color="rgb(66.801453%, 89.855957%, 66.801453%)" stop-opacity="1"/>
-<stop offset="0.335938" stop-color="rgb(67.021179%, 89.923096%, 67.021179%)" stop-opacity="1"/>
-<stop offset="0.34375" stop-color="rgb(67.242432%, 89.990234%, 67.242432%)" stop-opacity="1"/>
-<stop offset="0.351563" stop-color="rgb(67.462158%, 90.057373%, 67.462158%)" stop-opacity="1"/>
-<stop offset="0.359375" stop-color="rgb(67.683411%, 90.124512%, 67.683411%)" stop-opacity="1"/>
-<stop offset="0.367188" stop-color="rgb(67.903137%, 90.193176%, 67.903137%)" stop-opacity="1"/>
-<stop offset="0.375" stop-color="rgb(68.12439%, 90.260315%, 68.12439%)" stop-opacity="1"/>
-<stop offset="0.382813" stop-color="rgb(68.345642%, 90.327454%, 68.345642%)" stop-opacity="1"/>
-<stop offset="0.390625" stop-color="rgb(68.565369%, 90.394592%, 68.565369%)" stop-opacity="1"/>
-<stop offset="0.398438" stop-color="rgb(68.786621%, 90.461731%, 68.786621%)" stop-opacity="1"/>
-<stop offset="0.40625" stop-color="rgb(69.006348%, 90.52887%, 69.006348%)" stop-opacity="1"/>
-<stop offset="0.414063" stop-color="rgb(69.2276%, 90.597534%, 69.2276%)" stop-opacity="1"/>
-<stop offset="0.421875" stop-color="rgb(69.447327%, 90.664673%, 69.447327%)" stop-opacity="1"/>
-<stop offset="0.429688" stop-color="rgb(69.668579%, 90.731812%, 69.668579%)" stop-opacity="1"/>
-<stop offset="0.4375" stop-color="rgb(69.888306%, 90.79895%, 69.888306%)" stop-opacity="1"/>
-<stop offset="0.445313" stop-color="rgb(70.109558%, 90.866089%, 70.109558%)" stop-opacity="1"/>
-<stop offset="0.453125" stop-color="rgb(70.329285%, 90.933228%, 70.329285%)" stop-opacity="1"/>
-<stop offset="0.460938" stop-color="rgb(70.550537%, 91.001892%, 70.550537%)" stop-opacity="1"/>
-<stop offset="0.46875" stop-color="rgb(70.770264%, 91.069031%, 70.770264%)" stop-opacity="1"/>
-<stop offset="0.476563" stop-color="rgb(70.991516%, 91.136169%, 70.991516%)" stop-opacity="1"/>
-<stop offset="0.484375" stop-color="rgb(71.212769%, 91.203308%, 71.212769%)" stop-opacity="1"/>
-<stop offset="0.492188" stop-color="rgb(71.432495%, 91.270447%, 71.432495%)" stop-opacity="1"/>
-<stop offset="0.5" stop-color="rgb(71.653748%, 91.337585%, 71.653748%)" stop-opacity="1"/>
-<stop offset="0.507812" stop-color="rgb(71.873474%, 91.40625%, 71.873474%)" stop-opacity="1"/>
-<stop offset="0.515625" stop-color="rgb(72.094727%, 91.473389%, 72.094727%)" stop-opacity="1"/>
-<stop offset="0.523438" stop-color="rgb(72.314453%, 91.540527%, 72.314453%)" stop-opacity="1"/>
-<stop offset="0.53125" stop-color="rgb(72.535706%, 91.607666%, 72.535706%)" stop-opacity="1"/>
-<stop offset="0.539062" stop-color="rgb(72.755432%, 91.674805%, 72.755432%)" stop-opacity="1"/>
-<stop offset="0.546875" stop-color="rgb(72.976685%, 91.741943%, 72.976685%)" stop-opacity="1"/>
-<stop offset="0.554687" stop-color="rgb(73.196411%, 91.810608%, 73.196411%)" stop-opacity="1"/>
-<stop offset="0.5625" stop-color="rgb(73.417664%, 91.877747%, 73.417664%)" stop-opacity="1"/>
-<stop offset="0.570313" stop-color="rgb(73.638916%, 91.944885%, 73.638916%)" stop-opacity="1"/>
-<stop offset="0.578125" stop-color="rgb(73.858643%, 92.012024%, 73.858643%)" stop-opacity="1"/>
-<stop offset="0.585938" stop-color="rgb(74.079895%, 92.079163%, 74.079895%)" stop-opacity="1"/>
-<stop offset="0.59375" stop-color="rgb(74.299622%, 92.146301%, 74.299622%)" stop-opacity="1"/>
-<stop offset="0.601562" stop-color="rgb(74.520874%, 92.214966%, 74.520874%)" stop-opacity="1"/>
-<stop offset="0.609375" stop-color="rgb(74.740601%, 92.282104%, 74.740601%)" stop-opacity="1"/>
-<stop offset="0.617188" stop-color="rgb(74.961853%, 92.349243%, 74.961853%)" stop-opacity="1"/>
-<stop offset="0.625" stop-color="rgb(75.18158%, 92.416382%, 75.18158%)" stop-opacity="1"/>
-<stop offset="0.632812" stop-color="rgb(75.402832%, 92.483521%, 75.402832%)" stop-opacity="1"/>
-<stop offset="0.640625" stop-color="rgb(75.624084%, 92.550659%, 75.624084%)" stop-opacity="1"/>
-<stop offset="0.648437" stop-color="rgb(75.843811%, 92.619324%, 75.843811%)" stop-opacity="1"/>
-<stop offset="0.65625" stop-color="rgb(76.065063%, 92.686462%, 76.065063%)" stop-opacity="1"/>
-<stop offset="0.664063" stop-color="rgb(76.28479%, 92.753601%, 76.28479%)" stop-opacity="1"/>
-<stop offset="0.671875" stop-color="rgb(76.506042%, 92.82074%, 76.506042%)" stop-opacity="1"/>
-<stop offset="0.679688" stop-color="rgb(76.725769%, 92.887878%, 76.725769%)" stop-opacity="1"/>
-<stop offset="0.6875" stop-color="rgb(76.947021%, 92.955017%, 76.947021%)" stop-opacity="1"/>
-<stop offset="0.695312" stop-color="rgb(77.166748%, 93.022156%, 77.166748%)" stop-opacity="1"/>
-<stop offset="0.703125" stop-color="rgb(77.388%, 93.09082%, 77.388%)" stop-opacity="1"/>
-<stop offset="0.710938" stop-color="rgb(77.607727%, 93.157959%, 77.607727%)" stop-opacity="1"/>
-<stop offset="0.71875" stop-color="rgb(77.828979%, 93.225098%, 77.828979%)" stop-opacity="1"/>
-<stop offset="0.726562" stop-color="rgb(78.050232%, 93.292236%, 78.050232%)" stop-opacity="1"/>
-<stop offset="0.734375" stop-color="rgb(78.269958%, 93.359375%, 78.269958%)" stop-opacity="1"/>
-<stop offset="0.742188" stop-color="rgb(78.491211%, 93.426514%, 78.491211%)" stop-opacity="1"/>
-<stop offset="0.75" stop-color="rgb(78.710938%, 93.495178%, 78.710938%)" stop-opacity="1"/>
-<stop offset="0.757813" stop-color="rgb(78.93219%, 93.562317%, 78.93219%)" stop-opacity="1"/>
-<stop offset="0.765625" stop-color="rgb(79.151917%, 93.629456%, 79.151917%)" stop-opacity="1"/>
-<stop offset="0.773438" stop-color="rgb(79.373169%, 93.696594%, 79.373169%)" stop-opacity="1"/>
-<stop offset="0.78125" stop-color="rgb(79.592896%, 93.763733%, 79.592896%)" stop-opacity="1"/>
-<stop offset="0.789062" stop-color="rgb(79.814148%, 93.830872%, 79.814148%)" stop-opacity="1"/>
-<stop offset="0.796875" stop-color="rgb(80.0354%, 93.899536%, 80.0354%)" stop-opacity="1"/>
-<stop offset="0.804688" stop-color="rgb(80.255127%, 93.966675%, 80.255127%)" stop-opacity="1"/>
-<stop offset="0.8125" stop-color="rgb(80.476379%, 94.033813%, 80.476379%)" stop-opacity="1"/>
-<stop offset="0.820312" stop-color="rgb(80.696106%, 94.100952%, 80.696106%)" stop-opacity="1"/>
-<stop offset="0.828125" stop-color="rgb(80.917358%, 94.168091%, 80.917358%)" stop-opacity="1"/>
-<stop offset="0.835938" stop-color="rgb(81.137085%, 94.235229%, 81.137085%)" stop-opacity="1"/>
-<stop offset="0.84375" stop-color="rgb(81.358337%, 94.303894%, 81.358337%)" stop-opacity="1"/>
-<stop offset="0.851563" stop-color="rgb(81.578064%, 94.371033%, 81.578064%)" stop-opacity="1"/>
-<stop offset="0.859375" stop-color="rgb(81.799316%, 94.438171%, 81.799316%)" stop-opacity="1"/>
-<stop offset="0.867188" stop-color="rgb(82.020569%, 94.50531%, 82.020569%)" stop-opacity="1"/>
-<stop offset="0.875" stop-color="rgb(82.240295%, 94.572449%, 82.240295%)" stop-opacity="1"/>
-<stop offset="0.882812" stop-color="rgb(82.461548%, 94.639587%, 82.461548%)" stop-opacity="1"/>
-<stop offset="0.890625" stop-color="rgb(82.681274%, 94.706726%, 82.681274%)" stop-opacity="1"/>
-<stop offset="0.898438" stop-color="rgb(82.902527%, 94.775391%, 82.902527%)" stop-opacity="1"/>
-<stop offset="0.90625" stop-color="rgb(83.122253%, 94.842529%, 83.122253%)" stop-opacity="1"/>
-<stop offset="0.914063" stop-color="rgb(83.343506%, 94.909668%, 83.343506%)" stop-opacity="1"/>
-<stop offset="0.921875" stop-color="rgb(83.563232%, 94.976807%, 83.563232%)" stop-opacity="1"/>
-<stop offset="0.929688" stop-color="rgb(83.784485%, 95.043945%, 83.784485%)" stop-opacity="1"/>
-<stop offset="0.9375" stop-color="rgb(84.004211%, 95.111084%, 84.004211%)" stop-opacity="1"/>
-<stop offset="0.945313" stop-color="rgb(84.225464%, 95.179749%, 84.225464%)" stop-opacity="1"/>
-<stop offset="0.953125" stop-color="rgb(84.446716%, 95.246887%, 84.446716%)" stop-opacity="1"/>
-<stop offset="0.960938" stop-color="rgb(84.666443%, 95.314026%, 84.666443%)" stop-opacity="1"/>
-<stop offset="0.96875" stop-color="rgb(84.887695%, 95.381165%, 84.887695%)" stop-opacity="1"/>
-<stop offset="0.976562" stop-color="rgb(85.107422%, 95.448303%, 85.107422%)" stop-opacity="1"/>
-<stop offset="0.984375" stop-color="rgb(85.328674%, 95.515442%, 85.328674%)" stop-opacity="1"/>
-<stop offset="0.992188" stop-color="rgb(85.548401%, 95.584106%, 85.548401%)" stop-opacity="1"/>
-<stop offset="1" stop-color="rgb(85.769653%, 95.651245%, 85.769653%)" stop-opacity="1"/>
-</linearGradient>
-<clipPath id="clip-14">
-<path clip-rule="nonzero" d="M 280 45 L 349.308594 45 L 349.308594 69 L 280 69 Z M 280 45 "/>
-</clipPath>
-<clipPath id="clip-15">
-<path clip-rule="nonzero" d="M 281.082031 117 L 349.113281 117 L 349.113281 139 L 281.082031 139 Z M 281.082031 117 "/>
-</clipPath>
-<linearGradient id="linear-pattern-13" gradientUnits="userSpaceOnUse" x1="0" y1="25.002734" x2="0" y2="74.997266" gradientTransform="matrix(1.36081, 0, 0, -0.4536, 247.0581, 150.64429)">
-<stop offset="0" stop-color="rgb(57.649231%, 87.059021%, 57.649231%)" stop-opacity="1"/>
-<stop offset="0.0078125" stop-color="rgb(57.759094%, 87.09259%, 57.759094%)" stop-opacity="1"/>
-<stop offset="0.015625" stop-color="rgb(57.978821%, 87.159729%, 57.978821%)" stop-opacity="1"/>
-<stop offset="0.0234375" stop-color="rgb(58.200073%, 87.226868%, 58.200073%)" stop-opacity="1"/>
-<stop offset="0.03125" stop-color="rgb(58.4198%, 87.294006%, 58.4198%)" stop-opacity="1"/>
-<stop offset="0.0390625" stop-color="rgb(58.641052%, 87.361145%, 58.641052%)" stop-opacity="1"/>
-<stop offset="0.046875" stop-color="rgb(58.860779%, 87.42981%, 58.860779%)" stop-opacity="1"/>
-<stop offset="0.0546875" stop-color="rgb(59.082031%, 87.496948%, 59.082031%)" stop-opacity="1"/>
-<stop offset="0.0625" stop-color="rgb(59.301758%, 87.564087%, 59.301758%)" stop-opacity="1"/>
-<stop offset="0.0703125" stop-color="rgb(59.52301%, 87.631226%, 59.52301%)" stop-opacity="1"/>
-<stop offset="0.078125" stop-color="rgb(59.742737%, 87.698364%, 59.742737%)" stop-opacity="1"/>
-<stop offset="0.0859375" stop-color="rgb(59.963989%, 87.765503%, 59.963989%)" stop-opacity="1"/>
-<stop offset="0.09375" stop-color="rgb(60.185242%, 87.834167%, 60.185242%)" stop-opacity="1"/>
-<stop offset="0.101562" stop-color="rgb(60.404968%, 87.901306%, 60.404968%)" stop-opacity="1"/>
-<stop offset="0.109375" stop-color="rgb(60.626221%, 87.968445%, 60.626221%)" stop-opacity="1"/>
-<stop offset="0.117188" stop-color="rgb(60.845947%, 88.035583%, 60.845947%)" stop-opacity="1"/>
-<stop offset="0.125" stop-color="rgb(61.0672%, 88.102722%, 61.0672%)" stop-opacity="1"/>
-<stop offset="0.132812" stop-color="rgb(61.286926%, 88.169861%, 61.286926%)" stop-opacity="1"/>
-<stop offset="0.140625" stop-color="rgb(61.508179%, 88.238525%, 61.508179%)" stop-opacity="1"/>
-<stop offset="0.148438" stop-color="rgb(61.727905%, 88.305664%, 61.727905%)" stop-opacity="1"/>
-<stop offset="0.15625" stop-color="rgb(61.949158%, 88.372803%, 61.949158%)" stop-opacity="1"/>
-<stop offset="0.164062" stop-color="rgb(62.168884%, 88.439941%, 62.168884%)" stop-opacity="1"/>
-<stop offset="0.171875" stop-color="rgb(62.390137%, 88.50708%, 62.390137%)" stop-opacity="1"/>
-<stop offset="0.179688" stop-color="rgb(62.609863%, 88.574219%, 62.609863%)" stop-opacity="1"/>
-<stop offset="0.1875" stop-color="rgb(62.831116%, 88.642883%, 62.831116%)" stop-opacity="1"/>
-<stop offset="0.195312" stop-color="rgb(63.052368%, 88.710022%, 63.052368%)" stop-opacity="1"/>
-<stop offset="0.203125" stop-color="rgb(63.272095%, 88.777161%, 63.272095%)" stop-opacity="1"/>
-<stop offset="0.210937" stop-color="rgb(63.493347%, 88.844299%, 63.493347%)" stop-opacity="1"/>
-<stop offset="0.21875" stop-color="rgb(63.713074%, 88.911438%, 63.713074%)" stop-opacity="1"/>
-<stop offset="0.226562" stop-color="rgb(63.934326%, 88.980103%, 63.934326%)" stop-opacity="1"/>
-<stop offset="0.234375" stop-color="rgb(64.154053%, 89.047241%, 64.154053%)" stop-opacity="1"/>
-<stop offset="0.242187" stop-color="rgb(64.375305%, 89.11438%, 64.375305%)" stop-opacity="1"/>
-<stop offset="0.25" stop-color="rgb(64.595032%, 89.181519%, 64.595032%)" stop-opacity="1"/>
-<stop offset="0.257812" stop-color="rgb(64.816284%, 89.248657%, 64.816284%)" stop-opacity="1"/>
-<stop offset="0.265625" stop-color="rgb(65.036011%, 89.315796%, 65.036011%)" stop-opacity="1"/>
-<stop offset="0.273437" stop-color="rgb(65.257263%, 89.38446%, 65.257263%)" stop-opacity="1"/>
-<stop offset="0.28125" stop-color="rgb(65.478516%, 89.451599%, 65.478516%)" stop-opacity="1"/>
-<stop offset="0.289062" stop-color="rgb(65.698242%, 89.518738%, 65.698242%)" stop-opacity="1"/>
-<stop offset="0.296875" stop-color="rgb(65.919495%, 89.585876%, 65.919495%)" stop-opacity="1"/>
-<stop offset="0.304687" stop-color="rgb(66.139221%, 89.653015%, 66.139221%)" stop-opacity="1"/>
-<stop offset="0.3125" stop-color="rgb(66.360474%, 89.720154%, 66.360474%)" stop-opacity="1"/>
-<stop offset="0.320312" stop-color="rgb(66.5802%, 89.788818%, 66.5802%)" stop-opacity="1"/>
-<stop offset="0.328125" stop-color="rgb(66.801453%, 89.855957%, 66.801453%)" stop-opacity="1"/>
-<stop offset="0.335937" stop-color="rgb(67.021179%, 89.923096%, 67.021179%)" stop-opacity="1"/>
-<stop offset="0.34375" stop-color="rgb(67.242432%, 89.990234%, 67.242432%)" stop-opacity="1"/>
-<stop offset="0.351562" stop-color="rgb(67.462158%, 90.057373%, 67.462158%)" stop-opacity="1"/>
-<stop offset="0.359375" stop-color="rgb(67.683411%, 90.124512%, 67.683411%)" stop-opacity="1"/>
-<stop offset="0.367187" stop-color="rgb(67.903137%, 90.193176%, 67.903137%)" stop-opacity="1"/>
-<stop offset="0.375" stop-color="rgb(68.12439%, 90.260315%, 68.12439%)" stop-opacity="1"/>
-<stop offset="0.382812" stop-color="rgb(68.345642%, 90.327454%, 68.345642%)" stop-opacity="1"/>
-<stop offset="0.390625" stop-color="rgb(68.565369%, 90.394592%, 68.565369%)" stop-opacity="1"/>
-<stop offset="0.398437" stop-color="rgb(68.786621%, 90.461731%, 68.786621%)" stop-opacity="1"/>
-<stop offset="0.40625" stop-color="rgb(69.006348%, 90.52887%, 69.006348%)" stop-opacity="1"/>
-<stop offset="0.414062" stop-color="rgb(69.2276%, 90.597534%, 69.2276%)" stop-opacity="1"/>
-<stop offset="0.421875" stop-color="rgb(69.447327%, 90.664673%, 69.447327%)" stop-opacity="1"/>
-<stop offset="0.429687" stop-color="rgb(69.668579%, 90.731812%, 69.668579%)" stop-opacity="1"/>
-<stop offset="0.4375" stop-color="rgb(69.888306%, 90.79895%, 69.888306%)" stop-opacity="1"/>
-<stop offset="0.445312" stop-color="rgb(70.109558%, 90.866089%, 70.109558%)" stop-opacity="1"/>
-<stop offset="0.453125" stop-color="rgb(70.329285%, 90.933228%, 70.329285%)" stop-opacity="1"/>
-<stop offset="0.460937" stop-color="rgb(70.550537%, 91.001892%, 70.550537%)" stop-opacity="1"/>
-<stop offset="0.46875" stop-color="rgb(70.770264%, 91.069031%, 70.770264%)" stop-opacity="1"/>
-<stop offset="0.476562" stop-color="rgb(70.991516%, 91.136169%, 70.991516%)" stop-opacity="1"/>
-<stop offset="0.484375" stop-color="rgb(71.212769%, 91.203308%, 71.212769%)" stop-opacity="1"/>
-<stop offset="0.492188" stop-color="rgb(71.432495%, 91.270447%, 71.432495%)" stop-opacity="1"/>
-<stop offset="0.5" stop-color="rgb(71.653748%, 91.337585%, 71.653748%)" stop-opacity="1"/>
-<stop offset="0.507812" stop-color="rgb(71.873474%, 91.40625%, 71.873474%)" stop-opacity="1"/>
-<stop offset="0.515625" stop-color="rgb(72.094727%, 91.473389%, 72.094727%)" stop-opacity="1"/>
-<stop offset="0.523437" stop-color="rgb(72.314453%, 91.540527%, 72.314453%)" stop-opacity="1"/>
-<stop offset="0.53125" stop-color="rgb(72.535706%, 91.607666%, 72.535706%)" stop-opacity="1"/>
-<stop offset="0.539063" stop-color="rgb(72.755432%, 91.674805%, 72.755432%)" stop-opacity="1"/>
-<stop offset="0.546875" stop-color="rgb(72.976685%, 91.741943%, 72.976685%)" stop-opacity="1"/>
-<stop offset="0.554688" stop-color="rgb(73.196411%, 91.810608%, 73.196411%)" stop-opacity="1"/>
-<stop offset="0.5625" stop-color="rgb(73.417664%, 91.877747%, 73.417664%)" stop-opacity="1"/>
-<stop offset="0.570312" stop-color="rgb(73.638916%, 91.944885%, 73.638916%)" stop-opacity="1"/>
-<stop offset="0.578125" stop-color="rgb(73.858643%, 92.012024%, 73.858643%)" stop-opacity="1"/>
-<stop offset="0.585938" stop-color="rgb(74.079895%, 92.079163%, 74.079895%)" stop-opacity="1"/>
-<stop offset="0.59375" stop-color="rgb(74.299622%, 92.146301%, 74.299622%)" stop-opacity="1"/>
-<stop offset="0.601563" stop-color="rgb(74.520874%, 92.214966%, 74.520874%)" stop-opacity="1"/>
-<stop offset="0.609375" stop-color="rgb(74.740601%, 92.282104%, 74.740601%)" stop-opacity="1"/>
-<stop offset="0.617188" stop-color="rgb(74.961853%, 92.349243%, 74.961853%)" stop-opacity="1"/>
-<stop offset="0.625" stop-color="rgb(75.18158%, 92.416382%, 75.18158%)" stop-opacity="1"/>
-<stop offset="0.632813" stop-color="rgb(75.402832%, 92.483521%, 75.402832%)" stop-opacity="1"/>
-<stop offset="0.640625" stop-color="rgb(75.624084%, 92.550659%, 75.624084%)" stop-opacity="1"/>
-<stop offset="0.648438" stop-color="rgb(75.843811%, 92.619324%, 75.843811%)" stop-opacity="1"/>
-<stop offset="0.65625" stop-color="rgb(76.065063%, 92.686462%, 76.065063%)" stop-opacity="1"/>
-<stop offset="0.664062" stop-color="rgb(76.28479%, 92.753601%, 76.28479%)" stop-opacity="1"/>
-<stop offset="0.671875" stop-color="rgb(76.506042%, 92.82074%, 76.506042%)" stop-opacity="1"/>
-<stop offset="0.679688" stop-color="rgb(76.725769%, 92.887878%, 76.725769%)" stop-opacity="1"/>
-<stop offset="0.6875" stop-color="rgb(76.947021%, 92.955017%, 76.947021%)" stop-opacity="1"/>
-<stop offset="0.695312" stop-color="rgb(77.166748%, 93.022156%, 77.166748%)" stop-opacity="1"/>
-<stop offset="0.703125" stop-color="rgb(77.388%, 93.09082%, 77.388%)" stop-opacity="1"/>
-<stop offset="0.710938" stop-color="rgb(77.607727%, 93.157959%, 77.607727%)" stop-opacity="1"/>
-<stop offset="0.71875" stop-color="rgb(77.828979%, 93.225098%, 77.828979%)" stop-opacity="1"/>
-<stop offset="0.726562" stop-color="rgb(78.050232%, 93.292236%, 78.050232%)" stop-opacity="1"/>
-<stop offset="0.734375" stop-color="rgb(78.269958%, 93.359375%, 78.269958%)" stop-opacity="1"/>
-<stop offset="0.742188" stop-color="rgb(78.491211%, 93.426514%, 78.491211%)" stop-opacity="1"/>
-<stop offset="0.75" stop-color="rgb(78.710938%, 93.495178%, 78.710938%)" stop-opacity="1"/>
-<stop offset="0.757813" stop-color="rgb(78.93219%, 93.562317%, 78.93219%)" stop-opacity="1"/>
-<stop offset="0.765625" stop-color="rgb(79.151917%, 93.629456%, 79.151917%)" stop-opacity="1"/>
-<stop offset="0.773438" stop-color="rgb(79.373169%, 93.696594%, 79.373169%)" stop-opacity="1"/>
-<stop offset="0.78125" stop-color="rgb(79.592896%, 93.763733%, 79.592896%)" stop-opacity="1"/>
-<stop offset="0.789062" stop-color="rgb(79.814148%, 93.830872%, 79.814148%)" stop-opacity="1"/>
-<stop offset="0.796875" stop-color="rgb(80.0354%, 93.899536%, 80.0354%)" stop-opacity="1"/>
-<stop offset="0.804688" stop-color="rgb(80.255127%, 93.966675%, 80.255127%)" stop-opacity="1"/>
-<stop offset="0.8125" stop-color="rgb(80.476379%, 94.033813%, 80.476379%)" stop-opacity="1"/>
-<stop offset="0.820312" stop-color="rgb(80.696106%, 94.100952%, 80.696106%)" stop-opacity="1"/>
-<stop offset="0.828125" stop-color="rgb(80.917358%, 94.168091%, 80.917358%)" stop-opacity="1"/>
-<stop offset="0.835938" stop-color="rgb(81.137085%, 94.235229%, 81.137085%)" stop-opacity="1"/>
-<stop offset="0.84375" stop-color="rgb(81.358337%, 94.303894%, 81.358337%)" stop-opacity="1"/>
-<stop offset="0.851562" stop-color="rgb(81.578064%, 94.371033%, 81.578064%)" stop-opacity="1"/>
-<stop offset="0.859375" stop-color="rgb(81.799316%, 94.438171%, 81.799316%)" stop-opacity="1"/>
-<stop offset="0.867188" stop-color="rgb(82.020569%, 94.50531%, 82.020569%)" stop-opacity="1"/>
-<stop offset="0.875" stop-color="rgb(82.240295%, 94.572449%, 82.240295%)" stop-opacity="1"/>
-<stop offset="0.882813" stop-color="rgb(82.461548%, 94.639587%, 82.461548%)" stop-opacity="1"/>
-<stop offset="0.890625" stop-color="rgb(82.681274%, 94.706726%, 82.681274%)" stop-opacity="1"/>
-<stop offset="0.898438" stop-color="rgb(82.902527%, 94.775391%, 82.902527%)" stop-opacity="1"/>
-<stop offset="0.90625" stop-color="rgb(83.122253%, 94.842529%, 83.122253%)" stop-opacity="1"/>
-<stop offset="0.914062" stop-color="rgb(83.343506%, 94.909668%, 83.343506%)" stop-opacity="1"/>
-<stop offset="0.921875" stop-color="rgb(83.563232%, 94.976807%, 83.563232%)" stop-opacity="1"/>
-<stop offset="0.929688" stop-color="rgb(83.784485%, 95.043945%, 83.784485%)" stop-opacity="1"/>
-<stop offset="0.9375" stop-color="rgb(84.004211%, 95.111084%, 84.004211%)" stop-opacity="1"/>
-<stop offset="0.945312" stop-color="rgb(84.225464%, 95.179749%, 84.225464%)" stop-opacity="1"/>
-<stop offset="0.953125" stop-color="rgb(84.446716%, 95.246887%, 84.446716%)" stop-opacity="1"/>
-<stop offset="0.960938" stop-color="rgb(84.666443%, 95.314026%, 84.666443%)" stop-opacity="1"/>
-<stop offset="0.96875" stop-color="rgb(84.887695%, 95.381165%, 84.887695%)" stop-opacity="1"/>
-<stop offset="0.976562" stop-color="rgb(85.107422%, 95.448303%, 85.107422%)" stop-opacity="1"/>
-<stop offset="0.984375" stop-color="rgb(85.328674%, 95.515442%, 85.328674%)" stop-opacity="1"/>
-<stop offset="0.992187" stop-color="rgb(85.548401%, 95.584106%, 85.548401%)" stop-opacity="1"/>
-<stop offset="1" stop-color="rgb(85.769653%, 95.651245%, 85.769653%)" stop-opacity="1"/>
-</linearGradient>
-<clipPath id="clip-16">
-<path clip-rule="nonzero" d="M 280 116 L 349.308594 116 L 349.308594 140 L 280 140 Z M 280 116 "/>
-</clipPath>
-<clipPath id="clip-17">
-<path clip-rule="nonzero" d="M 281.082031 159.144531 L 349.113281 159.144531 L 349.113281 181.824219 L 281.082031 181.824219 Z M 281.082031 159.144531 "/>
-</clipPath>
-<linearGradient id="linear-pattern-14" gradientUnits="userSpaceOnUse" x1="0" y1="25.002734" x2="0" y2="74.997244" gradientTransform="matrix(1.36081, 0, 0, -0.4536, 247.0581, 193.16448)">
-<stop offset="0" stop-color="rgb(57.649231%, 87.059021%, 57.649231%)" stop-opacity="1"/>
-<stop offset="0.0078125" stop-color="rgb(57.759094%, 87.09259%, 57.759094%)" stop-opacity="1"/>
-<stop offset="0.015625" stop-color="rgb(57.978821%, 87.159729%, 57.978821%)" stop-opacity="1"/>
-<stop offset="0.0234375" stop-color="rgb(58.200073%, 87.226868%, 58.200073%)" stop-opacity="1"/>
-<stop offset="0.03125" stop-color="rgb(58.4198%, 87.294006%, 58.4198%)" stop-opacity="1"/>
-<stop offset="0.0390625" stop-color="rgb(58.641052%, 87.361145%, 58.641052%)" stop-opacity="1"/>
-<stop offset="0.046875" stop-color="rgb(58.860779%, 87.42981%, 58.860779%)" stop-opacity="1"/>
-<stop offset="0.0546875" stop-color="rgb(59.082031%, 87.496948%, 59.082031%)" stop-opacity="1"/>
-<stop offset="0.0625" stop-color="rgb(59.301758%, 87.564087%, 59.301758%)" stop-opacity="1"/>
-<stop offset="0.0703125" stop-color="rgb(59.52301%, 87.631226%, 59.52301%)" stop-opacity="1"/>
-<stop offset="0.078125" stop-color="rgb(59.742737%, 87.698364%, 59.742737%)" stop-opacity="1"/>
-<stop offset="0.0859375" stop-color="rgb(59.963989%, 87.765503%, 59.963989%)" stop-opacity="1"/>
-<stop offset="0.09375" stop-color="rgb(60.185242%, 87.834167%, 60.185242%)" stop-opacity="1"/>
-<stop offset="0.101563" stop-color="rgb(60.404968%, 87.901306%, 60.404968%)" stop-opacity="1"/>
-<stop offset="0.109375" stop-color="rgb(60.626221%, 87.968445%, 60.626221%)" stop-opacity="1"/>
-<stop offset="0.117187" stop-color="rgb(60.845947%, 88.035583%, 60.845947%)" stop-opacity="1"/>
-<stop offset="0.125" stop-color="rgb(61.0672%, 88.102722%, 61.0672%)" stop-opacity="1"/>
-<stop offset="0.132813" stop-color="rgb(61.286926%, 88.169861%, 61.286926%)" stop-opacity="1"/>
-<stop offset="0.140625" stop-color="rgb(61.508179%, 88.238525%, 61.508179%)" stop-opacity="1"/>
-<stop offset="0.148438" stop-color="rgb(61.727905%, 88.305664%, 61.727905%)" stop-opacity="1"/>
-<stop offset="0.15625" stop-color="rgb(61.949158%, 88.372803%, 61.949158%)" stop-opacity="1"/>
-<stop offset="0.164063" stop-color="rgb(62.168884%, 88.439941%, 62.168884%)" stop-opacity="1"/>
-<stop offset="0.171875" stop-color="rgb(62.390137%, 88.50708%, 62.390137%)" stop-opacity="1"/>
-<stop offset="0.179688" stop-color="rgb(62.609863%, 88.574219%, 62.609863%)" stop-opacity="1"/>
-<stop offset="0.1875" stop-color="rgb(62.831116%, 88.642883%, 62.831116%)" stop-opacity="1"/>
-<stop offset="0.195312" stop-color="rgb(63.052368%, 88.710022%, 63.052368%)" stop-opacity="1"/>
-<stop offset="0.203125" stop-color="rgb(63.272095%, 88.777161%, 63.272095%)" stop-opacity="1"/>
-<stop offset="0.210938" stop-color="rgb(63.493347%, 88.844299%, 63.493347%)" stop-opacity="1"/>
-<stop offset="0.21875" stop-color="rgb(63.713074%, 88.911438%, 63.713074%)" stop-opacity="1"/>
-<stop offset="0.226563" stop-color="rgb(63.934326%, 88.980103%, 63.934326%)" stop-opacity="1"/>
-<stop offset="0.234375" stop-color="rgb(64.154053%, 89.047241%, 64.154053%)" stop-opacity="1"/>
-<stop offset="0.242187" stop-color="rgb(64.375305%, 89.11438%, 64.375305%)" stop-opacity="1"/>
-<stop offset="0.25" stop-color="rgb(64.595032%, 89.181519%, 64.595032%)" stop-opacity="1"/>
-<stop offset="0.257813" stop-color="rgb(64.816284%, 89.248657%, 64.816284%)" stop-opacity="1"/>
-<stop offset="0.265625" stop-color="rgb(65.036011%, 89.315796%, 65.036011%)" stop-opacity="1"/>
-<stop offset="0.273438" stop-color="rgb(65.257263%, 89.38446%, 65.257263%)" stop-opacity="1"/>
-<stop offset="0.28125" stop-color="rgb(65.47699%, 89.451599%, 65.47699%)" stop-opacity="1"/>
-<stop offset="0.289063" stop-color="rgb(65.698242%, 89.518738%, 65.698242%)" stop-opacity="1"/>
-<stop offset="0.296875" stop-color="rgb(65.919495%, 89.585876%, 65.919495%)" stop-opacity="1"/>
-<stop offset="0.304688" stop-color="rgb(66.139221%, 89.653015%, 66.139221%)" stop-opacity="1"/>
-<stop offset="0.3125" stop-color="rgb(66.360474%, 89.720154%, 66.360474%)" stop-opacity="1"/>
-<stop offset="0.320313" stop-color="rgb(66.5802%, 89.788818%, 66.5802%)" stop-opacity="1"/>
-<stop offset="0.328125" stop-color="rgb(66.801453%, 89.855957%, 66.801453%)" stop-opacity="1"/>
-<stop offset="0.335938" stop-color="rgb(67.021179%, 89.923096%, 67.021179%)" stop-opacity="1"/>
-<stop offset="0.34375" stop-color="rgb(67.242432%, 89.990234%, 67.242432%)" stop-opacity="1"/>
-<stop offset="0.351563" stop-color="rgb(67.462158%, 90.057373%, 67.462158%)" stop-opacity="1"/>
-<stop offset="0.359375" stop-color="rgb(67.683411%, 90.124512%, 67.683411%)" stop-opacity="1"/>
-<stop offset="0.367188" stop-color="rgb(67.903137%, 90.193176%, 67.903137%)" stop-opacity="1"/>
-<stop offset="0.375" stop-color="rgb(68.12439%, 90.260315%, 68.12439%)" stop-opacity="1"/>
-<stop offset="0.382813" stop-color="rgb(68.345642%, 90.327454%, 68.345642%)" stop-opacity="1"/>
-<stop offset="0.390625" stop-color="rgb(68.565369%, 90.394592%, 68.565369%)" stop-opacity="1"/>
-<stop offset="0.398438" stop-color="rgb(68.786621%, 90.461731%, 68.786621%)" stop-opacity="1"/>
-<stop offset="0.40625" stop-color="rgb(69.006348%, 90.52887%, 69.006348%)" stop-opacity="1"/>
-<stop offset="0.414062" stop-color="rgb(69.2276%, 90.597534%, 69.2276%)" stop-opacity="1"/>
-<stop offset="0.421875" stop-color="rgb(69.447327%, 90.664673%, 69.447327%)" stop-opacity="1"/>
-<stop offset="0.429688" stop-color="rgb(69.668579%, 90.731812%, 69.668579%)" stop-opacity="1"/>
-<stop offset="0.4375" stop-color="rgb(69.888306%, 90.79895%, 69.888306%)" stop-opacity="1"/>
-<stop offset="0.445313" stop-color="rgb(70.109558%, 90.866089%, 70.109558%)" stop-opacity="1"/>
-<stop offset="0.453125" stop-color="rgb(70.329285%, 90.933228%, 70.329285%)" stop-opacity="1"/>
-<stop offset="0.460938" stop-color="rgb(70.550537%, 91.001892%, 70.550537%)" stop-opacity="1"/>
-<stop offset="0.46875" stop-color="rgb(70.770264%, 91.069031%, 70.770264%)" stop-opacity="1"/>
-<stop offset="0.476562" stop-color="rgb(70.991516%, 91.136169%, 70.991516%)" stop-opacity="1"/>
-<stop offset="0.484375" stop-color="rgb(71.212769%, 91.203308%, 71.212769%)" stop-opacity="1"/>
-<stop offset="0.492188" stop-color="rgb(71.432495%, 91.270447%, 71.432495%)" stop-opacity="1"/>
-<stop offset="0.5" stop-color="rgb(71.653748%, 91.337585%, 71.653748%)" stop-opacity="1"/>
-<stop offset="0.507812" stop-color="rgb(71.873474%, 91.40625%, 71.873474%)" stop-opacity="1"/>
-<stop offset="0.515625" stop-color="rgb(72.094727%, 91.473389%, 72.094727%)" stop-opacity="1"/>
-<stop offset="0.523437" stop-color="rgb(72.314453%, 91.540527%, 72.314453%)" stop-opacity="1"/>
-<stop offset="0.53125" stop-color="rgb(72.535706%, 91.607666%, 72.535706%)" stop-opacity="1"/>
-<stop offset="0.539062" stop-color="rgb(72.755432%, 91.674805%, 72.755432%)" stop-opacity="1"/>
-<stop offset="0.546875" stop-color="rgb(72.976685%, 91.741943%, 72.976685%)" stop-opacity="1"/>
-<stop offset="0.554688" stop-color="rgb(73.196411%, 91.810608%, 73.196411%)" stop-opacity="1"/>
-<stop offset="0.5625" stop-color="rgb(73.417664%, 91.877747%, 73.417664%)" stop-opacity="1"/>
-<stop offset="0.570312" stop-color="rgb(73.638916%, 91.944885%, 73.638916%)" stop-opacity="1"/>
-<stop offset="0.578125" stop-color="rgb(73.858643%, 92.012024%, 73.858643%)" stop-opacity="1"/>
-<stop offset="0.585938" stop-color="rgb(74.079895%, 92.079163%, 74.079895%)" stop-opacity="1"/>
-<stop offset="0.59375" stop-color="rgb(74.299622%, 92.146301%, 74.299622%)" stop-opacity="1"/>
-<stop offset="0.601563" stop-color="rgb(74.520874%, 92.214966%, 74.520874%)" stop-opacity="1"/>
-<stop offset="0.609375" stop-color="rgb(74.740601%, 92.282104%, 74.740601%)" stop-opacity="1"/>
-<stop offset="0.617187" stop-color="rgb(74.961853%, 92.349243%, 74.961853%)" stop-opacity="1"/>
-<stop offset="0.625" stop-color="rgb(75.18158%, 92.416382%, 75.18158%)" stop-opacity="1"/>
-<stop offset="0.632812" stop-color="rgb(75.402832%, 92.483521%, 75.402832%)" stop-opacity="1"/>
-<stop offset="0.640625" stop-color="rgb(75.624084%, 92.550659%, 75.624084%)" stop-opacity="1"/>
-<stop offset="0.648438" stop-color="rgb(75.843811%, 92.619324%, 75.843811%)" stop-opacity="1"/>
-<stop offset="0.65625" stop-color="rgb(76.065063%, 92.686462%, 76.065063%)" stop-opacity="1"/>
-<stop offset="0.664062" stop-color="rgb(76.28479%, 92.753601%, 76.28479%)" stop-opacity="1"/>
-<stop offset="0.671875" stop-color="rgb(76.506042%, 92.82074%, 76.506042%)" stop-opacity="1"/>
-<stop offset="0.679688" stop-color="rgb(76.725769%, 92.887878%, 76.725769%)" stop-opacity="1"/>
-<stop offset="0.6875" stop-color="rgb(76.947021%, 92.955017%, 76.947021%)" stop-opacity="1"/>
-<stop offset="0.695313" stop-color="rgb(77.166748%, 93.022156%, 77.166748%)" stop-opacity="1"/>
-<stop offset="0.703125" stop-color="rgb(77.388%, 93.09082%, 77.388%)" stop-opacity="1"/>
-<stop offset="0.710937" stop-color="rgb(77.607727%, 93.157959%, 77.607727%)" stop-opacity="1"/>
-<stop offset="0.71875" stop-color="rgb(77.828979%, 93.225098%, 77.828979%)" stop-opacity="1"/>
-<stop offset="0.726562" stop-color="rgb(78.050232%, 93.292236%, 78.050232%)" stop-opacity="1"/>
-<stop offset="0.734375" stop-color="rgb(78.269958%, 93.359375%, 78.269958%)" stop-opacity="1"/>
-<stop offset="0.742188" stop-color="rgb(78.491211%, 93.426514%, 78.491211%)" stop-opacity="1"/>
-<stop offset="0.75" stop-color="rgb(78.710938%, 93.495178%, 78.710938%)" stop-opacity="1"/>
-<stop offset="0.757812" stop-color="rgb(78.93219%, 93.562317%, 78.93219%)" stop-opacity="1"/>
-<stop offset="0.765625" stop-color="rgb(79.151917%, 93.629456%, 79.151917%)" stop-opacity="1"/>
-<stop offset="0.773437" stop-color="rgb(79.373169%, 93.696594%, 79.373169%)" stop-opacity="1"/>
-<stop offset="0.78125" stop-color="rgb(79.592896%, 93.763733%, 79.592896%)" stop-opacity="1"/>
-<stop offset="0.789063" stop-color="rgb(79.814148%, 93.830872%, 79.814148%)" stop-opacity="1"/>
-<stop offset="0.796875" stop-color="rgb(80.0354%, 93.899536%, 80.0354%)" stop-opacity="1"/>
-<stop offset="0.804687" stop-color="rgb(80.255127%, 93.966675%, 80.255127%)" stop-opacity="1"/>
-<stop offset="0.8125" stop-color="rgb(80.476379%, 94.033813%, 80.476379%)" stop-opacity="1"/>
-<stop offset="0.820312" stop-color="rgb(80.696106%, 94.100952%, 80.696106%)" stop-opacity="1"/>
-<stop offset="0.828125" stop-color="rgb(80.917358%, 94.168091%, 80.917358%)" stop-opacity="1"/>
-<stop offset="0.835938" stop-color="rgb(81.137085%, 94.235229%, 81.137085%)" stop-opacity="1"/>
-<stop offset="0.84375" stop-color="rgb(81.358337%, 94.303894%, 81.358337%)" stop-opacity="1"/>
-<stop offset="0.851562" stop-color="rgb(81.578064%, 94.371033%, 81.578064%)" stop-opacity="1"/>
-<stop offset="0.859375" stop-color="rgb(81.799316%, 94.438171%, 81.799316%)" stop-opacity="1"/>
-<stop offset="0.867187" stop-color="rgb(82.019043%, 94.50531%, 82.019043%)" stop-opacity="1"/>
-<stop offset="0.875" stop-color="rgb(82.240295%, 94.572449%, 82.240295%)" stop-opacity="1"/>
-<stop offset="0.882812" stop-color="rgb(82.461548%, 94.639587%, 82.461548%)" stop-opacity="1"/>
-<stop offset="0.890625" stop-color="rgb(82.681274%, 94.706726%, 82.681274%)" stop-opacity="1"/>
-<stop offset="0.898438" stop-color="rgb(82.902527%, 94.775391%, 82.902527%)" stop-opacity="1"/>
-<stop offset="0.90625" stop-color="rgb(83.122253%, 94.842529%, 83.122253%)" stop-opacity="1"/>
-<stop offset="0.914062" stop-color="rgb(83.343506%, 94.909668%, 83.343506%)" stop-opacity="1"/>
-<stop offset="0.921875" stop-color="rgb(83.563232%, 94.976807%, 83.563232%)" stop-opacity="1"/>
-<stop offset="0.929688" stop-color="rgb(83.784485%, 95.043945%, 83.784485%)" stop-opacity="1"/>
-<stop offset="0.9375" stop-color="rgb(84.004211%, 95.111084%, 84.004211%)" stop-opacity="1"/>
-<stop offset="0.945313" stop-color="rgb(84.225464%, 95.179749%, 84.225464%)" stop-opacity="1"/>
-<stop offset="0.953125" stop-color="rgb(84.446716%, 95.246887%, 84.446716%)" stop-opacity="1"/>
-<stop offset="0.960937" stop-color="rgb(84.666443%, 95.314026%, 84.666443%)" stop-opacity="1"/>
-<stop offset="0.96875" stop-color="rgb(84.887695%, 95.381165%, 84.887695%)" stop-opacity="1"/>
-<stop offset="0.976562" stop-color="rgb(85.107422%, 95.448303%, 85.107422%)" stop-opacity="1"/>
-<stop offset="0.984375" stop-color="rgb(85.328674%, 95.515442%, 85.328674%)" stop-opacity="1"/>
-<stop offset="0.992188" stop-color="rgb(85.548401%, 95.584106%, 85.548401%)" stop-opacity="1"/>
-<stop offset="1" stop-color="rgb(85.769653%, 95.651245%, 85.769653%)" stop-opacity="1"/>
-</linearGradient>
-<clipPath id="clip-18">
-<path clip-rule="nonzero" d="M 280 158 L 349.308594 158 L 349.308594 183 L 280 183 Z M 280 158 "/>
-</clipPath>
-<clipPath id="clip-19">
-<path clip-rule="nonzero" d="M 281.082031 202 L 349.113281 202 L 349.113281 224 L 281.082031 224 Z M 281.082031 202 "/>
-</clipPath>
-<linearGradient id="linear-pattern-15" gradientUnits="userSpaceOnUse" x1="0" y1="25.002734" x2="0" y2="74.997266" gradientTransform="matrix(1.36081, 0, 0, -0.4536, 247.0581, 235.68468)">
-<stop offset="0" stop-color="rgb(57.649231%, 87.059021%, 57.649231%)" stop-opacity="1"/>
-<stop offset="0.0078125" stop-color="rgb(57.759094%, 87.09259%, 57.759094%)" stop-opacity="1"/>
-<stop offset="0.015625" stop-color="rgb(57.978821%, 87.159729%, 57.978821%)" stop-opacity="1"/>
-<stop offset="0.0234375" stop-color="rgb(58.200073%, 87.226868%, 58.200073%)" stop-opacity="1"/>
-<stop offset="0.03125" stop-color="rgb(58.4198%, 87.294006%, 58.4198%)" stop-opacity="1"/>
-<stop offset="0.0390625" stop-color="rgb(58.641052%, 87.361145%, 58.641052%)" stop-opacity="1"/>
-<stop offset="0.046875" stop-color="rgb(58.860779%, 87.42981%, 58.860779%)" stop-opacity="1"/>
-<stop offset="0.0546875" stop-color="rgb(59.082031%, 87.496948%, 59.082031%)" stop-opacity="1"/>
-<stop offset="0.0625" stop-color="rgb(59.301758%, 87.564087%, 59.301758%)" stop-opacity="1"/>
-<stop offset="0.0703125" stop-color="rgb(59.52301%, 87.631226%, 59.52301%)" stop-opacity="1"/>
-<stop offset="0.078125" stop-color="rgb(59.742737%, 87.698364%, 59.742737%)" stop-opacity="1"/>
-<stop offset="0.0859375" stop-color="rgb(59.963989%, 87.765503%, 59.963989%)" stop-opacity="1"/>
-<stop offset="0.09375" stop-color="rgb(60.185242%, 87.834167%, 60.185242%)" stop-opacity="1"/>
-<stop offset="0.101562" stop-color="rgb(60.404968%, 87.901306%, 60.404968%)" stop-opacity="1"/>
-<stop offset="0.109375" stop-color="rgb(60.626221%, 87.968445%, 60.626221%)" stop-opacity="1"/>
-<stop offset="0.117188" stop-color="rgb(60.845947%, 88.035583%, 60.845947%)" stop-opacity="1"/>
-<stop offset="0.125" stop-color="rgb(61.0672%, 88.102722%, 61.0672%)" stop-opacity="1"/>
-<stop offset="0.132812" stop-color="rgb(61.286926%, 88.169861%, 61.286926%)" stop-opacity="1"/>
-<stop offset="0.140625" stop-color="rgb(61.508179%, 88.238525%, 61.508179%)" stop-opacity="1"/>
-<stop offset="0.148438" stop-color="rgb(61.727905%, 88.305664%, 61.727905%)" stop-opacity="1"/>
-<stop offset="0.15625" stop-color="rgb(61.949158%, 88.372803%, 61.949158%)" stop-opacity="1"/>
-<stop offset="0.164062" stop-color="rgb(62.168884%, 88.439941%, 62.168884%)" stop-opacity="1"/>
-<stop offset="0.171875" stop-color="rgb(62.390137%, 88.50708%, 62.390137%)" stop-opacity="1"/>
-<stop offset="0.179688" stop-color="rgb(62.609863%, 88.574219%, 62.609863%)" stop-opacity="1"/>
-<stop offset="0.1875" stop-color="rgb(62.831116%, 88.642883%, 62.831116%)" stop-opacity="1"/>
-<stop offset="0.195312" stop-color="rgb(63.052368%, 88.710022%, 63.052368%)" stop-opacity="1"/>
-<stop offset="0.203125" stop-color="rgb(63.272095%, 88.777161%, 63.272095%)" stop-opacity="1"/>
-<stop offset="0.210937" stop-color="rgb(63.493347%, 88.844299%, 63.493347%)" stop-opacity="1"/>
-<stop offset="0.21875" stop-color="rgb(63.713074%, 88.911438%, 63.713074%)" stop-opacity="1"/>
-<stop offset="0.226562" stop-color="rgb(63.934326%, 88.980103%, 63.934326%)" stop-opacity="1"/>
-<stop offset="0.234375" stop-color="rgb(64.154053%, 89.047241%, 64.154053%)" stop-opacity="1"/>
-<stop offset="0.242187" stop-color="rgb(64.375305%, 89.11438%, 64.375305%)" stop-opacity="1"/>
-<stop offset="0.25" stop-color="rgb(64.595032%, 89.181519%, 64.595032%)" stop-opacity="1"/>
-<stop offset="0.257812" stop-color="rgb(64.816284%, 89.248657%, 64.816284%)" stop-opacity="1"/>
-<stop offset="0.265625" stop-color="rgb(65.036011%, 89.315796%, 65.036011%)" stop-opacity="1"/>
-<stop offset="0.273437" stop-color="rgb(65.257263%, 89.38446%, 65.257263%)" stop-opacity="1"/>
-<stop offset="0.28125" stop-color="rgb(65.478516%, 89.451599%, 65.478516%)" stop-opacity="1"/>
-<stop offset="0.289062" stop-color="rgb(65.698242%, 89.518738%, 65.698242%)" stop-opacity="1"/>
-<stop offset="0.296875" stop-color="rgb(65.919495%, 89.585876%, 65.919495%)" stop-opacity="1"/>
-<stop offset="0.304687" stop-color="rgb(66.139221%, 89.653015%, 66.139221%)" stop-opacity="1"/>
-<stop offset="0.3125" stop-color="rgb(66.360474%, 89.720154%, 66.360474%)" stop-opacity="1"/>
-<stop offset="0.320312" stop-color="rgb(66.5802%, 89.788818%, 66.5802%)" stop-opacity="1"/>
-<stop offset="0.328125" stop-color="rgb(66.801453%, 89.855957%, 66.801453%)" stop-opacity="1"/>
-<stop offset="0.335937" stop-color="rgb(67.021179%, 89.923096%, 67.021179%)" stop-opacity="1"/>
-<stop offset="0.34375" stop-color="rgb(67.242432%, 89.990234%, 67.242432%)" stop-opacity="1"/>
-<stop offset="0.351562" stop-color="rgb(67.462158%, 90.057373%, 67.462158%)" stop-opacity="1"/>
-<stop offset="0.359375" stop-color="rgb(67.683411%, 90.124512%, 67.683411%)" stop-opacity="1"/>
-<stop offset="0.367187" stop-color="rgb(67.903137%, 90.193176%, 67.903137%)" stop-opacity="1"/>
-<stop offset="0.375" stop-color="rgb(68.12439%, 90.260315%, 68.12439%)" stop-opacity="1"/>
-<stop offset="0.382812" stop-color="rgb(68.345642%, 90.327454%, 68.345642%)" stop-opacity="1"/>
-<stop offset="0.390625" stop-color="rgb(68.565369%, 90.394592%, 68.565369%)" stop-opacity="1"/>
-<stop offset="0.398437" stop-color="rgb(68.786621%, 90.461731%, 68.786621%)" stop-opacity="1"/>
-<stop offset="0.40625" stop-color="rgb(69.006348%, 90.52887%, 69.006348%)" stop-opacity="1"/>
-<stop offset="0.414062" stop-color="rgb(69.2276%, 90.597534%, 69.2276%)" stop-opacity="1"/>
-<stop offset="0.421875" stop-color="rgb(69.447327%, 90.664673%, 69.447327%)" stop-opacity="1"/>
-<stop offset="0.429687" stop-color="rgb(69.668579%, 90.731812%, 69.668579%)" stop-opacity="1"/>
-<stop offset="0.4375" stop-color="rgb(69.888306%, 90.79895%, 69.888306%)" stop-opacity="1"/>
-<stop offset="0.445312" stop-color="rgb(70.109558%, 90.866089%, 70.109558%)" stop-opacity="1"/>
-<stop offset="0.453125" stop-color="rgb(70.329285%, 90.933228%, 70.329285%)" stop-opacity="1"/>
-<stop offset="0.460937" stop-color="rgb(70.550537%, 91.001892%, 70.550537%)" stop-opacity="1"/>
-<stop offset="0.46875" stop-color="rgb(70.770264%, 91.069031%, 70.770264%)" stop-opacity="1"/>
-<stop offset="0.476562" stop-color="rgb(70.991516%, 91.136169%, 70.991516%)" stop-opacity="1"/>
-<stop offset="0.484375" stop-color="rgb(71.212769%, 91.203308%, 71.212769%)" stop-opacity="1"/>
-<stop offset="0.492188" stop-color="rgb(71.432495%, 91.270447%, 71.432495%)" stop-opacity="1"/>
-<stop offset="0.5" stop-color="rgb(71.653748%, 91.337585%, 71.653748%)" stop-opacity="1"/>
-<stop offset="0.507812" stop-color="rgb(71.873474%, 91.40625%, 71.873474%)" stop-opacity="1"/>
-<stop offset="0.515625" stop-color="rgb(72.094727%, 91.473389%, 72.094727%)" stop-opacity="1"/>
-<stop offset="0.523437" stop-color="rgb(72.314453%, 91.540527%, 72.314453%)" stop-opacity="1"/>
-<stop offset="0.53125" stop-color="rgb(72.535706%, 91.607666%, 72.535706%)" stop-opacity="1"/>
-<stop offset="0.539063" stop-color="rgb(72.755432%, 91.674805%, 72.755432%)" stop-opacity="1"/>
-<stop offset="0.546875" stop-color="rgb(72.976685%, 91.741943%, 72.976685%)" stop-opacity="1"/>
-<stop offset="0.554688" stop-color="rgb(73.196411%, 91.810608%, 73.196411%)" stop-opacity="1"/>
-<stop offset="0.5625" stop-color="rgb(73.417664%, 91.877747%, 73.417664%)" stop-opacity="1"/>
-<stop offset="0.570312" stop-color="rgb(73.638916%, 91.944885%, 73.638916%)" stop-opacity="1"/>
-<stop offset="0.578125" stop-color="rgb(73.858643%, 92.012024%, 73.858643%)" stop-opacity="1"/>
-<stop offset="0.585938" stop-color="rgb(74.079895%, 92.079163%, 74.079895%)" stop-opacity="1"/>
-<stop offset="0.59375" stop-color="rgb(74.299622%, 92.146301%, 74.299622%)" stop-opacity="1"/>
-<stop offset="0.601563" stop-color="rgb(74.520874%, 92.214966%, 74.520874%)" stop-opacity="1"/>
-<stop offset="0.609375" stop-color="rgb(74.740601%, 92.282104%, 74.740601%)" stop-opacity="1"/>
-<stop offset="0.617188" stop-color="rgb(74.961853%, 92.349243%, 74.961853%)" stop-opacity="1"/>
-<stop offset="0.625" stop-color="rgb(75.18158%, 92.416382%, 75.18158%)" stop-opacity="1"/>
-<stop offset="0.632813" stop-color="rgb(75.402832%, 92.483521%, 75.402832%)" stop-opacity="1"/>
-<stop offset="0.640625" stop-color="rgb(75.624084%, 92.550659%, 75.624084%)" stop-opacity="1"/>
-<stop offset="0.648438" stop-color="rgb(75.843811%, 92.619324%, 75.843811%)" stop-opacity="1"/>
-<stop offset="0.65625" stop-color="rgb(76.065063%, 92.686462%, 76.065063%)" stop-opacity="1"/>
-<stop offset="0.664062" stop-color="rgb(76.28479%, 92.753601%, 76.28479%)" stop-opacity="1"/>
-<stop offset="0.671875" stop-color="rgb(76.506042%, 92.82074%, 76.506042%)" stop-opacity="1"/>
-<stop offset="0.679688" stop-color="rgb(76.725769%, 92.887878%, 76.725769%)" stop-opacity="1"/>
-<stop offset="0.6875" stop-color="rgb(76.947021%, 92.955017%, 76.947021%)" stop-opacity="1"/>
-<stop offset="0.695312" stop-color="rgb(77.166748%, 93.022156%, 77.166748%)" stop-opacity="1"/>
-<stop offset="0.703125" stop-color="rgb(77.388%, 93.09082%, 77.388%)" stop-opacity="1"/>
-<stop offset="0.710938" stop-color="rgb(77.607727%, 93.157959%, 77.607727%)" stop-opacity="1"/>
-<stop offset="0.71875" stop-color="rgb(77.828979%, 93.225098%, 77.828979%)" stop-opacity="1"/>
-<stop offset="0.726562" stop-color="rgb(78.050232%, 93.292236%, 78.050232%)" stop-opacity="1"/>
-<stop offset="0.734375" stop-color="rgb(78.269958%, 93.359375%, 78.269958%)" stop-opacity="1"/>
-<stop offset="0.742188" stop-color="rgb(78.491211%, 93.426514%, 78.491211%)" stop-opacity="1"/>
-<stop offset="0.75" stop-color="rgb(78.710938%, 93.495178%, 78.710938%)" stop-opacity="1"/>
-<stop offset="0.757813" stop-color="rgb(78.93219%, 93.562317%, 78.93219%)" stop-opacity="1"/>
-<stop offset="0.765625" stop-color="rgb(79.151917%, 93.629456%, 79.151917%)" stop-opacity="1"/>
-<stop offset="0.773438" stop-color="rgb(79.373169%, 93.696594%, 79.373169%)" stop-opacity="1"/>
-<stop offset="0.78125" stop-color="rgb(79.592896%, 93.763733%, 79.592896%)" stop-opacity="1"/>
-<stop offset="0.789062" stop-color="rgb(79.814148%, 93.830872%, 79.814148%)" stop-opacity="1"/>
-<stop offset="0.796875" stop-color="rgb(80.0354%, 93.899536%, 80.0354%)" stop-opacity="1"/>
-<stop offset="0.804688" stop-color="rgb(80.255127%, 93.966675%, 80.255127%)" stop-opacity="1"/>
-<stop offset="0.8125" stop-color="rgb(80.476379%, 94.033813%, 80.476379%)" stop-opacity="1"/>
-<stop offset="0.820312" stop-color="rgb(80.696106%, 94.100952%, 80.696106%)" stop-opacity="1"/>
-<stop offset="0.828125" stop-color="rgb(80.917358%, 94.168091%, 80.917358%)" stop-opacity="1"/>
-<stop offset="0.835938" stop-color="rgb(81.137085%, 94.235229%, 81.137085%)" stop-opacity="1"/>
-<stop offset="0.84375" stop-color="rgb(81.358337%, 94.303894%, 81.358337%)" stop-opacity="1"/>
-<stop offset="0.851562" stop-color="rgb(81.578064%, 94.371033%, 81.578064%)" stop-opacity="1"/>
-<stop offset="0.859375" stop-color="rgb(81.799316%, 94.438171%, 81.799316%)" stop-opacity="1"/>
-<stop offset="0.867188" stop-color="rgb(82.020569%, 94.50531%, 82.020569%)" stop-opacity="1"/>
-<stop offset="0.875" stop-color="rgb(82.240295%, 94.572449%, 82.240295%)" stop-opacity="1"/>
-<stop offset="0.882813" stop-color="rgb(82.461548%, 94.639587%, 82.461548%)" stop-opacity="1"/>
-<stop offset="0.890625" stop-color="rgb(82.681274%, 94.706726%, 82.681274%)" stop-opacity="1"/>
-<stop offset="0.898438" stop-color="rgb(82.902527%, 94.775391%, 82.902527%)" stop-opacity="1"/>
-<stop offset="0.90625" stop-color="rgb(83.122253%, 94.842529%, 83.122253%)" stop-opacity="1"/>
-<stop offset="0.914062" stop-color="rgb(83.343506%, 94.909668%, 83.343506%)" stop-opacity="1"/>
-<stop offset="0.921875" stop-color="rgb(83.563232%, 94.976807%, 83.563232%)" stop-opacity="1"/>
-<stop offset="0.929688" stop-color="rgb(83.784485%, 95.043945%, 83.784485%)" stop-opacity="1"/>
-<stop offset="0.9375" stop-color="rgb(84.004211%, 95.111084%, 84.004211%)" stop-opacity="1"/>
-<stop offset="0.945312" stop-color="rgb(84.225464%, 95.179749%, 84.225464%)" stop-opacity="1"/>
-<stop offset="0.953125" stop-color="rgb(84.446716%, 95.246887%, 84.446716%)" stop-opacity="1"/>
-<stop offset="0.960938" stop-color="rgb(84.666443%, 95.314026%, 84.666443%)" stop-opacity="1"/>
-<stop offset="0.96875" stop-color="rgb(84.887695%, 95.381165%, 84.887695%)" stop-opacity="1"/>
-<stop offset="0.976562" stop-color="rgb(85.107422%, 95.448303%, 85.107422%)" stop-opacity="1"/>
-<stop offset="0.984375" stop-color="rgb(85.328674%, 95.515442%, 85.328674%)" stop-opacity="1"/>
-<stop offset="0.992187" stop-color="rgb(85.548401%, 95.584106%, 85.548401%)" stop-opacity="1"/>
-<stop offset="1" stop-color="rgb(85.769653%, 95.651245%, 85.769653%)" stop-opacity="1"/>
-</linearGradient>
-<clipPath id="clip-20">
-<path clip-rule="nonzero" d="M 280 201 L 349.308594 201 L 349.308594 225 L 280 225 Z M 280 201 "/>
-</clipPath>
-</defs>
-<g fill="rgb(0%, 0%, 0%)" fill-opacity="1">
-<use xlink:href="#glyph-0-0" x="3.321" y="11.678"/>
-<use xlink:href="#glyph-0-1" x="10.840802" y="11.678"/>
-<use xlink:href="#glyph-0-1" x="17.093356" y="11.678"/>
-<use xlink:href="#glyph-0-2" x="23.34591" y="11.678"/>
-<use xlink:href="#glyph-0-3" x="26.358613" y="11.678"/>
-<use xlink:href="#glyph-0-4" x="29.466957" y="11.678"/>
-<use xlink:href="#glyph-0-5" x="34.595725" y="11.678"/>
-<use xlink:href="#glyph-0-6" x="40.346161" y="11.678"/>
-<use xlink:href="#glyph-0-3" x="44.410919" y="11.678"/>
-<use xlink:href="#glyph-0-7" x="47.519263" y="11.678"/>
-<use xlink:href="#glyph-0-8" x="53.496848" y="11.678"/>
-</g>
-<g clip-path="url(#clip-0)">
-<path fill-rule="nonzero" fill="url(#linear-pattern-0)" d="M 8.953125 215.839844 L 8.953125 193.160156 L 54.308594 193.160156 L 54.308594 215.839844 Z M 8.953125 215.839844 "/>
-</g>
-<path fill="none" stroke-width="0.3985" stroke-linecap="butt" stroke-linejoin="miter" stroke="rgb(47.059631%, 17.64679%, 17.64679%)" stroke-opacity="1" stroke-miterlimit="10" d="M 0.000125 -51.024844 L 0.000125 -28.345156 L 45.355594 -28.345156 L 45.355594 -51.024844 Z M 0.000125 -51.024844 " transform="matrix(1, 0, 0, -1, 8.953, 164.815)"/>
-<g fill="rgb(0%, 0%, 0%)" fill-opacity="1">
-<use xlink:href="#glyph-1-0" x="21.04" y="207.928"/>
-<use xlink:href="#glyph-1-1" x="27.017584" y="207.928"/>
-<use xlink:href="#glyph-1-2" x="31.809614" y="207.928"/>
-<use xlink:href="#glyph-1-3" x="35.196911" y="207.928"/>
-<use xlink:href="#glyph-1-4" x="39.709987" y="207.928"/>
-</g>
-<path fill="none" stroke-width="0.49814" stroke-linecap="butt" stroke-linejoin="miter" stroke="rgb(0%, 0%, 0%)" stroke-opacity="1" stroke-miterlimit="10" d="M 45.355594 -39.685 L 89.449344 20.943906 " transform="matrix(1, 0, 0, -1, 8.953, 164.815)"/>
-<path fill-rule="nonzero" fill="rgb(0%, 0%, 0%)" fill-opacity="1" d="M 99.664062 142.136719 L 96.261719 143.902344 L 98.402344 143.871094 L 99.035156 145.917969 "/>
-<g clip-path="url(#clip-1)">
-<path fill-rule="nonzero" fill="url(#linear-pattern-1)" d="M 8.953125 181.824219 L 8.953125 159.144531 L 54.308594 159.144531 L 54.308594 181.824219 Z M 8.953125 181.824219 "/>
-</g>
-<path fill="none" stroke-width="0.3985" stroke-linecap="butt" stroke-linejoin="miter" stroke="rgb(47.059631%, 17.64679%, 17.64679%)" stroke-opacity="1" stroke-miterlimit="10" d="M 0.000125 -17.009219 L 0.000125 5.670469 L 45.355594 5.670469 L 45.355594 -17.009219 Z M 0.000125 -17.009219 " transform="matrix(1, 0, 0, -1, 8.953, 164.815)"/>
-<g fill="rgb(0%, 0%, 0%)" fill-opacity="1">
-<use xlink:href="#glyph-1-5" x="18.818" y="173.713"/>
-<use xlink:href="#glyph-1-6" x="24.387116" y="173.713"/>
-<use xlink:href="#glyph-1-7" x="29.747016" y="173.713"/>
-<use xlink:href="#glyph-1-8" x="35.126842" y="173.713"/>
-<use xlink:href="#glyph-1-9" x="40.167938" y="173.713"/>
-</g>
-<path fill="none" stroke-width="0.49814" stroke-linecap="butt" stroke-linejoin="miter" stroke="rgb(0%, 0%, 0%)" stroke-opacity="1" stroke-miterlimit="10" d="M 45.355594 -5.669375 L 88.945437 24.299375 " transform="matrix(1, 0, 0, -1, 8.953, 164.815)"/>
-<path fill-rule="nonzero" fill="rgb(0%, 0%, 0%)" fill-opacity="1" d="M 99.664062 139.304688 L 95.867188 139.832031 L 97.898438 140.515625 L 97.808594 142.65625 "/>
-<g clip-path="url(#clip-2)">
-<path fill-rule="nonzero" fill="url(#linear-pattern-2)" d="M 8.953125 147.808594 L 8.953125 125.128906 L 54.308594 125.128906 L 54.308594 147.808594 Z M 8.953125 147.808594 "/>
-</g>
-<path fill="none" stroke-width="0.3985" stroke-linecap="butt" stroke-linejoin="miter" stroke="rgb(47.059631%, 17.64679%, 17.64679%)" stroke-opacity="1" stroke-miterlimit="10" d="M 0.000125 17.006406 L 0.000125 39.686094 L 45.355594 39.686094 L 45.355594 17.006406 Z M 0.000125 17.006406 " transform="matrix(1, 0, 0, -1, 8.953, 164.815)"/>
-<g fill="rgb(0%, 0%, 0%)" fill-opacity="1">
-<use xlink:href="#glyph-1-10" x="14.026" y="139.896"/>
-<use xlink:href="#glyph-1-3" x="21.179176" y="139.896"/>
-<use xlink:href="#glyph-1-11" x="25.692251" y="139.896"/>
-<use xlink:href="#glyph-1-12" x="30.703459" y="139.896"/>
-<use xlink:href="#glyph-1-13" x="35.336087" y="139.896"/>
-<use xlink:href="#glyph-1-13" x="39.968715" y="139.896"/>
-<use xlink:href="#glyph-1-13" x="44.601342" y="139.896"/>
-</g>
-<path fill="none" stroke-width="0.49814" stroke-linecap="butt" stroke-linejoin="miter" stroke="rgb(0%, 0%, 0%)" stroke-opacity="1" stroke-miterlimit="10" d="M 45.355594 28.34625 L 88.566531 28.34625 " transform="matrix(1, 0, 0, -1, 8.953, 164.815)"/>
-<path fill-rule="nonzero" fill="rgb(0%, 0%, 0%)" fill-opacity="1" d="M 99.664062 136.46875 L 96.234375 134.753906 L 97.519531 136.46875 L 96.234375 138.183594 "/>
-<g clip-path="url(#clip-3)">
-<path fill-rule="nonzero" fill="url(#linear-pattern-3)" d="M 8.953125 113.792969 L 8.953125 91.113281 L 54.308594 91.113281 L 54.308594 113.792969 Z M 8.953125 113.792969 "/>
-</g>
-<path fill="none" stroke-width="0.3985" stroke-linecap="butt" stroke-linejoin="miter" stroke="rgb(47.059631%, 17.64679%, 17.64679%)" stroke-opacity="1" stroke-miterlimit="10" d="M 0.000125 51.022031 L 0.000125 73.701719 L 45.355594 73.701719 L 45.355594 51.022031 Z M 0.000125 51.022031 " transform="matrix(1, 0, 0, -1, 8.953, 164.815)"/>
-<g fill="rgb(0%, 0%, 0%)" fill-opacity="1">
-<use xlink:href="#glyph-1-14" x="17.613" y="105.736"/>
-<use xlink:href="#glyph-1-15" x="26.439899" y="105.736"/>
-<use xlink:href="#glyph-1-6" x="31.46107" y="105.736"/>
-<use xlink:href="#glyph-1-14" x="36.82097" y="105.736"/>
-</g>
-<path fill="none" stroke-width="0.49814" stroke-linecap="butt" stroke-linejoin="miter" stroke="rgb(0%, 0%, 0%)" stroke-opacity="1" stroke-miterlimit="10" d="M 45.355594 62.361875 L 88.945437 32.393125 " transform="matrix(1, 0, 0, -1, 8.953, 164.815)"/>
-<path fill-rule="nonzero" fill="rgb(0%, 0%, 0%)" fill-opacity="1" d="M 99.664062 133.632812 L 97.808594 130.28125 L 97.898438 132.421875 L 95.867188 133.105469 "/>
-<g fill="rgb(0%, 0%, 0%)" fill-opacity="1">
-<use xlink:href="#glyph-0-9" x="104.46" y="11.666"/>
-<use xlink:href="#glyph-0-3" x="110.616913" y="11.666"/>
-<use xlink:href="#glyph-0-10" x="113.725257" y="11.666"/>
-<use xlink:href="#glyph-0-11" x="119.750662" y="11.666"/>
-<use xlink:href="#glyph-0-5" x="123.994748" y="11.666"/>
-<use xlink:href="#glyph-0-11" x="129.745185" y="11.666"/>
-</g>
-<g fill="rgb(0%, 0%, 0%)" fill-opacity="1">
-<use xlink:href="#glyph-0-12" x="134.168597" y="11.666"/>
-</g>
-<g clip-path="url(#clip-4)">
-<path fill-rule="nonzero" fill="url(#linear-pattern-4)" d="M 99.664062 147.808594 L 99.664062 125.128906 L 145.019531 125.128906 L 145.019531 147.808594 Z M 99.664062 147.808594 "/>
-</g>
-<path fill="none" stroke-width="0.3985" stroke-linecap="butt" stroke-linejoin="miter" stroke="rgb(23.529053%, 35.293579%, 54.116821%)" stroke-opacity="1" stroke-miterlimit="10" d="M 90.711062 17.006406 L 90.711062 39.686094 L 136.066531 39.686094 L 136.066531 17.006406 Z M 90.711062 17.006406 " transform="matrix(1, 0, 0, -1, 8.953, 164.815)"/>
-<g fill="rgb(0%, 0%, 0%)" fill-opacity="1">
-<use xlink:href="#glyph-1-4" x="105.009" y="139.896"/>
-<use xlink:href="#glyph-1-16" x="107.519585" y="139.896"/>
-<use xlink:href="#glyph-1-17" x="110.109872" y="139.896"/>
-<use xlink:href="#glyph-1-18" x="115.131042" y="139.896"/>
-<use xlink:href="#glyph-1-6" x="121.87575" y="139.896"/>
-<use xlink:href="#glyph-1-6" x="127.23565" y="139.896"/>
-<use xlink:href="#glyph-1-19" x="132.59555" y="139.896"/>
-</g>
-<path fill="none" stroke-width="0.49814" stroke-linecap="butt" stroke-linejoin="miter" stroke="rgb(0%, 0%, 0%)" stroke-opacity="1" stroke-miterlimit="10" d="M 136.066531 28.34625 L 180.550906 128.44 " transform="matrix(1, 0, 0, -1, 8.953, 164.815)"/>
-<path fill-rule="nonzero" fill="rgb(0%, 0%, 0%)" fill-opacity="1" d="M 190.371094 34.417969 L 187.414062 36.855469 L 189.503906 36.375 L 190.546875 38.246094 "/>
-<path fill="none" stroke-width="0.49814" stroke-linecap="butt" stroke-linejoin="miter" stroke="rgb(0%, 0%, 0%)" stroke-opacity="1" stroke-miterlimit="10" d="M 136.066531 28.34625 L 180.230594 94.59625 " transform="matrix(1, 0, 0, -1, 8.953, 164.815)"/>
-<path fill-rule="nonzero" fill="rgb(0%, 0%, 0%)" fill-opacity="1" d="M 190.371094 68.4375 L 187.046875 70.335938 L 189.183594 70.21875 L 189.898438 72.238281 "/>
-<path fill="none" stroke-width="0.49814" stroke-linecap="butt" stroke-linejoin="miter" stroke="rgb(0%, 0%, 0%)" stroke-opacity="1" stroke-miterlimit="10" d="M 136.066531 28.34625 L 179.707156 61.076719 " transform="matrix(1, 0, 0, -1, 8.953, 164.815)"/>
-<path fill-rule="nonzero" fill="rgb(0%, 0%, 0%)" fill-opacity="1" d="M 190.371094 102.453125 L 186.601562 103.136719 L 188.660156 103.738281 L 188.660156 105.878906 "/>
-<path fill="none" stroke-width="0.49814" stroke-linecap="butt" stroke-linejoin="miter" stroke="rgb(0%, 0%, 0%)" stroke-opacity="1" stroke-miterlimit="10" d="M 136.066531 28.34625 L 179.277469 28.34625 " transform="matrix(1, 0, 0, -1, 8.953, 164.815)"/>
-<path fill-rule="nonzero" fill="rgb(0%, 0%, 0%)" fill-opacity="1" d="M 190.371094 136.46875 L 186.945312 134.753906 L 188.230469 136.46875 L 186.945312 138.183594 "/>
-<path fill="none" stroke-width="0.49814" stroke-linecap="butt" stroke-linejoin="miter" stroke="rgb(0%, 0%, 0%)" stroke-opacity="1" stroke-miterlimit="10" d="M 136.066531 28.34625 L 179.707156 -4.384219 " transform="matrix(1, 0, 0, -1, 8.953, 164.815)"/>
-<path fill-rule="nonzero" fill="rgb(0%, 0%, 0%)" fill-opacity="1" d="M 190.371094 170.484375 L 188.660156 167.058594 L 188.660156 169.199219 L 186.601562 169.796875 "/>
-<path fill="none" stroke-width="0.49814" stroke-linecap="butt" stroke-linejoin="miter" stroke="rgb(0%, 0%, 0%)" stroke-opacity="1" stroke-miterlimit="10" d="M 136.066531 28.34625 L 180.230594 -37.90375 " transform="matrix(1, 0, 0, -1, 8.953, 164.815)"/>
-<path fill-rule="nonzero" fill="rgb(0%, 0%, 0%)" fill-opacity="1" d="M 190.371094 204.5 L 189.898438 200.699219 L 189.183594 202.71875 L 187.046875 202.597656 "/>
-<path fill="none" stroke-width="0.49814" stroke-linecap="butt" stroke-linejoin="miter" stroke="rgb(0%, 0%, 0%)" stroke-opacity="1" stroke-miterlimit="10" d="M 136.066531 28.34625 L 180.550906 -71.743594 " transform="matrix(1, 0, 0, -1, 8.953, 164.815)"/>
-<path fill-rule="nonzero" fill="rgb(0%, 0%, 0%)" fill-opacity="1" d="M 190.371094 238.515625 L 190.546875 234.6875 L 189.503906 236.558594 L 187.414062 236.082031 "/>
-<g fill="rgb(0%, 0%, 0%)" fill-opacity="1">
-<use xlink:href="#glyph-0-13" x="193.706" y="13.023"/>
-<use xlink:href="#glyph-0-5" x="201.070385" y="13.023"/>
-<use xlink:href="#glyph-0-4" x="206.820821" y="13.023"/>
-<use xlink:href="#glyph-0-14" x="211.949589" y="13.023"/>
-<use xlink:href="#glyph-0-15" x="217.96304" y="13.023"/>
-<use xlink:href="#glyph-0-8" x="223.378732" y="13.023"/>
-<use xlink:href="#glyph-0-16" x="229.822569" y="13.023"/>
-<use xlink:href="#glyph-0-17" x="236.158809" y="13.023"/>
-</g>
-<g clip-path="url(#clip-5)">
-<path fill-rule="nonzero" fill="url(#linear-pattern-5)" d="M 190.371094 45.757812 L 190.371094 23.082031 L 247.066406 23.082031 L 247.066406 45.757812 Z M 190.371094 45.757812 "/>
-</g>
-<path fill="none" stroke-width="0.3985" stroke-linecap="butt" stroke-linejoin="miter" stroke="rgb(19.999695%, 19.999695%, 19.999695%)" stroke-opacity="1" stroke-miterlimit="10" d="M 181.418094 119.057187 L 181.418094 141.732969 L 238.113406 141.732969 L 238.113406 119.057187 Z M 181.418094 119.057187 " transform="matrix(1, 0, 0, -1, 8.953, 164.815)"/>
-<g fill="rgb(0%, 0%, 0%)" fill-opacity="1">
-<use xlink:href="#glyph-1-5" x="204.654" y="37.65"/>
-<use xlink:href="#glyph-1-20" x="210.223116" y="37.65"/>
-<use xlink:href="#glyph-1-21" x="215.573053" y="37.65"/>
-</g>
-<g fill="rgb(0%, 0%, 0%)" fill-opacity="1">
-<use xlink:href="#glyph-1-3" x="219.03009" y="37.65"/>
-</g>
-<g fill="rgb(0%, 0%, 0%)" fill-opacity="1">
-<use xlink:href="#glyph-1-18" x="226.033825" y="37.65"/>
-</g>
-<path fill="none" stroke-width="0.49814" stroke-linecap="butt" stroke-linejoin="miter" stroke="rgb(0%, 0%, 0%)" stroke-opacity="1" stroke-miterlimit="10" d="M 238.113406 130.393125 L 270.246219 112.990781 " transform="matrix(1, 0, 0, -1, 8.953, 164.815)"/>
-<path fill-rule="nonzero" fill="rgb(0%, 0%, 0%)" fill-opacity="1" d="M 281.082031 52.84375 L 278.886719 49.707031 L 279.199219 51.824219 L 277.253906 52.71875 "/>
-<g clip-path="url(#clip-6)">
-<path fill-rule="nonzero" fill="url(#linear-pattern-6)" d="M 190.371094 79.773438 L 190.371094 57.097656 L 247.066406 57.097656 L 247.066406 79.773438 Z M 190.371094 79.773438 "/>
-</g>
-<path fill="none" stroke-width="0.3985" stroke-linecap="butt" stroke-linejoin="miter" stroke="rgb(19.999695%, 19.999695%, 19.999695%)" stroke-opacity="1" stroke-miterlimit="10" d="M 181.418094 85.041562 L 181.418094 107.717344 L 238.113406 107.717344 L 238.113406 85.041562 Z M 181.418094 85.041562 " transform="matrix(1, 0, 0, -1, 8.953, 164.815)"/>
-<g fill="rgb(0%, 0%, 0%)" fill-opacity="1">
-<use xlink:href="#glyph-1-22" x="210.104" y="71.665"/>
-</g>
-<g fill="rgb(0%, 0%, 0%)" fill-opacity="1">
-<use xlink:href="#glyph-1-23" x="215.254685" y="71.665"/>
-<use xlink:href="#glyph-1-24" x="221.541111" y="71.665"/>
-</g>
-<path fill="none" stroke-width="0.49814" stroke-linecap="butt" stroke-linejoin="miter" stroke="rgb(0%, 0%, 0%)" stroke-opacity="1" stroke-miterlimit="10" d="M 238.113406 96.3775 L 270.1095 107.709531 " transform="matrix(1, 0, 0, -1, 8.953, 164.815)"/>
-<path fill-rule="nonzero" fill="rgb(0%, 0%, 0%)" fill-opacity="1" d="M 281.082031 56.390625 L 277.28125 55.917969 L 279.0625 57.105469 L 278.421875 59.148438 "/>
-<g clip-path="url(#clip-7)">
-<path fill-rule="nonzero" fill="url(#linear-pattern-7)" d="M 190.371094 113.792969 L 190.371094 91.113281 L 247.066406 91.113281 L 247.066406 113.792969 Z M 190.371094 113.792969 "/>
-</g>
-<path fill="none" stroke-width="0.3985" stroke-linecap="butt" stroke-linejoin="miter" stroke="rgb(19.999695%, 19.999695%, 19.999695%)" stroke-opacity="1" stroke-miterlimit="10" d="M 181.418094 51.022031 L 181.418094 73.701719 L 238.113406 73.701719 L 238.113406 51.022031 Z M 181.418094 51.022031 " transform="matrix(1, 0, 0, -1, 8.953, 164.815)"/>
-<g fill="rgb(0%, 0%, 0%)" fill-opacity="1">
-<use xlink:href="#glyph-1-25" x="197.377" y="105.701"/>
-<use xlink:href="#glyph-1-26" x="202.50776" y="105.701"/>
-<use xlink:href="#glyph-1-27" x="205.436776" y="105.701"/>
-<use xlink:href="#glyph-1-24" x="211.573762" y="105.701"/>
-<use xlink:href="#glyph-1-8" x="217.362056" y="105.701"/>
-<use xlink:href="#glyph-1-14" x="222.403152" y="105.701"/>
-<use xlink:href="#glyph-1-14" x="231.230051" y="105.701"/>
-</g>
-<path fill="none" stroke-width="0.49814" stroke-linecap="butt" stroke-linejoin="miter" stroke="rgb(0%, 0%, 0%)" stroke-opacity="1" stroke-miterlimit="10" d="M 238.113406 62.361875 L 270.777469 102.514219 " transform="matrix(1, 0, 0, -1, 8.953, 164.815)"/>
-<path fill-rule="nonzero" fill="rgb(0%, 0%, 0%)" fill-opacity="1" d="M 281.082031 60.640625 L 277.589844 62.21875 L 279.730469 62.300781 L 280.25 64.378906 "/>
-<g clip-path="url(#clip-8)">
-<path fill-rule="nonzero" fill="url(#linear-pattern-8)" d="M 190.371094 147.808594 L 190.371094 125.128906 L 247.066406 125.128906 L 247.066406 147.808594 Z M 190.371094 147.808594 "/>
-</g>
-<path fill="none" stroke-width="0.3985" stroke-linecap="butt" stroke-linejoin="miter" stroke="rgb(19.999695%, 19.999695%, 19.999695%)" stroke-opacity="1" stroke-miterlimit="10" d="M 181.418094 17.006406 L 181.418094 39.686094 L 238.113406 39.686094 L 238.113406 17.006406 Z M 181.418094 17.006406 " transform="matrix(1, 0, 0, -1, 8.953, 164.815)"/>
-<g fill="rgb(0%, 0%, 0%)" fill-opacity="1">
-<use xlink:href="#glyph-1-18" x="205.372" y="139.697"/>
-<use xlink:href="#glyph-1-28" x="212.116707" y="139.697"/>
-<use xlink:href="#glyph-1-19" x="219.010854" y="139.697"/>
-</g>
-<g fill="rgb(0%, 0%, 0%)" fill-opacity="1">
-<use xlink:href="#glyph-1-22" x="225.795412" y="139.697"/>
-</g>
-<path fill="none" stroke-width="0.49814" stroke-linecap="butt" stroke-linejoin="miter" stroke="rgb(0%, 0%, 0%)" stroke-opacity="1" stroke-miterlimit="10" d="M 238.113406 28.34625 L 270.097781 39.006406 " transform="matrix(1, 0, 0, -1, 8.953, 164.815)"/>
-<path fill-rule="nonzero" fill="rgb(0%, 0%, 0%)" fill-opacity="1" d="M 281.082031 125.128906 L 277.289062 124.585938 L 279.050781 125.808594 L 278.371094 127.839844 "/>
-<g clip-path="url(#clip-9)">
-<path fill-rule="nonzero" fill="url(#linear-pattern-9)" d="M 190.371094 181.824219 L 190.371094 159.144531 L 247.066406 159.144531 L 247.066406 181.824219 Z M 190.371094 181.824219 "/>
-</g>
-<path fill="none" stroke-width="0.3985" stroke-linecap="butt" stroke-linejoin="miter" stroke="rgb(19.999695%, 19.999695%, 19.999695%)" stroke-opacity="1" stroke-miterlimit="10" d="M 181.418094 -17.009219 L 181.418094 5.670469 L 238.113406 5.670469 L 238.113406 -17.009219 Z M 181.418094 -17.009219 " transform="matrix(1, 0, 0, -1, 8.953, 164.815)"/>
-<g fill="rgb(0%, 0%, 0%)" fill-opacity="1">
-<use xlink:href="#glyph-1-29" x="210.851" y="173.718"/>
-<use xlink:href="#glyph-1-26" x="218.083877" y="173.718"/>
-<use xlink:href="#glyph-1-5" x="221.012893" y="173.718"/>
-</g>
-<path fill="none" stroke-width="0.49814" stroke-linecap="butt" stroke-linejoin="miter" stroke="rgb(0%, 0%, 0%)" stroke-opacity="1" stroke-miterlimit="10" d="M 238.113406 -5.669375 L 269.988406 -5.005313 " transform="matrix(1, 0, 0, -1, 8.953, 164.815)"/>
-<path fill-rule="nonzero" fill="rgb(0%, 0%, 0%)" fill-opacity="1" d="M 281.082031 169.777344 L 277.621094 168.132812 L 278.941406 169.820312 L 277.691406 171.5625 "/>
-<g clip-path="url(#clip-10)">
-<path fill-rule="nonzero" fill="url(#linear-pattern-10)" d="M 190.371094 215.839844 L 190.371094 193.160156 L 247.066406 193.160156 L 247.066406 215.839844 Z M 190.371094 215.839844 "/>
-</g>
-<path fill="none" stroke-width="0.3985" stroke-linecap="butt" stroke-linejoin="miter" stroke="rgb(19.999695%, 19.999695%, 19.999695%)" stroke-opacity="1" stroke-miterlimit="10" d="M 181.418094 -51.024844 L 181.418094 -28.345156 L 238.113406 -28.345156 L 238.113406 -51.024844 Z M 181.418094 -51.024844 " transform="matrix(1, 0, 0, -1, 8.953, 164.815)"/>
-<g fill="rgb(0%, 0%, 0%)" fill-opacity="1">
-<use xlink:href="#glyph-1-8" x="207.544" y="207.728"/>
-<use xlink:href="#glyph-1-30" x="212.585096" y="207.728"/>
-</g>
-<g fill="rgb(0%, 0%, 0%)" fill-opacity="1">
-<use xlink:href="#glyph-1-18" x="218.014735" y="207.728"/>
-<use xlink:href="#glyph-1-25" x="224.759442" y="207.728"/>
-</g>
-<path fill="none" stroke-width="0.49814" stroke-linecap="butt" stroke-linejoin="miter" stroke="rgb(0%, 0%, 0%)" stroke-opacity="1" stroke-miterlimit="10" d="M 238.113406 -39.685 L 270.050906 -47.669375 " transform="matrix(1, 0, 0, -1, 8.953, 164.815)"/>
-<path fill-rule="nonzero" fill="rgb(0%, 0%, 0%)" fill-opacity="1" d="M 281.082031 213.003906 L 278.171875 210.511719 L 279.003906 212.484375 L 277.339844 213.835938 "/>
-<g clip-path="url(#clip-11)">
-<path fill-rule="nonzero" fill="url(#linear-pattern-11)" d="M 190.371094 249.855469 L 190.371094 227.179688 L 247.066406 227.179688 L 247.066406 249.855469 Z M 190.371094 249.855469 "/>
-</g>
-<g clip-path="url(#clip-12)">
-<path fill="none" stroke-width="0.3985" stroke-linecap="butt" stroke-linejoin="miter" stroke="rgb(19.999695%, 19.999695%, 19.999695%)" stroke-opacity="1" stroke-miterlimit="10" d="M 181.418094 -85.040469 L 181.418094 -62.364688 L 238.113406 -62.364688 L 238.113406 -85.040469 Z M 181.418094 -85.040469 " transform="matrix(1, 0, 0, -1, 8.953, 164.815)"/>
-</g>
-<g fill="rgb(0%, 0%, 0%)" fill-opacity="1">
-<use xlink:href="#glyph-1-14" x="200.221" y="241.764"/>
-<use xlink:href="#glyph-1-22" x="209.047899" y="241.764"/>
-</g>
-<g fill="rgb(0%, 0%, 0%)" fill-opacity="1">
-<use xlink:href="#glyph-1-31" x="214.97567" y="241.764"/>
-<use xlink:href="#glyph-1-14" x="222.118883" y="241.764"/>
-<use xlink:href="#glyph-1-22" x="230.945782" y="241.764"/>
-</g>
-<path fill="none" stroke-width="0.49814" stroke-linecap="butt" stroke-linejoin="miter" stroke="rgb(0%, 0%, 0%)" stroke-opacity="1" stroke-miterlimit="10" d="M 238.113406 -73.700625 L 271.492313 33.389219 " transform="matrix(1, 0, 0, -1, 8.953, 164.815)"/>
-<path fill-rule="nonzero" fill="rgb(0%, 0%, 0%)" fill-opacity="1" d="M 281.082031 129.382812 L 278.425781 132.144531 L 280.445312 131.425781 L 281.699219 133.164062 "/>
-<path fill="none" stroke-width="0.49814" stroke-linecap="butt" stroke-linejoin="miter" stroke="rgb(0%, 0%, 0%)" stroke-opacity="1" stroke-miterlimit="10" d="M 238.113406 -73.700625 L 271.156375 -8.993594 " transform="matrix(1, 0, 0, -1, 8.953, 164.815)"/>
-<path fill-rule="nonzero" fill="rgb(0%, 0%, 0%)" fill-opacity="1" d="M 281.082031 171.902344 L 277.996094 174.175781 L 280.109375 173.808594 L 281.050781 175.734375 "/>
-<g fill="rgb(0%, 0%, 0%)" fill-opacity="1">
-<use xlink:href="#glyph-0-18" x="283.578" y="13.023"/>
-<use xlink:href="#glyph-0-5" x="292.257453" y="13.023"/>
-<use xlink:href="#glyph-0-11" x="298.00789" y="13.023"/>
-</g>
-<g fill="rgb(0%, 0%, 0%)" fill-opacity="1">
-<use xlink:href="#glyph-0-16" x="302.156334" y="13.023"/>
-<use xlink:href="#glyph-0-19" x="308.492574" y="13.023"/>
-<use xlink:href="#glyph-0-5" x="317.124207" y="13.023"/>
-<use xlink:href="#glyph-0-11" x="322.874644" y="13.023"/>
-</g>
-<g fill="rgb(0%, 0%, 0%)" fill-opacity="1">
-<use xlink:href="#glyph-0-15" x="327.023088" y="13.023"/>
-</g>
-<g clip-path="url(#clip-13)">
-<path fill-rule="nonzero" fill="url(#linear-pattern-12)" d="M 281.082031 68.4375 L 281.082031 45.757812 L 349.113281 45.757812 L 349.113281 68.4375 Z M 281.082031 68.4375 "/>
-</g>
-<g clip-path="url(#clip-14)">
-<path fill="none" stroke-width="0.3985" stroke-linecap="butt" stroke-linejoin="miter" stroke="rgb(17.64679%, 47.059631%, 17.64679%)" stroke-opacity="1" stroke-miterlimit="10" d="M 272.129031 96.3775 L 272.129031 119.057187 L 340.160281 119.057187 L 340.160281 96.3775 Z M 272.129031 96.3775 " transform="matrix(1, 0, 0, -1, 8.953, 164.815)"/>
-</g>
-<g fill="rgb(0%, 0%, 0%)" fill-opacity="1">
-<use xlink:href="#glyph-1-18" x="305.247" y="60.327"/>
-<use xlink:href="#glyph-1-5" x="311.991707" y="60.327"/>
-</g>
-<g fill="rgb(0%, 0%, 0%)" fill-opacity="1">
-<use xlink:href="#glyph-1-28" x="318.048992" y="60.327"/>
-</g>
-<g clip-path="url(#clip-15)">
-<path fill-rule="nonzero" fill="url(#linear-pattern-13)" d="M 281.082031 139.304688 L 281.082031 116.625 L 349.113281 116.625 L 349.113281 139.304688 Z M 281.082031 139.304688 "/>
-</g>
-<g clip-path="url(#clip-16)">
-<path fill="none" stroke-width="0.3985" stroke-linecap="butt" stroke-linejoin="miter" stroke="rgb(17.64679%, 47.059631%, 17.64679%)" stroke-opacity="1" stroke-miterlimit="10" d="M 272.129031 25.510312 L 272.129031 48.19 L 340.160281 48.19 L 340.160281 25.510312 Z M 272.129031 25.510312 " transform="matrix(1, 0, 0, -1, 8.953, 164.815)"/>
-</g>
-<g fill="rgb(0%, 0%, 0%)" fill-opacity="1">
-<use xlink:href="#glyph-1-10" x="287.364" y="131.193"/>
-</g>
-<g fill="rgb(0%, 0%, 0%)" fill-opacity="1">
-<use xlink:href="#glyph-1-23" x="294.756279" y="131.193"/>
-<use xlink:href="#glyph-1-26" x="301.042705" y="131.193"/>
-<use xlink:href="#glyph-1-19" x="303.971721" y="131.193"/>
-<use xlink:href="#glyph-1-26" x="311.045195" y="131.193"/>
-<use xlink:href="#glyph-1-22" x="313.974211" y="131.193"/>
-</g>
-<g fill="rgb(0%, 0%, 0%)" fill-opacity="1">
-<use xlink:href="#glyph-1-31" x="322.731372" y="131.193"/>
-<use xlink:href="#glyph-1-5" x="329.874585" y="131.193"/>
-</g>
-<g fill="rgb(0%, 0%, 0%)" fill-opacity="1">
-<use xlink:href="#glyph-1-28" x="335.93187" y="131.193"/>
-</g>
-<g clip-path="url(#clip-17)">
-<path fill-rule="nonzero" fill="url(#linear-pattern-14)" d="M 281.082031 181.824219 L 281.082031 159.144531 L 349.113281 159.144531 L 349.113281 181.824219 Z M 281.082031 181.824219 "/>
-</g>
-<g clip-path="url(#clip-18)">
-<path fill="none" stroke-width="0.3985" stroke-linecap="butt" stroke-linejoin="miter" stroke="rgb(17.64679%, 47.059631%, 17.64679%)" stroke-opacity="1" stroke-miterlimit="10" d="M 272.129031 -17.009219 L 272.129031 5.670469 L 340.160281 5.670469 L 340.160281 -17.009219 Z M 272.129031 -17.009219 " transform="matrix(1, 0, 0, -1, 8.953, 164.815)"/>
-</g>
-<g fill="rgb(0%, 0%, 0%)" fill-opacity="1">
-<use xlink:href="#glyph-1-22" x="292.719" y="173.733"/>
-<use xlink:href="#glyph-1-14" x="298.985501" y="173.733"/>
-<use xlink:href="#glyph-1-19" x="307.8124" y="173.733"/>
-</g>
-<g fill="rgb(0%, 0%, 0%)" fill-opacity="1">
-<use xlink:href="#glyph-1-31" x="317.376534" y="173.733"/>
-<use xlink:href="#glyph-1-5" x="324.519747" y="173.733"/>
-</g>
-<g fill="rgb(0%, 0%, 0%)" fill-opacity="1">
-<use xlink:href="#glyph-1-28" x="330.577032" y="173.733"/>
-</g>
-<g clip-path="url(#clip-19)">
-<path fill-rule="nonzero" fill="url(#linear-pattern-15)" d="M 281.082031 224.34375 L 281.082031 201.664062 L 349.113281 201.664062 L 349.113281 224.34375 Z M 281.082031 224.34375 "/>
-</g>
-<g clip-path="url(#clip-20)">
-<path fill="none" stroke-width="0.3985" stroke-linecap="butt" stroke-linejoin="miter" stroke="rgb(17.64679%, 47.059631%, 17.64679%)" stroke-opacity="1" stroke-miterlimit="10" d="M 272.129031 -59.52875 L 272.129031 -36.849063 L 340.160281 -36.849063 L 340.160281 -59.52875 Z M 272.129031 -59.52875 " transform="matrix(1, 0, 0, -1, 8.953, 164.815)"/>
-</g>
-<g fill="rgb(0%, 0%, 0%)" fill-opacity="1">
-<use xlink:href="#glyph-1-26" x="294.447" y="216.432"/>
-<use xlink:href="#glyph-1-32" x="297.376016" y="216.432"/>
-<use xlink:href="#glyph-1-2" x="302.745879" y="216.432"/>
-<use xlink:href="#glyph-1-3" x="306.133177" y="216.432"/>
-<use xlink:href="#glyph-1-4" x="310.646253" y="216.432"/>
-</g>
-<g fill="rgb(0%, 0%, 0%)" fill-opacity="1">
-<use xlink:href="#glyph-1-31" x="315.647498" y="216.432"/>
-<use xlink:href="#glyph-1-5" x="322.790711" y="216.432"/>
-</g>
-<g fill="rgb(0%, 0%, 0%)" fill-opacity="1">
-<use xlink:href="#glyph-1-28" x="328.847996" y="216.432"/>
-</g>
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Created with Inkscape (http://www.inkscape.org/) -->
+
+<svg
+   version="1.1"
+   id="svg1"
+   width="450.62799"
+   height="333.42267"
+   viewBox="0 0 450.62799 333.42267"
+   xmlns="http://www.w3.org/2000/svg"
+   xmlns:svg="http://www.w3.org/2000/svg">
+  <defs
+     id="defs1">
+    <linearGradient
+       x1="0"
+       y1="0"
+       x2="0"
+       y2="100.00128"
+       gradientUnits="userSpaceOnUse"
+       id="linearGradient5">
+      <stop
+         style="stop-opacity:1;stop-color:#e9b7b7"
+         offset="0"
+         id="stop1" />
+      <stop
+         style="stop-opacity:1;stop-color:#e9b7b7"
+         offset="0.25"
+         id="stop2" />
+      <stop
+         style="stop-opacity:1;stop-color:#f1d2d2"
+         offset="0.5"
+         id="stop3" />
+      <stop
+         style="stop-opacity:1;stop-color:#faeded"
+         offset="0.75"
+         id="stop4" />
+      <stop
+         style="stop-opacity:1;stop-color:#faeded"
+         offset="1"
+         id="stop5" />
+    </linearGradient>
+    <clipPath
+       clipPathUnits="userSpaceOnUse"
+       id="clipPath6">
+      <path
+         d="M 0,68.03218 V 90.7097 H 45.35506 V 68.03218 Z"
+         transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)"
+         id="path6" />
+    </clipPath>
+    <linearGradient
+       x1="0"
+       y1="0"
+       x2="0"
+       y2="100.00128"
+       gradientUnits="userSpaceOnUse"
+       id="linearGradient14">
+      <stop
+         style="stop-opacity:1;stop-color:#e9b7b7"
+         offset="0"
+         id="stop10" />
+      <stop
+         style="stop-opacity:1;stop-color:#e9b7b7"
+         offset="0.25"
+         id="stop11" />
+      <stop
+         style="stop-opacity:1;stop-color:#f1d2d2"
+         offset="0.5"
+         id="stop12" />
+      <stop
+         style="stop-opacity:1;stop-color:#faeded"
+         offset="0.75"
+         id="stop13" />
+      <stop
+         style="stop-opacity:1;stop-color:#faeded"
+         offset="1"
+         id="stop14" />
+    </linearGradient>
+    <clipPath
+       clipPathUnits="userSpaceOnUse"
+       id="clipPath15">
+      <path
+         d="M 0,34.01608 V 56.6936 H 45.35506 V 34.01608 Z"
+         transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)"
+         id="path15" />
+    </clipPath>
+    <linearGradient
+       x1="0"
+       y1="0"
+       x2="0"
+       y2="100.00128"
+       gradientUnits="userSpaceOnUse"
+       id="linearGradient23">
+      <stop
+         style="stop-opacity:1;stop-color:#e9b7b7"
+         offset="0"
+         id="stop19" />
+      <stop
+         style="stop-opacity:1;stop-color:#e9b7b7"
+         offset="0.25"
+         id="stop20" />
+      <stop
+         style="stop-opacity:1;stop-color:#f1d2d2"
+         offset="0.5"
+         id="stop21" />
+      <stop
+         style="stop-opacity:1;stop-color:#faeded"
+         offset="0.75"
+         id="stop22" />
+      <stop
+         style="stop-opacity:1;stop-color:#faeded"
+         offset="1"
+         id="stop23" />
+    </linearGradient>
+    <clipPath
+       clipPathUnits="userSpaceOnUse"
+       id="clipPath24">
+      <path
+         d="M 0,0 V 22.67752 H 45.35506 V 0 Z"
+         transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)"
+         id="path24" />
+    </clipPath>
+    <linearGradient
+       x1="0"
+       y1="0"
+       x2="0"
+       y2="100.00128"
+       gradientUnits="userSpaceOnUse"
+       id="linearGradient32">
+      <stop
+         style="stop-opacity:1;stop-color:#e9b7b7"
+         offset="0"
+         id="stop28" />
+      <stop
+         style="stop-opacity:1;stop-color:#e9b7b7"
+         offset="0.25"
+         id="stop29" />
+      <stop
+         style="stop-opacity:1;stop-color:#f1d2d2"
+         offset="0.5"
+         id="stop30" />
+      <stop
+         style="stop-opacity:1;stop-color:#faeded"
+         offset="0.75"
+         id="stop31" />
+      <stop
+         style="stop-opacity:1;stop-color:#faeded"
+         offset="1"
+         id="stop32" />
+    </linearGradient>
+    <clipPath
+       clipPathUnits="userSpaceOnUse"
+       id="clipPath33">
+      <path
+         d="m 0,-34.01648 v 22.67752 h 45.35506 v -22.67752 z"
+         transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)"
+         id="path33" />
+    </clipPath>
+    <linearGradient
+       x1="0"
+       y1="0"
+       x2="0"
+       y2="100.00128"
+       gradientUnits="userSpaceOnUse"
+       id="linearGradient42">
+      <stop
+         style="stop-opacity:1;stop-color:#c1d5f5"
+         offset="0"
+         id="stop38" />
+      <stop
+         style="stop-opacity:1;stop-color:#c1d5f5"
+         offset="0.25"
+         id="stop39" />
+      <stop
+         style="stop-opacity:1;stop-color:#d8e5f9"
+         offset="0.5"
+         id="stop40" />
+      <stop
+         style="stop-opacity:1;stop-color:#f0f5fd"
+         offset="0.75"
+         id="stop41" />
+      <stop
+         style="stop-opacity:1;stop-color:#f0f5fd"
+         offset="1"
+         id="stop42" />
+    </linearGradient>
+    <clipPath
+       clipPathUnits="userSpaceOnUse"
+       id="clipPath43">
+      <path
+         d="m 90.7097,17.00783 v 22.67752 h 45.35506 V 17.00783 Z"
+         transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)"
+         id="path43" />
+    </clipPath>
+    <linearGradient
+       x1="0"
+       y1="0"
+       x2="0"
+       y2="100.00128"
+       gradientUnits="userSpaceOnUse"
+       id="linearGradient64">
+      <stop
+         style="stop-opacity:1;stop-color:#cccccc"
+         offset="0"
+         id="stop60" />
+      <stop
+         style="stop-opacity:1;stop-color:#cccccc"
+         offset="0.25"
+         id="stop61" />
+      <stop
+         style="stop-opacity:1;stop-color:#dfdfdf"
+         offset="0.5"
+         id="stop62" />
+      <stop
+         style="stop-opacity:1;stop-color:#f2f2f2"
+         offset="0.75"
+         id="stop63" />
+      <stop
+         style="stop-opacity:1;stop-color:#f2f2f2"
+         offset="1"
+         id="stop64" />
+    </linearGradient>
+    <clipPath
+       clipPathUnits="userSpaceOnUse"
+       id="clipPath65">
+      <path
+         d="m 181.4194,119.0565 v 22.67752 h 56.69362 V 119.0565 Z"
+         transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)"
+         id="path65" />
+    </clipPath>
+    <linearGradient
+       x1="0"
+       y1="0"
+       x2="0"
+       y2="100.00128"
+       gradientUnits="userSpaceOnUse"
+       id="linearGradient73">
+      <stop
+         style="stop-opacity:1;stop-color:#cccccc"
+         offset="0"
+         id="stop69" />
+      <stop
+         style="stop-opacity:1;stop-color:#cccccc"
+         offset="0.25"
+         id="stop70" />
+      <stop
+         style="stop-opacity:1;stop-color:#dfdfdf"
+         offset="0.5"
+         id="stop71" />
+      <stop
+         style="stop-opacity:1;stop-color:#f2f2f2"
+         offset="0.75"
+         id="stop72" />
+      <stop
+         style="stop-opacity:1;stop-color:#f2f2f2"
+         offset="1"
+         id="stop73" />
+    </linearGradient>
+    <clipPath
+       clipPathUnits="userSpaceOnUse"
+       id="clipPath74">
+      <path
+         d="m 181.4194,85.04042 v 22.67752 h 56.69362 V 85.04042 Z"
+         transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)"
+         id="path74" />
+    </clipPath>
+    <linearGradient
+       x1="0"
+       y1="0"
+       x2="0"
+       y2="100.00128"
+       gradientUnits="userSpaceOnUse"
+       id="linearGradient82">
+      <stop
+         style="stop-opacity:1;stop-color:#cccccc"
+         offset="0"
+         id="stop78" />
+      <stop
+         style="stop-opacity:1;stop-color:#cccccc"
+         offset="0.25"
+         id="stop79" />
+      <stop
+         style="stop-opacity:1;stop-color:#dfdfdf"
+         offset="0.5"
+         id="stop80" />
+      <stop
+         style="stop-opacity:1;stop-color:#f2f2f2"
+         offset="0.75"
+         id="stop81" />
+      <stop
+         style="stop-opacity:1;stop-color:#f2f2f2"
+         offset="1"
+         id="stop82" />
+    </linearGradient>
+    <clipPath
+       clipPathUnits="userSpaceOnUse"
+       id="clipPath83">
+      <path
+         d="m 181.4194,51.02391 v 22.67752 h 56.69362 V 51.02391 Z"
+         transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)"
+         id="path83" />
+    </clipPath>
+    <linearGradient
+       x1="0"
+       y1="0"
+       x2="0"
+       y2="100.00128"
+       gradientUnits="userSpaceOnUse"
+       id="linearGradient91">
+      <stop
+         style="stop-opacity:1;stop-color:#cccccc"
+         offset="0"
+         id="stop87" />
+      <stop
+         style="stop-opacity:1;stop-color:#cccccc"
+         offset="0.25"
+         id="stop88" />
+      <stop
+         style="stop-opacity:1;stop-color:#dfdfdf"
+         offset="0.5"
+         id="stop89" />
+      <stop
+         style="stop-opacity:1;stop-color:#f2f2f2"
+         offset="0.75"
+         id="stop90" />
+      <stop
+         style="stop-opacity:1;stop-color:#f2f2f2"
+         offset="1"
+         id="stop91" />
+    </linearGradient>
+    <clipPath
+       clipPathUnits="userSpaceOnUse"
+       id="clipPath92">
+      <path
+         d="m 181.4194,17.00783 v 22.67752 h 56.69362 V 17.00783 Z"
+         transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)"
+         id="path92" />
+    </clipPath>
+    <linearGradient
+       x1="0"
+       y1="0"
+       x2="0"
+       y2="100.00128"
+       gradientUnits="userSpaceOnUse"
+       id="linearGradient100">
+      <stop
+         style="stop-opacity:1;stop-color:#cccccc"
+         offset="0"
+         id="stop96" />
+      <stop
+         style="stop-opacity:1;stop-color:#cccccc"
+         offset="0.25"
+         id="stop97" />
+      <stop
+         style="stop-opacity:1;stop-color:#dfdfdf"
+         offset="0.5"
+         id="stop98" />
+      <stop
+         style="stop-opacity:1;stop-color:#f2f2f2"
+         offset="0.75"
+         id="stop99" />
+      <stop
+         style="stop-opacity:1;stop-color:#f2f2f2"
+         offset="1"
+         id="stop100" />
+    </linearGradient>
+    <clipPath
+       clipPathUnits="userSpaceOnUse"
+       id="clipPath101">
+      <path
+         d="M 181.4194,-17.00824 V 5.66927 h 56.69362 v -22.67751 z"
+         transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)"
+         id="path101" />
+    </clipPath>
+    <linearGradient
+       x1="0"
+       y1="0"
+       x2="0"
+       y2="100.00128"
+       gradientUnits="userSpaceOnUse"
+       id="linearGradient109">
+      <stop
+         style="stop-opacity:1;stop-color:#cccccc"
+         offset="0"
+         id="stop105" />
+      <stop
+         style="stop-opacity:1;stop-color:#cccccc"
+         offset="0.25"
+         id="stop106" />
+      <stop
+         style="stop-opacity:1;stop-color:#dfdfdf"
+         offset="0.5"
+         id="stop107" />
+      <stop
+         style="stop-opacity:1;stop-color:#f2f2f2"
+         offset="0.75"
+         id="stop108" />
+      <stop
+         style="stop-opacity:1;stop-color:#f2f2f2"
+         offset="1"
+         id="stop109" />
+    </linearGradient>
+    <clipPath
+       clipPathUnits="userSpaceOnUse"
+       id="clipPath110">
+      <path
+         d="m 181.4194,-51.02432 v 22.67752 h 56.69362 v -22.67752 z"
+         transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)"
+         id="path110" />
+    </clipPath>
+    <linearGradient
+       x1="0"
+       y1="0"
+       x2="0"
+       y2="100.00128"
+       gradientUnits="userSpaceOnUse"
+       id="linearGradient118">
+      <stop
+         style="stop-opacity:1;stop-color:#cccccc"
+         offset="0"
+         id="stop114" />
+      <stop
+         style="stop-opacity:1;stop-color:#cccccc"
+         offset="0.25"
+         id="stop115" />
+      <stop
+         style="stop-opacity:1;stop-color:#dfdfdf"
+         offset="0.5"
+         id="stop116" />
+      <stop
+         style="stop-opacity:1;stop-color:#f2f2f2"
+         offset="0.75"
+         id="stop117" />
+      <stop
+         style="stop-opacity:1;stop-color:#f2f2f2"
+         offset="1"
+         id="stop118" />
+    </linearGradient>
+    <clipPath
+       clipPathUnits="userSpaceOnUse"
+       id="clipPath119">
+      <path
+         d="m 181.4194,-85.0404 v 22.67752 h 56.69362 V -85.0404 Z"
+         transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)"
+         id="path119" />
+    </clipPath>
+    <linearGradient
+       x1="0"
+       y1="0"
+       x2="0"
+       y2="100.00128"
+       gradientUnits="userSpaceOnUse"
+       id="linearGradient130">
+      <stop
+         style="stop-opacity:1;stop-color:#93de93"
+         offset="0"
+         id="stop126" />
+      <stop
+         style="stop-opacity:1;stop-color:#93de93"
+         offset="0.25"
+         id="stop127" />
+      <stop
+         style="stop-opacity:1;stop-color:#b7e9b7"
+         offset="0.5"
+         id="stop128" />
+      <stop
+         style="stop-opacity:1;stop-color:#dbf4db"
+         offset="0.75"
+         id="stop129" />
+      <stop
+         style="stop-opacity:1;stop-color:#dbf4db"
+         offset="1"
+         id="stop130" />
+    </linearGradient>
+    <clipPath
+       clipPathUnits="userSpaceOnUse"
+       id="clipPath131">
+      <path
+         d="m 272.12953,85.04042 v 22.67752 h 62.36289 V 85.04042 Z"
+         transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)"
+         id="path131" />
+    </clipPath>
+    <linearGradient
+       x1="0"
+       y1="0"
+       x2="0"
+       y2="100.00128"
+       gradientUnits="userSpaceOnUse"
+       id="linearGradient137">
+      <stop
+         style="stop-opacity:1;stop-color:#93de93"
+         offset="0"
+         id="stop133" />
+      <stop
+         style="stop-opacity:1;stop-color:#93de93"
+         offset="0.25"
+         id="stop134" />
+      <stop
+         style="stop-opacity:1;stop-color:#b7e9b7"
+         offset="0.5"
+         id="stop135" />
+      <stop
+         style="stop-opacity:1;stop-color:#dbf4db"
+         offset="0.75"
+         id="stop136" />
+      <stop
+         style="stop-opacity:1;stop-color:#dbf4db"
+         offset="1"
+         id="stop137" />
+    </linearGradient>
+    <clipPath
+       clipPathUnits="userSpaceOnUse"
+       id="clipPath138">
+      <path
+         d="m 272.12953,17.00783 v 22.67752 h 62.36289 V 17.00783 Z"
+         transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)"
+         id="path138" />
+    </clipPath>
+    <linearGradient
+       x1="0"
+       y1="0"
+       x2="0"
+       y2="100.00128"
+       gradientUnits="userSpaceOnUse"
+       id="linearGradient144">
+      <stop
+         style="stop-opacity:1;stop-color:#93de93"
+         offset="0"
+         id="stop140" />
+      <stop
+         style="stop-opacity:1;stop-color:#93de93"
+         offset="0.25"
+         id="stop141" />
+      <stop
+         style="stop-opacity:1;stop-color:#b7e9b7"
+         offset="0.5"
+         id="stop142" />
+      <stop
+         style="stop-opacity:1;stop-color:#dbf4db"
+         offset="0.75"
+         id="stop143" />
+      <stop
+         style="stop-opacity:1;stop-color:#dbf4db"
+         offset="1"
+         id="stop144" />
+    </linearGradient>
+    <clipPath
+       clipPathUnits="userSpaceOnUse"
+       id="clipPath145">
+      <path
+         d="M 272.12953,-17.00824 V 5.66927 h 62.36289 v -22.67751 z"
+         transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)"
+         id="path145" />
+    </clipPath>
+    <linearGradient
+       x1="0"
+       y1="0"
+       x2="0"
+       y2="100.00128"
+       gradientUnits="userSpaceOnUse"
+       id="linearGradient151">
+      <stop
+         style="stop-opacity:1;stop-color:#93de93"
+         offset="0"
+         id="stop147" />
+      <stop
+         style="stop-opacity:1;stop-color:#93de93"
+         offset="0.25"
+         id="stop148" />
+      <stop
+         style="stop-opacity:1;stop-color:#b7e9b7"
+         offset="0.5"
+         id="stop149" />
+      <stop
+         style="stop-opacity:1;stop-color:#dbf4db"
+         offset="0.75"
+         id="stop150" />
+      <stop
+         style="stop-opacity:1;stop-color:#dbf4db"
+         offset="1"
+         id="stop151" />
+    </linearGradient>
+    <clipPath
+       clipPathUnits="userSpaceOnUse"
+       id="clipPath152">
+      <path
+         d="m 272.12953,-51.02432 v 22.67752 h 62.36289 v -22.67752 z"
+         transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)"
+         id="path152" />
+    </clipPath>
+  </defs>
+  <g
+     id="g1">
+    <path
+       id="path1"
+       d="m 4.8645469,-3.2928672 c -0.3984375,0.023437 -0.890625,0.035156 -1.2421875,0.035156 -0.3515625,0 -0.9960938,-0.011719 -1.2539063,-0.046875 l 1.265625,-2.9179688 h 0.023437 c 0.4921875,1.1132813 0.890625,2.0976563 1.2070318,2.9296878 z m -2.71875,0.5625 c 0.2929687,-0.023437 1.1132812,-0.035156 1.5351562,-0.035156 0.4570313,0 1.1132813,0.011719 1.40625,0.035156 0.5742188,1.5703125 0.84375,2.50781251 0.9257813,2.76562501 C 6.2122031,1.015625e-4 6.4114219,1.015625e-4 6.6106406,1.015625e-4 c 0.1992188,0 0.515625,0 0.7148438,0.0351562475 C 6.7278281,-1.1717734 5.1575156,-5.2498984 4.0559531,-7.8631797 H 3.7161094 C 2.5676719,-5.2147422 1.3957969,-2.6014609 0.18876563,0.03525781 0.34110938,1.015625e-4 0.48173438,1.015625e-4 0.61064063,1.015625e-4 c 0.12890625,0 0.39843747,0 0.53906247,0.0351562475 0.1875,-0.7265625 0.5625,-1.71093751 0.9960938,-2.76562501 z m 7.3066406,-0.7851562 c 0.2695313,-0.6328125 0.9726565,-1.1835938 1.4414065,-1.1835938 0.808594,0 1.230469,0.7382813 1.230469,2.0976563 0,0.9726562 -0.28125,2.23828121 -1.558594,2.23828121 -0.1875,0 -0.6914065,-0.046875 -1.1132815,-0.5390625 z m 0,-0.6914063 v -0.8789062 c 0,-0.082031 -0.023437,-0.1171875 -0.070312,-0.1171875 -0.2695312,0.058594 -0.703125,0.070312 -0.9609375,0.035156 l -0.035156,0.035156 c 0.09375,0.5273437 0.1171875,1.4296875 0.1171875,2.3203125 v 3.19921871 c 0,0.89062499 -0.023437,1.75781249 -0.1171875,2.39062499 l 0.035156,0.035156 c 0.140625,-0.023437 0.4101563,-0.035156 0.5507813,-0.035156 0.1523437,0 0.421875,0.011719 0.5625,0.035156 l 0.023437,-0.035156 C 9.4641563,2.0977578 9.4524375,1.2891641 9.4524375,0.38682031 v -0.52734375 c 0.3164063,0.1640625 0.7617185,0.2578125 1.1835935,0.2578125 1.582032,0 2.53125,-1.19531246 2.53125,-2.88281246 0,-1.265625 -0.738281,-2.484375 -2.074218,-2.484375 -0.46875,0 -1.066407,0.2460937 -1.617188,1.078125 z m 6.2548825,0.6914063 c 0.269532,-0.6328125 0.972657,-1.1835938 1.441407,-1.1835938 0.808593,0 1.230468,0.7382813 1.230468,2.0976563 0,0.9726562 -0.28125,2.23828121 -1.558593,2.23828121 -0.1875,0 -0.691407,-0.046875 -1.113282,-0.5390625 z m 0,-0.6914063 v -0.8789062 c 0,-0.082031 -0.02344,-0.1171875 -0.07031,-0.1171875 -0.269531,0.058594 -0.703125,0.070312 -0.960938,0.035156 l -0.03516,0.035156 c 0.09375,0.5273437 0.117188,1.4296875 0.117188,2.3203125 v 3.19921871 c 0,0.89062499 -0.02344,1.75781249 -0.117188,2.39062499 l 0.03516,0.035156 c 0.140625,-0.023437 0.410157,-0.035156 0.550782,-0.035156 0.152343,0 0.421875,0.011719 0.5625,0.035156 l 0.02344,-0.035156 C 15.719039,2.0977578 15.70732,1.2891641 15.70732,0.38682031 v -0.52734375 c 0.316407,0.1640625 0.761719,0.2578125 1.183594,0.2578125 1.582031,0 2.53125,-1.19531246 2.53125,-2.88281246 0,-1.265625 -0.738281,-2.484375 -2.074219,-2.484375 -0.46875,0 -1.066406,0.2460937 -1.617187,1.078125 z m 5.396485,1.8164063 c 0,0.890625 -0.02344,1.75781246 -0.105469,2.3906249625 l 0.02344,0.0351562475 c 0.140625,-0.0234375 0.410157,-0.0351562475 0.5625,-0.0351562475 0.140625,0 0.410157,0.0117187475 0.5625,0.0351562475 L 22.170211,1.015625e-4 C 22.064742,-0.67958594 22.053023,-1.4764609 22.053023,-2.3905234 v -3.5859375 c 0,-0.9023438 0.04687,-1.4765625 0.117188,-2.25 0,-0.082031 -0.03516,-0.1171875 -0.117188,-0.1171875 -0.292968,0.1171875 -0.5625,0.1992187 -1.03125,0.234375 l -0.02344,0.035156 c 0.08203,0.515625 0.105469,1.40625 0.105469,2.3085938 z m 2.894531,-4.7460938 c 0,0.328125 0.28125,0.609375 0.609375,0.609375 0.316406,0 0.609375,-0.28125 0.609375,-0.609375 0,-0.3164062 -0.292969,-0.609375 -0.609375,-0.609375 -0.328125,0 -0.609375,0.2929688 -0.609375,0.609375 z m 0.117187,4.3242188 v 0.65625 c 0,0.9023437 -0.02344,1.51171871 -0.105468,2.1562499625 l 0.02344,0.0351562475 c 0.140625,-0.0234375 0.410156,-0.0351562475 0.5625,-0.0351562475 0.140625,0 0.410156,0.0117187475 0.5625,0.0351562475 L 25.18193,1.015625e-4 C 25.076461,-0.66786719 25.064742,-1.2420859 25.064742,-2.1561484 v -0.84375 c 0,-0.9023438 0.02344,-1.3242188 0.117188,-2.0507813 0,-0.1171875 -0.02344,-0.1289062 -0.117188,-0.1289062 -0.269531,0.035156 -0.785156,0.035156 -1.03125,0.011719 l -0.02344,0.035156 c 0.07031,0.515625 0.105468,1.4296875 0.105468,2.3203125 z m 5.018555,-1.9570313 c 0.457031,0 0.867188,0.3515625 1.195313,0.8789063 l 0.152343,-0.011719 0.28125,-0.9726562 -0.02344,-0.035156 c -0.375,-0.1992187 -0.996094,-0.3398437 -1.59375,-0.3398437 -1.242188,0 -2.542969,1.0195312 -2.542969,2.6484375 0,1.65234371 0.9375,2.71874996 2.402344,2.71874996 0.714844,0 1.277344,-0.234375 1.757812,-0.84375 l -0.234375,-0.2578125 h -0.04687 c -0.445312,0.41015625 -0.832031,0.50390625 -1.277343,0.50390625 -0.867188,0 -1.546875,-0.78515621 -1.546875,-2.17968751 0,-1.3242187 0.691406,-2.109375 1.476562,-2.109375 z m 5.844727,2.0742188 -0.04687,1.4296875 c 0,0.1523437 -0.07031,0.234375 -0.164063,0.30468746 -0.339844,0.2578125 -0.738281,0.46875 -1.101562,0.46875 -0.539063,0 -0.878907,-0.3515625 -0.878907,-0.72656246 0,-0.5390625 0.246094,-0.9492188 1.183594,-1.1953125 z m 0,2.12109371 c 0.128906,0.46875 0.492187,0.69140625 0.9375,0.69140625 0.292968,0 0.667968,-0.0703125 0.9375,-0.36328125 l -0.08203,-0.29296875 c -0.128906,0.0351563 -0.234375,0.0585938 -0.316406,0.0585938 -0.128906,0 -0.28125,-0.0234375 -0.363281,-0.09375 -0.117188,-0.10546875 -0.1875,-0.3984375 -0.1875,-0.93749996 0,-0.3398438 0.03516,-1.6640626 0.03516,-1.8046876 0,-1.6054687 -1.066406,-1.9335937 -1.96875,-1.9335937 -0.914062,0 -1.464844,0.4453125 -1.757812,0.6914063 l -0.03516,0.046875 0.199219,0.8085937 0.152344,0.011719 c 0.339844,-0.5390624 0.738281,-1.0195312 1.3125,-1.0195312 0.433594,0 1.148437,0.058594 1.148437,1.40625 0,0.09375 -0.04687,0.140625 -0.08203,0.1523437 l -1.113281,0.2460938 c -1.21875,0.28125 -1.980469,0.9257812 -1.980469,1.734375 0,0.89062501 0.609375,1.28906251 1.488281,1.28906251 0.667969,0 0.996094,-0.15234375 1.628907,-0.69140625 z m 3.18457,-4.55859371 c -0.257812,0 -0.503906,0 -0.644531,-0.023437 -0.08203,0.2109375 -0.152344,0.3164062 -0.269531,0.4921875 l 0.05859,0.082031 c 0.210938,-0.011719 0.574219,-0.011719 0.855469,-0.023437 v 1.640625 c 0,0.7148438 -0.03516,1.5820313 -0.03516,1.9335938 0,0.78515621 0.503906,1.14843746 1.054687,1.14843746 0.492188,0 0.878907,-0.1171874975 1.382813,-0.4453125 l -0.152344,-0.28125 c -0.363281,0.1171875 -0.632812,0.12890625 -0.9375,0.0820313 -0.28125,-0.0351563 -0.398437,-0.31640625 -0.398437,-0.98437506 0,-0.3515625 0.04687,-0.8671875 0.04687,-1.5820312 v -1.5117188 h 0.445312 c 0.292969,0 0.691406,0.011719 0.867188,0.023437 0.03516,-0.1875 0.07031,-0.3046875 0.140625,-0.4804687 l -0.05859,-0.09375 c -0.222656,0.011719 -0.5625,0.023437 -0.84375,0.023437 h -0.550781 c 0,-0.8789063 0,-1.0429688 0.05859,-1.7460938 0,-0.082031 -0.03516,-0.1171875 -0.117187,-0.1171875 -0.292969,0.1171875 -0.445313,0.2578125 -0.832031,0.3046875 l -0.02344,0.035156 c -0.02344,0.421875 -0.04687,0.8320312 -0.04687,1.5234375 z m 3.887695,-2.0039063 c 0,0.328125 0.28125,0.609375 0.609375,0.609375 0.316407,0 0.609375,-0.28125 0.609375,-0.609375 0,-0.3164062 -0.292968,-0.609375 -0.609375,-0.609375 -0.328125,0 -0.609375,0.2929688 -0.609375,0.609375 z m 0.117188,4.3242188 v 0.65625 c 0,0.9023437 -0.02344,1.51171871 -0.105469,2.1562499625 l 0.02344,0.0351562475 c 0.140625,-0.0234375 0.410156,-0.0351562475 0.5625,-0.0351562475 0.140625,0 0.410156,0.0117187475 0.5625,0.0351562475 L 43.234664,1.015625e-4 C 43.129195,-0.66786719 43.117477,-1.2420859 43.117477,-2.1561484 v -0.84375 c 0,-0.9023438 0.02344,-1.3242188 0.117187,-2.0507813 0,-0.1171875 -0.02344,-0.1289062 -0.117187,-0.1289062 -0.269532,0.035156 -0.785157,0.035156 -1.03125,0.011719 l -0.02344,0.035156 c 0.07031,0.515625 0.105469,1.4296875 0.105469,2.3203125 z m 2.487305,0.3632812 c 0,1.4296875 0.9375,2.56640626 2.53125,2.56640626 1.582031,0 2.53125,-1.11328125 2.53125,-2.67187496 0,-1.6289063 -0.867188,-2.6953125 -2.484375,-2.6953125 -1.570313,0 -2.578125,1.1015625 -2.578125,2.8007812 z m 2.496093,-2.3203125 c 1.265625,0 1.511719,0.984375 1.511719,2.4257813 0,1.1367187 -0.5625,1.98046871 -1.371094,1.98046871 -1.265625,0 -1.582031,-1.37109371 -1.582031,-2.28515621 0,-1.0429688 0.316406,-2.1210938 1.441406,-2.1210938 z m 8.519532,2.6132813 c 0,-0.375 0.02344,-0.8203125 0.02344,-1.2070313 0,-1.2421875 -0.398437,-1.8867187 -1.476562,-1.8867187 -0.457032,0 -1.300782,0.1875 -2.074219,1.078125 l -0.02344,-0.035156 v -0.8789062 c -0.01172,-0.082031 -0.02344,-0.1171875 -0.07031,-0.1171875 -0.269531,0.058594 -0.703125,0.070312 -0.960938,0.035156 l -0.03516,0.035156 c 0.09375,0.5273437 0.117188,1.4296875 0.117188,2.3203125 v 0.65625 c 0,0.9023437 -0.01172,1.51171871 -0.117188,2.1562499625 l 0.03516,0.0351562475 c 0.140625,-0.0234375 0.410157,-0.0351562475 0.550782,-0.0351562475 0.152343,0 0.421875,0.0117187475 0.5625,0.0351562475 L 52.225875,1.015625e-4 C 52.132125,-0.67958594 52.120406,-1.2420859 52.120406,-2.1561484 v -1.3945313 c 0.609375,-0.7148437 1.300782,-0.984375 1.734375,-0.984375 0.609375,0 0.867188,0.2460938 0.867188,1.2070313 v 1.171875 c 0,0.9023437 -0.02344,1.52343746 -0.117188,2.1562499625 l 0.02344,0.0351562475 c 0.152344,-0.0234375 0.421875,-0.0351562475 0.5625,-0.0351562475 0.152344,0 0.421875,0.0117187475 0.5625,0.0351562475 L 55.776656,1.015625e-4 C 55.682906,-0.67958594 55.671188,-1.2420859 55.671188,-2.1561484 Z m 0,0"
+       style="fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+       aria-label="Application"
+       transform="matrix(1.3333333,0,0,1.3333333,4.428,15.605333)" />
+    <g
+       id="g5"
+       clip-path="url(#clipPath6)">
+      <path
+         d="M 0,0 H 100 V 100 H 0 Z"
+         transform="matrix(1.2096133,0,0,-0.6048,-25.865307,144.18276)"
+         style="fill:url(#linearGradient5);stroke:none"
+         id="path5" />
+    </g>
+    <path
+       id="path7"
+       d="M 0,68.03218 V 90.7097 H 45.35506 V 68.03218 Z"
+       style="fill:none;stroke:#782d2d;stroke-width:0.3985;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+       transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)" />
+    <path
+       id="path8"
+       d="m 1.897875,-5.6257266 c 0,-0.2929687 0.1757812,-0.4453125 0.890625,-0.4453125 0.6914062,0 1.4179687,0.1875 1.4179687,1.3828125 0,1.1367188 -0.5625,1.5117188 -1.5234375,1.5117188 -0.2578125,0 -0.65625,-0.035156 -0.7851562,-0.1054688 z m -0.8671875,1.1953125 v 2.4375 c 0,0.75 -0.011719,1.45312504 -0.09375,1.9921875375 L 0.960375,0.03442969 c 0.1171875,-0.0234375 0.3867187,-0.0351562525 0.5039062,-0.0351562525 0.1171875,0 0.3867188,0.0117187525 0.5039063,0.0351562525 L 1.991625,-7.265625e-4 C 1.9095937,-0.56322656 1.897875,-1.2429141 1.897875,-1.9929141 v -0.8554687 c 0.2109375,0.070312 0.4804687,0.09375 0.8320312,0.09375 1.8164063,0 2.4375,-1.1367188 2.4375,-1.9921875 0,-0.7382813 -0.46875,-1.7460938 -2.3085937,-1.7460938 -0.2578125,0 -1.1015625,0.070312 -1.3945313,0.070312 -0.1171875,0 -0.3867187,-0.011719 -0.5039062,-0.035156 l -0.0234375,0.035156 c 0.0820312,0.5625 0.09375,1.2421875 0.09375,1.992188 z m 6.4365234,2.4375 v -1.1601562 c 0.46875,0 1.5234375,0.023437 2.2148438,0.09375 l 0.035156,-0.035156 c -0.023437,-0.082031 -0.035156,-0.2226562 -0.035156,-0.3164062 0,-0.082031 0.011719,-0.2226563 0.035156,-0.3164063 l -0.035156,-0.023437 c -0.5859375,0.046875 -1.0429688,0.09375 -2.2148438,0.09375 v -0.7734375 c 0,-0.1757812 0,-1.1953125 0.070312,-1.4414062 1.3359375,0 2.7187501,0.1289062 2.7187501,0.1289062 l 0.01172,-0.046875 c -0.01172,-0.082031 -0.01172,-0.1640625 -0.01172,-0.2578125 0,-0.082031 0,-0.234375 0.01172,-0.375 l -0.01172,-0.035156 c -0.164062,0.023437 -0.3984376,0.035156 -0.5976563,0.035156 h -2.625 c -0.3398438,0 -0.5039063,-0.035156 -0.5039063,-0.035156 l -0.023437,0.035156 c 0.082031,0.5625 0.09375,1.2421875 0.09375,1.9921875 v 2.4375 c 0,0.75 -0.011719,1.45312504 -0.09375,1.9921875375 L 6.5179922,0.03442969 c 0,0 0.1640625,-0.0351562525 0.515625,-0.0351562525 h 2.71875 c 0.2109375,0 0.4335938,0.0117187525 0.5976558,0.0351562525 l 0.02344,-0.0351562525 C 10.361742,-0.14135156 10.350023,-0.21166406 10.350023,-0.30541406 c 0,-0.10546875 0.01172,-0.2578125 0.02344,-0.328125 l -0.02344,-0.046875 c 0,0 -1.4765621,0.1171875 -2.8124996,0.1171875 -0.070312,-0.234375 -0.070312,-1.26562504 -0.070312,-1.42968754 z m 5.7246091,-2.4375 v 2.4375 c 0,0.75 -0.02344,1.45312504 -0.105468,1.9921875375 l 0.02344,0.0351562525 c 0.117188,-0.0234375 0.386719,-0.0351562525 0.515625,-0.0351562525 0.117188,0 0.386719,0.0117187525 0.503906,0.0351562525 l 0.02344,-0.0351562525 C 14.070727,-0.56322656 14.047289,-1.2429141 14.047289,-1.9929141 v -2.4375 c 0,-0.1757812 0.01172,-1.1953125 0.08203,-1.4414062 1.335938,0 2.015625,0.140625 2.015625,0.140625 l 0.01172,-0.035156 c -0.01172,-0.1523437 -0.01172,-0.375 0,-0.65625 l -0.01172,-0.035156 c -0.164062,0.023437 -0.398437,0.035156 -0.597656,0.035156 H 11.69182 c -0.199218,0 -0.433593,-0.011719 -0.597656,-0.035156 l -0.01172,0.035156 c 0.01172,0.28125 0.01172,0.5039063 0,0.65625 l 0.01172,0.035156 c 0,0 0.679688,-0.140625 2.015625,-0.140625 0.07031,0.2460937 0.08203,1.265625 0.08203,1.4414062 z m 5.53125,4.08984379 c -0.5625,0 -1.277343,-0.4921875 -1.582031,-1.07812499 l -0.105469,0.011719 c -0.03516,0.3632813 -0.152343,0.72656254 -0.222656,1.03125004 l 0.01172,0.0234375 c 0,0 0.644531,0.45703125 1.816406,0.45703125 1.21875,0 2.167969,-0.75 2.167969,-1.91015629 0,-1.1484375 -0.972656,-1.6875 -1.78125,-1.9921875 -0.503906,-0.1875 -1.21875,-0.4804687 -1.21875,-1.3007812 0,-0.3632813 0.199219,-0.7617188 0.457031,-0.9023438 0.175781,-0.082031 0.386719,-0.1171875 0.609375,-0.1171875 0.550781,0 1.089844,0.4335938 1.359375,1.0429688 l 0.09375,-0.011719 c 0.04687,-0.3515625 0.140625,-0.6914062 0.234375,-0.9960937 l -0.02344,-0.023437 c 0,0 -0.410157,-0.4453125 -1.582032,-0.4453125 -0.269531,0 -0.5625,0.046875 -0.855468,0.1640625 -0.585938,0.2460937 -1.089844,0.8320312 -1.089844,1.5585937 0,1.0429688 0.878906,1.5351563 1.710937,1.875 0.644532,0.2695313 1.160157,0.5976563 1.160157,1.453125 0,0.73828129 -0.574219,1.16015629 -1.160157,1.16015629 z m 5.124024,-3.63281249 c 0.375,0 0.726562,0.28125 0.996093,0.7382812 l 0.128907,-0.011719 0.222656,-0.8203125 -0.02344,-0.023437 c -0.304687,-0.1640625 -0.820312,-0.28125 -1.324218,-0.28125 -1.03125,0 -2.121094,0.84375 -2.121094,2.203125 0,1.38281249 0.785156,2.27343749 2.003906,2.27343749 0.597656,0 1.066406,-0.2109375 1.464844,-0.71484375 l -0.1875,-0.2109375 h -0.03516 c -0.386719,0.33984375 -0.703125,0.421875 -1.066407,0.421875 -0.726562,0 -1.300781,-0.65625004 -1.300781,-1.82812504 0,-1.0898437 0.574219,-1.7460937 1.242188,-1.7460937 z m 0,0"
+       style="fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+       aria-label="PETSc"
+       transform="matrix(1.3333333,0,0,1.3333333,17.532,118.208)" />
+    <path
+       id="path9"
+       d="M 45.35506,79.37073 89.19551,35.53065"
+       style="fill:none;stroke:#000000;stroke-width:0.49814;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+       transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)" />
+    <path
+       id="path10"
+       d="M 2.14195,0 -1.28517,1.71356 0,0 -1.28517,-1.71356"
+       style="fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+       transform="matrix(0.9428,0.94278667,0.94278667,-0.9428,123.306,172.39647)" />
+    <g
+       id="g14"
+       clip-path="url(#clipPath15)">
+      <path
+         d="M 0,0 H 100 V 100 H 0 Z"
+         transform="matrix(1.2096133,0,0,-0.6048,-25.865307,189.53756)"
+         style="fill:url(#linearGradient14);stroke:none"
+         id="path14" />
+    </g>
+    <path
+       id="path16"
+       d="M 0,34.01608 V 56.6936 H 45.35506 V 34.01608 Z"
+       style="fill:none;stroke:#782d2d;stroke-width:0.3985;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+       transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)" />
+    <path
+       id="path17"
+       d="m 1.8995078,-2.8475469 c 0.3398438,0 0.6445313,-0.011719 0.9140625,-0.035156 0.28125,0.328125 0.4921875,0.7265625 0.75,1.125 0.3632813,0.5859375 0.84375,1.45312497 0.9492188,1.79296872 0.1875,-0.035156245 0.4101562,-0.035156245 0.5976562,-0.035156245 0.1992188,0 0.3984375,0 0.5976563,0.035156245 L 5.7315391,1.09375e-4 C 5.2276328,-0.56239063 4.2198203,-2.1327031 3.5987266,-2.9764531 c 0.2695312,-0.046875 0.46875,-0.1171875 0.609375,-0.1875 0.5390625,-0.2695313 0.9960937,-0.7851563 0.9960937,-1.6054688 0,-0.5039062 -0.1757812,-0.9140625 -0.5273437,-1.2421875 -0.4804688,-0.4570312 -1.2421875,-0.4804687 -1.8632813,-0.4804687 -0.2578125,0 -1.0546875,0.070312 -1.3476562,0.070312 -0.1171875,0 -0.3867188,-0.011719 -0.50390629,-0.035156 l -0.0234375,0.035156 c 0.0820313,0.5625 0.09375,1.2421875 0.09375,1.9921875 v 2.4375 c 0,0.75 -0.011719,1.45312497 -0.09375,1.992187475 L 0.96200781,0.03526562 C 1.0791953,0.01182812 1.3487266,1.09375e-4 1.4659141,1.09375e-4 c 0.1171875,0 0.3867187,0.011718745 0.5039062,0.035156245 L 1.9932578,1.09375e-4 C 1.9112266,-0.56239063 1.8995078,-1.2420781 1.8995078,-1.9920781 Z m 0.9257813,-3.2226562 c 0.5976562,0 1.4179687,0.234375 1.4179687,1.3125 0,1.2070312 -0.8085937,1.5117187 -1.7695312,1.5117187 H 1.8995078 v -2.3789062 c 0,-0.3867188 0.023437,-0.4453125 0.9257813,-0.4453125 z m 6.2460937,3.8320312 -0.046875,1.1835938 c 0,0.12890622 -0.058594,0.19921872 -0.140625,0.25781247 -0.28125,0.2109375 -0.609375,0.38671875 -0.9140625,0.38671875 -0.4453125,0 -0.7382812,-0.29296875 -0.7382812,-0.60937502 0,-0.4453125 0.2109375,-0.7851562 0.984375,-0.9960937 z m 0,1.75781252 c 0.1054688,0.38671875 0.3984375,0.5859375 0.7734375,0.5859375 0.2460937,0 0.5507817,-0.0703125 0.7851567,-0.31640625 l -0.07031,-0.234375 c -0.105469,0.0234375 -0.199219,0.046875 -0.269531,0.046875 -0.09375,0 -0.222656,-0.0234375 -0.2929689,-0.0820313 -0.1054688,-0.0820313 -0.1640625,-0.328125 -0.1640625,-0.77343757 0,-0.2929687 0.035156,-1.3945312 0.035156,-1.5 0,-1.3476562 -0.890625,-1.6171875 -1.6523437,-1.6171875 -0.75,0 -1.2070313,0.3632813 -1.4648438,0.5742188 l -0.023437,0.046875 0.1640625,0.65625 0.1289063,0.011719 c 0.28125,-0.4453125 0.6210937,-0.84375 1.1015625,-0.84375 0.3515625,0 0.9492187,0.046875 0.9492187,1.171875 0,0.070312 -0.035156,0.1171875 -0.070312,0.1289063 l -0.9257812,0.1992187 c -1.0078125,0.234375 -1.6523438,0.7734375 -1.6523438,1.45312507 0,0.73828125 0.515625,1.078125 1.2539063,1.078125 0.5390625,0 0.8203125,-0.140625 1.3476562,-0.5859375 z M 11.71982,-4.2772344 c -0.210937,0 -0.421875,0 -0.539062,-0.011719 -0.07031,0.1640625 -0.128906,0.2578125 -0.222656,0.3984375 l 0.04687,0.070312 c 0.175781,0 0.480468,0 0.714843,-0.011719 v 1.359375 c 0,0.5976563 -0.03516,1.3125 -0.03516,1.61718752 0,0.64453125 0.421875,0.9609375 0.878906,0.9609375 0.421875,0 0.738282,-0.105468745 1.160157,-0.375 l -0.128907,-0.234375 c -0.304687,0.09375 -0.527343,0.10546875 -0.785156,0.0703125 -0.234375,-0.0351563 -0.328125,-0.26953125 -0.328125,-0.82031252 0,-0.3046875 0.03516,-0.7265625 0.03516,-1.3242187 v -1.2539063 h 0.375 c 0.234375,0 0.574219,0.011719 0.714844,0.011719 0.03516,-0.1523438 0.05859,-0.2578125 0.117188,-0.3984375 l -0.04687,-0.070312 c -0.175782,0 -0.46875,0.011719 -0.691407,0.011719 h -0.46875 c 0,-0.7382812 0,-0.8671875 0.04687,-1.453125 0,-0.070312 -0.03516,-0.09375 -0.09375,-0.09375 -0.246093,0.09375 -0.375,0.2109375 -0.691406,0.2460938 l -0.02344,0.035156 c -0.02344,0.3398438 -0.03516,0.6914063 -0.03516,1.265625 z m 3.679688,1.5585938 c 0.175781,-1.03125 0.820312,-1.2539063 1.136719,-1.2539063 0.386718,0 0.84375,0.3632813 0.84375,1.1132813 0,0.09375 -0.04687,0.140625 -0.140625,0.140625 z m 2.660156,1.6875 c -0.351562,0.37499997 -0.808594,0.53906247 -1.359375,0.53906247 -0.351562,0 -0.820312,-0.12890625 -1.089844,-0.56249997 -0.175781,-0.28125 -0.234375,-0.6796875 -0.234375,-1.265625 h 2.707032 c 0.105468,0 0.175781,-0.058594 0.175781,-0.1757813 0,-0.8320312 -0.410156,-1.875 -1.722656,-1.875 -1.019532,0 -2.039063,0.8203125 -2.039063,2.2851563 0,0.5742187 0.105469,1.12499997 0.445313,1.52343747 0.339843,0.421875 0.914062,0.66796875 1.582031,0.66796875 0.726562,0 1.359375,-0.375 1.734375,-0.890625 z m 1.511719,-0.9609375 c 0,0.75 -0.02344,1.46484372 -0.09375,1.992187475 l 0.02344,0.035156245 c 0.117188,-0.0234375 0.339844,-0.035156245 0.46875,-0.035156245 0.117188,0 0.339844,0.011718745 0.457032,0.035156245 L 20.450289,1.09375e-4 C 20.368258,-0.56239063 20.368258,-1.2303594 20.368258,-1.9920781 v -2.9882813 c 0,-0.75 0.03516,-1.2304687 0.08203,-1.875 0,-0.070312 -0.02344,-0.09375 -0.08203,-0.09375 -0.257813,0.09375 -0.46875,0.1640625 -0.867188,0.1992188 l -0.02344,0.023437 c 0.07031,0.4335937 0.09375,1.171875 0.09375,1.921875 z m 0,0"
+       style="fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+       aria-label="Ratel"
+       transform="matrix(1.3333333,0,0,1.3333333,20.494667,163.88267)" />
+    <path
+       id="path18"
+       d="M 45.35506,45.35464 88.68451,30.46008"
+       style="fill:none;stroke:#000000;stroke-width:0.49814;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+       transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)" />
+    <path
+       id="path19"
+       d="M 2.14195,0 -1.28517,1.71356 0,0 -1.28517,-1.71356"
+       style="fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+       transform="matrix(1.2609067,0.43342667,0.43342667,-1.2609067,122.62468,179.1572)" />
+    <g
+       id="g23"
+       clip-path="url(#clipPath24)">
+      <path
+         d="M 0,0 H 100 V 100 H 0 Z"
+         transform="matrix(1.2096133,0,0,-0.6048,-25.865307,234.89232)"
+         style="fill:url(#linearGradient23);stroke:none"
+         id="path23" />
+    </g>
+    <path
+       id="path25"
+       d="M 0,0 V 22.67752 H 45.35506 V 0 Z"
+       style="fill:none;stroke:#782d2d;stroke-width:0.3985;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+       transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)" />
+    <path
+       id="path26"
+       d="m 5.3445234,-4.4292109 v 0.796875 H 1.8992109 v -0.796875 c 0,-0.75 0.011719,-1.4648438 0.09375,-1.9921875 l -0.011719,-0.035156 c -0.1171875,0.023437 -0.3984375,0.035156 -0.515625,0.035156 -0.1171875,0 -0.3867188,-0.011719 -0.50390626,-0.035156 l -0.0234375,0.035156 c 0.0820313,0.5625 0.09375,1.2421875 0.09375,1.9921875 v 2.4375 c 0,0.75 -0.011719,1.45312496 -0.09375,1.9921874625 L 0.94999219,0.03563281 C 1.0671797,0.01219531 1.3484297,4.765625e-4 1.4656172,4.765625e-4 c 0.1171875,0 0.3867187,0.0117187475 0.5039062,0.0351562475 L 1.9929609,4.765625e-4 C 1.9109297,-0.56202344 1.8992109,-1.2417109 1.8992109,-1.9917109 v -1.2070313 h 3.4453125 v 1.2070313 c 0,0.75 -0.023437,1.45312496 -0.1054687,1.9921874625 L 5.2507734,0.03563281 C 5.3679609,0.01219531 5.6492109,4.765625e-4 5.7663984,4.765625e-4 c 0.1171875,0 0.3867188,0.0117187475 0.515625,0.0351562475 L 6.2937422,4.765625e-4 C 6.2117109,-0.56202344 6.1999922,-1.2417109 6.1999922,-1.9917109 v -2.4375 c 0,-0.75 0.011719,-1.4648438 0.09375,-1.9921875 l -0.011719,-0.035156 c -0.1171875,0.023437 -0.3984375,0.035156 -0.515625,0.035156 -0.1171875,0 -0.3867187,-0.011719 -0.5039062,-0.035156 l -0.023437,0.035156 c 0.082031,0.5625 0.1054687,1.2421875 0.1054684,1.9921875 z m 5.4287106,-1.6875 c 1.40625,0 2.296875,0.9609375 2.296875,3.0234375 0,1.7929687 -0.820312,2.75390621 -1.957031,2.75390621 -1.1953124,0 -2.3320311,-1.00781251 -2.3320311,-2.91796871 0,-2.0859375 1.03125,-2.859375 1.9921871,-2.859375 z m 3.328125,2.8359375 c 0,-1.96875 -1.40625,-3.2695313 -3.199218,-3.2695313 -1.7812504,0 -3.1523441,1.5117188 -3.1523441,3.4570313 0,1.96875 1.4414062,3.19921871 3.1640621,3.19921871 1.898438,0 3.1875,-1.39453121 3.1875,-3.38671871 z m 1.590821,-3.140625 c 0.08203,1.8867187 0.05859,4.8398437 -0.105469,6.4218749625 l 0.02344,0.0351562475 c 0.117188,-0.0234375 0.234375,-0.0351562475 0.351563,-0.0351562475 0.117187,0 0.222656,0.0117187475 0.351562,0.0351562475 L 16.324992,4.765625e-4 C 16.242961,-0.56202344 16.231242,-1.2417109 16.231242,-1.9917109 v -2.3789063 c 0,-0.609375 0.01172,-0.6445312 0.386719,-0.1640625 l 3.480469,4.39453126 c 0.105468,0.15234375 0.257812,0.24609375 0.421875,0.24609375 0.140625,0 0.175781,-0.12890625 0.1875,-0.31640625 0.04687,-2.27343746 0.04687,-4.00781246 0.199218,-6.21093746 l -0.01172,-0.035156 c -0.128907,0.023437 -0.234375,0.035156 -0.351563,0.035156 -0.117187,0 -0.234375,-0.011719 -0.351562,-0.035156 l -0.02344,0.035156 c 0.08203,0.5625 0.105469,1.2421875 0.105469,1.9921875 v 2.6601562 c -0.02344,0.5390625 -0.164063,0.3164063 -0.585938,-0.2578125 l -3.421875,-4.4179687 c 0,0 -0.09375,0.023437 -0.140625,0.023437 -0.339843,0 -0.410156,-0.035156 -0.410156,-0.035156 z m 7.974609,4.4296875 v -1.1601563 c 0.46875,0 1.523438,0.023437 2.214844,0.09375 l 0.03516,-0.035156 c -0.02344,-0.082031 -0.03516,-0.2226563 -0.03516,-0.3164063 0,-0.082031 0.01172,-0.2226562 0.03516,-0.3164062 l -0.03516,-0.023437 c -0.585938,0.046875 -1.042969,0.09375 -2.214844,0.09375 v -0.7734375 c 0,-0.1757813 0,-1.1953125 0.07031,-1.4414063 1.335937,0 2.71875,0.1289063 2.71875,0.1289063 l 0.01172,-0.046875 c -0.01172,-0.082031 -0.01172,-0.1640625 -0.01172,-0.2578125 0,-0.082031 0,-0.234375 0.01172,-0.375 l -0.01172,-0.035156 c -0.164063,0.023437 -0.398438,0.035156 -0.597657,0.035156 h -2.625 c -0.339843,0 -0.503906,-0.035156 -0.503906,-0.035156 l -0.02344,0.035156 c 0.08203,0.5625 0.09375,1.2421875 0.09375,1.9921875 v 2.4375 c 0,0.75 -0.01172,1.45312496 -0.09375,1.9921874625 L 22.71757,0.03563281 c 0,0 0.164063,-0.0351562475 0.515625,-0.0351562475 h 2.71875 c 0.210938,0 0.433594,0.0117187475 0.597657,0.0351562475 L 26.573039,4.765625e-4 C 26.56132,-0.14014844 26.549602,-0.21046094 26.549602,-0.30421094 c 0,-0.10546875 0.01172,-0.2578125 0.02344,-0.328125 l -0.02344,-0.046875 c 0,0 -1.476563,0.1171875 -2.8125,0.1171875 -0.07031,-0.234375 -0.07031,-1.26562496 -0.07031,-1.42968746 z m 5.358398,0 v -1.1601563 c 0.46875,0 1.523438,0.023437 2.214844,0.09375 l 0.03516,-0.035156 c -0.02344,-0.082031 -0.03516,-0.2226563 -0.03516,-0.3164063 0,-0.082031 0.01172,-0.2226562 0.03516,-0.3164062 l -0.03516,-0.023437 c -0.585937,0.046875 -1.042969,0.09375 -2.214844,0.09375 v -0.7734375 c 0,-0.1757813 0,-1.1953125 0.07031,-1.4414063 1.335937,0 2.71875,0.1289063 2.71875,0.1289063 l 0.01172,-0.046875 c -0.01172,-0.082031 -0.01172,-0.1640625 -0.01172,-0.2578125 0,-0.082031 0,-0.234375 0.01172,-0.375 l -0.01172,-0.035156 c -0.164063,0.023437 -0.398438,0.035156 -0.597656,0.035156 h -2.625 c -0.339844,0 -0.503907,-0.035156 -0.503907,-0.035156 l -0.02344,0.035156 c 0.08203,0.5625 0.09375,1.2421875 0.09375,1.9921875 v 2.4375 c 0,0.75 -0.01172,1.45312496 -0.09375,1.9921874625 l 0.01172,0.0351562475 c 0,0 0.164062,-0.0351562475 0.515625,-0.0351562475 h 2.71875 c 0.210937,0 0.433593,0.0117187475 0.597656,0.0351562475 L 31.931437,4.765625e-4 C 31.919719,-0.14014844 31.908,-0.21046094 31.908,-0.30421094 c 0,-0.10546875 0.01172,-0.2578125 0.02344,-0.328125 l -0.02344,-0.046875 c 0,0 -1.476563,0.1171875 -2.8125,0.1171875 -0.07031,-0.234375 -0.07031,-1.26562496 -0.07031,-1.42968746 z m 0,0"
+       style="fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+       aria-label="HONEE"
+       transform="matrix(1.3333333,0,0,1.3333333,12.956,208.91733)" />
+    <path
+       id="path27"
+       d="M 45.35506,11.33856 88.68451,26.23311"
+       style="fill:none;stroke:#000000;stroke-width:0.49814;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+       transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)" />
+    <path
+       id="path28"
+       d="M 2.14195,0 -1.28517,1.71356 0,0 -1.28517,-1.71356"
+       style="fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+       transform="matrix(1.2609067,-0.43342667,-0.43342667,-1.2609067,122.62468,184.7932)" />
+    <g
+       id="g32"
+       clip-path="url(#clipPath33)">
+      <path
+         d="M 0,0 H 100 V 100 H 0 Z"
+         transform="matrix(1.2096133,0,0,-0.6048,-25.865307,280.24763)"
+         style="fill:url(#linearGradient32);stroke:none"
+         id="path32" />
+    </g>
+    <path
+       id="path34"
+       d="m 0,-34.01648 v 22.67752 h 45.35506 v -22.67752 z"
+       style="fill:none;stroke:#782d2d;stroke-width:0.3985;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+       transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)" />
+    <path
+       id="path35"
+       d="m 1.9222109,-6.59725 c -0.3515625,2.015625 -0.890625,4.7695312 -1.28906246,6.6328125 0.1171875,-0.03515625 0.2109375,-0.03515625 0.328125,-0.03515625 0.11718746,0 0.19921876,0 0.30468746,0.03515625 0.1523438,-1.0546875 0.4921875,-3.2695313 0.7265625,-4.78125 h 0.046875 c 0.75,1.59375 1.453125,3.1992187 2.0976563,4.74609375 h 0.1875 C 5.0393984,-1.6285 5.7425234,-3.1519375 6.5393984,-4.7105313 l 0.023437,0.011719 c 0.2109375,1.5585937 0.4101563,3.0820312 0.5625,4.734375 C 7.2776797,4.0625e-4 7.4651797,4.0625e-4 7.6058047,4.0625e-4 c 0.1523437,0 0.4101562,0 0.5625,0.03515625 C 7.8050234,-2.09725 7.5120547,-4.1714688 7.2308047,-6.59725 H 6.9847109 L 4.5589297,-1.6519375 H 4.4886172 C 3.6917422,-3.2925625 2.9768984,-4.8863125 2.2503359,-6.59725 Z m 10.9599611,0.1757812 h -2.589844 c -0.3398436,0 -0.5039061,-0.035156 -0.5039061,-0.035156 l -0.023437,0.035156 c 0.082031,0.5625 0.09375,1.2421875 0.09375,1.9921875 v 2.4375 c 0,0.75 -0.011719,1.45312505 -0.09375,1.99218755 l 0.011719,0.03515625 c 0,0 0.1640625,-0.03515625 0.5156251,-0.03515625 0.339844,0 0.503906,0.03515625 0.503906,0.03515625 l 0.02344,-0.03515625 c -0.08203,-0.5625 -0.09375,-1.24218755 -0.09375,-1.99218755 v -1.1601562 c 0.46875,0 1.523437,0.023437 2.214844,0.09375 l 0.03516,-0.035156 C 12.952484,-3.175375 12.940766,-3.316 12.940766,-3.40975 c 0,-0.082031 0.01172,-0.2226563 0.03516,-0.3164063 l -0.03516,-0.023437 c -0.585938,0.046875 -1.042969,0.09375 -2.214844,0.09375 v -0.7734375 c 0,-0.1757812 0,-1.1953125 0.07031,-1.4414062 1.335938,0 2.71875,0.1289062 2.71875,0.1289062 l 0.01172,-0.046875 c -0.01172,-0.082031 -0.01172,-0.2109375 -0.01172,-0.2929687 0,-0.082031 0,-0.1992188 0.01172,-0.3398438 l -0.01172,-0.035156 c -0.164062,0.023437 -0.398437,0.035156 -0.632812,0.035156 z m 2.865234,4.4296875 v -1.1601562 c 0.46875,0 1.523438,0.023437 2.214844,0.09375 l 0.03516,-0.035156 C 17.973969,-3.175375 17.96225,-3.316 17.96225,-3.40975 c 0,-0.082031 0.01172,-0.2226563 0.03516,-0.3164063 l -0.03516,-0.023437 c -0.585938,0.046875 -1.042969,0.09375 -2.214844,0.09375 v -0.7734375 c 0,-0.1757812 0,-1.1953125 0.07031,-1.4414062 1.335937,0 2.71875,0.1289062 2.71875,0.1289062 l 0.01172,-0.046875 c -0.01172,-0.082031 -0.01172,-0.1640625 -0.01172,-0.2578125 0,-0.082031 0,-0.234375 0.01172,-0.375 l -0.01172,-0.035156 c -0.164063,0.023437 -0.398438,0.035156 -0.597657,0.035156 h -2.625 c -0.339843,0 -0.503906,-0.035156 -0.503906,-0.035156 l -0.02344,0.035156 c 0.08203,0.5625 0.09375,1.2421875 0.09375,1.9921875 v 2.4375 c 0,0.75 -0.01172,1.45312505 -0.09375,1.99218755 l 0.01172,0.03515625 c 0,0 0.164063,-0.03515625 0.515625,-0.03515625 h 2.71875 c 0.210938,0 0.433594,0.01171875 0.597657,0.03515625 l 0.02344,-0.03515625 c -0.01172,-0.140625 -0.02344,-0.2109375 -0.02344,-0.3046875 0,-0.10546875 0.01172,-0.2578125 0.02344,-0.328125 l -0.02344,-0.046875 c 0,0 -1.476563,0.1171875 -2.8125,0.1171875 -0.07031,-0.234375 -0.07031,-1.26562505 -0.07031,-1.42968755 z M 21.129242,-6.59725 C 20.77768,-4.581625 20.238617,-1.8277188 19.84018,0.0355625 19.957367,4.0625e-4 20.051117,4.0625e-4 20.168305,4.0625e-4 c 0.117187,0 0.199218,0 0.304687,0.03515625 0.152344,-1.0546875 0.492188,-3.2695313 0.726563,-4.78125 h 0.04687 c 0.75,1.59375 1.453125,3.1992187 2.097656,4.74609375 h 0.1875 C 24.24643,-1.6285 24.949555,-3.1519375 25.74643,-4.7105313 l 0.02344,0.011719 c 0.210938,1.5585937 0.410156,3.0820312 0.5625,4.734375 0.152344,-0.03515625 0.339844,-0.03515625 0.480469,-0.03515625 0.152344,0 0.410156,0 0.5625,0.03515625 -0.363281,-2.1328125 -0.65625,-4.2070313 -0.9375,-6.6328125 h -0.246094 l -2.425781,4.9453125 h -0.07031 C 22.898773,-3.2925625 22.18393,-4.8863125 21.457367,-6.59725 Z m 0,0"
+       style="fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+       aria-label="MFEM"
+       transform="matrix(1.3333333,0,0,1.3333333,15.925333,254.42133)" />
+    <path
+       id="path36"
+       d="M 45.35506,-22.67793 89.19551,21.16212"
+       style="fill:none;stroke:#000000;stroke-width:0.49814;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+       transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)" />
+    <path
+       id="path37"
+       d="M 2.14195,0 -1.28517,1.71356 0,0 -1.28517,-1.71356"
+       style="fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+       transform="matrix(0.9428,-0.94278667,-0.94278667,-0.9428,123.306,191.55451)" />
+    <path
+       id="path38"
+       d="m 2.2861484,-5.3202109 c 0,-0.890625 0.023437,-1.7578125 0.1171875,-2.390625 l -0.011719,-0.035156 c -0.140625,0.023437 -0.4804688,0.035156 -0.6210938,0.035156 -0.140625,0 -0.46875,-0.011719 -0.609375,-0.035156 l -0.023437,0.035156 c 0.09375,0.6796875 0.1171875,1.5 0.1171875,2.390625 v 2.9296875 c 0,0.890625 -0.023437,1.74609371 -0.1171875,2.3906249625 L 1.1494297,0.03525781 c 0,0 0.2109375,-0.0351562475 0.6210937,-0.0351562475 h 3.2578125 c 0.2578125,0 0.5273438,0.0117187475 0.7265625,0.0351562475 L 5.7783359,1.015625e-4 C 5.7666172,-0.16396094 5.7548984,-0.41005469 5.7548984,-0.52724219 c 0,-0.1171875 0.011719,-0.31640625 0.023437,-0.421875 l -0.023437,-0.046875 c 0,0 -1.7695312,0.26953125 -3.375,0.26953125 -0.082031,-0.29296876 -0.09375,-1.46484376 -0.09375,-1.66406246 z m 4.8310547,-1.8164063 c 0,0.328125 0.28125,0.609375 0.609375,0.609375 0.3164063,0 0.609375,-0.28125 0.609375,-0.609375 0,-0.3164062 -0.2929687,-0.609375 -0.609375,-0.609375 -0.328125,0 -0.609375,0.2929688 -0.609375,0.609375 z m 0.1171875,4.3242188 v 0.65625 c 0,0.9023437 -0.023437,1.51171871 -0.1054687,2.1562499625 L 7.1523594,0.03525781 c 0.140625,-0.0234375 0.4101562,-0.0351562475 0.5625,-0.0351562475 0.140625,0 0.4101562,0.0117187475 0.5625,0.0351562475 L 8.3007969,1.015625e-4 C 8.1953281,-0.66786719 8.1836094,-1.2420859 8.1836094,-2.1561484 v -0.84375 c 0,-0.9023438 0.023437,-1.3242188 0.1171875,-2.0507813 0,-0.1171875 -0.023437,-0.1289062 -0.1171875,-0.1289062 -0.2695313,0.035156 -0.7851563,0.035156 -1.03125,0.011719 l -0.023437,0.035156 c 0.070312,0.515625 0.1054687,1.4296875 0.1054682,2.3203125 z m 3.9638674,1.83984371 V -4.0663047 c 0.46875,-0.4804687 0.773437,-0.703125 1.253906,-0.703125 0.738281,0 1.382813,0.609375 1.382813,2.0976563 0,1.4765625 -0.445313,2.30859371 -1.582032,2.30859371 -0.375,0 -0.808593,-0.3046875 -1.054687,-0.609375 z m 0,-5.00390621 c 0,-0.8789063 0.01172,-1.5117188 0.07031,-2.25 0,-0.082031 -0.03516,-0.1171875 -0.105468,-0.1171875 -0.292969,0.1171875 -0.5625,0.1992187 -1.042969,0.234375 l -0.02344,0.035156 c 0.09375,0.515625 0.152344,1.40625 0.152344,2.3085938 v 4.89843746 c 0,0.45703125 -0.01172,0.64453125 -0.05859,0.890625 0.07031,0.0703125 0.199219,0.09375 0.328125,0.09375 0.128907,-0.140625 0.269532,-0.33984375 0.398438,-0.5859375 0.339844,0.3046875 0.890625,0.5859375 1.40625,0.5859375 1.21875,0 2.566406,-0.85546875 2.566406,-2.89453126 0,-1.453125 -1.03125,-2.4726562 -2.191406,-2.4726562 -0.574219,0 -1.089844,0.1523437 -1.5,0.5976562 z m 6.084961,1.9804687 v -1.0898437 c 0,-0.082031 -0.02344,-0.1171875 -0.07031,-0.1171875 -0.269531,0.058594 -0.703125,0.070312 -0.960937,0.035156 l -0.02344,0.035156 c 0.08203,0.5273437 0.105469,1.4296875 0.105469,2.3203125 v 0.65625 c 0,0.9023437 -0.02344,1.52343746 -0.105469,2.1562499625 l 0.02344,0.0351562475 c 0.140625,-0.0234375 0.410156,-0.0351562475 0.5625,-0.0351562475 0.140625,0 0.410156,0.0117187475 0.550781,0.0351562475 L 17.400406,1.015625e-4 C 17.294937,-0.67958594 17.283219,-1.2420859 17.283219,-2.1561484 v -0.6210938 c 0,-0.4921875 0.152343,-0.7851562 0.410156,-1.1835937 0.164062,-0.2578125 0.445312,-0.4101563 0.667969,-0.4101563 0.246093,0 0.46875,0.023437 0.621093,0.1640625 l 0.09375,-0.023437 0.246094,-0.890625 -0.04687,-0.046875 c -0.210937,-0.058594 -0.222656,-0.082031 -0.433594,-0.082031 -0.644531,0 -0.996093,0.375 -1.523437,1.2773437 z m 5.956054,1.3007813 -0.04687,1.4296875 c 0,0.1523437 -0.07031,0.234375 -0.164062,0.30468746 -0.339844,0.2578125 -0.738281,0.46875 -1.101563,0.46875 -0.539062,0 -0.878906,-0.3515625 -0.878906,-0.72656246 0,-0.5390625 0.246094,-0.9492188 1.183594,-1.1953125 z m 0,2.12109371 c 0.128907,0.46875 0.492188,0.69140625 0.9375,0.69140625 0.292969,0 0.667969,-0.0703125 0.9375,-0.36328125 l -0.08203,-0.29296875 c -0.128906,0.0351563 -0.234375,0.0585938 -0.316406,0.0585938 -0.128906,0 -0.28125,-0.0234375 -0.363281,-0.09375 -0.117188,-0.10546875 -0.1875,-0.3984375 -0.1875,-0.93750006 0,-0.3398437 0.03516,-1.6640625 0.03516,-1.8046875 0,-1.6054687 -1.066406,-1.9335937 -1.96875,-1.9335937 -0.914063,0 -1.464844,0.4453125 -1.757813,0.6914063 l -0.03516,0.046875 0.199219,0.8085937 0.152344,0.011719 c 0.339843,-0.5390624 0.738281,-1.0195312 1.3125,-1.0195312 0.433593,0 1.148437,0.058594 1.148437,1.40625 0,0.09375 -0.04687,0.140625 -0.08203,0.1523437 l -1.113281,0.2460938 c -1.21875,0.28125 -1.980469,0.9257812 -1.980469,1.734375 0,0.89062501 0.609375,1.28906251 1.488281,1.28906251 0.667969,0 0.996094,-0.15234375 1.628906,-0.69140625 z m 4.03711,-3.42187501 v -1.0898437 c 0,-0.082031 -0.02344,-0.1171875 -0.07031,-0.1171875 -0.269531,0.058594 -0.703125,0.070312 -0.960937,0.035156 l -0.02344,0.035156 c 0.08203,0.5273437 0.105469,1.4296875 0.105469,2.3203125 v 0.65625 c 0,0.9023437 -0.02344,1.52343746 -0.105469,2.1562499625 l 0.02344,0.0351562475 c 0.140625,-0.0234375 0.410156,-0.0351562475 0.5625,-0.0351562475 0.140625,0 0.410156,0.0117187475 0.550781,0.0351562475 L 27.39357,1.015625e-4 C 27.288102,-0.67958594 27.276383,-1.2420859 27.276383,-2.1561484 v -0.6210938 c 0,-0.4921875 0.152344,-0.7851562 0.410156,-1.1835937 0.164063,-0.2578125 0.445313,-0.4101563 0.667969,-0.4101563 0.246094,0 0.46875,0.023437 0.621094,0.1640625 l 0.09375,-0.023437 0.246093,-0.890625 -0.04687,-0.046875 c -0.210937,-0.058594 -0.222656,-0.082031 -0.433593,-0.082031 -0.644532,0 -0.996094,0.375 -1.523438,1.2773437 z m 2.771484,-1.171875 c 0.457031,1.0664063 1.886719,4.1484375 2.308594,5.28515626 -0.375,0.90234374 -0.867188,1.81640624 -1.40625,2.71875004 0.117187,-0.035156 0.304687,-0.058594 0.421875,-0.058594 0.117187,0 0.46875,0.023437 0.585937,0.058594 0.457032,-1.5 3.070313,-7.1953125 3.46875,-8.0039063 -0.128906,0.035156 -0.398437,0.035156 -0.515625,0.035156 -0.117187,0 -0.351562,0 -0.46875,-0.035156 -0.433593,1.359375 -0.996093,2.8828125 -1.582031,4.125 h -0.02344 c -0.585938,-1.4179687 -1.148438,-2.7421875 -1.582032,-4.125 -0.164062,0.035156 -0.421875,0.035156 -0.585937,0.035156 -0.164063,0 -0.457031,0 -0.621094,-0.035156 z m 0,0"
+       style="fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+       aria-label="Library"
+       transform="matrix(1.3333333,0,0,1.3333333,131.72133,15.605333)" />
+    <g
+       id="g42"
+       clip-path="url(#clipPath43)">
+      <path
+         d="M 0,0 H 100 V 100 H 0 Z"
+         transform="matrix(1.2096133,0,0,-0.6048,95.08096,212.21524)"
+         style="fill:url(#linearGradient42);stroke:none"
+         id="path42" />
+    </g>
+    <path
+       id="path44"
+       d="m 90.7097,17.00783 v 22.67752 h 45.35506 V 17.00783 Z"
+       style="fill:none;stroke:#3c5a8a;stroke-width:0.3985;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+       transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)" />
+    <path
+       id="path45"
+       d="m 0.9021875,-1.9913125 c 0,0.75 -0.0234375,1.46484375 -0.09375,1.9921875 L 0.831875,0.03603125 C 0.9490625,0.01259375 1.1717188,8.75e-4 1.300625,8.75e-4 c 0.1171875,0 0.3398438,0.01171875 0.4570313,0.03515625 L 1.7810938,8.75e-4 c -0.082031,-0.5625 -0.082031,-1.2304687 -0.082031,-1.9921875 v -2.9882812 c 0,-0.75 0.035156,-1.2304688 0.082031,-1.875 0,-0.070312 -0.023437,-0.09375 -0.082031,-0.09375 -0.2578125,0.09375 -0.46875,0.1640625 -0.8671875,0.1992187 l -0.0234375,0.023437 c 0.0703125,0.4335938 0.09375,1.171875 0.09375,1.921875 z m 2.4052734,-3.9492187 c 0,0.2578125 0.234375,0.5039062 0.5039063,0.5039062 0.2695312,0 0.515625,-0.2460937 0.515625,-0.5039062 0,-0.2695313 -0.2460938,-0.515625 -0.515625,-0.515625 -0.2695313,0 -0.5039063,0.2460937 -0.5039063,0.515625 z m 0.1054688,3.5976562 v 0.5507813 c 0,0.75 -0.023437,1.2539062 -0.09375,1.7929687 l 0.023437,0.03515625 C 3.4598047,0.01259375 3.6824609,8.75e-4 3.8113672,8.75e-4 c 0.1171875,0 0.3398437,0.01171875 0.4570312,0.03515625 L 4.2918359,8.75e-4 c -0.082031,-0.5625 -0.082031,-1.03125 -0.082031,-1.7929687 v -0.703125 c 0,-0.75 0.011719,-1.1132813 0.082031,-1.7226563 0,-0.082031 -0.023437,-0.09375 -0.082031,-0.09375 -0.234375,0.023437 -0.6679688,0.023437 -0.8671875,0.011719 l -0.023437,0.023437 c 0.058594,0.4335937 0.09375,1.1835937 0.093749,1.933594 z m 3.3046875,1.53515625 V -3.3858437 c 0.3867187,-0.3984375 0.6328125,-0.5859375 1.0429687,-0.5859375 0.609375,0 1.1484375,0.5039062 1.1484375,1.7460937 0,1.23046875 -0.375,1.921875 -1.3125,1.921875 -0.3164062,0 -0.6796875,-0.2578125 -0.8789062,-0.50390625 z m 0,-4.17187495 c 0,-0.7382813 0.011719,-1.2539063 0.058594,-1.875 0,-0.070312 -0.035156,-0.09375 -0.09375,-0.09375 -0.2460937,0.09375 -0.46875,0.1640625 -0.8671875,0.1992187 l -0.011719,0.023437 c 0.058594,0.4335938 0.1171875,1.171875 0.1171875,1.921875 v 4.078125 c 0,0.38671875 -0.011719,0.5390625 -0.058594,0.75 0.070312,0.05859375 0.1757813,0.08203125 0.28125,0.08203125 0.1171875,-0.12890625 0.2226563,-0.29296875 0.328125,-0.4921875 0.2929688,0.24609375 0.75,0.4921875 1.1835938,0.4921875 1.0078125,0 2.1328125,-0.7265625 2.1328125,-2.41406245 0,-1.21875 -0.8671875,-2.0625 -1.828125,-2.0625 -0.4804688,0 -0.9023438,0.1289062 -1.2421875,0.4921875 z M 13.90707,-6.5499062 c -1.6875,0 -3.269531,1.453125 -3.269531,3.4101562 0,1.7226563 0.984375,3.24609375 3.128906,3.24609375 0.914063,0 1.78125,-0.28125 2.449219,-1.08984375 -0.01172,-0.1171875 -0.02344,-0.3398437 -0.05859,-0.4335937 l -0.07031,-0.023437 c -0.703125,0.77343745 -1.300781,1.03124995 -2.226563,1.03124995 -1.265625,0 -2.191406,-1.35937495 -2.191406,-2.91796875 0,-2.015625 1.277344,-2.7539062 2.097656,-2.7539062 0.902344,0 1.628907,0.3515625 2.039063,1.171875 L 15.922695,-4.921 c 0.02344,-0.5039062 0.07031,-0.6796875 0.175782,-1.1015625 l -0.02344,-0.035156 c 0,0 -0.949219,-0.4921875 -2.167969,-0.4921875 z m 4.857422,4.5585937 v -1.1601562 c 0.46875,0 1.523438,0.023437 2.214844,0.09375 l 0.03516,-0.035156 c -0.02344,-0.082031 -0.03516,-0.2226562 -0.03516,-0.3164062 0,-0.082031 0.01172,-0.2226563 0.03516,-0.3164063 l -0.03516,-0.023437 c -0.585938,0.046875 -1.042969,0.09375 -2.214844,0.09375 v -0.7734375 c 0,-0.1757812 0,-1.1953125 0.07031,-1.4414062 1.335937,0 2.71875,0.1289062 2.71875,0.1289062 l 0.01172,-0.046875 c -0.01172,-0.082031 -0.01172,-0.1640625 -0.01172,-0.2578125 0,-0.082031 0,-0.234375 0.01172,-0.375 l -0.01172,-0.035156 C 21.389492,-6.4327187 21.155117,-6.421 20.955898,-6.421 h -2.625 c -0.339843,0 -0.503906,-0.035156 -0.503906,-0.035156 L 17.803555,-6.421 c 0.08203,0.5625 0.09375,1.2421875 0.09375,1.9921875 v 2.4375 c 0,0.75 -0.01172,1.453125 -0.09375,1.9921875 l 0.01172,0.03515625 c 0,0 0.164063,-0.03515625 0.515625,-0.03515625 h 2.71875 c 0.210938,0 0.433594,0.01171875 0.597657,0.03515625 L 21.670742,8.75e-4 c -0.01172,-0.140625 -0.02344,-0.2109375 -0.02344,-0.3046875 0,-0.10546875 0.01172,-0.2578125 0.02344,-0.328125 l -0.02344,-0.046875 c 0,0 -1.476563,0.1171875 -2.8125,0.1171875 -0.07031,-0.234375 -0.07031,-1.265625 -0.07031,-1.4296875 z m 5.361328,0 v -1.1601562 c 0.46875,0 1.523438,0.023437 2.214844,0.09375 l 0.03516,-0.035156 c -0.02344,-0.082031 -0.03516,-0.2226562 -0.03516,-0.3164062 0,-0.082031 0.01172,-0.2226563 0.03516,-0.3164063 l -0.03516,-0.023437 c -0.585937,0.046875 -1.042969,0.09375 -2.214844,0.09375 v -0.7734375 c 0,-0.1757812 0,-1.1953125 0.07031,-1.4414062 1.335937,0 2.71875,0.1289062 2.71875,0.1289062 l 0.01172,-0.046875 c -0.01172,-0.082031 -0.01172,-0.1640625 -0.01172,-0.2578125 0,-0.082031 0,-0.234375 0.01172,-0.375 l -0.01172,-0.035156 C 26.75082,-6.4327187 26.516445,-6.421 26.317227,-6.421 h -2.625 c -0.339844,0 -0.503907,-0.035156 -0.503907,-0.035156 L 23.164883,-6.421 c 0.08203,0.5625 0.09375,1.2421875 0.09375,1.9921875 v 2.4375 c 0,0.75 -0.01172,1.453125 -0.09375,1.9921875 l 0.01172,0.03515625 c 0,0 0.164062,-0.03515625 0.515625,-0.03515625 h 2.71875 c 0.210937,0 0.433593,0.01171875 0.597656,0.03515625 L 27.03207,8.75e-4 c -0.01172,-0.140625 -0.02344,-0.2109375 -0.02344,-0.3046875 0,-0.10546875 0.01172,-0.2578125 0.02344,-0.328125 l -0.02344,-0.046875 c 0,0 -1.476563,0.1171875 -2.8125,0.1171875 C 24.12582,-0.796 24.12582,-1.82725 24.12582,-1.9913125 Z m 5.358399,1.171875 v -4.7929687 c 0,-0.3515625 0.339844,-0.3984375 0.832031,-0.3984375 2.15625,0 2.800781,1.6523437 2.800781,3.1875 0,2.0039062 -1.148437,2.4023437 -2.554687,2.4023437 -0.972656,0 -1.078125,-0.0820313 -1.078125,-0.3984375 z M 29.050625,-6.421 c -0.339844,0 -0.503906,-0.035156 -0.503906,-0.035156 L 28.523281,-6.421 c 0.08203,0.5625 0.09375,1.2421875 0.09375,1.9921875 v 2.4375 c 0,0.75 -0.01172,1.453125 -0.09375,1.9921875 L 28.535,0.03603125 c 0,0 0.164063,-0.03515625 0.515625,-0.03515625 0.703125,0 0.796875,0.0234375 1.828125,0.0234375 1.394531,0 3.257813,-0.64453125 3.257813,-3.09375 0,-1.8515625 -1.535157,-3.3867187 -3.445313,-3.3867187 -0.632812,0 -1.007812,0.035156 -1.640625,0.035156 z m 0,0"
+       style="fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+       aria-label="libCEED"
+       transform="matrix(1.3333333,0,0,1.3333333,132.45333,186.56133)" />
+    <path
+       id="path46"
+       d="m 136.06476,28.3468 44.48515,100.09139"
+       style="fill:none;stroke:#000000;stroke-width:0.49814;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+       transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)" />
+    <path
+       id="path47"
+       d="M 2.14195,0 -1.28517,1.71356 0,0 -1.28517,-1.71356"
+       style="fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+       transform="matrix(0.54150667,-1.2183867,-1.2183867,-0.54150667,245.11188,48.519773)" />
+    <path
+       id="path48"
+       d="m 136.06476,28.3468 44.16698,66.25003"
+       style="fill:none;stroke:#000000;stroke-width:0.49814;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+       transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)" />
+    <path
+       id="path49"
+       d="M 2.14195,0 -1.28517,1.71356 0,0 -1.28517,-1.71356"
+       style="fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+       transform="matrix(0.73956,-1.10936,-1.10936,-0.73956,244.68763,93.64156)" />
+    <path
+       id="path50"
+       d="M 136.06476,28.3468 179.7063,61.07774"
+       style="fill:none;stroke:#000000;stroke-width:0.49814;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+       transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)" />
+    <path
+       id="path51"
+       d="M 2.14195,0 -1.28517,1.71356 0,0 -1.28517,-1.71356"
+       style="fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+       transform="matrix(1.0666267,-0.79998667,-0.79998667,-1.0666267,243.98707,138.33368)" />
+    <path
+       id="path52"
+       d="m 136.06476,28.3468 h 43.21309"
+       style="fill:none;stroke:#000000;stroke-width:0.49814;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+       transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)" />
+    <path
+       id="path53"
+       d="M 2.14195,0 -1.28517,1.71356 0,0 -1.28517,-1.71356"
+       style="fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+       transform="matrix(1.3333333,0,0,-1.3333333,243.4158,181.97493)" />
+    <path
+       id="path54"
+       d="M 136.06476,28.3468 179.7063,-4.38412"
+       style="fill:none;stroke:#000000;stroke-width:0.49814;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+       transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)" />
+    <path
+       id="path55"
+       d="M 2.14195,0 -1.28517,1.71356 0,0 -1.28517,-1.71356"
+       style="fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+       transform="matrix(1.0666267,0.79998667,0.79998667,-1.0666267,243.98707,225.61616)" />
+    <path
+       id="path56"
+       d="m 136.06476,28.3468 44.16698,-66.25001"
+       style="fill:none;stroke:#000000;stroke-width:0.49814;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+       transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)" />
+    <path
+       id="path57"
+       d="M 2.14195,0 -1.28517,1.71356 0,0 -1.28517,-1.71356"
+       style="fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+       transform="matrix(0.73956,1.10936,1.10936,-0.73956,244.68763,270.30828)" />
+    <path
+       id="path58"
+       d="M 136.06476,28.3468 180.54991,-71.74457"
+       style="fill:none;stroke:#000000;stroke-width:0.49814;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+       transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)" />
+    <path
+       id="path59"
+       d="M 2.14195,0 -1.28517,1.71356 0,0 -1.28517,-1.71356"
+       style="fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+       transform="matrix(0.54150667,1.2183867,1.2183867,-0.54150667,245.11188,315.43007)" />
+    <path
+       id="path60"
+       d="m 2.2725703,-6.6798594 c 0,-0.3984375 0.1171875,-0.5507812 1.0898438,-0.5507812 0.6679687,0 1.5117187,0.2578125 1.5117187,1.4765625 0,1.0195312 -0.6796875,1.3242187 -1.6757812,1.3242187 H 2.2725703 Z m 0,2.7539063 h 1.1132813 c 1.3945312,0 2.0507812,0.8789062 2.0507812,1.9570312 0,0.8203125 -0.28125,1.48828127 -2.0039062,1.48828127 -0.84375,0 -1.1601563,-0.17578125 -1.1601563,-0.57421877 z m -0.515625,-3.7851563 c -0.4101562,0 -0.609375,-0.035156 -0.609375,-0.035156 l -0.023437,0.035156 c 0.09375,0.6796875 0.1171875,1.5 0.1171875,2.390625 v 2.9296875 c 0,0.890625 -0.023437,1.74609377 -0.1171875,2.390625025 l 0.011719,0.035156245 c 0,0 0.1992187,-0.035156245 0.6210937,-0.035156245 0.7382813,0 0.8320313,0.035156245 1.8984375,0.035156245 2.3789063,0 2.9648438,-1.20703127 2.9648438,-2.23828127 0,-1.1484375 -0.7734375,-1.8046875 -1.8164063,-2.0976562 0.609375,-0.3046875 1.1132813,-0.9375 1.1132813,-1.59375 0,-0.796875 -0.421875,-1.8515625 -2.6484375,-1.8515625 -0.4101563,0 -0.9140625,0.035156 -1.5117195,0.035156 z m 9.3105467,5.015625 -0.04687,1.4296875 c 0,0.1523438 -0.07031,0.234375 -0.164062,0.30468752 -0.339844,0.2578125 -0.738282,0.46875 -1.1015628,0.46875 -0.5390625,0 -0.8789063,-0.3515625 -0.8789063,-0.72656252 0,-0.5390625 0.2460938,-0.9492187 1.1835941,-1.1953125 z m 0,2.12109377 c 0.128906,0.46875 0.492188,0.69140625 0.9375,0.69140625 0.292969,0 0.667969,-0.0703125 0.9375,-0.36328125 l -0.08203,-0.29296875 c -0.128906,0.0351563 -0.234375,0.0585938 -0.316406,0.0585938 -0.128907,0 -0.28125,-0.0234375 -0.363282,-0.09375 -0.117187,-0.10546875 -0.1875,-0.3984375 -0.1875,-0.93749992 0,-0.3398438 0.03516,-1.6640626 0.03516,-1.8046876 0,-1.6054687 -1.066407,-1.9335937 -1.96875,-1.9335937 -0.9140628,0 -1.4648441,0.4453125 -1.7578128,0.6914062 l -0.035156,0.046875 0.1992188,0.8085937 0.1523437,0.011719 c 0.3398438,-0.5390625 0.7382813,-1.0195313 1.3125,-1.0195313 0.4335933,0 1.1484373,0.058594 1.1484373,1.40625 0,0.09375 -0.04687,0.140625 -0.08203,0.1523438 l -1.1132813,0.2460937 c -1.21875,0.28125 -1.9804687,0.9257813 -1.9804687,1.734375 0,0.89062507 0.609375,1.28906257 1.4882812,1.28906257 0.6679688,0 0.9960938,-0.15234375 1.6289058,-0.69140625 z m 5.036133,-4.19531247 c 0.457031,0 0.867188,0.3515625 1.195313,0.8789062 l 0.152343,-0.011719 0.28125,-0.9726563 -0.02344,-0.035156 c -0.375,-0.1992188 -0.996094,-0.3398438 -1.59375,-0.3398438 -1.242188,0 -2.542969,1.0195313 -2.542969,2.6484375 0,1.65234377 0.9375,2.71875002 2.402344,2.71875002 0.714844,0 1.277344,-0.234375 1.757812,-0.84375 l -0.234375,-0.2578125 h -0.04687 c -0.445312,0.41015625 -0.832031,0.50390625 -1.277343,0.50390625 -0.867188,0 -1.546875,-0.78515627 -1.546875,-2.17968747 0,-1.3242188 0.691406,-2.109375 1.476562,-2.109375 z m 3.216797,2.3789062 c 0,0.890625 -0.02344,1.75781252 -0.105469,2.390625025 l 0.02344,0.035156245 c 0.140625,-0.0234375 0.410156,-0.035156245 0.5625,-0.035156245 0.140625,0 0.410156,0.011718745 0.5625,0.035156245 l 0.02344,-0.035156245 C 20.281359,-0.67985938 20.269641,-1.4767344 20.269641,-2.3907969 v -0.1171875 c 0.105468,0.011719 0.386718,0.046875 0.492187,0.1289063 0.785156,0.8085937 1.066406,1.2421875 1.875,2.41406247 0.1875,-0.01171875 0.585938,-0.035156245 0.796875,-0.035156245 0.1875,0 0.609375,0.023437495 0.714844,0.035156245 l 0.02344,-0.035156245 C 23.269641,-0.93767188 22.636828,-1.4532969 21.500109,-2.8478281 c 0.46875,-0.4921875 1.640625,-1.6171875 2.425782,-2.2851563 l -0.03516,-0.035156 c -0.222656,0.035156 -0.878906,0.035156 -1.207031,0.035156 -0.515625,0.7617188 -1.417969,1.7226563 -1.886719,2.0859375 -0.152343,0.1171875 -0.363281,0.1640625 -0.527343,0.1757813 v -3.1054688 c 0,-0.9023437 0.04687,-1.4765625 0.117187,-2.25 0,-0.082031 -0.03516,-0.1171875 -0.117187,-0.1171875 -0.292969,0.1171875 -0.5625,0.1992188 -1.03125,0.234375 l -0.02344,0.035156 c 0.08203,0.515625 0.105469,1.40625 0.105469,2.3085937 z m 6.436523,-0.8671875 c 0.199219,-1.2539062 0.972657,-1.5117187 1.359375,-1.5117187 0.46875,0 0.996094,0.4453125 0.996094,1.3359375 0,0.1054687 -0.04687,0.1757812 -0.164062,0.1757812 z m 3.1875,2.015625 c -0.421875,0.45703127 -0.984375,0.65625002 -1.640625,0.65625002 -0.421875,0 -0.984375,-0.15234375 -1.300781,-0.67968752 -0.210937,-0.3398437 -0.292969,-0.8085937 -0.292969,-1.5234375 h 3.257813 c 0.128906,0 0.210937,-0.070312 0.210937,-0.1992187 0,-1.0078125 -0.492187,-2.2617188 -2.0625,-2.2617188 -1.230468,0 -2.449218,0.9960938 -2.449218,2.7539063 0,0.6796875 0.128906,1.3476562 0.539062,1.82812497 0.398438,0.50390625 1.078125,0.78515625 1.898438,0.78515625 0.855468,0 1.617187,-0.4453125 2.074218,-1.06640625 z m 6.225586,-0.9140625 c 0,-0.375 0.02344,-0.8203125 0.02344,-1.2070312 0,-1.2421875 -0.398438,-1.8867188 -1.476563,-1.8867188 -0.457031,0 -1.300781,0.1875 -2.074218,1.078125 l -0.02344,-0.035156 v -0.8789063 c -0.01172,-0.082031 -0.02344,-0.1171875 -0.07031,-0.1171875 -0.269532,0.058594 -0.703125,0.070312 -0.960938,0.035156 l -0.03516,0.035156 c 0.09375,0.5273438 0.117187,1.4296875 0.117187,2.3203125 v 0.65625 c 0,0.9023438 -0.01172,1.51171877 -0.117187,2.156250025 L 30.588,0.03498437 c 0.140625,-0.0234375 0.410156,-0.035156245 0.550781,-0.035156245 0.152344,0 0.421875,0.011718745 0.5625,0.035156245 l 0.02344,-0.035156245 C 31.630969,-0.67985938 31.61925,-1.2423594 31.61925,-2.1564219 v -1.3945312 c 0.609375,-0.7148438 1.300781,-0.984375 1.734375,-0.984375 0.609375,0 0.867188,0.2460937 0.867188,1.2070312 v 1.171875 c 0,0.9023438 -0.02344,1.52343752 -0.117188,2.156250025 l 0.02344,0.035156245 c 0.152343,-0.0234375 0.421875,-0.035156245 0.5625,-0.035156245 0.152343,0 0.421875,0.011718745 0.5625,0.035156245 L 35.2755,-1.71875e-4 C 35.18175,-0.67985938 35.170031,-1.2423594 35.170031,-2.1564219 Z m 5.258789,0.8085938 c -0.445312,0.56249997 -0.984375,0.84374997 -1.464843,0.84374997 -0.644532,0 -1.195313,-0.64453127 -1.195313,-2.13281247 0,-1.7929688 0.9375,-2.1328125 1.523438,-2.1328125 0.5625,0 0.855468,0.234375 1.136718,0.6914062 z m 0,0.65624997 h 0.02344 l 0.05859,0.691406255 c 0,0.023437495 0.03516,0.035156245 0.117187,0.035156245 0.152344,-0.01171875 0.234375,-0.035156245 0.398438,-0.035156245 0.152343,0 0.386718,0.011718745 0.539062,0.035156245 l 0.02344,-0.035156245 C 41.495227,-0.51579688 41.389758,-1.3947031 41.389758,-2.2970469 v -3.6796875 c 0,-0.8789062 0.04687,-1.5117187 0.105469,-2.25 0,-0.082031 -0.03516,-0.1171875 -0.105469,-0.1171875 -0.304688,0.1171875 -0.5625,0.1992188 -1.042969,0.234375 l -0.02344,0.035156 c 0.08203,0.515625 0.105468,1.40625 0.105468,2.3085937 v 0.7851563 c -0.269531,-0.1640625 -0.761718,-0.2695313 -0.996093,-0.2695313 -1.570313,0 -2.707032,1.078125 -2.707032,2.7070313 0,1.4648437 0.867188,2.66015622 2.167969,2.66015622 0.574219,0 1.113281,-0.24609375 1.535156,-0.80859375 z m 2.622071,-0.51562497 -0.199219,1.00781247 c 0.679687,0.234375 1.347656,0.31640625 1.804687,0.31640625 1.640625,0 2.027344,-0.97265625 2.027344,-1.64062502 0,-0.9726562 -0.773437,-1.359375 -1.628906,-1.5585937 -0.445313,-0.1054688 -1.101563,-0.3398438 -1.101563,-0.9375 0,-0.4921875 0.386719,-0.75 0.902344,-0.75 0.621094,0 1.007813,0.4921875 1.277344,0.8085937 l 0.164062,-0.011719 0.246094,-0.8789063 -0.02344,-0.035156 C 46.168075,-5.062672 45.511825,-5.250172 44.90245,-5.250172 c -0.878906,0 -1.804687,0.46875 -1.804687,1.4648438 0,0.9492187 0.703125,1.2773437 1.429687,1.4648437 0.703125,0.1875 1.21875,0.3867188 1.21875,0.9726563 0,0.63281247 -0.46875,0.98437497 -1.101562,0.98437497 -0.574219,0 -1.078125,-0.38671875 -1.417969,-0.85546877 z m 0,0"
+       style="fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+       aria-label="Backends"
+       transform="matrix(1.3333333,0,0,1.3333333,250.716,17.398667)" />
+    <g
+       id="g64"
+       clip-path="url(#clipPath65)">
+      <path
+         d="M 0,0 H 100 V 100 H 0 Z"
+         transform="matrix(1.512,0,0,-0.6048,208.46693,76.15032)"
+         style="fill:url(#linearGradient64);stroke:none"
+         id="path64" />
+    </g>
+    <path
+       id="path66"
+       d="m 181.4194,119.0565 v 22.67752 h 56.69362 V 119.0565 Z"
+       style="fill:none;stroke:#333333;stroke-width:0.3985;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+       transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)" />
+    <path
+       id="path67"
+       d="m 1.8978125,-5.6261641 c 0,-0.2929687 0.1757812,-0.4453125 0.890625,-0.4453125 0.6914062,0 1.4179687,0.1875 1.4179687,1.3828125 0,1.1367188 -0.5625,1.5117188 -1.5234375,1.5117188 -0.2578125,0 -0.65625,-0.035156 -0.7851562,-0.1054688 z M 1.030625,-4.4308516 v 2.4375 c 0,0.75 -0.011719,1.45312504 -0.09375,1.99218754 L 0.9603125,0.03399219 C 1.0775,0.01055469 1.3470312,-0.00116406 1.4642187,-0.00116406 c 0.1171875,0 0.3867188,0.01171875 0.5039063,0.03515625 l 0.023437,-0.03515625 c -0.082031,-0.5625 -0.09375,-1.24218754 -0.09375,-1.99218754 v -0.8554687 c 0.2109375,0.070312 0.4804687,0.09375 0.8320312,0.09375 1.8164063,0 2.4375,-1.1367188 2.4375,-1.9921875 0,-0.7382813 -0.46875,-1.7460938 -2.3085937,-1.7460938 -0.2578125,0 -1.1015625,0.070312 -1.3945313,0.070312 -0.1171875,0 -0.3867187,-0.011719 -0.5039062,-0.035156 l -0.0234375,0.035156 c 0.0820312,0.5625 0.09375,1.2421875 0.0937505,1.992188 z m 8.1708984,3.51562504 0.046875,0.0234375 0.046875,0.890625 c 0,0.0234375 0.011719,0.03515625 0.035156,0.03515625 0.1289062,-0.01171875 0.2578125,-0.03515625 0.3867187,-0.03515625 0.140625,0 0.3281249,0.01171875 0.4570319,0.03515625 L 10.1859,-0.00116406 C 10.115586,-0.43475781 10.021836,-1.1613203 10.021836,-1.9113203 v -0.5742188 c 0,-0.7382812 0.01172,-1.265625 0.09375,-1.7929687 l -0.02344,-0.023437 c -0.1171871,0.011719 -0.3515621,0.023437 -0.4687496,0.023437 -0.1171875,0 -0.3515625,-0.011719 -0.46875,-0.023437 l -0.023437,0.023437 c 0.070312,0.5859375 0.09375,1.0429687 0.09375,1.7929687 v 0.984375 c -0.3046875,0.51562504 -0.8789062,1.04296879 -1.359375,1.04296879 -0.3515625,0 -0.6679687,-0.10546875 -0.6679687,-1.04296879 v -0.984375 c 0,-0.7382812 0,-1.265625 0.082031,-1.7929687 l -0.011719,-0.023437 c -0.1289063,0.011719 -0.3515625,0.023437 -0.46875,0.023437 -0.1289063,0 -0.3515625,-0.011719 -0.46875,-0.023437 l -0.023437,0.023437 c 0.082031,0.5742187 0.09375,1.0429687 0.09375,1.7929687 v 1.1953125 c 0,0.69140629 0.3164062,1.39453129 1.3007812,1.39453129 0.6210938,0 1.171875,-0.515625 1.5,-1.01953125 z M 12.582383,-3.3292891 v -0.9023437 c 0,-0.070312 -0.02344,-0.1054688 -0.05859,-0.1054688 -0.234375,0.046875 -0.585937,0.058594 -0.808594,0.035156 l -0.02344,0.023437 c 0.07031,0.4453125 0.09375,1.1835937 0.09375,1.9335937 v 0.5507813 c 0,0.75 -0.02344,1.26562499 -0.09375,1.79296874 l 0.02344,0.03515625 c 0.117188,-0.0234375 0.351563,-0.03515625 0.46875,-0.03515625 0.117188,0 0.351563,0.01171875 0.46875,0.03515625 l 0.02344,-0.03515625 c -0.08203,-0.5625 -0.09375,-1.03125004 -0.09375,-1.79296874 v -0.515625 c 0,-0.421875 0.117187,-0.65625 0.339844,-0.984375 0.140625,-0.2226563 0.363281,-0.3515625 0.550781,-0.3515625 0.199219,0 0.398437,0.023437 0.527344,0.140625 l 0.07031,-0.023437 0.199219,-0.7382813 -0.03516,-0.035156 c -0.164063,-0.046875 -0.175782,-0.070312 -0.363282,-0.070312 -0.527343,0 -0.820312,0.3046875 -1.253906,1.0664062 z m 3.035156,0.609375 c 0.175781,-1.03125 0.820313,-1.2539062 1.136719,-1.2539062 0.386719,0 0.84375,0.3632812 0.84375,1.1132812 0,0.09375 -0.04687,0.140625 -0.140625,0.140625 z m 2.660156,1.6875 c -0.351562,0.37500004 -0.808593,0.53906254 -1.359375,0.53906254 -0.351562,0 -0.820312,-0.12890625 -1.089843,-0.56250004 -0.175782,-0.28125 -0.234375,-0.6796875 -0.234375,-1.265625 h 2.707031 c 0.105469,0 0.175781,-0.058594 0.175781,-0.1757812 0,-0.8320313 -0.410156,-1.875 -1.722656,-1.875 -1.019531,0 -2.039063,0.8203125 -2.039063,2.2851562 0,0.5742188 0.105469,1.12500004 0.445313,1.52343754 0.339844,0.421875 0.914062,0.66796875 1.582031,0.66796875 0.726563,0 1.359375,-0.375 1.734375,-0.890625 z m 6.887696,-5.5195312 c -1.6875,0 -3.269532,1.453125 -3.269532,3.4101562 0,1.7226563 0.984375,3.24609379 3.128907,3.24609379 0.914062,0 1.78125,-0.28125 2.449218,-1.08984375 -0.01172,-0.11718754 -0.02344,-0.33984374 -0.05859,-0.43359374 l -0.07031,-0.023437 c -0.703125,0.77343749 -1.300781,1.03124999 -2.226562,1.03124999 -1.265625,0 -2.191407,-1.35937499 -2.191407,-2.91796879 0,-2.015625 1.277344,-2.7539062 2.097657,-2.7539062 0.902343,0 1.628906,0.3515625 2.039062,1.171875 l 0.117188,-0.011719 c 0.02344,-0.5039062 0.07031,-0.6796875 0.175781,-1.1015625 l -0.02344,-0.035156 c 0,0 -0.949218,-0.4921875 -2.167968,-0.4921875 z m 0,0"
+       style="fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+       aria-label="PureC"
+       transform="matrix(1.3333333,0,0,1.3333333,265.31333,50.177333)" />
+    <path
+       id="path68"
+       d="m 238.11302,130.39507 32.40416,-28.35381"
+       style="fill:none;stroke:#000000;stroke-width:0.49814;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+       transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)" />
+    <path
+       id="path69"
+       d="M 2.14195,0 -1.28517,1.71356 0,0 -1.28517,-1.71356"
+       style="fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+       transform="matrix(1.0034,0.87797333,0.87797333,-1.0034,365.06824,83.715627)" />
+    <g
+       id="g73"
+       clip-path="url(#clipPath74)">
+      <path
+         d="M 0,0 H 100 V 100 H 0 Z"
+         transform="matrix(1.512,0,0,-0.6048,208.46693,121.50511)"
+         style="fill:url(#linearGradient73);stroke:none"
+         id="path73" />
+    </g>
+    <path
+       id="path75"
+       d="m 181.4194,85.04042 v 22.67752 h 56.69362 V 85.04042 Z"
+       style="fill:none;stroke:#333333;stroke-width:0.3985;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+       transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)" />
+    <path
+       id="path76"
+       d="m 4.0532812,-2.74175 c -0.328125,0.011719 -0.75,0.023437 -1.03125,0.023437 -0.2929687,0 -0.8320312,-0.011719 -1.0546875,-0.035156 l 1.0546875,-2.4375 h 0.023437 c 0.4101563,0.9375 0.7382813,1.7578125 1.007813,2.449219 z M 1.7798437,-2.273 c 0.2460938,-0.023437 0.9375,-0.023437 1.2890625,-0.023437 0.375,0 0.9257813,0 1.1601563,0.023437 C 4.72125,-0.9605 4.9439062,-0.1870625 5.0142187,0.03559375 5.1782812,4.375e-4 5.3423437,4.375e-4 5.5064062,4.375e-4 c 0.1757813,0 0.4335938,0 0.5976563,0.03515625 C 5.611875,-0.97221875 4.2876562,-4.3706563 3.3735937,-6.5503438 h -0.28125 C 2.143125,-4.3472188 1.15875,-2.1675313 0.16265625,0.03559375 0.27984375,4.375e-4 0.39703125,4.375e-4 0.5025,4.375e-4 c 0.1171875,0 0.328125,0 0.45703125,0.03515625 C 1.111875,-0.57378125 1.4282812,-1.3940938 1.7798437,-2.273 Z m 6.5947266,0.9609375 h -0.011719 c -1.3125,-3.0351563 -1.8164063,-4.7929688 -1.9101563,-5.1445313 -0.1640625,0.023437 -0.3984375,0.035156 -0.5390625,0.035156 -0.1640625,0 -0.4453125,-0.011719 -0.5976562,-0.035156 C 5.8081638,-5.4487813 7.1206638,-2.0855 8.046445,0.10590625 H 8.3159766 C 9.2769141,-2.1089375 10.24957,-4.2534688 11.257383,-6.4565938 c -0.117188,0.023437 -0.304688,0.035156 -0.398438,0.035156 -0.105468,0 -0.339843,-0.011719 -0.445312,-0.035156 -0.351563,1.2890625 -1.3242189,3.4921875 -2.0390627,5.1445313 z M 17.049375,0.03559375 C 16.2525,-1.101125 15.59625,-2.1089375 14.729062,-3.4331563 c 0.925782,-1.3359375 1.722657,-2.4140625 2.167969,-3.0234375 -0.117187,0.035156 -0.316406,0.035156 -0.421875,0.035156 -0.105469,0 -0.304687,0 -0.421875,-0.035156 -0.539062,1.0078125 -0.785156,1.3710938 -1.605469,2.5664063 -0.65625,-0.9960938 -1.324218,-1.96875 -1.605468,-2.5664063 -0.175782,0.035156 -0.398438,0.035156 -0.574219,0.035156 -0.164063,0 -0.386719,0 -0.550781,-0.035156 l 2.144531,3.2578125 -2.191406,3.23437505 C 11.775937,4.375e-4 11.975156,4.375e-4 12.080625,4.375e-4 c 0.117187,0 0.316406,0 0.421875,0.03515625 0.527344,-0.984375 1.101562,-1.92187505 1.664062,-2.76562505 0.632813,0.9609375 1.207032,1.7695313 1.734375,2.76562505 C 16.076719,4.375e-4 16.299375,4.375e-4 16.475156,4.375e-4 c 0.175781,0 0.398438,0 0.574219,0.03515625 z m 0,0"
+       style="fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+       aria-label="AVX"
+       transform="matrix(1.3333333,0,0,1.3333333,272.58,95.530667)" />
+    <path
+       id="path77"
+       d="m 238.11302,96.37898 31.87468,0.66393"
+       style="fill:none;stroke:#000000;stroke-width:0.49814;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+       transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)" />
+    <path
+       id="path78"
+       d="M 2.14195,0 -1.28517,1.71356 0,0 -1.28517,-1.71356"
+       style="fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+       transform="matrix(1.3329867,-0.02774667,-0.02774667,-1.3329867,364.36227,90.380133)" />
+    <g
+       id="g82"
+       clip-path="url(#clipPath83)">
+      <path
+         d="M 0,0 H 100 V 100 H 0 Z"
+         transform="matrix(1.512,0,0,-0.6048,208.46693,166.86044)"
+         style="fill:url(#linearGradient82);stroke:none"
+         id="path82" />
+    </g>
+    <path
+       id="path84"
+       d="m 181.4194,51.02391 v 22.67752 h 56.69362 V 51.02391 Z"
+       style="fill:none;stroke:#333333;stroke-width:0.3985;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+       transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)" />
+    <path
+       id="path85"
+       d="m 1.8974688,-4.4304766 c 0,-0.75 0.023437,-1.4648437 0.1054687,-1.9921875 l -0.011719,-0.035156 c -0.1171875,0.023437 -0.3984375,0.035156 -0.515625,0.035156 -0.1171875,0 -0.3867188,-0.011719 -0.51562505,-0.035156 L 0.94825,-6.4226641 c 0.0820313,0.5625 0.09375,1.2421875 0.09375,1.9921875 v 2.4375 c 0,0.75 -0.011719,1.45312504 -0.09375,1.9921875375 L 0.95996875,0.03436719 c 0,0 0.17578125,-0.0351562525 0.51562505,-0.0351562525 h 2.71875 c 0.2109375,0 0.4335937,0.0117187525 0.5976562,0.0351562525 L 4.8154375,-7.890625e-4 C 4.8037188,-0.14141406 4.792,-0.34063281 4.792,-0.43438281 c 0,-0.10546875 0.011719,-0.26953125 0.023437,-0.3515625 L 4.792,-0.83282031 c 0,0 -1.4765625,0.22265625 -2.8125,0.22265625 C 1.909188,-0.84453906 1.897469,-1.8289141 1.897469,-1.9929766 Z m 4.265625,0 v 2.4375 c 0,0.75 -0.011719,1.45312504 -0.09375,1.9921875375 L 6.0927813,0.03436719 C 6.2099688,0.01092969 6.4795,-7.890625e-4 6.5966875,-7.890625e-4 c 0.1171875,0 0.3867188,0.0117187525 0.5039063,0.0351562525 L 7.1240313,-7.890625e-4 C 7.042,-0.56328906 7.0302813,-1.2429766 7.0302813,-1.9929766 v -2.4375 c 0,-0.75 0.011719,-1.4648437 0.09375,-1.9921875 l -0.023437,-0.035156 c -0.1171875,0.023437 -0.3867188,0.035156 -0.5039063,0.035156 -0.1171875,0 -0.3867187,-0.011719 -0.5039062,-0.035156 l -0.023437,0.035156 c 0.082031,0.5625 0.09375,1.2421875 0.093749,1.9921875 z m 3.7939453,-1.1367187 c 0,-0.328125 0.09375,-0.4570313 0.9023439,-0.4570313 0.550781,0 1.265625,0.2109375 1.265625,1.2304688 0,0.84375 -0.574219,1.1015625 -1.394531,1.1015625 H 9.9570391 Z m 0,2.296875 H 10.88282 c 1.160157,0 1.710938,0.7265625 1.710938,1.6289062 0,0.67968754 -0.246094,1.24218754 -1.675781,1.24218754 -0.703125,0 -0.9609379,-0.15234375 -0.9609379,-0.48046875 z M 9.5234453,-6.4226641 c -0.3398437,0 -0.5039062,-0.035156 -0.5039062,-0.035156 l -0.023437,0.035156 c 0.082031,0.5625 0.09375,1.2421875 0.09375,1.9921875 v 2.4375 c 0,0.75 -0.011719,1.45312504 -0.09375,1.9921875375 L 9.0078203,0.03436719 c 0,0 0.1640625,-0.0351562525 0.515625,-0.0351562525 0.6210937,0 0.6914067,0.0351562525 1.5820317,0.0351562525 1.980468,0 2.472656,-1.0078125 2.472656,-1.86328129 0,-0.9609375 -0.644531,-1.5117187 -1.511719,-1.7578125 0.503906,-0.2460937 0.925781,-0.7734375 0.925781,-1.3242187 0,-0.6679688 -0.351562,-1.546875 -2.203125,-1.546875 -0.351562,0 -0.773437,0.035156 -1.2656247,0.035156 z M 19.809578,0.03436719 C 19.012703,-1.1023516 18.356453,-2.1101641 17.489266,-3.4343828 c 0.925781,-1.3359375 1.722656,-2.4140625 2.167968,-3.0234375 -0.117187,0.035156 -0.316406,0.035156 -0.421875,0.035156 -0.105468,0 -0.304687,0 -0.421875,-0.035156 -0.539062,1.0078125 -0.785156,1.3710937 -1.605468,2.5664062 -0.65625,-0.9960937 -1.324219,-1.96875 -1.605469,-2.5664062 -0.175781,0.035156 -0.398438,0.035156 -0.574219,0.035156 -0.164062,0 -0.386719,0 -0.550781,-0.035156 l 2.144531,3.2578125 -2.191406,3.23437499 c 0.105469,-0.0351562525 0.304687,-0.0351562525 0.410156,-0.0351562525 0.117188,0 0.316406,0 0.421875,0.0351562525 0.527344,-0.984375 1.101563,-1.92187499 1.664063,-2.76562499 0.632812,0.9609375 1.207031,1.76953124 1.734375,2.76562499 0.175781,-0.0351562525 0.398437,-0.0351562525 0.574218,-0.0351562525 0.175782,0 0.398438,0 0.574219,0.0351562525 z m 2.589844,-0.375 c -0.5625,0 -1.277344,-0.4921875 -1.582031,-1.07812499 l -0.105469,0.011719 c -0.03516,0.3632813 -0.152344,0.72656254 -0.222656,1.03125004 l 0.01172,0.0234375 c 0,0 0.644532,0.45703125 1.816407,0.45703125 1.21875,0 2.167968,-0.75 2.167968,-1.91015629 0,-1.1484375 -0.972656,-1.6875 -1.78125,-1.9921875 -0.503906,-0.1875 -1.21875,-0.4804687 -1.21875,-1.3007812 0,-0.3632813 0.199219,-0.7617188 0.457032,-0.9023438 0.175781,-0.082031 0.386718,-0.1171875 0.609375,-0.1171875 0.550781,0 1.089843,0.4335938 1.359375,1.0429688 l 0.09375,-0.011719 c 0.04687,-0.3515625 0.140625,-0.6914062 0.234375,-0.9960937 l -0.02344,-0.023437 c 0,0 -0.410156,-0.4453125 -1.582031,-0.4453125 -0.269531,0 -0.5625,0.046875 -0.855469,0.1640625 -0.585937,0.2460937 -1.089844,0.8320312 -1.089844,1.5585937 0,1.0429688 0.878907,1.5351563 1.710938,1.875 0.644531,0.2695313 1.160156,0.5976563 1.160156,1.453125 0,0.73828129 -0.574219,1.16015629 -1.160156,1.16015569 z m 4.549805,-6.25781249 c -0.351563,2.015625 -0.890625,4.7695312 -1.289063,6.63281249 0.117188,-0.0351562525 0.210938,-0.0351562525 0.328125,-0.0351562525 0.117188,0 0.199219,0 0.304688,0.0351562525 0.152343,-1.05468749 0.492187,-3.26953129 0.726562,-4.78124999 h 0.04687 c 0.75,1.59375 1.453125,3.1992187 2.097656,4.7460937375 h 0.1875 C 30.066414,-1.6296953 30.769539,-3.1531328 31.566414,-4.7117266 l 0.02344,0.011719 c 0.210937,1.5585937 0.410156,3.0820312 0.5625,4.73437499 0.152343,-0.0351562525 0.339843,-0.0351562525 0.480468,-0.0351562525 0.152344,0 0.410157,0 0.5625,0.0351562525 -0.363281,-2.13281249 -0.65625,-4.20703129 -0.9375,-6.63281249 h -0.246093 l -2.425782,4.9453125 h -0.07031 c -0.796875,-1.640625 -1.511719,-3.234375 -2.238281,-4.9453125 z m 8.824218,0 c -0.351562,2.015625 -0.890625,4.7695312 -1.289062,6.63281249 0.117187,-0.0351562525 0.210937,-0.0351562525 0.328125,-0.0351562525 0.117187,0 0.199219,0 0.304687,0.0351562525 0.152344,-1.05468749 0.492188,-3.26953129 0.726563,-4.78124999 h 0.04687 c 0.75,1.59375 1.453125,3.1992187 2.097656,4.7460937375 h 0.1875 C 38.890633,-1.6296953 39.593758,-3.1531328 40.390633,-4.7117266 l 0.02344,0.011719 c 0.210938,1.5585937 0.410157,3.0820312 0.5625,4.73437499 0.152344,-0.0351562525 0.339844,-0.0351562525 0.480469,-0.0351562525 0.152344,0 0.410156,0 0.5625,0.0351562525 -0.363281,-2.13281249 -0.65625,-4.20703129 -0.9375,-6.63281249 h -0.246094 l -2.425781,4.9453125 h -0.07031 c -0.796875,-1.640625 -1.511719,-3.234375 -2.238282,-4.9453125 z m 0,0"
+       style="fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+       aria-label="LIBXSMM"
+       transform="matrix(1.3333333,0,0,1.3333333,255.61067,140.98933)" />
+    <path
+       id="path86"
+       d="m 238.11302,62.36288 32.42081,29.04401"
+       style="fill:none;stroke:#000000;stroke-width:0.49814;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+       transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)" />
+    <path
+       id="path87"
+       d="M 2.14195,0 -1.28517,1.71356 0,0 -1.28517,-1.71356"
+       style="fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+       transform="matrix(0.99304,-0.88962667,-0.88962667,-0.99304,365.09041,97.894827)" />
+    <g
+       id="g91"
+       clip-path="url(#clipPath92)">
+      <path
+         d="M 0,0 H 100 V 100 H 0 Z"
+         transform="matrix(1.512,0,0,-0.6048,208.46693,212.21524)"
+         style="fill:url(#linearGradient91);stroke:none"
+         id="path91" />
+    </g>
+    <path
+       id="path93"
+       d="m 181.4194,17.00783 v 22.67752 h 56.69362 V 17.00783 Z"
+       style="fill:none;stroke:#333333;stroke-width:0.3985;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+       transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)" />
+    <path
+       id="path94"
+       d="m 3.7843047,-6.5520703 c -1.6875,0 -3.26953126,1.453125 -3.26953126,3.4101562 0,1.7226563 0.98437496,3.24609379 3.12890626,3.24609379 0.9140625,0 1.78125,-0.28125 2.4492187,-1.08984375 C 6.0811797,-1.1028516 6.0694609,-1.3255078 6.0343047,-1.4192578 l -0.070312,-0.023437 c -0.703125,0.77343749 -1.3007813,1.03124999 -2.2265625,1.03124999 -1.265625,0 -2.1914063,-1.35937499 -2.1914063,-2.91796879 0,-2.015625 1.2773438,-2.7539062 2.0976563,-2.7539062 0.9023437,0 1.6289062,0.3515625 2.0390625,1.171875 l 0.1171875,-0.011719 c 0.023437,-0.5039062 0.070312,-0.6796875 0.1757812,-1.1015625 l -0.023437,-0.035156 c 0,0 -0.9492187,-0.4921875 -2.1679687,-0.4921875 z m 8.3730473,2.1210937 v 1.6757813 c 0,1.1132812 -0.164063,2.29687499 -1.851563,2.29687499 -1.6640624,0 -1.6640624,-1.68749999 -1.6640624,-2.22656249 v -1.7460938 c 0,-0.75 0.011719,-1.4648437 0.09375,-1.9921875 l -0.023437,-0.035156 c -0.1171875,0.023437 -0.3867188,0.035156 -0.5039063,0.035156 -0.1171875,0 -0.3867187,-0.011719 -0.5039062,-0.035156 l -0.023437,0.035156 c 0.082031,0.5625 0.09375,1.2421875 0.09375,1.9921875 v 2.0742188 c 0,2.03906249 1.4296875,2.46093749 2.3203129,2.46093749 2.050781,0 2.601562,-1.26562499 2.601562,-3.04687499 v -1.4882813 c 0,-0.75 0.02344,-1.4648437 0.105469,-1.9921875 l -0.02344,-0.035156 c -0.117187,0.023437 -0.222656,0.035156 -0.351562,0.035156 -0.117188,0 -0.222656,-0.011719 -0.339844,-0.035156 l -0.02344,0.035156 c 0.08203,0.5625 0.09375,1.2421875 0.09375,1.9921875 z m 3.380859,3.60937504 V -5.6145703 c 0,-0.3515625 0.339844,-0.3984375 0.832031,-0.3984375 2.15625,0 2.800781,1.6523437 2.800781,3.1875 0,2.00390624 -1.148437,2.40234374 -2.554687,2.40234374 -0.972656,0 -1.078125,-0.0820312 -1.078125,-0.3984375 z M 15.104617,-6.4231641 c -0.339844,0 -0.503906,-0.035156 -0.503906,-0.035156 l -0.02344,0.035156 c 0.08203,0.5625 0.09375,1.2421875 0.09375,1.9921875 v 2.4375 c 0,0.75 -0.01172,1.45312504 -0.09375,1.99218754 l 0.01172,0.03515625 c 0,0 0.164063,-0.03515625 0.515625,-0.03515625 0.703125,0 0.796875,0.0234375 1.828125,0.0234375 1.394531,0 3.257813,-0.64453125 3.257813,-3.09375004 0,-1.8515625 -1.535157,-3.3867187 -3.445313,-3.3867187 -0.632812,0 -1.007812,0.035156 -1.640625,0.035156 z m 9.372071,3.6796875 c -0.328125,0.011719 -0.75,0.023437 -1.03125,0.023437 -0.292969,0 -0.832032,-0.011719 -1.054688,-0.035156 l 1.054688,-2.4375 h 0.02344 c 0.410156,0.9375 0.738281,1.7578125 1.007813,2.4492187 z m -2.273438,0.46875 c 0.246094,-0.023437 0.9375,-0.023437 1.289063,-0.023437 0.375,0 0.925781,0 1.160156,0.023437 0.492187,1.31250004 0.714844,2.08593754 0.785156,2.30859379 0.164063,-0.03515625 0.328125,-0.03515625 0.492188,-0.03515625 0.175781,0 0.433593,0 0.597656,0.03515625 C 26.035281,-0.97394531 24.711063,-4.3723828 23.797,-6.5520703 h -0.28125 c -0.949219,2.203125 -1.933594,4.3828125 -2.929687,6.58593749 0.117187,-0.03515625 0.234375,-0.03515625 0.339843,-0.03515625 0.117188,0 0.328125,0 0.457032,0.03515625 0.152343,-0.609375 0.46875,-1.42968749 0.820312,-2.30859379 z m 0,0"
+       style="fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+       aria-label="CUDA"
+       transform="matrix(1.3333333,0,0,1.3333333,266.27067,186.24)" />
+    <path
+       id="path95"
+       d="m 238.11302,28.3468 31.87468,0.66391"
+       style="fill:none;stroke:#000000;stroke-width:0.49814;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+       transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)" />
+    <path
+       id="path96"
+       d="M 2.14195,0 -1.28517,1.71356 0,0 -1.28517,-1.71356"
+       style="fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+       transform="matrix(1.3329867,-0.02774667,-0.02774667,-1.3329867,364.36227,181.08973)" />
+    <g
+       id="g100"
+       clip-path="url(#clipPath101)">
+      <path
+         d="M 0,0 H 100 V 100 H 0 Z"
+         transform="matrix(1.512,0,0,-0.6048,208.46693,257.56997)"
+         style="fill:url(#linearGradient100);stroke:none"
+         id="path100" />
+    </g>
+    <path
+       id="path102"
+       d="M 181.4194,-17.00824 V 5.66927 h 56.69362 v -22.67751 z"
+       style="fill:none;stroke:#333333;stroke-width:0.3985;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+       transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)" />
+    <path
+       id="path103"
+       d="m 5.3424141,-4.4302187 v 0.796875 H 1.8971016 v -0.796875 c 0,-0.75 0.011719,-1.4648438 0.09375,-1.9921875 l -0.011719,-0.035156 c -0.1171875,0.023437 -0.3984375,0.035156 -0.515625,0.035156 -0.1171875,0 -0.3867187,-0.011719 -0.50390624,-0.035156 l -0.0234375,0.035156 c 0.0820312,0.5625 0.09375,1.2421875 0.09375,1.9921875 v 2.4375 c 0,0.75 -0.011719,1.45312495 -0.09375,1.99218745 L 0.94788281,0.034625 C 1.0650703,0.0111875 1.3463203,-5.3125e-4 1.4635078,-5.3125e-4 c 0.1171875,0 0.3867188,0.01171875 0.5039063,0.03515625 l 0.023437,-0.03515625 c -0.082031,-0.5625 -0.09375,-1.24218745 -0.09375,-1.99218745 V -3.19975 h 3.4453125 v 1.2070313 c 0,0.75 -0.023437,1.45312495 -0.1054688,1.99218745 L 5.2486641,0.034625 c 0.1171875,-0.0234375 0.3984375,-0.03515625 0.515625,-0.03515625 0.1171875,0 0.3867187,0.01171875 0.515625,0.03515625 l 0.011719,-0.03515625 c -0.082031,-0.5625 -0.09375,-1.24218745 -0.09375,-1.99218745 v -2.4375 c 0,-0.75 0.011719,-1.4648438 0.09375,-1.9921875 l -0.011719,-0.035156 c -0.1171875,0.023437 -0.3984375,0.035156 -0.515625,0.035156 -0.1171875,0 -0.3867188,-0.011719 -0.5039063,-0.035156 l -0.023437,0.035156 c 0.082031,0.5625 0.1054688,1.2421875 0.1054683,1.9921875 z m 2.9208984,0 v 2.4375 c 0,0.75 -0.011719,1.45312495 -0.09375,1.99218745 L 8.193,0.034625 C 8.3101875,0.0111875 8.5797187,-5.3125e-4 8.6969062,-5.3125e-4 c 0.1171875,0 0.3867188,0.01171875 0.5039063,0.03515625 L 9.22425,-5.3125e-4 C 9.1422187,-0.56303125 9.1305,-1.2427187 9.1305,-1.9927187 v -2.4375 c 0,-0.75 0.011719,-1.4648438 0.09375,-1.9921875 l -0.023437,-0.035156 c -0.1171875,0.023437 -0.3867188,0.035156 -0.5039063,0.035156 -0.1171875,0 -0.3867187,-0.011719 -0.5039062,-0.035156 l -0.023437,0.035156 c 0.082031,0.5625 0.09375,1.2421875 0.09375,1.9921875 z m 3.7968745,-1.1953125 c 0,-0.2929688 0.175782,-0.4453125 0.890625,-0.4453125 0.691407,0 1.417969,0.1875 1.417969,1.3828125 0,1.1367187 -0.5625,1.5117187 -1.523437,1.5117187 -0.257813,0 -0.65625,-0.035156 -0.785157,-0.1054687 z M 11.193,-4.4302187 v 2.4375 c 0,0.75 -0.01172,1.45312495 -0.09375,1.99218745 L 11.12269,0.034625 c 0.117188,-0.0234375 0.386719,-0.03515625 0.503907,-0.03515625 0.117187,0 0.386718,0.01171875 0.503906,0.03515625 l 0.02344,-0.03515625 c -0.08203,-0.5625 -0.09375,-1.24218745 -0.09375,-1.99218745 v -0.8554688 c 0.210938,0.070312 0.480469,0.09375 0.832032,0.09375 1.816406,0 2.4375,-1.1367187 2.4375,-1.9921875 0,-0.7382812 -0.46875,-1.7460937 -2.308594,-1.7460937 -0.257813,0 -1.101563,0.070312 -1.394531,0.070312 -0.117188,0 -0.386719,-0.011719 -0.503907,-0.035156 l -0.02344,0.035156 c 0.08203,0.5625 0.09375,1.2421875 0.09375,1.9921875 z m 0,0"
+       style="fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+       aria-label="HIP"
+       transform="matrix(1.3333333,0,0,1.3333333,273.576,231.64133)" />
+    <path
+       id="path104"
+       d="m 238.11302,-5.6697 31.87468,0.66392"
+       style="fill:none;stroke:#000000;stroke-width:0.49814;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+       transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)" />
+    <path
+       id="path105"
+       d="M 2.14195,0 -1.28517,1.71356 0,0 -1.28517,-1.71356"
+       style="fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+       transform="matrix(1.3329867,-0.02774667,-0.02774667,-1.3329867,364.36227,226.44507)" />
+    <g
+       id="g109"
+       clip-path="url(#clipPath110)">
+      <path
+         d="M 0,0 H 100 V 100 H 0 Z"
+         transform="matrix(1.512,0,0,-0.6048,208.46693,302.92473)"
+         style="fill:url(#linearGradient109);stroke:none"
+         id="path109" />
+    </g>
+    <path
+       id="path111"
+       d="m 181.4194,-51.02432 v 22.67752 h 56.69362 v -22.67752 z"
+       style="fill:none;stroke:#333333;stroke-width:0.3985;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+       transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)" />
+    <path
+       id="path112"
+       d="m 2.4131094,-0.33892969 c -0.5625,0 -1.2773438,-0.4921875 -1.58203127,-1.07812501 l -0.10546875,0.011719 c -0.0351563,0.3632812 -0.15234375,0.72656246 -0.22265625,1.03124996 l 0.0117188,0.0234375 c 0,0 0.64453127,0.45703125 1.81640627,0.45703125 1.21875,0 2.1679688,-0.75 2.1679688,-1.91015621 0,-1.1484375 -0.9726563,-1.6875 -1.78125,-1.9921875 -0.5039063,-0.1875 -1.21875,-0.4804688 -1.21875,-1.3007813 0,-0.3632812 0.1992187,-0.7617187 0.4570312,-0.9023437 0.1757813,-0.082031 0.3867188,-0.1171875 0.609375,-0.1171875 0.5507813,0 1.0898438,0.4335937 1.359375,1.0429687 l 0.09375,-0.011719 c 0.046875,-0.3515625 0.140625,-0.6914063 0.234375,-0.9960938 l -0.023437,-0.023437 c 0,0 -0.4101561,-0.4453125 -1.5820311,-0.4453125 -0.2695314,0 -0.5625,0.046875 -0.8554689,0.1640625 -0.5859375,0.2460938 -1.08984367,0.8320313 -1.08984367,1.5585938 0,1.0429687 0.87890617,1.5351562 1.71093757,1.875 0.6445311,0.2695312 1.1601562,0.5976562 1.1601562,1.453125 0,0.73828121 -0.5742187,1.16015621 -1.1601568,1.16015581 z M 8.3574453,-2.8350234 c 0.4101563,-0.796875 1.1015625,-1.78125 2.2968747,-3.6210938 -0.128906,0.023437 -0.292968,0.035156 -0.410156,0.035156 -0.128906,0 -0.3046874,-0.011719 -0.4218749,-0.035156 -0.4921875,1.0429688 -1.1601563,2.1679688 -1.7460938,3.0703125 -0.6210937,-1.0898437 -1.2070312,-2.0273437 -1.734375,-3.0703125 -0.1757812,0.023437 -0.3984375,0.035156 -0.5742187,0.035156 -0.1640625,0 -0.3984375,-0.011719 -0.5625,-0.035156 0.3984375,0.5976563 1.921875,3.0703125 2.296875,3.7148438 0,0.984375 -0.023437,2.22656246 -0.070312,2.77734371 0.1171875,-0.0234375 0.375,-0.0351562475 0.5039062,-0.0351562475 0.1171875,0 0.375,0.0117187475 0.4921875,0.0351562475 -0.035156,-0.48046875 -0.058594,-1.88671871 -0.070313,-2.87109371 z m 5.8974607,-3.7148438 c -1.6875,0 -3.269531,1.453125 -3.269531,3.4101563 0,1.7226562 0.984375,3.24609371 3.128906,3.24609371 0.914063,0 1.78125,-0.28125 2.449219,-1.08984375 C 16.55178,-1.1006484 16.54006,-1.3233047 16.50491,-1.4170547 l -0.07031,-0.023437 c -0.703125,0.77343751 -1.300781,1.03125001 -2.226563,1.03125001 -1.265625,0 -2.191406,-1.35937501 -2.191406,-2.91796871 0,-2.015625 1.277344,-2.7539063 2.097656,-2.7539063 0.902344,0 1.628907,0.3515625 2.039063,1.171875 l 0.117187,-0.011719 c 0.02344,-0.5039063 0.07031,-0.6796875 0.175782,-1.1015625 l -0.02344,-0.035156 c 0,0 -0.949219,-0.4921875 -2.167969,-0.4921875 z m 4.860352,2.1210938 c 0,-0.75 0.02344,-1.4648438 0.105469,-1.9921875 l -0.01172,-0.035156 c -0.117188,0.023437 -0.398438,0.035156 -0.515625,0.035156 -0.117188,0 -0.386719,-0.011719 -0.515625,-0.035156 l -0.01172,0.035156 c 0.08203,0.5625 0.09375,1.2421875 0.09375,1.9921875 v 2.4375 c 0,0.75 -0.01172,1.45312496 -0.09375,1.9921874625 l 0.01172,0.0351562475 c 0,0 0.175781,-0.0351562475 0.515625,-0.0351562475 h 2.71875 c 0.210937,0 0.433594,0.0117187475 0.597656,0.0351562475 L 22.033227,9.140625e-4 C 22.021508,-0.13971094 22.009789,-0.33892969 22.009789,-0.43267969 c 0,-0.10546875 0.01172,-0.26953125 0.02344,-0.3515625 l -0.02344,-0.046875 c 0,0 -1.476562,0.22265625 -2.8125,0.22265625 -0.07031,-0.234375 -0.08203,-1.21874996 -0.08203,-1.38281246 z m 0,0"
+       style="fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+       aria-label="SYCL"
+       transform="matrix(1.3333333,0,0,1.3333333,269.16533,276.948)" />
+    <path
+       id="path113"
+       d="m 238.11302,-39.68576 31.87468,0.66391"
+       style="fill:none;stroke:#000000;stroke-width:0.49814;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+       transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)" />
+    <path
+       id="path114"
+       d="M 2.14195,0 -1.28517,1.71356 0,0 -1.28517,-1.71356"
+       style="fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+       transform="matrix(1.3329867,-0.02774667,-0.02774667,-1.3329867,364.36227,271.79983)" />
+    <g
+       id="g118"
+       clip-path="url(#clipPath119)">
+      <path
+         d="M 0,0 H 100 V 100 H 0 Z"
+         transform="matrix(1.512,0,0,-0.6048,208.46693,348.27952)"
+         style="fill:url(#linearGradient118);stroke:none"
+         id="path118" />
+    </g>
+    <path
+       id="path120"
+       d="m 181.4194,-85.0404 v 22.67752 h 56.69362 V -85.0404 Z"
+       style="fill:none;stroke:#333333;stroke-width:0.3985;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+       transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)" />
+    <path
+       id="path121"
+       d="M 1.9216328,-6.5969687 C 1.5700703,-4.5813437 1.0310078,-1.8274375 0.63257031,0.03584375 0.74975781,6.875e-4 0.84350781,6.875e-4 0.96069531,6.875e-4 c 0.11718749,0 0.19921879,0 0.30468749,0.03515625 C 1.4177266,-1.0188437 1.7575703,-3.2336875 1.9919453,-4.7454062 h 0.046875 c 0.75,1.59375 1.453125,3.1992187 2.0976563,4.7460937 h 0.1875 C 5.0388203,-1.6282187 5.7419453,-3.1516562 6.5388203,-4.71025 l 0.023437,0.011719 C 6.7731953,-3.1399375 6.9724141,-1.6165 7.1247578,0.03584375 7.2771016,6.875e-4 7.4646016,6.875e-4 7.6052266,6.875e-4 c 0.1523437,0 0.4101562,0 0.5625,0.03515625 -0.3632813,-2.13281245 -0.65625,-4.20703125 -0.9375,-6.63281245 H 6.9841328 L 4.5583516,-1.6516562 H 4.4880391 C 3.6911641,-3.2922812 2.9763203,-4.8860312 2.2497578,-6.5969687 Z M 12.881594,-2.7415 c -0.328125,0.011719 -0.75,0.023437 -1.03125,0.023437 -0.292969,0 -0.832032,-0.011719 -1.054688,-0.035156 l 1.054688,-2.4375 h 0.02344 c 0.410156,0.9375 0.738281,1.7578125 1.007813,2.4492187 z m -2.273438,0.46875 c 0.246094,-0.023437 0.9375,-0.023437 1.289063,-0.023437 0.375,0 0.925781,0 1.160156,0.023437 0.492187,1.3125 0.714844,2.0859375 0.785156,2.30859375 C 14.006594,6.875e-4 14.170656,6.875e-4 14.334719,6.875e-4 c 0.175781,0 0.433593,0 0.597656,0.03515625 -0.492188,-1.0078125 -1.816406,-4.40624995 -2.730469,-6.58593745 h -0.28125 C 10.971437,-4.3469687 9.9870625,-2.1672812 8.9909687,0.03584375 9.1081562,6.875e-4 9.2253437,6.875e-4 9.3308125,6.875e-4 c 0.1171875,0 0.328125,0 0.4570312,0.03515625 0.1523438,-0.609375 0.4687503,-1.42968745 0.8203123,-2.30859375 z m 9.583008,1.1132813 v 0.44531245 c -0.339844,0.31640625 -0.925781,0.375 -1.464844,0.375 -1.769531,0 -2.425781,-1.60546875 -2.425781,-2.91796875 0,-1.7460937 0.960938,-2.859375 2.308594,-2.859375 0.890625,0 1.664062,0.5039063 2.109375,1.2539063 l 0.117187,-0.011719 c 0.03516,-0.5039062 0.09375,-0.7734375 0.199219,-1.1484375 l -0.01172,-0.035156 c 0,0 -1.054687,-0.4921875 -2.273437,-0.4921875 -1.710938,0 -3.480469,1.2070312 -3.480469,3.3984375 0,1.7226562 1.230469,3.25781245 3.234375,3.25781245 1.148438,0 1.96875,-0.33984375 2.648438,-0.90234375 v -0.0351563 c -0.09375,-0.09375 -0.105469,-0.3632813 -0.105469,-0.46875 v -0.058594 c 0,-0.75 0.02344,-1.171875 0.105469,-1.6992187 l -0.01172,-0.035156 c 0,0 -0.175781,0.035156 -0.515625,0.035156 -0.339844,0 -0.515625,-0.035156 -0.515625,-0.035156 l -0.01172,0.035156 c 0.07031,0.5625 0.09375,1.1484375 0.09375,1.8984375 z m 3.629883,-5.4375 c -0.351563,2.015625 -0.890625,4.7695312 -1.289063,6.63281245 C 22.649172,6.875e-4 22.742922,6.875e-4 22.860109,6.875e-4 c 0.117188,0 0.199219,0 0.304688,0.03515625 0.152344,-1.05468745 0.492187,-3.26953125 0.726562,-4.78124995 h 0.04687 c 0.75,1.59375 1.453125,3.1992187 2.097657,4.7460937 h 0.1875 c 0.714843,-1.6289062 1.417968,-3.1523437 2.214843,-4.7109375 l 0.02344,0.011719 c 0.210937,1.5585937 0.410156,3.0820312 0.5625,4.73437495 C 29.176516,6.875e-4 29.364016,6.875e-4 29.504641,6.875e-4 c 0.152343,0 0.410156,0 0.5625,0.03515625 -0.363282,-2.13281245 -0.65625,-4.20703125 -0.9375,-6.63281245 h -0.246094 l -2.425781,4.9453125 h -0.07031 c -0.796875,-1.640625 -1.511719,-3.234375 -2.238281,-4.9453125 z M 34.778078,-2.7415 c -0.328125,0.011719 -0.75,0.023437 -1.03125,0.023437 -0.292969,0 -0.832031,-0.011719 -1.054687,-0.035156 l 1.054687,-2.4375 h 0.02344 c 0.410156,0.9375 0.738281,1.7578125 1.007812,2.4492187 z m -2.273437,0.46875 c 0.246093,-0.023437 0.9375,-0.023437 1.289062,-0.023437 0.375,0 0.925781,0 1.160156,0.023437 0.492188,1.3125 0.714844,2.0859375 0.785157,2.30859375 C 35.903078,6.875e-4 36.067141,6.875e-4 36.231203,6.875e-4 c 0.175781,0 0.433594,0 0.597656,0.03515625 -0.492187,-1.0078125 -1.816406,-4.40624995 -2.730468,-6.58593745 h -0.28125 c -0.949219,2.203125 -1.933594,4.3828125 -2.929688,6.58593745 C 31.004641,6.875e-4 31.121828,6.875e-4 31.227297,6.875e-4 c 0.117187,0 0.328125,0 0.457031,0.03515625 0.152344,-0.609375 0.46875,-1.42968745 0.820313,-2.30859375 z m 0,0"
+       style="fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+       aria-label="MAGMA"
+       transform="matrix(1.3333333,0,0,1.3333333,259.40267,322.40533)" />
+    <path
+       id="path122"
+       d="m 238.11302,-73.70186 33.33023,98.60217"
+       style="fill:none;stroke:#000000;stroke-width:0.49814;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+       transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)" />
+    <path
+       id="path123"
+       d="M 2.14195,0 -1.28517,1.71356 0,0 -1.28517,-1.71356"
+       style="fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+       transform="matrix(0.42693333,-1.2630667,-1.2630667,-0.42693333,366.30299,186.57025)" />
+    <path
+       id="path124"
+       d="m 238.11302,-73.70186 33.04201,64.7076"
+       style="fill:none;stroke:#000000;stroke-width:0.49814;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+       transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)" />
+    <path
+       id="path125"
+       d="M 2.14195,0 -1.28517,1.71356 0,0 -1.28517,-1.71356"
+       style="fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+       transform="matrix(0.60634667,-1.18744,-1.18744,-0.60634667,365.91871,231.76304)" />
+    <path
+       id="path126"
+       d="m 6.4102891,-5.3204844 v 0.9609375 H 2.2735703 v -0.9609375 c 0,-0.890625 0.023437,-1.7578125 0.1171875,-2.390625 l -0.011719,-0.035156 c -0.140625,0.023437 -0.4804688,0.035156 -0.6210938,0.035156 -0.140625,0 -0.46875,-0.011719 -0.609375,-0.035156 l -0.023437,0.035156 c 0.09375,0.6796875 0.1171875,1.5 0.1171875,2.390625 v 2.9296875 c 0,0.890625 -0.023437,1.74609377 -0.1171875,2.390625025 L 1.136852,0.03498437 c 0.140625,-0.0234375 0.4804687,-0.035156245 0.6210937,-0.035156245 0.140625,0 0.46875,0.011718745 0.609375,0.035156245 L 2.3907578,-1.71875e-4 C 2.2970078,-0.67985938 2.2735703,-1.5001719 2.2735703,-2.3907969 v -1.4414062 h 4.1367188 v 1.4414062 c 0,0.890625 -0.023437,1.74609377 -0.1171875,2.390625025 l 0.011719,0.035156245 c 0.140625,-0.0234375 0.46875,-0.035156245 0.6210938,-0.035156245 0.140625,0 0.4570312,0.011718745 0.609375,0.035156245 L 7.5587266,-1.71875e-4 C 7.4532578,-0.67985938 7.4298203,-1.5001719 7.4298203,-2.3907969 v -2.9296875 c 0,-0.890625 0.023437,-1.7578125 0.1289063,-2.390625 l -0.011719,-0.035156 c -0.1523437,0.023437 -0.4804687,0.035156 -0.6210937,0.035156 -0.1523438,0 -0.46875,-0.011719 -0.609375,-0.035156 l -0.023437,0.035156 c 0.09375,0.6796875 0.1171875,1.5 0.1171872,2.390625 z m 5.9736329,2.625 -0.04687,1.4296875 c 0,0.1523438 -0.07031,0.234375 -0.164063,0.30468752 -0.339843,0.2578125 -0.738281,0.46875 -1.101562,0.46875 -0.539063,0 -0.878906,-0.3515625 -0.878906,-0.72656252 0,-0.5390625 0.246093,-0.9492187 1.183593,-1.1953125 z m 0,2.12109377 c 0.128906,0.46875 0.492187,0.69140625 0.9375,0.69140625 0.292969,0 0.667969,-0.0703125 0.9375,-0.36328125 l -0.08203,-0.29296875 c -0.128907,0.0351563 -0.234375,0.0585938 -0.316407,0.0585938 -0.128906,0 -0.28125,-0.0234375 -0.363281,-0.09375 -0.117187,-0.10546875 -0.1875,-0.3984375 -0.1875,-0.93749992 0,-0.3398438 0.03516,-1.6640626 0.03516,-1.8046876 0,-1.6054687 -1.066406,-1.9335937 -1.96875,-1.9335937 -0.914062,0 -1.4648434,0.4453125 -1.7578121,0.6914062 l -0.035156,0.046875 0.1992188,0.8085937 0.1523437,0.011719 c 0.3398436,-0.5390625 0.7382806,-1.0195313 1.3124996,-1.0195313 0.433594,0 1.148438,0.058594 1.148438,1.40625 0,0.09375 -0.04687,0.140625 -0.08203,0.1523438 l -1.113281,0.2460937 c -1.2187496,0.28125 -1.9804683,0.9257813 -1.9804683,1.734375 0,0.89062507 0.609375,1.28906257 1.4882813,1.28906257 0.667968,0 0.996093,-0.15234375 1.628906,-0.69140625 z m 4.037109,-3.42187497 v -1.0898438 c 0,-0.082031 -0.02344,-0.1171875 -0.07031,-0.1171875 -0.269532,0.058594 -0.703125,0.070312 -0.960938,0.035156 l -0.02344,0.035156 c 0.08203,0.5273438 0.105468,1.4296875 0.105468,2.3203125 v 0.65625 c 0,0.9023438 -0.02344,1.52343752 -0.105468,2.156250025 l 0.02344,0.035156245 c 0.140625,-0.0234375 0.410156,-0.035156245 0.5625,-0.035156245 0.140625,0 0.410156,0.011718745 0.550781,0.035156245 l 0.03516,-0.035156245 C 16.43275,-0.67985938 16.421031,-1.2423594 16.421031,-2.1564219 v -0.6210937 c 0,-0.4921875 0.152344,-0.7851563 0.410156,-1.1835938 0.164063,-0.2578125 0.445313,-0.4101562 0.667969,-0.4101562 0.246094,0 0.46875,0.023437 0.621094,0.1640625 l 0.09375,-0.023437 0.246094,-0.890625 -0.04687,-0.046875 c -0.210938,-0.058594 -0.222657,-0.082031 -0.433594,-0.082031 -0.644531,0 -0.996094,0.375 -1.523438,1.2773438 z m 6.46875,2.6484375 c -0.445312,0.56249997 -0.984375,0.84374997 -1.464844,0.84374997 -0.644531,0 -1.195312,-0.64453127 -1.195312,-2.13281247 0,-1.7929688 0.9375,-2.1328125 1.523437,-2.1328125 0.5625,0 0.855469,0.234375 1.136719,0.6914062 z m 0,0.65624997 h 0.02344 l 0.05859,0.691406255 c 0,0.023437495 0.03516,0.035156245 0.117188,0.035156245 0.152344,-0.01171875 0.234375,-0.035156245 0.398437,-0.035156245 0.152344,0 0.386719,0.011718745 0.539063,0.035156245 l 0.02344,-0.035156245 C 23.956187,-0.51579688 23.850719,-1.3947031 23.850719,-2.2970469 v -3.6796875 c 0,-0.8789062 0.04687,-1.5117187 0.105468,-2.25 0,-0.082031 -0.03516,-0.1171875 -0.105468,-0.1171875 -0.304688,0.1171875 -0.5625,0.1992188 -1.042969,0.234375 l -0.02344,0.035156 c 0.08203,0.515625 0.105469,1.40625 0.105469,2.3085937 v 0.7851563 c -0.269531,-0.1640625 -0.761719,-0.2695313 -0.996094,-0.2695313 -1.570312,0 -2.707031,1.078125 -2.707031,2.7070313 0,1.4648437 0.867188,2.66015622 2.167969,2.66015622 0.574219,0 1.113281,-0.24609375 1.535156,-0.80859375 z m 4.473633,0.73828125 c 0.117188,-0.03515625 0.28125,-0.03515625 0.398438,-0.03515625 0.117187,0 0.316406,0 0.421875,0.03515625 0.328125,-0.9140625 0.679687,-1.89843752 1.054687,-2.83593752 0.363281,0.9375 0.738281,1.91015627 1.089844,2.82421877 0.117187,-0.035156245 0.1875,-0.035156245 0.316406,-0.035156245 0.117188,0 0.328125,0 0.433594,0.035156245 0.585937,-1.40624997 1.546875,-3.70312497 2.226562,-5.20312497 -0.105468,0.035156 -0.398437,0.035156 -0.515625,0.035156 -0.117187,0 -0.292968,0 -0.398437,-0.035156 -0.445313,1.4179687 -0.984375,2.9414062 -1.488281,4.0898437 h -0.09375 c -0.492188,-1.3125 -0.867188,-2.8242187 -1.136719,-4.0898437 -0.152344,0.035156 -0.433594,0.035156 -0.597656,0.035156 -0.1875,0 -0.503907,0 -0.691407,-0.035156 0.152344,0.4921875 0.339844,1.0195312 0.550782,1.5703125 -0.316407,0.890625 -0.644532,1.7929687 -0.996094,2.53125 h -0.07031 c -0.597656,-1.359375 -0.996093,-2.7070313 -1.417968,-4.0898438 -0.164063,0.035156 -0.433594,0.035156 -0.597657,0.035156 -0.199218,0 -0.503906,0 -0.703125,-0.035156 0.808594,1.7578125 1.546875,3.4570313 2.214844,5.20312502 z m 9.884766,-2.74218752 -0.04687,1.4296875 c 0,0.1523438 -0.07031,0.234375 -0.164063,0.30468752 -0.339844,0.2578125 -0.738281,0.46875 -1.101562,0.46875 -0.539063,0 -0.878907,-0.3515625 -0.878907,-0.72656252 0,-0.5390625 0.246094,-0.9492187 1.183594,-1.1953125 z m 0,2.12109377 c 0.128906,0.46875 0.492187,0.69140625 0.9375,0.69140625 0.292968,0 0.667968,-0.0703125 0.9375,-0.36328125 l -0.08203,-0.29296875 c -0.128906,0.0351563 -0.234375,0.0585938 -0.316406,0.0585938 -0.128906,0 -0.28125,-0.0234375 -0.363281,-0.09375 -0.117188,-0.10546875 -0.1875,-0.3984375 -0.1875,-0.93749992 0,-0.3398438 0.03516,-1.6640626 0.03516,-1.8046876 0,-1.6054687 -1.066406,-1.9335937 -1.96875,-1.9335937 -0.914062,0 -1.464844,0.4453125 -1.757812,0.6914062 l -0.03516,0.046875 0.199219,0.8085937 0.152344,0.011719 c 0.339844,-0.5390625 0.738281,-1.0195313 1.3125,-1.0195313 0.433594,0 1.148437,0.058594 1.148437,1.40625 0,0.09375 -0.04687,0.140625 -0.08203,0.1523438 l -1.113281,0.2460937 c -1.21875,0.28125 -1.980469,0.9257813 -1.980469,1.734375 0,0.89062507 0.609375,1.28906257 1.488281,1.28906257 0.667969,0 0.996094,-0.15234375 1.628907,-0.69140625 z m 4.040039,-3.42187497 v -1.0898438 c 0,-0.082031 -0.02344,-0.1171875 -0.07031,-0.1171875 -0.269531,0.058594 -0.703125,0.070312 -0.960937,0.035156 l -0.02344,0.035156 c 0.08203,0.5273438 0.105469,1.4296875 0.105469,2.3203125 v 0.65625 c 0,0.9023438 -0.02344,1.52343752 -0.105469,2.156250025 l 0.02344,0.035156245 c 0.140625,-0.0234375 0.410156,-0.035156245 0.5625,-0.035156245 0.140625,0 0.410156,0.011718745 0.550781,0.035156245 l 0.03516,-0.035156245 C 41.299937,-0.67985938 41.288219,-1.2423594 41.288219,-2.1564219 v -0.6210937 c 0,-0.4921875 0.152343,-0.7851563 0.410156,-1.1835938 0.164062,-0.2578125 0.445312,-0.4101562 0.667969,-0.4101562 0.246093,0 0.46875,0.023437 0.621093,0.1640625 l 0.09375,-0.023437 0.246094,-0.890625 -0.04687,-0.046875 c -0.210937,-0.058594 -0.222656,-0.082031 -0.433594,-0.082031 -0.644531,0 -0.996093,0.375 -1.523437,1.2773438 z m 3.65625,0.7382812 c 0.199218,-1.2539062 0.972656,-1.5117187 1.359375,-1.5117187 0.46875,0 0.996093,0.4453125 0.996093,1.3359375 0,0.1054687 -0.04687,0.1757812 -0.164062,0.1757812 z m 3.1875,2.015625 c -0.421875,0.45703127 -0.984375,0.65625002 -1.640625,0.65625002 -0.421875,0 -0.984375,-0.15234375 -1.300782,-0.67968752 -0.210937,-0.3398437 -0.292968,-0.8085937 -0.292968,-1.5234375 h 3.257812 c 0.128906,0 0.210938,-0.070312 0.210938,-0.1992187 0,-1.0078125 -0.492188,-2.2617188 -2.0625,-2.2617188 -1.230469,0 -2.449219,0.9960938 -2.449219,2.7539063 0,0.6796875 0.128906,1.3476562 0.539062,1.82812497 0.398438,0.50390625 1.078125,0.78515625 1.898438,0.78515625 0.855469,0 1.617187,-0.4453125 2.074219,-1.06640625 z m 0,0"
+       style="fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+       aria-label="Hardware"
+       transform="matrix(1.3333333,0,0,1.3333333,376.21467,17.398667)" />
+    <g
+       id="g130"
+       clip-path="url(#clipPath131)">
+      <path
+         d="M 0,0 H 100 V 100 H 0 Z"
+         transform="matrix(1.6632,0,0,-0.6048,325.63328,121.50511)"
+         style="fill:url(#linearGradient130);stroke:none"
+         id="path130" />
+    </g>
+    <path
+       id="path132"
+       d="m 272.12953,85.04042 v 22.67752 h 62.36289 V 85.04042 Z"
+       style="fill:none;stroke:#2d782d;stroke-width:0.3985;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+       transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)" />
+    <path
+       id="path133"
+       d="m 3.7843438,-6.5503438 c -1.6875,0 -3.2695313,1.453125 -3.2695313,3.4101563 0,1.7226562 0.984375,3.24609375 3.1289063,3.24609375 0.9140625,0 1.78125,-0.28125 2.4492187,-1.08984375 C 6.0812188,-1.101125 6.0695,-1.3237813 6.0343438,-1.4175313 l -0.070312,-0.023437 c -0.703125,0.77343755 -1.3007813,1.03125005 -2.2265625,1.03125005 -1.265625,0 -2.1914063,-1.35937505 -2.1914063,-2.91796875 0,-2.015625 1.2773438,-2.7539063 2.0976563,-2.7539063 0.9023437,0 1.6289062,0.3515625 2.0390625,1.171875 l 0.1171875,-0.011719 C 5.8234063,-5.4253438 5.8702813,-5.601125 5.97575,-6.023 L 5.952313,-6.058156 c 0,0 -0.9492187,-0.4921875 -2.1679692,-0.4921878 z m 4.8574218,0.9257813 c 0,-0.2929688 0.1757813,-0.4453125 0.890625,-0.4453125 0.6914064,0 1.4179684,0.1875 1.4179684,1.3828125 0,1.1367187 -0.5625,1.5117187 -1.5234371,1.5117187 -0.2578125,0 -0.65625,-0.035156 -0.7851563,-0.1054687 z M 7.7745781,-4.42925 v 2.4375 c 0,0.75 -0.011719,1.453125 -0.09375,1.9921875 l 0.023437,0.03515625 C 7.8214531,0.01215625 8.0909844,4.375e-4 8.2081719,4.375e-4 c 0.1171875,0 0.3867187,0.01171875 0.5039062,0.03515625 L 8.7355156,4.375e-4 c -0.082031,-0.5625 -0.09375,-1.2421875 -0.09375,-1.9921875 v -0.8554688 c 0.2109375,0.070312 0.4804688,0.09375 0.8320313,0.09375 1.8164061,0 2.4375001,-1.1367187 2.4375001,-1.9921875 0,-0.7382812 -0.46875,-1.7460937 -2.3085939,-1.7460937 -0.2578125,0 -1.1015625,0.070312 -1.3945312,0.070312 -0.1171875,0 -0.3867188,-0.011719 -0.5039063,-0.035156 l -0.023437,0.035156 c 0.082031,0.5625 0.09375,1.2421875 0.09375,1.992188 z m 10.4414059,0 v 1.6757812 c 0,1.1132813 -0.164062,2.29687505 -1.851562,2.29687505 -1.664063,0 -1.664063,-1.68750005 -1.664063,-2.22656255 V -4.42925 c 0,-0.75 0.01172,-1.4648438 0.09375,-1.9921875 l -0.02344,-0.035156 c -0.117188,0.023437 -0.386719,0.035156 -0.503906,0.035156 -0.117188,0 -0.386719,-0.011719 -0.503907,-0.035156 l -0.02344,0.035156 c 0.08203,0.5625 0.09375,1.2421875 0.09375,1.9921875 v 2.0742187 c 0,2.03906255 1.429687,2.46093755 2.320312,2.46093755 2.050782,0 2.601563,-1.26562505 2.601563,-3.04687505 V -4.42925 c 0,-0.75 0.02344,-1.4648438 0.105469,-1.9921875 l -0.02344,-0.035156 c -0.117187,0.023437 -0.222656,0.035156 -0.351562,0.035156 -0.117188,0 -0.222657,-0.011719 -0.339844,-0.035156 l -0.02344,0.035156 c 0.08203,0.5625 0.09375,1.2421875 0.09375,1.9921875 z m 0,0"
+       style="fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+       aria-label="CPU"
+       transform="matrix(1.3333333,0,0,1.3333333,395.65733,95.530667)" />
+    <g
+       id="g137"
+       clip-path="url(#clipPath138)">
+      <path
+         d="M 0,0 H 100 V 100 H 0 Z"
+         transform="matrix(1.6632,0,0,-0.6048,325.63328,212.21524)"
+         style="fill:url(#linearGradient137);stroke:none"
+         id="path137" />
+    </g>
+    <path
+       id="path139"
+       d="m 272.12953,17.00783 v 22.67752 h 62.36289 V 17.00783 Z"
+       style="fill:none;stroke:#2d782d;stroke-width:0.3985;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+       transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)" />
+    <path
+       id="path140"
+       d="m 1.0775,-6.4231641 c 0.082031,1.8867188 0.058594,4.8398438 -0.10546875,6.42187504 l 0.0234375,0.03515625 C 1.1126563,0.01042969 1.2298438,-0.00128906 1.3470313,-0.00128906 c 0.1171875,0 0.2226562,0.01171875 0.3515625,0.03515625 l 0.011719,-0.03515625 c -0.082031,-0.5625 -0.09375,-1.24218754 -0.09375,-1.99218754 v -2.3789062 c 0,-0.609375 0.011719,-0.6445313 0.3867188,-0.1640625 L 5.48375,-0.14191406 c 0.1054688,0.15234375 0.2578125,0.24609375 0.421875,0.24609375 0.140625,0 0.1757813,-0.12890625 0.1875,-0.31640625 C 6.14,-2.4856641 6.14,-4.2200391 6.2923438,-6.4231641 l -0.011719,-0.035156 c -0.1289062,0.023437 -0.234375,0.035156 -0.3515625,0.035156 -0.1171875,0 -0.234375,-0.011719 -0.3515625,-0.035156 l -0.023437,0.035156 c 0.082031,0.5625 0.1054688,1.2421875 0.1054688,1.9921875 v 2.6601563 c -0.023437,0.5390625 -0.1640625,0.3164062 -0.5859375,-0.2578125 l -3.421875,-4.4179688 c 0,0 -0.09375,0.023437 -0.140625,0.023437 -0.3398438,0 -0.4101563,-0.035156 -0.4101563,-0.035156 z m 9.536133,5.109375 h -0.01172 C 9.2894141,-4.3489453 8.7855078,-6.1067578 8.6917578,-6.4583203 c -0.1640625,0.023437 -0.3984375,0.035156 -0.5390625,0.035156 -0.1640625,0 -0.4453125,-0.011719 -0.5976562,-0.035156 0.4921875,1.0078125 1.8046875,4.3710937 2.7304689,6.56249999 h 0.269531 c 0.960938,-2.21484379 1.933594,-4.35937499 2.941406,-6.56249999 -0.117187,0.023437 -0.304687,0.035156 -0.398437,0.035156 -0.105469,0 -0.339844,-0.011719 -0.445313,-0.035156 -0.351562,1.2890625 -1.324218,3.4921875 -2.039062,5.1445312 z m 4.095703,-3.1171875 v 2.4375 c 0,0.75 -0.01172,1.45312504 -0.09375,1.99218754 l 0.02344,0.03515625 c 0.117188,-0.0234375 0.386719,-0.03515625 0.503907,-0.03515625 0.117187,0 0.386718,0.01171875 0.503906,0.03515625 l 0.02344,-0.03515625 c -0.08203,-0.5625 -0.09375,-1.24218754 -0.09375,-1.99218754 v -2.4375 c 0,-0.75 0.01172,-1.4648437 0.09375,-1.9921875 l -0.02344,-0.035156 c -0.117188,0.023437 -0.386719,0.035156 -0.503906,0.035156 -0.117188,0 -0.386719,-0.011719 -0.503907,-0.035156 l -0.02344,0.035156 c 0.08203,0.5625 0.09375,1.2421875 0.09375,1.9921875 z m 3.796875,3.60937504 V -5.6145703 c 0,-0.3515625 0.339844,-0.3984375 0.832031,-0.3984375 2.15625,0 2.800781,1.6523437 2.800781,3.1875 0,2.00390624 -1.148437,2.40234374 -2.554687,2.40234374 -0.972656,0 -1.078125,-0.0820312 -1.078125,-0.3984375 z M 18.072617,-6.4231641 c -0.339844,0 -0.503906,-0.035156 -0.503906,-0.035156 l -0.02344,0.035156 c 0.08203,0.5625 0.09375,1.2421875 0.09375,1.9921875 v 2.4375 c 0,0.75 -0.01172,1.45312504 -0.09375,1.99218754 l 0.01172,0.03515625 c 0,0 0.164063,-0.03515625 0.515625,-0.03515625 0.703125,0 0.796875,0.0234375 1.828125,0.0234375 1.394531,0 3.257813,-0.64453125 3.257813,-3.09375004 0,-1.8515625 -1.535157,-3.3867187 -3.445313,-3.3867187 -0.632812,0 -1.007812,0.035156 -1.640625,0.035156 z m 6.638672,1.9921875 v 2.4375 c 0,0.75 -0.01172,1.45312504 -0.09375,1.99218754 l 0.02344,0.03515625 c 0.117187,-0.0234375 0.386718,-0.03515625 0.503906,-0.03515625 0.117187,0 0.386719,0.01171875 0.503906,0.03515625 l 0.02344,-0.03515625 c -0.08203,-0.5625 -0.09375,-1.24218754 -0.09375,-1.99218754 v -2.4375 c 0,-0.75 0.01172,-1.4648437 0.09375,-1.9921875 l -0.02344,-0.035156 c -0.117187,0.023437 -0.386719,0.035156 -0.503906,0.035156 -0.117188,0 -0.386719,-0.011719 -0.503906,-0.035156 l -0.02344,0.035156 c 0.08203,0.5625 0.09375,1.2421875 0.09375,1.9921875 z m 5.953125,1.6875 c -0.328125,0.011719 -0.75,0.023437 -1.03125,0.023437 -0.292969,0 -0.832031,-0.011719 -1.054687,-0.035156 l 1.054687,-2.4375 h 0.02344 c 0.410156,0.9375 0.738281,1.7578125 1.007812,2.4492187 z m -2.273437,0.46875 c 0.246093,-0.023437 0.9375,-0.023437 1.289062,-0.023437 0.375,0 0.925781,0 1.160156,0.023437 0.492188,1.31250004 0.714844,2.08593754 0.785157,2.30859379 0.164062,-0.03515625 0.328125,-0.03515625 0.492187,-0.03515625 0.175781,0 0.433594,0 0.597656,0.03515625 -0.492187,-1.0078125 -1.816406,-4.40624999 -2.730468,-6.58593749 h -0.28125 c -0.949219,2.203125 -1.933594,4.3828125 -2.929688,6.58593749 0.117188,-0.03515625 0.234375,-0.03515625 0.339844,-0.03515625 0.117187,0 0.328125,0 0.457031,0.03515625 0.152344,-0.609375 0.46875,-1.42968749 0.820313,-2.30859379 z m 12.413086,1.1132813 v 0.44531249 c -0.339844,0.31640625 -0.925782,0.375 -1.464844,0.375 -1.769531,0 -2.425781,-1.60546879 -2.425781,-2.91796879 0,-1.7460937 0.960937,-2.859375 2.308593,-2.859375 0.890625,0 1.664063,0.5039063 2.109375,1.2539063 l 0.117188,-0.011719 c 0.03516,-0.5039062 0.09375,-0.7734375 0.199219,-1.1484375 l -0.01172,-0.035156 c 0,0 -1.054688,-0.4921875 -2.273438,-0.4921875 -1.710937,0 -3.480468,1.2070312 -3.480468,3.3984375 0,1.7226562 1.230468,3.25781249 3.234375,3.25781249 1.148437,0 1.96875,-0.33984375 2.648437,-0.90234375 v -0.0351563 c -0.09375,-0.09375 -0.105469,-0.36328134 -0.105469,-0.46875004 v -0.058594 c 0,-0.75 0.02344,-1.171875 0.105469,-1.6992187 l -0.01172,-0.035156 c 0,0 -0.175781,0.035156 -0.515625,0.035156 -0.339843,0 -0.515625,-0.035156 -0.515625,-0.035156 l -0.01172,0.035156 c 0.07031,0.5625 0.09375,1.1484375 0.09375,1.8984375 z m 3.603515,-4.4648438 c 0,-0.2929687 0.175781,-0.4453125 0.890625,-0.4453125 0.691406,0 1.417969,0.1875 1.417969,1.3828125 0,1.1367188 -0.5625,1.5117188 -1.523438,1.5117188 -0.257812,0 -0.65625,-0.035156 -0.785156,-0.1054688 z m -0.867187,1.1953125 v 2.4375 c 0,0.75 -0.01172,1.45312504 -0.09375,1.99218754 l 0.02344,0.03515625 c 0.117188,-0.0234375 0.386719,-0.03515625 0.503906,-0.03515625 0.117188,0 0.386719,0.01171875 0.503907,0.03515625 l 0.02344,-0.03515625 c -0.08203,-0.5625 -0.09375,-1.24218754 -0.09375,-1.99218754 v -0.8554687 c 0.210938,0.070312 0.480469,0.09375 0.832031,0.09375 1.816407,0 2.4375,-1.1367188 2.4375,-1.9921875 0,-0.7382813 -0.46875,-1.7460938 -2.308593,-1.7460938 -0.257813,0 -1.101563,0.070312 -1.394532,0.070312 -0.117187,0 -0.386718,-0.011719 -0.503906,-0.035156 l -0.02344,0.035156 c 0.08203,0.5625 0.09375,1.2421875 0.09375,1.9921875 z m 10.441406,0 v 1.6757813 c 0,1.1132812 -0.164063,2.29687499 -1.851563,2.29687499 -1.664062,0 -1.664062,-1.68749999 -1.664062,-2.22656249 v -1.7460938 c 0,-0.75 0.01172,-1.4648437 0.09375,-1.9921875 l -0.02344,-0.035156 c -0.117187,0.023437 -0.386718,0.035156 -0.503906,0.035156 -0.117187,0 -0.386719,-0.011719 -0.503906,-0.035156 l -0.02344,0.035156 c 0.08203,0.5625 0.09375,1.2421875 0.09375,1.9921875 v 2.0742188 c 0,2.03906249 1.429688,2.46093749 2.320313,2.46093749 2.050781,0 2.601562,-1.26562499 2.601562,-3.04687499 v -1.4882813 c 0,-0.75 0.02344,-1.4648437 0.105469,-1.9921875 l -0.02344,-0.035156 c -0.117188,0.023437 -0.222657,0.035156 -0.351563,0.035156 -0.117187,0 -0.222656,-0.011719 -0.339844,-0.035156 l -0.02344,0.035156 c 0.08203,0.5625 0.09375,1.2421875 0.09375,1.9921875 z m 0,0"
+       style="fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+       aria-label="NVIDIAGPU"
+       transform="matrix(1.3333333,0,0,1.3333333,371.81333,186.24)" />
+    <g
+       id="g144"
+       clip-path="url(#clipPath145)">
+      <path
+         d="M 0,0 H 100 V 100 H 0 Z"
+         transform="matrix(1.6632,0,0,-0.6048,325.63328,257.56997)"
+         style="fill:url(#linearGradient144);stroke:none"
+         id="path144" />
+    </g>
+    <path
+       id="path146"
+       d="M 272.12953,-17.00824 V 5.66927 h 62.36289 v -22.67751 z"
+       style="fill:none;stroke:#2d782d;stroke-width:0.3985;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+       transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)" />
+    <path
+       id="path147"
+       d="m 4.0545313,-2.7407734 c -0.328125,0.011719 -0.75,0.023437 -1.03125,0.023437 -0.2929688,0 -0.8320313,-0.011719 -1.0546875,-0.035156 l 1.0546875,-2.4375 h 0.023437 c 0.4101562,0.9375 0.7382812,1.7578125 1.007813,2.449219 z m -2.2734375,0.46875 c 0.2460937,-0.023437 0.9375,-0.023437 1.2890625,-0.023437 0.375,0 0.9257812,0 1.1601562,0.023437 0.4921875,1.31249996 0.7148438,2.08593746 0.7851563,2.30859371 0.1640625,-0.03515625 0.328125,-0.03515625 0.4921875,-0.03515625 0.1757812,0 0.4335937,0 0.5976562,0.03515625 C 5.613125,-0.97124219 4.2889063,-4.3696797 3.3748438,-6.5493672 h -0.28125 C 2.144375,-4.3462422 1.16,-2.1665547 0.16390625,0.03657031 0.28109375,0.00141406 0.39828125,0.00141406 0.50375,0.00141406 c 0.1171875,0 0.328125,0 0.45703125,0.03515625 C 1.113125,-0.57280469 1.4295313,-1.3931172 1.7810938,-2.2720234 Z m 6.4072265,-4.3242188 c -0.3515625,2.015625 -0.890625,4.7695313 -1.2890625,6.63281251 0.1171875,-0.03515625 0.2109375,-0.03515625 0.328125,-0.03515625 0.1171875,0 0.1992188,0 0.3046875,0.03515625 C 7.6844141,-1.0181172 8.0242578,-3.2329609 8.2586328,-4.7446797 h 0.046875 c 0.75,1.59375 1.453125,3.1992188 2.0976562,4.74609376 h 0.1875 c 0.714844,-1.62890626 1.417969,-3.15234376 2.214844,-4.71093746 l 0.02344,0.011719 c 0.210938,1.5585938 0.410157,3.0820313 0.5625,4.73437501 0.152344,-0.03515625 0.339844,-0.03515625 0.480469,-0.03515625 0.152344,0 0.410156,0 0.5625,0.03515625 -0.363281,-2.13281251 -0.65625,-4.20703121 -0.9375,-6.63281251 H 13.25082 l -2.425781,4.9453125 h -0.07031 C 9.9578516,-3.2915547 9.2430078,-4.8853047 8.5164453,-6.5962422 Z m 8.8037107,5.77734376 V -5.6118672 c 0,-0.3515625 0.339844,-0.3984375 0.832032,-0.3984375 2.15625,0 2.800781,1.6523438 2.800781,3.1875 0,2.00390626 -1.148438,2.40234376 -2.554688,2.40234376 -0.972656,0 -1.078125,-0.0820313 -1.078125,-0.3984375 z M 16.558438,-6.4204609 c -0.339844,0 -0.503907,-0.035156 -0.503907,-0.035156 l -0.02344,0.035156 c 0.08203,0.5625 0.09375,1.2421875 0.09375,1.9921875 v 2.4375 c 0,0.75 -0.01172,1.45312496 -0.09375,1.99218746 l 0.01172,0.03515625 c 0,0 0.164062,-0.03515625 0.515625,-0.03515625 0.703125,0 0.796875,0.0234375 1.828125,0.0234375 1.394531,0 3.257812,-0.64453125 3.257812,-3.09374996 0,-1.8515625 -1.535156,-3.3867188 -3.445312,-3.3867188 -0.632813,0 -1.007813,0.035156 -1.640625,0.035156 z m 13.535156,5.2617187 v 0.44531251 c -0.339844,0.31640625 -0.925781,0.375 -1.464844,0.375 -1.769531,0 -2.425781,-1.60546871 -2.425781,-2.91796871 0,-1.7460938 0.960937,-2.859375 2.308594,-2.859375 0.890625,0 1.664062,0.5039062 2.109375,1.2539062 l 0.117187,-0.011719 c 0.03516,-0.5039063 0.09375,-0.7734375 0.199219,-1.1484375 l -0.01172,-0.035156 c 0,0 -1.054687,-0.4921875 -2.273437,-0.4921875 -1.710938,0 -3.480469,1.2070313 -3.480469,3.3984375 0,1.7226563 1.230469,3.25781251 3.234375,3.25781251 1.148437,0 1.96875,-0.33984375 2.648437,-0.90234375 v -0.0351563 c -0.09375,-0.09375 -0.105468,-0.36328116 -0.105468,-0.46874996 v -0.058594 c 0,-0.75 0.02344,-1.171875 0.105468,-1.6992189 l -0.01172,-0.035156 c 0,0 -0.175782,0.035156 -0.515625,0.035156 -0.339844,0 -0.515625,-0.035156 -0.515625,-0.035156 l -0.01172,0.035156 c 0.07031,0.5625 0.09375,1.1484376 0.09375,1.8984376 z m 3.606445,-4.4648437 c 0,-0.2929688 0.175781,-0.4453125 0.890625,-0.4453125 0.691406,0 1.417969,0.1875 1.417969,1.3828125 0,1.1367187 -0.5625,1.5117187 -1.523438,1.5117187 -0.257812,0 -0.65625,-0.035156 -0.785156,-0.1054687 z m -0.867187,1.1953125 v 2.4375 c 0,0.75 -0.01172,1.45312496 -0.09375,1.99218746 l 0.02344,0.03515625 c 0.117188,-0.0234375 0.386719,-0.03515625 0.503906,-0.03515625 0.117188,0 0.386719,0.01171875 0.503907,0.03515625 l 0.02344,-0.03515625 c -0.08203,-0.5625 -0.09375,-1.24218746 -0.09375,-1.99218746 v -0.8554688 c 0.210938,0.070312 0.480469,0.09375 0.832031,0.09375 1.816407,0 2.4375,-1.1367187 2.4375,-1.9921875 0,-0.7382812 -0.46875,-1.7460937 -2.308593,-1.7460937 -0.257813,0 -1.101563,0.070312 -1.394532,0.070312 -0.117187,0 -0.386718,-0.011719 -0.503906,-0.035156 l -0.02344,0.035156 c 0.08203,0.5625 0.09375,1.2421875 0.09375,1.9921875 z m 10.438476,0 v 1.6757812 c 0,1.1132813 -0.164062,2.29687501 -1.851562,2.29687501 -1.664063,0 -1.664063,-1.68750001 -1.664063,-2.22656251 v -1.7460937 c 0,-0.75 0.01172,-1.4648438 0.09375,-1.9921875 l -0.02344,-0.035156 c -0.117188,0.023437 -0.386719,0.035156 -0.503907,0.035156 -0.117187,0 -0.386718,-0.011719 -0.503906,-0.035156 l -0.02344,0.035156 c 0.08203,0.5625 0.09375,1.2421875 0.09375,1.9921875 v 2.0742187 c 0,2.03906251 1.429687,2.46093751 2.320312,2.46093751 2.050781,0 2.601563,-1.26562501 2.601563,-3.04687501 v -1.4882812 c 0,-0.75 0.02344,-1.4648438 0.105468,-1.9921875 l -0.02344,-0.035156 c -0.117188,0.023437 -0.222656,0.035156 -0.351563,0.035156 -0.117187,0 -0.222656,-0.011719 -0.339843,-0.035156 l -0.02344,0.035156 c 0.08203,0.5625 0.09375,1.2421875 0.09375,1.9921875 z m 0,0"
+       style="fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+       aria-label="AMDGPU"
+       transform="matrix(1.3333333,0,0,1.3333333,378.95333,231.69733)" />
+    <g
+       id="g151"
+       clip-path="url(#clipPath152)">
+      <path
+         d="M 0,0 H 100 V 100 H 0 Z"
+         transform="matrix(1.6632,0,0,-0.6048,325.63328,302.92473)"
+         style="fill:url(#linearGradient151);stroke:none"
+         id="path151" />
+    </g>
+    <path
+       id="path153"
+       d="m 272.12953,-51.02432 v 22.67752 h 62.36289 v -22.67752 z"
+       style="fill:none;stroke:#2d782d;stroke-width:0.3985;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+       transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)" />
+    <path
+       id="path154"
+       d="m 1.0306094,-4.4295391 v 2.4375 c 0,0.75 -0.011719,1.45312504 -0.09375,1.9921875375 L 0.96029687,0.03530469 C 1.0774844,0.01186719 1.3470156,1.484375e-4 1.4642031,1.484375e-4 c 0.1171875,0 0.3867188,0.0117187525 0.5039063,0.0351562525 L 1.9915469,1.484375e-4 C 1.9095156,-0.56235156 1.8977969,-1.2420391 1.8977969,-1.9920391 v -2.4375 c 0,-0.75 0.011719,-1.4648437 0.09375,-1.9921875 l -0.023437,-0.035156 c -0.1171875,0.023437 -0.3867188,0.035156 -0.5039063,0.035156 -0.1171875,0 -0.3867187,-0.011719 -0.50390623,-0.035156 l -0.0234375,0.035156 c 0.0820312,0.5625 0.09375,1.2421875 0.0937495,1.9921875 z m 6.4804687,2.6367188 c 0,-0.3164063 0.023437,-0.6914063 0.023437,-1.0078125 0,-1.03125 -0.3398437,-1.5703125 -1.2421875,-1.5703125 -0.375,0 -1.078125,0.1523437 -1.7226562,0.890625 l -0.011719,-0.023437 v -0.7265625 c -0.011719,-0.070312 -0.023437,-0.1054688 -0.070312,-0.1054688 -0.2226562,0.046875 -0.5859375,0.058594 -0.796875,0.035156 l -0.023437,0.023437 c 0.070312,0.4453125 0.09375,1.1835937 0.09375,1.9335937 v 0.5507813 c 0,0.75 -0.011719,1.25390624 -0.09375,1.7929687375 L 3.6907656,0.03530469 c 0.1171875,-0.0234375 0.3398438,-0.0351562525 0.46875,-0.0351562525 0.1171875,0 0.3398438,0.0117187525 0.46875,0.0351562525 L 4.6399844,1.484375e-4 C 4.5579531,-0.56235156 4.5579531,-1.0311016 4.5579531,-1.7928203 v -1.1601563 c 0.5039063,-0.5976562 1.078125,-0.8203125 1.4414063,-0.8203125 0.5039062,0 0.7148437,0.1992188 0.7148437,1.0078125 v 0.9726563 c 0,0.75 -0.023437,1.26562499 -0.09375,1.7929687375 L 6.6438906,0.03530469 c 0.1171875,-0.0234375 0.3515625,-0.0351562525 0.46875,-0.0351562525 0.1171875,0 0.3515625,0.0117187525 0.46875,0.0351562525 L 7.6048281,1.484375e-4 C 7.5227969,-0.56235156 7.5110781,-1.0311016 7.5110781,-1.7928203 Z m 1.7373047,-2.484375 c -0.2109375,0 -0.421875,0 -0.5390625,-0.011719 -0.070312,0.1640625 -0.1289062,0.2578125 -0.2226562,0.3984375 l 0.046875,0.070312 c 0.1757812,0 0.4804687,0 0.7148437,-0.011719 v 1.359375 c 0,0.5976562 -0.035156,1.3125 -0.035156,1.61718749 0,0.64453125 0.421875,0.9609375 0.8789062,0.9609375 0.421875,0 0.738281,-0.1054687525 1.160156,-0.375 l -0.128906,-0.234375 c -0.304688,0.09375 -0.527344,0.10546875 -0.785156,0.0703125 -0.234375,-0.0351563 -0.328125,-0.26953125 -0.328125,-0.82031249 0,-0.3046875 0.03516,-0.7265625 0.03516,-1.3242188 v -1.2539062 h 0.375 c 0.234375,0 0.574219,0.011719 0.714844,0.011719 0.03516,-0.1523437 0.05859,-0.2578125 0.117187,-0.3984375 l -0.04687,-0.070312 c -0.175781,0 -0.46875,0.011719 -0.691406,0.011719 h -0.46875 c 0,-0.7382813 0,-0.8671875 0.04687,-1.453125 0,-0.070312 -0.03516,-0.09375 -0.09375,-0.09375 -0.2460937,0.09375 -0.375,0.2109375 -0.6914062,0.2460937 l -0.023437,0.035156 c -0.023437,0.3398437 -0.035156,0.6914062 -0.035156,1.265625 z m 3.6796872,1.5585937 c 0.175782,-1.03125 0.820313,-1.2539062 1.136719,-1.2539062 0.386719,0 0.84375,0.3632812 0.84375,1.1132812 0,0.09375 -0.04687,0.140625 -0.140625,0.140625 z m 2.660157,1.6875 c -0.351563,0.37500004 -0.808594,0.53906254 -1.359375,0.53906254 -0.351563,0 -0.820313,-0.12890625 -1.089844,-0.56250004 -0.175781,-0.28125 -0.234375,-0.6796875 -0.234375,-1.265625 h 2.707031 c 0.105469,0 0.175781,-0.058594 0.175781,-0.1757812 0,-0.8320313 -0.410156,-1.875 -1.722656,-1.875 -1.019531,0 -2.039062,0.8203125 -2.039062,2.2851562 0,0.5742188 0.105468,1.12500004 0.445312,1.52343754 0.339844,0.421875 0.914063,0.66796875 1.582031,0.66796875 0.726563,0 1.359375,-0.375 1.734375,-0.890625 z m 1.514648,-0.9609375 c 0,0.75 -0.02344,1.46484379 -0.09375,1.9921875375 l 0.02344,0.0351562525 c 0.117188,-0.0234375 0.339844,-0.0351562525 0.46875,-0.0351562525 0.117188,0 0.339844,0.0117187525 0.457032,0.0351562525 L 17.981781,1.484375e-4 C 17.89975,-0.56235156 17.89975,-1.2303203 17.89975,-1.9920391 v -2.9882812 c 0,-0.75 0.03516,-1.2304688 0.08203,-1.875 0,-0.070312 -0.02344,-0.09375 -0.08203,-0.09375 -0.257813,0.09375 -0.46875,0.1640625 -0.867188,0.1992187 l -0.02344,0.023437 c 0.07031,0.4335938 0.09375,1.171875 0.09375,1.921875 z m 9.536133,0.8320313 v 0.44531249 c -0.339844,0.31640625 -0.925781,0.375 -1.464844,0.375 -1.769531,0 -2.425781,-1.60546879 -2.425781,-2.91796879 0,-1.7460937 0.960937,-2.859375 2.308594,-2.859375 0.890625,0 1.664062,0.5039063 2.109375,1.2539063 l 0.117187,-0.011719 c 0.03516,-0.5039062 0.09375,-0.7734375 0.199219,-1.1484375 l -0.01172,-0.035156 c 0,0 -1.054687,-0.4921875 -2.273437,-0.4921875 -1.710938,0 -3.480469,1.2070312 -3.480469,3.3984375 0,1.7226562 1.230469,3.25781249 3.234375,3.25781249 1.148437,0 1.96875,-0.33984375 2.648437,-0.90234375 v -0.0351563 c -0.09375,-0.09375 -0.105468,-0.36328134 -0.105468,-0.46875004 v -0.058594 c 0,-0.75 0.02344,-1.171875 0.105468,-1.6992187 l -0.01172,-0.035156 c 0,0 -0.175782,0.035156 -0.515625,0.035156 -0.339844,0 -0.515625,-0.035156 -0.515625,-0.035156 l -0.01172,0.035156 c 0.07031,0.5625 0.09375,1.1484375 0.09375,1.8984375 z m 3.603515,-4.4648438 c 0,-0.2929687 0.175782,-0.4453125 0.890625,-0.4453125 0.691407,0 1.417969,0.1875 1.417969,1.3828125 0,1.1367188 -0.5625,1.5117188 -1.523437,1.5117188 -0.257813,0 -0.65625,-0.035156 -0.785157,-0.1054688 z m -0.867187,1.1953125 v 2.4375 c 0,0.75 -0.01172,1.45312504 -0.09375,1.9921875375 l 0.02344,0.0351562525 c 0.117188,-0.0234375 0.386719,-0.0351562525 0.503907,-0.0351562525 0.117187,0 0.386718,0.0117187525 0.503906,0.0351562525 L 30.336273,1.484375e-4 C 30.254242,-0.56235156 30.242523,-1.2420391 30.242523,-1.9920391 v -0.8554687 c 0.210938,0.070312 0.480469,0.09375 0.832032,0.09375 1.816406,0 2.4375,-1.1367188 2.4375,-1.9921875 0,-0.7382813 -0.46875,-1.7460938 -2.308594,-1.7460938 -0.257813,0 -1.101563,0.070312 -1.394531,0.070312 -0.117188,0 -0.386719,-0.011719 -0.503907,-0.035156 l -0.02344,0.035156 c 0.08203,0.5625 0.09375,1.2421875 0.09375,1.9921875 z m 10.438476,0 v 1.6757813 c 0,1.1132812 -0.164062,2.29687499 -1.851562,2.29687499 -1.664063,0 -1.664063,-1.68749999 -1.664063,-2.22656249 v -1.7460938 c 0,-0.75 0.01172,-1.4648437 0.09375,-1.9921875 l -0.02344,-0.035156 c -0.117188,0.023437 -0.386719,0.035156 -0.503906,0.035156 -0.117188,0 -0.386719,-0.011719 -0.503907,-0.035156 l -0.02344,0.035156 c 0.08203,0.5625 0.09375,1.2421875 0.09375,1.9921875 v 2.0742188 c 0,2.03906249 1.429687,2.46093749 2.320312,2.46093749 2.050782,0 2.601563,-1.26562499 2.601563,-3.04687499 v -1.4882813 c 0,-0.75 0.02344,-1.4648437 0.105469,-1.9921875 l -0.02344,-0.035156 c -0.117187,0.023437 -0.222656,0.035156 -0.351562,0.035156 -0.117188,0 -0.222657,-0.011719 -0.339844,-0.035156 l -0.02344,0.035156 c 0.08203,0.5625 0.09375,1.2421875 0.09375,1.9921875 z m 0,0"
+       style="fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+       aria-label="IntelGPU"
+       transform="matrix(1.3333333,0,0,1.3333333,381.25867,277.26933)" />
+  </g>
 </svg>
diff --git a/doc/img/libCEEDBackends.tex b/doc/img/libCEEDBackends.tex
new file mode 100644
index 0000000000..a1b9d28652
--- /dev/null
+++ b/doc/img/libCEEDBackends.tex
@@ -0,0 +1,192 @@
+\documentclass[tikz]{standalone}
+\usepackage{tikz}
+\usepackage{pgfplots}
+\usepackage{pgfmath}
+\usepackage{libertine}
+\usetikzlibrary{calc}
+
+\renewcommand{\familydefault}{\sfdefault}
+
+\definecolor{ceed@blue}{RGB}{100,150,230}
+\definecolor{ceed@green}{RGB}{75,200,75}
+\definecolor{ceed@red}{RGB}{200,75,75}
+\definecolor{ceed@orange}{RGB}{252,186,3}
+
+\pgfplotsset{compat=1.18}
+
+\begin{document}
+
+\begin{tikzpicture}
+
+\begin{scope}[shift={(0,-0.6)}]
+  \node at (1.0,6.1) {\large Application};
+
+  % PETSc
+  \draw[
+    top color=ceed@red!10!white,
+    bottom color=ceed@red!40!white,
+    ceed@red!60!black,
+  ] (0.0,3.0) rectangle ++(1.6,0.8)
+  node[pos=.5,align=center,color=black] {PETSc};
+  \draw[-stealth, line width=0.5pt] (1.6, 3.0+0.4) -- ++(1.6,-1.2-0.4);
+
+  % Ratel
+  \draw[
+    top color=ceed@red!10!white,
+    bottom color=ceed@red!40!white,
+    ceed@red!60!black,
+  ] (0.0,1.8) rectangle ++(1.6,0.8)
+  node[pos=.5,align=center,color=black] {Ratel};
+  \draw[-stealth, line width=0.5pt] (1.6, 1.8+0.4) -- ++(1.6,0.0-0.55);
+
+  % HONEE
+  \draw[
+    top color=ceed@red!10!white,
+    bottom color=ceed@red!40!white,
+    ceed@red!60!black,
+  ] (0.0,0.6) rectangle ++(1.6,0.8)
+  node[pos=.5,align=center,color=black] {HONEE};
+  \draw[-stealth, line width=0.5pt] (1.6, 0.6+0.4) -- ++(1.6,1.2-0.65);
+
+  % MFEM
+  \draw[
+    top color=ceed@red!10!white,
+    bottom color=ceed@red!40!white,
+    ceed@red!60!black,
+  ] (0.0,-0.6) rectangle ++(1.6,0.8)
+  node[pos=.5,align=center,color=black] {MFEM};
+  \draw[-stealth, line width=0.5pt] (1.6, -0.6+0.4) -- ++(1.6,2.4-0.8);
+\end{scope}
+
+\begin{scope}[shift={(3.2,0)}]
+  \begin{scope}[shift={(0,-0.6)}]
+  \node at (0.8,6.1) {\large Library};
+    \draw[
+      top color=ceed@blue!10!white,
+      bottom color=ceed@blue!40!white,
+      ceed@blue!60!black,
+    ] (0.0,1.2) rectangle ++(1.6,0.8)
+    node[pos=.5,align=center,color=black] {libCEED};
+
+    \draw[-stealth, line width=0.5pt] (1.6, 1.6) -- ++(1.6,3.6);
+    \draw[-stealth, line width=0.5pt] (1.6, 1.6) -- ++(1.6,2.4);
+    \draw[-stealth, line width=0.5pt] (1.6, 1.6) -- ++(1.6,1.2);
+    \draw[-stealth, line width=0.5pt] (1.6, 1.6) -- ++(1.6,0.0);
+    \draw[-stealth, line width=0.5pt] (1.6, 1.6) -- ++(1.6,-1.2);
+    \draw[-stealth, line width=0.5pt] (1.6, 1.6) -- ++(1.6,-2.4);
+    \draw[-stealth, line width=0.5pt] (1.6, 1.6) -- ++(1.6,-3.6);
+  \end{scope}
+\end{scope}
+
+\begin{scope}[shift={(6.4,0)}]
+  \begin{scope}[shift={(0,-0.6)}]
+    \node at (0.95,6.1) {\large Backends};
+
+    % C
+    \draw[
+      top color=black!5!white,
+      bottom color=black!20!white,
+      black!80!white,
+    ] (0.0,4.8) rectangle ++(2.0,0.8)
+    node[pos=.5,align=center,color=black] {Pure C};
+    \draw[-stealth, line width=0.5pt] (2.0, 5.2) -- ++(1.2,-1.2+0.15);
+
+    % AVX
+    \draw[
+      top color=black!5!white,
+      bottom color=black!20!white,
+      black!80!white,
+    ] (0.0,3.6) rectangle ++(2.0,0.8)
+    node[pos=.5,align=center,color=black] {AVX};
+    \draw[-stealth, line width=0.5pt] (2.0, 4.0) -- ++(1.2,+0.0+0.025);
+
+    % LIBXSMM
+    \draw[
+      top color=black!5!white,
+      bottom color=black!20!white,
+      black!80!white,
+    ] (0.0,2.4) rectangle ++(2.0,0.8)
+    node[pos=.5,align=center,color=black] {LIBXSMM};
+    \draw[-stealth, line width=0.5pt] (2.0, 2.8) -- ++(1.2,1.1-0.025);
+
+    % CUDA
+    \draw[
+      top color=black!5!white,
+      bottom color=black!20!white,
+      black!80!white,
+    ] (0.0,1.2) rectangle ++(2.0,0.8)
+    node[pos=.5,align=center,color=black] {CUDA};
+    \draw[-stealth, line width=0.5pt] (2.0, 1.6) -- ++(1.2,0.0+0.025);
+
+    % HIP
+    \draw[
+      top color=black!5!white,
+      bottom color=black!20!white,
+      black!80!white,
+    ] (0.0,0.0) rectangle ++(2.0,0.8)
+    node[pos=.5,align=center,color=black] {HIP};
+    \draw[-stealth, line width=0.5pt] (2.0, 0.4) -- ++(1.2,0.0+0.025);
+
+    % SYCL
+    \draw[
+      top color=black!5!white,
+      bottom color=black!20!white,
+      black!80!white,
+    ] (0.0,-1.2) rectangle ++(2.0,0.8)
+    node[pos=.5,align=center,color=black] {SYCL};
+    \draw[-stealth, line width=0.5pt] (2.0, -0.8) -- ++(1.2,0.0+0.025);
+
+    % MAGMA
+    \draw[
+      top color=black!5!white,
+      bottom color=black!20!white,
+      black!80!white,
+    ] (0.0,-2.4) rectangle ++(2.0,0.8)
+    node[pos=.5,align=center,color=black] {MAGMA};
+    \draw[-stealth, line width=0.5pt] (2.0, -2.0) -- ++(1.2,3.7-0.15);
+    \draw[-stealth, line width=0.5pt] (2.0, -2.0) -- ++(1.2,2.5-0.15);
+
+  \end{scope}
+\end{scope}
+
+\begin{scope}[shift={(9.6,0)}]
+  \begin{scope}[shift={(0,-0.6)}]
+    \node at (1.1,6.1) {\large Hardware};
+
+    % CPU
+    \draw[
+      top color=ceed@green!20!white,
+      bottom color=ceed@green!60!white,
+      ceed@green!60!black,
+    ] (0.0,3.6) rectangle ++(2.2,0.8)
+    node[pos=.5,align=center,color=black] {CPU};
+
+    % CUDA GPU
+    \draw[
+      top color=ceed@green!20!white,
+      bottom color=ceed@green!60!white,
+      ceed@green!60!black,
+    ] (0.0,1.2) rectangle ++(2.2,0.8)
+    node[pos=.5,align=center,color=black] {NVIDIA GPU};
+
+    % ROCm GPU
+    \draw[
+      top color=ceed@green!20!white,
+      bottom color=ceed@green!60!white,
+      ceed@green!60!black,
+    ] (0.0,-0.0) rectangle ++(2.2,0.8)
+    node[pos=.5,align=center,color=black] {AMD GPU};
+
+    % Intel GPU
+    \draw[
+      top color=ceed@green!20!white,
+      bottom color=ceed@green!60!white,
+      ceed@green!60!black,
+    ] (0.0,-1.2) rectangle ++(2.2,0.8)
+    node[pos=.5,align=center,color=black] {Intel GPU};
+
+  \end{scope}
+\end{scope}
+
+\end{tikzpicture}
+\end{document}

From ed094490f53e580908aa80e9fe815a6fd76d7526 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Thu, 24 Jul 2025 13:44:56 -0600
Subject: [PATCH 478/571] api - naming consistency for composite CeedOperator

---
 backends/cuda-gen/ceed-cuda-gen-operator.c    | 12 +--
 backends/cuda-ref/ceed-cuda-ref-operator.c    |  8 +-
 backends/hip-gen/ceed-hip-gen-operator.c      | 14 +--
 backends/hip-ref/ceed-hip-ref-operator.c      |  8 +-
 backends/ref/ceed-ref-operator.c              |  4 +-
 .../sycl-ref/ceed-sycl-ref-operator.sycl.cpp  |  8 +-
 doc/sphinx/source/releasenotes.md             |  2 +
 examples/fluids/problems/advection.c          |  2 +-
 examples/fluids/problems/newtonian.c          |  2 +-
 examples/fluids/src/differential_filter.c     |  4 +-
 examples/fluids/src/mat-ceed.c                |  4 +-
 examples/fluids/src/setuplibceed.c            | 20 ++--
 .../fluids/src/strong_boundary_conditions.c   |  4 +-
 include/ceed/backend.h                        |  4 +-
 include/ceed/ceed.h                           | 20 ++--
 interface/ceed-fortran.c                      | 12 +--
 interface/ceed-operator.c                     | 64 ++++++-------
 interface/ceed-preconditioning.c              | 96 ++++++++++---------
 julia/LibCEED.jl/src/Operator.jl              |  4 +-
 .../src/generated/libceed_bindings.jl         | 20 ++--
 julia/LibCEED.jl/test/rundevtests.jl          | 34 ++++++-
 julia/LibCEED.jl/test/runtests.jl             |  4 -
 python/ceed_operator.py                       |  4 +-
 rust/libceed/src/operator.rs                  |  4 +-
 tests/t520-operator-f.f90                     | 12 +--
 tests/t520-operator.c                         | 22 ++---
 tests/t521-operator-f.f90                     | 12 +--
 tests/t521-operator.c                         | 12 +--
 tests/t522-operator-f.f90                     | 12 +--
 tests/t522-operator.c                         | 12 +--
 tests/t523-operator-f.f90                     | 12 +--
 tests/t523-operator.c                         | 12 +--
 tests/t524-operator-f.f90                     | 12 +--
 tests/t524-operator.c                         | 12 +--
 tests/t525-operator.c                         |  6 +-
 tests/t526-operator.c                         |  6 +-
 tests/t538-operator.c                         |  6 +-
 tests/t554-operator.c                         | 20 ++--
 tests/t565-operator.c                         |  6 +-
 39 files changed, 287 insertions(+), 245 deletions(-)

diff --git a/backends/cuda-gen/ceed-cuda-gen-operator.c b/backends/cuda-gen/ceed-cuda-gen-operator.c
index 7965b2a70f..c6791807ac 100644
--- a/backends/cuda-gen/ceed-cuda-gen-operator.c
+++ b/backends/cuda-gen/ceed-cuda-gen-operator.c
@@ -302,8 +302,8 @@ static int CeedOperatorApplyAddComposite_Cuda_gen(CeedOperator op, CeedVector in
   CeedOperator     *sub_operators;
 
   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
-  CeedCall(CeedCompositeOperatorGetNumSub(op, &num_suboperators));
-  CeedCall(CeedCompositeOperatorGetSubList(op, &sub_operators));
+  CeedCall(CeedOperatorCompositeGetNumSub(op, &num_suboperators));
+  CeedCall(CeedOperatorCompositeGetSubList(op, &sub_operators));
   if (input_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(input_vec, CEED_MEM_DEVICE, &input_arr));
   if (output_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArray(output_vec, CEED_MEM_DEVICE, &output_arr));
   for (CeedInt i = 0; i < num_suboperators; i++) {
@@ -515,7 +515,7 @@ static int CeedOperatorLinearAssembleQFunctionCore_Cuda_gen(CeedOperator op, boo
 
     CeedDebug(CeedOperatorReturnCeed(op), "\nFalling back to /gpu/cuda/ref CeedOperator for LinearAssemblyQFunction\n");
     CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback));
-    CeedCallBackend(CeedOperatorFallbackLinearAssembleQFunctionBuildOrUpdate(op_fallback, assembled, rstr, request));
+    CeedCallBackend(CeedOperatorLinearAssembleQFunctionBuildOrUpdateFallback(op_fallback, assembled, rstr, request));
     return CEED_ERROR_SUCCESS;
   }
   return CEED_ERROR_SUCCESS;
@@ -695,7 +695,7 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda_gen(CeedOperator o
 //------------------------------------------------------------------------------
 // AtPoints full assembly
 //------------------------------------------------------------------------------
-static int CeedSingleOperatorAssembleAtPoints_Cuda_gen(CeedOperator op, CeedInt offset, CeedVector assembled) {
+static int CeedOperatorAssembleSingleAtPoints_Cuda_gen(CeedOperator op, CeedInt offset, CeedVector assembled) {
   Ceed                   ceed;
   CeedOperator_Cuda_gen *data;
 
@@ -851,7 +851,7 @@ static int CeedSingleOperatorAssembleAtPoints_Cuda_gen(CeedOperator op, CeedInt
 
     CeedDebug(CeedOperatorReturnCeed(op), "\nFalling back to /gpu/cuda/ref CeedOperator for AtPoints SingleOperatorAssemble\n");
     CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback));
-    CeedCallBackend(CeedSingleOperatorAssemble(op_fallback, offset, assembled));
+    CeedCallBackend(CeedOperatorAssembleSingle(op_fallback, offset, assembled));
     return CEED_ERROR_SUCCESS;
   }
   return CEED_ERROR_SUCCESS;
@@ -878,7 +878,7 @@ int CeedOperatorCreate_Cuda_gen(CeedOperator op) {
   if (is_at_points) {
     CeedCallBackend(
         CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddDiagonal", CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda_gen));
-    CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleSingle", CeedSingleOperatorAssembleAtPoints_Cuda_gen));
+    CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleSingle", CeedOperatorAssembleSingleAtPoints_Cuda_gen));
   }
   if (!is_at_points) {
     CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunction", CeedOperatorLinearAssembleQFunction_Cuda_gen));
diff --git a/backends/cuda-ref/ceed-cuda-ref-operator.c b/backends/cuda-ref/ceed-cuda-ref-operator.c
index de6129fc25..b278f37643 100644
--- a/backends/cuda-ref/ceed-cuda-ref-operator.c
+++ b/backends/cuda-ref/ceed-cuda-ref-operator.c
@@ -1501,7 +1501,7 @@ static int CeedOperatorLinearAssembleAddPointBlockDiagonal_Cuda(CeedOperator op,
 //------------------------------------------------------------------------------
 // Single Operator Assembly Setup
 //------------------------------------------------------------------------------
-static int CeedSingleOperatorAssembleSetup_Cuda(CeedOperator op, CeedInt use_ceedsize_idx) {
+static int CeedOperatorAssembleSingleSetup_Cuda(CeedOperator op, CeedInt use_ceedsize_idx) {
   Ceed                ceed;
   Ceed_Cuda          *cuda_data;
   CeedInt             num_input_fields, num_output_fields, num_eval_modes_in = 0, num_eval_modes_out = 0;
@@ -1707,7 +1707,7 @@ static int CeedSingleOperatorAssembleSetup_Cuda(CeedOperator op, CeedInt use_cee
 // (could have multiple basis eval modes).
 // TODO: allow multiple active input restrictions/basis objects
 //------------------------------------------------------------------------------
-static int CeedSingleOperatorAssemble_Cuda(CeedOperator op, CeedInt offset, CeedVector values) {
+static int CeedOperatorAssembleSingle_Cuda(CeedOperator op, CeedInt offset, CeedVector values) {
   Ceed                ceed;
   CeedSize            values_length = 0, assembled_qf_length = 0;
   CeedInt             use_ceedsize_idx = 0, num_elem_in, num_elem_out, elem_size_in, elem_size_out;
@@ -1733,7 +1733,7 @@ static int CeedSingleOperatorAssemble_Cuda(CeedOperator op, CeedInt offset, Ceed
   if ((values_length > INT_MAX) || (assembled_qf_length > INT_MAX)) use_ceedsize_idx = 1;
 
   // Setup
-  if (!impl->asmb) CeedCallBackend(CeedSingleOperatorAssembleSetup_Cuda(op, use_ceedsize_idx));
+  if (!impl->asmb) CeedCallBackend(CeedOperatorAssembleSingleSetup_Cuda(op, use_ceedsize_idx));
   CeedOperatorAssemble_Cuda *asmb = impl->asmb;
 
   assert(asmb != NULL);
@@ -2077,7 +2077,7 @@ int CeedOperatorCreate_Cuda(CeedOperator op) {
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddDiagonal", CeedOperatorLinearAssembleAddDiagonal_Cuda));
   CeedCallBackend(
       CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddPointBlockDiagonal", CeedOperatorLinearAssembleAddPointBlockDiagonal_Cuda));
-  CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleSingle", CeedSingleOperatorAssemble_Cuda));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleSingle", CeedOperatorAssembleSingle_Cuda));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAdd_Cuda));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "Destroy", CeedOperatorDestroy_Cuda));
   CeedCallBackend(CeedDestroy(&ceed));
diff --git a/backends/hip-gen/ceed-hip-gen-operator.c b/backends/hip-gen/ceed-hip-gen-operator.c
index bf673730db..e4c2634a66 100644
--- a/backends/hip-gen/ceed-hip-gen-operator.c
+++ b/backends/hip-gen/ceed-hip-gen-operator.c
@@ -30,7 +30,7 @@ static int CeedOperatorDestroy_Hip_gen(CeedOperator op) {
   if (is_composite) {
     CeedInt num_suboperators;
 
-    CeedCall(CeedCompositeOperatorGetNumSub(op, &num_suboperators));
+    CeedCall(CeedOperatorCompositeGetNumSub(op, &num_suboperators));
     for (CeedInt i = 0; i < num_suboperators; i++) {
       if (impl->streams[i]) CeedCallHip(ceed, hipStreamDestroy(impl->streams[i]));
       impl->streams[i] = NULL;
@@ -262,8 +262,8 @@ static int CeedOperatorApplyAddComposite_Hip_gen(CeedOperator op, CeedVector inp
 
   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
   CeedCallBackend(CeedOperatorGetData(op, &impl));
-  CeedCallBackend(CeedCompositeOperatorGetNumSub(op, &num_suboperators));
-  CeedCallBackend(CeedCompositeOperatorGetSubList(op, &sub_operators));
+  CeedCallBackend(CeedOperatorCompositeGetNumSub(op, &num_suboperators));
+  CeedCallBackend(CeedOperatorCompositeGetSubList(op, &sub_operators));
   if (input_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(input_vec, CEED_MEM_DEVICE, &input_arr));
   if (output_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArray(output_vec, CEED_MEM_DEVICE, &output_arr));
   for (CeedInt i = 0; i < num_suboperators; i++) {
@@ -491,7 +491,7 @@ static int CeedOperatorLinearAssembleQFunctionCore_Hip_gen(CeedOperator op, bool
 
     CeedDebug(CeedOperatorReturnCeed(op), "\nFalling back to /gpu/hip/ref CeedOperator for LineearAssembleQFunction\n");
     CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback));
-    CeedCallBackend(CeedOperatorFallbackLinearAssembleQFunctionBuildOrUpdate(op_fallback, assembled, rstr, request));
+    CeedCallBackend(CeedOperatorLinearAssembleQFunctionBuildOrUpdateFallback(op_fallback, assembled, rstr, request));
     return CEED_ERROR_SUCCESS;
   }
   return CEED_ERROR_SUCCESS;
@@ -684,7 +684,7 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip_gen(CeedOperator op
 //------------------------------------------------------------------------------
 // AtPoints full assembly
 //------------------------------------------------------------------------------
-static int CeedSingleOperatorAssembleAtPoints_Hip_gen(CeedOperator op, CeedInt offset, CeedVector assembled) {
+static int CeedOperatorAssembleSingleAtPoints_Hip_gen(CeedOperator op, CeedInt offset, CeedVector assembled) {
   Ceed                  ceed;
   CeedOperator_Hip_gen *data;
 
@@ -860,7 +860,7 @@ static int CeedSingleOperatorAssembleAtPoints_Hip_gen(CeedOperator op, CeedInt o
 
     CeedDebug(CeedOperatorReturnCeed(op), "\nFalling back to /gpu/hip/ref CeedOperator for AtPoints SingleOperatorAssemble\n");
     CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback));
-    CeedCallBackend(CeedSingleOperatorAssemble(op_fallback, offset, assembled));
+    CeedCallBackend(CeedOperatorAssembleSingle(op_fallback, offset, assembled));
     return CEED_ERROR_SUCCESS;
   }
   return CEED_ERROR_SUCCESS;
@@ -886,7 +886,7 @@ int CeedOperatorCreate_Hip_gen(CeedOperator op) {
   CeedCall(CeedOperatorIsAtPoints(op, &is_at_points));
   if (is_at_points) {
     CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddDiagonal", CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip_gen));
-    CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleSingle", CeedSingleOperatorAssembleAtPoints_Hip_gen));
+    CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleSingle", CeedOperatorAssembleSingleAtPoints_Hip_gen));
   }
   if (!is_at_points) {
     CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunction", CeedOperatorLinearAssembleQFunction_Hip_gen));
diff --git a/backends/hip-ref/ceed-hip-ref-operator.c b/backends/hip-ref/ceed-hip-ref-operator.c
index 15fc21b73a..8c37aba61b 100644
--- a/backends/hip-ref/ceed-hip-ref-operator.c
+++ b/backends/hip-ref/ceed-hip-ref-operator.c
@@ -1498,7 +1498,7 @@ static int CeedOperatorLinearAssembleAddPointBlockDiagonal_Hip(CeedOperator op,
 //------------------------------------------------------------------------------
 // Single Operator Assembly Setup
 //------------------------------------------------------------------------------
-static int CeedSingleOperatorAssembleSetup_Hip(CeedOperator op, CeedInt use_ceedsize_idx) {
+static int CeedOperatorAssembleSingleSetup_Hip(CeedOperator op, CeedInt use_ceedsize_idx) {
   Ceed                ceed;
   Ceed_Hip           *hip_data;
   CeedInt             num_input_fields, num_output_fields, num_eval_modes_in = 0, num_eval_modes_out = 0;
@@ -1704,7 +1704,7 @@ static int CeedSingleOperatorAssembleSetup_Hip(CeedOperator op, CeedInt use_ceed
 // (could have multiple basis eval modes).
 // TODO: allow multiple active input restrictions/basis objects
 //------------------------------------------------------------------------------
-static int CeedSingleOperatorAssemble_Hip(CeedOperator op, CeedInt offset, CeedVector values) {
+static int CeedOperatorAssembleSingle_Hip(CeedOperator op, CeedInt offset, CeedVector values) {
   Ceed                ceed;
   CeedSize            values_length = 0, assembled_qf_length = 0;
   CeedInt             use_ceedsize_idx = 0, num_elem_in, num_elem_out, elem_size_in, elem_size_out;
@@ -1730,7 +1730,7 @@ static int CeedSingleOperatorAssemble_Hip(CeedOperator op, CeedInt offset, CeedV
   if ((values_length > INT_MAX) || (assembled_qf_length > INT_MAX)) use_ceedsize_idx = 1;
 
   // Setup
-  if (!impl->asmb) CeedCallBackend(CeedSingleOperatorAssembleSetup_Hip(op, use_ceedsize_idx));
+  if (!impl->asmb) CeedCallBackend(CeedOperatorAssembleSingleSetup_Hip(op, use_ceedsize_idx));
   CeedOperatorAssemble_Hip *asmb = impl->asmb;
 
   assert(asmb != NULL);
@@ -2076,7 +2076,7 @@ int CeedOperatorCreate_Hip(CeedOperator op) {
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddDiagonal", CeedOperatorLinearAssembleAddDiagonal_Hip));
   CeedCallBackend(
       CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddPointBlockDiagonal", CeedOperatorLinearAssembleAddPointBlockDiagonal_Hip));
-  CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleSingle", CeedSingleOperatorAssemble_Hip));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleSingle", CeedOperatorAssembleSingle_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAdd_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "Destroy", CeedOperatorDestroy_Hip));
   CeedCallBackend(CeedDestroy(&ceed));
diff --git a/backends/ref/ceed-ref-operator.c b/backends/ref/ceed-ref-operator.c
index 89fd47df76..4769d34d68 100644
--- a/backends/ref/ceed-ref-operator.c
+++ b/backends/ref/ceed-ref-operator.c
@@ -1527,7 +1527,7 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Ref(CeedOperator op, Ce
 //------------------------------------------------------------------------------
 // Assemble Operator AtPoints
 //------------------------------------------------------------------------------
-static int CeedSingleOperatorAssembleAtPoints_Ref(CeedOperator op, CeedInt offset, CeedVector values) {
+static int CeedOperatorAssembleSingleAtPoints_Ref(CeedOperator op, CeedInt offset, CeedVector values) {
   CeedInt             num_points_offset = 0, num_input_fields, num_output_fields, num_elem, num_comp_active = 1;
   CeedScalar         *e_data[2 * CEED_FIELD_MAX] = {0}, *assembled;
   Ceed                ceed;
@@ -1798,7 +1798,7 @@ int CeedOperatorCreateAtPoints_Ref(CeedOperator op) {
   CeedCallBackend(
       CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunctionUpdate", CeedOperatorLinearAssembleQFunctionAtPointsUpdate_Ref));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddDiagonal", CeedOperatorLinearAssembleAddDiagonalAtPoints_Ref));
-  CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleSingle", CeedSingleOperatorAssembleAtPoints_Ref));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleSingle", CeedOperatorAssembleSingleAtPoints_Ref));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAddAtPoints_Ref));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "Destroy", CeedOperatorDestroy_Ref));
   CeedCallBackend(CeedDestroy(&ceed));
diff --git a/backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp b/backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp
index 1bf91636db..9833e2a837 100644
--- a/backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp
+++ b/backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp
@@ -997,7 +997,7 @@ static int CeedOperatorLinearAssembleAddPointBlockDiagonal_Sycl(CeedOperator op,
 //------------------------------------------------------------------------------
 // Single operator assembly setup
 //------------------------------------------------------------------------------
-static int CeedSingleOperatorAssembleSetup_Sycl(CeedOperator op) {
+static int CeedOperatorAssembleSingleSetup_Sycl(CeedOperator op) {
   Ceed    ceed;
   CeedInt num_input_fields, num_output_fields, num_eval_mode_in = 0, dim = 1, num_B_in_mats_to_load = 0, size_B_in = 0, num_eval_mode_out = 0,
                                                num_B_out_mats_to_load = 0, size_B_out = 0, num_qpts = 0, elem_size = 0, num_elem, num_comp,
@@ -1337,7 +1337,7 @@ static int CeedOperatorLinearAssembleFallback_Sycl(sycl::queue &sycl_queue, cons
 // input restriction/basis per operator (could have multiple basis eval modes).
 // TODO: allow multiple active input restrictions/basis objects
 //------------------------------------------------------------------------------
-static int CeedSingleOperatorAssemble_Sycl(CeedOperator op, CeedInt offset, CeedVector values) {
+static int CeedOperatorAssembleSingle_Sycl(CeedOperator op, CeedInt offset, CeedVector values) {
   Ceed                ceed;
   Ceed_Sycl          *sycl_data;
   CeedScalar         *values_array;
@@ -1353,7 +1353,7 @@ static int CeedSingleOperatorAssemble_Sycl(CeedOperator op, CeedInt offset, Ceed
 
   // Setup
   if (!impl->asmb) {
-    CeedCallBackend(CeedSingleOperatorAssembleSetup_Sycl(op));
+    CeedCallBackend(CeedOperatorAssembleSingleSetup_Sycl(op));
     assert(impl->asmb != NULL);
   }
 
@@ -1397,7 +1397,7 @@ int CeedOperatorCreate_Sycl(CeedOperator op) {
   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Operator", op, "LinearAssembleAddDiagonal", CeedOperatorLinearAssembleAddDiagonal_Sycl));
   CeedCallBackend(
       CeedSetBackendFunctionCpp(ceed, "Operator", op, "LinearAssembleAddPointBlockDiagonal", CeedOperatorLinearAssembleAddPointBlockDiagonal_Sycl));
-  CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Operator", op, "LinearAssembleSingle", CeedSingleOperatorAssemble_Sycl));
+  CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Operator", op, "LinearAssembleSingle", CeedOperatorAssembleSingle_Sycl));
   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAdd_Sycl));
   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Operator", op, "Destroy", CeedOperatorDestroy_Sycl));
   CeedCallBackend(CeedDestroy(&ceed));
diff --git a/doc/sphinx/source/releasenotes.md b/doc/sphinx/source/releasenotes.md
index 2e329e54d7..0a4be0b959 100644
--- a/doc/sphinx/source/releasenotes.md
+++ b/doc/sphinx/source/releasenotes.md
@@ -15,6 +15,8 @@ On this page we provide a summary of the main API changes, new features and exam
     - Add `CEED_RUNNING_JIT_PASS` compiler definition for wrapping header files that device JiT compilers cannot read
     - Users should now prefer `#include <ceed/types.h>` rather than `#include <ceed.h>` in QFunction source files
 - Require use of `Ceed*Destroy()` on Ceed objects returned from `Ceed*Get*()`.
+- Rename `CeedCompositeOperatorCreate()` to `CeedOperatorCreateComposite()` for uniformity.
+- Rename `CeedCompositeOperator*()` to `CeedOperatorComposite*()` for uniformity.
 
 ### New features
 
diff --git a/examples/fluids/problems/advection.c b/examples/fluids/problems/advection.c
index 09d444dd01..7047fd6b66 100644
--- a/examples/fluids/problems/advection.c
+++ b/examples/fluids/problems/advection.c
@@ -35,7 +35,7 @@ PetscErrorCode CreateKSPMassOperator_AdvectionStabilized(User user, CeedOperator
     CeedOperatorField field;
     PetscInt          sub_op_index = 0;  // will be 0 for the volume op
 
-    PetscCallCeed(ceed, CeedCompositeOperatorGetSubList(user->op_rhs_ctx->op, &sub_ops));
+    PetscCallCeed(ceed, CeedOperatorCompositeGetSubList(user->op_rhs_ctx->op, &sub_ops));
     PetscCallCeed(ceed, CeedOperatorGetFieldByName(sub_ops[sub_op_index], "q", &field));
     PetscCallCeed(ceed, CeedOperatorFieldGetData(field, NULL, &elem_restr_q, &basis_q, NULL));
 
diff --git a/examples/fluids/problems/newtonian.c b/examples/fluids/problems/newtonian.c
index f02cc949e8..e13df731c2 100644
--- a/examples/fluids/problems/newtonian.c
+++ b/examples/fluids/problems/newtonian.c
@@ -171,7 +171,7 @@ PetscErrorCode CreateKSPMassOperator_NewtonianStabilized(User user, CeedOperator
     CeedOperatorField field;
     PetscInt          sub_op_index = 0;  // will be 0 for the volume op
 
-    PetscCallCeed(ceed, CeedCompositeOperatorGetSubList(user->op_rhs_ctx->op, &sub_ops));
+    PetscCallCeed(ceed, CeedOperatorCompositeGetSubList(user->op_rhs_ctx->op, &sub_ops));
     PetscCallCeed(ceed, CeedOperatorGetFieldByName(sub_ops[sub_op_index], "q", &field));
     PetscCallCeed(ceed, CeedOperatorFieldGetData(field, NULL, &elem_restr_q, &basis_q, NULL));
 
diff --git a/examples/fluids/src/differential_filter.c b/examples/fluids/src/differential_filter.c
index 8d0b1e6949..0727a39b75 100644
--- a/examples/fluids/src/differential_filter.c
+++ b/examples/fluids/src/differential_filter.c
@@ -96,7 +96,7 @@ PetscErrorCode DifferentialFilterCreateOperators(Ceed ceed, User user, CeedData
     // -- Get Grid anisotropy tensor
     PetscCall(GridAnisotropyTensorCalculateCollocatedVector(ceed, user, ceed_data, &elem_restr_grid_aniso, &grid_aniso_ceed, &num_comp_grid_aniso));
 
-    PetscCallCeed(ceed, CeedCompositeOperatorCreate(ceed, &op_lhs));
+    PetscCallCeed(ceed, CeedOperatorCreateComposite(ceed, &op_lhs));
     for (PetscInt i = 0; i < diff_filter->num_filtered_fields; i++) {
       CeedQFunction       qf_lhs;
       PetscInt            num_comp_filter = diff_filter->num_field_components[i];
@@ -149,7 +149,7 @@ PetscErrorCode DifferentialFilterCreateOperators(Ceed ceed, User user, CeedData
       PetscCallCeed(ceed, CeedOperatorSetField(op_lhs_sub, "v", elem_restr_filter, basis_filter, CEED_VECTOR_ACTIVE));
       PetscCallCeed(ceed, CeedOperatorSetField(op_lhs_sub, "Grad_v", elem_restr_filter, basis_filter, CEED_VECTOR_ACTIVE));
 
-      PetscCallCeed(ceed, CeedCompositeOperatorAddSub(op_lhs, op_lhs_sub));
+      PetscCallCeed(ceed, CeedOperatorCompositeAddSub(op_lhs, op_lhs_sub));
       PetscCallCeed(ceed, CeedElemRestrictionDestroy(&elem_restr_filter));
       PetscCallCeed(ceed, CeedBasisDestroy(&basis_filter));
       PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_lhs));
diff --git a/examples/fluids/src/mat-ceed.c b/examples/fluids/src/mat-ceed.c
index bf2f52b006..5e8ebef86f 100644
--- a/examples/fluids/src/mat-ceed.c
+++ b/examples/fluids/src/mat-ceed.c
@@ -406,8 +406,8 @@ PetscErrorCode MatCreateCeed(DM dm_x, DM dm_y, CeedOperator op_mult, CeedOperato
           CeedInt       num_sub_operators;
           CeedOperator *sub_operators;
 
-          PetscCallCeed(ctx->ceed, CeedCompositeOperatorGetNumSub(op_mult, &num_sub_operators));
-          PetscCallCeed(ctx->ceed, CeedCompositeOperatorGetSubList(op_mult, &sub_operators));
+          PetscCallCeed(ctx->ceed, CeedOperatorCompositeGetNumSub(op_mult, &num_sub_operators));
+          PetscCallCeed(ctx->ceed, CeedOperatorCompositeGetSubList(op_mult, &sub_operators));
           for (CeedInt i = 0; i < num_sub_operators; i++) {
             CeedInt                  num_bases, num_comp;
             CeedBasis               *active_bases;
diff --git a/examples/fluids/src/setuplibceed.c b/examples/fluids/src/setuplibceed.c
index 7c172a3abc..026b496878 100644
--- a/examples/fluids/src/setuplibceed.c
+++ b/examples/fluids/src/setuplibceed.c
@@ -28,7 +28,7 @@ static PetscErrorCode CreateKSPMassOperator_Unstabilized(User user, CeedOperator
     CeedOperatorField field;
     PetscInt          sub_op_index = 0;  // will be 0 for the volume op
 
-    PetscCallCeed(ceed, CeedCompositeOperatorGetSubList(user->op_rhs_ctx->op, &sub_ops));
+    PetscCallCeed(ceed, CeedOperatorCompositeGetSubList(user->op_rhs_ctx->op, &sub_ops));
     PetscCallCeed(ceed, CeedOperatorGetFieldByName(sub_ops[sub_op_index], "q", &field));
     PetscCallCeed(ceed, CeedOperatorFieldGetData(field, NULL, &elem_restr_q, &basis_q, NULL));
 
@@ -134,8 +134,8 @@ static PetscErrorCode AddBCSubOperator(Ceed ceed, DM dm, CeedData ceed_data, DML
   }
 
   // Apply Sub-Operator for Physics
-  PetscCallCeed(ceed, CeedCompositeOperatorAddSub(op_apply, op_apply_bc));
-  if (op_apply_bc_jacobian) PetscCallCeed(ceed, CeedCompositeOperatorAddSub(op_apply_ijacobian, op_apply_bc_jacobian));
+  PetscCallCeed(ceed, CeedOperatorCompositeAddSub(op_apply, op_apply_bc));
+  if (op_apply_bc_jacobian) PetscCallCeed(ceed, CeedOperatorCompositeAddSub(op_apply_ijacobian, op_apply_bc_jacobian));
 
   PetscCallCeed(ceed, CeedVectorDestroy(&q_data_sur));
   PetscCallCeed(ceed, CeedVectorDestroy(&jac_data_sur));
@@ -197,7 +197,7 @@ static PetscErrorCode AddBCSubOperators(User user, Ceed ceed, DM dm, SimpleBC bc
     PetscInt            sub_op_index = 0;  // will be 0 for the volume op
     CeedElemRestriction elem_restr_q, elem_restr_x;
 
-    PetscCallCeed(ceed, CeedCompositeOperatorGetSubList(op_apply, &sub_ops));
+    PetscCallCeed(ceed, CeedOperatorCompositeGetSubList(op_apply, &sub_ops));
     PetscCallCeed(ceed, CeedOperatorGetFieldByName(sub_ops[sub_op_index], "q", &field));
     PetscCallCeed(ceed, CeedOperatorFieldGetElemRestriction(field, &elem_restr_q));
     PetscCallCeed(ceed, CeedElemRestrictionGetNumComponents(elem_restr_q, &num_comp_q));
@@ -440,8 +440,8 @@ PetscErrorCode SetupLibceed(Ceed ceed, CeedData ceed_data, DM dm, User user, App
   if (!user->phys->implicit) {  // RHS
     CeedOperator op_rhs;
 
-    PetscCallCeed(ceed, CeedCompositeOperatorCreate(ceed, &op_rhs));
-    PetscCallCeed(ceed, CeedCompositeOperatorAddSub(op_rhs, op_rhs_vol));
+    PetscCallCeed(ceed, CeedOperatorCreateComposite(ceed, &op_rhs));
+    PetscCallCeed(ceed, CeedOperatorCompositeAddSub(op_rhs, op_rhs_vol));
     PetscCall(AddBCSubOperators(user, ceed, dm, bc, problem, ceed_data, op_rhs, NULL));
 
     PetscCall(OperatorApplyContextCreate(dm, dm, ceed, op_rhs, user->q_ceed, user->g_ceed, user->Q_loc, NULL, &user->op_rhs_ctx));
@@ -456,11 +456,11 @@ PetscErrorCode SetupLibceed(Ceed ceed, CeedData ceed_data, DM dm, User user, App
     CeedOperator op_ijacobian = NULL;
 
     // Create Composite Operaters
-    PetscCallCeed(ceed, CeedCompositeOperatorCreate(ceed, &user->op_ifunction));
-    PetscCallCeed(ceed, CeedCompositeOperatorAddSub(user->op_ifunction, op_ifunction_vol));
+    PetscCallCeed(ceed, CeedOperatorCreateComposite(ceed, &user->op_ifunction));
+    PetscCallCeed(ceed, CeedOperatorCompositeAddSub(user->op_ifunction, op_ifunction_vol));
     if (op_ijacobian_vol) {
-      PetscCallCeed(ceed, CeedCompositeOperatorCreate(ceed, &op_ijacobian));
-      PetscCallCeed(ceed, CeedCompositeOperatorAddSub(op_ijacobian, op_ijacobian_vol));
+      PetscCallCeed(ceed, CeedOperatorCreateComposite(ceed, &op_ijacobian));
+      PetscCallCeed(ceed, CeedOperatorCompositeAddSub(op_ijacobian, op_ijacobian_vol));
     }
     PetscCall(AddBCSubOperators(user, ceed, dm, bc, problem, ceed_data, user->op_ifunction, op_ijacobian));
 
diff --git a/examples/fluids/src/strong_boundary_conditions.c b/examples/fluids/src/strong_boundary_conditions.c
index 2e52a2ae8f..69a8558f78 100644
--- a/examples/fluids/src/strong_boundary_conditions.c
+++ b/examples/fluids/src/strong_boundary_conditions.c
@@ -104,7 +104,7 @@ PetscErrorCode SetupStrongSTG_Ceed(Ceed ceed, CeedData ceed_data, DM dm, Problem
     PetscCallCeed(ceed, CeedOperatorSetField(op_strong_bc_sub, "q", elem_restr_q_sur, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE));
 
     // -- Add to composite operator
-    PetscCallCeed(ceed, CeedCompositeOperatorAddSub(op_strong_bc, op_strong_bc_sub));
+    PetscCallCeed(ceed, CeedOperatorCompositeAddSub(op_strong_bc, op_strong_bc_sub));
 
     PetscCallCeed(ceed, CeedVectorDestroy(&multiplicity));
     PetscCallCeed(ceed, CeedVectorDestroy(&x_stored));
@@ -168,7 +168,7 @@ PetscErrorCode SetupStrongBC_Ceed(Ceed ceed, CeedData ceed_data, DM dm, User use
     PetscCall(DMRestoreGlobalVector(dm, &global_vec));
   }
 
-  PetscCallCeed(ceed, CeedCompositeOperatorCreate(ceed, &op_strong_bc));
+  PetscCallCeed(ceed, CeedOperatorCreateComposite(ceed, &op_strong_bc));
   {
     PetscBool use_strongstg = PETSC_FALSE;
     PetscCall(PetscOptionsGetBool(NULL, NULL, "-stg_strong", &use_strongstg, NULL));
diff --git a/include/ceed/backend.h b/include/ceed/backend.h
index c48e0d4666..fd861702bb 100644
--- a/include/ceed/backend.h
+++ b/include/ceed/backend.h
@@ -461,9 +461,9 @@ CEED_EXTERN int CeedOperatorReference(CeedOperator op);
 CEED_EXTERN int CeedOperatorGetFallback(CeedOperator op, CeedOperator *op_fallback);
 CEED_EXTERN int CeedOperatorGetFallbackParent(CeedOperator op, CeedOperator *parent);
 CEED_EXTERN int CeedOperatorGetFallbackParentCeed(CeedOperator op, Ceed *parent);
-CEED_EXTERN int CeedOperatorFallbackLinearAssembleQFunctionBuildOrUpdate(CeedOperator op, CeedVector *assembled, CeedElemRestriction *rstr,
+CEED_EXTERN int CeedOperatorLinearAssembleQFunctionBuildOrUpdateFallback(CeedOperator op, CeedVector *assembled, CeedElemRestriction *rstr,
                                                                          CeedRequest *request);
-CEED_INTERN int CeedSingleOperatorAssemble(CeedOperator op, CeedInt offset, CeedVector values);
+CEED_INTERN int CeedOperatorAssembleSingle(CeedOperator op, CeedInt offset, CeedVector values);
 CEED_EXTERN int CeedOperatorSetSetupDone(CeedOperator op);
 
 CEED_INTERN int CeedMatrixMatrixMultiply(Ceed ceed, const CeedScalar *mat_A, const CeedScalar *mat_B, CeedScalar *mat_C, CeedInt m, CeedInt n,
diff --git a/include/ceed/ceed.h b/include/ceed/ceed.h
index ce6284b910..8d791ad94e 100644
--- a/include/ceed/ceed.h
+++ b/include/ceed/ceed.h
@@ -409,7 +409,7 @@ CEED_EXTERN int CeedQFunctionContextDestroy(CeedQFunctionContext *ctx);
 
 CEED_EXTERN int CeedOperatorCreate(Ceed ceed, CeedQFunction qf, CeedQFunction dqf, CeedQFunction dqfT, CeedOperator *op);
 CEED_EXTERN int CeedOperatorCreateAtPoints(Ceed ceed, CeedQFunction qf, CeedQFunction dqf, CeedQFunction dqfT, CeedOperator *op);
-CEED_EXTERN int CeedCompositeOperatorCreate(Ceed ceed, CeedOperator *op);
+CEED_EXTERN int CeedOperatorCreateComposite(Ceed ceed, CeedOperator *op);
 CEED_EXTERN int CeedOperatorReferenceCopy(CeedOperator op, CeedOperator *op_copy);
 CEED_EXTERN int CeedOperatorSetField(CeedOperator op, const char *field_name, CeedElemRestriction rstr, CeedBasis basis, CeedVector vec);
 CEED_EXTERN int CeedOperatorGetFields(CeedOperator op, CeedInt *num_input_fields, CeedOperatorField **input_fields, CeedInt *num_output_fields,
@@ -418,10 +418,10 @@ CEED_EXTERN int CeedOperatorGetFields(CeedOperator op, CeedInt *num_input_fields
 CEED_EXTERN int  CeedOperatorAtPointsSetPoints(CeedOperator op, CeedElemRestriction rstr_points, CeedVector point_coords);
 CEED_EXTERN int  CeedOperatorAtPointsGetPoints(CeedOperator op, CeedElemRestriction *rstr_points, CeedVector *point_coords);
 CEED_EXTERN int  CeedOperatorIsAtPoints(CeedOperator op, bool *is_at_points);
-CEED_EXTERN int  CeedCompositeOperatorAddSub(CeedOperator composite_op, CeedOperator sub_op);
-CEED_EXTERN int  CeedCompositeOperatorGetNumSub(CeedOperator op, CeedInt *num_suboperators);
-CEED_EXTERN int  CeedCompositeOperatorGetSubList(CeedOperator op, CeedOperator **sub_operators);
-CEED_EXTERN int  CeedCompositeOperatorGetSubByName(CeedOperator op, const char *op_name, CeedOperator *sub_op);
+CEED_EXTERN int  CeedOperatorCompositeAddSub(CeedOperator composite_op, CeedOperator sub_op);
+CEED_EXTERN int  CeedOperatorCompositeGetNumSub(CeedOperator op, CeedInt *num_suboperators);
+CEED_EXTERN int  CeedOperatorCompositeGetSubList(CeedOperator op, CeedOperator **sub_operators);
+CEED_EXTERN int  CeedOperatorCompositeGetSubByName(CeedOperator op, const char *op_name, CeedOperator *sub_op);
 CEED_EXTERN int  CeedOperatorCheckReady(CeedOperator op);
 CEED_EXTERN int  CeedOperatorGetActiveVectorLengths(CeedOperator op, CeedSize *input_size, CeedSize *output_size);
 CEED_EXTERN int  CeedOperatorSetQFunctionAssemblyReuse(CeedOperator op, bool reuse_assembly_data);
@@ -436,7 +436,7 @@ CEED_EXTERN int  CeedOperatorLinearAssembleAddPointBlockDiagonal(CeedOperator op
 CEED_EXTERN int  CeedOperatorLinearAssemblePointBlockDiagonalSymbolic(CeedOperator op, CeedSize *num_entries, CeedInt **rows, CeedInt **cols);
 CEED_EXTERN int  CeedOperatorLinearAssembleSymbolic(CeedOperator op, CeedSize *num_entries, CeedInt **rows, CeedInt **cols);
 CEED_EXTERN int  CeedOperatorLinearAssemble(CeedOperator op, CeedVector values);
-CEED_EXTERN int  CeedCompositeOperatorGetMultiplicity(CeedOperator op, CeedInt num_skip_indices, CeedInt *skip_indices, CeedVector mult);
+CEED_EXTERN int  CeedOperatorCompositeGetMultiplicity(CeedOperator op, CeedInt num_skip_indices, CeedInt *skip_indices, CeedVector mult);
 CEED_EXTERN int  CeedOperatorMultigridLevelCreate(CeedOperator op_fine, CeedVector p_mult_fine, CeedElemRestriction rstr_coarse,
                                                   CeedBasis basis_coarse, CeedOperator *op_coarse, CeedOperator *op_prolong,
                                                   CeedOperator *op_restrict);
@@ -473,6 +473,14 @@ CEED_EXTERN int  CeedOperatorApplyAddActive(CeedOperator op, CeedVector in, Ceed
 CEED_EXTERN int  CeedOperatorAssemblyDataStrip(CeedOperator op);
 CEED_EXTERN int  CeedOperatorDestroy(CeedOperator *op);
 
+// Compatibility with previous composite CeedOperator naming
+#define CeedCompositeOperatorCreate(a, b) CeedOperatorCreateComposite(a, b)
+#define CeedCompositeOperatorAddSub(a, b) CeedOperatorCompositeAddSub(a, b)
+#define CeedCompositeOperatorGetNumSub(a, b) CeedOperatorCompositeGetNumSub(a, b)
+#define CeedCompositeOperatorGetSubList(a, b) CeedOperatorCompositeGetSubList(a, b)
+#define CeedCompositeOperatorGetSubByName(a, b) CeedOperatorCompositeGetSubByName(a, b, c)
+#define CeedCompositeOperatorGetMultiplicity(a, b, c, d) CeedOperatorCompositeGetMultiplicity(a, b, c, d)
+
 CEED_EXTERN int CeedOperatorGetFieldByName(CeedOperator op, const char *field_name, CeedOperatorField *op_field);
 CEED_EXTERN int CeedOperatorFieldGetName(CeedOperatorField op_field, const char **field_name);
 CEED_EXTERN int CeedOperatorFieldGetElemRestriction(CeedOperatorField op_field, CeedElemRestriction *rstr);
diff --git a/interface/ceed-fortran.c b/interface/ceed-fortran.c
index 73f8c801b9..e62cb30360 100644
--- a/interface/ceed-fortran.c
+++ b/interface/ceed-fortran.c
@@ -951,8 +951,8 @@ CEED_EXTERN void fCeedOperatorCreate(int *ceed, int *qf, int *dqf, int *dqfT, in
   CeedOperator_n++;
 }
 
-#define fCeedCompositeOperatorCreate FORTRAN_NAME(ceedcompositeoperatorcreate, CEEDCOMPOSITEOPERATORCREATE)
-CEED_EXTERN void fCeedCompositeOperatorCreate(int *ceed, int *op, int *err) {
+#define fCeedOperatorCreateComposite FORTRAN_NAME(ceedoperatorcreatecomposite, CEEDOPERATORCREATECOMPOSITE)
+CEED_EXTERN void fCeedOperatorCreateComposite(int *ceed, int *op, int *err) {
   if (CeedOperator_count == CeedOperator_count_max) {
     CeedOperator_count_max += CeedOperator_count_max / 2 + 1;
     CeedRealloc(CeedOperator_count_max, &CeedOperator_dict);
@@ -960,7 +960,7 @@ CEED_EXTERN void fCeedCompositeOperatorCreate(int *ceed, int *op, int *err) {
 
   CeedOperator *op_ = &CeedOperator_dict[CeedOperator_count];
 
-  *err = CeedCompositeOperatorCreate(Ceed_dict[*ceed], op_);
+  *err = CeedOperatorCreateComposite(Ceed_dict[*ceed], op_);
   if (*err) return;
   *op = CeedOperator_count++;
   CeedOperator_n++;
@@ -1003,12 +1003,12 @@ CEED_EXTERN void fCeedOperatorSetField(int *op, const char *field_name, int *r,
   *err = CeedOperatorSetField(op_, field_name_c, r_, b_, v_);
 }
 
-#define fCeedCompositeOperatorAddSub FORTRAN_NAME(ceedcompositeoperatoraddsub, CEEDCOMPOSITEOPERATORADDSUB)
-CEED_EXTERN void fCeedCompositeOperatorAddSub(int *compositeop, int *subop, int *err) {
+#define fCeedOperatorCompositeAddSub FORTRAN_NAME(ceedoperatorcompositeaddsub, CEEDOPERATORCOMPOSITEADDSUB)
+CEED_EXTERN void fCeedOperatorCompositeAddSub(int *compositeop, int *subop, int *err) {
   CeedOperator compositeop_ = CeedOperator_dict[*compositeop];
   CeedOperator subop_       = CeedOperator_dict[*subop];
 
-  *err = CeedCompositeOperatorAddSub(compositeop_, subop_);
+  *err = CeedOperatorCompositeAddSub(compositeop_, subop_);
 }
 
 #define fCeedOperatorSetName FORTRAN_NAME(ceedoperatorsetname, CEEDOPERATORSETNAME)
diff --git a/interface/ceed-operator.c b/interface/ceed-operator.c
index 039f8522db..3a0a0ddfe7 100644
--- a/interface/ceed-operator.c
+++ b/interface/ceed-operator.c
@@ -376,8 +376,8 @@ static int CeedOperatorContextSetGeneric(CeedOperator op, CeedContextFieldLabel
     CeedInt       num_sub;
     CeedOperator *sub_operators;
 
-    CeedCall(CeedCompositeOperatorGetNumSub(op, &num_sub));
-    CeedCall(CeedCompositeOperatorGetSubList(op, &sub_operators));
+    CeedCall(CeedOperatorCompositeGetNumSub(op, &num_sub));
+    CeedCall(CeedOperatorCompositeGetSubList(op, &sub_operators));
     CeedCheck(num_sub == field_label->num_sub_labels, CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED,
               "Composite operator modified after ContextFieldLabel created");
 
@@ -443,8 +443,8 @@ static int CeedOperatorContextGetGenericRead(CeedOperator op, CeedContextFieldLa
     CeedInt       num_sub;
     CeedOperator *sub_operators;
 
-    CeedCall(CeedCompositeOperatorGetNumSub(op, &num_sub));
-    CeedCall(CeedCompositeOperatorGetSubList(op, &sub_operators));
+    CeedCall(CeedOperatorCompositeGetNumSub(op, &num_sub));
+    CeedCall(CeedOperatorCompositeGetSubList(op, &sub_operators));
     CeedCheck(num_sub == field_label->num_sub_labels, CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED,
               "Composite operator modified after ContextFieldLabel created");
 
@@ -506,8 +506,8 @@ static int CeedOperatorContextRestoreGenericRead(CeedOperator op, CeedContextFie
     CeedInt       num_sub;
     CeedOperator *sub_operators;
 
-    CeedCall(CeedCompositeOperatorGetNumSub(op, &num_sub));
-    CeedCall(CeedCompositeOperatorGetSubList(op, &sub_operators));
+    CeedCall(CeedOperatorCompositeGetNumSub(op, &num_sub));
+    CeedCall(CeedOperatorCompositeGetSubList(op, &sub_operators));
     CeedCheck(num_sub == field_label->num_sub_labels, CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED,
               "Composite operator modified after ContextFieldLabel created");
 
@@ -831,13 +831,13 @@ int CeedOperatorCreateAtPoints(Ceed ceed, CeedQFunction qf, CeedQFunction dqf, C
 
   @ref User
  */
-int CeedCompositeOperatorCreate(Ceed ceed, CeedOperator *op) {
+int CeedOperatorCreateComposite(Ceed ceed, CeedOperator *op) {
   if (!ceed->CompositeOperatorCreate) {
     Ceed delegate;
 
     CeedCall(CeedGetObjectDelegate(ceed, &delegate, "Operator"));
     if (delegate) {
-      CeedCall(CeedCompositeOperatorCreate(delegate, op));
+      CeedCall(CeedOperatorCreateComposite(delegate, op));
       CeedCall(CeedDestroy(&delegate));
       return CEED_ERROR_SUCCESS;
     }
@@ -1263,7 +1263,7 @@ int CeedOperatorFieldGetData(CeedOperatorField op_field, const char **field_name
 
   @ref User
  */
-int CeedCompositeOperatorAddSub(CeedOperator composite_op, CeedOperator sub_op) {
+int CeedOperatorCompositeAddSub(CeedOperator composite_op, CeedOperator sub_op) {
   bool is_immutable;
 
   CeedCheck(composite_op->is_composite, CeedOperatorReturnCeed(composite_op), CEED_ERROR_MINOR, "CeedOperator is not a composite operator");
@@ -1303,7 +1303,7 @@ int CeedCompositeOperatorAddSub(CeedOperator composite_op, CeedOperator sub_op)
 
   @ref Backend
 **/
-int CeedCompositeOperatorGetNumSub(CeedOperator op, CeedInt *num_suboperators) {
+int CeedOperatorCompositeGetNumSub(CeedOperator op, CeedInt *num_suboperators) {
   bool is_composite;
 
   CeedCall(CeedOperatorIsComposite(op, &is_composite));
@@ -1322,7 +1322,7 @@ int CeedCompositeOperatorGetNumSub(CeedOperator op, CeedInt *num_suboperators) {
 
   @ref Backend
 **/
-int CeedCompositeOperatorGetSubList(CeedOperator op, CeedOperator **sub_operators) {
+int CeedOperatorCompositeGetSubList(CeedOperator op, CeedOperator **sub_operators) {
   bool is_composite;
 
   CeedCall(CeedOperatorIsComposite(op, &is_composite));
@@ -1346,7 +1346,7 @@ int CeedCompositeOperatorGetSubList(CeedOperator op, CeedOperator **sub_operator
 
   @ref Advanced
 **/
-int CeedCompositeOperatorGetSubByName(CeedOperator op, const char *op_name, CeedOperator *sub_op) {
+int CeedOperatorCompositeGetSubByName(CeedOperator op, const char *op_name, CeedOperator *sub_op) {
   bool          is_composite;
   CeedInt       num_sub_ops;
   CeedOperator *sub_ops;
@@ -1354,8 +1354,8 @@ int CeedCompositeOperatorGetSubByName(CeedOperator op, const char *op_name, Ceed
   CeedCall(CeedOperatorIsComposite(op, &is_composite));
   CeedCheck(is_composite, CeedOperatorReturnCeed(op), CEED_ERROR_MINOR, "Only defined for a composite operator");
   *sub_op = NULL;
-  CeedCall(CeedCompositeOperatorGetNumSub(op, &num_sub_ops));
-  CeedCall(CeedCompositeOperatorGetSubList(op, &sub_ops));
+  CeedCall(CeedOperatorCompositeGetNumSub(op, &num_sub_ops));
+  CeedCall(CeedOperatorCompositeGetSubList(op, &sub_ops));
   for (CeedInt i = 0; i < num_sub_ops; i++) {
     if (sub_ops[i]->name && !strcmp(op_name, sub_ops[i]->name)) {
       *sub_op = sub_ops[i];
@@ -1386,7 +1386,7 @@ int CeedOperatorCheckReady(CeedOperator op) {
   if (is_composite) {
     CeedInt num_suboperators;
 
-    CeedCall(CeedCompositeOperatorGetNumSub(op, &num_suboperators));
+    CeedCall(CeedOperatorCompositeGetNumSub(op, &num_suboperators));
     if (!num_suboperators) {
       // Empty operator setup
       op->input_size  = 0;
@@ -1394,7 +1394,7 @@ int CeedOperatorCheckReady(CeedOperator op) {
     } else {
       CeedOperator *sub_operators;
 
-      CeedCall(CeedCompositeOperatorGetSubList(op, &sub_operators));
+      CeedCall(CeedOperatorCompositeGetSubList(op, &sub_operators));
       for (CeedInt i = 0; i < num_suboperators; i++) {
         CeedCall(CeedOperatorCheckReady(sub_operators[i]));
       }
@@ -1448,8 +1448,8 @@ int CeedOperatorGetActiveVectorLengths(CeedOperator op, CeedSize *input_size, Ce
     CeedInt       num_suboperators;
     CeedOperator *sub_operators;
 
-    CeedCall(CeedCompositeOperatorGetNumSub(op, &num_suboperators));
-    CeedCall(CeedCompositeOperatorGetSubList(op, &sub_operators));
+    CeedCall(CeedOperatorCompositeGetNumSub(op, &num_suboperators));
+    CeedCall(CeedOperatorCompositeGetSubList(op, &sub_operators));
     for (CeedInt i = 0; i < num_suboperators; i++) {
       CeedSize sub_input_size, sub_output_size;
 
@@ -1516,8 +1516,8 @@ int CeedOperatorSetQFunctionAssemblyDataUpdateNeeded(CeedOperator op, bool needs
     CeedInt       num_suboperators;
     CeedOperator *sub_operators;
 
-    CeedCall(CeedCompositeOperatorGetNumSub(op, &num_suboperators));
-    CeedCall(CeedCompositeOperatorGetSubList(op, &sub_operators));
+    CeedCall(CeedOperatorCompositeGetNumSub(op, &num_suboperators));
+    CeedCall(CeedOperatorCompositeGetSubList(op, &sub_operators));
     for (CeedInt i = 0; i < num_suboperators; i++) {
       CeedCall(CeedOperatorSetQFunctionAssemblyDataUpdateNeeded(sub_operators[i], needs_data_update));
     }
@@ -1599,8 +1599,8 @@ static int CeedOperatorView_Core(CeedOperator op, FILE *stream, bool is_full) {
     CeedInt       num_suboperators;
     CeedOperator *sub_operators;
 
-    CeedCall(CeedCompositeOperatorGetNumSub(op, &num_suboperators));
-    CeedCall(CeedCompositeOperatorGetSubList(op, &sub_operators));
+    CeedCall(CeedOperatorCompositeGetNumSub(op, &num_suboperators));
+    CeedCall(CeedOperatorCompositeGetSubList(op, &sub_operators));
     fprintf(stream, "Composite CeedOperator%s%s\n", has_name ? " - " : "", has_name ? name : "");
 
     for (CeedInt i = 0; i < num_suboperators; i++) {
@@ -1729,9 +1729,9 @@ int CeedOperatorGetFlopsEstimate(CeedOperator op, CeedSize *flops) {
   if (is_composite) {
     CeedInt num_suboperators;
 
-    CeedCall(CeedCompositeOperatorGetNumSub(op, &num_suboperators));
+    CeedCall(CeedOperatorCompositeGetNumSub(op, &num_suboperators));
     CeedOperator *sub_operators;
-    CeedCall(CeedCompositeOperatorGetSubList(op, &sub_operators));
+    CeedCall(CeedOperatorCompositeGetSubList(op, &sub_operators));
 
     // FLOPs for each suboperator
     for (CeedInt i = 0; i < num_suboperators; i++) {
@@ -1906,8 +1906,8 @@ int CeedOperatorGetContextFieldLabel(CeedOperator op, const char *field_name, Ce
     CeedContextFieldLabel new_field_label;
 
     CeedCall(CeedCalloc(1, &new_field_label));
-    CeedCall(CeedCompositeOperatorGetNumSub(op, &num_sub));
-    CeedCall(CeedCompositeOperatorGetSubList(op, &sub_operators));
+    CeedCall(CeedOperatorCompositeGetNumSub(op, &num_sub));
+    CeedCall(CeedOperatorCompositeGetSubList(op, &sub_operators));
     CeedCall(CeedCalloc(num_sub, &new_field_label->sub_labels));
     new_field_label->num_sub_labels = num_sub;
 
@@ -2209,8 +2209,8 @@ int CeedOperatorApplyAdd(CeedOperator op, CeedVector in, CeedVector out, CeedReq
       CeedInt       num_suboperators;
       CeedOperator *sub_operators;
 
-      CeedCall(CeedCompositeOperatorGetNumSub(op, &num_suboperators));
-      CeedCall(CeedCompositeOperatorGetSubList(op, &sub_operators));
+      CeedCall(CeedOperatorCompositeGetNumSub(op, &num_suboperators));
+      CeedCall(CeedOperatorCompositeGetSubList(op, &sub_operators));
       for (CeedInt i = 0; i < num_suboperators; i++) {
         CeedCall(CeedOperatorApplyAdd(sub_operators[i], in, out, request));
       }
@@ -2250,8 +2250,8 @@ int CeedOperatorApplyAddActive(CeedOperator op, CeedVector in, CeedVector out, C
     CeedInt       num_suboperators;
     CeedOperator *sub_operators;
 
-    CeedCall(CeedCompositeOperatorGetNumSub(op, &num_suboperators));
-    CeedCall(CeedCompositeOperatorGetSubList(op, &sub_operators));
+    CeedCall(CeedOperatorCompositeGetNumSub(op, &num_suboperators));
+    CeedCall(CeedOperatorCompositeGetSubList(op, &sub_operators));
 
     // Zero all output vectors
     for (CeedInt i = 0; i < num_suboperators; i++) {
@@ -2308,8 +2308,8 @@ int CeedOperatorAssemblyDataStrip(CeedOperator op) {
     CeedInt       num_suboperators;
     CeedOperator *sub_operators;
 
-    CeedCall(CeedCompositeOperatorGetNumSub(op, &num_suboperators));
-    CeedCall(CeedCompositeOperatorGetSubList(op, &sub_operators));
+    CeedCall(CeedOperatorCompositeGetNumSub(op, &num_suboperators));
+    CeedCall(CeedOperatorCompositeGetSubList(op, &sub_operators));
     for (CeedInt i = 0; i < num_suboperators; i++) {
       CeedCall(CeedQFunctionAssemblyDataDestroy(&sub_operators[i]->qf_assembled));
       CeedCall(CeedOperatorAssemblyDataDestroy(&sub_operators[i]->op_assembled));
diff --git a/interface/ceed-preconditioning.c b/interface/ceed-preconditioning.c
index cffa77d2ae..bbc031d6ce 100644
--- a/interface/ceed-preconditioning.c
+++ b/interface/ceed-preconditioning.c
@@ -124,14 +124,14 @@ static int CeedOperatorCreateFallback(CeedOperator op) {
     CeedInt       num_suboperators;
     CeedOperator *sub_operators;
 
-    CeedCall(CeedCompositeOperatorCreate(ceed_fallback, &op_fallback));
-    CeedCall(CeedCompositeOperatorGetNumSub(op, &num_suboperators));
-    CeedCall(CeedCompositeOperatorGetSubList(op, &sub_operators));
+    CeedCall(CeedOperatorCreateComposite(ceed_fallback, &op_fallback));
+    CeedCall(CeedOperatorCompositeGetNumSub(op, &num_suboperators));
+    CeedCall(CeedOperatorCompositeGetSubList(op, &sub_operators));
     for (CeedInt i = 0; i < num_suboperators; i++) {
       CeedOperator op_sub_fallback;
 
       CeedCall(CeedOperatorGetFallback(sub_operators[i], &op_sub_fallback));
-      CeedCall(CeedCompositeOperatorAddSub(op_fallback, op_sub_fallback));
+      CeedCall(CeedOperatorCompositeAddSub(op_fallback, op_sub_fallback));
     }
   } else {
     bool               is_at_points = false;
@@ -213,7 +213,7 @@ static int CeedOperatorCreateFallback(CeedOperator op) {
 
   @ref Developer
 **/
-static inline int CeedSingleOperatorLinearAssembleAddDiagonal_Mesh(CeedOperator op, CeedRequest *request, const bool is_point_block,
+static inline int CeedOperatorLinearAssembleAddDiagonalSingle_Mesh(CeedOperator op, CeedRequest *request, const bool is_point_block,
                                                                    CeedVector assembled) {
   bool is_composite;
 
@@ -398,13 +398,13 @@ static inline int CeedSingleOperatorLinearAssembleAddDiagonal_Mesh(CeedOperator
 
   @ref Developer
 **/
-static inline int CeedSingleOperatorLinearAssembleAddDiagonal(CeedOperator op, CeedRequest *request, const bool is_point_block,
+static inline int CeedOperatorLinearAssembleAddDiagonalSingle(CeedOperator op, CeedRequest *request, const bool is_point_block,
                                                               CeedVector assembled) {
   bool is_at_points;
 
   CeedCall(CeedOperatorIsAtPoints(op, &is_at_points));
   CeedCheck(!is_at_points, CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED, "AtPoints operator not supported");
-  CeedCall(CeedSingleOperatorLinearAssembleAddDiagonal_Mesh(op, request, is_point_block, assembled));
+  CeedCall(CeedOperatorLinearAssembleAddDiagonalSingle_Mesh(op, request, is_point_block, assembled));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -420,13 +420,13 @@ static inline int CeedSingleOperatorLinearAssembleAddDiagonal(CeedOperator op, C
 
   @ref Developer
 **/
-static inline int CeedCompositeOperatorLinearAssembleAddDiagonal(CeedOperator op, CeedRequest *request, const bool is_point_block,
+static inline int CeedOperatorLinearAssembleAddDiagonalComposite(CeedOperator op, CeedRequest *request, const bool is_point_block,
                                                                  CeedVector assembled) {
   CeedInt       num_sub;
   CeedOperator *suboperators;
 
-  CeedCall(CeedCompositeOperatorGetNumSub(op, &num_sub));
-  CeedCall(CeedCompositeOperatorGetSubList(op, &suboperators));
+  CeedCall(CeedOperatorCompositeGetNumSub(op, &num_sub));
+  CeedCall(CeedOperatorCompositeGetSubList(op, &suboperators));
   for (CeedInt i = 0; i < num_sub; i++) {
     if (is_point_block) {
       CeedCall(CeedOperatorLinearAssembleAddPointBlockDiagonal(suboperators[i], assembled, request));
@@ -451,7 +451,7 @@ static inline int CeedCompositeOperatorLinearAssembleAddDiagonal(CeedOperator op
 
   @ref Developer
 **/
-static int CeedSingleOperatorAssembleSymbolic(CeedOperator op, CeedInt offset, CeedInt *rows, CeedInt *cols) {
+static int CeedOperatorAssembleSymbolicSingle(CeedOperator op, CeedInt offset, CeedInt *rows, CeedInt *cols) {
   Ceed                ceed;
   bool                is_composite;
   CeedSize            num_nodes_in, num_nodes_out, local_num_entries, count = 0;
@@ -650,7 +650,7 @@ static int CeedOperatorLinearAssembleQFunctionBuildOrUpdate_Core(CeedOperator op
 
   @ref Developer
 **/
-int CeedOperatorFallbackLinearAssembleQFunctionBuildOrUpdate(CeedOperator op, CeedVector *assembled, CeedElemRestriction *rstr,
+int CeedOperatorLinearAssembleQFunctionBuildOrUpdateFallback(CeedOperator op, CeedVector *assembled, CeedElemRestriction *rstr,
                                                              CeedRequest *request) {
   return CeedOperatorLinearAssembleQFunctionBuildOrUpdate_Core(op, false, assembled, rstr, request);
 }
@@ -668,7 +668,7 @@ int CeedOperatorFallbackLinearAssembleQFunctionBuildOrUpdate(CeedOperator op, Ce
 
   @ref Developer
 **/
-int CeedSingleOperatorAssemble(CeedOperator op, CeedInt offset, CeedVector values) {
+int CeedOperatorAssembleSingle(CeedOperator op, CeedInt offset, CeedVector values) {
   bool is_composite, is_at_points;
 
   CeedCall(CeedOperatorIsComposite(op, &is_composite));
@@ -690,10 +690,10 @@ int CeedSingleOperatorAssemble(CeedOperator op, CeedInt offset, CeedVector value
     // Operator fallback
     CeedOperator op_fallback;
 
-    CeedDebug(CeedOperatorReturnCeed(op), "\nFalling back for CeedSingleOperatorAssemble\n");
+    CeedDebug(CeedOperatorReturnCeed(op), "\nFalling back for CeedOperatorAssembleSingle\n");
     CeedCall(CeedOperatorGetFallback(op, &op_fallback));
     if (op_fallback) {
-      CeedCall(CeedSingleOperatorAssemble(op_fallback, offset, values));
+      CeedCall(CeedOperatorAssembleSingle(op_fallback, offset, values));
       return CEED_ERROR_SUCCESS;
     }
   }
@@ -925,7 +925,7 @@ int CeedSingleOperatorAssemble(CeedOperator op, CeedInt offset, CeedVector value
 
   @ref Utility
 **/
-static int CeedSingleOperatorAssemblyCountEntries(CeedOperator op, CeedSize *num_entries) {
+static int CeedOperatorAssemblyCountEntriesSingle(CeedOperator op, CeedSize *num_entries) {
   bool                is_composite;
   CeedInt             num_elem_in, elem_size_in, num_comp_in, num_elem_out, elem_size_out, num_comp_out;
   CeedElemRestriction rstr_in, rstr_out;
@@ -972,8 +972,9 @@ static int CeedSingleOperatorAssemblyCountEntries(CeedOperator op, CeedSize *num
 
   @ref Developer
 **/
-static int CeedSingleOperatorMultigridLevel(CeedOperator op_fine, CeedVector p_mult_fine, CeedElemRestriction rstr_coarse, CeedBasis basis_coarse,
-                                            CeedBasis basis_c_to_f, CeedOperator *op_coarse, CeedOperator *op_prolong, CeedOperator *op_restrict) {
+static int CeedOperatorMultigridLevelCreateSingle_Core(CeedOperator op_fine, CeedVector p_mult_fine, CeedElemRestriction rstr_coarse,
+                                                       CeedBasis basis_coarse, CeedBasis basis_c_to_f, CeedOperator *op_coarse,
+                                                       CeedOperator *op_prolong, CeedOperator *op_restrict) {
   bool                is_composite;
   Ceed                ceed;
   CeedInt             num_comp, num_input_fields, num_output_fields;
@@ -2107,7 +2108,7 @@ int CeedOperatorLinearAssembleDiagonal(CeedOperator op, CeedVector assembled, Ce
   } else if (is_composite) {
     // Default to summing contributions of suboperators
     CeedCall(CeedVectorSetValue(assembled, 0.0));
-    CeedCall(CeedCompositeOperatorLinearAssembleAddDiagonal(op, request, false, assembled));
+    CeedCall(CeedOperatorLinearAssembleAddDiagonalComposite(op, request, false, assembled));
     return CEED_ERROR_SUCCESS;
   } else {
     // Operator fallback
@@ -2167,7 +2168,7 @@ int CeedOperatorLinearAssembleAddDiagonal(CeedOperator op, CeedVector assembled,
     return CEED_ERROR_SUCCESS;
   } else if (is_composite) {
     // Default to summing contributions of suboperators
-    CeedCall(CeedCompositeOperatorLinearAssembleAddDiagonal(op, request, false, assembled));
+    CeedCall(CeedOperatorLinearAssembleAddDiagonalComposite(op, request, false, assembled));
     return CEED_ERROR_SUCCESS;
   } else {
     // Operator fallback
@@ -2181,7 +2182,7 @@ int CeedOperatorLinearAssembleAddDiagonal(CeedOperator op, CeedVector assembled,
     }
   }
   // Default interface implementation
-  CeedCall(CeedSingleOperatorLinearAssembleAddDiagonal(op, request, false, assembled));
+  CeedCall(CeedOperatorLinearAssembleAddDiagonalSingle(op, request, false, assembled));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -2217,8 +2218,8 @@ int CeedOperatorLinearAssemblePointBlockDiagonalSymbolic(CeedOperator op, CeedSi
   CeedCheck(input_size == output_size, CeedOperatorReturnCeed(op), CEED_ERROR_DIMENSION, "Operator must be square");
 
   if (is_composite) {
-    CeedCall(CeedCompositeOperatorGetNumSub(op, &num_sub_operators));
-    CeedCall(CeedCompositeOperatorGetSubList(op, &sub_operators));
+    CeedCall(CeedOperatorCompositeGetNumSub(op, &num_sub_operators));
+    CeedCall(CeedOperatorCompositeGetSubList(op, &sub_operators));
   } else {
     sub_operators     = &op;
     num_sub_operators = 1;
@@ -2406,9 +2407,9 @@ int CeedOperatorLinearAssembleAddPointBlockDiagonal(CeedOperator op, CeedVector
   }
   // Default interface implementation
   if (is_composite) {
-    CeedCall(CeedCompositeOperatorLinearAssembleAddDiagonal(op, request, true, assembled));
+    CeedCall(CeedOperatorLinearAssembleAddDiagonalComposite(op, request, true, assembled));
   } else {
-    CeedCall(CeedSingleOperatorLinearAssembleAddDiagonal(op, request, true, assembled));
+    CeedCall(CeedOperatorLinearAssembleAddDiagonalSingle(op, request, true, assembled));
   }
   return CEED_ERROR_SUCCESS;
 }
@@ -2463,14 +2464,14 @@ int CeedOperatorLinearAssembleSymbolic(CeedOperator op, CeedSize *num_entries, C
   // Count entries and allocate rows, cols arrays
   *num_entries = 0;
   if (is_composite) {
-    CeedCall(CeedCompositeOperatorGetNumSub(op, &num_suboperators));
-    CeedCall(CeedCompositeOperatorGetSubList(op, &sub_operators));
+    CeedCall(CeedOperatorCompositeGetNumSub(op, &num_suboperators));
+    CeedCall(CeedOperatorCompositeGetSubList(op, &sub_operators));
     for (CeedInt k = 0; k < num_suboperators; ++k) {
-      CeedCall(CeedSingleOperatorAssemblyCountEntries(sub_operators[k], &single_entries));
+      CeedCall(CeedOperatorAssemblyCountEntriesSingle(sub_operators[k], &single_entries));
       *num_entries += single_entries;
     }
   } else {
-    CeedCall(CeedSingleOperatorAssemblyCountEntries(op, &single_entries));
+    CeedCall(CeedOperatorAssemblyCountEntriesSingle(op, &single_entries));
     *num_entries += single_entries;
   }
   CeedCall(CeedCalloc(*num_entries, rows));
@@ -2478,15 +2479,15 @@ int CeedOperatorLinearAssembleSymbolic(CeedOperator op, CeedSize *num_entries, C
 
   // Assemble nonzero locations
   if (is_composite) {
-    CeedCall(CeedCompositeOperatorGetNumSub(op, &num_suboperators));
-    CeedCall(CeedCompositeOperatorGetSubList(op, &sub_operators));
+    CeedCall(CeedOperatorCompositeGetNumSub(op, &num_suboperators));
+    CeedCall(CeedOperatorCompositeGetSubList(op, &sub_operators));
     for (CeedInt k = 0; k < num_suboperators; ++k) {
-      CeedCall(CeedSingleOperatorAssembleSymbolic(sub_operators[k], offset, *rows, *cols));
-      CeedCall(CeedSingleOperatorAssemblyCountEntries(sub_operators[k], &single_entries));
+      CeedCall(CeedOperatorAssembleSymbolicSingle(sub_operators[k], offset, *rows, *cols));
+      CeedCall(CeedOperatorAssemblyCountEntriesSingle(sub_operators[k], &single_entries));
       offset += single_entries;
     }
   } else {
-    CeedCall(CeedSingleOperatorAssembleSymbolic(op, offset, *rows, *cols));
+    CeedCall(CeedOperatorAssembleSymbolicSingle(op, offset, *rows, *cols));
   }
   return CEED_ERROR_SUCCESS;
 }
@@ -2533,17 +2534,17 @@ int CeedOperatorLinearAssemble(CeedOperator op, CeedVector values) {
   } else if (is_composite) {
     // Default to summing contributions of suboperators
     CeedCall(CeedVectorSetValue(values, 0.0));
-    CeedCall(CeedCompositeOperatorGetNumSub(op, &num_suboperators));
-    CeedCall(CeedCompositeOperatorGetSubList(op, &sub_operators));
+    CeedCall(CeedOperatorCompositeGetNumSub(op, &num_suboperators));
+    CeedCall(CeedOperatorCompositeGetSubList(op, &sub_operators));
     for (CeedInt k = 0; k < num_suboperators; k++) {
-      CeedCall(CeedSingleOperatorAssemble(sub_operators[k], offset, values));
-      CeedCall(CeedSingleOperatorAssemblyCountEntries(sub_operators[k], &single_entries));
+      CeedCall(CeedOperatorAssembleSingle(sub_operators[k], offset, values));
+      CeedCall(CeedOperatorAssemblyCountEntriesSingle(sub_operators[k], &single_entries));
       offset += single_entries;
     }
     return CEED_ERROR_SUCCESS;
   } else if (op->LinearAssembleSingle) {
     CeedCall(CeedVectorSetValue(values, 0.0));
-    CeedCall(CeedSingleOperatorAssemble(op, offset, values));
+    CeedCall(CeedOperatorAssembleSingle(op, offset, values));
     return CEED_ERROR_SUCCESS;
   } else {
     // Operator fallback
@@ -2559,7 +2560,7 @@ int CeedOperatorLinearAssemble(CeedOperator op, CeedVector values) {
 
   // Default to interface version if non-composite and no fallback
   CeedCall(CeedVectorSetValue(values, 0.0));
-  CeedCall(CeedSingleOperatorAssemble(op, offset, values));
+  CeedCall(CeedOperatorAssembleSingle(op, offset, values));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -2577,7 +2578,7 @@ int CeedOperatorLinearAssemble(CeedOperator op, CeedVector values) {
 
   @ref User
 **/
-int CeedCompositeOperatorGetMultiplicity(CeedOperator op, CeedInt num_skip_indices, CeedInt *skip_indices, CeedVector mult) {
+int CeedOperatorCompositeGetMultiplicity(CeedOperator op, CeedInt num_skip_indices, CeedInt *skip_indices, CeedVector mult) {
   Ceed                ceed;
   CeedInt             num_suboperators;
   CeedSize            l_vec_len;
@@ -2592,9 +2593,9 @@ int CeedCompositeOperatorGetMultiplicity(CeedOperator op, CeedInt num_skip_indic
   CeedCall(CeedVectorSetValue(mult, 0.0));
 
   // Get suboperators
-  CeedCall(CeedCompositeOperatorGetNumSub(op, &num_suboperators));
+  CeedCall(CeedOperatorCompositeGetNumSub(op, &num_suboperators));
   if (num_suboperators == 0) return CEED_ERROR_SUCCESS;
-  CeedCall(CeedCompositeOperatorGetSubList(op, &sub_operators));
+  CeedCall(CeedOperatorCompositeGetSubList(op, &sub_operators));
 
   // Work vector
   CeedCall(CeedVectorGetLength(mult, &l_vec_len));
@@ -2670,7 +2671,8 @@ int CeedOperatorMultigridLevelCreate(CeedOperator op_fine, CeedVector p_mult_fin
   }
 
   // Core code
-  CeedCall(CeedSingleOperatorMultigridLevel(op_fine, p_mult_fine, rstr_coarse, basis_coarse, basis_c_to_f, op_coarse, op_prolong, op_restrict));
+  CeedCall(
+      CeedOperatorMultigridLevelCreateSingle_Core(op_fine, p_mult_fine, rstr_coarse, basis_coarse, basis_c_to_f, op_coarse, op_prolong, op_restrict));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -2735,7 +2737,8 @@ int CeedOperatorMultigridLevelCreateTensorH1(CeedOperator op_fine, CeedVector p_
   }
 
   // Core code
-  CeedCall(CeedSingleOperatorMultigridLevel(op_fine, p_mult_fine, rstr_coarse, basis_coarse, basis_c_to_f, op_coarse, op_prolong, op_restrict));
+  CeedCall(
+      CeedOperatorMultigridLevelCreateSingle_Core(op_fine, p_mult_fine, rstr_coarse, basis_coarse, basis_c_to_f, op_coarse, op_prolong, op_restrict));
   CeedCall(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
@@ -2799,7 +2802,8 @@ int CeedOperatorMultigridLevelCreateH1(CeedOperator op_fine, CeedVector p_mult_f
   }
 
   // Core code
-  CeedCall(CeedSingleOperatorMultigridLevel(op_fine, p_mult_fine, rstr_coarse, basis_coarse, basis_c_to_f, op_coarse, op_prolong, op_restrict));
+  CeedCall(
+      CeedOperatorMultigridLevelCreateSingle_Core(op_fine, p_mult_fine, rstr_coarse, basis_coarse, basis_c_to_f, op_coarse, op_prolong, op_restrict));
   CeedCall(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
diff --git a/julia/LibCEED.jl/src/Operator.jl b/julia/LibCEED.jl/src/Operator.jl
index d1de710c54..2ad8f41b1c 100644
--- a/julia/LibCEED.jl/src/Operator.jl
+++ b/julia/LibCEED.jl/src/Operator.jl
@@ -69,11 +69,11 @@ collection `ops`.
 """
 function create_composite_operator(c::Ceed, ops)
     ref = Ref{C.CeedOperator}()
-    C.CeedCompositeOperatorCreate(c[], ref)
+    C.CeedOperatorCreateComposite(c[], ref)
     comp_op = Operator(ref, QFunctionNone(), QFunctionNone(), QFunctionNone())
     comp_op.sub_ops = ops
     for op ∈ ops
-        C.CeedCompositeOperatorAddSub(comp_op[], op[])
+        C.CeedOperatorCompositeAddSub(comp_op[], op[])
     end
     comp_op
 end
diff --git a/julia/LibCEED.jl/src/generated/libceed_bindings.jl b/julia/LibCEED.jl/src/generated/libceed_bindings.jl
index 9cbf889dd9..d4bba38974 100644
--- a/julia/LibCEED.jl/src/generated/libceed_bindings.jl
+++ b/julia/LibCEED.jl/src/generated/libceed_bindings.jl
@@ -658,8 +658,8 @@ function CeedOperatorCreate(ceed, qf, dqf, dqfT, op)
     ccall((:CeedOperatorCreate, libceed), Cint, (Ceed, CeedQFunction, CeedQFunction, CeedQFunction, Ptr{CeedOperator}), ceed, qf, dqf, dqfT, op)
 end
 
-function CeedCompositeOperatorCreate(ceed, op)
-    ccall((:CeedCompositeOperatorCreate, libceed), Cint, (Ceed, Ptr{CeedOperator}), ceed, op)
+function CeedOperatorCreateComposite(ceed, op)
+    ccall((:CeedOperatorCreateComposite, libceed), Cint, (Ceed, Ptr{CeedOperator}), ceed, op)
 end
 
 function CeedOperatorReferenceCopy(op, op_copy)
@@ -674,16 +674,16 @@ function CeedOperatorGetFields(op, num_input_fields, input_fields, num_output_fi
     ccall((:CeedOperatorGetFields, libceed), Cint, (CeedOperator, Ptr{CeedInt}, Ptr{Ptr{CeedOperatorField}}, Ptr{CeedInt}, Ptr{Ptr{CeedOperatorField}}), op, num_input_fields, input_fields, num_output_fields, output_fields)
 end
 
-function CeedCompositeOperatorAddSub(composite_op, sub_op)
-    ccall((:CeedCompositeOperatorAddSub, libceed), Cint, (CeedOperator, CeedOperator), composite_op, sub_op)
+function CeedOperatorCompositeAddSub(composite_op, sub_op)
+    ccall((:CeedOperatorCompositeAddSub, libceed), Cint, (CeedOperator, CeedOperator), composite_op, sub_op)
 end
 
-function CeedCompositeOperatorGetNumSub(op, num_suboperators)
-    ccall((:CeedCompositeOperatorGetNumSub, libceed), Cint, (CeedOperator, Ptr{CeedInt}), op, num_suboperators)
+function CeedOperatorCompositeGetNumSub(op, num_suboperators)
+    ccall((:CeedOperatorCompositeGetNumSub, libceed), Cint, (CeedOperator, Ptr{CeedInt}), op, num_suboperators)
 end
 
-function CeedCompositeOperatorGetSubList(op, sub_operators)
-    ccall((:CeedCompositeOperatorGetSubList, libceed), Cint, (CeedOperator, Ptr{Ptr{CeedOperator}}), op, sub_operators)
+function CeedOperatorCompositeGetSubList(op, sub_operators)
+    ccall((:CeedOperatorCompositeGetSubList, libceed), Cint, (CeedOperator, Ptr{Ptr{CeedOperator}}), op, sub_operators)
 end
 
 function CeedOperatorCheckReady(op)
@@ -738,8 +738,8 @@ function CeedOperatorLinearAssemble(op, values)
     ccall((:CeedOperatorLinearAssemble, libceed), Cint, (CeedOperator, CeedVector), op, values)
 end
 
-function CeedCompositeOperatorGetMultiplicity(op, num_skip_indices, skip_indices, mult)
-    ccall((:CeedCompositeOperatorGetMultiplicity, libceed), Cint, (CeedOperator, CeedInt, Ptr{CeedInt}, CeedVector), op, num_skip_indices, skip_indices, mult)
+function CeedOperatorCompositeGetMultiplicity(op, num_skip_indices, skip_indices, mult)
+    ccall((:CeedOperatorCompositeGetMultiplicity, libceed), Cint, (CeedOperator, CeedInt, Ptr{CeedInt}, CeedVector), op, num_skip_indices, skip_indices, mult)
 end
 
 function CeedOperatorMultigridLevelCreate(op_fine, p_mult_fine, rstr_coarse, basis_coarse, op_coarse, op_prolong, op_restrict)
diff --git a/julia/LibCEED.jl/test/rundevtests.jl b/julia/LibCEED.jl/test/rundevtests.jl
index 9527d8d2dc..59d0e4840e 100644
--- a/julia/LibCEED.jl/test/rundevtests.jl
+++ b/julia/LibCEED.jl/test/rundevtests.jl
@@ -8,4 +8,36 @@ function checkoutput(str, fname)
     return true
 end
 
-@testset "LibCEED Development Tests" begin end
+@testset "LibCEED Development Tests" begin
+    @testset "Operator" begin
+        c = Ceed()
+        @interior_qf id = (
+            c,
+            (input, :in, EVAL_INTERP),
+            (output, :out, EVAL_INTERP),
+            begin
+                output[] = input
+            end,
+        )
+        b = create_tensor_h1_lagrange_basis(c, 3, 1, 3, 3, GAUSS_LOBATTO)
+        n = getnumnodes(b)
+        offsets = Vector{CeedInt}(0:n-1)
+        r = create_elem_restriction(c, 1, n, 1, 1, n, offsets)
+        op = Operator(
+            c;
+            qf=id,
+            fields=[
+                (:input, r, b, CeedVectorActive()),
+                (:output, r, b, CeedVectorActive()),
+            ],
+        )
+
+        v = rand(CeedScalar, n)
+        v1 = CeedVector(c, v)
+        v2 = CeedVector(c, n)
+
+        comp_op = create_composite_operator(c, [op])
+        apply!(comp_op, v1, v2)
+        @test @witharray_read(a1 = v1, @witharray_read(a2 = v2, a1 == a2))
+    end
+end
diff --git a/julia/LibCEED.jl/test/runtests.jl b/julia/LibCEED.jl/test/runtests.jl
index 83c7598ecd..724240d786 100644
--- a/julia/LibCEED.jl/test/runtests.jl
+++ b/julia/LibCEED.jl/test/runtests.jl
@@ -256,10 +256,6 @@ else
             LibCEED.assemble_add_diagonal!(op, diag_vector)
             @test @witharray(a = diag_vector, a == fill(1.0, n))
 
-            comp_op = create_composite_operator(c, [op])
-            apply!(comp_op, v1, v2)
-            @test @witharray_read(a1 = v1, @witharray_read(a2 = v2, a1 == a2))
-
             @test showstr(op) == """
                 CeedOperator
                   1 elements with 27 quadrature points each
diff --git a/python/ceed_operator.py b/python/ceed_operator.py
index cce2ee5ae5..e3053439f0 100644
--- a/python/ceed_operator.py
+++ b/python/ceed_operator.py
@@ -331,7 +331,7 @@ def __init__(self, ceed):
         # Reference to Ceed
         self._ceed = ceed
         # libCEED call
-        err_code = lib.CeedCompositeOperatorCreate(
+        err_code = lib.CeedOperatorCreateComposite(
             self._ceed._pointer[0], self._pointer)
         self._ceed._check_error(err_code)
 
@@ -343,7 +343,7 @@ def add_sub(self, subop):
              subop: sub-operator Operator"""
 
         # libCEED call
-        err_code = lib.CeedCompositeOperatorAddSub(
+        err_code = lib.CeedOperatorCompositeAddSub(
             self._pointer[0], subop._pointer[0])
         self._ceed._check_error(err_code)
 
diff --git a/rust/libceed/src/operator.rs b/rust/libceed/src/operator.rs
index bd480c3936..e82e92a393 100644
--- a/rust/libceed/src/operator.rs
+++ b/rust/libceed/src/operator.rs
@@ -2119,7 +2119,7 @@ impl<'a> CompositeOperator<'a> {
     // Constructor
     pub fn create(ceed: &crate::Ceed) -> crate::Result<Self> {
         let mut ptr = std::ptr::null_mut();
-        ceed.check_error(unsafe { bind_ceed::CeedCompositeOperatorCreate(ceed.ptr, &mut ptr) })?;
+        ceed.check_error(unsafe { bind_ceed::CeedOperatorCreateComposite(ceed.ptr, &mut ptr) })?;
         Ok(Self {
             op_core: OperatorCore {
                 ptr,
@@ -2401,7 +2401,7 @@ impl<'a> CompositeOperator<'a> {
     #[allow(unused_mut)]
     pub fn sub_operator(mut self, subop: &Operator) -> crate::Result<Self> {
         self.op_core.check_error(unsafe {
-            bind_ceed::CeedCompositeOperatorAddSub(self.op_core.ptr, subop.op_core.ptr)
+            bind_ceed::CeedOperatorCompositeAddSub(self.op_core.ptr, subop.op_core.ptr)
         })?;
         Ok(self)
     }
diff --git a/tests/t520-operator-f.f90 b/tests/t520-operator-f.f90
index 628ce3735c..3882ddc1e0 100644
--- a/tests/t520-operator-f.f90
+++ b/tests/t520-operator-f.f90
@@ -211,13 +211,13 @@ program test
      & buhex,ceed_vector_active,err)
 
 ! Composite Operators
-      call ceedcompositeoperatorcreate(ceed,op_setup,err)
-      call ceedcompositeoperatoraddsub(op_setup,op_setuptet,err)
-      call ceedcompositeoperatoraddsub(op_setup,op_setuphex,err)
+      call ceedoperatorcreatecomposite(ceed,op_setup,err)
+      call ceedoperatorcompositeaddsub(op_setup,op_setuptet,err)
+      call ceedoperatorcompositeaddsub(op_setup,op_setuphex,err)
 
-      call ceedcompositeoperatorcreate(ceed,op_mass,err)
-      call ceedcompositeoperatoraddsub(op_mass,op_masstet,err)
-      call ceedcompositeoperatoraddsub(op_mass,op_masshex,err)
+      call ceedoperatorcreatecomposite(ceed,op_mass,err)
+      call ceedoperatorcompositeaddsub(op_mass,op_masstet,err)
+      call ceedoperatorcompositeaddsub(op_mass,op_masshex,err)
 
 ! Apply Setup Operator
       call ceedoperatorapply(op_setup,x,ceed_vector_none,&
diff --git a/tests/t520-operator.c b/tests/t520-operator.c
index 31fceb9c97..5632b3e2d6 100644
--- a/tests/t520-operator.c
+++ b/tests/t520-operator.c
@@ -159,25 +159,25 @@ int main(int argc, char **argv) {
 
   // Set up Composite Operators
   // -- Create
-  CeedCompositeOperatorCreate(ceed, &op_setup);
+  CeedOperatorCreateComposite(ceed, &op_setup);
   // -- Add SubOperators
-  CeedCompositeOperatorAddSub(op_setup, op_setup_tet);
-  CeedCompositeOperatorAddSub(op_setup, op_setup_hex);
+  CeedOperatorCompositeAddSub(op_setup, op_setup_tet);
+  CeedOperatorCompositeAddSub(op_setup, op_setup_hex);
 
   // -- Create
-  CeedCompositeOperatorCreate(ceed, &op_mass);
+  CeedOperatorCreateComposite(ceed, &op_mass);
   // -- Add SubOperators
-  CeedCompositeOperatorAddSub(op_mass, op_mass_tet);
-  CeedCompositeOperatorAddSub(op_mass, op_mass_hex);
+  CeedOperatorCompositeAddSub(op_mass, op_mass_tet);
+  CeedOperatorCompositeAddSub(op_mass, op_mass_hex);
 
-  {  // Test CeedCompositeOperatorGetSubByName
+  {  // Test CeedOperatorCompositeGetSubByName
     CeedOperator op_byname;
 
-    CeedCompositeOperatorGetSubByName(op_mass, "mass hex", &op_byname);
-    if (op_byname != op_mass_hex) printf("CeedCompositeOperatorGetSubByName returned incorrect Sub Operator");
+    CeedOperatorCompositeGetSubByName(op_mass, "mass hex", &op_byname);
+    if (op_byname != op_mass_hex) printf("CeedOperatorCompositeGetSubByName returned incorrect Sub Operator");
 
-    CeedCompositeOperatorGetSubByName(op_mass, "asdf", &op_byname);
-    if (op_byname != NULL) printf("CeedCompositeOperatorGetSubByName returned non-NULL for non-existent Sub Operator");
+    CeedOperatorCompositeGetSubByName(op_mass, "asdf", &op_byname);
+    if (op_byname != NULL) printf("CeedOperatorCompositeGetSubByName returned non-NULL for non-existent Sub Operator");
   }
 
   // Apply Setup Operator
diff --git a/tests/t521-operator-f.f90 b/tests/t521-operator-f.f90
index 20ab09eb7b..73fff92d7a 100644
--- a/tests/t521-operator-f.f90
+++ b/tests/t521-operator-f.f90
@@ -213,13 +213,13 @@ program test
      & buhex,ceed_vector_active,err)
 
 ! Composite Operators
-      call ceedcompositeoperatorcreate(ceed,op_setup,err)
-      call ceedcompositeoperatoraddsub(op_setup,op_setuptet,err)
-      call ceedcompositeoperatoraddsub(op_setup,op_setuphex,err)
+      call ceedoperatorcreatecomposite(ceed,op_setup,err)
+      call ceedoperatorcompositeaddsub(op_setup,op_setuptet,err)
+      call ceedoperatorcompositeaddsub(op_setup,op_setuphex,err)
 
-      call ceedcompositeoperatorcreate(ceed,op_mass,err)
-      call ceedcompositeoperatoraddsub(op_mass,op_masstet,err)
-      call ceedcompositeoperatoraddsub(op_mass,op_masshex,err)
+      call ceedoperatorcreatecomposite(ceed,op_mass,err)
+      call ceedoperatorcompositeaddsub(op_mass,op_masstet,err)
+      call ceedoperatorcompositeaddsub(op_mass,op_masshex,err)
 
 ! Apply Setup Operator
       call ceedoperatorapply(op_setup,x,ceed_vector_none,&
diff --git a/tests/t521-operator.c b/tests/t521-operator.c
index 1fff943186..dd13ea5589 100644
--- a/tests/t521-operator.c
+++ b/tests/t521-operator.c
@@ -156,13 +156,13 @@ int main(int argc, char **argv) {
   CeedOperatorSetField(op_mass_hex, "v", elem_restriction_u_hex, basis_u_hex, CEED_VECTOR_ACTIVE);
 
   // Composite Operators
-  CeedCompositeOperatorCreate(ceed, &op_setup);
-  CeedCompositeOperatorAddSub(op_setup, op_setup_tet);
-  CeedCompositeOperatorAddSub(op_setup, op_setup_hex);
+  CeedOperatorCreateComposite(ceed, &op_setup);
+  CeedOperatorCompositeAddSub(op_setup, op_setup_tet);
+  CeedOperatorCompositeAddSub(op_setup, op_setup_hex);
 
-  CeedCompositeOperatorCreate(ceed, &op_mass);
-  CeedCompositeOperatorAddSub(op_mass, op_mass_tet);
-  CeedCompositeOperatorAddSub(op_mass, op_mass_hex);
+  CeedOperatorCreateComposite(ceed, &op_mass);
+  CeedOperatorCompositeAddSub(op_mass, op_mass_tet);
+  CeedOperatorCompositeAddSub(op_mass, op_mass_hex);
 
   // Apply Setup Operator
   CeedOperatorApply(op_setup, x, CEED_VECTOR_NONE, CEED_REQUEST_IMMEDIATE);
diff --git a/tests/t522-operator-f.f90 b/tests/t522-operator-f.f90
index 4ea3773f7b..98b9089edb 100644
--- a/tests/t522-operator-f.f90
+++ b/tests/t522-operator-f.f90
@@ -215,13 +215,13 @@ program test
      & buhex,ceed_vector_active,err)
 
 ! Composite Operators
-      call ceedcompositeoperatorcreate(ceed,op_setup,err)
-      call ceedcompositeoperatoraddsub(op_setup,op_setuptet,err)
-      call ceedcompositeoperatoraddsub(op_setup,op_setuphex,err)
+      call ceedoperatorcreatecomposite(ceed,op_setup,err)
+      call ceedoperatorcompositeaddsub(op_setup,op_setuptet,err)
+      call ceedoperatorcompositeaddsub(op_setup,op_setuphex,err)
 
-      call ceedcompositeoperatorcreate(ceed,op_diff,err)
-      call ceedcompositeoperatoraddsub(op_diff,op_difftet,err)
-      call ceedcompositeoperatoraddsub(op_diff,op_diffhex,err)
+      call ceedoperatorcreatecomposite(ceed,op_diff,err)
+      call ceedoperatorcompositeaddsub(op_diff,op_difftet,err)
+      call ceedoperatorcompositeaddsub(op_diff,op_diffhex,err)
 
 ! Apply Setup Operator
       call ceedoperatorapply(op_setup,x,ceed_vector_none,&
diff --git a/tests/t522-operator.c b/tests/t522-operator.c
index b2e1da90ac..8572c0d687 100644
--- a/tests/t522-operator.c
+++ b/tests/t522-operator.c
@@ -159,13 +159,13 @@ int main(int argc, char **argv) {
   CeedOperatorSetField(op_diff_hex, "v", elem_restriction_u_hex, basis_u_hex, CEED_VECTOR_ACTIVE);
 
   // Composite Operators
-  CeedCompositeOperatorCreate(ceed, &op_setup);
-  CeedCompositeOperatorAddSub(op_setup, op_setup_tet);
-  CeedCompositeOperatorAddSub(op_setup, op_setup_hex);
+  CeedOperatorCreateComposite(ceed, &op_setup);
+  CeedOperatorCompositeAddSub(op_setup, op_setup_tet);
+  CeedOperatorCompositeAddSub(op_setup, op_setup_hex);
 
-  CeedCompositeOperatorCreate(ceed, &op_diff);
-  CeedCompositeOperatorAddSub(op_diff, op_diff_tet);
-  CeedCompositeOperatorAddSub(op_diff, op_diff_hex);
+  CeedOperatorCreateComposite(ceed, &op_diff);
+  CeedOperatorCompositeAddSub(op_diff, op_diff_tet);
+  CeedOperatorCompositeAddSub(op_diff, op_diff_hex);
 
   // Apply Setup Operator
   CeedOperatorApply(op_setup, x, CEED_VECTOR_NONE, CEED_REQUEST_IMMEDIATE);
diff --git a/tests/t523-operator-f.f90 b/tests/t523-operator-f.f90
index 0431b60ee7..ea20128d3c 100644
--- a/tests/t523-operator-f.f90
+++ b/tests/t523-operator-f.f90
@@ -205,15 +205,15 @@ program test
      & buhex,ceed_vector_active,err)
 
 ! Composite Operators
-      call ceedcompositeoperatorcreate(ceed,op_setup,err)
+      call ceedoperatorcreatecomposite(ceed,op_setup,err)
       call ceedoperatorsetname(op_setup,'setup',err)
-      call ceedcompositeoperatoraddsub(op_setup,op_setuptet,err)
-      call ceedcompositeoperatoraddsub(op_setup,op_setuphex,err)
+      call ceedoperatorcompositeaddsub(op_setup,op_setuptet,err)
+      call ceedoperatorcompositeaddsub(op_setup,op_setuphex,err)
 
-      call ceedcompositeoperatorcreate(ceed,op_mass,err)
+      call ceedoperatorcreatecomposite(ceed,op_mass,err)
       call ceedoperatorsetname(op_mass,'mass',err)
-      call ceedcompositeoperatoraddsub(op_mass,op_masstet,err)
-      call ceedcompositeoperatoraddsub(op_mass,op_masshex,err)
+      call ceedoperatorcompositeaddsub(op_mass,op_masstet,err)
+      call ceedoperatorcompositeaddsub(op_mass,op_masshex,err)
 
 ! View
       call ceedoperatorview(op_setup,err)
diff --git a/tests/t523-operator.c b/tests/t523-operator.c
index 9b614ec360..2e9dd5c264 100644
--- a/tests/t523-operator.c
+++ b/tests/t523-operator.c
@@ -150,18 +150,18 @@ int main(int argc, char **argv) {
 
   // Set up Composite Operators
   // -- Create
-  CeedCompositeOperatorCreate(ceed, &op_setup);
+  CeedOperatorCreateComposite(ceed, &op_setup);
   CeedOperatorSetName(op_setup, "setup");
   // -- Add SubOperators
-  CeedCompositeOperatorAddSub(op_setup, op_setup_tet);
-  CeedCompositeOperatorAddSub(op_setup, op_setup_hex);
+  CeedOperatorCompositeAddSub(op_setup, op_setup_tet);
+  CeedOperatorCompositeAddSub(op_setup, op_setup_hex);
 
   // -- Create
-  CeedCompositeOperatorCreate(ceed, &op_mass);
+  CeedOperatorCreateComposite(ceed, &op_mass);
   CeedOperatorSetName(op_mass, "mass");
   // -- Add SubOperators
-  CeedCompositeOperatorAddSub(op_mass, op_mass_tet);
-  CeedCompositeOperatorAddSub(op_mass, op_mass_hex);
+  CeedOperatorCompositeAddSub(op_mass, op_mass_tet);
+  CeedOperatorCompositeAddSub(op_mass, op_mass_hex);
 
   // View
   CeedOperatorViewTerse(op_setup, stdout);
diff --git a/tests/t524-operator-f.f90 b/tests/t524-operator-f.f90
index 4639442a5c..16b041c09a 100644
--- a/tests/t524-operator-f.f90
+++ b/tests/t524-operator-f.f90
@@ -215,13 +215,13 @@ program test
      & buhex,ceed_vector_active,err)
 
 ! Composite Operators
-      call ceedcompositeoperatorcreate(ceed,op_setup,err)
-      call ceedcompositeoperatoraddsub(op_setup,op_setuptet,err)
-      call ceedcompositeoperatoraddsub(op_setup,op_setuphex,err)
+      call ceedoperatorcreatecomposite(ceed,op_setup,err)
+      call ceedoperatorcompositeaddsub(op_setup,op_setuptet,err)
+      call ceedoperatorcompositeaddsub(op_setup,op_setuphex,err)
 
-      call ceedcompositeoperatorcreate(ceed,op_mass,err)
-      call ceedcompositeoperatoraddsub(op_mass,op_masstet,err)
-      call ceedcompositeoperatoraddsub(op_mass,op_masshex,err)
+      call ceedoperatorcreatecomposite(ceed,op_mass,err)
+      call ceedoperatorcompositeaddsub(op_mass,op_masstet,err)
+      call ceedoperatorcompositeaddsub(op_mass,op_masshex,err)
 
 ! Apply Setup Operator
       call ceedoperatorapply(op_setup,x,ceed_vector_none,&
diff --git a/tests/t524-operator.c b/tests/t524-operator.c
index fec0fe6ccd..3d61a563b3 100644
--- a/tests/t524-operator.c
+++ b/tests/t524-operator.c
@@ -155,13 +155,13 @@ int main(int argc, char **argv) {
   CeedOperatorSetField(op_mass_hex, "v", elem_restriction_u_hex, basis_u_hex, CEED_VECTOR_ACTIVE);
 
   // Composite Operators
-  CeedCompositeOperatorCreate(ceed, &op_setup);
-  CeedCompositeOperatorAddSub(op_setup, op_setup_tet);
-  CeedCompositeOperatorAddSub(op_setup, op_setup_hex);
+  CeedOperatorCreateComposite(ceed, &op_setup);
+  CeedOperatorCompositeAddSub(op_setup, op_setup_tet);
+  CeedOperatorCompositeAddSub(op_setup, op_setup_hex);
 
-  CeedCompositeOperatorCreate(ceed, &op_mass);
-  CeedCompositeOperatorAddSub(op_mass, op_mass_tet);
-  CeedCompositeOperatorAddSub(op_mass, op_mass_hex);
+  CeedOperatorCreateComposite(ceed, &op_mass);
+  CeedOperatorCompositeAddSub(op_mass, op_mass_tet);
+  CeedOperatorCompositeAddSub(op_mass, op_mass_hex);
 
   // Apply Setup Operator
   CeedOperatorApply(op_setup, x, CEED_VECTOR_NONE, CEED_REQUEST_IMMEDIATE);
diff --git a/tests/t525-operator.c b/tests/t525-operator.c
index 9617d94f32..bed1365a77 100644
--- a/tests/t525-operator.c
+++ b/tests/t525-operator.c
@@ -73,9 +73,9 @@ int main(int argc, char **argv) {
   CeedOperatorCreate(ceed, qf_sub_2, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_sub_2);
 
   // Composite operator
-  CeedCompositeOperatorCreate(ceed, &op_composite);
-  CeedCompositeOperatorAddSub(op_composite, op_sub_1);
-  CeedCompositeOperatorAddSub(op_composite, op_sub_2);
+  CeedOperatorCreateComposite(ceed, &op_composite);
+  CeedOperatorCompositeAddSub(op_composite, op_sub_1);
+  CeedOperatorCompositeAddSub(op_composite, op_sub_2);
 
   // Check setting field in context of single sub-operator for composite operator
   CeedOperatorGetContextFieldLabel(op_composite, "time", &time_label);
diff --git a/tests/t526-operator.c b/tests/t526-operator.c
index 6d66590d15..8e68ab89b3 100644
--- a/tests/t526-operator.c
+++ b/tests/t526-operator.c
@@ -114,10 +114,10 @@ int main(int argc, char **argv) {
 
   // Set up Composite Operator
   // -- Create
-  CeedCompositeOperatorCreate(ceed, &op_mass);
+  CeedOperatorCreateComposite(ceed, &op_mass);
   // -- Add SubOperators
-  CeedCompositeOperatorAddSub(op_mass, op_mass_tet);
-  CeedCompositeOperatorAddSub(op_mass, op_mass_hex);
+  CeedOperatorCompositeAddSub(op_mass, op_mass_tet);
+  CeedOperatorCompositeAddSub(op_mass, op_mass_hex);
 
   // Estimate FLOPs
   CeedQFunctionSetUserFlopsEstimate(qf_mass, 1);
diff --git a/tests/t538-operator.c b/tests/t538-operator.c
index 45e86ecdff..0e5267019c 100644
--- a/tests/t538-operator.c
+++ b/tests/t538-operator.c
@@ -104,9 +104,9 @@ int main(int argc, char **argv) {
   CeedOperatorSetField(op_diff, "dv", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE);
 
   // Composite operator
-  CeedCompositeOperatorCreate(ceed, &op_apply);
-  CeedCompositeOperatorAddSub(op_apply, op_mass);
-  CeedCompositeOperatorAddSub(op_apply, op_diff);
+  CeedOperatorCreateComposite(ceed, &op_apply);
+  CeedOperatorCompositeAddSub(op_apply, op_mass);
+  CeedOperatorCompositeAddSub(op_apply, op_diff);
 
   // Assemble diagonal
   CeedVectorCreate(ceed, num_dofs, &assembled);
diff --git a/tests/t554-operator.c b/tests/t554-operator.c
index 0ca19605a9..d63c548696 100644
--- a/tests/t554-operator.c
+++ b/tests/t554-operator.c
@@ -33,10 +33,10 @@ int main(int argc, char **argv) {
   CeedVectorCreate(ceed, num_comp * num_dofs_u_fine, &v_fine);
 
   // Composite operators
-  CeedCompositeOperatorCreate(ceed, &op_mass_coarse);
-  CeedCompositeOperatorCreate(ceed, &op_mass_fine);
-  CeedCompositeOperatorCreate(ceed, &op_prolong);
-  CeedCompositeOperatorCreate(ceed, &op_restrict);
+  CeedOperatorCreateComposite(ceed, &op_mass_coarse);
+  CeedOperatorCreateComposite(ceed, &op_mass_fine);
+  CeedOperatorCreateComposite(ceed, &op_prolong);
+  CeedOperatorCreateComposite(ceed, &op_restrict);
 
   // Setup fine suboperators
   for (CeedInt i = 0; i < num_sub_ops; i++) {
@@ -99,7 +99,7 @@ int main(int argc, char **argv) {
     CeedOperatorApply(sub_op_setup, x, q_data, CEED_REQUEST_IMMEDIATE);
 
     // -- Composite operators
-    CeedCompositeOperatorAddSub(op_mass_fine, sub_op_mass_fine);
+    CeedOperatorCompositeAddSub(op_mass_fine, sub_op_mass_fine);
 
     // -- Cleanup
     CeedVectorDestroy(&q_data);
@@ -116,7 +116,7 @@ int main(int argc, char **argv) {
 
   // Scale for suboperator multiplicity
   CeedVectorCreate(ceed, num_comp * num_dofs_u_fine, &p_mult_fine);
-  CeedCompositeOperatorGetMultiplicity(op_mass_fine, 0, NULL, p_mult_fine);
+  CeedOperatorCompositeGetMultiplicity(op_mass_fine, 0, NULL, p_mult_fine);
 
   // Setup coarse and prolong/restriction suboperators
   for (CeedInt i = 0; i < num_sub_ops; i++) {
@@ -125,7 +125,7 @@ int main(int argc, char **argv) {
     CeedOperator       *sub_ops_mass_fine, sub_op_mass_coarse, sub_op_prolong, sub_op_restrict;
 
     // -- Fine grid operator
-    CeedCompositeOperatorGetSubList(op_mass_fine, &sub_ops_mass_fine);
+    CeedOperatorCompositeGetSubList(op_mass_fine, &sub_ops_mass_fine);
 
     // -- Restrictions
     CeedInt offset = num_elem_sub * i * (p_coarse - 1);
@@ -145,9 +145,9 @@ int main(int argc, char **argv) {
                                      &sub_op_prolong, &sub_op_restrict);
 
     // -- Composite operators
-    CeedCompositeOperatorAddSub(op_mass_coarse, sub_op_mass_coarse);
-    CeedCompositeOperatorAddSub(op_prolong, sub_op_prolong);
-    CeedCompositeOperatorAddSub(op_restrict, sub_op_restrict);
+    CeedOperatorCompositeAddSub(op_mass_coarse, sub_op_mass_coarse);
+    CeedOperatorCompositeAddSub(op_prolong, sub_op_prolong);
+    CeedOperatorCompositeAddSub(op_restrict, sub_op_restrict);
 
     // -- Cleanup
     CeedElemRestrictionDestroy(&elem_restriction_u_coarse);
diff --git a/tests/t565-operator.c b/tests/t565-operator.c
index b5a542451f..8ed3e0ea5f 100644
--- a/tests/t565-operator.c
+++ b/tests/t565-operator.c
@@ -107,9 +107,9 @@ int main(int argc, char **argv) {
   CeedOperatorSetField(op_diff, "dv", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE);
 
   // Composite operator
-  CeedCompositeOperatorCreate(ceed, &op_apply);
-  CeedCompositeOperatorAddSub(op_apply, op_mass);
-  CeedCompositeOperatorAddSub(op_apply, op_diff);
+  CeedOperatorCreateComposite(ceed, &op_apply);
+  CeedOperatorCompositeAddSub(op_apply, op_mass);
+  CeedOperatorCompositeAddSub(op_apply, op_diff);
 
   // Fully assemble operator
   CeedSize   num_entries;

From 52b0e5638a25014f691e0ea82a9e87500116bc33 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Mon, 28 Jul 2025 09:00:40 -0600
Subject: [PATCH 479/571] interface - add deprecated warnings for old fn names

---
 Makefile                  |  1 +
 include/ceed/ceed.h       |  7 +------
 include/ceed/deprecated.h | 38 ++++++++++++++++++++++++++++++++++++++
 python/build_ceed_cffi.py |  1 +
 4 files changed, 41 insertions(+), 6 deletions(-)
 create mode 100644 include/ceed/deprecated.h

diff --git a/Makefile b/Makefile
index 5331ca3dc5..2fd059ce8f 100644
--- a/Makefile
+++ b/Makefile
@@ -876,6 +876,7 @@ install : $(libceed) $(OBJDIR)/ceed.pc
 	  "$(includedir)/ceed/jit-source/gallery/" "$(includedir)/ceed/jit-source/magma/"\
 	  "$(includedir)/ceed/jit-source/sycl/" "$(libdir)" "$(pkgconfigdir)")
 	$(INSTALL_DATA) include/ceed/ceed.h "$(DESTDIR)$(includedir)/ceed/"
+	$(INSTALL_DATA) include/ceed/deprecated.h "$(DESTDIR)$(includedir)/ceed/"
 	$(INSTALL_DATA) include/ceed/types.h "$(DESTDIR)$(includedir)/ceed/"
 	$(INSTALL_DATA) include/ceed/ceed-f32.h "$(DESTDIR)$(includedir)/ceed/"
 	$(INSTALL_DATA) include/ceed/ceed-f64.h "$(DESTDIR)$(includedir)/ceed/"
diff --git a/include/ceed/ceed.h b/include/ceed/ceed.h
index 8d791ad94e..65e94ce43a 100644
--- a/include/ceed/ceed.h
+++ b/include/ceed/ceed.h
@@ -474,12 +474,7 @@ CEED_EXTERN int  CeedOperatorAssemblyDataStrip(CeedOperator op);
 CEED_EXTERN int  CeedOperatorDestroy(CeedOperator *op);
 
 // Compatibility with previous composite CeedOperator naming
-#define CeedCompositeOperatorCreate(a, b) CeedOperatorCreateComposite(a, b)
-#define CeedCompositeOperatorAddSub(a, b) CeedOperatorCompositeAddSub(a, b)
-#define CeedCompositeOperatorGetNumSub(a, b) CeedOperatorCompositeGetNumSub(a, b)
-#define CeedCompositeOperatorGetSubList(a, b) CeedOperatorCompositeGetSubList(a, b)
-#define CeedCompositeOperatorGetSubByName(a, b) CeedOperatorCompositeGetSubByName(a, b, c)
-#define CeedCompositeOperatorGetMultiplicity(a, b, c, d) CeedOperatorCompositeGetMultiplicity(a, b, c, d)
+#include "deprecated.h"
 
 CEED_EXTERN int CeedOperatorGetFieldByName(CeedOperator op, const char *field_name, CeedOperatorField *op_field);
 CEED_EXTERN int CeedOperatorFieldGetName(CeedOperatorField op_field, const char **field_name);
diff --git a/include/ceed/deprecated.h b/include/ceed/deprecated.h
new file mode 100644
index 0000000000..4f54f5c606
--- /dev/null
+++ b/include/ceed/deprecated.h
@@ -0,0 +1,38 @@
+/// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+/// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+///
+/// SPDX-License-Identifier: BSD-2-Clause
+///
+/// This file is part of CEED:  http://github.com/ceed
+
+/// @file
+/// Public header for user and utility components of libCEED
+#pragma once
+
+#if __STDC_VERSION__ >= 202311L
+#define DEPRECATED(msg) [[deprecated(msg)]]
+#elif defined(__GNUC__) || defined(__clang__)
+#define DEPRECATED(msg) __attribute__((deprecated(msg)))
+#else
+#define DEPRECATED(msg)
+#endif
+
+// Compatibility with previous composite CeedOperator naming
+DEPRECATED("Use CeedOperatorCreateComposite()")
+static inline int CeedCompositeOperatorCreate(Ceed a, CeedOperator *b) { return CeedOperatorCreateComposite(a, b); }
+DEPRECATED("Use CeedOperatorCompositeAddSub()")
+static inline int CeedCompositeOperatorAddSub(CeedOperator a, CeedOperator b) { return CeedOperatorCompositeAddSub(a, b); }
+DEPRECATED("Use CeedOperatorCompositeGetNumSub()")
+static inline int CeedCompositeOperatorGetNumSub(CeedOperator a, CeedInt *b) { return CeedOperatorCompositeGetNumSub(a, b); }
+DEPRECATED("Use CeedOperatorCompositeGetSubList()")
+static inline int CeedCompositeOperatorGetSubList(CeedOperator a, CeedOperator **b) { return CeedOperatorCompositeGetSubList(a, b); }
+DEPRECATED("Use CeedCompositeOperatorGetSubByName()")
+static inline int CeedCompositeOperatorGetSubByName(CeedOperator a, const char *b, CeedOperator *c) {
+  return CeedOperatorCompositeGetSubByName(a, b, c);
+}
+DEPRECATED("Use CeedOperatorCompositeGetMultiplicity()")
+static inline int CeedCompositeOperatorGetMultiplicity(CeedOperator a, CeedInt b, CeedInt *c, CeedVector d) {
+  return CeedOperatorCompositeGetMultiplicity(a, b, c, d);
+}
+
+#undef DEPRECATED
diff --git a/python/build_ceed_cffi.py b/python/build_ceed_cffi.py
index cf302c85a1..a802357303 100644
--- a/python/build_ceed_cffi.py
+++ b/python/build_ceed_cffi.py
@@ -34,6 +34,7 @@ def get_ceed_dirs():
         lines += [line.strip() for line in f if
                   not (line.startswith("#") and not line.startswith("#include")) and
                   not line.startswith("  static") and
+                  not line.startswith("#include \"deprecated.h\"") and
                   not line.startswith("  CEED_QFUNCTION_ATTR") and
                   "CeedErrorImpl" not in line and
                   "const char *, ...);" not in line and

From 2027fb9d13fe34211738d8539f90542a9801ae2c Mon Sep 17 00:00:00 2001
From: SirAlienTheGreat <79958059+SirAlienTheGreat@users.noreply.github.com>
Date: Thu, 31 Jul 2025 12:43:27 -0600
Subject: [PATCH 480/571] Rust and cuda clang support (#1873)

* Switch compiler to clang (not portable)

* test add_num

* compile with llvm tools

* not working linking

* not fixed

* Update ex1-volume.h

* update

* remove global path

* changes

* changes 2

* crate works

* basic gpu rust compilation

* still not working

* rust source roots basic support

* nvrtc/clang selection

* cleanup

* update example (not working)

* add rust example

* fix merge issue

* delete temp files

* cleanup

* rust qfunc 2d array (needs doc)

* cleanup

* more cleanup

* downgrade back to c++11

* format

* final draft cleanup

* formatting + CUDA_CLANG -> GPU_CLANG

* Update cuda CEED_QFUNCTION_RUST

* fix python

* fix python and format

* format fr

* update comment

* fix python fr

* Apply error suggestions from code review

* update errors to libceed format

* Apply suggestions from code review

* add optimization flag

* remove line breaks

* Apply suggestions from code review

* avoid python in macro better

* add rust example

* format

* Apply suggestions from code review

* move rust example to own directory

* Simplify python exclusion logic

* re-fix python

* Update python/build_ceed_cffi.py

* change names and simplify makefile

* Revert "change names and simplify makefile"

This reverts commit 96e762f9f5d50d8ed9d3673e7fda850dfc390abe.

* Apply Jeremy's diff

* Simplify CeedCallSystem

* use rust-install llvm tools

* add gitignores

* update paths

* example absolute path

* fix comments

* apply partial diff

* add newline

* add makefile

* Update examples/rust-qfunctions/Makefile

* update makefile

---------

Co-authored-by: Allen MacFarland <alma4974@noether>
---
 .gitignore                                    |   3 +
 .gitlab-ci.yml                                |  51 ++
 Cargo.toml                                    |  19 +-
 Makefile                                      |  23 +-
 backends/cuda/ceed-cuda-compile.cpp           | 249 ++++++++--
 examples/Makefile                             |   2 +
 examples/rust-qfunctions/.gitignore           |   1 +
 examples/rust-qfunctions/Makefile             |  35 ++
 .../ex1-volume-rs/.cargo/config.toml          |   6 +
 .../rust-qfunctions/ex1-volume-rs/.gitignore  |   3 +
 .../rust-qfunctions/ex1-volume-rs/Cargo.toml  |  17 +
 .../ex1-volume-rs/rust-toolchain.toml         |   2 +
 .../rust-qfunctions/ex1-volume-rs/src/lib.rs  | 124 +++++
 examples/rust-qfunctions/ex1-volume.c         | 439 ++++++++++++++++++
 examples/rust-qfunctions/ex1-volume.h         |  19 +
 include/ceed-impl.h                           |   3 +
 include/ceed/backend.h                        |   2 +
 include/ceed/ceed.h                           |  13 +-
 include/ceed/jit-source/cuda/cuda-jit.h       |   4 +
 include/ceed/types.h                          |  27 ++
 interface/ceed-config.c                       |  32 ++
 interface/ceed.c                              |  88 ++++
 python/build_ceed_cffi.py                     |  33 +-
 tests/junit.py                                |   2 +
 24 files changed, 1123 insertions(+), 74 deletions(-)
 create mode 100644 examples/rust-qfunctions/.gitignore
 create mode 100644 examples/rust-qfunctions/Makefile
 create mode 100644 examples/rust-qfunctions/ex1-volume-rs/.cargo/config.toml
 create mode 100644 examples/rust-qfunctions/ex1-volume-rs/.gitignore
 create mode 100644 examples/rust-qfunctions/ex1-volume-rs/Cargo.toml
 create mode 100644 examples/rust-qfunctions/ex1-volume-rs/rust-toolchain.toml
 create mode 100644 examples/rust-qfunctions/ex1-volume-rs/src/lib.rs
 create mode 100644 examples/rust-qfunctions/ex1-volume.c
 create mode 100644 examples/rust-qfunctions/ex1-volume.h

diff --git a/.gitignore b/.gitignore
index 06ab679a45..8a036aadc5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -51,6 +51,9 @@ doc/sphinx/build/
 # Example docs automatically copied from source tree
 doc/sphinx/source/examples/
 
+# Clang GPU temp files
+temp_*
+
 # Output files, videos, and compressed archives should not be added accidentally
 *.avi
 *.bin
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 5389460d56..9a7b3adfc5 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -171,6 +171,57 @@ noether-cpu:
 #    - touch .SUCCESS
 
 
+# ----------------------------------------------------------------------------------------
+# Rust + CUDA
+# ----------------------------------------------------------------------------------------
+noether-rust-qfunctions:
+  stage: test:gpu-and-float
+  tags:
+    - cuda
+  interruptible: true
+  before_script:
+    # Environment
+    - export COVERAGE=1 CC=gcc CXX=g++ FC=gfortran NVCC=nvcc GPU_CLANG=1
+    - export NPROC_POOL=4
+    - echo "-------------- nproc ---------------" && NPROC_CPU=$(nproc) && NPROC_GPU=$(($(nproc)<8?$(nproc):8)) && echo "NPROC_CPU" $NPROC_CPU && echo "NPROC_GPU" $NPROC_GPU
+    - echo "-------------- CC ------------------" && $CC --version
+    - echo "-------------- CXX -----------------" && $CXX --version
+    - echo "-------------- FC ------------------" && $FC --version
+    - echo "-------------- NVCC ----------------" && $NVCC --version
+    - echo "-------------- Rustc ---------------" && rustc --version
+    - echo "-------------- Clang++ -------------" && clang++ --version
+    - echo "-------------- GCOV ----------------" && gcov --version
+  script:
+    - rm -f .SUCCESS
+    # libCEED
+    - make configure OPT='-O -march=native -ffp-contract=fast' CUDA_DIR=/usr/local/cuda-12.9
+    - echo "-------------- libCEED -------------" && make info
+    - make clean
+    - make -k -j$NPROC_CPU -l$NPROC_CPU
+    # -- libCEED only tests
+    - echo "-------------- Rust QFunction tests -"
+    #    Note: PETSC_DIR is set by default in GitLab runner env, unsetting to isolate core tests
+    - export PETSC_DIR= PETSC_ARCH=
+    - make -k -j$((NPROC_GPU / NPROC_POOL)) JUNIT_BATCH="rust-qfunction" junit realsearch=rust-qfunction
+    # Report status
+    - touch .SUCCESS
+  after_script:
+    - |
+      if [ -f .SUCCESS ]; then
+        lcov --directory . --capture --output-file coverage.info --ignore-errors source,mismatch;
+        bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F interface;
+        bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F gallery;
+        bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F backends;
+        bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F tests;
+        bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F examples;
+      fi
+  artifacts:
+    paths:
+      - build/*.junit
+    reports:
+      junit: build/*.junit
+
+
 # ----------------------------------------------------------------------------------------
 # CUDA backends
 # ----------------------------------------------------------------------------------------
diff --git a/Cargo.toml b/Cargo.toml
index 218a10b19e..83aaac7b46 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,12 +1,13 @@
 [workspace]
 members = [
-        "rust/libceed",
-        "rust/libceed-sys",
-        "examples/rust/ex1-volume",
-        "examples/rust/ex1-volume-vector",
-        "examples/rust/ex2-surface",
-        "examples/rust/ex2-surface-vector",
-        "examples/rust/ex3-volume",
-        "examples/rust/ex3-volume-vector",
-        "examples/rust/mesh",
+    "rust/libceed",
+    "rust/libceed-sys",
+    "examples/rust/ex1-volume",
+    "examples/rust/ex1-volume-vector",
+    "examples/rust/ex2-surface",
+    "examples/rust/ex2-surface-vector",
+    "examples/rust/ex3-volume",
+    "examples/rust/ex3-volume-vector",
+    "examples/rust/mesh",
 ]
+exclude = ["examples/rust-qfunctions/ex1-volume-rs"]
diff --git a/Makefile b/Makefile
index 2fd059ce8f..8a097ed5c6 100644
--- a/Makefile
+++ b/Makefile
@@ -349,6 +349,10 @@ mfemexamples     := $(mfemexamples.cpp:examples/mfem/%.cpp=$(OBJDIR)/mfem-%)
 # Nek5K Examples
 nekexamples := $(OBJDIR)/nek-bps
 
+# Rust QFunction Examples
+rustqfunctions.c       := $(sort $(wildcard examples/rust-qfunctions/*.c))
+rustqfunctionsexamples := $(rustqfunctions.c:examples/rust-qfunctions/%.c=$(OBJDIR)/rustqfunctions-%)
+
 # PETSc Examples
 petscexamples.c := $(wildcard examples/petsc/*.c)
 petscexamples   := $(petscexamples.c:examples/petsc/%.c=$(OBJDIR)/petsc-%)
@@ -733,6 +737,11 @@ $(OBJDIR)/nek-bps : examples/nek/bps/bps.usr examples/nek/nek-examples.sh $(libc
 	mv examples/nek/build/bps $(OBJDIR)/bps
 	cp examples/nek/nek-examples.sh $(OBJDIR)/nek-bps
 
+# Rust QFunctions
+$(OBJDIR)/rustqfunctions-% : examples/rust-qfunctions/%.c $(libceed) | $$(@D)/.DIR
+	+$(MAKE) -C examples/rust-qfunctions CEED_DIR=`pwd`
+	cp examples/rust-qfunctions/$* $@
+
 # PETSc
 # Several executables have common utilities, but we can't build the utilities
 # from separate submake invocations because they'll compete with each
@@ -763,19 +772,22 @@ $(OBJDIR)/solids-% : examples/solids/%.c examples/solids/%.h \
 	  PETSC_DIR="$(abspath $(PETSC_DIR))" OPT="$(OPT)" $*
 	cp examples/solids/$* $@
 
-examples : $(allexamples)
-ceedexamples : $(examples)
-nekexamples : $(nekexamples)
-mfemexamples : $(mfemexamples)
+examples      : $(allexamples)
+ceedexamples  : $(examples)
+nekexamples   : $(nekexamples)
+mfemexamples  : $(mfemexamples)
 petscexamples : $(petscexamples)
 
+rustqfunctionsexamples : $(rustqfunctionsexamples)
+
 external_examples := \
 	$(if $(MFEM_DIR),$(mfemexamples)) \
 	$(if $(PETSC_DIR),$(petscexamples)) \
 	$(if $(NEK5K_DIR),$(nekexamples)) \
 	$(if $(DEAL_II_DIR),$(dealiiexamples)) \
 	$(if $(PETSC_DIR),$(fluidsexamples)) \
-	$(if $(PETSC_DIR),$(solidsexamples))
+	$(if $(PETSC_DIR),$(solidsexamples)) \
+	$(rustqfunctionsexamples)
 
 allexamples = $(examples) $(external_examples)
 
@@ -904,6 +916,7 @@ cln clean :
 	$(call quiet,MAKE) -C examples clean NEK5K_DIR="$(abspath $(NEK5K_DIR))"
 	$(call quiet,MAKE) -C python/tests clean
 	$(RM) benchmarks/*output.txt
+	$(RM) -f temp_*
 
 distclean : clean
 	$(RM) -r doc/html doc/sphinx/build $(CONFIG)
diff --git a/backends/cuda/ceed-cuda-compile.cpp b/backends/cuda/ceed-cuda-compile.cpp
index dc430b81fa..c73a988a55 100644
--- a/backends/cuda/ceed-cuda-compile.cpp
+++ b/backends/cuda/ceed-cuda-compile.cpp
@@ -11,11 +11,17 @@
 #include <ceed/backend.h>
 #include <ceed/jit-tools.h>
 #include <cuda_runtime.h>
+#include <dirent.h>
 #include <nvrtc.h>
 #include <stdarg.h>
 #include <string.h>
+#include <sys/types.h>
 
+#include <cstdlib>
+#include <fstream>
+#include <iostream>
 #include <sstream>
+#include <string>
 
 #include "ceed-cuda-common.h"
 
@@ -31,9 +37,34 @@
     CeedChk_Nvrtc(ceed, ierr_q_); \
   } while (0)
 
+#define CeedCallSystem(ceed, command, message) CeedCallBackend(CeedCallSystem_Core(ceed, command, message))
+
+//------------------------------------------------------------------------------
+// Call system command and capture stdout + stderr
+//------------------------------------------------------------------------------
+static int CeedCallSystem_Core(Ceed ceed, const char *command, const char *message) {
+  CeedDebug(ceed, "Running command:\n$ %s\n", command);
+  FILE *output_stream = popen((command + std::string(" 2>&1")).c_str(), "r");
+
+  CeedCheck(output_stream != nullptr, ceed, CEED_ERROR_BACKEND, "Failed to %s with command: %s", message, command);
+
+  char output[4 * CEED_MAX_RESOURCE_LEN];
+
+  while (fgets(output, sizeof(output), output_stream) != nullptr) {
+  }
+  CeedDebug(ceed, "Command output:\n%s\n", output);
+
+  CeedCheck(pclose(output_stream) == 0, ceed, CEED_ERROR_BACKEND, "Failed to %s with error: %s", message, output);
+  return CEED_ERROR_SUCCESS;
+}
+
 //------------------------------------------------------------------------------
 // Compile CUDA kernel
 //------------------------------------------------------------------------------
+using std::ifstream;
+using std::ofstream;
+using std::ostringstream;
+
 static int CeedCompileCore_Cuda(Ceed ceed, const char *source, const bool throw_error, bool *is_compile_good, CUmodule *module,
                                 const CeedInt num_defines, va_list args) {
   size_t                ptx_size;
@@ -48,6 +79,14 @@ static int CeedCompileCore_Cuda(Ceed ceed, const char *source, const bool throw_
   cudaFree(0);  // Make sure a Context exists for nvrtc
 
   std::ostringstream code;
+  bool               using_clang;
+
+  CeedCallBackend(CeedGetIsClang(ceed, &using_clang));
+
+  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS,
+               using_clang ? "Compiling CUDA with Clang backend (with Rust QFunction support)"
+                           : "Compiling CUDA with NVRTC backend (without Rust QFunction support).\nTo use the Clang backend, set the environment "
+                             "variable GPU_CLANG=1");
 
   // Get kernel specific options, such as kernel constants
   if (num_defines > 0) {
@@ -116,66 +155,184 @@ static int CeedCompileCore_Cuda(Ceed ceed, const char *source, const bool throw_
   code << source;
 
   // Create Program
-  CeedCallNvrtc(ceed, nvrtcCreateProgram(&prog, code.str().c_str(), NULL, 0, NULL, NULL));
 
   // Compile kernel
   CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "---------- ATTEMPTING TO COMPILE JIT SOURCE ----------\n");
   CeedDebug(ceed, "Source:\n%s\n", code.str().c_str());
   CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "---------- END OF JIT SOURCE ----------\n");
-  if (CeedDebugFlag(ceed)) {
-    // LCOV_EXCL_START
-    CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "---------- JiT COMPILER OPTIONS ----------\n");
-    for (CeedInt i = 0; i < num_opts + num_jit_source_dirs + num_jit_defines; i++) {
-      CeedDebug(ceed, "Option %d: %s", i, opts[i]);
-    }
-    CeedDebug(ceed, "");
-    CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "---------- END OF JiT COMPILER OPTIONS ----------\n");
-    // LCOV_EXCL_STOP
-  }
-  nvrtcResult result = nvrtcCompileProgram(prog, num_opts + num_jit_source_dirs + num_jit_defines, opts);
 
-  for (CeedInt i = 0; i < num_jit_source_dirs; i++) {
-    CeedCallBackend(CeedFree(&opts[num_opts + i]));
-  }
-  for (CeedInt i = 0; i < num_jit_defines; i++) {
-    CeedCallBackend(CeedFree(&opts[num_opts + num_jit_source_dirs + i]));
-  }
-  CeedCallBackend(CeedFree(&opts));
-  *is_compile_good = result == NVRTC_SUCCESS;
-  if (!*is_compile_good) {
-    char  *log;
-    size_t log_size;
-
-    CeedCallNvrtc(ceed, nvrtcGetProgramLogSize(prog, &log_size));
-    CeedCallBackend(CeedMalloc(log_size, &log));
-    CeedCallNvrtc(ceed, nvrtcGetProgramLog(prog, log));
-    if (throw_error) {
-      return CeedError(ceed, CEED_ERROR_BACKEND, "%s\n%s", nvrtcGetErrorString(result), log);
-    } else {
+  if (!using_clang) {
+    CeedCallNvrtc(ceed, nvrtcCreateProgram(&prog, code.str().c_str(), NULL, 0, NULL, NULL));
+
+    if (CeedDebugFlag(ceed)) {
       // LCOV_EXCL_START
-      CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- COMPILE ERROR DETECTED ----------\n");
-      CeedDebug(ceed, "Error: %s\nCompile log:\n%s\n", nvrtcGetErrorString(result), log);
-      CeedDebug256(ceed, CEED_DEBUG_COLOR_WARNING, "---------- BACKEND MAY FALLBACK ----------\n");
-      CeedCallBackend(CeedFree(&log));
-      CeedCallNvrtc(ceed, nvrtcDestroyProgram(&prog));
-      return CEED_ERROR_SUCCESS;
+      CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "---------- JiT COMPILER OPTIONS ----------\n");
+      for (CeedInt i = 0; i < num_opts + num_jit_source_dirs + num_jit_defines; i++) {
+        CeedDebug(ceed, "Option %d: %s", i, opts[i]);
+      }
+      CeedDebug(ceed, "");
+      CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "---------- END OF JiT COMPILER OPTIONS ----------\n");
       // LCOV_EXCL_STOP
     }
-  }
+
+    nvrtcResult result = nvrtcCompileProgram(prog, num_opts + num_jit_source_dirs + num_jit_defines, opts);
+
+    for (CeedInt i = 0; i < num_jit_source_dirs; i++) {
+      CeedCallBackend(CeedFree(&opts[num_opts + i]));
+    }
+    for (CeedInt i = 0; i < num_jit_defines; i++) {
+      CeedCallBackend(CeedFree(&opts[num_opts + num_jit_source_dirs + i]));
+    }
+    CeedCallBackend(CeedFree(&opts));
+    *is_compile_good = result == NVRTC_SUCCESS;
+    if (!*is_compile_good) {
+      char  *log;
+      size_t log_size;
+
+      CeedCallNvrtc(ceed, nvrtcGetProgramLogSize(prog, &log_size));
+      CeedCallBackend(CeedMalloc(log_size, &log));
+      CeedCallNvrtc(ceed, nvrtcGetProgramLog(prog, log));
+      if (throw_error) {
+        return CeedError(ceed, CEED_ERROR_BACKEND, "%s\n%s", nvrtcGetErrorString(result), log);
+      } else {
+        // LCOV_EXCL_START
+        CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- COMPILE ERROR DETECTED ----------\n");
+        CeedDebug(ceed, "Error: %s\nCompile log:\n%s\n", nvrtcGetErrorString(result), log);
+        CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- BACKEND MAY FALLBACK ----------\n");
+        CeedCallBackend(CeedFree(&log));
+        CeedCallNvrtc(ceed, nvrtcDestroyProgram(&prog));
+        return CEED_ERROR_SUCCESS;
+        // LCOV_EXCL_STOP
+      }
+    }
 
 #if CUDA_VERSION >= 11010
-  CeedCallNvrtc(ceed, nvrtcGetCUBINSize(prog, &ptx_size));
-  CeedCallBackend(CeedMalloc(ptx_size, &ptx));
-  CeedCallNvrtc(ceed, nvrtcGetCUBIN(prog, ptx));
+    CeedCallNvrtc(ceed, nvrtcGetCUBINSize(prog, &ptx_size));
+    CeedCallBackend(CeedMalloc(ptx_size, &ptx));
+    CeedCallNvrtc(ceed, nvrtcGetCUBIN(prog, ptx));
 #else
-  CeedCallNvrtc(ceed, nvrtcGetPTXSize(prog, &ptx_size));
-  CeedCallBackend(CeedMalloc(ptx_size, &ptx));
-  CeedCallNvrtc(ceed, nvrtcGetPTX(prog, ptx));
+    CeedCallNvrtc(ceed, nvrtcGetPTXSize(prog, &ptx_size));
+    CeedCallBackend(CeedMalloc(ptx_size, &ptx));
+    CeedCallNvrtc(ceed, nvrtcGetPTX(prog, ptx));
 #endif
-  CeedCallNvrtc(ceed, nvrtcDestroyProgram(&prog));
+    CeedCallNvrtc(ceed, nvrtcDestroyProgram(&prog));
+
+    CeedCallCuda(ceed, cuModuleLoadData(module, ptx));
+    CeedCallBackend(CeedFree(&ptx));
+    return CEED_ERROR_SUCCESS;
+  } else {
+    const char *full_filename = "temp_kernel_source.cu";
+    FILE       *file          = fopen(full_filename, "w");
+
+    CeedCheck(file, ceed, CEED_ERROR_BACKEND, "Failed to create file. Write access is required for cuda-clang\n");
+    fputs(code.str().c_str(), file);
+    fclose(file);
+
+    // Get rust crate directories
 
-  CeedCallCuda(ceed, cuModuleLoadData(module, ptx));
-  CeedCallBackend(CeedFree(&ptx));
+    const char **rust_source_dirs     = nullptr;
+    int          num_rust_source_dirs = 0;
+
+    CeedCallBackend(CeedGetRustSourceRoots(ceed, &num_rust_source_dirs, &rust_source_dirs));
+
+    std::string rust_dirs[10];
+
+    if (num_rust_source_dirs > 0) {
+      CeedDebug(ceed, "There are %d source dirs, including %s\n", num_rust_source_dirs, rust_source_dirs[0]);
+    }
+
+    for (CeedInt i = 0; i < num_rust_source_dirs; i++) {
+      rust_dirs[i] = std::string(rust_source_dirs[i]);
+    }
+
+    CeedCallBackend(CeedRestoreRustSourceRoots(ceed, &rust_source_dirs));
+
+    char *rust_toolchain = std::getenv("RUST_TOOLCHAIN");
+
+    if (rust_toolchain == nullptr) {
+      rust_toolchain = (char *)"nightly";
+      setenv("RUST_TOOLCHAIN", "nightly", 0);
+    }
+
+    // Compile Rust crate(s) needed
+    std::string command;
+
+    for (CeedInt i = 0; i < num_rust_source_dirs; i++) {
+      command = "cargo +" + std::string(rust_toolchain) + " build --release --target nvptx64-nvidia-cuda --config " + rust_dirs[i] +
+                "/.cargo/config.toml --manifest-path " + rust_dirs[i] + "/Cargo.toml";
+      CeedCallSystem(ceed, command.c_str(), "build Rust crate");
+    }
+
+    // Compile wrapper kernel
+    command = "clang++ -flto=thin --cuda-gpu-arch=sm_" + std::to_string(prop.major) + std::to_string(prop.minor) +
+              " --cuda-device-only -emit-llvm -S temp_kernel_source.cu -o temp_kernel.ll ";
+    command += opts[4];
+    CeedCallSystem(ceed, command.c_str(), "JiT kernel source");
+
+    // the find command finds the rust-installed llvm-link tool and runs it
+    command = "$(find $(rustup run " + std::string(rust_toolchain) +
+              " rustc --print sysroot) -name llvm-link) temp_kernel.ll --ignore-non-bitcode --internalize --only-needed -S -o "
+              "temp_kernel_linked.ll  ";
+
+    // Searches for .a files in rust directoy
+    // Note: this is necessary because rust crate names may not match the folder they are in
+    for (CeedInt i = 0; i < num_rust_source_dirs; i++) {
+      std::string dir = rust_dirs[i] + "/target/nvptx64-nvidia-cuda/release";
+      DIR        *dp  = opendir(dir.c_str());
+
+      CeedCheck(dp != nullptr, ceed, CEED_ERROR_BACKEND, "Could not open directory: %s", dir.c_str());
+      struct dirent *entry;
+
+      // finds files ending in .a
+      while ((entry = readdir(dp)) != nullptr) {
+        std::string filename(entry->d_name);
+
+        if (filename.size() >= 2 && filename.substr(filename.size() - 2) == ".a") {
+          command += dir + "/" + filename + " ";
+        }
+      }
+      closedir(dp);
+      // TODO: when libCEED switches to c++17, switch to std::filesystem for the loop above
+    }
+
+    // Link, optimize, and compile final CUDA kernel
+    // note that the find command is used to find the rust-installed llvm tool
+    CeedCallSystem(ceed, command.c_str(), "link C and Rust source");
+    CeedCallSystem(ceed,
+                   ("$(find $(rustup run " + std::string(rust_toolchain) +
+                    " rustc --print sysroot) -name opt) --passes internalize,inline temp_kernel_linked.ll -o temp_kernel_opt.bc")
+                       .c_str(),
+                   "optimize linked C and Rust source");
+    CeedCallSystem(ceed,
+                   ("$(find $(rustup run " + std::string(rust_toolchain) + " rustc --print sysroot) -name llc) -O3 -mcpu=sm_" +
+                    std::to_string(prop.major) + std::to_string(prop.minor) + " temp_kernel_opt.bc -o temp_kernel_final.ptx")
+                       .c_str(),
+                   "compile final CUDA kernel");
+
+    ifstream      ptxfile("temp_kernel_final.ptx");
+    ostringstream sstr;
+
+    sstr << ptxfile.rdbuf();
+
+    auto ptx_data = sstr.str();
+    ptx_size      = ptx_data.length();
+
+    int result = cuModuleLoadData(module, ptx_data.c_str());
+
+    *is_compile_good = result == 0;
+    if (!*is_compile_good) {
+      if (throw_error) {
+        return CeedError(ceed, CEED_ERROR_BACKEND, "Failed to load module data");
+      } else {
+        // LCOV_EXCL_START
+        CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- COMPILE ERROR DETECTED ----------\n");
+        CeedDebug(ceed, "Error: Failed to load module data");
+        CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- BACKEND MAY FALLBACK ----------\n");
+        return CEED_ERROR_SUCCESS;
+        // LCOV_EXCL_STOP
+      }
+    }
+  }
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/examples/Makefile b/examples/Makefile
index d32f406f5a..f220748ba2 100644
--- a/examples/Makefile
+++ b/examples/Makefile
@@ -53,10 +53,12 @@ fluids:
 
 solids:
 	make CEED_DIR=$(CEED_DIR) PETSC_DIR=$(PETSC_DIR) PETSC_ARCH=$(PETSC_ARCH) -C solids all
+
 clean:
 	+make -C ceed clean
 	+make -C mfem clean
 	+make -C nek clean
+	+make -C rust-qfunctions clean
 	+make -C petsc clean
 	+make -C fluids clean
 	+make -C solids clean
diff --git a/examples/rust-qfunctions/.gitignore b/examples/rust-qfunctions/.gitignore
new file mode 100644
index 0000000000..7c891764ce
--- /dev/null
+++ b/examples/rust-qfunctions/.gitignore
@@ -0,0 +1 @@
+temp_*
diff --git a/examples/rust-qfunctions/Makefile b/examples/rust-qfunctions/Makefile
new file mode 100644
index 0000000000..324297c5d9
--- /dev/null
+++ b/examples/rust-qfunctions/Makefile
@@ -0,0 +1,35 @@
+# Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors
+# All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+#
+# SPDX-License-Identifier: BSD-2-Clause
+#
+# This file is part of CEED:  http://github.com/ceed
+
+OPT ?= -O -g
+
+# Ceed directory
+CEED_DIR ?= ../..
+CEED_FLAGS ?= -I$(CEED_DIR)/include -std=c11  $(OPT)
+CEED_LIBS ?= -Wl,-rpath,$(abspath $(CEED_DIR)/lib) -L$(CEED_DIR)/lib -lceed -L$(CEED_DIR)/examples/ceed -lm
+
+EXAMPLES.c = $(wildcard ex*.c)
+EXAMPLES = $(EXAMPLES.c:%.c=%)
+
+.SUFFIXES:
+.SUFFIXES: .c
+.PHONY: all clean
+
+all: $(EXAMPLES)
+
+# Remove built-in rules
+%: %.c
+
+# Special build rule for example 1 (rust)
+ex1-volume: ex1-volume.c
+	cargo +nightly build --release --manifest-path ex1-volume-rs/Cargo.toml --config ex1-volume-rs/.cargo/config.toml
+	$(LINK.c) $(CEED_FLAGS) $(CEED_LDFLAGS) $(abspath $<) -o $@ $(CEED_LIBS) -L$(CEED_DIR)/examples/rust-qfunctions/ex1-volume-rs/target/release -lex1_volume_rs
+
+clean:
+	rm -f *~ $(EXAMPLES)
+	rm -f temp_*
+	rm -rf *.dSYM *.TVD.*breakpoints
diff --git a/examples/rust-qfunctions/ex1-volume-rs/.cargo/config.toml b/examples/rust-qfunctions/ex1-volume-rs/.cargo/config.toml
new file mode 100644
index 0000000000..ca727ba27d
--- /dev/null
+++ b/examples/rust-qfunctions/ex1-volume-rs/.cargo/config.toml
@@ -0,0 +1,6 @@
+[target.nvptx64-nvidia-cuda]
+rustflags = [
+  "-C", "linker-plugin-lto",
+]
+[unstable]
+build-std = ["panic_abort","core", "alloc"]
diff --git a/examples/rust-qfunctions/ex1-volume-rs/.gitignore b/examples/rust-qfunctions/ex1-volume-rs/.gitignore
new file mode 100644
index 0000000000..20a838f835
--- /dev/null
+++ b/examples/rust-qfunctions/ex1-volume-rs/.gitignore
@@ -0,0 +1,3 @@
+target
+registry
+Cargo.lock
diff --git a/examples/rust-qfunctions/ex1-volume-rs/Cargo.toml b/examples/rust-qfunctions/ex1-volume-rs/Cargo.toml
new file mode 100644
index 0000000000..afc2f3b200
--- /dev/null
+++ b/examples/rust-qfunctions/ex1-volume-rs/Cargo.toml
@@ -0,0 +1,17 @@
+[package]
+name = "ex1-volume-rs"
+version = "0.1.0"
+edition = "2021"
+
+[profile.dev]
+panic = "abort"
+
+[profile.release]
+panic = "abort"
+
+# Compiles the crate as a lib (for GPU) and staticlib (for CPU)
+[lib]
+crate-type = ["staticlib"]
+
+[dependencies]
+ndarray = {version = "0.16.1", default-features = false}
diff --git a/examples/rust-qfunctions/ex1-volume-rs/rust-toolchain.toml b/examples/rust-qfunctions/ex1-volume-rs/rust-toolchain.toml
new file mode 100644
index 0000000000..5d56faf9ae
--- /dev/null
+++ b/examples/rust-qfunctions/ex1-volume-rs/rust-toolchain.toml
@@ -0,0 +1,2 @@
+[toolchain]
+channel = "nightly"
diff --git a/examples/rust-qfunctions/ex1-volume-rs/src/lib.rs b/examples/rust-qfunctions/ex1-volume-rs/src/lib.rs
new file mode 100644
index 0000000000..8f2a36dfc9
--- /dev/null
+++ b/examples/rust-qfunctions/ex1-volume-rs/src/lib.rs
@@ -0,0 +1,124 @@
+#![no_std]
+#![allow(internal_features)]
+#![feature(asm_experimental_arch, abi_ptx, core_intrinsics)]
+use core::ffi::c_void;
+use core::intrinsics::abort;
+use core::panic::PanicInfo;
+
+use ndarray::ArrayView;
+
+// This is a dummy allocator that always returns null. Heap allocations do not work on GPUs
+use core::alloc::{GlobalAlloc, Layout};
+pub struct Allocator;
+unsafe impl GlobalAlloc for Allocator {
+    unsafe fn alloc(&self, _layout: Layout) -> *mut u8 {
+        0 as *mut u8
+    }
+    unsafe fn dealloc(&self, _ptr: *mut u8, _layout: Layout) {
+        abort(); // since we never allocate
+    }
+}
+#[global_allocator]
+static GLOBAL_ALLOCATOR: Allocator = Allocator;
+
+// This is a copy of the same data structure defined in the .h file. It can be autogenerated using bindgen/cbindgen
+#[doc = " A structure used to pass additional data to f_build_mass"]
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct BuildContext {
+    pub dim: i32,
+    pub space_dim: i32,
+}
+
+// On no_std targets, its required to implement your own panic function.
+#[panic_handler]
+fn panic(_info: &PanicInfo) -> ! {
+    abort()
+}
+
+/* The no_mangle is required because rust "mangles" names (changes them to prevent namespace conflicts)
+Also note that this function ends in _rs, even though the C call `CEED_QFUNCTION_RUST(build_mass)` doesn't */
+#[no_mangle]
+pub unsafe extern "C" fn build_mass_rs(
+    ctx: *mut c_void,
+    q: i32,
+    in_: *const *const f64,
+    out: *mut *mut f64,
+) -> i8 {
+    let ctx: *mut BuildContext = unsafe { core::mem::transmute(ctx) };
+    let ctx: &mut BuildContext = &mut *ctx;
+
+    let in_slice = core::slice::from_raw_parts(in_, 2);
+
+    // in_slice[0] is Jacobians with shape [dim, dim, Q]
+    // in_slice[1] is quadrature weights with shape [1, Q]
+    let j_ptr = in_slice[0];
+    let w_ptr = in_slice[1];
+
+    let j = ArrayView::from_shape_ptr((ctx.dim as usize, ctx.dim as usize, q as usize), j_ptr);
+
+    let w = core::slice::from_raw_parts(w_ptr, q as usize);
+
+    let out_slice = core::slice::from_raw_parts_mut(out, 1);
+    let q_data = core::slice::from_raw_parts_mut(out_slice[0], q as usize);
+
+    match ctx.dim * 10 + ctx.space_dim {
+        11 => {
+            // Quadrature Point Loop
+            for i in 0..q as usize {
+                q_data[i] = j[[0, 0, i]] * w[i];
+            }
+        }
+        22 => {
+            // Quadrature Point Loop
+            for i in 0..q as usize {
+                q_data[i] = (j[[0, 0, i]] * j[[1, 1, i]] - j[[0, 1, i]] * j[[1, 0, i]]) * w[i];
+            }
+        }
+        33 => {
+            // Quadrature Point Loop
+            for i in 0..q as usize {
+                q_data[i] = (j[[0, 0, i]]
+                    * (j[[1, 1, i]] * j[[2, 2, i]] - j[[1, 2, i]] * j[[2, 1, i]])
+                    - j[[0, 1, i]] * (j[[1, 0, i]] * j[[2, 2, i]] - j[[1, 2, i]] * j[[2, 0, i]])
+                    + j[[0, 2, i]] * (j[[1, 0, i]] * j[[2, 1, i]] - j[[1, 1, i]] * j[[2, 0, i]]))
+                    * w[i];
+            }
+        }
+        _ => {
+            abort();
+        }
+    }
+
+    0
+}
+
+/* The no_mangle is required because rust "mangles" names (changes them to prevent namespace conflicts)
+Also note that this function ends in _rs, even though the C call `CEED_QFUNCTION_RUST(apply_mass)` doesn't
+For FFI reasons, it is also required to include all parameters in this exact form, even if you don't use all of them*/
+#[no_mangle]
+pub unsafe extern "C" fn apply_mass_rs(
+    _ctx: *mut c_void,
+    q: i32,
+    in_: *const *const f64,
+    out: *mut *mut f64,
+) -> i8 {
+    let in_slice = core::slice::from_raw_parts(in_, 2);
+
+    let u_ptr = in_slice[0];
+    let q_data_ptr = in_slice[1];
+
+    let u = core::slice::from_raw_parts(u_ptr, q as usize);
+    let q_data = core::slice::from_raw_parts(q_data_ptr, q as usize);
+
+    let out_slice = core::slice::from_raw_parts_mut(out, 1);
+
+    let v_ptr = out_slice[0];
+    let v = core::slice::from_raw_parts_mut(v_ptr, q as usize);
+
+    for i in 0..q as usize {
+        v[i] = q_data[i] * u[i];
+    }
+
+    0
+}
diff --git a/examples/rust-qfunctions/ex1-volume.c b/examples/rust-qfunctions/ex1-volume.c
new file mode 100644
index 0000000000..75b0a836f1
--- /dev/null
+++ b/examples/rust-qfunctions/ex1-volume.c
@@ -0,0 +1,439 @@
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+//                             libCEED Example 1
+//
+// This example illustrates a simple usage of libCEED to compute the volume of a 3D body using matrix-free application of a mass operator.
+// Arbitrary mesh and solution degrees in 1D, 2D and 3D are supported from the same code.
+//
+// The example has no dependencies, and is designed to be self-contained.
+// For additional examples that use external discretization libraries (MFEM, PETSc, etc.) see the subdirectories in libceed/examples.
+//
+// All libCEED objects use a Ceed device object constructed based on a command line argument (-ceed).
+//
+// Build with:
+//
+//     make ex1-volume-rust [CEED_DIR=</path/to/libceed>]
+//
+// Sample runs:
+//
+//     ./ex1-volume
+//     ./ex1-volume -ceed /cpu/self
+//     ./ex1-volume -ceed /gpu/cuda
+//
+// Test in 1D-3D
+//TESTARGS(name="1D User QFunction") -ceed {ceed_resource} -d 1 -t
+//TESTARGS(name="2D User QFunction") -ceed {ceed_resource} -d 2 -t
+//TESTARGS(name="3D User QFunction") -ceed {ceed_resource} -d 3 -t
+//TESTARGS(name="1D Gallery QFunction") -ceed {ceed_resource} -d 1 -t -g
+//TESTARGS(name="2D Gallery QFunction") -ceed {ceed_resource} -d 2 -t -g
+//TESTARGS(name="3D Gallery QFunction") -ceed {ceed_resource} -d 3 -t -g
+
+/// @file
+/// libCEED example using mass operator to compute volume
+
+#include "ex1-volume.h"
+
+#include <ceed.h>
+#include <math.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+// Auxiliary functions
+int        GetCartesianMeshSize(CeedInt dim, CeedInt degree, CeedInt prob_size, CeedInt num_xyz[dim]);
+int        BuildCartesianRestriction(Ceed ceed, CeedInt dim, CeedInt num_xyz[dim], CeedInt degree, CeedInt num_comp, CeedInt *size, CeedInt num_qpts,
+                                     CeedElemRestriction *restriction, CeedElemRestriction *q_data_restriction);
+int        SetCartesianMeshCoords(CeedInt dim, CeedInt num_xyz[dim], CeedInt mesh_degree, CeedVector mesh_coords);
+CeedScalar TransformMeshCoords(CeedInt dim, CeedInt mesh_size, CeedVector mesh_coords);
+
+// Main example
+int main(int argc, const char *argv[]) {
+  const char *ceed_spec   = "/cpu/self";
+  CeedInt     dim         = 3;               // dimension of the mesh
+  CeedInt     num_comp_x  = 3;               // number of x components
+  CeedInt     mesh_degree = 4;               // polynomial degree for the mesh
+  CeedInt     sol_degree  = 4;               // polynomial degree for the solution
+  CeedInt     num_qpts    = sol_degree + 2;  // number of 1D quadrature points
+  CeedInt     prob_size   = -1;              // approximate problem size
+  CeedInt     help = 0, test = 0, gallery = 0, benchmark = 0;
+
+  // Process command line arguments.
+  for (int ia = 1; ia < argc; ia++) {
+    // LCOV_EXCL_START
+    int next_arg = ((ia + 1) < argc), parse_error = 0;
+    if (!strcmp(argv[ia], "-h")) {
+      help = 1;
+    } else if (!strcmp(argv[ia], "-c") || !strcmp(argv[ia], "-ceed")) {
+      parse_error = next_arg ? ceed_spec = argv[++ia], 0 : 1;
+    } else if (!strcmp(argv[ia], "-d")) {
+      parse_error = next_arg ? dim = atoi(argv[++ia]), 0 : 1;
+      num_comp_x                   = dim;
+    } else if (!strcmp(argv[ia], "-m")) {
+      parse_error = next_arg ? mesh_degree = atoi(argv[++ia]), 0 : 1;
+    } else if (!strcmp(argv[ia], "-p")) {
+      parse_error = next_arg ? sol_degree = atoi(argv[++ia]), 0 : 1;
+    } else if (!strcmp(argv[ia], "-q")) {
+      parse_error = next_arg ? num_qpts = atoi(argv[++ia]), 0 : 1;
+    } else if (!strcmp(argv[ia], "-s")) {
+      parse_error = next_arg ? prob_size = atoi(argv[++ia]), 0 : 1;
+    } else if (!strcmp(argv[ia], "-b")) {
+      parse_error = next_arg ? benchmark = atoi(argv[++ia]), 0 : 1;
+    } else if (!strcmp(argv[ia], "-t")) {
+      test = 1;
+    } else if (!strcmp(argv[ia], "-g")) {
+      gallery = 1;
+    }
+    if (parse_error) {
+      printf("Error parsing command line options.\n");
+      return 1;
+    }
+    // LCOV_EXCL_STOP
+  }
+  if (prob_size < 0) prob_size = test ? 8 * 16 : 256 * 1024;
+
+  // Print the values of all options:
+  if (!test || help) {
+    // LCOV_EXCL_START
+    printf("Selected options: [command line option] : <current value>\n");
+    printf("  Ceed specification     [-c] : %s\n", ceed_spec);
+    printf("  Mesh dimension         [-d] : %" CeedInt_FMT "\n", dim);
+    printf("  Mesh degree            [-m] : %" CeedInt_FMT "\n", mesh_degree);
+    printf("  Solution degree        [-p] : %" CeedInt_FMT "\n", sol_degree);
+    printf("  Num. 1D quadrature pts [-q] : %" CeedInt_FMT "\n", num_qpts);
+    printf("  Approx. # unknowns     [-s] : %" CeedInt_FMT "\n", prob_size);
+    printf("  QFunction source       [-g] : %s\n", gallery ? "gallery" : "header");
+    if (help) {
+      printf("Test/quiet mode is %s\n", (test ? "ON" : "OFF (use -t to enable)"));
+      return 0;
+    }
+    printf("\n");
+    // LCOV_EXCL_STOP
+  }
+
+  // Select appropriate backend and logical device based on the (-ceed) command line argument.
+  Ceed ceed;
+
+  CeedInit(ceed_spec, &ceed);
+
+  // Add the path to the Rust crate to the ceed object.
+  {
+    char  root[2048] = __FILE__;
+    char *last_slash = strrchr(root, '/');
+
+    strncpy(last_slash + 1, "ex1-volume-rs", 14);
+    CeedAddRustSourceRoot(ceed, root);
+  }
+
+  // Construct the mesh and solution bases.
+  CeedBasis mesh_basis, sol_basis;
+
+  CeedBasisCreateTensorH1Lagrange(ceed, dim, num_comp_x, mesh_degree + 1, num_qpts, CEED_GAUSS, &mesh_basis);
+  CeedBasisCreateTensorH1Lagrange(ceed, dim, 1, sol_degree + 1, num_qpts, CEED_GAUSS, &sol_basis);
+
+  // Determine the mesh size based on the given approximate problem size.
+  CeedInt num_xyz[dim];
+
+  GetCartesianMeshSize(dim, sol_degree, prob_size, num_xyz);
+  if (!test) {
+    // LCOV_EXCL_START
+    printf("Mesh size: nx = %" CeedInt_FMT, num_xyz[0]);
+    if (dim > 1) printf(", ny = %" CeedInt_FMT, num_xyz[1]);
+    if (dim > 2) printf(", nz = %" CeedInt_FMT, num_xyz[2]);
+    printf("\n");
+    // LCOV_EXCL_STOP
+  }
+
+  // Build CeedElemRestriction objects describing the mesh and solution discrete representations.
+  CeedInt             mesh_size, sol_size;
+  CeedElemRestriction mesh_restriction, sol_restriction, q_data_restriction;
+
+  BuildCartesianRestriction(ceed, dim, num_xyz, mesh_degree, num_comp_x, &mesh_size, num_qpts, &mesh_restriction, NULL);
+  BuildCartesianRestriction(ceed, dim, num_xyz, sol_degree, 1, &sol_size, num_qpts, &sol_restriction, &q_data_restriction);
+  if (!test) {
+    // LCOV_EXCL_START
+    printf("Number of mesh nodes     : %" CeedInt_FMT "\n", mesh_size / dim);
+    printf("Number of solution nodes : %" CeedInt_FMT "\n", sol_size);
+    // LCOV_EXCL_STOP
+  }
+
+  // Create a CeedVector with the mesh coordinates.
+  CeedVector mesh_coords;
+
+  CeedVectorCreate(ceed, mesh_size, &mesh_coords);
+  SetCartesianMeshCoords(dim, num_xyz, mesh_degree, mesh_coords);
+
+  // Apply a transformation to the mesh.
+  CeedScalar exact_volume = TransformMeshCoords(dim, mesh_size, mesh_coords);
+
+  // Context data to be passed to the 'build_mass' QFunction.
+  CeedQFunctionContext build_ctx;
+  struct BuildContext  build_ctx_data;
+
+  build_ctx_data.dim = build_ctx_data.space_dim = dim;
+  CeedQFunctionContextCreate(ceed, &build_ctx);
+  CeedQFunctionContextSetData(build_ctx, CEED_MEM_HOST, CEED_USE_POINTER, sizeof(build_ctx_data), &build_ctx_data);
+
+  // Create the QFunction that builds the mass operator (i.e. computes its quadrature data) and set its context data.
+  CeedQFunction qf_build;
+
+  if (gallery) {
+    // This creates the QFunction via the gallery.
+    char name[13] = "";
+    snprintf(name, sizeof name, "Mass%" CeedInt_FMT "DBuild", dim);
+    CeedQFunctionCreateInteriorByName(ceed, name, &qf_build);
+  } else {
+    // This creates the QFunction directly.
+    CeedQFunctionCreateInterior(ceed, 1, build_mass, build_mass_loc, &qf_build);
+    CeedQFunctionAddInput(qf_build, "dx", num_comp_x * dim, CEED_EVAL_GRAD);
+    CeedQFunctionAddInput(qf_build, "weights", 1, CEED_EVAL_WEIGHT);
+    CeedQFunctionAddOutput(qf_build, "qdata", 1, CEED_EVAL_NONE);
+    CeedQFunctionSetContext(qf_build, build_ctx);
+  }
+
+  // Create the operator that builds the quadrature data for the mass operator.
+  CeedOperator op_build;
+
+  CeedOperatorCreate(ceed, qf_build, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_build);
+  CeedOperatorSetField(op_build, "dx", mesh_restriction, mesh_basis, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetField(op_build, "weights", CEED_ELEMRESTRICTION_NONE, mesh_basis, CEED_VECTOR_NONE);
+  CeedOperatorSetField(op_build, "qdata", q_data_restriction, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE);
+
+  // Compute the quadrature data for the mass operator.
+  CeedVector q_data;
+  CeedInt    elem_qpts = CeedIntPow(num_qpts, dim);
+  CeedInt    num_elem  = 1;
+
+  for (CeedInt d = 0; d < dim; d++) num_elem *= num_xyz[d];
+  CeedVectorCreate(ceed, num_elem * elem_qpts, &q_data);
+  CeedOperatorApply(op_build, mesh_coords, q_data, CEED_REQUEST_IMMEDIATE);
+
+  // Create the QFunction that defines the action of the mass operator.
+  CeedQFunction qf_apply;
+
+  if (gallery) {
+    // This creates the QFunction via the gallery.
+    CeedQFunctionCreateInteriorByName(ceed, "MassApply", &qf_apply);
+  } else {
+    // This creates the QFunction directly.
+    CeedQFunctionCreateInterior(ceed, 1, apply_mass, apply_mass_loc, &qf_apply);
+    CeedQFunctionAddInput(qf_apply, "u", 1, CEED_EVAL_INTERP);
+    CeedQFunctionAddInput(qf_apply, "qdata", 1, CEED_EVAL_NONE);
+    CeedQFunctionAddOutput(qf_apply, "v", 1, CEED_EVAL_INTERP);
+  }
+
+  // Create the mass operator.
+  CeedOperator op_apply;
+
+  CeedOperatorCreate(ceed, qf_apply, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_apply);
+  CeedOperatorSetField(op_apply, "u", sol_restriction, sol_basis, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetField(op_apply, "qdata", q_data_restriction, CEED_BASIS_NONE, q_data);
+  CeedOperatorSetField(op_apply, "v", sol_restriction, sol_basis, CEED_VECTOR_ACTIVE);
+
+  // Create auxiliary solution-size vectors.
+  CeedVector u, v;
+
+  CeedVectorCreate(ceed, sol_size, &u);
+  CeedVectorCreate(ceed, sol_size, &v);
+
+  // Initialize 'u' with ones.
+  CeedVectorSetValue(u, 1.0);
+
+  // Compute the mesh volume using the mass operator: volume = 1^T \cdot M \cdot 1
+  CeedOperatorApply(op_apply, u, v, CEED_REQUEST_IMMEDIATE);
+
+  // Benchmark runs
+  if (!test && benchmark) {
+    // LCOV_EXCL_START
+    printf(" Executing %d benchmarking runs...\n", benchmark);
+    // LCOV_EXCL_STOP
+  }
+  for (CeedInt i = 0; i < benchmark; i++) {
+    // LCOV_EXCL_START
+    CeedOperatorApply(op_apply, u, v, CEED_REQUEST_IMMEDIATE);
+    // LCOV_EXCL_STOP
+  }
+
+  // Compute and print the sum of the entries of 'v' giving the mesh volume.
+  CeedScalar volume = 0.;
+
+  {
+    const CeedScalar *v_array;
+
+    CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array);
+    for (CeedInt i = 0; i < sol_size; i++) volume += v_array[i];
+    CeedVectorRestoreArrayRead(v, &v_array);
+  }
+  if (!test) {
+    // LCOV_EXCL_START
+    printf(" done.\n");
+    printf("Exact mesh volume    : % .14g\n", exact_volume);
+    printf("Computed mesh volume : % .14g\n", volume);
+    printf("Volume error         : % .14g\n", volume - exact_volume);
+    // LCOV_EXCL_STOP
+  } else {
+    CeedScalar tol = (dim == 1 ? 200. * CEED_EPSILON : dim == 2 ? 1E-5 : 1E-5);
+
+    if (fabs(volume - exact_volume) > tol) printf("Volume error : % .1e\n", volume - exact_volume);
+  }
+
+  // Free dynamically allocated memory.
+  CeedVectorDestroy(&u);
+  CeedVectorDestroy(&v);
+  CeedVectorDestroy(&q_data);
+  CeedVectorDestroy(&mesh_coords);
+  CeedOperatorDestroy(&op_apply);
+  CeedQFunctionDestroy(&qf_apply);
+  CeedQFunctionContextDestroy(&build_ctx);
+  CeedOperatorDestroy(&op_build);
+  CeedQFunctionDestroy(&qf_build);
+  CeedElemRestrictionDestroy(&sol_restriction);
+  CeedElemRestrictionDestroy(&mesh_restriction);
+  CeedElemRestrictionDestroy(&q_data_restriction);
+  CeedBasisDestroy(&sol_basis);
+  CeedBasisDestroy(&mesh_basis);
+  CeedDestroy(&ceed);
+  return 0;
+}
+
+int GetCartesianMeshSize(CeedInt dim, CeedInt degree, CeedInt prob_size, CeedInt num_xyz[dim]) {
+  // Use the approximate formula:
+  //    prob_size ~ num_elem * degree^dim
+  CeedInt num_elem = prob_size / CeedIntPow(degree, dim);
+  CeedInt s        = 0;  // find s: num_elem/2 < 2^s <= num_elem
+
+  while (num_elem > 1) {
+    num_elem /= 2;
+    s++;
+  }
+  CeedInt r = s % dim;
+
+  for (CeedInt d = 0; d < dim; d++) {
+    CeedInt sd = s / dim;
+
+    if (r > 0) {
+      sd++;
+      r--;
+    }
+    num_xyz[d] = 1 << sd;
+  }
+  return 0;
+}
+
+int BuildCartesianRestriction(Ceed ceed, CeedInt dim, CeedInt num_xyz[dim], CeedInt degree, CeedInt num_comp, CeedInt *size, CeedInt num_qpts,
+                              CeedElemRestriction *restriction, CeedElemRestriction *q_data_restriction) {
+  CeedInt p         = degree + 1;
+  CeedInt num_nodes = CeedIntPow(p, dim);         // number of scalar nodes per element
+  CeedInt elem_qpts = CeedIntPow(num_qpts, dim);  // number of qpts per element
+  CeedInt nd[3], num_elem = 1, scalar_size = 1;
+
+  for (CeedInt d = 0; d < dim; d++) {
+    num_elem *= num_xyz[d];
+    nd[d] = num_xyz[d] * (p - 1) + 1;
+    scalar_size *= nd[d];
+  }
+  *size = scalar_size * num_comp;
+  // elem:         0             1                 n-1
+  //           |---*-...-*---|---*-...-*---|- ... -|--...--|
+  // num_nodes:   0   1    p-1  p  p+1       2*p             n*p
+  CeedInt *elem_nodes = malloc(sizeof(CeedInt) * num_elem * num_nodes);
+
+  for (CeedInt e = 0; e < num_elem; e++) {
+    CeedInt e_xyz[3] = {1, 1, 1}, re = e;
+
+    for (CeedInt d = 0; d < dim; d++) {
+      e_xyz[d] = re % num_xyz[d];
+      re /= num_xyz[d];
+    }
+    CeedInt *local_elem_nodes = elem_nodes + e * num_nodes;
+
+    for (CeedInt l_nodes = 0; l_nodes < num_nodes; l_nodes++) {
+      CeedInt g_nodes = 0, g_nodes_stride = 1, r_nodes = l_nodes;
+
+      for (CeedInt d = 0; d < dim; d++) {
+        g_nodes += (e_xyz[d] * (p - 1) + r_nodes % p) * g_nodes_stride;
+        g_nodes_stride *= nd[d];
+        r_nodes /= p;
+      }
+      local_elem_nodes[l_nodes] = g_nodes;
+    }
+  }
+  CeedElemRestrictionCreate(ceed, num_elem, num_nodes, num_comp, scalar_size, num_comp * scalar_size, CEED_MEM_HOST, CEED_COPY_VALUES, elem_nodes,
+                            restriction);
+  if (q_data_restriction) {
+    CeedElemRestrictionCreateStrided(ceed, num_elem, elem_qpts, num_comp, num_comp * elem_qpts * num_elem, CEED_STRIDES_BACKEND, q_data_restriction);
+  }
+  free(elem_nodes);
+  return 0;
+}
+
+int SetCartesianMeshCoords(CeedInt dim, CeedInt num_xyz[dim], CeedInt mesh_degree, CeedVector mesh_coords) {
+  CeedInt p = mesh_degree + 1;
+  CeedInt nd[3], scalar_size = 1;
+
+  for (CeedInt d = 0; d < dim; d++) {
+    nd[d] = num_xyz[d] * (p - 1) + 1;
+    scalar_size *= nd[d];
+  }
+  CeedScalar *coords;
+
+  CeedVectorGetArrayWrite(mesh_coords, CEED_MEM_HOST, &coords);
+  CeedScalar *nodes = malloc(sizeof(CeedScalar) * p);
+
+  // The H1 basis uses Lobatto quadrature points as nodes.
+  CeedLobattoQuadrature(p, nodes, NULL);  // nodes are in [-1,1]
+  for (CeedInt i = 0; i < p; i++) nodes[i] = 0.5 + 0.5 * nodes[i];
+  for (CeedInt gs_nodes = 0; gs_nodes < scalar_size; gs_nodes++) {
+    CeedInt r_nodes = gs_nodes;
+
+    for (CeedInt d = 0; d < dim; d++) {
+      CeedInt d_1d = r_nodes % nd[d];
+
+      coords[gs_nodes + scalar_size * d] = ((d_1d / (p - 1)) + nodes[d_1d % (p - 1)]) / num_xyz[d];
+      r_nodes /= nd[d];
+    }
+  }
+  free(nodes);
+  CeedVectorRestoreArray(mesh_coords, &coords);
+  return 0;
+}
+
+#ifndef M_PI
+#define M_PI 3.14159265358979323846
+#define M_PI_2 1.57079632679489661923
+#endif
+
+CeedScalar TransformMeshCoords(CeedInt dim, CeedInt mesh_size, CeedVector mesh_coords) {
+  CeedScalar  exact_volume;
+  CeedScalar *coords;
+
+  CeedVectorGetArray(mesh_coords, CEED_MEM_HOST, &coords);
+  if (dim == 1) {
+    for (CeedInt i = 0; i < mesh_size; i++) {
+      // map [0,1] to [0,1] varying the mesh density
+      coords[i] = 0.5 + 1. / sqrt(3.) * sin((2. / 3.) * M_PI * (coords[i] - 0.5));
+    }
+    exact_volume = 1.;
+  } else {
+    CeedInt num_nodes = mesh_size / dim;
+
+    for (CeedInt i = 0; i < num_nodes; i++) {
+      // map (x,y) from [0,1]x[0,1] to the quarter annulus with polar
+      // coordinates, (r,phi) in [1,2]x[0,pi/2] with area = 3/4*pi
+      CeedScalar u = coords[i], v = coords[i + num_nodes];
+
+      u                     = 1. + u;
+      v                     = M_PI_2 * v;
+      coords[i]             = u * cos(v);
+      coords[i + num_nodes] = u * sin(v);
+    }
+    exact_volume = 3. / 4. * M_PI;
+  }
+  CeedVectorRestoreArray(mesh_coords, &coords);
+  return exact_volume;
+}
diff --git a/examples/rust-qfunctions/ex1-volume.h b/examples/rust-qfunctions/ex1-volume.h
new file mode 100644
index 0000000000..e769823dfc
--- /dev/null
+++ b/examples/rust-qfunctions/ex1-volume.h
@@ -0,0 +1,19 @@
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+#include <ceed/types.h>
+
+/// A structure used to pass additional data to f_build_mass
+struct BuildContext {
+  CeedInt dim, space_dim;
+};
+
+// References the rust file for the qfunction named build_mass_rs
+CEED_QFUNCTION_RUST(build_mass)
+
+// References the rust file for the qfunction named apply_mass_rs
+CEED_QFUNCTION_RUST(apply_mass)
diff --git a/include/ceed-impl.h b/include/ceed-impl.h
index 1b6875a6ca..0fa7de80d2 100644
--- a/include/ceed-impl.h
+++ b/include/ceed-impl.h
@@ -98,7 +98,10 @@ struct Ceed_private {
   int          obj_delegate_count;
   Ceed         op_fallback_ceed;
   char       **jit_source_roots;
+  char       **rust_source_roots;
+  CeedInt      num_rust_source_roots, max_rust_source_roots, num_rust_source_roots_readers;
   CeedInt      num_jit_source_roots, max_jit_source_roots, num_jit_source_roots_readers;
+  bool         cuda_compile_with_clang;
   char       **jit_defines;
   CeedInt      num_jit_defines, max_jit_defines, num_jit_defines_readers;
   int (*Error)(Ceed, const char *, int, const char *, int, const char *, va_list *);
diff --git a/include/ceed/backend.h b/include/ceed/backend.h
index fd861702bb..6bf1ba3bb6 100644
--- a/include/ceed/backend.h
+++ b/include/ceed/backend.h
@@ -258,7 +258,9 @@ CEED_EXTERN int CeedRestoreWorkVector(Ceed ceed, CeedVector *vec);
 CEED_EXTERN int CeedClearWorkVectors(Ceed ceed, CeedSize min_len);
 CEED_EXTERN int CeedGetWorkVectorMemoryUsage(Ceed ceed, CeedScalar *usage_mb);
 CEED_EXTERN int CeedGetJitSourceRoots(Ceed ceed, CeedInt *num_source_roots, const char ***jit_source_roots);
+CEED_EXTERN int CeedGetRustSourceRoots(Ceed ceed, CeedInt *num_source_roots, const char ***rust_source_roots);
 CEED_EXTERN int CeedRestoreJitSourceRoots(Ceed ceed, const char ***jit_source_roots);
+CEED_EXTERN int CeedRestoreRustSourceRoots(Ceed ceed, const char ***rust_source_roots);
 CEED_EXTERN int CeedGetJitDefines(Ceed ceed, CeedInt *num_defines, const char ***jit_defines);
 CEED_EXTERN int CeedRestoreJitDefines(Ceed ceed, const char ***jit_defines);
 
diff --git a/include/ceed/ceed.h b/include/ceed/ceed.h
index 65e94ce43a..a3442629a5 100644
--- a/include/ceed/ceed.h
+++ b/include/ceed/ceed.h
@@ -107,6 +107,7 @@ CEED_EXTERN int CeedReferenceCopy(Ceed ceed, Ceed *ceed_copy);
 CEED_EXTERN int CeedGetResource(Ceed ceed, const char **resource);
 CEED_EXTERN int CeedIsDeterministic(Ceed ceed, bool *is_deterministic);
 CEED_EXTERN int CeedAddJitSourceRoot(Ceed ceed, const char *jit_source_root);
+CEED_EXTERN int CeedAddRustSourceRoot(Ceed ceed, const char *rust_source_root);
 CEED_EXTERN int CeedAddJitDefine(Ceed ceed, const char *jit_define);
 CEED_EXTERN int CeedView(Ceed ceed, FILE *stream);
 CEED_EXTERN int CeedDestroy(Ceed *ceed);
@@ -166,6 +167,9 @@ CEED_EXTERN int CeedGetVersion(int *major, int *minor, int *patch, bool *release
 CEED_EXTERN int CeedGetGitVersion(const char **git_version);
 CEED_EXTERN int CeedGetBuildConfiguration(const char **build_config);
 
+CEED_EXTERN int CeedSetIsClang(Ceed ceed, bool isClang);
+CEED_EXTERN int CeedGetIsClang(Ceed ceed, bool *isClang);
+
 CEED_EXTERN int CeedGetScalarType(CeedScalarType *scalar_type);
 
 /// String names for enum pretty printing
@@ -369,11 +373,10 @@ CEED_EXTERN int  CeedQFunctionGetCeed(CeedQFunction qf, Ceed *ceed);
 CEED_EXTERN Ceed CeedQFunctionReturnCeed(CeedQFunction qf);
 CEED_EXTERN int  CeedQFunctionApply(CeedQFunction qf, CeedInt Q, CeedVector *u, CeedVector *v);
 CEED_EXTERN int  CeedQFunctionDestroy(CeedQFunction *qf);
-
-CEED_EXTERN int CeedQFunctionFieldGetName(CeedQFunctionField qf_field, const char **field_name);
-CEED_EXTERN int CeedQFunctionFieldGetSize(CeedQFunctionField qf_field, CeedInt *size);
-CEED_EXTERN int CeedQFunctionFieldGetEvalMode(CeedQFunctionField qf_field, CeedEvalMode *eval_mode);
-CEED_EXTERN int CeedQFunctionFieldGetData(CeedQFunctionField qf_field, const char **field_name, CeedInt *size, CeedEvalMode *eval_mode);
+CEED_EXTERN int  CeedQFunctionFieldGetName(CeedQFunctionField qf_field, const char **field_name);
+CEED_EXTERN int  CeedQFunctionFieldGetSize(CeedQFunctionField qf_field, CeedInt *size);
+CEED_EXTERN int  CeedQFunctionFieldGetEvalMode(CeedQFunctionField qf_field, CeedEvalMode *eval_mode);
+CEED_EXTERN int  CeedQFunctionFieldGetData(CeedQFunctionField qf_field, const char **field_name, CeedInt *size, CeedEvalMode *eval_mode);
 
 /** Handle for the user provided @ref CeedQFunctionContextDestroy() callback function
 
diff --git a/include/ceed/jit-source/cuda/cuda-jit.h b/include/ceed/jit-source/cuda/cuda-jit.h
index baa15a1e85..e592b31689 100644
--- a/include/ceed/jit-source/cuda/cuda-jit.h
+++ b/include/ceed/jit-source/cuda/cuda-jit.h
@@ -13,4 +13,8 @@
 #define CeedPragmaSIMD
 #define CEED_Q_VLA 1
 
+#define CEED_QFUNCTION_RUST(name)                                                                                       \
+  extern "C" __device__ int name##_rs(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out); \
+  static __device__ int name(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { return name##_rs(ctx, Q, in, out); }
+
 #include "cuda-types.h"
diff --git a/include/ceed/types.h b/include/ceed/types.h
index 2739390d8e..098afd5987 100644
--- a/include/ceed/types.h
+++ b/include/ceed/types.h
@@ -51,6 +51,33 @@ backends. It also creates a variable `name_loc` populated with the correct sourc
   CEED_QFUNCTION_ATTR static int name
 #endif
 
+/**
+    @ingroup CeedQFunction
+This macro populates the correct function for Rust-based User QFunction source for code generation backends or populates default values for CPU backends. It also creates a variable `name_loc` populated with the correct source path for creating the respective User QFunction. Note that the function, as named in rust, must be called `name_rs`. When referencing it in C, use just `name` (no `_rs`)
+Example:
+//ex1-volume.h
+CEED_QFUNCTION_RUST(build_mass)
+//ex1-volume.c
+CeedAddRustSourceRoot(ceed, "examples/ceed/ex1-volume-rs");
+// ex1-volume-rs/src/lib.rs
+#[no_mangle]
+pub unsafe extern "C" fn build_mass_rs(
+    ctx: *mut c_void,
+    Q: i32,
+    in: *const *const f64,
+    out: *mut *mut f64,
+) -> i8
+**/
+#ifndef CEED_QFUNCTION_RUST
+#define CEED_QFUNCTION_RUST(name)                                                                                            \
+  CEED_QFUNCTION_ATTR int        name##_rs(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out); \
+  CEED_QFUNCTION_ATTR static int name(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {     \
+    return name##_rs(ctx, Q, in, out);                                                                                       \
+  }                                                                                                                          \
+  static const char name##_loc[] = __FILE__ ":" #name;
+#endif
+// Note: placing the _loc of the function below the function in the macro is required because python cffi will exclude the previous line (the }) based on the backslash at the end of it, which is required for our python build script to exclude macros. See /python/build_ceed_cffi.py for more details
+
 /**
   @ingroup CeedQFunction
   This macro populates the correct function annotations for User QFunction helper function source for code generation backends or populates default
diff --git a/interface/ceed-config.c b/interface/ceed-config.c
index bb7aaa9258..3c837167b7 100644
--- a/interface/ceed-config.c
+++ b/interface/ceed-config.c
@@ -35,6 +35,38 @@ int CeedGetGitVersion(const char **git_version) {
   return CEED_ERROR_SUCCESS;
 }
 
+/**
+  @brief Set whether or not to use clang when compiling for GPU (instead of nvrtc)
+
+  @param[in] is_clang Whether or not to use clang on GPU
+
+  @ref Developer
+
+  @sa CeedGetIsClang()
+
+  @return An error code: 0 - success, otherwise - failure
+ */
+int CeedSetIsClang(Ceed ceed, bool is_clang) {
+  ceed->cuda_compile_with_clang = is_clang;
+  return CEED_ERROR_SUCCESS;
+}
+
+/**
+  @brief Determine if the current ceed is set to compile with clang when on GPU
+
+  @param[out] is_clang The location to write the current GPU clang status to
+
+  @ref Developer
+
+  @sa CeedSetIsClang()
+
+  @return An error code: 0 - success, otherwise - failure
+ */
+int CeedGetIsClang(Ceed ceed, bool *is_clang) {
+  *is_clang = ceed->cuda_compile_with_clang;
+  return CEED_ERROR_SUCCESS;
+}
+
 /**
   @brief Get build variables as a multi-line string.
 
diff --git a/interface/ceed.c b/interface/ceed.c
index b4bf09883a..4d8a08279b 100644
--- a/interface/ceed.c
+++ b/interface/ceed.c
@@ -959,6 +959,30 @@ int CeedGetJitSourceRoots(Ceed ceed, CeedInt *num_source_roots, const char ***ji
   return CEED_ERROR_SUCCESS;
 }
 
+/**
+  @brief Retrieve list of additional Rust source roots from `Ceed` context.
+
+  Note: The caller is responsible for restoring `rust_source_roots` with @ref CeedRestoreRustSourceRoots().
+
+  @param[in]  ceed             `Ceed` context
+  @param[out] num_source_roots Number of JiT source directories
+  @param[out] rust_source_roots Absolute paths to additional Rust source directories
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Backend
+**/
+int CeedGetRustSourceRoots(Ceed ceed, CeedInt *num_source_roots, const char ***rust_source_roots) {
+  Ceed ceed_parent;
+
+  CeedCall(CeedGetParent(ceed, &ceed_parent));
+  *num_source_roots  = ceed_parent->num_rust_source_roots;
+  *rust_source_roots = (const char **)ceed_parent->rust_source_roots;
+  ceed_parent->num_rust_source_roots_readers++;
+  CeedCall(CeedDestroy(&ceed_parent));
+  return CEED_ERROR_SUCCESS;
+}
+
 /**
   @brief Restore list of additional JiT source roots from with @ref CeedGetJitSourceRoots()
 
@@ -979,6 +1003,26 @@ int CeedRestoreJitSourceRoots(Ceed ceed, const char ***jit_source_roots) {
   return CEED_ERROR_SUCCESS;
 }
 
+/**
+  @brief Restore list of additional Rust source roots from with @ref CeedGetJitSourceRoots()
+
+  @param[in]  ceed             `Ceed` context
+  @param[out] rust_source_roots Absolute paths to additional Rust source directories
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Backend
+**/
+int CeedRestoreRustSourceRoots(Ceed ceed, const char ***rust_source_roots) {
+  Ceed ceed_parent;
+
+  CeedCall(CeedGetParent(ceed, &ceed_parent));
+  *rust_source_roots = NULL;
+  ceed_parent->num_rust_source_roots_readers--;
+  CeedCall(CeedDestroy(&ceed_parent));
+  return CEED_ERROR_SUCCESS;
+}
+
 /**
   @brief Retrieve list of additional JiT defines from `Ceed` context.
 
@@ -1166,6 +1210,7 @@ int CeedInit(const char *resource, Ceed *ceed) {
   // Setup Ceed
   CeedCall(CeedCalloc(1, ceed));
   CeedCall(CeedCalloc(1, &(*ceed)->jit_source_roots));
+  CeedCall(CeedCalloc(1, &(*ceed)->rust_source_roots));
   const char *ceed_error_handler = getenv("CEED_ERROR_HANDLER");
   if (!ceed_error_handler) ceed_error_handler = "abort";
   if (!strcmp(ceed_error_handler, "exit")) (*ceed)->Error = CeedErrorExit;
@@ -1278,6 +1323,16 @@ int CeedInit(const char *resource, Ceed *ceed) {
   // Note: there will always be the default root for every Ceed but all additional paths are added to the top-most parent
   CeedCall(CeedAddJitSourceRoot(*ceed, (char *)CeedJitSourceRootDefault));
 
+  // By default, make cuda compile without clang, use nvrtc instead
+  // Note that this is overridden if a rust file is included (rust requires clang)
+  const char *env = getenv("GPU_CLANG");
+
+  if (env && strcmp(env, "1") == 0) {
+    (*ceed)->cuda_compile_with_clang = true;
+  } else {
+    (*ceed)->cuda_compile_with_clang = false;
+  }
+
   // Backend specific setup
   CeedCall(backends[match_index].init(&resource[match_help], *ceed));
   return CEED_ERROR_SUCCESS;
@@ -1418,6 +1473,39 @@ int CeedAddJitSourceRoot(Ceed ceed, const char *jit_source_root) {
   return CEED_ERROR_SUCCESS;
 }
 
+/**
+  @brief Set additional Rust source root for `Ceed` context for use in QFunction
+
+  @param[in,out] ceed            `Ceed` context
+  @param[in]     rust_source_root Absolute path to additional Rust source directory
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref User
+**/
+int CeedAddRustSourceRoot(Ceed ceed, const char *rust_source_root) {
+  Ceed ceed_parent;
+
+  CeedCall(CeedGetParent(ceed, &ceed_parent));
+  CeedCheck(!ceed_parent->num_rust_source_roots_readers, ceed, CEED_ERROR_ACCESS, "Cannot add Rust source root, read access has not been restored");
+
+  CeedInt index       = ceed_parent->num_rust_source_roots;
+  size_t  path_length = strlen(rust_source_root);
+
+  if (ceed_parent->num_rust_source_roots == ceed_parent->max_rust_source_roots) {
+    if (ceed_parent->max_rust_source_roots == 0) ceed_parent->max_rust_source_roots = 1;
+    ceed_parent->max_rust_source_roots *= 2;
+    CeedCall(CeedRealloc(ceed_parent->max_rust_source_roots, &ceed_parent->rust_source_roots));
+  }
+  CeedCall(CeedCalloc(path_length + 1, &ceed_parent->rust_source_roots[index]));
+  memcpy(ceed_parent->rust_source_roots[index], rust_source_root, path_length);
+  ceed_parent->num_rust_source_roots++;
+  ceed_parent->cuda_compile_with_clang = true;
+  ceed->cuda_compile_with_clang        = true;
+  CeedCall(CeedDestroy(&ceed_parent));
+  return CEED_ERROR_SUCCESS;
+}
+
 /**
   @brief Set additional JiT compiler define for `Ceed` context
 
diff --git a/python/build_ceed_cffi.py b/python/build_ceed_cffi.py
index a802357303..5ecba579af 100644
--- a/python/build_ceed_cffi.py
+++ b/python/build_ceed_cffi.py
@@ -13,6 +13,29 @@
 ceed_version_ge = re.compile(r'\s+\(!?CEED_VERSION.*')
 
 
+# Checks to see if a c line is part of the lines we have to exclude (macros)
+def is_valid_line(line):
+    if (line.startswith("#") and not line.startswith("#include")):
+        return False
+    if (line.startswith("#include \"deprecated.h\"")):
+        return False
+    if (line.startswith("  CEED_QFUNCTION_ATTR")):
+        return False
+    if (line.startswith("  static const char")):
+        return False
+    if (line.endswith('\\\n')):
+        return False
+    if ("CeedErrorImpl" in line):
+        return False
+    if (r'const char *, ...);' in line):
+        return False
+    if (line.startswith("CEED_EXTERN const char *const")):
+        return False
+    if (ceed_version_ge.match(line)):
+        return False
+    return True
+
+
 def get_ceed_dirs():
     here = os.path.dirname(os.path.abspath(__file__))
     prefix = os.path.dirname(here)
@@ -31,15 +54,7 @@ def get_ceed_dirs():
 lines = []
 for header_path in ["include/ceed/types.h", "include/ceed/ceed.h"]:
     with open(os.path.abspath(header_path)) as f:
-        lines += [line.strip() for line in f if
-                  not (line.startswith("#") and not line.startswith("#include")) and
-                  not line.startswith("  static") and
-                  not line.startswith("#include \"deprecated.h\"") and
-                  not line.startswith("  CEED_QFUNCTION_ATTR") and
-                  "CeedErrorImpl" not in line and
-                  "const char *, ...);" not in line and
-                  not line.startswith("CEED_EXTERN const char *const") and
-                  not ceed_version_ge.match(line)]
+        lines += [line.strip() for line in f if is_valid_line(line)]
 lines = [line.replace("CEED_EXTERN", "extern") for line in lines]
 
 # Find scalar type inclusion line and insert definitions
diff --git a/tests/junit.py b/tests/junit.py
index b0144454af..7237594b17 100755
--- a/tests/junit.py
+++ b/tests/junit.py
@@ -51,6 +51,8 @@ def get_source_path(self, test: str) -> Path:
             Path: Path to source file
         """
         prefix, rest = test.split('-', 1)
+        if prefix == 'rustqfunctions':
+            return (Path('examples') / 'rust-qfunctions' / rest).with_suffix('.c')
         if prefix == 'petsc':
             return (Path('examples') / 'petsc' / rest).with_suffix('.c')
         elif prefix == 'mfem':

From 898eb931305e6255a892582eacaf7552197fb10f Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Thu, 31 Jul 2025 12:59:09 -0600
Subject: [PATCH 481/571] ceed - clean up rust source roots on destroy

---
 interface/ceed.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/interface/ceed.c b/interface/ceed.c
index 4d8a08279b..0952389a00 100644
--- a/interface/ceed.c
+++ b/interface/ceed.c
@@ -1601,6 +1601,11 @@ int CeedDestroy(Ceed *ceed) {
   }
   CeedCall(CeedFree(&(*ceed)->jit_defines));
 
+  for (CeedInt i = 0; i < (*ceed)->num_rust_source_roots; i++) {
+    CeedCall(CeedFree(&(*ceed)->rust_source_roots[i]));
+  }
+  CeedCall(CeedFree(&(*ceed)->rust_source_roots));
+
   CeedCall(CeedFree(&(*ceed)->f_offsets));
   CeedCall(CeedFree(&(*ceed)->resource));
   CeedCall(CeedDestroy(&(*ceed)->op_fallback_ceed));

From 270160121299d08160b35f75985c4d6c80009118 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Thu, 31 Jul 2025 13:09:45 -0600
Subject: [PATCH 482/571] ci - make sure rust toolchain has everything needed

---
 .gitlab-ci.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 9a7b3adfc5..03f0ccb8ef 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -193,6 +193,8 @@ noether-rust-qfunctions:
     - echo "-------------- GCOV ----------------" && gcov --version
   script:
     - rm -f .SUCCESS
+    # Rustup
+    - rustup component add rust-src --toolchain nightly-x86_64-unknown-linux-gnu
     # libCEED
     - make configure OPT='-O -march=native -ffp-contract=fast' CUDA_DIR=/usr/local/cuda-12.9
     - echo "-------------- libCEED -------------" && make info

From 6ca7e175262dc74f45d0be70e66c7f4ced8e510b Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Thu, 31 Jul 2025 13:16:51 -0600
Subject: [PATCH 483/571] make - only test Rust QFunctions when requested

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 8a097ed5c6..f8792bfb95 100644
--- a/Makefile
+++ b/Makefile
@@ -787,7 +787,7 @@ external_examples := \
 	$(if $(DEAL_II_DIR),$(dealiiexamples)) \
 	$(if $(PETSC_DIR),$(fluidsexamples)) \
 	$(if $(PETSC_DIR),$(solidsexamples)) \
-	$(rustqfunctionsexamples)
+	$(if $(or $(RUST_QF),$(GPU_CLANG)),$(rustqfunctionsexamples))
 
 allexamples = $(examples) $(external_examples)
 

From 722068580ca339c7e06327342f6cccc6af62d2ef Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Thu, 31 Jul 2025 13:40:56 -0600
Subject: [PATCH 484/571] ci - fix search=rustqfunction

---
 .gitlab-ci.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 03f0ccb8ef..cc469666e1 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -201,10 +201,10 @@ noether-rust-qfunctions:
     - make clean
     - make -k -j$NPROC_CPU -l$NPROC_CPU
     # -- libCEED only tests
-    - echo "-------------- Rust QFunction tests -"
+    - echo "-------------- Rust QFunction tests -----"
     #    Note: PETSC_DIR is set by default in GitLab runner env, unsetting to isolate core tests
     - export PETSC_DIR= PETSC_ARCH=
-    - make -k -j$((NPROC_GPU / NPROC_POOL)) JUNIT_BATCH="rust-qfunction" junit realsearch=rust-qfunction
+    - make -k -j$((NPROC_GPU / NPROC_POOL)) JUNIT_BATCH="rust-qfunction" junit search=rustqfunction
     # Report status
     - touch .SUCCESS
   after_script:

From 9b5f41c81b637db3e5453a22df59a4f47deed499 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Thu, 31 Jul 2025 14:36:18 -0600
Subject: [PATCH 485/571] clang - keep temp files from overwriting each other

---
 .gitignore                          |  2 +-
 Makefile                            |  2 +-
 backends/cuda/ceed-cuda-compile.cpp | 60 +++++++++++++++++++----------
 examples/ceed/.gitignore            |  1 +
 examples/rust-qfunctions/.gitignore |  3 +-
 examples/rust-qfunctions/Makefile   |  2 +-
 6 files changed, 46 insertions(+), 24 deletions(-)

diff --git a/.gitignore b/.gitignore
index 8a036aadc5..6baed96b94 100644
--- a/.gitignore
+++ b/.gitignore
@@ -52,7 +52,7 @@ doc/sphinx/build/
 doc/sphinx/source/examples/
 
 # Clang GPU temp files
-temp_*
+temp/*
 
 # Output files, videos, and compressed archives should not be added accidentally
 *.avi
diff --git a/Makefile b/Makefile
index f8792bfb95..1c95362776 100644
--- a/Makefile
+++ b/Makefile
@@ -916,7 +916,7 @@ cln clean :
 	$(call quiet,MAKE) -C examples clean NEK5K_DIR="$(abspath $(NEK5K_DIR))"
 	$(call quiet,MAKE) -C python/tests clean
 	$(RM) benchmarks/*output.txt
-	$(RM) -f temp_*
+	$(RM) -rf temp
 
 distclean : clean
 	$(RM) -r doc/html doc/sphinx/build $(CONFIG)
diff --git a/backends/cuda/ceed-cuda-compile.cpp b/backends/cuda/ceed-cuda-compile.cpp
index c73a988a55..be60a45ee5 100644
--- a/backends/cuda/ceed-cuda-compile.cpp
+++ b/backends/cuda/ceed-cuda-compile.cpp
@@ -15,6 +15,7 @@
 #include <nvrtc.h>
 #include <stdarg.h>
 #include <string.h>
+#include <sys/stat.h>
 #include <sys/types.h>
 
 #include <cstdlib>
@@ -54,7 +55,7 @@ static int CeedCallSystem_Core(Ceed ceed, const char *command, const char *messa
   }
   CeedDebug(ceed, "Command output:\n%s\n", output);
 
-  CeedCheck(pclose(output_stream) == 0, ceed, CEED_ERROR_BACKEND, "Failed to %s with error: %s", message, output);
+  CeedCheck(pclose(output_stream) == 0, ceed, CEED_ERROR_BACKEND, "Failed to %s with command: %s\nand error: %s", message, command, output);
   return CEED_ERROR_SUCCESS;
 }
 
@@ -154,8 +155,6 @@ static int CeedCompileCore_Cuda(Ceed ceed, const char *source, const bool throw_
   // Add string source argument provided in call
   code << source;
 
-  // Create Program
-
   // Compile kernel
   CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "---------- ATTEMPTING TO COMPILE JIT SOURCE ----------\n");
   CeedDebug(ceed, "Source:\n%s\n", code.str().c_str());
@@ -221,15 +220,31 @@ static int CeedCompileCore_Cuda(Ceed ceed, const char *source, const bool throw_
     CeedCallBackend(CeedFree(&ptx));
     return CEED_ERROR_SUCCESS;
   } else {
-    const char *full_filename = "temp_kernel_source.cu";
-    FILE       *file          = fopen(full_filename, "w");
+    srand(time(NULL));
+    const int build_id = rand();
 
-    CeedCheck(file, ceed, CEED_ERROR_BACKEND, "Failed to create file. Write access is required for cuda-clang\n");
-    fputs(code.str().c_str(), file);
-    fclose(file);
+    // Create temp dir if needed
+    {
+      DIR *dir = opendir("temp");
 
-    // Get rust crate directories
+      if (dir) {
+        closedir(dir);
+      } else {
+        mkdir("temp", 0777);
+        chmod("temp", 0777);
+      }
+    }
+    // Write code to temp file
+    {
+      std::string filename = std::string("temp/kernel_") + std::to_string(build_id) + std::string("_0_source.cu");
+      FILE       *file     = fopen(filename.c_str(), "w");
+
+      CeedCheck(file, ceed, CEED_ERROR_BACKEND, "Failed to create file. Write access is required for cuda-clang");
+      fputs(code.str().c_str(), file);
+      fclose(file);
+    }
 
+    // Get rust crate directories
     const char **rust_source_dirs     = nullptr;
     int          num_rust_source_dirs = 0;
 
@@ -265,14 +280,17 @@ static int CeedCompileCore_Cuda(Ceed ceed, const char *source, const bool throw_
 
     // Compile wrapper kernel
     command = "clang++ -flto=thin --cuda-gpu-arch=sm_" + std::to_string(prop.major) + std::to_string(prop.minor) +
-              " --cuda-device-only -emit-llvm -S temp_kernel_source.cu -o temp_kernel.ll ";
+              " --cuda-device-only -emit-llvm -S temp/kernel_" + std::to_string(build_id) + "_0_source.cu -o temp/kernel_" +
+              std::to_string(build_id) + "_1_wrapped.ll ";
     command += opts[4];
     CeedCallSystem(ceed, command.c_str(), "JiT kernel source");
 
     // the find command finds the rust-installed llvm-link tool and runs it
-    command = "$(find $(rustup run " + std::string(rust_toolchain) +
-              " rustc --print sysroot) -name llvm-link) temp_kernel.ll --ignore-non-bitcode --internalize --only-needed -S -o "
-              "temp_kernel_linked.ll  ";
+    command = "$(find $(rustup run " + std::string(rust_toolchain) + " rustc --print sysroot) -name llvm-link) temp/kernel_" +
+              std::to_string(build_id) +
+              "_1_wrapped.ll --ignore-non-bitcode --internalize --only-needed -S -o "
+              "temp/kernel_" +
+              std::to_string(build_id) + "_2_linked.ll ";
 
     // Searches for .a files in rust directoy
     // Note: this is necessary because rust crate names may not match the folder they are in
@@ -298,18 +316,20 @@ static int CeedCompileCore_Cuda(Ceed ceed, const char *source, const bool throw_
     // Link, optimize, and compile final CUDA kernel
     // note that the find command is used to find the rust-installed llvm tool
     CeedCallSystem(ceed, command.c_str(), "link C and Rust source");
-    CeedCallSystem(ceed,
-                   ("$(find $(rustup run " + std::string(rust_toolchain) +
-                    " rustc --print sysroot) -name opt) --passes internalize,inline temp_kernel_linked.ll -o temp_kernel_opt.bc")
-                       .c_str(),
-                   "optimize linked C and Rust source");
+    CeedCallSystem(
+        ceed,
+        ("$(find $(rustup run " + std::string(rust_toolchain) + " rustc --print sysroot) -name opt) --passes internalize,inline temp/kernel_" +
+         std::to_string(build_id) + "_2_linked.ll -o temp/kernel_" + std::to_string(build_id) + "_3_opt.bc")
+            .c_str(),
+        "optimize linked C and Rust source");
     CeedCallSystem(ceed,
                    ("$(find $(rustup run " + std::string(rust_toolchain) + " rustc --print sysroot) -name llc) -O3 -mcpu=sm_" +
-                    std::to_string(prop.major) + std::to_string(prop.minor) + " temp_kernel_opt.bc -o temp_kernel_final.ptx")
+                    std::to_string(prop.major) + std::to_string(prop.minor) + " temp/kernel_" + std::to_string(build_id) +
+                    "_3_opt.bc -o temp/kernel_" + std::to_string(build_id) + "_4_final.ptx")
                        .c_str(),
                    "compile final CUDA kernel");
 
-    ifstream      ptxfile("temp_kernel_final.ptx");
+    ifstream      ptxfile("temp/kernel_" + std::to_string(build_id) + "_4_final.ptx");
     ostringstream sstr;
 
     sstr << ptxfile.rdbuf();
diff --git a/examples/ceed/.gitignore b/examples/ceed/.gitignore
index 9f00fb96a8..9250d2275b 100644
--- a/examples/ceed/.gitignore
+++ b/examples/ceed/.gitignore
@@ -1,2 +1,3 @@
 ex1-volume
 ex2-surface
+ex3-volume
diff --git a/examples/rust-qfunctions/.gitignore b/examples/rust-qfunctions/.gitignore
index 7c891764ce..f2ceaf60f1 100644
--- a/examples/rust-qfunctions/.gitignore
+++ b/examples/rust-qfunctions/.gitignore
@@ -1 +1,2 @@
-temp_*
+ex1-volume
+temp/*
diff --git a/examples/rust-qfunctions/Makefile b/examples/rust-qfunctions/Makefile
index 324297c5d9..92f0915cbf 100644
--- a/examples/rust-qfunctions/Makefile
+++ b/examples/rust-qfunctions/Makefile
@@ -31,5 +31,5 @@ ex1-volume: ex1-volume.c
 
 clean:
 	rm -f *~ $(EXAMPLES)
-	rm -f temp_*
+	rm -rf temp/
 	rm -rf *.dSYM *.TVD.*breakpoints

From 33cc410d9e7c7c1ec2b31835f296422ec750fa8c Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 1 Aug 2025 09:58:16 -0600
Subject: [PATCH 486/571] clang - always set permissions

---
 backends/cuda/ceed-cuda-compile.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/backends/cuda/ceed-cuda-compile.cpp b/backends/cuda/ceed-cuda-compile.cpp
index be60a45ee5..89cc4ca3ca 100644
--- a/backends/cuda/ceed-cuda-compile.cpp
+++ b/backends/cuda/ceed-cuda-compile.cpp
@@ -230,6 +230,8 @@ static int CeedCompileCore_Cuda(Ceed ceed, const char *source, const bool throw_
       if (dir) {
         closedir(dir);
       } else {
+        // In parallel multiple processes may attempt
+        // Only one process needs to succeed
         mkdir("temp", 0777);
         chmod("temp", 0777);
       }
@@ -284,6 +286,7 @@ static int CeedCompileCore_Cuda(Ceed ceed, const char *source, const bool throw_
               std::to_string(build_id) + "_1_wrapped.ll ";
     command += opts[4];
     CeedCallSystem(ceed, command.c_str(), "JiT kernel source");
+    CeedCallSystem(ceed, ("chmod 0777 temp/kernel_" + std::to_string(build_id) + "_1_wrapped.ll").c_str(), "update JiT file permissions");
 
     // the find command finds the rust-installed llvm-link tool and runs it
     command = "$(find $(rustup run " + std::string(rust_toolchain) + " rustc --print sysroot) -name llvm-link) temp/kernel_" +
@@ -322,12 +325,14 @@ static int CeedCompileCore_Cuda(Ceed ceed, const char *source, const bool throw_
          std::to_string(build_id) + "_2_linked.ll -o temp/kernel_" + std::to_string(build_id) + "_3_opt.bc")
             .c_str(),
         "optimize linked C and Rust source");
+    CeedCallSystem(ceed, ("chmod 0777 temp/kernel_" + std::to_string(build_id) + "_2_linked.ll").c_str(), "update JiT file permissions");
     CeedCallSystem(ceed,
                    ("$(find $(rustup run " + std::string(rust_toolchain) + " rustc --print sysroot) -name llc) -O3 -mcpu=sm_" +
                     std::to_string(prop.major) + std::to_string(prop.minor) + " temp/kernel_" + std::to_string(build_id) +
                     "_3_opt.bc -o temp/kernel_" + std::to_string(build_id) + "_4_final.ptx")
                        .c_str(),
                    "compile final CUDA kernel");
+    CeedCallSystem(ceed, ("chmod 0777 temp/kernel_" + std::to_string(build_id) + "_4_final.ptx").c_str(), "update JiT file permissions");
 
     ifstream      ptxfile("temp/kernel_" + std::to_string(build_id) + "_4_final.ptx");
     ostringstream sstr;

From 7fed1585870094bd36e2c0cdb6ad3575c8ecfc13 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 1 Aug 2025 10:31:16 -0600
Subject: [PATCH 487/571] ci - make sure rustup is up to date

---
 .gitlab-ci.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index cc469666e1..44c52c2028 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -194,6 +194,7 @@ noether-rust-qfunctions:
   script:
     - rm -f .SUCCESS
     # Rustup
+    - rustup update nightly
     - rustup component add rust-src --toolchain nightly-x86_64-unknown-linux-gnu
     # libCEED
     - make configure OPT='-O -march=native -ffp-contract=fast' CUDA_DIR=/usr/local/cuda-12.9

From fa619ecc52f58ebd3ff3ef012ebe7a24b3c56483 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 1 Aug 2025 12:10:56 -0600
Subject: [PATCH 488/571] clang - use clang++-version if able

---
 backends/cuda/ceed-cuda-common.h    |  2 +
 backends/cuda/ceed-cuda-compile.cpp | 65 ++++++++++++++++++++++-------
 2 files changed, 53 insertions(+), 14 deletions(-)

diff --git a/backends/cuda/ceed-cuda-common.h b/backends/cuda/ceed-cuda-common.h
index 0236e57cdc..b7ec686424 100644
--- a/backends/cuda/ceed-cuda-common.h
+++ b/backends/cuda/ceed-cuda-common.h
@@ -66,6 +66,8 @@ static const char *cublasGetErrorName(cublasStatus_t error) {
 
 typedef struct {
   int                   device_id;
+  bool                  use_llvm_version;
+  int                   llvm_version;
   cublasHandle_t        cublas_handle;
   struct cudaDeviceProp device_prop;
 } Ceed_Cuda;
diff --git a/backends/cuda/ceed-cuda-compile.cpp b/backends/cuda/ceed-cuda-compile.cpp
index 89cc4ca3ca..47964dcc98 100644
--- a/backends/cuda/ceed-cuda-compile.cpp
+++ b/backends/cuda/ceed-cuda-compile.cpp
@@ -14,6 +14,7 @@
 #include <dirent.h>
 #include <nvrtc.h>
 #include <stdarg.h>
+#include <stdio.h>
 #include <string.h>
 #include <sys/stat.h>
 #include <sys/types.h>
@@ -44,18 +45,19 @@
 // Call system command and capture stdout + stderr
 //------------------------------------------------------------------------------
 static int CeedCallSystem_Core(Ceed ceed, const char *command, const char *message) {
-  CeedDebug(ceed, "Running command:\n$ %s\n", command);
+  CeedDebug(ceed, "Running command:\n$ %s", command);
   FILE *output_stream = popen((command + std::string(" 2>&1")).c_str(), "r");
 
-  CeedCheck(output_stream != nullptr, ceed, CEED_ERROR_BACKEND, "Failed to %s with command: %s", message, command);
+  CeedCheck(output_stream != nullptr, ceed, CEED_ERROR_BACKEND, "Failed to %s\ncommand:\n$ %s", message, command);
 
-  char output[4 * CEED_MAX_RESOURCE_LEN];
+  char        line[CEED_MAX_RESOURCE_LEN] = "";
+  std::string output                      = "";
 
-  while (fgets(output, sizeof(output), output_stream) != nullptr) {
+  while (fgets(line, sizeof(line), output_stream) != nullptr) {
+    output += line;
   }
-  CeedDebug(ceed, "Command output:\n%s\n", output);
-
-  CeedCheck(pclose(output_stream) == 0, ceed, CEED_ERROR_BACKEND, "Failed to %s with command: %s\nand error: %s", message, command, output);
+  CeedDebug(ceed, "output:\n%s\n", output.c_str());
+  CeedCheck(pclose(output_stream) == 0, ceed, CEED_ERROR_BACKEND, "Failed to %s\ncommand:\n$ %s\nerror:\n%s", message, command, output.c_str());
   return CEED_ERROR_SUCCESS;
 }
 
@@ -280,15 +282,51 @@ static int CeedCompileCore_Cuda(Ceed ceed, const char *source, const bool throw_
       CeedCallSystem(ceed, command.c_str(), "build Rust crate");
     }
 
+    // Get Clang version
+    bool use_llvm_version = ceed_data->use_llvm_version;
+    int  llvm_version     = ceed_data->llvm_version;
+
+    if (llvm_version == 0) {
+      command = "$(find $(rustup run " + std::string(rust_toolchain) + " rustc --print sysroot) -name llvm-link) --version";
+      CeedDebug(ceed, "Attempting to detect Rust LLVM version.\ncommand:\n$ %s", command.c_str());
+      FILE *output_stream = popen((command + std::string(" 2>&1")).c_str(), "r");
+
+      CeedCheck(output_stream != nullptr, ceed, CEED_ERROR_BACKEND, "Failed to detect Rust LLVM version");
+
+      char        line[CEED_MAX_RESOURCE_LEN] = "";
+      std::string output                      = "";
+
+      while (fgets(line, sizeof(line), output_stream) != nullptr) {
+        output += line;
+      }
+      CeedDebug(ceed, "output:\n%s", output.c_str());
+      CeedCheck(pclose(output_stream) == 0, ceed, CEED_ERROR_BACKEND, "Failed to detect Rust LLVM version\ncommand:\n$ %s\nerror:\n%s",
+                command.c_str(), output.c_str());
+
+      const char *version_substring = strstr(output.c_str(), "LLVM version ");
+
+      version_substring += 13;
+
+      char *next_dot = strchr((char *)version_substring, '.');
+
+      next_dot[0]             = '\0';
+      ceed_data->llvm_version = llvm_version = std::stoi(version_substring);
+      CeedDebug(ceed, "Rust LLVM version number: %d\n", llvm_version);
+
+      command                     = std::string("clang++-") + std::to_string(llvm_version);
+      output_stream               = popen((command + std::string(" 2>&1")).c_str(), "r");
+      ceed_data->use_llvm_version = use_llvm_version = pclose(output_stream) == 0;
+    }
+
     // Compile wrapper kernel
-    command = "clang++ -flto=thin --cuda-gpu-arch=sm_" + std::to_string(prop.major) + std::to_string(prop.minor) +
-              " --cuda-device-only -emit-llvm -S temp/kernel_" + std::to_string(build_id) + "_0_source.cu -o temp/kernel_" +
-              std::to_string(build_id) + "_1_wrapped.ll ";
+    command = "clang++" + (use_llvm_version ? (std::string("-") + std::to_string(llvm_version)) : "") + " -flto=thin --cuda-gpu-arch=sm_" +
+              std::to_string(prop.major) + std::to_string(prop.minor) + " --cuda-device-only -emit-llvm -S temp/kernel_" + std::to_string(build_id) +
+              "_0_source.cu -o temp/kernel_" + std::to_string(build_id) + "_1_wrapped.ll ";
     command += opts[4];
     CeedCallSystem(ceed, command.c_str(), "JiT kernel source");
     CeedCallSystem(ceed, ("chmod 0777 temp/kernel_" + std::to_string(build_id) + "_1_wrapped.ll").c_str(), "update JiT file permissions");
 
-    // the find command finds the rust-installed llvm-link tool and runs it
+    // Find Rust's llvm-link tool and runs it
     command = "$(find $(rustup run " + std::string(rust_toolchain) + " rustc --print sysroot) -name llvm-link) temp/kernel_" +
               std::to_string(build_id) +
               "_1_wrapped.ll --ignore-non-bitcode --internalize --only-needed -S -o "
@@ -296,7 +334,7 @@ static int CeedCompileCore_Cuda(Ceed ceed, const char *source, const bool throw_
               std::to_string(build_id) + "_2_linked.ll ";
 
     // Searches for .a files in rust directoy
-    // Note: this is necessary because rust crate names may not match the folder they are in
+    // Note: this is necessary because Rust crate names may not match the folder they are in
     for (CeedInt i = 0; i < num_rust_source_dirs; i++) {
       std::string dir = rust_dirs[i] + "/target/nvptx64-nvidia-cuda/release";
       DIR        *dp  = opendir(dir.c_str());
@@ -304,7 +342,7 @@ static int CeedCompileCore_Cuda(Ceed ceed, const char *source, const bool throw_
       CeedCheck(dp != nullptr, ceed, CEED_ERROR_BACKEND, "Could not open directory: %s", dir.c_str());
       struct dirent *entry;
 
-      // finds files ending in .a
+      // Find files ending in .a
       while ((entry = readdir(dp)) != nullptr) {
         std::string filename(entry->d_name);
 
@@ -317,7 +355,6 @@ static int CeedCompileCore_Cuda(Ceed ceed, const char *source, const bool throw_
     }
 
     // Link, optimize, and compile final CUDA kernel
-    // note that the find command is used to find the rust-installed llvm tool
     CeedCallSystem(ceed, command.c_str(), "link C and Rust source");
     CeedCallSystem(
         ceed,

From bdd929d2535f59f3fa3773fee8d7c44fbe1c8ed5 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 1 Aug 2025 13:05:48 -0600
Subject: [PATCH 489/571] ci - ensure full rust tools installed

---
 .gitlab-ci.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 44c52c2028..5179597c67 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -195,7 +195,8 @@ noether-rust-qfunctions:
     - rm -f .SUCCESS
     # Rustup
     - rustup update nightly
-    - rustup component add rust-src --toolchain nightly-x86_64-unknown-linux-gnu
+    - rustup component add rust-src --toolchain nightly
+    - rustup component add llvm-tools --toolchain nightly
     # libCEED
     - make configure OPT='-O -march=native -ffp-contract=fast' CUDA_DIR=/usr/local/cuda-12.9
     - echo "-------------- libCEED -------------" && make info

From b02756aa7c7857cb31158f7e47cf7642283031c8 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 1 Aug 2025 14:50:22 -0600
Subject: [PATCH 490/571] ci - rust job NPROC_POOL=1

---
 .gitlab-ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 5179597c67..9a994f963b 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -182,7 +182,7 @@ noether-rust-qfunctions:
   before_script:
     # Environment
     - export COVERAGE=1 CC=gcc CXX=g++ FC=gfortran NVCC=nvcc GPU_CLANG=1
-    - export NPROC_POOL=4
+    - export NPROC_POOL=1
     - echo "-------------- nproc ---------------" && NPROC_CPU=$(nproc) && NPROC_GPU=$(($(nproc)<8?$(nproc):8)) && echo "NPROC_CPU" $NPROC_CPU && echo "NPROC_GPU" $NPROC_GPU
     - echo "-------------- CC ------------------" && $CC --version
     - echo "-------------- CXX -----------------" && $CXX --version

From f03c7eee995104f4b96260e7ce1a69b39ad20579 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Mon, 4 Aug 2025 09:13:21 -0600
Subject: [PATCH 491/571] clang - minor tidying for CUDA JiT

---
 backends/cuda/ceed-cuda-compile.cpp | 28 +++++++++++++++++-----------
 1 file changed, 17 insertions(+), 11 deletions(-)

diff --git a/backends/cuda/ceed-cuda-compile.cpp b/backends/cuda/ceed-cuda-compile.cpp
index 47964dcc98..878b64b4ba 100644
--- a/backends/cuda/ceed-cuda-compile.cpp
+++ b/backends/cuda/ceed-cuda-compile.cpp
@@ -309,13 +309,18 @@ static int CeedCompileCore_Cuda(Ceed ceed, const char *source, const bool throw_
 
       char *next_dot = strchr((char *)version_substring, '.');
 
-      next_dot[0]             = '\0';
-      ceed_data->llvm_version = llvm_version = std::stoi(version_substring);
-      CeedDebug(ceed, "Rust LLVM version number: %d\n", llvm_version);
-
-      command                     = std::string("clang++-") + std::to_string(llvm_version);
-      output_stream               = popen((command + std::string(" 2>&1")).c_str(), "r");
-      ceed_data->use_llvm_version = use_llvm_version = pclose(output_stream) == 0;
+      if (next_dot) {
+        next_dot[0]             = '\0';
+        ceed_data->llvm_version = llvm_version = std::stoi(version_substring);
+        CeedDebug(ceed, "Rust LLVM version number: %d\n", llvm_version);
+
+        command                     = std::string("clang++-") + std::to_string(llvm_version);
+        output_stream               = popen((command + std::string(" 2>&1")).c_str(), "r");
+        ceed_data->use_llvm_version = use_llvm_version = pclose(output_stream) == 0;
+      } else {
+        ceed_data->llvm_version     = -1;
+        ceed_data->use_llvm_version = use_llvm_version = false;
+      }
     }
 
     // Compile wrapper kernel
@@ -326,15 +331,16 @@ static int CeedCompileCore_Cuda(Ceed ceed, const char *source, const bool throw_
     CeedCallSystem(ceed, command.c_str(), "JiT kernel source");
     CeedCallSystem(ceed, ("chmod 0777 temp/kernel_" + std::to_string(build_id) + "_1_wrapped.ll").c_str(), "update JiT file permissions");
 
-    // Find Rust's llvm-link tool and runs it
+    // Find Rust's llvm-link tool and run it
     command = "$(find $(rustup run " + std::string(rust_toolchain) + " rustc --print sysroot) -name llvm-link) temp/kernel_" +
               std::to_string(build_id) +
               "_1_wrapped.ll --ignore-non-bitcode --internalize --only-needed -S -o "
               "temp/kernel_" +
               std::to_string(build_id) + "_2_linked.ll ";
 
-    // Searches for .a files in rust directoy
-    // Note: this is necessary because Rust crate names may not match the folder they are in
+    // Searches for .a files in Rust directory
+    // Note: Rust crate names may not match the folder they are in
+    // TODO: If libCEED switches to c++17, use std::filesystem here
     for (CeedInt i = 0; i < num_rust_source_dirs; i++) {
       std::string dir = rust_dirs[i] + "/target/nvptx64-nvidia-cuda/release";
       DIR        *dp  = opendir(dir.c_str());
@@ -351,7 +357,6 @@ static int CeedCompileCore_Cuda(Ceed ceed, const char *source, const bool throw_
         }
       }
       closedir(dp);
-      // TODO: when libCEED switches to c++17, switch to std::filesystem for the loop above
     }
 
     // Link, optimize, and compile final CUDA kernel
@@ -371,6 +376,7 @@ static int CeedCompileCore_Cuda(Ceed ceed, const char *source, const bool throw_
                    "compile final CUDA kernel");
     CeedCallSystem(ceed, ("chmod 0777 temp/kernel_" + std::to_string(build_id) + "_4_final.ptx").c_str(), "update JiT file permissions");
 
+    // Load module from final PTX
     ifstream      ptxfile("temp/kernel_" + std::to_string(build_id) + "_4_final.ptx");
     ostringstream sstr;
 

From de5bf46bcc909da70680c51facd7097145cfeaa6 Mon Sep 17 00:00:00 2001
From: Zach Atkins <zach.atkins@colorado.edu>
Date: Wed, 13 Aug 2025 12:05:46 -0600
Subject: [PATCH 492/571] ref - Ensure at-points operators skip cells with no
 points (#1886)

* ref - Ensure at-points operators skip cells with no points

* change conditional

Co-authored-by: Jeremy L Thompson <jeremy@jeremylt.org>

---------

Co-authored-by: Jeremy L Thompson <jeremy@jeremylt.org>
---
 backends/ref/ceed-ref-operator.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/backends/ref/ceed-ref-operator.c b/backends/ref/ceed-ref-operator.c
index 4769d34d68..c102b8c01d 100644
--- a/backends/ref/ceed-ref-operator.c
+++ b/backends/ref/ceed-ref-operator.c
@@ -1054,6 +1054,7 @@ static int CeedOperatorApplyAddAtPoints_Ref(CeedOperator op, CeedVector in_vec,
     // Setup points for element
     CeedCallBackend(CeedElemRestrictionApplyAtPointsInElement(rstr_points, e, CEED_NOTRANSPOSE, point_coords, impl->point_coords_elem, request));
     CeedCallBackend(CeedElemRestrictionGetNumPointsInElement(rstr_points, e, &num_points));
+    if (num_points < 1) continue;
 
     // Input basis apply
     CeedCallBackend(CeedOperatorInputBasisAtPoints_Ref(e, num_points_offset, num_points, qf_input_fields, op_input_fields, num_input_fields, in_vec,

From dd6207fc8114e0cc21a5ac2f5d1e07899e2de22d Mon Sep 17 00:00:00 2001
From: Jed Brown <jed@jedbrown.org>
Date: Thu, 14 Aug 2025 17:02:15 -0600
Subject: [PATCH 493/571] mailmap: updates, 44 contributors as of today

---
 .mailmap | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/.mailmap b/.mailmap
index 61c3cab2b6..787aaa2c49 100644
--- a/.mailmap
+++ b/.mailmap
@@ -6,7 +6,11 @@
 #
 # See 'git help shortlog' for details
 
+Adeleke Bankole <adeleke.bankole@colorado.edu>
+Adeleke Bankole <adeleke.bankole@colorado.edu>          <86932837+AdelekeBankole@users.noreply.github.com>
 Ahmad Abdelfattah <ahmad@icl.utk.edu>                   <36712794+abdelfattah83@users.noreply.github.com>
+Allen MacFarland <Allen.MacFarland@colorado.edu>        <79958059+SirAlienTheGreat@users.noreply.github.com>
+Alex Pedersen <ajpedersen20@gmail.com>                  <54287657+ajpedersen20@users.noreply.github.com>
 Arash Mehraban <arashm81@gmail.com>                     <ArashMehraban@users.noreply.github.com>
 David Medina <dmed256@gmail.com>
 James Wright <james@jameswright.xyz>                    <jameswright@jameswright.xyz>
@@ -16,13 +20,21 @@ Jeremy L. Thompson <jeremy@jeremylt.org>                <25011573+jeremylt@users
 Jeremy L. Thompson <jeremy@jeremylt.org>                <jeth8984@noether>
 Jeremy L. Thompson <jeremy@jeremylt.org>                <jeremy.thompson@colorado.edu>
 Jeremy L. Thompson <jeremy@jeremylt.org>                <thompson.jeremy.luke@gmail.com>
-Leila Ghaffari <Leila.Ghaffari@colorado.edu>            <49916147+LeilaGhaffari@users.noreply.github.com>
-Leila Ghaffari <Leila.Ghaffari@colorado.edu>            <leila@Leilas-MacBook-Pro.local>
+Kenneth E. Jansen <Kenneth.Jansen@colorado.edu>         <kenneth.jansen@colorado.edu>
+Layla Ghaffari <Layla.Ghaffari@colorado.edu>            <Leila.Ghaffari@colorado.edu>
+Layla Ghaffari <Layla.Ghaffari@colorado.edu>            <49916147+LeilaGhaffari@users.noreply.github.com>
+Layla Ghaffari <Layla.Ghaffari@colorado.edu>            <leila@Leilas-MacBook-Pro.local>
+Natalie Beams <nbeams@icl.utk.edu>
 Natalie Beams <nbeams@icl.utk.edu>                      <246972+nbeams@users.noreply.github.com>
 Rey Koki <rey.koki@colorado.edu>                        <36133157+reykoki@users.noreply.github.com>
 Rezgar Shakeri <Rezgar.Shakeri@colorado.edu>            <42816410+rezgarshakeri@users.noreply.github.com>
+Rezgar Shakeri <Rezgar.Shakeri@colorado.edu>            <rezgar.shakeri@colorado.edu>
+Riccardo Balin <riccardo.balin@gmail.com>               <balin@uan-0001.head.cm.americas.sgi.com>
+Riccardo Balin <riccardo.balin@gmail.com>               <balin@uan-0002.head.cm.americas.sgi.com>
+Thilina Ratnayaka <thilinarmtb@gmail.com>
 Thilina Ratnayaka <thilinarmtb@gmail.com>               <thilinarmtb@users.noreply.github.com>
 Tzanio Kolev <tzanio@llnl.gov>
+Umesh Unnikrishnan <unnikrishnan@anl.gov>               <umesh.aero@gatech.edu>
 Valeria Barra <valeriabarra21@gmail.com>
 Valeria Barra <valeriabarra21@gmail.com>                <39932030+valeriabarra@users.noreply.github.com>
 Valeria Barra <valeriabarra21@gmail.com>                <vaba3353@shas0136.rc.int.colorado.edu>
@@ -31,3 +43,6 @@ Valeria Barra <valeriabarra21@gmail.com>                <valeria.barra@colorado.
 Will Pazner <will.e.p@gmail.com>                        <11493037+pazner@users.noreply.github.com>
 Yohann Dudouit <dudouit1@llnl.gov>
 Yohann Dudouit <dudouit1@llnl.gov>                      <yohann.dudouit@gmail.com>
+Zach Atkins <Zach.Atkins@colorado.edu>                  <zach.atkins@colorado.edu>
+Zach Atkins <Zach.Atkins@colorado.edu>                  <zachary.r.atkins@pm.me>
+Zach Atkins <Zach.Atkins@colorado.edu>                  <zacharyjayhawk@gmail.com>

From ebd5faf1fbcdb19a543da89a64e575df1e7d5a2b Mon Sep 17 00:00:00 2001
From: Zach Atkins <zach.atkins@colorado.edu>
Date: Mon, 18 Aug 2025 14:12:07 -0600
Subject: [PATCH 494/571] python - remove license classifier (deprecated in
 later python versions)

---
 setup.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/setup.py b/setup.py
index a2bb813e59..b0d423c815 100644
--- a/setup.py
+++ b/setup.py
@@ -79,7 +79,6 @@ def make_libceed_so(self, prefix):
 Development Status :: 4 - Beta
 Intended Audience :: Developers
 Intended Audience :: Science/Research
-License :: OSI Approved :: BSD License
 Operating System :: POSIX
 Programming Language :: C
 Programming Language :: C++

From 56318ee4862bf2b045dbea0ef6799ecae8c551fe Mon Sep 17 00:00:00 2001
From: Zach Atkins <zach.atkins@colorado.edu>
Date: Tue, 19 Aug 2025 14:20:34 -0600
Subject: [PATCH 495/571] assembly - add
 `CeedOperatorLinearAssembleGetNumEntries`

---
 include/ceed/ceed.h              |  1 +
 interface/ceed-preconditioning.c | 49 ++++++++++++++++++++++++--------
 2 files changed, 38 insertions(+), 12 deletions(-)

diff --git a/include/ceed/ceed.h b/include/ceed/ceed.h
index a3442629a5..f0207d6abf 100644
--- a/include/ceed/ceed.h
+++ b/include/ceed/ceed.h
@@ -438,6 +438,7 @@ CEED_EXTERN int  CeedOperatorLinearAssemblePointBlockDiagonal(CeedOperator op, C
 CEED_EXTERN int  CeedOperatorLinearAssembleAddPointBlockDiagonal(CeedOperator op, CeedVector assembled, CeedRequest *request);
 CEED_EXTERN int  CeedOperatorLinearAssemblePointBlockDiagonalSymbolic(CeedOperator op, CeedSize *num_entries, CeedInt **rows, CeedInt **cols);
 CEED_EXTERN int  CeedOperatorLinearAssembleSymbolic(CeedOperator op, CeedSize *num_entries, CeedInt **rows, CeedInt **cols);
+CEED_EXTERN int  CeedOperatorLinearAssembleGetNumEntries(CeedOperator op, CeedSize *num_entries);
 CEED_EXTERN int  CeedOperatorLinearAssemble(CeedOperator op, CeedVector values);
 CEED_EXTERN int  CeedOperatorCompositeGetMultiplicity(CeedOperator op, CeedInt num_skip_indices, CeedInt *skip_indices, CeedVector mult);
 CEED_EXTERN int  CeedOperatorMultigridLevelCreate(CeedOperator op_fine, CeedVector p_mult_fine, CeedElemRestriction rstr_coarse,
diff --git a/interface/ceed-preconditioning.c b/interface/ceed-preconditioning.c
index bbc031d6ce..84d3f1a1e0 100644
--- a/interface/ceed-preconditioning.c
+++ b/interface/ceed-preconditioning.c
@@ -956,6 +956,42 @@ static int CeedOperatorAssemblyCountEntriesSingle(CeedOperator op, CeedSize *num
   return CEED_ERROR_SUCCESS;
 }
 
+/**
+  @brief Count number of entries for assembled `CeedOperator`
+
+  @param[in]  op          `CeedOperator` to assemble
+  @param[out] num_entries Number of entries in assembled representation
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Utility
+**/
+int CeedOperatorLinearAssembleGetNumEntries(CeedOperator op, CeedSize *num_entries) {
+  bool is_composite;
+
+  CeedCall(CeedOperatorCheckReady(op));
+  CeedCall(CeedOperatorIsComposite(op, &is_composite));
+
+  if (is_composite) {
+    CeedInt       num_suboperators;
+    CeedOperator *sub_operators;
+
+    CeedCall(CeedOperatorCompositeGetNumSub(op, &num_suboperators));
+    CeedCall(CeedOperatorCompositeGetSubList(op, &sub_operators));
+
+    *num_entries = 0;
+    for (CeedInt k = 0; k < num_suboperators; ++k) {
+      CeedSize single_entries;
+
+      CeedCall(CeedOperatorAssemblyCountEntriesSingle(sub_operators[k], &single_entries));
+      *num_entries += single_entries;
+    }
+  } else {
+    CeedCall(CeedOperatorAssemblyCountEntriesSingle(op, num_entries));
+  }
+  return CEED_ERROR_SUCCESS;
+}
+
 /**
   @brief Common code for creating a multigrid coarse `CeedOperator` and level transfer `CeedOperator` for a `CeedOperator`
 
@@ -2462,18 +2498,7 @@ int CeedOperatorLinearAssembleSymbolic(CeedOperator op, CeedSize *num_entries, C
   // Default interface implementation
 
   // Count entries and allocate rows, cols arrays
-  *num_entries = 0;
-  if (is_composite) {
-    CeedCall(CeedOperatorCompositeGetNumSub(op, &num_suboperators));
-    CeedCall(CeedOperatorCompositeGetSubList(op, &sub_operators));
-    for (CeedInt k = 0; k < num_suboperators; ++k) {
-      CeedCall(CeedOperatorAssemblyCountEntriesSingle(sub_operators[k], &single_entries));
-      *num_entries += single_entries;
-    }
-  } else {
-    CeedCall(CeedOperatorAssemblyCountEntriesSingle(op, &single_entries));
-    *num_entries += single_entries;
-  }
+  CeedCall(CeedOperatorLinearAssembleGetNumEntries(op, num_entries));
   CeedCall(CeedCalloc(*num_entries, rows));
   CeedCall(CeedCalloc(*num_entries, cols));
 

From 9c944cbf21f25d9ee2d211269694a2e860ed203b Mon Sep 17 00:00:00 2001
From: Zach Atkins <zach.atkins@colorado.edu>
Date: Sat, 30 Aug 2025 15:53:02 -0600
Subject: [PATCH 496/571] at-points - ensure reference point coordinates are
 always up to date

---
 backends/cuda-ref/ceed-cuda-ref-operator.c | 8 ++++----
 backends/hip-ref/ceed-hip-ref-operator.c   | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/backends/cuda-ref/ceed-cuda-ref-operator.c b/backends/cuda-ref/ceed-cuda-ref-operator.c
index b278f37643..525a225340 100644
--- a/backends/cuda-ref/ceed-cuda-ref-operator.c
+++ b/backends/cuda-ref/ceed-cuda-ref-operator.c
@@ -848,12 +848,12 @@ static int CeedOperatorApplyAddAtPoints_Cuda(CeedOperator op, CeedVector in_vec,
   CeedCallBackend(CeedGetWorkVector(ceed, impl->max_active_e_vec_len, &active_e_vec));
 
   // Get point coordinates
-  if (!impl->point_coords_elem) {
+  {
     CeedVector          point_coords = NULL;
     CeedElemRestriction rstr_points  = NULL;
 
     CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, &point_coords));
-    CeedCallBackend(CeedElemRestrictionCreateVector(rstr_points, NULL, &impl->point_coords_elem));
+    if (!impl->point_coords_elem) CeedCallBackend(CeedElemRestrictionCreateVector(rstr_points, NULL, &impl->point_coords_elem));
     CeedCallBackend(CeedElemRestrictionApply(rstr_points, CEED_NOTRANSPOSE, point_coords, impl->point_coords_elem, request));
     CeedCallBackend(CeedVectorDestroy(&point_coords));
     CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
@@ -1855,12 +1855,12 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda(CeedOperator op, C
   }
 
   // Get point coordinates
-  if (!impl->point_coords_elem) {
+  {
     CeedVector          point_coords = NULL;
     CeedElemRestriction rstr_points  = NULL;
 
     CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, &point_coords));
-    CeedCallBackend(CeedElemRestrictionCreateVector(rstr_points, NULL, &impl->point_coords_elem));
+    if (!impl->point_coords_elem) CeedCallBackend(CeedElemRestrictionCreateVector(rstr_points, NULL, &impl->point_coords_elem));
     CeedCallBackend(CeedElemRestrictionApply(rstr_points, CEED_NOTRANSPOSE, point_coords, impl->point_coords_elem, request));
     CeedCallBackend(CeedVectorDestroy(&point_coords));
     CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
diff --git a/backends/hip-ref/ceed-hip-ref-operator.c b/backends/hip-ref/ceed-hip-ref-operator.c
index 8c37aba61b..77bc460152 100644
--- a/backends/hip-ref/ceed-hip-ref-operator.c
+++ b/backends/hip-ref/ceed-hip-ref-operator.c
@@ -846,12 +846,12 @@ static int CeedOperatorApplyAddAtPoints_Hip(CeedOperator op, CeedVector in_vec,
   CeedCallBackend(CeedGetWorkVector(ceed, impl->max_active_e_vec_len, &active_e_vec));
 
   // Get point coordinates
-  if (!impl->point_coords_elem) {
+  {
     CeedVector          point_coords = NULL;
     CeedElemRestriction rstr_points  = NULL;
 
     CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, &point_coords));
-    CeedCallBackend(CeedElemRestrictionCreateVector(rstr_points, NULL, &impl->point_coords_elem));
+    if (!impl->point_coords_elem) CeedCallBackend(CeedElemRestrictionCreateVector(rstr_points, NULL, &impl->point_coords_elem));
     CeedCallBackend(CeedElemRestrictionApply(rstr_points, CEED_NOTRANSPOSE, point_coords, impl->point_coords_elem, request));
     CeedCallBackend(CeedVectorDestroy(&point_coords));
     CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
@@ -1852,12 +1852,12 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip(CeedOperator op, Ce
   }
 
   // Get point coordinates
-  if (!impl->point_coords_elem) {
+  {
     CeedVector          point_coords = NULL;
     CeedElemRestriction rstr_points  = NULL;
 
     CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, &point_coords));
-    CeedCallBackend(CeedElemRestrictionCreateVector(rstr_points, NULL, &impl->point_coords_elem));
+    if (!impl->point_coords_elem) CeedCallBackend(CeedElemRestrictionCreateVector(rstr_points, NULL, &impl->point_coords_elem));
     CeedCallBackend(CeedElemRestrictionApply(rstr_points, CEED_NOTRANSPOSE, point_coords, impl->point_coords_elem, request));
     CeedCallBackend(CeedVectorDestroy(&point_coords));
     CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));

From 68c01f397ddc8bbbc42c9edecbbe7eea80732430 Mon Sep 17 00:00:00 2001
From: Zach Atkins <zach.atkins@colorado.edu>
Date: Tue, 2 Sep 2025 08:57:57 -0600
Subject: [PATCH 497/571] gpu - add state checking for at-points element
 restriction

---
 backends/cuda-ref/ceed-cuda-ref-operator.c | 16 ++++++++++++++--
 backends/cuda-ref/ceed-cuda-ref.h          |  2 +-
 backends/hip-ref/ceed-hip-ref-operator.c   | 16 ++++++++++++++--
 backends/hip-ref/ceed-hip-ref.h            |  2 +-
 4 files changed, 30 insertions(+), 6 deletions(-)

diff --git a/backends/cuda-ref/ceed-cuda-ref-operator.c b/backends/cuda-ref/ceed-cuda-ref-operator.c
index 525a225340..c3cafaebca 100644
--- a/backends/cuda-ref/ceed-cuda-ref-operator.c
+++ b/backends/cuda-ref/ceed-cuda-ref-operator.c
@@ -854,7 +854,13 @@ static int CeedOperatorApplyAddAtPoints_Cuda(CeedOperator op, CeedVector in_vec,
 
     CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, &point_coords));
     if (!impl->point_coords_elem) CeedCallBackend(CeedElemRestrictionCreateVector(rstr_points, NULL, &impl->point_coords_elem));
-    CeedCallBackend(CeedElemRestrictionApply(rstr_points, CEED_NOTRANSPOSE, point_coords, impl->point_coords_elem, request));
+    {
+      uint64_t state;
+      CeedCallBackend(CeedVectorGetState(point_coords, &state));
+      if (impl->points_state != state) {
+        CeedCallBackend(CeedElemRestrictionApply(rstr_points, CEED_NOTRANSPOSE, point_coords, impl->point_coords_elem, request));
+      }
+    }
     CeedCallBackend(CeedVectorDestroy(&point_coords));
     CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
   }
@@ -1861,7 +1867,13 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda(CeedOperator op, C
 
     CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, &point_coords));
     if (!impl->point_coords_elem) CeedCallBackend(CeedElemRestrictionCreateVector(rstr_points, NULL, &impl->point_coords_elem));
-    CeedCallBackend(CeedElemRestrictionApply(rstr_points, CEED_NOTRANSPOSE, point_coords, impl->point_coords_elem, request));
+    {
+      uint64_t state;
+      CeedCallBackend(CeedVectorGetState(point_coords, &state));
+      if (impl->points_state != state) {
+        CeedCallBackend(CeedElemRestrictionApply(rstr_points, CEED_NOTRANSPOSE, point_coords, impl->point_coords_elem, request));
+      }
+    }
     CeedCallBackend(CeedVectorDestroy(&point_coords));
     CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
   }
diff --git a/backends/cuda-ref/ceed-cuda-ref.h b/backends/cuda-ref/ceed-cuda-ref.h
index b8cd4babd7..5010cdb885 100644
--- a/backends/cuda-ref/ceed-cuda-ref.h
+++ b/backends/cuda-ref/ceed-cuda-ref.h
@@ -133,7 +133,7 @@ typedef struct {
 
 typedef struct {
   bool                      *skip_rstr_in, *skip_rstr_out, *apply_add_basis_out;
-  uint64_t                  *input_states;  // State tracking for passive inputs
+  uint64_t                  *input_states, points_state;  // State tracking for passive inputs
   CeedVector                *e_vecs_in, *e_vecs_out;
   CeedVector                *q_vecs_in, *q_vecs_out;
   CeedInt                    num_inputs, num_outputs;
diff --git a/backends/hip-ref/ceed-hip-ref-operator.c b/backends/hip-ref/ceed-hip-ref-operator.c
index 77bc460152..0d1587c630 100644
--- a/backends/hip-ref/ceed-hip-ref-operator.c
+++ b/backends/hip-ref/ceed-hip-ref-operator.c
@@ -852,7 +852,13 @@ static int CeedOperatorApplyAddAtPoints_Hip(CeedOperator op, CeedVector in_vec,
 
     CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, &point_coords));
     if (!impl->point_coords_elem) CeedCallBackend(CeedElemRestrictionCreateVector(rstr_points, NULL, &impl->point_coords_elem));
-    CeedCallBackend(CeedElemRestrictionApply(rstr_points, CEED_NOTRANSPOSE, point_coords, impl->point_coords_elem, request));
+    {
+      uint64_t state;
+      CeedCallBackend(CeedVectorGetState(point_coords, &state));
+      if (impl->points_state != state) {
+        CeedCallBackend(CeedElemRestrictionApply(rstr_points, CEED_NOTRANSPOSE, point_coords, impl->point_coords_elem, request));
+      }
+    }
     CeedCallBackend(CeedVectorDestroy(&point_coords));
     CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
   }
@@ -1858,7 +1864,13 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip(CeedOperator op, Ce
 
     CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, &point_coords));
     if (!impl->point_coords_elem) CeedCallBackend(CeedElemRestrictionCreateVector(rstr_points, NULL, &impl->point_coords_elem));
-    CeedCallBackend(CeedElemRestrictionApply(rstr_points, CEED_NOTRANSPOSE, point_coords, impl->point_coords_elem, request));
+    {
+      uint64_t state;
+      CeedCallBackend(CeedVectorGetState(point_coords, &state));
+      if (impl->points_state != state) {
+        CeedCallBackend(CeedElemRestrictionApply(rstr_points, CEED_NOTRANSPOSE, point_coords, impl->point_coords_elem, request));
+      }
+    }
     CeedCallBackend(CeedVectorDestroy(&point_coords));
     CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
   }
diff --git a/backends/hip-ref/ceed-hip-ref.h b/backends/hip-ref/ceed-hip-ref.h
index 993e4601fd..24a95ec085 100644
--- a/backends/hip-ref/ceed-hip-ref.h
+++ b/backends/hip-ref/ceed-hip-ref.h
@@ -138,7 +138,7 @@ typedef struct {
 
 typedef struct {
   bool                     *skip_rstr_in, *skip_rstr_out, *apply_add_basis_out;
-  uint64_t                 *input_states;  // State tracking for passive inputs
+  uint64_t                 *input_states, points_state;  // State tracking for passive inputs
   CeedVector               *e_vecs_in, *e_vecs_out;
   CeedVector               *q_vecs_in, *q_vecs_out;
   CeedInt                   num_inputs, num_outputs;

From 909e4c015f58576b1dd1a932426cda0d0f7a3b5a Mon Sep 17 00:00:00 2001
From: Zach Atkins <zach.atkins@colorado.edu>
Date: Thu, 4 Sep 2025 16:14:41 -0600
Subject: [PATCH 498/571] gpu & at-points - fix bug in output vectors for
 diagonal assembly

---
 backends/cuda-ref/ceed-cuda-ref-operator.c | 4 ++--
 backends/hip-ref/ceed-hip-ref-operator.c   | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/backends/cuda-ref/ceed-cuda-ref-operator.c b/backends/cuda-ref/ceed-cuda-ref-operator.c
index c3cafaebca..9c4ffd8819 100644
--- a/backends/cuda-ref/ceed-cuda-ref-operator.c
+++ b/backends/cuda-ref/ceed-cuda-ref-operator.c
@@ -2055,8 +2055,8 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda(CeedOperator op, C
     if (eval_mode == CEED_EVAL_NONE) {
       CeedScalar *e_vec_array;
 
-      CeedCallBackend(CeedVectorTakeArray(impl->q_vecs_in[i], CEED_MEM_DEVICE, &e_vec_array));
-      CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs_in[i], &e_vec_array));
+      CeedCallBackend(CeedVectorTakeArray(impl->q_vecs_out[i], CEED_MEM_DEVICE, &e_vec_array));
+      CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs_out[i], &e_vec_array));
     }
   }
 
diff --git a/backends/hip-ref/ceed-hip-ref-operator.c b/backends/hip-ref/ceed-hip-ref-operator.c
index 0d1587c630..274743d526 100644
--- a/backends/hip-ref/ceed-hip-ref-operator.c
+++ b/backends/hip-ref/ceed-hip-ref-operator.c
@@ -2054,8 +2054,8 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip(CeedOperator op, Ce
     if (eval_mode == CEED_EVAL_NONE) {
       CeedScalar *e_vec_array;
 
-      CeedCallBackend(CeedVectorTakeArray(impl->q_vecs_in[i], CEED_MEM_DEVICE, &e_vec_array));
-      CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs_in[i], &e_vec_array));
+      CeedCallBackend(CeedVectorTakeArray(impl->q_vecs_out[i], CEED_MEM_DEVICE, &e_vec_array));
+      CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs_out[i], &e_vec_array));
     }
   }
 

From a5ef892d03a3b4cf0e22d36a5b8677e57e337bf9 Mon Sep 17 00:00:00 2001
From: James Wright <james@jameswright.xyz>
Date: Fri, 26 Sep 2025 14:18:23 -0600
Subject: [PATCH 499/571] vec: Check length >= 0

---
 interface/ceed-vector.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/interface/ceed-vector.c b/interface/ceed-vector.c
index 3d76acb95d..f4a5b7505d 100644
--- a/interface/ceed-vector.c
+++ b/interface/ceed-vector.c
@@ -159,6 +159,7 @@ int CeedVectorReference(CeedVector vec) {
   @ref User
 **/
 int CeedVectorCreate(Ceed ceed, CeedSize length, CeedVector *vec) {
+  CeedCheck(length >= 0, ceed, CEED_ERROR_UNSUPPORTED, "CeedVector must have length >= 0, recieved %" CeedSize_FMT, length);
   if (!ceed->VectorCreate) {
     Ceed delegate;
 

From 6a430a1a5d50e15f5fa69eb0a18cd933635c43c5 Mon Sep 17 00:00:00 2001
From: Jed Brown <jed@jedbrown.org>
Date: Fri, 26 Sep 2025 15:09:32 -0600
Subject: [PATCH 500/571] deprecated.h: fix typo

---
 include/ceed/deprecated.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/ceed/deprecated.h b/include/ceed/deprecated.h
index 4f54f5c606..ee6cc394f5 100644
--- a/include/ceed/deprecated.h
+++ b/include/ceed/deprecated.h
@@ -26,7 +26,7 @@ DEPRECATED("Use CeedOperatorCompositeGetNumSub()")
 static inline int CeedCompositeOperatorGetNumSub(CeedOperator a, CeedInt *b) { return CeedOperatorCompositeGetNumSub(a, b); }
 DEPRECATED("Use CeedOperatorCompositeGetSubList()")
 static inline int CeedCompositeOperatorGetSubList(CeedOperator a, CeedOperator **b) { return CeedOperatorCompositeGetSubList(a, b); }
-DEPRECATED("Use CeedCompositeOperatorGetSubByName()")
+DEPRECATED("Use CeedOperatorCompositeGetSubByName()")
 static inline int CeedCompositeOperatorGetSubByName(CeedOperator a, const char *b, CeedOperator *c) {
   return CeedOperatorCompositeGetSubByName(a, b, c);
 }

From 1a8516d00062e8132c3db0515cc9f5fa064f6664 Mon Sep 17 00:00:00 2001
From: James Wright <james@jameswright.xyz>
Date: Tue, 30 Sep 2025 14:18:54 -0600
Subject: [PATCH 501/571] style: Change clang-format to penalize newline after
 open ()

---
 .clang-format                                 |  1 +
 backends/blocked/ceed-blocked-operator.c      |  8 ++---
 backends/cuda-gen/ceed-cuda-gen-operator.c    | 28 ++++++++--------
 backends/cuda-ref/ceed-cuda-ref-basis.c       |  4 +--
 backends/cuda-ref/ceed-cuda-ref-operator.c    | 32 +++++++++----------
 backends/cuda-ref/ceed-cuda-ref-restriction.c |  4 +--
 backends/cuda-shared/ceed-cuda-shared-basis.c | 32 +++++++++----------
 backends/cuda/ceed-cuda-compile.cpp           | 12 +++----
 backends/hip-gen/ceed-hip-gen-operator.c      | 20 ++++++------
 backends/hip-ref/ceed-hip-ref-basis.c         |  4 +--
 backends/hip-ref/ceed-hip-ref-operator.c      | 24 +++++++-------
 backends/hip-ref/ceed-hip-ref-restriction.c   |  4 +--
 backends/hip-shared/ceed-hip-shared-basis.c   | 20 ++++++------
 backends/memcheck/ceed-memcheck-restriction.c | 12 +++----
 backends/opt/ceed-opt-operator.c              | 12 +++----
 backends/ref/ceed-ref-basis.c                 | 10 +++---
 backends/ref/ceed-ref-operator.c              | 24 +++++++-------
 backends/ref/ceed-ref-restriction.c           |  8 ++---
 .../sycl-ref/ceed-sycl-ref-operator.sycl.cpp  |  8 ++---
 backends/sycl/ceed-sycl-compile.sycl.cpp      |  5 +--
 examples/fluids/problems/advection.c          |  8 ++---
 examples/fluids/problems/bc_freestream.c      |  8 ++---
 examples/fluids/problems/blasius.c            |  4 +--
 examples/fluids/src/differential_filter.c     |  7 ++--
 examples/fluids/src/dm_utils.c                |  4 +--
 examples/fluids/src/misc.c                    | 32 +++++++++++--------
 examples/fluids/src/setupts.c                 |  4 +--
 examples/fluids/src/turb_spanstats.c          | 14 ++++----
 .../fluids/src/velocity_gradient_projection.c |  8 ++---
 examples/petsc/bpsraw.c                       |  8 ++---
 examples/petsc/dmswarm.c                      |  8 ++---
 examples/solids/src/cl-options.c              |  4 +--
 examples/solids/src/setup-libceed.c           |  4 +--
 interface/ceed-elemrestriction.c              |  8 ++---
 interface/ceed-preconditioning.c              | 12 +++----
 35 files changed, 206 insertions(+), 199 deletions(-)

diff --git a/.clang-format b/.clang-format
index 4311596850..3e49ddce6e 100644
--- a/.clang-format
+++ b/.clang-format
@@ -8,6 +8,7 @@ AlignEscapedNewlines:         true
 AlignOperands:                Align
 AllowShortIfStatementsOnASingleLine: AllIfsAndElse
 ColumnLimit:                  150
+PenaltyBreakOpenParenthesis:  100
 ReflowComments:               false
 CommentPragmas:               'TESTARGS'
 DerivePointerAlignment:       false
diff --git a/backends/blocked/ceed-blocked-operator.c b/backends/blocked/ceed-blocked-operator.c
index c698428f50..9f52f8f0da 100644
--- a/backends/blocked/ceed-blocked-operator.c
+++ b/backends/blocked/ceed-blocked-operator.c
@@ -476,8 +476,8 @@ static int CeedOperatorApplyAdd_Blocked(CeedOperator op, CeedVector in_vec, Ceed
       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
       if (eval_mode == CEED_EVAL_NONE) {
         CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &size));
-        CeedCallBackend(
-            CeedVectorSetArray(impl->q_vecs_out[i], CEED_MEM_HOST, CEED_USE_POINTER, &e_data_full[i + num_input_fields][(CeedSize)e * Q * size]));
+        CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[i], CEED_MEM_HOST, CEED_USE_POINTER,
+                                           &e_data_full[i + num_input_fields][(CeedSize)e * Q * size]));
       }
     }
 
@@ -508,8 +508,8 @@ static int CeedOperatorApplyAdd_Blocked(CeedOperator op, CeedVector in_vec, Ceed
     // Active
     if (is_active) vec = out_vec;
     // Restrict
-    CeedCallBackend(
-        CeedElemRestrictionApply(impl->block_rstr[i + impl->num_inputs], CEED_TRANSPOSE, impl->e_vecs_full[i + impl->num_inputs], vec, request));
+    CeedCallBackend(CeedElemRestrictionApply(impl->block_rstr[i + impl->num_inputs], CEED_TRANSPOSE, impl->e_vecs_full[i + impl->num_inputs], vec,
+                                             request));
     if (!is_active) CeedCallBackend(CeedVectorDestroy(&vec));
   }
 
diff --git a/backends/cuda-gen/ceed-cuda-gen-operator.c b/backends/cuda-gen/ceed-cuda-gen-operator.c
index c6791807ac..7e28525d53 100644
--- a/backends/cuda-gen/ceed-cuda-gen-operator.c
+++ b/backends/cuda-gen/ceed-cuda-gen-operator.c
@@ -472,8 +472,8 @@ static int CeedOperatorLinearAssembleQFunctionCore_Cuda_gen(CeedOperator op, boo
     }
     CeedInt shared_mem = block[0] * block[1] * block[2] * sizeof(CeedScalar);
 
-    CeedCallBackend(
-        CeedTryRunKernelDimShared_Cuda(ceed, data->assemble_qfunction, NULL, grid, block[0], block[1], block[2], shared_mem, &is_run_good, opargs));
+    CeedCallBackend(CeedTryRunKernelDimShared_Cuda(ceed, data->assemble_qfunction, NULL, grid, block[0], block[1], block[2], shared_mem, &is_run_good,
+                                                   opargs));
     CeedCallCuda(ceed, cudaDeviceSynchronize());
 
     // Restore input arrays
@@ -546,8 +546,8 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda_gen(CeedOperator o
     CeedOperatorAssemblyData assembly_data;
 
     CeedCallBackend(CeedOperatorGetOperatorAssemblyData(op, &assembly_data));
-    CeedCallBackend(
-        CeedOperatorAssemblyDataGetEvalModes(assembly_data, &num_active_bases_in, NULL, NULL, NULL, &num_active_bases_out, NULL, NULL, NULL, NULL));
+    CeedCallBackend(CeedOperatorAssemblyDataGetEvalModes(assembly_data, &num_active_bases_in, NULL, NULL, NULL, &num_active_bases_out, NULL, NULL,
+                                                         NULL, NULL));
     if (num_active_bases_in == num_active_bases_out) {
       CeedCallBackend(CeedOperatorBuildKernel_Cuda_gen(op, &is_build_good));
       if (is_build_good) CeedCallBackend(CeedOperatorBuildKernelDiagonalAssemblyAtPoints_Cuda_gen(op, &is_build_good));
@@ -640,8 +640,8 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda_gen(CeedOperator o
                                        cuda_data->device_prop.maxThreadsDim[2], cuda_data->device_prop.warpSize, block, &grid));
     CeedInt shared_mem = block[0] * block[1] * block[2] * sizeof(CeedScalar);
 
-    CeedCallBackend(
-        CeedTryRunKernelDimShared_Cuda(ceed, data->assemble_diagonal, NULL, grid, block[0], block[1], block[2], shared_mem, &is_run_good, opargs));
+    CeedCallBackend(CeedTryRunKernelDimShared_Cuda(ceed, data->assemble_diagonal, NULL, grid, block[0], block[1], block[2], shared_mem, &is_run_good,
+                                                   opargs));
     CeedCallCuda(ceed, cudaDeviceSynchronize());
 
     // Restore input arrays
@@ -709,8 +709,8 @@ static int CeedOperatorAssembleSingleAtPoints_Cuda_gen(CeedOperator op, CeedInt
     CeedOperatorAssemblyData assembly_data;
 
     CeedCallBackend(CeedOperatorGetOperatorAssemblyData(op, &assembly_data));
-    CeedCallBackend(
-        CeedOperatorAssemblyDataGetEvalModes(assembly_data, &num_active_bases_in, NULL, NULL, NULL, &num_active_bases_out, NULL, NULL, NULL, NULL));
+    CeedCallBackend(CeedOperatorAssemblyDataGetEvalModes(assembly_data, &num_active_bases_in, NULL, NULL, NULL, &num_active_bases_out, NULL, NULL,
+                                                         NULL, NULL));
     if (num_active_bases_in == num_active_bases_out) {
       CeedCallBackend(CeedOperatorBuildKernel_Cuda_gen(op, &is_build_good));
       if (is_build_good) CeedCallBackend(CeedOperatorBuildKernelFullAssemblyAtPoints_Cuda_gen(op, &is_build_good));
@@ -805,8 +805,8 @@ static int CeedOperatorAssembleSingleAtPoints_Cuda_gen(CeedOperator op, CeedInt
                                        cuda_data->device_prop.maxThreadsDim[2], cuda_data->device_prop.warpSize, block, &grid));
     CeedInt shared_mem = block[0] * block[1] * block[2] * sizeof(CeedScalar);
 
-    CeedCallBackend(
-        CeedTryRunKernelDimShared_Cuda(ceed, data->assemble_full, NULL, grid, block[0], block[1], block[2], shared_mem, &is_run_good, opargs));
+    CeedCallBackend(CeedTryRunKernelDimShared_Cuda(ceed, data->assemble_full, NULL, grid, block[0], block[1], block[2], shared_mem, &is_run_good,
+                                                   opargs));
     CeedCallCuda(ceed, cudaDeviceSynchronize());
 
     // Restore input arrays
@@ -876,14 +876,14 @@ int CeedOperatorCreate_Cuda_gen(CeedOperator op) {
   }
   CeedCall(CeedOperatorIsAtPoints(op, &is_at_points));
   if (is_at_points) {
-    CeedCallBackend(
-        CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddDiagonal", CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda_gen));
+    CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddDiagonal",
+                                           CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda_gen));
     CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleSingle", CeedOperatorAssembleSingleAtPoints_Cuda_gen));
   }
   if (!is_at_points) {
     CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunction", CeedOperatorLinearAssembleQFunction_Cuda_gen));
-    CeedCallBackend(
-        CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunctionUpdate", CeedOperatorLinearAssembleQFunctionUpdate_Cuda_gen));
+    CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunctionUpdate",
+                                           CeedOperatorLinearAssembleQFunctionUpdate_Cuda_gen));
   }
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "Destroy", CeedOperatorDestroy_Cuda_gen));
   CeedCallBackend(CeedDestroy(&ceed));
diff --git a/backends/cuda-ref/ceed-cuda-ref-basis.c b/backends/cuda-ref/ceed-cuda-ref-basis.c
index b28cad5ae8..565e7d13c7 100644
--- a/backends/cuda-ref/ceed-cuda-ref-basis.c
+++ b/backends/cuda-ref/ceed-cuda-ref-basis.c
@@ -209,8 +209,8 @@ static int CeedBasisApplyAtPointsCore_Cuda(CeedBasis basis, bool apply_add, cons
       void         *interp_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
       const CeedInt block_size    = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
 
-      CeedCallBackend(
-          CeedRunKernel_Cuda(ceed, is_transpose ? data->InterpTransposeAtPoints : data->InterpAtPoints, num_elem, block_size, interp_args));
+      CeedCallBackend(CeedRunKernel_Cuda(ceed, is_transpose ? data->InterpTransposeAtPoints : data->InterpAtPoints, num_elem, block_size,
+                                         interp_args));
     } break;
     case CEED_EVAL_GRAD: {
       void         *grad_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
diff --git a/backends/cuda-ref/ceed-cuda-ref-operator.c b/backends/cuda-ref/ceed-cuda-ref-operator.c
index 9c4ffd8819..3ad959eb21 100644
--- a/backends/cuda-ref/ceed-cuda-ref-operator.c
+++ b/backends/cuda-ref/ceed-cuda-ref-operator.c
@@ -170,8 +170,8 @@ static int CeedOperatorSetupFields_Cuda(CeedQFunction qf, CeedOperator op, bool
           CeedInt num_points[num_elem];
 
           for (CeedInt i = 0; i < num_elem; i++) num_points[i] = Q;
-          CeedCallBackend(
-              CeedBasisApplyAtPoints(basis, num_elem, num_points, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT, CEED_VECTOR_NONE, CEED_VECTOR_NONE, q_vecs[i]));
+          CeedCallBackend(CeedBasisApplyAtPoints(basis, num_elem, num_points, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT, CEED_VECTOR_NONE, CEED_VECTOR_NONE,
+                                                 q_vecs[i]));
         } else {
           CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT, CEED_VECTOR_NONE, q_vecs[i]));
         }
@@ -269,8 +269,8 @@ static int CeedOperatorSetup_Cuda(CeedOperator op) {
   impl->num_outputs = num_output_fields;
 
   // Set up infield and outfield e-vecs and q-vecs
-  CeedCallBackend(
-      CeedOperatorSetupFields_Cuda(qf, op, true, false, impl->skip_rstr_in, NULL, impl->e_vecs_in, impl->q_vecs_in, num_input_fields, Q, num_elem));
+  CeedCallBackend(CeedOperatorSetupFields_Cuda(qf, op, true, false, impl->skip_rstr_in, NULL, impl->e_vecs_in, impl->q_vecs_in, num_input_fields, Q,
+                                               num_elem));
   CeedCallBackend(CeedOperatorSetupFields_Cuda(qf, op, false, false, impl->skip_rstr_out, impl->apply_add_basis_out, impl->e_vecs_out,
                                                impl->q_vecs_out, num_output_fields, Q, num_elem));
 
@@ -522,8 +522,8 @@ static int CeedOperatorApplyAdd_Cuda(CeedOperator op, CeedVector in_vec, CeedVec
   for (CeedInt i = 0; i < num_input_fields; i++) {
     CeedInt field = impl->input_field_order[i];
 
-    CeedCallBackend(
-        CeedOperatorInputRestrict_Cuda(op_input_fields[field], qf_input_fields[field], field, in_vec, active_e_vec, false, impl, request));
+    CeedCallBackend(CeedOperatorInputRestrict_Cuda(op_input_fields[field], qf_input_fields[field], field, in_vec, active_e_vec, false, impl,
+                                                   request));
     CeedCallBackend(CeedOperatorInputBasis_Cuda(op_input_fields[field], qf_input_fields[field], field, in_vec, active_e_vec, num_elem, false, impl));
   }
 
@@ -869,8 +869,8 @@ static int CeedOperatorApplyAddAtPoints_Cuda(CeedOperator op, CeedVector in_vec,
   for (CeedInt i = 0; i < num_input_fields; i++) {
     CeedInt field = impl->input_field_order[i];
 
-    CeedCallBackend(
-        CeedOperatorInputRestrict_Cuda(op_input_fields[field], qf_input_fields[field], field, in_vec, active_e_vec, false, impl, request));
+    CeedCallBackend(CeedOperatorInputRestrict_Cuda(op_input_fields[field], qf_input_fields[field], field, in_vec, active_e_vec, false, impl,
+                                                   request));
     CeedCallBackend(CeedOperatorInputBasisAtPoints_Cuda(op_input_fields[field], qf_input_fields[field], field, in_vec, active_e_vec, num_elem,
                                                         num_points, false, false, impl));
   }
@@ -1015,8 +1015,8 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Cuda(CeedOperator op,
           CeedSize q_size = (CeedSize)Q * num_elem;
 
           CeedCallBackend(CeedVectorCreate(ceed, q_size, &active_inputs[num_active_in + field]));
-          CeedCallBackend(
-              CeedVectorSetArray(active_inputs[num_active_in + field], CEED_MEM_DEVICE, CEED_USE_POINTER, &q_vec_array[field * Q * num_elem]));
+          CeedCallBackend(CeedVectorSetArray(active_inputs[num_active_in + field], CEED_MEM_DEVICE, CEED_USE_POINTER,
+                                             &q_vec_array[field * Q * num_elem]));
         }
         num_active_in += size;
         CeedCallBackend(CeedVectorRestoreArray(impl->q_vecs_in[i], &q_vec_array));
@@ -1881,8 +1881,8 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda(CeedOperator op, C
   // Process inputs
   for (CeedInt i = 0; i < num_input_fields; i++) {
     CeedCallBackend(CeedOperatorInputRestrict_Cuda(op_input_fields[i], qf_input_fields[i], i, NULL, NULL, true, impl, request));
-    CeedCallBackend(
-        CeedOperatorInputBasisAtPoints_Cuda(op_input_fields[i], qf_input_fields[i], i, NULL, NULL, num_elem, num_points, true, false, impl));
+    CeedCallBackend(CeedOperatorInputBasisAtPoints_Cuda(op_input_fields[i], qf_input_fields[i], i, NULL, NULL, num_elem, num_points, true, false,
+                                                        impl));
   }
 
   // Output pointers, as necessary
@@ -2003,8 +2003,8 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda(CeedOperator op, C
 
             CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[field_out], &basis));
             if (impl->apply_add_basis_out[field_out]) {
-              CeedCallBackend(
-                  CeedBasisApplyAddAtPoints(basis, num_elem, num_points, CEED_TRANSPOSE, eval_mode, impl->point_coords_elem, q_vec, e_vec));
+              CeedCallBackend(CeedBasisApplyAddAtPoints(basis, num_elem, num_points, CEED_TRANSPOSE, eval_mode, impl->point_coords_elem, q_vec,
+                                                        e_vec));
             } else {
               CeedCallBackend(CeedBasisApplyAtPoints(basis, num_elem, num_points, CEED_TRANSPOSE, eval_mode, impl->point_coords_elem, q_vec, e_vec));
             }
@@ -2087,8 +2087,8 @@ int CeedOperatorCreate_Cuda(CeedOperator op) {
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunction", CeedOperatorLinearAssembleQFunction_Cuda));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunctionUpdate", CeedOperatorLinearAssembleQFunctionUpdate_Cuda));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddDiagonal", CeedOperatorLinearAssembleAddDiagonal_Cuda));
-  CeedCallBackend(
-      CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddPointBlockDiagonal", CeedOperatorLinearAssembleAddPointBlockDiagonal_Cuda));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddPointBlockDiagonal",
+                                         CeedOperatorLinearAssembleAddPointBlockDiagonal_Cuda));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleSingle", CeedOperatorAssembleSingle_Cuda));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAdd_Cuda));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "Destroy", CeedOperatorDestroy_Cuda));
diff --git a/backends/cuda-ref/ceed-cuda-ref-restriction.c b/backends/cuda-ref/ceed-cuda-ref-restriction.c
index 30e2ee1623..0eb2924975 100644
--- a/backends/cuda-ref/ceed-cuda-ref-restriction.c
+++ b/backends/cuda-ref/ceed-cuda-ref-restriction.c
@@ -652,8 +652,8 @@ int CeedElemRestrictionCreate_Cuda(CeedMemType mem_type, CeedCopyMode copy_mode,
   CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetOrientations", CeedElemRestrictionGetOrientations_Cuda));
   CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetCurlOrientations", CeedElemRestrictionGetCurlOrientations_Cuda));
   if (rstr_type == CEED_RESTRICTION_POINTS) {
-    CeedCallBackend(
-        CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetAtPointsElementOffset", CeedElemRestrictionGetAtPointsElementOffset_Cuda));
+    CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetAtPointsElementOffset",
+                                           CeedElemRestrictionGetAtPointsElementOffset_Cuda));
   }
   CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "Destroy", CeedElemRestrictionDestroy_Cuda));
   CeedCallBackend(CeedDestroy(&ceed));
diff --git a/backends/cuda-shared/ceed-cuda-shared-basis.c b/backends/cuda-shared/ceed-cuda-shared-basis.c
index a84ccd1410..e6cb677c4b 100644
--- a/backends/cuda-shared/ceed-cuda-shared-basis.c
+++ b/backends/cuda-shared/ceed-cuda-shared-basis.c
@@ -80,8 +80,8 @@ static int CeedBasisApplyTensorCore_Cuda_shared(CeedBasis basis, bool apply_add,
           CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, NULL, grid, thread_1d,
                                                       thread_1d, elems_per_block, shared_mem, interp_args));
         } else {
-          CeedCallBackend(
-              CeedRunKernelDimShared_Cuda(ceed, data->Interp, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args));
+          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Interp, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem,
+                                                      interp_args));
         }
       } else if (dim == 3) {
         CeedInt elems_per_block = 1;
@@ -92,8 +92,8 @@ static int CeedBasisApplyTensorCore_Cuda_shared(CeedBasis basis, bool apply_add,
           CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, NULL, grid, thread_1d,
                                                       thread_1d, elems_per_block, shared_mem, interp_args));
         } else {
-          CeedCallBackend(
-              CeedRunKernelDimShared_Cuda(ceed, data->Interp, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args));
+          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Interp, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem,
+                                                      interp_args));
         }
       }
     } break;
@@ -331,8 +331,8 @@ static int CeedBasisApplyAtPointsCore_Cuda_shared(CeedBasis basis, bool apply_ad
           CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->InterpTransposeAddAtPoints : data->InterpTransposeAtPoints, NULL, grid,
                                                       thread_1d, 1, elems_per_block, shared_mem, interp_args));
         } else {
-          CeedCallBackend(
-              CeedRunKernelDimShared_Cuda(ceed, data->InterpAtPoints, NULL, grid, thread_1d, 1, elems_per_block, shared_mem, interp_args));
+          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->InterpAtPoints, NULL, grid, thread_1d, 1, elems_per_block, shared_mem,
+                                                      interp_args));
         }
       } else if (dim == 2) {
         const CeedInt opt_elems[7] = {0, 32, 8, 6, 4, 2, 8};
@@ -345,8 +345,8 @@ static int CeedBasisApplyAtPointsCore_Cuda_shared(CeedBasis basis, bool apply_ad
           CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->InterpTransposeAddAtPoints : data->InterpTransposeAtPoints, NULL, grid,
                                                       thread_1d, thread_1d, elems_per_block, shared_mem, interp_args));
         } else {
-          CeedCallBackend(
-              CeedRunKernelDimShared_Cuda(ceed, data->InterpAtPoints, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args));
+          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->InterpAtPoints, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem,
+                                                      interp_args));
         }
       } else if (dim == 3) {
         CeedInt elems_per_block = 1;
@@ -357,8 +357,8 @@ static int CeedBasisApplyAtPointsCore_Cuda_shared(CeedBasis basis, bool apply_ad
           CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->InterpTransposeAddAtPoints : data->InterpTransposeAtPoints, NULL, grid,
                                                       thread_1d, thread_1d, elems_per_block, shared_mem, interp_args));
         } else {
-          CeedCallBackend(
-              CeedRunKernelDimShared_Cuda(ceed, data->InterpAtPoints, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args));
+          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->InterpAtPoints, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem,
+                                                      interp_args));
         }
       }
     } break;
@@ -394,8 +394,8 @@ static int CeedBasisApplyAtPointsCore_Cuda_shared(CeedBasis basis, bool apply_ad
           CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->GradTransposeAddAtPoints : data->GradTransposeAtPoints, NULL, grid,
                                                       thread_1d, thread_1d, elems_per_block, shared_mem, grad_args));
         } else {
-          CeedCallBackend(
-              CeedRunKernelDimShared_Cuda(ceed, data->GradAtPoints, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args));
+          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->GradAtPoints, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem,
+                                                      grad_args));
         }
       } else if (dim == 3) {
         CeedInt elems_per_block = 1;
@@ -406,8 +406,8 @@ static int CeedBasisApplyAtPointsCore_Cuda_shared(CeedBasis basis, bool apply_ad
           CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->GradTransposeAddAtPoints : data->GradTransposeAtPoints, NULL, grid,
                                                       thread_1d, thread_1d, elems_per_block, shared_mem, grad_args));
         } else {
-          CeedCallBackend(
-              CeedRunKernelDimShared_Cuda(ceed, data->GradAtPoints, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args));
+          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->GradAtPoints, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem,
+                                                      grad_args));
         }
       }
     } break;
@@ -637,8 +637,8 @@ int CeedBasisCreateTensorH1_Cuda_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d,
   CeedCallBackend(CeedBasisIsCollocated(basis, &is_collocated));
   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, is_collocated ? "InterpCollocated" : "Interp", &data->Interp));
   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, is_collocated ? "InterpCollocatedTranspose" : "InterpTranspose", &data->InterpTranspose));
-  CeedCallBackend(
-      CeedGetKernel_Cuda(ceed, data->module, is_collocated ? "InterpCollocatedTransposeAdd" : "InterpTransposeAdd", &data->InterpTransposeAdd));
+  CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, is_collocated ? "InterpCollocatedTransposeAdd" : "InterpTransposeAdd",
+                                     &data->InterpTransposeAdd));
   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, is_collocated ? "GradCollocated" : "Grad", &data->Grad));
   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, is_collocated ? "GradCollocatedTranspose" : "GradTranspose", &data->GradTranspose));
   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, is_collocated ? "GradCollocatedTransposeAdd" : "GradTransposeAdd", &data->GradTransposeAdd));
diff --git a/backends/cuda/ceed-cuda-compile.cpp b/backends/cuda/ceed-cuda-compile.cpp
index 878b64b4ba..5348fa9398 100644
--- a/backends/cuda/ceed-cuda-compile.cpp
+++ b/backends/cuda/ceed-cuda-compile.cpp
@@ -361,12 +361,12 @@ static int CeedCompileCore_Cuda(Ceed ceed, const char *source, const bool throw_
 
     // Link, optimize, and compile final CUDA kernel
     CeedCallSystem(ceed, command.c_str(), "link C and Rust source");
-    CeedCallSystem(
-        ceed,
-        ("$(find $(rustup run " + std::string(rust_toolchain) + " rustc --print sysroot) -name opt) --passes internalize,inline temp/kernel_" +
-         std::to_string(build_id) + "_2_linked.ll -o temp/kernel_" + std::to_string(build_id) + "_3_opt.bc")
-            .c_str(),
-        "optimize linked C and Rust source");
+    CeedCallSystem(ceed,
+                   ("$(find $(rustup run " + std::string(rust_toolchain) +
+                    " rustc --print sysroot) -name opt) --passes internalize,inline temp/kernel_" + std::to_string(build_id) +
+                    "_2_linked.ll -o temp/kernel_" + std::to_string(build_id) + "_3_opt.bc")
+                       .c_str(),
+                   "optimize linked C and Rust source");
     CeedCallSystem(ceed, ("chmod 0777 temp/kernel_" + std::to_string(build_id) + "_2_linked.ll").c_str(), "update JiT file permissions");
     CeedCallSystem(ceed,
                    ("$(find $(rustup run " + std::string(rust_toolchain) + " rustc --print sysroot) -name llc) -O3 -mcpu=sm_" +
diff --git a/backends/hip-gen/ceed-hip-gen-operator.c b/backends/hip-gen/ceed-hip-gen-operator.c
index e4c2634a66..bab3b81b79 100644
--- a/backends/hip-gen/ceed-hip-gen-operator.c
+++ b/backends/hip-gen/ceed-hip-gen-operator.c
@@ -163,20 +163,20 @@ static int CeedOperatorApplyAddCore_Hip_gen(CeedOperator op, hipStream_t stream,
     CeedInt grid      = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
     CeedInt sharedMem = block_sizes[2] * data->thread_1d * sizeof(CeedScalar);
 
-    CeedCallBackend(
-        CeedTryRunKernelDimShared_Hip(ceed, data->op, stream, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, is_run_good, opargs));
+    CeedCallBackend(CeedTryRunKernelDimShared_Hip(ceed, data->op, stream, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem,
+                                                  is_run_good, opargs));
   } else if (data->dim == 2) {
     CeedInt grid      = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
     CeedInt sharedMem = block_sizes[2] * data->thread_1d * data->thread_1d * sizeof(CeedScalar);
 
-    CeedCallBackend(
-        CeedTryRunKernelDimShared_Hip(ceed, data->op, stream, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, is_run_good, opargs));
+    CeedCallBackend(CeedTryRunKernelDimShared_Hip(ceed, data->op, stream, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem,
+                                                  is_run_good, opargs));
   } else if (data->dim == 3) {
     CeedInt grid      = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
     CeedInt sharedMem = block_sizes[2] * data->thread_1d * data->thread_1d * sizeof(CeedScalar);
 
-    CeedCallBackend(
-        CeedTryRunKernelDimShared_Hip(ceed, data->op, stream, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, is_run_good, opargs));
+    CeedCallBackend(CeedTryRunKernelDimShared_Hip(ceed, data->op, stream, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem,
+                                                  is_run_good, opargs));
   }
 
   // Restore input arrays
@@ -522,8 +522,8 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip_gen(CeedOperator op
     CeedOperatorAssemblyData assembly_data;
 
     CeedCallBackend(CeedOperatorGetOperatorAssemblyData(op, &assembly_data));
-    CeedCallBackend(
-        CeedOperatorAssemblyDataGetEvalModes(assembly_data, &num_active_bases_in, NULL, NULL, NULL, &num_active_bases_out, NULL, NULL, NULL, NULL));
+    CeedCallBackend(CeedOperatorAssemblyDataGetEvalModes(assembly_data, &num_active_bases_in, NULL, NULL, NULL, &num_active_bases_out, NULL, NULL,
+                                                         NULL, NULL));
     if (num_active_bases_in == num_active_bases_out) {
       CeedCallBackend(CeedOperatorBuildKernel_Hip_gen(op, &is_build_good));
       if (is_build_good) CeedCallBackend(CeedOperatorBuildKernelDiagonalAssemblyAtPoints_Hip_gen(op, &is_build_good));
@@ -698,8 +698,8 @@ static int CeedOperatorAssembleSingleAtPoints_Hip_gen(CeedOperator op, CeedInt o
     CeedOperatorAssemblyData assembly_data;
 
     CeedCallBackend(CeedOperatorGetOperatorAssemblyData(op, &assembly_data));
-    CeedCallBackend(
-        CeedOperatorAssemblyDataGetEvalModes(assembly_data, &num_active_bases_in, NULL, NULL, NULL, &num_active_bases_out, NULL, NULL, NULL, NULL));
+    CeedCallBackend(CeedOperatorAssemblyDataGetEvalModes(assembly_data, &num_active_bases_in, NULL, NULL, NULL, &num_active_bases_out, NULL, NULL,
+                                                         NULL, NULL));
     if (num_active_bases_in == num_active_bases_out) {
       CeedCallBackend(CeedOperatorBuildKernel_Hip_gen(op, &is_build_good));
       if (is_build_good) CeedCallBackend(CeedOperatorBuildKernelFullAssemblyAtPoints_Hip_gen(op, &is_build_good));
diff --git a/backends/hip-ref/ceed-hip-ref-basis.c b/backends/hip-ref/ceed-hip-ref-basis.c
index 69091bc381..c0c198b329 100644
--- a/backends/hip-ref/ceed-hip-ref-basis.c
+++ b/backends/hip-ref/ceed-hip-ref-basis.c
@@ -208,8 +208,8 @@ static int CeedBasisApplyAtPointsCore_Hip(CeedBasis basis, bool apply_add, const
       void         *interp_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
       const CeedInt block_size    = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
 
-      CeedCallBackend(
-          CeedRunKernel_Hip(ceed, is_transpose ? data->InterpTransposeAtPoints : data->InterpAtPoints, num_elem, block_size, interp_args));
+      CeedCallBackend(CeedRunKernel_Hip(ceed, is_transpose ? data->InterpTransposeAtPoints : data->InterpAtPoints, num_elem, block_size,
+                                        interp_args));
     } break;
     case CEED_EVAL_GRAD: {
       void         *grad_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
diff --git a/backends/hip-ref/ceed-hip-ref-operator.c b/backends/hip-ref/ceed-hip-ref-operator.c
index 274743d526..1d399b1b68 100644
--- a/backends/hip-ref/ceed-hip-ref-operator.c
+++ b/backends/hip-ref/ceed-hip-ref-operator.c
@@ -169,8 +169,8 @@ static int CeedOperatorSetupFields_Hip(CeedQFunction qf, CeedOperator op, bool i
           CeedInt num_points[num_elem];
 
           for (CeedInt i = 0; i < num_elem; i++) num_points[i] = Q;
-          CeedCallBackend(
-              CeedBasisApplyAtPoints(basis, num_elem, num_points, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT, CEED_VECTOR_NONE, CEED_VECTOR_NONE, q_vecs[i]));
+          CeedCallBackend(CeedBasisApplyAtPoints(basis, num_elem, num_points, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT, CEED_VECTOR_NONE, CEED_VECTOR_NONE,
+                                                 q_vecs[i]));
         } else {
           CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT, CEED_VECTOR_NONE, q_vecs[i]));
         }
@@ -268,8 +268,8 @@ static int CeedOperatorSetup_Hip(CeedOperator op) {
   impl->num_outputs = num_output_fields;
 
   // Set up infield and outfield e-vecs and q-vecs
-  CeedCallBackend(
-      CeedOperatorSetupFields_Hip(qf, op, true, false, impl->skip_rstr_in, NULL, impl->e_vecs_in, impl->q_vecs_in, num_input_fields, Q, num_elem));
+  CeedCallBackend(CeedOperatorSetupFields_Hip(qf, op, true, false, impl->skip_rstr_in, NULL, impl->e_vecs_in, impl->q_vecs_in, num_input_fields, Q,
+                                              num_elem));
   CeedCallBackend(CeedOperatorSetupFields_Hip(qf, op, false, false, impl->skip_rstr_out, impl->apply_add_basis_out, impl->e_vecs_out,
                                               impl->q_vecs_out, num_output_fields, Q, num_elem));
 
@@ -1012,8 +1012,8 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Hip(CeedOperator op, b
           CeedSize q_size = (CeedSize)Q * num_elem;
 
           CeedCallBackend(CeedVectorCreate(ceed, q_size, &active_inputs[num_active_in + field]));
-          CeedCallBackend(
-              CeedVectorSetArray(active_inputs[num_active_in + field], CEED_MEM_DEVICE, CEED_USE_POINTER, &q_vec_array[field * Q * num_elem]));
+          CeedCallBackend(CeedVectorSetArray(active_inputs[num_active_in + field], CEED_MEM_DEVICE, CEED_USE_POINTER,
+                                             &q_vec_array[field * Q * num_elem]));
         }
         num_active_in += size;
         CeedCallBackend(CeedVectorRestoreArray(impl->q_vecs_in[i], &q_vec_array));
@@ -1878,8 +1878,8 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip(CeedOperator op, Ce
   // Process inputs
   for (CeedInt i = 0; i < num_input_fields; i++) {
     CeedCallBackend(CeedOperatorInputRestrict_Hip(op_input_fields[i], qf_input_fields[i], i, NULL, NULL, true, impl, request));
-    CeedCallBackend(
-        CeedOperatorInputBasisAtPoints_Hip(op_input_fields[i], qf_input_fields[i], i, NULL, NULL, num_elem, num_points, true, false, impl));
+    CeedCallBackend(CeedOperatorInputBasisAtPoints_Hip(op_input_fields[i], qf_input_fields[i], i, NULL, NULL, num_elem, num_points, true, false,
+                                                       impl));
   }
 
   // Output pointers, as necessary
@@ -2000,8 +2000,8 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip(CeedOperator op, Ce
 
             CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[field_out], &basis));
             if (impl->apply_add_basis_out[field_out]) {
-              CeedCallBackend(
-                  CeedBasisApplyAddAtPoints(basis, num_elem, num_points, CEED_TRANSPOSE, eval_mode, impl->point_coords_elem, q_vec, e_vec));
+              CeedCallBackend(CeedBasisApplyAddAtPoints(basis, num_elem, num_points, CEED_TRANSPOSE, eval_mode, impl->point_coords_elem, q_vec,
+                                                        e_vec));
             } else {
               CeedCallBackend(CeedBasisApplyAtPoints(basis, num_elem, num_points, CEED_TRANSPOSE, eval_mode, impl->point_coords_elem, q_vec, e_vec));
             }
@@ -2086,8 +2086,8 @@ int CeedOperatorCreate_Hip(CeedOperator op) {
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunction", CeedOperatorLinearAssembleQFunction_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunctionUpdate", CeedOperatorLinearAssembleQFunctionUpdate_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddDiagonal", CeedOperatorLinearAssembleAddDiagonal_Hip));
-  CeedCallBackend(
-      CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddPointBlockDiagonal", CeedOperatorLinearAssembleAddPointBlockDiagonal_Hip));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddPointBlockDiagonal",
+                                         CeedOperatorLinearAssembleAddPointBlockDiagonal_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleSingle", CeedOperatorAssembleSingle_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAdd_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "Destroy", CeedOperatorDestroy_Hip));
diff --git a/backends/hip-ref/ceed-hip-ref-restriction.c b/backends/hip-ref/ceed-hip-ref-restriction.c
index 54d8b13ea0..1b21b0e5cb 100644
--- a/backends/hip-ref/ceed-hip-ref-restriction.c
+++ b/backends/hip-ref/ceed-hip-ref-restriction.c
@@ -653,8 +653,8 @@ int CeedElemRestrictionCreate_Hip(CeedMemType mem_type, CeedCopyMode copy_mode,
   CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetOrientations", CeedElemRestrictionGetOrientations_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetCurlOrientations", CeedElemRestrictionGetCurlOrientations_Hip));
   if (rstr_type == CEED_RESTRICTION_POINTS) {
-    CeedCallBackend(
-        CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetAtPointsElementOffset", CeedElemRestrictionGetAtPointsElementOffset_Hip));
+    CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetAtPointsElementOffset",
+                                           CeedElemRestrictionGetAtPointsElementOffset_Hip));
   }
   CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "Destroy", CeedElemRestrictionDestroy_Hip));
   CeedCallBackend(CeedDestroy(&ceed));
diff --git a/backends/hip-shared/ceed-hip-shared-basis.c b/backends/hip-shared/ceed-hip-shared-basis.c
index 6201f2aff1..abbb86ab48 100644
--- a/backends/hip-shared/ceed-hip-shared-basis.c
+++ b/backends/hip-shared/ceed-hip-shared-basis.c
@@ -409,8 +409,8 @@ static int CeedBasisApplyAtPointsCore_Hip_shared(CeedBasis basis, bool apply_add
           CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->InterpTransposeAddAtPoints : data->InterpTransposeAtPoints, NULL, grid,
                                                      thread_1d, thread_1d, elems_per_block, shared_mem, interp_args));
         } else {
-          CeedCallBackend(
-              CeedRunKernelDimShared_Hip(ceed, data->InterpAtPoints, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args));
+          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->InterpAtPoints, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem,
+                                                     interp_args));
         }
       } else if (dim == 3) {
         const CeedInt elems_per_block = 1;
@@ -421,8 +421,8 @@ static int CeedBasisApplyAtPointsCore_Hip_shared(CeedBasis basis, bool apply_add
           CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->InterpTransposeAddAtPoints : data->InterpTransposeAtPoints, NULL, grid,
                                                      thread_1d, thread_1d, elems_per_block, shared_mem, interp_args));
         } else {
-          CeedCallBackend(
-              CeedRunKernelDimShared_Hip(ceed, data->InterpAtPoints, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args));
+          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->InterpAtPoints, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem,
+                                                     interp_args));
         }
       }
     } break;
@@ -457,8 +457,8 @@ static int CeedBasisApplyAtPointsCore_Hip_shared(CeedBasis basis, bool apply_add
           CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->GradTransposeAddAtPoints : data->GradTransposeAtPoints, NULL, grid,
                                                      thread_1d, thread_1d, elems_per_block, shared_mem, grad_args));
         } else {
-          CeedCallBackend(
-              CeedRunKernelDimShared_Hip(ceed, data->GradAtPoints, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args));
+          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->GradAtPoints, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem,
+                                                     grad_args));
         }
       } else if (dim == 3) {
         const CeedInt elems_per_block = 1;
@@ -469,8 +469,8 @@ static int CeedBasisApplyAtPointsCore_Hip_shared(CeedBasis basis, bool apply_add
           CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->GradTransposeAddAtPoints : data->GradTransposeAtPoints, NULL, grid,
                                                      thread_1d, thread_1d, elems_per_block, shared_mem, grad_args));
         } else {
-          CeedCallBackend(
-              CeedRunKernelDimShared_Hip(ceed, data->GradAtPoints, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args));
+          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->GradAtPoints, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem,
+                                                     grad_args));
         }
       }
     } break;
@@ -703,8 +703,8 @@ int CeedBasisCreateTensorH1_Hip_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d,
   CeedCallBackend(CeedBasisIsCollocated(basis, &is_collocated));
   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, is_collocated ? "InterpCollocated" : "Interp", &data->Interp));
   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, is_collocated ? "InterpCollocatedTranspose" : "InterpTranspose", &data->InterpTranspose));
-  CeedCallBackend(
-      CeedGetKernel_Hip(ceed, data->module, is_collocated ? "InterpCollocatedTransposeAdd" : "InterpTransposeAdd", &data->InterpTransposeAdd));
+  CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, is_collocated ? "InterpCollocatedTransposeAdd" : "InterpTransposeAdd",
+                                    &data->InterpTransposeAdd));
   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, is_collocated ? "GradCollocated" : "Grad", &data->Grad));
   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, is_collocated ? "GradCollocatedTranspose" : "GradTranspose", &data->GradTranspose));
   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, is_collocated ? "GradCollocatedTransposeAdd" : "GradTransposeAdd", &data->GradTransposeAdd));
diff --git a/backends/memcheck/ceed-memcheck-restriction.c b/backends/memcheck/ceed-memcheck-restriction.c
index 5bae298ddb..1b57862613 100644
--- a/backends/memcheck/ceed-memcheck-restriction.c
+++ b/backends/memcheck/ceed-memcheck-restriction.c
@@ -420,8 +420,8 @@ static inline int CeedElemRestrictionApply_Memcheck_Core(CeedElemRestriction rst
     // Sum into for transpose mode
     switch (rstr_type) {
       case CEED_RESTRICTION_STRIDED:
-        CeedCallBackend(
-            CeedElemRestrictionApplyStridedTranspose_Memcheck_Core(rstr, num_comp, block_size, start, stop, num_elem, elem_size, v_offset, uu, vv));
+        CeedCallBackend(CeedElemRestrictionApplyStridedTranspose_Memcheck_Core(rstr, num_comp, block_size, start, stop, num_elem, elem_size, v_offset,
+                                                                               uu, vv));
         break;
       case CEED_RESTRICTION_STANDARD:
         CeedCallBackend(CeedElemRestrictionApplyOffsetTranspose_Memcheck_Core(rstr, num_comp, block_size, comp_stride, start, stop, num_elem,
@@ -460,8 +460,8 @@ static inline int CeedElemRestrictionApply_Memcheck_Core(CeedElemRestriction rst
     // Overwrite for notranspose mode
     switch (rstr_type) {
       case CEED_RESTRICTION_STRIDED:
-        CeedCallBackend(
-            CeedElemRestrictionApplyStridedNoTranspose_Memcheck_Core(rstr, num_comp, block_size, start, stop, num_elem, elem_size, v_offset, uu, vv));
+        CeedCallBackend(CeedElemRestrictionApplyStridedNoTranspose_Memcheck_Core(rstr, num_comp, block_size, start, stop, num_elem, elem_size,
+                                                                                 v_offset, uu, vv));
         break;
       case CEED_RESTRICTION_STANDARD:
         CeedCallBackend(CeedElemRestrictionApplyOffsetNoTranspose_Memcheck_Core(rstr, num_comp, block_size, comp_stride, start, stop, num_elem,
@@ -760,8 +760,8 @@ int CeedElemRestrictionCreate_Memcheck(CeedMemType mem_type, CeedCopyMode copy_m
   CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "ApplyUnsigned", CeedElemRestrictionApplyUnsigned_Memcheck));
   CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "ApplyUnoriented", CeedElemRestrictionApplyUnoriented_Memcheck));
   if (rstr_type == CEED_RESTRICTION_POINTS) {
-    CeedCallBackend(
-        CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "ApplyAtPointsInElement", CeedElemRestrictionApplyAtPointsInElement_Memcheck));
+    CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "ApplyAtPointsInElement",
+                                           CeedElemRestrictionApplyAtPointsInElement_Memcheck));
   }
   CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "ApplyBlock", CeedElemRestrictionApplyBlock_Memcheck));
   CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetOffsets", CeedElemRestrictionGetOffsets_Memcheck));
diff --git a/backends/opt/ceed-opt-operator.c b/backends/opt/ceed-opt-operator.c
index 0695f7f1df..03f6c28fd0 100644
--- a/backends/opt/ceed-opt-operator.c
+++ b/backends/opt/ceed-opt-operator.c
@@ -410,8 +410,8 @@ static inline int CeedOperatorOutputBasis_Opt(CeedInt e, CeedInt Q, CeedQFunctio
     is_active = vec == CEED_VECTOR_ACTIVE;
     if (is_active) vec = out_vec;
     // Restrict
-    CeedCallBackend(
-        CeedElemRestrictionApplyBlock(impl->block_rstr[i + impl->num_inputs], e / block_size, CEED_TRANSPOSE, impl->e_vecs_out[i], vec, request));
+    CeedCallBackend(CeedElemRestrictionApplyBlock(impl->block_rstr[i + impl->num_inputs], e / block_size, CEED_TRANSPOSE, impl->e_vecs_out[i], vec,
+                                                  request));
     if (!is_active) CeedCallBackend(CeedVectorDestroy(&vec));
   }
   return CEED_ERROR_SUCCESS;
@@ -493,8 +493,8 @@ static int CeedOperatorApplyAdd_Opt(CeedOperator op, CeedVector in_vec, CeedVect
   // Loop through elements
   for (CeedInt e = 0; e < num_blocks * block_size; e += block_size) {
     // Input basis apply
-    CeedCallBackend(
-        CeedOperatorInputBasis_Opt(e, Q, qf_input_fields, op_input_fields, num_input_fields, block_size, in_vec, false, e_data, impl, request));
+    CeedCallBackend(CeedOperatorInputBasis_Opt(e, Q, qf_input_fields, op_input_fields, num_input_fields, block_size, in_vec, false, e_data, impl,
+                                               request));
 
     // Q function
     if (!impl->is_identity_qf) {
@@ -624,8 +624,8 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Opt(CeedOperator op, b
     CeedCallBackend(CeedVectorGetArray(l_vec, CEED_MEM_HOST, &l_vec_array));
 
     // Input basis apply
-    CeedCallBackend(
-        CeedOperatorInputBasis_Opt(e, Q, qf_input_fields, op_input_fields, num_input_fields, block_size, NULL, true, e_data, impl, request));
+    CeedCallBackend(CeedOperatorInputBasis_Opt(e, Q, qf_input_fields, op_input_fields, num_input_fields, block_size, NULL, true, e_data, impl,
+                                               request));
 
     // Assemble QFunction
     for (CeedInt i = 0; i < num_input_fields; i++) {
diff --git a/backends/ref/ceed-ref-basis.c b/backends/ref/ceed-ref-basis.c
index 15efd97e36..c4d07f69f4 100644
--- a/backends/ref/ceed-ref-basis.c
+++ b/backends/ref/ceed-ref-basis.c
@@ -116,11 +116,11 @@ static int CeedBasisApplyCore_Ref(CeedBasis basis, bool apply_add, CeedInt num_e
           }
           pre = num_comp * CeedIntPow(P, dim - 1), post = num_elem;
           for (CeedInt d = 0; d < dim; d++) {
-            CeedCallBackend(CeedTensorContractApply(
-                contract, pre, P, post, Q, (t_mode == CEED_NOTRANSPOSE ? impl->collo_grad_1d : interp_1d), t_mode,
-                (t_mode == CEED_NOTRANSPOSE && apply_add) || (t_mode == CEED_TRANSPOSE && (d == dim - 1)),
-                (t_mode == CEED_NOTRANSPOSE ? interp : (d == 0 ? interp : tmp[d % 2])),
-                (t_mode == CEED_NOTRANSPOSE ? &v[d * num_qpts * num_comp * num_elem] : (d == dim - 1 ? v : tmp[(d + 1) % 2]))));
+            CeedCallBackend(CeedTensorContractApply(contract, pre, P, post, Q, (t_mode == CEED_NOTRANSPOSE ? impl->collo_grad_1d : interp_1d), t_mode,
+                                                    (t_mode == CEED_NOTRANSPOSE && apply_add) || (t_mode == CEED_TRANSPOSE && (d == dim - 1)),
+                                                    (t_mode == CEED_NOTRANSPOSE ? interp : (d == 0 ? interp : tmp[d % 2])),
+                                                    (t_mode == CEED_NOTRANSPOSE ? &v[d * num_qpts * num_comp * num_elem]
+                                                                                : (d == dim - 1 ? v : tmp[(d + 1) % 2]))));
             pre /= P;
             post *= Q;
           }
diff --git a/backends/ref/ceed-ref-operator.c b/backends/ref/ceed-ref-operator.c
index c102b8c01d..6b306f7cba 100644
--- a/backends/ref/ceed-ref-operator.c
+++ b/backends/ref/ceed-ref-operator.c
@@ -427,8 +427,8 @@ static int CeedOperatorApplyAdd_Ref(CeedOperator op, CeedVector in_vec, CeedVect
       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
       if (eval_mode == CEED_EVAL_NONE) {
         CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &size));
-        CeedCallBackend(
-            CeedVectorSetArray(impl->q_vecs_out[i], CEED_MEM_HOST, CEED_USE_POINTER, &e_data_full[i + num_input_fields][(CeedSize)e * Q * size]));
+        CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[i], CEED_MEM_HOST, CEED_USE_POINTER,
+                                           &e_data_full[i + num_input_fields][(CeedSize)e * Q * size]));
       }
     }
 
@@ -762,8 +762,8 @@ static int CeedOperatorSetupFieldsAtPoints_Ref(CeedQFunction qf, CeedOperator op
         CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis));
         q_size = (CeedSize)max_num_points;
         CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i]));
-        CeedCallBackend(
-            CeedBasisApplyAtPoints(basis, 1, &max_num_points, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT, CEED_VECTOR_NONE, CEED_VECTOR_NONE, q_vecs[i]));
+        CeedCallBackend(CeedBasisApplyAtPoints(basis, 1, &max_num_points, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT, CEED_VECTOR_NONE, CEED_VECTOR_NONE,
+                                               q_vecs[i]));
         CeedCallBackend(CeedBasisDestroy(&basis));
         break;
     }
@@ -936,8 +936,8 @@ static inline int CeedOperatorInputBasisAtPoints_Ref(CeedInt e, CeedInt num_poin
           CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
           CeedCallBackend(CeedVectorSetArray(impl->e_vecs_in[i], CEED_MEM_HOST, CEED_USE_POINTER, &e_data[i][(CeedSize)e * elem_size * num_comp]));
         }
-        CeedCallBackend(
-            CeedBasisApplyAtPoints(basis, 1, &num_points, CEED_NOTRANSPOSE, eval_mode, point_coords_elem, impl->e_vecs_in[i], impl->q_vecs_in[i]));
+        CeedCallBackend(CeedBasisApplyAtPoints(basis, 1, &num_points, CEED_NOTRANSPOSE, eval_mode, point_coords_elem, impl->e_vecs_in[i],
+                                               impl->q_vecs_in[i]));
         CeedCallBackend(CeedBasisDestroy(&basis));
         break;
       case CEED_EVAL_WEIGHT:
@@ -985,8 +985,8 @@ static inline int CeedOperatorOutputBasisAtPoints_Ref(CeedInt e, CeedInt num_poi
           CeedCallBackend(CeedBasisApplyAddAtPoints(basis, 1, &num_points, CEED_TRANSPOSE, eval_mode, point_coords_elem, impl->q_vecs_out[i],
                                                     impl->e_vecs_out[i]));
         } else {
-          CeedCallBackend(
-              CeedBasisApplyAtPoints(basis, 1, &num_points, CEED_TRANSPOSE, eval_mode, point_coords_elem, impl->q_vecs_out[i], impl->e_vecs_out[i]));
+          CeedCallBackend(CeedBasisApplyAtPoints(basis, 1, &num_points, CEED_TRANSPOSE, eval_mode, point_coords_elem, impl->q_vecs_out[i],
+                                                 impl->e_vecs_out[i]));
         }
         CeedCallBackend(CeedBasisDestroy(&basis));
         break;
@@ -1594,8 +1594,8 @@ static int CeedOperatorAssembleSingleAtPoints_Ref(CeedOperator op, CeedInt offse
     CeedInt num_points, e_vec_size = 0;
 
     // Setup points for element
-    CeedCallBackend(
-        CeedElemRestrictionApplyAtPointsInElement(rstr_points, e, CEED_NOTRANSPOSE, point_coords, impl->point_coords_elem, CEED_REQUEST_IMMEDIATE));
+    CeedCallBackend(CeedElemRestrictionApplyAtPointsInElement(rstr_points, e, CEED_NOTRANSPOSE, point_coords, impl->point_coords_elem,
+                                                              CEED_REQUEST_IMMEDIATE));
     CeedCallBackend(CeedElemRestrictionGetNumPointsInElement(rstr_points, e, &num_points));
 
     // Input basis apply for non-active bases
@@ -1796,8 +1796,8 @@ int CeedOperatorCreateAtPoints_Ref(CeedOperator op) {
   CeedCallBackend(CeedCalloc(1, &impl));
   CeedCallBackend(CeedOperatorSetData(op, impl));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunction", CeedOperatorLinearAssembleQFunctionAtPoints_Ref));
-  CeedCallBackend(
-      CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunctionUpdate", CeedOperatorLinearAssembleQFunctionAtPointsUpdate_Ref));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunctionUpdate",
+                                         CeedOperatorLinearAssembleQFunctionAtPointsUpdate_Ref));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddDiagonal", CeedOperatorLinearAssembleAddDiagonalAtPoints_Ref));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleSingle", CeedOperatorAssembleSingleAtPoints_Ref));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAddAtPoints_Ref));
diff --git a/backends/ref/ceed-ref-restriction.c b/backends/ref/ceed-ref-restriction.c
index ce0ad60c75..2d0f3895cc 100644
--- a/backends/ref/ceed-ref-restriction.c
+++ b/backends/ref/ceed-ref-restriction.c
@@ -423,8 +423,8 @@ static inline int CeedElemRestrictionApply_Ref_Core(CeedElemRestriction rstr, co
     // Sum into for transpose mode
     switch (rstr_type) {
       case CEED_RESTRICTION_STRIDED:
-        CeedCallBackend(
-            CeedElemRestrictionApplyStridedTranspose_Ref_Core(rstr, num_comp, block_size, start, stop, num_elem, elem_size, v_offset, uu, vv));
+        CeedCallBackend(CeedElemRestrictionApplyStridedTranspose_Ref_Core(rstr, num_comp, block_size, start, stop, num_elem, elem_size, v_offset, uu,
+                                                                          vv));
         break;
       case CEED_RESTRICTION_STANDARD:
         CeedCallBackend(CeedElemRestrictionApplyOffsetTranspose_Ref_Core(rstr, num_comp, block_size, comp_stride, start, stop, num_elem, elem_size,
@@ -463,8 +463,8 @@ static inline int CeedElemRestrictionApply_Ref_Core(CeedElemRestriction rstr, co
     // Overwrite for notranspose mode
     switch (rstr_type) {
       case CEED_RESTRICTION_STRIDED:
-        CeedCallBackend(
-            CeedElemRestrictionApplyStridedNoTranspose_Ref_Core(rstr, num_comp, block_size, start, stop, num_elem, elem_size, v_offset, uu, vv));
+        CeedCallBackend(CeedElemRestrictionApplyStridedNoTranspose_Ref_Core(rstr, num_comp, block_size, start, stop, num_elem, elem_size, v_offset,
+                                                                            uu, vv));
         break;
       case CEED_RESTRICTION_STANDARD:
         CeedCallBackend(CeedElemRestrictionApplyOffsetNoTranspose_Ref_Core(rstr, num_comp, block_size, comp_stride, start, stop, num_elem, elem_size,
diff --git a/backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp b/backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp
index 9833e2a837..74b455fbdb 100644
--- a/backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp
+++ b/backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp
@@ -519,8 +519,8 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Sycl(CeedOperator op,
         for (CeedInt field = 0; field < size; field++) {
           q_size = (CeedSize)Q * num_elem;
           CeedCallBackend(CeedVectorCreate(ceed_parent, q_size, &active_in[num_active_in + field]));
-          CeedCallBackend(
-              CeedVectorSetArray(active_in[num_active_in + field], CEED_MEM_DEVICE, CEED_USE_POINTER, &q_vec_array[field * Q * num_elem]));
+          CeedCallBackend(CeedVectorSetArray(active_in[num_active_in + field], CEED_MEM_DEVICE, CEED_USE_POINTER,
+                                             &q_vec_array[field * Q * num_elem]));
         }
         num_active_in += size;
         CeedCallBackend(CeedVectorRestoreArray(impl->q_vecs_in[i], &q_vec_array));
@@ -1395,8 +1395,8 @@ int CeedOperatorCreate_Sycl(CeedOperator op) {
   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Operator", op, "LinearAssembleQFunction", CeedOperatorLinearAssembleQFunction_Sycl));
   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Operator", op, "LinearAssembleQFunctionUpdate", CeedOperatorLinearAssembleQFunctionUpdate_Sycl));
   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Operator", op, "LinearAssembleAddDiagonal", CeedOperatorLinearAssembleAddDiagonal_Sycl));
-  CeedCallBackend(
-      CeedSetBackendFunctionCpp(ceed, "Operator", op, "LinearAssembleAddPointBlockDiagonal", CeedOperatorLinearAssembleAddPointBlockDiagonal_Sycl));
+  CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Operator", op, "LinearAssembleAddPointBlockDiagonal",
+                                            CeedOperatorLinearAssembleAddPointBlockDiagonal_Sycl));
   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Operator", op, "LinearAssembleSingle", CeedOperatorAssembleSingle_Sycl));
   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAdd_Sycl));
   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Operator", op, "Destroy", CeedOperatorDestroy_Sycl));
diff --git a/backends/sycl/ceed-sycl-compile.sycl.cpp b/backends/sycl/ceed-sycl-compile.sycl.cpp
index 0a24ec1e2a..39b7aa80b7 100644
--- a/backends/sycl/ceed-sycl-compile.sycl.cpp
+++ b/backends/sycl/ceed-sycl-compile.sycl.cpp
@@ -157,8 +157,9 @@ int CeedGetKernel_Sycl(Ceed ceed, const SyclModule_t *sycl_module, const std::st
     return CeedError(ceed, CEED_ERROR_BACKEND, "Failed to retrieve kernel from Level Zero module");
   }
 
-  *sycl_kernel = new sycl::kernel(sycl::make_kernel<sycl::backend::ext_oneapi_level_zero>(
-      {*sycl_module, lz_kernel, sycl::ext::oneapi::level_zero::ownership::transfer}, data->sycl_context));
+  *sycl_kernel = new sycl::kernel(sycl::make_kernel<sycl::backend::ext_oneapi_level_zero>({*sycl_module, lz_kernel,
+                                                                                           sycl::ext::oneapi::level_zero::ownership::transfer},
+                                                                                          data->sycl_context));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/examples/fluids/problems/advection.c b/examples/fluids/problems/advection.c
index 7047fd6b66..307aac9081 100644
--- a/examples/fluids/problems/advection.c
+++ b/examples/fluids/problems/advection.c
@@ -199,8 +199,8 @@ PetscErrorCode NS_ADVECTION(ProblemData problem, DM dm, void *ctx, SimpleBC bc)
   }
   if (wind_type == WIND_TRANSLATION && advectionic_type == ADVECTIONIC_BUBBLE_CYLINDER && wind[2] != 0.) {
     wind[2] = 0;
-    PetscCall(
-        PetscPrintf(comm, "Warning! Background wind in the z direction should be zero (-wind_translation x,x,0) with -advection_ic_type cylinder\n"));
+    PetscCall(PetscPrintf(comm,
+                          "Warning! Background wind in the z direction should be zero (-wind_translation x,x,0) with -advection_ic_type cylinder\n"));
   }
   if (stab == STAB_NONE && CtauS != 0) {
     PetscCall(PetscPrintf(comm, "Warning! Use -CtauS only with -stab su or -stab supg\n"));
@@ -298,8 +298,8 @@ PetscErrorCode PRINT_ADVECTION(User user, ProblemData problem, AppCtx app_ctx) {
         PetscCall(PetscPrintf(comm, "    Background Wind                    : %f,%f\n", setup_ctx->wind[0], setup_ctx->wind[1]));
         break;
       case 3:
-        PetscCall(
-            PetscPrintf(comm, "    Background Wind                    : %f,%f,%f\n", setup_ctx->wind[0], setup_ctx->wind[1], setup_ctx->wind[2]));
+        PetscCall(PetscPrintf(comm, "    Background Wind                    : %f,%f,%f\n", setup_ctx->wind[0], setup_ctx->wind[1],
+                              setup_ctx->wind[2]));
         break;
     }
   }
diff --git a/examples/fluids/problems/bc_freestream.c b/examples/fluids/problems/bc_freestream.c
index 45458af00f..d93eabf094 100644
--- a/examples/fluids/problems/bc_freestream.c
+++ b/examples/fluids/problems/bc_freestream.c
@@ -144,13 +144,13 @@ PetscErrorCode OutflowBCSetup(ProblemData problem, DM dm, void *ctx, NewtonianId
   CeedScalar temperature = reference->temperature / Kelvin;
   CeedScalar recirc = 1, softplus_velocity = 1e-2;
   PetscOptionsBegin(user->comm, NULL, "Options for Outflow boundary condition", NULL);
-  PetscCall(
-      PetscOptionsEnum("-outflow_type", "Type of outflow condition", NULL, OutflowTypes, (PetscEnum)outflow_type, (PetscEnum *)&outflow_type, NULL));
+  PetscCall(PetscOptionsEnum("-outflow_type", "Type of outflow condition", NULL, OutflowTypes, (PetscEnum)outflow_type, (PetscEnum *)&outflow_type,
+                             NULL));
   PetscCall(PetscOptionsScalar("-outflow_pressure", "Pressure at outflow condition", NULL, pressure, &pressure, NULL));
   if (outflow_type == OUTFLOW_RIEMANN) {
     PetscCall(PetscOptionsScalar("-outflow_temperature", "Temperature at outflow condition", NULL, temperature, &temperature, NULL));
-    PetscCall(
-        PetscOptionsReal("-outflow_recirc", "Fraction of recirculation to allow in exterior velocity state [0,1]", NULL, recirc, &recirc, NULL));
+    PetscCall(PetscOptionsReal("-outflow_recirc", "Fraction of recirculation to allow in exterior velocity state [0,1]", NULL, recirc, &recirc,
+                               NULL));
     PetscCall(PetscOptionsReal("-outflow_softplus_velocity", "Characteristic velocity of softplus regularization", NULL, softplus_velocity,
                                &softplus_velocity, NULL));
   }
diff --git a/examples/fluids/problems/blasius.c b/examples/fluids/problems/blasius.c
index aec0d1fc82..f01d67a42c 100644
--- a/examples/fluids/problems/blasius.c
+++ b/examples/fluids/problems/blasius.c
@@ -290,8 +290,8 @@ PetscErrorCode NS_BLASIUS(ProblemData problem, DM dm, void *ctx, SimpleBC bc) {
     PetscCall(PetscOptionsScalar("-platemesh_refine_height", "Height of boundary layer mesh refinement", NULL, mesh_refine_height,
                                  &mesh_refine_height, NULL));
     PetscCall(PetscOptionsScalar("-platemesh_growth", "Geometric growth rate of boundary layer mesh", NULL, mesh_growth, &mesh_growth, NULL));
-    PetscCall(
-        PetscOptionsScalar("-platemesh_top_angle", "Geometric top_angle rate of boundary layer mesh", NULL, mesh_top_angle, &mesh_top_angle, NULL));
+    PetscCall(PetscOptionsScalar("-platemesh_top_angle", "Geometric top_angle rate of boundary layer mesh", NULL, mesh_top_angle, &mesh_top_angle,
+                                 NULL));
     PetscCall(PetscOptionsString("-platemesh_y_node_locs_path",
                                  "Path to file with y node locations. "
                                  "If empty, will use the algorithmic mesh warping.",
diff --git a/examples/fluids/src/differential_filter.c b/examples/fluids/src/differential_filter.c
index 0727a39b75..130dafa876 100644
--- a/examples/fluids/src/differential_filter.c
+++ b/examples/fluids/src/differential_filter.c
@@ -273,9 +273,10 @@ PetscErrorCode DifferentialFilterSetup(Ceed ceed, User user, CeedData ceed_data,
   PetscCallCeed(ceed, CeedQFunctionContextCreate(ceed, &diff_filter_qfctx));
   PetscCallCeed(ceed, CeedQFunctionContextSetData(diff_filter_qfctx, CEED_MEM_HOST, CEED_USE_POINTER, sizeof(*diff_filter_ctx), diff_filter_ctx));
   PetscCallCeed(ceed, CeedQFunctionContextSetDataDestroy(diff_filter_qfctx, CEED_MEM_HOST, FreeContextPetsc));
-  PetscCallCeed(ceed, CeedQFunctionContextRegisterDouble(
-                          diff_filter_qfctx, "filter width scaling", offsetof(struct DifferentialFilterContext_, width_scaling),
-                          sizeof(diff_filter_ctx->width_scaling) / sizeof(diff_filter_ctx->width_scaling[0]), "Filter width scaling"));
+  PetscCallCeed(ceed, CeedQFunctionContextRegisterDouble(diff_filter_qfctx, "filter width scaling",
+                                                         offsetof(struct DifferentialFilterContext_, width_scaling),
+                                                         sizeof(diff_filter_ctx->width_scaling) / sizeof(diff_filter_ctx->width_scaling[0]),
+                                                         "Filter width scaling"));
 
   // -- Setup Operators
   PetscCall(DifferentialFilterCreateOperators(ceed, user, ceed_data, diff_filter_qfctx));
diff --git a/examples/fluids/src/dm_utils.c b/examples/fluids/src/dm_utils.c
index 4e91ba6da2..55f99f058e 100644
--- a/examples/fluids/src/dm_utils.c
+++ b/examples/fluids/src/dm_utils.c
@@ -67,8 +67,8 @@ PetscErrorCode DMPlexCeedElemRestrictionCreate(Ceed ceed, DM dm, DMLabel domain_
   CeedInt *restriction_offsets_ceed = NULL;
 
   PetscFunctionBeginUser;
-  PetscCall(
-      DMPlexGetLocalOffsets(dm, domain_label, label_value, height, dm_field, &num_elem, &elem_size, &num_comp, &num_dof, &restriction_offsets_petsc));
+  PetscCall(DMPlexGetLocalOffsets(dm, domain_label, label_value, height, dm_field, &num_elem, &elem_size, &num_comp, &num_dof,
+                                  &restriction_offsets_petsc));
   PetscCall(IntArrayPetscToCeed(num_elem * elem_size, &restriction_offsets_petsc, &restriction_offsets_ceed));
   PetscCallCeed(ceed, CeedElemRestrictionCreate(ceed, num_elem, elem_size, num_comp, 1, num_dof, CEED_MEM_HOST, CEED_COPY_VALUES,
                                                 restriction_offsets_ceed, restriction));
diff --git a/examples/fluids/src/misc.c b/examples/fluids/src/misc.c
index 3f912a1b11..5b74939ee0 100644
--- a/examples/fluids/src/misc.c
+++ b/examples/fluids/src/misc.c
@@ -475,9 +475,10 @@ PetscErrorCode PrintRunInfo(User user, Physics phys_ctx, ProblemData problem, TS
       part_owned_dofs[1]             = gather_buffer[comm_size - 1];  // max
       part_owned_dofs[2]             = gather_buffer[median_index];   // median
       PetscReal part_owned_dof_ratio = (PetscReal)part_owned_dofs[1] / (PetscReal)part_owned_dofs[2];
-      PetscCall(PetscPrintf(
-          comm, "    Global Vector %" PetscInt_FMT "-DoF nodes          : %" PetscInt_FMT ", %" PetscInt_FMT ", %" PetscInt_FMT ", %f\n", num_comp_q,
-          part_owned_dofs[0] / num_comp_q, part_owned_dofs[1] / num_comp_q, part_owned_dofs[2] / num_comp_q, part_owned_dof_ratio));
+      PetscCall(PetscPrintf(comm,
+                            "    Global Vector %" PetscInt_FMT "-DoF nodes          : %" PetscInt_FMT ", %" PetscInt_FMT ", %" PetscInt_FMT ", %f\n",
+                            num_comp_q, part_owned_dofs[0] / num_comp_q, part_owned_dofs[1] / num_comp_q, part_owned_dofs[2] / num_comp_q,
+                            part_owned_dof_ratio));
     }
 
     PetscCallMPI(MPI_Gather(&local_dofs, 1, MPIU_INT, gather_buffer, 1, MPIU_INT, 0, comm));
@@ -487,9 +488,10 @@ PetscErrorCode PrintRunInfo(User user, Physics phys_ctx, ProblemData problem, TS
       part_local_dofs[1]             = gather_buffer[comm_size - 1];  // max
       part_local_dofs[2]             = gather_buffer[median_index];   // median
       PetscReal part_local_dof_ratio = (PetscReal)part_local_dofs[1] / (PetscReal)part_local_dofs[2];
-      PetscCall(PetscPrintf(
-          comm, "    Local Vector %" PetscInt_FMT "-DoF nodes           : %" PetscInt_FMT ", %" PetscInt_FMT ", %" PetscInt_FMT ", %f\n", num_comp_q,
-          part_local_dofs[0] / num_comp_q, part_local_dofs[1] / num_comp_q, part_local_dofs[2] / num_comp_q, part_local_dof_ratio));
+      PetscCall(PetscPrintf(comm,
+                            "    Local Vector %" PetscInt_FMT "-DoF nodes           : %" PetscInt_FMT ", %" PetscInt_FMT ", %" PetscInt_FMT ", %f\n",
+                            num_comp_q, part_local_dofs[0] / num_comp_q, part_local_dofs[1] / num_comp_q, part_local_dofs[2] / num_comp_q,
+                            part_local_dof_ratio));
     }
 
     if (comm_size != 1) {
@@ -521,10 +523,11 @@ PetscErrorCode PrintRunInfo(User user, Physics phys_ctx, ProblemData problem, TS
         part_boundary_dofs[1]           = gather_buffer[comm_size - 1];  // max
         part_boundary_dofs[2]           = gather_buffer[median_index];   // median
         PetscReal part_shared_dof_ratio = (PetscReal)part_boundary_dofs[1] / (PetscReal)part_boundary_dofs[2];
-        PetscCall(PetscPrintf(
-            comm, "    Ghost Interface %" PetscInt_FMT "-DoF nodes        : %" PetscInt_FMT ", %" PetscInt_FMT ", %" PetscInt_FMT ", %f\n",
-            num_comp_q, part_boundary_dofs[0] / num_comp_q, part_boundary_dofs[1] / num_comp_q, part_boundary_dofs[2] / num_comp_q,
-            part_shared_dof_ratio));
+        PetscCall(PetscPrintf(comm,
+                              "    Ghost Interface %" PetscInt_FMT "-DoF nodes        : %" PetscInt_FMT ", %" PetscInt_FMT ", %" PetscInt_FMT
+                              ", %f\n",
+                              num_comp_q, part_boundary_dofs[0] / num_comp_q, part_boundary_dofs[1] / num_comp_q, part_boundary_dofs[2] / num_comp_q,
+                              part_shared_dof_ratio));
       }
 
       PetscCallMPI(MPI_Gather(&num_ghost_interface_ranks, 1, MPIU_INT, gather_buffer, 1, MPIU_INT, 0, comm));
@@ -545,10 +548,11 @@ PetscErrorCode PrintRunInfo(User user, Physics phys_ctx, ProblemData problem, TS
         part_boundary_dofs[1]           = gather_buffer[comm_size - 1];  // max
         part_boundary_dofs[2]           = gather_buffer[median_index];   // median
         PetscReal part_shared_dof_ratio = (PetscReal)part_boundary_dofs[1] / (PetscReal)part_boundary_dofs[2];
-        PetscCall(PetscPrintf(
-            comm, "    Owned Interface %" PetscInt_FMT "-DoF nodes        : %" PetscInt_FMT ", %" PetscInt_FMT ", %" PetscInt_FMT ", %f\n",
-            num_comp_q, part_boundary_dofs[0] / num_comp_q, part_boundary_dofs[1] / num_comp_q, part_boundary_dofs[2] / num_comp_q,
-            part_shared_dof_ratio));
+        PetscCall(PetscPrintf(comm,
+                              "    Owned Interface %" PetscInt_FMT "-DoF nodes        : %" PetscInt_FMT ", %" PetscInt_FMT ", %" PetscInt_FMT
+                              ", %f\n",
+                              num_comp_q, part_boundary_dofs[0] / num_comp_q, part_boundary_dofs[1] / num_comp_q, part_boundary_dofs[2] / num_comp_q,
+                              part_shared_dof_ratio));
       }
 
       PetscCallMPI(MPI_Gather(&num_owned_interface_ranks, 1, MPIU_INT, gather_buffer, 1, MPIU_INT, 0, comm));
diff --git a/examples/fluids/src/setupts.c b/examples/fluids/src/setupts.c
index 1fdda9fb82..1a4e3f3e76 100644
--- a/examples/fluids/src/setupts.c
+++ b/examples/fluids/src/setupts.c
@@ -207,8 +207,8 @@ PetscErrorCode WriteOutput(User user, Vec Q, PetscInt step_no, PetscScalar time)
       PetscCall(VecZeroEntries(Q_refined_loc));
       PetscCall(DMGlobalToLocal(user->dm_viz, Q_refined, INSERT_VALUES, Q_refined_loc));
 
-      PetscCall(
-          PetscSNPrintf(file_path_refined, sizeof file_path_refined, "%s/nsrefined-%03" PetscInt_FMT ".vtu", user->app_ctx->output_dir, step_no));
+      PetscCall(PetscSNPrintf(file_path_refined, sizeof file_path_refined, "%s/nsrefined-%03" PetscInt_FMT ".vtu", user->app_ctx->output_dir,
+                              step_no));
 
       PetscCall(PetscViewerVTKOpen(PetscObjectComm((PetscObject)Q_refined), file_path_refined, FILE_MODE_WRITE, &viewer_refined));
       PetscCall(VecView(Q_refined_loc, viewer_refined));
diff --git a/examples/fluids/src/turb_spanstats.c b/examples/fluids/src/turb_spanstats.c
index bd856d4af4..f5f3c26854 100644
--- a/examples/fluids/src/turb_spanstats.c
+++ b/examples/fluids/src/turb_spanstats.c
@@ -206,8 +206,8 @@ PetscErrorCode SpanStatsSetupDataCreate(Ceed ceed, User user, CeedData ceed_data
 
   PetscCall(CreateElemRestrColloc_CompMajor(ceed, num_comp_stats, (*stats_data)->basis_stats, (*stats_data)->elem_restr_parent_stats,
                                             &(*stats_data)->elem_restr_parent_colloc));
-  PetscCall(
-      CreateElemRestrColloc_CompMajor(ceed, num_comp_stats, ceed_data->basis_q, ceed_data->elem_restr_q, &(*stats_data)->elem_restr_child_colloc));
+  PetscCall(CreateElemRestrColloc_CompMajor(ceed, num_comp_stats, ceed_data->basis_q, ceed_data->elem_restr_q,
+                                            &(*stats_data)->elem_restr_child_colloc));
 
   {  // -- Copy DM coordinates into CeedVector
     DM cdm;
@@ -397,9 +397,9 @@ PetscErrorCode CreateStatisticCollectionOperator(Ceed ceed, User user, CeedData
 
     PetscCallCeed(ceed, CeedQFunctionContextRegisterDouble(collect_context, "solution time",
                                                            offsetof(struct Turbulence_SpanStatsContext_, solution_time), 1, "Current solution time"));
-    PetscCallCeed(
-        ceed, CeedQFunctionContextRegisterDouble(collect_context, "previous time", offsetof(struct Turbulence_SpanStatsContext_, previous_time), 1,
-                                                 "Previous time statistics collection was done"));
+    PetscCallCeed(ceed,
+                  CeedQFunctionContextRegisterDouble(collect_context, "previous time", offsetof(struct Turbulence_SpanStatsContext_, previous_time),
+                                                     1, "Previous time statistics collection was done"));
 
     PetscCallCeed(ceed, CeedQFunctionContextRestoreData(problem->apply_vol_rhs.qfunction_context, &newtonian_ig_ctx));
   }
@@ -423,8 +423,8 @@ PetscErrorCode CreateStatisticCollectionOperator(Ceed ceed, User user, CeedData
   PetscCall(OperatorApplyContextCreate(user->dm, user->spanstats.dm, user->ceed, op_stats_collect, user->q_ceed, NULL, NULL, NULL,
                                        &user->spanstats.op_stats_collect_ctx));
 
-  PetscCall(
-      CeedOperatorCreateLocalVecs(op_stats_collect, DMReturnVecType(user->spanstats.dm), PETSC_COMM_SELF, NULL, &user->spanstats.Child_Stats_loc));
+  PetscCall(CeedOperatorCreateLocalVecs(op_stats_collect, DMReturnVecType(user->spanstats.dm), PETSC_COMM_SELF, NULL,
+                                        &user->spanstats.Child_Stats_loc));
   PetscCall(VecZeroEntries(user->spanstats.Child_Stats_loc));
 
   PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_stats_collect));
diff --git a/examples/fluids/src/velocity_gradient_projection.c b/examples/fluids/src/velocity_gradient_projection.c
index 232c46946d..d022047900 100644
--- a/examples/fluids/src/velocity_gradient_projection.c
+++ b/examples/fluids/src/velocity_gradient_projection.c
@@ -22,8 +22,8 @@ PetscErrorCode VelocityGradientProjectionCreateDM(NodalProjectionData grad_velo_
   PetscCall(DMClone(user->dm, &grad_velo_proj->dm));
   PetscCall(PetscObjectSetName((PetscObject)grad_velo_proj->dm, "Velocity Gradient Projection"));
 
-  PetscCall(
-      DMSetupByOrder_FEM(PETSC_TRUE, PETSC_TRUE, user->app_ctx->degree, 1, user->app_ctx->q_extra, 1, &grad_velo_proj->num_comp, grad_velo_proj->dm));
+  PetscCall(DMSetupByOrder_FEM(PETSC_TRUE, PETSC_TRUE, user->app_ctx->degree, 1, user->app_ctx->q_extra, 1, &grad_velo_proj->num_comp,
+                               grad_velo_proj->dm));
 
   PetscCall(DMGetLocalSection(grad_velo_proj->dm, &section));
   PetscCall(PetscSectionSetFieldName(section, 0, ""));
@@ -67,8 +67,8 @@ PetscErrorCode VelocityGradientProjectionSetup(Ceed ceed, User user, CeedData ce
   // -- Build RHS operator
   switch (state_var_input) {
     case STATEVAR_PRIMITIVE:
-      PetscCallCeed(
-          ceed, CeedQFunctionCreateInterior(ceed, 1, VelocityGradientProjectionRHS_Prim, VelocityGradientProjectionRHS_Prim_loc, &qf_rhs_assemble));
+      PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, VelocityGradientProjectionRHS_Prim, VelocityGradientProjectionRHS_Prim_loc,
+                                                      &qf_rhs_assemble));
       break;
     case STATEVAR_CONSERVATIVE:
       PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, VelocityGradientProjectionRHS_Conserv, VelocityGradientProjectionRHS_Conserv_loc,
diff --git a/examples/petsc/bpsraw.c b/examples/petsc/bpsraw.c
index ed8ae0aca8..4970494304 100644
--- a/examples/petsc/bpsraw.c
+++ b/examples/petsc/bpsraw.c
@@ -385,8 +385,8 @@ int main(int argc, char **argv) {
   PetscInt two       = 2;
   ksp_max_it_clip[0] = 5;
   ksp_max_it_clip[1] = 20;
-  PetscCall(
-      PetscOptionsIntArray("-ksp_max_it_clip", "Min and max number of iterations to use during benchmarking", NULL, ksp_max_it_clip, &two, NULL));
+  PetscCall(PetscOptionsIntArray("-ksp_max_it_clip", "Min and max number of iterations to use during benchmarking", NULL, ksp_max_it_clip, &two,
+                                 NULL));
   PetscOptionsEnd();
   P = degree + 1;
   Q = P + q_extra;
@@ -767,8 +767,8 @@ int main(int argc, char **argv) {
       }
     }
     if (!test_mode) {
-      PetscCall(
-          PetscPrintf(comm, "    DoFs/Sec in CG                     : %g (%g) million\n", 1e-6 * gsize * its / rt_max, 1e-6 * gsize * its / rt_min));
+      PetscCall(PetscPrintf(comm, "    DoFs/Sec in CG                     : %g (%g) million\n", 1e-6 * gsize * its / rt_max,
+                            1e-6 * gsize * its / rt_min));
     }
   }
 
diff --git a/examples/petsc/dmswarm.c b/examples/petsc/dmswarm.c
index cc618413bf..903f601cc4 100644
--- a/examples/petsc/dmswarm.c
+++ b/examples/petsc/dmswarm.c
@@ -82,10 +82,10 @@ int main(int argc, char **argv) {
   PetscOptionsBegin(comm, NULL, "libCEED example using PETSc with DMSwarm", NULL);
 
   PetscCall(PetscOptionsBool("-test", "Testing mode (do not print unless error is large)", NULL, test_mode, &test_mode, NULL));
-  PetscCall(
-      PetscOptionsBool("-u_petsc_swarm_view", "View XDMF of swarm values interpolated by PETSc", NULL, view_petsc_swarm, &view_petsc_swarm, NULL));
-  PetscCall(
-      PetscOptionsBool("-u_ceed_swarm_view", "View XDMF of swarm values interpolated by libCEED", NULL, view_ceed_swarm, &view_ceed_swarm, NULL));
+  PetscCall(PetscOptionsBool("-u_petsc_swarm_view", "View XDMF of swarm values interpolated by PETSc", NULL, view_petsc_swarm, &view_petsc_swarm,
+                             NULL));
+  PetscCall(PetscOptionsBool("-u_ceed_swarm_view", "View XDMF of swarm values interpolated by libCEED", NULL, view_ceed_swarm, &view_ceed_swarm,
+                             NULL));
   PetscCall(PetscOptionsEnum("-target", "Target field function", NULL, target_types, (PetscEnum)target_type, (PetscEnum *)&target_type, NULL));
   PetscCall(PetscOptionsInt("-solution_order", "Order of mesh solution space", NULL, solution_order, &solution_order, NULL));
   PetscCall(PetscOptionsInt("-mesh_order", "Order of mesh coordinate space", NULL, mesh_order, &mesh_order, NULL));
diff --git a/examples/solids/src/cl-options.c b/examples/solids/src/cl-options.c
index b2e203dda7..4e6087990b 100644
--- a/examples/solids/src/cl-options.c
+++ b/examples/solids/src/cl-options.c
@@ -65,8 +65,8 @@ PetscErrorCode ProcessCommandLineOptions(MPI_Comm comm, AppCtx app_ctx) {
 
   // Dirichlet boundary conditions
   app_ctx->bc_clamp_count = 16;
-  PetscCall(
-      PetscOptionsIntArray("-bc_clamp", "Face IDs to apply incremental Dirichlet BC", NULL, app_ctx->bc_clamp_faces, &app_ctx->bc_clamp_count, NULL));
+  PetscCall(PetscOptionsIntArray("-bc_clamp", "Face IDs to apply incremental Dirichlet BC", NULL, app_ctx->bc_clamp_faces, &app_ctx->bc_clamp_count,
+                                 NULL));
   // Set vector for each clamped BC
   for (PetscInt i = 0; i < app_ctx->bc_clamp_count; i++) {
     // Translation vector
diff --git a/examples/solids/src/setup-libceed.c b/examples/solids/src/setup-libceed.c
index 16f3b076af..16adff9f84 100644
--- a/examples/solids/src/setup-libceed.c
+++ b/examples/solids/src/setup-libceed.c
@@ -316,8 +316,8 @@ PetscErrorCode SetupLibceedFineLevel(DM dm, DM dm_energy, DM dm_diagnostic, Ceed
       CeedOperator        op_traction;
       CeedQFunctionContextSetData(traction_ctx, CEED_MEM_HOST, CEED_USE_POINTER, 3 * sizeof(CeedScalar), app_ctx->bc_traction_vector[i]);
       // Setup restriction
-      PetscCall(
-          GetRestrictionForDomain(ceed, dm, 1, domain_label, app_ctx->bc_traction_faces[i], Q, 0, &elem_restr_u_face, &elem_restr_x_face, NULL));
+      PetscCall(GetRestrictionForDomain(ceed, dm, 1, domain_label, app_ctx->bc_traction_faces[i], Q, 0, &elem_restr_u_face, &elem_restr_x_face,
+                                        NULL));
       // ---- Create boundary Operator
       CeedOperatorCreate(ceed, qf_traction, NULL, NULL, &op_traction);
       CeedOperatorSetField(op_traction, "dx", elem_restr_x_face, basis_x_face, CEED_VECTOR_ACTIVE);
diff --git a/interface/ceed-elemrestriction.c b/interface/ceed-elemrestriction.c
index 3a64789a94..5da52c8b25 100644
--- a/interface/ceed-elemrestriction.c
+++ b/interface/ceed-elemrestriction.c
@@ -689,8 +689,8 @@ int CeedElemRestrictionCreateOriented(Ceed ceed, CeedInt num_elem, CeedInt elem_
 
     CeedCall(CeedGetObjectDelegate(ceed, &delegate, "ElemRestriction"));
     CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement CeedElemRestrictionCreateOriented");
-    CeedCall(
-        CeedElemRestrictionCreateOriented(delegate, num_elem, elem_size, num_comp, comp_stride, l_size, mem_type, copy_mode, offsets, orients, rstr));
+    CeedCall(CeedElemRestrictionCreateOriented(delegate, num_elem, elem_size, num_comp, comp_stride, l_size, mem_type, copy_mode, offsets, orients,
+                                               rstr));
     CeedCall(CeedDestroy(&delegate));
     return CEED_ERROR_SUCCESS;
   }
@@ -1031,8 +1031,8 @@ int CeedElemRestrictionCreateBlockedOriented(Ceed ceed, CeedInt num_elem, CeedIn
   (*rstr)->num_block   = num_block;
   (*rstr)->block_size  = block_size;
   (*rstr)->rstr_type   = CEED_RESTRICTION_ORIENTED;
-  CeedCall(
-      ceed->ElemRestrictionCreateBlocked(CEED_MEM_HOST, CEED_OWN_POINTER, (const CeedInt *)block_offsets, (const bool *)block_orients, NULL, *rstr));
+  CeedCall(ceed->ElemRestrictionCreateBlocked(CEED_MEM_HOST, CEED_OWN_POINTER, (const CeedInt *)block_offsets, (const bool *)block_orients, NULL,
+                                              *rstr));
   if (copy_mode == CEED_OWN_POINTER) CeedCall(CeedFree(&offsets));
   return CEED_ERROR_SUCCESS;
 }
diff --git a/interface/ceed-preconditioning.c b/interface/ceed-preconditioning.c
index 84d3f1a1e0..94237510db 100644
--- a/interface/ceed-preconditioning.c
+++ b/interface/ceed-preconditioning.c
@@ -2696,8 +2696,8 @@ int CeedOperatorMultigridLevelCreate(CeedOperator op_fine, CeedVector p_mult_fin
   }
 
   // Core code
-  CeedCall(
-      CeedOperatorMultigridLevelCreateSingle_Core(op_fine, p_mult_fine, rstr_coarse, basis_coarse, basis_c_to_f, op_coarse, op_prolong, op_restrict));
+  CeedCall(CeedOperatorMultigridLevelCreateSingle_Core(op_fine, p_mult_fine, rstr_coarse, basis_coarse, basis_c_to_f, op_coarse, op_prolong,
+                                                       op_restrict));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -2762,8 +2762,8 @@ int CeedOperatorMultigridLevelCreateTensorH1(CeedOperator op_fine, CeedVector p_
   }
 
   // Core code
-  CeedCall(
-      CeedOperatorMultigridLevelCreateSingle_Core(op_fine, p_mult_fine, rstr_coarse, basis_coarse, basis_c_to_f, op_coarse, op_prolong, op_restrict));
+  CeedCall(CeedOperatorMultigridLevelCreateSingle_Core(op_fine, p_mult_fine, rstr_coarse, basis_coarse, basis_c_to_f, op_coarse, op_prolong,
+                                                       op_restrict));
   CeedCall(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
@@ -2827,8 +2827,8 @@ int CeedOperatorMultigridLevelCreateH1(CeedOperator op_fine, CeedVector p_mult_f
   }
 
   // Core code
-  CeedCall(
-      CeedOperatorMultigridLevelCreateSingle_Core(op_fine, p_mult_fine, rstr_coarse, basis_coarse, basis_c_to_f, op_coarse, op_prolong, op_restrict));
+  CeedCall(CeedOperatorMultigridLevelCreateSingle_Core(op_fine, p_mult_fine, rstr_coarse, basis_coarse, basis_c_to_f, op_coarse, op_prolong,
+                                                       op_restrict));
   CeedCall(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }

From 5a526491291e2ef13670ec99232a2cb0069702e5 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Tue, 30 Sep 2025 11:23:30 -0600
Subject: [PATCH 502/571] op - add SetNumViewTabs

---
 include/ceed-impl.h              |  1 +
 include/ceed/backend.h           |  1 +
 include/ceed/ceed.h              |  1 +
 interface/ceed-fortran.c         |  7 +++
 interface/ceed-operator.c        | 91 +++++++++++++++++++++++---------
 tests/output/t504-operator-f.out | 40 +++++++-------
 tests/output/t504-operator.out   | 42 +++++++--------
 tests/output/t523-operator-f.out | 82 ++++++++++++++--------------
 tests/output/t523-operator.out   | 88 +++++++++++++++---------------
 tests/t504-operator-f.f90        |  1 +
 tests/t504-operator.c            |  1 +
 tests/t523-operator-f.f90        |  1 +
 tests/t523-operator.c            |  1 +
 13 files changed, 206 insertions(+), 151 deletions(-)

diff --git a/include/ceed-impl.h b/include/ceed-impl.h
index 0fa7de80d2..311d0c1bba 100644
--- a/include/ceed-impl.h
+++ b/include/ceed-impl.h
@@ -368,6 +368,7 @@ struct CeedOperator_private {
   CeedOperatorField        *input_fields;
   CeedOperatorField        *output_fields;
   CeedSize                  input_size, output_size;
+  CeedInt                   num_tabs;   /* Viewing offset */
   CeedInt                   num_elem;   /* Number of elements */
   CeedInt                   num_qpts;   /* Number of quadrature points over all elements */
   CeedInt                   num_fields; /* Number of fields that have been set */
diff --git a/include/ceed/backend.h b/include/ceed/backend.h
index 6bf1ba3bb6..ec4534b298 100644
--- a/include/ceed/backend.h
+++ b/include/ceed/backend.h
@@ -446,6 +446,7 @@ CEED_EXTERN int CeedOperatorAssemblyDataGetElemRestrictions(CeedOperatorAssembly
                                                             CeedElemRestriction **active_elem_rstrs_out);
 CEED_EXTERN int CeedOperatorAssemblyDataDestroy(CeedOperatorAssemblyData *data);
 
+CEED_EXTERN int CeedOperatorGetNumViewTabs(CeedOperator op, CeedInt *num_tabs);
 CEED_EXTERN int CeedOperatorGetActiveBasis(CeedOperator op, CeedBasis *active_basis);
 CEED_EXTERN int CeedOperatorGetActiveBases(CeedOperator op, CeedBasis *active_input_basis, CeedBasis *active_output_basis);
 CEED_EXTERN int CeedOperatorGetActiveElemRestriction(CeedOperator op, CeedElemRestriction *active_rstr);
diff --git a/include/ceed/ceed.h b/include/ceed/ceed.h
index f0207d6abf..a4ed4a9006 100644
--- a/include/ceed/ceed.h
+++ b/include/ceed/ceed.h
@@ -453,6 +453,7 @@ CEED_EXTERN int  CeedOperatorMultigridLevelCreateH1(CeedOperator op_fine, CeedVe
 CEED_EXTERN int  CeedOperatorCreateFDMElementInverse(CeedOperator op, CeedOperator *fdm_inv, CeedRequest *request);
 CEED_EXTERN int  CeedOperatorSetName(CeedOperator op, const char *name);
 CEED_EXTERN int  CeedOperatorGetName(CeedOperator op, const char **name);
+CEED_EXTERN int  CeedOperatorSetNumViewTabs(CeedOperator op, CeedInt num_tabs);
 CEED_EXTERN int  CeedOperatorView(CeedOperator op, FILE *stream);
 CEED_EXTERN int  CeedOperatorViewTerse(CeedOperator op, FILE *stream);
 CEED_EXTERN int  CeedOperatorGetCeed(CeedOperator op, Ceed *ceed);
diff --git a/interface/ceed-fortran.c b/interface/ceed-fortran.c
index e62cb30360..5da94e1b02 100644
--- a/interface/ceed-fortran.c
+++ b/interface/ceed-fortran.c
@@ -1019,6 +1019,13 @@ CEED_EXTERN void fCeedOperatorSetName(int *op, const char *name, int *err, fortr
   *err = CeedOperatorSetName(op_, name_c);
 }
 
+#define fCeedOperatorSetNumViewTabs FORTRAN_NAME(ceedoperatorsetnumviewtabs, CEEDOPERATORSETNUMVIEWTABS)
+CEED_EXTERN void fCeedOperatorSetNumViewTabs(int *op, int *ntabs, int *err) {
+  CeedOperator op_ = CeedOperator_dict[*op];
+
+  *err = CeedOperatorSetNumViewTabs(op_, *ntabs);
+}
+
 #define fCeedOperatorLinearAssembleQFunction FORTRAN_NAME(ceedoperatorlinearassembleqfunction, CEEDOPERATORLINEARASSEMBLEQFUNCTION)
 CEED_EXTERN void fCeedOperatorLinearAssembleQFunction(int *op, int *assembledvec, int *assembledrstr, int *rqst, int *err) {
   // Vector
diff --git a/interface/ceed-operator.c b/interface/ceed-operator.c
index 3a0a0ddfe7..e2943cd3cf 100644
--- a/interface/ceed-operator.c
+++ b/interface/ceed-operator.c
@@ -87,7 +87,7 @@ static int CeedOperatorCheckField(Ceed ceed, CeedQFunctionField qf_field, CeedEl
   @param[in] op_field     `CeedOperator` Field to view
   @param[in] qf_field     `CeedQFunction` Field (carries field name)
   @param[in] field_number Number of field being viewed
-  @param[in] sub          true indicates sub-operator, which increases indentation; false for top-level operator
+  @param[in] tabs         Tabs to append before each line
   @param[in] input        true for an input field; false for output field
   @param[in] stream       Stream to view to, e.g., `stdout`
 
@@ -95,8 +95,8 @@ static int CeedOperatorCheckField(Ceed ceed, CeedQFunctionField qf_field, CeedEl
 
   @ref Utility
 **/
-static int CeedOperatorFieldView(CeedOperatorField op_field, CeedQFunctionField qf_field, CeedInt field_number, bool sub, bool input, FILE *stream) {
-  const char  *pre    = sub ? "  " : "";
+static int CeedOperatorFieldView(CeedOperatorField op_field, CeedQFunctionField qf_field, CeedInt field_number, const char *tabs, bool input,
+                                 FILE *stream) {
   const char  *in_out = input ? "Input" : "Output";
   const char  *field_name;
   CeedInt      size;
@@ -112,12 +112,12 @@ static int CeedOperatorFieldView(CeedOperatorField op_field, CeedQFunctionField
           "%s    %s field %" CeedInt_FMT
           ":\n"
           "%s      Name: \"%s\"\n",
-          pre, in_out, field_number, pre, field_name);
-  fprintf(stream, "%s      Size: %" CeedInt_FMT "\n", pre, size);
-  fprintf(stream, "%s      EvalMode: %s\n", pre, CeedEvalModes[eval_mode]);
-  if (basis == CEED_BASIS_NONE) fprintf(stream, "%s      No basis\n", pre);
-  if (vec == CEED_VECTOR_ACTIVE) fprintf(stream, "%s      Active vector\n", pre);
-  else if (vec == CEED_VECTOR_NONE) fprintf(stream, "%s      No vector\n", pre);
+          tabs, in_out, field_number, tabs, field_name);
+  fprintf(stream, "%s      Size: %" CeedInt_FMT "\n", tabs, size);
+  fprintf(stream, "%s      EvalMode: %s\n", tabs, CeedEvalModes[eval_mode]);
+  if (basis == CEED_BASIS_NONE) fprintf(stream, "%s      No basis\n", tabs);
+  if (vec == CEED_VECTOR_ACTIVE) fprintf(stream, "%s      Active vector\n", tabs);
+  else if (vec == CEED_VECTOR_NONE) fprintf(stream, "%s      No vector\n", tabs);
 
   CeedCall(CeedVectorDestroy(&vec));
   CeedCall(CeedBasisDestroy(&basis));
@@ -128,16 +128,15 @@ static int CeedOperatorFieldView(CeedOperatorField op_field, CeedQFunctionField
   @brief View a single `CeedOperator`
 
   @param[in] op     `CeedOperator` to view
-  @param[in] sub    Boolean flag for sub-operator
+  @param[in] tabs   Tabs to append before each new line
   @param[in] stream Stream to write; typically `stdout` or a file
 
   @return Error code: 0 - success, otherwise - failure
 
   @ref Utility
 **/
-int CeedOperatorSingleView(CeedOperator op, bool sub, FILE *stream) {
+int CeedOperatorSingleView(CeedOperator op, const char *tabs, FILE *stream) {
   bool                is_at_points;
-  const char         *pre = sub ? "  " : "";
   CeedInt             num_elem, num_qpts, total_fields = 0, num_input_fields, num_output_fields;
   CeedQFunction       qf;
   CeedQFunctionField *qf_input_fields, *qf_output_fields;
@@ -158,23 +157,38 @@ int CeedOperatorSingleView(CeedOperator op, bool sub, FILE *stream) {
 
     CeedCall(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL));
     CeedCall(CeedElemRestrictionGetMaxPointsInElement(rstr_points, &max_points));
-    fprintf(stream, "%s  %" CeedInt_FMT " elements with %" CeedInt_FMT " max points each\n", pre, num_elem, max_points);
+    fprintf(stream, "%s  %" CeedInt_FMT " elements with %" CeedInt_FMT " max points each\n", tabs, num_elem, max_points);
     CeedCall(CeedElemRestrictionDestroy(&rstr_points));
   } else {
-    fprintf(stream, "%s  %" CeedInt_FMT " elements with %" CeedInt_FMT " quadrature points each\n", pre, num_elem, num_qpts);
+    fprintf(stream, "%s  %" CeedInt_FMT " elements with %" CeedInt_FMT " quadrature points each\n", tabs, num_elem, num_qpts);
   }
-  fprintf(stream, "%s  %" CeedInt_FMT " field%s\n", pre, total_fields, total_fields > 1 ? "s" : "");
-  fprintf(stream, "%s  %" CeedInt_FMT " input field%s:\n", pre, num_input_fields, num_input_fields > 1 ? "s" : "");
+  fprintf(stream, "%s  %" CeedInt_FMT " field%s\n", tabs, total_fields, total_fields > 1 ? "s" : "");
+  fprintf(stream, "%s  %" CeedInt_FMT " input field%s:\n", tabs, num_input_fields, num_input_fields > 1 ? "s" : "");
   for (CeedInt i = 0; i < num_input_fields; i++) {
-    CeedCall(CeedOperatorFieldView(op_input_fields[i], qf_input_fields[i], i, sub, 1, stream));
+    CeedCall(CeedOperatorFieldView(op_input_fields[i], qf_input_fields[i], i, tabs, 1, stream));
   }
-  fprintf(stream, "%s  %" CeedInt_FMT " output field%s:\n", pre, num_output_fields, num_output_fields > 1 ? "s" : "");
+  fprintf(stream, "%s  %" CeedInt_FMT " output field%s:\n", tabs, num_output_fields, num_output_fields > 1 ? "s" : "");
   for (CeedInt i = 0; i < num_output_fields; i++) {
-    CeedCall(CeedOperatorFieldView(op_output_fields[i], qf_output_fields[i], i, sub, 0, stream));
+    CeedCall(CeedOperatorFieldView(op_output_fields[i], qf_output_fields[i], i, tabs, 0, stream));
   }
   return CEED_ERROR_SUCCESS;
 }
 
+/**
+  @brief Get the number of tabs to indent for @ref CeedOperatorView() output
+
+  @param[in]  op       `CeedOperator` to get the number of view tabs
+  @param[out] num_tabs Number of view tabs
+
+  @return Error code: 0 - success, otherwise - failure
+
+  @ref User
+**/
+int CeedOperatorGetNumViewTabs(CeedOperator op, CeedInt *num_tabs) {
+  *num_tabs = op->num_tabs;
+  return CEED_ERROR_SUCCESS;
+}
+
 /**
   @brief Find the active input vector `CeedBasis` for a non-composite `CeedOperator`.
 
@@ -1588,31 +1602,58 @@ int CeedOperatorGetName(CeedOperator op, const char **name) {
   @ref Developer
 **/
 static int CeedOperatorView_Core(CeedOperator op, FILE *stream, bool is_full) {
-  bool        has_name, is_composite, is_at_points;
-  const char *name = NULL;
+  bool          has_name, is_composite, is_at_points;
+  char         *tabs      = NULL;
+  const char   *name      = NULL;
+  const CeedInt tab_width = 2;
+  CeedInt       num_tabs  = 0;
 
   CeedCall(CeedOperatorGetName(op, &name));
   has_name = name ? strlen(name) : false;
   CeedCall(CeedOperatorIsComposite(op, &is_composite));
   CeedCall(CeedOperatorIsAtPoints(op, &is_at_points));
+  // Set tabs
+  CeedCall(CeedOperatorGetNumViewTabs(op, &num_tabs));
+  CeedCall(CeedCalloc(tab_width * (num_tabs + is_composite) + 1, &tabs));
+  for (CeedInt i = 0; i < tab_width * num_tabs; i++) tabs[i] = ' ';
   if (is_composite) {
     CeedInt       num_suboperators;
     CeedOperator *sub_operators;
 
     CeedCall(CeedOperatorCompositeGetNumSub(op, &num_suboperators));
     CeedCall(CeedOperatorCompositeGetSubList(op, &sub_operators));
+    fprintf(stream, tabs);
     fprintf(stream, "Composite CeedOperator%s%s\n", has_name ? " - " : "", has_name ? name : "");
-
+    for (CeedInt i = 0; i < tab_width; i++) tabs[tab_width * num_tabs + i] = ' ';
     for (CeedInt i = 0; i < num_suboperators; i++) {
       has_name = sub_operators[i]->name;
-      fprintf(stream, "  SubOperator%s %" CeedInt_FMT "%s%s%s\n", is_at_points ? " AtPoints" : "", i, has_name ? " - " : "",
+      fprintf(stream, tabs);
+      fprintf(stream, "SubOperator%s %" CeedInt_FMT "%s%s%s\n", is_at_points ? " AtPoints" : "", i, has_name ? " - " : "",
               has_name ? sub_operators[i]->name : "", is_full ? ":" : "");
-      if (is_full) CeedCall(CeedOperatorSingleView(sub_operators[i], 1, stream));
+      if (is_full) CeedCall(CeedOperatorSingleView(sub_operators[i], tabs, stream));
     }
   } else {
+    fprintf(stream, tabs);
     fprintf(stream, "CeedOperator%s%s%s\n", is_at_points ? " AtPoints" : "", has_name ? " - " : "", has_name ? name : "");
-    if (is_full) CeedCall(CeedOperatorSingleView(op, 0, stream));
+    if (is_full) CeedCall(CeedOperatorSingleView(op, tabs, stream));
   }
+  CeedCall(CeedFree(&tabs));
+  return CEED_ERROR_SUCCESS;
+}
+
+/**
+  @brief Set the number of tabs to indent for @ref CeedOperatorView() output
+
+  @param[in] op       `CeedOperator` to set the number of view tabs
+  @param[in] num_tabs Number of view tabs to set
+
+  @return Error code: 0 - success, otherwise - failure
+
+  @ref User
+**/
+int CeedOperatorSetNumViewTabs(CeedOperator op, CeedInt num_tabs) {
+  CeedCheck(num_tabs >= 0, CeedOperatorReturnCeed(op), CEED_ERROR_MINOR, "Number of view tabs must be non-negative");
+  op->num_tabs = num_tabs;
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/tests/output/t504-operator-f.out b/tests/output/t504-operator-f.out
index 41b0ea772b..3b5857619f 100644
--- a/tests/output/t504-operator-f.out
+++ b/tests/output/t504-operator-f.out
@@ -19,23 +19,23 @@ CeedOperator - setup
       EvalMode: none
       No basis
       Active vector
-CeedOperator - mass
-  15 elements with 8 quadrature points each
-  3 fields
-  2 input fields:
-    Input field 0:
-      Name: "rho"
-      Size: 1
-      EvalMode: none
-      No basis
-    Input field 1:
-      Name: "u"
-      Size: 2
-      EvalMode: interpolation
-      Active vector
-  1 output field:
-    Output field 0:
-      Name: "v"
-      Size: 2
-      EvalMode: interpolation
-      Active vector
+  CeedOperator - mass
+    15 elements with 8 quadrature points each
+    3 fields
+    2 input fields:
+      Input field 0:
+        Name: "rho"
+        Size: 1
+        EvalMode: none
+        No basis
+      Input field 1:
+        Name: "u"
+        Size: 2
+        EvalMode: interpolation
+        Active vector
+    1 output field:
+      Output field 0:
+        Name: "v"
+        Size: 2
+        EvalMode: interpolation
+        Active vector
diff --git a/tests/output/t504-operator.out b/tests/output/t504-operator.out
index 3f1a31a7d4..4f23570743 100644
--- a/tests/output/t504-operator.out
+++ b/tests/output/t504-operator.out
@@ -20,24 +20,24 @@ CeedOperator - setup
       EvalMode: none
       No basis
       Active vector
-CeedOperator - mass
-CeedOperator - mass
-  15 elements with 8 quadrature points each
-  3 fields
-  2 input fields:
-    Input field 0:
-      Name: "rho"
-      Size: 1
-      EvalMode: none
-      No basis
-    Input field 1:
-      Name: "u"
-      Size: 2
-      EvalMode: interpolation
-      Active vector
-  1 output field:
-    Output field 0:
-      Name: "v"
-      Size: 2
-      EvalMode: interpolation
-      Active vector
+  CeedOperator - mass
+  CeedOperator - mass
+    15 elements with 8 quadrature points each
+    3 fields
+    2 input fields:
+      Input field 0:
+        Name: "rho"
+        Size: 1
+        EvalMode: none
+        No basis
+      Input field 1:
+        Name: "u"
+        Size: 2
+        EvalMode: interpolation
+        Active vector
+    1 output field:
+      Output field 0:
+        Name: "v"
+        Size: 2
+        EvalMode: interpolation
+        Active vector
diff --git a/tests/output/t523-operator-f.out b/tests/output/t523-operator-f.out
index 1817a8a2cf..2a17d484bb 100644
--- a/tests/output/t523-operator-f.out
+++ b/tests/output/t523-operator-f.out
@@ -39,44 +39,44 @@ Composite CeedOperator - setup
         Size: 1
         EvalMode: none
         No basis
-Composite CeedOperator - mass
-  SubOperator 0 - triangle elements:
-    6 elements with 4 quadrature points each
-    3 fields
-    2 input fields:
-      Input field 0:
-        Name: "rho"
-        Size: 1
-        EvalMode: none
-        No basis
-      Input field 1:
-        Name: "u"
-        Size: 1
-        EvalMode: interpolation
-        Active vector
-    1 output field:
-      Output field 0:
-        Name: "v"
-        Size: 1
-        EvalMode: interpolation
-        Active vector
-  SubOperator 1 - quadrilateral elements:
-    6 elements with 16 quadrature points each
-    3 fields
-    2 input fields:
-      Input field 0:
-        Name: "rho"
-        Size: 1
-        EvalMode: none
-        No basis
-      Input field 1:
-        Name: "u"
-        Size: 1
-        EvalMode: interpolation
-        Active vector
-    1 output field:
-      Output field 0:
-        Name: "v"
-        Size: 1
-        EvalMode: interpolation
-        Active vector
+  Composite CeedOperator - mass
+    SubOperator 0 - triangle elements:
+      6 elements with 4 quadrature points each
+      3 fields
+      2 input fields:
+        Input field 0:
+          Name: "rho"
+          Size: 1
+          EvalMode: none
+          No basis
+        Input field 1:
+          Name: "u"
+          Size: 1
+          EvalMode: interpolation
+          Active vector
+      1 output field:
+        Output field 0:
+          Name: "v"
+          Size: 1
+          EvalMode: interpolation
+          Active vector
+    SubOperator 1 - quadrilateral elements:
+      6 elements with 16 quadrature points each
+      3 fields
+      2 input fields:
+        Input field 0:
+          Name: "rho"
+          Size: 1
+          EvalMode: none
+          No basis
+        Input field 1:
+          Name: "u"
+          Size: 1
+          EvalMode: interpolation
+          Active vector
+      1 output field:
+        Output field 0:
+          Name: "v"
+          Size: 1
+          EvalMode: interpolation
+          Active vector
diff --git a/tests/output/t523-operator.out b/tests/output/t523-operator.out
index 87d61f5b0a..742f6954a7 100644
--- a/tests/output/t523-operator.out
+++ b/tests/output/t523-operator.out
@@ -42,47 +42,47 @@ Composite CeedOperator - setup
         Size: 1
         EvalMode: none
         No basis
-Composite CeedOperator - mass
-  SubOperator 0 - triangle elements
-  SubOperator 1 - quadrilateral elements
-Composite CeedOperator - mass
-  SubOperator 0 - triangle elements:
-    6 elements with 4 quadrature points each
-    3 fields
-    2 input fields:
-      Input field 0:
-        Name: "rho"
-        Size: 1
-        EvalMode: none
-        No basis
-      Input field 1:
-        Name: "u"
-        Size: 1
-        EvalMode: interpolation
-        Active vector
-    1 output field:
-      Output field 0:
-        Name: "v"
-        Size: 1
-        EvalMode: interpolation
-        Active vector
-  SubOperator 1 - quadrilateral elements:
-    6 elements with 16 quadrature points each
-    3 fields
-    2 input fields:
-      Input field 0:
-        Name: "rho"
-        Size: 1
-        EvalMode: none
-        No basis
-      Input field 1:
-        Name: "u"
-        Size: 1
-        EvalMode: interpolation
-        Active vector
-    1 output field:
-      Output field 0:
-        Name: "v"
-        Size: 1
-        EvalMode: interpolation
-        Active vector
+  Composite CeedOperator - mass
+    SubOperator 0 - triangle elements
+    SubOperator 1 - quadrilateral elements
+  Composite CeedOperator - mass
+    SubOperator 0 - triangle elements:
+      6 elements with 4 quadrature points each
+      3 fields
+      2 input fields:
+        Input field 0:
+          Name: "rho"
+          Size: 1
+          EvalMode: none
+          No basis
+        Input field 1:
+          Name: "u"
+          Size: 1
+          EvalMode: interpolation
+          Active vector
+      1 output field:
+        Output field 0:
+          Name: "v"
+          Size: 1
+          EvalMode: interpolation
+          Active vector
+    SubOperator 1 - quadrilateral elements:
+      6 elements with 16 quadrature points each
+      3 fields
+      2 input fields:
+        Input field 0:
+          Name: "rho"
+          Size: 1
+          EvalMode: none
+          No basis
+        Input field 1:
+          Name: "u"
+          Size: 1
+          EvalMode: interpolation
+          Active vector
+      1 output field:
+        Output field 0:
+          Name: "v"
+          Size: 1
+          EvalMode: interpolation
+          Active vector
diff --git a/tests/t504-operator-f.f90 b/tests/t504-operator-f.f90
index 5e555d5c13..beedfda264 100644
--- a/tests/t504-operator-f.f90
+++ b/tests/t504-operator-f.f90
@@ -95,6 +95,7 @@ program test
       call ceedoperatorsetname(op_setup,'setup',err)
       call ceedoperatorview(op_setup,err)
       call ceedoperatorsetname(op_mass,'mass',err)
+      call ceedoperatorsetnumviewtabs(op_mass,1,err)
       call ceedoperatorview(op_mass,err)
 
       call ceedvectordestroy(qdata,err)
diff --git a/tests/t504-operator.c b/tests/t504-operator.c
index 4d75c79b0c..d7046ab08d 100644
--- a/tests/t504-operator.c
+++ b/tests/t504-operator.c
@@ -70,6 +70,7 @@ int main(int argc, char **argv) {
   CeedOperatorViewTerse(op_setup, stdout);
   CeedOperatorView(op_setup, stdout);
   CeedOperatorSetName(op_mass, "mass");
+  CeedOperatorSetNumViewTabs(op_mass, 1);
   CeedOperatorViewTerse(op_mass, stdout);
   CeedOperatorView(op_mass, stdout);
 
diff --git a/tests/t523-operator-f.f90 b/tests/t523-operator-f.f90
index ea20128d3c..fcd4504fe5 100644
--- a/tests/t523-operator-f.f90
+++ b/tests/t523-operator-f.f90
@@ -212,6 +212,7 @@ program test
 
       call ceedoperatorcreatecomposite(ceed,op_mass,err)
       call ceedoperatorsetname(op_mass,'mass',err)
+      call ceedoperatorsetnumviewtabs(op_mass,1,err)
       call ceedoperatorcompositeaddsub(op_mass,op_masstet,err)
       call ceedoperatorcompositeaddsub(op_mass,op_masshex,err)
 
diff --git a/tests/t523-operator.c b/tests/t523-operator.c
index 2e9dd5c264..a1e2307839 100644
--- a/tests/t523-operator.c
+++ b/tests/t523-operator.c
@@ -166,6 +166,7 @@ int main(int argc, char **argv) {
   // View
   CeedOperatorViewTerse(op_setup, stdout);
   CeedOperatorView(op_setup, stdout);
+  CeedOperatorSetNumViewTabs(op_mass, 1);
   CeedOperatorViewTerse(op_mass, stdout);
   CeedOperatorView(op_mass, stdout);
 

From b7fd8817a7bcbefc12e1275a5b7700c26a9adcf5 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Wed, 1 Oct 2025 10:04:33 -0600
Subject: [PATCH 503/571] op - minor naming consistency

---
 interface/ceed-operator.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/interface/ceed-operator.c b/interface/ceed-operator.c
index e2943cd3cf..907b4e8a8b 100644
--- a/interface/ceed-operator.c
+++ b/interface/ceed-operator.c
@@ -88,16 +88,16 @@ static int CeedOperatorCheckField(Ceed ceed, CeedQFunctionField qf_field, CeedEl
   @param[in] qf_field     `CeedQFunction` Field (carries field name)
   @param[in] field_number Number of field being viewed
   @param[in] tabs         Tabs to append before each line
-  @param[in] input        true for an input field; false for output field
+  @param[in] is_input    `true` for an input field; `false` for output field
   @param[in] stream       Stream to view to, e.g., `stdout`
 
   @return An error code: 0 - success, otherwise - failure
 
   @ref Utility
 **/
-static int CeedOperatorFieldView(CeedOperatorField op_field, CeedQFunctionField qf_field, CeedInt field_number, const char *tabs, bool input,
+static int CeedOperatorFieldView(CeedOperatorField op_field, CeedQFunctionField qf_field, CeedInt field_number, const char *tabs, bool is_input,
                                  FILE *stream) {
-  const char  *in_out = input ? "Input" : "Output";
+  const char  *field_type = is_input ? "Input" : "Output";
   const char  *field_name;
   CeedInt      size;
   CeedEvalMode eval_mode;
@@ -112,7 +112,7 @@ static int CeedOperatorFieldView(CeedOperatorField op_field, CeedQFunctionField
           "%s    %s field %" CeedInt_FMT
           ":\n"
           "%s      Name: \"%s\"\n",
-          tabs, in_out, field_number, tabs, field_name);
+          tabs, field_type, field_number, tabs, field_name);
   fprintf(stream, "%s      Size: %" CeedInt_FMT "\n", tabs, size);
   fprintf(stream, "%s      EvalMode: %s\n", tabs, CeedEvalModes[eval_mode]);
   if (basis == CEED_BASIS_NONE) fprintf(stream, "%s      No basis\n", tabs);

From df1daa628943f4e245c41a90ef855a29606b49ff Mon Sep 17 00:00:00 2001
From: Zach Atkins <zach.atkins@colorado.edu>
Date: Thu, 2 Oct 2025 18:26:53 -0600
Subject: [PATCH 504/571] minor - Don't fprintf string pointers

---
 interface/ceed-operator.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/interface/ceed-operator.c b/interface/ceed-operator.c
index 907b4e8a8b..8da35425fd 100644
--- a/interface/ceed-operator.c
+++ b/interface/ceed-operator.c
@@ -1622,18 +1622,18 @@ static int CeedOperatorView_Core(CeedOperator op, FILE *stream, bool is_full) {
 
     CeedCall(CeedOperatorCompositeGetNumSub(op, &num_suboperators));
     CeedCall(CeedOperatorCompositeGetSubList(op, &sub_operators));
-    fprintf(stream, tabs);
+    fprintf(stream, "%s", tabs);
     fprintf(stream, "Composite CeedOperator%s%s\n", has_name ? " - " : "", has_name ? name : "");
     for (CeedInt i = 0; i < tab_width; i++) tabs[tab_width * num_tabs + i] = ' ';
     for (CeedInt i = 0; i < num_suboperators; i++) {
       has_name = sub_operators[i]->name;
-      fprintf(stream, tabs);
+      fprintf(stream, "%s", tabs);
       fprintf(stream, "SubOperator%s %" CeedInt_FMT "%s%s%s\n", is_at_points ? " AtPoints" : "", i, has_name ? " - " : "",
               has_name ? sub_operators[i]->name : "", is_full ? ":" : "");
       if (is_full) CeedCall(CeedOperatorSingleView(sub_operators[i], tabs, stream));
     }
   } else {
-    fprintf(stream, tabs);
+    fprintf(stream, "%s", tabs);
     fprintf(stream, "CeedOperator%s%s%s\n", is_at_points ? " AtPoints" : "", has_name ? " - " : "", has_name ? name : "");
     if (is_full) CeedCall(CeedOperatorSingleView(op, tabs, stream));
   }

From f32835d2003e815a62ae274a1abec7ba5861b972 Mon Sep 17 00:00:00 2001
From: Zach Atkins <zach.atkins@colorado.edu>
Date: Thu, 9 Oct 2025 11:00:50 -0600
Subject: [PATCH 505/571] fix(ref atpoints) - Fix ref assembly at-points to
 properly handle empty elements

---
 backends/ref/ceed-ref-operator.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/backends/ref/ceed-ref-operator.c b/backends/ref/ceed-ref-operator.c
index 6b306f7cba..5bad14bb6f 100644
--- a/backends/ref/ceed-ref-operator.c
+++ b/backends/ref/ceed-ref-operator.c
@@ -1208,6 +1208,7 @@ static inline int CeedOperatorLinearAssembleQFunctionAtPointsCore_Ref(CeedOperat
     // Setup points for element
     CeedCallBackend(CeedElemRestrictionApplyAtPointsInElement(rstr_points, e, CEED_NOTRANSPOSE, point_coords, impl->point_coords_elem, request));
     CeedCallBackend(CeedElemRestrictionGetNumPointsInElement(rstr_points, e, &num_points));
+    if (num_points < 1) continue;
 
     // Input basis apply
     CeedCallBackend(CeedOperatorInputBasisAtPoints_Ref(e, num_points_offset, num_points, qf_input_fields, op_input_fields, num_input_fields, NULL,
@@ -1391,6 +1392,7 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Ref(CeedOperator op, Ce
     // Setup points for element
     CeedCallBackend(CeedElemRestrictionApplyAtPointsInElement(rstr_points, e, CEED_NOTRANSPOSE, point_coords, impl->point_coords_elem, request));
     CeedCallBackend(CeedElemRestrictionGetNumPointsInElement(rstr_points, e, &num_points));
+    if (num_points < 1) continue;
 
     // Input basis apply for non-active bases
     CeedCallBackend(CeedOperatorInputBasisAtPoints_Ref(e, num_points_offset, num_points, qf_input_fields, op_input_fields, num_input_fields, in_vec,
@@ -1597,6 +1599,7 @@ static int CeedOperatorAssembleSingleAtPoints_Ref(CeedOperator op, CeedInt offse
     CeedCallBackend(CeedElemRestrictionApplyAtPointsInElement(rstr_points, e, CEED_NOTRANSPOSE, point_coords, impl->point_coords_elem,
                                                               CEED_REQUEST_IMMEDIATE));
     CeedCallBackend(CeedElemRestrictionGetNumPointsInElement(rstr_points, e, &num_points));
+    if (num_points < 1) continue;
 
     // Input basis apply for non-active bases
     CeedCallBackend(CeedOperatorInputBasisAtPoints_Ref(e, num_points_offset, num_points, qf_input_fields, op_input_fields, num_input_fields, in_vec,

From dbab11867bb0a967ed0a40975979a130b10b080b Mon Sep 17 00:00:00 2001
From: Zach Atkins <zach.atkins@colorado.edu>
Date: Tue, 14 Oct 2025 17:37:04 -0600
Subject: [PATCH 506/571] atpoints - fix insidious memory bug

---
 backends/cuda-ref/ceed-cuda-ref-restriction.c | 2 +-
 backends/hip-ref/ceed-hip-ref-restriction.c   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/backends/cuda-ref/ceed-cuda-ref-restriction.c b/backends/cuda-ref/ceed-cuda-ref-restriction.c
index 0eb2924975..ac3b061996 100644
--- a/backends/cuda-ref/ceed-cuda-ref-restriction.c
+++ b/backends/cuda-ref/ceed-cuda-ref-restriction.c
@@ -550,7 +550,7 @@ int CeedElemRestrictionCreate_Cuda(CeedMemType mem_type, CeedCopyMode copy_mode,
     CeedCallBackend(CeedMalloc(num_elem, &points_per_elem));
     for (CeedInt i = 0; i < num_elem; i++) {
       CeedInt num_points = offsets[i + 1] - offsets[i];
-      CeedInt last_point = offsets[offsets[i]] * num_comp;
+      CeedInt last_point = 0;
 
       points_per_elem[i] = num_points;
       at_points_size += num_points;
diff --git a/backends/hip-ref/ceed-hip-ref-restriction.c b/backends/hip-ref/ceed-hip-ref-restriction.c
index 1b21b0e5cb..56080e8676 100644
--- a/backends/hip-ref/ceed-hip-ref-restriction.c
+++ b/backends/hip-ref/ceed-hip-ref-restriction.c
@@ -551,7 +551,7 @@ int CeedElemRestrictionCreate_Hip(CeedMemType mem_type, CeedCopyMode copy_mode,
     CeedCallBackend(CeedMalloc(num_elem, &points_per_elem));
     for (CeedInt i = 0; i < num_elem; i++) {
       CeedInt num_points = offsets[i + 1] - offsets[i];
-      CeedInt last_point = offsets[offsets[i]] * num_comp;
+      CeedInt last_point = 0;
 
       points_per_elem[i] = num_points;
       at_points_size += num_points;

From 4c789ea28eed8450eb29ac731b0b6c80a3f128bb Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 7 Nov 2025 13:57:39 -0700
Subject: [PATCH 507/571] view - add Ceed*[Get/Set]ViewTabs for all objects

---
 include/ceed-impl.h               |  6 +++
 include/ceed/backend.h            | 10 ++++
 include/ceed/ceed.h               |  6 +++
 interface/ceed-basis.c            | 78 ++++++++++++++++++++++++-------
 interface/ceed-elemrestriction.c  | 49 +++++++++++++++++--
 interface/ceed-operator.c         | 17 ++++---
 interface/ceed-qfunction.c        | 64 ++++++++++++++++++++-----
 interface/ceed-qfunctioncontext.c | 48 +++++++++++++++++--
 interface/ceed-vector.c           | 49 +++++++++++++++++--
 interface/ceed.c                  | 49 +++++++++++++++++--
 10 files changed, 323 insertions(+), 53 deletions(-)

diff --git a/include/ceed-impl.h b/include/ceed-impl.h
index 311d0c1bba..6fd4ed523a 100644
--- a/include/ceed-impl.h
+++ b/include/ceed-impl.h
@@ -104,6 +104,7 @@ struct Ceed_private {
   bool         cuda_compile_with_clang;
   char       **jit_defines;
   CeedInt      num_jit_defines, max_jit_defines, num_jit_defines_readers;
+  CeedInt      num_tabs; /* Viewing offset */
   int (*Error)(Ceed, const char *, int, const char *, int, const char *, va_list *);
   int (*SetStream)(Ceed, void *);
   int (*GetPreferredMemType)(CeedMemType *);
@@ -160,6 +161,7 @@ struct CeedVector_private {
   CeedSize length;
   uint64_t state;
   uint64_t num_readers;
+  CeedInt  num_tabs; /* Viewing offset */
   void    *data;
 };
 
@@ -192,6 +194,7 @@ struct CeedElemRestriction_private {
   CeedRestrictionType
            rstr_type;   /* initialized in element restriction constructor for default, oriented, curl-oriented, or strided element restriction */
   uint64_t num_readers; /* number of instances of offset read only access */
+  CeedInt  num_tabs;    /* Viewing offset */
   void    *data;        /* place for the backend to store any data */
 };
 
@@ -225,6 +228,7 @@ struct CeedBasis_private {
                        quadrature points for H(curl) discretizations */
   CeedVector  vec_chebyshev;
   CeedBasis   basis_chebyshev; /* basis interpolating from nodes to Chebyshev polynomial coefficients */
+  CeedInt     num_tabs;        /* Viewing offset */
   void       *data;            /* place for the backend to store any data */
 };
 
@@ -251,6 +255,7 @@ struct CeedQFunction_private {
   int (*Destroy)(CeedQFunction);
   int                  ref_count;
   CeedInt              vec_length; /* Number of quadrature points must be padded to a multiple of vec_length */
+  CeedInt              num_tabs;   /* Viewing offset */
   CeedQFunctionField  *input_fields;
   CeedQFunctionField  *output_fields;
   CeedInt              num_input_fields, num_output_fields;
@@ -286,6 +291,7 @@ struct CeedQFunctionContext_private {
   CeedMemType                         data_destroy_mem_type;
   CeedInt                             num_fields;
   CeedInt                             max_fields;
+  CeedInt                             num_tabs; /* Viewing offset */
   CeedContextFieldLabel              *field_labels;
   uint64_t                            state;
   uint64_t                            num_readers;
diff --git a/include/ceed/backend.h b/include/ceed/backend.h
index ec4534b298..a5785b4937 100644
--- a/include/ceed/backend.h
+++ b/include/ceed/backend.h
@@ -74,6 +74,10 @@
 #define CeedPragmaCritical(x) CeedPragmaOMP(critical(x))
 #endif
 
+/// This macro provides the tab width for viewing Ceed objects.
+/// @ingroup Ceed
+#define CEED_TAB_WIDTH 2
+
 /**
   This enum supplies common colors for CeedDebug256 debugging output.
   Set the environment variable `CEED_DEBUG = 1` to activate debugging output.
@@ -248,6 +252,7 @@ CEED_EXTERN int CeedGetObjectDelegate(Ceed ceed, Ceed *delegate, const char *obj
 CEED_EXTERN int CeedSetObjectDelegate(Ceed ceed, Ceed delegate, const char *obj_name);
 CEED_EXTERN int CeedGetOperatorFallbackCeed(Ceed ceed, Ceed *fallback_ceed);
 CEED_EXTERN int CeedSetOperatorFallbackCeed(Ceed ceed, Ceed fallback_ceed);
+CEED_EXTERN int CeedGetNumViewTabs(Ceed ceed, CeedInt *num_tabs);
 CEED_EXTERN int CeedSetDeterministic(Ceed ceed, bool is_deterministic);
 CEED_EXTERN int CeedSetBackendFunctionImpl(Ceed ceed, const char *type, void *object, const char *func_name, void (*f)(void));
 CEED_EXTERN int CeedGetData(Ceed ceed, void *data);
@@ -264,6 +269,7 @@ CEED_EXTERN int CeedRestoreRustSourceRoots(Ceed ceed, const char ***rust_source_
 CEED_EXTERN int CeedGetJitDefines(Ceed ceed, CeedInt *num_defines, const char ***jit_defines);
 CEED_EXTERN int CeedRestoreJitDefines(Ceed ceed, const char ***jit_defines);
 
+CEED_EXTERN int CeedVectorGetNumViewTabs(CeedVector vec, CeedInt *num_tabs);
 CEED_EXTERN int CeedVectorHasValidArray(CeedVector vec, bool *has_valid_array);
 CEED_EXTERN int CeedVectorHasBorrowedArrayOfType(CeedVector vec, CeedMemType mem_type, bool *has_borrowed_array_of_type);
 CEED_EXTERN int CeedVectorHasValidArray(CeedVector vec, bool *has_valid_array);
@@ -292,6 +298,7 @@ typedef enum {
 } CeedRestrictionType;
 
 CEED_EXTERN int CeedElemRestrictionGetType(CeedElemRestriction rstr, CeedRestrictionType *rstr_type);
+CEED_EXTERN int CeedElemRestrictionGetNumViewTabs(CeedElemRestriction rstr, CeedInt *num_tabs);
 CEED_EXTERN int CeedElemRestrictionIsStrided(CeedElemRestriction rstr, bool *is_strided);
 CEED_EXTERN int CeedElemRestrictionIsAtPoints(CeedElemRestriction rstr, bool *is_points);
 CEED_EXTERN int CeedElemRestrictionAtPointsAreCompatible(CeedElemRestriction rstr_a, CeedElemRestriction rstr_b, bool *are_compatible);
@@ -332,6 +339,7 @@ CEED_EXTERN const char *const CeedFESpaces[];
 
 CEED_EXTERN int CeedBasisGetCollocatedGrad(CeedBasis basis, CeedScalar *colo_grad_1d);
 CEED_EXTERN int CeedBasisGetChebyshevInterp1D(CeedBasis basis, CeedScalar *chebyshev_interp_1d);
+CEED_EXTERN int CeedBasisGetNumViewTabs(CeedBasis basis, CeedInt *num_tabs);
 CEED_EXTERN int CeedBasisIsTensor(CeedBasis basis, bool *is_tensor);
 CEED_EXTERN int CeedBasisIsCollocated(CeedBasis basis, bool *is_collocated);
 CEED_EXTERN int CeedBasisGetData(CeedBasis basis, void *data);
@@ -372,6 +380,7 @@ CEED_EXTERN int CeedQFunctionGetName(CeedQFunction qf, const char **name);
 CEED_EXTERN int CeedQFunctionGetSourcePath(CeedQFunction qf, const char **source_path);
 CEED_EXTERN int CeedQFunctionLoadSourceToBuffer(CeedQFunction qf, const char **source_buffer);
 CEED_EXTERN int CeedQFunctionGetUserFunction(CeedQFunction qf, CeedQFunctionUser *f);
+CEED_EXTERN int CeedQFunctionGetNumViewTabs(CeedQFunction qf, CeedInt *num_tabs);
 CEED_EXTERN int CeedQFunctionGetContext(CeedQFunction qf, CeedQFunctionContext *ctx);
 CEED_EXTERN int CeedQFunctionGetContextData(CeedQFunction qf, CeedMemType mem_type, void *data);
 CEED_EXTERN int CeedQFunctionRestoreContextData(CeedQFunction qf, void *data);
@@ -389,6 +398,7 @@ CEED_EXTERN int CeedQFunctionGetFlopsEstimate(CeedQFunction qf, CeedSize *flops)
 
 CEED_EXTERN int  CeedQFunctionContextGetCeed(CeedQFunctionContext ctx, Ceed *ceed);
 CEED_EXTERN Ceed CeedQFunctionContextReturnCeed(CeedQFunctionContext ctx);
+CEED_EXTERN int  CeedQFunctionContextGetNumViewTabs(CeedQFunctionContext ctx, CeedInt *num_tabs);
 CEED_EXTERN int  CeedQFunctionContextHasValidData(CeedQFunctionContext ctx, bool *has_valid_data);
 CEED_EXTERN int  CeedQFunctionContextHasBorrowedDataOfType(CeedQFunctionContext ctx, CeedMemType mem_type, bool *has_borrowed_data_of_type);
 CEED_EXTERN int  CeedQFunctionContextGetState(CeedQFunctionContext ctx, uint64_t *state);
diff --git a/include/ceed/ceed.h b/include/ceed/ceed.h
index a4ed4a9006..69c8afe281 100644
--- a/include/ceed/ceed.h
+++ b/include/ceed/ceed.h
@@ -109,6 +109,7 @@ CEED_EXTERN int CeedIsDeterministic(Ceed ceed, bool *is_deterministic);
 CEED_EXTERN int CeedAddJitSourceRoot(Ceed ceed, const char *jit_source_root);
 CEED_EXTERN int CeedAddRustSourceRoot(Ceed ceed, const char *rust_source_root);
 CEED_EXTERN int CeedAddJitDefine(Ceed ceed, const char *jit_define);
+CEED_EXTERN int CeedSetNumViewTabs(Ceed ceed, CeedInt num_tabs);
 CEED_EXTERN int CeedView(Ceed ceed, FILE *stream);
 CEED_EXTERN int CeedDestroy(Ceed *ceed);
 CEED_EXTERN int CeedErrorImpl(Ceed ceed, const char *filename, int lineno, const char *func, int ecode, const char *format, ...);
@@ -204,6 +205,7 @@ CEED_EXTERN int  CeedVectorAXPY(CeedVector y, CeedScalar alpha, CeedVector x);
 CEED_EXTERN int  CeedVectorAXPBY(CeedVector y, CeedScalar alpha, CeedScalar beta, CeedVector x);
 CEED_EXTERN int  CeedVectorPointwiseMult(CeedVector w, CeedVector x, CeedVector y);
 CEED_EXTERN int  CeedVectorReciprocal(CeedVector vec);
+CEED_EXTERN int  CeedVectorSetNumViewTabs(CeedVector vec, CeedInt num_tabs);
 CEED_EXTERN int  CeedVectorViewRange(CeedVector vec, CeedSize start, CeedSize stop, CeedInt step, const char *fp_fmt, FILE *stream);
 CEED_EXTERN int  CeedVectorView(CeedVector vec, const char *fp_fmt, FILE *stream);
 CEED_EXTERN int  CeedVectorGetCeed(CeedVector vec, Ceed *ceed);
@@ -293,6 +295,7 @@ CEED_EXTERN int  CeedElemRestrictionGetNumComponents(CeedElemRestriction rstr, C
 CEED_EXTERN int  CeedElemRestrictionGetNumBlocks(CeedElemRestriction rstr, CeedInt *num_block);
 CEED_EXTERN int  CeedElemRestrictionGetBlockSize(CeedElemRestriction rstr, CeedInt *block_size);
 CEED_EXTERN int  CeedElemRestrictionGetMultiplicity(CeedElemRestriction rstr, CeedVector mult);
+CEED_EXTERN int  CeedElemRestrictionSetNumViewTabs(CeedElemRestriction rstr, CeedInt num_tabs);
 CEED_EXTERN int  CeedElemRestrictionView(CeedElemRestriction rstr, FILE *stream);
 CEED_EXTERN int  CeedElemRestrictionDestroy(CeedElemRestriction *rstr);
 
@@ -312,6 +315,7 @@ CEED_EXTERN int CeedBasisCreateHcurl(Ceed ceed, CeedElemTopology topo, CeedInt n
                                      const CeedScalar *curl, const CeedScalar *q_ref, const CeedScalar *q_weights, CeedBasis *basis);
 CEED_EXTERN int CeedBasisCreateProjection(CeedBasis basis_from, CeedBasis basis_to, CeedBasis *basis_project);
 CEED_EXTERN int CeedBasisReferenceCopy(CeedBasis basis, CeedBasis *basis_copy);
+CEED_EXTERN int CeedBasisSetNumViewTabs(CeedBasis basis, CeedInt num_tabs);
 CEED_EXTERN int CeedBasisView(CeedBasis basis, FILE *stream);
 CEED_EXTERN int CeedBasisApply(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, CeedVector v);
 CEED_EXTERN int CeedBasisApplyAdd(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, CeedVector v);
@@ -368,6 +372,7 @@ CEED_EXTERN int  CeedQFunctionGetFields(CeedQFunction qf, CeedInt *num_input_fie
 CEED_EXTERN int  CeedQFunctionSetContext(CeedQFunction qf, CeedQFunctionContext ctx);
 CEED_EXTERN int  CeedQFunctionSetContextWritable(CeedQFunction qf, bool is_writable);
 CEED_EXTERN int  CeedQFunctionSetUserFlopsEstimate(CeedQFunction qf, CeedSize flops);
+CEED_EXTERN int  CeedQFunctionSetNumViewTabs(CeedQFunction qf, CeedInt num_tabs);
 CEED_EXTERN int  CeedQFunctionView(CeedQFunction qf, FILE *stream);
 CEED_EXTERN int  CeedQFunctionGetCeed(CeedQFunction qf, Ceed *ceed);
 CEED_EXTERN Ceed CeedQFunctionReturnCeed(CeedQFunction qf);
@@ -406,6 +411,7 @@ CEED_EXTERN int CeedQFunctionContextGetAllFieldLabels(CeedQFunctionContext ctx,
 CEED_EXTERN int CeedContextFieldLabelGetDescription(CeedContextFieldLabel label, const char **field_name, size_t *field_offset, size_t *num_values,
                                                     const char **field_description, CeedContextFieldType *field_type);
 CEED_EXTERN int CeedQFunctionContextGetContextSize(CeedQFunctionContext ctx, size_t *ctx_size);
+CEED_EXTERN int CeedQFunctionContextSetNumViewTabs(CeedQFunctionContext ctx, CeedInt num_tabs);
 CEED_EXTERN int CeedQFunctionContextView(CeedQFunctionContext ctx, FILE *stream);
 CEED_EXTERN int CeedQFunctionContextSetDataDestroy(CeedQFunctionContext ctx, CeedMemType f_mem_type, CeedQFunctionContextDataDestroyUser f);
 CEED_EXTERN int CeedQFunctionContextDestroy(CeedQFunctionContext *ctx);
diff --git a/interface/ceed-basis.c b/interface/ceed-basis.c
index 6dfbd55d61..ce1ccae1dc 100644
--- a/interface/ceed-basis.c
+++ b/interface/ceed-basis.c
@@ -153,23 +153,24 @@ static int CeedGivensRotation(CeedScalar *A, CeedScalar c, CeedScalar s, CeedTra
   @param[in] m      Number of rows in array
   @param[in] n      Number of columns in array
   @param[in] a      Array to be viewed
+  @param[in] tabs   Tabs to append before each new line
   @param[in] stream Stream to view to, e.g., `stdout`
 
   @return An error code: 0 - success, otherwise - failure
 
   @ref Developer
 **/
-static int CeedScalarView(const char *name, const char *fp_fmt, CeedInt m, CeedInt n, const CeedScalar *a, FILE *stream) {
+static int CeedScalarView(const char *name, const char *fp_fmt, CeedInt m, CeedInt n, const CeedScalar *a, const char *tabs, FILE *stream) {
   if (m > 1) {
-    fprintf(stream, "  %s:\n", name);
+    fprintf(stream, "%s  %s:\n", tabs, name);
   } else {
     char padded_name[12];
 
     snprintf(padded_name, 11, "%s:", name);
-    fprintf(stream, "  %-10s", padded_name);
+    fprintf(stream, "%s  %-10s", tabs, padded_name);
   }
   for (CeedInt i = 0; i < m; i++) {
-    if (m > 1) fprintf(stream, "    [%" CeedInt_FMT "]", i);
+    if (m > 1) fprintf(stream, "%s    [%" CeedInt_FMT "]", tabs, i);
     for (CeedInt j = 0; j < n; j++) fprintf(stream, fp_fmt, fabs(a[i * n + j]) > 1E-14 ? a[i * n + j] : 0);
     fputs("\n", stream);
   }
@@ -723,6 +724,21 @@ int CeedBasisGetCollocatedGrad(CeedBasis basis, CeedScalar *collo_grad_1d) {
   return CEED_ERROR_SUCCESS;
 }
 
+/**
+  @brief Get the number of tabs to indent for @ref CeedBasisView() output
+
+  @param[in]  basis    `CeedBasis` to get the number of view tabs
+  @param[out] num_tabs Number of view tabs
+
+  @return Error code: 0 - success, otherwise - failure
+
+  @ref Backend
+**/
+int CeedBasisGetNumViewTabs(CeedBasis basis, CeedInt *num_tabs) {
+  *num_tabs = basis->num_tabs;
+  return CEED_ERROR_SUCCESS;
+}
+
 /**
   @brief Return 1D interpolation matrix to Chebyshev polynomial coefficients on quadrature space
 
@@ -1892,6 +1908,22 @@ int CeedBasisReferenceCopy(CeedBasis basis, CeedBasis *basis_copy) {
   return CEED_ERROR_SUCCESS;
 }
 
+/**
+  @brief Set the number of tabs to indent for @ref CeedBasisView() output
+
+  @param[in] basis    `CeedBasis` to set the number of view tabs
+  @param[in] num_tabs Number of view tabs to set
+
+  @return Error code: 0 - success, otherwise - failure
+
+  @ref User
+**/
+int CeedBasisSetNumViewTabs(CeedBasis basis, CeedInt num_tabs) {
+  CeedCheck(num_tabs >= 0, CeedBasisReturnCeed(basis), CEED_ERROR_MINOR, "Number of view tabs must be non-negative");
+  basis->num_tabs = num_tabs;
+  return CEED_ERROR_SUCCESS;
+}
+
 /**
   @brief View a `CeedBasis`
 
@@ -1904,6 +1936,7 @@ int CeedBasisReferenceCopy(CeedBasis basis, CeedBasis *basis_copy) {
 **/
 int CeedBasisView(CeedBasis basis, FILE *stream) {
   bool             is_tensor_basis;
+  char            *tabs = NULL;
   CeedElemTopology topo;
   CeedFESpace      fe_space;
 
@@ -1912,14 +1945,22 @@ int CeedBasisView(CeedBasis basis, FILE *stream) {
   CeedCall(CeedBasisGetTopology(basis, &topo));
   CeedCall(CeedBasisGetFESpace(basis, &fe_space));
 
+  {
+    CeedInt num_tabs = 0;
+
+    CeedCall(CeedBasisGetNumViewTabs(basis, &num_tabs));
+    CeedCall(CeedCalloc(CEED_TAB_WIDTH * num_tabs + 1, &tabs));
+    for (CeedInt i = 0; i < CEED_TAB_WIDTH * num_tabs; i++) tabs[i] = ' ';
+  }
+
   // Print FE space and element topology of the basis
-  fprintf(stream, "CeedBasis in a %s on a %s element\n", CeedFESpaces[fe_space], CeedElemTopologies[topo]);
+  fprintf(stream, "%sCeedBasis in a %s on a %s element\n", tabs, CeedFESpaces[fe_space], CeedElemTopologies[topo]);
   if (is_tensor_basis) {
-    fprintf(stream, "  P: %" CeedInt_FMT "\n  Q: %" CeedInt_FMT "\n", basis->P_1d, basis->Q_1d);
+    fprintf(stream, "%s  P: %" CeedInt_FMT "\n%s  Q: %" CeedInt_FMT "\n", tabs, basis->P_1d, tabs, basis->Q_1d);
   } else {
-    fprintf(stream, "  P: %" CeedInt_FMT "\n  Q: %" CeedInt_FMT "\n", basis->P, basis->Q);
+    fprintf(stream, "%s  P: %" CeedInt_FMT "\n%s  Q: %" CeedInt_FMT "\n", tabs, basis->P, tabs, basis->Q);
   }
-  fprintf(stream, "  dimension: %" CeedInt_FMT "\n  field components: %" CeedInt_FMT "\n", basis->dim, basis->num_comp);
+  fprintf(stream, "%s  dimension: %" CeedInt_FMT "\n%s  field components: %" CeedInt_FMT "\n", tabs, basis->dim, tabs, basis->num_comp);
   // Print quadrature data, interpolation/gradient/divergence/curl of the basis
   if (is_tensor_basis) {  // tensor basis
     CeedInt           P_1d, Q_1d;
@@ -1932,10 +1973,10 @@ int CeedBasisView(CeedBasis basis, FILE *stream) {
     CeedCall(CeedBasisGetInterp1D(basis, &interp_1d));
     CeedCall(CeedBasisGetGrad1D(basis, &grad_1d));
 
-    CeedCall(CeedScalarView("qref1d", "\t% 12.8f", 1, Q_1d, q_ref_1d, stream));
-    CeedCall(CeedScalarView("qweight1d", "\t% 12.8f", 1, Q_1d, q_weight_1d, stream));
-    CeedCall(CeedScalarView("interp1d", "\t% 12.8f", Q_1d, P_1d, interp_1d, stream));
-    CeedCall(CeedScalarView("grad1d", "\t% 12.8f", Q_1d, P_1d, grad_1d, stream));
+    CeedCall(CeedScalarView("qref1d", "\t% 12.8f", 1, Q_1d, q_ref_1d, tabs, stream));
+    CeedCall(CeedScalarView("qweight1d", "\t% 12.8f", 1, Q_1d, q_weight_1d, tabs, stream));
+    CeedCall(CeedScalarView("interp1d", "\t% 12.8f", Q_1d, P_1d, interp_1d, tabs, stream));
+    CeedCall(CeedScalarView("grad1d", "\t% 12.8f", Q_1d, P_1d, grad_1d, tabs, stream));
   } else {  // non-tensor basis
     CeedInt           P, Q, dim, q_comp;
     const CeedScalar *q_ref, *q_weight, *interp, *grad, *div, *curl;
@@ -1950,23 +1991,24 @@ int CeedBasisView(CeedBasis basis, FILE *stream) {
     CeedCall(CeedBasisGetDiv(basis, &div));
     CeedCall(CeedBasisGetCurl(basis, &curl));
 
-    CeedCall(CeedScalarView("qref", "\t% 12.8f", 1, Q * dim, q_ref, stream));
-    CeedCall(CeedScalarView("qweight", "\t% 12.8f", 1, Q, q_weight, stream));
+    CeedCall(CeedScalarView("qref", "\t% 12.8f", 1, Q * dim, q_ref, tabs, stream));
+    CeedCall(CeedScalarView("qweight", "\t% 12.8f", 1, Q, q_weight, tabs, stream));
     CeedCall(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp));
-    CeedCall(CeedScalarView("interp", "\t% 12.8f", q_comp * Q, P, interp, stream));
+    CeedCall(CeedScalarView("interp", "\t% 12.8f", q_comp * Q, P, interp, tabs, stream));
     if (grad) {
       CeedCall(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_GRAD, &q_comp));
-      CeedCall(CeedScalarView("grad", "\t% 12.8f", q_comp * Q, P, grad, stream));
+      CeedCall(CeedScalarView("grad", "\t% 12.8f", q_comp * Q, P, grad, tabs, stream));
     }
     if (div) {
       CeedCall(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_DIV, &q_comp));
-      CeedCall(CeedScalarView("div", "\t% 12.8f", q_comp * Q, P, div, stream));
+      CeedCall(CeedScalarView("div", "\t% 12.8f", q_comp * Q, P, div, tabs, stream));
     }
     if (curl) {
       CeedCall(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_CURL, &q_comp));
-      CeedCall(CeedScalarView("curl", "\t% 12.8f", q_comp * Q, P, curl, stream));
+      CeedCall(CeedScalarView("curl", "\t% 12.8f", q_comp * Q, P, curl, tabs, stream));
     }
   }
+  CeedCall(CeedFree(&tabs));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/interface/ceed-elemrestriction.c b/interface/ceed-elemrestriction.c
index 5da52c8b25..e8f5cf7a8d 100644
--- a/interface/ceed-elemrestriction.c
+++ b/interface/ceed-elemrestriction.c
@@ -121,6 +121,21 @@ int CeedElemRestrictionGetType(CeedElemRestriction rstr, CeedRestrictionType *rs
   return CEED_ERROR_SUCCESS;
 }
 
+/**
+  @brief Get the number of tabs to indent for @ref CeedElemRestrictionView() output
+
+  @param[in]  rstr     `CeedElemRestriction` to get the number of view tabs
+  @param[out] num_tabs Number of view tabs
+
+  @return Error code: 0 - success, otherwise - failure
+
+  @ref Backend
+**/
+int CeedElemRestrictionGetNumViewTabs(CeedElemRestriction rstr, CeedInt *num_tabs) {
+  *num_tabs = rstr->num_tabs;
+  return CEED_ERROR_SUCCESS;
+}
+
 /**
   @brief Get the strided status of a `CeedElemRestriction`
 
@@ -1721,6 +1736,22 @@ int CeedElemRestrictionGetMultiplicity(CeedElemRestriction rstr, CeedVector mult
   return CEED_ERROR_SUCCESS;
 }
 
+/**
+  @brief Set the number of tabs to indent for @ref CeedElemRestrictionView() output
+
+  @param[in] rstr     `CeedElemRestriction` to set the number of view tabs
+  @param[in] num_tabs Number of view tabs to set
+
+  @return Error code: 0 - success, otherwise - failure
+
+  @ref User
+**/
+int CeedElemRestrictionSetNumViewTabs(CeedElemRestriction rstr, CeedInt num_tabs) {
+  CeedCheck(num_tabs >= 0, CeedElemRestrictionReturnCeed(rstr), CEED_ERROR_MINOR, "Number of view tabs must be non-negative");
+  rstr->num_tabs = num_tabs;
+  return CEED_ERROR_SUCCESS;
+}
+
 /**
   @brief View a `CeedElemRestriction`
 
@@ -1732,17 +1763,26 @@ int CeedElemRestrictionGetMultiplicity(CeedElemRestriction rstr, CeedVector mult
   @ref User
 **/
 int CeedElemRestrictionView(CeedElemRestriction rstr, FILE *stream) {
+  char               *tabs = NULL;
   CeedRestrictionType rstr_type;
 
+  {
+    CeedInt num_tabs = 0;
+
+    CeedCall(CeedElemRestrictionGetNumViewTabs(rstr, &num_tabs));
+    CeedCall(CeedCalloc(CEED_TAB_WIDTH * num_tabs + 1, &tabs));
+    for (CeedInt i = 0; i < CEED_TAB_WIDTH * num_tabs; i++) tabs[i] = ' ';
+  }
+
   CeedCall(CeedElemRestrictionGetType(rstr, &rstr_type));
   if (rstr_type == CEED_RESTRICTION_POINTS) {
     CeedInt max_points;
 
     CeedCall(CeedElemRestrictionGetMaxPointsInElement(rstr, &max_points));
     fprintf(stream,
-            "CeedElemRestriction at points from (%" CeedSize_FMT ", %" CeedInt_FMT ") to %" CeedInt_FMT " elements with a maximum of %" CeedInt_FMT
+            "%sCeedElemRestriction at points from (%" CeedSize_FMT ", %" CeedInt_FMT ") to %" CeedInt_FMT " elements with a maximum of %" CeedInt_FMT
             " points on an element\n",
-            rstr->l_size, rstr->num_comp, rstr->num_elem, max_points);
+            tabs, rstr->l_size, rstr->num_comp, rstr->num_elem, max_points);
   } else {
     char strides_str[500];
 
@@ -1752,11 +1792,12 @@ int CeedElemRestrictionView(CeedElemRestriction rstr, FILE *stream) {
       sprintf(strides_str, "%" CeedInt_FMT, rstr->comp_stride);
     }
     fprintf(stream,
-            "%sCeedElemRestriction from (%" CeedSize_FMT ", %" CeedInt_FMT ") to %" CeedInt_FMT " elements with %" CeedInt_FMT
+            "%s%sCeedElemRestriction from (%" CeedSize_FMT ", %" CeedInt_FMT ") to %" CeedInt_FMT " elements with %" CeedInt_FMT
             " nodes each and %s %s\n",
-            rstr->block_size > 1 ? "Blocked " : "", rstr->l_size, rstr->num_comp, rstr->num_elem, rstr->elem_size,
+            tabs, rstr->block_size > 1 ? "Blocked " : "", rstr->l_size, rstr->num_comp, rstr->num_elem, rstr->elem_size,
             rstr->strides ? "strides" : "component stride", strides_str);
   }
+  CeedCall(CeedFree(&tabs));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/interface/ceed-operator.c b/interface/ceed-operator.c
index 8da35425fd..cf4d5625ed 100644
--- a/interface/ceed-operator.c
+++ b/interface/ceed-operator.c
@@ -182,7 +182,7 @@ int CeedOperatorSingleView(CeedOperator op, const char *tabs, FILE *stream) {
 
   @return Error code: 0 - success, otherwise - failure
 
-  @ref User
+  @ref Backend
 **/
 int CeedOperatorGetNumViewTabs(CeedOperator op, CeedInt *num_tabs) {
   *num_tabs = op->num_tabs;
@@ -1602,11 +1602,10 @@ int CeedOperatorGetName(CeedOperator op, const char **name) {
   @ref Developer
 **/
 static int CeedOperatorView_Core(CeedOperator op, FILE *stream, bool is_full) {
-  bool          has_name, is_composite, is_at_points;
-  char         *tabs      = NULL;
-  const char   *name      = NULL;
-  const CeedInt tab_width = 2;
-  CeedInt       num_tabs  = 0;
+  bool        has_name, is_composite, is_at_points;
+  char       *tabs     = NULL;
+  const char *name     = NULL;
+  CeedInt     num_tabs = 0;
 
   CeedCall(CeedOperatorGetName(op, &name));
   has_name = name ? strlen(name) : false;
@@ -1614,8 +1613,8 @@ static int CeedOperatorView_Core(CeedOperator op, FILE *stream, bool is_full) {
   CeedCall(CeedOperatorIsAtPoints(op, &is_at_points));
   // Set tabs
   CeedCall(CeedOperatorGetNumViewTabs(op, &num_tabs));
-  CeedCall(CeedCalloc(tab_width * (num_tabs + is_composite) + 1, &tabs));
-  for (CeedInt i = 0; i < tab_width * num_tabs; i++) tabs[i] = ' ';
+  CeedCall(CeedCalloc(CEED_TAB_WIDTH * (num_tabs + is_composite) + 1, &tabs));
+  for (CeedInt i = 0; i < CEED_TAB_WIDTH * num_tabs; i++) tabs[i] = ' ';
   if (is_composite) {
     CeedInt       num_suboperators;
     CeedOperator *sub_operators;
@@ -1624,7 +1623,7 @@ static int CeedOperatorView_Core(CeedOperator op, FILE *stream, bool is_full) {
     CeedCall(CeedOperatorCompositeGetSubList(op, &sub_operators));
     fprintf(stream, "%s", tabs);
     fprintf(stream, "Composite CeedOperator%s%s\n", has_name ? " - " : "", has_name ? name : "");
-    for (CeedInt i = 0; i < tab_width; i++) tabs[tab_width * num_tabs + i] = ' ';
+    for (CeedInt i = 0; i < CEED_TAB_WIDTH; i++) tabs[CEED_TAB_WIDTH * num_tabs + i] = ' ';
     for (CeedInt i = 0; i < num_suboperators; i++) {
       has_name = sub_operators[i]->name;
       fprintf(stream, "%s", tabs);
diff --git a/interface/ceed-qfunction.c b/interface/ceed-qfunction.c
index 4f9f5e55cd..a503939c8c 100644
--- a/interface/ceed-qfunction.c
+++ b/interface/ceed-qfunction.c
@@ -117,13 +117,14 @@ static int CeedQFunctionFieldSet(CeedQFunctionField *f, const char *field_name,
   @param[in] field        `CeedQFunction` field to view
   @param[in] field_number Number of field being viewed
   @param[in] in           true for input field, false for output
+  @param[in] tabs         Tabs to append before each new line
   @param[in] stream       Stream to view to, e.g., `stdout`
 
   @return An error code: 0 - success, otherwise - failure
 
   @ref Utility
 **/
-static int CeedQFunctionFieldView(CeedQFunctionField field, CeedInt field_number, bool in, FILE *stream) {
+static int CeedQFunctionFieldView(CeedQFunctionField field, CeedInt field_number, bool in, const char *tabs, FILE *stream) {
   const char  *inout = in ? "Input" : "Output";
   const char  *field_name;
   CeedInt      size;
@@ -131,13 +132,13 @@ static int CeedQFunctionFieldView(CeedQFunctionField field, CeedInt field_number
 
   CeedCall(CeedQFunctionFieldGetData(field, &field_name, &size, &eval_mode));
   fprintf(stream,
-          "    %s field %" CeedInt_FMT
-          ":\n"
-          "      Name: \"%s\"\n"
+          "%s    %s field %" CeedInt_FMT
+          ":\n%s"
+          "      Name: \"%s\"\n%s"
           "      Size: %" CeedInt_FMT
-          "\n"
+          "\n%s"
           "      EvalMode: \"%s\"\n",
-          inout, field_number, field_name, size, CeedEvalModes[eval_mode]);
+          tabs, inout, field_number, tabs, field_name, tabs, size, tabs, CeedEvalModes[eval_mode]);
   return CEED_ERROR_SUCCESS;
 }
 
@@ -337,6 +338,21 @@ int CeedQFunctionGetUserFunction(CeedQFunction qf, CeedQFunctionUser *f) {
   return CEED_ERROR_SUCCESS;
 }
 
+/**
+  @brief Get the number of tabs to indent for @ref CeedQFunctionView() output
+
+  @param[in]  qf       `CeedQFunction` to get the number of view tabs
+  @param[out] num_tabs Number of view tabs
+
+  @return Error code: 0 - success, otherwise - failure
+
+  @ref Backend
+**/
+int CeedQFunctionGetNumViewTabs(CeedQFunction qf, CeedInt *num_tabs) {
+  *num_tabs = qf->num_tabs;
+  return CEED_ERROR_SUCCESS;
+}
+
 /**
   @brief Get global context for a `CeedQFunction`.
 
@@ -1010,6 +1026,22 @@ int CeedQFunctionSetUserFlopsEstimate(CeedQFunction qf, CeedSize flops) {
   return CEED_ERROR_SUCCESS;
 }
 
+/**
+  @brief Set the number of tabs to indent for @ref CeedQFunctionView() output
+
+  @param[in] qf       `CeedQFunction` to set the number of view tabs
+  @param[in] num_tabs Number of view tabs to set
+
+  @return Error code: 0 - success, otherwise - failure
+
+  @ref User
+**/
+int CeedQFunctionSetNumViewTabs(CeedQFunction qf, CeedInt num_tabs) {
+  CeedCheck(num_tabs >= 0, CeedQFunctionReturnCeed(qf), CEED_ERROR_MINOR, "Number of view tabs must be non-negative");
+  qf->num_tabs = num_tabs;
+  return CEED_ERROR_SUCCESS;
+}
+
 /**
   @brief View a `CeedQFunction`
 
@@ -1021,20 +1053,30 @@ int CeedQFunctionSetUserFlopsEstimate(CeedQFunction qf, CeedSize flops) {
   @ref User
 **/
 int CeedQFunctionView(CeedQFunction qf, FILE *stream) {
+  char       *tabs = NULL;
   const char *name;
 
+  {
+    CeedInt num_tabs = 0;
+
+    CeedCall(CeedQFunctionGetNumViewTabs(qf, &num_tabs));
+    CeedCall(CeedCalloc(CEED_TAB_WIDTH * num_tabs + 1, &tabs));
+    for (CeedInt i = 0; i < CEED_TAB_WIDTH * num_tabs; i++) tabs[i] = ' ';
+  }
+
   CeedCall(CeedQFunctionGetName(qf, &name));
-  fprintf(stream, "%sCeedQFunction - %s\n", qf->is_gallery ? "Gallery " : "User ", name);
+  fprintf(stream, "%s%sCeedQFunction - %s\n", tabs, qf->is_gallery ? "Gallery " : "User ", name);
 
-  fprintf(stream, "  %" CeedInt_FMT " input field%s:\n", qf->num_input_fields, qf->num_input_fields > 1 ? "s" : "");
+  fprintf(stream, "%s  %" CeedInt_FMT " input field%s:\n", tabs, qf->num_input_fields, qf->num_input_fields > 1 ? "s" : "");
   for (CeedInt i = 0; i < qf->num_input_fields; i++) {
-    CeedCall(CeedQFunctionFieldView(qf->input_fields[i], i, 1, stream));
+    CeedCall(CeedQFunctionFieldView(qf->input_fields[i], i, 1, tabs, stream));
   }
 
-  fprintf(stream, "  %" CeedInt_FMT " output field%s:\n", qf->num_output_fields, qf->num_output_fields > 1 ? "s" : "");
+  fprintf(stream, "%s  %" CeedInt_FMT " output field%s:\n", tabs, qf->num_output_fields, qf->num_output_fields > 1 ? "s" : "");
   for (CeedInt i = 0; i < qf->num_output_fields; i++) {
-    CeedCall(CeedQFunctionFieldView(qf->output_fields[i], i, 0, stream));
+    CeedCall(CeedQFunctionFieldView(qf->output_fields[i], i, 0, tabs, stream));
   }
+  CeedCall(CeedFree(&tabs));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/interface/ceed-qfunctioncontext.c b/interface/ceed-qfunctioncontext.c
index 2568133fcb..87f7d2158a 100644
--- a/interface/ceed-qfunctioncontext.c
+++ b/interface/ceed-qfunctioncontext.c
@@ -162,6 +162,21 @@ int CeedQFunctionContextGetCeed(CeedQFunctionContext ctx, Ceed *ceed) {
 **/
 Ceed CeedQFunctionContextReturnCeed(CeedQFunctionContext ctx) { return ctx->ceed; }
 
+/**
+  @brief Get the number of tabs to indent for @ref CeedQFunctionContextView() output
+
+  @param[in]  ctx      `CeedQFunctionContext` to get the number of view tabs
+  @param[out] num_tabs Number of view tabs
+
+  @return Error code: 0 - success, otherwise - failure
+
+  @ref Backend
+**/
+int CeedQFunctionContextGetNumViewTabs(CeedQFunctionContext ctx, CeedInt *num_tabs) {
+  *num_tabs = ctx->num_tabs;
+  return CEED_ERROR_SUCCESS;
+}
+
 /**
   @brief Check for valid data in a `CeedQFunctionContext`
 
@@ -881,6 +896,22 @@ int CeedQFunctionContextGetContextSize(CeedQFunctionContext ctx, size_t *ctx_siz
   return CEED_ERROR_SUCCESS;
 }
 
+/**
+  @brief Set the number of tabs to indent for @ref CeedQFunctionContextView() output
+
+  @param[in] ctx      `CeedQFunctionContext` to set the number of view tabs
+  @param[in] num_tabs Number of view tabs to set
+
+  @return Error code: 0 - success, otherwise - failure
+
+  @ref User
+**/
+int CeedQFunctionContextSetNumViewTabs(CeedQFunctionContext ctx, CeedInt num_tabs) {
+  CeedCheck(num_tabs >= 0, CeedQFunctionContextReturnCeed(ctx), CEED_ERROR_MINOR, "Number of view tabs must be non-negative");
+  ctx->num_tabs = num_tabs;
+  return CEED_ERROR_SUCCESS;
+}
+
 /**
   @brief View a `CeedQFunctionContext`
 
@@ -892,11 +923,22 @@ int CeedQFunctionContextGetContextSize(CeedQFunctionContext ctx, size_t *ctx_siz
   @ref User
 **/
 int CeedQFunctionContextView(CeedQFunctionContext ctx, FILE *stream) {
-  fprintf(stream, "CeedQFunctionContext\n");
-  fprintf(stream, "  Context Data Size: %zu\n", ctx->ctx_size);
+  char *tabs = NULL;
+
+  {
+    CeedInt num_tabs = 0;
+
+    CeedCall(CeedQFunctionContextGetNumViewTabs(ctx, &num_tabs));
+    CeedCall(CeedCalloc(CEED_TAB_WIDTH * num_tabs + 1, &tabs));
+    for (CeedInt i = 0; i < CEED_TAB_WIDTH * num_tabs; i++) tabs[i] = ' ';
+  }
+
+  fprintf(stream, "%sCeedQFunctionContext\n", tabs);
+  fprintf(stream, "%s  Context Data Size: %zu\n", tabs, ctx->ctx_size);
   for (CeedInt i = 0; i < ctx->num_fields; i++) {
-    fprintf(stream, "  Labeled %s field: %s\n", CeedContextFieldTypes[ctx->field_labels[i]->type], ctx->field_labels[i]->name);
+    fprintf(stream, "%s  Labeled %s field: %s\n", tabs, CeedContextFieldTypes[ctx->field_labels[i]->type], ctx->field_labels[i]->name);
   }
+  CeedCall(CeedFree(&tabs));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/interface/ceed-vector.c b/interface/ceed-vector.c
index f4a5b7505d..ff3ea4eae5 100644
--- a/interface/ceed-vector.c
+++ b/interface/ceed-vector.c
@@ -39,6 +39,21 @@ const CeedVector CEED_VECTOR_NONE = &ceed_vector_none;
 /// @addtogroup CeedVectorBackend
 /// @{
 
+/**
+  @brief Get the number of tabs to indent for @ref CeedVectorView() output
+
+  @param[in]  vec      `CeedVector` to get the number of view tabs
+  @param[out] num_tabs Number of view tabs
+
+  @return Error code: 0 - success, otherwise - failure
+
+  @ref Backend
+**/
+int CeedVectorGetNumViewTabs(CeedVector vec, CeedInt *num_tabs) {
+  *num_tabs = vec->num_tabs;
+  return CEED_ERROR_SUCCESS;
+}
+
 /**
   @brief Check for valid data in a `CeedVector`
 
@@ -994,6 +1009,22 @@ int CeedVectorReciprocal(CeedVector vec) {
   return CEED_ERROR_SUCCESS;
 }
 
+/**
+  @brief Set the number of tabs to indent for @ref CeedVectorView() output
+
+  @param[in] vec      `CeedVector` to set the number of view tabs
+  @param[in] num_tabs Number of view tabs to set
+
+  @return Error code: 0 - success, otherwise - failure
+
+  @ref User
+**/
+int CeedVectorSetNumViewTabs(CeedVector vec, CeedInt num_tabs) {
+  CeedCheck(num_tabs >= 0, CeedVectorReturnCeed(vec), CEED_ERROR_MINOR, "Number of view tabs must be non-negative");
+  vec->num_tabs = num_tabs;
+  return CEED_ERROR_SUCCESS;
+}
+
 /**
   @brief View a `CeedVector`
 
@@ -1013,24 +1044,34 @@ int CeedVectorReciprocal(CeedVector vec) {
 **/
 int CeedVectorViewRange(CeedVector vec, CeedSize start, CeedSize stop, CeedInt step, const char *fp_fmt, FILE *stream) {
   char              fmt[1024];
+  char             *tabs = NULL;
   CeedSize          length;
   const CeedScalar *x;
 
   CeedCheck(step != 0, CeedVectorReturnCeed(vec), CEED_ERROR_MINOR, "View range 'step' must be nonzero");
 
+  {
+    CeedInt num_tabs = 0;
+
+    CeedCall(CeedVectorGetNumViewTabs(vec, &num_tabs));
+    CeedCall(CeedCalloc(CEED_TAB_WIDTH * num_tabs + 1, &tabs));
+    for (CeedInt i = 0; i < CEED_TAB_WIDTH * num_tabs; i++) tabs[i] = ' ';
+  }
+
   CeedCall(CeedVectorGetLength(vec, &length));
-  fprintf(stream, "CeedVector length %" CeedSize_FMT "\n", length);
+  fprintf(stream, "%sCeedVector length %" CeedSize_FMT "\n", tabs, length);
   if (start != 0 || stop != length || step != 1) {
-    fprintf(stream, "  start: %" CeedSize_FMT "\n  stop:  %" CeedSize_FMT "\n  step:  %" CeedInt_FMT "\n", start, stop, step);
+    fprintf(stream, "%s  start: %" CeedSize_FMT "\n%s  stop:  %" CeedSize_FMT "\n%s  step:  %" CeedInt_FMT "\n", tabs, start, tabs, stop, tabs, step);
   }
   if (start > length) start = length;
   if (stop == -1 || stop > length) stop = length;
 
-  snprintf(fmt, sizeof fmt, "  %s\n", fp_fmt ? fp_fmt : "%g");
+  snprintf(fmt, sizeof fmt, "%s  %s\n", tabs, fp_fmt ? fp_fmt : "%g");
   CeedCall(CeedVectorGetArrayRead(vec, CEED_MEM_HOST, &x));
   for (CeedSize i = start; step > 0 ? (i < stop) : (i > stop); i += step) fprintf(stream, fmt, x[i]);
   CeedCall(CeedVectorRestoreArrayRead(vec, &x));
-  if (stop != length) fprintf(stream, "  ...\n");
+  if (stop != length) fprintf(stream, "%s  ...\n", tabs);
+  CeedCall(CeedFree(&tabs));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/interface/ceed.c b/interface/ceed.c
index 0952389a00..e7e248a853 100644
--- a/interface/ceed.c
+++ b/interface/ceed.c
@@ -663,6 +663,21 @@ int CeedSetOperatorFallbackCeed(Ceed ceed, Ceed fallback_ceed) {
   return CEED_ERROR_SUCCESS;
 }
 
+/**
+  @brief Get the number of tabs to indent for @ref CeedView() output
+
+  @param[in]  ceed     `Ceed` to get the number of view tabs
+  @param[out] num_tabs Number of view tabs
+
+  @return Error code: 0 - success, otherwise - failure
+
+  @ref Backend
+**/
+int CeedGetNumViewTabs(Ceed ceed, CeedInt *num_tabs) {
+  *num_tabs = ceed->num_tabs;
+  return CEED_ERROR_SUCCESS;
+}
+
 /**
   @brief Flag `Ceed` context as deterministic
 
@@ -1537,6 +1552,22 @@ int CeedAddJitDefine(Ceed ceed, const char *jit_define) {
   return CEED_ERROR_SUCCESS;
 }
 
+/**
+  @brief Set the number of tabs to indent for @ref CeedView() output
+
+  @param[in] ceed     `Ceed` to set the number of view tabs
+  @param[in] num_tabs Number of view tabs to set
+
+  @return Error code: 0 - success, otherwise - failure
+
+  @ref User
+**/
+int CeedSetNumViewTabs(Ceed ceed, CeedInt num_tabs) {
+  CeedCheck(num_tabs >= 0, ceed, CEED_ERROR_MINOR, "Number of view tabs must be non-negative");
+  ceed->num_tabs = num_tabs;
+  return CEED_ERROR_SUCCESS;
+}
+
 /**
   @brief View a `Ceed`
 
@@ -1548,15 +1579,25 @@ int CeedAddJitDefine(Ceed ceed, const char *jit_define) {
   @ref User
 **/
 int CeedView(Ceed ceed, FILE *stream) {
+  char       *tabs = NULL;
   CeedMemType mem_type;
 
+  {
+    CeedInt num_tabs = 0;
+
+    CeedCall(CeedGetNumViewTabs(ceed, &num_tabs));
+    CeedCall(CeedCalloc(CEED_TAB_WIDTH * num_tabs + 1, &tabs));
+    for (CeedInt i = 0; i < CEED_TAB_WIDTH * num_tabs; i++) tabs[i] = ' ';
+  }
+
   CeedCall(CeedGetPreferredMemType(ceed, &mem_type));
 
   fprintf(stream,
-          "Ceed\n"
-          "  Ceed Resource: %s\n"
-          "  Preferred MemType: %s\n",
-          ceed->resource, CeedMemTypes[mem_type]);
+          "%sCeed\n"
+          "%s  Ceed Resource: %s\n"
+          "%s  Preferred MemType: %s\n",
+          tabs, tabs, ceed->resource, tabs, CeedMemTypes[mem_type]);
+  CeedCall(CeedFree(&tabs));
   return CEED_ERROR_SUCCESS;
 }
 

From 681c4c4865e6a48cef1382a3d5a94c3bb19afc16 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Tue, 11 Nov 2025 09:32:18 -0700
Subject: [PATCH 508/571] junit - drop unused arg

---
 tests/junit.py        | 3 +--
 tests/junit_common.py | 5 ++---
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/tests/junit.py b/tests/junit.py
index 7237594b17..5b4e8319d7 100755
--- a/tests/junit.py
+++ b/tests/junit.py
@@ -24,7 +24,6 @@ def create_argparser() -> argparse.ArgumentParser:
         help='Output mode, junit or tap',
         default=RunMode.JUNIT)
     parser.add_argument('-n', '--nproc', type=int, default=1, help='number of MPI processes')
-    parser.add_argument('-o', '--output', type=Optional[Path], default=None, help='Output file to write test')
     parser.add_argument('-b', '--junit-batch', type=str, default='', help='Name of JUnit batch for output file')
     parser.add_argument('-np', '--pool-size', type=int, default=1, help='Number of test cases to run in parallel')
     parser.add_argument('-s', '--search', type=str, default='.*',
@@ -213,6 +212,6 @@ def check_allowed_stdout(self, test: str) -> bool:
 
     # write output and check for failures
     if args.mode is RunMode.JUNIT:
-        write_junit_xml(result, args.output, args.junit_batch)
+        write_junit_xml(result, args.junit_batch)
         if has_failures(result):
             sys.exit(1)
diff --git a/tests/junit_common.py b/tests/junit_common.py
index 60aa2a136c..eee7924ba9 100644
--- a/tests/junit_common.py
+++ b/tests/junit_common.py
@@ -817,15 +817,14 @@ def run_tests(test: str, ceed_backends: List[str], mode: RunMode, nproc: int,
     return TestSuite(test, test_cases)
 
 
-def write_junit_xml(test_suite: TestSuite, output_file: Optional[Path], batch: str = '') -> None:
+def write_junit_xml(test_suite: TestSuite, batch: str = '') -> None:
     """Write a JUnit XML file containing the results of a `TestSuite`
 
     Args:
         test_suite (TestSuite): JUnit `TestSuite` to write
-        output_file (Optional[Path]): Path to output file, or `None` to generate automatically as `build/{test_suite.name}{batch}.junit`
         batch (str): Name of JUnit batch, defaults to empty string
     """
-    output_file = output_file or Path('build') / (f'{test_suite.name}{batch}.junit')
+    output_file = Path('build') / (f'{test_suite.name}{batch}.junit')
     output_file.write_text(to_xml_report_string([test_suite]))
 
 
From 62e2d4103ebce4cf22bacb0bf84df0fd456c3171 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Tue, 11 Nov 2025 10:57:28 -0700
Subject: [PATCH 509/571] view - update tests for Ceed*SetNumViewTabs

---
 tests/output/t107-vector.out          | 11 ++++++++
 tests/output/t210-elemrestriction.out |  1 +
 tests/output/t211-elemrestriction.out |  1 +
 tests/output/t212-elemrestriction.out |  1 +
 tests/output/t300-basis.out           | 17 +++++++++++++
 tests/output/t320-basis.out           | 21 ++++++++++++++++
 tests/output/t330-basis.out           | 36 +++++++++++++++++++++++++++
 tests/output/t340-basis.out           | 21 ++++++++++++++++
 tests/output/t402-qfunction.out       | 18 ++++++++++++++
 tests/output/t413-qfunction.out       | 15 +++++++++++
 tests/t003-ceed.c                     |  3 +++
 tests/t107-vector.c                   |  3 +++
 tests/t210-elemrestriction.c          |  2 ++
 tests/t211-elemrestriction.c          |  2 ++
 tests/t212-elemrestriction.c          |  2 ++
 tests/t300-basis.c                    |  2 ++
 tests/t320-basis.c                    |  2 ++
 tests/t330-basis.c                    |  2 ++
 tests/t340-basis.c                    |  2 ++
 tests/t402-qfunction.c                |  5 ++++
 tests/t413-qfunction.c                |  2 ++
 21 files changed, 169 insertions(+)

diff --git a/tests/output/t107-vector.out b/tests/output/t107-vector.out
index c4823d39c7..f3faa3e8ea 100644
--- a/tests/output/t107-vector.out
+++ b/tests/output/t107-vector.out
@@ -9,3 +9,14 @@ CeedVector length 10
    17.00000000
    18.00000000
    19.00000000
+  CeedVector length 10
+     10.00000000
+     11.00000000
+     12.00000000
+     13.00000000
+     14.00000000
+     15.00000000
+     16.00000000
+     17.00000000
+     18.00000000
+     19.00000000
diff --git a/tests/output/t210-elemrestriction.out b/tests/output/t210-elemrestriction.out
index 0696c8ce32..22990a413e 100644
--- a/tests/output/t210-elemrestriction.out
+++ b/tests/output/t210-elemrestriction.out
@@ -1 +1,2 @@
 CeedElemRestriction from (4, 1) to 3 elements with 2 nodes each and component stride 1
+  CeedElemRestriction from (4, 1) to 3 elements with 2 nodes each and component stride 1
diff --git a/tests/output/t211-elemrestriction.out b/tests/output/t211-elemrestriction.out
index af26a4a612..b2d7a029c4 100644
--- a/tests/output/t211-elemrestriction.out
+++ b/tests/output/t211-elemrestriction.out
@@ -1 +1,2 @@
 CeedElemRestriction from (6, 1) to 3 elements with 2 nodes each and strides [1, 2, 2]
+  CeedElemRestriction from (6, 1) to 3 elements with 2 nodes each and strides [1, 2, 2]
diff --git a/tests/output/t212-elemrestriction.out b/tests/output/t212-elemrestriction.out
index a5cd6de40b..7d72d8c00f 100644
--- a/tests/output/t212-elemrestriction.out
+++ b/tests/output/t212-elemrestriction.out
@@ -1 +1,2 @@
 Blocked CeedElemRestriction from (6, 1) to 3 elements with 2 nodes each and strides [1, 2, 2]
+  Blocked CeedElemRestriction from (6, 1) to 3 elements with 2 nodes each and strides [1, 2, 2]
diff --git a/tests/output/t300-basis.out b/tests/output/t300-basis.out
index ebbe0f9635..5ab53a4686 100644
--- a/tests/output/t300-basis.out
+++ b/tests/output/t300-basis.out
@@ -32,3 +32,20 @@ CeedBasis in a H^1 space on a line element
     [1]	 -0.51670214	 -0.48795249	  1.33790510	 -0.33325047
     [2]	  0.33325047	 -1.33790510	  0.48795249	  0.51670214
     [3]	 -0.18899664	  0.63510411	 -2.78794489	  2.34183742
+  CeedBasis in a H^1 space on a line element
+    P: 4
+    Q: 4
+    dimension: 1
+    field components: 1
+    qref1d:   	 -0.86113631	 -0.33998104	  0.33998104	  0.86113631
+    qweight1d:	  0.34785485	  0.65214515	  0.65214515	  0.34785485
+    interp1d:
+      [0]	  0.62994317	  0.47255875	 -0.14950343	  0.04700152
+      [1]	 -0.07069480	  0.97297619	  0.13253993	 -0.03482132
+      [2]	 -0.03482132	  0.13253993	  0.97297619	 -0.07069480
+      [3]	  0.04700152	 -0.14950343	  0.47255875	  0.62994317
+    grad1d:
+      [0]	 -2.34183742	  2.78794489	 -0.63510411	  0.18899664
+      [1]	 -0.51670214	 -0.48795249	  1.33790510	 -0.33325047
+      [2]	  0.33325047	 -1.33790510	  0.48795249	  0.51670214
+      [3]	 -0.18899664	  0.63510411	 -2.78794489	  2.34183742
diff --git a/tests/output/t320-basis.out b/tests/output/t320-basis.out
index a1522dd848..34c78eeaaf 100644
--- a/tests/output/t320-basis.out
+++ b/tests/output/t320-basis.out
@@ -19,3 +19,24 @@ CeedBasis in a H^1 space on a triangle element
     [5]	  0.20000000	 -2.40000000	  0.00000000	  0.00000000	  2.40000000	 -0.20000000
     [6]	 -0.33333333	 -1.33333333	  0.00000000	  0.00000000	  1.33333333	  0.33333333
     [7]	  0.20000000	 -0.80000000	  0.00000000	 -1.60000000	  0.80000000	  1.40000000
+  CeedBasis in a H^1 space on a triangle element
+    P: 6
+    Q: 4
+    dimension: 2
+    field components: 1
+    qref:     	  0.20000000	  0.60000000	  0.33333333	  0.20000000	  0.20000000	  0.20000000	  0.33333333	  0.60000000
+    qweight:  	  0.26041667	  0.26041667	 -0.28125000	  0.26041667
+    interp:
+      [0]	  0.12000000	  0.48000000	 -0.12000000	  0.48000000	  0.16000000	 -0.12000000
+      [1]	 -0.12000000	  0.48000000	  0.12000000	  0.16000000	  0.48000000	 -0.12000000
+      [2]	 -0.11111111	  0.44444444	 -0.11111111	  0.44444444	  0.44444444	 -0.11111111
+      [3]	 -0.12000000	  0.16000000	 -0.12000000	  0.48000000	  0.48000000	  0.12000000
+    grad:
+      [0]	 -1.40000000	  1.60000000	 -0.20000000	 -0.80000000	  0.80000000	  0.00000000
+      [1]	  0.20000000	 -1.60000000	  1.40000000	 -0.80000000	  0.80000000	  0.00000000
+      [2]	 -0.33333333	  0.00000000	  0.33333333	 -1.33333333	  1.33333333	  0.00000000
+      [3]	  0.20000000	  0.00000000	 -0.20000000	 -2.40000000	  2.40000000	  0.00000000
+      [4]	 -1.40000000	 -0.80000000	  0.00000000	  1.60000000	  0.80000000	 -0.20000000
+      [5]	  0.20000000	 -2.40000000	  0.00000000	  0.00000000	  2.40000000	 -0.20000000
+      [6]	 -0.33333333	 -1.33333333	  0.00000000	  0.00000000	  1.33333333	  0.33333333
+      [7]	  0.20000000	 -0.80000000	  0.00000000	 -1.60000000	  0.80000000	  1.40000000
diff --git a/tests/output/t330-basis.out b/tests/output/t330-basis.out
index 75e93004fc..1377df2bb5 100644
--- a/tests/output/t330-basis.out
+++ b/tests/output/t330-basis.out
@@ -34,3 +34,39 @@ CeedBasis in a H(div) space on a quadrilateral element
     [6]	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000
     [7]	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000
     [8]	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000
+  CeedBasis in a H(div) space on a quadrilateral element
+    P: 8
+    Q: 9
+    dimension: 2
+    field components: 1
+    qref:     	 -0.77459667	  0.00000000	  0.77459667	 -0.77459667	  0.00000000	  0.77459667	 -0.77459667	  0.00000000	  0.77459667	 -0.77459667	 -0.77459667	 -0.77459667	  0.00000000	  0.00000000	  0.00000000	  0.77459667	  0.77459667	  0.77459667
+    qweight:  	  0.30864198	  0.49382716	  0.30864198	  0.49382716	  0.79012346	  0.49382716	  0.30864198	  0.49382716	  0.30864198
+    interp:
+      [0]	 -0.05000000	  0.05000000	  0.10000000	  0.01270167	 -0.05000000	  0.05000000	 -0.78729833	 -0.10000000
+      [1]	 -0.12500000	  0.12500000	  0.44364917	  0.05635083	 -0.12500000	  0.12500000	 -0.44364917	 -0.05635083
+      [2]	 -0.05000000	  0.05000000	  0.78729833	  0.10000000	 -0.05000000	  0.05000000	 -0.10000000	 -0.01270167
+      [3]	 -0.05000000	  0.05000000	  0.05635083	  0.05635083	 -0.05000000	  0.05000000	 -0.44364917	 -0.44364917
+      [4]	 -0.12500000	  0.12500000	  0.25000000	  0.25000000	 -0.12500000	  0.12500000	 -0.25000000	 -0.25000000
+      [5]	 -0.05000000	  0.05000000	  0.44364917	  0.44364917	 -0.05000000	  0.05000000	 -0.05635083	 -0.05635083
+      [6]	 -0.05000000	  0.05000000	  0.01270167	  0.10000000	 -0.05000000	  0.05000000	 -0.10000000	 -0.78729833
+      [7]	 -0.12500000	  0.12500000	  0.05635083	  0.44364917	 -0.12500000	  0.12500000	 -0.05635083	 -0.44364917
+      [8]	 -0.05000000	  0.05000000	  0.10000000	  0.78729833	 -0.05000000	  0.05000000	 -0.01270167	 -0.10000000
+      [9]	 -0.78729833	 -0.10000000	 -0.05000000	  0.05000000	  0.10000000	  0.01270167	 -0.05000000	  0.05000000
+      [10]	 -0.44364917	 -0.44364917	 -0.05000000	  0.05000000	  0.05635083	  0.05635083	 -0.05000000	  0.05000000
+      [11]	 -0.10000000	 -0.78729833	 -0.05000000	  0.05000000	  0.01270167	  0.10000000	 -0.05000000	  0.05000000
+      [12]	 -0.44364917	 -0.05635083	 -0.12500000	  0.12500000	  0.44364917	  0.05635083	 -0.12500000	  0.12500000
+      [13]	 -0.25000000	 -0.25000000	 -0.12500000	  0.12500000	  0.25000000	  0.25000000	 -0.12500000	  0.12500000
+      [14]	 -0.05635083	 -0.44364917	 -0.12500000	  0.12500000	  0.05635083	  0.44364917	 -0.12500000	  0.12500000
+      [15]	 -0.10000000	 -0.01270167	 -0.05000000	  0.05000000	  0.78729833	  0.10000000	 -0.05000000	  0.05000000
+      [16]	 -0.05635083	 -0.05635083	 -0.05000000	  0.05000000	  0.44364917	  0.44364917	 -0.05000000	  0.05000000
+      [17]	 -0.01270167	 -0.10000000	 -0.05000000	  0.05000000	  0.10000000	  0.78729833	 -0.05000000	  0.05000000
+    div:
+      [0]	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000
+      [1]	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000
+      [2]	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000
+      [3]	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000
+      [4]	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000
+      [5]	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000
+      [6]	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000
+      [7]	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000
+      [8]	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000
diff --git a/tests/output/t340-basis.out b/tests/output/t340-basis.out
index 5c97bec2a8..fc3b0b9123 100644
--- a/tests/output/t340-basis.out
+++ b/tests/output/t340-basis.out
@@ -19,3 +19,24 @@ CeedBasis in a H(curl) space on a triangle element
     [1]	 -1.20000000	 -1.20000000	 -8.40000000	  1.20000000	  8.40000000	 -1.20000000	 -9.60000000	  9.60000000
     [2]	 -1.20000000	  8.40000000	  1.20000000	 -8.40000000	 -1.20000000	 -1.20000000	  9.60000000	  9.60000000
     [3]	  8.40000000	 -1.20000000	  1.20000000	  1.20000000	 -1.20000000	  8.40000000	  0.00000000	-19.20000000
+  CeedBasis in a H(curl) space on a triangle element
+    P: 8
+    Q: 4
+    dimension: 2
+    field components: 1
+    qref:     	  0.33333333	  0.20000000	  0.20000000	  0.60000000	  0.33333333	  0.20000000	  0.60000000	  0.20000000
+    qweight:  	 -0.26041667	  0.26041667	  0.28125000	  0.26041667
+    interp:
+      [0]	 -0.22222222	  0.44444444	  0.22222222	 -0.44444444	 -0.22222222	 -0.22222222	  2.66666667	  0.00000000
+      [1]	  0.08000000	  0.48000000	  0.56000000	 -0.48000000	  1.04000000	 -0.72000000	  2.24000000	 -0.64000000
+      [2]	  0.24000000	 -0.48000000	 -0.24000000	  0.48000000	 -0.56000000	 -0.56000000	  2.88000000	  0.00000000
+      [3]	 -0.56000000	  0.48000000	 -0.08000000	 -0.48000000	 -0.72000000	  1.04000000	  1.60000000	  0.64000000
+      [4]	 -0.44444444	  0.22222222	 -0.22222222	 -0.22222222	  0.22222222	 -0.44444444	  0.00000000	  2.66666667
+      [5]	 -0.48000000	 -0.08000000	  1.04000000	 -0.72000000	  0.56000000	 -0.48000000	 -0.64000000	  2.24000000
+      [6]	 -0.48000000	  0.56000000	 -0.72000000	  1.04000000	 -0.08000000	 -0.48000000	  0.64000000	  1.60000000
+      [7]	  0.48000000	 -0.24000000	 -0.56000000	 -0.56000000	 -0.24000000	  0.48000000	  0.00000000	  2.88000000
+    curl:
+      [0]	  2.00000000	  2.00000000	 -2.00000000	 -2.00000000	  2.00000000	  2.00000000	  0.00000000	  0.00000000
+      [1]	 -1.20000000	 -1.20000000	 -8.40000000	  1.20000000	  8.40000000	 -1.20000000	 -9.60000000	  9.60000000
+      [2]	 -1.20000000	  8.40000000	  1.20000000	 -8.40000000	 -1.20000000	 -1.20000000	  9.60000000	  9.60000000
+      [3]	  8.40000000	 -1.20000000	  1.20000000	  1.20000000	 -1.20000000	  8.40000000	  0.00000000	-19.20000000
diff --git a/tests/output/t402-qfunction.out b/tests/output/t402-qfunction.out
index 4d131f7852..ad131d8d36 100644
--- a/tests/output/t402-qfunction.out
+++ b/tests/output/t402-qfunction.out
@@ -27,3 +27,21 @@ User CeedQFunction - mass
 CeedQFunctionContext
   Context Data Size: 40
   Labeled double field: scale
+  User CeedQFunction - mass
+    2 input fields:
+      Input field 0:
+        Name: "q data"
+        Size: 1
+        EvalMode: "none"
+      Input field 1:
+        Name: "u"
+        Size: 1
+        EvalMode: "interpolation"
+    1 output field:
+      Output field 0:
+        Name: "v"
+        Size: 1
+        EvalMode: "interpolation"
+  CeedQFunctionContext
+    Context Data Size: 40
+    Labeled double field: scale
diff --git a/tests/output/t413-qfunction.out b/tests/output/t413-qfunction.out
index ffee1bdca7..05731e4204 100644
--- a/tests/output/t413-qfunction.out
+++ b/tests/output/t413-qfunction.out
@@ -28,3 +28,18 @@ Gallery CeedQFunction - MassApply
       Name: "v"
       Size: 1
       EvalMode: "interpolation"
+  Gallery CeedQFunction - MassApply
+    2 input fields:
+      Input field 0:
+        Name: "u"
+        Size: 1
+        EvalMode: "interpolation"
+      Input field 1:
+        Name: "qdata"
+        Size: 1
+        EvalMode: "none"
+    1 output field:
+      Output field 0:
+        Name: "v"
+        Size: 1
+        EvalMode: "interpolation"
diff --git a/tests/t003-ceed.c b/tests/t003-ceed.c
index 813c0cfe49..1d58ad7ce0 100644
--- a/tests/t003-ceed.c
+++ b/tests/t003-ceed.c
@@ -11,6 +11,9 @@ int main(int argc, char **argv) {
 
   CeedView(ceed, stdout);
 
+  CeedSetNumViewTabs(ceed, 1);
+  CeedView(ceed, stdout);
+
   CeedDestroy(&ceed);
   return 0;
 }
diff --git a/tests/t107-vector.c b/tests/t107-vector.c
index ffa27a508d..b6c8993537 100644
--- a/tests/t107-vector.c
+++ b/tests/t107-vector.c
@@ -17,6 +17,9 @@ int main(int argc, char **argv) {
 
   CeedVectorView(x, "%12.8f", stdout);
 
+  CeedVectorSetNumViewTabs(x, 1);
+  CeedVectorView(x, "%12.8f", stdout);
+
   CeedVectorDestroy(&x);
   CeedDestroy(&ceed);
   return 0;
diff --git a/tests/t210-elemrestriction.c b/tests/t210-elemrestriction.c
index 7aff301411..13781acc33 100644
--- a/tests/t210-elemrestriction.c
+++ b/tests/t210-elemrestriction.c
@@ -17,6 +17,8 @@ int main(int argc, char **argv) {
   }
   CeedElemRestrictionCreate(ceed, num_elem, 2, 1, 1, num_elem + 1, CEED_MEM_HOST, CEED_USE_POINTER, ind, &elem_restriction);
 
+  CeedElemRestrictionView(elem_restriction, stdout);
+  CeedElemRestrictionSetNumViewTabs(elem_restriction, 1);
   CeedElemRestrictionView(elem_restriction, stdout);
 
   CeedElemRestrictionDestroy(&elem_restriction);
diff --git a/tests/t211-elemrestriction.c b/tests/t211-elemrestriction.c
index 55ba2de881..3318a56f18 100644
--- a/tests/t211-elemrestriction.c
+++ b/tests/t211-elemrestriction.c
@@ -14,6 +14,8 @@ int main(int argc, char **argv) {
   CeedInt strides[3] = {1, 2, 2};
   CeedElemRestrictionCreateStrided(ceed, num_elem, 2, 1, num_elem * 2, strides, &elem_restriction);
 
+  CeedElemRestrictionView(elem_restriction, stdout);
+  CeedElemRestrictionSetNumViewTabs(elem_restriction, 1);
   CeedElemRestrictionView(elem_restriction, stdout);
 
   CeedElemRestrictionDestroy(&elem_restriction);
diff --git a/tests/t212-elemrestriction.c b/tests/t212-elemrestriction.c
index 99f5dc1cea..3914727201 100644
--- a/tests/t212-elemrestriction.c
+++ b/tests/t212-elemrestriction.c
@@ -14,6 +14,8 @@ int main(int argc, char **argv) {
   CeedInt strides[3] = {1, 2, 2};
   CeedElemRestrictionCreateBlockedStrided(ceed, num_elem, 2, 2, 1, num_elem * 2, strides, &elem_restriction);
 
+  CeedElemRestrictionView(elem_restriction, stdout);
+  CeedElemRestrictionSetNumViewTabs(elem_restriction, 1);
   CeedElemRestrictionView(elem_restriction, stdout);
 
   CeedElemRestrictionDestroy(&elem_restriction);
diff --git a/tests/t300-basis.c b/tests/t300-basis.c
index db17332def..da563a5c0e 100644
--- a/tests/t300-basis.c
+++ b/tests/t300-basis.c
@@ -18,6 +18,8 @@ int main(int argc, char **argv) {
 
   CeedBasisCreateTensorH1Lagrange(ceed, 1, 1, 4, 4, CEED_GAUSS, &basis);
   CeedBasisView(basis, stdout);
+  CeedBasisSetNumViewTabs(basis, 1);
+  CeedBasisView(basis, stdout);
   CeedBasisDestroy(&basis);
 
   CeedDestroy(&ceed);
diff --git a/tests/t320-basis.c b/tests/t320-basis.c
index c028fcd0a5..20309ec1ed 100644
--- a/tests/t320-basis.c
+++ b/tests/t320-basis.c
@@ -20,6 +20,8 @@ int main(int argc, char **argv) {
   Build2DSimplex(q_ref, q_weight, interp, grad);
   CeedBasisCreateH1(ceed, CEED_TOPOLOGY_TRIANGLE, 1, p, q, interp, grad, q_ref, q_weight, &basis);
   CeedBasisView(basis, stdout);
+  CeedBasisSetNumViewTabs(basis, 1);
+  CeedBasisView(basis, stdout);
 
   CeedBasisDestroy(&basis);
   CeedDestroy(&ceed);
diff --git a/tests/t330-basis.c b/tests/t330-basis.c
index dfbf3373ff..bd96afd40d 100644
--- a/tests/t330-basis.c
+++ b/tests/t330-basis.c
@@ -21,6 +21,8 @@ int main(int argc, char **argv) {
   BuildHdivQuadrilateral(q, q_ref, q_weights, interp, div, CEED_GAUSS);
   CeedBasisCreateHdiv(ceed, CEED_TOPOLOGY_QUAD, 1, p, num_qpts, interp, div, q_ref, q_weights, &basis);
   CeedBasisView(basis, stdout);
+  CeedBasisSetNumViewTabs(basis, 1);
+  CeedBasisView(basis, stdout);
 
   CeedBasisDestroy(&basis);
   CeedDestroy(&ceed);
diff --git a/tests/t340-basis.c b/tests/t340-basis.c
index e9af85ff5f..8a70269bec 100644
--- a/tests/t340-basis.c
+++ b/tests/t340-basis.c
@@ -20,6 +20,8 @@ int main(int argc, char **argv) {
   BuildHcurl2DSimplex(q_ref, q_weight, interp, curl);
   CeedBasisCreateHcurl(ceed, CEED_TOPOLOGY_TRIANGLE, 1, p, q, interp, curl, q_ref, q_weight, &basis);
   CeedBasisView(basis, stdout);
+  CeedBasisSetNumViewTabs(basis, 1);
+  CeedBasisView(basis, stdout);
 
   CeedBasisDestroy(&basis);
   CeedDestroy(&ceed);
diff --git a/tests/t402-qfunction.c b/tests/t402-qfunction.c
index 2f80666e15..8e15a3e5a1 100644
--- a/tests/t402-qfunction.c
+++ b/tests/t402-qfunction.c
@@ -34,6 +34,11 @@ int main(int argc, char **argv) {
   }
   CeedQFunctionContextView(ctx, stdout);
 
+  CeedQFunctionSetNumViewTabs(qf_mass, 1);
+  CeedQFunctionView(qf_mass, stdout);
+  CeedQFunctionContextSetNumViewTabs(ctx, 1);
+  CeedQFunctionContextView(ctx, stdout);
+
   CeedQFunctionDestroy(&qf_setup);
   CeedQFunctionDestroy(&qf_mass);
   CeedQFunctionContextDestroy(&ctx);
diff --git a/tests/t413-qfunction.c b/tests/t413-qfunction.c
index 690502ae76..aeecdd639f 100644
--- a/tests/t413-qfunction.c
+++ b/tests/t413-qfunction.c
@@ -14,6 +14,8 @@ int main(int argc, char **argv) {
 
   CeedQFunctionView(qf_setup, stdout);
   CeedQFunctionView(qf_mass, stdout);
+  CeedQFunctionSetNumViewTabs(qf_mass, 1);
+  CeedQFunctionView(qf_mass, stdout);
 
   CeedQFunctionDestroy(&qf_setup);
   CeedQFunctionDestroy(&qf_mass);

From 13a7d540f7e197536fd975043a07a67f9c536b7c Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Tue, 11 Nov 2025 12:54:02 -0700
Subject: [PATCH 510/571] fortran - add tab view fns

---
 interface/ceed-fortran.c                | 24 ++++++++++++++++++++++++
 tests/output/t107-vector-f.out          | 11 +++++++++++
 tests/output/t210-elemrestriction-f.out |  1 +
 tests/output/t211-elemrestriction-f.out |  1 +
 tests/output/t212-elemrestriction-f.out |  1 +
 tests/output/t300-basis-f.out           | 17 +++++++++++++++++
 tests/output/t320-basis-f.out           | 21 +++++++++++++++++++++
 tests/output/t402-qfunction-f.out       | 17 +++++++++++++++++
 tests/output/t413-qfunction-f.out       | 15 +++++++++++++++
 tests/t003-ceed-f.f90                   |  3 +++
 tests/t008-ceed.c                       |  2 +-
 tests/t107-vector-f.f90                 |  3 +++
 tests/t210-elemrestriction-f.f90        |  3 +++
 tests/t211-elemrestriction-f.f90        |  3 +++
 tests/t212-elemrestriction-f.f90        |  3 +++
 tests/t300-basis-f.f90                  |  3 +++
 tests/t320-basis-f.f90                  |  2 ++
 tests/t402-qfunction-f.f90              |  5 +++++
 tests/t413-qfunction-f.f90              |  2 ++
 19 files changed, 136 insertions(+), 1 deletion(-)

diff --git a/interface/ceed-fortran.c b/interface/ceed-fortran.c
index 5da94e1b02..402f581979 100644
--- a/interface/ceed-fortran.c
+++ b/interface/ceed-fortran.c
@@ -85,6 +85,9 @@ CEED_EXTERN void fCeedIsDeterministic(int *ceed, int *is_deterministic, int *err
 #define fCeedGetPreferredMemType FORTRAN_NAME(ceedgetpreferredmemtype, CEEDGETPREFERREDMEMTYPE)
 CEED_EXTERN void fCeedGetPreferredMemType(int *ceed, int *type, int *err) { *err = CeedGetPreferredMemType(Ceed_dict[*ceed], (CeedMemType *)type); }
 
+#define fCeedSetNumViewTabs FORTRAN_NAME(ceedsetnumviewtabs, CEEDSETNUMVIEWTABS)
+CEED_EXTERN void fCeedSetNumViewTabs(int *ceed, int *num_tabs, int *err) { *err = CeedSetNumViewTabs(Ceed_dict[*ceed], *num_tabs); }
+
 #define fCeedView FORTRAN_NAME(ceedview, CEEDVIEW)
 CEED_EXTERN void fCeedView(int *ceed, int *err) { *err = CeedView(Ceed_dict[*ceed], stdout); }
 
@@ -192,6 +195,9 @@ CEED_EXTERN void fCeedVectorNorm(int *vec, int *type, CeedScalar *norm, int *err
 #define fCeedVectorReciprocal FORTRAN_NAME(ceedvectorreciprocal, CEEDVECTORRECIPROCAL)
 CEED_EXTERN void fCeedVectorReciprocal(int *vec, int *err) { *err = CeedVectorReciprocal(CeedVector_dict[*vec]); }
 
+#define fCeedVectorSetNumViewTabs FORTRAN_NAME(ceedvectorsetnumviewtabs, CEEDVECTORSETNUMVIEWTABS)
+CEED_EXTERN void fCeedVectorSetNumViewTabs(int *vec, int *num_tabs, int *err) { *err = CeedVectorSetNumViewTabs(CeedVector_dict[*vec], *num_tabs); }
+
 #define fCeedVectorView FORTRAN_NAME(ceedvectorview, CEEDVECTORVIEW)
 CEED_EXTERN void fCeedVectorView(int *vec, int *err) { *err = CeedVectorView(CeedVector_dict[*vec], "%12.8f", stdout); }
 
@@ -449,6 +455,11 @@ CEED_EXTERN void fCeedElemRestrictionGetELayout(int *elemr, int *layout, int *er
   for (int i = 0; i < 3; i++) layout[i] = layout_c[i];
 }
 
+#define fCeedElemRestrictionSetNumViewTabs FORTRAN_NAME(ceedelemrestrictionsetnumviewtabs, CEEDELEMRESTRICTIONSETNUMVIEWTABS)
+CEED_EXTERN void fCeedElemRestrictionSetNumViewTabs(int *elemr, int *num_tabs, int *err) {
+  *err = CeedElemRestrictionSetNumViewTabs(CeedElemRestriction_dict[*elemr], *num_tabs);
+}
+
 #define fCeedElemRestrictionView FORTRAN_NAME(ceedelemrestrictionview, CEEDELEMRESTRICTIONVIEW)
 CEED_EXTERN void fCeedElemRestrictionView(int *elemr, int *err) { *err = CeedElemRestrictionView(CeedElemRestriction_dict[*elemr], stdout); }
 
@@ -575,6 +586,9 @@ CEED_EXTERN void fCeedBasisCreateHcurl(int *ceed, int *topo, int *num_comp, int
   }
 }
 
+#define fCeedBasisSetNumViewTabs FORTRAN_NAME(ceedbasissetnumviewtabs, CEEDBASISSETNUMVIEWTABS)
+CEED_EXTERN void fCeedBasisSetNumViewTabs(int *basis, int *num_tabs, int *err) { *err = CeedBasisSetNumViewTabs(CeedBasis_dict[*basis], *num_tabs); }
+
 #define fCeedBasisView FORTRAN_NAME(ceedbasisview, CEEDBASISVIEW)
 CEED_EXTERN void fCeedBasisView(int *basis, int *err) { *err = CeedBasisView(CeedBasis_dict[*basis], stdout); }
 
@@ -688,6 +702,11 @@ CEED_EXTERN void fCeedQFunctionContextRestoreData(int *ctx, CeedScalar *data, in
   *offset = 0;
 }
 
+#define fCeedQFunctionContextSetNumViewTabs FORTRAN_NAME(ceedqfunctioncontextsetnumviewtabs, CEEDQFUNCTIONCONTEXTSETNUMVIEWTABS)
+CEED_EXTERN void fCeedQFunctionContextSetNumViewTabs(int *ctx, int *num_tabs, int *err) {
+  *err = CeedQFunctionContextSetNumViewTabs(CeedQFunctionContext_dict[*ctx], *num_tabs);
+}
+
 #define fCeedQFunctionContextView FORTRAN_NAME(ceedqfunctioncontextview, CEEDQFUNCTIONCONTEXTVIEW)
 CEED_EXTERN void fCeedQFunctionContextView(int *ctx, int *err) { *err = CeedQFunctionContextView(CeedQFunctionContext_dict[*ctx], stdout); }
 
@@ -849,6 +868,11 @@ CEED_EXTERN void fCeedQFunctionSetContext(int *qf, int *ctx, int *err) {
   *err = CeedQFunctionContextDestroy(&fctx);
 }
 
+#define fCeedQFunctionSetNumViewTabs FORTRAN_NAME(ceedqfunctionsetnumviewtabs, CEEDQFUNCTIONSETNUMVIEWTABS)
+CEED_EXTERN void fCeedQFunctionSetNumViewTabs(int *qf, int *num_tabs, int *err) {
+  *err = CeedQFunctionSetNumViewTabs(CeedQFunction_dict[*qf], *num_tabs);
+}
+
 #define fCeedQFunctionView FORTRAN_NAME(ceedqfunctionview, CEEDQFUNCTIONVIEW)
 CEED_EXTERN void fCeedQFunctionView(int *qf, int *err) {
   CeedQFunction qf_ = CeedQFunction_dict[*qf];
diff --git a/tests/output/t107-vector-f.out b/tests/output/t107-vector-f.out
index c4823d39c7..f3faa3e8ea 100644
--- a/tests/output/t107-vector-f.out
+++ b/tests/output/t107-vector-f.out
@@ -9,3 +9,14 @@ CeedVector length 10
    17.00000000
    18.00000000
    19.00000000
+  CeedVector length 10
+     10.00000000
+     11.00000000
+     12.00000000
+     13.00000000
+     14.00000000
+     15.00000000
+     16.00000000
+     17.00000000
+     18.00000000
+     19.00000000
diff --git a/tests/output/t210-elemrestriction-f.out b/tests/output/t210-elemrestriction-f.out
index 0696c8ce32..22990a413e 100644
--- a/tests/output/t210-elemrestriction-f.out
+++ b/tests/output/t210-elemrestriction-f.out
@@ -1 +1,2 @@
 CeedElemRestriction from (4, 1) to 3 elements with 2 nodes each and component stride 1
+  CeedElemRestriction from (4, 1) to 3 elements with 2 nodes each and component stride 1
diff --git a/tests/output/t211-elemrestriction-f.out b/tests/output/t211-elemrestriction-f.out
index af26a4a612..b2d7a029c4 100644
--- a/tests/output/t211-elemrestriction-f.out
+++ b/tests/output/t211-elemrestriction-f.out
@@ -1 +1,2 @@
 CeedElemRestriction from (6, 1) to 3 elements with 2 nodes each and strides [1, 2, 2]
+  CeedElemRestriction from (6, 1) to 3 elements with 2 nodes each and strides [1, 2, 2]
diff --git a/tests/output/t212-elemrestriction-f.out b/tests/output/t212-elemrestriction-f.out
index a5cd6de40b..7d72d8c00f 100644
--- a/tests/output/t212-elemrestriction-f.out
+++ b/tests/output/t212-elemrestriction-f.out
@@ -1 +1,2 @@
 Blocked CeedElemRestriction from (6, 1) to 3 elements with 2 nodes each and strides [1, 2, 2]
+  Blocked CeedElemRestriction from (6, 1) to 3 elements with 2 nodes each and strides [1, 2, 2]
diff --git a/tests/output/t300-basis-f.out b/tests/output/t300-basis-f.out
index ebbe0f9635..5ab53a4686 100644
--- a/tests/output/t300-basis-f.out
+++ b/tests/output/t300-basis-f.out
@@ -32,3 +32,20 @@ CeedBasis in a H^1 space on a line element
     [1]	 -0.51670214	 -0.48795249	  1.33790510	 -0.33325047
     [2]	  0.33325047	 -1.33790510	  0.48795249	  0.51670214
     [3]	 -0.18899664	  0.63510411	 -2.78794489	  2.34183742
+  CeedBasis in a H^1 space on a line element
+    P: 4
+    Q: 4
+    dimension: 1
+    field components: 1
+    qref1d:   	 -0.86113631	 -0.33998104	  0.33998104	  0.86113631
+    qweight1d:	  0.34785485	  0.65214515	  0.65214515	  0.34785485
+    interp1d:
+      [0]	  0.62994317	  0.47255875	 -0.14950343	  0.04700152
+      [1]	 -0.07069480	  0.97297619	  0.13253993	 -0.03482132
+      [2]	 -0.03482132	  0.13253993	  0.97297619	 -0.07069480
+      [3]	  0.04700152	 -0.14950343	  0.47255875	  0.62994317
+    grad1d:
+      [0]	 -2.34183742	  2.78794489	 -0.63510411	  0.18899664
+      [1]	 -0.51670214	 -0.48795249	  1.33790510	 -0.33325047
+      [2]	  0.33325047	 -1.33790510	  0.48795249	  0.51670214
+      [3]	 -0.18899664	  0.63510411	 -2.78794489	  2.34183742
diff --git a/tests/output/t320-basis-f.out b/tests/output/t320-basis-f.out
index a1522dd848..34c78eeaaf 100644
--- a/tests/output/t320-basis-f.out
+++ b/tests/output/t320-basis-f.out
@@ -19,3 +19,24 @@ CeedBasis in a H^1 space on a triangle element
     [5]	  0.20000000	 -2.40000000	  0.00000000	  0.00000000	  2.40000000	 -0.20000000
     [6]	 -0.33333333	 -1.33333333	  0.00000000	  0.00000000	  1.33333333	  0.33333333
     [7]	  0.20000000	 -0.80000000	  0.00000000	 -1.60000000	  0.80000000	  1.40000000
+  CeedBasis in a H^1 space on a triangle element
+    P: 6
+    Q: 4
+    dimension: 2
+    field components: 1
+    qref:     	  0.20000000	  0.60000000	  0.33333333	  0.20000000	  0.20000000	  0.20000000	  0.33333333	  0.60000000
+    qweight:  	  0.26041667	  0.26041667	 -0.28125000	  0.26041667
+    interp:
+      [0]	  0.12000000	  0.48000000	 -0.12000000	  0.48000000	  0.16000000	 -0.12000000
+      [1]	 -0.12000000	  0.48000000	  0.12000000	  0.16000000	  0.48000000	 -0.12000000
+      [2]	 -0.11111111	  0.44444444	 -0.11111111	  0.44444444	  0.44444444	 -0.11111111
+      [3]	 -0.12000000	  0.16000000	 -0.12000000	  0.48000000	  0.48000000	  0.12000000
+    grad:
+      [0]	 -1.40000000	  1.60000000	 -0.20000000	 -0.80000000	  0.80000000	  0.00000000
+      [1]	  0.20000000	 -1.60000000	  1.40000000	 -0.80000000	  0.80000000	  0.00000000
+      [2]	 -0.33333333	  0.00000000	  0.33333333	 -1.33333333	  1.33333333	  0.00000000
+      [3]	  0.20000000	  0.00000000	 -0.20000000	 -2.40000000	  2.40000000	  0.00000000
+      [4]	 -1.40000000	 -0.80000000	  0.00000000	  1.60000000	  0.80000000	 -0.20000000
+      [5]	  0.20000000	 -2.40000000	  0.00000000	  0.00000000	  2.40000000	 -0.20000000
+      [6]	 -0.33333333	 -1.33333333	  0.00000000	  0.00000000	  1.33333333	  0.33333333
+      [7]	  0.20000000	 -0.80000000	  0.00000000	 -1.60000000	  0.80000000	  1.40000000
diff --git a/tests/output/t402-qfunction-f.out b/tests/output/t402-qfunction-f.out
index 7163a434f1..dc4e005814 100644
--- a/tests/output/t402-qfunction-f.out
+++ b/tests/output/t402-qfunction-f.out
@@ -26,3 +26,20 @@ User CeedQFunction - mass
       EvalMode: "interpolation"
 CeedQFunctionContext
   Context Data Size: 40
+  User CeedQFunction - mass
+    2 input fields:
+      Input field 0:
+        Name: "qdata"
+        Size: 1
+        EvalMode: "none"
+      Input field 1:
+        Name: "u"
+        Size: 1
+        EvalMode: "interpolation"
+    1 output field:
+      Output field 0:
+        Name: "v"
+        Size: 1
+        EvalMode: "interpolation"
+  CeedQFunctionContext
+    Context Data Size: 40
diff --git a/tests/output/t413-qfunction-f.out b/tests/output/t413-qfunction-f.out
index ffee1bdca7..05731e4204 100644
--- a/tests/output/t413-qfunction-f.out
+++ b/tests/output/t413-qfunction-f.out
@@ -28,3 +28,18 @@ Gallery CeedQFunction - MassApply
       Name: "v"
       Size: 1
       EvalMode: "interpolation"
+  Gallery CeedQFunction - MassApply
+    2 input fields:
+      Input field 0:
+        Name: "u"
+        Size: 1
+        EvalMode: "interpolation"
+      Input field 1:
+        Name: "qdata"
+        Size: 1
+        EvalMode: "none"
+    1 output field:
+      Output field 0:
+        Name: "v"
+        Size: 1
+        EvalMode: "interpolation"
diff --git a/tests/t003-ceed-f.f90 b/tests/t003-ceed-f.f90
index 61c00b3535..00147b869d 100644
--- a/tests/t003-ceed-f.f90
+++ b/tests/t003-ceed-f.f90
@@ -12,6 +12,9 @@ program test
 
       call ceedview(ceed,err)
 
+      call ceedsetnumviewtabs(ceed,1,err)
+      call ceedview(ceed,err)
+
       call ceeddestroy(ceed,err)
 
       end
diff --git a/tests/t008-ceed.c b/tests/t008-ceed.c
index 24b3fecff6..344b341ae7 100644
--- a/tests/t008-ceed.c
+++ b/tests/t008-ceed.c
@@ -11,7 +11,7 @@ int main(int argc, char **argv) {
   sprintf(help_resource, "help:%s", argv[1]);
 
   CeedInit(help_resource, &ceed);
-  CeedDestroy(&ceed);
 
+  CeedDestroy(&ceed);
   return 0;
 }
diff --git a/tests/t107-vector-f.f90 b/tests/t107-vector-f.f90
index 44531fe72b..51c2b79ff5 100644
--- a/tests/t107-vector-f.f90
+++ b/tests/t107-vector-f.f90
@@ -25,6 +25,9 @@ program test
 
       call ceedvectorview(x,err)
 
+      call ceedvectorsetnumviewtabs(x,1,err)
+      call ceedvectorview(x,err)
+
       call ceedvectordestroy(x,err)
       call ceeddestroy(ceed,err)
 
diff --git a/tests/t210-elemrestriction-f.f90 b/tests/t210-elemrestriction-f.f90
index b22c4fe5d8..7bad3f941a 100644
--- a/tests/t210-elemrestriction-f.f90
+++ b/tests/t210-elemrestriction-f.f90
@@ -27,6 +27,9 @@ program test
 
       call ceedelemrestrictionview(r,err)
 
+      call ceedelemrestrictionsetnumviewtabs(r,1,err)
+      call ceedelemrestrictionview(r,err)
+
       call ceedelemrestrictiondestroy(r,err)
       call ceeddestroy(ceed,err)
 
diff --git a/tests/t211-elemrestriction-f.f90 b/tests/t211-elemrestriction-f.f90
index 6d86c9c685..4cc27845fd 100644
--- a/tests/t211-elemrestriction-f.f90
+++ b/tests/t211-elemrestriction-f.f90
@@ -20,6 +20,9 @@ program test
 
       call ceedelemrestrictionview(r,err)
 
+      call ceedelemrestrictionsetnumviewtabs(r,1,err)
+      call ceedelemrestrictionview(r,err)
+
       call ceedelemrestrictiondestroy(r,err)
       call ceeddestroy(ceed,err)
 
diff --git a/tests/t212-elemrestriction-f.f90 b/tests/t212-elemrestriction-f.f90
index b36f7c2ea3..9d1341052a 100644
--- a/tests/t212-elemrestriction-f.f90
+++ b/tests/t212-elemrestriction-f.f90
@@ -21,6 +21,9 @@ program test
 
       call ceedelemrestrictionview(r,err)
 
+      call ceedelemrestrictionsetnumviewtabs(r,1,err)
+      call ceedelemrestrictionview(r,err)
+
       call ceedelemrestrictiondestroy(r,err)
       call ceeddestroy(ceed,err)
 
diff --git a/tests/t300-basis-f.f90 b/tests/t300-basis-f.f90
index 1397a5a403..6aef5a0c34 100644
--- a/tests/t300-basis-f.f90
+++ b/tests/t300-basis-f.f90
@@ -18,7 +18,10 @@ program test
 
       call ceedbasiscreatetensorh1lagrange(ceed,1,1,4,4,ceed_gauss,b,err)
       call ceedbasisview(b,err)
+      call ceedbasissetnumviewtabs(b,1,err)
+      call ceedbasisview(b,err)
       call ceedbasisdestroy(b,err)
+
       call ceeddestroy(ceed,err)
 
       end
diff --git a/tests/t320-basis-f.f90 b/tests/t320-basis-f.f90
index 46dffdede5..c8eb67fee8 100644
--- a/tests/t320-basis-f.f90
+++ b/tests/t320-basis-f.f90
@@ -32,6 +32,8 @@ program test
       call ceedbasiscreateh1(ceed,ceed_triangle,1,p,q,interp,grad,qref,qweight,&
      & b,err)
       call ceedbasisview(b,err)
+      call ceedbasissetnumviewtabs(b,1,err)
+      call ceedbasisview(b,err)
 
       call ceedbasisdestroy(b,err)
       call ceeddestroy(ceed,err)
diff --git a/tests/t402-qfunction-f.f90 b/tests/t402-qfunction-f.f90
index e87bcc4f2b..45cac092ce 100644
--- a/tests/t402-qfunction-f.f90
+++ b/tests/t402-qfunction-f.f90
@@ -49,6 +49,11 @@ program test
      & ctxdata,coffset,err)
       call ceedqfunctioncontextview(ctx,err)
 
+      call ceedqfunctionsetnumviewtabs(qf_mass,1,err)
+      call ceedqfunctionview(qf_mass,err)
+      call ceedqfunctioncontextsetnumviewtabs(ctx,1,err)
+      call ceedqfunctioncontextview(ctx,err)
+
       call ceedqfunctiondestroy(qf_setup,err)
       call ceedqfunctiondestroy(qf_mass,err)
       call ceeddestroy(ceed,err)
diff --git a/tests/t413-qfunction-f.f90 b/tests/t413-qfunction-f.f90
index c6a43d7361..754d881a4d 100644
--- a/tests/t413-qfunction-f.f90
+++ b/tests/t413-qfunction-f.f90
@@ -15,6 +15,8 @@ program test
 
       call ceedqfunctionview(qf_setup,err)
       call ceedqfunctionview(qf_mass,err)
+      call ceedqfunctionsetnumviewtabs(qf_mass,1,err)
+      call ceedqfunctionview(qf_mass,err)
 
       call ceedqfunctiondestroy(qf_setup,err)
       call ceedqfunctiondestroy(qf_mass,err)

From 648ecb3c64fa28ff6913386e8556e83e2c4c6241 Mon Sep 17 00:00:00 2001
From: Zach Atkins <zach.atkins@colorado.edu>
Date: Thu, 20 Nov 2025 16:20:32 -0700
Subject: [PATCH 511/571] minor - move Ceed*GetNumViewTabs to public header

---
 include/ceed/backend.h | 7 -------
 include/ceed/ceed.h    | 7 +++++++
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/include/ceed/backend.h b/include/ceed/backend.h
index a5785b4937..627d2907c2 100644
--- a/include/ceed/backend.h
+++ b/include/ceed/backend.h
@@ -252,7 +252,6 @@ CEED_EXTERN int CeedGetObjectDelegate(Ceed ceed, Ceed *delegate, const char *obj
 CEED_EXTERN int CeedSetObjectDelegate(Ceed ceed, Ceed delegate, const char *obj_name);
 CEED_EXTERN int CeedGetOperatorFallbackCeed(Ceed ceed, Ceed *fallback_ceed);
 CEED_EXTERN int CeedSetOperatorFallbackCeed(Ceed ceed, Ceed fallback_ceed);
-CEED_EXTERN int CeedGetNumViewTabs(Ceed ceed, CeedInt *num_tabs);
 CEED_EXTERN int CeedSetDeterministic(Ceed ceed, bool is_deterministic);
 CEED_EXTERN int CeedSetBackendFunctionImpl(Ceed ceed, const char *type, void *object, const char *func_name, void (*f)(void));
 CEED_EXTERN int CeedGetData(Ceed ceed, void *data);
@@ -269,7 +268,6 @@ CEED_EXTERN int CeedRestoreRustSourceRoots(Ceed ceed, const char ***rust_source_
 CEED_EXTERN int CeedGetJitDefines(Ceed ceed, CeedInt *num_defines, const char ***jit_defines);
 CEED_EXTERN int CeedRestoreJitDefines(Ceed ceed, const char ***jit_defines);
 
-CEED_EXTERN int CeedVectorGetNumViewTabs(CeedVector vec, CeedInt *num_tabs);
 CEED_EXTERN int CeedVectorHasValidArray(CeedVector vec, bool *has_valid_array);
 CEED_EXTERN int CeedVectorHasBorrowedArrayOfType(CeedVector vec, CeedMemType mem_type, bool *has_borrowed_array_of_type);
 CEED_EXTERN int CeedVectorHasValidArray(CeedVector vec, bool *has_valid_array);
@@ -298,7 +296,6 @@ typedef enum {
 } CeedRestrictionType;
 
 CEED_EXTERN int CeedElemRestrictionGetType(CeedElemRestriction rstr, CeedRestrictionType *rstr_type);
-CEED_EXTERN int CeedElemRestrictionGetNumViewTabs(CeedElemRestriction rstr, CeedInt *num_tabs);
 CEED_EXTERN int CeedElemRestrictionIsStrided(CeedElemRestriction rstr, bool *is_strided);
 CEED_EXTERN int CeedElemRestrictionIsAtPoints(CeedElemRestriction rstr, bool *is_points);
 CEED_EXTERN int CeedElemRestrictionAtPointsAreCompatible(CeedElemRestriction rstr_a, CeedElemRestriction rstr_b, bool *are_compatible);
@@ -339,7 +336,6 @@ CEED_EXTERN const char *const CeedFESpaces[];
 
 CEED_EXTERN int CeedBasisGetCollocatedGrad(CeedBasis basis, CeedScalar *colo_grad_1d);
 CEED_EXTERN int CeedBasisGetChebyshevInterp1D(CeedBasis basis, CeedScalar *chebyshev_interp_1d);
-CEED_EXTERN int CeedBasisGetNumViewTabs(CeedBasis basis, CeedInt *num_tabs);
 CEED_EXTERN int CeedBasisIsTensor(CeedBasis basis, bool *is_tensor);
 CEED_EXTERN int CeedBasisIsCollocated(CeedBasis basis, bool *is_collocated);
 CEED_EXTERN int CeedBasisGetData(CeedBasis basis, void *data);
@@ -380,7 +376,6 @@ CEED_EXTERN int CeedQFunctionGetName(CeedQFunction qf, const char **name);
 CEED_EXTERN int CeedQFunctionGetSourcePath(CeedQFunction qf, const char **source_path);
 CEED_EXTERN int CeedQFunctionLoadSourceToBuffer(CeedQFunction qf, const char **source_buffer);
 CEED_EXTERN int CeedQFunctionGetUserFunction(CeedQFunction qf, CeedQFunctionUser *f);
-CEED_EXTERN int CeedQFunctionGetNumViewTabs(CeedQFunction qf, CeedInt *num_tabs);
 CEED_EXTERN int CeedQFunctionGetContext(CeedQFunction qf, CeedQFunctionContext *ctx);
 CEED_EXTERN int CeedQFunctionGetContextData(CeedQFunction qf, CeedMemType mem_type, void *data);
 CEED_EXTERN int CeedQFunctionRestoreContextData(CeedQFunction qf, void *data);
@@ -398,7 +393,6 @@ CEED_EXTERN int CeedQFunctionGetFlopsEstimate(CeedQFunction qf, CeedSize *flops)
 
 CEED_EXTERN int  CeedQFunctionContextGetCeed(CeedQFunctionContext ctx, Ceed *ceed);
 CEED_EXTERN Ceed CeedQFunctionContextReturnCeed(CeedQFunctionContext ctx);
-CEED_EXTERN int  CeedQFunctionContextGetNumViewTabs(CeedQFunctionContext ctx, CeedInt *num_tabs);
 CEED_EXTERN int  CeedQFunctionContextHasValidData(CeedQFunctionContext ctx, bool *has_valid_data);
 CEED_EXTERN int  CeedQFunctionContextHasBorrowedDataOfType(CeedQFunctionContext ctx, CeedMemType mem_type, bool *has_borrowed_data_of_type);
 CEED_EXTERN int  CeedQFunctionContextGetState(CeedQFunctionContext ctx, uint64_t *state);
@@ -456,7 +450,6 @@ CEED_EXTERN int CeedOperatorAssemblyDataGetElemRestrictions(CeedOperatorAssembly
                                                             CeedElemRestriction **active_elem_rstrs_out);
 CEED_EXTERN int CeedOperatorAssemblyDataDestroy(CeedOperatorAssemblyData *data);
 
-CEED_EXTERN int CeedOperatorGetNumViewTabs(CeedOperator op, CeedInt *num_tabs);
 CEED_EXTERN int CeedOperatorGetActiveBasis(CeedOperator op, CeedBasis *active_basis);
 CEED_EXTERN int CeedOperatorGetActiveBases(CeedOperator op, CeedBasis *active_input_basis, CeedBasis *active_output_basis);
 CEED_EXTERN int CeedOperatorGetActiveElemRestriction(CeedOperator op, CeedElemRestriction *active_rstr);
diff --git a/include/ceed/ceed.h b/include/ceed/ceed.h
index 69c8afe281..49d7983732 100644
--- a/include/ceed/ceed.h
+++ b/include/ceed/ceed.h
@@ -110,6 +110,7 @@ CEED_EXTERN int CeedAddJitSourceRoot(Ceed ceed, const char *jit_source_root);
 CEED_EXTERN int CeedAddRustSourceRoot(Ceed ceed, const char *rust_source_root);
 CEED_EXTERN int CeedAddJitDefine(Ceed ceed, const char *jit_define);
 CEED_EXTERN int CeedSetNumViewTabs(Ceed ceed, CeedInt num_tabs);
+CEED_EXTERN int CeedGetNumViewTabs(Ceed ceed, CeedInt *num_tabs);
 CEED_EXTERN int CeedView(Ceed ceed, FILE *stream);
 CEED_EXTERN int CeedDestroy(Ceed *ceed);
 CEED_EXTERN int CeedErrorImpl(Ceed ceed, const char *filename, int lineno, const char *func, int ecode, const char *format, ...);
@@ -206,6 +207,7 @@ CEED_EXTERN int  CeedVectorAXPBY(CeedVector y, CeedScalar alpha, CeedScalar beta
 CEED_EXTERN int  CeedVectorPointwiseMult(CeedVector w, CeedVector x, CeedVector y);
 CEED_EXTERN int  CeedVectorReciprocal(CeedVector vec);
 CEED_EXTERN int  CeedVectorSetNumViewTabs(CeedVector vec, CeedInt num_tabs);
+CEED_EXTERN int  CeedVectorGetNumViewTabs(CeedVector vec, CeedInt *num_tabs);
 CEED_EXTERN int  CeedVectorViewRange(CeedVector vec, CeedSize start, CeedSize stop, CeedInt step, const char *fp_fmt, FILE *stream);
 CEED_EXTERN int  CeedVectorView(CeedVector vec, const char *fp_fmt, FILE *stream);
 CEED_EXTERN int  CeedVectorGetCeed(CeedVector vec, Ceed *ceed);
@@ -296,6 +298,7 @@ CEED_EXTERN int  CeedElemRestrictionGetNumBlocks(CeedElemRestriction rstr, CeedI
 CEED_EXTERN int  CeedElemRestrictionGetBlockSize(CeedElemRestriction rstr, CeedInt *block_size);
 CEED_EXTERN int  CeedElemRestrictionGetMultiplicity(CeedElemRestriction rstr, CeedVector mult);
 CEED_EXTERN int  CeedElemRestrictionSetNumViewTabs(CeedElemRestriction rstr, CeedInt num_tabs);
+CEED_EXTERN int  CeedElemRestrictionGetNumViewTabs(CeedElemRestriction rstr, CeedInt *num_tabs);
 CEED_EXTERN int  CeedElemRestrictionView(CeedElemRestriction rstr, FILE *stream);
 CEED_EXTERN int  CeedElemRestrictionDestroy(CeedElemRestriction *rstr);
 
@@ -316,6 +319,7 @@ CEED_EXTERN int CeedBasisCreateHcurl(Ceed ceed, CeedElemTopology topo, CeedInt n
 CEED_EXTERN int CeedBasisCreateProjection(CeedBasis basis_from, CeedBasis basis_to, CeedBasis *basis_project);
 CEED_EXTERN int CeedBasisReferenceCopy(CeedBasis basis, CeedBasis *basis_copy);
 CEED_EXTERN int CeedBasisSetNumViewTabs(CeedBasis basis, CeedInt num_tabs);
+CEED_EXTERN int CeedBasisGetNumViewTabs(CeedBasis basis, CeedInt *num_tabs);
 CEED_EXTERN int CeedBasisView(CeedBasis basis, FILE *stream);
 CEED_EXTERN int CeedBasisApply(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, CeedVector v);
 CEED_EXTERN int CeedBasisApplyAdd(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, CeedVector v);
@@ -373,6 +377,7 @@ CEED_EXTERN int  CeedQFunctionSetContext(CeedQFunction qf, CeedQFunctionContext
 CEED_EXTERN int  CeedQFunctionSetContextWritable(CeedQFunction qf, bool is_writable);
 CEED_EXTERN int  CeedQFunctionSetUserFlopsEstimate(CeedQFunction qf, CeedSize flops);
 CEED_EXTERN int  CeedQFunctionSetNumViewTabs(CeedQFunction qf, CeedInt num_tabs);
+CEED_EXTERN int  CeedQFunctionGetNumViewTabs(CeedQFunction qf, CeedInt *num_tabs);
 CEED_EXTERN int  CeedQFunctionView(CeedQFunction qf, FILE *stream);
 CEED_EXTERN int  CeedQFunctionGetCeed(CeedQFunction qf, Ceed *ceed);
 CEED_EXTERN Ceed CeedQFunctionReturnCeed(CeedQFunction qf);
@@ -412,6 +417,7 @@ CEED_EXTERN int CeedContextFieldLabelGetDescription(CeedContextFieldLabel label,
                                                     const char **field_description, CeedContextFieldType *field_type);
 CEED_EXTERN int CeedQFunctionContextGetContextSize(CeedQFunctionContext ctx, size_t *ctx_size);
 CEED_EXTERN int CeedQFunctionContextSetNumViewTabs(CeedQFunctionContext ctx, CeedInt num_tabs);
+CEED_EXTERN int CeedQFunctionContextGetNumViewTabs(CeedQFunctionContext ctx, CeedInt *num_tabs);
 CEED_EXTERN int CeedQFunctionContextView(CeedQFunctionContext ctx, FILE *stream);
 CEED_EXTERN int CeedQFunctionContextSetDataDestroy(CeedQFunctionContext ctx, CeedMemType f_mem_type, CeedQFunctionContextDataDestroyUser f);
 CEED_EXTERN int CeedQFunctionContextDestroy(CeedQFunctionContext *ctx);
@@ -460,6 +466,7 @@ CEED_EXTERN int  CeedOperatorCreateFDMElementInverse(CeedOperator op, CeedOperat
 CEED_EXTERN int  CeedOperatorSetName(CeedOperator op, const char *name);
 CEED_EXTERN int  CeedOperatorGetName(CeedOperator op, const char **name);
 CEED_EXTERN int  CeedOperatorSetNumViewTabs(CeedOperator op, CeedInt num_tabs);
+CEED_EXTERN int  CeedOperatorGetNumViewTabs(CeedOperator op, CeedInt *num_tabs);
 CEED_EXTERN int  CeedOperatorView(CeedOperator op, FILE *stream);
 CEED_EXTERN int  CeedOperatorViewTerse(CeedOperator op, FILE *stream);
 CEED_EXTERN int  CeedOperatorGetCeed(CeedOperator op, Ceed *ceed);

From 690992b2ef7dcef8d00e259f2fa17075fde4c330 Mon Sep 17 00:00:00 2001
From: Zach Atkins <zach.atkins@colorado.edu>
Date: Fri, 21 Nov 2025 11:08:16 -0700
Subject: [PATCH 512/571] minor - move implementations to match headers

---
 interface/ceed-basis.c            | 30 +++++++++++++++---------------
 interface/ceed-elemrestriction.c  | 30 +++++++++++++++---------------
 interface/ceed-operator.c         | 30 +++++++++++++++---------------
 interface/ceed-qfunction.c        | 30 +++++++++++++++---------------
 interface/ceed-qfunctioncontext.c | 30 +++++++++++++++---------------
 interface/ceed-vector.c           | 30 +++++++++++++++---------------
 interface/ceed.c                  | 30 +++++++++++++++---------------
 7 files changed, 105 insertions(+), 105 deletions(-)

diff --git a/interface/ceed-basis.c b/interface/ceed-basis.c
index ce1ccae1dc..d93dc273b6 100644
--- a/interface/ceed-basis.c
+++ b/interface/ceed-basis.c
@@ -724,21 +724,6 @@ int CeedBasisGetCollocatedGrad(CeedBasis basis, CeedScalar *collo_grad_1d) {
   return CEED_ERROR_SUCCESS;
 }
 
-/**
-  @brief Get the number of tabs to indent for @ref CeedBasisView() output
-
-  @param[in]  basis    `CeedBasis` to get the number of view tabs
-  @param[out] num_tabs Number of view tabs
-
-  @return Error code: 0 - success, otherwise - failure
-
-  @ref Backend
-**/
-int CeedBasisGetNumViewTabs(CeedBasis basis, CeedInt *num_tabs) {
-  *num_tabs = basis->num_tabs;
-  return CEED_ERROR_SUCCESS;
-}
-
 /**
   @brief Return 1D interpolation matrix to Chebyshev polynomial coefficients on quadrature space
 
@@ -1924,6 +1909,21 @@ int CeedBasisSetNumViewTabs(CeedBasis basis, CeedInt num_tabs) {
   return CEED_ERROR_SUCCESS;
 }
 
+/**
+  @brief Get the number of tabs to indent for @ref CeedBasisView() output
+
+  @param[in]  basis    `CeedBasis` to get the number of view tabs
+  @param[out] num_tabs Number of view tabs
+
+  @return Error code: 0 - success, otherwise - failure
+
+  @ref User
+**/
+int CeedBasisGetNumViewTabs(CeedBasis basis, CeedInt *num_tabs) {
+  *num_tabs = basis->num_tabs;
+  return CEED_ERROR_SUCCESS;
+}
+
 /**
   @brief View a `CeedBasis`
 
diff --git a/interface/ceed-elemrestriction.c b/interface/ceed-elemrestriction.c
index e8f5cf7a8d..7408bbce2e 100644
--- a/interface/ceed-elemrestriction.c
+++ b/interface/ceed-elemrestriction.c
@@ -121,21 +121,6 @@ int CeedElemRestrictionGetType(CeedElemRestriction rstr, CeedRestrictionType *rs
   return CEED_ERROR_SUCCESS;
 }
 
-/**
-  @brief Get the number of tabs to indent for @ref CeedElemRestrictionView() output
-
-  @param[in]  rstr     `CeedElemRestriction` to get the number of view tabs
-  @param[out] num_tabs Number of view tabs
-
-  @return Error code: 0 - success, otherwise - failure
-
-  @ref Backend
-**/
-int CeedElemRestrictionGetNumViewTabs(CeedElemRestriction rstr, CeedInt *num_tabs) {
-  *num_tabs = rstr->num_tabs;
-  return CEED_ERROR_SUCCESS;
-}
-
 /**
   @brief Get the strided status of a `CeedElemRestriction`
 
@@ -1752,6 +1737,21 @@ int CeedElemRestrictionSetNumViewTabs(CeedElemRestriction rstr, CeedInt num_tabs
   return CEED_ERROR_SUCCESS;
 }
 
+/**
+  @brief Get the number of tabs to indent for @ref CeedElemRestrictionView() output
+
+  @param[in]  rstr     `CeedElemRestriction` to get the number of view tabs
+  @param[out] num_tabs Number of view tabs
+
+  @return Error code: 0 - success, otherwise - failure
+
+  @ref User
+**/
+int CeedElemRestrictionGetNumViewTabs(CeedElemRestriction rstr, CeedInt *num_tabs) {
+  *num_tabs = rstr->num_tabs;
+  return CEED_ERROR_SUCCESS;
+}
+
 /**
   @brief View a `CeedElemRestriction`
 
diff --git a/interface/ceed-operator.c b/interface/ceed-operator.c
index cf4d5625ed..466d8d0cd9 100644
--- a/interface/ceed-operator.c
+++ b/interface/ceed-operator.c
@@ -174,21 +174,6 @@ int CeedOperatorSingleView(CeedOperator op, const char *tabs, FILE *stream) {
   return CEED_ERROR_SUCCESS;
 }
 
-/**
-  @brief Get the number of tabs to indent for @ref CeedOperatorView() output
-
-  @param[in]  op       `CeedOperator` to get the number of view tabs
-  @param[out] num_tabs Number of view tabs
-
-  @return Error code: 0 - success, otherwise - failure
-
-  @ref Backend
-**/
-int CeedOperatorGetNumViewTabs(CeedOperator op, CeedInt *num_tabs) {
-  *num_tabs = op->num_tabs;
-  return CEED_ERROR_SUCCESS;
-}
-
 /**
   @brief Find the active input vector `CeedBasis` for a non-composite `CeedOperator`.
 
@@ -1656,6 +1641,21 @@ int CeedOperatorSetNumViewTabs(CeedOperator op, CeedInt num_tabs) {
   return CEED_ERROR_SUCCESS;
 }
 
+/**
+  @brief Get the number of tabs to indent for @ref CeedOperatorView() output
+
+  @param[in]  op       `CeedOperator` to get the number of view tabs
+  @param[out] num_tabs Number of view tabs
+
+  @return Error code: 0 - success, otherwise - failure
+
+  @ref User
+**/
+int CeedOperatorGetNumViewTabs(CeedOperator op, CeedInt *num_tabs) {
+  *num_tabs = op->num_tabs;
+  return CEED_ERROR_SUCCESS;
+}
+
 /**
   @brief View a `CeedOperator`
 
diff --git a/interface/ceed-qfunction.c b/interface/ceed-qfunction.c
index a503939c8c..956348892f 100644
--- a/interface/ceed-qfunction.c
+++ b/interface/ceed-qfunction.c
@@ -338,21 +338,6 @@ int CeedQFunctionGetUserFunction(CeedQFunction qf, CeedQFunctionUser *f) {
   return CEED_ERROR_SUCCESS;
 }
 
-/**
-  @brief Get the number of tabs to indent for @ref CeedQFunctionView() output
-
-  @param[in]  qf       `CeedQFunction` to get the number of view tabs
-  @param[out] num_tabs Number of view tabs
-
-  @return Error code: 0 - success, otherwise - failure
-
-  @ref Backend
-**/
-int CeedQFunctionGetNumViewTabs(CeedQFunction qf, CeedInt *num_tabs) {
-  *num_tabs = qf->num_tabs;
-  return CEED_ERROR_SUCCESS;
-}
-
 /**
   @brief Get global context for a `CeedQFunction`.
 
@@ -1042,6 +1027,21 @@ int CeedQFunctionSetNumViewTabs(CeedQFunction qf, CeedInt num_tabs) {
   return CEED_ERROR_SUCCESS;
 }
 
+/**
+  @brief Get the number of tabs to indent for @ref CeedQFunctionView() output
+
+  @param[in]  qf       `CeedQFunction` to get the number of view tabs
+  @param[out] num_tabs Number of view tabs
+
+  @return Error code: 0 - success, otherwise - failure
+
+  @ref User
+**/
+int CeedQFunctionGetNumViewTabs(CeedQFunction qf, CeedInt *num_tabs) {
+  *num_tabs = qf->num_tabs;
+  return CEED_ERROR_SUCCESS;
+}
+
 /**
   @brief View a `CeedQFunction`
 
diff --git a/interface/ceed-qfunctioncontext.c b/interface/ceed-qfunctioncontext.c
index 87f7d2158a..8919726495 100644
--- a/interface/ceed-qfunctioncontext.c
+++ b/interface/ceed-qfunctioncontext.c
@@ -162,21 +162,6 @@ int CeedQFunctionContextGetCeed(CeedQFunctionContext ctx, Ceed *ceed) {
 **/
 Ceed CeedQFunctionContextReturnCeed(CeedQFunctionContext ctx) { return ctx->ceed; }
 
-/**
-  @brief Get the number of tabs to indent for @ref CeedQFunctionContextView() output
-
-  @param[in]  ctx      `CeedQFunctionContext` to get the number of view tabs
-  @param[out] num_tabs Number of view tabs
-
-  @return Error code: 0 - success, otherwise - failure
-
-  @ref Backend
-**/
-int CeedQFunctionContextGetNumViewTabs(CeedQFunctionContext ctx, CeedInt *num_tabs) {
-  *num_tabs = ctx->num_tabs;
-  return CEED_ERROR_SUCCESS;
-}
-
 /**
   @brief Check for valid data in a `CeedQFunctionContext`
 
@@ -912,6 +897,21 @@ int CeedQFunctionContextSetNumViewTabs(CeedQFunctionContext ctx, CeedInt num_tab
   return CEED_ERROR_SUCCESS;
 }
 
+/**
+  @brief Get the number of tabs to indent for @ref CeedQFunctionContextView() output
+
+  @param[in]  ctx      `CeedQFunctionContext` to get the number of view tabs
+  @param[out] num_tabs Number of view tabs
+
+  @return Error code: 0 - success, otherwise - failure
+
+  @ref User
+**/
+int CeedQFunctionContextGetNumViewTabs(CeedQFunctionContext ctx, CeedInt *num_tabs) {
+  *num_tabs = ctx->num_tabs;
+  return CEED_ERROR_SUCCESS;
+}
+
 /**
   @brief View a `CeedQFunctionContext`
 
diff --git a/interface/ceed-vector.c b/interface/ceed-vector.c
index ff3ea4eae5..b73cdfbf5a 100644
--- a/interface/ceed-vector.c
+++ b/interface/ceed-vector.c
@@ -39,21 +39,6 @@ const CeedVector CEED_VECTOR_NONE = &ceed_vector_none;
 /// @addtogroup CeedVectorBackend
 /// @{
 
-/**
-  @brief Get the number of tabs to indent for @ref CeedVectorView() output
-
-  @param[in]  vec      `CeedVector` to get the number of view tabs
-  @param[out] num_tabs Number of view tabs
-
-  @return Error code: 0 - success, otherwise - failure
-
-  @ref Backend
-**/
-int CeedVectorGetNumViewTabs(CeedVector vec, CeedInt *num_tabs) {
-  *num_tabs = vec->num_tabs;
-  return CEED_ERROR_SUCCESS;
-}
-
 /**
   @brief Check for valid data in a `CeedVector`
 
@@ -1025,6 +1010,21 @@ int CeedVectorSetNumViewTabs(CeedVector vec, CeedInt num_tabs) {
   return CEED_ERROR_SUCCESS;
 }
 
+/**
+  @brief Get the number of tabs to indent for @ref CeedVectorView() output
+
+  @param[in]  vec      `CeedVector` to get the number of view tabs
+  @param[out] num_tabs Number of view tabs
+
+  @return Error code: 0 - success, otherwise - failure
+
+  @ref User
+**/
+int CeedVectorGetNumViewTabs(CeedVector vec, CeedInt *num_tabs) {
+  *num_tabs = vec->num_tabs;
+  return CEED_ERROR_SUCCESS;
+}
+
 /**
   @brief View a `CeedVector`
 
diff --git a/interface/ceed.c b/interface/ceed.c
index e7e248a853..133ddabfd3 100644
--- a/interface/ceed.c
+++ b/interface/ceed.c
@@ -663,21 +663,6 @@ int CeedSetOperatorFallbackCeed(Ceed ceed, Ceed fallback_ceed) {
   return CEED_ERROR_SUCCESS;
 }
 
-/**
-  @brief Get the number of tabs to indent for @ref CeedView() output
-
-  @param[in]  ceed     `Ceed` to get the number of view tabs
-  @param[out] num_tabs Number of view tabs
-
-  @return Error code: 0 - success, otherwise - failure
-
-  @ref Backend
-**/
-int CeedGetNumViewTabs(Ceed ceed, CeedInt *num_tabs) {
-  *num_tabs = ceed->num_tabs;
-  return CEED_ERROR_SUCCESS;
-}
-
 /**
   @brief Flag `Ceed` context as deterministic
 
@@ -1568,6 +1553,21 @@ int CeedSetNumViewTabs(Ceed ceed, CeedInt num_tabs) {
   return CEED_ERROR_SUCCESS;
 }
 
+/**
+  @brief Get the number of tabs to indent for @ref CeedView() output
+
+  @param[in]  ceed     `Ceed` to get the number of view tabs
+  @param[out] num_tabs Number of view tabs
+
+  @return Error code: 0 - success, otherwise - failure
+
+  @ref User
+**/
+int CeedGetNumViewTabs(Ceed ceed, CeedInt *num_tabs) {
+  *num_tabs = ceed->num_tabs;
+  return CEED_ERROR_SUCCESS;
+}
+
 /**
   @brief View a `Ceed`
 

From afeb93e9a539977d805c4dfebb022b4afb833c26 Mon Sep 17 00:00:00 2001
From: Kartiki Pande <kpande3289@sdsu.edu>
Date: Thu, 11 Dec 2025 07:57:09 -0800
Subject: [PATCH 513/571] Kartiki p ex3 volume julia (#1909)

* Add ex3-volume.jl for volume mesh example

Implement example for volume mesh transformation and execution.

* Update ex3-volume

Error : Field 'qdata' of size 4 and EvalMode none: ElemRestriction has 1 components
Needs debugging.

* Update julia/LibCEED.jl/examples/ex3-volume.jl

Co-authored-by: Jeremy L Thompson <jeremy@jeremylt.org>

* Update julia/LibCEED.jl/examples/ex3-volume.jl

Co-authored-by: Jeremy L Thompson <jeremy@jeremylt.org>

* Diff Applied

* Remove foo.diff

* Apply Jeremy's ex3-volume patch

---------

Co-authored-by: Jeremy L Thompson <jeremy@jeremylt.org>
---
 julia/LibCEED.jl/examples/ex3-volume.jl | 197 ++++++++++++++++++++++++
 1 file changed, 197 insertions(+)
 create mode 100644 julia/LibCEED.jl/examples/ex3-volume.jl

diff --git a/julia/LibCEED.jl/examples/ex3-volume.jl b/julia/LibCEED.jl/examples/ex3-volume.jl
new file mode 100644
index 0000000000..68edf59817
--- /dev/null
+++ b/julia/LibCEED.jl/examples/ex3-volume.jl
@@ -0,0 +1,197 @@
+using LibCEED, Printf
+
+include("common.jl")
+
+function transform_mesh_coords!(dim, mesh_size, mesh_coords)
+    @witharray coords = mesh_coords begin
+        if dim == 1
+            for i = 1:mesh_size
+                # map [0,1] to [0,1] varying the mesh density
+                coords[i] = 0.5 + 1.0/sqrt(3.0)*sin((2.0/3.0)*pi*(coords[i] - 0.5))
+            end
+            exact_volume = 1.0
+        else
+            num_nodes = mesh_size÷dim
+            @inbounds @simd for i = 1:num_nodes
+                # map (x,y) from [0,1]x[0,1] to the quarter annulus with polar
+                # coordinates, (r,phi) in [1,2]x[0,pi/2] with area = 3/4*pi
+                u = coords[i]
+                v = coords[i+num_nodes]
+                u = 1.0 + u
+                v = pi/2*v
+                coords[i] = u*cos(v)
+                coords[i+num_nodes] = u*sin(v)
+            end
+            exact_volume = 3.0/4.0*pi
+        end
+        return exact_volume
+    end
+end
+
+function run_ex3(; ceed_spec, dim, mesh_order, sol_order, num_qpts, prob_size)
+    ncompx = dim
+    prob_size < 0 && (prob_size = 256*1024)
+
+    ceed = Ceed(ceed_spec)
+    mesh_basis =
+        create_tensor_h1_lagrange_basis(ceed, dim, ncompx, mesh_order + 1, num_qpts, GAUSS)
+    sol_basis =
+        create_tensor_h1_lagrange_basis(ceed, dim, 1, sol_order + 1, num_qpts, GAUSS)
+
+    # Determine the mesh size based on the given approximate problem size.
+    nxyz = get_cartesian_mesh_size(dim, sol_order, prob_size)
+    println("Mesh size: ", nxyz)
+
+    # Build CeedElemRestriction objects describing the mesh and solution discrete
+    # representations.
+    mesh_size, mesh_rstr, _ =
+        build_cartesian_restriction(ceed, dim, nxyz, mesh_order, ncompx, num_qpts)
+    num_q_comp = 1 + div(dim*(dim + 1), 2)
+    sol_size, _, qdata_rstr_i = build_cartesian_restriction(
+        ceed,
+        dim,
+        nxyz,
+        sol_order,
+        num_q_comp,
+        num_qpts,
+        mode=StridedOnly,
+    )
+    sol_size, sol_rstr, sol_rstr_i = build_cartesian_restriction(
+        ceed,
+        dim,
+        nxyz,
+        sol_order,
+        1,
+        num_qpts,
+        mode=RestrictionAndStrided,
+    )
+    println("Number of mesh nodes     : ", div(mesh_size, dim))
+    println("Number of solution nodes : ", sol_size)
+
+    # Create a CeedVector with the mesh coordinates.
+    mesh_coords = CeedVector(ceed, mesh_size)
+    set_cartesian_mesh_coords!(dim, nxyz, mesh_order, mesh_coords)
+    # Apply a transformation to the mesh.
+    exact_vol = transform_mesh_coords!(dim, mesh_size, mesh_coords)
+
+    #Create the Q-function that builds the mass+diffusion operator ( i.e it computes the quadrature data) and set its context data.
+    @interior_qf build_qfunc = (
+        ceed,
+        dim=dim,
+        (dx, :in, EVAL_GRAD, dim, dim),      # ← THIS LINE: dx input
+        (weights, :in, EVAL_WEIGHT),         # ← weights input
+        (qdata, :out, EVAL_NONE, num_q_comp), # ← qdata output
+        begin
+            # Compute determinant
+            det_J = det(dx)
+
+            # Store mass component
+            qdata[1] = weights*det_J
+
+            # Store diffusion components (J^T * J)
+            idx = 2
+            for i = 1:dim
+                for j = i:dim
+                    qdata[idx] = dx[:, i]'*dx[:, j]
+                    idx += 1
+                end
+            end
+        end,
+    )
+
+    # Create the operator that builds the quadrature data for the mass+diffusion operator.
+    build_oper = Operator(
+        ceed,
+        qf=build_qfunc,
+        fields=[
+            (:dx, mesh_rstr, mesh_basis, CeedVectorActive()),
+            (:weights, ElemRestrictionNone(), mesh_basis, CeedVectorNone()),
+            (:qdata, qdata_rstr_i, BasisNone(), CeedVectorActive()),
+        ],
+    )
+
+    # Compute the quadrature data for the mass+diff operator.
+    elem_qpts = num_qpts^dim
+    num_elem = prod(nxyz)
+    qdata = CeedVector(ceed, num_elem*elem_qpts*num_q_comp)
+    print("Computing the quadrature data for the mass+diffusion operator ...")
+    flush(stdout)
+    apply!(build_oper, mesh_coords, qdata)
+    println(" done.")
+
+    # Create the Q-function that defines the action of the mass+diffusion operator.
+    @interior_qf apply_qfunc = (
+        ceed,
+        dim=dim,
+        (u, :in, EVAL_INTERP),
+        (du, :in, EVAL_GRAD, dim),
+        (qdata, :in, EVAL_NONE, num_q_comp),
+        (v, :out, EVAL_INTERP),
+        (dv, :out, EVAL_GRAD, dim),
+        begin
+            # Apply mass: v = qdata[1] * u
+            v .= qdata[1].*u
+
+            # Apply diffusion: dv = (qdata[2:end]) * du
+            # The qdata contains the symmetric diffusion tensor (J^T*J)
+            # dv_i = sum_j (J^T*J)_{i,j} * du_j
+
+            # For efficiency, rebuild the matrix from stored components
+            idx = 2
+            for i = 1:dim
+                dv_i = 0.0
+                for j = 1:dim
+                    # Reconstruct symmetric matrix element
+                    if j >= i
+                        mat_idx = idx + div((j - 1)*j, 2) + (i - 1)
+                    else
+                        mat_idx = idx + div((i - 1)*i, 2) + (j - 1)
+                    end
+                    dv_i += qdata[mat_idx]*du[j]
+                end
+                dv[i] = dv_i
+            end
+        end,
+    )
+
+    # Create the mass+diffusion operator.
+    oper = Operator(
+        ceed,
+        qf=apply_qfunc,
+        fields=[
+            (:u, sol_rstr, sol_basis, CeedVectorActive()),
+            (:du, sol_rstr, sol_basis, CeedVectorActive()),
+            (:qdata, qdata_rstr_i, BasisNone(), qdata),
+            (:v, sol_rstr, sol_basis, CeedVectorActive()),
+            (:dv, sol_rstr, sol_basis, CeedVectorActive()),
+        ],
+    )
+
+    # Compute the mesh volume using the mass+diffusion operator: vol = 1^T \cdot (M + K) \cdot 1
+    print("Computing the mesh volume using the formula: vol = 1^T * (M + K) * 1...")
+    flush(stdout)
+    # Create auxiliary solution-size vectors.
+    u = CeedVector(ceed, sol_size)
+    v = CeedVector(ceed, sol_size)
+    # Initialize 'u' with ones.
+    u[] = 1.0
+    # Apply the mass+diffusion operator: 'u' -> 'v'.
+    apply!(oper, u, v)
+    # Compute and print the sum of the entries of 'v' giving the mesh volume.
+    vol = witharray_read(sum, v, MEM_HOST)
+
+    println(" done.")
+    @printf("Exact mesh volume    : % .14g\n", exact_vol)
+    @printf("Computed mesh volume : % .14g\n", vol)
+    @printf("Volume error         : % .14g\n", vol - exact_vol)
+end
+
+# Entry point
+run_ex3(
+    ceed_spec="/cpu/self",
+    dim=3,
+    mesh_order=4,
+    sol_order=4,
+    num_qpts=4 + 2,
+    prob_size=-1,
+)

From b413b6aed3bbcdbc1ec551c290bc716fd91c53bb Mon Sep 17 00:00:00 2001
From: Gabriele Bozzola <gbozzola@amazon.com>
Date: Fri, 12 Dec 2025 09:33:29 -0800
Subject: [PATCH 514/571] Use compiler definition for Intel oneAPI

`__INTEL_COMPILER` does no longer exist on LLVM-based Intel compilers.
The new macro is `__INTEL_LLVM_COMPILER`

https://www.intel.com/content/www/us/en/docs/dpcpp-cpp-compiler/developer-guide-reference/2023-2/use-predefined-macros-to-specify-intel-compilers.html
---
 include/ceed/types.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/ceed/types.h b/include/ceed/types.h
index 098afd5987..5492816012 100644
--- a/include/ceed/types.h
+++ b/include/ceed/types.h
@@ -25,7 +25,7 @@
 #ifndef __NO_INLINE__
 #if defined(__GNUC__) || defined(__clang__)
 #define CEED_QFUNCTION_ATTR __attribute__((flatten))
-#elif defined(__INTEL_COMPILER)
+#elif defined(__INTEL_COMPILER) || defined(__INTEL_LLVM_COMPILER)
 #define CEED_QFUNCTION_ATTR _Pragma("forceinline")
 #else
 #define CEED_QFUNCTION_ATTR
@@ -103,7 +103,7 @@ values for CPU backends.
     Code generation backends may redefine this macro, as needed.
 **/
 #ifndef CeedPragmaSIMD
-#if defined(__INTEL_COMPILER)
+#if defined(__INTEL_COMPILER) || defined(__INTEL_LLVM_COMPILER)
 #define CeedPragmaSIMD _Pragma("vector")
 /// Cannot use Intel pragma ivdep because it miscompiles unpacking symmetric tensors, as in Poisson2DApply, where the SIMD loop body contains
 /// temporaries such as the following.

From 9ba83ac0e4b1fca39d6fa6737a318a9f0cbc172d Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 19 Dec 2025 12:03:59 -0700
Subject: [PATCH 515/571] minor - update copyright to 2026

---
 LICENSE                                                         | 2 +-
 Makefile                                                        | 2 +-
 README.md                                                       | 2 +-
 backends/avx/ceed-avx-blocked.c                                 | 2 +-
 backends/avx/ceed-avx-serial.c                                  | 2 +-
 backends/avx/ceed-avx-tensor.c                                  | 2 +-
 backends/avx/ceed-avx.h                                         | 2 +-
 backends/blocked/ceed-blocked-operator.c                        | 2 +-
 backends/blocked/ceed-blocked.c                                 | 2 +-
 backends/blocked/ceed-blocked.h                                 | 2 +-
 backends/ceed-backend-list.h                                    | 2 +-
 backends/ceed-backend-weak.c                                    | 2 +-
 backends/cuda-gen/ceed-cuda-gen-operator-build.cpp              | 2 +-
 backends/cuda-gen/ceed-cuda-gen-operator-build.h                | 2 +-
 backends/cuda-gen/ceed-cuda-gen-operator.c                      | 2 +-
 backends/cuda-gen/ceed-cuda-gen-qfunction.c                     | 2 +-
 backends/cuda-gen/ceed-cuda-gen.c                               | 2 +-
 backends/cuda-gen/ceed-cuda-gen.h                               | 2 +-
 backends/cuda-ref/ceed-cuda-ref-basis.c                         | 2 +-
 backends/cuda-ref/ceed-cuda-ref-operator.c                      | 2 +-
 backends/cuda-ref/ceed-cuda-ref-qfunction-load.cpp              | 2 +-
 backends/cuda-ref/ceed-cuda-ref-qfunction-load.h                | 2 +-
 backends/cuda-ref/ceed-cuda-ref-qfunction.c                     | 2 +-
 backends/cuda-ref/ceed-cuda-ref-qfunctioncontext.c              | 2 +-
 backends/cuda-ref/ceed-cuda-ref-restriction.c                   | 2 +-
 backends/cuda-ref/ceed-cuda-ref-vector.c                        | 2 +-
 backends/cuda-ref/ceed-cuda-ref.c                               | 2 +-
 backends/cuda-ref/ceed-cuda-ref.h                               | 2 +-
 backends/cuda-ref/kernels/cuda-ref-vector.cu                    | 2 +-
 backends/cuda-shared/ceed-cuda-shared-basis.c                   | 2 +-
 backends/cuda-shared/ceed-cuda-shared.c                         | 2 +-
 backends/cuda-shared/ceed-cuda-shared.h                         | 2 +-
 backends/cuda/ceed-cuda-common.c                                | 2 +-
 backends/cuda/ceed-cuda-common.h                                | 2 +-
 backends/cuda/ceed-cuda-compile.cpp                             | 2 +-
 backends/cuda/ceed-cuda-compile.h                               | 2 +-
 backends/hip-gen/ceed-hip-gen-operator-build.cpp                | 2 +-
 backends/hip-gen/ceed-hip-gen-operator-build.h                  | 2 +-
 backends/hip-gen/ceed-hip-gen-operator.c                        | 2 +-
 backends/hip-gen/ceed-hip-gen-qfunction.c                       | 2 +-
 backends/hip-gen/ceed-hip-gen.c                                 | 2 +-
 backends/hip-gen/ceed-hip-gen.h                                 | 2 +-
 backends/hip-ref/ceed-hip-ref-basis.c                           | 2 +-
 backends/hip-ref/ceed-hip-ref-operator.c                        | 2 +-
 backends/hip-ref/ceed-hip-ref-qfunction-load.cpp                | 2 +-
 backends/hip-ref/ceed-hip-ref-qfunction-load.h                  | 2 +-
 backends/hip-ref/ceed-hip-ref-qfunction.c                       | 2 +-
 backends/hip-ref/ceed-hip-ref-qfunctioncontext.c                | 2 +-
 backends/hip-ref/ceed-hip-ref-restriction.c                     | 2 +-
 backends/hip-ref/ceed-hip-ref-vector.c                          | 2 +-
 backends/hip-ref/ceed-hip-ref.c                                 | 2 +-
 backends/hip-ref/ceed-hip-ref.h                                 | 2 +-
 backends/hip-ref/kernels/hip-ref-vector.hip.cpp                 | 2 +-
 backends/hip-shared/ceed-hip-shared-basis.c                     | 2 +-
 backends/hip-shared/ceed-hip-shared.c                           | 2 +-
 backends/hip-shared/ceed-hip-shared.h                           | 2 +-
 backends/hip/ceed-hip-common.c                                  | 2 +-
 backends/hip/ceed-hip-common.h                                  | 2 +-
 backends/hip/ceed-hip-compile.cpp                               | 2 +-
 backends/hip/ceed-hip-compile.h                                 | 2 +-
 backends/magma/ceed-magma-basis.c                               | 2 +-
 backends/magma/ceed-magma-common.c                              | 2 +-
 backends/magma/ceed-magma-common.h                              | 2 +-
 backends/magma/ceed-magma-det.c                                 | 2 +-
 backends/magma/ceed-magma-gemm-nontensor.cpp                    | 2 +-
 backends/magma/ceed-magma-gemm-nontensor.h                      | 2 +-
 backends/magma/ceed-magma-gemm-selector.cpp                     | 2 +-
 backends/magma/ceed-magma-gemm-selector.h                       | 2 +-
 backends/magma/ceed-magma.c                                     | 2 +-
 backends/magma/ceed-magma.h                                     | 2 +-
 backends/magma/tuning/Makefile                                  | 2 +-
 backends/magma/tuning/generate_tuning.py                        | 2 +-
 backends/magma/tuning/tuning.cpp                                | 2 +-
 backends/memcheck/ceed-memcheck-blocked.c                       | 2 +-
 backends/memcheck/ceed-memcheck-qfunction.c                     | 2 +-
 backends/memcheck/ceed-memcheck-qfunctioncontext.c              | 2 +-
 backends/memcheck/ceed-memcheck-restriction.c                   | 2 +-
 backends/memcheck/ceed-memcheck-serial.c                        | 2 +-
 backends/memcheck/ceed-memcheck-vector.c                        | 2 +-
 backends/memcheck/ceed-memcheck.h                               | 2 +-
 backends/occa/ceed-occa-basis.cpp                               | 2 +-
 backends/occa/ceed-occa-basis.hpp                               | 2 +-
 backends/occa/ceed-occa-ceed-object.cpp                         | 2 +-
 backends/occa/ceed-occa-ceed-object.hpp                         | 2 +-
 backends/occa/ceed-occa-context.cpp                             | 2 +-
 backends/occa/ceed-occa-context.hpp                             | 2 +-
 backends/occa/ceed-occa-cpu-operator.cpp                        | 2 +-
 backends/occa/ceed-occa-cpu-operator.hpp                        | 2 +-
 backends/occa/ceed-occa-elem-restriction.cpp                    | 2 +-
 backends/occa/ceed-occa-elem-restriction.hpp                    | 2 +-
 backends/occa/ceed-occa-gpu-operator.cpp                        | 2 +-
 backends/occa/ceed-occa-gpu-operator.hpp                        | 2 +-
 backends/occa/ceed-occa-kernels.hpp                             | 2 +-
 backends/occa/ceed-occa-operator-args.cpp                       | 2 +-
 backends/occa/ceed-occa-operator-args.hpp                       | 2 +-
 backends/occa/ceed-occa-operator-field.cpp                      | 2 +-
 backends/occa/ceed-occa-operator-field.hpp                      | 2 +-
 backends/occa/ceed-occa-operator.cpp                            | 2 +-
 backends/occa/ceed-occa-operator.hpp                            | 2 +-
 backends/occa/ceed-occa-qfunction-args.cpp                      | 2 +-
 backends/occa/ceed-occa-qfunction-args.hpp                      | 2 +-
 backends/occa/ceed-occa-qfunction-field.cpp                     | 2 +-
 backends/occa/ceed-occa-qfunction-field.hpp                     | 2 +-
 backends/occa/ceed-occa-qfunction.cpp                           | 2 +-
 backends/occa/ceed-occa-qfunction.hpp                           | 2 +-
 backends/occa/ceed-occa-qfunctioncontext.cpp                    | 2 +-
 backends/occa/ceed-occa-qfunctioncontext.hpp                    | 2 +-
 backends/occa/ceed-occa-simplex-basis.cpp                       | 2 +-
 backends/occa/ceed-occa-simplex-basis.hpp                       | 2 +-
 backends/occa/ceed-occa-tensor-basis.cpp                        | 2 +-
 backends/occa/ceed-occa-tensor-basis.hpp                        | 2 +-
 backends/occa/ceed-occa-types.hpp                               | 2 +-
 backends/occa/ceed-occa-vector.cpp                              | 2 +-
 backends/occa/ceed-occa-vector.hpp                              | 2 +-
 backends/occa/ceed-occa.cpp                                     | 2 +-
 backends/occa/ceed-occa.h                                       | 2 +-
 backends/occa/kernels/elem-restriction.cpp                      | 2 +-
 backends/occa/kernels/elem-restriction.hpp                      | 2 +-
 backends/occa/kernels/kernel-defines.hpp                        | 2 +-
 backends/occa/kernels/set-value.cpp                             | 2 +-
 backends/occa/kernels/set-value.hpp                             | 2 +-
 backends/occa/kernels/simplex-basis.hpp                         | 2 +-
 backends/occa/kernels/simplex-basis/cpu-simplex-basis.cpp       | 2 +-
 backends/occa/kernels/simplex-basis/gpu-simplex-basis.cpp       | 2 +-
 backends/occa/kernels/tensor-basis.hpp                          | 2 +-
 backends/occa/kernels/tensor-basis/cpu/tensor-basis-1d.cpp      | 2 +-
 backends/occa/kernels/tensor-basis/cpu/tensor-basis-2d.cpp      | 2 +-
 backends/occa/kernels/tensor-basis/cpu/tensor-basis-3d.cpp      | 2 +-
 backends/occa/kernels/tensor-basis/gpu/tensor-basis-1d.cpp      | 2 +-
 backends/occa/kernels/tensor-basis/gpu/tensor-basis-2d.cpp      | 2 +-
 backends/occa/kernels/tensor-basis/gpu/tensor-basis-3d.cpp      | 2 +-
 backends/opt/ceed-opt-blocked.c                                 | 2 +-
 backends/opt/ceed-opt-operator.c                                | 2 +-
 backends/opt/ceed-opt-serial.c                                  | 2 +-
 backends/opt/ceed-opt-tensor.c                                  | 2 +-
 backends/opt/ceed-opt.h                                         | 2 +-
 backends/ref/ceed-ref-basis.c                                   | 2 +-
 backends/ref/ceed-ref-operator.c                                | 2 +-
 backends/ref/ceed-ref-qfunction.c                               | 2 +-
 backends/ref/ceed-ref-qfunctioncontext.c                        | 2 +-
 backends/ref/ceed-ref-restriction.c                             | 2 +-
 backends/ref/ceed-ref-tensor.c                                  | 2 +-
 backends/ref/ceed-ref-vector.c                                  | 2 +-
 backends/ref/ceed-ref.c                                         | 2 +-
 backends/ref/ceed-ref.h                                         | 2 +-
 backends/sycl-gen/ceed-sycl-gen-operator-build.hpp              | 2 +-
 backends/sycl-gen/ceed-sycl-gen-operator-build.sycl.cpp         | 2 +-
 backends/sycl-gen/ceed-sycl-gen-operator.sycl.cpp               | 2 +-
 backends/sycl-gen/ceed-sycl-gen-qfunction.sycl.cpp              | 2 +-
 backends/sycl-gen/ceed-sycl-gen.hpp                             | 2 +-
 backends/sycl-gen/ceed-sycl-gen.sycl.cpp                        | 2 +-
 backends/sycl-ref/ceed-sycl-ref-basis.sycl.cpp                  | 2 +-
 backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp               | 2 +-
 backends/sycl-ref/ceed-sycl-ref-qfunction-load.hpp              | 2 +-
 backends/sycl-ref/ceed-sycl-ref-qfunction-load.sycl.cpp         | 2 +-
 backends/sycl-ref/ceed-sycl-ref-qfunction.sycl.cpp              | 2 +-
 backends/sycl-ref/ceed-sycl-ref-qfunctioncontext.sycl.cpp       | 2 +-
 backends/sycl-ref/ceed-sycl-ref.hpp                             | 2 +-
 backends/sycl-ref/ceed-sycl-ref.sycl.cpp                        | 2 +-
 backends/sycl-ref/ceed-sycl-restriction.sycl.cpp                | 2 +-
 backends/sycl-ref/ceed-sycl-vector.sycl.cpp                     | 2 +-
 backends/sycl-ref/kernels/sycl-ref-vector.cpp                   | 2 +-
 backends/sycl-shared/ceed-sycl-shared-basis.sycl.cpp            | 2 +-
 backends/sycl-shared/ceed-sycl-shared.hpp                       | 2 +-
 backends/sycl-shared/ceed-sycl-shared.sycl.cpp                  | 2 +-
 backends/sycl/ceed-sycl-common.hpp                              | 2 +-
 backends/sycl/ceed-sycl-common.sycl.cpp                         | 2 +-
 backends/sycl/ceed-sycl-compile.hpp                             | 2 +-
 backends/sycl/ceed-sycl-compile.sycl.cpp                        | 2 +-
 backends/xsmm/ceed-xsmm-blocked.c                               | 2 +-
 backends/xsmm/ceed-xsmm-serial.c                                | 2 +-
 backends/xsmm/ceed-xsmm-tensor.c                                | 2 +-
 backends/xsmm/ceed-xsmm.h                                       | 2 +-
 benchmarks/benchmark.sh                                         | 2 +-
 benchmarks/petsc-bps.sh                                         | 2 +-
 benchmarks/petsc-bpsraw.sh                                      | 2 +-
 benchmarks/postprocess_base.py                                  | 2 +-
 benchmarks/postprocess_plot.py                                  | 2 +-
 benchmarks/postprocess_table.py                                 | 2 +-
 common.mk                                                       | 2 +-
 examples/ceed/Makefile                                          | 2 +-
 examples/ceed/ex1-volume.c                                      | 2 +-
 examples/ceed/ex1-volume.h                                      | 2 +-
 examples/ceed/ex2-surface.c                                     | 2 +-
 examples/ceed/ex2-surface.h                                     | 2 +-
 examples/ceed/ex3-volume.c                                      | 2 +-
 examples/ceed/ex3-volume.h                                      | 2 +-
 examples/fluids/include/bc_definition.h                         | 2 +-
 examples/fluids/include/log_events.h                            | 2 +-
 examples/fluids/include/mat-ceed-impl.h                         | 2 +-
 examples/fluids/include/mat-ceed.h                              | 2 +-
 examples/fluids/include/petsc-ceed-utils.h                      | 2 +-
 examples/fluids/include/petsc-ceed.h                            | 2 +-
 examples/fluids/include/petsc_ops.h                             | 2 +-
 examples/fluids/navierstokes.c                                  | 2 +-
 examples/fluids/navierstokes.h                                  | 2 +-
 examples/fluids/problems/advection.c                            | 2 +-
 examples/fluids/problems/bc_freestream.c                        | 2 +-
 examples/fluids/problems/bc_slip.c                              | 2 +-
 examples/fluids/problems/blasius.c                              | 2 +-
 examples/fluids/problems/channel.c                              | 2 +-
 examples/fluids/problems/densitycurrent.c                       | 2 +-
 examples/fluids/problems/eulervortex.c                          | 2 +-
 examples/fluids/problems/gaussianwave.c                         | 2 +-
 examples/fluids/problems/newtonian.c                            | 2 +-
 examples/fluids/problems/shocktube.c                            | 2 +-
 examples/fluids/problems/stg_shur14.c                           | 2 +-
 examples/fluids/problems/stg_shur14.h                           | 2 +-
 examples/fluids/problems/taylorgreen.c                          | 2 +-
 examples/fluids/qfunctions/advection.h                          | 2 +-
 examples/fluids/qfunctions/advection_types.h                    | 2 +-
 examples/fluids/qfunctions/bc_freestream.h                      | 2 +-
 examples/fluids/qfunctions/bc_freestream_type.h                 | 2 +-
 examples/fluids/qfunctions/bc_slip.h                            | 2 +-
 examples/fluids/qfunctions/blasius.h                            | 2 +-
 examples/fluids/qfunctions/channel.h                            | 2 +-
 examples/fluids/qfunctions/densitycurrent.h                     | 2 +-
 examples/fluids/qfunctions/differential_filter.h                | 2 +-
 examples/fluids/qfunctions/differential_filter_enums.h          | 2 +-
 examples/fluids/qfunctions/eulervortex.h                        | 2 +-
 examples/fluids/qfunctions/gaussianwave.h                       | 2 +-
 examples/fluids/qfunctions/grid_anisotropy_tensor.h             | 2 +-
 examples/fluids/qfunctions/inverse_multiplicity.h               | 2 +-
 examples/fluids/qfunctions/mass.h                               | 2 +-
 examples/fluids/qfunctions/newtonian.h                          | 2 +-
 examples/fluids/qfunctions/newtonian_state.h                    | 2 +-
 examples/fluids/qfunctions/newtonian_types.h                    | 2 +-
 examples/fluids/qfunctions/riemann_solver.h                     | 2 +-
 examples/fluids/qfunctions/setupgeo.h                           | 2 +-
 examples/fluids/qfunctions/setupgeo2d.h                         | 2 +-
 examples/fluids/qfunctions/setupgeo_helpers.h                   | 2 +-
 examples/fluids/qfunctions/shocktube.h                          | 2 +-
 examples/fluids/qfunctions/stabilization.h                      | 2 +-
 examples/fluids/qfunctions/stabilization_types.h                | 2 +-
 examples/fluids/qfunctions/stg_shur14.h                         | 2 +-
 examples/fluids/qfunctions/stg_shur14_type.h                    | 2 +-
 examples/fluids/qfunctions/strong_boundary_conditions.h         | 2 +-
 examples/fluids/qfunctions/taylorgreen.h                        | 2 +-
 examples/fluids/qfunctions/turb_spanstats.h                     | 2 +-
 examples/fluids/qfunctions/turb_stats_types.h                   | 2 +-
 examples/fluids/qfunctions/utils.h                              | 2 +-
 examples/fluids/qfunctions/utils_eigensolver_jacobi.h           | 2 +-
 examples/fluids/qfunctions/velocity_gradient_projection.h       | 2 +-
 examples/fluids/src/bc_definition.c                             | 2 +-
 examples/fluids/src/boundary_condition.c                        | 2 +-
 examples/fluids/src/cloptions.c                                 | 2 +-
 examples/fluids/src/differential_filter.c                       | 2 +-
 examples/fluids/src/dm_utils.c                                  | 2 +-
 examples/fluids/src/grid_anisotropy_tensor.c                    | 2 +-
 examples/fluids/src/inverse_multiplicity.c                      | 2 +-
 examples/fluids/src/log_events.c                                | 2 +-
 examples/fluids/src/misc.c                                      | 2 +-
 examples/fluids/src/petsc_ops.c                                 | 2 +-
 examples/fluids/src/qdata.c                                     | 2 +-
 examples/fluids/src/setupdm.c                                   | 2 +-
 examples/fluids/src/setuplibceed.c                              | 2 +-
 examples/fluids/src/setupts.c                                   | 2 +-
 examples/fluids/src/strong_boundary_conditions.c                | 2 +-
 examples/fluids/src/turb_spanstats.c                            | 2 +-
 examples/fluids/src/velocity_gradient_projection.c              | 2 +-
 examples/mfem/Makefile                                          | 2 +-
 examples/mfem/bp1.cpp                                           | 2 +-
 examples/mfem/bp1.h                                             | 2 +-
 examples/mfem/bp1.hpp                                           | 2 +-
 examples/mfem/bp3.cpp                                           | 2 +-
 examples/mfem/bp3.h                                             | 2 +-
 examples/mfem/bp3.hpp                                           | 2 +-
 examples/nek/bps/bps.h                                          | 2 +-
 examples/nek/bps/bps.usr                                        | 2 +-
 examples/petsc/Makefile                                         | 2 +-
 examples/petsc/area.c                                           | 2 +-
 examples/petsc/area.h                                           | 2 +-
 examples/petsc/bps.c                                            | 2 +-
 examples/petsc/bps.h                                            | 2 +-
 examples/petsc/bpsraw.c                                         | 2 +-
 examples/petsc/bpssphere.c                                      | 2 +-
 examples/petsc/bpssphere.h                                      | 2 +-
 examples/petsc/bpsswarm.c                                       | 2 +-
 examples/petsc/dmswarm.c                                        | 2 +-
 examples/petsc/include/areaproblemdata.h                        | 2 +-
 examples/petsc/include/bpsproblemdata.h                         | 2 +-
 examples/petsc/include/libceedsetup.h                           | 2 +-
 examples/petsc/include/matops.h                                 | 2 +-
 examples/petsc/include/petscutils.h                             | 2 +-
 examples/petsc/include/petscversion.h                           | 2 +-
 examples/petsc/include/sphereproblemdata.h                      | 2 +-
 examples/petsc/include/structs.h                                | 2 +-
 examples/petsc/include/swarmutils.h                             | 2 +-
 examples/petsc/multigrid.c                                      | 2 +-
 examples/petsc/qfunctions/area/areacube.h                       | 2 +-
 examples/petsc/qfunctions/area/areasphere.h                     | 2 +-
 examples/petsc/qfunctions/bps/bp1.h                             | 2 +-
 examples/petsc/qfunctions/bps/bp13.h                            | 2 +-
 examples/petsc/qfunctions/bps/bp1sphere.h                       | 2 +-
 examples/petsc/qfunctions/bps/bp2.h                             | 2 +-
 examples/petsc/qfunctions/bps/bp24.h                            | 2 +-
 examples/petsc/qfunctions/bps/bp2sphere.h                       | 2 +-
 examples/petsc/qfunctions/bps/bp3.h                             | 2 +-
 examples/petsc/qfunctions/bps/bp3sphere.h                       | 2 +-
 examples/petsc/qfunctions/bps/bp4.h                             | 2 +-
 examples/petsc/qfunctions/bps/bp4sphere.h                       | 2 +-
 examples/petsc/qfunctions/bps/common.h                          | 2 +-
 examples/petsc/qfunctions/swarm/swarmmass.h                     | 2 +-
 examples/petsc/src/libceedsetup.c                               | 2 +-
 examples/petsc/src/petscutils.c                                 | 2 +-
 examples/petsc/src/swarmutils.c                                 | 2 +-
 examples/python/Makefile                                        | 2 +-
 examples/python/conftest.py                                     | 2 +-
 examples/python/ex1_volume.py                                   | 2 +-
 examples/python/ex2_surface.py                                  | 2 +-
 examples/python/ex3_volume.py                                   | 2 +-
 examples/python/ex_common.py                                    | 2 +-
 examples/python/ex_test.py                                      | 2 +-
 examples/python/qfunctions/ex-common.h                          | 2 +-
 examples/python/qfunctions/ex1-volume.h                         | 2 +-
 examples/python/qfunctions/ex2-surface.h                        | 2 +-
 examples/python/qfunctions/ex3-volume.h                         | 2 +-
 examples/python/qfunctions/qfunctions.c                         | 2 +-
 examples/rust-qfunctions/Makefile                               | 2 +-
 examples/rust-qfunctions/ex1-volume.c                           | 2 +-
 examples/rust-qfunctions/ex1-volume.h                           | 2 +-
 examples/rust/ex1-volume-vector/src/main.rs                     | 2 +-
 examples/rust/ex1-volume-vector/src/opt.rs                      | 2 +-
 examples/rust/ex1-volume-vector/src/transform.rs                | 2 +-
 examples/rust/ex1-volume/src/main.rs                            | 2 +-
 examples/rust/ex1-volume/src/opt.rs                             | 2 +-
 examples/rust/ex1-volume/src/transform.rs                       | 2 +-
 examples/rust/ex2-surface-vector/src/main.rs                    | 2 +-
 examples/rust/ex2-surface-vector/src/opt.rs                     | 2 +-
 examples/rust/ex2-surface-vector/src/transform.rs               | 2 +-
 examples/rust/ex2-surface/src/main.rs                           | 2 +-
 examples/rust/ex2-surface/src/opt.rs                            | 2 +-
 examples/rust/ex2-surface/src/transform.rs                      | 2 +-
 examples/rust/ex3-volume-vector/src/main.rs                     | 2 +-
 examples/rust/ex3-volume-vector/src/opt.rs                      | 2 +-
 examples/rust/ex3-volume-vector/src/transform.rs                | 2 +-
 examples/rust/ex3-volume/src/main.rs                            | 2 +-
 examples/rust/ex3-volume/src/opt.rs                             | 2 +-
 examples/rust/ex3-volume/src/transform.rs                       | 2 +-
 examples/rust/mesh/src/lib.rs                                   | 2 +-
 examples/solids/Makefile                                        | 2 +-
 examples/solids/elasticity.c                                    | 2 +-
 examples/solids/elasticity.h                                    | 2 +-
 examples/solids/include/boundary.h                              | 2 +-
 examples/solids/include/cl-options.h                            | 2 +-
 examples/solids/include/matops.h                                | 2 +-
 examples/solids/include/misc.h                                  | 2 +-
 examples/solids/include/setup-dm.h                              | 2 +-
 examples/solids/include/setup-libceed.h                         | 2 +-
 examples/solids/include/structs.h                               | 2 +-
 examples/solids/include/utils.h                                 | 2 +-
 examples/solids/problems/cl-problems.h                          | 2 +-
 examples/solids/problems/finite-strain-mooney-rivlin.c          | 2 +-
 examples/solids/problems/finite-strain-neo-hookean.c            | 2 +-
 examples/solids/problems/linear.c                               | 2 +-
 examples/solids/problems/mooney-rivlin.c                        | 2 +-
 examples/solids/problems/mooney-rivlin.h                        | 2 +-
 examples/solids/problems/neo-hookean.c                          | 2 +-
 examples/solids/problems/neo-hookean.h                          | 2 +-
 examples/solids/problems/problems.c                             | 2 +-
 examples/solids/problems/problems.h                             | 2 +-
 examples/solids/qfunctions/common.h                             | 2 +-
 examples/solids/qfunctions/constant-force.h                     | 2 +-
 examples/solids/qfunctions/finite-strain-mooney-rivlin.h        | 2 +-
 examples/solids/qfunctions/finite-strain-neo-hookean.h          | 2 +-
 examples/solids/qfunctions/linear.h                             | 2 +-
 examples/solids/qfunctions/manufactured-force.h                 | 2 +-
 examples/solids/qfunctions/manufactured-true.h                  | 2 +-
 examples/solids/qfunctions/traction-boundary.h                  | 2 +-
 examples/solids/src/boundary.c                                  | 2 +-
 examples/solids/src/cl-options.c                                | 2 +-
 examples/solids/src/matops.c                                    | 2 +-
 examples/solids/src/misc.c                                      | 2 +-
 examples/solids/src/setup-dm.c                                  | 2 +-
 examples/solids/src/setup-libceed.c                             | 2 +-
 gallery/ceed-gallery-list.h                                     | 2 +-
 gallery/ceed-gallery-weak.c                                     | 2 +-
 gallery/identity/ceed-identity.c                                | 2 +-
 gallery/mass-vector/ceed-vectormassapply.c                      | 2 +-
 gallery/mass/ceed-mass1dbuild.c                                 | 2 +-
 gallery/mass/ceed-mass2dbuild.c                                 | 2 +-
 gallery/mass/ceed-mass3dbuild.c                                 | 2 +-
 gallery/mass/ceed-massapply.c                                   | 2 +-
 gallery/poisson-vector/ceed-vectorpoisson1dapply.c              | 2 +-
 gallery/poisson-vector/ceed-vectorpoisson2dapply.c              | 2 +-
 gallery/poisson-vector/ceed-vectorpoisson3dapply.c              | 2 +-
 gallery/poisson/ceed-poisson1dapply.c                           | 2 +-
 gallery/poisson/ceed-poisson1dbuild.c                           | 2 +-
 gallery/poisson/ceed-poisson2dapply.c                           | 2 +-
 gallery/poisson/ceed-poisson2dbuild.c                           | 2 +-
 gallery/poisson/ceed-poisson3dapply.c                           | 2 +-
 gallery/poisson/ceed-poisson3dbuild.c                           | 2 +-
 gallery/scale/ceed-scale.c                                      | 2 +-
 include/ceed-fortran-name.h                                     | 2 +-
 include/ceed-impl.h                                             | 2 +-
 include/ceed/backend.h                                          | 2 +-
 include/ceed/ceed-f32.h                                         | 2 +-
 include/ceed/ceed-f64.h                                         | 2 +-
 include/ceed/ceed.h                                             | 2 +-
 include/ceed/cuda.h                                             | 2 +-
 include/ceed/deprecated.h                                       | 2 +-
 include/ceed/fortran.h                                          | 2 +-
 include/ceed/hip.h                                              | 2 +-
 include/ceed/jit-source/cuda/cuda-atomic-add-fallback.h         | 2 +-
 include/ceed/jit-source/cuda/cuda-gen-templates.h               | 2 +-
 include/ceed/jit-source/cuda/cuda-jit.h                         | 2 +-
 .../ceed/jit-source/cuda/cuda-ref-basis-nontensor-templates.h   | 2 +-
 include/ceed/jit-source/cuda/cuda-ref-basis-nontensor.h         | 2 +-
 include/ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h  | 2 +-
 include/ceed/jit-source/cuda/cuda-ref-basis-tensor.h            | 2 +-
 .../ceed/jit-source/cuda/cuda-ref-operator-assemble-diagonal.h  | 2 +-
 include/ceed/jit-source/cuda/cuda-ref-operator-assemble.h       | 2 +-
 include/ceed/jit-source/cuda/cuda-ref-qfunction.h               | 2 +-
 include/ceed/jit-source/cuda/cuda-ref-restriction-at-points.h   | 2 +-
 .../ceed/jit-source/cuda/cuda-ref-restriction-curl-oriented.h   | 2 +-
 include/ceed/jit-source/cuda/cuda-ref-restriction-offset.h      | 2 +-
 include/ceed/jit-source/cuda/cuda-ref-restriction-oriented.h    | 2 +-
 include/ceed/jit-source/cuda/cuda-ref-restriction-strided.h     | 2 +-
 .../jit-source/cuda/cuda-shared-basis-nontensor-templates.h     | 2 +-
 include/ceed/jit-source/cuda/cuda-shared-basis-nontensor.h      | 2 +-
 .../jit-source/cuda/cuda-shared-basis-read-write-templates.h    | 2 +-
 .../cuda/cuda-shared-basis-tensor-at-points-templates.h         | 2 +-
 .../ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points.h   | 2 +-
 .../cuda/cuda-shared-basis-tensor-flattened-templates.h         | 2 +-
 .../ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h   | 2 +-
 include/ceed/jit-source/cuda/cuda-shared-basis-tensor.h         | 2 +-
 include/ceed/jit-source/cuda/cuda-types.h                       | 2 +-
 include/ceed/jit-source/gallery/ceed-identity.h                 | 2 +-
 include/ceed/jit-source/gallery/ceed-mass1dbuild.h              | 2 +-
 include/ceed/jit-source/gallery/ceed-mass2dbuild.h              | 2 +-
 include/ceed/jit-source/gallery/ceed-mass3dbuild.h              | 2 +-
 include/ceed/jit-source/gallery/ceed-massapply.h                | 2 +-
 include/ceed/jit-source/gallery/ceed-poisson1dapply.h           | 2 +-
 include/ceed/jit-source/gallery/ceed-poisson1dbuild.h           | 2 +-
 include/ceed/jit-source/gallery/ceed-poisson2dapply.h           | 2 +-
 include/ceed/jit-source/gallery/ceed-poisson2dbuild.h           | 2 +-
 include/ceed/jit-source/gallery/ceed-poisson3dapply.h           | 2 +-
 include/ceed/jit-source/gallery/ceed-poisson3dbuild.h           | 2 +-
 include/ceed/jit-source/gallery/ceed-scale.h                    | 2 +-
 include/ceed/jit-source/gallery/ceed-vectormassapply.h          | 2 +-
 include/ceed/jit-source/gallery/ceed-vectorpoisson1dapply.h     | 2 +-
 include/ceed/jit-source/gallery/ceed-vectorpoisson2dapply.h     | 2 +-
 include/ceed/jit-source/gallery/ceed-vectorpoisson3dapply.h     | 2 +-
 include/ceed/jit-source/hip/hip-gen-templates.h                 | 2 +-
 include/ceed/jit-source/hip/hip-jit.h                           | 2 +-
 include/ceed/jit-source/hip/hip-ref-basis-nontensor-templates.h | 2 +-
 include/ceed/jit-source/hip/hip-ref-basis-nontensor.h           | 2 +-
 include/ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h    | 2 +-
 include/ceed/jit-source/hip/hip-ref-basis-tensor.h              | 2 +-
 .../ceed/jit-source/hip/hip-ref-operator-assemble-diagonal.h    | 2 +-
 include/ceed/jit-source/hip/hip-ref-operator-assemble.h         | 2 +-
 include/ceed/jit-source/hip/hip-ref-qfunction.h                 | 2 +-
 include/ceed/jit-source/hip/hip-ref-restriction-at-points.h     | 2 +-
 include/ceed/jit-source/hip/hip-ref-restriction-curl-oriented.h | 2 +-
 include/ceed/jit-source/hip/hip-ref-restriction-offset.h        | 2 +-
 include/ceed/jit-source/hip/hip-ref-restriction-oriented.h      | 2 +-
 include/ceed/jit-source/hip/hip-ref-restriction-strided.h       | 2 +-
 .../ceed/jit-source/hip/hip-shared-basis-nontensor-templates.h  | 2 +-
 include/ceed/jit-source/hip/hip-shared-basis-nontensor.h        | 2 +-
 .../ceed/jit-source/hip/hip-shared-basis-read-write-templates.h | 2 +-
 .../hip/hip-shared-basis-tensor-at-points-templates.h           | 2 +-
 include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points.h | 2 +-
 .../hip/hip-shared-basis-tensor-flattened-templates.h           | 2 +-
 include/ceed/jit-source/hip/hip-shared-basis-tensor-templates.h | 2 +-
 include/ceed/jit-source/hip/hip-shared-basis-tensor.h           | 2 +-
 include/ceed/jit-source/hip/hip-types.h                         | 2 +-
 include/ceed/jit-source/magma/magma-basis-grad-1d.h             | 2 +-
 include/ceed/jit-source/magma/magma-basis-grad-2d.h             | 2 +-
 include/ceed/jit-source/magma/magma-basis-grad-3d.h             | 2 +-
 include/ceed/jit-source/magma/magma-basis-interp-1d.h           | 2 +-
 include/ceed/jit-source/magma/magma-basis-interp-2d.h           | 2 +-
 include/ceed/jit-source/magma/magma-basis-interp-3d.h           | 2 +-
 .../ceed/jit-source/magma/magma-basis-interp-deriv-nontensor.h  | 2 +-
 include/ceed/jit-source/magma/magma-basis-weight-1d.h           | 2 +-
 include/ceed/jit-source/magma/magma-basis-weight-2d.h           | 2 +-
 include/ceed/jit-source/magma/magma-basis-weight-3d.h           | 2 +-
 include/ceed/jit-source/magma/magma-basis-weight-nontensor.h    | 2 +-
 include/ceed/jit-source/magma/magma-common-defs.h               | 2 +-
 include/ceed/jit-source/magma/magma-common-nontensor.h          | 2 +-
 include/ceed/jit-source/magma/magma-common-tensor.h             | 2 +-
 include/ceed/jit-source/sycl/sycl-gen-templates.h               | 2 +-
 include/ceed/jit-source/sycl/sycl-jit.h                         | 2 +-
 include/ceed/jit-source/sycl/sycl-ref-qfunction.h               | 2 +-
 .../jit-source/sycl/sycl-shared-basis-read-write-templates.h    | 2 +-
 .../ceed/jit-source/sycl/sycl-shared-basis-tensor-templates.h   | 2 +-
 include/ceed/jit-source/sycl/sycl-shared-basis-tensor.h         | 2 +-
 include/ceed/jit-source/sycl/sycl-types.h                       | 2 +-
 include/ceed/jit-tools.h                                        | 2 +-
 include/ceed/types.h                                            | 2 +-
 interface/ceed-basis.c                                          | 2 +-
 interface/ceed-config.c                                         | 2 +-
 interface/ceed-cuda.c                                           | 2 +-
 interface/ceed-elemrestriction.c                                | 2 +-
 interface/ceed-fortran.c                                        | 2 +-
 interface/ceed-hip.c                                            | 2 +-
 interface/ceed-jit-source-root-default.c                        | 2 +-
 interface/ceed-jit-source-root-install.c                        | 2 +-
 interface/ceed-jit-tools.c                                      | 2 +-
 interface/ceed-operator.c                                       | 2 +-
 interface/ceed-preconditioning.c                                | 2 +-
 interface/ceed-qfunction-register.c                             | 2 +-
 interface/ceed-qfunction.c                                      | 2 +-
 interface/ceed-qfunctioncontext.c                               | 2 +-
 interface/ceed-register.c                                       | 2 +-
 interface/ceed-tensor.c                                         | 2 +-
 interface/ceed-types.c                                          | 2 +-
 interface/ceed-vector.c                                         | 2 +-
 interface/ceed.c                                                | 2 +-
 python/__init__.py                                              | 2 +-
 python/build_ceed_cffi.py                                       | 2 +-
 python/ceed.py                                                  | 2 +-
 python/ceed_basis.py                                            | 2 +-
 python/ceed_constants.py                                        | 2 +-
 python/ceed_elemrestriction.py                                  | 2 +-
 python/ceed_operator.py                                         | 2 +-
 python/ceed_qfunction.py                                        | 2 +-
 python/ceed_qfunctioncontext.py                                 | 2 +-
 python/ceed_vector.py                                           | 2 +-
 python/tests/Makefile                                           | 2 +-
 python/tests/conftest.py                                        | 2 +-
 python/tests/libceed-qfunctions.c                               | 2 +-
 python/tests/setup-qfunctions.py                                | 2 +-
 python/tests/setup.cfg                                          | 2 +-
 python/tests/test-0-ceed.py                                     | 2 +-
 python/tests/test-1-vector.py                                   | 2 +-
 python/tests/test-2-elemrestriction.py                          | 2 +-
 python/tests/test-3-basis.py                                    | 2 +-
 python/tests/test-4-qfunction.py                                | 2 +-
 python/tests/test-5-operator.py                                 | 2 +-
 python/tests/test-qfunctions.h                                  | 2 +-
 rust/libceed-sys/src/lib.rs                                     | 2 +-
 rust/libceed/src/basis.rs                                       | 2 +-
 rust/libceed/src/elem_restriction.rs                            | 2 +-
 rust/libceed/src/lib.rs                                         | 2 +-
 rust/libceed/src/operator.rs                                    | 2 +-
 rust/libceed/src/qfunction.rs                                   | 2 +-
 rust/libceed/src/vector.rs                                      | 2 +-
 rust/libceed/tests/version-numbers.rs                           | 2 +-
 tests/t319-basis.h                                              | 2 +-
 tests/t320-basis-f.h                                            | 2 +-
 tests/t320-basis.h                                              | 2 +-
 tests/t330-basis.h                                              | 2 +-
 tests/t340-basis.h                                              | 2 +-
 tests/t400-qfunction.h                                          | 2 +-
 tests/t401-qfunction.h                                          | 2 +-
 tests/t405-qfunction.h                                          | 2 +-
 tests/t406-qfunction-helper.h                                   | 2 +-
 tests/t406-qfunction-scales.h                                   | 2 +-
 tests/t406-qfunction.h                                          | 2 +-
 tests/t409-qfunction.h                                          | 2 +-
 tests/t500-operator.h                                           | 2 +-
 tests/t502-operator.h                                           | 2 +-
 tests/t507-operator.h                                           | 2 +-
 tests/t510-operator.h                                           | 2 +-
 tests/t522-operator.h                                           | 2 +-
 tests/t530-operator.h                                           | 2 +-
 tests/t531-operator.h                                           | 2 +-
 tests/t532-operator.h                                           | 2 +-
 tests/t534-operator.h                                           | 2 +-
 tests/t535-operator.h                                           | 2 +-
 tests/t537-operator.h                                           | 2 +-
 tests/t539-operator.h                                           | 2 +-
 tests/t540-operator.h                                           | 2 +-
 tests/t541-operator.h                                           | 2 +-
 tests/t566-operator.h                                           | 2 +-
 tests/t567-operator.h                                           | 2 +-
 tests/t568-operator.h                                           | 2 +-
 tests/t580-operator.h                                           | 2 +-
 tests/t590-operator.h                                           | 2 +-
 tests/t591-operator.h                                           | 2 +-
 tests/t595-operator.h                                           | 2 +-
 tests/t596-operator.h                                           | 2 +-
 tests/t597-operator.h                                           | 2 +-
 573 files changed, 573 insertions(+), 573 deletions(-)

diff --git a/LICENSE b/LICENSE
index 2cefa6edd3..85888e282b 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,6 +1,6 @@
 BSD 2-Clause License
 
-Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
diff --git a/Makefile b/Makefile
index 1c95362776..07f557ff1b 100644
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/README.md b/README.md
index 65f1819aa9..6a2f050cae 100644
--- a/README.md
+++ b/README.md
@@ -480,7 +480,7 @@ The BibTeX entries for these references can be found in the `doc/bib/references.
 
 The following copyright applies to each file in the CEED software suite, unless otherwise stated in the file:
 
-> Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+> Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 > All rights reserved.
 
 See files LICENSE and NOTICE for details.
diff --git a/backends/avx/ceed-avx-blocked.c b/backends/avx/ceed-avx-blocked.c
index d9098b779e..8452fd8591 100644
--- a/backends/avx/ceed-avx-blocked.c
+++ b/backends/avx/ceed-avx-blocked.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/avx/ceed-avx-serial.c b/backends/avx/ceed-avx-serial.c
index e45294c03d..06ed5f9fdb 100644
--- a/backends/avx/ceed-avx-serial.c
+++ b/backends/avx/ceed-avx-serial.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/avx/ceed-avx-tensor.c b/backends/avx/ceed-avx-tensor.c
index e58b6eee67..40d5df0646 100644
--- a/backends/avx/ceed-avx-tensor.c
+++ b/backends/avx/ceed-avx-tensor.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/avx/ceed-avx.h b/backends/avx/ceed-avx.h
index d9d354b59f..cb151baa85 100644
--- a/backends/avx/ceed-avx.h
+++ b/backends/avx/ceed-avx.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/blocked/ceed-blocked-operator.c b/backends/blocked/ceed-blocked-operator.c
index 9f52f8f0da..61663b2406 100644
--- a/backends/blocked/ceed-blocked-operator.c
+++ b/backends/blocked/ceed-blocked-operator.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/blocked/ceed-blocked.c b/backends/blocked/ceed-blocked.c
index 7dcf10038c..f50d2fc91e 100644
--- a/backends/blocked/ceed-blocked.c
+++ b/backends/blocked/ceed-blocked.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/blocked/ceed-blocked.h b/backends/blocked/ceed-blocked.h
index dacb05b840..e1976d6e43 100644
--- a/backends/blocked/ceed-blocked.h
+++ b/backends/blocked/ceed-blocked.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/ceed-backend-list.h b/backends/ceed-backend-list.h
index 2464453c5f..8650d956e2 100644
--- a/backends/ceed-backend-list.h
+++ b/backends/ceed-backend-list.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/ceed-backend-weak.c b/backends/ceed-backend-weak.c
index c7edeb1861..81abd45a3d 100644
--- a/backends/ceed-backend-weak.c
+++ b/backends/ceed-backend-weak.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
index 804109754f..1cfd9c97d7 100644
--- a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
+++ b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/cuda-gen/ceed-cuda-gen-operator-build.h b/backends/cuda-gen/ceed-cuda-gen-operator-build.h
index 3568e57ee3..8fd3ee12c5 100644
--- a/backends/cuda-gen/ceed-cuda-gen-operator-build.h
+++ b/backends/cuda-gen/ceed-cuda-gen-operator-build.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/cuda-gen/ceed-cuda-gen-operator.c b/backends/cuda-gen/ceed-cuda-gen-operator.c
index 7e28525d53..41ce035b7e 100644
--- a/backends/cuda-gen/ceed-cuda-gen-operator.c
+++ b/backends/cuda-gen/ceed-cuda-gen-operator.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/cuda-gen/ceed-cuda-gen-qfunction.c b/backends/cuda-gen/ceed-cuda-gen-qfunction.c
index bfb36c16fc..38c5cc9ee1 100644
--- a/backends/cuda-gen/ceed-cuda-gen-qfunction.c
+++ b/backends/cuda-gen/ceed-cuda-gen-qfunction.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/cuda-gen/ceed-cuda-gen.c b/backends/cuda-gen/ceed-cuda-gen.c
index e826f0aab1..799c35fd1e 100644
--- a/backends/cuda-gen/ceed-cuda-gen.c
+++ b/backends/cuda-gen/ceed-cuda-gen.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/cuda-gen/ceed-cuda-gen.h b/backends/cuda-gen/ceed-cuda-gen.h
index 4f64d3a4f8..0e04f3c4e4 100644
--- a/backends/cuda-gen/ceed-cuda-gen.h
+++ b/backends/cuda-gen/ceed-cuda-gen.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/cuda-ref/ceed-cuda-ref-basis.c b/backends/cuda-ref/ceed-cuda-ref-basis.c
index 565e7d13c7..7ed1865a95 100644
--- a/backends/cuda-ref/ceed-cuda-ref-basis.c
+++ b/backends/cuda-ref/ceed-cuda-ref-basis.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/cuda-ref/ceed-cuda-ref-operator.c b/backends/cuda-ref/ceed-cuda-ref-operator.c
index 3ad959eb21..1525246bad 100644
--- a/backends/cuda-ref/ceed-cuda-ref-operator.c
+++ b/backends/cuda-ref/ceed-cuda-ref-operator.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/cuda-ref/ceed-cuda-ref-qfunction-load.cpp b/backends/cuda-ref/ceed-cuda-ref-qfunction-load.cpp
index ec4f40ef86..82d21af0ac 100644
--- a/backends/cuda-ref/ceed-cuda-ref-qfunction-load.cpp
+++ b/backends/cuda-ref/ceed-cuda-ref-qfunction-load.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/cuda-ref/ceed-cuda-ref-qfunction-load.h b/backends/cuda-ref/ceed-cuda-ref-qfunction-load.h
index b0efe60933..360b8b9673 100644
--- a/backends/cuda-ref/ceed-cuda-ref-qfunction-load.h
+++ b/backends/cuda-ref/ceed-cuda-ref-qfunction-load.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/cuda-ref/ceed-cuda-ref-qfunction.c b/backends/cuda-ref/ceed-cuda-ref-qfunction.c
index 9a74f2ad79..ded455665b 100644
--- a/backends/cuda-ref/ceed-cuda-ref-qfunction.c
+++ b/backends/cuda-ref/ceed-cuda-ref-qfunction.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/cuda-ref/ceed-cuda-ref-qfunctioncontext.c b/backends/cuda-ref/ceed-cuda-ref-qfunctioncontext.c
index 52c9586273..b5d25b6f63 100644
--- a/backends/cuda-ref/ceed-cuda-ref-qfunctioncontext.c
+++ b/backends/cuda-ref/ceed-cuda-ref-qfunctioncontext.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/cuda-ref/ceed-cuda-ref-restriction.c b/backends/cuda-ref/ceed-cuda-ref-restriction.c
index ac3b061996..f390ec0b4c 100644
--- a/backends/cuda-ref/ceed-cuda-ref-restriction.c
+++ b/backends/cuda-ref/ceed-cuda-ref-restriction.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/cuda-ref/ceed-cuda-ref-vector.c b/backends/cuda-ref/ceed-cuda-ref-vector.c
index 58999fc73a..3675a727d7 100644
--- a/backends/cuda-ref/ceed-cuda-ref-vector.c
+++ b/backends/cuda-ref/ceed-cuda-ref-vector.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/cuda-ref/ceed-cuda-ref.c b/backends/cuda-ref/ceed-cuda-ref.c
index db87224018..0937b0ce17 100644
--- a/backends/cuda-ref/ceed-cuda-ref.c
+++ b/backends/cuda-ref/ceed-cuda-ref.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/cuda-ref/ceed-cuda-ref.h b/backends/cuda-ref/ceed-cuda-ref.h
index 5010cdb885..337e7c92a0 100644
--- a/backends/cuda-ref/ceed-cuda-ref.h
+++ b/backends/cuda-ref/ceed-cuda-ref.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/cuda-ref/kernels/cuda-ref-vector.cu b/backends/cuda-ref/kernels/cuda-ref-vector.cu
index 6f83efaa1e..cae3bad181 100644
--- a/backends/cuda-ref/kernels/cuda-ref-vector.cu
+++ b/backends/cuda-ref/kernels/cuda-ref-vector.cu
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/cuda-shared/ceed-cuda-shared-basis.c b/backends/cuda-shared/ceed-cuda-shared-basis.c
index e6cb677c4b..885e5f0979 100644
--- a/backends/cuda-shared/ceed-cuda-shared-basis.c
+++ b/backends/cuda-shared/ceed-cuda-shared-basis.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/cuda-shared/ceed-cuda-shared.c b/backends/cuda-shared/ceed-cuda-shared.c
index cc6b82089d..1224032995 100644
--- a/backends/cuda-shared/ceed-cuda-shared.c
+++ b/backends/cuda-shared/ceed-cuda-shared.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/cuda-shared/ceed-cuda-shared.h b/backends/cuda-shared/ceed-cuda-shared.h
index 6ef6770758..7d67327789 100644
--- a/backends/cuda-shared/ceed-cuda-shared.h
+++ b/backends/cuda-shared/ceed-cuda-shared.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/cuda/ceed-cuda-common.c b/backends/cuda/ceed-cuda-common.c
index 35fb6a262e..9538a2ee4d 100644
--- a/backends/cuda/ceed-cuda-common.c
+++ b/backends/cuda/ceed-cuda-common.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/cuda/ceed-cuda-common.h b/backends/cuda/ceed-cuda-common.h
index b7ec686424..489374a29a 100644
--- a/backends/cuda/ceed-cuda-common.h
+++ b/backends/cuda/ceed-cuda-common.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/cuda/ceed-cuda-compile.cpp b/backends/cuda/ceed-cuda-compile.cpp
index 5348fa9398..d1593dd800 100644
--- a/backends/cuda/ceed-cuda-compile.cpp
+++ b/backends/cuda/ceed-cuda-compile.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/cuda/ceed-cuda-compile.h b/backends/cuda/ceed-cuda-compile.h
index 8bda2c2f1a..151f0e0a24 100644
--- a/backends/cuda/ceed-cuda-compile.h
+++ b/backends/cuda/ceed-cuda-compile.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/hip-gen/ceed-hip-gen-operator-build.cpp b/backends/hip-gen/ceed-hip-gen-operator-build.cpp
index 3f20ab5070..2380cbb83d 100644
--- a/backends/hip-gen/ceed-hip-gen-operator-build.cpp
+++ b/backends/hip-gen/ceed-hip-gen-operator-build.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/hip-gen/ceed-hip-gen-operator-build.h b/backends/hip-gen/ceed-hip-gen-operator-build.h
index 60c2fcd479..0bb7f20df3 100644
--- a/backends/hip-gen/ceed-hip-gen-operator-build.h
+++ b/backends/hip-gen/ceed-hip-gen-operator-build.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/hip-gen/ceed-hip-gen-operator.c b/backends/hip-gen/ceed-hip-gen-operator.c
index bab3b81b79..c5918f914b 100644
--- a/backends/hip-gen/ceed-hip-gen-operator.c
+++ b/backends/hip-gen/ceed-hip-gen-operator.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/hip-gen/ceed-hip-gen-qfunction.c b/backends/hip-gen/ceed-hip-gen-qfunction.c
index 33bfec8f46..872f312594 100644
--- a/backends/hip-gen/ceed-hip-gen-qfunction.c
+++ b/backends/hip-gen/ceed-hip-gen-qfunction.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/hip-gen/ceed-hip-gen.c b/backends/hip-gen/ceed-hip-gen.c
index 9dd958b321..8b3ead0db7 100644
--- a/backends/hip-gen/ceed-hip-gen.c
+++ b/backends/hip-gen/ceed-hip-gen.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/hip-gen/ceed-hip-gen.h b/backends/hip-gen/ceed-hip-gen.h
index 06bbe5a1ad..1590f217f2 100644
--- a/backends/hip-gen/ceed-hip-gen.h
+++ b/backends/hip-gen/ceed-hip-gen.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/hip-ref/ceed-hip-ref-basis.c b/backends/hip-ref/ceed-hip-ref-basis.c
index c0c198b329..a05bba5006 100644
--- a/backends/hip-ref/ceed-hip-ref-basis.c
+++ b/backends/hip-ref/ceed-hip-ref-basis.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/hip-ref/ceed-hip-ref-operator.c b/backends/hip-ref/ceed-hip-ref-operator.c
index 1d399b1b68..7f3eea4e35 100644
--- a/backends/hip-ref/ceed-hip-ref-operator.c
+++ b/backends/hip-ref/ceed-hip-ref-operator.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/hip-ref/ceed-hip-ref-qfunction-load.cpp b/backends/hip-ref/ceed-hip-ref-qfunction-load.cpp
index fe8a96cf38..bf938eacc4 100644
--- a/backends/hip-ref/ceed-hip-ref-qfunction-load.cpp
+++ b/backends/hip-ref/ceed-hip-ref-qfunction-load.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/hip-ref/ceed-hip-ref-qfunction-load.h b/backends/hip-ref/ceed-hip-ref-qfunction-load.h
index 806874fdf1..5fc7073046 100644
--- a/backends/hip-ref/ceed-hip-ref-qfunction-load.h
+++ b/backends/hip-ref/ceed-hip-ref-qfunction-load.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/hip-ref/ceed-hip-ref-qfunction.c b/backends/hip-ref/ceed-hip-ref-qfunction.c
index 95e2b90351..60dd757ee7 100644
--- a/backends/hip-ref/ceed-hip-ref-qfunction.c
+++ b/backends/hip-ref/ceed-hip-ref-qfunction.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/hip-ref/ceed-hip-ref-qfunctioncontext.c b/backends/hip-ref/ceed-hip-ref-qfunctioncontext.c
index 1f5eab0ea4..84a69716e6 100644
--- a/backends/hip-ref/ceed-hip-ref-qfunctioncontext.c
+++ b/backends/hip-ref/ceed-hip-ref-qfunctioncontext.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/hip-ref/ceed-hip-ref-restriction.c b/backends/hip-ref/ceed-hip-ref-restriction.c
index 56080e8676..b1cd8b5c06 100644
--- a/backends/hip-ref/ceed-hip-ref-restriction.c
+++ b/backends/hip-ref/ceed-hip-ref-restriction.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/hip-ref/ceed-hip-ref-vector.c b/backends/hip-ref/ceed-hip-ref-vector.c
index 77f63b2fe4..0bf497f27f 100644
--- a/backends/hip-ref/ceed-hip-ref-vector.c
+++ b/backends/hip-ref/ceed-hip-ref-vector.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/hip-ref/ceed-hip-ref.c b/backends/hip-ref/ceed-hip-ref.c
index 2587e7fba3..f22f3a16e7 100644
--- a/backends/hip-ref/ceed-hip-ref.c
+++ b/backends/hip-ref/ceed-hip-ref.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/hip-ref/ceed-hip-ref.h b/backends/hip-ref/ceed-hip-ref.h
index 24a95ec085..2e7ee88313 100644
--- a/backends/hip-ref/ceed-hip-ref.h
+++ b/backends/hip-ref/ceed-hip-ref.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/hip-ref/kernels/hip-ref-vector.hip.cpp b/backends/hip-ref/kernels/hip-ref-vector.hip.cpp
index 0db492fc80..b9f81032b5 100644
--- a/backends/hip-ref/kernels/hip-ref-vector.hip.cpp
+++ b/backends/hip-ref/kernels/hip-ref-vector.hip.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/hip-shared/ceed-hip-shared-basis.c b/backends/hip-shared/ceed-hip-shared-basis.c
index abbb86ab48..3fb4c93630 100644
--- a/backends/hip-shared/ceed-hip-shared-basis.c
+++ b/backends/hip-shared/ceed-hip-shared-basis.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/hip-shared/ceed-hip-shared.c b/backends/hip-shared/ceed-hip-shared.c
index d8493e63ec..afb39e8bde 100644
--- a/backends/hip-shared/ceed-hip-shared.c
+++ b/backends/hip-shared/ceed-hip-shared.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/hip-shared/ceed-hip-shared.h b/backends/hip-shared/ceed-hip-shared.h
index ea92ca1ee3..c534b85e33 100644
--- a/backends/hip-shared/ceed-hip-shared.h
+++ b/backends/hip-shared/ceed-hip-shared.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/hip/ceed-hip-common.c b/backends/hip/ceed-hip-common.c
index 0b98004346..c33f13b766 100644
--- a/backends/hip/ceed-hip-common.c
+++ b/backends/hip/ceed-hip-common.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/hip/ceed-hip-common.h b/backends/hip/ceed-hip-common.h
index 23fdaacffe..fb89216be5 100644
--- a/backends/hip/ceed-hip-common.h
+++ b/backends/hip/ceed-hip-common.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/hip/ceed-hip-compile.cpp b/backends/hip/ceed-hip-compile.cpp
index 0eeec57c78..e30bc07a02 100644
--- a/backends/hip/ceed-hip-compile.cpp
+++ b/backends/hip/ceed-hip-compile.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/hip/ceed-hip-compile.h b/backends/hip/ceed-hip-compile.h
index d2cb987819..dd48fe4cd0 100644
--- a/backends/hip/ceed-hip-compile.h
+++ b/backends/hip/ceed-hip-compile.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/magma/ceed-magma-basis.c b/backends/magma/ceed-magma-basis.c
index 9a5fe0fc1e..6ce6ce33b9 100644
--- a/backends/magma/ceed-magma-basis.c
+++ b/backends/magma/ceed-magma-basis.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/magma/ceed-magma-common.c b/backends/magma/ceed-magma-common.c
index b3a03491c5..8e62e36b9c 100644
--- a/backends/magma/ceed-magma-common.c
+++ b/backends/magma/ceed-magma-common.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/magma/ceed-magma-common.h b/backends/magma/ceed-magma-common.h
index 1604aaef12..83c313390e 100644
--- a/backends/magma/ceed-magma-common.h
+++ b/backends/magma/ceed-magma-common.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/magma/ceed-magma-det.c b/backends/magma/ceed-magma-det.c
index 07b5bdd291..081cb6e7d9 100644
--- a/backends/magma/ceed-magma-det.c
+++ b/backends/magma/ceed-magma-det.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/magma/ceed-magma-gemm-nontensor.cpp b/backends/magma/ceed-magma-gemm-nontensor.cpp
index 6ac67ae227..c43ff9266a 100644
--- a/backends/magma/ceed-magma-gemm-nontensor.cpp
+++ b/backends/magma/ceed-magma-gemm-nontensor.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/magma/ceed-magma-gemm-nontensor.h b/backends/magma/ceed-magma-gemm-nontensor.h
index e867ad2600..f7108b07c4 100644
--- a/backends/magma/ceed-magma-gemm-nontensor.h
+++ b/backends/magma/ceed-magma-gemm-nontensor.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/magma/ceed-magma-gemm-selector.cpp b/backends/magma/ceed-magma-gemm-selector.cpp
index 22532a8049..193c5ba4f5 100644
--- a/backends/magma/ceed-magma-gemm-selector.cpp
+++ b/backends/magma/ceed-magma-gemm-selector.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/magma/ceed-magma-gemm-selector.h b/backends/magma/ceed-magma-gemm-selector.h
index c94eb81c20..c199ef7dc2 100644
--- a/backends/magma/ceed-magma-gemm-selector.h
+++ b/backends/magma/ceed-magma-gemm-selector.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/magma/ceed-magma.c b/backends/magma/ceed-magma.c
index 8ffce54fe6..9908dd55da 100644
--- a/backends/magma/ceed-magma.c
+++ b/backends/magma/ceed-magma.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/magma/ceed-magma.h b/backends/magma/ceed-magma.h
index 12eb9cab28..c800f2a6ab 100644
--- a/backends/magma/ceed-magma.h
+++ b/backends/magma/ceed-magma.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/magma/tuning/Makefile b/backends/magma/tuning/Makefile
index 930213e647..bde10abd6e 100644
--- a/backends/magma/tuning/Makefile
+++ b/backends/magma/tuning/Makefile
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors
+# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/magma/tuning/generate_tuning.py b/backends/magma/tuning/generate_tuning.py
index 04d563a99b..2e3180ba2f 100644
--- a/backends/magma/tuning/generate_tuning.py
+++ b/backends/magma/tuning/generate_tuning.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 
-# Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/magma/tuning/tuning.cpp b/backends/magma/tuning/tuning.cpp
index 44e313fe69..37f20863ae 100644
--- a/backends/magma/tuning/tuning.cpp
+++ b/backends/magma/tuning/tuning.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/memcheck/ceed-memcheck-blocked.c b/backends/memcheck/ceed-memcheck-blocked.c
index 6d9ca0e0c9..009c9e4601 100644
--- a/backends/memcheck/ceed-memcheck-blocked.c
+++ b/backends/memcheck/ceed-memcheck-blocked.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/memcheck/ceed-memcheck-qfunction.c b/backends/memcheck/ceed-memcheck-qfunction.c
index d67ea92a8f..17d823d4ab 100644
--- a/backends/memcheck/ceed-memcheck-qfunction.c
+++ b/backends/memcheck/ceed-memcheck-qfunction.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/memcheck/ceed-memcheck-qfunctioncontext.c b/backends/memcheck/ceed-memcheck-qfunctioncontext.c
index 0ba11cc464..01f67802c3 100644
--- a/backends/memcheck/ceed-memcheck-qfunctioncontext.c
+++ b/backends/memcheck/ceed-memcheck-qfunctioncontext.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/memcheck/ceed-memcheck-restriction.c b/backends/memcheck/ceed-memcheck-restriction.c
index 1b57862613..e728d08d17 100644
--- a/backends/memcheck/ceed-memcheck-restriction.c
+++ b/backends/memcheck/ceed-memcheck-restriction.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/memcheck/ceed-memcheck-serial.c b/backends/memcheck/ceed-memcheck-serial.c
index 6a5b5a7b5e..a0140fbd75 100644
--- a/backends/memcheck/ceed-memcheck-serial.c
+++ b/backends/memcheck/ceed-memcheck-serial.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/memcheck/ceed-memcheck-vector.c b/backends/memcheck/ceed-memcheck-vector.c
index 1483ac9841..c5dd1fe56d 100644
--- a/backends/memcheck/ceed-memcheck-vector.c
+++ b/backends/memcheck/ceed-memcheck-vector.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/memcheck/ceed-memcheck.h b/backends/memcheck/ceed-memcheck.h
index c09b533549..49f14e0270 100644
--- a/backends/memcheck/ceed-memcheck.h
+++ b/backends/memcheck/ceed-memcheck.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/ceed-occa-basis.cpp b/backends/occa/ceed-occa-basis.cpp
index 29f9c39e41..53aa042e5f 100644
--- a/backends/occa/ceed-occa-basis.cpp
+++ b/backends/occa/ceed-occa-basis.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/ceed-occa-basis.hpp b/backends/occa/ceed-occa-basis.hpp
index 54d4bddb55..1e8c01ead4 100644
--- a/backends/occa/ceed-occa-basis.hpp
+++ b/backends/occa/ceed-occa-basis.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/ceed-occa-ceed-object.cpp b/backends/occa/ceed-occa-ceed-object.cpp
index 199d6e0119..7f82ae6cc0 100644
--- a/backends/occa/ceed-occa-ceed-object.cpp
+++ b/backends/occa/ceed-occa-ceed-object.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/ceed-occa-ceed-object.hpp b/backends/occa/ceed-occa-ceed-object.hpp
index 326c5c2944..be90bf447e 100644
--- a/backends/occa/ceed-occa-ceed-object.hpp
+++ b/backends/occa/ceed-occa-ceed-object.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/ceed-occa-context.cpp b/backends/occa/ceed-occa-context.cpp
index 2119a0eede..e22b221338 100644
--- a/backends/occa/ceed-occa-context.cpp
+++ b/backends/occa/ceed-occa-context.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/ceed-occa-context.hpp b/backends/occa/ceed-occa-context.hpp
index 1785cc2024..4a41d79411 100644
--- a/backends/occa/ceed-occa-context.hpp
+++ b/backends/occa/ceed-occa-context.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/ceed-occa-cpu-operator.cpp b/backends/occa/ceed-occa-cpu-operator.cpp
index 7f725c7d60..bdf6efea37 100644
--- a/backends/occa/ceed-occa-cpu-operator.cpp
+++ b/backends/occa/ceed-occa-cpu-operator.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/ceed-occa-cpu-operator.hpp b/backends/occa/ceed-occa-cpu-operator.hpp
index 65496e274b..62c336562b 100644
--- a/backends/occa/ceed-occa-cpu-operator.hpp
+++ b/backends/occa/ceed-occa-cpu-operator.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/ceed-occa-elem-restriction.cpp b/backends/occa/ceed-occa-elem-restriction.cpp
index 026f45d90c..2fa4b57c9e 100644
--- a/backends/occa/ceed-occa-elem-restriction.cpp
+++ b/backends/occa/ceed-occa-elem-restriction.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/ceed-occa-elem-restriction.hpp b/backends/occa/ceed-occa-elem-restriction.hpp
index 3fdc226df7..6c6206b82c 100644
--- a/backends/occa/ceed-occa-elem-restriction.hpp
+++ b/backends/occa/ceed-occa-elem-restriction.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/ceed-occa-gpu-operator.cpp b/backends/occa/ceed-occa-gpu-operator.cpp
index 14984056fc..f35a52bb97 100644
--- a/backends/occa/ceed-occa-gpu-operator.cpp
+++ b/backends/occa/ceed-occa-gpu-operator.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/ceed-occa-gpu-operator.hpp b/backends/occa/ceed-occa-gpu-operator.hpp
index 8ebf6e742d..8b7651b396 100644
--- a/backends/occa/ceed-occa-gpu-operator.hpp
+++ b/backends/occa/ceed-occa-gpu-operator.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/ceed-occa-kernels.hpp b/backends/occa/ceed-occa-kernels.hpp
index d5a4896e7b..bfa77f52a5 100644
--- a/backends/occa/ceed-occa-kernels.hpp
+++ b/backends/occa/ceed-occa-kernels.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/ceed-occa-operator-args.cpp b/backends/occa/ceed-occa-operator-args.cpp
index 0008c79ada..caf02788fd 100644
--- a/backends/occa/ceed-occa-operator-args.cpp
+++ b/backends/occa/ceed-occa-operator-args.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/ceed-occa-operator-args.hpp b/backends/occa/ceed-occa-operator-args.hpp
index 683225da80..6ea4d96687 100644
--- a/backends/occa/ceed-occa-operator-args.hpp
+++ b/backends/occa/ceed-occa-operator-args.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/ceed-occa-operator-field.cpp b/backends/occa/ceed-occa-operator-field.cpp
index 2906c53b0a..db13070149 100644
--- a/backends/occa/ceed-occa-operator-field.cpp
+++ b/backends/occa/ceed-occa-operator-field.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/ceed-occa-operator-field.hpp b/backends/occa/ceed-occa-operator-field.hpp
index 8849fffeb9..866364fbb4 100644
--- a/backends/occa/ceed-occa-operator-field.hpp
+++ b/backends/occa/ceed-occa-operator-field.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/ceed-occa-operator.cpp b/backends/occa/ceed-occa-operator.cpp
index 057a131162..9111c1f8a1 100644
--- a/backends/occa/ceed-occa-operator.cpp
+++ b/backends/occa/ceed-occa-operator.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/ceed-occa-operator.hpp b/backends/occa/ceed-occa-operator.hpp
index d2a84ddb6b..866050ef1b 100644
--- a/backends/occa/ceed-occa-operator.hpp
+++ b/backends/occa/ceed-occa-operator.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/ceed-occa-qfunction-args.cpp b/backends/occa/ceed-occa-qfunction-args.cpp
index 974719b0a6..cec008f8e6 100644
--- a/backends/occa/ceed-occa-qfunction-args.cpp
+++ b/backends/occa/ceed-occa-qfunction-args.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/ceed-occa-qfunction-args.hpp b/backends/occa/ceed-occa-qfunction-args.hpp
index a8c5638a98..de0e1fd751 100644
--- a/backends/occa/ceed-occa-qfunction-args.hpp
+++ b/backends/occa/ceed-occa-qfunction-args.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/ceed-occa-qfunction-field.cpp b/backends/occa/ceed-occa-qfunction-field.cpp
index 1c15578544..c947f94458 100644
--- a/backends/occa/ceed-occa-qfunction-field.cpp
+++ b/backends/occa/ceed-occa-qfunction-field.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/ceed-occa-qfunction-field.hpp b/backends/occa/ceed-occa-qfunction-field.hpp
index 7f4b34e158..00c91b1aac 100644
--- a/backends/occa/ceed-occa-qfunction-field.hpp
+++ b/backends/occa/ceed-occa-qfunction-field.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/ceed-occa-qfunction.cpp b/backends/occa/ceed-occa-qfunction.cpp
index a72b04d00c..9b79aabb0a 100644
--- a/backends/occa/ceed-occa-qfunction.cpp
+++ b/backends/occa/ceed-occa-qfunction.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/ceed-occa-qfunction.hpp b/backends/occa/ceed-occa-qfunction.hpp
index e0eb84de84..e607941eb4 100644
--- a/backends/occa/ceed-occa-qfunction.hpp
+++ b/backends/occa/ceed-occa-qfunction.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/ceed-occa-qfunctioncontext.cpp b/backends/occa/ceed-occa-qfunctioncontext.cpp
index 6cdc9f36de..a570be638c 100644
--- a/backends/occa/ceed-occa-qfunctioncontext.cpp
+++ b/backends/occa/ceed-occa-qfunctioncontext.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/ceed-occa-qfunctioncontext.hpp b/backends/occa/ceed-occa-qfunctioncontext.hpp
index 4cfde9a25c..b00857c2fb 100644
--- a/backends/occa/ceed-occa-qfunctioncontext.hpp
+++ b/backends/occa/ceed-occa-qfunctioncontext.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/ceed-occa-simplex-basis.cpp b/backends/occa/ceed-occa-simplex-basis.cpp
index 5fff1d4e7e..000e68df0a 100644
--- a/backends/occa/ceed-occa-simplex-basis.cpp
+++ b/backends/occa/ceed-occa-simplex-basis.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/ceed-occa-simplex-basis.hpp b/backends/occa/ceed-occa-simplex-basis.hpp
index e4875f2ddb..a5de4701a9 100644
--- a/backends/occa/ceed-occa-simplex-basis.hpp
+++ b/backends/occa/ceed-occa-simplex-basis.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/ceed-occa-tensor-basis.cpp b/backends/occa/ceed-occa-tensor-basis.cpp
index 29c9361f09..9cca7e8318 100644
--- a/backends/occa/ceed-occa-tensor-basis.cpp
+++ b/backends/occa/ceed-occa-tensor-basis.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/ceed-occa-tensor-basis.hpp b/backends/occa/ceed-occa-tensor-basis.hpp
index 4d0dc2c2ba..88a6eb3029 100644
--- a/backends/occa/ceed-occa-tensor-basis.hpp
+++ b/backends/occa/ceed-occa-tensor-basis.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/ceed-occa-types.hpp b/backends/occa/ceed-occa-types.hpp
index 9dc1d83f58..52496e934c 100644
--- a/backends/occa/ceed-occa-types.hpp
+++ b/backends/occa/ceed-occa-types.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/ceed-occa-vector.cpp b/backends/occa/ceed-occa-vector.cpp
index 9b369a4efe..efcabd15f9 100644
--- a/backends/occa/ceed-occa-vector.cpp
+++ b/backends/occa/ceed-occa-vector.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/ceed-occa-vector.hpp b/backends/occa/ceed-occa-vector.hpp
index 71fcbdf693..7b2f8d730e 100644
--- a/backends/occa/ceed-occa-vector.hpp
+++ b/backends/occa/ceed-occa-vector.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/ceed-occa.cpp b/backends/occa/ceed-occa.cpp
index eca2f4e798..4cdbe5a290 100644
--- a/backends/occa/ceed-occa.cpp
+++ b/backends/occa/ceed-occa.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/ceed-occa.h b/backends/occa/ceed-occa.h
index 43df0a7001..76283f4dc9 100644
--- a/backends/occa/ceed-occa.h
+++ b/backends/occa/ceed-occa.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/kernels/elem-restriction.cpp b/backends/occa/kernels/elem-restriction.cpp
index 824e1ef79a..d252e1a670 100644
--- a/backends/occa/kernels/elem-restriction.cpp
+++ b/backends/occa/kernels/elem-restriction.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/kernels/elem-restriction.hpp b/backends/occa/kernels/elem-restriction.hpp
index 65d1309e03..c2989dfbc2 100644
--- a/backends/occa/kernels/elem-restriction.hpp
+++ b/backends/occa/kernels/elem-restriction.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/kernels/kernel-defines.hpp b/backends/occa/kernels/kernel-defines.hpp
index bbda69714b..8e66664b64 100644
--- a/backends/occa/kernels/kernel-defines.hpp
+++ b/backends/occa/kernels/kernel-defines.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/kernels/set-value.cpp b/backends/occa/kernels/set-value.cpp
index b165748d22..87efbc6163 100644
--- a/backends/occa/kernels/set-value.cpp
+++ b/backends/occa/kernels/set-value.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/kernels/set-value.hpp b/backends/occa/kernels/set-value.hpp
index 240b531d6c..c4173b2342 100644
--- a/backends/occa/kernels/set-value.hpp
+++ b/backends/occa/kernels/set-value.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/kernels/simplex-basis.hpp b/backends/occa/kernels/simplex-basis.hpp
index 40072e5758..b6e2d12cfe 100644
--- a/backends/occa/kernels/simplex-basis.hpp
+++ b/backends/occa/kernels/simplex-basis.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/kernels/simplex-basis/cpu-simplex-basis.cpp b/backends/occa/kernels/simplex-basis/cpu-simplex-basis.cpp
index 17976fa6d1..4b78dbf621 100644
--- a/backends/occa/kernels/simplex-basis/cpu-simplex-basis.cpp
+++ b/backends/occa/kernels/simplex-basis/cpu-simplex-basis.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/kernels/simplex-basis/gpu-simplex-basis.cpp b/backends/occa/kernels/simplex-basis/gpu-simplex-basis.cpp
index 2ef0dfe28a..ab0f2d8fd8 100644
--- a/backends/occa/kernels/simplex-basis/gpu-simplex-basis.cpp
+++ b/backends/occa/kernels/simplex-basis/gpu-simplex-basis.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/kernels/tensor-basis.hpp b/backends/occa/kernels/tensor-basis.hpp
index fe763ace59..afffd661e8 100644
--- a/backends/occa/kernels/tensor-basis.hpp
+++ b/backends/occa/kernels/tensor-basis.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/kernels/tensor-basis/cpu/tensor-basis-1d.cpp b/backends/occa/kernels/tensor-basis/cpu/tensor-basis-1d.cpp
index 98619f64e8..5af734984f 100644
--- a/backends/occa/kernels/tensor-basis/cpu/tensor-basis-1d.cpp
+++ b/backends/occa/kernels/tensor-basis/cpu/tensor-basis-1d.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/kernels/tensor-basis/cpu/tensor-basis-2d.cpp b/backends/occa/kernels/tensor-basis/cpu/tensor-basis-2d.cpp
index 8649e42070..143025f7ba 100644
--- a/backends/occa/kernels/tensor-basis/cpu/tensor-basis-2d.cpp
+++ b/backends/occa/kernels/tensor-basis/cpu/tensor-basis-2d.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/kernels/tensor-basis/cpu/tensor-basis-3d.cpp b/backends/occa/kernels/tensor-basis/cpu/tensor-basis-3d.cpp
index eda0e8475d..45263bb635 100644
--- a/backends/occa/kernels/tensor-basis/cpu/tensor-basis-3d.cpp
+++ b/backends/occa/kernels/tensor-basis/cpu/tensor-basis-3d.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/kernels/tensor-basis/gpu/tensor-basis-1d.cpp b/backends/occa/kernels/tensor-basis/gpu/tensor-basis-1d.cpp
index 892447c271..408140b723 100644
--- a/backends/occa/kernels/tensor-basis/gpu/tensor-basis-1d.cpp
+++ b/backends/occa/kernels/tensor-basis/gpu/tensor-basis-1d.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/kernels/tensor-basis/gpu/tensor-basis-2d.cpp b/backends/occa/kernels/tensor-basis/gpu/tensor-basis-2d.cpp
index c98df69a7f..c080336f40 100644
--- a/backends/occa/kernels/tensor-basis/gpu/tensor-basis-2d.cpp
+++ b/backends/occa/kernels/tensor-basis/gpu/tensor-basis-2d.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/occa/kernels/tensor-basis/gpu/tensor-basis-3d.cpp b/backends/occa/kernels/tensor-basis/gpu/tensor-basis-3d.cpp
index 3251951305..6d0b5f631c 100644
--- a/backends/occa/kernels/tensor-basis/gpu/tensor-basis-3d.cpp
+++ b/backends/occa/kernels/tensor-basis/gpu/tensor-basis-3d.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/opt/ceed-opt-blocked.c b/backends/opt/ceed-opt-blocked.c
index a3ba49c7fa..e8980c3ba9 100644
--- a/backends/opt/ceed-opt-blocked.c
+++ b/backends/opt/ceed-opt-blocked.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/opt/ceed-opt-operator.c b/backends/opt/ceed-opt-operator.c
index 03f6c28fd0..3a67a97607 100644
--- a/backends/opt/ceed-opt-operator.c
+++ b/backends/opt/ceed-opt-operator.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/opt/ceed-opt-serial.c b/backends/opt/ceed-opt-serial.c
index 86bc832bc8..1e3517b44a 100644
--- a/backends/opt/ceed-opt-serial.c
+++ b/backends/opt/ceed-opt-serial.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/opt/ceed-opt-tensor.c b/backends/opt/ceed-opt-tensor.c
index 24a00adb81..ee41dce029 100644
--- a/backends/opt/ceed-opt-tensor.c
+++ b/backends/opt/ceed-opt-tensor.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/opt/ceed-opt.h b/backends/opt/ceed-opt.h
index 96bfff26bc..a1b67a58e4 100644
--- a/backends/opt/ceed-opt.h
+++ b/backends/opt/ceed-opt.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/ref/ceed-ref-basis.c b/backends/ref/ceed-ref-basis.c
index c4d07f69f4..21cbe2201f 100644
--- a/backends/ref/ceed-ref-basis.c
+++ b/backends/ref/ceed-ref-basis.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/ref/ceed-ref-operator.c b/backends/ref/ceed-ref-operator.c
index 5bad14bb6f..326ff93e61 100644
--- a/backends/ref/ceed-ref-operator.c
+++ b/backends/ref/ceed-ref-operator.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/ref/ceed-ref-qfunction.c b/backends/ref/ceed-ref-qfunction.c
index a0e6e32cce..caedcbbad1 100644
--- a/backends/ref/ceed-ref-qfunction.c
+++ b/backends/ref/ceed-ref-qfunction.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/ref/ceed-ref-qfunctioncontext.c b/backends/ref/ceed-ref-qfunctioncontext.c
index 1e8ed0cc90..6c3e500560 100644
--- a/backends/ref/ceed-ref-qfunctioncontext.c
+++ b/backends/ref/ceed-ref-qfunctioncontext.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/ref/ceed-ref-restriction.c b/backends/ref/ceed-ref-restriction.c
index 2d0f3895cc..de65e5854b 100644
--- a/backends/ref/ceed-ref-restriction.c
+++ b/backends/ref/ceed-ref-restriction.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/ref/ceed-ref-tensor.c b/backends/ref/ceed-ref-tensor.c
index 38c7880dc6..9d66a2a68d 100644
--- a/backends/ref/ceed-ref-tensor.c
+++ b/backends/ref/ceed-ref-tensor.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/ref/ceed-ref-vector.c b/backends/ref/ceed-ref-vector.c
index 813ce21d6d..520afdd61a 100644
--- a/backends/ref/ceed-ref-vector.c
+++ b/backends/ref/ceed-ref-vector.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/ref/ceed-ref.c b/backends/ref/ceed-ref.c
index 274e3c72c9..46af219839 100644
--- a/backends/ref/ceed-ref.c
+++ b/backends/ref/ceed-ref.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/ref/ceed-ref.h b/backends/ref/ceed-ref.h
index 8396fc1a69..4af06564a5 100644
--- a/backends/ref/ceed-ref.h
+++ b/backends/ref/ceed-ref.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/sycl-gen/ceed-sycl-gen-operator-build.hpp b/backends/sycl-gen/ceed-sycl-gen-operator-build.hpp
index 0a00fdb3b9..b112488569 100644
--- a/backends/sycl-gen/ceed-sycl-gen-operator-build.hpp
+++ b/backends/sycl-gen/ceed-sycl-gen-operator-build.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/sycl-gen/ceed-sycl-gen-operator-build.sycl.cpp b/backends/sycl-gen/ceed-sycl-gen-operator-build.sycl.cpp
index aa7d5f0253..ec783e5cc2 100644
--- a/backends/sycl-gen/ceed-sycl-gen-operator-build.sycl.cpp
+++ b/backends/sycl-gen/ceed-sycl-gen-operator-build.sycl.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/sycl-gen/ceed-sycl-gen-operator.sycl.cpp b/backends/sycl-gen/ceed-sycl-gen-operator.sycl.cpp
index c1414c15c0..9370c98d5a 100644
--- a/backends/sycl-gen/ceed-sycl-gen-operator.sycl.cpp
+++ b/backends/sycl-gen/ceed-sycl-gen-operator.sycl.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/sycl-gen/ceed-sycl-gen-qfunction.sycl.cpp b/backends/sycl-gen/ceed-sycl-gen-qfunction.sycl.cpp
index 3529e76015..99d1438269 100644
--- a/backends/sycl-gen/ceed-sycl-gen-qfunction.sycl.cpp
+++ b/backends/sycl-gen/ceed-sycl-gen-qfunction.sycl.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/sycl-gen/ceed-sycl-gen.hpp b/backends/sycl-gen/ceed-sycl-gen.hpp
index cc632651f6..5ba1836197 100644
--- a/backends/sycl-gen/ceed-sycl-gen.hpp
+++ b/backends/sycl-gen/ceed-sycl-gen.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/sycl-gen/ceed-sycl-gen.sycl.cpp b/backends/sycl-gen/ceed-sycl-gen.sycl.cpp
index e07fe476d6..1d67da7e0a 100644
--- a/backends/sycl-gen/ceed-sycl-gen.sycl.cpp
+++ b/backends/sycl-gen/ceed-sycl-gen.sycl.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/sycl-ref/ceed-sycl-ref-basis.sycl.cpp b/backends/sycl-ref/ceed-sycl-ref-basis.sycl.cpp
index a90ad9e78c..508830fffd 100644
--- a/backends/sycl-ref/ceed-sycl-ref-basis.sycl.cpp
+++ b/backends/sycl-ref/ceed-sycl-ref-basis.sycl.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp b/backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp
index 74b455fbdb..45cef53918 100644
--- a/backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp
+++ b/backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other
 // CEED contributors. All Rights Reserved. See the top-level LICENSE and NOTICE
 // files for details.
 //
diff --git a/backends/sycl-ref/ceed-sycl-ref-qfunction-load.hpp b/backends/sycl-ref/ceed-sycl-ref-qfunction-load.hpp
index 712a8b66b4..7a68343f29 100644
--- a/backends/sycl-ref/ceed-sycl-ref-qfunction-load.hpp
+++ b/backends/sycl-ref/ceed-sycl-ref-qfunction-load.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/sycl-ref/ceed-sycl-ref-qfunction-load.sycl.cpp b/backends/sycl-ref/ceed-sycl-ref-qfunction-load.sycl.cpp
index e2e2b63749..82cac87b6d 100644
--- a/backends/sycl-ref/ceed-sycl-ref-qfunction-load.sycl.cpp
+++ b/backends/sycl-ref/ceed-sycl-ref-qfunction-load.sycl.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/sycl-ref/ceed-sycl-ref-qfunction.sycl.cpp b/backends/sycl-ref/ceed-sycl-ref-qfunction.sycl.cpp
index 0c783e713b..fb0ad6f287 100644
--- a/backends/sycl-ref/ceed-sycl-ref-qfunction.sycl.cpp
+++ b/backends/sycl-ref/ceed-sycl-ref-qfunction.sycl.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other
 // CEED contributors. All Rights Reserved. See the top-level LICENSE and NOTICE
 // files for details.
 //
diff --git a/backends/sycl-ref/ceed-sycl-ref-qfunctioncontext.sycl.cpp b/backends/sycl-ref/ceed-sycl-ref-qfunctioncontext.sycl.cpp
index 0629d404be..b6a50a0226 100644
--- a/backends/sycl-ref/ceed-sycl-ref-qfunctioncontext.sycl.cpp
+++ b/backends/sycl-ref/ceed-sycl-ref-qfunctioncontext.sycl.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/sycl-ref/ceed-sycl-ref.hpp b/backends/sycl-ref/ceed-sycl-ref.hpp
index adce713bb6..723fcc8cfb 100644
--- a/backends/sycl-ref/ceed-sycl-ref.hpp
+++ b/backends/sycl-ref/ceed-sycl-ref.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other
 // CEED contributors. All Rights Reserved. See the top-level LICENSE and NOTICE
 // files for details.
 //
diff --git a/backends/sycl-ref/ceed-sycl-ref.sycl.cpp b/backends/sycl-ref/ceed-sycl-ref.sycl.cpp
index 6b2996a460..ffa5a78d7d 100644
--- a/backends/sycl-ref/ceed-sycl-ref.sycl.cpp
+++ b/backends/sycl-ref/ceed-sycl-ref.sycl.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other
 // CEED contributors. All Rights Reserved. See the top-level LICENSE and NOTICE
 // files for details.
 //
diff --git a/backends/sycl-ref/ceed-sycl-restriction.sycl.cpp b/backends/sycl-ref/ceed-sycl-restriction.sycl.cpp
index ba7e520dfc..d33d135198 100644
--- a/backends/sycl-ref/ceed-sycl-restriction.sycl.cpp
+++ b/backends/sycl-ref/ceed-sycl-restriction.sycl.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other
 // CEED contributors. All Rights Reserved. See the top-level LICENSE and NOTICE
 // files for details.
 //
diff --git a/backends/sycl-ref/ceed-sycl-vector.sycl.cpp b/backends/sycl-ref/ceed-sycl-vector.sycl.cpp
index 1c7d3f8b5e..9b1e8f1033 100644
--- a/backends/sycl-ref/ceed-sycl-vector.sycl.cpp
+++ b/backends/sycl-ref/ceed-sycl-vector.sycl.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/sycl-ref/kernels/sycl-ref-vector.cpp b/backends/sycl-ref/kernels/sycl-ref-vector.cpp
index bc1c26593c..11db777dce 100644
--- a/backends/sycl-ref/kernels/sycl-ref-vector.cpp
+++ b/backends/sycl-ref/kernels/sycl-ref-vector.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other
 // CEED contributors. All Rights Reserved. See the top-level LICENSE and NOTICE
 // files for details.
 //
diff --git a/backends/sycl-shared/ceed-sycl-shared-basis.sycl.cpp b/backends/sycl-shared/ceed-sycl-shared-basis.sycl.cpp
index bac5693aa4..162b2acb3c 100644
--- a/backends/sycl-shared/ceed-sycl-shared-basis.sycl.cpp
+++ b/backends/sycl-shared/ceed-sycl-shared-basis.sycl.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/sycl-shared/ceed-sycl-shared.hpp b/backends/sycl-shared/ceed-sycl-shared.hpp
index 69dd86fe9b..2e2c3df1ca 100644
--- a/backends/sycl-shared/ceed-sycl-shared.hpp
+++ b/backends/sycl-shared/ceed-sycl-shared.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/sycl-shared/ceed-sycl-shared.sycl.cpp b/backends/sycl-shared/ceed-sycl-shared.sycl.cpp
index 7f001a65cd..a563a73626 100644
--- a/backends/sycl-shared/ceed-sycl-shared.sycl.cpp
+++ b/backends/sycl-shared/ceed-sycl-shared.sycl.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/sycl/ceed-sycl-common.hpp b/backends/sycl/ceed-sycl-common.hpp
index 3a84e1ef33..f087f8c29a 100644
--- a/backends/sycl/ceed-sycl-common.hpp
+++ b/backends/sycl/ceed-sycl-common.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/sycl/ceed-sycl-common.sycl.cpp b/backends/sycl/ceed-sycl-common.sycl.cpp
index 253d00d077..aa09b693df 100644
--- a/backends/sycl/ceed-sycl-common.sycl.cpp
+++ b/backends/sycl/ceed-sycl-common.sycl.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other
 // CEED contributors. All Rights Reserved. See the top-level LICENSE and NOTICE
 // files for details.
 //
diff --git a/backends/sycl/ceed-sycl-compile.hpp b/backends/sycl/ceed-sycl-compile.hpp
index 9faea1f6dc..1baa1f3ca4 100644
--- a/backends/sycl/ceed-sycl-compile.hpp
+++ b/backends/sycl/ceed-sycl-compile.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/sycl/ceed-sycl-compile.sycl.cpp b/backends/sycl/ceed-sycl-compile.sycl.cpp
index 39b7aa80b7..f939ca940f 100644
--- a/backends/sycl/ceed-sycl-compile.sycl.cpp
+++ b/backends/sycl/ceed-sycl-compile.sycl.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/xsmm/ceed-xsmm-blocked.c b/backends/xsmm/ceed-xsmm-blocked.c
index ec6fb2376f..2abaa247c1 100644
--- a/backends/xsmm/ceed-xsmm-blocked.c
+++ b/backends/xsmm/ceed-xsmm-blocked.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/xsmm/ceed-xsmm-serial.c b/backends/xsmm/ceed-xsmm-serial.c
index 0869ff3442..7892e845be 100644
--- a/backends/xsmm/ceed-xsmm-serial.c
+++ b/backends/xsmm/ceed-xsmm-serial.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/xsmm/ceed-xsmm-tensor.c b/backends/xsmm/ceed-xsmm-tensor.c
index 8386181de4..21bf22ef8b 100644
--- a/backends/xsmm/ceed-xsmm-tensor.c
+++ b/backends/xsmm/ceed-xsmm-tensor.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/xsmm/ceed-xsmm.h b/backends/xsmm/ceed-xsmm.h
index 65ff339d8a..124d8d4493 100644
--- a/backends/xsmm/ceed-xsmm.h
+++ b/backends/xsmm/ceed-xsmm.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/benchmarks/benchmark.sh b/benchmarks/benchmark.sh
index 9cf27bd199..59ff3cc0d7 100755
--- a/benchmarks/benchmark.sh
+++ b/benchmarks/benchmark.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/benchmarks/petsc-bps.sh b/benchmarks/petsc-bps.sh
index 8e69a10c86..004dc4b5b8 100755
--- a/benchmarks/petsc-bps.sh
+++ b/benchmarks/petsc-bps.sh
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/benchmarks/petsc-bpsraw.sh b/benchmarks/petsc-bpsraw.sh
index 20567e4408..7099bb4ce1 100755
--- a/benchmarks/petsc-bpsraw.sh
+++ b/benchmarks/petsc-bpsraw.sh
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/benchmarks/postprocess_base.py b/benchmarks/postprocess_base.py
index b63acb4b65..f69d283d38 100755
--- a/benchmarks/postprocess_base.py
+++ b/benchmarks/postprocess_base.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 
-# Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/benchmarks/postprocess_plot.py b/benchmarks/postprocess_plot.py
index 52b7c5ee5b..59101837f3 100755
--- a/benchmarks/postprocess_plot.py
+++ b/benchmarks/postprocess_plot.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 
-# Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/benchmarks/postprocess_table.py b/benchmarks/postprocess_table.py
index 22463e4cf7..8822a346ff 100755
--- a/benchmarks/postprocess_table.py
+++ b/benchmarks/postprocess_table.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 
-# Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/common.mk b/common.mk
index 19ba90d847..1a53bbf820 100644
--- a/common.mk
+++ b/common.mk
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/ceed/Makefile b/examples/ceed/Makefile
index a5dbc70b4a..db88064a1e 100644
--- a/examples/ceed/Makefile
+++ b/examples/ceed/Makefile
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors
+# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/ceed/ex1-volume.c b/examples/ceed/ex1-volume.c
index 1ac84ea112..354f977113 100644
--- a/examples/ceed/ex1-volume.c
+++ b/examples/ceed/ex1-volume.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/ceed/ex1-volume.h b/examples/ceed/ex1-volume.h
index 79cf3d0d4e..72361e620c 100644
--- a/examples/ceed/ex1-volume.h
+++ b/examples/ceed/ex1-volume.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/ceed/ex2-surface.c b/examples/ceed/ex2-surface.c
index 2db90319fd..2191e4dc63 100644
--- a/examples/ceed/ex2-surface.c
+++ b/examples/ceed/ex2-surface.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/ceed/ex2-surface.h b/examples/ceed/ex2-surface.h
index 901be2e9ce..5e8e253003 100644
--- a/examples/ceed/ex2-surface.h
+++ b/examples/ceed/ex2-surface.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/ceed/ex3-volume.c b/examples/ceed/ex3-volume.c
index 13f5275e07..380882a631 100644
--- a/examples/ceed/ex3-volume.c
+++ b/examples/ceed/ex3-volume.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/ceed/ex3-volume.h b/examples/ceed/ex3-volume.h
index 11e9c7f99c..956648a211 100644
--- a/examples/ceed/ex3-volume.h
+++ b/examples/ceed/ex3-volume.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/include/bc_definition.h b/examples/fluids/include/bc_definition.h
index c43f68f5cd..7b5671ab1c 100644
--- a/examples/fluids/include/bc_definition.h
+++ b/examples/fluids/include/bc_definition.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/include/log_events.h b/examples/fluids/include/log_events.h
index 8e65ac373c..4a70db3b83 100644
--- a/examples/fluids/include/log_events.h
+++ b/examples/fluids/include/log_events.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/include/mat-ceed-impl.h b/examples/fluids/include/mat-ceed-impl.h
index 882b6221bc..67a77b7591 100644
--- a/examples/fluids/include/mat-ceed-impl.h
+++ b/examples/fluids/include/mat-ceed-impl.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/include/mat-ceed.h b/examples/fluids/include/mat-ceed.h
index 11e7caad0d..b6a8c08511 100644
--- a/examples/fluids/include/mat-ceed.h
+++ b/examples/fluids/include/mat-ceed.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/include/petsc-ceed-utils.h b/examples/fluids/include/petsc-ceed-utils.h
index 33c9aa2412..54b61610cb 100644
--- a/examples/fluids/include/petsc-ceed-utils.h
+++ b/examples/fluids/include/petsc-ceed-utils.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/include/petsc-ceed.h b/examples/fluids/include/petsc-ceed.h
index a8667b3b75..3b3d648d15 100644
--- a/examples/fluids/include/petsc-ceed.h
+++ b/examples/fluids/include/petsc-ceed.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/include/petsc_ops.h b/examples/fluids/include/petsc_ops.h
index ab79c0ad76..d614df60ab 100644
--- a/examples/fluids/include/petsc_ops.h
+++ b/examples/fluids/include/petsc_ops.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/navierstokes.c b/examples/fluids/navierstokes.c
index 122f1f579d..0b674bd660 100644
--- a/examples/fluids/navierstokes.c
+++ b/examples/fluids/navierstokes.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/navierstokes.h b/examples/fluids/navierstokes.h
index a5c2b9fc30..26ba140814 100644
--- a/examples/fluids/navierstokes.h
+++ b/examples/fluids/navierstokes.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/problems/advection.c b/examples/fluids/problems/advection.c
index 307aac9081..79275a231c 100644
--- a/examples/fluids/problems/advection.c
+++ b/examples/fluids/problems/advection.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/problems/bc_freestream.c b/examples/fluids/problems/bc_freestream.c
index d93eabf094..b2f23f786e 100644
--- a/examples/fluids/problems/bc_freestream.c
+++ b/examples/fluids/problems/bc_freestream.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/problems/bc_slip.c b/examples/fluids/problems/bc_slip.c
index 2bb762e8d1..727188dfe0 100644
--- a/examples/fluids/problems/bc_slip.c
+++ b/examples/fluids/problems/bc_slip.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/problems/blasius.c b/examples/fluids/problems/blasius.c
index f01d67a42c..b9f3654046 100644
--- a/examples/fluids/problems/blasius.c
+++ b/examples/fluids/problems/blasius.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/problems/channel.c b/examples/fluids/problems/channel.c
index 4b95e9acab..55734e042d 100644
--- a/examples/fluids/problems/channel.c
+++ b/examples/fluids/problems/channel.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/problems/densitycurrent.c b/examples/fluids/problems/densitycurrent.c
index 7075701ecb..1dbbe36fb0 100644
--- a/examples/fluids/problems/densitycurrent.c
+++ b/examples/fluids/problems/densitycurrent.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/problems/eulervortex.c b/examples/fluids/problems/eulervortex.c
index 96b2f5d293..34d74052ab 100644
--- a/examples/fluids/problems/eulervortex.c
+++ b/examples/fluids/problems/eulervortex.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/problems/gaussianwave.c b/examples/fluids/problems/gaussianwave.c
index b421296473..abadc453f7 100644
--- a/examples/fluids/problems/gaussianwave.c
+++ b/examples/fluids/problems/gaussianwave.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/problems/newtonian.c b/examples/fluids/problems/newtonian.c
index e13df731c2..1ab6e222ac 100644
--- a/examples/fluids/problems/newtonian.c
+++ b/examples/fluids/problems/newtonian.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/problems/shocktube.c b/examples/fluids/problems/shocktube.c
index 32c8dd691a..462f9cebb8 100644
--- a/examples/fluids/problems/shocktube.c
+++ b/examples/fluids/problems/shocktube.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/problems/stg_shur14.c b/examples/fluids/problems/stg_shur14.c
index ca5eb8c9f1..ca8dd2d10f 100644
--- a/examples/fluids/problems/stg_shur14.c
+++ b/examples/fluids/problems/stg_shur14.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/problems/stg_shur14.h b/examples/fluids/problems/stg_shur14.h
index 0af335b26d..49fd6f1f6b 100644
--- a/examples/fluids/problems/stg_shur14.h
+++ b/examples/fluids/problems/stg_shur14.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/problems/taylorgreen.c b/examples/fluids/problems/taylorgreen.c
index a6974775fe..7a0e55f52c 100644
--- a/examples/fluids/problems/taylorgreen.c
+++ b/examples/fluids/problems/taylorgreen.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/qfunctions/advection.h b/examples/fluids/qfunctions/advection.h
index e7cace4afd..486e0727ed 100644
--- a/examples/fluids/qfunctions/advection.h
+++ b/examples/fluids/qfunctions/advection.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/qfunctions/advection_types.h b/examples/fluids/qfunctions/advection_types.h
index 90c709d54a..ed008f0603 100644
--- a/examples/fluids/qfunctions/advection_types.h
+++ b/examples/fluids/qfunctions/advection_types.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/qfunctions/bc_freestream.h b/examples/fluids/qfunctions/bc_freestream.h
index 7767b3af1f..c348e9ab2e 100644
--- a/examples/fluids/qfunctions/bc_freestream.h
+++ b/examples/fluids/qfunctions/bc_freestream.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/qfunctions/bc_freestream_type.h b/examples/fluids/qfunctions/bc_freestream_type.h
index a6c7456842..62a3fa1c4c 100644
--- a/examples/fluids/qfunctions/bc_freestream_type.h
+++ b/examples/fluids/qfunctions/bc_freestream_type.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/qfunctions/bc_slip.h b/examples/fluids/qfunctions/bc_slip.h
index 2d5bd21cac..5a77f3727e 100644
--- a/examples/fluids/qfunctions/bc_slip.h
+++ b/examples/fluids/qfunctions/bc_slip.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/qfunctions/blasius.h b/examples/fluids/qfunctions/blasius.h
index 88c24661de..20e4f4c72b 100644
--- a/examples/fluids/qfunctions/blasius.h
+++ b/examples/fluids/qfunctions/blasius.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/qfunctions/channel.h b/examples/fluids/qfunctions/channel.h
index d19fafec03..9595c81701 100644
--- a/examples/fluids/qfunctions/channel.h
+++ b/examples/fluids/qfunctions/channel.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/qfunctions/densitycurrent.h b/examples/fluids/qfunctions/densitycurrent.h
index b328cce698..69fe9488fd 100644
--- a/examples/fluids/qfunctions/densitycurrent.h
+++ b/examples/fluids/qfunctions/densitycurrent.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/qfunctions/differential_filter.h b/examples/fluids/qfunctions/differential_filter.h
index e8dc47e619..a983cd7a63 100644
--- a/examples/fluids/qfunctions/differential_filter.h
+++ b/examples/fluids/qfunctions/differential_filter.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/qfunctions/differential_filter_enums.h b/examples/fluids/qfunctions/differential_filter_enums.h
index 9e00c67ccf..9c000c3b9d 100644
--- a/examples/fluids/qfunctions/differential_filter_enums.h
+++ b/examples/fluids/qfunctions/differential_filter_enums.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/qfunctions/eulervortex.h b/examples/fluids/qfunctions/eulervortex.h
index 8862005cc2..2f6c0ad003 100644
--- a/examples/fluids/qfunctions/eulervortex.h
+++ b/examples/fluids/qfunctions/eulervortex.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/qfunctions/gaussianwave.h b/examples/fluids/qfunctions/gaussianwave.h
index aa90258248..0bf6b612b4 100644
--- a/examples/fluids/qfunctions/gaussianwave.h
+++ b/examples/fluids/qfunctions/gaussianwave.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/qfunctions/grid_anisotropy_tensor.h b/examples/fluids/qfunctions/grid_anisotropy_tensor.h
index 7311c3090d..80078afcd4 100644
--- a/examples/fluids/qfunctions/grid_anisotropy_tensor.h
+++ b/examples/fluids/qfunctions/grid_anisotropy_tensor.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/qfunctions/inverse_multiplicity.h b/examples/fluids/qfunctions/inverse_multiplicity.h
index 07191c5fc2..2c4a5ef335 100644
--- a/examples/fluids/qfunctions/inverse_multiplicity.h
+++ b/examples/fluids/qfunctions/inverse_multiplicity.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/qfunctions/mass.h b/examples/fluids/qfunctions/mass.h
index a05857c53e..81de13d16e 100644
--- a/examples/fluids/qfunctions/mass.h
+++ b/examples/fluids/qfunctions/mass.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/qfunctions/newtonian.h b/examples/fluids/qfunctions/newtonian.h
index a273a25aea..fa470e5cd1 100644
--- a/examples/fluids/qfunctions/newtonian.h
+++ b/examples/fluids/qfunctions/newtonian.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/qfunctions/newtonian_state.h b/examples/fluids/qfunctions/newtonian_state.h
index b0b054831d..0b6796f2fc 100644
--- a/examples/fluids/qfunctions/newtonian_state.h
+++ b/examples/fluids/qfunctions/newtonian_state.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/qfunctions/newtonian_types.h b/examples/fluids/qfunctions/newtonian_types.h
index 15cf679a2e..70b2b4c3bd 100644
--- a/examples/fluids/qfunctions/newtonian_types.h
+++ b/examples/fluids/qfunctions/newtonian_types.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/qfunctions/riemann_solver.h b/examples/fluids/qfunctions/riemann_solver.h
index a3a2970da1..7d884e9ad1 100644
--- a/examples/fluids/qfunctions/riemann_solver.h
+++ b/examples/fluids/qfunctions/riemann_solver.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/qfunctions/setupgeo.h b/examples/fluids/qfunctions/setupgeo.h
index b05bbd8373..62b8390376 100644
--- a/examples/fluids/qfunctions/setupgeo.h
+++ b/examples/fluids/qfunctions/setupgeo.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/qfunctions/setupgeo2d.h b/examples/fluids/qfunctions/setupgeo2d.h
index 4564a3fc27..0cd5649296 100644
--- a/examples/fluids/qfunctions/setupgeo2d.h
+++ b/examples/fluids/qfunctions/setupgeo2d.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/qfunctions/setupgeo_helpers.h b/examples/fluids/qfunctions/setupgeo_helpers.h
index 870295fd74..6677225f4f 100644
--- a/examples/fluids/qfunctions/setupgeo_helpers.h
+++ b/examples/fluids/qfunctions/setupgeo_helpers.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/qfunctions/shocktube.h b/examples/fluids/qfunctions/shocktube.h
index c4a5ead1c1..64e0798a44 100644
--- a/examples/fluids/qfunctions/shocktube.h
+++ b/examples/fluids/qfunctions/shocktube.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/qfunctions/stabilization.h b/examples/fluids/qfunctions/stabilization.h
index 9c4718498b..87f05823aa 100644
--- a/examples/fluids/qfunctions/stabilization.h
+++ b/examples/fluids/qfunctions/stabilization.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/qfunctions/stabilization_types.h b/examples/fluids/qfunctions/stabilization_types.h
index c9241ecc4e..8544e428e9 100644
--- a/examples/fluids/qfunctions/stabilization_types.h
+++ b/examples/fluids/qfunctions/stabilization_types.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/qfunctions/stg_shur14.h b/examples/fluids/qfunctions/stg_shur14.h
index b62bbda326..2e8b05db1c 100644
--- a/examples/fluids/qfunctions/stg_shur14.h
+++ b/examples/fluids/qfunctions/stg_shur14.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/qfunctions/stg_shur14_type.h b/examples/fluids/qfunctions/stg_shur14_type.h
index 4ade5d20c2..945956de84 100644
--- a/examples/fluids/qfunctions/stg_shur14_type.h
+++ b/examples/fluids/qfunctions/stg_shur14_type.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/qfunctions/strong_boundary_conditions.h b/examples/fluids/qfunctions/strong_boundary_conditions.h
index 8177fe33d3..1526580963 100644
--- a/examples/fluids/qfunctions/strong_boundary_conditions.h
+++ b/examples/fluids/qfunctions/strong_boundary_conditions.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/qfunctions/taylorgreen.h b/examples/fluids/qfunctions/taylorgreen.h
index 7bc6074990..c28e718913 100644
--- a/examples/fluids/qfunctions/taylorgreen.h
+++ b/examples/fluids/qfunctions/taylorgreen.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/qfunctions/turb_spanstats.h b/examples/fluids/qfunctions/turb_spanstats.h
index 49adf1f364..6331b119e9 100644
--- a/examples/fluids/qfunctions/turb_spanstats.h
+++ b/examples/fluids/qfunctions/turb_spanstats.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/qfunctions/turb_stats_types.h b/examples/fluids/qfunctions/turb_stats_types.h
index 12deac8c20..dccae3653a 100644
--- a/examples/fluids/qfunctions/turb_stats_types.h
+++ b/examples/fluids/qfunctions/turb_stats_types.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/qfunctions/utils.h b/examples/fluids/qfunctions/utils.h
index 61dcc13d3a..bd9d787efc 100644
--- a/examples/fluids/qfunctions/utils.h
+++ b/examples/fluids/qfunctions/utils.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/qfunctions/utils_eigensolver_jacobi.h b/examples/fluids/qfunctions/utils_eigensolver_jacobi.h
index d4c1ffc482..71587633dd 100644
--- a/examples/fluids/qfunctions/utils_eigensolver_jacobi.h
+++ b/examples/fluids/qfunctions/utils_eigensolver_jacobi.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/qfunctions/velocity_gradient_projection.h b/examples/fluids/qfunctions/velocity_gradient_projection.h
index 3b143d6c33..2fecc3f258 100644
--- a/examples/fluids/qfunctions/velocity_gradient_projection.h
+++ b/examples/fluids/qfunctions/velocity_gradient_projection.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/src/bc_definition.c b/examples/fluids/src/bc_definition.c
index 7b917ae642..acdb50a370 100644
--- a/examples/fluids/src/bc_definition.c
+++ b/examples/fluids/src/bc_definition.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/src/boundary_condition.c b/examples/fluids/src/boundary_condition.c
index 3f3f7f7a06..89e917634d 100644
--- a/examples/fluids/src/boundary_condition.c
+++ b/examples/fluids/src/boundary_condition.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/src/cloptions.c b/examples/fluids/src/cloptions.c
index aa9d70ccb9..905144216c 100644
--- a/examples/fluids/src/cloptions.c
+++ b/examples/fluids/src/cloptions.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/src/differential_filter.c b/examples/fluids/src/differential_filter.c
index 130dafa876..c3f1478867 100644
--- a/examples/fluids/src/differential_filter.c
+++ b/examples/fluids/src/differential_filter.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/src/dm_utils.c b/examples/fluids/src/dm_utils.c
index 55f99f058e..b7a1bf8ea7 100644
--- a/examples/fluids/src/dm_utils.c
+++ b/examples/fluids/src/dm_utils.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/src/grid_anisotropy_tensor.c b/examples/fluids/src/grid_anisotropy_tensor.c
index 7f78edd3ca..8e5ffecb49 100644
--- a/examples/fluids/src/grid_anisotropy_tensor.c
+++ b/examples/fluids/src/grid_anisotropy_tensor.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/src/inverse_multiplicity.c b/examples/fluids/src/inverse_multiplicity.c
index 5d3b7dd9ae..0e8cb90cbf 100644
--- a/examples/fluids/src/inverse_multiplicity.c
+++ b/examples/fluids/src/inverse_multiplicity.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/src/log_events.c b/examples/fluids/src/log_events.c
index fce0368614..d67b312250 100644
--- a/examples/fluids/src/log_events.c
+++ b/examples/fluids/src/log_events.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/src/misc.c b/examples/fluids/src/misc.c
index 5b74939ee0..cebed9689b 100644
--- a/examples/fluids/src/misc.c
+++ b/examples/fluids/src/misc.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/src/petsc_ops.c b/examples/fluids/src/petsc_ops.c
index 982979f99e..786b081b2e 100644
--- a/examples/fluids/src/petsc_ops.c
+++ b/examples/fluids/src/petsc_ops.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/src/qdata.c b/examples/fluids/src/qdata.c
index d39f7f1fc2..4288220a6b 100644
--- a/examples/fluids/src/qdata.c
+++ b/examples/fluids/src/qdata.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/src/setupdm.c b/examples/fluids/src/setupdm.c
index b4de50feb6..25573ca63d 100644
--- a/examples/fluids/src/setupdm.c
+++ b/examples/fluids/src/setupdm.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/src/setuplibceed.c b/examples/fluids/src/setuplibceed.c
index 026b496878..bb801aa269 100644
--- a/examples/fluids/src/setuplibceed.c
+++ b/examples/fluids/src/setuplibceed.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/src/setupts.c b/examples/fluids/src/setupts.c
index 1a4e3f3e76..6ebaa66b39 100644
--- a/examples/fluids/src/setupts.c
+++ b/examples/fluids/src/setupts.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/src/strong_boundary_conditions.c b/examples/fluids/src/strong_boundary_conditions.c
index 69a8558f78..76bee17d39 100644
--- a/examples/fluids/src/strong_boundary_conditions.c
+++ b/examples/fluids/src/strong_boundary_conditions.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/src/turb_spanstats.c b/examples/fluids/src/turb_spanstats.c
index f5f3c26854..942efc38a7 100644
--- a/examples/fluids/src/turb_spanstats.c
+++ b/examples/fluids/src/turb_spanstats.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/src/velocity_gradient_projection.c b/examples/fluids/src/velocity_gradient_projection.c
index d022047900..277da68ee1 100644
--- a/examples/fluids/src/velocity_gradient_projection.c
+++ b/examples/fluids/src/velocity_gradient_projection.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/mfem/Makefile b/examples/mfem/Makefile
index d94e4b01f1..6b042926fe 100644
--- a/examples/mfem/Makefile
+++ b/examples/mfem/Makefile
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors
+# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/mfem/bp1.cpp b/examples/mfem/bp1.cpp
index 22f420cc24..096a6aeee7 100644
--- a/examples/mfem/bp1.cpp
+++ b/examples/mfem/bp1.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/mfem/bp1.h b/examples/mfem/bp1.h
index 47c4879707..3e6fe273c8 100644
--- a/examples/mfem/bp1.h
+++ b/examples/mfem/bp1.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/mfem/bp1.hpp b/examples/mfem/bp1.hpp
index 7a3afbdcff..912346857c 100644
--- a/examples/mfem/bp1.hpp
+++ b/examples/mfem/bp1.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/mfem/bp3.cpp b/examples/mfem/bp3.cpp
index ccc435fdb9..779a75f3a2 100644
--- a/examples/mfem/bp3.cpp
+++ b/examples/mfem/bp3.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/mfem/bp3.h b/examples/mfem/bp3.h
index 1c2d2bfb41..bc73b3acab 100644
--- a/examples/mfem/bp3.h
+++ b/examples/mfem/bp3.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/mfem/bp3.hpp b/examples/mfem/bp3.hpp
index f6cea2fc59..36b88b3697 100644
--- a/examples/mfem/bp3.hpp
+++ b/examples/mfem/bp3.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/nek/bps/bps.h b/examples/nek/bps/bps.h
index 5e54d12e90..5de48e9e34 100644
--- a/examples/nek/bps/bps.h
+++ b/examples/nek/bps/bps.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/nek/bps/bps.usr b/examples/nek/bps/bps.usr
index 24ccc9cd00..f5021d78d9 100644
--- a/examples/nek/bps/bps.usr
+++ b/examples/nek/bps/bps.usr
@@ -1,4 +1,4 @@
-C Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors
+C Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors
 C All Rights Reserved. See the top-level COPYRIGHT and NOTICE files for details.
 C
 C SPDX-License-Identifier: (BSD-2-Clause)
diff --git a/examples/petsc/Makefile b/examples/petsc/Makefile
index ee3bd9313d..d66fd3176a 100644
--- a/examples/petsc/Makefile
+++ b/examples/petsc/Makefile
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors
+# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/petsc/area.c b/examples/petsc/area.c
index 9821caccf3..2c7e2fbe7a 100644
--- a/examples/petsc/area.c
+++ b/examples/petsc/area.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/petsc/area.h b/examples/petsc/area.h
index 08c69f43b4..fd36dd79df 100644
--- a/examples/petsc/area.h
+++ b/examples/petsc/area.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/petsc/bps.c b/examples/petsc/bps.c
index c5f02e3f6d..92838c8537 100644
--- a/examples/petsc/bps.c
+++ b/examples/petsc/bps.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/petsc/bps.h b/examples/petsc/bps.h
index e45e9dcb17..95d4a4c644 100644
--- a/examples/petsc/bps.h
+++ b/examples/petsc/bps.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/petsc/bpsraw.c b/examples/petsc/bpsraw.c
index 4970494304..2541d351bc 100644
--- a/examples/petsc/bpsraw.c
+++ b/examples/petsc/bpsraw.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/petsc/bpssphere.c b/examples/petsc/bpssphere.c
index df1e7bbdc0..acd591d5e3 100644
--- a/examples/petsc/bpssphere.c
+++ b/examples/petsc/bpssphere.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/petsc/bpssphere.h b/examples/petsc/bpssphere.h
index 80e64144a0..c5d030bab8 100644
--- a/examples/petsc/bpssphere.h
+++ b/examples/petsc/bpssphere.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/petsc/bpsswarm.c b/examples/petsc/bpsswarm.c
index 9fc1320b21..787148797a 100644
--- a/examples/petsc/bpsswarm.c
+++ b/examples/petsc/bpsswarm.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/petsc/dmswarm.c b/examples/petsc/dmswarm.c
index 903f601cc4..fa95f16979 100644
--- a/examples/petsc/dmswarm.c
+++ b/examples/petsc/dmswarm.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/petsc/include/areaproblemdata.h b/examples/petsc/include/areaproblemdata.h
index 290dc86bdb..5820409159 100644
--- a/examples/petsc/include/areaproblemdata.h
+++ b/examples/petsc/include/areaproblemdata.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/petsc/include/bpsproblemdata.h b/examples/petsc/include/bpsproblemdata.h
index 5e3a55d8f5..9525216d0f 100644
--- a/examples/petsc/include/bpsproblemdata.h
+++ b/examples/petsc/include/bpsproblemdata.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/petsc/include/libceedsetup.h b/examples/petsc/include/libceedsetup.h
index dee9e9d730..c87130e923 100644
--- a/examples/petsc/include/libceedsetup.h
+++ b/examples/petsc/include/libceedsetup.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/petsc/include/matops.h b/examples/petsc/include/matops.h
index 5d2162229c..d9e03b6f6d 100644
--- a/examples/petsc/include/matops.h
+++ b/examples/petsc/include/matops.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/petsc/include/petscutils.h b/examples/petsc/include/petscutils.h
index 8b9d10d075..0f1f5d0ad6 100644
--- a/examples/petsc/include/petscutils.h
+++ b/examples/petsc/include/petscutils.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/petsc/include/petscversion.h b/examples/petsc/include/petscversion.h
index 50d333c33e..426aeae2ba 100644
--- a/examples/petsc/include/petscversion.h
+++ b/examples/petsc/include/petscversion.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/petsc/include/sphereproblemdata.h b/examples/petsc/include/sphereproblemdata.h
index a4a2b7f8b4..4a63deea05 100644
--- a/examples/petsc/include/sphereproblemdata.h
+++ b/examples/petsc/include/sphereproblemdata.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/petsc/include/structs.h b/examples/petsc/include/structs.h
index 8b6c48e296..8b2647fe16 100644
--- a/examples/petsc/include/structs.h
+++ b/examples/petsc/include/structs.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/petsc/include/swarmutils.h b/examples/petsc/include/swarmutils.h
index 14b16c48b7..4beed9bef1 100644
--- a/examples/petsc/include/swarmutils.h
+++ b/examples/petsc/include/swarmutils.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/petsc/multigrid.c b/examples/petsc/multigrid.c
index a59011074b..39bf1eb1ab 100644
--- a/examples/petsc/multigrid.c
+++ b/examples/petsc/multigrid.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/petsc/qfunctions/area/areacube.h b/examples/petsc/qfunctions/area/areacube.h
index e041bb3ad2..f008846f2a 100644
--- a/examples/petsc/qfunctions/area/areacube.h
+++ b/examples/petsc/qfunctions/area/areacube.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/petsc/qfunctions/area/areasphere.h b/examples/petsc/qfunctions/area/areasphere.h
index 902e2e17ae..13e5536e14 100644
--- a/examples/petsc/qfunctions/area/areasphere.h
+++ b/examples/petsc/qfunctions/area/areasphere.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/petsc/qfunctions/bps/bp1.h b/examples/petsc/qfunctions/bps/bp1.h
index 060ec3f395..fb35d0249e 100644
--- a/examples/petsc/qfunctions/bps/bp1.h
+++ b/examples/petsc/qfunctions/bps/bp1.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/petsc/qfunctions/bps/bp13.h b/examples/petsc/qfunctions/bps/bp13.h
index a721b21f5b..33d454546d 100644
--- a/examples/petsc/qfunctions/bps/bp13.h
+++ b/examples/petsc/qfunctions/bps/bp13.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/petsc/qfunctions/bps/bp1sphere.h b/examples/petsc/qfunctions/bps/bp1sphere.h
index ba5162be73..394d3d6cae 100644
--- a/examples/petsc/qfunctions/bps/bp1sphere.h
+++ b/examples/petsc/qfunctions/bps/bp1sphere.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/petsc/qfunctions/bps/bp2.h b/examples/petsc/qfunctions/bps/bp2.h
index c0660f76af..21da3ec39a 100644
--- a/examples/petsc/qfunctions/bps/bp2.h
+++ b/examples/petsc/qfunctions/bps/bp2.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/petsc/qfunctions/bps/bp24.h b/examples/petsc/qfunctions/bps/bp24.h
index dad569f852..4870cd1cfe 100644
--- a/examples/petsc/qfunctions/bps/bp24.h
+++ b/examples/petsc/qfunctions/bps/bp24.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/petsc/qfunctions/bps/bp2sphere.h b/examples/petsc/qfunctions/bps/bp2sphere.h
index 2370699150..aa08525c86 100644
--- a/examples/petsc/qfunctions/bps/bp2sphere.h
+++ b/examples/petsc/qfunctions/bps/bp2sphere.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/petsc/qfunctions/bps/bp3.h b/examples/petsc/qfunctions/bps/bp3.h
index aeac6005d9..153ad6e021 100644
--- a/examples/petsc/qfunctions/bps/bp3.h
+++ b/examples/petsc/qfunctions/bps/bp3.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/petsc/qfunctions/bps/bp3sphere.h b/examples/petsc/qfunctions/bps/bp3sphere.h
index db5064d38a..911e14d0ac 100644
--- a/examples/petsc/qfunctions/bps/bp3sphere.h
+++ b/examples/petsc/qfunctions/bps/bp3sphere.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/petsc/qfunctions/bps/bp4.h b/examples/petsc/qfunctions/bps/bp4.h
index 89d8ce98e9..0ccad57d68 100644
--- a/examples/petsc/qfunctions/bps/bp4.h
+++ b/examples/petsc/qfunctions/bps/bp4.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/petsc/qfunctions/bps/bp4sphere.h b/examples/petsc/qfunctions/bps/bp4sphere.h
index cd26f72767..43b4806afe 100644
--- a/examples/petsc/qfunctions/bps/bp4sphere.h
+++ b/examples/petsc/qfunctions/bps/bp4sphere.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/petsc/qfunctions/bps/common.h b/examples/petsc/qfunctions/bps/common.h
index 29e5c5709a..09cccd5840 100644
--- a/examples/petsc/qfunctions/bps/common.h
+++ b/examples/petsc/qfunctions/bps/common.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/petsc/qfunctions/swarm/swarmmass.h b/examples/petsc/qfunctions/swarm/swarmmass.h
index 7eefea5806..1b6fa1e21c 100644
--- a/examples/petsc/qfunctions/swarm/swarmmass.h
+++ b/examples/petsc/qfunctions/swarm/swarmmass.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/petsc/src/libceedsetup.c b/examples/petsc/src/libceedsetup.c
index 446cf93f72..8f8323e7a6 100644
--- a/examples/petsc/src/libceedsetup.c
+++ b/examples/petsc/src/libceedsetup.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/petsc/src/petscutils.c b/examples/petsc/src/petscutils.c
index 990a75f1fd..1c4076ed10 100644
--- a/examples/petsc/src/petscutils.c
+++ b/examples/petsc/src/petscutils.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/petsc/src/swarmutils.c b/examples/petsc/src/swarmutils.c
index f7c37ac0de..21339ae9d6 100644
--- a/examples/petsc/src/swarmutils.c
+++ b/examples/petsc/src/swarmutils.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/python/Makefile b/examples/python/Makefile
index 2e22bc0f2d..64244ea2c1 100644
--- a/examples/python/Makefile
+++ b/examples/python/Makefile
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/python/conftest.py b/examples/python/conftest.py
index 6c763ac90a..70bdf69cfc 100644
--- a/examples/python/conftest.py
+++ b/examples/python/conftest.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors
+# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/python/ex1_volume.py b/examples/python/ex1_volume.py
index 91b6b8d41d..b08b7e34e3 100644
--- a/examples/python/ex1_volume.py
+++ b/examples/python/ex1_volume.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python3
-# Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/python/ex2_surface.py b/examples/python/ex2_surface.py
index 97be1ef276..f741600110 100644
--- a/examples/python/ex2_surface.py
+++ b/examples/python/ex2_surface.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python3
-# Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/python/ex3_volume.py b/examples/python/ex3_volume.py
index f803adaec8..7fe6df7387 100644
--- a/examples/python/ex3_volume.py
+++ b/examples/python/ex3_volume.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python3
-# Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/python/ex_common.py b/examples/python/ex_common.py
index db68b8f567..00e75805fb 100644
--- a/examples/python/ex_common.py
+++ b/examples/python/ex_common.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors
+# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/python/ex_test.py b/examples/python/ex_test.py
index e814007500..4d9cbf1e6a 100644
--- a/examples/python/ex_test.py
+++ b/examples/python/ex_test.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python3
-# Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors
+# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/python/qfunctions/ex-common.h b/examples/python/qfunctions/ex-common.h
index b6e98500b2..32b867b67f 100644
--- a/examples/python/qfunctions/ex-common.h
+++ b/examples/python/qfunctions/ex-common.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/python/qfunctions/ex1-volume.h b/examples/python/qfunctions/ex1-volume.h
index 984339cb86..907b77bf5c 100644
--- a/examples/python/qfunctions/ex1-volume.h
+++ b/examples/python/qfunctions/ex1-volume.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/python/qfunctions/ex2-surface.h b/examples/python/qfunctions/ex2-surface.h
index 52f0aea1ae..980a952105 100644
--- a/examples/python/qfunctions/ex2-surface.h
+++ b/examples/python/qfunctions/ex2-surface.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/python/qfunctions/ex3-volume.h b/examples/python/qfunctions/ex3-volume.h
index 76489a622a..1a992480cc 100644
--- a/examples/python/qfunctions/ex3-volume.h
+++ b/examples/python/qfunctions/ex3-volume.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/python/qfunctions/qfunctions.c b/examples/python/qfunctions/qfunctions.c
index f7fd7f945d..ee41a501a7 100644
--- a/examples/python/qfunctions/qfunctions.c
+++ b/examples/python/qfunctions/qfunctions.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/rust-qfunctions/Makefile b/examples/rust-qfunctions/Makefile
index 92f0915cbf..2fba76706a 100644
--- a/examples/rust-qfunctions/Makefile
+++ b/examples/rust-qfunctions/Makefile
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors
+# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/rust-qfunctions/ex1-volume.c b/examples/rust-qfunctions/ex1-volume.c
index 75b0a836f1..906ab1aff7 100644
--- a/examples/rust-qfunctions/ex1-volume.c
+++ b/examples/rust-qfunctions/ex1-volume.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/rust-qfunctions/ex1-volume.h b/examples/rust-qfunctions/ex1-volume.h
index e769823dfc..1c2baa8fc2 100644
--- a/examples/rust-qfunctions/ex1-volume.h
+++ b/examples/rust-qfunctions/ex1-volume.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/rust/ex1-volume-vector/src/main.rs b/examples/rust/ex1-volume-vector/src/main.rs
index b319d0b6f2..85921e688d 100644
--- a/examples/rust/ex1-volume-vector/src/main.rs
+++ b/examples/rust/ex1-volume-vector/src/main.rs
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/rust/ex1-volume-vector/src/opt.rs b/examples/rust/ex1-volume-vector/src/opt.rs
index e74dba8bf6..37cbf0a3c0 100644
--- a/examples/rust/ex1-volume-vector/src/opt.rs
+++ b/examples/rust/ex1-volume-vector/src/opt.rs
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/rust/ex1-volume-vector/src/transform.rs b/examples/rust/ex1-volume-vector/src/transform.rs
index e18f0e4948..7073937353 100644
--- a/examples/rust/ex1-volume-vector/src/transform.rs
+++ b/examples/rust/ex1-volume-vector/src/transform.rs
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/rust/ex1-volume/src/main.rs b/examples/rust/ex1-volume/src/main.rs
index dac51d4278..9020fb270c 100644
--- a/examples/rust/ex1-volume/src/main.rs
+++ b/examples/rust/ex1-volume/src/main.rs
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/rust/ex1-volume/src/opt.rs b/examples/rust/ex1-volume/src/opt.rs
index 66d5ca74b8..c93cd17180 100644
--- a/examples/rust/ex1-volume/src/opt.rs
+++ b/examples/rust/ex1-volume/src/opt.rs
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/rust/ex1-volume/src/transform.rs b/examples/rust/ex1-volume/src/transform.rs
index e18f0e4948..7073937353 100644
--- a/examples/rust/ex1-volume/src/transform.rs
+++ b/examples/rust/ex1-volume/src/transform.rs
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/rust/ex2-surface-vector/src/main.rs b/examples/rust/ex2-surface-vector/src/main.rs
index 9314572c62..e2ff598d2e 100644
--- a/examples/rust/ex2-surface-vector/src/main.rs
+++ b/examples/rust/ex2-surface-vector/src/main.rs
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/rust/ex2-surface-vector/src/opt.rs b/examples/rust/ex2-surface-vector/src/opt.rs
index 1ad9d895a4..ecbeb8c3cc 100644
--- a/examples/rust/ex2-surface-vector/src/opt.rs
+++ b/examples/rust/ex2-surface-vector/src/opt.rs
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/rust/ex2-surface-vector/src/transform.rs b/examples/rust/ex2-surface-vector/src/transform.rs
index 5a15323e28..43cdbfe0f1 100644
--- a/examples/rust/ex2-surface-vector/src/transform.rs
+++ b/examples/rust/ex2-surface-vector/src/transform.rs
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/rust/ex2-surface/src/main.rs b/examples/rust/ex2-surface/src/main.rs
index b4f892d30e..ee66c4663d 100644
--- a/examples/rust/ex2-surface/src/main.rs
+++ b/examples/rust/ex2-surface/src/main.rs
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/rust/ex2-surface/src/opt.rs b/examples/rust/ex2-surface/src/opt.rs
index 176d2bab80..f2c1afc8f2 100644
--- a/examples/rust/ex2-surface/src/opt.rs
+++ b/examples/rust/ex2-surface/src/opt.rs
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/rust/ex2-surface/src/transform.rs b/examples/rust/ex2-surface/src/transform.rs
index 5a15323e28..43cdbfe0f1 100644
--- a/examples/rust/ex2-surface/src/transform.rs
+++ b/examples/rust/ex2-surface/src/transform.rs
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/rust/ex3-volume-vector/src/main.rs b/examples/rust/ex3-volume-vector/src/main.rs
index 0d572871b3..2eb530470e 100644
--- a/examples/rust/ex3-volume-vector/src/main.rs
+++ b/examples/rust/ex3-volume-vector/src/main.rs
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/rust/ex3-volume-vector/src/opt.rs b/examples/rust/ex3-volume-vector/src/opt.rs
index e21db82caa..edf546b032 100644
--- a/examples/rust/ex3-volume-vector/src/opt.rs
+++ b/examples/rust/ex3-volume-vector/src/opt.rs
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/rust/ex3-volume-vector/src/transform.rs b/examples/rust/ex3-volume-vector/src/transform.rs
index 01054d2ecd..e022a34860 100644
--- a/examples/rust/ex3-volume-vector/src/transform.rs
+++ b/examples/rust/ex3-volume-vector/src/transform.rs
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/rust/ex3-volume/src/main.rs b/examples/rust/ex3-volume/src/main.rs
index 7f7b691be0..16c3dfcfa3 100644
--- a/examples/rust/ex3-volume/src/main.rs
+++ b/examples/rust/ex3-volume/src/main.rs
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/rust/ex3-volume/src/opt.rs b/examples/rust/ex3-volume/src/opt.rs
index e21db82caa..edf546b032 100644
--- a/examples/rust/ex3-volume/src/opt.rs
+++ b/examples/rust/ex3-volume/src/opt.rs
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/rust/ex3-volume/src/transform.rs b/examples/rust/ex3-volume/src/transform.rs
index 01054d2ecd..e022a34860 100644
--- a/examples/rust/ex3-volume/src/transform.rs
+++ b/examples/rust/ex3-volume/src/transform.rs
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/rust/mesh/src/lib.rs b/examples/rust/mesh/src/lib.rs
index 5623cdaaaf..775d8e5ec2 100644
--- a/examples/rust/mesh/src/lib.rs
+++ b/examples/rust/mesh/src/lib.rs
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/solids/Makefile b/examples/solids/Makefile
index 5c87c8f0d0..490b229acc 100644
--- a/examples/solids/Makefile
+++ b/examples/solids/Makefile
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/solids/elasticity.c b/examples/solids/elasticity.c
index 06e56f6d3d..6c7db8fe97 100644
--- a/examples/solids/elasticity.c
+++ b/examples/solids/elasticity.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/solids/elasticity.h b/examples/solids/elasticity.h
index 155a022136..9458668b30 100644
--- a/examples/solids/elasticity.h
+++ b/examples/solids/elasticity.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/solids/include/boundary.h b/examples/solids/include/boundary.h
index c58c00e888..ca5916b682 100644
--- a/examples/solids/include/boundary.h
+++ b/examples/solids/include/boundary.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/solids/include/cl-options.h b/examples/solids/include/cl-options.h
index a6db168fad..1d4b8fc962 100644
--- a/examples/solids/include/cl-options.h
+++ b/examples/solids/include/cl-options.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/solids/include/matops.h b/examples/solids/include/matops.h
index aead345453..9b1fe843ba 100644
--- a/examples/solids/include/matops.h
+++ b/examples/solids/include/matops.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/solids/include/misc.h b/examples/solids/include/misc.h
index 03738cb4d1..d6dc668b3f 100644
--- a/examples/solids/include/misc.h
+++ b/examples/solids/include/misc.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/solids/include/setup-dm.h b/examples/solids/include/setup-dm.h
index ff07f7e86f..06c5347c18 100644
--- a/examples/solids/include/setup-dm.h
+++ b/examples/solids/include/setup-dm.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/solids/include/setup-libceed.h b/examples/solids/include/setup-libceed.h
index fc1606bf68..870f3bdf16 100644
--- a/examples/solids/include/setup-libceed.h
+++ b/examples/solids/include/setup-libceed.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/solids/include/structs.h b/examples/solids/include/structs.h
index 75f9e3c5ce..f553002f93 100644
--- a/examples/solids/include/structs.h
+++ b/examples/solids/include/structs.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/solids/include/utils.h b/examples/solids/include/utils.h
index 82832a3755..709be45d3d 100644
--- a/examples/solids/include/utils.h
+++ b/examples/solids/include/utils.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/solids/problems/cl-problems.h b/examples/solids/problems/cl-problems.h
index ec9a10b2ff..f596a01b60 100644
--- a/examples/solids/problems/cl-problems.h
+++ b/examples/solids/problems/cl-problems.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/solids/problems/finite-strain-mooney-rivlin.c b/examples/solids/problems/finite-strain-mooney-rivlin.c
index bae739e175..9798eeb26e 100644
--- a/examples/solids/problems/finite-strain-mooney-rivlin.c
+++ b/examples/solids/problems/finite-strain-mooney-rivlin.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/solids/problems/finite-strain-neo-hookean.c b/examples/solids/problems/finite-strain-neo-hookean.c
index d7ae867a6f..3948d257e3 100644
--- a/examples/solids/problems/finite-strain-neo-hookean.c
+++ b/examples/solids/problems/finite-strain-neo-hookean.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/solids/problems/linear.c b/examples/solids/problems/linear.c
index a733acc70a..82cb9635d1 100644
--- a/examples/solids/problems/linear.c
+++ b/examples/solids/problems/linear.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/solids/problems/mooney-rivlin.c b/examples/solids/problems/mooney-rivlin.c
index 2f6de2337f..4444250187 100644
--- a/examples/solids/problems/mooney-rivlin.c
+++ b/examples/solids/problems/mooney-rivlin.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/solids/problems/mooney-rivlin.h b/examples/solids/problems/mooney-rivlin.h
index eca930aea0..0903df99d6 100644
--- a/examples/solids/problems/mooney-rivlin.h
+++ b/examples/solids/problems/mooney-rivlin.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/solids/problems/neo-hookean.c b/examples/solids/problems/neo-hookean.c
index 86d2b9b0f0..dfd2d68005 100644
--- a/examples/solids/problems/neo-hookean.c
+++ b/examples/solids/problems/neo-hookean.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/solids/problems/neo-hookean.h b/examples/solids/problems/neo-hookean.h
index eeeb5e3b42..a80e508001 100644
--- a/examples/solids/problems/neo-hookean.h
+++ b/examples/solids/problems/neo-hookean.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/solids/problems/problems.c b/examples/solids/problems/problems.c
index e86686ce61..5a819aecf1 100644
--- a/examples/solids/problems/problems.c
+++ b/examples/solids/problems/problems.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/solids/problems/problems.h b/examples/solids/problems/problems.h
index d7856b703c..e71ab1719b 100644
--- a/examples/solids/problems/problems.h
+++ b/examples/solids/problems/problems.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/solids/qfunctions/common.h b/examples/solids/qfunctions/common.h
index 3bd1d37f0f..cf63c02a93 100644
--- a/examples/solids/qfunctions/common.h
+++ b/examples/solids/qfunctions/common.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/solids/qfunctions/constant-force.h b/examples/solids/qfunctions/constant-force.h
index 147686156d..e37505c7e4 100644
--- a/examples/solids/qfunctions/constant-force.h
+++ b/examples/solids/qfunctions/constant-force.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/solids/qfunctions/finite-strain-mooney-rivlin.h b/examples/solids/qfunctions/finite-strain-mooney-rivlin.h
index 74975966fe..9fc34b9ff5 100644
--- a/examples/solids/qfunctions/finite-strain-mooney-rivlin.h
+++ b/examples/solids/qfunctions/finite-strain-mooney-rivlin.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/solids/qfunctions/finite-strain-neo-hookean.h b/examples/solids/qfunctions/finite-strain-neo-hookean.h
index ea7486677a..5742c5e8ff 100644
--- a/examples/solids/qfunctions/finite-strain-neo-hookean.h
+++ b/examples/solids/qfunctions/finite-strain-neo-hookean.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/solids/qfunctions/linear.h b/examples/solids/qfunctions/linear.h
index 6e300af27c..b6f9573c05 100644
--- a/examples/solids/qfunctions/linear.h
+++ b/examples/solids/qfunctions/linear.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/solids/qfunctions/manufactured-force.h b/examples/solids/qfunctions/manufactured-force.h
index 9c063409bb..41b761351f 100644
--- a/examples/solids/qfunctions/manufactured-force.h
+++ b/examples/solids/qfunctions/manufactured-force.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/solids/qfunctions/manufactured-true.h b/examples/solids/qfunctions/manufactured-true.h
index 943bca1686..25cffbd126 100644
--- a/examples/solids/qfunctions/manufactured-true.h
+++ b/examples/solids/qfunctions/manufactured-true.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/solids/qfunctions/traction-boundary.h b/examples/solids/qfunctions/traction-boundary.h
index 797cb7cdcd..6cc3c2e16d 100644
--- a/examples/solids/qfunctions/traction-boundary.h
+++ b/examples/solids/qfunctions/traction-boundary.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/solids/src/boundary.c b/examples/solids/src/boundary.c
index 9f63128e3f..2fdaac80ea 100644
--- a/examples/solids/src/boundary.c
+++ b/examples/solids/src/boundary.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/solids/src/cl-options.c b/examples/solids/src/cl-options.c
index 4e6087990b..3dc3d7effb 100644
--- a/examples/solids/src/cl-options.c
+++ b/examples/solids/src/cl-options.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/solids/src/matops.c b/examples/solids/src/matops.c
index def1109c00..31930d2446 100644
--- a/examples/solids/src/matops.c
+++ b/examples/solids/src/matops.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/solids/src/misc.c b/examples/solids/src/misc.c
index 1633b628f5..6c45e893b6 100644
--- a/examples/solids/src/misc.c
+++ b/examples/solids/src/misc.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/solids/src/setup-dm.c b/examples/solids/src/setup-dm.c
index 38fdd3889a..9a4d55a356 100644
--- a/examples/solids/src/setup-dm.c
+++ b/examples/solids/src/setup-dm.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/solids/src/setup-libceed.c b/examples/solids/src/setup-libceed.c
index 16adff9f84..bfe153fcbf 100644
--- a/examples/solids/src/setup-libceed.c
+++ b/examples/solids/src/setup-libceed.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/gallery/ceed-gallery-list.h b/gallery/ceed-gallery-list.h
index 9014adad07..d45aea46cc 100644
--- a/gallery/ceed-gallery-list.h
+++ b/gallery/ceed-gallery-list.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/gallery/ceed-gallery-weak.c b/gallery/ceed-gallery-weak.c
index 02065b51fc..8744f75d8f 100644
--- a/gallery/ceed-gallery-weak.c
+++ b/gallery/ceed-gallery-weak.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/gallery/identity/ceed-identity.c b/gallery/identity/ceed-identity.c
index 07bdb7d3c0..1391986b58 100644
--- a/gallery/identity/ceed-identity.c
+++ b/gallery/identity/ceed-identity.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/gallery/mass-vector/ceed-vectormassapply.c b/gallery/mass-vector/ceed-vectormassapply.c
index 6899f1bfb8..388f203802 100644
--- a/gallery/mass-vector/ceed-vectormassapply.c
+++ b/gallery/mass-vector/ceed-vectormassapply.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/gallery/mass/ceed-mass1dbuild.c b/gallery/mass/ceed-mass1dbuild.c
index 798e1924d8..13f6ce0f77 100644
--- a/gallery/mass/ceed-mass1dbuild.c
+++ b/gallery/mass/ceed-mass1dbuild.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/gallery/mass/ceed-mass2dbuild.c b/gallery/mass/ceed-mass2dbuild.c
index 766ec2f999..b4431443a2 100644
--- a/gallery/mass/ceed-mass2dbuild.c
+++ b/gallery/mass/ceed-mass2dbuild.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/gallery/mass/ceed-mass3dbuild.c b/gallery/mass/ceed-mass3dbuild.c
index fcc428d1c6..58ffc2fc9e 100644
--- a/gallery/mass/ceed-mass3dbuild.c
+++ b/gallery/mass/ceed-mass3dbuild.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/gallery/mass/ceed-massapply.c b/gallery/mass/ceed-massapply.c
index 232f137d1c..cefa208c0f 100644
--- a/gallery/mass/ceed-massapply.c
+++ b/gallery/mass/ceed-massapply.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/gallery/poisson-vector/ceed-vectorpoisson1dapply.c b/gallery/poisson-vector/ceed-vectorpoisson1dapply.c
index a2c5aa4b5c..433eca7f2e 100644
--- a/gallery/poisson-vector/ceed-vectorpoisson1dapply.c
+++ b/gallery/poisson-vector/ceed-vectorpoisson1dapply.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/gallery/poisson-vector/ceed-vectorpoisson2dapply.c b/gallery/poisson-vector/ceed-vectorpoisson2dapply.c
index 4170ea7e56..4efd1225f6 100644
--- a/gallery/poisson-vector/ceed-vectorpoisson2dapply.c
+++ b/gallery/poisson-vector/ceed-vectorpoisson2dapply.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/gallery/poisson-vector/ceed-vectorpoisson3dapply.c b/gallery/poisson-vector/ceed-vectorpoisson3dapply.c
index 47bfbfac50..d35e0ae3d9 100644
--- a/gallery/poisson-vector/ceed-vectorpoisson3dapply.c
+++ b/gallery/poisson-vector/ceed-vectorpoisson3dapply.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/gallery/poisson/ceed-poisson1dapply.c b/gallery/poisson/ceed-poisson1dapply.c
index 02edfa39d9..2112d24ce2 100644
--- a/gallery/poisson/ceed-poisson1dapply.c
+++ b/gallery/poisson/ceed-poisson1dapply.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/gallery/poisson/ceed-poisson1dbuild.c b/gallery/poisson/ceed-poisson1dbuild.c
index 98eab20581..c54aa72f55 100644
--- a/gallery/poisson/ceed-poisson1dbuild.c
+++ b/gallery/poisson/ceed-poisson1dbuild.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/gallery/poisson/ceed-poisson2dapply.c b/gallery/poisson/ceed-poisson2dapply.c
index d8dea0c3cd..1c90c84c5c 100644
--- a/gallery/poisson/ceed-poisson2dapply.c
+++ b/gallery/poisson/ceed-poisson2dapply.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/gallery/poisson/ceed-poisson2dbuild.c b/gallery/poisson/ceed-poisson2dbuild.c
index 0772c7961c..7310aab0d7 100644
--- a/gallery/poisson/ceed-poisson2dbuild.c
+++ b/gallery/poisson/ceed-poisson2dbuild.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/gallery/poisson/ceed-poisson3dapply.c b/gallery/poisson/ceed-poisson3dapply.c
index d5742ed0dc..a5e0207b15 100644
--- a/gallery/poisson/ceed-poisson3dapply.c
+++ b/gallery/poisson/ceed-poisson3dapply.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/gallery/poisson/ceed-poisson3dbuild.c b/gallery/poisson/ceed-poisson3dbuild.c
index 63004755de..8054f3d4ad 100644
--- a/gallery/poisson/ceed-poisson3dbuild.c
+++ b/gallery/poisson/ceed-poisson3dbuild.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/gallery/scale/ceed-scale.c b/gallery/scale/ceed-scale.c
index 77aff92063..6f86879e2c 100644
--- a/gallery/scale/ceed-scale.c
+++ b/gallery/scale/ceed-scale.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed-fortran-name.h b/include/ceed-fortran-name.h
index 5f70b6c879..1646f3deeb 100644
--- a/include/ceed-fortran-name.h
+++ b/include/ceed-fortran-name.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed-impl.h b/include/ceed-impl.h
index 6fd4ed523a..2c2454abcf 100644
--- a/include/ceed-impl.h
+++ b/include/ceed-impl.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/backend.h b/include/ceed/backend.h
index 627d2907c2..62d4307560 100644
--- a/include/ceed/backend.h
+++ b/include/ceed/backend.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/ceed-f32.h b/include/ceed/ceed-f32.h
index 0bce734257..ede7251d81 100644
--- a/include/ceed/ceed-f32.h
+++ b/include/ceed/ceed-f32.h
@@ -1,4 +1,4 @@
-/// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+/// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 /// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 ///
 /// SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/ceed-f64.h b/include/ceed/ceed-f64.h
index b74d867c18..88e37972f9 100644
--- a/include/ceed/ceed-f64.h
+++ b/include/ceed/ceed-f64.h
@@ -1,4 +1,4 @@
-/// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+/// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 /// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 ///
 /// SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/ceed.h b/include/ceed/ceed.h
index 49d7983732..f18b6c391c 100644
--- a/include/ceed/ceed.h
+++ b/include/ceed/ceed.h
@@ -1,4 +1,4 @@
-/// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+/// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 /// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 ///
 /// SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/cuda.h b/include/ceed/cuda.h
index 4a53a5239e..eb9ac3e9cb 100644
--- a/include/ceed/cuda.h
+++ b/include/ceed/cuda.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/deprecated.h b/include/ceed/deprecated.h
index ee6cc394f5..233b910a60 100644
--- a/include/ceed/deprecated.h
+++ b/include/ceed/deprecated.h
@@ -1,4 +1,4 @@
-/// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+/// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 /// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 ///
 /// SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/fortran.h b/include/ceed/fortran.h
index 08bb627e57..ed0c0ef628 100644
--- a/include/ceed/fortran.h
+++ b/include/ceed/fortran.h
@@ -1,4 +1,4 @@
-! Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+! Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 ! All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 !
 ! SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/hip.h b/include/ceed/hip.h
index 5f4bdd149a..86ba7dc098 100644
--- a/include/ceed/hip.h
+++ b/include/ceed/hip.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/cuda/cuda-atomic-add-fallback.h b/include/ceed/jit-source/cuda/cuda-atomic-add-fallback.h
index 7cef2f83a2..351c3be86c 100644
--- a/include/ceed/jit-source/cuda/cuda-atomic-add-fallback.h
+++ b/include/ceed/jit-source/cuda/cuda-atomic-add-fallback.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/cuda/cuda-gen-templates.h b/include/ceed/jit-source/cuda/cuda-gen-templates.h
index f4dccf54ea..5fd998d9e9 100644
--- a/include/ceed/jit-source/cuda/cuda-gen-templates.h
+++ b/include/ceed/jit-source/cuda/cuda-gen-templates.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/cuda/cuda-jit.h b/include/ceed/jit-source/cuda/cuda-jit.h
index e592b31689..d9cd5a8963 100644
--- a/include/ceed/jit-source/cuda/cuda-jit.h
+++ b/include/ceed/jit-source/cuda/cuda-jit.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/cuda/cuda-ref-basis-nontensor-templates.h b/include/ceed/jit-source/cuda/cuda-ref-basis-nontensor-templates.h
index 4316f70c7e..2a4967f807 100644
--- a/include/ceed/jit-source/cuda/cuda-ref-basis-nontensor-templates.h
+++ b/include/ceed/jit-source/cuda/cuda-ref-basis-nontensor-templates.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/cuda/cuda-ref-basis-nontensor.h b/include/ceed/jit-source/cuda/cuda-ref-basis-nontensor.h
index b6e90450a8..c441e414ef 100644
--- a/include/ceed/jit-source/cuda/cuda-ref-basis-nontensor.h
+++ b/include/ceed/jit-source/cuda/cuda-ref-basis-nontensor.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h b/include/ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h
index c461d0ce30..602a6d1f40 100644
--- a/include/ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h
+++ b/include/ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/cuda/cuda-ref-basis-tensor.h b/include/ceed/jit-source/cuda/cuda-ref-basis-tensor.h
index a39a9fede7..baa8554eda 100644
--- a/include/ceed/jit-source/cuda/cuda-ref-basis-tensor.h
+++ b/include/ceed/jit-source/cuda/cuda-ref-basis-tensor.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/cuda/cuda-ref-operator-assemble-diagonal.h b/include/ceed/jit-source/cuda/cuda-ref-operator-assemble-diagonal.h
index 54fe5ec1ad..5be93d9a1e 100644
--- a/include/ceed/jit-source/cuda/cuda-ref-operator-assemble-diagonal.h
+++ b/include/ceed/jit-source/cuda/cuda-ref-operator-assemble-diagonal.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/cuda/cuda-ref-operator-assemble.h b/include/ceed/jit-source/cuda/cuda-ref-operator-assemble.h
index 569c2728be..76643040fb 100644
--- a/include/ceed/jit-source/cuda/cuda-ref-operator-assemble.h
+++ b/include/ceed/jit-source/cuda/cuda-ref-operator-assemble.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/cuda/cuda-ref-qfunction.h b/include/ceed/jit-source/cuda/cuda-ref-qfunction.h
index 3fb97139dc..61785cc00c 100644
--- a/include/ceed/jit-source/cuda/cuda-ref-qfunction.h
+++ b/include/ceed/jit-source/cuda/cuda-ref-qfunction.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/cuda/cuda-ref-restriction-at-points.h b/include/ceed/jit-source/cuda/cuda-ref-restriction-at-points.h
index 48c9f13063..73ecc3bb25 100644
--- a/include/ceed/jit-source/cuda/cuda-ref-restriction-at-points.h
+++ b/include/ceed/jit-source/cuda/cuda-ref-restriction-at-points.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/cuda/cuda-ref-restriction-curl-oriented.h b/include/ceed/jit-source/cuda/cuda-ref-restriction-curl-oriented.h
index f8ad690489..e83eebb8cd 100644
--- a/include/ceed/jit-source/cuda/cuda-ref-restriction-curl-oriented.h
+++ b/include/ceed/jit-source/cuda/cuda-ref-restriction-curl-oriented.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/cuda/cuda-ref-restriction-offset.h b/include/ceed/jit-source/cuda/cuda-ref-restriction-offset.h
index 264524e728..487c4d2194 100644
--- a/include/ceed/jit-source/cuda/cuda-ref-restriction-offset.h
+++ b/include/ceed/jit-source/cuda/cuda-ref-restriction-offset.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/cuda/cuda-ref-restriction-oriented.h b/include/ceed/jit-source/cuda/cuda-ref-restriction-oriented.h
index e8217e4b9d..ead457562a 100644
--- a/include/ceed/jit-source/cuda/cuda-ref-restriction-oriented.h
+++ b/include/ceed/jit-source/cuda/cuda-ref-restriction-oriented.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/cuda/cuda-ref-restriction-strided.h b/include/ceed/jit-source/cuda/cuda-ref-restriction-strided.h
index a72250d311..c5dc12b227 100644
--- a/include/ceed/jit-source/cuda/cuda-ref-restriction-strided.h
+++ b/include/ceed/jit-source/cuda/cuda-ref-restriction-strided.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor-templates.h
index e5b31970ff..d49bc52a4b 100644
--- a/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor-templates.h
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor-templates.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor.h b/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor.h
index ec7102ea2c..abddaa58cd 100644
--- a/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor.h
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-read-write-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-read-write-templates.h
index cb62c4f80b..ececd93ae6 100644
--- a/include/ceed/jit-source/cuda/cuda-shared-basis-read-write-templates.h
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-read-write-templates.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h
index 0da9163716..6f2843acce 100644
--- a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points.h
index dcb1763e38..fc812792e4 100644
--- a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points.h
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-flattened-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-flattened-templates.h
index fcc084c687..54594b3af4 100644
--- a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-flattened-templates.h
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-flattened-templates.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h
index af2bdebcbd..dc05f100ae 100644
--- a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor.h
index 53a50bfd16..ae1cdfc5c7 100644
--- a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor.h
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/cuda/cuda-types.h b/include/ceed/jit-source/cuda/cuda-types.h
index 9acb0064a3..58b2961246 100644
--- a/include/ceed/jit-source/cuda/cuda-types.h
+++ b/include/ceed/jit-source/cuda/cuda-types.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/gallery/ceed-identity.h b/include/ceed/jit-source/gallery/ceed-identity.h
index 0c99c75507..a502fce59d 100644
--- a/include/ceed/jit-source/gallery/ceed-identity.h
+++ b/include/ceed/jit-source/gallery/ceed-identity.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/gallery/ceed-mass1dbuild.h b/include/ceed/jit-source/gallery/ceed-mass1dbuild.h
index d37985aba7..dfa9c4ae7e 100644
--- a/include/ceed/jit-source/gallery/ceed-mass1dbuild.h
+++ b/include/ceed/jit-source/gallery/ceed-mass1dbuild.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/gallery/ceed-mass2dbuild.h b/include/ceed/jit-source/gallery/ceed-mass2dbuild.h
index 02e11a30e1..a23c14858a 100644
--- a/include/ceed/jit-source/gallery/ceed-mass2dbuild.h
+++ b/include/ceed/jit-source/gallery/ceed-mass2dbuild.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/gallery/ceed-mass3dbuild.h b/include/ceed/jit-source/gallery/ceed-mass3dbuild.h
index 692bb2e917..fdff95017e 100644
--- a/include/ceed/jit-source/gallery/ceed-mass3dbuild.h
+++ b/include/ceed/jit-source/gallery/ceed-mass3dbuild.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/gallery/ceed-massapply.h b/include/ceed/jit-source/gallery/ceed-massapply.h
index 70559b3429..bfd2c99491 100644
--- a/include/ceed/jit-source/gallery/ceed-massapply.h
+++ b/include/ceed/jit-source/gallery/ceed-massapply.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/gallery/ceed-poisson1dapply.h b/include/ceed/jit-source/gallery/ceed-poisson1dapply.h
index 33709c7ee2..6400c5d1b2 100644
--- a/include/ceed/jit-source/gallery/ceed-poisson1dapply.h
+++ b/include/ceed/jit-source/gallery/ceed-poisson1dapply.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/gallery/ceed-poisson1dbuild.h b/include/ceed/jit-source/gallery/ceed-poisson1dbuild.h
index eb619b3056..94b366d997 100644
--- a/include/ceed/jit-source/gallery/ceed-poisson1dbuild.h
+++ b/include/ceed/jit-source/gallery/ceed-poisson1dbuild.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/gallery/ceed-poisson2dapply.h b/include/ceed/jit-source/gallery/ceed-poisson2dapply.h
index 808b1eb988..3e94ac700f 100644
--- a/include/ceed/jit-source/gallery/ceed-poisson2dapply.h
+++ b/include/ceed/jit-source/gallery/ceed-poisson2dapply.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/gallery/ceed-poisson2dbuild.h b/include/ceed/jit-source/gallery/ceed-poisson2dbuild.h
index afa4c6d64a..23f00a26dd 100644
--- a/include/ceed/jit-source/gallery/ceed-poisson2dbuild.h
+++ b/include/ceed/jit-source/gallery/ceed-poisson2dbuild.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/gallery/ceed-poisson3dapply.h b/include/ceed/jit-source/gallery/ceed-poisson3dapply.h
index 4b0894b9dd..4b3f687494 100644
--- a/include/ceed/jit-source/gallery/ceed-poisson3dapply.h
+++ b/include/ceed/jit-source/gallery/ceed-poisson3dapply.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/gallery/ceed-poisson3dbuild.h b/include/ceed/jit-source/gallery/ceed-poisson3dbuild.h
index 6a3a818cad..71ea0d1a69 100644
--- a/include/ceed/jit-source/gallery/ceed-poisson3dbuild.h
+++ b/include/ceed/jit-source/gallery/ceed-poisson3dbuild.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/gallery/ceed-scale.h b/include/ceed/jit-source/gallery/ceed-scale.h
index d7a7098b3f..12fcfe5277 100644
--- a/include/ceed/jit-source/gallery/ceed-scale.h
+++ b/include/ceed/jit-source/gallery/ceed-scale.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/gallery/ceed-vectormassapply.h b/include/ceed/jit-source/gallery/ceed-vectormassapply.h
index c07ebc7fc8..072ccc4cd0 100644
--- a/include/ceed/jit-source/gallery/ceed-vectormassapply.h
+++ b/include/ceed/jit-source/gallery/ceed-vectormassapply.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/gallery/ceed-vectorpoisson1dapply.h b/include/ceed/jit-source/gallery/ceed-vectorpoisson1dapply.h
index ef8552bf32..928a5e9882 100644
--- a/include/ceed/jit-source/gallery/ceed-vectorpoisson1dapply.h
+++ b/include/ceed/jit-source/gallery/ceed-vectorpoisson1dapply.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/gallery/ceed-vectorpoisson2dapply.h b/include/ceed/jit-source/gallery/ceed-vectorpoisson2dapply.h
index aef318b9c5..16f98bed6b 100644
--- a/include/ceed/jit-source/gallery/ceed-vectorpoisson2dapply.h
+++ b/include/ceed/jit-source/gallery/ceed-vectorpoisson2dapply.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/gallery/ceed-vectorpoisson3dapply.h b/include/ceed/jit-source/gallery/ceed-vectorpoisson3dapply.h
index 52fb3565c1..60e296ac81 100644
--- a/include/ceed/jit-source/gallery/ceed-vectorpoisson3dapply.h
+++ b/include/ceed/jit-source/gallery/ceed-vectorpoisson3dapply.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/hip/hip-gen-templates.h b/include/ceed/jit-source/hip/hip-gen-templates.h
index 03f9204a99..0064ec66e3 100644
--- a/include/ceed/jit-source/hip/hip-gen-templates.h
+++ b/include/ceed/jit-source/hip/hip-gen-templates.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/hip/hip-jit.h b/include/ceed/jit-source/hip/hip-jit.h
index 70a00416e4..032d716828 100644
--- a/include/ceed/jit-source/hip/hip-jit.h
+++ b/include/ceed/jit-source/hip/hip-jit.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/hip/hip-ref-basis-nontensor-templates.h b/include/ceed/jit-source/hip/hip-ref-basis-nontensor-templates.h
index 546cef8780..c6b951b87a 100644
--- a/include/ceed/jit-source/hip/hip-ref-basis-nontensor-templates.h
+++ b/include/ceed/jit-source/hip/hip-ref-basis-nontensor-templates.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/hip/hip-ref-basis-nontensor.h b/include/ceed/jit-source/hip/hip-ref-basis-nontensor.h
index f0707ee270..923318aa86 100644
--- a/include/ceed/jit-source/hip/hip-ref-basis-nontensor.h
+++ b/include/ceed/jit-source/hip/hip-ref-basis-nontensor.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h b/include/ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h
index 302ea9fff6..61ef0d3f0a 100644
--- a/include/ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h
+++ b/include/ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/hip/hip-ref-basis-tensor.h b/include/ceed/jit-source/hip/hip-ref-basis-tensor.h
index 2642a22ae8..1455b5ac21 100644
--- a/include/ceed/jit-source/hip/hip-ref-basis-tensor.h
+++ b/include/ceed/jit-source/hip/hip-ref-basis-tensor.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/hip/hip-ref-operator-assemble-diagonal.h b/include/ceed/jit-source/hip/hip-ref-operator-assemble-diagonal.h
index c73a0b8063..581545f71a 100644
--- a/include/ceed/jit-source/hip/hip-ref-operator-assemble-diagonal.h
+++ b/include/ceed/jit-source/hip/hip-ref-operator-assemble-diagonal.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/hip/hip-ref-operator-assemble.h b/include/ceed/jit-source/hip/hip-ref-operator-assemble.h
index d9bcb07cd9..a235c8be7a 100644
--- a/include/ceed/jit-source/hip/hip-ref-operator-assemble.h
+++ b/include/ceed/jit-source/hip/hip-ref-operator-assemble.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/hip/hip-ref-qfunction.h b/include/ceed/jit-source/hip/hip-ref-qfunction.h
index f26ec054b1..bf605feba4 100644
--- a/include/ceed/jit-source/hip/hip-ref-qfunction.h
+++ b/include/ceed/jit-source/hip/hip-ref-qfunction.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/hip/hip-ref-restriction-at-points.h b/include/ceed/jit-source/hip/hip-ref-restriction-at-points.h
index 15fad9984a..3c88d685a3 100644
--- a/include/ceed/jit-source/hip/hip-ref-restriction-at-points.h
+++ b/include/ceed/jit-source/hip/hip-ref-restriction-at-points.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/hip/hip-ref-restriction-curl-oriented.h b/include/ceed/jit-source/hip/hip-ref-restriction-curl-oriented.h
index cc23fa3a52..ee5544309d 100644
--- a/include/ceed/jit-source/hip/hip-ref-restriction-curl-oriented.h
+++ b/include/ceed/jit-source/hip/hip-ref-restriction-curl-oriented.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/hip/hip-ref-restriction-offset.h b/include/ceed/jit-source/hip/hip-ref-restriction-offset.h
index 80d746503e..a3e952b5ca 100644
--- a/include/ceed/jit-source/hip/hip-ref-restriction-offset.h
+++ b/include/ceed/jit-source/hip/hip-ref-restriction-offset.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/hip/hip-ref-restriction-oriented.h b/include/ceed/jit-source/hip/hip-ref-restriction-oriented.h
index a88dac2295..ffe8890ef2 100644
--- a/include/ceed/jit-source/hip/hip-ref-restriction-oriented.h
+++ b/include/ceed/jit-source/hip/hip-ref-restriction-oriented.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/hip/hip-ref-restriction-strided.h b/include/ceed/jit-source/hip/hip-ref-restriction-strided.h
index f3443b33d4..445aede42d 100644
--- a/include/ceed/jit-source/hip/hip-ref-restriction-strided.h
+++ b/include/ceed/jit-source/hip/hip-ref-restriction-strided.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/hip/hip-shared-basis-nontensor-templates.h b/include/ceed/jit-source/hip/hip-shared-basis-nontensor-templates.h
index 9b04bdbb8c..71d183bcf8 100644
--- a/include/ceed/jit-source/hip/hip-shared-basis-nontensor-templates.h
+++ b/include/ceed/jit-source/hip/hip-shared-basis-nontensor-templates.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/hip/hip-shared-basis-nontensor.h b/include/ceed/jit-source/hip/hip-shared-basis-nontensor.h
index 36c34c4e91..175e720a55 100644
--- a/include/ceed/jit-source/hip/hip-shared-basis-nontensor.h
+++ b/include/ceed/jit-source/hip/hip-shared-basis-nontensor.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/hip/hip-shared-basis-read-write-templates.h b/include/ceed/jit-source/hip/hip-shared-basis-read-write-templates.h
index 349ea253b0..80be446bee 100644
--- a/include/ceed/jit-source/hip/hip-shared-basis-read-write-templates.h
+++ b/include/ceed/jit-source/hip/hip-shared-basis-read-write-templates.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h b/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h
index 1b1e6675f1..d93ce6c90b 100644
--- a/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h
+++ b/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points.h b/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points.h
index 5b009d525e..f30e6070c4 100644
--- a/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points.h
+++ b/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/hip/hip-shared-basis-tensor-flattened-templates.h b/include/ceed/jit-source/hip/hip-shared-basis-tensor-flattened-templates.h
index c13aab77f6..bba3c2f8a1 100644
--- a/include/ceed/jit-source/hip/hip-shared-basis-tensor-flattened-templates.h
+++ b/include/ceed/jit-source/hip/hip-shared-basis-tensor-flattened-templates.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/hip/hip-shared-basis-tensor-templates.h b/include/ceed/jit-source/hip/hip-shared-basis-tensor-templates.h
index 72c1f6731d..ada945ed1e 100644
--- a/include/ceed/jit-source/hip/hip-shared-basis-tensor-templates.h
+++ b/include/ceed/jit-source/hip/hip-shared-basis-tensor-templates.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/hip/hip-shared-basis-tensor.h b/include/ceed/jit-source/hip/hip-shared-basis-tensor.h
index f1fb0bbe6b..3b82d1190c 100644
--- a/include/ceed/jit-source/hip/hip-shared-basis-tensor.h
+++ b/include/ceed/jit-source/hip/hip-shared-basis-tensor.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/hip/hip-types.h b/include/ceed/jit-source/hip/hip-types.h
index 7befa1b492..ebe689c094 100644
--- a/include/ceed/jit-source/hip/hip-types.h
+++ b/include/ceed/jit-source/hip/hip-types.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/magma/magma-basis-grad-1d.h b/include/ceed/jit-source/magma/magma-basis-grad-1d.h
index ba7c645262..ed2aceb69a 100644
--- a/include/ceed/jit-source/magma/magma-basis-grad-1d.h
+++ b/include/ceed/jit-source/magma/magma-basis-grad-1d.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/magma/magma-basis-grad-2d.h b/include/ceed/jit-source/magma/magma-basis-grad-2d.h
index 671da961e8..9fda73c657 100644
--- a/include/ceed/jit-source/magma/magma-basis-grad-2d.h
+++ b/include/ceed/jit-source/magma/magma-basis-grad-2d.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/magma/magma-basis-grad-3d.h b/include/ceed/jit-source/magma/magma-basis-grad-3d.h
index b833ad7609..4b835216f2 100644
--- a/include/ceed/jit-source/magma/magma-basis-grad-3d.h
+++ b/include/ceed/jit-source/magma/magma-basis-grad-3d.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/magma/magma-basis-interp-1d.h b/include/ceed/jit-source/magma/magma-basis-interp-1d.h
index fb1fa90b33..531b9273e2 100644
--- a/include/ceed/jit-source/magma/magma-basis-interp-1d.h
+++ b/include/ceed/jit-source/magma/magma-basis-interp-1d.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/magma/magma-basis-interp-2d.h b/include/ceed/jit-source/magma/magma-basis-interp-2d.h
index 956059fea9..04640fe75b 100644
--- a/include/ceed/jit-source/magma/magma-basis-interp-2d.h
+++ b/include/ceed/jit-source/magma/magma-basis-interp-2d.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/magma/magma-basis-interp-3d.h b/include/ceed/jit-source/magma/magma-basis-interp-3d.h
index 6011db7e97..004071ee32 100644
--- a/include/ceed/jit-source/magma/magma-basis-interp-3d.h
+++ b/include/ceed/jit-source/magma/magma-basis-interp-3d.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/magma/magma-basis-interp-deriv-nontensor.h b/include/ceed/jit-source/magma/magma-basis-interp-deriv-nontensor.h
index f3dfa150ee..15f2b90ce6 100644
--- a/include/ceed/jit-source/magma/magma-basis-interp-deriv-nontensor.h
+++ b/include/ceed/jit-source/magma/magma-basis-interp-deriv-nontensor.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/magma/magma-basis-weight-1d.h b/include/ceed/jit-source/magma/magma-basis-weight-1d.h
index ad8e6290d2..d922a7586c 100644
--- a/include/ceed/jit-source/magma/magma-basis-weight-1d.h
+++ b/include/ceed/jit-source/magma/magma-basis-weight-1d.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/magma/magma-basis-weight-2d.h b/include/ceed/jit-source/magma/magma-basis-weight-2d.h
index 9ef709af0e..9cbb18baae 100644
--- a/include/ceed/jit-source/magma/magma-basis-weight-2d.h
+++ b/include/ceed/jit-source/magma/magma-basis-weight-2d.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/magma/magma-basis-weight-3d.h b/include/ceed/jit-source/magma/magma-basis-weight-3d.h
index bcfa89df40..8fc3e96919 100644
--- a/include/ceed/jit-source/magma/magma-basis-weight-3d.h
+++ b/include/ceed/jit-source/magma/magma-basis-weight-3d.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/magma/magma-basis-weight-nontensor.h b/include/ceed/jit-source/magma/magma-basis-weight-nontensor.h
index 5233afa6c9..51cf97d727 100644
--- a/include/ceed/jit-source/magma/magma-basis-weight-nontensor.h
+++ b/include/ceed/jit-source/magma/magma-basis-weight-nontensor.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/magma/magma-common-defs.h b/include/ceed/jit-source/magma/magma-common-defs.h
index a2ea52628a..22a1b835cb 100644
--- a/include/ceed/jit-source/magma/magma-common-defs.h
+++ b/include/ceed/jit-source/magma/magma-common-defs.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/magma/magma-common-nontensor.h b/include/ceed/jit-source/magma/magma-common-nontensor.h
index 70a73247b1..8f33484295 100644
--- a/include/ceed/jit-source/magma/magma-common-nontensor.h
+++ b/include/ceed/jit-source/magma/magma-common-nontensor.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/magma/magma-common-tensor.h b/include/ceed/jit-source/magma/magma-common-tensor.h
index 87ec727c0d..d0ca2f53c0 100644
--- a/include/ceed/jit-source/magma/magma-common-tensor.h
+++ b/include/ceed/jit-source/magma/magma-common-tensor.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/sycl/sycl-gen-templates.h b/include/ceed/jit-source/sycl/sycl-gen-templates.h
index b028924996..5dada5b9eb 100644
--- a/include/ceed/jit-source/sycl/sycl-gen-templates.h
+++ b/include/ceed/jit-source/sycl/sycl-gen-templates.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/sycl/sycl-jit.h b/include/ceed/jit-source/sycl/sycl-jit.h
index 1a2971f4df..25837f5701 100644
--- a/include/ceed/jit-source/sycl/sycl-jit.h
+++ b/include/ceed/jit-source/sycl/sycl-jit.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/sycl/sycl-ref-qfunction.h b/include/ceed/jit-source/sycl/sycl-ref-qfunction.h
index b22d86ec33..9f5df69e68 100644
--- a/include/ceed/jit-source/sycl/sycl-ref-qfunction.h
+++ b/include/ceed/jit-source/sycl/sycl-ref-qfunction.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/sycl/sycl-shared-basis-read-write-templates.h b/include/ceed/jit-source/sycl/sycl-shared-basis-read-write-templates.h
index 1105df17f2..551789e48b 100644
--- a/include/ceed/jit-source/sycl/sycl-shared-basis-read-write-templates.h
+++ b/include/ceed/jit-source/sycl/sycl-shared-basis-read-write-templates.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/sycl/sycl-shared-basis-tensor-templates.h b/include/ceed/jit-source/sycl/sycl-shared-basis-tensor-templates.h
index 8d37c1ba82..f023b77d6b 100644
--- a/include/ceed/jit-source/sycl/sycl-shared-basis-tensor-templates.h
+++ b/include/ceed/jit-source/sycl/sycl-shared-basis-tensor-templates.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/sycl/sycl-shared-basis-tensor.h b/include/ceed/jit-source/sycl/sycl-shared-basis-tensor.h
index 3593f8ab7d..71f60cce8b 100644
--- a/include/ceed/jit-source/sycl/sycl-shared-basis-tensor.h
+++ b/include/ceed/jit-source/sycl/sycl-shared-basis-tensor.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/sycl/sycl-types.h b/include/ceed/jit-source/sycl/sycl-types.h
index 3d57991fa4..5133c6eee8 100644
--- a/include/ceed/jit-source/sycl/sycl-types.h
+++ b/include/ceed/jit-source/sycl/sycl-types.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-tools.h b/include/ceed/jit-tools.h
index 1213f974d2..c82a9ad075 100644
--- a/include/ceed/jit-tools.h
+++ b/include/ceed/jit-tools.h
@@ -1,4 +1,4 @@
-/// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+/// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 /// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 ///
 /// SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/types.h b/include/ceed/types.h
index 5492816012..c687c218f9 100644
--- a/include/ceed/types.h
+++ b/include/ceed/types.h
@@ -1,4 +1,4 @@
-/// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+/// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 /// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 ///
 /// SPDX-License-Identifier: BSD-2-Clause
diff --git a/interface/ceed-basis.c b/interface/ceed-basis.c
index d93dc273b6..30b090b855 100644
--- a/interface/ceed-basis.c
+++ b/interface/ceed-basis.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/interface/ceed-config.c b/interface/ceed-config.c
index 3c837167b7..49d428d926 100644
--- a/interface/ceed-config.c
+++ b/interface/ceed-config.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/interface/ceed-cuda.c b/interface/ceed-cuda.c
index b54c706720..ea15d46735 100644
--- a/interface/ceed-cuda.c
+++ b/interface/ceed-cuda.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/interface/ceed-elemrestriction.c b/interface/ceed-elemrestriction.c
index 7408bbce2e..795477a4bf 100644
--- a/interface/ceed-elemrestriction.c
+++ b/interface/ceed-elemrestriction.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/interface/ceed-fortran.c b/interface/ceed-fortran.c
index 402f581979..042d7ae014 100644
--- a/interface/ceed-fortran.c
+++ b/interface/ceed-fortran.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/interface/ceed-hip.c b/interface/ceed-hip.c
index 911a374a94..f35480e873 100644
--- a/interface/ceed-hip.c
+++ b/interface/ceed-hip.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/interface/ceed-jit-source-root-default.c b/interface/ceed-jit-source-root-default.c
index 27587c6fd9..2cee49718f 100644
--- a/interface/ceed-jit-source-root-default.c
+++ b/interface/ceed-jit-source-root-default.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/interface/ceed-jit-source-root-install.c b/interface/ceed-jit-source-root-install.c
index 9b41385694..b80dca4f9f 100644
--- a/interface/ceed-jit-source-root-install.c
+++ b/interface/ceed-jit-source-root-install.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/interface/ceed-jit-tools.c b/interface/ceed-jit-tools.c
index fdc0efb217..c50e683f9a 100644
--- a/interface/ceed-jit-tools.c
+++ b/interface/ceed-jit-tools.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/interface/ceed-operator.c b/interface/ceed-operator.c
index 466d8d0cd9..d72333c294 100644
--- a/interface/ceed-operator.c
+++ b/interface/ceed-operator.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/interface/ceed-preconditioning.c b/interface/ceed-preconditioning.c
index 94237510db..aaed880485 100644
--- a/interface/ceed-preconditioning.c
+++ b/interface/ceed-preconditioning.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/interface/ceed-qfunction-register.c b/interface/ceed-qfunction-register.c
index 05163b6a3d..eb3832c4f5 100644
--- a/interface/ceed-qfunction-register.c
+++ b/interface/ceed-qfunction-register.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/interface/ceed-qfunction.c b/interface/ceed-qfunction.c
index 956348892f..b15c2ceaaf 100644
--- a/interface/ceed-qfunction.c
+++ b/interface/ceed-qfunction.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/interface/ceed-qfunctioncontext.c b/interface/ceed-qfunctioncontext.c
index 8919726495..93a7fcecab 100644
--- a/interface/ceed-qfunctioncontext.c
+++ b/interface/ceed-qfunctioncontext.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/interface/ceed-register.c b/interface/ceed-register.c
index 08d2189cc2..759a6463fb 100644
--- a/interface/ceed-register.c
+++ b/interface/ceed-register.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/interface/ceed-tensor.c b/interface/ceed-tensor.c
index 2ae2b4446f..ac2f789f72 100644
--- a/interface/ceed-tensor.c
+++ b/interface/ceed-tensor.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/interface/ceed-types.c b/interface/ceed-types.c
index e975793307..cbec562cff 100644
--- a/interface/ceed-types.c
+++ b/interface/ceed-types.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/interface/ceed-vector.c b/interface/ceed-vector.c
index b73cdfbf5a..64e4e26227 100644
--- a/interface/ceed-vector.c
+++ b/interface/ceed-vector.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/interface/ceed.c b/interface/ceed.c
index 133ddabfd3..f53d92c4e4 100644
--- a/interface/ceed.c
+++ b/interface/ceed.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/python/__init__.py b/python/__init__.py
index 9c77ff8833..c5eb31d18e 100644
--- a/python/__init__.py
+++ b/python/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors
+# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/python/build_ceed_cffi.py b/python/build_ceed_cffi.py
index 5ecba579af..71c99a21f4 100644
--- a/python/build_ceed_cffi.py
+++ b/python/build_ceed_cffi.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors
+# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/python/ceed.py b/python/ceed.py
index c146f562f5..8df025acae 100644
--- a/python/ceed.py
+++ b/python/ceed.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors
+# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/python/ceed_basis.py b/python/ceed_basis.py
index 2b41dc542a..c4f71a7089 100644
--- a/python/ceed_basis.py
+++ b/python/ceed_basis.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors
+# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/python/ceed_constants.py b/python/ceed_constants.py
index b0df95fdaf..8b4ea22673 100644
--- a/python/ceed_constants.py
+++ b/python/ceed_constants.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors
+# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/python/ceed_elemrestriction.py b/python/ceed_elemrestriction.py
index b71df55685..42e72a9311 100644
--- a/python/ceed_elemrestriction.py
+++ b/python/ceed_elemrestriction.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors
+# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/python/ceed_operator.py b/python/ceed_operator.py
index e3053439f0..90c3549f36 100644
--- a/python/ceed_operator.py
+++ b/python/ceed_operator.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors
+# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/python/ceed_qfunction.py b/python/ceed_qfunction.py
index a0c462efed..9c73581ae4 100644
--- a/python/ceed_qfunction.py
+++ b/python/ceed_qfunction.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors
+# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/python/ceed_qfunctioncontext.py b/python/ceed_qfunctioncontext.py
index 712adcc090..b98863aa7d 100644
--- a/python/ceed_qfunctioncontext.py
+++ b/python/ceed_qfunctioncontext.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors
+# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/python/ceed_vector.py b/python/ceed_vector.py
index 379d1e1913..06bd693ec6 100644
--- a/python/ceed_vector.py
+++ b/python/ceed_vector.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors
+# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/python/tests/Makefile b/python/tests/Makefile
index b13dd58922..94c49d5b3a 100644
--- a/python/tests/Makefile
+++ b/python/tests/Makefile
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/python/tests/conftest.py b/python/tests/conftest.py
index 6c763ac90a..70bdf69cfc 100644
--- a/python/tests/conftest.py
+++ b/python/tests/conftest.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors
+# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/python/tests/libceed-qfunctions.c b/python/tests/libceed-qfunctions.c
index 8feb69aa91..14fdfa6749 100644
--- a/python/tests/libceed-qfunctions.c
+++ b/python/tests/libceed-qfunctions.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/python/tests/setup-qfunctions.py b/python/tests/setup-qfunctions.py
index 3a697113a4..74074b67c3 100644
--- a/python/tests/setup-qfunctions.py
+++ b/python/tests/setup-qfunctions.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/python/tests/setup.cfg b/python/tests/setup.cfg
index 89e8bd3596..7290d8e331 100644
--- a/python/tests/setup.cfg
+++ b/python/tests/setup.cfg
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/python/tests/test-0-ceed.py b/python/tests/test-0-ceed.py
index 76c2d14332..5ab30e1fd9 100644
--- a/python/tests/test-0-ceed.py
+++ b/python/tests/test-0-ceed.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors
+# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/python/tests/test-1-vector.py b/python/tests/test-1-vector.py
index 834212c72c..9838a35b30 100644
--- a/python/tests/test-1-vector.py
+++ b/python/tests/test-1-vector.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors
+# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/python/tests/test-2-elemrestriction.py b/python/tests/test-2-elemrestriction.py
index f85beef4ab..60feb73626 100644
--- a/python/tests/test-2-elemrestriction.py
+++ b/python/tests/test-2-elemrestriction.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors
+# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/python/tests/test-3-basis.py b/python/tests/test-3-basis.py
index df62c24bdc..aaded78b21 100644
--- a/python/tests/test-3-basis.py
+++ b/python/tests/test-3-basis.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors
+# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/python/tests/test-4-qfunction.py b/python/tests/test-4-qfunction.py
index 7fd4ea41b1..0491a2c624 100644
--- a/python/tests/test-4-qfunction.py
+++ b/python/tests/test-4-qfunction.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors
+# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/python/tests/test-5-operator.py b/python/tests/test-5-operator.py
index 39219230c5..1b67bdab2d 100644
--- a/python/tests/test-5-operator.py
+++ b/python/tests/test-5-operator.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors
+# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/python/tests/test-qfunctions.h b/python/tests/test-qfunctions.h
index f4b5aa30d8..5790d540aa 100644
--- a/python/tests/test-qfunctions.h
+++ b/python/tests/test-qfunctions.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/rust/libceed-sys/src/lib.rs b/rust/libceed-sys/src/lib.rs
index 419ed405bb..21dff0d343 100644
--- a/rust/libceed-sys/src/lib.rs
+++ b/rust/libceed-sys/src/lib.rs
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/rust/libceed/src/basis.rs b/rust/libceed/src/basis.rs
index 72c3cb8bde..7018e0d462 100644
--- a/rust/libceed/src/basis.rs
+++ b/rust/libceed/src/basis.rs
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/rust/libceed/src/elem_restriction.rs b/rust/libceed/src/elem_restriction.rs
index e6dcd74145..d251220ff7 100644
--- a/rust/libceed/src/elem_restriction.rs
+++ b/rust/libceed/src/elem_restriction.rs
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/rust/libceed/src/lib.rs b/rust/libceed/src/lib.rs
index 0821989a7e..ae6487ee83 100755
--- a/rust/libceed/src/lib.rs
+++ b/rust/libceed/src/lib.rs
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/rust/libceed/src/operator.rs b/rust/libceed/src/operator.rs
index e82e92a393..85cb3a0d18 100644
--- a/rust/libceed/src/operator.rs
+++ b/rust/libceed/src/operator.rs
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/rust/libceed/src/qfunction.rs b/rust/libceed/src/qfunction.rs
index dac75c8f9f..09912c93be 100644
--- a/rust/libceed/src/qfunction.rs
+++ b/rust/libceed/src/qfunction.rs
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/rust/libceed/src/vector.rs b/rust/libceed/src/vector.rs
index 827aac8436..42c1c211f0 100644
--- a/rust/libceed/src/vector.rs
+++ b/rust/libceed/src/vector.rs
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/rust/libceed/tests/version-numbers.rs b/rust/libceed/tests/version-numbers.rs
index 5060c78398..c0f189e2e1 100644
--- a/rust/libceed/tests/version-numbers.rs
+++ b/rust/libceed/tests/version-numbers.rs
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors
 // All Rights Reserved. See the top-level COPYRIGHT and NOTICE files for details.
 //
 // SPDX-License-Identifier: (BSD-2-Clause)
diff --git a/tests/t319-basis.h b/tests/t319-basis.h
index 12e95b6aa1..965a7fcd0c 100644
--- a/tests/t319-basis.h
+++ b/tests/t319-basis.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/tests/t320-basis-f.h b/tests/t320-basis-f.h
index 762c754b64..84e7486a10 100644
--- a/tests/t320-basis-f.h
+++ b/tests/t320-basis-f.h
@@ -1,4 +1,4 @@
-! Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+! Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 ! All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 !
 ! SPDX-License-Identifier: BSD-2-Clause
diff --git a/tests/t320-basis.h b/tests/t320-basis.h
index 942103a5e8..30f8e824d0 100644
--- a/tests/t320-basis.h
+++ b/tests/t320-basis.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/tests/t330-basis.h b/tests/t330-basis.h
index 9fdbe5e531..b75bd421b9 100644
--- a/tests/t330-basis.h
+++ b/tests/t330-basis.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/tests/t340-basis.h b/tests/t340-basis.h
index 9768c2623d..90aef60f15 100644
--- a/tests/t340-basis.h
+++ b/tests/t340-basis.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/tests/t400-qfunction.h b/tests/t400-qfunction.h
index d3207139f6..740c7da030 100644
--- a/tests/t400-qfunction.h
+++ b/tests/t400-qfunction.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/tests/t401-qfunction.h b/tests/t401-qfunction.h
index 856fa98110..f91dae701c 100644
--- a/tests/t401-qfunction.h
+++ b/tests/t401-qfunction.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/tests/t405-qfunction.h b/tests/t405-qfunction.h
index 4e2d211c3d..0c356c943e 100644
--- a/tests/t405-qfunction.h
+++ b/tests/t405-qfunction.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/tests/t406-qfunction-helper.h b/tests/t406-qfunction-helper.h
index c000ee3d73..0410b00af6 100644
--- a/tests/t406-qfunction-helper.h
+++ b/tests/t406-qfunction-helper.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/tests/t406-qfunction-scales.h b/tests/t406-qfunction-scales.h
index 90685238a2..7dc42e93c6 100644
--- a/tests/t406-qfunction-scales.h
+++ b/tests/t406-qfunction-scales.h
@@ -3,7 +3,7 @@
 // Testing # on first line
 // Note: #ifndef and #pragma once header guards both work
 
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/tests/t406-qfunction.h b/tests/t406-qfunction.h
index e886a61b4c..617e6b7875 100644
--- a/tests/t406-qfunction.h
+++ b/tests/t406-qfunction.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/tests/t409-qfunction.h b/tests/t409-qfunction.h
index 78e9930e27..b2f59a9f80 100644
--- a/tests/t409-qfunction.h
+++ b/tests/t409-qfunction.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/tests/t500-operator.h b/tests/t500-operator.h
index 5efd4bac27..935d077208 100644
--- a/tests/t500-operator.h
+++ b/tests/t500-operator.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/tests/t502-operator.h b/tests/t502-operator.h
index 5f7a9da561..fab809d8db 100644
--- a/tests/t502-operator.h
+++ b/tests/t502-operator.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/tests/t507-operator.h b/tests/t507-operator.h
index adbb802ac2..312500b35f 100644
--- a/tests/t507-operator.h
+++ b/tests/t507-operator.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/tests/t510-operator.h b/tests/t510-operator.h
index cfe155e2bc..171f9d01df 100644
--- a/tests/t510-operator.h
+++ b/tests/t510-operator.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/tests/t522-operator.h b/tests/t522-operator.h
index 0685068099..b594818bd1 100644
--- a/tests/t522-operator.h
+++ b/tests/t522-operator.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/tests/t530-operator.h b/tests/t530-operator.h
index cfe155e2bc..171f9d01df 100644
--- a/tests/t530-operator.h
+++ b/tests/t530-operator.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/tests/t531-operator.h b/tests/t531-operator.h
index d310e303ae..f1c3ccab25 100644
--- a/tests/t531-operator.h
+++ b/tests/t531-operator.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/tests/t532-operator.h b/tests/t532-operator.h
index a8e6755a7d..b81f87dbc6 100644
--- a/tests/t532-operator.h
+++ b/tests/t532-operator.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/tests/t534-operator.h b/tests/t534-operator.h
index 83556af4f8..518481a070 100644
--- a/tests/t534-operator.h
+++ b/tests/t534-operator.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/tests/t535-operator.h b/tests/t535-operator.h
index 9510f5ae25..fc62a6ca0d 100644
--- a/tests/t535-operator.h
+++ b/tests/t535-operator.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/tests/t537-operator.h b/tests/t537-operator.h
index 71d1988e79..f08c690d12 100644
--- a/tests/t537-operator.h
+++ b/tests/t537-operator.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/tests/t539-operator.h b/tests/t539-operator.h
index a48cc7c13b..65eaa85554 100644
--- a/tests/t539-operator.h
+++ b/tests/t539-operator.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/tests/t540-operator.h b/tests/t540-operator.h
index 6278964c57..0259af529c 100644
--- a/tests/t540-operator.h
+++ b/tests/t540-operator.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/tests/t541-operator.h b/tests/t541-operator.h
index 2ccc6a6b1d..a8a3424f78 100644
--- a/tests/t541-operator.h
+++ b/tests/t541-operator.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/tests/t566-operator.h b/tests/t566-operator.h
index 3f5a7d90a6..c227b7d834 100644
--- a/tests/t566-operator.h
+++ b/tests/t566-operator.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/tests/t567-operator.h b/tests/t567-operator.h
index 5ab2e633a3..997b6db1bb 100644
--- a/tests/t567-operator.h
+++ b/tests/t567-operator.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/tests/t568-operator.h b/tests/t568-operator.h
index 6047197254..6c38bb04c7 100644
--- a/tests/t568-operator.h
+++ b/tests/t568-operator.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/tests/t580-operator.h b/tests/t580-operator.h
index e23db70411..cb7e472fba 100644
--- a/tests/t580-operator.h
+++ b/tests/t580-operator.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/tests/t590-operator.h b/tests/t590-operator.h
index 71c26bd525..c50595bc26 100644
--- a/tests/t590-operator.h
+++ b/tests/t590-operator.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/tests/t591-operator.h b/tests/t591-operator.h
index 2cffaee1a1..0a834e5056 100644
--- a/tests/t591-operator.h
+++ b/tests/t591-operator.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/tests/t595-operator.h b/tests/t595-operator.h
index 1e9e9e1ada..a5ddb3b9d7 100644
--- a/tests/t595-operator.h
+++ b/tests/t595-operator.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/tests/t596-operator.h b/tests/t596-operator.h
index 4fe3d700f5..85dc60e259 100644
--- a/tests/t596-operator.h
+++ b/tests/t596-operator.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/tests/t597-operator.h b/tests/t597-operator.h
index b68854b6fe..57b8e0dec6 100644
--- a/tests/t597-operator.h
+++ b/tests/t597-operator.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause

From 346c77e6436e93de99b1714e06a264fc70d47960 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Tue, 16 Dec 2025 10:44:41 -0700
Subject: [PATCH 516/571] occa - retire backends

---
 .gitlab-ci.yml                                |   7 +-
 Makefile                                      |  27 +-
 README.md                                     |  16 -
 backends/ceed-backend-list.h                  |   1 -
 backends/occa/ceed-occa-basis.cpp             |  82 --
 backends/occa/ceed-occa-basis.hpp             |  53 --
 backends/occa/ceed-occa-ceed-object.cpp       |  31 -
 backends/occa/ceed-occa-ceed-object.hpp       |  42 -
 backends/occa/ceed-occa-context.cpp           |  32 -
 backends/occa/ceed-occa-context.hpp           |  33 -
 backends/occa/ceed-occa-cpu-operator.cpp      | 751 ------------------
 backends/occa/ceed-occa-cpu-operator.hpp      | 132 ---
 backends/occa/ceed-occa-elem-restriction.cpp  | 372 ---------
 backends/occa/ceed-occa-elem-restriction.hpp  |  94 ---
 backends/occa/ceed-occa-gpu-operator.cpp      |  24 -
 backends/occa/ceed-occa-gpu-operator.hpp      |  30 -
 backends/occa/ceed-occa-kernels.hpp           |  16 -
 backends/occa/ceed-occa-operator-args.cpp     |  48 --
 backends/occa/ceed-occa-operator-args.hpp     |  40 -
 backends/occa/ceed-occa-operator-field.cpp    |  63 --
 backends/occa/ceed-occa-operator-field.hpp    |  55 --
 backends/occa/ceed-occa-operator.cpp          | 151 ----
 backends/occa/ceed-occa-operator.hpp          |  74 --
 backends/occa/ceed-occa-qfunction-args.cpp    |  60 --
 backends/occa/ceed-occa-qfunction-args.hpp    |  55 --
 backends/occa/ceed-occa-qfunction-field.cpp   |  22 -
 backends/occa/ceed-occa-qfunction-field.hpp   |  30 -
 backends/occa/ceed-occa-qfunction.cpp         | 242 ------
 backends/occa/ceed-occa-qfunction.hpp         |  54 --
 backends/occa/ceed-occa-qfunctioncontext.cpp  | 318 --------
 backends/occa/ceed-occa-qfunctioncontext.hpp  | 102 ---
 backends/occa/ceed-occa-simplex-basis.cpp     | 165 ----
 backends/occa/ceed-occa-simplex-basis.hpp     |  57 --
 backends/occa/ceed-occa-tensor-basis.cpp      | 236 ------
 backends/occa/ceed-occa-tensor-basis.hpp      |  64 --
 backends/occa/ceed-occa-types.hpp             |  60 --
 backends/occa/ceed-occa-vector.cpp            | 460 -----------
 backends/occa/ceed-occa-vector.hpp            | 133 ----
 backends/occa/ceed-occa.cpp                   | 329 --------
 backends/occa/ceed-occa.h                     | 149 ----
 backends/occa/kernels/elem-restriction.cpp    | 124 ---
 backends/occa/kernels/elem-restriction.hpp    |  31 -
 backends/occa/kernels/kernel-defines.hpp      |  13 -
 backends/occa/kernels/set-value.cpp           |  23 -
 backends/occa/kernels/set-value.hpp           |  20 -
 backends/occa/kernels/simplex-basis.hpp       |  31 -
 .../simplex-basis/cpu-simplex-basis.cpp       | 138 ----
 .../simplex-basis/gpu-simplex-basis.cpp       | 138 ----
 backends/occa/kernels/tensor-basis.hpp        |  38 -
 .../tensor-basis/cpu/tensor-basis-1d.cpp      | 108 ---
 .../tensor-basis/cpu/tensor-basis-2d.cpp      | 211 -----
 .../tensor-basis/cpu/tensor-basis-3d.cpp      | 306 -------
 .../tensor-basis/gpu/tensor-basis-1d.cpp      | 118 ---
 .../tensor-basis/gpu/tensor-basis-2d.cpp      | 162 ----
 .../tensor-basis/gpu/tensor-basis-3d.cpp      | 237 ------
 doc/sphinx/source/intro.md                    |   2 +-
 doc/sphinx/source/libCEEDapi.md               |   2 +-
 doc/sphinx/source/libCEEDdev.md               |   2 -
 doc/sphinx/source/releasenotes.md             |   4 +
 examples/petsc/area.c                         |   1 -
 examples/petsc/bps.c                          |   1 -
 examples/petsc/bpsraw.c                       |   1 -
 examples/petsc/bpssphere.c                    |   1 -
 examples/petsc/bpsswarm.c                     |   1 -
 examples/petsc/multigrid.c                    |   1 -
 examples/python/tutorial-0-ceed.ipynb         |   9 +-
 tests/junit.py                                |   7 +-
 67 files changed, 20 insertions(+), 6420 deletions(-)
 delete mode 100644 backends/occa/ceed-occa-basis.cpp
 delete mode 100644 backends/occa/ceed-occa-basis.hpp
 delete mode 100644 backends/occa/ceed-occa-ceed-object.cpp
 delete mode 100644 backends/occa/ceed-occa-ceed-object.hpp
 delete mode 100644 backends/occa/ceed-occa-context.cpp
 delete mode 100644 backends/occa/ceed-occa-context.hpp
 delete mode 100644 backends/occa/ceed-occa-cpu-operator.cpp
 delete mode 100644 backends/occa/ceed-occa-cpu-operator.hpp
 delete mode 100644 backends/occa/ceed-occa-elem-restriction.cpp
 delete mode 100644 backends/occa/ceed-occa-elem-restriction.hpp
 delete mode 100644 backends/occa/ceed-occa-gpu-operator.cpp
 delete mode 100644 backends/occa/ceed-occa-gpu-operator.hpp
 delete mode 100644 backends/occa/ceed-occa-kernels.hpp
 delete mode 100644 backends/occa/ceed-occa-operator-args.cpp
 delete mode 100644 backends/occa/ceed-occa-operator-args.hpp
 delete mode 100644 backends/occa/ceed-occa-operator-field.cpp
 delete mode 100644 backends/occa/ceed-occa-operator-field.hpp
 delete mode 100644 backends/occa/ceed-occa-operator.cpp
 delete mode 100644 backends/occa/ceed-occa-operator.hpp
 delete mode 100644 backends/occa/ceed-occa-qfunction-args.cpp
 delete mode 100644 backends/occa/ceed-occa-qfunction-args.hpp
 delete mode 100644 backends/occa/ceed-occa-qfunction-field.cpp
 delete mode 100644 backends/occa/ceed-occa-qfunction-field.hpp
 delete mode 100644 backends/occa/ceed-occa-qfunction.cpp
 delete mode 100644 backends/occa/ceed-occa-qfunction.hpp
 delete mode 100644 backends/occa/ceed-occa-qfunctioncontext.cpp
 delete mode 100644 backends/occa/ceed-occa-qfunctioncontext.hpp
 delete mode 100644 backends/occa/ceed-occa-simplex-basis.cpp
 delete mode 100644 backends/occa/ceed-occa-simplex-basis.hpp
 delete mode 100644 backends/occa/ceed-occa-tensor-basis.cpp
 delete mode 100644 backends/occa/ceed-occa-tensor-basis.hpp
 delete mode 100644 backends/occa/ceed-occa-types.hpp
 delete mode 100644 backends/occa/ceed-occa-vector.cpp
 delete mode 100644 backends/occa/ceed-occa-vector.hpp
 delete mode 100644 backends/occa/ceed-occa.cpp
 delete mode 100644 backends/occa/ceed-occa.h
 delete mode 100644 backends/occa/kernels/elem-restriction.cpp
 delete mode 100644 backends/occa/kernels/elem-restriction.hpp
 delete mode 100644 backends/occa/kernels/kernel-defines.hpp
 delete mode 100644 backends/occa/kernels/set-value.cpp
 delete mode 100644 backends/occa/kernels/set-value.hpp
 delete mode 100644 backends/occa/kernels/simplex-basis.hpp
 delete mode 100644 backends/occa/kernels/simplex-basis/cpu-simplex-basis.cpp
 delete mode 100644 backends/occa/kernels/simplex-basis/gpu-simplex-basis.cpp
 delete mode 100644 backends/occa/kernels/tensor-basis.hpp
 delete mode 100644 backends/occa/kernels/tensor-basis/cpu/tensor-basis-1d.cpp
 delete mode 100644 backends/occa/kernels/tensor-basis/cpu/tensor-basis-2d.cpp
 delete mode 100644 backends/occa/kernels/tensor-basis/cpu/tensor-basis-3d.cpp
 delete mode 100644 backends/occa/kernels/tensor-basis/gpu/tensor-basis-1d.cpp
 delete mode 100644 backends/occa/kernels/tensor-basis/gpu/tensor-basis-2d.cpp
 delete mode 100644 backends/occa/kernels/tensor-basis/gpu/tensor-basis-3d.cpp

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 9a994f963b..a907bfb1db 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -80,9 +80,6 @@ noether-cpu:
     # -- LIBXSMM 7 April 2024
     - cd .. && export XSMM_HASH=94ee71576870152feb62f3f0cf6b061d036dcdb5 && { [[ -d libxsmm-$XSMM_HASH ]] || { curl -L https://github.com/libxsmm/libxsmm/archive/$XSMM_HASH.tar.gz -o xsmm.tar.gz && tar zvxf xsmm.tar.gz && rm xsmm.tar.gz && make -C libxsmm-$XSMM_HASH -j$(nproc); }; } && export XSMM_DIR=$PWD/libxsmm-$XSMM_HASH && cd libCEED
     - echo "-------------- LIBXSMM -------------" && basename $XSMM_DIR
-    # -- OCCA v1.6.0
-    - cd .. && export OCCA_VERSION=occa-1.6.0 && { [[ -d $OCCA_VERSION ]] || { git clone --depth 1 --branch v1.6.0 https://github.com/libocca/occa.git $OCCA_VERSION && cd $OCCA_VERSION && export ENABLE_OPENCL="OFF" ENABLE_DPCPP="OFF" ENABLE_HIP="OFF" ENABLE_CUDA="OFF" && ./configure-cmake.sh && cmake --build build --parallel $NPROC_CPU && cmake --install build && cd ..; }; } && export OCCA_DIR=$PWD/$OCCA_VERSION/install && cd libCEED
-    - echo "-------------- OCCA ----------------" && git -C $OCCA_DIR describe --tags && LD_LIBRARY_PATH=$OCCA_DIR/lib $OCCA_DIR/bin/occa info
   script:
     - rm -f .SUCCESS
     # libCEED
@@ -91,7 +88,7 @@ noether-cpu:
     - echo "-------------- libCEED -------------" && make info
     - echo "-------------- BACKENDS_CPU --------" && echo $BACKENDS_CPU
     - make clean
-    - OCCA_DIR= PEDANTIC=1 make -j$NPROC_CPU
+    - PEDANTIC=1 make -j$NPROC_CPU
     - make -j$NPROC_CPU
     # -- libCEED only tests
     - echo "-------------- core tests ----------"
@@ -115,7 +112,7 @@ noether-cpu:
     - export NPROC_POOL=1
     - make -k -j$NPROC_CPU BACKENDS="$BACKENDS_CPU" JUNIT_BATCH="cpu" junit search=nek NEK5K_DIR=$NEK5K_DIR
     # -- deal.II 8bd5c262f13e15793aa206b6eed8774a9b25ce11
-    - OCCA_DIR= BACKENDS_CPU=$(make info-backends-all | grep -o '/cpu[^ ]*' | tr '\n' ' ')
+    - BACKENDS_CPU=$(make info-backends-all | grep -o '/cpu[^ ]*' | tr '\n' ' ')
     - export DEAL_II_ROOT_DIR=/projects/dealii DEAL_II_DIR=/projects/dealii/install
     - echo "-------------- deal.II -------------" && git -C $DEAL_II_ROOT_DIR -c safe.directory=$DEAL_II_ROOT_DIR describe --always
     - make -k -j$NPROC_CPU BACKENDS="$BACKENDS_CPU" JUNIT_BATCH="cpu" junit search=dealii DEAL_II_DIR=$DEAL_II_DIR
diff --git a/Makefile b/Makefile
index 07f557ff1b..5ecaecd71a 100644
--- a/Makefile
+++ b/Makefile
@@ -50,9 +50,6 @@ HIP_ARCH ?=
 # env variable MAGMA_DIR can be used too
 MAGMA_DIR ?= ../magma
 
-# OCCA_DIR env variable should point to OCCA main (github.com/libocca/occa)
-OCCA_DIR ?= ../occa/install
-
 
 # ------------------------------------------------------------
 # Compiler flags
@@ -103,7 +100,7 @@ UNDERSCORE ?= 1
 # Verbose mode, V or VERBOSE
 V ?= $(VERBOSE)
 
-# Warning: SANTIZ options still don't run with /gpu/occa
+# SANTIZ options
 AFLAGS ?= -fsanitize=address #-fsanitize=undefined -fno-omit-frame-pointer
 
 # Note: Intel oneAPI C/C++ compiler is now icx/icpx
@@ -324,7 +321,6 @@ sycl-shared.cpp:= $(sort $(wildcard backends/sycl-shared/*.sycl.cpp))
 sycl-gen.cpp   := $(sort $(wildcard backends/sycl-gen/*.sycl.cpp))
 magma.c        := $(sort $(wildcard backends/magma/*.c))
 magma.cpp      := $(sort $(wildcard backends/magma/*.cpp))
-occa.cpp       := $(sort $(shell find backends/occa -type f -name *.cpp))
 
 # Tests
 tests.c := $(sort $(wildcard tests/t[0-9][0-9][0-9]-*.c))
@@ -442,7 +438,6 @@ info:
 	$(info ROCM_DIR      = $(ROCM_DIR)$(call backend_status,$(HIP_BACKENDS)))
 	$(info SYCL_DIR      = $(SYCL_DIR)$(call backend_status,$(SYCL_BACKENDS)))
 	$(info MAGMA_DIR     = $(MAGMA_DIR)$(call backend_status,$(MAGMA_BACKENDS)))
-	$(info OCCA_DIR      = $(OCCA_DIR)$(call backend_status,$(OCCA_BACKENDS)))
 	$(info )
 	$(info -----------------------------------------)
 	$(info )
@@ -619,22 +614,6 @@ ifneq ($(wildcard $(MAGMA_DIR)/lib/libmagma.*),)
   BACKENDS_MAKE += $(MAGMA_BACKENDS)
 endif
 
-# OCCA Backends
-OCCA_BACKENDS = /cpu/self/occa
-ifneq ($(wildcard $(OCCA_DIR)/lib/libocca.*),)
-  OCCA_MODES := $(shell LD_LIBRARY_PATH=$(OCCA_DIR)/lib $(OCCA_DIR)/bin/occa modes)
-  OCCA_BACKENDS += $(if $(filter OpenMP,$(OCCA_MODES)),/cpu/openmp/occa)
-  OCCA_BACKENDS += $(if $(filter dpcpp,$(OCCA_MODES)),/gpu/dpcpp/occa)
-  OCCA_BACKENDS += $(if $(filter OpenCL,$(OCCA_MODES)),/gpu/opencl/occa)
-  OCCA_BACKENDS += $(if $(filter HIP,$(OCCA_MODES)),/gpu/hip/occa)
-  OCCA_BACKENDS += $(if $(filter CUDA,$(OCCA_MODES)),/gpu/cuda/occa)
-  $(libceeds) : CPPFLAGS += -I$(OCCA_DIR)/include
-  PKG_LIBS += -L$(abspath $(OCCA_DIR))/lib -locca
-  LIBCEED_CONTAINS_CXX = 1
-  libceed.cpp += $(occa.cpp)
-  BACKENDS_MAKE += $(OCCA_BACKENDS)
-endif
-
 BACKENDS ?= $(BACKENDS_MAKE)
 export BACKENDS
 
@@ -976,7 +955,7 @@ CLANG_TIDY ?= clang-tidy
 	$(CLANG_TIDY) $(TIDY_OPTS) $^ -- $(CPPFLAGS) --std=c11 -I$(CUDA_DIR)/include -I$(ROCM_DIR)/include -DCEED_JIT_SOURCE_ROOT_DEFAULT="\"$(abspath ./include)/\"" -DCEED_GIT_VERSION="\"$(GIT_DESCRIBE)\"" -DCEED_BUILD_CONFIGURATION="\"// Build Configuration:$(foreach v,$(CONFIG_VARS),\n$(v) = $($(v)))\""
 
 %.cpp.tidy : %.cpp
-	$(CLANG_TIDY) $(TIDY_OPTS) $^ -- $(CPPFLAGS) --std=c++11 -I$(CUDA_DIR)/include -I$(OCCA_DIR)/include -I$(ROCM_DIR)/include
+	$(CLANG_TIDY) $(TIDY_OPTS) $^ -- $(CPPFLAGS) --std=c++11 -I$(CUDA_DIR)/include -I$(ROCM_DIR)/include
 
 tidy-c   : $(libceed.c:%=%.tidy)
 tidy-cpp : $(libceed.cpp:%=%.tidy)
@@ -1032,7 +1011,7 @@ print-% :
 CONFIG_VARS = CC CXX FC NVCC NVCC_CXX HIPCC \
   OPT CFLAGS CPPFLAGS CXXFLAGS FFLAGS NVCCFLAGS HIPCCFLAGS SYCLFLAGS \
   AR ARFLAGS LDFLAGS LDLIBS LIBCXX SED \
-  MAGMA_DIR OCCA_DIR XSMM_DIR CUDA_DIR CUDA_ARCH MFEM_DIR PETSC_DIR NEK5K_DIR ROCM_DIR HIP_ARCH SYCL_DIR
+  MAGMA_DIR XSMM_DIR CUDA_DIR CUDA_ARCH MFEM_DIR PETSC_DIR NEK5K_DIR ROCM_DIR HIP_ARCH SYCL_DIR
 
 # $(call needs_save,CFLAGS) returns true (a nonempty string) if CFLAGS
 # was set on the command line or in config.mk (where it will appear as
diff --git a/README.md b/README.md
index 6a2f050cae..84b46748bf 100644
--- a/README.md
+++ b/README.md
@@ -183,13 +183,6 @@ There are multiple supported backends, which can be selected at runtime in the e
 | `/gpu/hip/magma`           | HIP MAGMA kernels                                 | No                    |
 | `/gpu/hip/magma/det`       | HIP MAGMA kernels                                 | Yes                   |
 ||
-| **OCCA**                   |
-| `/*/occa`                  | Selects backend based on available OCCA modes     | Yes                   |
-| `/cpu/self/occa`           | OCCA backend with serial CPU kernels              | Yes                   |
-| `/cpu/openmp/occa`         | OCCA backend with OpenMP kernels                  | Yes                   |
-| `/cpu/dpcpp/occa`          | OCCA backend with DPC++ kernels                   | Yes                   |
-| `/gpu/cuda/occa`           | OCCA backend with CUDA kernels                    | Yes                   |
-| `/gpu/hip/occa`            | OCCA backend with HIP kernels                     | Yes                   |
 
 The `/cpu/self/*/serial` backends process one element at a time and are intended for meshes with a smaller number of high order elements.
 The `/cpu/self/*/blocked` backends process blocked batches of eight interlaced elements and are intended for meshes with higher numbers of elements.
@@ -230,15 +223,6 @@ For example:
 
 > - `/gpu/cuda/gen:device_id=1`
 
-The `/*/occa` backends rely upon the [OCCA](http://github.com/libocca/occa) package to provide cross platform performance.
-To enable the OCCA backend, the environment variable `OCCA_DIR` must point to the top-level OCCA directory, with the OCCA library located in the `${OCCA_DIR}/lib` (By default, `OCCA_DIR` is set to `../occa`).
-OCCA version 1.6.0 or newer is required.
-
-Users can pass specific OCCA device properties after setting the CEED resource.
-For example:
-
-> - `"/*/occa:mode='CUDA',device_id=0"`
-
 Bit-for-bit reproducibility is important in some applications.
 However, some libCEED backends use non-deterministic operations, such as `atomicAdd` for increased performance.
 The backends which are capable of generating reproducible results, with the proper compilation options, are highlighted in the list above.
diff --git a/backends/ceed-backend-list.h b/backends/ceed-backend-list.h
index 8650d956e2..017d8a76a0 100644
--- a/backends/ceed-backend-list.h
+++ b/backends/ceed-backend-list.h
@@ -26,7 +26,6 @@ CEED_BACKEND(CeedRegister_Magma, 2, "/gpu/cuda/magma", "/gpu/hip/magma")
 CEED_BACKEND(CeedRegister_Magma_Det, 2, "/gpu/cuda/magma/det", "/gpu/hip/magma/det")
 CEED_BACKEND(CeedRegister_Memcheck_Blocked, 1, "/cpu/self/memcheck/blocked")
 CEED_BACKEND(CeedRegister_Memcheck_Serial, 1, "/cpu/self/memcheck/serial")
-CEED_BACKEND(CeedRegister_Occa, 6, "/cpu/self/occa", "/cpu/openmp/occa", "/gpu/dpcpp/occa", "/gpu/opencl/occa", "/gpu/hip/occa", "/gpu/cuda/occa")
 CEED_BACKEND(CeedRegister_Opt_Blocked, 1, "/cpu/self/opt/blocked")
 CEED_BACKEND(CeedRegister_Opt_Serial, 1, "/cpu/self/opt/serial")
 CEED_BACKEND(CeedRegister_Ref, 1, "/cpu/self/ref/serial")
diff --git a/backends/occa/ceed-occa-basis.cpp b/backends/occa/ceed-occa-basis.cpp
deleted file mode 100644
index 53aa042e5f..0000000000
--- a/backends/occa/ceed-occa-basis.cpp
+++ /dev/null
@@ -1,82 +0,0 @@
-// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#include "ceed-occa-basis.hpp"
-
-#include "ceed-occa-tensor-basis.hpp"
-
-namespace ceed {
-namespace occa {
-Basis::Basis() : ceedComponentCount(0), dim(0), P(0), Q(0) {}
-
-Basis::~Basis() {}
-
-Basis *Basis::getBasis(CeedBasis basis, const bool assertValid) {
-  if (!basis) {
-    return NULL;
-  }
-
-  int    ierr;
-  Basis *basis_ = NULL;
-
-  ierr = CeedBasisGetData(basis, &basis_);
-  if (assertValid) {
-    CeedOccaFromChk(ierr);
-  }
-
-  return basis_;
-}
-
-Basis *Basis::from(CeedBasis basis) {
-  Basis *basis_ = getBasis(basis);
-  if (!basis_) {
-    return NULL;
-  }
-
-  CeedCallOcca(basis_->setCeedFields(basis));
-
-  return basis_;
-}
-
-Basis *Basis::from(CeedOperatorField operatorField) {
-  CeedBasis ceedBasis;
-  CeedCallOcca(CeedOperatorFieldGetBasis(operatorField, &ceedBasis));
-  Basis *basis = from(ceedBasis);
-  CeedCallOcca(CeedBasisDestroy(&ceedBasis));
-  return basis;
-}
-
-int Basis::setCeedFields(CeedBasis basis) {
-  CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
-  CeedCallBackend(CeedBasisGetNumComponents(basis, &ceedComponentCount));
-
-  return CEED_ERROR_SUCCESS;
-}
-
-//---[ Ceed Callbacks ]-----------
-int Basis::registerCeedFunction(Ceed ceed, CeedBasis basis, const char *fname, ceed::occa::ceedFunction f) {
-  return CeedSetBackendFunction(ceed, "Basis", basis, fname, f);
-}
-
-int Basis::ceedApply(CeedBasis basis, const CeedInt nelem, CeedTransposeMode tmode, CeedEvalMode emode, CeedVector u, CeedVector v) {
-  Basis  *basis_ = Basis::from(basis);
-  Vector *U      = Vector::from(u);
-  Vector *V      = Vector::from(v);
-
-  if (!basis_) {
-    return staticCeedError("Incorrect CeedBasis argument: op");
-  }
-
-  return basis_->apply(nelem, tmode, emode, U, V);
-}
-
-int Basis::ceedDestroy(CeedBasis basis) {
-  delete getBasis(basis, false);
-  return CEED_ERROR_SUCCESS;
-}
-}  // namespace occa
-}  // namespace ceed
diff --git a/backends/occa/ceed-occa-basis.hpp b/backends/occa/ceed-occa-basis.hpp
deleted file mode 100644
index 1e8c01ead4..0000000000
--- a/backends/occa/ceed-occa-basis.hpp
+++ /dev/null
@@ -1,53 +0,0 @@
-// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#ifndef CEED_OCCA_BASIS_HEADER
-#define CEED_OCCA_BASIS_HEADER
-
-#include "ceed-occa-ceed-object.hpp"
-#include "ceed-occa-vector.hpp"
-
-namespace ceed {
-namespace occa {
-class Basis : public CeedObject {
- public:
-  // Ceed object information
-  CeedInt ceedComponentCount;
-
-  // Owned information
-  CeedInt dim;
-  CeedInt P;
-  CeedInt Q;
-
-  Basis();
-
-  virtual ~Basis();
-
-  static Basis *getBasis(CeedBasis basis, const bool assertValid = true);
-
-  static Basis *from(CeedBasis basis);
-  static Basis *from(CeedOperatorField operatorField);
-
-  int setCeedFields(CeedBasis basis);
-
-  virtual bool isTensorBasis() const = 0;
-
-  virtual const char *getFunctionSource() const = 0;
-
-  virtual int apply(const CeedInt elementCount, CeedTransposeMode tmode, CeedEvalMode emode, Vector *u, Vector *v) = 0;
-
-  //---[ Ceed Callbacks ]-----------
-  static int registerCeedFunction(Ceed ceed, CeedBasis basis, const char *fname, ceed::occa::ceedFunction f);
-
-  static int ceedApply(CeedBasis basis, const CeedInt nelem, CeedTransposeMode tmode, CeedEvalMode emode, CeedVector u, CeedVector v);
-
-  static int ceedDestroy(CeedBasis basis);
-};
-}  // namespace occa
-}  // namespace ceed
-
-#endif
diff --git a/backends/occa/ceed-occa-ceed-object.cpp b/backends/occa/ceed-occa-ceed-object.cpp
deleted file mode 100644
index 7f82ae6cc0..0000000000
--- a/backends/occa/ceed-occa-ceed-object.cpp
+++ /dev/null
@@ -1,31 +0,0 @@
-// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#include "ceed-occa-ceed-object.hpp"
-
-#include "ceed-occa-context.hpp"
-
-namespace ceed {
-namespace occa {
-CeedObject::CeedObject(Ceed ceed_) : ceed(ceed_) {}
-
-::occa::device CeedObject::getDevice() {
-  if (!_device.isInitialized()) {
-    _device = Context::from(ceed)->device;
-  }
-  return _device;
-}
-
-bool CeedObject::usingCpuDevice() const { return Context::from(ceed)->usingCpuDevice(); }
-
-bool CeedObject::usingGpuDevice() const { return Context::from(ceed)->usingGpuDevice(); }
-
-int CeedObject::ceedError(const std::string &message) const { return CeedError(ceed, CEED_ERROR_BACKEND, message.c_str()); }
-
-int CeedObject::staticCeedError(const std::string &message) { return CeedError(NULL, CEED_ERROR_BACKEND, message.c_str()); }
-}  // namespace occa
-}  // namespace ceed
diff --git a/backends/occa/ceed-occa-ceed-object.hpp b/backends/occa/ceed-occa-ceed-object.hpp
deleted file mode 100644
index be90bf447e..0000000000
--- a/backends/occa/ceed-occa-ceed-object.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#ifndef CEED_OCCA_CEEDOBJECT_HEADER
-#define CEED_OCCA_CEEDOBJECT_HEADER
-
-#include "ceed-occa-context.hpp"
-
-namespace ceed {
-namespace occa {
-class CeedObject {
- private:
-  ::occa::device _device;
-
- public:
-  Ceed ceed;
-
-  CeedObject(Ceed ceed_ = NULL);
-
-  ::occa::device getDevice();
-
-  bool usingCpuDevice() const;
-  bool usingGpuDevice() const;
-
-  int        ceedError(const std::string &message) const;
-  static int staticCeedError(const std::string &message);
-};
-
-namespace SyncState {
-static const int none   = 0;
-static const int host   = (1 << 0);
-static const int device = (1 << 1);
-static const int all    = host | device;
-}  // namespace SyncState
-}  // namespace occa
-}  // namespace ceed
-
-#endif
diff --git a/backends/occa/ceed-occa-context.cpp b/backends/occa/ceed-occa-context.cpp
deleted file mode 100644
index e22b221338..0000000000
--- a/backends/occa/ceed-occa-context.cpp
+++ /dev/null
@@ -1,32 +0,0 @@
-// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#include "ceed-occa-context.hpp"
-
-namespace ceed {
-namespace occa {
-Context::Context(::occa::device device_) : device(device_) {
-  const std::string mode = device.mode();
-  _usingCpuDevice        = (mode == "Serial" || mode == "OpenMP");
-  _usingGpuDevice        = (mode == "CUDA" || mode == "HIP" || mode == "OpenCL");
-}
-
-Context *Context::from(Ceed ceed) {
-  if (!ceed) {
-    return NULL;
-  }
-
-  Context *context;
-  CeedGetData(ceed, (void **)&context);
-  return context;
-}
-
-bool Context::usingCpuDevice() const { return _usingCpuDevice; }
-
-bool Context::usingGpuDevice() const { return _usingGpuDevice; }
-}  // namespace occa
-}  // namespace ceed
diff --git a/backends/occa/ceed-occa-context.hpp b/backends/occa/ceed-occa-context.hpp
deleted file mode 100644
index 4a41d79411..0000000000
--- a/backends/occa/ceed-occa-context.hpp
+++ /dev/null
@@ -1,33 +0,0 @@
-// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#ifndef CEED_OCCA_CONTEXT_HEADER
-#define CEED_OCCA_CONTEXT_HEADER
-
-#include "ceed-occa-types.hpp"
-
-namespace ceed {
-namespace occa {
-class Context {
- private:
-  bool _usingCpuDevice;
-  bool _usingGpuDevice;
-
- public:
-  ::occa::device device;
-
-  Context(::occa::device device_);
-
-  static Context *from(Ceed ceed);
-
-  bool usingCpuDevice() const;
-  bool usingGpuDevice() const;
-};
-}  // namespace occa
-}  // namespace ceed
-
-#endif
diff --git a/backends/occa/ceed-occa-cpu-operator.cpp b/backends/occa/ceed-occa-cpu-operator.cpp
deleted file mode 100644
index bdf6efea37..0000000000
--- a/backends/occa/ceed-occa-cpu-operator.cpp
+++ /dev/null
@@ -1,751 +0,0 @@
-// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#include "ceed-occa-cpu-operator.hpp"
-
-#include "ceed-occa-elem-restriction.hpp"
-#include "ceed-occa-qfunction.hpp"
-#include "ceed-occa-qfunctioncontext.hpp"
-#include "ceed-occa-simplex-basis.hpp"
-#include "ceed-occa-tensor-basis.hpp"
-
-#define CEED_OCCA_PRINT_KERNEL_HASHES 0
-
-namespace ceed {
-namespace occa {
-CpuOperator::CpuOperator() {}
-
-CpuOperator::~CpuOperator() {}
-
-void CpuOperator::setupVectors() {
-  setupVectors(args.inputCount(), args.opInputs, args.qfInputs, dofInputs);
-  setupVectors(args.outputCount(), args.opOutputs, args.qfOutputs, dofOutputs);
-}
-
-void CpuOperator::setupVectors(const int fieldCount, OperatorFieldVector &opFields, QFunctionFieldVector &qfFields, VectorVector &vectors) {
-  for (int i = 0; i < fieldCount; ++i) {
-    const QFunctionField &qfField = qfFields[i];
-    const OperatorField  &opField = opFields[i];
-
-    if (qfField.evalMode == CEED_EVAL_WEIGHT) {
-      // Weight kernel doesn't use the input
-      vectors.push_back(NULL);
-      continue;
-    }
-
-    int entries;
-    if (qfField.evalMode == CEED_EVAL_NONE) {
-      // The output vector stores values at quadrature points
-      entries = (ceedElementCount * ceedQ * qfField.size);
-    } else {
-      // The output vector stores the element dof values
-      entries = (ceedElementCount * opField.getElementSize() * opField.getComponentCount());
-    }
-
-    Vector *dofVector = new Vector();
-    dofVector->ceed   = ceed;
-    dofVector->resize(entries);
-
-    vectors.push_back(dofVector);
-  }
-}
-
-void CpuOperator::freeVectors() {
-  for (int i = 0; i < args.inputCount(); ++i) {
-    delete dofInputs[i];
-  }
-  for (int i = 0; i < args.outputCount(); ++i) {
-    delete dofOutputs[i];
-  }
-  dofInputs.clear();
-  dofOutputs.clear();
-}
-
-void CpuOperator::setupInputs(Vector *in) {
-  for (int i = 0; i < args.inputCount(); ++i) {
-    // Weight kernel doesn't use the input vector
-    if (args.getInputEvalMode(i) == CEED_EVAL_WEIGHT) {
-      continue;
-    }
-
-    const OperatorField &opField = args.getOpInput(i);
-
-    Vector *input  = opField.usesActiveVector() ? in : opField.vec;
-    Vector *output = dofInputs[i];
-
-    opField.elemRestriction->apply(CEED_NOTRANSPOSE, *input, *output);
-  }
-}
-
-void CpuOperator::setupOutputs(Vector *out) {
-  for (int i = 0; i < args.outputCount(); ++i) {
-    // Weight is not supported for output vectors
-    if (args.getOutputEvalMode(i) == CEED_EVAL_WEIGHT) {
-      continue;
-    }
-
-    const OperatorField &opField = args.getOpOutput(i);
-
-    Vector *input  = dofOutputs[i];
-    Vector *output = opField.usesActiveVector() ? out : opField.vec;
-
-    opField.elemRestriction->apply(CEED_TRANSPOSE, *input, *output);
-  }
-}
-
-void CpuOperator::applyQFunction() {
-  if (qfunction->qFunctionContext) {
-    QFunctionContext *ctx = QFunctionContext::from(qfunction->qFunctionContext);
-    applyAddKernel.pushArg(ctx->getKernelArg());
-  } else {
-    applyAddKernel.pushArg(::occa::null);
-  }
-  applyAddKernel.pushArg(ceedElementCount);
-
-  for (int i = 0; i < args.inputCount(); ++i) {
-    const bool isInput = true;
-    pushKernelArgs(dofInputs[i], isInput, i);
-  }
-
-  for (int i = 0; i < args.outputCount(); ++i) {
-    const bool isInput = false;
-    pushKernelArgs(dofOutputs[i], isInput, i);
-  }
-
-  applyAddKernel.run();
-}
-
-void CpuOperator::pushKernelArgs(Vector *vec, const bool isInput, const int index) {
-  const OperatorField  &opField = args.getOpField(isInput, index);
-  const QFunctionField &qfField = args.getQfField(isInput, index);
-
-  if (opField.hasBasis()) {
-    if (opField.usingTensorBasis()) {
-      pushTensorBasisKernelArgs(qfField, *((TensorBasis *)opField.basis));
-    } else {
-      pushSimplexBasisKernelArgs(qfField, *((SimplexBasis *)opField.basis));
-    }
-  }
-
-  if (vec) {
-    if (isInput) {
-      applyAddKernel.pushArg(vec->getConstKernelArg());
-    } else {
-      applyAddKernel.pushArg(vec->getKernelArg());
-    }
-  } else {
-    applyAddKernel.pushArg(::occa::null);
-  }
-}
-
-void CpuOperator::pushTensorBasisKernelArgs(const QFunctionField &qfField, TensorBasis &basis) {
-  switch (qfField.evalMode) {
-    case CEED_EVAL_INTERP: {
-      applyAddKernel.pushArg(basis.interp1D);
-      break;
-    }
-    case CEED_EVAL_GRAD: {
-      applyAddKernel.pushArg(basis.interp1D);
-      applyAddKernel.pushArg(basis.grad1D);
-      break;
-    }
-    case CEED_EVAL_WEIGHT: {
-      applyAddKernel.pushArg(basis.qWeight1D);
-      break;
-    }
-    default: {
-    }
-  }
-}
-
-void CpuOperator::pushSimplexBasisKernelArgs(const QFunctionField &qfField, SimplexBasis &basis) {
-  switch (qfField.evalMode) {
-    case CEED_EVAL_INTERP: {
-      applyAddKernel.pushArg(basis.interp);
-      break;
-    }
-    case CEED_EVAL_GRAD: {
-      applyAddKernel.pushArg(basis.grad);
-      break;
-    }
-    case CEED_EVAL_WEIGHT: {
-      applyAddKernel.pushArg(basis.qWeight);
-      break;
-    }
-    default: {
-    }
-  }
-}
-
-::occa::properties CpuOperator::getKernelProps() {
-  ::occa::properties props = qfunction->getKernelProps(ceedQ);
-
-  props["defines/OCCA_Q"] = ceedQ;
-
-  return props;
-}
-
-void CpuOperator::applyAdd(Vector *in, Vector *out) {
-  // Setup helper vectors
-  setupVectors();
-
-  // Dof nodes -> local dofs
-  setupInputs(in);
-
-  // Apply qFunction
-  applyQFunction();
-
-  // Local dofs -> dof nodes
-  setupOutputs(out);
-
-  // Cleanup helper vectors
-  freeVectors();
-}
-
-::occa::kernel CpuOperator::buildApplyAddKernel() {
-  std::stringstream ss;
-
-  addBasisFunctionSource(ss);
-
-  addKernelSource(ss);
-
-  const std::string kernelSource = ss.str();
-
-  CeedDebug(ceed, kernelSource.c_str());
-
-  // TODO: Store a kernel per Q
-  return getDevice().buildKernelFromString(kernelSource, "applyAdd", getKernelProps());
-}
-
-//---[ Kernel Generation ]--------------------
-void CpuOperator::addBasisFunctionSource(std::stringstream &ss) {
-  BasisVector sourceBasis;
-  for (int i = 0; i < args.inputCount(); ++i) {
-    addBasisIfMissingSource(sourceBasis, args.getOpInput(i).basis);
-  }
-  for (int i = 0; i < args.outputCount(); ++i) {
-    addBasisIfMissingSource(sourceBasis, args.getOpOutput(i).basis);
-  }
-
-  // Make sure there's a break between past code
-  ss << std::endl;
-
-  // Add source code for each unique basis function
-  const int basisCount = (int)sourceBasis.size();
-  for (int i = 0; i < basisCount; ++i) {
-    Basis &basis = *(sourceBasis[i]);
-
-    ss << "// Code generation for basis " << i + 1 << std::endl << "//---[ START ]-------------------------------" << std::endl;
-
-    // Undefine and redefine required variables
-    if (basis.isTensorBasis()) {
-      TensorBasis &basisTensor = (TensorBasis &)basis;
-      ss << "#undef  TENSOR_FUNCTION" << std::endl
-         << "#undef  P1D" << std::endl
-         << "#undef  Q1D" << std::endl
-         << "#define P1D " << basisTensor.P1D << std::endl
-         << "#define Q1D " << basisTensor.Q1D << std::endl;
-    } else {
-      SimplexBasis &basisSimplex = (SimplexBasis &)basis;
-      ss << "#undef  SIMPLEX_FUNCTION" << std::endl
-         << "#undef  DIM" << std::endl
-         << "#undef  P" << std::endl
-         << "#undef  Q" << std::endl
-         << "#define DIM " << basisSimplex.dim << std::endl
-         << "#define P   " << basisSimplex.P << std::endl
-         << "#define Q   " << basisSimplex.Q << std::endl;
-    }
-
-    ss << std::endl << basis.getFunctionSource() << std::endl << "//---[ END ]---------------------------------" << std::endl;
-  }
-}
-
-void CpuOperator::addBasisIfMissingSource(BasisVector &sourceBasis, Basis *basis) {
-  // Avoid adding duplicate sources which will result in colliding symbol names
-
-  // No basis
-  if (!basis) {
-    return;
-  }
-
-  // Fast enough since we expect a small number of inputs/outputs
-  const int existingBasisCount = (int)sourceBasis.size();
-  for (int i = 0; i < existingBasisCount; ++i) {
-    Basis *other = sourceBasis[i];
-    // They are different basis types so other != basis
-    if (basis->isTensorBasis() != other->isTensorBasis()) {
-      continue;
-    }
-
-    if (basis->dim == other->dim && basis->P == other->P && basis->Q == other->Q) {
-      // `other` wil generate the same code
-      return;
-    }
-  }
-
-  // Basis didn't match any other existing basis
-  sourceBasis.push_back(basis);
-}
-
-void CpuOperator::addKernelSource(std::stringstream &ss) {
-  // Make sure there's a break between past code
-  ss << std::endl;
-
-  ss << "@kernel void applyAdd(" << std::endl;
-
-  addKernelArgsSource(ss);
-
-  ss << std::endl
-     << ") {" << std::endl
-     << "  @tile(128, @outer, @inner)" << std::endl
-     << "  for (int element = 0; element < elementCount; ++element) {" << std::endl;
-
-#if CEED_OCCA_PRINT_KERNEL_HASHES
-  // Print to see which kernel is being run
-  ss << "    if (element == 0) {" << std::endl
-     << "      printf(\"\\n\\nOperator Kernel: \" OKL_KERNEL_HASH \"\\n\\n\");" << std::endl
-     << "    }" << std::endl;
-#endif
-
-  addQuadArraySource(ss);
-
-  ss << std::endl << "    // [Start] Transforming inputs to quadrature points" << std::endl;
-  addInputSetupSource(ss);
-  ss << "    // [End] Transforming inputs to quadrature points" << std::endl << std::endl;
-
-  addQFunctionApplicationSource(ss);
-
-  ss << std::endl << "    // [Start] Transforming outputs to quadrature points" << std::endl;
-  addOutputSetupSource(ss);
-  ss << "    // [End] Transforming outputs to quadrature points" << std::endl;
-
-  ss << "  }" << std::endl << "}" << std::endl;
-}
-
-void CpuOperator::addKernelArgsSource(std::stringstream &ss) {
-  ss << "  void *ctx," << std::endl << "  const CeedInt elementCount";
-
-  for (int i = 0; i < args.inputCount(); ++i) {
-    const bool isInput = true;
-    addKernelArgSource(ss, isInput, i);
-  }
-  for (int i = 0; i < args.outputCount(); ++i) {
-    const bool isInput = false;
-    addKernelArgSource(ss, isInput, i);
-  }
-}
-
-void CpuOperator::addKernelArgSource(std::stringstream &ss, const bool isInput, const int index) {
-  const OperatorField  &opField = args.getOpField(isInput, index);
-  const QFunctionField &qfField = args.getQfField(isInput, index);
-
-  std::stringstream dimAttribute;
-  if (opField.hasBasis()) {
-    ss << ',' << std::endl;
-    if (opField.usingTensorBasis()) {
-      addTensorKernelArgSource(ss, isInput, index, opField, qfField, dimAttribute);
-    } else {
-      addSimplexKernelArgSource(ss, isInput, index, opField, qfField, dimAttribute);
-    }
-  }
-
-  ss << ',' << std::endl;
-  if (isInput) {
-    ss << "  const CeedScalar *" << dofInputVar(index) << dimAttribute.str();
-  } else {
-    ss << "  CeedScalar *" << dofOutputVar(index) << dimAttribute.str();
-  }
-}
-
-void CpuOperator::addTensorKernelArgSource(std::stringstream &ss, const bool isInput, const int index, const OperatorField &opField,
-                                           const QFunctionField &qfField, std::stringstream &dimAttribute) {
-  TensorBasis &basis = *((TensorBasis *)opField.basis);
-
-  dimAttribute << " @dim(";
-
-  if (qfField.evalMode == CEED_EVAL_INTERP) {
-    ss << "  const CeedScalar *" << interpVar(isInput, index);
-
-    // @dim(P1D, P1D, BASIS_COMPONENT_COUNT, elementCount)
-    for (int i = 0; i < basis.dim; ++i) {
-      dimAttribute << basis.P1D << ", ";
-    }
-    dimAttribute << basis.ceedComponentCount << ", elementCount";
-  } else if (qfField.evalMode == CEED_EVAL_GRAD) {
-    ss << "  const CeedScalar *" << interpVar(isInput, index) << ',' << std::endl << "  const CeedScalar *" << gradVar(isInput, index);
-
-    // @dim(P1D, P1D, BASIS_COMPONENT_COUNT, elementCount)
-    for (int i = 0; i < basis.dim; ++i) {
-      dimAttribute << basis.P1D << ", ";
-    }
-    dimAttribute << basis.ceedComponentCount << ", elementCount";
-  } else if (qfField.evalMode == CEED_EVAL_WEIGHT) {
-    ss << "  const CeedScalar *" << qWeightVar(isInput, index);
-
-    // @dim(Q1D, Q1D, elementCount)
-    for (int i = 0; i < basis.dim; ++i) {
-      dimAttribute << basis.Q1D << ", ";
-    }
-    dimAttribute << "elementCount";
-  } else {
-    // Clear @dim
-    dimAttribute.str("");
-    return;
-  }
-
-  dimAttribute << ")";
-}
-
-void CpuOperator::addSimplexKernelArgSource(std::stringstream &ss, const bool isInput, const int index, const OperatorField &opField,
-                                            const QFunctionField &qfField, std::stringstream &dimAttribute) {
-  SimplexBasis &basis = *((SimplexBasis *)opField.basis);
-
-  dimAttribute << " @dim(";
-
-  if (qfField.evalMode == CEED_EVAL_INTERP) {
-    ss << "  const CeedScalar *" << interpVar(isInput, index);
-
-    // @dim(P, BASIS_COMPONENT_COUNT, elementCount)
-    dimAttribute << basis.P << ", " << basis.ceedComponentCount << ", elementCount";
-  } else if (qfField.evalMode == CEED_EVAL_GRAD) {
-    ss << "  const CeedScalar *" << gradVar(isInput, index);
-
-    // @dim(P, BASIS_COMPONENT_COUNT, elementCount)
-    dimAttribute << basis.P << ", " << basis.ceedComponentCount << ", elementCount";
-  } else if (qfField.evalMode == CEED_EVAL_WEIGHT) {
-    ss << "  const CeedScalar *" << qWeightVar(isInput, index);
-
-    // @dim(Q, elementCount)
-    dimAttribute << basis.Q << ", "
-                 << "elementCount";
-  } else {
-    // Clear @dim
-    dimAttribute.str("");
-    return;
-  }
-
-  dimAttribute << ")";
-}
-
-void CpuOperator::addQuadArraySource(std::stringstream &ss) {
-  const int inputs  = args.inputCount();
-  const int outputs = args.outputCount();
-
-  const std::string quadInput  = "quadInput";
-  const std::string quadOutput = "quadOutput";
-
-  ss << "    // Store the transformed input quad values" << std::endl;
-  for (int i = 0; i < inputs; ++i) {
-    const bool isInput = true;
-    addSingleQfunctionQuadArraySource(ss, isInput, i, quadInput);
-  }
-
-  ss << std::endl << "    // Store the transformed output quad values" << std::endl;
-  for (int i = 0; i < outputs; ++i) {
-    const bool isInput = false;
-    addSingleQfunctionQuadArraySource(ss, isInput, i, quadOutput);
-  }
-  ss << std::endl;
-
-  ss << std::endl << "    // Store all input pointers in a single array" << std::endl;
-  addQfunctionQuadArraySource(ss, true, inputs, quadInput);
-
-  ss << std::endl << "    // Store all output pointers in a single array" << std::endl;
-  addQfunctionQuadArraySource(ss, false, outputs, quadOutput);
-
-  ss << std::endl;
-}
-
-void CpuOperator::addSingleQfunctionQuadArraySource(std::stringstream &ss, const bool isInput, const int index, const std::string &name) {
-  // Output:
-  //   CeedScalar quadInput0[DIM][COMPONENTS][OCCA_Q];
-  //   CeedScalar quadInput0[OCCA_Q * SIZE];
-
-  const OperatorField &opField  = args.getOpField(isInput, index);
-  CeedEvalMode         evalMode = args.getEvalMode(isInput, index);
-
-  if (evalMode == CEED_EVAL_GRAD) {
-    ss << "    CeedScalar " << indexedVar(name, index) << "[" << opField.getDim() << "]"
-       << "[" << opField.getComponentCount() << "]"
-       << "[OCCA_Q];" << std::endl;
-  } else if (evalMode == CEED_EVAL_INTERP) {
-    ss << "    CeedScalar " << indexedVar(name, index) << "[" << opField.getComponentCount() << "]"
-       << "[OCCA_Q];" << std::endl;
-  } else {
-    const QFunctionField &qfField = args.getQfField(isInput, index);
-
-    ss << "    CeedScalar " << indexedVar(name, index) << "[OCCA_Q * " << qfField.size << "];" << std::endl;
-  }
-}
-
-void CpuOperator::addQfunctionQuadArraySource(std::stringstream &ss, const bool isInput, const int count, const std::string &name) {
-  // Output:
-  //   CeedScalar *quadInputs[2] = {
-  //     (CeedScalar*) quadInput0,
-  //     (CeedScalar*) quadInput1
-  //   };
-
-  // Add an 's': quadInput -> quadInputs
-  const std::string arrayName = name + "s";
-
-  ss << "    CeedScalar *" << arrayName << "[" << count << "] = {" << std::endl;
-  for (int i = 0; i < count; ++i) {
-    if (i) {
-      ss << ',' << std::endl;
-    }
-    ss << "      (CeedScalar*) " << indexedVar(name, i);
-  }
-  ss << std::endl << "    };" << std::endl;
-}
-
-void CpuOperator::addInputSetupSource(std::stringstream &ss) {
-  const bool isInput = true;
-  addBasisApplySource(ss, isInput, args.inputCount());
-}
-
-void CpuOperator::addOutputSetupSource(std::stringstream &ss) {
-  const bool isInput = false;
-  addBasisApplySource(ss, isInput, args.outputCount());
-}
-
-void CpuOperator::addBasisApplySource(std::stringstream &ss, const bool isInput, const int count) {
-  for (int i = 0; i < count; ++i) {
-    CeedEvalMode evalMode = args.getEvalMode(isInput, i);
-
-    if (evalMode == CEED_EVAL_INTERP) {
-      addInterpSource(ss, isInput, i);
-    } else if (evalMode == CEED_EVAL_GRAD) {
-      const bool hasTensorBasis = args.getOpField(isInput, i).usingTensorBasis();
-      if (hasTensorBasis) {
-        addGradTensorSource(ss, isInput, i);
-      } else {
-        addGradSimplexSource(ss, isInput, i);
-      }
-    } else if (evalMode == CEED_EVAL_WEIGHT) {
-      addWeightSource(ss, isInput, i);
-    } else if (evalMode == CEED_EVAL_NONE) {
-      addCopySource(ss, isInput, i);
-    }
-  }
-}
-
-void CpuOperator::addInterpSource(std::stringstream &ss, const bool isInput, const int index) {
-  const OperatorField &opField          = args.getOpField(isInput, index);
-  const bool           usingTensorBasis = opField.usingTensorBasis();
-  const int            components       = opField.getComponentCount();
-  const int            dim              = opField.getDim();
-
-  const std::string weights = interpVar(isInput, index);
-
-  std::string dimArgs;
-  if (usingTensorBasis) {
-    for (int i = 0; i < dim; ++i) {
-      if (i) {
-        dimArgs += ", ";
-      }
-      dimArgs += '0';
-    }
-  } else {
-    dimArgs = "0";
-  }
-
-  std::string input, output;
-  if (isInput) {
-    input  = "&" + dofInputVar(index) + "(" + dimArgs + ", component, element)";
-    output = "(CeedScalar*) " + indexedVar("quadInput", index) + "[component]";
-  } else {
-    input  = "(CeedScalar*) " + indexedVar("quadOutput", index) + "[component]";
-    output = "&" + dofOutputVar(index) + "(" + dimArgs + ", component, element)";
-  }
-
-  ss << "    // Applying interp (" << xputName(isInput) << ": " << index << ")" << std::endl
-     << "    for (int component = 0; component < " << components << "; ++component) {" << std::endl
-     << "      " << elementFunction(isInput, index) << "(" << std::endl
-     << "        " << weights << ',' << std::endl
-     << "        " << input << ',' << std::endl
-     << "        " << output << std::endl
-     << "      );" << std::endl
-     << "    }" << std::endl
-     << std::endl;
-}
-
-void CpuOperator::addGradTensorSource(std::stringstream &ss, const bool isInput, const int index) {
-  const OperatorField &opField    = args.getOpField(isInput, index);
-  const int            components = opField.getComponentCount();
-  const int            dim        = opField.getDim();
-
-  const std::string B  = interpVar(isInput, index);
-  const std::string Bx = gradVar(isInput, index);
-
-  std::string dimArgs;
-  for (int i = 0; i < dim; ++i) {
-    if (i) {
-      dimArgs += ", ";
-    }
-    dimArgs += '0';
-  }
-
-  std::string inputs, outputs;
-  if (isInput) {
-    inputs = "&" + dofInputVar(index) + "(" + dimArgs + ", component, element)";
-
-    for (int i = 0; i < dim; ++i) {
-      if (i) {
-        outputs += ",\n        ";
-      }
-      const std::string iStr = std::to_string(i);
-      outputs += "(CeedScalar*) " + indexedVar("quadInput", index) + "[" + iStr + "][component]";
-    }
-  } else {
-    for (int i = 0; i < dim; ++i) {
-      if (i) {
-        inputs += ",\n        ";
-      }
-      const std::string iStr = std::to_string(i);
-      inputs += "(CeedScalar*) " + indexedVar("quadOutput", index) + "[" + iStr + "][component]";
-    }
-
-    outputs = "&" + dofOutputVar(index) + "(" + dimArgs + ", component, element)";
-  }
-
-  ss << "    // Applying grad-tensor (" << xputName(isInput) << ": " << index << ")" << std::endl
-     << "    for (int component = 0; component < " << components << "; ++component) {" << std::endl
-     << "      " << elementFunction(isInput, index) << "(" << std::endl
-     << "        " << B << ',' << std::endl
-     << "        " << Bx << ',' << std::endl
-     << "        " << inputs << ',' << std::endl
-     << "        " << outputs << std::endl
-     << "      );" << std::endl
-     << "    }" << std::endl
-     << std::endl;
-}
-
-void CpuOperator::addGradSimplexSource(std::stringstream &ss, const bool isInput, const int index) {
-  const int components = (args.getOpField(isInput, index).getComponentCount());
-
-  const std::string weights = gradVar(isInput, index);
-
-  std::string input, output;
-  if (isInput) {
-    input  = "&" + dofInputVar(index) + "(0, component, element)";
-    output = "(CeedScalar*) " + indexedVar("quadInput", index) + "[component]";
-  } else {
-    input  = "(CeedScalar*) " + indexedVar("quadOutput", index) + "[component]";
-    output = "&" + dofOutputVar(index) + "(0, component, element)";
-  }
-
-  ss << "    // Applying grad-simplex (" << xputName(isInput) << ": " << index << ")" << std::endl
-     << "    for (int component = 0; component < " << components << "; ++component) {" << std::endl
-     << "      " << elementFunction(isInput, index) << "(" << std::endl
-     << "        " << weights << ',' << std::endl
-     << "        " << input << ',' << std::endl
-     << "        " << output << std::endl
-     << "      );" << std::endl
-     << "    }" << std::endl
-     << std::endl;
-}
-
-void CpuOperator::addWeightSource(std::stringstream &ss, const bool isInput, const int index) {
-  const std::string weights = qWeightVar(isInput, index);
-
-  std::string output;
-  if (isInput) {
-    // TODO: Can the weight operator handle multiple components?
-    output = "(CeedScalar*) " + indexedVar("quadInput", index);
-  } else {
-    output = "&" + dofOutputVar(index) + "(0, element)";
-  }
-
-  ss << "    // Applying weight (" << xputName(isInput) << ": " << index << ")" << std::endl
-     << "    " << elementFunction(isInput, index) << "(" << std::endl
-     << "      " << weights << ',' << std::endl
-     << "      " << output << std::endl
-     << "    );" << std::endl
-     << std::endl;
-}
-
-void CpuOperator::addCopySource(std::stringstream &ss, const bool isInput, const int index) {
-  const QFunctionField &qfField = args.getQfField(isInput, index);
-  const std::string     size    = std::to_string(qfField.size);
-
-  std::string input, output;
-  if (isInput) {
-    input += dofInputVar(index) + "[q + (OCCA_Q * (field + element * " + size + "))]";
-    output += indexedVar("quadInput", index) + "[q + field * OCCA_Q]";
-  } else {
-    input  = indexedVar("quadOutput", index) + "[q + field * OCCA_Q]";
-    output = dofOutputVar(index) + "[q + (OCCA_Q * (field + element * " + size + "))]";
-  }
-
-  ss << "    // Copying source directly (" << xputName(isInput) << ": " << index << ")" << std::endl
-     << "    for (int field = 0; field < " << size << "; ++field) {" << std::endl
-     << "      for (int q = 0; q < OCCA_Q; ++q) {" << std::endl
-     << "        " << output << " = " << input << ";" << std::endl
-     << "      }" << std::endl
-     << "    }" << std::endl
-     << std::endl;
-}
-
-void CpuOperator::addQFunctionApplicationSource(std::stringstream &ss) {
-  ss << "    // Apply qFunction" << std::endl
-     << "    " << qfunction->qFunctionName << "(ctx, OCCA_Q, quadInputs, quadOutputs);" << std::endl
-     << std::endl;
-}
-
-//  ---[ Variables ]-----------------
-std::string CpuOperator::elementFunction(const bool isInput, const int index) {
-  return fullFieldFunctionName(isInput, args.getOpField(isInput, index), args.getQfField(isInput, index));
-}
-
-std::string CpuOperator::fieldFunctionName(const QFunctionField &qfField) {
-  switch (qfField.evalMode) {
-    case CEED_EVAL_INTERP:
-      return "interp";
-    case CEED_EVAL_GRAD:
-      return "grad";
-    case CEED_EVAL_WEIGHT:
-      return "weight";
-    default:
-      return "none";
-  }
-}
-
-std::string CpuOperator::fullFieldFunctionName(const bool isInput, const OperatorField &opField, const QFunctionField &qfField) {
-  // Output:
-  //   - tensor_1d_interpElement_Q2_P2
-  //   - simplex_1d_interpElementTranspose_Q2_P2
-
-  const bool        usingTensorBasis = opField.usingTensorBasis();
-  std::stringstream ss;
-  int               dim, Q, P;
-
-  if (usingTensorBasis) {
-    TensorBasis &basis = *((TensorBasis *)opField.basis);
-    dim                = basis.dim;
-    Q                  = basis.Q1D;
-    P                  = basis.P1D;
-    ss << "tensor_";
-  } else {
-    SimplexBasis &basis = *((SimplexBasis *)opField.basis);
-    dim                 = basis.dim;
-    Q                   = basis.Q;
-    P                   = basis.P;
-    ss << "simplex_";
-  }
-
-  ss << dim << "d_" << fieldFunctionName(qfField) << "Element";
-
-  if (!isInput) {
-    ss << "Transpose";
-  }
-
-  ss << "_Q" << Q << "_P" << P;
-
-  return ss.str();
-}
-}  // namespace occa
-}  // namespace ceed
diff --git a/backends/occa/ceed-occa-cpu-operator.hpp b/backends/occa/ceed-occa-cpu-operator.hpp
deleted file mode 100644
index 62c336562b..0000000000
--- a/backends/occa/ceed-occa-cpu-operator.hpp
+++ /dev/null
@@ -1,132 +0,0 @@
-// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#ifndef CEED_OCCA_CPU_OPERATOR_HEADER
-#define CEED_OCCA_CPU_OPERATOR_HEADER
-
-#include <sstream>
-#include <vector>
-
-#include "ceed-occa-operator.hpp"
-#include "ceed-occa-vector.hpp"
-
-namespace ceed {
-namespace occa {
-class Basis;
-class SimplexBasis;
-class TensorBasis;
-
-class CpuOperator : public Operator {
- private:
-  typedef std::vector<Vector *> VectorVector;
-  typedef std::vector<Basis *>  BasisVector;
-
-  VectorVector dofInputs, dofOutputs;
-
- public:
-  CpuOperator();
-
-  ~CpuOperator();
-
-  // Setup helper vectors
-  void setupVectors();
-
-  void setupVectors(const int fieldCount, OperatorFieldVector &opFields, QFunctionFieldVector &qfFields, VectorVector &vectors);
-
-  void freeVectors();
-
-  // Restriction operators
-  void setupInputs(Vector *in);
-
-  void setupOutputs(Vector *out);
-
-  void applyQFunction();
-
-  // Push arguments for a given field
-  void pushKernelArgs(Vector *vec, const bool isInput, const int index);
-
-  void pushTensorBasisKernelArgs(const QFunctionField &qfField, TensorBasis &basis);
-
-  void pushSimplexBasisKernelArgs(const QFunctionField &qfField, SimplexBasis &basis);
-
-  // Set props for a given field
-  ::occa::properties getKernelProps();
-
-  void applyAdd(Vector *in, Vector *out);
-
-  ::occa::kernel buildApplyAddKernel();
-
-  //---[ Kernel Generation ]------------------
-  void addBasisFunctionSource(std::stringstream &ss);
-
-  void addBasisIfMissingSource(BasisVector &sourceBasis, Basis *basis);
-
-  void addKernelSource(std::stringstream &ss);
-
-  void addKernelArgsSource(std::stringstream &ss);
-
-  void addKernelArgSource(std::stringstream &ss, const bool isInput, const int index);
-
-  void addTensorKernelArgSource(std::stringstream &ss, const bool isInput, const int index, const OperatorField &opField,
-                                const QFunctionField &qfField, std::stringstream &dimAttribute);
-
-  void addSimplexKernelArgSource(std::stringstream &ss, const bool isInput, const int index, const OperatorField &opField,
-                                 const QFunctionField &qfField, std::stringstream &dimAttribute);
-
-  void addQuadArraySource(std::stringstream &ss);
-
-  void addSingleQfunctionQuadArraySource(std::stringstream &ss, const bool isInput, const int index, const std::string &name);
-
-  void addQfunctionQuadArraySource(std::stringstream &ss, const bool isInput, const int count, const std::string &name);
-
-  void addInputSetupSource(std::stringstream &ss);
-
-  void addOutputSetupSource(std::stringstream &ss);
-
-  void addBasisApplySource(std::stringstream &ss, const bool isInput, const int count);
-
-  void addInterpSource(std::stringstream &ss, const bool isInput, const int index);
-
-  void addGradTensorSource(std::stringstream &ss, const bool isInput, const int index);
-
-  void addGradSimplexSource(std::stringstream &ss, const bool isInput, const int index);
-
-  void addWeightSource(std::stringstream &ss, const bool isInput, const int index);
-
-  void addCopySource(std::stringstream &ss, const bool isInput, const int index);
-
-  void addQFunctionApplicationSource(std::stringstream &ss);
-
-  //  ---[ Variables ]---------------
-  inline std::string xputName(const bool isInput) { return isInput ? "input" : "output"; }
-
-  inline std::string indexedVar(const std::string &name, const int index) { return name + std::to_string(index); }
-
-  inline std::string indexedVar(const std::string &name, const bool isInput, const int index) {
-    return (isInput ? "input" : "output") + std::to_string(index) + "_" + name;
-  }
-
-  inline std::string dofInputVar(const int index) { return indexedVar("dofInput", index); }
-
-  inline std::string dofOutputVar(const int index) { return indexedVar("dofOutput", index); }
-
-  inline std::string interpVar(const bool isInput, const int index) { return indexedVar("B", isInput, index); }
-
-  inline std::string gradVar(const bool isInput, const int index) { return indexedVar("Bx", isInput, index); }
-
-  inline std::string qWeightVar(const bool isInput, const int index) { return indexedVar("qWeights", isInput, index); }
-
-  std::string elementFunction(const bool isInput, const int index);
-
-  std::string fieldFunctionName(const QFunctionField &qfField);
-
-  std::string fullFieldFunctionName(const bool isInput, const OperatorField &opField, const QFunctionField &qfField);
-};
-}  // namespace occa
-}  // namespace ceed
-
-#endif
diff --git a/backends/occa/ceed-occa-elem-restriction.cpp b/backends/occa/ceed-occa-elem-restriction.cpp
deleted file mode 100644
index 2fa4b57c9e..0000000000
--- a/backends/occa/ceed-occa-elem-restriction.cpp
+++ /dev/null
@@ -1,372 +0,0 @@
-// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#include "./ceed-occa-elem-restriction.hpp"
-
-#include <cstring>
-#include <map>
-
-#include "./ceed-occa-kernels.hpp"
-#include "./ceed-occa-vector.hpp"
-
-namespace ceed {
-namespace occa {
-ElemRestriction::ElemRestriction()
-    : ceedElementCount(0),
-      ceedElementSize(0),
-      ceedComponentCount(0),
-      ceedLVectorSize(0),
-      ceedNodeStride(0),
-      ceedComponentStride(0),
-      ceedElementStride(0),
-      ceedUnstridedComponentStride(0),
-      freeHostIndices(true),
-      hostIndices(NULL),
-      freeIndices(true) {}
-
-ElemRestriction::~ElemRestriction() {
-  if (freeHostIndices) {
-    CeedFree(&hostIndices);
-  }
-  if (freeIndices) {
-    indices.free();
-  }
-}
-
-void ElemRestriction::setup(CeedMemType memType, CeedCopyMode copyMode, const CeedInt *indicesInput) {
-  if (memType == CEED_MEM_HOST) {
-    setupFromHostMemory(copyMode, indicesInput);
-  } else {
-    setupFromDeviceMemory(copyMode, indicesInput);
-  }
-
-  setupTransposeIndices();
-}
-
-void ElemRestriction::setupFromHostMemory(CeedCopyMode copyMode, const CeedInt *indices_h) {
-  const CeedInt entries = ceedElementCount * ceedElementSize;
-
-  freeHostIndices = (copyMode == CEED_OWN_POINTER || copyMode == CEED_COPY_VALUES);
-
-  if (copyMode != CEED_COPY_VALUES) {
-    hostIndices = const_cast<CeedInt *>(indices_h);
-  } else {
-    const size_t bytes = entries * sizeof(CeedInt);
-    hostIndices        = (CeedInt *)::malloc(bytes);
-    std::memcpy(hostIndices, indices_h, bytes);
-  }
-
-  if (hostIndices) {
-    indices = getDevice().malloc<CeedInt>(entries, hostIndices);
-  }
-}
-
-void ElemRestriction::setupFromDeviceMemory(CeedCopyMode copyMode, const CeedInt *indices_d) {
-  ::occa::memory deviceIndices = arrayToMemory(indices_d);
-
-  freeIndices = (copyMode == CEED_OWN_POINTER);
-
-  if (copyMode == CEED_COPY_VALUES) {
-    indices = deviceIndices.clone();
-  } else {
-    indices = deviceIndices;
-  }
-}
-
-bool ElemRestriction::usesIndices() { return indices.isInitialized(); }
-
-void ElemRestriction::setupTransposeIndices() {
-  if (!usesIndices() || transposeQuadIndices.isInitialized()) {
-    return;
-  }
-
-  const CeedInt elementEntryCount = ceedElementCount * ceedElementSize;
-
-  bool *indexIsUsed = new bool[ceedLVectorSize];
-  std::memset(indexIsUsed, 0, ceedLVectorSize * sizeof(bool));
-
-  for (CeedInt i = 0; i < elementEntryCount; ++i) {
-    indexIsUsed[hostIndices[i]] = true;
-  }
-
-  CeedInt nodeCount = 0;
-  for (CeedInt i = 0; i < ceedLVectorSize; ++i) {
-    nodeCount += indexIsUsed[i];
-  }
-
-  const CeedInt dofOffsetCount         = nodeCount + 1;
-  CeedInt      *quadIndexToDofOffset   = new CeedInt[ceedLVectorSize];
-  CeedInt      *transposeQuadIndices_h = new CeedInt[nodeCount];
-  CeedInt      *transposeDofOffsets_h  = new CeedInt[dofOffsetCount];
-  CeedInt      *transposeDofIndices_h  = new CeedInt[elementEntryCount];
-
-  std::memset(transposeDofOffsets_h, 0, dofOffsetCount * sizeof(CeedInt));
-
-  // Compute ids
-  CeedInt offsetId = 0;
-  for (CeedInt i = 0; i < ceedLVectorSize; ++i) {
-    if (indexIsUsed[i]) {
-      transposeQuadIndices_h[offsetId] = i;
-      quadIndexToDofOffset[i]          = offsetId++;
-    }
-  }
-
-  // Count how many times a specific quad node is used
-  for (CeedInt i = 0; i < elementEntryCount; ++i) {
-    ++transposeDofOffsets_h[quadIndexToDofOffset[hostIndices[i]] + 1];
-  }
-
-  // Aggregate to find true offsets
-  for (CeedInt i = 1; i < dofOffsetCount; ++i) {
-    transposeDofOffsets_h[i] += transposeDofOffsets_h[i - 1];
-  }
-
-  // Compute dof indices
-  for (CeedInt i = 0; i < elementEntryCount; ++i) {
-    const CeedInt quadIndex         = hostIndices[i];
-    const CeedInt dofIndex          = transposeDofOffsets_h[quadIndexToDofOffset[quadIndex]]++;
-    transposeDofIndices_h[dofIndex] = i;
-  }
-
-  // Reset offsets
-  for (int i = dofOffsetCount - 1; i > 0; --i) {
-    transposeDofOffsets_h[i] = transposeDofOffsets_h[i - 1];
-  }
-  transposeDofOffsets_h[0] = 0;
-
-  // Copy to device
-  ::occa::device device = getDevice();
-
-  transposeQuadIndices = device.malloc<CeedInt>(nodeCount, transposeQuadIndices_h);
-  transposeDofOffsets  = device.malloc<CeedInt>(dofOffsetCount, transposeDofOffsets_h);
-  transposeDofIndices  = device.malloc<CeedInt>(elementEntryCount, transposeDofIndices_h);
-
-  // Clean up temporary arrays
-  delete[] indexIsUsed;
-  delete[] quadIndexToDofOffset;
-  delete[] transposeQuadIndices_h;
-  delete[] transposeDofOffsets_h;
-  delete[] transposeDofIndices_h;
-}
-
-void ElemRestriction::setKernelProperties() {
-  kernelProperties["defines/CeedInt"]                    = ::occa::dtype::get<CeedInt>().name();
-  kernelProperties["defines/CeedScalar"]                 = ::occa::dtype::get<CeedScalar>().name();
-  kernelProperties["defines/COMPONENT_COUNT"]            = ceedComponentCount;
-  kernelProperties["defines/ELEMENT_SIZE"]               = ceedElementSize;
-  kernelProperties["defines/TILE_SIZE"]                  = 64;
-  kernelProperties["defines/USES_INDICES"]               = usesIndices();
-  kernelProperties["defines/USER_STRIDES"]               = StrideType::USER_STRIDES;
-  kernelProperties["defines/NOT_STRIDED"]                = StrideType::NOT_STRIDED;
-  kernelProperties["defines/BACKEND_STRIDES"]            = StrideType::BACKEND_STRIDES;
-  kernelProperties["defines/STRIDE_TYPE"]                = ceedStrideType;
-  kernelProperties["defines/NODE_COUNT"]                 = transposeQuadIndices.length();
-  kernelProperties["defines/NODE_STRIDE"]                = ceedNodeStride;
-  kernelProperties["defines/COMPONENT_STRIDE"]           = ceedComponentStride;
-  kernelProperties["defines/ELEMENT_STRIDE"]             = ceedElementStride;
-  kernelProperties["defines/UNSTRIDED_COMPONENT_STRIDE"] = ceedUnstridedComponentStride;
-}
-
-ElemRestriction *ElemRestriction::getElemRestriction(CeedElemRestriction r, const bool assertValid) {
-  if (!r || r == CEED_ELEMRESTRICTION_NONE) {
-    return NULL;
-  }
-
-  int              ierr;
-  ElemRestriction *elemRestriction = NULL;
-
-  ierr = CeedElemRestrictionGetData(r, (void **)&elemRestriction);
-  if (assertValid) {
-    CeedOccaFromChk(ierr);
-  }
-
-  return elemRestriction;
-}
-
-ElemRestriction *ElemRestriction::from(CeedElemRestriction r) {
-  ElemRestriction *elemRestriction = getElemRestriction(r);
-  if (!elemRestriction) {
-    return NULL;
-  }
-
-  CeedCallOcca(CeedElemRestrictionGetCeed(r, &elemRestriction->ceed));
-
-  return elemRestriction->setupFrom(r);
-}
-
-ElemRestriction *ElemRestriction::from(CeedOperatorField operatorField) {
-  CeedElemRestriction ceedElemRestriction;
-  CeedCallOcca(CeedOperatorFieldGetElemRestriction(operatorField, &ceedElemRestriction));
-  ElemRestriction *elemRestriction = from(ceedElemRestriction);
-  CeedCallOcca(CeedElemRestrictionDestroy(&ceedElemRestriction));
-  return elemRestriction;
-}
-
-ElemRestriction *ElemRestriction::setupFrom(CeedElemRestriction r) {
-  CeedCallOcca(CeedElemRestrictionGetNumElements(r, &ceedElementCount));
-
-  CeedCallOcca(CeedElemRestrictionGetElementSize(r, &ceedElementSize));
-
-  CeedCallOcca(CeedElemRestrictionGetNumComponents(r, &ceedComponentCount));
-
-  CeedCallOcca(CeedElemRestrictionGetLVectorSize(r, &ceedLVectorSize));
-
-  // Find what type of striding the restriction uses
-  bool isStrided         = false;
-  bool hasBackendStrides = false;
-
-  CeedCallOcca(CeedElemRestrictionIsStrided(r, &isStrided));
-
-  if (isStrided) {
-    CeedCallOcca(CeedElemRestrictionHasBackendStrides(r, &hasBackendStrides));
-  }
-
-  if (isStrided) {
-    if (hasBackendStrides) {
-      ceedStrideType = BACKEND_STRIDES;
-    } else {
-      ceedStrideType = USER_STRIDES;
-    }
-  } else {
-    ceedStrideType = NOT_STRIDED;
-  }
-
-  // Default strides
-  ceedNodeStride               = 1;
-  ceedComponentStride          = ceedElementSize;
-  ceedElementStride            = ceedElementSize * ceedComponentCount;
-  ceedUnstridedComponentStride = 1;
-
-  if (ceedStrideType == USER_STRIDES) {
-    CeedInt strides[3];
-
-    CeedCallOcca(CeedElemRestrictionGetStrides(r, strides));
-
-    ceedNodeStride      = strides[0];
-    ceedComponentStride = strides[1];
-    ceedElementStride   = strides[2];
-
-  } else if (ceedStrideType == NOT_STRIDED) {
-    CeedCallOcca(CeedElemRestrictionGetCompStride(r, &ceedUnstridedComponentStride));
-  }
-
-  return this;
-}
-
-int ElemRestriction::apply(CeedTransposeMode rTransposeMode, Vector &u, Vector &v) {
-  const bool rIsTransposed = (rTransposeMode != CEED_NOTRANSPOSE);
-
-  // Todo: refactor
-  if (rIsTransposed) {
-    if (!restrictionTransposeKernel.isInitialized()) {
-      setKernelProperties();
-      restrictionTransposeKernel = getDevice().buildKernelFromString(occa_elem_restriction_source, "applyRestrictionTranspose", kernelProperties);
-    }
-    restrictionTransposeKernel(ceedElementCount, transposeQuadIndices, transposeDofOffsets, transposeDofIndices, u.getConstKernelArg(),
-                               v.getKernelArg());
-  } else {
-    if (!restrictionKernel.isInitialized()) {
-      setKernelProperties();
-      restrictionKernel = getDevice().buildKernelFromString(occa_elem_restriction_source, "applyRestriction", kernelProperties);
-    }
-    restrictionKernel(ceedElementCount, indices, u.getConstKernelArg(), v.getKernelArg());
-  }
-  return CEED_ERROR_SUCCESS;
-}
-
-int ElemRestriction::getOffsets(CeedMemType memType, const CeedInt **offsets) {
-  switch (memType) {
-    case CEED_MEM_HOST: {
-      *offsets = hostIndices;
-      return CEED_ERROR_SUCCESS;
-    }
-    case CEED_MEM_DEVICE: {
-      *offsets = memoryToArray<CeedInt>(indices);
-      return CEED_ERROR_SUCCESS;
-    }
-  }
-  return ceedError("Unsupported CeedMemType passed to ElemRestriction::getOffsets");
-}
-
-//---[ Ceed Callbacks ]-----------
-int ElemRestriction::registerCeedFunction(Ceed ceed, CeedElemRestriction r, const char *fname, ceed::occa::ceedFunction f) {
-  return CeedSetBackendFunction(ceed, "ElemRestriction", r, fname, f);
-}
-
-int ElemRestriction::ceedCreate(CeedMemType memType, CeedCopyMode copyMode, const CeedInt *indicesInput, const bool *orientsInput,
-                                const CeedInt8 *curlOrientsInput, CeedElemRestriction r) {
-  Ceed ceed;
-  CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed));
-
-  if ((memType != CEED_MEM_DEVICE) && (memType != CEED_MEM_HOST)) {
-    return staticCeedError("Only HOST and DEVICE CeedMemType supported");
-  }
-
-  CeedRestrictionType rstr_type;
-  CeedCallBackend(CeedElemRestrictionGetType(r, &rstr_type));
-  if ((rstr_type == CEED_RESTRICTION_ORIENTED) || (rstr_type == CEED_RESTRICTION_CURL_ORIENTED)) {
-    return staticCeedError("(OCCA) Backend does not implement CeedElemRestrictionCreateOriented or CeedElemRestrictionCreateCurlOriented");
-  }
-
-  ElemRestriction *elemRestriction = new ElemRestriction();
-  CeedCallBackend(CeedElemRestrictionSetData(r, elemRestriction));
-
-  // Setup Ceed objects before setting up memory
-  elemRestriction = ElemRestriction::from(r);
-  elemRestriction->setup(memType, copyMode, indicesInput);
-
-  CeedInt defaultLayout[3] = {1, elemRestriction->ceedElementSize, elemRestriction->ceedElementSize * elemRestriction->ceedComponentCount};
-  CeedCallBackend(CeedElemRestrictionSetELayout(r, defaultLayout));
-
-  CeedOccaRegisterFunction(r, "Apply", ElemRestriction::ceedApply);
-  CeedOccaRegisterFunction(r, "ApplyUnsigned", ElemRestriction::ceedApply);
-  CeedOccaRegisterFunction(r, "ApplyUnoriented", ElemRestriction::ceedApply);
-  CeedOccaRegisterFunction(r, "ApplyBlock", ElemRestriction::ceedApplyBlock);
-  CeedOccaRegisterFunction(r, "GetOffsets", ElemRestriction::ceedGetOffsets);
-  CeedOccaRegisterFunction(r, "Destroy", ElemRestriction::ceedDestroy);
-
-  return CEED_ERROR_SUCCESS;
-}
-
-int ElemRestriction::ceedApply(CeedElemRestriction r, CeedTransposeMode tmode, CeedVector u, CeedVector v, CeedRequest *request) {
-  ElemRestriction *elemRestriction = ElemRestriction::from(r);
-  Vector          *uVector         = Vector::from(u);
-  Vector          *vVector         = Vector::from(v);
-
-  if (!elemRestriction) {
-    return staticCeedError("Incorrect CeedElemRestriction argument: r");
-  }
-  if (!uVector) {
-    return elemRestriction->ceedError("Incorrect CeedVector argument: u");
-  }
-  if (!vVector) {
-    return elemRestriction->ceedError("Incorrect CeedVector argument: v");
-  }
-
-  return elemRestriction->apply(tmode, *uVector, *vVector);
-}
-
-int ElemRestriction::ceedApplyBlock(CeedElemRestriction r, CeedInt block, CeedTransposeMode tmode, CeedVector u, CeedVector v, CeedRequest *request) {
-  return staticCeedError("(OCCA) Backend does not implement CeedElemRestrictionApplyBlock");
-}
-
-int ElemRestriction::ceedGetOffsets(CeedElemRestriction r, CeedMemType memType, const CeedInt **offsets) {
-  ElemRestriction *elemRestriction = ElemRestriction::from(r);
-
-  if (!elemRestriction) {
-    return staticCeedError("Incorrect CeedElemRestriction argument: r");
-  }
-
-  return elemRestriction->getOffsets(memType, offsets);
-}
-
-int ElemRestriction::ceedDestroy(CeedElemRestriction r) {
-  delete getElemRestriction(r, false);
-  return CEED_ERROR_SUCCESS;
-}
-}  // namespace occa
-}  // namespace ceed
diff --git a/backends/occa/ceed-occa-elem-restriction.hpp b/backends/occa/ceed-occa-elem-restriction.hpp
deleted file mode 100644
index 6c6206b82c..0000000000
--- a/backends/occa/ceed-occa-elem-restriction.hpp
+++ /dev/null
@@ -1,94 +0,0 @@
-// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#ifndef CEED_OCCA_ELEMRESTRICTION_HEADER
-#define CEED_OCCA_ELEMRESTRICTION_HEADER
-
-#include "ceed-occa-ceed-object.hpp"
-#include "ceed-occa-vector.hpp"
-
-namespace ceed {
-namespace occa {
-enum StrideType {
-  BACKEND_STRIDES = 0,
-  USER_STRIDES    = 1,
-  NOT_STRIDED     = 2,
-};
-
-class ElemRestriction : public CeedObject {
- public:
-  // Ceed object information
-  CeedInt    ceedElementCount;
-  CeedInt    ceedElementSize;
-  CeedInt    ceedComponentCount;
-  CeedSize   ceedLVectorSize;
-  StrideType ceedStrideType;
-  CeedInt    ceedNodeStride;
-  CeedInt    ceedComponentStride;
-  CeedInt    ceedElementStride;
-  CeedInt    ceedUnstridedComponentStride;
-
-  // Passed resources
-  bool     freeHostIndices;
-  CeedInt *hostIndices;
-
-  // Owned resources
-  bool           freeIndices;
-  ::occa::memory indices;
-
-  ::occa::memory transposeQuadIndices;
-  ::occa::memory transposeDofOffsets;
-  ::occa::memory transposeDofIndices;
-
-  ::occa::json   kernelProperties;
-  ::occa::kernel restrictionKernel;
-  ::occa::kernel restrictionTransposeKernel;
-
-  ElemRestriction();
-
-  ~ElemRestriction();
-
-  void setup(CeedMemType memType, CeedCopyMode copyMode, const CeedInt *indicesInput);
-
-  void setupFromHostMemory(CeedCopyMode copyMode, const CeedInt *indices_h);
-
-  void setupFromDeviceMemory(CeedCopyMode copyMode, const CeedInt *indices_d);
-
-  bool usesIndices();
-
-  void setupTransposeIndices();
-
-  void setKernelProperties();
-
-  static ElemRestriction *getElemRestriction(CeedElemRestriction r, const bool assertValid = true);
-
-  static ElemRestriction *from(CeedElemRestriction r);
-  static ElemRestriction *from(CeedOperatorField operatorField);
-  ElemRestriction        *setupFrom(CeedElemRestriction r);
-
-  int apply(CeedTransposeMode rTransposeMode, Vector &u, Vector &v);
-
-  int getOffsets(CeedMemType memType, const CeedInt **offsets);
-
-  //---[ Ceed Callbacks ]-----------
-  static int registerCeedFunction(Ceed ceed, CeedElemRestriction r, const char *fname, ceed::occa::ceedFunction f);
-
-  static int ceedCreate(CeedMemType memType, CeedCopyMode copyMode, const CeedInt *indicesInput, const bool *orientsInput,
-                        const CeedInt8 *curlOrientsInput, CeedElemRestriction r);
-
-  static int ceedApply(CeedElemRestriction r, CeedTransposeMode tmode, CeedVector u, CeedVector v, CeedRequest *request);
-
-  static int ceedGetOffsets(CeedElemRestriction r, CeedMemType memType, const CeedInt **offsets);
-
-  static int ceedApplyBlock(CeedElemRestriction r, CeedInt block, CeedTransposeMode tmode, CeedVector u, CeedVector v, CeedRequest *request);
-
-  static int ceedDestroy(CeedElemRestriction r);
-};
-}  // namespace occa
-}  // namespace ceed
-
-#endif
diff --git a/backends/occa/ceed-occa-gpu-operator.cpp b/backends/occa/ceed-occa-gpu-operator.cpp
deleted file mode 100644
index f35a52bb97..0000000000
--- a/backends/occa/ceed-occa-gpu-operator.cpp
+++ /dev/null
@@ -1,24 +0,0 @@
-// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#include "ceed-occa-gpu-operator.hpp"
-
-#include "ceed-occa-qfunction.hpp"
-
-namespace ceed {
-namespace occa {
-GpuOperator::GpuOperator() {}
-
-GpuOperator::~GpuOperator() {}
-
-::occa::kernel GpuOperator::buildApplyAddKernel() { return ::occa::kernel(); }
-
-void GpuOperator::applyAdd(Vector *in, Vector *out) {
-  // TODO: Implement
-}
-}  // namespace occa
-}  // namespace ceed
diff --git a/backends/occa/ceed-occa-gpu-operator.hpp b/backends/occa/ceed-occa-gpu-operator.hpp
deleted file mode 100644
index 8b7651b396..0000000000
--- a/backends/occa/ceed-occa-gpu-operator.hpp
+++ /dev/null
@@ -1,30 +0,0 @@
-// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#ifndef CEED_OCCA_GPU_OPERATOR_HEADER
-#define CEED_OCCA_GPU_OPERATOR_HEADER
-
-#include <vector>
-
-#include "ceed-occa-operator.hpp"
-
-namespace ceed {
-namespace occa {
-class GpuOperator : public Operator {
- public:
-  GpuOperator();
-
-  ~GpuOperator();
-
-  ::occa::kernel buildApplyAddKernel();
-
-  void applyAdd(Vector *in, Vector *out);
-};
-}  // namespace occa
-}  // namespace ceed
-
-#endif
diff --git a/backends/occa/ceed-occa-kernels.hpp b/backends/occa/ceed-occa-kernels.hpp
deleted file mode 100644
index bfa77f52a5..0000000000
--- a/backends/occa/ceed-occa-kernels.hpp
+++ /dev/null
@@ -1,16 +0,0 @@
-// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#ifndef CEED_OCCA_KERNELS_HEADER
-#define CEED_OCCA_KERNELS_HEADER
-
-#include "./kernels/elem-restriction.hpp"
-#include "./kernels/set-value.hpp"
-#include "./kernels/simplex-basis.hpp"
-#include "./kernels/tensor-basis.hpp"
-
-#endif
diff --git a/backends/occa/ceed-occa-operator-args.cpp b/backends/occa/ceed-occa-operator-args.cpp
deleted file mode 100644
index caf02788fd..0000000000
--- a/backends/occa/ceed-occa-operator-args.cpp
+++ /dev/null
@@ -1,48 +0,0 @@
-// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#include "ceed-occa-operator-args.hpp"
-
-namespace ceed {
-namespace occa {
-OperatorArgs::OperatorArgs() : QFunctionArgs() {}
-
-OperatorArgs::OperatorArgs(CeedOperator op) : QFunctionArgs() { setupArgs(op); }
-
-void OperatorArgs::setupArgs(CeedOperator op) {
-  CeedQFunction      qf;
-  CeedOperatorField *ceedInputFields, *ceedOutputFields;
-
-  CeedCallOccaValid(_isValid, CeedOperatorGetQFunction(op, &qf));
-  setupQFunctionArgs(qf);
-
-  if (!_isValid) {
-    return;
-  }
-
-  CeedCallOccaValid(_isValid, CeedOperatorGetFields(op, NULL, &ceedInputFields, NULL, &ceedOutputFields));
-
-  for (int i = 0; i < _inputCount; ++i) {
-    OperatorField field = OperatorField(ceedInputFields[i]);
-    opInputs.push_back(field);
-    _isValid &= field.isValid();
-  }
-
-  for (int i = 0; i < _outputCount; ++i) {
-    OperatorField field = OperatorField(ceedOutputFields[i]);
-    opOutputs.push_back(field);
-    _isValid &= field.isValid();
-  }
-}
-
-const OperatorField &OperatorArgs::getOpField(const bool isInput, const int index) const { return isInput ? opInputs[index] : opOutputs[index]; }
-
-const OperatorField &OperatorArgs::getOpInput(const int index) const { return opInputs[index]; }
-
-const OperatorField &OperatorArgs::getOpOutput(const int index) const { return opOutputs[index]; }
-}  // namespace occa
-}  // namespace ceed
diff --git a/backends/occa/ceed-occa-operator-args.hpp b/backends/occa/ceed-occa-operator-args.hpp
deleted file mode 100644
index 6ea4d96687..0000000000
--- a/backends/occa/ceed-occa-operator-args.hpp
+++ /dev/null
@@ -1,40 +0,0 @@
-// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#ifndef CEED_OCCA_OPERATORARGS_HEADER
-#define CEED_OCCA_OPERATORARGS_HEADER
-
-#include <vector>
-
-#include "ceed-occa-ceed-object.hpp"
-#include "ceed-occa-operator-field.hpp"
-#include "ceed-occa-qfunction-args.hpp"
-
-namespace ceed {
-namespace occa {
-typedef std::vector<OperatorField> OperatorFieldVector;
-
-class OperatorArgs : public QFunctionArgs {
- public:
-  OperatorFieldVector opInputs;
-  OperatorFieldVector opOutputs;
-
-  OperatorArgs();
-  OperatorArgs(CeedOperator op);
-
-  void setupArgs(CeedOperator op);
-
-  const OperatorField &getOpField(const bool isInput, const int index) const;
-
-  const OperatorField &getOpInput(const int index) const;
-
-  const OperatorField &getOpOutput(const int index) const;
-};
-}  // namespace occa
-}  // namespace ceed
-
-#endif
diff --git a/backends/occa/ceed-occa-operator-field.cpp b/backends/occa/ceed-occa-operator-field.cpp
deleted file mode 100644
index db13070149..0000000000
--- a/backends/occa/ceed-occa-operator-field.cpp
+++ /dev/null
@@ -1,63 +0,0 @@
-// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#include "ceed-occa-operator-field.hpp"
-
-#include "ceed-occa-basis.hpp"
-#include "ceed-occa-elem-restriction.hpp"
-#include "ceed-occa-vector.hpp"
-
-namespace ceed {
-namespace occa {
-OperatorField::OperatorField(CeedOperatorField opField) : _isValid(false), _usesActiveVector(false), vec(NULL), basis(NULL), elemRestriction(NULL) {
-  CeedBasis           ceedBasis;
-  CeedVector          ceedVector;
-  CeedElemRestriction ceedElemRestriction;
-
-  CeedCallOccaValid(_isValid, CeedOperatorFieldGetBasis(opField, &ceedBasis));
-  CeedCallOccaValid(_isValid, CeedOperatorFieldGetVector(opField, &ceedVector));
-  CeedCallOccaValid(_isValid, CeedOperatorFieldGetElemRestriction(opField, &ceedElemRestriction));
-
-  _isValid          = true;
-  _usesActiveVector = ceedVector == CEED_VECTOR_ACTIVE;
-
-  vec             = Vector::from(ceedVector);
-  basis           = Basis::from(ceedBasis);
-  elemRestriction = ElemRestriction::from(ceedElemRestriction);
-
-  CeedCallOccaValid(_isValid, CeedBasisDestroy(&ceedBasis));
-  CeedCallOccaValid(_isValid, CeedVectorDestroy(&ceedVector));
-  CeedCallOccaValid(_isValid, CeedElemRestrictionDestroy(&ceedElemRestriction));
-}
-
-bool OperatorField::isValid() const { return _isValid; }
-
-//---[ Vector Info ]----------------
-bool OperatorField::usesActiveVector() const { return _usesActiveVector; }
-//==================================
-
-//---[ Basis Info ]-----------------
-bool OperatorField::hasBasis() const { return basis; }
-
-int OperatorField::usingTensorBasis() const { return basis->isTensorBasis(); }
-
-int OperatorField::getComponentCount() const { return (basis ? basis->ceedComponentCount : 1); }
-
-int OperatorField::getP() const { return (basis ? basis->P : 0); }
-
-int OperatorField::getQ() const { return (basis ? basis->Q : 0); }
-
-int OperatorField::getDim() const { return (basis ? basis->dim : 1); }
-//==================================
-
-//---[ ElemRestriction Info ]-------
-int OperatorField::getElementCount() const { return (elemRestriction ? elemRestriction->ceedElementCount : 1); }
-
-int OperatorField::getElementSize() const { return (elemRestriction ? elemRestriction->ceedElementSize : 1); }
-//==================================
-}  // namespace occa
-}  // namespace ceed
diff --git a/backends/occa/ceed-occa-operator-field.hpp b/backends/occa/ceed-occa-operator-field.hpp
deleted file mode 100644
index 866364fbb4..0000000000
--- a/backends/occa/ceed-occa-operator-field.hpp
+++ /dev/null
@@ -1,55 +0,0 @@
-// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#ifndef CEED_OCCA_OPERATORFIELD_HEADER
-#define CEED_OCCA_OPERATORFIELD_HEADER
-
-#include "ceed-occa-context.hpp"
-
-namespace ceed {
-namespace occa {
-class Basis;
-class ElemRestriction;
-class Vector;
-
-class OperatorField {
- private:
-  bool _isValid;
-  bool _usesActiveVector;
-
- public:
-  Vector          *vec;
-  Basis           *basis;
-  ElemRestriction *elemRestriction;
-
-  OperatorField(CeedOperatorField opField);
-
-  bool isValid() const;
-
-  //---[ Vector Info ]--------------
-  bool usesActiveVector() const;
-  //================================
-
-  //---[ Basis Info ]---------------
-  bool hasBasis() const;
-  int  usingTensorBasis() const;
-
-  int getComponentCount() const;
-  int getP() const;
-  int getQ() const;
-  int getDim() const;
-  //================================
-
-  //---[ ElemRestriction Info ]-----
-  int getElementCount() const;
-  int getElementSize() const;
-  //================================
-};
-}  // namespace occa
-}  // namespace ceed
-
-#endif
diff --git a/backends/occa/ceed-occa-operator.cpp b/backends/occa/ceed-occa-operator.cpp
deleted file mode 100644
index 9111c1f8a1..0000000000
--- a/backends/occa/ceed-occa-operator.cpp
+++ /dev/null
@@ -1,151 +0,0 @@
-// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#include "ceed-occa-operator.hpp"
-
-#include "ceed-occa-basis.hpp"
-#include "ceed-occa-cpu-operator.hpp"
-#include "ceed-occa-elem-restriction.hpp"
-#include "ceed-occa-gpu-operator.hpp"
-#include "ceed-occa-qfunction.hpp"
-
-namespace ceed {
-namespace occa {
-Operator::Operator() : ceedQ(0), ceedElementCount(0), qfunction(NULL), needsInitialSetup(true) {}
-
-Operator::~Operator() {}
-
-Operator *Operator::getOperator(CeedOperator op, const bool assertValid) {
-  if (!op) {
-    return NULL;
-  }
-
-  int       ierr;
-  Operator *operator_ = NULL;
-
-  ierr = CeedOperatorGetData(op, (void **)&operator_);
-  if (assertValid) {
-    CeedOccaFromChk(ierr);
-  }
-
-  return operator_;
-}
-
-Operator *Operator::from(CeedOperator op) {
-  Operator *operator_ = getOperator(op);
-  if (!operator_) {
-    return NULL;
-  }
-
-  CeedCallOcca(CeedOperatorGetCeed(op, &operator_->ceed));
-
-  operator_->qfunction = QFunction::from(op);
-  if (!operator_->qfunction) {
-    return NULL;
-  }
-
-  CeedCallOcca(CeedOperatorGetNumQuadraturePoints(op, &operator_->ceedQ));
-  CeedCallOcca(CeedOperatorGetNumElements(op, &operator_->ceedElementCount));
-
-  operator_->args.setupArgs(op);
-  if (!operator_->args.isValid()) {
-    return NULL;
-  }
-
-  return operator_;
-}
-
-bool Operator::isApplyingIdentityFunction() { return qfunction->ceedIsIdentity; }
-
-int Operator::applyAdd(Vector *in, Vector *out, CeedRequest *request) {
-  // TODO: Cache kernel objects rather than relying on OCCA kernel caching
-  applyAddKernel = buildApplyAddKernel();
-
-  if (needsInitialSetup) {
-    initialSetup();
-    needsInitialSetup = false;
-  }
-
-  applyAdd(in, out);
-
-  return CEED_ERROR_SUCCESS;
-}
-
-//---[ Virtual Methods ]------------
-void Operator::initialSetup() {}
-
-//---[ Ceed Callbacks ]-------------
-int Operator::registerCeedFunction(Ceed ceed, CeedOperator op, const char *fname, ceed::occa::ceedFunction f) {
-  return CeedSetBackendFunction(ceed, "Operator", op, fname, f);
-}
-
-int Operator::ceedCreate(CeedOperator op) {
-  Ceed ceed;
-  CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
-
-#if 1
-  Operator *operator_ = new CpuOperator();
-#else
-  // TODO: Add GPU specific operator
-  Operator *operator_ = (Context::from(ceed)->usingCpuDevice() ? ((Operator *)new CpuOperator()) : ((Operator *)new GpuOperator()));
-#endif
-
-  CeedCallBackend(CeedOperatorSetData(op, operator_));
-
-  CeedOccaRegisterFunction(op, "LinearAssembleQFunction", Operator::ceedLinearAssembleQFunction);
-  CeedOccaRegisterFunction(op, "LinearAssembleQFunctionUpdate", Operator::ceedLinearAssembleQFunction);
-  CeedOccaRegisterFunction(op, "LinearAssembleAddDiagonal", Operator::ceedLinearAssembleAddDiagonal);
-  CeedOccaRegisterFunction(op, "LinearAssembleAddPointBlockDiagonal", Operator::ceedLinearAssembleAddPointBlockDiagonal);
-  CeedOccaRegisterFunction(op, "CreateFDMElementInverse", Operator::ceedCreateFDMElementInverse);
-  CeedOccaRegisterFunction(op, "ApplyAdd", Operator::ceedApplyAdd);
-  CeedOccaRegisterFunction(op, "Destroy", Operator::ceedDestroy);
-
-  return CEED_ERROR_SUCCESS;
-}
-
-int Operator::ceedCreateComposite(CeedOperator op) {
-  Ceed ceed;
-  CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
-
-  CeedOccaRegisterFunction(op, "LinearAssembleAddDiagonal", Operator::ceedLinearAssembleAddDiagonal);
-  CeedOccaRegisterFunction(op, "LinearAssembleAddPointBlockDiagonal", Operator::ceedLinearAssembleAddPointBlockDiagonal);
-
-  return CEED_ERROR_SUCCESS;
-}
-
-int Operator::ceedLinearAssembleQFunction(CeedOperator op) { return staticCeedError("(OCCA) Backend does not implement LinearAssembleQFunction"); }
-
-int Operator::ceedLinearAssembleQFunctionUpdate(CeedOperator op) {
-  return staticCeedError("(OCCA) Backend does not implement LinearAssembleQFunctionUpdate");
-}
-
-int Operator::ceedLinearAssembleAddDiagonal(CeedOperator op) { return staticCeedError("(OCCA) Backend does not implement LinearAssembleDiagonal"); }
-
-int Operator::ceedLinearAssembleAddPointBlockDiagonal(CeedOperator op) {
-  return staticCeedError("(OCCA) Backend does not implement LinearAssemblePointBlockDiagonal");
-}
-
-int Operator::ceedCreateFDMElementInverse(CeedOperator op) { return staticCeedError("(OCCA) Backend does not implement CreateFDMElementInverse"); }
-
-int Operator::ceedApplyAdd(CeedOperator op, CeedVector invec, CeedVector outvec, CeedRequest *request) {
-  Operator *operator_ = Operator::from(op);
-  Vector   *in        = Vector::from(invec);
-  Vector   *out       = Vector::from(outvec);
-
-  if (!operator_) {
-    return staticCeedError("Incorrect CeedOperator argument: op");
-  }
-
-  return operator_->applyAdd(in, out, request);
-}
-
-int Operator::ceedDestroy(CeedOperator op) {
-  delete getOperator(op, false);
-  return CEED_ERROR_SUCCESS;
-}
-}  // namespace occa
-}  // namespace ceed
diff --git a/backends/occa/ceed-occa-operator.hpp b/backends/occa/ceed-occa-operator.hpp
deleted file mode 100644
index 866050ef1b..0000000000
--- a/backends/occa/ceed-occa-operator.hpp
+++ /dev/null
@@ -1,74 +0,0 @@
-// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#ifndef CEED_OCCA_OPERATOR_HEADER
-#define CEED_OCCA_OPERATOR_HEADER
-
-#include <vector>
-
-#include "ceed-occa-ceed-object.hpp"
-#include "ceed-occa-operator-args.hpp"
-
-namespace ceed {
-namespace occa {
-typedef std::vector<ceed::occa::Vector *> VectorVector_t;
-
-class QFunction;
-
-class Operator : public CeedObject {
- public:
-  // Ceed object information
-  CeedInt ceedQ;
-  CeedInt ceedElementCount;
-
-  // Owned resources
-  QFunction     *qfunction;
-  OperatorArgs   args;
-  ::occa::kernel applyAddKernel;
-  bool           needsInitialSetup;
-
-  // Reference to other memory
-  ::occa::memory qFunctionContextData;
-
-  Operator();
-  virtual ~Operator();
-
-  static Operator *getOperator(CeedOperator op, const bool assertValid = true);
-
-  static Operator *from(CeedOperator op);
-
-  bool isApplyingIdentityFunction();
-
-  int applyAdd(Vector *in, Vector *out, CeedRequest *request);
-
-  //---[ Virtual Methods ]----------
-  virtual ::occa::kernel buildApplyAddKernel() = 0;
-
-  virtual void initialSetup();
-
-  virtual void applyAdd(Vector *in, Vector *out) = 0;
-
-  //---[ Ceed Callbacks ]-----------
-  static int registerCeedFunction(Ceed ceed, CeedOperator op, const char *fname, ceed::occa::ceedFunction f);
-
-  static int ceedCreate(CeedOperator op);
-  static int ceedCreateComposite(CeedOperator op);
-
-  static int ceedLinearAssembleQFunction(CeedOperator op);
-  static int ceedLinearAssembleQFunctionUpdate(CeedOperator op);
-  static int ceedLinearAssembleAddDiagonal(CeedOperator op);
-  static int ceedLinearAssembleAddPointBlockDiagonal(CeedOperator op);
-  static int ceedCreateFDMElementInverse(CeedOperator op);
-
-  static int ceedApplyAdd(CeedOperator op, CeedVector invec, CeedVector outvec, CeedRequest *request);
-
-  static int ceedDestroy(CeedOperator op);
-};
-}  // namespace occa
-}  // namespace ceed
-
-#endif
diff --git a/backends/occa/ceed-occa-qfunction-args.cpp b/backends/occa/ceed-occa-qfunction-args.cpp
deleted file mode 100644
index cec008f8e6..0000000000
--- a/backends/occa/ceed-occa-qfunction-args.cpp
+++ /dev/null
@@ -1,60 +0,0 @@
-// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#include "ceed-occa-qfunction-args.hpp"
-
-namespace ceed {
-namespace occa {
-QFunctionArgs::QFunctionArgs() : _isValid(false), _inputCount(0), _outputCount(0) {}
-
-QFunctionArgs::QFunctionArgs(CeedQFunction qf) : _isValid(false), _inputCount(0), _outputCount(0) { setupQFunctionArgs(qf); }
-
-void QFunctionArgs::setupQFunctionArgs(CeedQFunction qf) {
-  CeedQFunctionField *ceedInputFields, *ceedOutputFields;
-
-  CeedCallOccaValid(_isValid, CeedQFunctionGetCeed(qf, &ceed));
-
-  CeedCallOccaValid(_isValid, CeedQFunctionGetNumArgs(qf, &_inputCount, &_outputCount));
-
-  CeedCallOccaValid(_isValid, CeedQFunctionGetFields(qf, NULL, &ceedInputFields, NULL, &ceedOutputFields));
-
-  _isValid = true;
-
-  for (int i = 0; i < _inputCount; ++i) {
-    QFunctionField field = QFunctionField(ceedInputFields[i]);
-    qfInputs.push_back(field);
-    _isValid &= field.isValid();
-  }
-
-  for (int i = 0; i < _outputCount; ++i) {
-    QFunctionField field = QFunctionField(ceedOutputFields[i]);
-    qfOutputs.push_back(field);
-    _isValid &= field.isValid();
-  }
-}
-
-bool QFunctionArgs::isValid() const { return _isValid; }
-
-int QFunctionArgs::inputCount() const { return _inputCount; }
-
-int QFunctionArgs::outputCount() const { return _outputCount; }
-
-const QFunctionField &QFunctionArgs::getQfField(const bool isInput, const int index) const { return isInput ? qfInputs[index] : qfOutputs[index]; }
-
-const QFunctionField &QFunctionArgs::getQfInput(const int index) const { return qfInputs[index]; }
-
-const QFunctionField &QFunctionArgs::getQfOutput(const int index) const { return qfOutputs[index]; }
-
-CeedEvalMode QFunctionArgs::getEvalMode(const bool isInput, const int index) const {
-  return isInput ? qfInputs[index].evalMode : qfOutputs[index].evalMode;
-}
-
-CeedEvalMode QFunctionArgs::getInputEvalMode(const int index) const { return qfInputs[index].evalMode; }
-
-CeedEvalMode QFunctionArgs::getOutputEvalMode(const int index) const { return qfOutputs[index].evalMode; }
-}  // namespace occa
-}  // namespace ceed
diff --git a/backends/occa/ceed-occa-qfunction-args.hpp b/backends/occa/ceed-occa-qfunction-args.hpp
deleted file mode 100644
index de0e1fd751..0000000000
--- a/backends/occa/ceed-occa-qfunction-args.hpp
+++ /dev/null
@@ -1,55 +0,0 @@
-// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#ifndef CEED_OCCA_QFUNCTIONARGS_HEADER
-#define CEED_OCCA_QFUNCTIONARGS_HEADER
-
-#include <vector>
-
-#include "ceed-occa-ceed-object.hpp"
-#include "ceed-occa-qfunction-field.hpp"
-
-namespace ceed {
-namespace occa {
-typedef std::vector<QFunctionField> QFunctionFieldVector;
-
-class QFunctionArgs : public CeedObject {
- protected:
-  bool    _isValid;
-  CeedInt _inputCount;
-  CeedInt _outputCount;
-
- public:
-  QFunctionFieldVector qfInputs;
-  QFunctionFieldVector qfOutputs;
-
-  QFunctionArgs();
-  QFunctionArgs(CeedQFunction qf);
-
-  void setupQFunctionArgs(CeedQFunction qf);
-
-  bool isValid() const;
-
-  int inputCount() const;
-  int outputCount() const;
-
-  const QFunctionField &getQfField(const bool isInput, const int index) const;
-
-  const QFunctionField &getQfInput(const int index) const;
-
-  const QFunctionField &getQfOutput(const int index) const;
-
-  CeedEvalMode getEvalMode(const bool isInput, const int index) const;
-
-  CeedEvalMode getInputEvalMode(const int index) const;
-
-  CeedEvalMode getOutputEvalMode(const int index) const;
-};
-}  // namespace occa
-}  // namespace ceed
-
-#endif
diff --git a/backends/occa/ceed-occa-qfunction-field.cpp b/backends/occa/ceed-occa-qfunction-field.cpp
deleted file mode 100644
index c947f94458..0000000000
--- a/backends/occa/ceed-occa-qfunction-field.cpp
+++ /dev/null
@@ -1,22 +0,0 @@
-// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#include "ceed-occa-qfunction-field.hpp"
-
-namespace ceed {
-namespace occa {
-QFunctionField::QFunctionField(CeedQFunctionField qfField) : _isValid(false), size(0) {
-  CeedCallOccaValid(_isValid, CeedQFunctionFieldGetEvalMode(qfField, &evalMode));
-
-  CeedCallOccaValid(_isValid, CeedQFunctionFieldGetSize(qfField, &size));
-
-  _isValid = true;
-}
-
-bool QFunctionField::isValid() const { return _isValid; }
-}  // namespace occa
-}  // namespace ceed
diff --git a/backends/occa/ceed-occa-qfunction-field.hpp b/backends/occa/ceed-occa-qfunction-field.hpp
deleted file mode 100644
index 00c91b1aac..0000000000
--- a/backends/occa/ceed-occa-qfunction-field.hpp
+++ /dev/null
@@ -1,30 +0,0 @@
-// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#ifndef CEED_OCCA_QFUNCTIONFIELD_HEADER
-#define CEED_OCCA_QFUNCTIONFIELD_HEADER
-
-#include "ceed-occa-context.hpp"
-
-namespace ceed {
-namespace occa {
-class QFunctionField {
- protected:
-  bool _isValid;
-
- public:
-  CeedEvalMode evalMode;
-  CeedInt      size;
-
-  QFunctionField(CeedQFunctionField qfField);
-
-  bool isValid() const;
-};
-}  // namespace occa
-}  // namespace ceed
-
-#endif
diff --git a/backends/occa/ceed-occa-qfunction.cpp b/backends/occa/ceed-occa-qfunction.cpp
deleted file mode 100644
index 9b79aabb0a..0000000000
--- a/backends/occa/ceed-occa-qfunction.cpp
+++ /dev/null
@@ -1,242 +0,0 @@
-// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#include "ceed-occa-qfunction.hpp"
-
-#include <sstream>
-#include <string>
-
-#include "ceed-occa-qfunctioncontext.hpp"
-#include "ceed-occa-vector.hpp"
-
-namespace ceed {
-namespace occa {
-QFunction::QFunction(const std::string &source, const std::string &function_name) : ceedIsIdentity(false) {
-  filename      = source;
-  qFunctionName = function_name;
-}
-
-QFunction *QFunction::getQFunction(CeedQFunction qf, const bool assertValid) {
-  if (!qf) {
-    return NULL;
-  }
-
-  QFunction *qFunction = NULL;
-
-  CeedCallOcca(CeedQFunctionGetData(qf, &qFunction));
-
-  return qFunction;
-}
-
-QFunction *QFunction::from(CeedQFunction qf) {
-  QFunction *qFunction = getQFunction(qf);
-  if (!qFunction) {
-    return NULL;
-  }
-
-  CeedCallOcca(CeedQFunctionGetCeed(qf, &qFunction->ceed));
-
-  CeedCallOcca(CeedQFunctionGetInnerContext(qf, &qFunction->qFunctionContext));
-
-  CeedCallOcca(CeedQFunctionIsIdentity(qf, &qFunction->ceedIsIdentity));
-
-  qFunction->args.setupQFunctionArgs(qf);
-  if (!qFunction->args.isValid()) {
-    return NULL;
-  }
-
-  return qFunction;
-}
-
-QFunction *QFunction::from(CeedOperator op) {
-  if (!op) {
-    return NULL;
-  }
-
-  CeedQFunction qf;
-
-  CeedCallOcca(CeedOperatorGetQFunction(op, &qf));
-
-  return QFunction::from(qf);
-}
-
-::occa::properties QFunction::getKernelProps(const CeedInt Q) {
-  ::occa::properties props;
-
-  // Types
-  props["defines/CeedInt"]    = ::occa::dtype::get<CeedInt>().name();
-  props["defines/CeedScalar"] = ::occa::dtype::get<CeedScalar>().name();
-
-  // CEED defines
-  props["defines/CeedPragmaSIMD"]     = "";
-  props["defines/CEED_Q_VLA"]         = "OCCA_Q";
-  props["defines/CEED_ERROR_SUCCESS"] = 0;
-
-  std::stringstream ss;
-  ss << "#define CEED_QFUNCTION(FUNC_NAME) \\" << std::endl
-     << "  inline int FUNC_NAME" << std::endl
-     << "#define CEED_QFUNCTION_HELPER \\" << std::endl
-     << "  inline" << std::endl
-     << std::endl
-     << "#include \"" << filename << "\"" << std::endl;
-
-  props["headers"].asArray() += ss.str();
-
-  return props;
-}
-
-int QFunction::buildKernel(const CeedInt Q) {
-  // TODO: Store a kernel per Q
-  if (!qFunctionKernel.isInitialized()) {
-    ::occa::properties props = getKernelProps(Q);
-
-    // Properties only used in the QFunction kernel source
-    props["defines/OCCA_Q"] = Q;
-
-    const std::string kernelName = "qf_" + qFunctionName;
-
-    qFunctionKernel = (getDevice().buildKernelFromString(getKernelSource(kernelName, Q), kernelName, props));
-  }
-
-  return CEED_ERROR_SUCCESS;
-}
-
-std::string QFunction::getKernelSource(const std::string &kernelName, const CeedInt Q) {
-  std::stringstream ss;
-
-  ss << "@kernel" << std::endl << "void " << kernelName << "(" << std::endl;
-
-  // qfunction arguments
-  for (int i = 0; i < args.inputCount(); ++i) {
-    ss << "  const CeedScalar *in" << i << ',' << std::endl;
-  }
-  for (int i = 0; i < args.outputCount(); ++i) {
-    ss << "  CeedScalar *out" << i << ',' << std::endl;
-  }
-  ss << "  void *ctx" << std::endl;
-  ss << ") {" << std::endl;
-
-  // Iterate over Q and call qfunction
-  ss << "  @tile(128, @outer, @inner)" << std::endl
-     << "  for (int q = 0; q < OCCA_Q; ++q) {" << std::endl
-     << "    const CeedScalar* in[" << std::max(1, args.inputCount()) << "];" << std::endl
-     << "    CeedScalar* out[" << std::max(1, args.outputCount()) << "];" << std::endl;
-
-  // Set and define in for the q point
-  for (int i = 0; i < args.inputCount(); ++i) {
-    const CeedInt     fieldSize = args.getQfInput(i).size;
-    const std::string qIn_i     = "qIn" + std::to_string(i);
-    const std::string in_i      = "in" + std::to_string(i);
-
-    ss << "    CeedScalar " << qIn_i << "[" << fieldSize << "];" << std::endl
-       << "    in[" << i << "] = " << qIn_i << ";"
-       << std::endl
-       // Copy q data
-       << "    for (int qi = 0; qi < " << fieldSize << "; ++qi) {" << std::endl
-       << "      " << qIn_i << "[qi] = " << in_i << "[q + (OCCA_Q * qi)];" << std::endl
-       << "    }" << std::endl;
-  }
-
-  // Set out for the q point
-  for (int i = 0; i < args.outputCount(); ++i) {
-    const CeedInt     fieldSize = args.getQfOutput(i).size;
-    const std::string qOut_i    = "qOut" + std::to_string(i);
-
-    ss << "    CeedScalar " << qOut_i << "[" << fieldSize << "];" << std::endl << "    out[" << i << "] = " << qOut_i << ";" << std::endl;
-  }
-
-  ss << "    " << qFunctionName << "(ctx, 1, in, out);" << std::endl;
-
-  // Copy out for the q point
-  for (int i = 0; i < args.outputCount(); ++i) {
-    const CeedInt     fieldSize = args.getQfOutput(i).size;
-    const std::string qOut_i    = "qOut" + std::to_string(i);
-    const std::string out_i     = "out" + std::to_string(i);
-
-    ss << "    for (int qi = 0; qi < " << fieldSize << "; ++qi) {" << std::endl
-       << "      " << out_i << "[q + (OCCA_Q * qi)] = " << qOut_i << "[qi];" << std::endl
-       << "    }" << std::endl;
-  }
-
-  ss << "  }" << std::endl << "}";
-
-  return ss.str();
-}
-
-int QFunction::apply(CeedInt Q, CeedVector *U, CeedVector *V) {
-  CeedCallBackend(buildKernel(Q));
-
-  std::vector<CeedScalar *> outputArgs;
-
-  qFunctionKernel.clearArgs();
-
-  for (CeedInt i = 0; i < args.inputCount(); i++) {
-    Vector *u = Vector::from(U[i]);
-    if (!u) {
-      return ceedError("Incorrect qFunction input field: U[" + std::to_string(i) + "]");
-    }
-    qFunctionKernel.pushArg(u->getConstKernelArg());
-  }
-
-  for (CeedInt i = 0; i < args.outputCount(); i++) {
-    Vector *v = Vector::from(V[i]);
-    if (!v) {
-      return ceedError("Incorrect qFunction output field: V[" + std::to_string(i) + "]");
-    }
-    qFunctionKernel.pushArg(v->getKernelArg());
-  }
-  if (qFunctionContext) {
-    QFunctionContext *ctx = QFunctionContext::from(qFunctionContext);
-    qFunctionKernel.pushArg(ctx->getKernelArg());
-  } else {
-    qFunctionKernel.pushArg(::occa::null);
-  }
-
-  qFunctionKernel.run();
-
-  return CEED_ERROR_SUCCESS;
-}
-
-//---[ Ceed Callbacks ]-----------
-int QFunction::registerCeedFunction(Ceed ceed, CeedQFunction qf, const char *fname, ceed::occa::ceedFunction f) {
-  return CeedSetBackendFunction(ceed, "QFunction", qf, fname, f);
-}
-
-int QFunction::ceedCreate(CeedQFunction qf) {
-  Ceed ceed;
-  CeedCallBackend(CeedQFunctionGetCeed(qf, &ceed));
-  Context *context;
-  CeedCallBackend(CeedGetData(ceed, &context));
-  const char *source;
-  CeedCallBackend(CeedQFunctionGetSourcePath(qf, &source));
-  const char *function_name;
-  CeedCallBackend(CeedQFunctionGetKernelName(qf, &function_name));
-
-  QFunction *qFunction = new QFunction(source, function_name);
-  CeedCallBackend(CeedQFunctionSetData(qf, qFunction));
-
-  CeedOccaRegisterFunction(qf, "Apply", QFunction::ceedApply);
-  CeedOccaRegisterFunction(qf, "Destroy", QFunction::ceedDestroy);
-
-  return CEED_ERROR_SUCCESS;
-}
-
-int QFunction::ceedApply(CeedQFunction qf, CeedInt Q, CeedVector *U, CeedVector *V) {
-  QFunction *qFunction = QFunction::from(qf);
-  if (qFunction) {
-    return qFunction->apply(Q, U, V);
-  }
-
-  return 1;
-}
-
-int QFunction::ceedDestroy(CeedQFunction qf) {
-  delete getQFunction(qf, false);
-  return CEED_ERROR_SUCCESS;
-}
-}  // namespace occa
-}  // namespace ceed
diff --git a/backends/occa/ceed-occa-qfunction.hpp b/backends/occa/ceed-occa-qfunction.hpp
deleted file mode 100644
index e607941eb4..0000000000
--- a/backends/occa/ceed-occa-qfunction.hpp
+++ /dev/null
@@ -1,54 +0,0 @@
-// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#ifndef CEED_OCCA_QFUNCTION_HEADER
-#define CEED_OCCA_QFUNCTION_HEADER
-
-#include "ceed-occa-ceed-object.hpp"
-#include "ceed-occa-qfunction-args.hpp"
-
-namespace ceed {
-namespace occa {
-class QFunction : public CeedObject {
- public:
-  // Ceed object information
-  bool ceedIsIdentity;
-
-  // Owned resources
-  std::string          filename;
-  std::string          qFunctionName;
-  ::occa::kernel       qFunctionKernel;
-  CeedQFunctionContext qFunctionContext;
-  QFunctionArgs        args;
-
-  QFunction(const std::string &source, const std::string &function_name);
-
-  static QFunction *getQFunction(CeedQFunction qf, const bool assertValid = true);
-
-  static QFunction *from(CeedQFunction qf);
-  static QFunction *from(CeedOperator op);
-
-  ::occa::properties getKernelProps(const CeedInt Q);
-
-  int         buildKernel(const CeedInt Q);
-  std::string getKernelSource(const std::string &kernelName, const CeedInt Q);
-
-  int apply(CeedInt Q, CeedVector *U, CeedVector *V);
-
-  //---[ Ceed Callbacks ]-----------
-  static int registerCeedFunction(Ceed ceed, CeedQFunction qf, const char *fname, ceed::occa::ceedFunction f);
-
-  static int ceedCreate(CeedQFunction qf);
-
-  static int ceedApply(CeedQFunction qf, CeedInt Q, CeedVector *U, CeedVector *V);
-
-  static int ceedDestroy(CeedQFunction qf);
-};
-}  // namespace occa
-}  // namespace ceed
-
-#endif
diff --git a/backends/occa/ceed-occa-qfunctioncontext.cpp b/backends/occa/ceed-occa-qfunctioncontext.cpp
deleted file mode 100644
index a570be638c..0000000000
--- a/backends/occa/ceed-occa-qfunctioncontext.cpp
+++ /dev/null
@@ -1,318 +0,0 @@
-// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#include "ceed-occa-qfunctioncontext.hpp"
-
-#include <cstring>
-
-namespace ceed {
-namespace occa {
-QFunctionContext::QFunctionContext() : ctxSize(0), hostBuffer(NULL), currentHostBuffer(NULL), syncState(SyncState::none) {}
-
-QFunctionContext::~QFunctionContext() {
-  memory.free();
-  freeHostCtxBuffer();
-}
-
-QFunctionContext *QFunctionContext::getQFunctionContext(CeedQFunctionContext ctx, const bool assertValid) {
-  if (!ctx) {
-    return NULL;
-  }
-
-  int               ierr;
-  QFunctionContext *ctx_ = NULL;
-
-  ierr = CeedQFunctionContextGetBackendData(ctx, &ctx_);
-  if (assertValid) {
-    CeedOccaFromChk(ierr);
-  }
-
-  return ctx_;
-}
-
-QFunctionContext *QFunctionContext::from(CeedQFunctionContext ctx) {
-  QFunctionContext *ctx_ = getQFunctionContext(ctx);
-  if (!ctx_) {
-    return NULL;
-  }
-
-  CeedCallOcca(CeedQFunctionContextGetContextSize(ctx, &ctx_->ctxSize));
-
-  if (ctx_ != NULL) {
-    CeedCallOcca(CeedQFunctionContextGetCeed(ctx, &ctx_->ceed));
-  }
-
-  return ctx_;
-}
-
-void QFunctionContext::resizeCtx(const size_t ctxSize_) { ctxSize = ctxSize_; }
-
-void QFunctionContext::resizeCtxMemory(const size_t ctxSize_) { resizeCtxMemory(getDevice(), ctxSize_); }
-
-void QFunctionContext::resizeCtxMemory(::occa::device device, const size_t ctxSize_) {
-  if (ctxSize_ != memory.size()) {
-    memory.free();
-    memory = device.malloc(ctxSize_);
-  }
-}
-
-void QFunctionContext::resizeHostCtxBuffer(const size_t ctxSize_) {
-  CeedFree(&hostBuffer);
-  CeedMallocArray(1, ctxSize, &hostBuffer);
-}
-
-void QFunctionContext::setCurrentCtxMemoryIfNeeded() {
-  if (!currentMemory.isInitialized()) {
-    resizeCtxMemory(ctxSize);
-    currentMemory = memory;
-  }
-}
-
-void QFunctionContext::setCurrentHostCtxBufferIfNeeded() {
-  if (!currentHostBuffer) {
-    resizeHostCtxBuffer(ctxSize);
-    currentHostBuffer = hostBuffer;
-  }
-}
-
-void QFunctionContext::freeHostCtxBuffer() {
-  if (hostBuffer) {
-    CeedFree(&hostBuffer);
-  }
-}
-
-int QFunctionContext::hasValidData(bool *has_valid_data) const {
-  (*has_valid_data) = (!!hostBuffer) || (!!currentHostBuffer) || (memory.isInitialized()) || (currentMemory.isInitialized());
-  return CEED_ERROR_SUCCESS;
-}
-
-int QFunctionContext::hasBorrowedDataOfType(CeedMemType mem_type, bool *has_borrowed_data_of_type) const {
-  switch (mem_type) {
-    case CEED_MEM_HOST:
-      (*has_borrowed_data_of_type) = !!currentHostBuffer;
-      break;
-    case CEED_MEM_DEVICE:
-      (*has_borrowed_data_of_type) = currentMemory.isInitialized();
-      break;
-  }
-  return CEED_ERROR_SUCCESS;
-}
-
-int QFunctionContext::setData(CeedMemType mtype, CeedCopyMode cmode, void *data) {
-  switch (cmode) {
-    case CEED_COPY_VALUES:
-      return copyDataValues(mtype, data);
-    case CEED_OWN_POINTER:
-      return ownDataPointer(mtype, data);
-    case CEED_USE_POINTER:
-      return useDataPointer(mtype, data);
-  }
-  return ceedError("Invalid CeedCopyMode passed");
-}
-
-int QFunctionContext::copyDataValues(CeedMemType mtype, void *data) {
-  switch (mtype) {
-    case CEED_MEM_HOST:
-      setCurrentHostCtxBufferIfNeeded();
-      std::memcpy(currentHostBuffer, data, ctxSize);
-      syncState = SyncState::host;
-      return CEED_ERROR_SUCCESS;
-    case CEED_MEM_DEVICE:
-      setCurrentCtxMemoryIfNeeded();
-      currentMemory.copyFrom(dataToMemory(data));
-      syncState = SyncState::device;
-      return CEED_ERROR_SUCCESS;
-  }
-  return ceedError("Invalid CeedMemType passed");
-}
-
-int QFunctionContext::ownDataPointer(CeedMemType mtype, void *data) {
-  switch (mtype) {
-    case CEED_MEM_HOST:
-      freeHostCtxBuffer();
-      hostBuffer = currentHostBuffer = data;
-      syncState                      = SyncState::host;
-      return CEED_ERROR_SUCCESS;
-    case CEED_MEM_DEVICE:
-      memory.free();
-      memory = currentMemory = dataToMemory(data);
-      syncState              = SyncState::device;
-      return CEED_ERROR_SUCCESS;
-  }
-  return ceedError("Invalid CeedMemType passed");
-}
-
-int QFunctionContext::useDataPointer(CeedMemType mtype, void *data) {
-  switch (mtype) {
-    case CEED_MEM_HOST:
-      freeHostCtxBuffer();
-      currentHostBuffer = data;
-      syncState         = SyncState::host;
-      return CEED_ERROR_SUCCESS;
-    case CEED_MEM_DEVICE:
-      memory.free();
-      currentMemory = dataToMemory(data);
-      syncState     = SyncState::device;
-      return CEED_ERROR_SUCCESS;
-  }
-  return ceedError("Invalid CeedMemType passed");
-}
-
-int QFunctionContext::takeData(CeedMemType mtype, void *data) {
-  if (currentHostBuffer == NULL && currentMemory == ::occa::null) return ceedError("No context data set");
-  switch (mtype) {
-    case CEED_MEM_HOST:
-      setCurrentHostCtxBufferIfNeeded();
-      if (syncState == SyncState::device) {
-        setCurrentCtxMemoryIfNeeded();
-        currentMemory.copyTo(currentHostBuffer);
-      }
-      syncState         = SyncState::host;
-      *(void **)data    = currentHostBuffer;
-      hostBuffer        = NULL;
-      currentHostBuffer = NULL;
-      return CEED_ERROR_SUCCESS;
-    case CEED_MEM_DEVICE:
-      setCurrentCtxMemoryIfNeeded();
-      if (syncState == SyncState::host) {
-        setCurrentHostCtxBufferIfNeeded();
-        currentMemory.copyFrom(currentHostBuffer);
-      }
-      syncState      = SyncState::device;
-      *(void **)data = memoryToData(currentMemory);
-      memory         = ::occa::null;
-      currentMemory  = ::occa::null;
-      return CEED_ERROR_SUCCESS;
-  }
-  return ceedError("Invalid CeedMemType passed");
-}
-
-int QFunctionContext::getData(CeedMemType mtype, void *data) {
-  // The passed `data` might be modified before restoring
-  if (currentHostBuffer == NULL && currentMemory == ::occa::null) return ceedError("No context data set");
-  switch (mtype) {
-    case CEED_MEM_HOST:
-      setCurrentHostCtxBufferIfNeeded();
-      if (syncState == SyncState::device) {
-        setCurrentCtxMemoryIfNeeded();
-        currentMemory.copyTo(currentHostBuffer);
-      }
-      syncState      = SyncState::host;
-      *(void **)data = currentHostBuffer;
-      return CEED_ERROR_SUCCESS;
-    case CEED_MEM_DEVICE:
-      setCurrentCtxMemoryIfNeeded();
-      if (syncState == SyncState::host) {
-        setCurrentHostCtxBufferIfNeeded();
-        currentMemory.copyFrom(currentHostBuffer);
-      }
-      syncState      = SyncState::device;
-      *(void **)data = memoryToData(currentMemory);
-      return CEED_ERROR_SUCCESS;
-  }
-  return ceedError("Invalid CeedMemType passed");
-}
-
-int QFunctionContext::restoreData() { return CEED_ERROR_SUCCESS; }
-
-::occa::memory QFunctionContext::getKernelArg() {
-  setCurrentCtxMemoryIfNeeded();
-  if (syncState == SyncState::host) {
-    setCurrentHostCtxBufferIfNeeded();
-    currentMemory.copyFrom(currentHostBuffer);
-  }
-  syncState = SyncState::device;
-  return currentMemory;
-}
-
-//---[ Ceed Callbacks ]-----------
-int QFunctionContext::registerCeedFunction(Ceed ceed, CeedQFunctionContext ctx, const char *fname, ceed::occa::ceedFunction f) {
-  return CeedSetBackendFunction(ceed, "QFunctionContext", ctx, fname, f);
-}
-
-int QFunctionContext::ceedCreate(CeedQFunctionContext ctx) {
-  Ceed ceed;
-  CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed));
-
-  CeedOccaRegisterFunction(ctx, "HasValidData", QFunctionContext::ceedHasValidData);
-  CeedOccaRegisterFunction(ctx, "HasBorrowedDataOfType", QFunctionContext::ceedHasBorrowedDataOfType);
-  CeedOccaRegisterFunction(ctx, "SetData", QFunctionContext::ceedSetData);
-  CeedOccaRegisterFunction(ctx, "TakeData", QFunctionContext::ceedTakeData);
-  CeedOccaRegisterFunction(ctx, "GetData", QFunctionContext::ceedGetData);
-  CeedOccaRegisterFunction(ctx, "GetDataRead", QFunctionContext::ceedGetDataRead);
-  CeedOccaRegisterFunction(ctx, "RestoreData", QFunctionContext::ceedRestoreData);
-  CeedOccaRegisterFunction(ctx, "Destroy", QFunctionContext::ceedDestroy);
-
-  QFunctionContext *ctx_ = new QFunctionContext();
-  CeedCallBackend(CeedQFunctionContextSetBackendData(ctx, ctx_));
-
-  return CEED_ERROR_SUCCESS;
-}
-
-int QFunctionContext::ceedHasValidData(const CeedQFunctionContext ctx, bool *has_valid_data) {
-  QFunctionContext *ctx_ = QFunctionContext::from(ctx);
-  if (!ctx_) {
-    return staticCeedError("Invalid CeedQFunctionContext passed");
-  }
-  return ctx_->hasValidData(has_valid_data);
-}
-
-int QFunctionContext::ceedHasBorrowedDataOfType(const CeedQFunctionContext ctx, CeedMemType mem_type, bool *has_borrowed_data_of_type) {
-  QFunctionContext *ctx_ = QFunctionContext::from(ctx);
-  if (!ctx_) {
-    return staticCeedError("Invalid CeedQFunctionContext passed");
-  }
-  return ctx_->hasBorrowedDataOfType(mem_type, has_borrowed_data_of_type);
-}
-
-int QFunctionContext::ceedSetData(CeedQFunctionContext ctx, CeedMemType mtype, CeedCopyMode cmode, void *data) {
-  QFunctionContext *ctx_ = QFunctionContext::from(ctx);
-  if (!ctx_) {
-    return staticCeedError("Invalid CeedQFunctionContext passed");
-  }
-  return ctx_->setData(mtype, cmode, data);
-}
-
-int QFunctionContext::ceedTakeData(CeedQFunctionContext ctx, CeedMemType mtype, void *data) {
-  QFunctionContext *ctx_ = QFunctionContext::from(ctx);
-  if (!ctx_) {
-    return staticCeedError("Invalid CeedQFunctionContext passed");
-  }
-  return ctx_->takeData(mtype, data);
-}
-
-int QFunctionContext::ceedGetData(CeedQFunctionContext ctx, CeedMemType mtype, void *data) {
-  QFunctionContext *ctx_ = QFunctionContext::from(ctx);
-  if (!ctx_) {
-    return staticCeedError("Invalid CeedQFunctionContext passed");
-  }
-  return ctx_->getData(mtype, data);
-}
-
-int QFunctionContext::ceedGetDataRead(CeedQFunctionContext ctx, CeedMemType mtype, void *data) {
-  QFunctionContext *ctx_ = QFunctionContext::from(ctx);
-  if (!ctx_) {
-    return staticCeedError("Invalid CeedQFunctionContext passed");
-  }
-  // Todo: Determine if calling getData is sufficient
-  return ctx_->getData(mtype, data);
-}
-
-int QFunctionContext::ceedRestoreData(CeedQFunctionContext ctx) {
-  QFunctionContext *ctx_ = QFunctionContext::from(ctx);
-  if (!ctx_) {
-    return staticCeedError("Invalid CeedQFunctionContext passed");
-  }
-  return ctx_->restoreData();
-}
-
-int QFunctionContext::ceedDestroy(CeedQFunctionContext ctx) {
-  delete getQFunctionContext(ctx, false);
-  return CEED_ERROR_SUCCESS;
-}
-}  // namespace occa
-}  // namespace ceed
diff --git a/backends/occa/ceed-occa-qfunctioncontext.hpp b/backends/occa/ceed-occa-qfunctioncontext.hpp
deleted file mode 100644
index b00857c2fb..0000000000
--- a/backends/occa/ceed-occa-qfunctioncontext.hpp
+++ /dev/null
@@ -1,102 +0,0 @@
-// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#ifndef CEED_OCCA_QFUNCTIONCONTEXT_HEADER
-#define CEED_OCCA_QFUNCTIONCONTEXT_HEADER
-
-#include "ceed-occa-ceed-object.hpp"
-
-namespace ceed {
-namespace occa {
-class QFunctionContext : public CeedObject {
- public:
-  // Owned resources
-  size_t         ctxSize;
-  ::occa::memory memory;
-  void          *hostBuffer;
-
-  // Current resources
-  ::occa::memory currentMemory;
-  void          *currentHostBuffer;
-
-  // State information
-  int syncState;
-
-  QFunctionContext();
-
-  ~QFunctionContext();
-
-  static QFunctionContext *getQFunctionContext(CeedQFunctionContext ctx, const bool assertValid = true);
-
-  static QFunctionContext *from(CeedQFunctionContext ctx);
-
-  ::occa::memory dataToMemory(const void *data) {
-    ::occa::memory mem((::occa::modeMemory_t *)data);
-    return mem;
-  }
-
-  void *memoryToData(::occa::memory &memory) { return memory.getModeMemory(); }
-
-  void resizeCtx(const size_t ctxSize_);
-
-  void resizeCtxMemory(const size_t ctxSize_);
-
-  void resizeCtxMemory(::occa::device device, const size_t ctxSize_);
-
-  void resizeHostCtxBuffer(const size_t ctxSize_);
-
-  void setCurrentCtxMemoryIfNeeded();
-
-  void setCurrentHostCtxBufferIfNeeded();
-
-  void freeHostCtxBuffer();
-
-  int hasValidData(bool *has_valid_data) const;
-
-  int hasBorrowedDataOfType(CeedMemType mem_type, bool *has_borrowed_data_of_type) const;
-
-  int setData(CeedMemType mtype, CeedCopyMode cmode, void *data);
-
-  int copyDataValues(CeedMemType mtype, void *data);
-
-  int ownDataPointer(CeedMemType mtype, void *data);
-
-  int useDataPointer(CeedMemType mtype, void *data);
-
-  int takeData(CeedMemType mtype, void *data);
-
-  int getData(CeedMemType mtype, void *data);
-
-  int restoreData();
-
-  ::occa::memory getKernelArg();
-
-  //---[ Ceed Callbacks ]-----------
-  static int registerCeedFunction(Ceed ceed, CeedQFunctionContext ctx, const char *fname, ceed::occa::ceedFunction f);
-
-  static int ceedCreate(CeedQFunctionContext ctx);
-
-  static int ceedHasValidData(const CeedQFunctionContext ctx, bool *has_valid_data);
-
-  static int ceedHasBorrowedDataOfType(const CeedQFunctionContext ctx, CeedMemType mem_type, bool *has_borrowed_data_of_type);
-
-  static int ceedSetData(CeedQFunctionContext ctx, CeedMemType mtype, CeedCopyMode cmode, void *data);
-
-  static int ceedTakeData(CeedQFunctionContext ctx, CeedMemType mtype, void *data);
-
-  static int ceedGetData(CeedQFunctionContext ctx, CeedMemType mtype, void *data);
-
-  static int ceedGetDataRead(CeedQFunctionContext ctx, CeedMemType mtype, void *data);
-
-  static int ceedRestoreData(CeedQFunctionContext ctx);
-
-  static int ceedDestroy(CeedQFunctionContext ctx);
-};
-}  // namespace occa
-}  // namespace ceed
-
-#endif
diff --git a/backends/occa/ceed-occa-simplex-basis.cpp b/backends/occa/ceed-occa-simplex-basis.cpp
deleted file mode 100644
index 000e68df0a..0000000000
--- a/backends/occa/ceed-occa-simplex-basis.cpp
+++ /dev/null
@@ -1,165 +0,0 @@
-// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#include "ceed-occa-simplex-basis.hpp"
-
-#include "ceed-occa-kernels.hpp"
-
-namespace ceed {
-namespace occa {
-SimplexBasis::SimplexBasis(CeedBasis basis, CeedInt dim_, CeedInt P_, CeedInt Q_, const CeedScalar *interp_, const CeedScalar *grad_,
-                           const CeedScalar *qWeight_) {
-  setCeedFields(basis);
-
-  dim = dim_;
-  P   = P_;
-  Q   = Q_;
-
-  ::occa::device device = getDevice();
-
-  interp  = device.malloc<CeedScalar>(P * Q, interp_);
-  grad    = device.malloc<CeedScalar>(P * Q * dim, grad_);
-  qWeight = device.malloc<CeedScalar>(Q, qWeight_);
-
-  setKernelProperties();
-}
-
-SimplexBasis::~SimplexBasis() {}
-
-bool SimplexBasis::isTensorBasis() const { return false; }
-
-const char *SimplexBasis::getFunctionSource() const {
-  // TODO: Add gpu function sources when split
-  return occa_simplex_basis_cpu_function_source;
-}
-
-void SimplexBasis::setKernelProperties() {
-  kernelProperties["defines/CeedInt"]               = ::occa::dtype::get<CeedInt>().name();
-  kernelProperties["defines/CeedScalar"]            = ::occa::dtype::get<CeedScalar>().name();
-  kernelProperties["defines/DIM"]                   = dim;
-  kernelProperties["defines/Q"]                     = Q;
-  kernelProperties["defines/P"]                     = P;
-  kernelProperties["defines/MAX_PQ"]                = P > Q ? P : Q;
-  kernelProperties["defines/BASIS_COMPONENT_COUNT"] = ceedComponentCount;
-  if (usingGpuDevice()) {
-    kernelProperties["defines/ELEMENTS_PER_BLOCK"] = (Q <= 1024) ? (1024 / Q) : 1;
-  }
-}
-
-::occa::kernel SimplexBasis::buildKernel(const std::string &kernelName) {
-  std::string kernelSource;
-  if (usingGpuDevice()) {
-    kernelSource = occa_simplex_basis_gpu_source;
-  } else {
-    kernelSource = occa_simplex_basis_cpu_function_source;
-    kernelSource += '\n';
-    kernelSource += occa_simplex_basis_cpu_kernel_source;
-  }
-
-  return getDevice().buildKernelFromString(kernelSource, kernelName, kernelProperties);
-}
-
-int SimplexBasis::applyInterp(const CeedInt elementCount, const bool transpose, Vector &U, Vector &V) {
-  if (transpose) {
-    if (!interpTKernel.isInitialized()) {
-      kernelProperties["defines/TRANSPOSE"] = transpose;
-      interpTKernel                         = buildKernel("interp");
-    }
-
-    interpTKernel(elementCount, interp, U.getConstKernelArg(), V.getKernelArg());
-  } else {
-    if (!interpKernel.isInitialized()) {
-      kernelProperties["defines/TRANSPOSE"] = transpose;
-      interpKernel                          = buildKernel("interp");
-    }
-
-    interpKernel(elementCount, interp, U.getConstKernelArg(), V.getKernelArg());
-  }
-  return CEED_ERROR_SUCCESS;
-}
-
-int SimplexBasis::applyGrad(const CeedInt elementCount, const bool transpose, Vector &U, Vector &V) {
-  if (transpose) {
-    if (!gradTKernel.isInitialized()) {
-      kernelProperties["defines/TRANSPOSE"] = transpose;
-      gradTKernel                           = buildKernel("grad");
-    }
-
-    gradTKernel(elementCount, grad, U.getConstKernelArg(), V.getKernelArg());
-  } else {
-    if (!gradKernel.isInitialized()) {
-      kernelProperties["defines/TRANSPOSE"] = transpose;
-      gradKernel                            = buildKernel("grad");
-    }
-
-    gradKernel(elementCount, grad, U.getConstKernelArg(), V.getKernelArg());
-  }
-  return CEED_ERROR_SUCCESS;
-}
-
-int SimplexBasis::applyWeight(const CeedInt elementCount, Vector &W) {
-  if (!weightKernel.isInitialized()) {
-    weightKernel = buildKernel("weight");
-  }
-  weightKernel(elementCount, qWeight, W.getKernelArg());
-
-  return CEED_ERROR_SUCCESS;
-}
-
-int SimplexBasis::apply(const CeedInt elementCount, CeedTransposeMode tmode, CeedEvalMode emode, Vector *U, Vector *V) {
-  const bool transpose = tmode == CEED_TRANSPOSE;
-
-  if ((dim < 1) || (3 < dim)) {
-    return ceedError("Backend only supports dimensions: 1, 2, and 3");
-  }
-
-  // Check arguments
-  if (emode != CEED_EVAL_WEIGHT) {
-    if (!U) {
-      return ceedError("Incorrect CeedVector input: U");
-    }
-  }
-  if (!V) {
-    return ceedError("Incorrect CeedVector input: V");
-  }
-
-  try {
-    // Apply kernel
-    switch (emode) {
-      case CEED_EVAL_INTERP:
-        return applyInterp(elementCount, transpose, *U, *V);
-      case CEED_EVAL_GRAD:
-        return applyGrad(elementCount, transpose, *U, *V);
-      case CEED_EVAL_WEIGHT:
-        return applyWeight(elementCount, *V);
-      default:
-        return ceedError("Backend does not support given simplex eval mode");
-    }
-  } catch (::occa::exception &exc) {
-    // Handle kernel build errors the CEED way
-    CeedHandleOccaException(exc);
-  }
-
-  return CEED_ERROR_SUCCESS;
-}
-
-//---[ Ceed Callbacks ]-------------
-int SimplexBasis::ceedCreate(CeedElemTopology topology, CeedInt dim, CeedInt ndof, CeedInt nquad, const CeedScalar *interp, const CeedScalar *grad,
-                             const CeedScalar *qref, const CeedScalar *qWeight, CeedBasis basis) {
-  Ceed ceed;
-  CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
-
-  SimplexBasis *basis_ = new SimplexBasis(basis, dim, ndof, nquad, interp, grad, qWeight);
-  CeedCallBackend(CeedBasisSetData(basis, basis_));
-
-  CeedOccaRegisterFunction(basis, "Apply", Basis::ceedApply);
-  CeedOccaRegisterFunction(basis, "Destroy", Basis::ceedDestroy);
-
-  return CEED_ERROR_SUCCESS;
-}
-}  // namespace occa
-}  // namespace ceed
diff --git a/backends/occa/ceed-occa-simplex-basis.hpp b/backends/occa/ceed-occa-simplex-basis.hpp
deleted file mode 100644
index a5de4701a9..0000000000
--- a/backends/occa/ceed-occa-simplex-basis.hpp
+++ /dev/null
@@ -1,57 +0,0 @@
-// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#ifndef CEED_OCCA_SIMPLEXBASIS_HEADER
-#define CEED_OCCA_SIMPLEXBASIS_HEADER
-
-#include "ceed-occa-basis.hpp"
-
-namespace ceed {
-namespace occa {
-class SimplexBasis : public Basis {
- public:
-  ::occa::memory interp;
-  ::occa::memory grad;
-  ::occa::memory qWeight;
-
-  ::occa::json   kernelProperties;
-  ::occa::kernel interpKernel;
-  ::occa::kernel interpTKernel;
-  ::occa::kernel gradKernel;
-  ::occa::kernel gradTKernel;
-  ::occa::kernel weightKernel;
-
-  SimplexBasis(CeedBasis basis, CeedInt dim, CeedInt P_, CeedInt Q_, const CeedScalar *interp_, const CeedScalar *grad_, const CeedScalar *qWeight_);
-
-  ~SimplexBasis();
-
-  bool isTensorBasis() const;
-
-  const char *getFunctionSource() const;
-
-  void setKernelProperties();
-
-  std::string getKernelSource() const;
-
-  ::occa::kernel buildKernel(const std::string &kernelName);
-
-  int applyInterp(const CeedInt elementCount, const bool transpose, Vector &U, Vector &V);
-
-  int applyGrad(const CeedInt elementCount, const bool transpose, Vector &U, Vector &V);
-
-  int applyWeight(const CeedInt elementCount, Vector &W);
-
-  int apply(const CeedInt elementCount, CeedTransposeMode tmode, CeedEvalMode emode, Vector *u, Vector *v);
-
-  //---[ Ceed Callbacks ]-----------
-  static int ceedCreate(CeedElemTopology topology, CeedInt dim, CeedInt ndof, CeedInt nquad, const CeedScalar *interp, const CeedScalar *grad,
-                        const CeedScalar *qref, const CeedScalar *qWeight, CeedBasis basis);
-};
-}  // namespace occa
-}  // namespace ceed
-
-#endif
diff --git a/backends/occa/ceed-occa-tensor-basis.cpp b/backends/occa/ceed-occa-tensor-basis.cpp
deleted file mode 100644
index 9cca7e8318..0000000000
--- a/backends/occa/ceed-occa-tensor-basis.cpp
+++ /dev/null
@@ -1,236 +0,0 @@
-// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#include "ceed-occa-tensor-basis.hpp"
-
-#include "ceed-occa-kernels.hpp"
-
-namespace ceed {
-namespace occa {
-TensorBasis::TensorBasis(CeedBasis basis, CeedInt dim_, CeedInt P1D_, CeedInt Q1D_, const CeedScalar *interp1D_, const CeedScalar *grad1D_,
-                         const CeedScalar *qWeight1D_)
-    : P1D(P1D_), Q1D(Q1D_) {
-  setCeedFields(basis);
-
-  dim = dim_;
-
-  P = P1D;
-  Q = Q1D;
-  for (int i = 1; i < dim; ++i) {
-    P *= P1D;
-    Q *= Q1D;
-  }
-
-  ::occa::device device = getDevice();
-
-  interp1D  = device.malloc<CeedScalar>(P1D * Q1D, interp1D_);
-  grad1D    = device.malloc<CeedScalar>(P1D * Q1D, grad1D_);
-  qWeight1D = device.malloc<CeedScalar>(Q1D, qWeight1D_);
-
-  setKernelProperties();
-}
-
-TensorBasis::~TensorBasis() {}
-
-bool TensorBasis::isTensorBasis() const { return true; }
-
-void TensorBasis::setKernelProperties() {
-  kernelProperties["defines/CeedInt"]               = ::occa::dtype::get<CeedInt>().name();
-  kernelProperties["defines/CeedScalar"]            = ::occa::dtype::get<CeedScalar>().name();
-  kernelProperties["defines/Q1D"]                   = Q1D;
-  kernelProperties["defines/P1D"]                   = P1D;
-  kernelProperties["defines/BASIS_COMPONENT_COUNT"] = ceedComponentCount;
-  if (usingGpuDevice()) {
-    kernelProperties["defines/MAX_PQ"] = (Q1D > P1D) ? Q1D : P1D;
-  }
-}
-
-const char *TensorBasis::getFunctionSource() const {
-  // TODO: Add gpu function sources when split
-  const char *cpuFunctionSources[3] = {occa_tensor_basis_1d_cpu_function_source, occa_tensor_basis_2d_cpu_function_source,
-                                       occa_tensor_basis_3d_cpu_function_source};
-  return cpuFunctionSources[dim - 1];
-}
-
-std::string TensorBasis::getKernelSource() const {
-  const char *cpuFunctionSources[3] = {occa_tensor_basis_1d_cpu_function_source, occa_tensor_basis_2d_cpu_function_source,
-                                       occa_tensor_basis_3d_cpu_function_source};
-  const char *cpuKernelSources[3]   = {occa_tensor_basis_1d_cpu_kernel_source, occa_tensor_basis_2d_cpu_kernel_source,
-                                       occa_tensor_basis_3d_cpu_kernel_source};
-  const char *gpuKernelSources[3]   = {occa_tensor_basis_1d_gpu_source, occa_tensor_basis_2d_gpu_source, occa_tensor_basis_3d_gpu_source};
-
-  std::string kernelSource;
-  if (usingGpuDevice()) {
-    kernelSource = gpuKernelSources[dim - 1];
-  } else {
-    kernelSource = cpuFunctionSources[dim - 1];
-    kernelSource += '\n';
-    kernelSource += cpuKernelSources[dim - 1];
-  }
-  return kernelSource;
-}
-
-::occa::kernel TensorBasis::buildKernel(const std::string &kernelName) {
-  std::string kernelSource = getKernelSource();
-  return getDevice().buildKernelFromString(kernelSource, kernelName, kernelProperties);
-}
-
-int TensorBasis::applyInterp(const CeedInt elementCount, const bool transpose, Vector &U, Vector &V) {
-  if (transpose) {
-    if (!interpTKernel.isInitialized()) {
-      kernelProperties["defines/TRANSPOSE"]          = transpose;
-      kernelProperties["defines/ELEMENTS_PER_BLOCK"] = elementsPerBlockInterp();
-      interpTKernel                                  = buildKernel("interp");
-    }
-    interpTKernel(elementCount, interp1D, U.getConstKernelArg(), V.getKernelArg());
-  } else {
-    if (!interpKernel.isInitialized()) {
-      kernelProperties["defines/TRANSPOSE"]          = transpose;
-      kernelProperties["defines/ELEMENTS_PER_BLOCK"] = elementsPerBlockInterp();
-      interpKernel                                   = buildKernel("interp");
-    }
-    interpKernel(elementCount, interp1D, U.getConstKernelArg(), V.getKernelArg());
-  }
-  return CEED_ERROR_SUCCESS;
-}
-
-int TensorBasis::elementsPerBlockInterp() const {
-  int elementsPerBlock;
-  if (dim == 1) {
-    elementsPerBlock = 32;
-  } else if (dim == 2) {
-    const CeedInt blocksByQ[7] = {0, 32, 8, 6, 4, 2, 8};
-    if (Q1D < 7) {
-      elementsPerBlock = blocksByQ[Q1D];
-    } else {
-      elementsPerBlock = 1;
-    }
-  } else {
-    elementsPerBlock = 1;
-  }
-  return elementsPerBlock;
-}
-
-int TensorBasis::applyGrad(const CeedInt elementCount, const bool transpose, Vector &U, Vector &V) {
-  if (transpose) {
-    if (!gradTKernel.isInitialized()) {
-      kernelProperties["defines/TRANSPOSE"]          = transpose;
-      kernelProperties["defines/ELEMENTS_PER_BLOCK"] = elementsPerBlockGrad();
-      gradTKernel                                    = buildKernel("grad");
-    }
-    gradTKernel(elementCount, interp1D, grad1D, U.getConstKernelArg(), V.getKernelArg());
-  } else {
-    if (!gradKernel.isInitialized()) {
-      kernelProperties["defines/TRANSPOSE"]          = transpose;
-      kernelProperties["defines/ELEMENTS_PER_BLOCK"] = elementsPerBlockGrad();
-      gradKernel                                     = buildKernel("grad");
-    }
-    gradKernel(elementCount, interp1D, grad1D, U.getConstKernelArg(), V.getKernelArg());
-  }
-  return CEED_ERROR_SUCCESS;
-}
-
-int TensorBasis::elementsPerBlockGrad() const {
-  int elementsPerBlock;
-  if (dim == 1) {
-    elementsPerBlock = 32;
-  } else if (dim == 2) {
-    const CeedInt blocksByQ[7] = {0, 32, 8, 6, 4, 2, 8};
-    if (Q1D < 7) {
-      elementsPerBlock = blocksByQ[Q1D];
-    } else {
-      elementsPerBlock = 1;
-    }
-  } else {
-    elementsPerBlock = 1;
-  }
-  return elementsPerBlock;
-}
-
-int TensorBasis::applyWeight(const CeedInt elementCount, Vector &W) {
-  if (!weightKernel.isInitialized()) {
-    kernelProperties["defines/ELEMENTS_PER_BLOCK"] = elementsPerBlockWeight();
-    weightKernel                                   = buildKernel("weight");
-  }
-  weightKernel(elementCount, qWeight1D, W.getKernelArg());
-
-  return CEED_ERROR_SUCCESS;
-}
-
-int TensorBasis::elementsPerBlockWeight() const {
-  int elementsPerBlock;
-  if (dim == 1) {
-    elementsPerBlock = 32 / Q1D;
-  } else if (dim == 2) {
-    if ((Q1D * Q1D) > 32) {
-      elementsPerBlock = 1;
-    } else {
-      elementsPerBlock = 32 / (Q1D * Q1D);
-    }
-  } else {
-    elementsPerBlock = Q1D;
-  }
-  return elementsPerBlock;
-}
-
-int TensorBasis::apply(const CeedInt elementCount, CeedTransposeMode tmode, CeedEvalMode emode, Vector *U, Vector *V) {
-  const bool transpose = tmode == CEED_TRANSPOSE;
-
-  if ((dim < 1) || (3 < dim)) {
-    return ceedError("Backend only supports dimensions: 1, 2, and 3");
-  }
-
-  // Check arguments
-  if (emode != CEED_EVAL_WEIGHT) {
-    if (!U) {
-      return ceedError("Incorrect CeedVector input: U");
-    }
-  }
-  if (!V) {
-    return ceedError("Incorrect CeedVector input: V");
-  }
-
-  try {
-    // Apply kernel
-    switch (emode) {
-      case CEED_EVAL_INTERP:
-        return applyInterp(elementCount, transpose, *U, *V);
-      case CEED_EVAL_GRAD:
-        return applyGrad(elementCount, transpose, *U, *V);
-      case CEED_EVAL_WEIGHT:
-        return applyWeight(elementCount, *V);
-      default:
-        return ceedError("Backend does not support given tensor eval mode");
-    }
-  } catch (::occa::exception &exc) {
-    // Handle kernel build errors the CEED way
-    CeedHandleOccaException(exc);
-  }
-
-  return CEED_ERROR_SUCCESS;
-}
-
-//---[ Ceed Callbacks ]-------------
-int TensorBasis::ceedCreate(CeedInt dim, CeedInt P1D, CeedInt Q1D, const CeedScalar *interp1D, const CeedScalar *grad1D, const CeedScalar *qref1D,
-                            const CeedScalar *qWeight1D, CeedBasis basis) {
-  Ceed ceed;
-  CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
-
-  if (Q1D < P1D && Context::from(ceed)->usingGpuDevice()) {
-    return staticCeedError("(OCCA) Backend does not implement underintegrated basis");
-  }
-
-  TensorBasis *basis_ = new TensorBasis(basis, dim, P1D, Q1D, interp1D, grad1D, qWeight1D);
-  CeedCallBackend(CeedBasisSetData(basis, basis_));
-
-  CeedOccaRegisterFunction(basis, "Apply", Basis::ceedApply);
-  CeedOccaRegisterFunction(basis, "Destroy", Basis::ceedDestroy);
-
-  return CEED_ERROR_SUCCESS;
-}
-}  // namespace occa
-}  // namespace ceed
diff --git a/backends/occa/ceed-occa-tensor-basis.hpp b/backends/occa/ceed-occa-tensor-basis.hpp
deleted file mode 100644
index 88a6eb3029..0000000000
--- a/backends/occa/ceed-occa-tensor-basis.hpp
+++ /dev/null
@@ -1,64 +0,0 @@
-// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#ifndef CEED_OCCA_TENSORBASIS_HEADER
-#define CEED_OCCA_TENSORBASIS_HEADER
-
-#include "ceed-occa-basis.hpp"
-
-namespace ceed {
-namespace occa {
-class TensorBasis : public Basis {
- public:
-  CeedInt        P1D;
-  CeedInt        Q1D;
-  ::occa::memory interp1D;
-  ::occa::memory grad1D;
-  ::occa::memory qWeight1D;
-
-  ::occa::json   kernelProperties;
-  ::occa::kernel interpKernel;
-  ::occa::kernel interpTKernel;
-  ::occa::kernel gradKernel;
-  ::occa::kernel gradTKernel;
-  ::occa::kernel weightKernel;
-
-  TensorBasis(CeedBasis basis, CeedInt dim_, CeedInt P1D_, CeedInt Q1D_, const CeedScalar *interp1D_, const CeedScalar *grad1D_,
-              const CeedScalar *qWeight1D_);
-
-  ~TensorBasis();
-
-  bool isTensorBasis() const;
-
-  const char *getFunctionSource() const;
-
-  std::string getKernelSource() const;
-
-  void setKernelProperties();
-
-  int elementsPerBlockInterp() const;
-  int elementsPerBlockGrad() const;
-  int elementsPerBlockWeight() const;
-
-  ::occa::kernel buildKernel(const std::string &kernelName);
-
-  int applyInterp(const CeedInt elementCount, const bool transpose, Vector &U, Vector &V);
-
-  int applyGrad(const CeedInt elementCount, const bool transpose, Vector &U, Vector &V);
-
-  int applyWeight(const CeedInt elementCount, Vector &W);
-
-  int apply(const CeedInt elementCount, CeedTransposeMode tmode, CeedEvalMode emode, Vector *U, Vector *V);
-
-  //---[ Ceed Callbacks ]-----------
-  static int ceedCreate(CeedInt dim, CeedInt P1D, CeedInt Q1D, const CeedScalar *interp1D, const CeedScalar *grad1D, const CeedScalar *qref1D,
-                        const CeedScalar *qWeight1D, CeedBasis basis);
-};
-}  // namespace occa
-}  // namespace ceed
-
-#endif
diff --git a/backends/occa/ceed-occa-types.hpp b/backends/occa/ceed-occa-types.hpp
deleted file mode 100644
index 52496e934c..0000000000
--- a/backends/occa/ceed-occa-types.hpp
+++ /dev/null
@@ -1,60 +0,0 @@
-// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#ifndef CEED_OCCA_TYPES_HEADER
-#define CEED_OCCA_TYPES_HEADER
-
-#include <ceed/backend.h>
-
-#include <occa.hpp>
-
-#define CeedOccaFromChk(ierr) \
-  do {                        \
-    if (ierr) {               \
-      return NULL;            \
-    }                         \
-  } while (0)
-
-#define CeedCallOcca(...)      \
-  do {                         \
-    int ierr_q_ = __VA_ARGS__; \
-    CeedOccaFromChk(ierr_q_);  \
-  } while (0);
-
-#define CeedOccaValidChk(isValidVar, ierr) \
-  do {                                     \
-    if (ierr) {                            \
-      isValidVar = false;                  \
-      return;                              \
-    }                                      \
-  } while (0)
-
-#define CeedCallOccaValid(isValidVar, ...) \
-  do {                                     \
-    int ierr_q_ = __VA_ARGS__;             \
-    CeedOccaValidChk(isValidVar, ierr_q_); \
-  } while (0);
-
-#define CeedHandleOccaException(exc)                           \
-  do {                                                         \
-    std::string error = exc.toString();                        \
-    return CeedError(ceed, CEED_ERROR_BACKEND, error.c_str()); \
-  } while (0)
-
-#define CeedOccaCastRegisterFunction(func) (ceed::occa::ceedFunction)(void *) func
-
-#define CeedOccaRegisterBaseFunction(name, func) CeedCallBackend(registerCeedFunction(ceed, name, CeedOccaCastRegisterFunction(func)));
-
-#define CeedOccaRegisterFunction(object, name, func) CeedCallBackend(registerCeedFunction(ceed, object, name, CeedOccaCastRegisterFunction(func)));
-
-namespace ceed {
-namespace occa {
-typedef int (*ceedFunction)();
-}
-}  // namespace ceed
-
-#endif
diff --git a/backends/occa/ceed-occa-vector.cpp b/backends/occa/ceed-occa-vector.cpp
deleted file mode 100644
index efcabd15f9..0000000000
--- a/backends/occa/ceed-occa-vector.cpp
+++ /dev/null
@@ -1,460 +0,0 @@
-// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#include "ceed-occa-vector.hpp"
-
-#include <cstring>
-
-#include "ceed-occa-kernels.hpp"
-
-namespace ceed {
-namespace occa {
-Vector::Vector() : length(0), hostBufferLength(0), hostBuffer(NULL), currentHostBuffer(NULL), syncState(SyncState::none) {}
-
-Vector::~Vector() {
-  memory.free();
-  freeHostBuffer();
-}
-
-int Vector::hasValidArray(bool *has_valid_array) {
-  (*has_valid_array) = (!!hostBuffer) || (!!currentHostBuffer) || (memory.isInitialized()) || (currentMemory.isInitialized());
-  return CEED_ERROR_SUCCESS;
-}
-
-int Vector::hasBorrowedArrayOfType(CeedMemType mem_type, bool *has_borrowed_array_of_type) {
-  switch (mem_type) {
-    case CEED_MEM_HOST:
-      (*has_borrowed_array_of_type) = !!currentHostBuffer;
-      break;
-    case CEED_MEM_DEVICE:
-      (*has_borrowed_array_of_type) = currentMemory.isInitialized();
-      break;
-  }
-  return CEED_ERROR_SUCCESS;
-}
-
-Vector *Vector::getVector(CeedVector vec, const bool assertValid) {
-  if (!vec || vec == CEED_VECTOR_NONE) {
-    return NULL;
-  }
-
-  int     ierr;
-  Vector *vector = NULL;
-
-  ierr = CeedVectorGetData(vec, &vector);
-  if (assertValid) {
-    CeedOccaFromChk(ierr);
-  }
-
-  return vector;
-}
-
-Vector *Vector::from(CeedVector vec) {
-  Vector *vector = getVector(vec);
-  if (!vector) {
-    return NULL;
-  }
-
-  CeedCallOcca(CeedVectorGetCeed(vec, &vector->ceed));
-  CeedCallOcca(CeedVectorGetLength(vec, &vector->length));
-
-  return vector;
-}
-
-void Vector::resize(const CeedSize length_) { length = length_; }
-
-void Vector::resizeMemory(const CeedSize length_) { resizeMemory(getDevice(), length_); }
-
-void Vector::resizeMemory(::occa::device device, const CeedSize length_) {
-  if (length_ != (CeedSize)memory.length()) {
-    memory.free();
-    memory = device.malloc<CeedScalar>(length_);
-  }
-}
-
-void Vector::resizeHostBuffer(const CeedSize length_) {
-  if (length_ != hostBufferLength) {
-    delete hostBuffer;
-    hostBuffer = new CeedScalar[length_];
-  }
-}
-
-void Vector::setCurrentMemoryIfNeeded() {
-  if (!currentMemory.isInitialized()) {
-    resizeMemory(length);
-    currentMemory = memory;
-  }
-}
-
-void Vector::setCurrentHostBufferIfNeeded() {
-  if (!currentHostBuffer) {
-    resizeHostBuffer(length);
-    currentHostBuffer = hostBuffer;
-  }
-}
-
-void Vector::freeHostBuffer() {
-  if (hostBuffer) {
-    delete[] hostBuffer;
-    hostBuffer = NULL;
-  }
-}
-
-int Vector::setValue(CeedScalar value) {
-  // Prioritize keeping data in the device
-  if (syncState & SyncState::device) {
-    setCurrentMemoryIfNeeded();
-    if (!setValueKernel.isInitialized()) {
-      ::occa::json kernelProperties;
-      CeedInt constexpr block_size{256};
-      kernelProperties["defines/CeedInt"]    = ::occa::dtype::get<CeedInt>().name();
-      kernelProperties["defines/CeedScalar"] = ::occa::dtype::get<CeedScalar>().name();
-      kernelProperties["defines/BLOCK_SIZE"] = block_size;
-
-      std::string kernelSource = occa_set_value_source;
-      setValueKernel           = getDevice().buildKernelFromString(kernelSource, "setValue", kernelProperties);
-      setValueKernel(currentMemory, value, length);
-    }
-    syncState = SyncState::device;
-  } else {
-    setCurrentHostBufferIfNeeded();
-    for (CeedInt i = 0; i < length; ++i) {
-      currentHostBuffer[i] = value;
-    }
-    syncState = SyncState::host;
-  }
-  return CEED_ERROR_SUCCESS;
-}
-
-int Vector::setArray(CeedMemType mtype, CeedCopyMode cmode, CeedScalar *array) {
-  switch (cmode) {
-    case CEED_COPY_VALUES:
-      return copyArrayValues(mtype, array);
-    case CEED_OWN_POINTER:
-      return ownArrayPointer(mtype, array);
-    case CEED_USE_POINTER:
-      return useArrayPointer(mtype, array);
-  }
-  return ceedError("Invalid CeedCopyMode passed");
-}
-
-int Vector::takeArray(CeedMemType mtype, CeedScalar **array) {
-  switch (mtype) {
-    case CEED_MEM_HOST:
-      setCurrentHostBufferIfNeeded();
-      if (syncState == SyncState::device) {
-        setCurrentMemoryIfNeeded();
-        currentMemory.copyTo(currentHostBuffer);
-      }
-      *array            = currentHostBuffer;
-      hostBuffer        = NULL;
-      currentHostBuffer = NULL;
-
-      syncState = SyncState::host;
-      return CEED_ERROR_SUCCESS;
-    case CEED_MEM_DEVICE:
-      setCurrentMemoryIfNeeded();
-      if (syncState == SyncState::host) {
-        setCurrentHostBufferIfNeeded();
-        currentMemory.copyFrom(currentHostBuffer);
-      }
-      *array        = memoryToArray<CeedScalar>(currentMemory);
-      memory        = ::occa::null;
-      currentMemory = ::occa::null;
-
-      syncState = SyncState::device;
-      return CEED_ERROR_SUCCESS;
-  }
-  return ceedError("Invalid CeedMemType passed");
-}
-
-int Vector::copyArrayValues(CeedMemType mtype, CeedScalar *array) {
-  switch (mtype) {
-    case CEED_MEM_HOST:
-      setCurrentHostBufferIfNeeded();
-      if (array) {
-        std::memcpy(currentHostBuffer, array, length * sizeof(CeedScalar));
-      }
-      syncState = SyncState::host;
-      return CEED_ERROR_SUCCESS;
-    case CEED_MEM_DEVICE:
-      setCurrentMemoryIfNeeded();
-      if (array) {
-        currentMemory.copyFrom(arrayToMemory(array));
-      }
-      syncState = SyncState::device;
-      return CEED_ERROR_SUCCESS;
-  }
-  return ceedError("Invalid CeedMemType passed");
-}
-
-int Vector::ownArrayPointer(CeedMemType mtype, CeedScalar *array) {
-  switch (mtype) {
-    case CEED_MEM_HOST:
-      freeHostBuffer();
-      hostBuffer = currentHostBuffer = array;
-      syncState                      = SyncState::host;
-      return CEED_ERROR_SUCCESS;
-    case CEED_MEM_DEVICE:
-      memory.free();
-      memory = currentMemory = arrayToMemory(array);
-      syncState              = SyncState::device;
-      return CEED_ERROR_SUCCESS;
-  }
-  return ceedError("Invalid CeedMemType passed");
-}
-
-int Vector::useArrayPointer(CeedMemType mtype, CeedScalar *array) {
-  switch (mtype) {
-    case CEED_MEM_HOST:
-      freeHostBuffer();
-      currentHostBuffer = array;
-      syncState         = SyncState::host;
-      return CEED_ERROR_SUCCESS;
-    case CEED_MEM_DEVICE:
-      memory.free();
-      currentMemory = arrayToMemory(array);
-      syncState     = SyncState::device;
-      return CEED_ERROR_SUCCESS;
-  }
-  return ceedError("Invalid CeedMemType passed");
-}
-
-int Vector::getArray(CeedMemType mtype, CeedScalar **array) {
-  // The passed `array` might be modified before restoring
-  // so we can't set sync state to SyncState::all
-  switch (mtype) {
-    case CEED_MEM_HOST:
-      setCurrentHostBufferIfNeeded();
-      if (syncState == SyncState::device) {
-        setCurrentMemoryIfNeeded();
-        currentMemory.copyTo(currentHostBuffer);
-      }
-      syncState = SyncState::host;
-      *array    = currentHostBuffer;
-      return CEED_ERROR_SUCCESS;
-    case CEED_MEM_DEVICE:
-      setCurrentMemoryIfNeeded();
-      if (syncState == SyncState::host) {
-        setCurrentHostBufferIfNeeded();
-        currentMemory.copyFrom(currentHostBuffer);
-      }
-      syncState = SyncState::device;
-      *array    = memoryToArray<CeedScalar>(currentMemory);
-      return CEED_ERROR_SUCCESS;
-  }
-  return ceedError("Invalid CeedMemType passed");
-}
-
-int Vector::getReadOnlyArray(CeedMemType mtype, CeedScalar **array) {
-  const bool willBeFullySynced =
-      ((syncState == SyncState::device && mtype == CEED_MEM_HOST) || (syncState == SyncState::host && mtype == CEED_MEM_DEVICE));
-
-  const int error = getArray(mtype, const_cast<CeedScalar **>(array));
-  // Take advantage the vector will be fully synced
-  if (!error && willBeFullySynced) {
-    syncState = SyncState::all;
-  }
-
-  return error;
-}
-
-int Vector::getWriteOnlyArray(CeedMemType mtype, CeedScalar **array) {
-  // const bool willBeFullySynced = (
-  //   (syncState == SyncState::device && mtype == CEED_MEM_HOST) ||
-  //   (syncState == SyncState::host && mtype == CEED_MEM_DEVICE)
-  // );
-
-  const int error = getArray(mtype, const_cast<CeedScalar **>(array));
-  // // Take advantage the vector will be fully synced
-  // if (!error && willBeFullySynced) {
-  //   syncState = SyncState::all;
-  // }
-
-  return error;
-}
-
-int Vector::restoreArray(CeedScalar **array) { return CEED_ERROR_SUCCESS; }
-
-int Vector::restoreReadOnlyArray(CeedScalar **array) { return CEED_ERROR_SUCCESS; }
-
-::occa::memory Vector::getKernelArg() {
-  setCurrentMemoryIfNeeded();
-  if (syncState == SyncState::host) {
-    setCurrentHostBufferIfNeeded();
-    currentMemory.copyFrom(currentHostBuffer);
-  }
-  syncState = SyncState::device;
-  return currentMemory;
-}
-
-::occa::memory Vector::getConstKernelArg() {
-  setCurrentMemoryIfNeeded();
-  if (syncState == SyncState::host) {
-    setCurrentHostBufferIfNeeded();
-    currentMemory.copyFrom(currentHostBuffer);
-    syncState = SyncState::all;
-  }
-  return currentMemory;
-}
-
-void Vector::printValues(const std::string &name) {
-  CeedScalar *values;
-  getReadOnlyArray(CEED_MEM_HOST, &values);
-
-  std::cout << std::setprecision(8) << "Vector: " << name << std::endl << "  - Values: " << std::endl;
-
-  for (int i = 0; i < length; ++i) {
-    printf("    %12.8f\n", values[i]);
-  }
-}
-
-void Vector::printNonZeroValues(const std::string &name) {
-  CeedScalar *values;
-  getReadOnlyArray(CEED_MEM_HOST, &values);
-
-  std::cout << std::setprecision(8) << "Vector: " << name << std::endl << "  - Non-zero values: " << std::endl;
-
-  for (int i = 0; i < length; ++i) {
-    if (fabs(values[i]) > 1e-8) {
-      printf("    %d: %12.8f\n", i, values[i]);
-    }
-  }
-}
-
-void Vector::printSummary(const std::string &name) {
-  CeedScalar *values;
-  getReadOnlyArray(CEED_MEM_HOST, &values);
-
-  CeedScalar minValue = values[0];
-  CeedScalar maxValue = values[0];
-
-  for (int i = 0; i < length; ++i) {
-    const CeedScalar value = values[i];
-    minValue               = minValue < value ? minValue : value;
-    maxValue               = maxValue > value ? maxValue : value;
-  }
-
-  std::cout << std::setprecision(8) << "Vector: " << name << std::endl
-            << "  - Length: " << length << std::endl
-            << "  - Min   : " << minValue << std::endl
-            << "  - Max   : " << maxValue << std::endl;
-}
-
-//---[ Ceed Callbacks ]-----------
-int Vector::registerCeedFunction(Ceed ceed, CeedVector vec, const char *fname, ceed::occa::ceedFunction f) {
-  return CeedSetBackendFunction(ceed, "Vector", vec, fname, f);
-}
-
-int Vector::ceedCreate(CeedSize length, CeedVector vec) {
-  Ceed ceed;
-  CeedCallBackend(CeedVectorGetCeed(vec, &ceed));
-
-  CeedOccaRegisterFunction(vec, "HasValidArray", Vector::ceedHasValidArray);
-  CeedOccaRegisterFunction(vec, "HasBorrowedArrayOfType", Vector::ceedHasBorrowedArrayOfType);
-  CeedOccaRegisterFunction(vec, "SetValue", Vector::ceedSetValue);
-  CeedOccaRegisterFunction(vec, "SetArray", Vector::ceedSetArray);
-  CeedOccaRegisterFunction(vec, "TakeArray", Vector::ceedTakeArray);
-  CeedOccaRegisterFunction(vec, "GetArray", Vector::ceedGetArray);
-  CeedOccaRegisterFunction(vec, "GetArrayRead", Vector::ceedGetArrayRead);
-  CeedOccaRegisterFunction(vec, "GetArrayWrite", Vector::ceedGetArrayWrite);
-  CeedOccaRegisterFunction(vec, "RestoreArray", Vector::ceedRestoreArray);
-  CeedOccaRegisterFunction(vec, "RestoreArrayRead", Vector::ceedRestoreArrayRead);
-  CeedOccaRegisterFunction(vec, "Destroy", Vector::ceedDestroy);
-
-  Vector *vector = new Vector();
-  CeedCallBackend(CeedVectorSetData(vec, vector));
-
-  return CEED_ERROR_SUCCESS;
-}
-
-int Vector::ceedHasValidArray(CeedVector vec, bool *has_valid_array) {
-  Vector *vector = Vector::from(vec);
-  if (!vector) {
-    return staticCeedError("Invalid CeedVector passed");
-  }
-  return vector->hasValidArray(has_valid_array);
-}
-
-int Vector::ceedHasBorrowedArrayOfType(CeedVector vec, CeedMemType mem_type, bool *has_borrowed_array_of_type) {
-  Vector *vector = Vector::from(vec);
-  if (!vector) {
-    return staticCeedError("Invalid CeedVector passed");
-  }
-  return vector->hasBorrowedArrayOfType(mem_type, has_borrowed_array_of_type);
-}
-
-int Vector::ceedSetValue(CeedVector vec, CeedScalar value) {
-  Vector *vector = Vector::from(vec);
-  if (!vector) {
-    return staticCeedError("Invalid CeedVector passed");
-  }
-  return vector->setValue(value);
-}
-
-int Vector::ceedSetArray(CeedVector vec, CeedMemType mtype, CeedCopyMode cmode, CeedScalar *array) {
-  Vector *vector = Vector::from(vec);
-  if (!vector) {
-    return staticCeedError("Invalid CeedVector passed");
-  }
-  return vector->setArray(mtype, cmode, array);
-}
-
-int Vector::ceedTakeArray(CeedVector vec, CeedMemType mtype, CeedScalar **array) {
-  Vector *vector = Vector::from(vec);
-  if (!vector) {
-    return staticCeedError("Invalid CeedVector passed");
-  }
-  return vector->takeArray(mtype, array);
-}
-
-int Vector::ceedGetArray(CeedVector vec, CeedMemType mtype, CeedScalar **array) {
-  Vector *vector = Vector::from(vec);
-  if (!vector) {
-    return staticCeedError("Invalid CeedVector passed");
-  }
-  return vector->getArray(mtype, array);
-}
-
-int Vector::ceedGetArrayRead(CeedVector vec, CeedMemType mtype, CeedScalar **array) {
-  Vector *vector = Vector::from(vec);
-  if (!vector) {
-    return staticCeedError("Invalid CeedVector passed");
-  }
-  return vector->getReadOnlyArray(mtype, array);
-}
-
-int Vector::ceedGetArrayWrite(CeedVector vec, CeedMemType mtype, CeedScalar **array) {
-  Vector *vector = Vector::from(vec);
-  if (!vector) {
-    return staticCeedError("Invalid CeedVector passed");
-  }
-  return vector->getWriteOnlyArray(mtype, array);
-}
-
-int Vector::ceedRestoreArray(CeedVector vec, CeedScalar **array) {
-  Vector *vector = Vector::from(vec);
-  if (!vector) {
-    return staticCeedError("Invalid CeedVector passed");
-  }
-  return vector->restoreArray(array);
-}
-
-int Vector::ceedRestoreArrayRead(CeedVector vec, CeedScalar **array) {
-  Vector *vector = Vector::from(vec);
-  if (!vector) {
-    return staticCeedError("Invalid CeedVector passed");
-  }
-  return vector->restoreReadOnlyArray(array);
-}
-
-int Vector::ceedDestroy(CeedVector vec) {
-  delete getVector(vec, false);
-  return CEED_ERROR_SUCCESS;
-}
-}  // namespace occa
-}  // namespace ceed
diff --git a/backends/occa/ceed-occa-vector.hpp b/backends/occa/ceed-occa-vector.hpp
deleted file mode 100644
index 7b2f8d730e..0000000000
--- a/backends/occa/ceed-occa-vector.hpp
+++ /dev/null
@@ -1,133 +0,0 @@
-// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#ifndef CEED_OCCA_VECTOR_HEADER
-#define CEED_OCCA_VECTOR_HEADER
-
-#include "ceed-occa-ceed-object.hpp"
-
-namespace ceed {
-namespace occa {
-template <class TM>
-::occa::memory arrayToMemory(const TM *array) {
-  if (array) {
-    ::occa::memory mem((::occa::modeMemory_t *)array);
-    mem.setDtype(::occa::dtype::get<TM>());
-    return mem;
-  }
-  return ::occa::null;
-}
-
-template <class TM>
-TM *memoryToArray(::occa::memory &memory) {
-  return (TM *)memory.getModeMemory();
-}
-
-class Vector : public CeedObject {
- public:
-  // Owned resources
-  CeedSize       length;
-  ::occa::memory memory;
-  CeedSize       hostBufferLength;
-  CeedScalar    *hostBuffer;
-
-  ::occa::kernel setValueKernel;
-
-  // Current resources
-  ::occa::memory currentMemory;
-  CeedScalar    *currentHostBuffer;
-
-  // State information
-  int syncState;
-
-  Vector();
-
-  ~Vector();
-
-  int hasValidArray(bool *has_valid_array);
-
-  int hasBorrowedArrayOfType(CeedMemType mem_type, bool *has_borrowed_array_of_type);
-
-  static Vector *getVector(CeedVector vec, const bool assertValid = true);
-
-  static Vector *from(CeedVector vec);
-
-  void resize(const CeedSize length_);
-
-  void resizeMemory(const CeedSize length_);
-
-  void resizeMemory(::occa::device device, const CeedSize length_);
-
-  void resizeHostBuffer(const CeedSize length_);
-
-  void setCurrentMemoryIfNeeded();
-
-  void setCurrentHostBufferIfNeeded();
-
-  void freeHostBuffer();
-
-  int setValue(CeedScalar value);
-
-  int setArray(CeedMemType mtype, CeedCopyMode cmode, CeedScalar *array);
-
-  int takeArray(CeedMemType mtype, CeedScalar **array);
-
-  int copyArrayValues(CeedMemType mtype, CeedScalar *array);
-
-  int ownArrayPointer(CeedMemType mtype, CeedScalar *array);
-
-  int useArrayPointer(CeedMemType mtype, CeedScalar *array);
-
-  int getArray(CeedMemType mtype, CeedScalar **array);
-
-  int getReadOnlyArray(CeedMemType mtype, CeedScalar **array);
-
-  int getWriteOnlyArray(CeedMemType mtype, CeedScalar **array);
-
-  int restoreArray(CeedScalar **array);
-
-  int restoreReadOnlyArray(CeedScalar **array);
-
-  ::occa::memory getKernelArg();
-
-  ::occa::memory getConstKernelArg();
-
-  void printValues(const std::string &name);
-  void printNonZeroValues(const std::string &name);
-  void printSummary(const std::string &name);
-
-  //---[ Ceed Callbacks ]-----------
-  static int registerCeedFunction(Ceed ceed, CeedVector vec, const char *fname, ceed::occa::ceedFunction f);
-
-  static int ceedHasValidArray(CeedVector vec, bool *has_valid_array);
-
-  static int ceedHasBorrowedArrayOfType(CeedVector vec, CeedMemType mem_type, bool *has_borrowed_array_of_type);
-
-  static int ceedCreate(CeedSize length, CeedVector vec);
-
-  static int ceedSetValue(CeedVector vec, CeedScalar value);
-
-  static int ceedSetArray(CeedVector vec, CeedMemType mtype, CeedCopyMode cmode, CeedScalar *array);
-
-  static int ceedTakeArray(CeedVector vec, CeedMemType mtype, CeedScalar **array);
-
-  static int ceedGetArray(CeedVector vec, CeedMemType mtype, CeedScalar **array);
-
-  static int ceedGetArrayRead(CeedVector vec, CeedMemType mtype, CeedScalar **array);
-
-  static int ceedGetArrayWrite(CeedVector vec, CeedMemType mtype, CeedScalar **array);
-
-  static int ceedRestoreArray(CeedVector vec, CeedScalar **array);
-
-  static int ceedRestoreArrayRead(CeedVector vec, CeedScalar **array);
-
-  static int ceedDestroy(CeedVector vec);
-};
-}  // namespace occa
-}  // namespace ceed
-
-#endif
diff --git a/backends/occa/ceed-occa.cpp b/backends/occa/ceed-occa.cpp
deleted file mode 100644
index 4cdbe5a290..0000000000
--- a/backends/occa/ceed-occa.cpp
+++ /dev/null
@@ -1,329 +0,0 @@
-// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#warning "libCEED OCCA backend is experimental; for best performance, use device native backends"
-
-#include <map>
-#include <occa.hpp>
-#include <vector>
-
-#include "ceed-occa-context.hpp"
-#include "ceed-occa-elem-restriction.hpp"
-#include "ceed-occa-operator.hpp"
-#include "ceed-occa-qfunction.hpp"
-#include "ceed-occa-qfunctioncontext.hpp"
-#include "ceed-occa-simplex-basis.hpp"
-#include "ceed-occa-tensor-basis.hpp"
-#include "ceed-occa-types.hpp"
-#include "ceed-occa-vector.hpp"
-
-namespace ceed {
-namespace occa {
-typedef std::map<std::string, std::string> StringMap;
-typedef std::vector<std::string>           StringVector;
-
-enum ResourceParserStep { RESOURCE, QUERY_KEY, QUERY_VALUE };
-
-static const char RESOURCE_DELIMITER        = '/';
-static const char QUERY_DELIMITER           = ':';
-static const char QUERY_KEY_VALUE_DELIMITER = '=';
-static const char QUERY_ARG_DELIMITER       = ',';
-
-static std::string getDefaultDeviceMode(const bool cpuMode, const bool gpuMode) {
-  // In case both cpuMode and gpuMode are set, prioritize the GPU if available
-  // For example, if the resource is "/*/occa"
-  if (gpuMode) {
-    if (::occa::modeIsEnabled("CUDA")) {
-      return "CUDA";
-    }
-    if (::occa::modeIsEnabled("HIP")) {
-      return "HIP";
-    }
-    if (::occa::modeIsEnabled("dpcpp")) {
-      return "dpcpp";
-    }
-    if (::occa::modeIsEnabled("OpenCL")) {
-      return "OpenCL";
-    }
-    // Metal doesn't support doubles
-  }
-
-  if (cpuMode) {
-    if (::occa::modeIsEnabled("OpenMP")) {
-      return "OpenMP";
-    }
-    return "Serial";
-  }
-
-  return "";
-}
-
-static int getDeviceMode(const std::string &match, std::string &mode) {
-  if (match == "cuda") {
-    mode = "CUDA";
-    return CEED_ERROR_SUCCESS;
-  }
-  if (match == "hip") {
-    mode = "HIP";
-    return CEED_ERROR_SUCCESS;
-  }
-  if (match == "dpcpp") {
-    mode = "dpcpp";
-    return CEED_ERROR_SUCCESS;
-  }
-  if (match == "opencl") {
-    mode = "OpenCL";
-    return CEED_ERROR_SUCCESS;
-  }
-  if (match == "openmp") {
-    mode = "OpenMP";
-    return CEED_ERROR_SUCCESS;
-  }
-  if (match == "serial") {
-    mode = "Serial";
-    return CEED_ERROR_SUCCESS;
-  }
-
-  const bool autoMode = match == "*";
-  const bool cpuMode  = match == "cpu";
-  const bool gpuMode  = match == "gpu";
-
-  mode = getDefaultDeviceMode(cpuMode || autoMode, gpuMode || autoMode);
-  return !mode.size();
-}
-
-static int splitCeedResource(const std::string &resource, std::string &match, StringMap &query) {
-  /*
-   * resource:
-   *
-   *    "/gpu/occa?mode='CUDA':device_id=0"
-   *
-   * resourceVector:
-   *
-   *    ["gpu", "occa"]
-   *
-   * match:
-   *
-   *    "gpu"
-   *
-   * query:
-   *
-   *    {
-   *      "mode": "'CUDA'",
-   *      "device_id": "0",
-   *    }
-   */
-  const int   charCount  = (int)resource.size();
-  const char *c_resource = resource.c_str();
-
-  StringVector resourceVector;
-
-  ResourceParserStep parsingStep = RESOURCE;
-  int                wordStart   = 1;
-  std::string        queryKey;
-
-  // Check for /gpu/cuda/occa, /gpu/hip/occa, /cpu/self/occa, /cpu/openmp/occa
-  // Note: added for matching style with other backends
-  if (resource == "/gpu/cuda/occa") {
-    match = "cuda";
-    return CEED_ERROR_SUCCESS;
-  }
-  if (resource == "/gpu/hip/occa") {
-    match = "hip";
-    return CEED_ERROR_SUCCESS;
-  }
-  if (resource == "/gpu/dpcpp/occa") {
-    match = "dpcpp";
-    return CEED_ERROR_SUCCESS;
-  }
-  if (resource == "/gpu/opencl/occa") {
-    match = "opencl";
-    return CEED_ERROR_SUCCESS;
-  }
-  if (resource == "/cpu/openmp/occa") {
-    match = "openmp";
-    return CEED_ERROR_SUCCESS;
-  }
-  if (resource == "/cpu/self/occa") {
-    match = "serial";
-    return CEED_ERROR_SUCCESS;
-  }
-
-  // Skip initial slash
-  for (int i = 1; i <= charCount; ++i) {
-    const char c = c_resource[i];
-
-    if (parsingStep == RESOURCE) {
-      if (c == RESOURCE_DELIMITER || c == QUERY_DELIMITER || c == '\0') {
-        resourceVector.push_back(resource.substr(wordStart, i - wordStart));
-        wordStart = i + 1;
-
-        // Check if we are done parsing the resource
-        if (c == QUERY_DELIMITER) {
-          parsingStep = QUERY_KEY;
-        }
-      }
-    } else if (parsingStep == QUERY_KEY) {
-      if (c == QUERY_KEY_VALUE_DELIMITER) {
-        queryKey  = resource.substr(wordStart, i - wordStart);
-        wordStart = i + 1;
-
-        // Looking to parse the query value now
-        parsingStep = QUERY_VALUE;
-      }
-    } else if (parsingStep == QUERY_VALUE) {
-      if (c == QUERY_ARG_DELIMITER || c == '\0') {
-        query[queryKey] = resource.substr(wordStart, i - wordStart);
-        wordStart       = i + 1;
-
-        // Back to parsing the next query argument
-        parsingStep = QUERY_KEY;
-        queryKey    = "";
-      }
-    }
-  }
-
-  // Looking for [match, "occa"]
-  if (resourceVector.size() != 2 || resourceVector[1] != "occa") {
-    return 1;
-  }
-
-  match = resourceVector[0];
-  return CEED_ERROR_SUCCESS;
-}
-
-void setDefaultProps(::occa::properties &deviceProps, const std::string &defaultMode) {
-  std::string mode;
-  if (deviceProps.has("mode")) {
-    // Don't override mode if passed
-    mode = (std::string)deviceProps["mode"];
-  } else {
-    mode = defaultMode;
-    deviceProps.set("mode", mode);
-  }
-
-  // Set default device id
-  if ((mode == "CUDA") || (mode == "HIP") || (mode == "dpcpp") || (mode == "OpenCL")) {
-    if (!deviceProps.has("device_id")) {
-      deviceProps["device_id"] = 0;
-    }
-  }
-
-  // Set default platform id
-  if ((mode == "dpcpp") || (mode == "OpenCL")) {
-    if (!deviceProps.has("platform_id")) {
-      deviceProps["platform_id"] = 0;
-    }
-  }
-}
-
-static int initCeed(const char *c_resource, Ceed ceed) {
-  int         ierr;
-  std::string match;
-  StringMap   query;
-
-  ierr = splitCeedResource(c_resource, match, query);
-  if (ierr) {
-    return CeedError(ceed, CEED_ERROR_BACKEND, "(OCCA) Backend cannot use resource: %s", c_resource);
-  }
-
-  std::string mode;
-  ierr = getDeviceMode(match, mode);
-  if (ierr) {
-    return CeedError(ceed, CEED_ERROR_BACKEND, "(OCCA) Backend cannot use resource: %s", c_resource);
-  }
-
-  std::string               devicePropsStr = "{\n";
-  StringMap::const_iterator it;
-  for (it = query.begin(); it != query.end(); ++it) {
-    devicePropsStr += "  \"";
-    devicePropsStr += it->first;
-    devicePropsStr += "\": ";
-    devicePropsStr += it->second;
-    devicePropsStr += ",\n";
-  }
-  devicePropsStr += '}';
-
-  ::occa::properties deviceProps(devicePropsStr);
-  setDefaultProps(deviceProps, mode);
-
-  ceed::occa::Context *context = new Context(::occa::device(deviceProps));
-  CeedCallBackend(CeedSetData(ceed, context));
-
-  return CEED_ERROR_SUCCESS;
-}
-
-static int destroyCeed(Ceed ceed) {
-  delete Context::from(ceed);
-  return CEED_ERROR_SUCCESS;
-}
-
-static int registerCeedFunction(Ceed ceed, const char *fname, ceed::occa::ceedFunction f) {
-  return CeedSetBackendFunction(ceed, "Ceed", ceed, fname, f);
-}
-
-static int preferHostMemType(CeedMemType *type) {
-  *type = CEED_MEM_HOST;
-  return CEED_ERROR_SUCCESS;
-}
-
-static int preferDeviceMemType(CeedMemType *type) {
-  *type = CEED_MEM_DEVICE;
-  return CEED_ERROR_SUCCESS;
-}
-
-static ceed::occa::ceedFunction getPreferredMemType(Ceed ceed) {
-  if (Context::from(ceed)->device.hasSeparateMemorySpace()) {
-    return (ceed::occa::ceedFunction)(void *)preferDeviceMemType;
-  }
-  return (ceed::occa::ceedFunction)(void *)preferHostMemType;
-}
-
-static int registerMethods(Ceed ceed) {
-  CeedOccaRegisterBaseFunction("Destroy", ceed::occa::destroyCeed);
-  CeedOccaRegisterBaseFunction("GetPreferredMemType", getPreferredMemType(ceed));
-  CeedOccaRegisterBaseFunction("VectorCreate", ceed::occa::Vector::ceedCreate);
-  CeedOccaRegisterBaseFunction("BasisCreateTensorH1", ceed::occa::TensorBasis::ceedCreate);
-  CeedOccaRegisterBaseFunction("BasisCreateH1", ceed::occa::SimplexBasis::ceedCreate);
-  CeedOccaRegisterBaseFunction("ElemRestrictionCreate", ceed::occa::ElemRestriction::ceedCreate);
-  CeedOccaRegisterBaseFunction("QFunctionCreate", ceed::occa::QFunction::ceedCreate);
-  CeedOccaRegisterBaseFunction("QFunctionContextCreate", ceed::occa::QFunctionContext::ceedCreate);
-  CeedOccaRegisterBaseFunction("OperatorCreate", ceed::occa::Operator::ceedCreate);
-  CeedOccaRegisterBaseFunction("CompositeOperatorCreate", ceed::occa::Operator::ceedCreateComposite);
-
-  return CEED_ERROR_SUCCESS;
-}
-
-static int registerBackend(const char *resource, Ceed ceed) {
-  try {
-    CeedCallBackend(ceed::occa::initCeed(resource, ceed));
-  } catch (const ::occa::exception &e) {
-    CeedHandleOccaException(e);
-  }
-  try {
-    CeedCallBackend(ceed::occa::registerMethods(ceed));
-  } catch (const ::occa::exception &e) {
-    CeedHandleOccaException(e);
-  }
-  return CEED_ERROR_SUCCESS;
-}
-}  // namespace occa
-}  // namespace ceed
-
-CEED_INTERN int CeedRegister_Occa(void) {
-  // General mode
-  CeedCallBackend(CeedRegister("/*/occa", ceed::occa::registerBackend, 270));
-  // CPU Modes
-  CeedCallBackend(CeedRegister("/cpu/self/occa", ceed::occa::registerBackend, 260));
-  CeedCallBackend(CeedRegister("/cpu/openmp/occa", ceed::occa::registerBackend, 250));
-  // GPU Modes
-  CeedCallBackend(CeedRegister("/gpu/dpcpp/occa", ceed::occa::registerBackend, 240));
-  CeedCallBackend(CeedRegister("/gpu/opencl/occa", ceed::occa::registerBackend, 230));
-  CeedCallBackend(CeedRegister("/gpu/hip/occa", ceed::occa::registerBackend, 220));
-  CeedCallBackend(CeedRegister("/gpu/cuda/occa", ceed::occa::registerBackend, 210));
-  return CEED_ERROR_SUCCESS;
-}
diff --git a/backends/occa/ceed-occa.h b/backends/occa/ceed-occa.h
deleted file mode 100644
index 76283f4dc9..0000000000
--- a/backends/occa/ceed-occa.h
+++ /dev/null
@@ -1,149 +0,0 @@
-// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#include <ceed.h>
-#include <ceed/backend.h>
-#include <assert.h>
-#include <stdbool.h>
-#include <string.h>
-#include <sys/stat.h>
-
-// *****************************************************************************
-#define OCCA_PATH_MAX 4096
-
-// *****************************************************************************
-// used to get Dl_info struct declaration (vs _GNU_SOURCE?)
-#ifndef __USE_GNU
-#define __USE_GNU
-#endif
-#include <dlfcn.h>
-
-// *****************************************************************************
-#include "occa.h"
-
-// *****************************************************************************
-#define NO_OFFSET 0
-#define TILE_SIZE 32
-#define NO_PROPS occaDefault
-
-// *****************************************************************************
-// * CeedVector Occa struct
-// *****************************************************************************
-typedef struct {
-  CeedScalar *h_array;
-  CeedScalar *h_array_allocated;
-  occaMemory  d_array;
-} CeedVector_Occa;
-
-// *****************************************************************************
-// * CeedElemRestriction Occa struct
-// *****************************************************************************
-#define CEED_OCCA_NUM_RESTRICTION_KERNELS 8
-typedef struct {
-  bool       strided;
-  occaMemory d_indices;
-  occaMemory d_toffsets;
-  occaMemory d_tindices;
-  occaKernel kRestrict[CEED_OCCA_NUM_RESTRICTION_KERNELS];
-} CeedElemRestriction_Occa;
-
-// *****************************************************************************
-// * CeedBasis Occa struct
-// *****************************************************************************
-typedef struct {
-  bool                ready;
-  CeedElemRestriction er;
-  occaMemory          qref1d;
-  occaMemory          qweight1d;
-  occaMemory          interp1d;
-  occaMemory          grad1d;
-  occaMemory          tmp0, tmp1;
-  occaKernel          kZero, kInterp, kGrad, kWeight;
-} CeedBasis_Occa;
-
-// *****************************************************************************
-// * CeedOperator Occa struct
-// *****************************************************************************
-typedef struct {
-  CeedVector  *Evecs;  /// E-vectors needed to apply operator (in followed by out)
-  CeedScalar **Edata;
-  CeedVector  *evecsin;   /// Input E-vectors needed to apply operator
-  CeedVector  *evecsout;  /// Output E-vectors needed to apply operator
-  CeedVector  *qvecsin;   /// Input Q-vectors needed to apply operator
-  CeedVector  *qvecsout;  /// Output Q-vectors needed to apply operator
-  CeedInt      numein;
-  CeedInt      numeout;
-} CeedOperator_Occa;
-
-// *****************************************************************************
-// * CeedQFunction Occa struct
-// *****************************************************************************
-#define N_MAX_IDX 16
-typedef struct {
-  bool         ready;
-  CeedInt      idx, odx;
-  CeedInt      iOf7[N_MAX_IDX];
-  CeedInt      oOf7[N_MAX_IDX];
-  int          nc, dim, nelem, elemsize, e;
-  occaMemory   o_indata, o_outdata;
-  occaMemory   d_ctx, d_idx, d_odx;
-  char        *oklPath;
-  const char  *qFunctionName;
-  occaKernel   kQFunctionApply;
-  CeedOperator op;
-} CeedQFunction_Occa;
-
-// *****************************************************************************
-// * CeedQFunctionContext Occa struct
-// *****************************************************************************
-typedef struct {
-  CeedScalar *h_data;
-  CeedScalar *h_data_allocated;
-} CeedQFunctionContext_Occa;
-
-// *****************************************************************************
-// * Ceed Occa struct
-// *****************************************************************************
-typedef struct {
-  occaDevice device;
-  bool       ocl;
-  char      *libceed_dir;
-  char      *occa_cache_dir;
-} Ceed_Occa;
-
-// *****************************************************************************
-CEED_INTERN int CeedOklPath_Occa(const Ceed, const char *, const char *, char **);
-
-// *****************************************************************************
-CEED_INTERN int CeedOklDladdr_Occa(Ceed);
-
-// *****************************************************************************
-CEED_INTERN int CeedBasisCreateTensorH1_Occa(CeedInt dim, CeedInt P1d, CeedInt Q1d, const CeedScalar *interp1d, const CeedScalar *grad1d,
-                                             const CeedScalar *qref1d, const CeedScalar *qweight1d, CeedBasis basis);
-
-// *****************************************************************************
-CEED_INTERN int CeedBasisCreateH1_Occa(CeedElemTopology topo, CeedInt dim, CeedInt ndof, CeedInt nqpts, const CeedScalar *interp1d,
-                                       const CeedScalar *grad1d, const CeedScalar *qref1d, const CeedScalar *qweight1d, CeedBasis basis);
-
-// *****************************************************************************
-CEED_INTERN int CeedBasisApplyElems_Occa(CeedBasis basis, CeedInt Q, CeedTransposeMode tmode, CeedEvalMode emode, const CeedVector u, CeedVector v);
-
-// *****************************************************************************
-CEED_INTERN int CeedOperatorCreate_Occa(CeedOperator op);
-
-// *****************************************************************************
-CEED_INTERN int CeedQFunctionCreate_Occa(CeedQFunction qf);
-
-// *****************************************************************************
-CEED_INTERN int CeedQFunctionContextCreate_Occa(CeedQFunctionContext ctx);
-
-// *****************************************************************************
-CEED_INTERN int CeedElemRestrictionCreate_Occa(const CeedMemType mtype, const CeedCopyMode cmode, const CeedInt *indices, const bool *orients,
-                                               const CeedInt8 *curl_orients, const CeedElemRestriction res);
-
-// *****************************************************************************
-CEED_INTERN int CeedVectorCreate_Occa(CeedInt n, CeedVector vec);
diff --git a/backends/occa/kernels/elem-restriction.cpp b/backends/occa/kernels/elem-restriction.cpp
deleted file mode 100644
index d252e1a670..0000000000
--- a/backends/occa/kernels/elem-restriction.cpp
+++ /dev/null
@@ -1,124 +0,0 @@
-// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#include "./kernel-defines.hpp"
-
-// Kernels are based on the cuda backend from LLNL and VT groups
-//
-// Expects the following types to be defined:
-// - CeedInt
-// - CeedScalar
-//
-// Expects the following constants to be defined:
-// - COMPONENT_COUNT            : CeedInt
-// - ELEMENT_SIZE               : CeedInt
-// - NODE_COUNT                 : CeedInt
-// - TILE_SIZE                  : int
-// - USES_INDICES               : bool
-// - STRIDE_TYPE                : ceed::occa::StrideType
-// - NODE_STRIDE                : Optional[CeedInt]
-// - COMPONENT_STRIDE           : Optional[CeedInt]
-// - ELEMENT_STRIDE             : Optional[CeedInt]
-// - UNSTRIDED_COMPONENT_STRIDE : Optional[CeedInt]
-
-const char *occa_elem_restriction_source = STRINGIFY_SOURCE(
-
-    @directive("#define PRINT_KERNEL_HASHES 0")
-
-            typedef CeedScalar *
-        QuadVector @dim(ELEMENT_SIZE, COMPONENT_COUNT, elementCount);
-
-    @kernel void applyRestriction(const CeedInt elementCount, const CeedInt *indices, CeedScalar *u, QuadVector v) {
-      @tile(TILE_SIZE, @outer, @inner) for (int element = 0; element < elementCount; ++element) {
-        @directive("#if PRINT_KERNEL_HASHES")
-            // Print to see which kernel is being run
-            if (element == 0) {
-          printf("\n\napplyRestriction Kernel: " OKL_KERNEL_HASH "\n\n");
-        }
-        @directive("#endif")
-
-            @directive("#if USES_INDICES") for (int node = 0; node < ELEMENT_SIZE; ++node) {
-          const CeedInt index = indices[node + (element * ELEMENT_SIZE)];
-
-          for (int c = 0; c < COMPONENT_COUNT; ++c) {
-            v(node, c, element) = u[index + (c * UNSTRIDED_COMPONENT_STRIDE)];
-          }
-        }
-        @directive("#else") for (int node = 0; node < ELEMENT_SIZE; ++node) {
-          for (int c = 0; c < COMPONENT_COUNT; ++c) {
-            v(node, c, element) = u[(node * NODE_STRIDE) + (c * COMPONENT_STRIDE) + (element * ELEMENT_STRIDE)];
-          }
-        }
-        @directive("#endif")
-      }
-    }
-
-    @directive("#if USES_INDICES")
-
-        @kernel void applyRestrictionTranspose(const CeedInt elementCount, const CeedInt *quadIndices, const CeedInt *dofOffsets,
-                                               const CeedInt *dofIndices, const QuadVector u, CeedScalar *v) {
-          @tile(TILE_SIZE, @outer, @inner) for (int n = 0; n < NODE_COUNT; ++n) {
-            @directive("#if PRINT_KERNEL_HASHES")
-                // Print to see which kernel is being run
-                if (n == 0) {
-              printf("\n\napplyRestrictionTranspose Kernel: " OKL_KERNEL_HASH "\n\n");
-            }
-            @directive("#endif")
-
-                CeedScalar vComp[COMPONENT_COUNT];
-
-            // Prefetch index information
-            const CeedInt vIndex      = quadIndices[n];
-            const CeedInt offsetStart = dofOffsets[n];
-            const CeedInt offsetEnd   = dofOffsets[n + 1];
-
-            for (int c = 0; c < COMPONENT_COUNT; ++c) {
-              vComp[c] = 0;
-            }
-
-            // Aggregate by component
-            for (CeedInt i = offsetStart; i < offsetEnd; ++i) {
-              const CeedInt index = dofIndices[i];
-
-              const int node    = (index % ELEMENT_SIZE);
-              const int element = (index / ELEMENT_SIZE);
-
-              for (int c = 0; c < COMPONENT_COUNT; ++c) {
-                vComp[c] += u(node, c, element);
-              }
-            }
-
-            // Update dofs by component
-            for (int c = 0; c < COMPONENT_COUNT; ++c) {
-              v[vIndex + (c * UNSTRIDED_COMPONENT_STRIDE)] += vComp[c];
-            }
-          }
-        }
-
-    @directive("#else")  // USES_INDICES = false
-
-    @kernel void applyRestrictionTranspose(const CeedInt elementCount, const CeedInt *quadIndices, const CeedInt *dofOffsets,
-                                           const CeedInt *dofIndices, const QuadVector u, CeedScalar *v) {
-      @tile(TILE_SIZE, @outer, @inner) for (int element = 0; element < elementCount; ++element) {
-        @directive("#if PRINT_KERNEL_HASHES")
-            // Print to see which kernel is being run
-            if (element == 0) {
-          printf("\n\napplyRestrictionTranspose Kernel: " OKL_KERNEL_HASH "\n\n");
-        }
-        @directive("#endif")
-
-            for (int node = 0; node < ELEMENT_SIZE; ++node) {
-          for (int c = 0; c < COMPONENT_COUNT; ++c) {
-            v[(node * NODE_STRIDE) + (c * COMPONENT_STRIDE) + (element * ELEMENT_STRIDE)] += u(node, c, element);
-          }
-        }
-      }
-    }
-
-    @directive("#endif")  // USES_INDICES
-
-);
diff --git a/backends/occa/kernels/elem-restriction.hpp b/backends/occa/kernels/elem-restriction.hpp
deleted file mode 100644
index c2989dfbc2..0000000000
--- a/backends/occa/kernels/elem-restriction.hpp
+++ /dev/null
@@ -1,31 +0,0 @@
-// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#ifndef CEED_OCCA_KERNELS_ELEMRESTRICTION_HEADER
-#define CEED_OCCA_KERNELS_ELEMRESTRICTION_HEADER
-
-// Kernels are based on the cuda backend from LLNL and VT groups
-//
-// Expects the following types to be defined:
-// - CeedInt
-// - CeedScalar
-//
-// Expects the following constants to be defined:
-// - COMPONENT_COUNT            : CeedInt
-// - ELEMENT_SIZE               : CeedInt
-// - NODE_COUNT                 : CeedInt
-// - TILE_SIZE                  : int
-// - USES_INDICES               : bool
-// - STRIDE_TYPE                : ceed::occa::StrideType
-// - NODE_STRIDE                : Optional[CeedInt]
-// - COMPONENT_STRIDE           : Optional[CeedInt]
-// - ELEMENT_STRIDE             : Optional[CeedInt]
-// - UNSTRIDED_COMPONENT_STRIDE : Optional[CeedInt]
-
-extern const char *occa_elem_restriction_source;
-
-#endif
diff --git a/backends/occa/kernels/kernel-defines.hpp b/backends/occa/kernels/kernel-defines.hpp
deleted file mode 100644
index 8e66664b64..0000000000
--- a/backends/occa/kernels/kernel-defines.hpp
+++ /dev/null
@@ -1,13 +0,0 @@
-// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#ifndef CEED_OCCA_KERNELS_KERNELDEFINES_HEADER
-#define CEED_OCCA_KERNELS_KERNELDEFINES_HEADER
-
-#define STRINGIFY_SOURCE(...) #__VA_ARGS__
-
-#endif
diff --git a/backends/occa/kernels/set-value.cpp b/backends/occa/kernels/set-value.cpp
deleted file mode 100644
index 87efbc6163..0000000000
--- a/backends/occa/kernels/set-value.cpp
+++ /dev/null
@@ -1,23 +0,0 @@
-// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#include "./kernel-defines.hpp"
-
-// Expects the following types to be defined:
-// - CeedInt
-// - CeedScalar
-//
-// Expects the following constants to be defined:
-// - BLOCK_SIZE : CeedInt
-
-const char *occa_set_value_source = STRINGIFY_SOURCE(
-
-    @kernel void setValue(CeedScalar *ptr, const CeedScalar value, const CeedInt count) {
-      @tile(BLOCK_SIZE, @outer, @inner) for (CeedInt i = 0; i < count; ++i) {
-        ptr[i] = value;
-      }
-    });
diff --git a/backends/occa/kernels/set-value.hpp b/backends/occa/kernels/set-value.hpp
deleted file mode 100644
index c4173b2342..0000000000
--- a/backends/occa/kernels/set-value.hpp
+++ /dev/null
@@ -1,20 +0,0 @@
-// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#ifndef CEED_OCCA_KERNELS_SETVALUE_HEADER
-#define CEED_OCCA_KERNELS_SETVALUE_HEADER
-
-// Expects the following types to be defined:
-// - CeedInt
-// - CeedScalar
-//
-// Expects the following constants to be defined:
-// - BLOCK_SIZE : CeedInt
-
-extern const char *occa_set_value_source;
-
-#endif
\ No newline at end of file
diff --git a/backends/occa/kernels/simplex-basis.hpp b/backends/occa/kernels/simplex-basis.hpp
deleted file mode 100644
index b6e2d12cfe..0000000000
--- a/backends/occa/kernels/simplex-basis.hpp
+++ /dev/null
@@ -1,31 +0,0 @@
-// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#ifndef CEED_OCCA_KERNELS_SIMPLEXBASIS_HEADER
-#define CEED_OCCA_KERNELS_SIMPLEXBASIS_HEADER
-
-// Kernels are based on the cuda backend from LLNL and VT groups
-//
-// Expects the following types to be defined:
-// - CeedInt
-// - CeedScalar
-//
-// Expects the following constants to be defined:
-// - DIM                  : CeedInt
-// - Q                    : CeedInt
-// - P                    : CeedInt
-// - MAX_PQ               : CeedInt
-// - BASIS_COMPONENT_COUNT: CeedInt
-// - ELEMENTS_PER_BLOCK   : CeedInt
-// - TRANSPOSE            : bool
-
-extern const char *occa_simplex_basis_cpu_function_source;
-extern const char *occa_simplex_basis_cpu_kernel_source;
-
-extern const char *occa_simplex_basis_gpu_source;
-
-#endif
diff --git a/backends/occa/kernels/simplex-basis/cpu-simplex-basis.cpp b/backends/occa/kernels/simplex-basis/cpu-simplex-basis.cpp
deleted file mode 100644
index 4b78dbf621..0000000000
--- a/backends/occa/kernels/simplex-basis/cpu-simplex-basis.cpp
+++ /dev/null
@@ -1,138 +0,0 @@
-// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#include "../kernel-defines.hpp"
-
-const char *occa_simplex_basis_cpu_function_source = STRINGIFY_SOURCE(
-
-    @directive("#define SIMPLEX_FUNCTION(FUNCTION_NAME) simplex_ ## DIM ## d_ ## FUNCTION_NAME ## _Q ## Q ## _P ## P")
-
-        inline void SIMPLEX_FUNCTION(interpElement)(const CeedScalar *B @dim(P, Q), const CeedScalar *Ue, CeedScalar *Ve) {
-          for (int q = 0; q < Q; ++q) {
-            CeedScalar v = 0;
-            for (int p = 0; p < P; ++p) {
-              v += B(p, q) * Ue[p];
-            }
-            Ve[q] = v;
-          }
-        }
-
-    inline void SIMPLEX_FUNCTION(interpElementTranspose)(const CeedScalar *B @dim(P, Q), const CeedScalar *Ue, CeedScalar *Ve) {
-      for (int p = 0; p < P; ++p) {
-        CeedScalar v = 0;
-        for (int q = 0; q < Q; ++q) {
-          v += B(p, q) * Ue[q];
-        }
-        Ve[p] = v;
-      }
-    }
-
-    inline void SIMPLEX_FUNCTION(gradElement)(const CeedScalar *Bx @dim(P, Q, DIM), const CeedScalar *Ue, CeedScalar *Ve, ) {
-      for (int q = 0; q < Q; ++q) {
-        CeedScalar v[DIM];
-        for (int dim = 0; dim < DIM; ++dim) {
-          v[dim] = 0;
-        }
-
-        for (int p = 0; p < P; ++p) {
-          const CeedScalar u = Ue[p];
-          for (int dim = 0; dim < DIM; ++dim) {
-            v[dim] += Bx(p, q, dim) * u;
-          }
-        }
-
-        for (int dim = 0; dim < DIM; ++dim) {
-          Ve[dim * Q + q] = v[dim];
-        }
-      }
-    }
-
-    inline void SIMPLEX_FUNCTION(gradElementTranspose)(const CeedScalar *Bx @dim(P, Q, DIM), const CeedScalar *Ue, CeedScalar *Ve) {
-      for (int p = 0; p < P; ++p) {
-        CeedScalar v = 0;
-        for (int dim = 0; dim < DIM; ++dim) {
-          for (int q = 0; q < Q; ++q) {
-            v += Bx(p, q, dim) * Ue[dim * Q + q];
-          }
-        }
-        Ve[p] = v;
-      }
-    }
-
-    inline void SIMPLEX_FUNCTION(weightElement)(const CeedScalar *qWeights, CeedScalar *We) {
-      for (int q = 0; q < Q; ++q) {
-        We[q] = qWeights[q];
-      }
-    }
-
-);
-
-const char *occa_simplex_basis_cpu_kernel_source = STRINGIFY_SOURCE(
-
-    @kernel void interp(const CeedInt elementCount, const CeedScalar *B, const CeedScalar *U, CeedScalar *V) {
-      for (int element = 0; element < elementCount; ++element; @outer) {
-        for (int component = 0; component < BASIS_COMPONENT_COUNT; ++component; @inner) {
-          if (!TRANSPOSE) {
-            const CeedScalar *Ue @dim(P, BASIS_COMPONENT_COUNT, elementCount) = U;
-            CeedScalar       *Ve @dim(Q, elementCount, BASIS_COMPONENT_COUNT) = V;
-
-            SIMPLEX_FUNCTION(interpElement)(B, &Ue(0, component, element), &Ve(0, element, component));
-          } else {
-            const CeedScalar *Ue @dim(Q, elementCount, BASIS_COMPONENT_COUNT) = U;
-            CeedScalar       *Ve @dim(P, BASIS_COMPONENT_COUNT, elementCount) = V;
-
-            SIMPLEX_FUNCTION(interpElementTranspose)(B, &Ue(0, element, component), &Ve(0, component, element));
-          }
-        }
-      }
-    }
-
-    @kernel void grad(const CeedInt elementCount, const CeedScalar *Bx, const CeedScalar *U, CeedScalar *V) {
-      for (int element = 0; element < elementCount; ++element; @outer) {
-        for (int component = 0; component < BASIS_COMPONENT_COUNT; ++component; @inner) {
-          if (!TRANSPOSE) {
-            const CeedScalar *Ue @dim(P, BASIS_COMPONENT_COUNT, elementCount)       = U;
-            CeedScalar       *_Ve @dim(Q, elementCount, BASIS_COMPONENT_COUNT, DIM) = V;
-
-            CeedScalar Ve[DIM][Q];
-            for (int dim = 0; dim < DIM; ++dim) {
-              for (int q = 0; q < Q; ++q) {
-                Ve[dim][q] = _Ve(q, element, component, dim);
-              }
-            }
-
-            SIMPLEX_FUNCTION(gradElement)(Bx, &Ue(0, component, element), (CeedScalar *)Ve);
-
-            for (int dim = 0; dim < DIM; ++dim) {
-              for (int q = 0; q < Q; ++q) {
-                _Ve(q, element, component, dim) = Ve[dim][q];
-              }
-            }
-          } else {
-            const CeedScalar *_Ue @dim(Q, elementCount, BASIS_COMPONENT_COUNT, DIM) = U;
-            CeedScalar       *Ve @dim(P, BASIS_COMPONENT_COUNT, elementCount)       = V;
-
-            CeedScalar Ue[DIM][Q];
-            for (int dim = 0; dim < DIM; ++dim) {
-              for (int q = 0; q < Q; ++q) {
-                Ue[dim][q] = _Ue(q, element, component, dim);
-              }
-            }
-
-            SIMPLEX_FUNCTION(gradElementTranspose)(Bx, (CeedScalar *)Ue, &Ve(0, component, element));
-          }
-        }
-      }
-    }
-
-    @kernel void weight(const CeedInt elementCount, const CeedScalar *qWeights, CeedScalar *W @dim(Q, elementCount)) {
-      @tile(32, @outer, @inner) for (int element = 0; element < elementCount; ++element) {
-        SIMPLEX_FUNCTION(weightElement)(qWeights, &W(0, element));
-      }
-    }
-
-);
diff --git a/backends/occa/kernels/simplex-basis/gpu-simplex-basis.cpp b/backends/occa/kernels/simplex-basis/gpu-simplex-basis.cpp
deleted file mode 100644
index ab0f2d8fd8..0000000000
--- a/backends/occa/kernels/simplex-basis/gpu-simplex-basis.cpp
+++ /dev/null
@@ -1,138 +0,0 @@
-// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#include "../kernel-defines.hpp"
-
-const char *occa_simplex_basis_gpu_source = STRINGIFY_SOURCE(
-
-    @directive("#if TRANSPOSE") typedef CeedScalar * dofArray @dim(Q, elementCount, BASIS_COMPONENT_COUNT, DIM);
-    typedef CeedScalar * quadArray @dim(P, BASIS_COMPONENT_COUNT, elementCount, DIM);
-    @directive("#else") typedef CeedScalar * dofArray @dim(P, BASIS_COMPONENT_COUNT, elementCount, DIM);
-    typedef CeedScalar * quadArray @dim(Q, elementCount, BASIS_COMPONENT_COUNT, DIM); @directive("#endif")
-
-                                                                                          typedef CeedScalar *
-                                                                                      quadToDof @dim(P, Q);
-    typedef CeedScalar * dQuadToDof @dim(P, Q, DIM); typedef CeedScalar * elementWeightArray @dim(Q, elementCount);
-
-    @kernel void interp(const CeedInt elementCount, const quadToDof B, const dofArray U, quadArray V) {
-      for (int elementOffset = 0; elementOffset < elementCount; elementOffset += ELEMENTS_PER_BLOCK; @outer) {
-        @shared CeedScalar s_B[P * Q] @dim(P, Q);
-
-        // Store weights in shared memory
-        for (int i = 0; i < MAX_PQ; ++i; @inner) {
-          for (int j = i; j < (P * Q); j += MAX_PQ) {
-            s_B[j] = B[j];
-          }
-        }
-
-        for (int localElement = 0; localElement < ELEMENTS_PER_BLOCK; ++localElement) {
-          for (int i = 0; i < MAX_PQ; ++i; @inner) {
-            const int element = elementOffset + localElement;
-            if (element < elementCount) {
-              // Element operation
-              for (int component = 0; component < BASIS_COMPONENT_COUNT; ++component) {
-                if (!TRANSPOSE) {
-                  const int q = i;
-                  if (q < Q) {
-                    CeedScalar v = 0;
-                    for (int p = 0; p < P; ++p) {
-                      v += s_B(p, q) * U(p, component, element, 0);
-                    }
-                    V(q, element, component, 0) = v;
-                  }
-                } else {
-                  const int p = i;
-                  if (p < P) {
-                    CeedScalar v = 0;
-                    for (int q = 0; q < Q; ++q) {
-                      v += s_B(p, q) * U(q, element, component, 0);
-                    }
-                    V(p, component, element, 0) = v;
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-
-    @kernel void grad(const CeedInt elementCount, const dQuadToDof Bx, const dofArray U, quadArray V) {
-      for (int elementOffset = 0; elementOffset < elementCount; elementOffset += ELEMENTS_PER_BLOCK; @outer) {
-        @shared CeedScalar s_Bx[Q * P * DIM] @dim(P, Q, DIM);
-
-        // Store weights in shared memory
-        for (int i = 0; i < MAX_PQ; ++i; @inner) {
-          for (int j = i; j < (P * Q * DIM); j += MAX_PQ) {
-            s_Bx[j] = Bx[j];
-          }
-        }
-
-        for (int localElement = 0; localElement < ELEMENTS_PER_BLOCK; ++localElement) {
-          for (int i = 0; i < MAX_PQ; ++i; @inner) {
-            const int element = elementOffset + localElement;
-            if (element < elementCount) {
-              // Element operation
-              for (int component = 0; component < BASIS_COMPONENT_COUNT; ++component) {
-                if (!TRANSPOSE) {
-                  const int q = i;
-                  if (q < Q) {
-                    CeedScalar v[DIM];
-                    for (int dim = 0; dim < DIM; ++dim) {
-                      v[dim] = 0;
-                    }
-
-                    for (int p = 0; p < P; ++p) {
-                      const CeedScalar u = U(p, component, element, 0);
-                      for (int dim = 0; dim < DIM; ++dim) {
-                        v[dim] += s_Bx(p, q, dim) * u;
-                      }
-                    }
-
-                    for (int dim = 0; dim < DIM; ++dim) {
-                      V(q, element, component, dim) = v[dim];
-                    }
-                  }
-                } else {
-                  const int p = i;
-                  if (p < P) {
-                    CeedScalar v = 0;
-                    for (int dim = 0; dim < DIM; ++dim) {
-                      for (int q = 0; q < Q; ++q) {
-                        v += s_Bx(p, q, dim) * U(q, element, component, dim);
-                      }
-                    }
-                    V(p, component, element, 0) = v;
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-
-    @kernel void weight(const CeedInt elementCount, const CeedScalar *qWeights, elementWeightArray W) {
-      for (int elementOffset = 0; elementOffset < elementCount; elementOffset += ELEMENTS_PER_BLOCK; @outer) {
-        @shared CeedScalar s_qWeights[Q];
-
-        for (int q = 0; q < Q; ++q; @inner) {
-          s_qWeights[q] = qWeights[q];
-        }
-
-        for (int localElement = 0; localElement < ELEMENTS_PER_BLOCK; ++localElement) {
-          const int element = elementOffset + localElement;
-          if (element < elementCount) {
-            for (int q = 0; q < Q; ++q; @inner) {
-              W(q, element) = s_qWeights[q];
-            }
-          }
-        }
-      }
-    }
-
-);
diff --git a/backends/occa/kernels/tensor-basis.hpp b/backends/occa/kernels/tensor-basis.hpp
deleted file mode 100644
index afffd661e8..0000000000
--- a/backends/occa/kernels/tensor-basis.hpp
+++ /dev/null
@@ -1,38 +0,0 @@
-// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#ifndef CEED_OCCA_KERNELS_TENSORBASIS_HEADER
-#define CEED_OCCA_KERNELS_TENSORBASIS_HEADER
-
-// Kernels are based on the cuda backend from LLNL and VT groups
-//
-// Expects the following types to be defined:
-// - CeedInt
-// - CeedScalar
-//
-// Expects the following constants to be defined:
-// - Q1D                  : CeedInt
-// - P1D                  : CeedInt
-// - BASIS_COMPONENT_COUNT: CeedInt
-// - ELEMENTS_PER_BLOCK   : CeedInt
-// - SHARED_BUFFER_SIZE   : CeedInt
-// - TRANSPOSE            : bool
-
-extern const char *occa_tensor_basis_1d_cpu_function_source;
-extern const char *occa_tensor_basis_1d_cpu_kernel_source;
-
-extern const char *occa_tensor_basis_2d_cpu_function_source;
-extern const char *occa_tensor_basis_2d_cpu_kernel_source;
-
-extern const char *occa_tensor_basis_3d_cpu_function_source;
-extern const char *occa_tensor_basis_3d_cpu_kernel_source;
-
-extern const char *occa_tensor_basis_1d_gpu_source;
-extern const char *occa_tensor_basis_2d_gpu_source;
-extern const char *occa_tensor_basis_3d_gpu_source;
-
-#endif
diff --git a/backends/occa/kernels/tensor-basis/cpu/tensor-basis-1d.cpp b/backends/occa/kernels/tensor-basis/cpu/tensor-basis-1d.cpp
deleted file mode 100644
index 5af734984f..0000000000
--- a/backends/occa/kernels/tensor-basis/cpu/tensor-basis-1d.cpp
+++ /dev/null
@@ -1,108 +0,0 @@
-// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#include "../../kernel-defines.hpp"
-
-const char *occa_tensor_basis_1d_cpu_function_source = STRINGIFY_SOURCE(
-
-    @directive("#define TENSOR_FUNCTION(FUNCTION_NAME) tensor_1d_ ## FUNCTION_NAME ## _Q ## Q1D ## _P ## P1D")
-
-        inline void TENSOR_FUNCTION(interpElement)(const CeedScalar *B @dim(P1D, Q1D), const CeedScalar *Ue, CeedScalar *Ve) {
-          for (int q = 0; q < Q1D; ++q) {
-            CeedScalar Vq = 0;
-            for (int p = 0; p < P1D; ++p) {
-              Vq += B(p, q) * Ue[p];
-            }
-            Ve[q] = Vq;
-          }
-        }
-
-    inline void TENSOR_FUNCTION(interpElementTranspose)(const CeedScalar *B @dim(P1D, Q1D), const CeedScalar *Ue, CeedScalar *Ve) {
-      for (int p = 0; p < P1D; ++p) {
-        CeedScalar Vp = 0;
-        for (int q = 0; q < Q1D; ++q) {
-          Vp += B(p, q) * Ue[q];
-        }
-        Ve[p] = Vp;
-      }
-    }
-
-    inline void TENSOR_FUNCTION(gradElement)(const CeedScalar *B @dim(P1D, Q1D), const CeedScalar *Bx @dim(P1D, Q1D), const CeedScalar *Ue,
-                                             CeedScalar *Ve) {
-      for (int q = 0; q < Q1D; ++q) {
-        CeedScalar Vq = 0;
-        for (int p = 0; p < P1D; ++p) {
-          Vq += Bx(p, q) * Ue[p];
-        }
-        Ve[q] = Vq;
-      }
-    }
-
-    inline void TENSOR_FUNCTION(gradElementTranspose)(const CeedScalar *B @dim(P1D, Q1D), const CeedScalar *Bx @dim(P1D, Q1D), const CeedScalar *Ue,
-                                                      CeedScalar *Ve) {
-      for (int p = 0; p < P1D; ++p) {
-        CeedScalar Vp = 0;
-        for (int q = 0; q < Q1D; ++q) {
-          Vp += Bx(p, q) * Ue[q];
-        }
-        Ve[p] = Vp;
-      }
-    }
-
-    inline void TENSOR_FUNCTION(weightElement)(const CeedScalar *qWeights1D, CeedScalar *We) {
-      for (int q = 0; q < Q1D; ++q) {
-        We[q] = qWeights1D[q];
-      }
-    }
-
-);
-
-const char *occa_tensor_basis_1d_cpu_kernel_source = STRINGIFY_SOURCE(
-
-    @kernel void interp(const CeedInt elementCount, const CeedScalar *B, const CeedScalar *U, CeedScalar *V) {
-      for (int element = 0; element < elementCount; ++element; @outer) {
-        for (int component = 0; component < BASIS_COMPONENT_COUNT; ++component; @inner) {
-          if (!TRANSPOSE) {
-            const CeedScalar *Ue @dim(P1D, BASIS_COMPONENT_COUNT, elementCount) = U;
-            CeedScalar       *Ve @dim(Q1D, elementCount, BASIS_COMPONENT_COUNT) = V;
-
-            TENSOR_FUNCTION(interpElement)(B, &Ue(0, component, element), &Ve(0, element, component));
-          } else {
-            const CeedScalar *Ue @dim(Q1D, elementCount, BASIS_COMPONENT_COUNT) = U;
-            CeedScalar       *Ve @dim(P1D, BASIS_COMPONENT_COUNT, elementCount) = V;
-
-            TENSOR_FUNCTION(interpElementTranspose)(B, &Ue(0, element, component), &Ve(0, component, element));
-          }
-        }
-      }
-    }
-
-    @kernel void grad(const CeedInt elementCount, const CeedScalar *B, const CeedScalar *Bx, const CeedScalar *U, CeedScalar *V) {
-      for (int element = 0; element < elementCount; ++element; @outer) {
-        for (int component = 0; component < BASIS_COMPONENT_COUNT; ++component; @inner) {
-          if (!TRANSPOSE) {
-            const CeedScalar *Ue @dim(P1D, BASIS_COMPONENT_COUNT, elementCount) = U;
-            CeedScalar       *Ve @dim(Q1D, elementCount, BASIS_COMPONENT_COUNT) = V;
-
-            TENSOR_FUNCTION(gradElement)(B, Bx, &Ue(0, component, element), &Ve(0, element, component));
-          } else {
-            const CeedScalar *Ue @dim(Q1D, elementCount, BASIS_COMPONENT_COUNT) = U;
-            CeedScalar       *Ve @dim(P1D, BASIS_COMPONENT_COUNT, elementCount) = V;
-
-            TENSOR_FUNCTION(gradElementTranspose)(B, Bx, &Ue(0, element, component), &Ve(0, component, element));
-          }
-        }
-      }
-    }
-
-    @kernel void weight(const CeedInt elementCount, const CeedScalar *qWeights1D, CeedScalar *W @dim(Q1D, elementCount)) {
-      @tile(32, @outer, @inner) for (int element = 0; element < elementCount; ++element) {
-        TENSOR_FUNCTION(weightElement)(qWeights1D, &W(0, element));
-      }
-    }
-
-);
diff --git a/backends/occa/kernels/tensor-basis/cpu/tensor-basis-2d.cpp b/backends/occa/kernels/tensor-basis/cpu/tensor-basis-2d.cpp
deleted file mode 100644
index 143025f7ba..0000000000
--- a/backends/occa/kernels/tensor-basis/cpu/tensor-basis-2d.cpp
+++ /dev/null
@@ -1,211 +0,0 @@
-// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#include "../../kernel-defines.hpp"
-
-const char *occa_tensor_basis_2d_cpu_function_source = STRINGIFY_SOURCE(
-
-    @directive("#define TENSOR_FUNCTION(FUNCTION_NAME) tensor_2d_ ## FUNCTION_NAME ## _Q ## Q1D ## _P ## P1D")
-
-        inline void TENSOR_FUNCTION(interpElement)(const CeedScalar *B @dim(P1D, Q1D), const CeedScalar *Ue @dim(P1D, P1D),
-                                                   CeedScalar *Ve @dim(Q1D, Q1D)) {
-          for (int qy = 0; qy < Q1D; ++qy) {
-            for (int qx = 0; qx < Q1D; ++qx) {
-              Ve(qx, qy) = 0;
-            }
-          }
-
-          for (int py = 0; py < P1D; ++py) {
-            CeedScalar V_x[Q1D];
-            for (int qx = 0; qx < Q1D; ++qx) {
-              V_x[qx] = 0;
-            }
-
-            for (int px = 0; px < P1D; ++px) {
-              const CeedScalar Up = Ue(px, py);
-              for (int qx = 0; qx < Q1D; ++qx) {
-                V_x[qx] += B(px, qx) * Up;
-              }
-            }
-
-            for (int qy = 0; qy < Q1D; ++qy) {
-              const CeedScalar w = B(py, qy);
-              for (int qx = 0; qx < Q1D; ++qx) {
-                Ve(qx, qy) += w * V_x[qx];
-              }
-            }
-          }
-        }
-
-    inline void TENSOR_FUNCTION(interpElementTranspose)(const CeedScalar *B @dim(P1D, Q1D), const CeedScalar *Ue @dim(Q1D, Q1D),
-                                                        CeedScalar *Ve @dim(P1D, P1D)) {
-      for (int py = 0; py < P1D; ++py) {
-        for (int px = 0; px < P1D; ++px) {
-          Ve(px, py) = 0;
-        }
-      }
-
-      for (int qy = 0; qy < Q1D; ++qy) {
-        CeedScalar V_x[P1D];
-        for (int py = 0; py < P1D; ++py) {
-          V_x[py] = 0;
-        }
-
-        for (int qx = 0; qx < Q1D; ++qx) {
-          const CeedScalar Up = Ue(qx, qy);
-          for (int px = 0; px < P1D; ++px) {
-            V_x[px] += B(px, qx) * Up;
-          }
-        }
-
-        for (int py = 0; py < P1D; ++py) {
-          const CeedScalar w = B(py, qy);
-          for (int px = 0; px < P1D; ++px) {
-            Ve(px, py) += w * V_x[px];
-          }
-        }
-      }
-    }
-
-    inline void TENSOR_FUNCTION(gradElement)(const CeedScalar *B @dim(P1D, Q1D), const CeedScalar *Bx @dim(P1D, Q1D),
-                                             const CeedScalar *Ue @dim(P1D, P1D), CeedScalar *Ve_x @dim(Q1D, Q1D), CeedScalar *Ve_y @dim(Q1D, Q1D)) {
-      CeedScalar grad[Q1D][Q1D][2];
-      for (int qy = 0; qy < Q1D; ++qy) {
-        for (int qx = 0; qx < Q1D; ++qx) {
-          grad[qy][qx][0] = 0;
-          grad[qy][qx][1] = 0;
-        }
-      }
-
-      for (int py = 0; py < P1D; ++py) {
-        CeedScalar gradX[Q1D][2];
-        for (int qx = 0; qx < Q1D; ++qx) {
-          gradX[qx][0] = 0;
-          gradX[qx][1] = 0;
-        }
-
-        for (int px = 0; px < P1D; ++px) {
-          const CeedScalar Up = Ue(px, py);
-          for (int qx = 0; qx < Q1D; ++qx) {
-            gradX[qx][0] += Up * B(px, qx);
-            gradX[qx][1] += Up * Bx(px, qx);
-          }
-        }
-
-        for (int qy = 0; qy < Q1D; ++qy) {
-          const CeedScalar wy  = B(py, qy);
-          const CeedScalar wDy = Bx(py, qy);
-          for (int qx = 0; qx < Q1D; ++qx) {
-            const CeedScalar wx  = gradX[qx][0];
-            const CeedScalar wDx = gradX[qx][1];
-            grad[qy][qx][0] += wDx * wy;
-            grad[qy][qx][1] += wx * wDy;
-          }
-        }
-      }
-      for (int qy = 0; qy < Q1D; ++qy) {
-        for (int qx = 0; qx < Q1D; ++qx) {
-          Ve_x(qx, qy) = grad[qy][qx][0];
-          Ve_y(qx, qy) = grad[qy][qx][1];
-        }
-      }
-    }
-
-    inline void TENSOR_FUNCTION(gradElementTranspose)(const CeedScalar *B @dim(P1D, Q1D), const CeedScalar *Bx @dim(P1D, Q1D),
-                                                      const CeedScalar *Ue_x @dim(Q1D, Q1D), const CeedScalar *Ue_y @dim(Q1D, Q1D),
-                                                      CeedScalar *Ve @dim(P1D, P1D)) {
-      for (int py = 0; py < P1D; ++py) {
-        for (int px = 0; px < P1D; ++px) {
-          Ve(px, py) = 0.0;
-        }
-      }
-
-      for (int qy = 0; qy < Q1D; ++qy) {
-        CeedScalar gradX[P1D][2];
-        for (int px = 0; px < P1D; ++px) {
-          gradX[px][0] = 0;
-          gradX[px][1] = 0;
-        }
-
-        for (int qx = 0; qx < Q1D; ++qx) {
-          const CeedScalar Ux = Ue_x(qx, qy);
-          const CeedScalar Uy = Ue_y(qx, qy);
-          for (int px = 0; px < P1D; ++px) {
-            const CeedScalar wx  = B(px, qx);
-            const CeedScalar wDx = Bx(px, qx);
-            gradX[px][0] += Ux * wDx;
-            gradX[px][1] += Uy * wx;
-          }
-        }
-
-        for (int py = 0; py < P1D; ++py) {
-          const CeedScalar wy  = B(py, qy);
-          const CeedScalar wDy = Bx(py, qy);
-          for (int px = 0; px < P1D; ++px) {
-            Ve(px, py) += ((gradX[px][0] * wy) + (gradX[px][1] * wDy));
-          }
-        }
-      }
-    }
-
-    inline void TENSOR_FUNCTION(weightElement)(const CeedScalar *qWeights1D, CeedScalar *We @dim(Q1D, Q1D)) {
-      for (int qy = 0; qy < Q1D; ++qy) {
-        const CeedScalar wy = qWeights1D[qy];
-        for (int qx = 0; qx < Q1D; ++qx) {
-          We(qx, qy) = qWeights1D[qx] * wy;
-        }
-      }
-    }
-
-);
-
-const char *occa_tensor_basis_2d_cpu_kernel_source = STRINGIFY_SOURCE(
-
-    @kernel void interp(const CeedInt elementCount, const CeedScalar *B, const CeedScalar *U, CeedScalar *V) {
-      for (int element = 0; element < elementCount; ++element; @outer) {
-        for (int component = 0; component < BASIS_COMPONENT_COUNT; ++component; @inner) {
-          if (!TRANSPOSE) {
-            const CeedScalar *Ue @dim(P1D, P1D, BASIS_COMPONENT_COUNT, elementCount) = U;
-            CeedScalar       *Ve @dim(Q1D, Q1D, elementCount, BASIS_COMPONENT_COUNT) = V;
-
-            TENSOR_FUNCTION(interpElement)(B, &Ue(0, 0, component, element), &Ve(0, 0, element, component));
-          } else {
-            const CeedScalar *Ue @dim(Q1D, Q1D, elementCount, BASIS_COMPONENT_COUNT) = U;
-            CeedScalar       *Ve @dim(P1D, P1D, BASIS_COMPONENT_COUNT, elementCount) = V;
-
-            TENSOR_FUNCTION(interpElementTranspose)(B, &Ue(0, 0, element, component), &Ve(0, 0, component, element));
-          }
-        }
-      }
-    }
-
-    @kernel void grad(const CeedInt elementCount, const CeedScalar *B, const CeedScalar *Bx, const CeedScalar *U, CeedScalar *V) {
-      for (int element = 0; element < elementCount; ++element; @outer) {
-        for (int component = 0; component < BASIS_COMPONENT_COUNT; ++component; @inner) {
-          if (!TRANSPOSE) {
-            const CeedScalar *Ue @dim(P1D, P1D, BASIS_COMPONENT_COUNT, elementCount)    = U;
-            CeedScalar       *Ve @dim(Q1D, Q1D, elementCount, BASIS_COMPONENT_COUNT, 2) = V;
-
-            TENSOR_FUNCTION(gradElement)(B, Bx, &Ue(0, 0, component, element), &Ve(0, 0, element, component, 0), &Ve(0, 0, element, component, 1));
-          } else {
-            const CeedScalar *Ue @dim(Q1D, Q1D, elementCount, BASIS_COMPONENT_COUNT, 2) = U;
-            CeedScalar       *Ve @dim(P1D, P1D, BASIS_COMPONENT_COUNT, elementCount)    = V;
-
-            TENSOR_FUNCTION(gradElementTranspose)
-            (B, Bx, &Ue(0, 0, element, component, 0), &Ue(0, 0, element, component, 1), &Ve(0, 0, component, element));
-          }
-        }
-      }
-    }
-
-    @kernel void weight(const CeedInt elementCount, const CeedScalar *qWeights1D, CeedScalar *W @dim(Q1D, Q1D, elementCount)) {
-      @tile(32, @outer, @inner) for (int element = 0; element < elementCount; ++element) {
-        TENSOR_FUNCTION(weightElement)(qWeights1D, &W(0, 0, element));
-      }
-    }
-
-);
diff --git a/backends/occa/kernels/tensor-basis/cpu/tensor-basis-3d.cpp b/backends/occa/kernels/tensor-basis/cpu/tensor-basis-3d.cpp
deleted file mode 100644
index 45263bb635..0000000000
--- a/backends/occa/kernels/tensor-basis/cpu/tensor-basis-3d.cpp
+++ /dev/null
@@ -1,306 +0,0 @@
-// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#include "../../kernel-defines.hpp"
-
-const char *occa_tensor_basis_3d_cpu_function_source = STRINGIFY_SOURCE(
-
-    @directive("#define TENSOR_FUNCTION(FUNCTION_NAME) tensor_3d_ ## FUNCTION_NAME ## _Q ## Q1D ## _P ## P1D")
-
-        inline void TENSOR_FUNCTION(interpElement)(const CeedScalar *B @dim(P1D, Q1D), const CeedScalar *Ue @dim(P1D, P1D, P1D),
-                                                   CeedScalar *Ve @dim(Q1D, Q1D, Q1D)) {
-          for (int qz = 0; qz < Q1D; ++qz) {
-            for (int qy = 0; qy < Q1D; ++qy) {
-              for (int qx = 0; qx < Q1D; ++qx) {
-                Ve(qx, qy, qz) = 0;
-              }
-            }
-          }
-
-          for (int pz = 0; pz < P1D; ++pz) {
-            CeedScalar V_xy[Q1D][Q1D];
-            for (int qy = 0; qy < Q1D; ++qy) {
-              for (int qx = 0; qx < Q1D; ++qx) {
-                V_xy[qy][qx] = 0;
-              }
-            }
-
-            for (int py = 0; py < P1D; ++py) {
-              CeedScalar V_x[Q1D];
-              for (int qx = 0; qx < Q1D; ++qx) {
-                V_x[qx] = 0;
-              }
-
-              for (int px = 0; px < P1D; ++px) {
-                const CeedScalar Up = Ue(px, py, pz);
-                for (int qx = 0; qx < Q1D; ++qx) {
-                  V_x[qx] += B(px, qx) * Up;
-                }
-              }
-
-              for (int qy = 0; qy < Q1D; ++qy) {
-                const CeedScalar wy = B(py, qy);
-                for (int qx = 0; qx < Q1D; ++qx) {
-                  V_xy[qy][qx] += wy * V_x[qx];
-                }
-              }
-            }
-
-            for (int qz = 0; qz < Q1D; ++qz) {
-              const CeedScalar wz = B(pz, qz);
-              for (int qy = 0; qy < Q1D; ++qy) {
-                for (int qx = 0; qx < Q1D; ++qx) {
-                  Ve(qx, qy, qz) += wz * V_xy[qy][qx];
-                }
-              }
-            }
-          }
-        }
-
-    inline void TENSOR_FUNCTION(interpElementTranspose)(const CeedScalar *B @dim(P1D, Q1D), const CeedScalar *Ue @dim(Q1D, Q1D, Q1D),
-                                                        CeedScalar *Ve @dim(P1D, P1D, P1D)) {
-      for (int pz = 0; pz < P1D; ++pz) {
-        for (int py = 0; py < P1D; ++py) {
-          for (int px = 0; px < P1D; ++px) {
-            Ve(px, py, pz) = 0;
-          }
-        }
-      }
-
-      for (int qz = 0; qz < Q1D; ++qz) {
-        CeedScalar V_xy[P1D][P1D];
-        for (int py = 0; py < P1D; ++py) {
-          for (int px = 0; px < P1D; ++px) {
-            V_xy[py][px] = 0;
-          }
-        }
-
-        for (int qy = 0; qy < Q1D; ++qy) {
-          CeedScalar V_x[P1D];
-          for (int px = 0; px < P1D; ++px) {
-            V_x[px] = 0;
-          }
-
-          for (int qx = 0; qx < Q1D; ++qx) {
-            const CeedScalar Uq = Ue(qx, qy, qz);
-            for (int px = 0; px < P1D; ++px) {
-              V_x[px] += B(px, qx) * Uq;
-            }
-          }
-
-          for (int py = 0; py < P1D; ++py) {
-            const CeedScalar wy = B(py, qy);
-            for (int px = 0; px < P1D; ++px) {
-              V_xy[py][px] += wy * V_x[px];
-            }
-          }
-        }
-
-        for (int pz = 0; pz < P1D; ++pz) {
-          const CeedScalar wz = B(pz, qz);
-          for (int py = 0; py < P1D; ++py) {
-            for (int px = 0; px < P1D; ++px) {
-              Ve(px, py, pz) += wz * V_xy[py][px];
-            }
-          }
-        }
-      }
-    }
-
-    inline void TENSOR_FUNCTION(gradElement)(const CeedScalar *B @dim(P1D, Q1D), const CeedScalar *Bx @dim(P1D, Q1D),
-                                             const CeedScalar *Ue @dim(P1D, P1D, P1D), CeedScalar *Ve_x @dim(Q1D, Q1D, Q1D),
-                                             CeedScalar *Ve_y @dim(Q1D, Q1D, Q1D), CeedScalar *Ve_z @dim(Q1D, Q1D, Q1D)) {
-      for (int qz = 0; qz < Q1D; ++qz) {
-        for (int qy = 0; qy < Q1D; ++qy) {
-          for (int qx = 0; qx < Q1D; ++qx) {
-            Ve_x(qx, qy, qz) = 0;
-            Ve_y(qx, qy, qz) = 0;
-            Ve_z(qx, qy, qz) = 0;
-          }
-        }
-      }
-
-      for (int pz = 0; pz < P1D; ++pz) {
-        CeedScalar gradXY[Q1D][Q1D][3];
-        for (int qy = 0; qy < Q1D; ++qy) {
-          for (int qx = 0; qx < Q1D; ++qx) {
-            gradXY[qy][qx][0] = 0;
-            gradXY[qy][qx][1] = 0;
-            gradXY[qy][qx][2] = 0;
-          }
-        }
-
-        for (int py = 0; py < P1D; ++py) {
-          CeedScalar gradX[Q1D][2];
-          for (int qx = 0; qx < Q1D; ++qx) {
-            gradX[qx][0] = 0;
-            gradX[qx][1] = 0;
-          }
-
-          for (int px = 0; px < P1D; ++px) {
-            const CeedScalar Up = Ue(px, py, pz);
-            for (int qx = 0; qx < Q1D; ++qx) {
-              gradX[qx][0] += Up * B(px, qx);
-              gradX[qx][1] += Up * Bx(px, qx);
-            }
-          }
-
-          for (int qy = 0; qy < Q1D; ++qy) {
-            const CeedScalar wy  = B(py, qy);
-            const CeedScalar wDy = Bx(py, qy);
-            for (int qx = 0; qx < Q1D; ++qx) {
-              const CeedScalar wx  = gradX[qx][0];
-              const CeedScalar wDx = gradX[qx][1];
-              gradXY[qy][qx][0] += wDx * wy;
-              gradXY[qy][qx][1] += wx * wDy;
-              gradXY[qy][qx][2] += wx * wy;
-            }
-          }
-        }
-
-        for (int qz = 0; qz < Q1D; ++qz) {
-          const CeedScalar wz  = B(pz, qz);
-          const CeedScalar wDz = Bx(pz, qz);
-          for (int qy = 0; qy < Q1D; ++qy) {
-            for (int qx = 0; qx < Q1D; ++qx) {
-              Ve_x(qx, qy, qz) += gradXY[qy][qx][0] * wz;
-              Ve_y(qx, qy, qz) += gradXY[qy][qx][1] * wz;
-              Ve_z(qx, qy, qz) += gradXY[qy][qx][2] * wDz;
-            }
-          }
-        }
-      }
-    }
-
-    inline void TENSOR_FUNCTION(gradElementTranspose)(const CeedScalar *B @dim(P1D, Q1D), const CeedScalar *Bx @dim(P1D, Q1D),
-                                                      const CeedScalar *Ue_x @dim(Q1D, Q1D, Q1D), const CeedScalar *Ue_y @dim(Q1D, Q1D, Q1D),
-                                                      const CeedScalar *Ue_z @dim(Q1D, Q1D, Q1D), CeedScalar *Ve @dim(P1D, P1D, P1D)) {
-      for (int pz = 0; pz < P1D; ++pz) {
-        for (int py = 0; py < P1D; ++py) {
-          for (int px = 0; px < P1D; ++px) {
-            Ve(px, py, pz) = 0;
-          }
-        }
-      }
-
-      for (int qz = 0; qz < Q1D; ++qz) {
-        CeedScalar gradXY[P1D][P1D][3];
-        for (int py = 0; py < P1D; ++py) {
-          for (int px = 0; px < P1D; ++px) {
-            gradXY[py][px][0] = 0;
-            gradXY[py][px][1] = 0;
-            gradXY[py][px][2] = 0;
-          }
-        }
-
-        for (int qy = 0; qy < Q1D; ++qy) {
-          CeedScalar gradX[P1D][3];
-          for (int px = 0; px < P1D; ++px) {
-            gradX[px][0] = 0;
-            gradX[px][1] = 0;
-            gradX[px][2] = 0;
-          }
-
-          for (int qx = 0; qx < Q1D; ++qx) {
-            const CeedScalar Ux = Ue_x(qx, qy, qz);
-            const CeedScalar Uy = Ue_y(qx, qy, qz);
-            const CeedScalar Uz = Ue_z(qx, qy, qz);
-            for (int px = 0; px < P1D; ++px) {
-              const CeedScalar wx  = B(px, qx);
-              const CeedScalar wDx = Bx(px, qx);
-              gradX[px][0] += Ux * wDx;
-              gradX[px][1] += Uy * wx;
-              gradX[px][2] += Uz * wx;
-            }
-          }
-
-          for (int py = 0; py < P1D; ++py) {
-            const CeedScalar wy  = B(py, qy);
-            const CeedScalar wDy = Bx(py, qy);
-            for (int px = 0; px < P1D; ++px) {
-              gradXY[py][px][0] += gradX[px][0] * wy;
-              gradXY[py][px][1] += gradX[px][1] * wDy;
-              gradXY[py][px][2] += gradX[px][2] * wy;
-            }
-          }
-        }
-
-        for (int pz = 0; pz < P1D; ++pz) {
-          const CeedScalar wz  = B(pz, qz);
-          const CeedScalar wDz = Bx(pz, qz);
-          for (int py = 0; py < P1D; ++py) {
-            for (int px = 0; px < P1D; ++px) {
-              Ve(px, py, pz) += ((gradXY[py][px][0] * wz) + (gradXY[py][px][1] * wz) + (gradXY[py][px][2] * wDz));
-            }
-          }
-        }
-      }
-    }
-
-    inline void TENSOR_FUNCTION(weightElement)(const CeedScalar *qWeights1D, CeedScalar *We @dim(Q1D, Q1D, Q1D)) {
-      for (int qz = 0; qz < Q1D; ++qz) {
-        const CeedScalar wz = qWeights1D[qz];
-        for (int qy = 0; qy < Q1D; ++qy) {
-          const CeedScalar wy = qWeights1D[qy];
-          for (int qx = 0; qx < Q1D; ++qx) {
-            We(qx, qy, qz) = qWeights1D[qx] * wy * wz;
-          }
-        }
-      }
-    }
-
-);
-
-const char *occa_tensor_basis_3d_cpu_kernel_source = STRINGIFY_SOURCE(
-
-    @kernel void interp(const CeedInt elementCount, const CeedScalar *B, const CeedScalar *U, CeedScalar *V) {
-      for (int element = 0; element < elementCount; ++element; @outer) {
-        for (int component = 0; component < BASIS_COMPONENT_COUNT; ++component; @inner) {
-          if (!TRANSPOSE) {
-            const CeedScalar *Ue @dim(P1D, P1D, P1D, BASIS_COMPONENT_COUNT, elementCount) = U;
-            CeedScalar       *Ve @dim(Q1D, Q1D, Q1D, elementCount, BASIS_COMPONENT_COUNT) = V;
-
-            TENSOR_FUNCTION(interpElement)(B, &Ue(0, 0, 0, component, element), &Ve(0, 0, 0, element, component));
-          } else {
-            const CeedScalar *Ue @dim(Q1D, Q1D, Q1D, elementCount, BASIS_COMPONENT_COUNT) = U;
-            CeedScalar       *Ve @dim(P1D, P1D, P1D, BASIS_COMPONENT_COUNT, elementCount) = V;
-
-            TENSOR_FUNCTION(interpElementTranspose)(B, &Ue(0, 0, 0, element, component), &Ve(0, 0, 0, component, element));
-          }
-        }
-      }
-    }
-
-    @kernel void grad(const CeedInt elementCount, const CeedScalar *B, const CeedScalar *Bx, const CeedScalar *U, CeedScalar *V) {
-      for (int element = 0; element < elementCount; ++element; @outer) {
-        for (int component = 0; component < BASIS_COMPONENT_COUNT; ++component; @inner) {
-          if (!TRANSPOSE) {
-            const CeedScalar *Ue @dim(P1D, P1D, P1D, BASIS_COMPONENT_COUNT, elementCount)    = U;
-            CeedScalar       *Ve @dim(Q1D, Q1D, Q1D, elementCount, BASIS_COMPONENT_COUNT, 3) = V;
-
-            TENSOR_FUNCTION(gradElement)
-            (B, Bx, &Ue(0, 0, 0, component, element), &Ve(0, 0, 0, element, component, 0), &Ve(0, 0, 0, element, component, 1),
-             &Ve(0, 0, 0, element, component, 2));
-          } else {
-            const CeedScalar *Ue @dim(Q1D, Q1D, Q1D, elementCount, BASIS_COMPONENT_COUNT, 3) = U;
-            CeedScalar       *Ve @dim(P1D, P1D, P1D, BASIS_COMPONENT_COUNT, elementCount)    = V;
-
-            TENSOR_FUNCTION(gradElementTranspose)
-            (B, Bx, &Ue(0, 0, 0, element, component, 0), &Ue(0, 0, 0, element, component, 1), &Ue(0, 0, 0, element, component, 2),
-             &Ve(0, 0, 0, component, element));
-          }
-        }
-      }
-    }
-
-    @kernel void weight(const CeedInt elementCount, const CeedScalar *qWeights1D, CeedScalar *W @dim(Q1D, Q1D, Q1D, elementCount)) {
-      @tile(32, @outer, @inner) for (int element = 0; element < elementCount; ++element) {
-        TENSOR_FUNCTION(weightElement)(qWeights1D, &W(0, 0, 0, element));
-      }
-    }
-
-);
diff --git a/backends/occa/kernels/tensor-basis/gpu/tensor-basis-1d.cpp b/backends/occa/kernels/tensor-basis/gpu/tensor-basis-1d.cpp
deleted file mode 100644
index 408140b723..0000000000
--- a/backends/occa/kernels/tensor-basis/gpu/tensor-basis-1d.cpp
+++ /dev/null
@@ -1,118 +0,0 @@
-// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#include "../../kernel-defines.hpp"
-
-const char *occa_tensor_basis_1d_gpu_source = STRINGIFY_SOURCE(
-
-    typedef CeedScalar * dofArray @dim(P1D, BASIS_COMPONENT_COUNT, elementCount);
-    typedef const CeedScalar *const_dofArray @dim(P1D, BASIS_COMPONENT_COUNT, elementCount);
-
-    typedef CeedScalar * quadArray @dim(Q1D, elementCount, BASIS_COMPONENT_COUNT);
-    typedef const CeedScalar *const_quadArray @dim(Q1D, elementCount, BASIS_COMPONENT_COUNT);
-
-    typedef CeedScalar * sharedBufferArray @dim(MAX_PQ, ELEMENTS_PER_BLOCK); typedef const CeedScalar *quadToDof @dim(P1D, Q1D);
-    typedef CeedScalar * elementWeightArray @dim(Q1D, elementCount);
-
-    //---[ Utility Methods ]----------------
-    inline void readDofs(const int element, const int localElement, const int component, const int p, const_dofArray U,
-                         sharedBufferArray sharedBuffer) {
-      // Zero out extra entries
-      sharedBuffer(p, localElement) = ((p < P1D) ? U(p, component, element) : 0.0);
-    }
-
-    inline void writeDofs(const int element, const int component, const int p, const CeedScalar Vp, dofArray V) {
-      if (p < P1D) {
-        V(p, component, element) = Vp;
-      }
-    }
-
-    inline void readQuads(const int elementCount, const int element, const int localElement, const int component, const int q, const_quadArray U,
-                          sharedBufferArray sharedBuffer) { sharedBuffer(q, localElement) = U(q, element, component); }
-
-    inline void writeQuads(const int elementCount, const int element, const int component, const int q, const CeedScalar Vq, quadArray V) {
-      V(q, element, component) = Vq;
-    }
-
-    inline void contractX(const int q, const int localElement, sharedBufferArray sharedBuffer, quadToDof B, CeedScalar &V) {
-      V = 0.0;
-      for (int p = 0; p < P1D; ++p) {
-        V += B(p, q) * sharedBuffer(p, localElement);
-      }
-    }
-
-    inline void contractTransposeX(const int p, const int localElement, sharedBufferArray sharedBuffer, quadToDof B, CeedScalar &V) {
-      V = 0.0;
-      for (int q = 0; q < Q1D; ++q) {
-        V += B(p, q) * sharedBuffer(q, localElement);
-      }
-    }
-
-    //---[ Kernels ]------------------------
-    @kernel void interp(const CeedInt elementCount, quadToDof B, const CeedScalar *U, CeedScalar *V) {
-      for (int elementOffset = 0; elementOffset < elementCount; elementOffset += ELEMENTS_PER_BLOCK; @outer) {
-        @shared CeedScalar sharedBuffer[MAX_PQ * ELEMENTS_PER_BLOCK];
-
-        for (int localElement = 0; localElement < ELEMENTS_PER_BLOCK; ++localElement; @inner) {
-          for (int q = 0; q < Q1D; ++q; @inner) {
-            const int element = elementOffset + localElement;
-            if (element < elementCount) {
-              for (int component = 0; component < BASIS_COMPONENT_COUNT; ++component) {
-                CeedScalar r;
-                if (!TRANSPOSE) {
-                  readDofs(element, localElement, component, q, U, sharedBuffer);
-                  contractX(q, localElement, sharedBuffer, B, r);
-                  writeQuads(elementCount, element, component, q, r, V);
-                } else {
-                  readQuads(elementCount, element, localElement, component, q, U, sharedBuffer);
-                  contractTransposeX(q, localElement, sharedBuffer, B, r);
-                  writeDofs(element, component, q, r, V);
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-
-    @kernel void grad(const CeedInt elementCount, quadToDof B, quadToDof Bx, const CeedScalar *U, CeedScalar *V) {
-      for (int elementOffset = 0; elementOffset < elementCount; elementOffset += ELEMENTS_PER_BLOCK; @outer) {
-        @shared CeedScalar sharedBuffer[MAX_PQ * ELEMENTS_PER_BLOCK];
-
-        for (int localElement = 0; localElement < ELEMENTS_PER_BLOCK; ++localElement; @inner) {
-          for (int q = 0; q < Q1D; ++q; @inner) {
-            const int element = elementOffset + localElement;
-            if (element < elementCount) {
-              for (int component = 0; component < BASIS_COMPONENT_COUNT; ++component) {
-                CeedScalar r;
-                if (!TRANSPOSE) {
-                  readDofs(element, localElement, component, q, U, sharedBuffer);
-                  contractX(q, localElement, sharedBuffer, Bx, r);
-                  writeQuads(elementCount, element, component, q, r, V);
-                } else {
-                  readQuads(elementCount, element, localElement, component, q, U, sharedBuffer);
-                  contractTransposeX(q, localElement, sharedBuffer, Bx, r);
-                  writeDofs(element, component, q, r, V);
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-
-    @kernel void weight(const CeedInt elementCount, const CeedScalar *qWeights1D, elementWeightArray W) {
-      for (int elementOffset = 0; elementOffset < elementCount; elementOffset += ELEMENTS_PER_BLOCK; @outer) {
-        for (int element = elementOffset; element < (elementOffset + ELEMENTS_PER_BLOCK); ++element; @outer) {
-          for (int q = 0; q < Q1D; ++q; @inner) {
-            W(q, element) = qWeights1D[q];
-          }
-        }
-      }
-    }
-
-);
diff --git a/backends/occa/kernels/tensor-basis/gpu/tensor-basis-2d.cpp b/backends/occa/kernels/tensor-basis/gpu/tensor-basis-2d.cpp
deleted file mode 100644
index c080336f40..0000000000
--- a/backends/occa/kernels/tensor-basis/gpu/tensor-basis-2d.cpp
+++ /dev/null
@@ -1,162 +0,0 @@
-// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#include "../../kernel-defines.hpp"
-
-const char *occa_tensor_basis_2d_gpu_source = STRINGIFY_SOURCE(
-
-    typedef CeedScalar * dofArray @dim(P1D, P1D, BASIS_COMPONENT_COUNT, elementCount);
-    typedef const CeedScalar *const_dofArray @dim(P1D, P1D, BASIS_COMPONENT_COUNT, elementCount);
-
-    typedef CeedScalar * quadArray @dim(Q1D, Q1D, elementCount, BASIS_COMPONENT_COUNT, 2);
-    typedef const CeedScalar *const_quadArray @dim(Q1D, Q1D, elementCount, BASIS_COMPONENT_COUNT, 2);
-
-    typedef CeedScalar * sharedBufferArray @dim(MAX_PQ, MAX_PQ, ELEMENTS_PER_BLOCK); typedef const CeedScalar *quadToDof @dim(P1D, Q1D);
-    typedef CeedScalar * elementWeightArray @dim(Q1D, Q1D, elementCount);
-
-    //---[ Utility Methods ]----------------
-    inline void readDofs(const int element, const int component, const int px, const int py, const_dofArray U, CeedScalar *Up) {
-      // Zero out extra entries
-      *Up = ((px < P1D) && (py < P1D) ? U(px, py, component, element) : 0.0);
-    }
-
-    inline void writeDofs(const int element, const int component, const int px, const int py, const CeedScalar Vp, dofArray V) {
-      if ((px < P1D) && (py < P1D)) {
-        V(px, py, component, element) = Vp;
-      }
-    }
-
-    inline void readQuads(const int elementCount, const int element, const int component, const int qx, const int qy, const int dim,
-                          const_quadArray U, CeedScalar *Uq) { *Uq = U(qx, qy, element, component, dim); }
-
-    inline void writeQuads(const int elementCount, const int element, const int component, const int qx, const int qy, const int dim,
-                           const CeedScalar Vq, quadArray V) { V(qx, qy, element, component, dim) = Vq; }
-
-    inline void contractX(const int qx, const int qy, const int localElement, sharedBufferArray sharedBuffer, quadToDof B, const CeedScalar U,
-                          CeedScalar *V) {
-      sharedBuffer(qx, qy, localElement) = U;
-      *V                                 = 0.0;
-      @barrier();
-      for (int p = 0; p < P1D; ++p) {
-        *V += B(p, qx) * sharedBuffer(p, qy, localElement);
-      }
-      @barrier();
-    }
-
-    inline void contractY(const int qx, const int qy, const int localElement, sharedBufferArray sharedBuffer, quadToDof B, const CeedScalar U,
-                          CeedScalar *V) {
-      sharedBuffer(qx, qy, localElement) = U;
-      *V                                 = 0.0;
-      @barrier();
-      for (int p = 0; p < P1D; ++p) {
-        *V += B(p, qy) * sharedBuffer(qx, p, localElement);
-      }
-      @barrier();
-    }
-
-    inline void contractTransposeX(const int px, const int py, const int localElement, sharedBufferArray sharedBuffer, quadToDof B,
-                                   const CeedScalar U, CeedScalar *V) {
-      sharedBuffer(px, py, localElement) = U;
-      *V                                 = 0.0;
-      @barrier();
-      for (int q = 0; q < Q1D; ++q) {
-        *V += B(px, q) * sharedBuffer(q, py, localElement);
-      }
-      @barrier();
-    }
-
-    inline void contractTransposeY(const int px, const int py, const int localElement, sharedBufferArray sharedBuffer, quadToDof B,
-                                   const CeedScalar U, CeedScalar *V) {
-      sharedBuffer(px, py, localElement) = U;
-      *V                                 = 0.0;
-      @barrier();
-      for (int q = 0; q < Q1D; ++q) {
-        *V += B(py, q) * sharedBuffer(px, q, localElement);
-      }
-      @barrier();
-    }
-
-    //---[ Kernels ]------------------------
-    @kernel void interp(const CeedInt elementCount, quadToDof B, const CeedScalar *U, CeedScalar *V) {
-      for (int elementOffset = 0; elementOffset < elementCount; elementOffset += ELEMENTS_PER_BLOCK; @outer) {
-        @shared CeedScalar sharedBuffer[MAX_PQ * MAX_PQ * ELEMENTS_PER_BLOCK];
-
-        for (int localElement = 0; localElement < ELEMENTS_PER_BLOCK; ++localElement; @inner) {
-          const int element = elementOffset + localElement;
-          for (int qy = 0; qy < Q1D; ++qy; @inner) {
-            for (int qx = 0; qx < Q1D; ++qx; @inner) {
-              if (element < elementCount) {
-                for (int component = 0; component < BASIS_COMPONENT_COUNT; ++component) {
-                  CeedScalar r1, r2;
-                  if (!TRANSPOSE) {
-                    readDofs(element, component, qx, qy, U, &r1);
-                    contractX(qx, qy, localElement, sharedBuffer, B, r1, &r2);
-                    contractY(qx, qy, localElement, sharedBuffer, B, r2, &r1);
-                    writeQuads(elementCount, element, component, qx, qy, 0, r1, V);
-                  } else {
-                    readQuads(elementCount, element, component, qx, qy, 0, U, &r1);
-                    contractTransposeY(qx, qy, localElement, sharedBuffer, B, r1, &r2);
-                    contractTransposeX(qx, qy, localElement, sharedBuffer, B, r2, &r1);
-                    writeDofs(element, component, qx, qy, r1, V);
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-
-    @kernel void grad(const CeedInt elementCount, quadToDof B, quadToDof Bx, const CeedScalar *U, CeedScalar *V) {
-      for (int elementOffset = 0; elementOffset < elementCount; elementOffset += ELEMENTS_PER_BLOCK; @outer) {
-        @shared CeedScalar sharedBuffer[MAX_PQ * MAX_PQ * ELEMENTS_PER_BLOCK];
-
-        for (int localElement = 0; localElement < ELEMENTS_PER_BLOCK; ++localElement; @inner) {
-          const int element = elementOffset + localElement;
-          for (int qy = 0; qy < Q1D; ++qy; @inner) {
-            for (int qx = 0; qx < Q1D; ++qx; @inner) {
-              if (element < elementCount) {
-                for (int component = 0; component < BASIS_COMPONENT_COUNT; ++component) {
-                  CeedScalar r1, r2, r3;
-                  if (!TRANSPOSE) {
-                    readDofs(element, component, qx, qy, U, &r1);
-                    contractX(qx, qy, localElement, sharedBuffer, Bx, r1, &r2);
-                    contractY(qx, qy, localElement, sharedBuffer, B, r2, &r3);
-                    writeQuads(elementCount, element, component, qx, qy, 0, r3, V);
-                    contractX(qx, qy, localElement, sharedBuffer, B, r1, &r2);
-                    contractY(qx, qy, localElement, sharedBuffer, Bx, r2, &r3);
-                    writeQuads(elementCount, element, component, qx, qy, 1, r3, V);
-                  } else {
-                    readQuads(elementCount, element, component, qx, qy, 0, U, &r1);
-                    contractTransposeY(qx, qy, localElement, sharedBuffer, B, r1, &r2);
-                    contractTransposeX(qx, qy, localElement, sharedBuffer, Bx, r2, &r3);
-                    readQuads(elementCount, element, component, qx, qy, 1, U, &r1);
-                    contractTransposeY(qx, qy, localElement, sharedBuffer, Bx, r1, &r2);
-                    contractTransposeX(qx, qy, localElement, sharedBuffer, B, r2, &r1);
-                    writeDofs(element, component, qx, qy, r1 + r3, V);
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-
-    @kernel void weight(const CeedInt elementCount, const CeedScalar *qWeights1D, elementWeightArray W) {
-      for (int elementOffset = 0; elementOffset < elementCount; elementOffset += ELEMENTS_PER_BLOCK; @outer) {
-        for (int element = elementOffset; element < (elementOffset + ELEMENTS_PER_BLOCK); ++element; @outer) {
-          for (int qy = 0; qy < Q1D; ++qy; @inner) {
-            for (int qx = 0; qx < Q1D; ++qx; @inner) {
-              W(qx, qy, element) = qWeights1D[qx] * qWeights1D[qy];
-            }
-          }
-        }
-      }
-    }
-
-);
diff --git a/backends/occa/kernels/tensor-basis/gpu/tensor-basis-3d.cpp b/backends/occa/kernels/tensor-basis/gpu/tensor-basis-3d.cpp
deleted file mode 100644
index 6d0b5f631c..0000000000
--- a/backends/occa/kernels/tensor-basis/gpu/tensor-basis-3d.cpp
+++ /dev/null
@@ -1,237 +0,0 @@
-// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#include "../../kernel-defines.hpp"
-
-const char *occa_tensor_basis_3d_gpu_source = STRINGIFY_SOURCE(
-
-    typedef CeedScalar * dofArray @dim(P1D, P1D, P1D, BASIS_COMPONENT_COUNT, elementCount);
-    typedef const CeedScalar *const_dofArray @dim(P1D, P1D, P1D, BASIS_COMPONENT_COUNT, elementCount);
-
-    typedef CeedScalar * quadArray @dim(Q1D, Q1D, Q1D, elementCount, BASIS_COMPONENT_COUNT, 3);
-    typedef const CeedScalar *const_quadArray @dim(Q1D, Q1D, Q1D, elementCount, BASIS_COMPONENT_COUNT, 3);
-
-    typedef CeedScalar * sharedBufferArray @dim(MAX_PQ, MAX_PQ, BASIS_COMPONENT_COUNT); typedef const CeedScalar *quadToDof @dim(P1D, Q1D);
-    typedef CeedScalar * elementWeightArray @dim(Q1D, Q1D, Q1D, elementCount);
-
-    //---[ Utility Methods ]----------------
-    inline void add(const CeedScalar *U, CeedScalar *V) {
-      for (int q = 0; q < Q1D; q++) {
-        V[q] += U[q];
-      }
-    }
-
-    inline void readDofs(const int element, const int component, const int px, const int py, const_dofArray U, CeedScalar *Up) {
-      // Zero out extra entries
-      for (int pz = 0; pz < P1D; ++pz) {
-        Up[pz] = ((px < P1D) && (py < P1D) ? U(px, py, pz, component, element) : 0.0);
-      }
-      for (int q = P1D; q < Q1D; ++q) {
-        Up[q] = 0.0;
-      }
-    }
-
-    inline void writeDofs(const int element, const int component, const int px, const int py, const CeedScalar *Vp, dofArray V) {
-      if ((px < P1D) && (py < P1D)) {
-        for (int pz = 0; pz < P1D; ++pz) {
-          V(px, py, pz, component, element) = Vp[pz];
-        }
-      }
-    }
-
-    inline void readQuads(const int elementCount, const int element, const int component, const int qx, const int qy, const int dim,
-                          const_quadArray U, CeedScalar *Uq) {
-      for (int qz = 0; qz < Q1D; ++qz) {
-        Uq[qz] = U(qx, qy, qz, element, component, dim);
-      }
-    }
-
-    inline void writeQuads(const int elementCount, const int element, const int component, const int qx, const int qy, const int dim,
-                           const CeedScalar *Vq, quadArray V) {
-      for (int qz = 0; qz < Q1D; ++qz) {
-        V(qx, qy, qz, element, component, dim) = Vq[qz];
-      }
-    }
-
-    inline void contractX(const int qx, const int qy, const int component, sharedBufferArray sharedBuffer, quadToDof B, const CeedScalar *Uq,
-                          CeedScalar *Vp) {
-      for (int pz = 0; pz < P1D; ++pz) {
-        sharedBuffer(qx, qy, component) = Uq[pz];
-        Vp[pz]                          = 0.0;
-        @barrier();
-        for (int p = 0; p < P1D; ++p) {
-          Vp[pz] += B(p, qx) * sharedBuffer(p, qy, component);
-        }
-        @barrier();
-      }
-    }
-
-    inline void contractY(const int qx, const int qy, const int component, sharedBufferArray sharedBuffer, quadToDof B, const CeedScalar *Uq,
-                          CeedScalar *Vp) {
-      for (int pz = 0; pz < P1D; ++pz) {
-        sharedBuffer(qx, qy, component) = Uq[pz];
-        Vp[pz]                          = 0.0;
-        @barrier();
-        for (int p = 0; p < P1D; ++p) {
-          Vp[pz] += B(p, qy) * sharedBuffer(qx, p, component);
-        }
-        @barrier();
-      }
-    }
-
-    inline void contractZ(const int qx, const int qy, quadToDof B, const CeedScalar *Up, CeedScalar *Vq) {
-      for (int qz = 0; qz < Q1D; ++qz) {
-        Vq[qz] = 0.0;
-        for (int p = 0; p < P1D; ++p) {
-          Vq[qz] += B(p, qz) * Up[p];
-        }
-      }
-    }
-
-    inline void contractTransposeX(const int px, const int py, const int component, sharedBufferArray sharedBuffer, quadToDof B, const CeedScalar *Up,
-                                   CeedScalar *Vp) {
-      for (int pz = 0; pz < P1D; ++pz) {
-        sharedBuffer(px, py, component) = Up[pz];
-        Vp[pz]                          = 0.0;
-        @barrier();
-        if (px < P1D) {
-          for (int qx = 0; qx < Q1D; ++qx) {
-            Vp[pz] += B(px, qx) * sharedBuffer(qx, py, component);
-          }
-        }
-        @barrier();
-      }
-    }
-
-    inline void contractTransposeY(const int px, const int py, const int component, sharedBufferArray sharedBuffer, quadToDof B, const CeedScalar *Up,
-                                   CeedScalar *Vp) {
-      for (int pz = 0; pz < P1D; ++pz) {
-        sharedBuffer(px, py, component) = Up[pz];
-        Vp[pz]                          = 0.0;
-        @barrier();
-        if (py < P1D) {
-          for (int qy = 0; qy < Q1D; ++qy) {
-            Vp[pz] += B(py, qy) * sharedBuffer(px, qy, component);
-          }
-        }
-        @barrier();
-      }
-    }
-
-    inline void contractTransposeZ(const int px, const int py, quadToDof B, const CeedScalar *Uq, CeedScalar *Vq) {
-      for (int pz = 0; pz < P1D; ++pz) {
-        Vq[pz] = 0.0;
-        for (int qz = 0; qz < Q1D; ++qz) {
-          Vq[pz] += B(pz, qz) * Uq[qz];
-        }
-      }
-    }
-
-    //---[ Kernels ]------------------------
-    @kernel void interp(const CeedInt elementCount, quadToDof B, const CeedScalar *U, CeedScalar *V) {
-      for (int element = 0; element < elementCount; ++element; @outer) {
-        @shared CeedScalar sharedBuffer[MAX_PQ * MAX_PQ * BASIS_COMPONENT_COUNT];
-
-        for (int component = 0; component < BASIS_COMPONENT_COUNT; ++component; @inner) {
-          for (int qy = 0; qy < Q1D; ++qy; @inner) {
-            for (int qx = 0; qx < Q1D; ++qx; @inner) {
-              if (element < elementCount) {
-                CeedScalar r1[MAX_PQ], r2[MAX_PQ];
-                for (int q = 0; q < Q1D; ++q) {
-                  r1[q] = 0.0;
-                  r2[q] = 0.0;
-                }
-
-                if (!TRANSPOSE) {
-                  readDofs(element, component, qx, qy, U, r1);
-                  contractX(qx, qy, component, sharedBuffer, B, r1, r2);
-                  contractY(qx, qy, component, sharedBuffer, B, r2, r1);
-                  contractZ(qx, qy, B, r1, r2);
-                  writeQuads(elementCount, element, component, qx, qy, 0, r2, V);
-                } else {
-                  readQuads(elementCount, element, component, qx, qy, 0, U, r1);
-                  contractTransposeZ(qx, qy, B, r1, r2);
-                  contractTransposeY(qx, qy, component, sharedBuffer, B, r2, r1);
-                  contractTransposeX(qx, qy, component, sharedBuffer, B, r1, r2);
-                  writeDofs(element, component, qx, qy, r2, V);
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-
-    @kernel void grad(const CeedInt elementCount, quadToDof B, quadToDof Bx, const CeedScalar *U, CeedScalar *V) {
-      for (int element = 0; element < elementCount; ++element; @outer) {
-        @shared CeedScalar sharedBuffer[MAX_PQ * MAX_PQ * BASIS_COMPONENT_COUNT];
-
-        for (int component = 0; component < BASIS_COMPONENT_COUNT; ++component; @inner) {
-          for (int qy = 0; qy < Q1D; ++qy; @inner) {
-            for (int qx = 0; qx < Q1D; ++qx; @inner) {
-              if (element < elementCount) {
-                CeedScalar r1[MAX_PQ], r2[MAX_PQ], r3[MAX_PQ];
-
-                if (!TRANSPOSE) {
-                  readDofs(element, component, qx, qy, U, r1);
-                  // Dx
-                  contractX(qx, qy, component, sharedBuffer, Bx, r1, r2);
-                  contractY(qx, qy, component, sharedBuffer, B, r2, r3);
-                  contractZ(qx, qy, B, r3, r2);
-                  writeQuads(elementCount, element, component, qx, qy, 0, r2, V);
-                  // Dy
-                  contractX(qx, qy, component, sharedBuffer, B, r1, r2);
-                  contractY(qx, qy, component, sharedBuffer, Bx, r2, r3);
-                  contractZ(qx, qy, B, r3, r2);
-                  writeQuads(elementCount, element, component, qx, qy, 1, r2, V);
-                  // Dz
-                  contractX(qx, qy, component, sharedBuffer, B, r1, r2);
-                  contractY(qx, qy, component, sharedBuffer, B, r2, r3);
-                  contractZ(qx, qy, Bx, r3, r2);
-                  writeQuads(elementCount, element, component, qx, qy, 2, r2, V);
-                } else {
-                  // Dx
-                  readQuads(elementCount, element, component, qx, qy, 0, U, r1);
-                  contractTransposeZ(qx, qy, B, r1, r3);
-                  contractTransposeY(qx, qy, component, sharedBuffer, B, r3, r1);
-                  contractTransposeX(qx, qy, component, sharedBuffer, Bx, r1, r2);
-                  // Dy
-                  readQuads(elementCount, element, component, qx, qy, 1, U, r1);
-                  contractTransposeZ(qx, qy, B, r1, r3);
-                  contractTransposeY(qx, qy, component, sharedBuffer, Bx, r3, r1);
-                  contractTransposeX(qx, qy, component, sharedBuffer, B, r1, r3);
-                  add(r3, r2);
-                  // Dz
-                  readQuads(elementCount, element, component, qx, qy, 2, U, r1);
-                  contractTransposeZ(qx, qy, Bx, r1, r3);
-                  contractTransposeY(qx, qy, component, sharedBuffer, B, r3, r1);
-                  contractTransposeX(qx, qy, component, sharedBuffer, B, r1, r3);
-                  add(r3, r2);
-                  writeDofs(element, component, qx, qy, r2, V);
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-
-    @kernel void weight(const CeedInt elementCount, const CeedScalar *qWeights1D, elementWeightArray W) {
-      for (int element = 0; element < elementCount; ++element; @outer) {
-        for (int qz = 0; qz < Q1D; ++qz; @inner) {
-          for (int qy = 0; qy < Q1D; ++qy; @inner) {
-            for (int qx = 0; qx < Q1D; ++qx) {
-              if (element < elementCount) {
-                W(qx, qy, qz, element) = qWeights1D[qx] * qWeights1D[qy] * qWeights1D[qz];
-              }
-            }
-          }
-        }
-      }
-    }
-
-);
diff --git a/doc/sphinx/source/intro.md b/doc/sphinx/source/intro.md
index 3c0d04e1ef..574672b31d 100644
--- a/doc/sphinx/source/intro.md
+++ b/doc/sphinx/source/intro.md
@@ -22,7 +22,7 @@ Furthermore, software packages that provide high-performance implementations hav
 libCEED's purely algebraic interface can unobtrusively be integrated in new and legacy software to provide performance portable interfaces.
 While libCEED's focus is on high-order finite elements, the approach is algebraic and thus applicable to other discretizations in factored form.
 libCEED's role, as a lightweight portable library that allows a wide variety of applications to share highly optimized discretization kernels, is illustrated in {numref}`fig-libCEED-backends`, where a non-exhaustive list of specialized implementations (backends) is provided.
-libCEED provides a low-level Application Programming Interface (API) for user codes so that applications with their own discretization infrastructure (e.g., those in [PETSc](https://www.mcs.anl.gov/petsc/), [MFEM](https://mfem.org/) and [Nek5000](https://nek5000.mcs.anl.gov/)) can evaluate and use the core operations provided by libCEED. GPU implementations are available via pure [CUDA](https://developer.nvidia.com/about-cuda) and pure [HIP](https://rocmdocs.amd.com) as well as the [OCCA](http://github.com/libocca/occa) and [MAGMA](https://bitbucket.org/icl/magma) libraries.
+libCEED provides a low-level Application Programming Interface (API) for user codes so that applications with their own discretization infrastructure (e.g., those in [PETSc](https://www.mcs.anl.gov/petsc/), [MFEM](https://mfem.org/) and [Nek5000](https://nek5000.mcs.anl.gov/)) can evaluate and use the core operations provided by libCEED. GPU implementations are available via pure [CUDA](https://developer.nvidia.com/about-cuda) and pure [HIP](https://rocmdocs.amd.com) as well as the [MAGMA](https://bitbucket.org/icl/magma) library.
 CPU implementations are available via pure C and AVX intrinsics as well as the [LIBXSMM](http://github.com/hfp/libxsmm) library.
 libCEED provides a unified interface, so that users only need to write a single source code and can select the desired specialized implementation at run time. Moreover, each process or thread can instantiate an arbitrary number of backends.
 
diff --git a/doc/sphinx/source/libCEEDapi.md b/doc/sphinx/source/libCEEDapi.md
index b43871e422..66caa15688 100644
--- a/doc/sphinx/source/libCEEDapi.md
+++ b/doc/sphinx/source/libCEEDapi.md
@@ -259,7 +259,7 @@ If greater than 1, the caller must ensure that the number of quadrature points `
 This is often satisfied automatically due to the element size or by batching elements together to facilitate vectorization in other stages, and can always be ensured by padding.
 
 In addition to the function pointers (`setup` and `mass`), {ref}`CeedQFunction` constructors take a string representation specifying where the source for the implementation is found.
-This is used by backends that support Just-In-Time (JIT) compilation (i.e., CUDA and OCCA) to compile for coprocessors.
+This is used by backends that support Just-In-Time (JIT) compilation (i.e., CUDA and HIP) to compile for coprocessors.
 For full support across all backends, these {ref}`CeedQFunction` source files must only contain constructs mutually supported by C99, C++11, and CUDA.
 For example, explicit type casting of void pointers and explicit use of compatible arguments for {code}`math` library functions is required, and variable-length array (VLA) syntax for array reshaping is only available via libCEED's {code}`CEED_Q_VLA` macro.
 
diff --git a/doc/sphinx/source/libCEEDdev.md b/doc/sphinx/source/libCEEDdev.md
index 61d0156730..cd6bfbd733 100644
--- a/doc/sphinx/source/libCEEDdev.md
+++ b/doc/sphinx/source/libCEEDdev.md
@@ -86,8 +86,6 @@ This kernel is compiled at runtime via NVRTC, HIPRTC, or OpenCL RTC.
 The `/gpu/*/magma` backends delegate to the corresponding `/gpu/cuda/ref` and `/gpu/hip/ref` backends.
 These backends provide better performance for {ref}`CeedBasis` kernels but do not have the improvements from the `/gpu/*/gen` backends for {ref}`CeedOperator`.
 
-The `/*/*/occa` backends are an experimental feature and not part of any family.
-
 ## Internal Layouts
 
 Ceed backends are free to use any E-vector and Q-vector data layout (including never fully forming these vectors) so long as the backend passes the `t5**` series tests and all examples.
diff --git a/doc/sphinx/source/releasenotes.md b/doc/sphinx/source/releasenotes.md
index 0a4be0b959..682e76f13b 100644
--- a/doc/sphinx/source/releasenotes.md
+++ b/doc/sphinx/source/releasenotes.md
@@ -35,6 +35,10 @@ Specifically, directories set with `CeedAddJitSourceRoot(ceed, "foo/bar")` will
 
 - Add deal.II example with CEED BP suite.
 
+### Maintainability
+
+- OCCA backends were retired.
+
 (v0-12)=
 
 ## v0.12 (Oct 31, 2023)
diff --git a/examples/petsc/area.c b/examples/petsc/area.c
index 2c7e2fbe7a..1b146a4c21 100644
--- a/examples/petsc/area.c
+++ b/examples/petsc/area.c
@@ -128,7 +128,6 @@ int main(int argc, char **argv) {
 
       CeedGetResource(ceed, &resolved);
       if (strstr(resolved, "/gpu/cuda")) vec_type = VECCUDA;
-      else if (strstr(resolved, "/gpu/hip/occa")) vec_type = VECSTANDARD;  // https://github.com/CEED/libCEED/issues/678
       else if (strstr(resolved, "/gpu/hip")) vec_type = VECHIP;
       else vec_type = VECSTANDARD;
     }
diff --git a/examples/petsc/bps.c b/examples/petsc/bps.c
index 92838c8537..a00ee650c8 100644
--- a/examples/petsc/bps.c
+++ b/examples/petsc/bps.c
@@ -83,7 +83,6 @@ static PetscErrorCode RunWithDM(RunParams rp, DM dm, const char *ceed_resource)
 
       CeedGetResource(ceed, &resolved);
       if (strstr(resolved, "/gpu/cuda")) vec_type = VECCUDA;
-      else if (strstr(resolved, "/gpu/hip/occa")) vec_type = VECSTANDARD;  // https://github.com/CEED/libCEED/issues/678
       else if (strstr(resolved, "/gpu/hip")) vec_type = VECHIP;
       else vec_type = VECSTANDARD;
     }
diff --git a/examples/petsc/bpsraw.c b/examples/petsc/bpsraw.c
index 2541d351bc..5bb10f4bd7 100644
--- a/examples/petsc/bpsraw.c
+++ b/examples/petsc/bpsraw.c
@@ -406,7 +406,6 @@ int main(int argc, char **argv) {
 
       CeedGetResource(ceed, &resolved);
       if (strstr(resolved, "/gpu/cuda")) default_vec_type = VECCUDA;
-      else if (strstr(resolved, "/gpu/hip/occa")) default_vec_type = VECSTANDARD;  // https://github.com/CEED/libCEED/issues/678
       else if (strstr(resolved, "/gpu/hip")) default_vec_type = VECHIP;
       else default_vec_type = VECSTANDARD;
     }
diff --git a/examples/petsc/bpssphere.c b/examples/petsc/bpssphere.c
index acd591d5e3..30489224b1 100644
--- a/examples/petsc/bpssphere.c
+++ b/examples/petsc/bpssphere.c
@@ -107,7 +107,6 @@ int main(int argc, char **argv) {
 
       CeedGetResource(ceed, &resolved);
       if (strstr(resolved, "/gpu/cuda")) vec_type = VECCUDA;
-      else if (strstr(resolved, "/gpu/hip/occa")) vec_type = VECSTANDARD;  // https://github.com/CEED/libCEED/issues/678
       else if (strstr(resolved, "/gpu/hip")) vec_type = VECHIP;
       else vec_type = VECSTANDARD;
     }
diff --git a/examples/petsc/bpsswarm.c b/examples/petsc/bpsswarm.c
index 787148797a..e4ba5aed4c 100644
--- a/examples/petsc/bpsswarm.c
+++ b/examples/petsc/bpsswarm.c
@@ -161,7 +161,6 @@ int main(int argc, char **argv) {
 
       CeedGetResource(ceed, &resolved);
       if (strstr(resolved, "/gpu/cuda")) vec_type = VECCUDA;
-      else if (strstr(resolved, "/gpu/hip/occa")) vec_type = VECSTANDARD;  // https://github.com/CEED/libCEED/issues/678
       else if (strstr(resolved, "/gpu/hip")) vec_type = VECHIP;
       else vec_type = VECSTANDARD;
     }
diff --git a/examples/petsc/multigrid.c b/examples/petsc/multigrid.c
index 39bf1eb1ab..1bce6a318a 100644
--- a/examples/petsc/multigrid.c
+++ b/examples/petsc/multigrid.c
@@ -132,7 +132,6 @@ int main(int argc, char **argv) {
 
       CeedGetResource(ceed, &resolved);
       if (strstr(resolved, "/gpu/cuda")) vec_type = VECCUDA;
-      else if (strstr(resolved, "/gpu/hip/occa")) vec_type = VECSTANDARD;  // https://github.com/CEED/libCEED/issues/678
       else if (strstr(resolved, "/gpu/hip")) vec_type = VECHIP;
       else vec_type = VECSTANDARD;
     }
diff --git a/examples/python/tutorial-0-ceed.ipynb b/examples/python/tutorial-0-ceed.ipynb
index b1d712a552..801081154f 100644
--- a/examples/python/tutorial-0-ceed.ipynb
+++ b/examples/python/tutorial-0-ceed.ipynb
@@ -92,8 +92,15 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Similarly, if libCEED is built with GPU support, you can specify a GPU backend, e.g., `/gpu/occa` or `/gpu/cuda/gen`."
+    "Similarly, if libCEED is built with GPU support, you can specify a GPU backend, e.g., `/gpu/hip` or `/gpu/cuda/gen`."
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
diff --git a/tests/junit.py b/tests/junit.py
index 5b4e8319d7..f9ef51891d 100755
--- a/tests/junit.py
+++ b/tests/junit.py
@@ -107,9 +107,6 @@ def check_pre_skip(self, test: str, spec: TestSpec, resource: str, nproc: int) -
         Returns:
             Optional[str]: Skip reason, or `None` if test case should not be skipped
         """
-        if contains_any(resource, ['occa']) and startswith_any(
-                test, ['t4', 't5', 'ex', 'mfem', 'nek', 'petsc', 'fluids', 'solids']):
-            return 'OCCA mode not supported'
         if test.startswith('t318') and contains_any(resource, ['/gpu/cuda/ref']):
             return 'CUDA ref backend not supported'
         if test.startswith('t506') and contains_any(resource, ['/gpu/cuda/shared']):
@@ -130,9 +127,7 @@ def check_post_skip(self, test: str, spec: TestSpec, resource: str, stderr: str)
         Returns:
             Optional[str]: Skip reason, or `None` if unexpeced error
         """
-        if 'OCCA backend failed to use' in stderr:
-            return f'OCCA mode not supported'
-        elif 'Backend does not implement' in stderr:
+        if 'Backend does not implement' in stderr:
             return f'Backend does not implement'
         elif 'Can only provide HOST memory for this backend' in stderr:
             return f'Device memory not supported'

From cc2ec7ec82ea595f9373397304c8373f122cc9f8 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Thu, 8 Jan 2026 14:58:42 -0700
Subject: [PATCH 517/571] gallery - no need for weak symbols here

---
 gallery/ceed-gallery-weak.c | 23 -----------------------
 1 file changed, 23 deletions(-)
 delete mode 100644 gallery/ceed-gallery-weak.c

diff --git a/gallery/ceed-gallery-weak.c b/gallery/ceed-gallery-weak.c
deleted file mode 100644
index 8744f75d8f..0000000000
--- a/gallery/ceed-gallery-weak.c
+++ /dev/null
@@ -1,23 +0,0 @@
-// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#include <ceed/backend.h>
-#include <ceed/types.h>
-
-// This function provides a debug target for weak symbols
-// LCOV_EXCL_START
-static int CeedQFunctionRegister_Weak(const char *name) {
-  CeedDebugEnv("** Weak Register: %s", name);
-  return CEED_ERROR_SUCCESS;
-}
-// LCOV_EXCL_STOP
-
-#define CEED_GALLERY_QFUNCTION(name)                \
-  CEED_INTERN int name(void) __attribute__((weak)); \
-  int             name(void) { return CeedQFunctionRegister_Weak(__func__); }
-#include "ceed-gallery-list.h"
-#undef CEED_GALLERY_QFUNCTION

From e31b7a9f017acbb99fc9680ebe1cb8561925e8f7 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Tue, 13 Jan 2026 11:41:26 -0700
Subject: [PATCH 518/571] gen - use nD for pure CEED_EVAL_NONE AtPoints
 operator

---
 backends/cuda-gen/ceed-cuda-gen-operator-build.cpp | 8 ++++++--
 backends/hip-gen/ceed-hip-gen-operator-build.cpp   | 8 ++++++--
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
index 1cfd9c97d7..79105413fa 100644
--- a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
+++ b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
@@ -1211,19 +1211,23 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_b
                                                          &use_3d_slices));
     data->max_P_1d = is_all_tensor ? max_P_1d : max_P;
   }
-  if (max_dim == 0) max_dim = 1;
-  data->dim = max_dim;
   if (is_at_points) {
+    CeedInt                   coords_dim = 0;
     CeedElemRestriction_Cuda *rstr_data;
     CeedElemRestriction       rstr_points = NULL;
 
     CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL));
     CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(rstr_points, &max_num_points));
     CeedCallBackend(CeedElemRestrictionGetCompStride(rstr_points, &coords_comp_stride));
+    CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr_points, &coords_dim));
     CeedCallBackend(CeedElemRestrictionGetData(rstr_points, &rstr_data));
     data->points.indices = (CeedInt *)rstr_data->d_offsets;
     CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
+    if (max_dim == 0) max_dim = coords_dim;
+    if (Q_1d == 0) max_num_points = ceil(pow(max_num_points, 1.0 / max_dim));
   }
+  if (max_dim == 0) max_dim = 1;
+  data->dim = max_dim;
   if (is_at_points) use_3d_slices = false;
   if (Q_1d == 0) {
     if (is_at_points) Q_1d = max_num_points;
diff --git a/backends/hip-gen/ceed-hip-gen-operator-build.cpp b/backends/hip-gen/ceed-hip-gen-operator-build.cpp
index 2380cbb83d..fc47a6cdd5 100644
--- a/backends/hip-gen/ceed-hip-gen-operator-build.cpp
+++ b/backends/hip-gen/ceed-hip-gen-operator-build.cpp
@@ -1237,19 +1237,23 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_bu
                                                         qf_output_fields, &max_P, &max_P_1d, &Q, &Q_1d, &max_dim, &is_all_tensor, &use_3d_slices));
     data->max_P_1d = is_all_tensor ? max_P_1d : max_P;
   }
-  if (max_dim == 0) max_dim = 1;
-  data->dim = max_dim;
   if (is_at_points) {
+    CeedInt                  coords_dim = 0;
     CeedElemRestriction_Hip *rstr_data;
     CeedElemRestriction      rstr_points = NULL;
 
     CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL));
     CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(rstr_points, &max_num_points));
     CeedCallBackend(CeedElemRestrictionGetCompStride(rstr_points, &coords_comp_stride));
+    CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr_points, &coords_dim));
     CeedCallBackend(CeedElemRestrictionGetData(rstr_points, &rstr_data));
     data->points.indices = (CeedInt *)rstr_data->d_offsets;
     CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
+    if (max_dim == 0) max_dim = coords_dim;
+    if (Q_1d == 0) max_num_points = ceil(pow(max_num_points, 1.0 / max_dim));
   }
+  if (max_dim == 0) max_dim = 1;
+  data->dim = max_dim;
   if (is_at_points) use_3d_slices = false;
   if (Q_1d == 0) {
     if (is_at_points) Q_1d = max_num_points;

From b0f67a9c1aeeb4d82b4724afaae1227ff4e81f15 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Wed, 14 Jan 2026 13:40:15 -0700
Subject: [PATCH 519/571] internal - add CeedObject to handle common
 functionality

---
 backends/ref/ceed-ref-basis.c     |   4 +
 include/ceed-impl.h               |  59 +++++++------
 include/ceed/backend.h            |   5 ++
 include/ceed/ceed.h               |   7 ++
 interface/ceed-basis.c            |  44 ++++++----
 interface/ceed-elemrestriction.c  |  67 +++++++--------
 interface/ceed-object.c           | 132 ++++++++++++++++++++++++++++++
 interface/ceed-operator.c         |  35 +++++---
 interface/ceed-preconditioning.c  |  12 ++-
 interface/ceed-qfunction.c        |  29 +++++--
 interface/ceed-qfunctioncontext.c |  31 +++++--
 interface/ceed-tensor.c           |  13 ++-
 interface/ceed-vector.c           |  46 ++++++++---
 interface/ceed.c                  |  47 ++++++++---
 14 files changed, 384 insertions(+), 147 deletions(-)
 create mode 100644 interface/ceed-object.c

diff --git a/backends/ref/ceed-ref-basis.c b/backends/ref/ceed-ref-basis.c
index 21cbe2201f..d8eef6ce98 100644
--- a/backends/ref/ceed-ref-basis.c
+++ b/backends/ref/ceed-ref-basis.c
@@ -295,6 +295,7 @@ int CeedBasisCreateTensorH1_Ref(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const C
 
   CeedCallBackend(CeedTensorContractCreate(ceed_parent, &contract));
   CeedCallBackend(CeedBasisSetTensorContract(basis, contract));
+  CeedCallBackend(CeedTensorContractDestroy(&contract));
 
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Ref));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAdd_Ref));
@@ -317,6 +318,7 @@ int CeedBasisCreateH1_Ref(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes,
 
   CeedCallBackend(CeedTensorContractCreate(ceed_parent, &contract));
   CeedCallBackend(CeedBasisSetTensorContract(basis, contract));
+  CeedCallBackend(CeedTensorContractDestroy(&contract));
 
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Ref));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAdd_Ref));
@@ -338,6 +340,7 @@ int CeedBasisCreateHdiv_Ref(CeedElemTopology topo, CeedInt dim, CeedInt num_node
 
   CeedCallBackend(CeedTensorContractCreate(ceed_parent, &contract));
   CeedCallBackend(CeedBasisSetTensorContract(basis, contract));
+  CeedCallBackend(CeedTensorContractDestroy(&contract));
 
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Ref));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAdd_Ref));
@@ -359,6 +362,7 @@ int CeedBasisCreateHcurl_Ref(CeedElemTopology topo, CeedInt dim, CeedInt num_nod
 
   CeedCallBackend(CeedTensorContractCreate(ceed_parent, &contract));
   CeedCallBackend(CeedBasisSetTensorContract(basis, contract));
+  CeedCallBackend(CeedTensorContractDestroy(&contract));
 
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Ref));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAdd_Ref));
diff --git a/include/ceed-impl.h b/include/ceed-impl.h
index 2c2454abcf..434943a1b2 100644
--- a/include/ceed-impl.h
+++ b/include/ceed-impl.h
@@ -90,21 +90,28 @@ struct CeedWorkVectors_private {
   CeedVector *vecs;
 };
 
+typedef struct CeedObject_private {
+  Ceed ceed;
+  int (*ViewFunction)(CeedObject, FILE *);
+  int ref_count;
+} CeedObject_private;
+
 struct Ceed_private {
-  const char  *resource;
-  Ceed         delegate;
-  Ceed         parent;
-  ObjDelegate *obj_delegates;
-  int          obj_delegate_count;
-  Ceed         op_fallback_ceed;
-  char       **jit_source_roots;
-  char       **rust_source_roots;
-  CeedInt      num_rust_source_roots, max_rust_source_roots, num_rust_source_roots_readers;
-  CeedInt      num_jit_source_roots, max_jit_source_roots, num_jit_source_roots_readers;
-  bool         cuda_compile_with_clang;
-  char       **jit_defines;
-  CeedInt      num_jit_defines, max_jit_defines, num_jit_defines_readers;
-  CeedInt      num_tabs; /* Viewing offset */
+  CeedObject_private obj;
+  const char        *resource;
+  Ceed               delegate;
+  Ceed               parent;
+  ObjDelegate       *obj_delegates;
+  int                obj_delegate_count;
+  Ceed               op_fallback_ceed;
+  char             **jit_source_roots;
+  char             **rust_source_roots;
+  CeedInt            num_rust_source_roots, max_rust_source_roots, num_rust_source_roots_readers;
+  CeedInt            num_jit_source_roots, max_jit_source_roots, num_jit_source_roots_readers;
+  bool               cuda_compile_with_clang;
+  char             **jit_defines;
+  CeedInt            num_jit_defines, max_jit_defines, num_jit_defines_readers;
+  CeedInt            num_tabs; /* Viewing offset */
   int (*Error)(Ceed, const char *, int, const char *, int, const char *, va_list *);
   int (*SetStream)(Ceed, void *);
   int (*GetPreferredMemType)(CeedMemType *);
@@ -126,7 +133,6 @@ struct Ceed_private {
   int (*OperatorCreate)(CeedOperator);
   int (*OperatorCreateAtPoints)(CeedOperator);
   int (*CompositeOperatorCreate)(CeedOperator);
-  int             ref_count;
   void           *data;
   bool            is_debug;
   bool            is_deterministic;
@@ -136,7 +142,7 @@ struct Ceed_private {
 };
 
 struct CeedVector_private {
-  Ceed ceed;
+  CeedObject_private obj;
   int (*HasValidArray)(CeedVector, bool *);
   int (*HasBorrowedArrayOfType)(CeedVector, CeedMemType, bool *);
   int (*CopyStrided)(CeedVector, CeedSize, CeedSize, CeedSize, CeedVector);
@@ -157,7 +163,6 @@ struct CeedVector_private {
   int (*PointwiseMult)(CeedVector, CeedVector, CeedVector);
   int (*Reciprocal)(CeedVector);
   int (*Destroy)(CeedVector);
-  int      ref_count;
   CeedSize length;
   uint64_t state;
   uint64_t num_readers;
@@ -166,7 +171,7 @@ struct CeedVector_private {
 };
 
 struct CeedElemRestriction_private {
-  Ceed                ceed;
+  CeedObject_private  obj;
   CeedElemRestriction rstr_base;
   int (*Apply)(CeedElemRestriction, CeedTransposeMode, CeedVector, CeedVector, CeedRequest *);
   int (*ApplyUnsigned)(CeedElemRestriction, CeedTransposeMode, CeedVector, CeedVector, CeedRequest *);
@@ -178,7 +183,6 @@ struct CeedElemRestriction_private {
   int (*GetOrientations)(CeedElemRestriction, CeedMemType, const bool **);
   int (*GetCurlOrientations)(CeedElemRestriction, CeedMemType, const CeedInt8 **);
   int (*Destroy)(CeedElemRestriction);
-  int      ref_count;
   CeedInt  num_elem;    /* number of elements */
   CeedInt  elem_size;   /* number of nodes per element */
   CeedInt  num_points;  /* number of points, for points restriction */
@@ -199,13 +203,12 @@ struct CeedElemRestriction_private {
 };
 
 struct CeedBasis_private {
-  Ceed ceed;
+  CeedObject_private obj;
   int (*Apply)(CeedBasis, CeedInt, CeedTransposeMode, CeedEvalMode, CeedVector, CeedVector);
   int (*ApplyAdd)(CeedBasis, CeedInt, CeedTransposeMode, CeedEvalMode, CeedVector, CeedVector);
   int (*ApplyAtPoints)(CeedBasis, CeedInt, const CeedInt *, CeedTransposeMode, CeedEvalMode, CeedVector, CeedVector, CeedVector);
   int (*ApplyAddAtPoints)(CeedBasis, CeedInt, const CeedInt *, CeedTransposeMode, CeedEvalMode, CeedVector, CeedVector, CeedVector);
   int (*Destroy)(CeedBasis);
-  int                ref_count;
   bool               is_tensor_basis; /* flag for tensor basis */
   CeedInt            dim;             /* topological dimension */
   CeedElemTopology   topo;            /* element topology */
@@ -233,11 +236,10 @@ struct CeedBasis_private {
 };
 
 struct CeedTensorContract_private {
-  Ceed ceed;
+  CeedObject_private obj;
   int (*Apply)(CeedTensorContract, CeedInt, CeedInt, CeedInt, CeedInt, const CeedScalar *restrict, CeedTransposeMode, const CeedInt,
                const CeedScalar *restrict, CeedScalar *restrict);
   int (*Destroy)(CeedTensorContract);
-  int   ref_count;
   void *data;
 };
 
@@ -248,12 +250,11 @@ struct CeedQFunctionField_private {
 };
 
 struct CeedQFunction_private {
-  Ceed ceed;
+  CeedObject_private obj;
   int (*Apply)(CeedQFunction, CeedInt, CeedVector *, CeedVector *);
   int (*SetCUDAUserFunction)(CeedQFunction, void *);
   int (*SetHIPUserFunction)(CeedQFunction, void *);
   int (*Destroy)(CeedQFunction);
-  int                  ref_count;
   CeedInt              vec_length; /* Number of quadrature points must be padded to a multiple of vec_length */
   CeedInt              num_tabs;   /* Viewing offset */
   CeedQFunctionField  *input_fields;
@@ -275,8 +276,7 @@ struct CeedQFunction_private {
 };
 
 struct CeedQFunctionContext_private {
-  Ceed ceed;
-  int  ref_count;
+  CeedObject_private obj;
   int (*HasValidData)(CeedQFunctionContext, bool *);
   int (*HasBorrowedDataOfType)(CeedQFunctionContext, CeedMemType, bool *);
   int (*SetData)(CeedQFunctionContext, CeedMemType, CeedCopyMode, void *);
@@ -352,9 +352,8 @@ struct CeedOperatorAssemblyData_private {
 };
 
 struct CeedOperator_private {
-  Ceed         ceed;
-  CeedOperator op_fallback, op_fallback_parent;
-  int          ref_count;
+  CeedObject_private obj;
+  CeedOperator       op_fallback, op_fallback_parent;
   int (*LinearAssembleQFunction)(CeedOperator, CeedVector *, CeedElemRestriction *, CeedRequest *);
   int (*LinearAssembleQFunctionUpdate)(CeedOperator, CeedVector, CeedElemRestriction, CeedRequest *);
   int (*LinearAssembleDiagonal)(CeedOperator, CeedVector, CeedRequest *);
diff --git a/include/ceed/backend.h b/include/ceed/backend.h
index 62d4307560..f5ac3dfcd1 100644
--- a/include/ceed/backend.h
+++ b/include/ceed/backend.h
@@ -186,6 +186,11 @@ CEED_INTERN int CeedReallocArray(size_t n, size_t unit, void *p);
 CEED_INTERN int CeedStringAllocCopy(const char *source, char **copy);
 CEED_INTERN int CeedFree(void *p);
 
+CEED_INTERN int CeedObjectCreate(Ceed ceed, int (*view_function)(CeedObject, FILE *), CeedObject obj);
+CEED_INTERN int CeedObjectReference(CeedObject obj);
+CEED_INTERN int CeedObjectDereference(CeedObject obj);
+CEED_INTERN int CeedObjectDestroy(CeedObject obj);
+
 CEED_INTERN int CeedSetHostBoolArray(const bool *source_array, CeedCopyMode copy_mode, CeedSize num_values, const bool **target_array_owned,
                                      const bool **target_array_borrowed, const bool **target_array);
 CEED_INTERN int CeedSetHostCeedInt8Array(const CeedInt8 *source_array, CeedCopyMode copy_mode, CeedSize num_values,
diff --git a/include/ceed/ceed.h b/include/ceed/ceed.h
index f18b6c391c..0d3744bba1 100644
--- a/include/ceed/ceed.h
+++ b/include/ceed/ceed.h
@@ -99,6 +99,13 @@ typedef struct CeedContextFieldLabel_private *CeedContextFieldLabel;
 /// Given an element restriction \f$E\f$, basis evaluator \f$B\f$, and quadrature function\f$f\f$, a `CeedOperator` expresses operations of the form \f$E^T B^T f(B E u)\f$ acting on the vector \f$u\f$.
 /// @ingroup CeedOperatorUser
 typedef struct CeedOperator_private *CeedOperator;
+/// Generic type for all libCEED objects to support common functionality, such as viewing
+/// @ingroup CeedUser
+typedef struct CeedObject_private *CeedObject;
+
+CEED_EXTERN int  CeedObjectView(CeedObject obj, FILE *stream);
+CEED_EXTERN int  CeedObjectGetCeed(CeedObject obj, Ceed *ceed);
+CEED_EXTERN Ceed CeedObjectReturnCeed(CeedObject obj);
 
 CEED_EXTERN int CeedRegistryGetList(size_t *n, char ***const resources, CeedInt **array);
 CEED_EXTERN int CeedInit(const char *resource, Ceed *ceed);
diff --git a/interface/ceed-basis.c b/interface/ceed-basis.c
index 30b090b855..12b361e153 100644
--- a/interface/ceed-basis.c
+++ b/interface/ceed-basis.c
@@ -177,6 +177,21 @@ static int CeedScalarView(const char *name, const char *fp_fmt, CeedInt m, CeedI
   return CEED_ERROR_SUCCESS;
 }
 
+/**
+  @brief View a `CeedBasis` passed as a `CeedObject`
+
+  @param[in] basis  `CeedBasis` to view
+  @param[in] stream Filestream to write to
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Developer
+**/
+static int CeedBasisView_Object(CeedObject basis, FILE *stream) {
+  CeedCall(CeedBasisView((CeedBasis)basis, stream));
+  return CEED_ERROR_SUCCESS;
+}
+
 /**
   @brief Create the interpolation and gradient matrices for projection from the nodes of `basis_from` to the nodes of `basis_to`.
 
@@ -684,7 +699,7 @@ int CeedBasisCreateH1Fallback(Ceed ceed, CeedElemTopology topo, CeedInt num_comp
   CeedCall(CeedGetObjectDelegate(ceed, &delegate, "Basis"));
   CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement BasisCreateH1");
 
-  CeedCall(CeedReferenceCopy(delegate, &(basis)->ceed));
+  CeedCall(CeedReferenceCopy(delegate, &(basis)->obj.ceed));
   CeedCall(CeedBasisGetTopologyDimension(topo, &dim));
   CeedCall(delegate->BasisCreateH1(topo, dim, P, Q, interp, grad, q_ref, q_weight, basis));
   CeedCall(CeedDestroy(&delegate));
@@ -847,7 +862,7 @@ int CeedBasisSetData(CeedBasis basis, void *data) {
   @ref Backend
 **/
 int CeedBasisReference(CeedBasis basis) {
-  basis->ref_count++;
+  CeedCall(CeedObjectReference((CeedObject)basis));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -1524,8 +1539,7 @@ int CeedBasisCreateTensorH1(Ceed ceed, CeedInt dim, CeedInt num_comp, CeedInt P_
   CeedElemTopology topo = dim == 1 ? CEED_TOPOLOGY_LINE : dim == 2 ? CEED_TOPOLOGY_QUAD : CEED_TOPOLOGY_HEX;
 
   CeedCall(CeedCalloc(1, basis));
-  CeedCall(CeedReferenceCopy(ceed, &(*basis)->ceed));
-  (*basis)->ref_count       = 1;
+  CeedCall(CeedObjectCreate(ceed, CeedBasisView_Object, &(*basis)->obj));
   (*basis)->is_tensor_basis = true;
   (*basis)->dim             = dim;
   (*basis)->topo            = topo;
@@ -1663,8 +1677,7 @@ int CeedBasisCreateH1(Ceed ceed, CeedElemTopology topo, CeedInt num_comp, CeedIn
   CeedCall(CeedBasisGetTopologyDimension(topo, &dim));
 
   CeedCall(CeedCalloc(1, basis));
-  CeedCall(CeedReferenceCopy(ceed, &(*basis)->ceed));
-  (*basis)->ref_count       = 1;
+  CeedCall(CeedObjectCreate(ceed, CeedBasisView_Object, &(*basis)->obj));
   (*basis)->is_tensor_basis = false;
   (*basis)->dim             = dim;
   (*basis)->topo            = topo;
@@ -1723,8 +1736,7 @@ int CeedBasisCreateHdiv(Ceed ceed, CeedElemTopology topo, CeedInt num_comp, Ceed
   CeedCall(CeedBasisGetTopologyDimension(topo, &dim));
 
   CeedCall(CeedCalloc(1, basis));
-  CeedCall(CeedReferenceCopy(ceed, &(*basis)->ceed));
-  (*basis)->ref_count       = 1;
+  CeedCall(CeedObjectCreate(ceed, CeedBasisView_Object, &(*basis)->obj));
   (*basis)->is_tensor_basis = false;
   (*basis)->dim             = dim;
   (*basis)->topo            = topo;
@@ -1784,8 +1796,7 @@ int CeedBasisCreateHcurl(Ceed ceed, CeedElemTopology topo, CeedInt num_comp, Cee
   curl_comp = (dim < 3) ? 1 : dim;
 
   CeedCall(CeedCalloc(1, basis));
-  CeedCall(CeedReferenceCopy(ceed, &(*basis)->ceed));
-  (*basis)->ref_count       = 1;
+  CeedCall(CeedObjectCreate(ceed, CeedBasisView_Object, &(*basis)->obj));
   (*basis)->is_tensor_basis = false;
   (*basis)->dim             = dim;
   (*basis)->topo            = topo;
@@ -2143,21 +2154,20 @@ int CeedBasisApplyAddAtPoints(CeedBasis basis, CeedInt num_elem, const CeedInt *
   @ref Advanced
 **/
 int CeedBasisGetCeed(CeedBasis basis, Ceed *ceed) {
-  *ceed = NULL;
-  CeedCall(CeedReferenceCopy(CeedBasisReturnCeed(basis), ceed));
+  CeedCall(CeedObjectGetCeed((CeedObject)basis, ceed));
   return CEED_ERROR_SUCCESS;
 }
 
 /**
   @brief Return the `Ceed` associated with a `CeedBasis`
 
-  @param[in]  basis `CeedBasis`
+  @param[in] basis `CeedBasis`
 
   @return `Ceed` associated with the `basis`
 
   @ref Advanced
 **/
-Ceed CeedBasisReturnCeed(CeedBasis basis) { return basis->ceed; }
+Ceed CeedBasisReturnCeed(CeedBasis basis) { return CeedObjectReturnCeed((CeedObject)basis); }
 
 /**
   @brief Get dimension for given `CeedBasis`
@@ -2436,7 +2446,7 @@ int CeedBasisGetCurl(CeedBasis basis, const CeedScalar **curl) {
 }
 
 /**
-  @brief Destroy a @ref  CeedBasis
+  @brief Destroy a @ref CeedBasis
 
   @param[in,out] basis `CeedBasis` to destroy
 
@@ -2445,7 +2455,7 @@ int CeedBasisGetCurl(CeedBasis basis, const CeedScalar **curl) {
   @ref User
 **/
 int CeedBasisDestroy(CeedBasis *basis) {
-  if (!*basis || *basis == CEED_BASIS_NONE || --(*basis)->ref_count > 0) {
+  if (!*basis || *basis == CEED_BASIS_NONE || CeedObjectDereference((CeedObject)*basis) > 0) {
     *basis = NULL;
     return CEED_ERROR_SUCCESS;
   }
@@ -2461,7 +2471,7 @@ int CeedBasisDestroy(CeedBasis *basis) {
   CeedCall(CeedFree(&(*basis)->curl));
   CeedCall(CeedVectorDestroy(&(*basis)->vec_chebyshev));
   CeedCall(CeedBasisDestroy(&(*basis)->basis_chebyshev));
-  CeedCall(CeedDestroy(&(*basis)->ceed));
+  CeedCall(CeedObjectDestroy(&(*basis)->obj));
   CeedCall(CeedFree(basis));
   return CEED_ERROR_SUCCESS;
 }
diff --git a/interface/ceed-elemrestriction.c b/interface/ceed-elemrestriction.c
index 795477a4bf..efaa52f4e3 100644
--- a/interface/ceed-elemrestriction.c
+++ b/interface/ceed-elemrestriction.c
@@ -98,6 +98,21 @@ int CeedPermutePadCurlOrients(const CeedInt8 *curl_orients, CeedInt8 *block_curl
   return CEED_ERROR_SUCCESS;
 }
 
+/**
+  @brief View a `CeedElemRestriction` passed as a `CeedObject`
+
+  @param[in] rstr   `CeedElemRestriction` to view
+  @param[in] stream Filestream to write to
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Developer
+**/
+static int CeedElemRestrictionView_Object(CeedObject rstr, FILE *stream) {
+  CeedCall(CeedElemRestrictionView((CeedElemRestriction)rstr, stream));
+  return CEED_ERROR_SUCCESS;
+}
+
 /// @}
 
 /// ----------------------------------------------------------------------------
@@ -531,7 +546,7 @@ int CeedElemRestrictionSetData(CeedElemRestriction rstr, void *data) {
   @ref Backend
 **/
 int CeedElemRestrictionReference(CeedElemRestriction rstr) {
-  rstr->ref_count++;
+  CeedCall(CeedObjectReference((CeedObject)rstr));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -643,8 +658,7 @@ int CeedElemRestrictionCreate(Ceed ceed, CeedInt num_elem, CeedInt elem_size, Ce
   CeedCheck(num_comp == 1 || comp_stride > 0, ceed, CEED_ERROR_DIMENSION, "CeedElemRestriction component stride must be at least 1");
 
   CeedCall(CeedCalloc(1, rstr));
-  CeedCall(CeedReferenceCopy(ceed, &(*rstr)->ceed));
-  (*rstr)->ref_count   = 1;
+  CeedCall(CeedObjectCreate(ceed, CeedElemRestrictionView_Object, &(*rstr)->obj));
   (*rstr)->num_elem    = num_elem;
   (*rstr)->elem_size   = elem_size;
   (*rstr)->num_comp    = num_comp;
@@ -701,8 +715,7 @@ int CeedElemRestrictionCreateOriented(Ceed ceed, CeedInt num_elem, CeedInt elem_
   CeedCheck(num_comp == 1 || comp_stride > 0, ceed, CEED_ERROR_DIMENSION, "CeedElemRestriction component stride must be at least 1");
 
   CeedCall(CeedCalloc(1, rstr));
-  CeedCall(CeedReferenceCopy(ceed, &(*rstr)->ceed));
-  (*rstr)->ref_count   = 1;
+  CeedCall(CeedObjectCreate(ceed, CeedElemRestrictionView_Object, &(*rstr)->obj));
   (*rstr)->num_elem    = num_elem;
   (*rstr)->elem_size   = elem_size;
   (*rstr)->num_comp    = num_comp;
@@ -760,8 +773,7 @@ int CeedElemRestrictionCreateCurlOriented(Ceed ceed, CeedInt num_elem, CeedInt e
   CeedCheck(num_comp == 1 || comp_stride > 0, ceed, CEED_ERROR_DIMENSION, "CeedElemRestriction component stride must be at least 1");
 
   CeedCall(CeedCalloc(1, rstr));
-  CeedCall(CeedReferenceCopy(ceed, &(*rstr)->ceed));
-  (*rstr)->ref_count   = 1;
+  CeedCall(CeedObjectCreate(ceed, CeedElemRestrictionView_Object, &(*rstr)->obj));
   (*rstr)->num_elem    = num_elem;
   (*rstr)->elem_size   = elem_size;
   (*rstr)->num_comp    = num_comp;
@@ -815,8 +827,7 @@ int CeedElemRestrictionCreateStrided(Ceed ceed, CeedInt num_elem, CeedInt elem_s
             (CeedSize)num_elem * (CeedSize)elem_size * (CeedSize)num_comp, l_size);
 
   CeedCall(CeedCalloc(1, rstr));
-  CeedCall(CeedReferenceCopy(ceed, &(*rstr)->ceed));
-  (*rstr)->ref_count  = 1;
+  CeedCall(CeedObjectCreate(ceed, CeedElemRestrictionView_Object, &(*rstr)->obj));
   (*rstr)->num_elem   = num_elem;
   (*rstr)->elem_size  = elem_size;
   (*rstr)->num_comp   = num_comp;
@@ -883,8 +894,7 @@ int CeedElemRestrictionCreateAtPoints(Ceed ceed, CeedInt num_elem, CeedInt num_p
             l_size);
 
   CeedCall(CeedCalloc(1, rstr));
-  CeedCall(CeedReferenceCopy(ceed, &(*rstr)->ceed));
-  (*rstr)->ref_count   = 1;
+  CeedCall(CeedObjectCreate(ceed, CeedElemRestrictionView_Object, &(*rstr)->obj));
   (*rstr)->num_elem    = num_elem;
   (*rstr)->num_points  = num_points;
   (*rstr)->num_comp    = num_comp;
@@ -949,8 +959,7 @@ int CeedElemRestrictionCreateBlocked(Ceed ceed, CeedInt num_elem, CeedInt elem_s
   CeedCall(CeedPermutePadOffsets(offsets, block_offsets, num_block, num_elem, block_size, elem_size));
 
   CeedCall(CeedCalloc(1, rstr));
-  CeedCall(CeedReferenceCopy(ceed, &(*rstr)->ceed));
-  (*rstr)->ref_count   = 1;
+  CeedCall(CeedObjectCreate(ceed, CeedElemRestrictionView_Object, &(*rstr)->obj));
   (*rstr)->num_elem    = num_elem;
   (*rstr)->elem_size   = elem_size;
   (*rstr)->num_comp    = num_comp;
@@ -1020,8 +1029,7 @@ int CeedElemRestrictionCreateBlockedOriented(Ceed ceed, CeedInt num_elem, CeedIn
   CeedCall(CeedPermutePadOrients(orients, block_orients, num_block, num_elem, block_size, elem_size));
 
   CeedCall(CeedCalloc(1, rstr));
-  CeedCall(CeedReferenceCopy(ceed, &(*rstr)->ceed));
-  (*rstr)->ref_count   = 1;
+  CeedCall(CeedObjectCreate(ceed, CeedElemRestrictionView_Object, &(*rstr)->obj));
   (*rstr)->num_elem    = num_elem;
   (*rstr)->elem_size   = elem_size;
   (*rstr)->num_comp    = num_comp;
@@ -1094,8 +1102,7 @@ int CeedElemRestrictionCreateBlockedCurlOriented(Ceed ceed, CeedInt num_elem, Ce
   CeedCall(CeedPermutePadCurlOrients(curl_orients, block_curl_orients, num_block, num_elem, block_size, 3 * elem_size));
 
   CeedCall(CeedCalloc(1, rstr));
-  CeedCall(CeedReferenceCopy(ceed, &(*rstr)->ceed));
-  (*rstr)->ref_count   = 1;
+  CeedCall(CeedObjectCreate(ceed, CeedElemRestrictionView_Object, &(*rstr)->obj));
   (*rstr)->num_elem    = num_elem;
   (*rstr)->elem_size   = elem_size;
   (*rstr)->num_comp    = num_comp;
@@ -1153,8 +1160,7 @@ int CeedElemRestrictionCreateBlockedStrided(Ceed ceed, CeedInt num_elem, CeedInt
             (CeedSize)num_elem * (CeedSize)elem_size * (CeedSize)num_comp, l_size);
 
   CeedCall(CeedCalloc(1, rstr));
-  CeedCall(CeedReferenceCopy(ceed, &(*rstr)->ceed));
-  (*rstr)->ref_count  = 1;
+  CeedCall(CeedObjectCreate(ceed, CeedElemRestrictionView_Object, &(*rstr)->obj));
   (*rstr)->num_elem   = num_elem;
   (*rstr)->elem_size  = elem_size;
   (*rstr)->num_comp   = num_comp;
@@ -1186,10 +1192,8 @@ int CeedElemRestrictionCreateUnsignedCopy(CeedElemRestriction rstr, CeedElemRest
 
   // Copy old rstr
   memcpy(*rstr_unsigned, rstr, sizeof(struct CeedElemRestriction_private));
-  (*rstr_unsigned)->ceed = NULL;
-  CeedCall(CeedReferenceCopy(rstr->ceed, &(*rstr_unsigned)->ceed));
-  (*rstr_unsigned)->ref_count = 1;
-  (*rstr_unsigned)->strides   = NULL;
+  CeedCall(CeedObjectCreate(CeedElemRestrictionReturnCeed(rstr), CeedElemRestrictionView_Object, &(*rstr_unsigned)->obj));
+  (*rstr_unsigned)->strides = NULL;
   if (rstr->strides) {
     CeedCall(CeedMalloc(3, &(*rstr_unsigned)->strides));
     for (CeedInt i = 0; i < 3; i++) (*rstr_unsigned)->strides[i] = rstr->strides[i];
@@ -1218,10 +1222,8 @@ int CeedElemRestrictionCreateUnorientedCopy(CeedElemRestriction rstr, CeedElemRe
 
   // Copy old rstr
   memcpy(*rstr_unoriented, rstr, sizeof(struct CeedElemRestriction_private));
-  (*rstr_unoriented)->ceed = NULL;
-  CeedCall(CeedReferenceCopy(rstr->ceed, &(*rstr_unoriented)->ceed));
-  (*rstr_unoriented)->ref_count = 1;
-  (*rstr_unoriented)->strides   = NULL;
+  CeedCall(CeedObjectCreate(CeedElemRestrictionReturnCeed(rstr), CeedElemRestrictionView_Object, &(*rstr_unoriented)->obj));
+  (*rstr_unoriented)->strides = NULL;
   if (rstr->strides) {
     CeedCall(CeedMalloc(3, &(*rstr_unoriented)->strides));
     for (CeedInt i = 0; i < 3; i++) (*rstr_unoriented)->strides[i] = rstr->strides[i];
@@ -1438,8 +1440,7 @@ int CeedElemRestrictionApplyBlock(CeedElemRestriction rstr, CeedInt block, CeedT
   @ref Advanced
 **/
 int CeedElemRestrictionGetCeed(CeedElemRestriction rstr, Ceed *ceed) {
-  *ceed = NULL;
-  CeedCall(CeedReferenceCopy(CeedElemRestrictionReturnCeed(rstr), ceed));
+  CeedCall(CeedObjectGetCeed((CeedObject)rstr, ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -1452,7 +1453,7 @@ int CeedElemRestrictionGetCeed(CeedElemRestriction rstr, Ceed *ceed) {
 
   @ref Advanced
 **/
-Ceed CeedElemRestrictionReturnCeed(CeedElemRestriction rstr) { return rstr->ceed; }
+Ceed CeedElemRestrictionReturnCeed(CeedElemRestriction rstr) { return CeedObjectReturnCeed((CeedObject)rstr); }
 
 /**
   @brief Get the L-vector component stride
@@ -1811,11 +1812,11 @@ int CeedElemRestrictionView(CeedElemRestriction rstr, FILE *stream) {
   @ref User
 **/
 int CeedElemRestrictionDestroy(CeedElemRestriction *rstr) {
-  if (!*rstr || *rstr == CEED_ELEMRESTRICTION_NONE || --(*rstr)->ref_count > 0) {
+  if (!*rstr || *rstr == CEED_ELEMRESTRICTION_NONE || CeedObjectDereference((CeedObject)*rstr) > 0) {
     *rstr = NULL;
     return CEED_ERROR_SUCCESS;
   }
-  CeedCheck((*rstr)->num_readers == 0, (*rstr)->ceed, CEED_ERROR_ACCESS,
+  CeedCheck((*rstr)->num_readers == 0, CeedElemRestrictionReturnCeed(*rstr), CEED_ERROR_ACCESS,
             "Cannot destroy CeedElemRestriction, a process has read access to the offset data");
 
   // Only destroy backend data once between rstr and unsigned copy
@@ -1823,7 +1824,7 @@ int CeedElemRestrictionDestroy(CeedElemRestriction *rstr) {
   else if ((*rstr)->Destroy) CeedCall((*rstr)->Destroy(*rstr));
 
   CeedCall(CeedFree(&(*rstr)->strides));
-  CeedCall(CeedDestroy(&(*rstr)->ceed));
+  CeedCall(CeedObjectDestroy(&(*rstr)->obj));
   CeedCall(CeedFree(rstr));
   return CEED_ERROR_SUCCESS;
 }
diff --git a/interface/ceed-object.c b/interface/ceed-object.c
new file mode 100644
index 0000000000..7b13f875d7
--- /dev/null
+++ b/interface/ceed-object.c
@@ -0,0 +1,132 @@
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+#include <ceed-impl.h>
+#include <ceed.h>
+#include <ceed/backend.h>
+
+/// @file
+/// Implementation of CeedObject functionality
+
+/// ----------------------------------------------------------------------------
+/// CeedObject Backend API
+/// ----------------------------------------------------------------------------
+/// @addtogroup CeedBackend
+/// @{
+
+/**
+  @brief Create a `CeedObject`
+
+  @param[in]  ceed          `Ceed` object to reference
+  @param[in]  view_function `Ceed*` function for viewing the `obj`
+  @param[out] obj           Address of the variable where is `CeedObject` exists
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Backend
+**/
+int CeedObjectCreate(Ceed ceed, int (*view_function)(CeedObject, FILE *), CeedObject obj) {
+  obj->ceed = NULL;
+  if (ceed) CeedCall(CeedReferenceCopy(ceed, &obj->ceed));
+  obj->ViewFunction = view_function;
+  obj->ref_count    = 1;
+  return CEED_ERROR_SUCCESS;
+}
+
+/**
+  @brief Increment the reference counter for a `CeedObject`
+
+  @param[in,out] obj `CeedObject` to increment the reference counter
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Backend
+**/
+int CeedObjectReference(CeedObject obj) {
+  obj->ref_count++;
+  return CEED_ERROR_SUCCESS;
+}
+
+/**
+  @brief Decrement the reference counter for a `CeedObject`
+
+  @param[in,out] obj `CeedObject` to decrement the reference counter
+
+  @return The new reference count
+
+  @ref Backend
+**/
+int CeedObjectDereference(CeedObject obj) {
+  return --obj->ref_count;  // prefix notation, to get new number of references
+}
+
+/**
+  @brief Destroy a @ref CeedObject
+
+  @param[in,out] obj `CeedObject` to destroy
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Backend
+**/
+int CeedObjectDestroy(CeedObject obj) {
+  CeedCheck(obj->ref_count == 0, CeedObjectReturnCeed(obj), CEED_ERROR_ACCESS, "Cannot destroy CeedObject, it is still referenced by another object");
+  if (obj->ceed) CeedCall(CeedDestroy(&obj->ceed));
+  return CEED_ERROR_SUCCESS;
+}
+
+/// @}
+
+/// ----------------------------------------------------------------------------
+/// CeedObject Public API
+/// ----------------------------------------------------------------------------
+/// @addtogroup CeedUser
+/// @{
+
+/**
+  @brief View a `CeedObject`
+
+  @param[in] obj    `CeedObject` to view
+  @param[in] stream Stream to view to, e.g., `stdout`
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref User
+**/
+int CeedObjectView(CeedObject obj, FILE *stream) {
+  if (obj->ViewFunction) CeedCall(obj->ViewFunction(obj, stream));
+  return CEED_ERROR_SUCCESS;
+}
+
+/**
+  @brief Get the `Ceed` associated with a `CeedObject`
+
+  @param[in]  obj   `CeedObject`
+  @param[out] ceed  Variable to store `Ceed`
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Advanced
+**/
+int CeedObjectGetCeed(CeedObject obj, Ceed *ceed) {
+  *ceed = NULL;
+  CeedCall(CeedReferenceCopy(CeedObjectReturnCeed(obj), ceed));
+  return CEED_ERROR_SUCCESS;
+}
+
+/**
+  @brief Return the `Ceed` associated with a `CeedObject`
+
+  @param[in] obj `CeedObject`
+
+  @return `Ceed` associated with the `basis`
+
+  @ref Advanced
+**/
+Ceed CeedObjectReturnCeed(CeedObject obj) { return (obj->ceed) ? obj->ceed : (Ceed)obj; }
+
+/// @}
diff --git a/interface/ceed-operator.c b/interface/ceed-operator.c
index d72333c294..53091d0600 100644
--- a/interface/ceed-operator.c
+++ b/interface/ceed-operator.c
@@ -174,6 +174,21 @@ int CeedOperatorSingleView(CeedOperator op, const char *tabs, FILE *stream) {
   return CEED_ERROR_SUCCESS;
 }
 
+/**
+  @brief View a `CeedOperator` passed as a `CeedObject`
+
+  @param[in] op     `CeedOperator` to view
+  @param[in] stream Filestream to write to
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Developer
+**/
+static int CeedOperatorView_Object(CeedObject op, FILE *stream) {
+  CeedCall(CeedOperatorView((CeedOperator)op, stream));
+  return CEED_ERROR_SUCCESS;
+}
+
 /**
   @brief Find the active input vector `CeedBasis` for a non-composite `CeedOperator`.
 
@@ -708,7 +723,7 @@ int CeedOperatorSetData(CeedOperator op, void *data) {
   @ref Backend
 **/
 int CeedOperatorReference(CeedOperator op) {
-  op->ref_count++;
+  CeedCall(CeedObjectReference((CeedObject)op));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -763,8 +778,7 @@ int CeedOperatorCreate(Ceed ceed, CeedQFunction qf, CeedQFunction dqf, CeedQFunc
   CeedCheck(qf && qf != CEED_QFUNCTION_NONE, ceed, CEED_ERROR_MINOR, "Operator must have a valid CeedQFunction.");
 
   CeedCall(CeedCalloc(1, op));
-  CeedCall(CeedReferenceCopy(ceed, &(*op)->ceed));
-  (*op)->ref_count   = 1;
+  CeedCall(CeedObjectCreate(ceed, CeedOperatorView_Object, &(*op)->obj));
   (*op)->input_size  = -1;
   (*op)->output_size = -1;
   CeedCall(CeedQFunctionReferenceCopy(qf, &(*op)->qf));
@@ -806,8 +820,7 @@ int CeedOperatorCreateAtPoints(Ceed ceed, CeedQFunction qf, CeedQFunction dqf, C
   CeedCheck(qf && qf != CEED_QFUNCTION_NONE, ceed, CEED_ERROR_MINOR, "Operator must have a valid CeedQFunction.");
 
   CeedCall(CeedCalloc(1, op));
-  CeedCall(CeedReferenceCopy(ceed, &(*op)->ceed));
-  (*op)->ref_count    = 1;
+  CeedCall(CeedObjectCreate(ceed, CeedOperatorView_Object, &(*op)->obj));
   (*op)->is_at_points = true;
   (*op)->input_size   = -1;
   (*op)->output_size  = -1;
@@ -843,8 +856,7 @@ int CeedOperatorCreateComposite(Ceed ceed, CeedOperator *op) {
   }
 
   CeedCall(CeedCalloc(1, op));
-  CeedCall(CeedReferenceCopy(ceed, &(*op)->ceed));
-  (*op)->ref_count    = 1;
+  CeedCall(CeedObjectCreate(ceed, CeedOperatorView_Object, &(*op)->obj));
   (*op)->is_composite = true;
   CeedCall(CeedCalloc(CEED_COMPOSITE_MAX, &(*op)->sub_operators));
   (*op)->input_size  = -1;
@@ -1697,8 +1709,7 @@ int CeedOperatorViewTerse(CeedOperator op, FILE *stream) {
   @ref Advanced
 **/
 int CeedOperatorGetCeed(CeedOperator op, Ceed *ceed) {
-  *ceed = NULL;
-  CeedCall(CeedReferenceCopy(CeedOperatorReturnCeed(op), ceed));
+  CeedCall(CeedObjectGetCeed((CeedObject)op, ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -1711,7 +1722,7 @@ int CeedOperatorGetCeed(CeedOperator op, Ceed *ceed) {
 
   @ref Advanced
 **/
-Ceed CeedOperatorReturnCeed(CeedOperator op) { return op->ceed; }
+Ceed CeedOperatorReturnCeed(CeedOperator op) { return CeedObjectReturnCeed((CeedObject)op); }
 
 /**
   @brief Get the number of elements associated with a `CeedOperator`
@@ -2368,7 +2379,7 @@ int CeedOperatorAssemblyDataStrip(CeedOperator op) {
   @ref User
 **/
 int CeedOperatorDestroy(CeedOperator *op) {
-  if (!*op || --(*op)->ref_count > 0) {
+  if (!*op || CeedObjectDereference((CeedObject)*op) > 0) {
     *op = NULL;
     return CEED_ERROR_SUCCESS;
   }
@@ -2436,7 +2447,7 @@ int CeedOperatorDestroy(CeedOperator *op) {
   CeedCall(CeedOperatorDestroy(&(*op)->op_fallback));
 
   CeedCall(CeedFree(&(*op)->name));
-  CeedCall(CeedDestroy(&(*op)->ceed));
+  CeedCall(CeedObjectDestroy(&(*op)->obj));
   CeedCall(CeedFree(op));
   return CEED_ERROR_SUCCESS;
 }
diff --git a/interface/ceed-preconditioning.c b/interface/ceed-preconditioning.c
index aaed880485..1ccc3baa42 100644
--- a/interface/ceed-preconditioning.c
+++ b/interface/ceed-preconditioning.c
@@ -1353,7 +1353,7 @@ int CeedOperatorGetQFunctionAssemblyData(CeedOperator op, CeedQFunctionAssemblyD
   if (!op->qf_assembled) {
     CeedQFunctionAssemblyData data;
 
-    CeedCall(CeedQFunctionAssemblyDataCreate(op->ceed, &data));
+    CeedCall(CeedQFunctionAssemblyDataCreate(CeedOperatorReturnCeed(op), &data));
     op->qf_assembled = data;
   }
   *data = op->qf_assembled;
@@ -1373,8 +1373,7 @@ int CeedOperatorGetQFunctionAssemblyData(CeedOperator op, CeedQFunctionAssemblyD
 int CeedQFunctionAssemblyDataCreate(Ceed ceed, CeedQFunctionAssemblyData *data) {
   CeedCall(CeedCalloc(1, data));
   (*data)->ref_count = 1;
-  (*data)->ceed      = ceed;
-  CeedCall(CeedReference(ceed));
+  CeedCall(CeedReferenceCopy(ceed, &(*data)->ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -1549,7 +1548,7 @@ int CeedOperatorGetOperatorAssemblyData(CeedOperator op, CeedOperatorAssemblyDat
   if (!op->op_assembled) {
     CeedOperatorAssemblyData data;
 
-    CeedCall(CeedOperatorAssemblyDataCreate(op->ceed, op, &data));
+    CeedCall(CeedOperatorAssemblyDataCreate(CeedOperatorReturnCeed(op), op, &data));
     op->op_assembled = data;
   }
   *data = op->op_assembled;
@@ -1589,8 +1588,7 @@ int CeedOperatorAssemblyDataCreate(Ceed ceed, CeedOperator op, CeedOperatorAssem
 
   // Allocate
   CeedCall(CeedCalloc(1, data));
-  (*data)->ceed = ceed;
-  CeedCall(CeedReference(ceed));
+  CeedCall(CeedReferenceCopy(ceed, &(*data)->ceed));
 
   // Build OperatorAssembly data
   CeedCall(CeedOperatorGetQFunction(op, &qf));
@@ -2023,7 +2021,7 @@ int CeedOperatorGetFallbackParent(CeedOperator op, CeedOperator *parent) {
 **/
 int CeedOperatorGetFallbackParentCeed(CeedOperator op, Ceed *parent) {
   *parent = NULL;
-  if (op->op_fallback_parent) CeedCall(CeedReferenceCopy(op->op_fallback_parent->ceed, parent));
+  if (op->op_fallback_parent) CeedCall(CeedReferenceCopy(CeedOperatorReturnCeed(op->op_fallback_parent), parent));
   else CeedCall(CeedReferenceCopy(CeedOperatorReturnCeed(op), parent));
   return CEED_ERROR_SUCCESS;
 }
diff --git a/interface/ceed-qfunction.c b/interface/ceed-qfunction.c
index b15c2ceaaf..705d06b5f1 100644
--- a/interface/ceed-qfunction.c
+++ b/interface/ceed-qfunction.c
@@ -142,6 +142,21 @@ static int CeedQFunctionFieldView(CeedQFunctionField field, CeedInt field_number
   return CEED_ERROR_SUCCESS;
 }
 
+/**
+  @brief View a `CeedQFunction` passed as a `CeedObject`
+
+  @param[in] qf     `CeedQFunction` to view
+  @param[in] stream Filestream to write to
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Developer
+**/
+static int CeedQFunctionView_Object(CeedObject qf, FILE *stream) {
+  CeedCall(CeedQFunctionView((CeedQFunction)qf, stream));
+  return CEED_ERROR_SUCCESS;
+}
+
 /**
   @brief Set flag to determine if Fortran interface is used
 
@@ -598,7 +613,7 @@ int CeedQFunctionSetImmutable(CeedQFunction qf) {
   @ref Backend
 **/
 int CeedQFunctionReference(CeedQFunction qf) {
-  qf->ref_count++;
+  CeedCall(CeedObjectReference((CeedObject)qf));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -662,8 +677,7 @@ int CeedQFunctionCreateInterior(Ceed ceed, CeedInt vec_length, CeedQFunctionUser
             "Provided path to source does not include function name. Provided: \"%s\"\nRequired: \"\\abs_path\\file.h:function_name\"", source);
 
   CeedCall(CeedCalloc(1, qf));
-  CeedCall(CeedReferenceCopy(ceed, &(*qf)->ceed));
-  (*qf)->ref_count           = 1;
+  CeedCall(CeedObjectCreate(ceed, CeedQFunctionView_Object, &(*qf)->obj));
   (*qf)->vec_length          = vec_length;
   (*qf)->is_identity         = false;
   (*qf)->is_context_writable = true;
@@ -1091,8 +1105,7 @@ int CeedQFunctionView(CeedQFunction qf, FILE *stream) {
   @ref Advanced
 **/
 int CeedQFunctionGetCeed(CeedQFunction qf, Ceed *ceed) {
-  *ceed = NULL;
-  CeedCall(CeedReferenceCopy(CeedQFunctionReturnCeed(qf), ceed));
+  CeedCall(CeedObjectGetCeed((CeedObject)qf, ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -1105,7 +1118,7 @@ int CeedQFunctionGetCeed(CeedQFunction qf, Ceed *ceed) {
 
   @ref Advanced
 **/
-Ceed CeedQFunctionReturnCeed(CeedQFunction qf) { return qf->ceed; }
+Ceed CeedQFunctionReturnCeed(CeedQFunction qf) { return CeedObjectReturnCeed((CeedObject)qf); }
 
 /**
   @brief Apply the action of a `CeedQFunction`
@@ -1143,7 +1156,7 @@ int CeedQFunctionApply(CeedQFunction qf, CeedInt Q, CeedVector *u, CeedVector *v
   @ref User
 **/
 int CeedQFunctionDestroy(CeedQFunction *qf) {
-  if (!*qf || --(*qf)->ref_count > 0) {
+  if (!*qf || CeedObjectDereference((CeedObject)*qf) > 0) {
     *qf = NULL;
     return CEED_ERROR_SUCCESS;
   }
@@ -1170,7 +1183,7 @@ int CeedQFunctionDestroy(CeedQFunction *qf) {
   CeedCall(CeedFree(&(*qf)->source_path));
   CeedCall(CeedFree(&(*qf)->gallery_name));
   CeedCall(CeedFree(&(*qf)->kernel_name));
-  CeedCall(CeedDestroy(&(*qf)->ceed));
+  CeedCall(CeedObjectDestroy(&(*qf)->obj));
   CeedCall(CeedFree(qf));
   return CEED_ERROR_SUCCESS;
 }
diff --git a/interface/ceed-qfunctioncontext.c b/interface/ceed-qfunctioncontext.c
index 93a7fcecab..b308f5d038 100644
--- a/interface/ceed-qfunctioncontext.c
+++ b/interface/ceed-qfunctioncontext.c
@@ -127,6 +127,21 @@ static int CeedQFunctionContextDestroyData(CeedQFunctionContext ctx) {
   return CEED_ERROR_SUCCESS;
 }
 
+/**
+  @brief View a `CeedQFunctionContext` passed as a `CeedObject`
+
+  @param[in] ctx    `CeedQFunctionContext` to view
+  @param[in] stream Filestream to write to
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Developer
+**/
+static int CeedQFunctionContextView_Object(CeedObject ctx, FILE *stream) {
+  CeedCall(CeedQFunctionContextView((CeedQFunctionContext)ctx, stream));
+  return CEED_ERROR_SUCCESS;
+}
+
 /// @}
 
 /// ----------------------------------------------------------------------------
@@ -146,8 +161,7 @@ static int CeedQFunctionContextDestroyData(CeedQFunctionContext ctx) {
   @ref Backend
 **/
 int CeedQFunctionContextGetCeed(CeedQFunctionContext ctx, Ceed *ceed) {
-  *ceed = NULL;
-  CeedCall(CeedReferenceCopy(CeedQFunctionContextReturnCeed(ctx), ceed));
+  CeedCall(CeedObjectGetCeed((CeedObject)ctx, ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -160,7 +174,7 @@ int CeedQFunctionContextGetCeed(CeedQFunctionContext ctx, Ceed *ceed) {
 
   @ref Backend
 **/
-Ceed CeedQFunctionContextReturnCeed(CeedQFunctionContext ctx) { return ctx->ceed; }
+Ceed CeedQFunctionContextReturnCeed(CeedQFunctionContext ctx) { return CeedObjectReturnCeed((CeedObject)ctx); }
 
 /**
   @brief Check for valid data in a `CeedQFunctionContext`
@@ -542,7 +556,7 @@ int CeedQFunctionContextGetDataDestroy(CeedQFunctionContext ctx, CeedMemType *f_
   @ref Backend
 **/
 int CeedQFunctionContextReference(CeedQFunctionContext ctx) {
-  ctx->ref_count++;
+  CeedCall(CeedObjectReference((CeedObject)ctx));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -576,8 +590,7 @@ int CeedQFunctionContextCreate(Ceed ceed, CeedQFunctionContext *ctx) {
   }
 
   CeedCall(CeedCalloc(1, ctx));
-  CeedCall(CeedReferenceCopy(ceed, &(*ctx)->ceed));
-  (*ctx)->ref_count = 1;
+  CeedCall(CeedObjectCreate(ceed, CeedQFunctionContextView_Object, &(*ctx)->obj));
   CeedCall(ceed->QFunctionContextCreate(*ctx));
   return CEED_ERROR_SUCCESS;
 }
@@ -970,11 +983,11 @@ int CeedQFunctionContextSetDataDestroy(CeedQFunctionContext ctx, CeedMemType f_m
   @ref User
 **/
 int CeedQFunctionContextDestroy(CeedQFunctionContext *ctx) {
-  if (!*ctx || --(*ctx)->ref_count > 0) {
+  if (!*ctx || CeedObjectDereference((CeedObject)*ctx) > 0) {
     *ctx = NULL;
     return CEED_ERROR_SUCCESS;
   }
-  CeedCheck(((*ctx)->state % 2) == 0, (*ctx)->ceed, 1, "Cannot destroy CeedQFunctionContext, the access lock is in use");
+  CeedCheck(((*ctx)->state % 2) == 0, CeedQFunctionContextReturnCeed(*ctx), 1, "Cannot destroy CeedQFunctionContext, the access lock is in use");
 
   CeedCall(CeedQFunctionContextDestroyData(*ctx));
   if ((*ctx)->Destroy) CeedCall((*ctx)->Destroy(*ctx));
@@ -984,7 +997,7 @@ int CeedQFunctionContextDestroy(CeedQFunctionContext *ctx) {
     CeedCall(CeedFree(&(*ctx)->field_labels[i]));
   }
   CeedCall(CeedFree(&(*ctx)->field_labels));
-  CeedCall(CeedDestroy(&(*ctx)->ceed));
+  CeedCall(CeedObjectDestroy(&(*ctx)->obj));
   CeedCall(CeedFree(ctx));
   return CEED_ERROR_SUCCESS;
 }
diff --git a/interface/ceed-tensor.c b/interface/ceed-tensor.c
index ac2f789f72..757dd4ff17 100644
--- a/interface/ceed-tensor.c
+++ b/interface/ceed-tensor.c
@@ -41,7 +41,7 @@ int CeedTensorContractCreate(Ceed ceed, CeedTensorContract *contract) {
   }
 
   CeedCall(CeedCalloc(1, contract));
-  CeedCall(CeedReferenceCopy(ceed, &(*contract)->ceed));
+  CeedCall(CeedObjectCreate(ceed, NULL, &(*contract)->obj));
   CeedCall(ceed->TensorContractCreate(*contract));
   return CEED_ERROR_SUCCESS;
 }
@@ -124,8 +124,7 @@ int CeedTensorContractStridedApply(CeedTensorContract contract, CeedInt A, CeedI
   @ref Backend
 **/
 int CeedTensorContractGetCeed(CeedTensorContract contract, Ceed *ceed) {
-  *ceed = NULL;
-  CeedCall(CeedReferenceCopy(CeedTensorContractReturnCeed(contract), ceed));
+  CeedCall(CeedObjectGetCeed((CeedObject)contract, ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -138,7 +137,7 @@ int CeedTensorContractGetCeed(CeedTensorContract contract, Ceed *ceed) {
 
   @ref Backend
 **/
-Ceed CeedTensorContractReturnCeed(CeedTensorContract contract) { return contract->ceed; }
+Ceed CeedTensorContractReturnCeed(CeedTensorContract contract) { return CeedObjectReturnCeed((CeedObject)contract); }
 
 /**
   @brief Get backend data of a `CeedTensorContract`
@@ -180,7 +179,7 @@ int CeedTensorContractSetData(CeedTensorContract contract, void *data) {
   @ref Backend
 **/
 int CeedTensorContractReference(CeedTensorContract contract) {
-  contract->ref_count++;
+  CeedCall(CeedObjectReference((CeedObject)contract));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -216,14 +215,14 @@ int CeedTensorContractReferenceCopy(CeedTensorContract tensor, CeedTensorContrac
   @ref Backend
 **/
 int CeedTensorContractDestroy(CeedTensorContract *contract) {
-  if (!*contract || --(*contract)->ref_count > 0) {
+  if (!*contract || CeedObjectDereference((CeedObject)*contract) > 0) {
     *contract = NULL;
     return CEED_ERROR_SUCCESS;
   }
   if ((*contract)->Destroy) {
     CeedCall((*contract)->Destroy(*contract));
   }
-  CeedCall(CeedDestroy(&(*contract)->ceed));
+  CeedCall(CeedObjectDestroy(&(*contract)->obj));
   CeedCall(CeedFree(contract));
   return CEED_ERROR_SUCCESS;
 }
diff --git a/interface/ceed-vector.c b/interface/ceed-vector.c
index 64e4e26227..32f7fb15db 100644
--- a/interface/ceed-vector.c
+++ b/interface/ceed-vector.c
@@ -33,6 +33,29 @@ const CeedVector CEED_VECTOR_NONE = &ceed_vector_none;
 
 /// @}
 
+/// ----------------------------------------------------------------------------
+/// CeedVector Internal Functions
+/// ----------------------------------------------------------------------------
+/// @addtogroup CeedVectorDeveloper
+/// @{
+
+/**
+  @brief View a `CeedVector` passed as a `CeedObject`
+
+  @param[in] vec    `CeedVector` to view
+  @param[in] stream Filestream to write to
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Developer
+**/
+static int CeedVectorView_Object(CeedObject vec, FILE *stream) {
+  CeedCall(CeedVectorView((CeedVector)vec, "%12.8f", stream));
+  return CEED_ERROR_SUCCESS;
+}
+
+/// @}
+
 /// ----------------------------------------------------------------------------
 /// CeedVector Backend API
 /// ----------------------------------------------------------------------------
@@ -135,7 +158,7 @@ int CeedVectorSetData(CeedVector vec, void *data) {
   @ref Backend
 **/
 int CeedVectorReference(CeedVector vec) {
-  vec->ref_count++;
+  CeedCall(CeedObjectReference((CeedObject)vec));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -171,10 +194,9 @@ int CeedVectorCreate(Ceed ceed, CeedSize length, CeedVector *vec) {
   }
 
   CeedCall(CeedCalloc(1, vec));
-  CeedCall(CeedReferenceCopy(ceed, &(*vec)->ceed));
-  (*vec)->ref_count = 1;
-  (*vec)->length    = length;
-  (*vec)->state     = 0;
+  CeedCall(CeedObjectCreate(ceed, CeedVectorView_Object, &(*vec)->obj));
+  (*vec)->length = length;
+  (*vec)->state  = 0;
   CeedCall(ceed->VectorCreate(length, *vec));
   return CEED_ERROR_SUCCESS;
 }
@@ -1105,8 +1127,7 @@ int CeedVectorView(CeedVector vec, const char *fp_fmt, FILE *stream) {
   @ref Advanced
 **/
 int CeedVectorGetCeed(CeedVector vec, Ceed *ceed) {
-  *ceed = NULL;
-  CeedCall(CeedReferenceCopy(CeedVectorReturnCeed(vec), ceed));
+  CeedCall(CeedObjectGetCeed((CeedObject)vec, ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -1119,7 +1140,7 @@ int CeedVectorGetCeed(CeedVector vec, Ceed *ceed) {
 
   @ref Advanced
 **/
-Ceed CeedVectorReturnCeed(CeedVector vec) { return vec->ceed; }
+Ceed CeedVectorReturnCeed(CeedVector vec) { return CeedObjectReturnCeed((CeedObject)vec); }
 
 /**
   @brief Get the length of a `CeedVector`
@@ -1146,16 +1167,15 @@ int CeedVectorGetLength(CeedVector vec, CeedSize *length) {
   @ref User
 **/
 int CeedVectorDestroy(CeedVector *vec) {
-  if (!*vec || *vec == CEED_VECTOR_ACTIVE || *vec == CEED_VECTOR_NONE || --(*vec)->ref_count > 0) {
+  if (!*vec || *vec == CEED_VECTOR_ACTIVE || *vec == CEED_VECTOR_NONE || CeedObjectDereference((CeedObject)*vec) > 0) {
     *vec = NULL;
     return CEED_ERROR_SUCCESS;
   }
-  CeedCheck((*vec)->state % 2 == 0, (*vec)->ceed, CEED_ERROR_ACCESS, "Cannot destroy CeedVector, the writable access lock is in use");
-  CeedCheck((*vec)->num_readers == 0, (*vec)->ceed, CEED_ERROR_ACCESS, "Cannot destroy CeedVector, a process has read access");
+  CeedCheck((*vec)->state % 2 == 0, CeedVectorReturnCeed(*vec), CEED_ERROR_ACCESS, "Cannot destroy CeedVector, the writable access lock is in use");
+  CeedCheck((*vec)->num_readers == 0, CeedVectorReturnCeed(*vec), CEED_ERROR_ACCESS, "Cannot destroy CeedVector, a process has read access");
 
   if ((*vec)->Destroy) CeedCall((*vec)->Destroy(*vec));
-
-  CeedCall(CeedDestroy(&(*vec)->ceed));
+  CeedCall(CeedObjectDestroy(&(*vec)->obj));
   CeedCall(CeedFree(vec));
   return CEED_ERROR_SUCCESS;
 }
diff --git a/interface/ceed.c b/interface/ceed.c
index f53d92c4e4..01167241b5 100644
--- a/interface/ceed.c
+++ b/interface/ceed.c
@@ -165,9 +165,12 @@ static int CeedWorkVectorsDestroy(Ceed ceed) {
   if (!ceed->work_vectors) return CEED_ERROR_SUCCESS;
   for (CeedSize i = 0; i < ceed->work_vectors->num_vecs; i++) {
     CeedCheck(!ceed->work_vectors->is_in_use[i], ceed, CEED_ERROR_ACCESS, "Work vector %" CeedSize_FMT " checked out but not returned");
-    ceed->ref_count += 2;  // Note: increase ref_count to prevent Ceed destructor from triggering again
+    // Note: increase ref_count to prevent Ceed destructor from triggering again
+    CeedCall(CeedObjectReference((CeedObject)ceed));
+    CeedCall(CeedObjectReference((CeedObject)ceed));
     CeedCall(CeedVectorDestroy(&ceed->work_vectors->vecs[i]));
-    ceed->ref_count -= 1;  // Note: restore ref_count
+    // Note: restore ref_count
+    CeedObjectDereference((CeedObject)ceed);
   }
   CeedCall(CeedFree(&ceed->work_vectors->is_in_use));
   CeedCall(CeedFree(&ceed->work_vectors->vecs));
@@ -175,6 +178,21 @@ static int CeedWorkVectorsDestroy(Ceed ceed) {
   return CEED_ERROR_SUCCESS;
 }
 
+/**
+  @brief View a `Ceed` passed as a `CeedObject`
+
+  @param[in] ceed   `Ceed` to view
+  @param[in] stream Filestream to write to
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Developer
+**/
+static int CeedView_Object(CeedObject ceed, FILE *stream) {
+  CeedCall(CeedView((Ceed)ceed, stream));
+  return CEED_ERROR_SUCCESS;
+}
+
 /// @}
 
 /// ----------------------------------------------------------------------------
@@ -759,7 +777,7 @@ int CeedSetData(Ceed ceed, void *data) {
   @ref Backend
 **/
 int CeedReference(Ceed ceed) {
-  ceed->ref_count++;
+  CeedCall(CeedObjectReference((CeedObject)ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -823,9 +841,12 @@ int CeedClearWorkVectors(Ceed ceed, CeedSize min_len) {
     CeedSize vec_len;
     CeedCall(CeedVectorGetLength(ceed->work_vectors->vecs[i], &vec_len));
     if (vec_len < min_len) {
-      ceed->ref_count += 2;  // Note: increase ref_count to prevent Ceed destructor from triggering
+      // Note: increase ref_count to prevent Ceed destructor from triggering
+      CeedCall(CeedObjectReference((CeedObject)ceed));
+      CeedCall(CeedObjectReference((CeedObject)ceed));
       CeedCall(CeedVectorDestroy(&ceed->work_vectors->vecs[i]));
-      ceed->ref_count -= 1;  // Note: restore ref_count
+      // Note: restore ref_count
+      CeedObjectDereference((CeedObject)ceed);
       ceed->work_vectors->num_vecs--;
       if (ceed->work_vectors->num_vecs > 0) {
         ceed->work_vectors->vecs[i]                                 = ceed->work_vectors->vecs[ceed->work_vectors->num_vecs];
@@ -889,14 +910,16 @@ int CeedGetWorkVector(Ceed ceed, CeedSize len, CeedVector *vec) {
     }
     ceed->work_vectors->num_vecs++;
     CeedCallBackend(CeedVectorCreate(ceed, len, &ceed->work_vectors->vecs[i]));
-    ceed->ref_count--;  // Note: ref_count manipulation to prevent a ref-loop
+    // Note: ref_count manipulation to prevent a ref-loop
+    CeedObjectDereference((CeedObject)ceed);
     if (ceed->is_debug) CeedGetWorkVectorMemoryUsage(ceed, &usage_mb);
   }
   // Return pointer to work vector
   ceed->work_vectors->is_in_use[i] = true;
   *vec                             = NULL;
   CeedCall(CeedVectorReferenceCopy(ceed->work_vectors->vecs[i], vec));
-  ceed->ref_count++;  // Note: bump ref_count to account for external access
+  // Note: bump ref_count to account for external access
+  CeedCall(CeedObjectReference((CeedObject)ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -926,7 +949,8 @@ int CeedRestoreWorkVector(Ceed ceed, CeedVector *vec) {
       CeedCheck(ceed->work_vectors->is_in_use[i], ceed, CEED_ERROR_ACCESS, "Work vector %" CeedSize_FMT " was not checked out but is being returned");
       CeedCall(CeedVectorDestroy(vec));
       ceed->work_vectors->is_in_use[i] = false;
-      ceed->ref_count--;  // Note: reduce ref_count again to prevent a ref-loop
+      // Note: reduce ref_count again to prevent a ref-loop
+      CeedObjectDereference((CeedObject)ceed);
       return CEED_ERROR_SUCCESS;
     }
   }
@@ -1209,6 +1233,7 @@ int CeedInit(const char *resource, Ceed *ceed) {
 
   // Setup Ceed
   CeedCall(CeedCalloc(1, ceed));
+  CeedCall(CeedObjectCreate(NULL, CeedView_Object, &(*ceed)->obj));
   CeedCall(CeedCalloc(1, &(*ceed)->jit_source_roots));
   CeedCall(CeedCalloc(1, &(*ceed)->rust_source_roots));
   const char *ceed_error_handler = getenv("CEED_ERROR_HANDLER");
@@ -1217,8 +1242,7 @@ int CeedInit(const char *resource, Ceed *ceed) {
   else if (!strcmp(ceed_error_handler, "store")) (*ceed)->Error = CeedErrorStore;
   else (*ceed)->Error = CeedErrorAbort;
   memcpy((*ceed)->err_msg, "No error message stored", 24);
-  (*ceed)->ref_count = 1;
-  (*ceed)->data      = NULL;
+  (*ceed)->data = NULL;
 
   // Set lookup table
   FOffset f_offsets[] = {
@@ -1611,7 +1635,7 @@ int CeedView(Ceed ceed, FILE *stream) {
   @ref User
 **/
 int CeedDestroy(Ceed *ceed) {
-  if (!*ceed || --(*ceed)->ref_count > 0) {
+  if (!*ceed || CeedObjectDereference((CeedObject)*ceed) > 0) {
     *ceed = NULL;
     return CEED_ERROR_SUCCESS;
   }
@@ -1651,6 +1675,7 @@ int CeedDestroy(Ceed *ceed) {
   CeedCall(CeedFree(&(*ceed)->resource));
   CeedCall(CeedDestroy(&(*ceed)->op_fallback_ceed));
   CeedCall(CeedWorkVectorsDestroy(*ceed));
+  CeedCall(CeedObjectDestroy(&(*ceed)->obj));
   CeedCall(CeedFree(ceed));
   return CEED_ERROR_SUCCESS;
 }

From a299a25b9879b9b54a5c96c9f88ed0cb403a2fde Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Tue, 20 Jan 2026 09:05:18 -0700
Subject: [PATCH 520/571] obj - move view tabs to CeedObj

---
 include/ceed-impl.h               | 10 ++--------
 include/ceed/ceed.h               |  2 ++
 interface/ceed-basis.c            |  5 ++---
 interface/ceed-elemrestriction.c  |  5 ++---
 interface/ceed-object.c           | 31 +++++++++++++++++++++++++++++++
 interface/ceed-operator.c         |  5 ++---
 interface/ceed-qfunction.c        |  5 ++---
 interface/ceed-qfunctioncontext.c |  5 ++---
 interface/ceed-vector.c           |  5 ++---
 interface/ceed.c                  |  5 ++---
 10 files changed, 49 insertions(+), 29 deletions(-)

diff --git a/include/ceed-impl.h b/include/ceed-impl.h
index 434943a1b2..7496dda577 100644
--- a/include/ceed-impl.h
+++ b/include/ceed-impl.h
@@ -93,7 +93,8 @@ struct CeedWorkVectors_private {
 typedef struct CeedObject_private {
   Ceed ceed;
   int (*ViewFunction)(CeedObject, FILE *);
-  int ref_count;
+  int     ref_count;
+  CeedInt num_view_tabs;
 } CeedObject_private;
 
 struct Ceed_private {
@@ -111,7 +112,6 @@ struct Ceed_private {
   bool               cuda_compile_with_clang;
   char             **jit_defines;
   CeedInt            num_jit_defines, max_jit_defines, num_jit_defines_readers;
-  CeedInt            num_tabs; /* Viewing offset */
   int (*Error)(Ceed, const char *, int, const char *, int, const char *, va_list *);
   int (*SetStream)(Ceed, void *);
   int (*GetPreferredMemType)(CeedMemType *);
@@ -166,7 +166,6 @@ struct CeedVector_private {
   CeedSize length;
   uint64_t state;
   uint64_t num_readers;
-  CeedInt  num_tabs; /* Viewing offset */
   void    *data;
 };
 
@@ -198,7 +197,6 @@ struct CeedElemRestriction_private {
   CeedRestrictionType
            rstr_type;   /* initialized in element restriction constructor for default, oriented, curl-oriented, or strided element restriction */
   uint64_t num_readers; /* number of instances of offset read only access */
-  CeedInt  num_tabs;    /* Viewing offset */
   void    *data;        /* place for the backend to store any data */
 };
 
@@ -231,7 +229,6 @@ struct CeedBasis_private {
                        quadrature points for H(curl) discretizations */
   CeedVector  vec_chebyshev;
   CeedBasis   basis_chebyshev; /* basis interpolating from nodes to Chebyshev polynomial coefficients */
-  CeedInt     num_tabs;        /* Viewing offset */
   void       *data;            /* place for the backend to store any data */
 };
 
@@ -256,7 +253,6 @@ struct CeedQFunction_private {
   int (*SetHIPUserFunction)(CeedQFunction, void *);
   int (*Destroy)(CeedQFunction);
   CeedInt              vec_length; /* Number of quadrature points must be padded to a multiple of vec_length */
-  CeedInt              num_tabs;   /* Viewing offset */
   CeedQFunctionField  *input_fields;
   CeedQFunctionField  *output_fields;
   CeedInt              num_input_fields, num_output_fields;
@@ -291,7 +287,6 @@ struct CeedQFunctionContext_private {
   CeedMemType                         data_destroy_mem_type;
   CeedInt                             num_fields;
   CeedInt                             max_fields;
-  CeedInt                             num_tabs; /* Viewing offset */
   CeedContextFieldLabel              *field_labels;
   uint64_t                            state;
   uint64_t                            num_readers;
@@ -373,7 +368,6 @@ struct CeedOperator_private {
   CeedOperatorField        *input_fields;
   CeedOperatorField        *output_fields;
   CeedSize                  input_size, output_size;
-  CeedInt                   num_tabs;   /* Viewing offset */
   CeedInt                   num_elem;   /* Number of elements */
   CeedInt                   num_qpts;   /* Number of quadrature points over all elements */
   CeedInt                   num_fields; /* Number of fields that have been set */
diff --git a/include/ceed/ceed.h b/include/ceed/ceed.h
index 0d3744bba1..006dea00e0 100644
--- a/include/ceed/ceed.h
+++ b/include/ceed/ceed.h
@@ -104,6 +104,8 @@ typedef struct CeedOperator_private *CeedOperator;
 typedef struct CeedObject_private *CeedObject;
 
 CEED_EXTERN int  CeedObjectView(CeedObject obj, FILE *stream);
+CEED_EXTERN int  CeedObjectSetNumViewTabs(CeedObject obj, CeedInt num_tabs);
+CEED_EXTERN int  CeedObjectGetNumViewTabs(CeedObject obj, CeedInt *num_tabs);
 CEED_EXTERN int  CeedObjectGetCeed(CeedObject obj, Ceed *ceed);
 CEED_EXTERN Ceed CeedObjectReturnCeed(CeedObject obj);
 
diff --git a/interface/ceed-basis.c b/interface/ceed-basis.c
index 12b361e153..e2e58852a9 100644
--- a/interface/ceed-basis.c
+++ b/interface/ceed-basis.c
@@ -1915,8 +1915,7 @@ int CeedBasisReferenceCopy(CeedBasis basis, CeedBasis *basis_copy) {
   @ref User
 **/
 int CeedBasisSetNumViewTabs(CeedBasis basis, CeedInt num_tabs) {
-  CeedCheck(num_tabs >= 0, CeedBasisReturnCeed(basis), CEED_ERROR_MINOR, "Number of view tabs must be non-negative");
-  basis->num_tabs = num_tabs;
+  CeedCall(CeedObjectSetNumViewTabs((CeedObject)basis, num_tabs));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -1931,7 +1930,7 @@ int CeedBasisSetNumViewTabs(CeedBasis basis, CeedInt num_tabs) {
   @ref User
 **/
 int CeedBasisGetNumViewTabs(CeedBasis basis, CeedInt *num_tabs) {
-  *num_tabs = basis->num_tabs;
+  CeedCall(CeedObjectGetNumViewTabs((CeedObject)basis, num_tabs));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/interface/ceed-elemrestriction.c b/interface/ceed-elemrestriction.c
index efaa52f4e3..b5bbd20ade 100644
--- a/interface/ceed-elemrestriction.c
+++ b/interface/ceed-elemrestriction.c
@@ -1733,8 +1733,7 @@ int CeedElemRestrictionGetMultiplicity(CeedElemRestriction rstr, CeedVector mult
   @ref User
 **/
 int CeedElemRestrictionSetNumViewTabs(CeedElemRestriction rstr, CeedInt num_tabs) {
-  CeedCheck(num_tabs >= 0, CeedElemRestrictionReturnCeed(rstr), CEED_ERROR_MINOR, "Number of view tabs must be non-negative");
-  rstr->num_tabs = num_tabs;
+  CeedCall(CeedObjectSetNumViewTabs((CeedObject)rstr, num_tabs));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -1749,7 +1748,7 @@ int CeedElemRestrictionSetNumViewTabs(CeedElemRestriction rstr, CeedInt num_tabs
   @ref User
 **/
 int CeedElemRestrictionGetNumViewTabs(CeedElemRestriction rstr, CeedInt *num_tabs) {
-  *num_tabs = rstr->num_tabs;
+  CeedCall(CeedObjectGetNumViewTabs((CeedObject)rstr, num_tabs));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/interface/ceed-object.c b/interface/ceed-object.c
index 7b13f875d7..c054de1798 100644
--- a/interface/ceed-object.c
+++ b/interface/ceed-object.c
@@ -102,6 +102,37 @@ int CeedObjectView(CeedObject obj, FILE *stream) {
   return CEED_ERROR_SUCCESS;
 }
 
+/**
+  @brief Set the number of tabs to indent for @ref CeedObjectView() output
+
+  @param[in] obj      `CeedObject` to set the number of view tabs
+  @param[in] num_tabs Number of view tabs to set
+
+  @return Error code: 0 - success, otherwise - failure
+
+  @ref User
+**/
+int CeedObjectSetNumViewTabs(CeedObject obj, CeedInt num_tabs) {
+  CeedCheck(num_tabs >= 0, CeedObjectReturnCeed(obj), CEED_ERROR_MINOR, "Number of view tabs must be non-negative");
+  obj->num_view_tabs = num_tabs;
+  return CEED_ERROR_SUCCESS;
+}
+
+/**
+  @brief Get the number of tabs to indent for @ref CeedObjectView() output
+
+  @param[in]  obj      `CeedObject` to get the number of view tabs
+  @param[out] num_tabs Number of view tabs
+
+  @return Error code: 0 - success, otherwise - failure
+
+  @ref User
+**/
+int CeedObjectGetNumViewTabs(CeedObject obj, CeedInt *num_tabs) {
+  *num_tabs = obj->num_view_tabs;
+  return CEED_ERROR_SUCCESS;
+}
+
 /**
   @brief Get the `Ceed` associated with a `CeedObject`
 
diff --git a/interface/ceed-operator.c b/interface/ceed-operator.c
index 53091d0600..582d140ed3 100644
--- a/interface/ceed-operator.c
+++ b/interface/ceed-operator.c
@@ -1648,8 +1648,7 @@ static int CeedOperatorView_Core(CeedOperator op, FILE *stream, bool is_full) {
   @ref User
 **/
 int CeedOperatorSetNumViewTabs(CeedOperator op, CeedInt num_tabs) {
-  CeedCheck(num_tabs >= 0, CeedOperatorReturnCeed(op), CEED_ERROR_MINOR, "Number of view tabs must be non-negative");
-  op->num_tabs = num_tabs;
+  CeedCall(CeedObjectSetNumViewTabs((CeedObject)op, num_tabs));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -1664,7 +1663,7 @@ int CeedOperatorSetNumViewTabs(CeedOperator op, CeedInt num_tabs) {
   @ref User
 **/
 int CeedOperatorGetNumViewTabs(CeedOperator op, CeedInt *num_tabs) {
-  *num_tabs = op->num_tabs;
+  CeedCall(CeedObjectGetNumViewTabs((CeedObject)op, num_tabs));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/interface/ceed-qfunction.c b/interface/ceed-qfunction.c
index 705d06b5f1..1f0131eeef 100644
--- a/interface/ceed-qfunction.c
+++ b/interface/ceed-qfunction.c
@@ -1036,8 +1036,7 @@ int CeedQFunctionSetUserFlopsEstimate(CeedQFunction qf, CeedSize flops) {
   @ref User
 **/
 int CeedQFunctionSetNumViewTabs(CeedQFunction qf, CeedInt num_tabs) {
-  CeedCheck(num_tabs >= 0, CeedQFunctionReturnCeed(qf), CEED_ERROR_MINOR, "Number of view tabs must be non-negative");
-  qf->num_tabs = num_tabs;
+  CeedCall(CeedObjectSetNumViewTabs((CeedObject)qf, num_tabs));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -1052,7 +1051,7 @@ int CeedQFunctionSetNumViewTabs(CeedQFunction qf, CeedInt num_tabs) {
   @ref User
 **/
 int CeedQFunctionGetNumViewTabs(CeedQFunction qf, CeedInt *num_tabs) {
-  *num_tabs = qf->num_tabs;
+  CeedCall(CeedObjectGetNumViewTabs((CeedObject)qf, num_tabs));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/interface/ceed-qfunctioncontext.c b/interface/ceed-qfunctioncontext.c
index b308f5d038..fd56007b3a 100644
--- a/interface/ceed-qfunctioncontext.c
+++ b/interface/ceed-qfunctioncontext.c
@@ -905,8 +905,7 @@ int CeedQFunctionContextGetContextSize(CeedQFunctionContext ctx, size_t *ctx_siz
   @ref User
 **/
 int CeedQFunctionContextSetNumViewTabs(CeedQFunctionContext ctx, CeedInt num_tabs) {
-  CeedCheck(num_tabs >= 0, CeedQFunctionContextReturnCeed(ctx), CEED_ERROR_MINOR, "Number of view tabs must be non-negative");
-  ctx->num_tabs = num_tabs;
+  CeedCall(CeedObjectSetNumViewTabs((CeedObject)ctx, num_tabs));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -921,7 +920,7 @@ int CeedQFunctionContextSetNumViewTabs(CeedQFunctionContext ctx, CeedInt num_tab
   @ref User
 **/
 int CeedQFunctionContextGetNumViewTabs(CeedQFunctionContext ctx, CeedInt *num_tabs) {
-  *num_tabs = ctx->num_tabs;
+  CeedCall(CeedObjectGetNumViewTabs((CeedObject)ctx, num_tabs));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/interface/ceed-vector.c b/interface/ceed-vector.c
index 32f7fb15db..c92d7d1441 100644
--- a/interface/ceed-vector.c
+++ b/interface/ceed-vector.c
@@ -1027,8 +1027,7 @@ int CeedVectorReciprocal(CeedVector vec) {
   @ref User
 **/
 int CeedVectorSetNumViewTabs(CeedVector vec, CeedInt num_tabs) {
-  CeedCheck(num_tabs >= 0, CeedVectorReturnCeed(vec), CEED_ERROR_MINOR, "Number of view tabs must be non-negative");
-  vec->num_tabs = num_tabs;
+  CeedCall(CeedObjectSetNumViewTabs((CeedObject)vec, num_tabs));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -1043,7 +1042,7 @@ int CeedVectorSetNumViewTabs(CeedVector vec, CeedInt num_tabs) {
   @ref User
 **/
 int CeedVectorGetNumViewTabs(CeedVector vec, CeedInt *num_tabs) {
-  *num_tabs = vec->num_tabs;
+  CeedCall(CeedObjectGetNumViewTabs((CeedObject)vec, num_tabs));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/interface/ceed.c b/interface/ceed.c
index 01167241b5..ebce488850 100644
--- a/interface/ceed.c
+++ b/interface/ceed.c
@@ -1572,8 +1572,7 @@ int CeedAddJitDefine(Ceed ceed, const char *jit_define) {
   @ref User
 **/
 int CeedSetNumViewTabs(Ceed ceed, CeedInt num_tabs) {
-  CeedCheck(num_tabs >= 0, ceed, CEED_ERROR_MINOR, "Number of view tabs must be non-negative");
-  ceed->num_tabs = num_tabs;
+  CeedCall(CeedObjectSetNumViewTabs((CeedObject)ceed, num_tabs));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -1588,7 +1587,7 @@ int CeedSetNumViewTabs(Ceed ceed, CeedInt num_tabs) {
   @ref User
 **/
 int CeedGetNumViewTabs(Ceed ceed, CeedInt *num_tabs) {
-  *num_tabs = ceed->num_tabs;
+  CeedCall(CeedObjectGetNumViewTabs((CeedObject)ceed, num_tabs));
   return CEED_ERROR_SUCCESS;
 }
 

From 6c328a79ca68936a528462ed2722d2e5d44e1e31 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Tue, 20 Jan 2026 10:09:56 -0700
Subject: [PATCH 521/571] obj - add common CeedObjectDestroy

---
 include/ceed-impl.h               |  3 ++-
 include/ceed/backend.h            |  4 ++--
 include/ceed/ceed.h               |  1 +
 interface/ceed-basis.c            | 24 +++++++++++++++----
 interface/ceed-elemrestriction.c  | 40 +++++++++++++++++++++----------
 interface/ceed-object.c           | 36 +++++++++++++++++++++-------
 interface/ceed-operator.c         | 22 +++++++++++++----
 interface/ceed-qfunction.c        | 18 ++++++++++++--
 interface/ceed-qfunctioncontext.c | 18 ++++++++++++--
 interface/ceed-tensor.c           | 26 ++++++++++++++++++--
 interface/ceed-vector.c           | 18 ++++++++++++--
 interface/ceed.c                  | 18 ++++++++++++--
 12 files changed, 185 insertions(+), 43 deletions(-)

diff --git a/include/ceed-impl.h b/include/ceed-impl.h
index 7496dda577..669f039d5a 100644
--- a/include/ceed-impl.h
+++ b/include/ceed-impl.h
@@ -92,7 +92,8 @@ struct CeedWorkVectors_private {
 
 typedef struct CeedObject_private {
   Ceed ceed;
-  int (*ViewFunction)(CeedObject, FILE *);
+  int (*View)(CeedObject, FILE *);
+  int (*Destroy)(CeedObject *);
   int     ref_count;
   CeedInt num_view_tabs;
 } CeedObject_private;
diff --git a/include/ceed/backend.h b/include/ceed/backend.h
index f5ac3dfcd1..87f5f32c25 100644
--- a/include/ceed/backend.h
+++ b/include/ceed/backend.h
@@ -186,10 +186,10 @@ CEED_INTERN int CeedReallocArray(size_t n, size_t unit, void *p);
 CEED_INTERN int CeedStringAllocCopy(const char *source, char **copy);
 CEED_INTERN int CeedFree(void *p);
 
-CEED_INTERN int CeedObjectCreate(Ceed ceed, int (*view_function)(CeedObject, FILE *), CeedObject obj);
+CEED_INTERN int CeedObjectCreate(Ceed ceed, int (*view_function)(CeedObject, FILE *), int (*destroy_function)(CeedObject *), CeedObject obj);
 CEED_INTERN int CeedObjectReference(CeedObject obj);
 CEED_INTERN int CeedObjectDereference(CeedObject obj);
-CEED_INTERN int CeedObjectDestroy(CeedObject obj);
+CEED_INTERN int CeedObjectDestroy_Private(CeedObject obj);
 
 CEED_INTERN int CeedSetHostBoolArray(const bool *source_array, CeedCopyMode copy_mode, CeedSize num_values, const bool **target_array_owned,
                                      const bool **target_array_borrowed, const bool **target_array);
diff --git a/include/ceed/ceed.h b/include/ceed/ceed.h
index 006dea00e0..d5db4a4bc5 100644
--- a/include/ceed/ceed.h
+++ b/include/ceed/ceed.h
@@ -108,6 +108,7 @@ CEED_EXTERN int  CeedObjectSetNumViewTabs(CeedObject obj, CeedInt num_tabs);
 CEED_EXTERN int  CeedObjectGetNumViewTabs(CeedObject obj, CeedInt *num_tabs);
 CEED_EXTERN int  CeedObjectGetCeed(CeedObject obj, Ceed *ceed);
 CEED_EXTERN Ceed CeedObjectReturnCeed(CeedObject obj);
+CEED_EXTERN int  CeedObjectDestroy(CeedObject *obj);
 
 CEED_EXTERN int CeedRegistryGetList(size_t *n, char ***const resources, CeedInt **array);
 CEED_EXTERN int CeedInit(const char *resource, Ceed *ceed);
diff --git a/interface/ceed-basis.c b/interface/ceed-basis.c
index e2e58852a9..23bf05c419 100644
--- a/interface/ceed-basis.c
+++ b/interface/ceed-basis.c
@@ -192,6 +192,20 @@ static int CeedBasisView_Object(CeedObject basis, FILE *stream) {
   return CEED_ERROR_SUCCESS;
 }
 
+/**
+  @brief Destroy a `CeedBasis` passed as a `CeedObject`
+
+  @param[in,out] basis Address of `CeedBasis` to destroy
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Developer
+**/
+static int CeedBasisDestroy_Object(CeedObject *basis) {
+  CeedCall(CeedBasisDestroy((CeedBasis *)basis));
+  return CEED_ERROR_SUCCESS;
+}
+
 /**
   @brief Create the interpolation and gradient matrices for projection from the nodes of `basis_from` to the nodes of `basis_to`.
 
@@ -1539,7 +1553,7 @@ int CeedBasisCreateTensorH1(Ceed ceed, CeedInt dim, CeedInt num_comp, CeedInt P_
   CeedElemTopology topo = dim == 1 ? CEED_TOPOLOGY_LINE : dim == 2 ? CEED_TOPOLOGY_QUAD : CEED_TOPOLOGY_HEX;
 
   CeedCall(CeedCalloc(1, basis));
-  CeedCall(CeedObjectCreate(ceed, CeedBasisView_Object, &(*basis)->obj));
+  CeedCall(CeedObjectCreate(ceed, CeedBasisView_Object, CeedBasisDestroy_Object, &(*basis)->obj));
   (*basis)->is_tensor_basis = true;
   (*basis)->dim             = dim;
   (*basis)->topo            = topo;
@@ -1677,7 +1691,7 @@ int CeedBasisCreateH1(Ceed ceed, CeedElemTopology topo, CeedInt num_comp, CeedIn
   CeedCall(CeedBasisGetTopologyDimension(topo, &dim));
 
   CeedCall(CeedCalloc(1, basis));
-  CeedCall(CeedObjectCreate(ceed, CeedBasisView_Object, &(*basis)->obj));
+  CeedCall(CeedObjectCreate(ceed, CeedBasisView_Object, CeedBasisDestroy_Object, &(*basis)->obj));
   (*basis)->is_tensor_basis = false;
   (*basis)->dim             = dim;
   (*basis)->topo            = topo;
@@ -1736,7 +1750,7 @@ int CeedBasisCreateHdiv(Ceed ceed, CeedElemTopology topo, CeedInt num_comp, Ceed
   CeedCall(CeedBasisGetTopologyDimension(topo, &dim));
 
   CeedCall(CeedCalloc(1, basis));
-  CeedCall(CeedObjectCreate(ceed, CeedBasisView_Object, &(*basis)->obj));
+  CeedCall(CeedObjectCreate(ceed, CeedBasisView_Object, CeedBasisDestroy_Object, &(*basis)->obj));
   (*basis)->is_tensor_basis = false;
   (*basis)->dim             = dim;
   (*basis)->topo            = topo;
@@ -1796,7 +1810,7 @@ int CeedBasisCreateHcurl(Ceed ceed, CeedElemTopology topo, CeedInt num_comp, Cee
   curl_comp = (dim < 3) ? 1 : dim;
 
   CeedCall(CeedCalloc(1, basis));
-  CeedCall(CeedObjectCreate(ceed, CeedBasisView_Object, &(*basis)->obj));
+  CeedCall(CeedObjectCreate(ceed, CeedBasisView_Object, CeedBasisDestroy_Object, &(*basis)->obj));
   (*basis)->is_tensor_basis = false;
   (*basis)->dim             = dim;
   (*basis)->topo            = topo;
@@ -2470,7 +2484,7 @@ int CeedBasisDestroy(CeedBasis *basis) {
   CeedCall(CeedFree(&(*basis)->curl));
   CeedCall(CeedVectorDestroy(&(*basis)->vec_chebyshev));
   CeedCall(CeedBasisDestroy(&(*basis)->basis_chebyshev));
-  CeedCall(CeedObjectDestroy(&(*basis)->obj));
+  CeedCall(CeedObjectDestroy_Private(&(*basis)->obj));
   CeedCall(CeedFree(basis));
   return CEED_ERROR_SUCCESS;
 }
diff --git a/interface/ceed-elemrestriction.c b/interface/ceed-elemrestriction.c
index b5bbd20ade..476daab0c2 100644
--- a/interface/ceed-elemrestriction.c
+++ b/interface/ceed-elemrestriction.c
@@ -113,6 +113,20 @@ static int CeedElemRestrictionView_Object(CeedObject rstr, FILE *stream) {
   return CEED_ERROR_SUCCESS;
 }
 
+/**
+  @brief Destroy a `CeedElemRestricton` passed as a `CeedObject`
+
+  @param[in,out] rstr Address of `CeedElemRestriction` to destroy
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Developer
+**/
+static int CeedElemRestrictionDestroy_Object(CeedObject *rstr) {
+  CeedCall(CeedElemRestrictionDestroy((CeedElemRestriction *)rstr));
+  return CEED_ERROR_SUCCESS;
+}
+
 /// @}
 
 /// ----------------------------------------------------------------------------
@@ -658,7 +672,7 @@ int CeedElemRestrictionCreate(Ceed ceed, CeedInt num_elem, CeedInt elem_size, Ce
   CeedCheck(num_comp == 1 || comp_stride > 0, ceed, CEED_ERROR_DIMENSION, "CeedElemRestriction component stride must be at least 1");
 
   CeedCall(CeedCalloc(1, rstr));
-  CeedCall(CeedObjectCreate(ceed, CeedElemRestrictionView_Object, &(*rstr)->obj));
+  CeedCall(CeedObjectCreate(ceed, CeedElemRestrictionView_Object, CeedElemRestrictionDestroy_Object, &(*rstr)->obj));
   (*rstr)->num_elem    = num_elem;
   (*rstr)->elem_size   = elem_size;
   (*rstr)->num_comp    = num_comp;
@@ -715,7 +729,7 @@ int CeedElemRestrictionCreateOriented(Ceed ceed, CeedInt num_elem, CeedInt elem_
   CeedCheck(num_comp == 1 || comp_stride > 0, ceed, CEED_ERROR_DIMENSION, "CeedElemRestriction component stride must be at least 1");
 
   CeedCall(CeedCalloc(1, rstr));
-  CeedCall(CeedObjectCreate(ceed, CeedElemRestrictionView_Object, &(*rstr)->obj));
+  CeedCall(CeedObjectCreate(ceed, CeedElemRestrictionView_Object, CeedElemRestrictionDestroy_Object, &(*rstr)->obj));
   (*rstr)->num_elem    = num_elem;
   (*rstr)->elem_size   = elem_size;
   (*rstr)->num_comp    = num_comp;
@@ -773,7 +787,7 @@ int CeedElemRestrictionCreateCurlOriented(Ceed ceed, CeedInt num_elem, CeedInt e
   CeedCheck(num_comp == 1 || comp_stride > 0, ceed, CEED_ERROR_DIMENSION, "CeedElemRestriction component stride must be at least 1");
 
   CeedCall(CeedCalloc(1, rstr));
-  CeedCall(CeedObjectCreate(ceed, CeedElemRestrictionView_Object, &(*rstr)->obj));
+  CeedCall(CeedObjectCreate(ceed, CeedElemRestrictionView_Object, CeedElemRestrictionDestroy_Object, &(*rstr)->obj));
   (*rstr)->num_elem    = num_elem;
   (*rstr)->elem_size   = elem_size;
   (*rstr)->num_comp    = num_comp;
@@ -827,7 +841,7 @@ int CeedElemRestrictionCreateStrided(Ceed ceed, CeedInt num_elem, CeedInt elem_s
             (CeedSize)num_elem * (CeedSize)elem_size * (CeedSize)num_comp, l_size);
 
   CeedCall(CeedCalloc(1, rstr));
-  CeedCall(CeedObjectCreate(ceed, CeedElemRestrictionView_Object, &(*rstr)->obj));
+  CeedCall(CeedObjectCreate(ceed, CeedElemRestrictionView_Object, CeedElemRestrictionDestroy_Object, &(*rstr)->obj));
   (*rstr)->num_elem   = num_elem;
   (*rstr)->elem_size  = elem_size;
   (*rstr)->num_comp   = num_comp;
@@ -894,7 +908,7 @@ int CeedElemRestrictionCreateAtPoints(Ceed ceed, CeedInt num_elem, CeedInt num_p
             l_size);
 
   CeedCall(CeedCalloc(1, rstr));
-  CeedCall(CeedObjectCreate(ceed, CeedElemRestrictionView_Object, &(*rstr)->obj));
+  CeedCall(CeedObjectCreate(ceed, CeedElemRestrictionView_Object, CeedElemRestrictionDestroy_Object, &(*rstr)->obj));
   (*rstr)->num_elem    = num_elem;
   (*rstr)->num_points  = num_points;
   (*rstr)->num_comp    = num_comp;
@@ -959,7 +973,7 @@ int CeedElemRestrictionCreateBlocked(Ceed ceed, CeedInt num_elem, CeedInt elem_s
   CeedCall(CeedPermutePadOffsets(offsets, block_offsets, num_block, num_elem, block_size, elem_size));
 
   CeedCall(CeedCalloc(1, rstr));
-  CeedCall(CeedObjectCreate(ceed, CeedElemRestrictionView_Object, &(*rstr)->obj));
+  CeedCall(CeedObjectCreate(ceed, CeedElemRestrictionView_Object, CeedElemRestrictionDestroy_Object, &(*rstr)->obj));
   (*rstr)->num_elem    = num_elem;
   (*rstr)->elem_size   = elem_size;
   (*rstr)->num_comp    = num_comp;
@@ -1029,7 +1043,7 @@ int CeedElemRestrictionCreateBlockedOriented(Ceed ceed, CeedInt num_elem, CeedIn
   CeedCall(CeedPermutePadOrients(orients, block_orients, num_block, num_elem, block_size, elem_size));
 
   CeedCall(CeedCalloc(1, rstr));
-  CeedCall(CeedObjectCreate(ceed, CeedElemRestrictionView_Object, &(*rstr)->obj));
+  CeedCall(CeedObjectCreate(ceed, CeedElemRestrictionView_Object, CeedElemRestrictionDestroy_Object, &(*rstr)->obj));
   (*rstr)->num_elem    = num_elem;
   (*rstr)->elem_size   = elem_size;
   (*rstr)->num_comp    = num_comp;
@@ -1102,7 +1116,7 @@ int CeedElemRestrictionCreateBlockedCurlOriented(Ceed ceed, CeedInt num_elem, Ce
   CeedCall(CeedPermutePadCurlOrients(curl_orients, block_curl_orients, num_block, num_elem, block_size, 3 * elem_size));
 
   CeedCall(CeedCalloc(1, rstr));
-  CeedCall(CeedObjectCreate(ceed, CeedElemRestrictionView_Object, &(*rstr)->obj));
+  CeedCall(CeedObjectCreate(ceed, CeedElemRestrictionView_Object, CeedElemRestrictionDestroy_Object, &(*rstr)->obj));
   (*rstr)->num_elem    = num_elem;
   (*rstr)->elem_size   = elem_size;
   (*rstr)->num_comp    = num_comp;
@@ -1160,7 +1174,7 @@ int CeedElemRestrictionCreateBlockedStrided(Ceed ceed, CeedInt num_elem, CeedInt
             (CeedSize)num_elem * (CeedSize)elem_size * (CeedSize)num_comp, l_size);
 
   CeedCall(CeedCalloc(1, rstr));
-  CeedCall(CeedObjectCreate(ceed, CeedElemRestrictionView_Object, &(*rstr)->obj));
+  CeedCall(CeedObjectCreate(ceed, CeedElemRestrictionView_Object, CeedElemRestrictionDestroy_Object, &(*rstr)->obj));
   (*rstr)->num_elem   = num_elem;
   (*rstr)->elem_size  = elem_size;
   (*rstr)->num_comp   = num_comp;
@@ -1192,7 +1206,8 @@ int CeedElemRestrictionCreateUnsignedCopy(CeedElemRestriction rstr, CeedElemRest
 
   // Copy old rstr
   memcpy(*rstr_unsigned, rstr, sizeof(struct CeedElemRestriction_private));
-  CeedCall(CeedObjectCreate(CeedElemRestrictionReturnCeed(rstr), CeedElemRestrictionView_Object, &(*rstr_unsigned)->obj));
+  CeedCall(CeedObjectCreate(CeedElemRestrictionReturnCeed(rstr), CeedElemRestrictionView_Object, CeedElemRestrictionDestroy_Object,
+                            &(*rstr_unsigned)->obj));
   (*rstr_unsigned)->strides = NULL;
   if (rstr->strides) {
     CeedCall(CeedMalloc(3, &(*rstr_unsigned)->strides));
@@ -1222,7 +1237,8 @@ int CeedElemRestrictionCreateUnorientedCopy(CeedElemRestriction rstr, CeedElemRe
 
   // Copy old rstr
   memcpy(*rstr_unoriented, rstr, sizeof(struct CeedElemRestriction_private));
-  CeedCall(CeedObjectCreate(CeedElemRestrictionReturnCeed(rstr), CeedElemRestrictionView_Object, &(*rstr_unoriented)->obj));
+  CeedCall(CeedObjectCreate(CeedElemRestrictionReturnCeed(rstr), CeedElemRestrictionView_Object, CeedElemRestrictionDestroy_Object,
+                            &(*rstr_unoriented)->obj));
   (*rstr_unoriented)->strides = NULL;
   if (rstr->strides) {
     CeedCall(CeedMalloc(3, &(*rstr_unoriented)->strides));
@@ -1823,7 +1839,7 @@ int CeedElemRestrictionDestroy(CeedElemRestriction *rstr) {
   else if ((*rstr)->Destroy) CeedCall((*rstr)->Destroy(*rstr));
 
   CeedCall(CeedFree(&(*rstr)->strides));
-  CeedCall(CeedObjectDestroy(&(*rstr)->obj));
+  CeedCall(CeedObjectDestroy_Private(&(*rstr)->obj));
   CeedCall(CeedFree(rstr));
   return CEED_ERROR_SUCCESS;
 }
diff --git a/interface/ceed-object.c b/interface/ceed-object.c
index c054de1798..afb1575e4c 100644
--- a/interface/ceed-object.c
+++ b/interface/ceed-object.c
@@ -21,19 +21,22 @@
 /**
   @brief Create a `CeedObject`
 
-  @param[in]  ceed          `Ceed` object to reference
-  @param[in]  view_function `Ceed*` function for viewing the `obj`
-  @param[out] obj           Address of the variable where is `CeedObject` exists
+  @param[in]  ceed             `Ceed` object to reference
+  @param[in]  view_function    `Ceed*` function for viewing the `obj`
+  @param[in]  destroy_function `Ceed*` function for destroying the `obj`
+  @param[out] obj              Address of the variable where is `CeedObject` exists
 
   @return An error code: 0 - success, otherwise - failure
 
   @ref Backend
 **/
-int CeedObjectCreate(Ceed ceed, int (*view_function)(CeedObject, FILE *), CeedObject obj) {
+int CeedObjectCreate(Ceed ceed, int (*view_function)(CeedObject, FILE *), int (*destroy_function)(CeedObject *), CeedObject obj) {
   obj->ceed = NULL;
   if (ceed) CeedCall(CeedReferenceCopy(ceed, &obj->ceed));
-  obj->ViewFunction = view_function;
-  obj->ref_count    = 1;
+  obj->View = view_function;
+  CeedCheck(destroy_function, CeedObjectReturnCeed(obj), CEED_ERROR_UNSUPPORTED, "Must provide destroy function to create CeedObject");
+  obj->Destroy   = destroy_function;
+  obj->ref_count = 1;
   return CEED_ERROR_SUCCESS;
 }
 
@@ -73,8 +76,9 @@ int CeedObjectDereference(CeedObject obj) {
 
   @ref Backend
 **/
-int CeedObjectDestroy(CeedObject obj) {
-  CeedCheck(obj->ref_count == 0, CeedObjectReturnCeed(obj), CEED_ERROR_ACCESS, "Cannot destroy CeedObject, it is still referenced by another object");
+int CeedObjectDestroy_Private(CeedObject obj) {
+  CeedCheck(obj->ref_count == 0, CeedObjectReturnCeed(obj), CEED_ERROR_UNSUPPORTED,
+            "Cannot destroy CeedObject, it is still referenced by another object");
   if (obj->ceed) CeedCall(CeedDestroy(&obj->ceed));
   return CEED_ERROR_SUCCESS;
 }
@@ -98,7 +102,7 @@ int CeedObjectDestroy(CeedObject obj) {
   @ref User
 **/
 int CeedObjectView(CeedObject obj, FILE *stream) {
-  if (obj->ViewFunction) CeedCall(obj->ViewFunction(obj, stream));
+  if (obj->View) CeedCall(obj->View(obj, stream));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -160,4 +164,18 @@ int CeedObjectGetCeed(CeedObject obj, Ceed *ceed) {
 **/
 Ceed CeedObjectReturnCeed(CeedObject obj) { return (obj->ceed) ? obj->ceed : (Ceed)obj; }
 
+/**
+  @brief Destroy a @ref CeedObject
+
+  @param[in,out] obj Address of `CeedObject` to destroy
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Backend
+**/
+int CeedObjectDestroy(CeedObject *obj) {
+  CeedCall((*obj)->Destroy(obj));
+  return CEED_ERROR_SUCCESS;
+}
+
 /// @}
diff --git a/interface/ceed-operator.c b/interface/ceed-operator.c
index 582d140ed3..6eb0b74d46 100644
--- a/interface/ceed-operator.c
+++ b/interface/ceed-operator.c
@@ -189,6 +189,20 @@ static int CeedOperatorView_Object(CeedObject op, FILE *stream) {
   return CEED_ERROR_SUCCESS;
 }
 
+/**
+  @brief Destroy a `CeedOperator` passed as a `CeedObject`
+
+  @param[in,out] op Address of `CeedOperator` to destroy
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Developer
+**/
+static int CeedOperatorDestroy_Object(CeedObject *op) {
+  CeedCall(CeedOperatorDestroy((CeedOperator *)op));
+  return CEED_ERROR_SUCCESS;
+}
+
 /**
   @brief Find the active input vector `CeedBasis` for a non-composite `CeedOperator`.
 
@@ -778,7 +792,7 @@ int CeedOperatorCreate(Ceed ceed, CeedQFunction qf, CeedQFunction dqf, CeedQFunc
   CeedCheck(qf && qf != CEED_QFUNCTION_NONE, ceed, CEED_ERROR_MINOR, "Operator must have a valid CeedQFunction.");
 
   CeedCall(CeedCalloc(1, op));
-  CeedCall(CeedObjectCreate(ceed, CeedOperatorView_Object, &(*op)->obj));
+  CeedCall(CeedObjectCreate(ceed, CeedOperatorView_Object, CeedOperatorDestroy_Object, &(*op)->obj));
   (*op)->input_size  = -1;
   (*op)->output_size = -1;
   CeedCall(CeedQFunctionReferenceCopy(qf, &(*op)->qf));
@@ -820,7 +834,7 @@ int CeedOperatorCreateAtPoints(Ceed ceed, CeedQFunction qf, CeedQFunction dqf, C
   CeedCheck(qf && qf != CEED_QFUNCTION_NONE, ceed, CEED_ERROR_MINOR, "Operator must have a valid CeedQFunction.");
 
   CeedCall(CeedCalloc(1, op));
-  CeedCall(CeedObjectCreate(ceed, CeedOperatorView_Object, &(*op)->obj));
+  CeedCall(CeedObjectCreate(ceed, CeedOperatorView_Object, CeedOperatorDestroy_Object, &(*op)->obj));
   (*op)->is_at_points = true;
   (*op)->input_size   = -1;
   (*op)->output_size  = -1;
@@ -856,7 +870,7 @@ int CeedOperatorCreateComposite(Ceed ceed, CeedOperator *op) {
   }
 
   CeedCall(CeedCalloc(1, op));
-  CeedCall(CeedObjectCreate(ceed, CeedOperatorView_Object, &(*op)->obj));
+  CeedCall(CeedObjectCreate(ceed, CeedOperatorView_Object, CeedOperatorDestroy_Object, &(*op)->obj));
   (*op)->is_composite = true;
   CeedCall(CeedCalloc(CEED_COMPOSITE_MAX, &(*op)->sub_operators));
   (*op)->input_size  = -1;
@@ -2446,7 +2460,7 @@ int CeedOperatorDestroy(CeedOperator *op) {
   CeedCall(CeedOperatorDestroy(&(*op)->op_fallback));
 
   CeedCall(CeedFree(&(*op)->name));
-  CeedCall(CeedObjectDestroy(&(*op)->obj));
+  CeedCall(CeedObjectDestroy_Private(&(*op)->obj));
   CeedCall(CeedFree(op));
   return CEED_ERROR_SUCCESS;
 }
diff --git a/interface/ceed-qfunction.c b/interface/ceed-qfunction.c
index 1f0131eeef..e9a6430925 100644
--- a/interface/ceed-qfunction.c
+++ b/interface/ceed-qfunction.c
@@ -157,6 +157,20 @@ static int CeedQFunctionView_Object(CeedObject qf, FILE *stream) {
   return CEED_ERROR_SUCCESS;
 }
 
+/**
+  @brief Destroy a `CeedQFunction` passed as a `CeedObject`
+
+  @param[in,out] qf Address of `CeedQFunction` to destroy
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Developer
+**/
+static int CeedQFunctionDestroy_Object(CeedObject *qf) {
+  CeedCall(CeedQFunctionDestroy((CeedQFunction *)qf));
+  return CEED_ERROR_SUCCESS;
+}
+
 /**
   @brief Set flag to determine if Fortran interface is used
 
@@ -677,7 +691,7 @@ int CeedQFunctionCreateInterior(Ceed ceed, CeedInt vec_length, CeedQFunctionUser
             "Provided path to source does not include function name. Provided: \"%s\"\nRequired: \"\\abs_path\\file.h:function_name\"", source);
 
   CeedCall(CeedCalloc(1, qf));
-  CeedCall(CeedObjectCreate(ceed, CeedQFunctionView_Object, &(*qf)->obj));
+  CeedCall(CeedObjectCreate(ceed, CeedQFunctionView_Object, CeedQFunctionDestroy_Object, &(*qf)->obj));
   (*qf)->vec_length          = vec_length;
   (*qf)->is_identity         = false;
   (*qf)->is_context_writable = true;
@@ -1182,7 +1196,7 @@ int CeedQFunctionDestroy(CeedQFunction *qf) {
   CeedCall(CeedFree(&(*qf)->source_path));
   CeedCall(CeedFree(&(*qf)->gallery_name));
   CeedCall(CeedFree(&(*qf)->kernel_name));
-  CeedCall(CeedObjectDestroy(&(*qf)->obj));
+  CeedCall(CeedObjectDestroy_Private(&(*qf)->obj));
   CeedCall(CeedFree(qf));
   return CEED_ERROR_SUCCESS;
 }
diff --git a/interface/ceed-qfunctioncontext.c b/interface/ceed-qfunctioncontext.c
index fd56007b3a..48563a9999 100644
--- a/interface/ceed-qfunctioncontext.c
+++ b/interface/ceed-qfunctioncontext.c
@@ -142,6 +142,20 @@ static int CeedQFunctionContextView_Object(CeedObject ctx, FILE *stream) {
   return CEED_ERROR_SUCCESS;
 }
 
+/**
+  @brief Destroy a `CeedQFunctionContext` passed as a `CeedObject`
+
+  @param[in,out] ctx Address of `CeedQFunctionContext` to destroy
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Developer
+**/
+static int CeedQFunctionContextDestroy_Object(CeedObject *ctx) {
+  CeedCall(CeedQFunctionContextDestroy((CeedQFunctionContext *)ctx));
+  return CEED_ERROR_SUCCESS;
+}
+
 /// @}
 
 /// ----------------------------------------------------------------------------
@@ -590,7 +604,7 @@ int CeedQFunctionContextCreate(Ceed ceed, CeedQFunctionContext *ctx) {
   }
 
   CeedCall(CeedCalloc(1, ctx));
-  CeedCall(CeedObjectCreate(ceed, CeedQFunctionContextView_Object, &(*ctx)->obj));
+  CeedCall(CeedObjectCreate(ceed, CeedQFunctionContextView_Object, CeedQFunctionContextDestroy_Object, &(*ctx)->obj));
   CeedCall(ceed->QFunctionContextCreate(*ctx));
   return CEED_ERROR_SUCCESS;
 }
@@ -996,7 +1010,7 @@ int CeedQFunctionContextDestroy(CeedQFunctionContext *ctx) {
     CeedCall(CeedFree(&(*ctx)->field_labels[i]));
   }
   CeedCall(CeedFree(&(*ctx)->field_labels));
-  CeedCall(CeedObjectDestroy(&(*ctx)->obj));
+  CeedCall(CeedObjectDestroy_Private(&(*ctx)->obj));
   CeedCall(CeedFree(ctx));
   return CEED_ERROR_SUCCESS;
 }
diff --git a/interface/ceed-tensor.c b/interface/ceed-tensor.c
index 757dd4ff17..24f3687c62 100644
--- a/interface/ceed-tensor.c
+++ b/interface/ceed-tensor.c
@@ -13,6 +13,28 @@
 /// @file
 /// Implementation of CeedTensorContract interfaces
 
+/// ----------------------------------------------------------------------------
+/// CeedTensorContract Library Internal Functions
+/// ----------------------------------------------------------------------------
+/// @addtogroup CeedTensorContractDeveloper
+/// @{
+
+/**
+  @brief Destroy a `CeedTensorContract` passed as a `CeedObject`
+
+  @param[in,out] contract Address of `CeedTensorContract` to destroy
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Developer
+**/
+static int CeedTensorContractDestroy_Object(CeedObject *contract) {
+  CeedCall(CeedTensorContractDestroy((CeedTensorContract *)contract));
+  return CEED_ERROR_SUCCESS;
+}
+
+/// @}
+
 /// ----------------------------------------------------------------------------
 /// CeedTensorContract Backend API
 /// ----------------------------------------------------------------------------
@@ -41,7 +63,7 @@ int CeedTensorContractCreate(Ceed ceed, CeedTensorContract *contract) {
   }
 
   CeedCall(CeedCalloc(1, contract));
-  CeedCall(CeedObjectCreate(ceed, NULL, &(*contract)->obj));
+  CeedCall(CeedObjectCreate(ceed, NULL, CeedTensorContractDestroy_Object, &(*contract)->obj));
   CeedCall(ceed->TensorContractCreate(*contract));
   return CEED_ERROR_SUCCESS;
 }
@@ -222,7 +244,7 @@ int CeedTensorContractDestroy(CeedTensorContract *contract) {
   if ((*contract)->Destroy) {
     CeedCall((*contract)->Destroy(*contract));
   }
-  CeedCall(CeedObjectDestroy(&(*contract)->obj));
+  CeedCall(CeedObjectDestroy_Private(&(*contract)->obj));
   CeedCall(CeedFree(contract));
   return CEED_ERROR_SUCCESS;
 }
diff --git a/interface/ceed-vector.c b/interface/ceed-vector.c
index c92d7d1441..1a42381897 100644
--- a/interface/ceed-vector.c
+++ b/interface/ceed-vector.c
@@ -54,6 +54,20 @@ static int CeedVectorView_Object(CeedObject vec, FILE *stream) {
   return CEED_ERROR_SUCCESS;
 }
 
+/**
+  @brief Destroy a `CeedVector` passed as a `CeedObject`
+
+  @param[in,out] vec Address of `CeedVector` to destroy
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Developer
+**/
+static int CeedVectorDestroy_Object(CeedObject *vec) {
+  CeedCall(CeedVectorDestroy((CeedVector *)vec));
+  return CEED_ERROR_SUCCESS;
+}
+
 /// @}
 
 /// ----------------------------------------------------------------------------
@@ -194,7 +208,7 @@ int CeedVectorCreate(Ceed ceed, CeedSize length, CeedVector *vec) {
   }
 
   CeedCall(CeedCalloc(1, vec));
-  CeedCall(CeedObjectCreate(ceed, CeedVectorView_Object, &(*vec)->obj));
+  CeedCall(CeedObjectCreate(ceed, CeedVectorView_Object, CeedVectorDestroy_Object, &(*vec)->obj));
   (*vec)->length = length;
   (*vec)->state  = 0;
   CeedCall(ceed->VectorCreate(length, *vec));
@@ -1174,7 +1188,7 @@ int CeedVectorDestroy(CeedVector *vec) {
   CeedCheck((*vec)->num_readers == 0, CeedVectorReturnCeed(*vec), CEED_ERROR_ACCESS, "Cannot destroy CeedVector, a process has read access");
 
   if ((*vec)->Destroy) CeedCall((*vec)->Destroy(*vec));
-  CeedCall(CeedObjectDestroy(&(*vec)->obj));
+  CeedCall(CeedObjectDestroy_Private(&(*vec)->obj));
   CeedCall(CeedFree(vec));
   return CEED_ERROR_SUCCESS;
 }
diff --git a/interface/ceed.c b/interface/ceed.c
index ebce488850..790a0cb686 100644
--- a/interface/ceed.c
+++ b/interface/ceed.c
@@ -193,6 +193,20 @@ static int CeedView_Object(CeedObject ceed, FILE *stream) {
   return CEED_ERROR_SUCCESS;
 }
 
+/**
+  @brief Destroy a `Ceed` passed as a `CeedObject`
+
+  @param[in,out] ceed Address of `Ceed` context to destroy
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Developer
+**/
+static int CeedDestroy_Object(CeedObject *ceed) {
+  CeedCall(CeedDestroy((Ceed *)ceed));
+  return CEED_ERROR_SUCCESS;
+}
+
 /// @}
 
 /// ----------------------------------------------------------------------------
@@ -1233,7 +1247,7 @@ int CeedInit(const char *resource, Ceed *ceed) {
 
   // Setup Ceed
   CeedCall(CeedCalloc(1, ceed));
-  CeedCall(CeedObjectCreate(NULL, CeedView_Object, &(*ceed)->obj));
+  CeedCall(CeedObjectCreate(NULL, CeedView_Object, CeedDestroy_Object, &(*ceed)->obj));
   CeedCall(CeedCalloc(1, &(*ceed)->jit_source_roots));
   CeedCall(CeedCalloc(1, &(*ceed)->rust_source_roots));
   const char *ceed_error_handler = getenv("CEED_ERROR_HANDLER");
@@ -1674,7 +1688,7 @@ int CeedDestroy(Ceed *ceed) {
   CeedCall(CeedFree(&(*ceed)->resource));
   CeedCall(CeedDestroy(&(*ceed)->op_fallback_ceed));
   CeedCall(CeedWorkVectorsDestroy(*ceed));
-  CeedCall(CeedObjectDestroy(&(*ceed)->obj));
+  CeedCall(CeedObjectDestroy_Private(&(*ceed)->obj));
   CeedCall(CeedFree(ceed));
   return CEED_ERROR_SUCCESS;
 }

From 82a9f6a50dfeefde267cdcc52ec1ab58d2e2b9ff Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Tue, 20 Jan 2026 10:37:50 -0700
Subject: [PATCH 522/571] test - add CeedObject tests

---
 tests/t003-ceed.c            |  9 +++++++++
 tests/t107-vector.c          | 11 +++++++++--
 tests/t210-elemrestriction.c | 12 ++++++++++--
 tests/t300-basis.c           | 14 +++++++++++---
 tests/t402-qfunction.c       | 19 +++++++++++++++----
 tests/t504-operator.c        | 16 ++++++++++++----
 6 files changed, 66 insertions(+), 15 deletions(-)

diff --git a/tests/t003-ceed.c b/tests/t003-ceed.c
index 1d58ad7ce0..9e323c6d49 100644
--- a/tests/t003-ceed.c
+++ b/tests/t003-ceed.c
@@ -14,6 +14,15 @@ int main(int argc, char **argv) {
   CeedSetNumViewTabs(ceed, 1);
   CeedView(ceed, stdout);
 
+  // Check CeedObject interface
+  {
+    Ceed ceed_copy = NULL;
+
+    CeedReferenceCopy(ceed, &ceed_copy);
+    CeedObjectView((CeedObject)ceed_copy, stdout);
+    CeedObjectDestroy((CeedObject *)&ceed_copy);
+  }
+
   CeedDestroy(&ceed);
   return 0;
 }
diff --git a/tests/t107-vector.c b/tests/t107-vector.c
index b6c8993537..b6f3eb38df 100644
--- a/tests/t107-vector.c
+++ b/tests/t107-vector.c
@@ -17,8 +17,15 @@ int main(int argc, char **argv) {
 
   CeedVectorView(x, "%12.8f", stdout);
 
-  CeedVectorSetNumViewTabs(x, 1);
-  CeedVectorView(x, "%12.8f", stdout);
+  // Check tabs and CeedObject functionality
+  {
+    CeedVector x_copy = NULL;
+
+    CeedVectorReferenceCopy(x, &x_copy);
+    CeedVectorSetNumViewTabs(x_copy, 1);
+    CeedObjectView((CeedObject)x_copy, stdout);
+    CeedObjectDestroy((CeedObject *)&x_copy);
+  }
 
   CeedVectorDestroy(&x);
   CeedDestroy(&ceed);
diff --git a/tests/t210-elemrestriction.c b/tests/t210-elemrestriction.c
index 13781acc33..1cefd2d185 100644
--- a/tests/t210-elemrestriction.c
+++ b/tests/t210-elemrestriction.c
@@ -18,8 +18,16 @@ int main(int argc, char **argv) {
   CeedElemRestrictionCreate(ceed, num_elem, 2, 1, 1, num_elem + 1, CEED_MEM_HOST, CEED_USE_POINTER, ind, &elem_restriction);
 
   CeedElemRestrictionView(elem_restriction, stdout);
-  CeedElemRestrictionSetNumViewTabs(elem_restriction, 1);
-  CeedElemRestrictionView(elem_restriction, stdout);
+
+  // Check tabs and CeedObject functionality
+  {
+    CeedElemRestriction elem_restriction_copy = NULL;
+
+    CeedElemRestrictionReferenceCopy(elem_restriction, &elem_restriction_copy);
+    CeedElemRestrictionSetNumViewTabs(elem_restriction_copy, 1);
+    CeedObjectView((CeedObject)elem_restriction_copy, stdout);
+    CeedObjectDestroy((CeedObject *)&elem_restriction_copy);
+  }
 
   CeedElemRestrictionDestroy(&elem_restriction);
   CeedDestroy(&ceed);
diff --git a/tests/t300-basis.c b/tests/t300-basis.c
index da563a5c0e..d340be94e3 100644
--- a/tests/t300-basis.c
+++ b/tests/t300-basis.c
@@ -18,10 +18,18 @@ int main(int argc, char **argv) {
 
   CeedBasisCreateTensorH1Lagrange(ceed, 1, 1, 4, 4, CEED_GAUSS, &basis);
   CeedBasisView(basis, stdout);
-  CeedBasisSetNumViewTabs(basis, 1);
-  CeedBasisView(basis, stdout);
-  CeedBasisDestroy(&basis);
 
+  // Check tabs and CeedObject functionality
+  {
+    CeedBasis basis_copy = NULL;
+
+    CeedBasisReferenceCopy(basis, &basis_copy);
+    CeedBasisSetNumViewTabs(basis_copy, 1);
+    CeedObjectView((CeedObject)basis_copy, stdout);
+    CeedObjectDestroy((CeedObject *)&basis_copy);
+  }
+
+  CeedBasisDestroy(&basis);
   CeedDestroy(&ceed);
   return 0;
 }
diff --git a/tests/t402-qfunction.c b/tests/t402-qfunction.c
index 8e15a3e5a1..6f24d492f0 100644
--- a/tests/t402-qfunction.c
+++ b/tests/t402-qfunction.c
@@ -34,10 +34,21 @@ int main(int argc, char **argv) {
   }
   CeedQFunctionContextView(ctx, stdout);
 
-  CeedQFunctionSetNumViewTabs(qf_mass, 1);
-  CeedQFunctionView(qf_mass, stdout);
-  CeedQFunctionContextSetNumViewTabs(ctx, 1);
-  CeedQFunctionContextView(ctx, stdout);
+  // Check tabs and CeedObject functionality
+  {
+    CeedQFunction        qf_mass_copy = NULL;
+    CeedQFunctionContext ctx_copy     = NULL;
+
+    CeedQFunctionReferenceCopy(qf_mass, &qf_mass_copy);
+    CeedQFunctionSetNumViewTabs(qf_mass_copy, 1);
+    CeedObjectView((CeedObject)qf_mass_copy, stdout);
+    CeedObjectDestroy((CeedObject *)&qf_mass_copy);
+
+    CeedQFunctionContextReferenceCopy(ctx, &ctx_copy);
+    CeedQFunctionContextSetNumViewTabs(ctx_copy, 1);
+    CeedObjectView((CeedObject)ctx_copy, stdout);
+    CeedObjectDestroy((CeedObject *)&ctx_copy);
+  }
 
   CeedQFunctionDestroy(&qf_setup);
   CeedQFunctionDestroy(&qf_mass);
diff --git a/tests/t504-operator.c b/tests/t504-operator.c
index d7046ab08d..41dfcc7962 100644
--- a/tests/t504-operator.c
+++ b/tests/t504-operator.c
@@ -69,10 +69,18 @@ int main(int argc, char **argv) {
   CeedOperatorSetName(op_setup, "setup");
   CeedOperatorViewTerse(op_setup, stdout);
   CeedOperatorView(op_setup, stdout);
-  CeedOperatorSetName(op_mass, "mass");
-  CeedOperatorSetNumViewTabs(op_mass, 1);
-  CeedOperatorViewTerse(op_mass, stdout);
-  CeedOperatorView(op_mass, stdout);
+
+  // Check tabs and CeedObject functionality
+  {
+    CeedOperator op_mass_copy = NULL;
+
+    CeedOperatorReferenceCopy(op_mass, &op_mass_copy);
+    CeedOperatorSetName(op_mass_copy, "mass");
+    CeedOperatorSetNumViewTabs(op_mass_copy, 1);
+    CeedOperatorViewTerse(op_mass_copy, stdout);
+    CeedObjectView((CeedObject)op_mass_copy, stdout);
+    CeedObjectDestroy((CeedObject *)&op_mass_copy);
+  }
 
   CeedVectorDestroy(&q_data);
   CeedElemRestrictionDestroy(&elem_restriction_u);

From 75c339d669dd3e1a40b793905dfb8a312beee6a6 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Tue, 20 Jan 2026 12:27:32 -0700
Subject: [PATCH 523/571] doc - clarify CeedObjectCreate usage

---
 interface/ceed-object.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/interface/ceed-object.c b/interface/ceed-object.c
index afb1575e4c..24b38dbb31 100644
--- a/interface/ceed-object.c
+++ b/interface/ceed-object.c
@@ -19,7 +19,11 @@
 /// @{
 
 /**
-  @brief Create a `CeedObject`
+  @brief Create a `CeedObject`.
+
+  Note: This interface takes a `CeedObject` and not a pointer to a `CeedObject` like other `Ceed*Create` interfaces.
+          This `CeedObject` will have already been allocated a the first part of the `Ceed*` struct.
+          This function is only intended to be called inside of `Ceed*Create` functions.
 
   @param[in]  ceed             `Ceed` object to reference
   @param[in]  view_function    `Ceed*` function for viewing the `obj`

From 378c2ab09ffbfaf74726e2c51fd5faf548bf58ad Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Wed, 21 Jan 2026 10:39:12 -0700
Subject: [PATCH 524/571] cov - minor fixes

---
 .gitlab-ci.yml     | 10 +++++-----
 tests/t302-basis.c |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index a907bfb1db..5df451315e 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -121,7 +121,7 @@ noether-cpu:
   after_script:
     - |
       if [ -f .SUCCESS ]; then
-        lcov --directory . --capture --output-file coverage.info --ignore-errors source,mismatch;
+        lcov --directory . --capture --output-file coverage.info --ignore-errors source,mismatch --substitute 's#(t*-f.h)#test/(t*.-f.h)#g';
         bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F interface;
         bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F gallery;
         bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F backends;
@@ -209,7 +209,7 @@ noether-rust-qfunctions:
   after_script:
     - |
       if [ -f .SUCCESS ]; then
-        lcov --directory . --capture --output-file coverage.info --ignore-errors source,mismatch;
+        lcov --directory . --capture --output-file coverage.info --ignore-errors source,mismatch --substitute 's#(t*-f.h)#test/(t*.-f.h)#g';
         bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F interface;
         bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F gallery;
         bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F backends;
@@ -290,7 +290,7 @@ noether-cuda:
   after_script:
     - |
       if [ -f .SUCCESS ]; then
-        lcov --directory . --capture --output-file coverage.info --ignore-errors source,mismatch;
+        lcov --directory . --capture --output-file coverage.info --ignore-errors source,mismatch --substitute 's#(t*-f.h)#test/(t*.-f.h)#g';
         bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F interface;
         bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F gallery;
         bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F backends;
@@ -357,7 +357,7 @@ noether-cuda:
 #  after_script:
 #    - |
 #      if [ -f .SUCCESS ]; then
-#        lcov --directory . --capture --output-file coverage.info --ignore-errors source,mismatch;
+#        lcov --directory . --capture --output-file coverage.info --ignore-errors source,mismatch --substitute 's#(t*-f.h)#test/(t*.-f.h)#g';
 #        bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F interface;
 #        bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F gallery;
 #        bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F backends;
@@ -460,7 +460,7 @@ noether-float:
   after_script:
     - |
       if [ $(cat .job_status) == "SUCCESS" ]; then
-        lcov --directory . --capture --output-file coverage.info --ignore-errors source,mismatch;
+        lcov --directory . --capture --output-file coverage.info --ignore-errors source,mismatch --substitute 's#(t*-f.h)#test/(t*.-f.h)#g';
         bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F interface;
         bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F gallery;
         bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F backends;
diff --git a/tests/t302-basis.c b/tests/t302-basis.c
index 72623f400f..bfe94c8ef5 100644
--- a/tests/t302-basis.c
+++ b/tests/t302-basis.c
@@ -26,7 +26,7 @@ int main(int argc, char **argv) {
       if (fabs(collocated_gradient_1d[j + p * i] - gradient_1d[j + p * i]) > 100 * CEED_EPSILON) {
         // LCOV_EXCL_START
         printf("Error in collocated gradient %f != %f\n", collocated_gradient_1d[j + p * i], gradient_1d[j + p * i]);
-        // LCOV_EXCL_START
+        // LCOV_EXCL_STOP
       }
     }
   }

From e9f76d14a1d2b6594a8aca2075cf16ebd9c3bf6f Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Wed, 21 Jan 2026 10:57:58 -0700
Subject: [PATCH 525/571] doc - minor fixes

---
 interface/ceed-config.c    | 10 ++++++----
 interface/ceed-qfunction.c |  6 +++---
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/interface/ceed-config.c b/interface/ceed-config.c
index 49d428d926..37ae708ec7 100644
--- a/interface/ceed-config.c
+++ b/interface/ceed-config.c
@@ -36,9 +36,10 @@ int CeedGetGitVersion(const char **git_version) {
 }
 
 /**
-  @brief Set whether or not to use clang when compiling for GPU (instead of nvrtc)
+  @brief Set whether or not to use Clang when compiling for GPU (instead of nvrtc)
 
-  @param[in] is_clang Whether or not to use clang on GPU
+  @param[in,out]  ceed     `Ceed` context to set Clang GPU compilation flag
+  @param[in]      is_clang Flag to use clang for GPU compilation
 
   @ref Developer
 
@@ -52,9 +53,10 @@ int CeedSetIsClang(Ceed ceed, bool is_clang) {
 }
 
 /**
-  @brief Determine if the current ceed is set to compile with clang when on GPU
+  @brief Determine if the current `ceed` is set to compile with Clang for CPU
 
-  @param[out] is_clang The location to write the current GPU clang status to
+  @param[in]  ceed     `Ceed` context to get Clang GPU compilation flag
+  @param[out] is_clang Variable to store Clang GPU compilation flag
 
   @ref Developer
 
diff --git a/interface/ceed-qfunction.c b/interface/ceed-qfunction.c
index e9a6430925..8f2ffbbd70 100644
--- a/interface/ceed-qfunction.c
+++ b/interface/ceed-qfunction.c
@@ -228,10 +228,10 @@ int CeedQFunctionGetNumArgs(CeedQFunction qf, CeedInt *num_input, CeedInt *num_o
 
 /**
   @brief Get the name of the `CeedQFunction`.
-    Use the `name` if created via @ref `CeedQFunctionCreateInteriorByName`, otherwise return the kernel name via @ref `CeedQFunctionGetKernelName`.
+    Use the `name` if created via @ref CeedQFunctionCreateInteriorByName(), otherwise return the kernel name via @ref CeedQFunctionGetKernelName().
 
-  @param[in]  qf          `CeedQFunction`
-  @param[out] kernel_name Variable to store `CeedQFunction` name
+  @param[in]  qf   `CeedQFunction`
+  @param[out] name Variable to store `CeedQFunction` name
 
   @return An error code: 0 - success, otherwise - failure
 

From 73c5a4d20b1e9054b8bd6349271e27f9c7e6d692 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Wed, 21 Jan 2026 11:29:47 -0700
Subject: [PATCH 526/571] minor - reorder to prevent theoretical leak

---
 interface/ceed.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/interface/ceed.c b/interface/ceed.c
index 790a0cb686..6c94ec8db8 100644
--- a/interface/ceed.c
+++ b/interface/ceed.c
@@ -1619,6 +1619,8 @@ int CeedView(Ceed ceed, FILE *stream) {
   char       *tabs = NULL;
   CeedMemType mem_type;
 
+  CeedCall(CeedGetPreferredMemType(ceed, &mem_type));
+
   {
     CeedInt num_tabs = 0;
 
@@ -1626,9 +1628,6 @@ int CeedView(Ceed ceed, FILE *stream) {
     CeedCall(CeedCalloc(CEED_TAB_WIDTH * num_tabs + 1, &tabs));
     for (CeedInt i = 0; i < CEED_TAB_WIDTH * num_tabs; i++) tabs[i] = ' ';
   }
-
-  CeedCall(CeedGetPreferredMemType(ceed, &mem_type));
-
   fprintf(stream,
           "%sCeed\n"
           "%s  Ceed Resource: %s\n"

From 505462d430205018e042274803877f51a0153482 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 23 Jan 2026 12:23:55 -0700
Subject: [PATCH 527/571] doc - remove hovverxref; its deprecated

---
 doc/sphinx/source/conf.py | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/doc/sphinx/source/conf.py b/doc/sphinx/source/conf.py
index 4353a893b5..3f956d3785 100755
--- a/doc/sphinx/source/conf.py
+++ b/doc/sphinx/source/conf.py
@@ -40,7 +40,6 @@
 extensions = [
     "sphinxext_altair.altairplot",
     "breathe",
-    "hoverxref.extension",
     "sphinx_design",
     "myst_parser",
     "sphinx_rtd_theme",
@@ -162,13 +161,6 @@
     ]
 }
 
-# hoverxref options
-hoverxref_auto_ref = True
-hoverxref_mathjax = True
-hoverxref_role_types = {
-    "ref": "modal",
-}
-
 latex_macros = r"""
 \def \diff {\operatorname{d}\!}
 \def \tcolon {\!:\!}

From 65af418b5ee547b431a105e503cbe8faf1805930 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Tue, 6 May 2025 13:24:48 -0600
Subject: [PATCH 528/571] make - disable D flag for AR on Darwin

---
 Makefile | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index 5ecaecd71a..5bee63eaa1 100644
--- a/Makefile
+++ b/Makefile
@@ -32,6 +32,9 @@ quiet ?= $($(1))
 .PRECIOUS: %/.DIR
 
 
+DARWIN := $(filter Darwin,$(shell uname -s))
+
+
 # ------------------------------------------------------------
 # Root directories for backend dependencies
 # ------------------------------------------------------------
@@ -72,7 +75,7 @@ ifeq (,$(filter-out undefined default,$(origin AR)))
   AR = ar
 endif
 ifeq (,$(filter-out undefined default,$(origin ARFLAGS)))
-  ARFLAGS = crD
+  ARFLAGS = $(if $(DARWIN),cr,crD)
 endif
 NVCC ?= $(CUDA_DIR)/bin/nvcc
 NVCC_CXX ?= $(CXX)
@@ -222,7 +225,6 @@ MFLAGS := -j $(NPROCS) --warn-undefined-variables \
 PYTHON ?= python3
 PROVE ?= prove
 PROVE_OPTS ?= -j $(NPROCS)
-DARWIN := $(filter Darwin,$(shell uname -s))
 SO_EXT := $(if $(DARWIN),dylib,so)
 
 ceed.pc := $(LIBDIR)/pkgconfig/ceed.pc

From 40fa5a58ea35e52695f55b19c0da75a46f741190 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Thu, 22 Jan 2026 16:07:45 -0700
Subject: [PATCH 529/571] weak - in backend-weak, only have weak definition,
 not declaration

---
 backends/ceed-backend-weak.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/backends/ceed-backend-weak.c b/backends/ceed-backend-weak.c
index 81abd45a3d..6df1f249cc 100644
--- a/backends/ceed-backend-weak.c
+++ b/backends/ceed-backend-weak.c
@@ -37,8 +37,7 @@ static int CeedRegister_Weak(const char *name, int num_prefixes, ...) {
 }
 // LCOV_EXCL_STOP
 
-#define CEED_BACKEND(name, num_prefixes, ...)       \
-  CEED_INTERN int name(void) __attribute__((weak)); \
-  int             name(void) { return CeedRegister_Weak(__func__, num_prefixes, __VA_ARGS__); }
+#define CEED_BACKEND(name, num_prefixes, ...) \
+  CEED_INTERN int __attribute__((weak)) name(void) { return CeedRegister_Weak(__func__, num_prefixes, __VA_ARGS__); }
 #include "ceed-backend-list.h"
 #undef CEED_BACKEND

From 631bf14c71f0df4faaa3a04b0bc325d74013e42d Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 23 Jan 2026 10:39:23 -0700
Subject: [PATCH 530/571] weak - inline helper

---
 backends/ceed-backend-weak.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backends/ceed-backend-weak.c b/backends/ceed-backend-weak.c
index 6df1f249cc..820f610848 100644
--- a/backends/ceed-backend-weak.c
+++ b/backends/ceed-backend-weak.c
@@ -17,7 +17,7 @@ static int CeedInit_Weak(const char *resource, Ceed ceed) {
 }
 
 // This function provides a debug target for weak symbols
-static int CeedRegister_Weak(const char *name, int num_prefixes, ...) {
+static inline int CeedRegister_Weak(const char *name, int num_prefixes, ...) {
   va_list prefixes;
   int     ierr;
 

From cdc02741cf0d4c36a3f809dfa4e8274cffc9a29d Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 23 Jan 2026 10:56:43 -0700
Subject: [PATCH 531/571] make - reorder backend processing

---
 backends/ceed-backend-list.h | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/backends/ceed-backend-list.h b/backends/ceed-backend-list.h
index 017d8a76a0..2c69e9a53c 100644
--- a/backends/ceed-backend-list.h
+++ b/backends/ceed-backend-list.h
@@ -11,8 +11,16 @@
 // This will be expanded inside CeedRegisterAll() to call each registration function in the order listed, and also to define weak symbol aliases for
 // backends that are not configured.
 
+CEED_BACKEND(CeedRegister_Ref, 1, "/cpu/self/ref/serial")
+CEED_BACKEND(CeedRegister_Ref_Blocked, 1, "/cpu/self/ref/blocked")
+CEED_BACKEND(CeedRegister_Opt_Blocked, 1, "/cpu/self/opt/blocked")
+CEED_BACKEND(CeedRegister_Opt_Serial, 1, "/cpu/self/opt/serial")
 CEED_BACKEND(CeedRegister_Avx_Blocked, 1, "/cpu/self/avx/blocked")
 CEED_BACKEND(CeedRegister_Avx_Serial, 1, "/cpu/self/avx/serial")
+CEED_BACKEND(CeedRegister_Memcheck_Blocked, 1, "/cpu/self/memcheck/blocked")
+CEED_BACKEND(CeedRegister_Memcheck_Serial, 1, "/cpu/self/memcheck/serial")
+CEED_BACKEND(CeedRegister_Xsmm_Blocked, 1, "/cpu/self/xsmm/blocked")
+CEED_BACKEND(CeedRegister_Xsmm_Serial, 1, "/cpu/self/xsmm/serial")
 CEED_BACKEND(CeedRegister_Cuda, 1, "/gpu/cuda/ref")
 CEED_BACKEND(CeedRegister_Cuda_Gen, 1, "/gpu/cuda/gen")
 CEED_BACKEND(CeedRegister_Cuda_Shared, 1, "/gpu/cuda/shared")
@@ -24,11 +32,3 @@ CEED_BACKEND(CeedRegister_Sycl_Shared, 1, "/gpu/sycl/shared")
 CEED_BACKEND(CeedRegister_Sycl_Gen, 1, "/gpu/sycl/gen")
 CEED_BACKEND(CeedRegister_Magma, 2, "/gpu/cuda/magma", "/gpu/hip/magma")
 CEED_BACKEND(CeedRegister_Magma_Det, 2, "/gpu/cuda/magma/det", "/gpu/hip/magma/det")
-CEED_BACKEND(CeedRegister_Memcheck_Blocked, 1, "/cpu/self/memcheck/blocked")
-CEED_BACKEND(CeedRegister_Memcheck_Serial, 1, "/cpu/self/memcheck/serial")
-CEED_BACKEND(CeedRegister_Opt_Blocked, 1, "/cpu/self/opt/blocked")
-CEED_BACKEND(CeedRegister_Opt_Serial, 1, "/cpu/self/opt/serial")
-CEED_BACKEND(CeedRegister_Ref, 1, "/cpu/self/ref/serial")
-CEED_BACKEND(CeedRegister_Ref_Blocked, 1, "/cpu/self/ref/blocked")
-CEED_BACKEND(CeedRegister_Xsmm_Blocked, 1, "/cpu/self/xsmm/blocked")
-CEED_BACKEND(CeedRegister_Xsmm_Serial, 1, "/cpu/self/xsmm/serial")

From d15030167e504f21b69a912dc34420b169f681b5 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 23 Jan 2026 10:56:43 -0700
Subject: [PATCH 532/571] build - split weak defs

---
 Makefile                                |  2 +-
 backends/weak/ceed-avx-weak.c           | 16 ++++++++++++++++
 backends/{ => weak}/ceed-backend-weak.c |  8 ++------
 backends/weak/ceed-backend-weak.h       | 12 ++++++++++++
 backends/weak/ceed-cuda-weak.c          | 17 +++++++++++++++++
 backends/weak/ceed-hip-weak.c           | 17 +++++++++++++++++
 backends/weak/ceed-magma-weak.c         | 16 ++++++++++++++++
 backends/weak/ceed-memcheck-weak.c      | 16 ++++++++++++++++
 backends/weak/ceed-sycl-weak.c          | 17 +++++++++++++++++
 backends/weak/ceed-xsmm-weak.c          | 16 ++++++++++++++++
 10 files changed, 130 insertions(+), 7 deletions(-)
 create mode 100644 backends/weak/ceed-avx-weak.c
 rename backends/{ => weak}/ceed-backend-weak.c (80%)
 create mode 100644 backends/weak/ceed-backend-weak.h
 create mode 100644 backends/weak/ceed-cuda-weak.c
 create mode 100644 backends/weak/ceed-hip-weak.c
 create mode 100644 backends/weak/ceed-magma-weak.c
 create mode 100644 backends/weak/ceed-memcheck-weak.c
 create mode 100644 backends/weak/ceed-sycl-weak.c
 create mode 100644 backends/weak/ceed-xsmm-weak.c

diff --git a/Makefile b/Makefile
index 5bee63eaa1..7b13417175 100644
--- a/Makefile
+++ b/Makefile
@@ -282,7 +282,7 @@ $(libceed.so) : CEED_LDFLAGS += $(if $(DARWIN), -install_name @rpath/$(notdir $(
 # ------------------------------------------------------------
 
 # Interface and gallery
-libceed.c := $(filter-out interface/ceed-cuda.c interface/ceed-hip.c interface/ceed-jit-source-root-$(if $(for_install),default,install).c, $(wildcard interface/ceed*.c backends/*.c gallery/*.c))
+libceed.c := $(filter-out interface/ceed-cuda.c interface/ceed-hip.c interface/ceed-jit-source-root-$(if $(for_install),default,install).c, $(wildcard interface/ceed*.c backends/weak/*.c gallery/*.c))
 gallery.c := $(wildcard gallery/*/ceed*.c)
 libceed.c += $(gallery.c)
 
diff --git a/backends/weak/ceed-avx-weak.c b/backends/weak/ceed-avx-weak.c
new file mode 100644
index 0000000000..790611b902
--- /dev/null
+++ b/backends/weak/ceed-avx-weak.c
@@ -0,0 +1,16 @@
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+#include "ceed-backend-weak.h"
+
+#define CEED_BACKEND(name, num_prefixes, ...) \
+  CEED_INTERN int __attribute__((weak)) name(void) { return CeedRegister_Weak(__func__, num_prefixes, __VA_ARGS__); }
+// LCOV_EXCL_START
+CEED_BACKEND(CeedRegister_Avx_Blocked, 1, "/cpu/self/avx/blocked")
+CEED_BACKEND(CeedRegister_Avx_Serial, 1, "/cpu/self/avx/serial")
+// LCOV_EXCL_STOP
+#undef CEED_BACKEND
diff --git a/backends/ceed-backend-weak.c b/backends/weak/ceed-backend-weak.c
similarity index 80%
rename from backends/ceed-backend-weak.c
rename to backends/weak/ceed-backend-weak.c
index 820f610848..1ae70f81a6 100644
--- a/backends/ceed-backend-weak.c
+++ b/backends/weak/ceed-backend-weak.c
@@ -5,6 +5,7 @@
 //
 // This file is part of CEED:  http://github.com/ceed
 
+#include "ceed-backend-weak.h"
 #include <ceed.h>
 #include <ceed/backend.h>
 #include <stdarg.h>
@@ -17,7 +18,7 @@ static int CeedInit_Weak(const char *resource, Ceed ceed) {
 }
 
 // This function provides a debug target for weak symbols
-static inline int CeedRegister_Weak(const char *name, int num_prefixes, ...) {
+int CeedRegister_Weak(const char *name, int num_prefixes, ...) {
   va_list prefixes;
   int     ierr;
 
@@ -36,8 +37,3 @@ static inline int CeedRegister_Weak(const char *name, int num_prefixes, ...) {
   return CEED_ERROR_SUCCESS;
 }
 // LCOV_EXCL_STOP
-
-#define CEED_BACKEND(name, num_prefixes, ...) \
-  CEED_INTERN int __attribute__((weak)) name(void) { return CeedRegister_Weak(__func__, num_prefixes, __VA_ARGS__); }
-#include "ceed-backend-list.h"
-#undef CEED_BACKEND
diff --git a/backends/weak/ceed-backend-weak.h b/backends/weak/ceed-backend-weak.h
new file mode 100644
index 0000000000..713be68518
--- /dev/null
+++ b/backends/weak/ceed-backend-weak.h
@@ -0,0 +1,12 @@
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+#include <ceed.h>
+#include <ceed/backend.h>
+#include <stdarg.h>
+
+CEED_INTERN int CeedRegister_Weak(const char *name, int num_prefixes, ...);
diff --git a/backends/weak/ceed-cuda-weak.c b/backends/weak/ceed-cuda-weak.c
new file mode 100644
index 0000000000..0654fa596a
--- /dev/null
+++ b/backends/weak/ceed-cuda-weak.c
@@ -0,0 +1,17 @@
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+#include "ceed-backend-weak.h"
+
+#define CEED_BACKEND(name, num_prefixes, ...) \
+  CEED_INTERN int __attribute__((weak)) name(void) { return CeedRegister_Weak(__func__, num_prefixes, __VA_ARGS__); }
+// LCOV_EXCL_START
+CEED_BACKEND(CeedRegister_Cuda, 1, "/gpu/cuda/ref")
+CEED_BACKEND(CeedRegister_Cuda_Gen, 1, "/gpu/cuda/gen")
+CEED_BACKEND(CeedRegister_Cuda_Shared, 1, "/gpu/cuda/shared")
+// LCOV_EXCL_STOP
+#undef CEED_BACKEND
diff --git a/backends/weak/ceed-hip-weak.c b/backends/weak/ceed-hip-weak.c
new file mode 100644
index 0000000000..41f327df16
--- /dev/null
+++ b/backends/weak/ceed-hip-weak.c
@@ -0,0 +1,17 @@
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+#include "ceed-backend-weak.h"
+
+#define CEED_BACKEND(name, num_prefixes, ...) \
+  CEED_INTERN int __attribute__((weak)) name(void) { return CeedRegister_Weak(__func__, num_prefixes, __VA_ARGS__); }
+// LCOV_EXCL_START
+CEED_BACKEND(CeedRegister_Hip, 1, "/gpu/hip/ref")
+CEED_BACKEND(CeedRegister_Hip_Gen, 1, "/gpu/hip/gen")
+CEED_BACKEND(CeedRegister_Hip_Shared, 1, "/gpu/hip/shared")
+// LCOV_EXCL_STOP
+#undef CEED_BACKEND
diff --git a/backends/weak/ceed-magma-weak.c b/backends/weak/ceed-magma-weak.c
new file mode 100644
index 0000000000..1b8a40149c
--- /dev/null
+++ b/backends/weak/ceed-magma-weak.c
@@ -0,0 +1,16 @@
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+#include "ceed-backend-weak.h"
+
+#define CEED_BACKEND(name, num_prefixes, ...) \
+  CEED_INTERN int __attribute__((weak)) name(void) { return CeedRegister_Weak(__func__, num_prefixes, __VA_ARGS__); }
+// LCOV_EXCL_START
+CEED_BACKEND(CeedRegister_Magma, 2, "/gpu/cuda/magma", "/gpu/hip/magma")
+CEED_BACKEND(CeedRegister_Magma_Det, 2, "/gpu/cuda/magma/det", "/gpu/hip/magma/det")
+// LCOV_EXCL_STOP
+#undef CEED_BACKEND
diff --git a/backends/weak/ceed-memcheck-weak.c b/backends/weak/ceed-memcheck-weak.c
new file mode 100644
index 0000000000..2c6d69b045
--- /dev/null
+++ b/backends/weak/ceed-memcheck-weak.c
@@ -0,0 +1,16 @@
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+#include "ceed-backend-weak.h"
+
+#define CEED_BACKEND(name, num_prefixes, ...) \
+  CEED_INTERN int __attribute__((weak)) name(void) { return CeedRegister_Weak(__func__, num_prefixes, __VA_ARGS__); }
+// LCOV_EXCL_START
+CEED_BACKEND(CeedRegister_Memcheck_Blocked, 1, "/cpu/self/memcheck/blocked")
+CEED_BACKEND(CeedRegister_Memcheck_Serial, 1, "/cpu/self/memcheck/serial")
+// LCOV_EXCL_STOP
+#undef CEED_BACKEND
diff --git a/backends/weak/ceed-sycl-weak.c b/backends/weak/ceed-sycl-weak.c
new file mode 100644
index 0000000000..93f9872e62
--- /dev/null
+++ b/backends/weak/ceed-sycl-weak.c
@@ -0,0 +1,17 @@
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+#include "ceed-backend-weak.h"
+
+#define CEED_BACKEND(name, num_prefixes, ...) \
+  CEED_INTERN int __attribute__((weak)) name(void) { return CeedRegister_Weak(__func__, num_prefixes, __VA_ARGS__); }
+// LCOV_EXCL_START
+CEED_BACKEND(CeedRegister_Sycl, 1, "/gpu/sycl/ref")
+CEED_BACKEND(CeedRegister_Sycl_Shared, 1, "/gpu/sycl/shared")
+CEED_BACKEND(CeedRegister_Sycl_Gen, 1, "/gpu/sycl/gen")
+// LCOV_EXCL_STOP
+#undef CEED_BACKEND
diff --git a/backends/weak/ceed-xsmm-weak.c b/backends/weak/ceed-xsmm-weak.c
new file mode 100644
index 0000000000..3198da1aa5
--- /dev/null
+++ b/backends/weak/ceed-xsmm-weak.c
@@ -0,0 +1,16 @@
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+#include "ceed-backend-weak.h"
+
+#define CEED_BACKEND(name, num_prefixes, ...) \
+  CEED_INTERN int __attribute__((weak)) name(void) { return CeedRegister_Weak(__func__, num_prefixes, __VA_ARGS__); }
+// LCOV_EXCL_START
+CEED_BACKEND(CeedRegister_Xsmm_Blocked, 1, "/cpu/self/xsmm/blocked")
+CEED_BACKEND(CeedRegister_Xsmm_Serial, 1, "/cpu/self/xsmm/serial")
+// LCOV_EXCL_STOP
+#undef CEED_BACKEND

From bdee0278611904727ee35fcc2d0d7c3bf83db4c4 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 23 Jan 2026 11:56:08 -0700
Subject: [PATCH 533/571] rust - add macos CI job

---
 .github/workflows/rust-test-with-style.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/rust-test-with-style.yml b/.github/workflows/rust-test-with-style.yml
index f8e9499fe2..80ea4a4dbf 100644
--- a/.github/workflows/rust-test-with-style.yml
+++ b/.github/workflows/rust-test-with-style.yml
@@ -10,7 +10,7 @@ jobs:
   test:
     strategy:
       matrix:
-        os: [ubuntu-24.04]
+        os: [ubuntu-24.04, macos-15]
         compiler: [clang]
 
     runs-on: ${{ matrix.os }}

From 9569b36af1e3012330679c2612d8558bdbd28fc8 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 23 Jan 2026 16:57:02 -0700
Subject: [PATCH 534/571] build - simplify includes to reduce duplication

---
 backends/ceed-backend-list-avx.h      | 13 ++++++++
 backends/ceed-backend-list-cuda.h     | 14 +++++++++
 backends/ceed-backend-list-hip.h      | 14 +++++++++
 backends/ceed-backend-list-magma.h    | 13 ++++++++
 backends/ceed-backend-list-memcheck.h | 13 ++++++++
 backends/ceed-backend-list-ref.h      | 15 +++++++++
 backends/ceed-backend-list-sycl.h     | 14 +++++++++
 backends/ceed-backend-list-xsmm.h     | 13 ++++++++
 backends/ceed-backend-list.h          | 45 ++++++++++++---------------
 backends/weak/ceed-avx-weak.c         |  6 +---
 backends/weak/ceed-backend-weak.h     |  3 ++
 backends/weak/ceed-cuda-weak.c        |  7 +----
 backends/weak/ceed-hip-weak.c         |  7 +----
 backends/weak/ceed-magma-weak.c       |  6 +---
 backends/weak/ceed-memcheck-weak.c    |  6 +---
 backends/weak/ceed-sycl-weak.c        |  7 +----
 backends/weak/ceed-xsmm-weak.c        |  6 +---
 17 files changed, 139 insertions(+), 63 deletions(-)
 create mode 100644 backends/ceed-backend-list-avx.h
 create mode 100644 backends/ceed-backend-list-cuda.h
 create mode 100644 backends/ceed-backend-list-hip.h
 create mode 100644 backends/ceed-backend-list-magma.h
 create mode 100644 backends/ceed-backend-list-memcheck.h
 create mode 100644 backends/ceed-backend-list-ref.h
 create mode 100644 backends/ceed-backend-list-sycl.h
 create mode 100644 backends/ceed-backend-list-xsmm.h

diff --git a/backends/ceed-backend-list-avx.h b/backends/ceed-backend-list-avx.h
new file mode 100644
index 0000000000..5e19a016c7
--- /dev/null
+++ b/backends/ceed-backend-list-avx.h
@@ -0,0 +1,13 @@
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+// This header does not have guards because it is included multiple times.
+// This will be expanded inside CeedRegisterAll() to call each registration function.
+// This is also used to create weakly linked registration functions in `backends/weak/ceed-*-weak.c'.
+
+CEED_BACKEND(CeedRegister_Avx_Blocked, 1, "/cpu/self/avx/blocked")
+CEED_BACKEND(CeedRegister_Avx_Serial, 1, "/cpu/self/avx/serial")
diff --git a/backends/ceed-backend-list-cuda.h b/backends/ceed-backend-list-cuda.h
new file mode 100644
index 0000000000..87593f5b08
--- /dev/null
+++ b/backends/ceed-backend-list-cuda.h
@@ -0,0 +1,14 @@
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+// This header does not have guards because it is included multiple times.
+// This will be expanded inside CeedRegisterAll() to call each registration function.
+// This is also used to create weakly linked registration functions in `backends/weak/ceed-*-weak.c'.
+
+CEED_BACKEND(CeedRegister_Cuda, 1, "/gpu/cuda/ref")
+CEED_BACKEND(CeedRegister_Cuda_Gen, 1, "/gpu/cuda/gen")
+CEED_BACKEND(CeedRegister_Cuda_Shared, 1, "/gpu/cuda/shared")
diff --git a/backends/ceed-backend-list-hip.h b/backends/ceed-backend-list-hip.h
new file mode 100644
index 0000000000..e66fc98298
--- /dev/null
+++ b/backends/ceed-backend-list-hip.h
@@ -0,0 +1,14 @@
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+// This header does not have guards because it is included multiple times.
+// This will be expanded inside CeedRegisterAll() to call each registration function.
+// This is also used to create weakly linked registration functions in `backends/weak/ceed-*-weak.c'.
+
+CEED_BACKEND(CeedRegister_Hip, 1, "/gpu/hip/ref")
+CEED_BACKEND(CeedRegister_Hip_Gen, 1, "/gpu/hip/gen")
+CEED_BACKEND(CeedRegister_Hip_Shared, 1, "/gpu/hip/shared")
diff --git a/backends/ceed-backend-list-magma.h b/backends/ceed-backend-list-magma.h
new file mode 100644
index 0000000000..66c985c884
--- /dev/null
+++ b/backends/ceed-backend-list-magma.h
@@ -0,0 +1,13 @@
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+// This header does not have guards because it is included multiple times.
+// This will be expanded inside CeedRegisterAll() to call each registration function.
+// This is also used to create weakly linked registration functions in `backends/weak/ceed-*-weak.c'.
+
+CEED_BACKEND(CeedRegister_Magma, 2, "/gpu/cuda/magma", "/gpu/hip/magma")
+CEED_BACKEND(CeedRegister_Magma_Det, 2, "/gpu/cuda/magma/det", "/gpu/hip/magma/det")
diff --git a/backends/ceed-backend-list-memcheck.h b/backends/ceed-backend-list-memcheck.h
new file mode 100644
index 0000000000..fa6f51b0bb
--- /dev/null
+++ b/backends/ceed-backend-list-memcheck.h
@@ -0,0 +1,13 @@
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+// This header does not have guards because it is included multiple times.
+// This will be expanded inside CeedRegisterAll() to call each registration function.
+// This is also used to create weakly linked registration functions in `backends/weak/ceed-*-weak.c'.
+
+CEED_BACKEND(CeedRegister_Memcheck_Blocked, 1, "/cpu/self/memcheck/blocked")
+CEED_BACKEND(CeedRegister_Memcheck_Serial, 1, "/cpu/self/memcheck/serial")
diff --git a/backends/ceed-backend-list-ref.h b/backends/ceed-backend-list-ref.h
new file mode 100644
index 0000000000..ac3e21525d
--- /dev/null
+++ b/backends/ceed-backend-list-ref.h
@@ -0,0 +1,15 @@
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+// This header does not have guards because it is included multiple times.
+// This will be expanded inside CeedRegisterAll() to call each registration function.
+// This is also used to create weakly linked registration functions in `backends/weak/ceed-*-weak.c'.
+
+CEED_BACKEND(CeedRegister_Ref, 1, "/cpu/self/ref/serial")
+CEED_BACKEND(CeedRegister_Ref_Blocked, 1, "/cpu/self/ref/blocked")
+CEED_BACKEND(CeedRegister_Opt_Blocked, 1, "/cpu/self/opt/blocked")
+CEED_BACKEND(CeedRegister_Opt_Serial, 1, "/cpu/self/opt/serial")
diff --git a/backends/ceed-backend-list-sycl.h b/backends/ceed-backend-list-sycl.h
new file mode 100644
index 0000000000..88617e1b2b
--- /dev/null
+++ b/backends/ceed-backend-list-sycl.h
@@ -0,0 +1,14 @@
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+// This header does not have guards because it is included multiple times.
+// This will be expanded inside CeedRegisterAll() to call each registration function.
+// This is also used to create weakly linked registration functions in `backends/weak/ceed-*-weak.c'.
+
+CEED_BACKEND(CeedRegister_Sycl, 1, "/gpu/sycl/ref")
+CEED_BACKEND(CeedRegister_Sycl_Shared, 1, "/gpu/sycl/shared")
+CEED_BACKEND(CeedRegister_Sycl_Gen, 1, "/gpu/sycl/gen")
diff --git a/backends/ceed-backend-list-xsmm.h b/backends/ceed-backend-list-xsmm.h
new file mode 100644
index 0000000000..fee5f81102
--- /dev/null
+++ b/backends/ceed-backend-list-xsmm.h
@@ -0,0 +1,13 @@
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+// This header does not have guards because it is included multiple times.
+// This will be expanded inside CeedRegisterAll() to call each registration function.
+// This is also used to create weakly linked registration functions in `backends/weak/ceed-*-weak.c'.
+
+CEED_BACKEND(CeedRegister_Xsmm_Blocked, 1, "/cpu/self/xsmm/blocked")
+CEED_BACKEND(CeedRegister_Xsmm_Serial, 1, "/cpu/self/xsmm/serial")
diff --git a/backends/ceed-backend-list.h b/backends/ceed-backend-list.h
index 2c69e9a53c..77f8e34490 100644
--- a/backends/ceed-backend-list.h
+++ b/backends/ceed-backend-list.h
@@ -5,30 +5,25 @@
 //
 // This file is part of CEED:  http://github.com/ceed
 
-// This header does not have guards because it is included multiple times.
+// This header does not have guards because it may be included multiple times.
 
-// List each backend registration function once here.
-// This will be expanded inside CeedRegisterAll() to call each registration function in the order listed, and also to define weak symbol aliases for
-// backends that are not configured.
+// List each backend registration function in the corresponding `ceed-backend-list-*.h` file, grouped by install requirement.
+// Include each of those files here.
+// This will be expanded inside CeedRegisterAll() to call each registration function in the order listed.
 
-CEED_BACKEND(CeedRegister_Ref, 1, "/cpu/self/ref/serial")
-CEED_BACKEND(CeedRegister_Ref_Blocked, 1, "/cpu/self/ref/blocked")
-CEED_BACKEND(CeedRegister_Opt_Blocked, 1, "/cpu/self/opt/blocked")
-CEED_BACKEND(CeedRegister_Opt_Serial, 1, "/cpu/self/opt/serial")
-CEED_BACKEND(CeedRegister_Avx_Blocked, 1, "/cpu/self/avx/blocked")
-CEED_BACKEND(CeedRegister_Avx_Serial, 1, "/cpu/self/avx/serial")
-CEED_BACKEND(CeedRegister_Memcheck_Blocked, 1, "/cpu/self/memcheck/blocked")
-CEED_BACKEND(CeedRegister_Memcheck_Serial, 1, "/cpu/self/memcheck/serial")
-CEED_BACKEND(CeedRegister_Xsmm_Blocked, 1, "/cpu/self/xsmm/blocked")
-CEED_BACKEND(CeedRegister_Xsmm_Serial, 1, "/cpu/self/xsmm/serial")
-CEED_BACKEND(CeedRegister_Cuda, 1, "/gpu/cuda/ref")
-CEED_BACKEND(CeedRegister_Cuda_Gen, 1, "/gpu/cuda/gen")
-CEED_BACKEND(CeedRegister_Cuda_Shared, 1, "/gpu/cuda/shared")
-CEED_BACKEND(CeedRegister_Hip, 1, "/gpu/hip/ref")
-CEED_BACKEND(CeedRegister_Hip_Gen, 1, "/gpu/hip/gen")
-CEED_BACKEND(CeedRegister_Hip_Shared, 1, "/gpu/hip/shared")
-CEED_BACKEND(CeedRegister_Sycl, 1, "/gpu/sycl/ref")
-CEED_BACKEND(CeedRegister_Sycl_Shared, 1, "/gpu/sycl/shared")
-CEED_BACKEND(CeedRegister_Sycl_Gen, 1, "/gpu/sycl/gen")
-CEED_BACKEND(CeedRegister_Magma, 2, "/gpu/cuda/magma", "/gpu/hip/magma")
-CEED_BACKEND(CeedRegister_Magma_Det, 2, "/gpu/cuda/magma/det", "/gpu/hip/magma/det")
+// Always compiled
+#include "ceed-backend-list-ref.h"
+// Requires AVX support
+#include "ceed-backend-list-avx.h"
+// Requires Valgrind
+#include "ceed-backend-list-memcheck.h"
+// Requires LIBXSMM
+#include "ceed-backend-list-xsmm.h"
+// Requires CUDA
+#include "ceed-backend-list-cuda.h"
+// Requires ROCm
+#include "ceed-backend-list-hip.h"
+// Requires SYCL
+#include "ceed-backend-list-sycl.h"
+// Requires MAGMA + (CUDA or ROCm)
+#include "ceed-backend-list-magma.h"
diff --git a/backends/weak/ceed-avx-weak.c b/backends/weak/ceed-avx-weak.c
index 790611b902..639c08f63b 100644
--- a/backends/weak/ceed-avx-weak.c
+++ b/backends/weak/ceed-avx-weak.c
@@ -6,11 +6,7 @@
 // This file is part of CEED:  http://github.com/ceed
 
 #include "ceed-backend-weak.h"
-
-#define CEED_BACKEND(name, num_prefixes, ...) \
-  CEED_INTERN int __attribute__((weak)) name(void) { return CeedRegister_Weak(__func__, num_prefixes, __VA_ARGS__); }
 // LCOV_EXCL_START
-CEED_BACKEND(CeedRegister_Avx_Blocked, 1, "/cpu/self/avx/blocked")
-CEED_BACKEND(CeedRegister_Avx_Serial, 1, "/cpu/self/avx/serial")
+#include "../ceed-backend-list-avx.h"
 // LCOV_EXCL_STOP
 #undef CEED_BACKEND
diff --git a/backends/weak/ceed-backend-weak.h b/backends/weak/ceed-backend-weak.h
index 713be68518..b828c44394 100644
--- a/backends/weak/ceed-backend-weak.h
+++ b/backends/weak/ceed-backend-weak.h
@@ -10,3 +10,6 @@
 #include <stdarg.h>
 
 CEED_INTERN int CeedRegister_Weak(const char *name, int num_prefixes, ...);
+
+#define CEED_BACKEND(name, num_prefixes, ...) \
+  CEED_INTERN int __attribute__((weak)) name(void) { return CeedRegister_Weak(__func__, num_prefixes, __VA_ARGS__); }
diff --git a/backends/weak/ceed-cuda-weak.c b/backends/weak/ceed-cuda-weak.c
index 0654fa596a..8bc81c78f5 100644
--- a/backends/weak/ceed-cuda-weak.c
+++ b/backends/weak/ceed-cuda-weak.c
@@ -6,12 +6,7 @@
 // This file is part of CEED:  http://github.com/ceed
 
 #include "ceed-backend-weak.h"
-
-#define CEED_BACKEND(name, num_prefixes, ...) \
-  CEED_INTERN int __attribute__((weak)) name(void) { return CeedRegister_Weak(__func__, num_prefixes, __VA_ARGS__); }
 // LCOV_EXCL_START
-CEED_BACKEND(CeedRegister_Cuda, 1, "/gpu/cuda/ref")
-CEED_BACKEND(CeedRegister_Cuda_Gen, 1, "/gpu/cuda/gen")
-CEED_BACKEND(CeedRegister_Cuda_Shared, 1, "/gpu/cuda/shared")
+#include "../ceed-backend-list-cuda.h"
 // LCOV_EXCL_STOP
 #undef CEED_BACKEND
diff --git a/backends/weak/ceed-hip-weak.c b/backends/weak/ceed-hip-weak.c
index 41f327df16..ec90d3bdee 100644
--- a/backends/weak/ceed-hip-weak.c
+++ b/backends/weak/ceed-hip-weak.c
@@ -6,12 +6,7 @@
 // This file is part of CEED:  http://github.com/ceed
 
 #include "ceed-backend-weak.h"
-
-#define CEED_BACKEND(name, num_prefixes, ...) \
-  CEED_INTERN int __attribute__((weak)) name(void) { return CeedRegister_Weak(__func__, num_prefixes, __VA_ARGS__); }
 // LCOV_EXCL_START
-CEED_BACKEND(CeedRegister_Hip, 1, "/gpu/hip/ref")
-CEED_BACKEND(CeedRegister_Hip_Gen, 1, "/gpu/hip/gen")
-CEED_BACKEND(CeedRegister_Hip_Shared, 1, "/gpu/hip/shared")
+#include "../ceed-backend-list-hip.h"
 // LCOV_EXCL_STOP
 #undef CEED_BACKEND
diff --git a/backends/weak/ceed-magma-weak.c b/backends/weak/ceed-magma-weak.c
index 1b8a40149c..cace059504 100644
--- a/backends/weak/ceed-magma-weak.c
+++ b/backends/weak/ceed-magma-weak.c
@@ -6,11 +6,7 @@
 // This file is part of CEED:  http://github.com/ceed
 
 #include "ceed-backend-weak.h"
-
-#define CEED_BACKEND(name, num_prefixes, ...) \
-  CEED_INTERN int __attribute__((weak)) name(void) { return CeedRegister_Weak(__func__, num_prefixes, __VA_ARGS__); }
 // LCOV_EXCL_START
-CEED_BACKEND(CeedRegister_Magma, 2, "/gpu/cuda/magma", "/gpu/hip/magma")
-CEED_BACKEND(CeedRegister_Magma_Det, 2, "/gpu/cuda/magma/det", "/gpu/hip/magma/det")
+#include "../ceed-backend-list-magma.h"
 // LCOV_EXCL_STOP
 #undef CEED_BACKEND
diff --git a/backends/weak/ceed-memcheck-weak.c b/backends/weak/ceed-memcheck-weak.c
index 2c6d69b045..35fd01613b 100644
--- a/backends/weak/ceed-memcheck-weak.c
+++ b/backends/weak/ceed-memcheck-weak.c
@@ -6,11 +6,7 @@
 // This file is part of CEED:  http://github.com/ceed
 
 #include "ceed-backend-weak.h"
-
-#define CEED_BACKEND(name, num_prefixes, ...) \
-  CEED_INTERN int __attribute__((weak)) name(void) { return CeedRegister_Weak(__func__, num_prefixes, __VA_ARGS__); }
 // LCOV_EXCL_START
-CEED_BACKEND(CeedRegister_Memcheck_Blocked, 1, "/cpu/self/memcheck/blocked")
-CEED_BACKEND(CeedRegister_Memcheck_Serial, 1, "/cpu/self/memcheck/serial")
+#include "../ceed-backend-list-memcheck.h"
 // LCOV_EXCL_STOP
 #undef CEED_BACKEND
diff --git a/backends/weak/ceed-sycl-weak.c b/backends/weak/ceed-sycl-weak.c
index 93f9872e62..92bc508449 100644
--- a/backends/weak/ceed-sycl-weak.c
+++ b/backends/weak/ceed-sycl-weak.c
@@ -6,12 +6,7 @@
 // This file is part of CEED:  http://github.com/ceed
 
 #include "ceed-backend-weak.h"
-
-#define CEED_BACKEND(name, num_prefixes, ...) \
-  CEED_INTERN int __attribute__((weak)) name(void) { return CeedRegister_Weak(__func__, num_prefixes, __VA_ARGS__); }
 // LCOV_EXCL_START
-CEED_BACKEND(CeedRegister_Sycl, 1, "/gpu/sycl/ref")
-CEED_BACKEND(CeedRegister_Sycl_Shared, 1, "/gpu/sycl/shared")
-CEED_BACKEND(CeedRegister_Sycl_Gen, 1, "/gpu/sycl/gen")
+#include "../ceed-backend-list-sycl.h"
 // LCOV_EXCL_STOP
 #undef CEED_BACKEND
diff --git a/backends/weak/ceed-xsmm-weak.c b/backends/weak/ceed-xsmm-weak.c
index 3198da1aa5..6ae36a2822 100644
--- a/backends/weak/ceed-xsmm-weak.c
+++ b/backends/weak/ceed-xsmm-weak.c
@@ -6,11 +6,7 @@
 // This file is part of CEED:  http://github.com/ceed
 
 #include "ceed-backend-weak.h"
-
-#define CEED_BACKEND(name, num_prefixes, ...) \
-  CEED_INTERN int __attribute__((weak)) name(void) { return CeedRegister_Weak(__func__, num_prefixes, __VA_ARGS__); }
 // LCOV_EXCL_START
-CEED_BACKEND(CeedRegister_Xsmm_Blocked, 1, "/cpu/self/xsmm/blocked")
-CEED_BACKEND(CeedRegister_Xsmm_Serial, 1, "/cpu/self/xsmm/serial")
+#include "../ceed-backend-list-xsmm.h"
 // LCOV_EXCL_STOP
 #undef CEED_BACKEND

From 860dc8215fde8015ce701d5081dc7f665abfde5f Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Thu, 29 Jan 2026 09:50:01 -0700
Subject: [PATCH 535/571] ex - add fortran ex1

---
 Makefile                       |     8 +-
 coverage.info                  | 12848 +++++++++++++++++++++++++++++++
 examples/ceed/ex1-volume-f-c.h |    59 +
 examples/ceed/ex1-volume-f.f90 |   557 ++
 examples/ceed/ex1-volume-f.h   |    55 +
 examples/ceed/ex1-volume.h     |     7 +-
 examples/ceed/ex2-surface.h    |     4 +-
 examples/ceed/ex3-volume.h     |     4 +-
 tests/junit.py                 |     5 +-
 9 files changed, 13537 insertions(+), 10 deletions(-)
 create mode 100644 coverage.info
 create mode 100644 examples/ceed/ex1-volume-f-c.h
 create mode 100644 examples/ceed/ex1-volume-f.f90
 create mode 100644 examples/ceed/ex1-volume-f.h

diff --git a/Makefile b/Makefile
index 7b13417175..ebefbe877d 100644
--- a/Makefile
+++ b/Makefile
@@ -333,9 +333,9 @@ tests   += $(tests.f:tests/%.f90=$(OBJDIR)/%$(EXE_SUFFIX))
 
 # Examples
 examples.c := $(sort $(wildcard examples/ceed/*.c))
-examples.f := $(if $(FC),$(sort $(wildcard examples/ceed/*.f)))
+examples.f := $(if $(FC),$(sort $(wildcard examples/ceed/*.f90)))
 examples   := $(examples.c:examples/ceed/%.c=$(OBJDIR)/%$(EXE_SUFFIX))
-examples   += $(examples.f:examples/ceed/%.f=$(OBJDIR)/%$(EXE_SUFFIX))
+examples   += $(examples.f:examples/ceed/%.f90=$(OBJDIR)/%$(EXE_SUFFIX))
 
 # deal.II Examples
 dealiiexamples := $(OBJDIR)/dealii-bps
@@ -687,7 +687,7 @@ $(OBJDIR)/%$(EXE_SUFFIX) : tests/%.f90 | $$(@D)/.DIR
 $(OBJDIR)/%$(EXE_SUFFIX) : examples/ceed/%.c | $$(@D)/.DIR
 	$(call quiet,LINK.c) $(CEED_LDFLAGS) -o $@ $(abspath $<) $(CEED_LIBS) $(CEED_LDLIBS) $(LDLIBS)
 
-$(OBJDIR)/%$(EXE_SUFFIX) : examples/ceed/%.f | $$(@D)/.DIR
+$(OBJDIR)/%$(EXE_SUFFIX) : examples/ceed/%.f90 | $$(@D)/.DIR
 	$(call quiet,LINK.F) -DSOURCE_DIR='"$(abspath $(<D))/"' $(CEED_LDFLAGS) -o $@ $(abspath $<) $(CEED_LIBS) $(CEED_LDLIBS) $(LDLIBS)
 
 
@@ -928,7 +928,7 @@ CLANG_FORMAT_OPTS += -style=file -i
 AUTOPEP8          ?= autopep8
 AUTOPEP8_OPTS     += --in-place --aggressive --max-line-length 120
 
-format.ch := $(filter-out include/ceedf.h $(wildcard tests/t*-f.h), $(shell git ls-files '*.[ch]pp' '*.[ch]' '*.cu'))
+format.ch := $(filter-out include/ceedf.h $(wildcard tests/t*-f.h) $(wildcard examples/ceed/ex*-f.h), $(shell git ls-files '*.[ch]pp' '*.[ch]' '*.cu'))
 format.py := $(filter-out tests/junit-xml/junit_xml/__init__.py, $(shell git ls-files '*.py'))
 format.ot := $(filter-out doc/sphinx/source/CODE_OF_CONDUCT.md doc/sphinx/source/CONTRIBUTING.md, $(shell git ls-files '*.md' '*.f90'))
 
diff --git a/coverage.info b/coverage.info
new file mode 100644
index 0000000000..2e177c3e35
--- /dev/null
+++ b/coverage.info
@@ -0,0 +1,12848 @@
+TN:
+SF:/home/jeremy/Dev/libCEED/backends/blocked/ceed-blocked-operator.c
+FNL:0,19,197
+FNA:0,192,CeedOperatorSetupFields_Blocked
+FNL:1,202,266
+FNA:1,96,CeedOperatorSetup_Blocked
+FNL:2,271,303
+FNA:2,96,CeedOperatorSetupInputs_Blocked
+FNL:3,308,354
+FNA:3,192,CeedOperatorInputBasis_Blocked
+FNL:4,359,400
+FNA:4,192,CeedOperatorOutputBasis_Blocked
+FNL:5,405,427
+FNA:5,96,CeedOperatorRestoreInputs_Blocked
+FNL:6,432,520
+FNA:6,96,CeedOperatorApplyAdd_Blocked
+FNL:7,525,719
+FNA:7,0,CeedOperatorLinearAssembleQFunctionCore_Blocked
+FNL:8,724,726
+FNA:8,0,CeedOperatorLinearAssembleQFunction_Blocked
+FNL:9,731,733
+FNA:9,0,CeedOperatorLinearAssembleQFunctionUpdate_Blocked
+FNL:10,738,775
+FNA:10,96,CeedOperatorDestroy_Blocked
+FNL:11,780,793
+FNA:11,96,CeedOperatorCreate_Blocked
+FNF:12
+FNH:9
+DA:19,192
+DA:31,192
+DA:32,192
+DA:33,192
+DA:34,192
+DA:36,192
+DA:37,96
+DA:38,96
+DA:40,96
+DA:41,96
+DA:45,480
+DA:49,288
+DA:50,288
+DA:57,240
+DA:58,240
+DA:59,240
+DA:60,240
+DA:61,240
+DA:62,240
+DA:63,240
+DA:65,240
+DA:66,240
+DA:67,144
+DA:68,144
+DA:70,144
+DA:71,144
+DA:73,144
+DA:74,144
+DA:75,0
+DA:76,0
+DA:77,0
+DA:79,0
+DA:80,0
+DA:81,0
+DA:83,0
+DA:84,0
+DA:85,0
+DA:86,0
+DA:87,0
+DA:88,0
+DA:90,0
+DA:91,0
+DA:92,0
+DA:95,0
+DA:96,0
+DA:97,0
+DA:98,96
+DA:101,96
+DA:102,96
+DA:104,96
+DA:105,0
+DA:107,0
+DA:109,240
+DA:110,240
+DA:111,240
+DA:114,288
+DA:115,96
+DA:116,96
+DA:117,96
+DA:118,96
+DA:119,96
+DA:120,144
+DA:124,144
+DA:125,144
+DA:126,144
+DA:127,144
+DA:128,144
+DA:129,144
+DA:130,144
+DA:131,144
+DA:132,144
+DA:133,144
+DA:134,48
+DA:135,48
+DA:136,48
+DA:137,48
+DA:138,48
+DA:139,48
+DA:140,48
+DA:144,192
+DA:145,288
+DA:149,192
+DA:150,192
+DA:151,288
+DA:155,96
+DA:156,96
+DA:157,96
+DA:158,0
+DA:159,0
+DA:160,0
+DA:162,96
+DA:163,96
+DA:165,192
+DA:166,192
+DA:169,192
+DA:173,96
+DA:174,96
+DA:175,96
+DA:179,0
+DA:180,0
+DA:181,0
+DA:182,0
+DA:183,0
+DA:184,0
+DA:185,0
+DA:186,0
+DA:188,0
+DA:189,0
+DA:191,96
+DA:192,96
+DA:195,192
+DA:196,192
+DA:202,96
+DA:205,96
+DA:211,96
+DA:212,96
+DA:214,96
+DA:215,96
+DA:216,96
+DA:217,96
+DA:218,96
+DA:219,96
+DA:222,96
+DA:223,96
+DA:225,96
+DA:226,96
+DA:227,96
+DA:228,96
+DA:229,96
+DA:230,96
+DA:231,96
+DA:232,96
+DA:233,96
+DA:235,96
+DA:236,96
+DA:240,96
+DA:243,96
+DA:248,96
+DA:252,0
+DA:253,0
+DA:254,0
+DA:256,0
+DA:257,0
+DA:259,0
+DA:263,96
+DA:264,96
+DA:265,96
+DA:271,96
+DA:274,288
+DA:281,192
+DA:282,192
+DA:283,192
+DA:284,96
+DA:285,96
+DA:288,192
+DA:289,192
+DA:292,144
+DA:293,144
+DA:294,144
+DA:296,144
+DA:298,144
+DA:300,192
+DA:302,96
+DA:308,192
+DA:311,576
+DA:318,384
+DA:322,0
+DA:323,0
+DA:324,0
+DA:325,0
+DA:329,384
+DA:330,384
+DA:331,384
+DA:332,384
+DA:333,384
+DA:335,384
+DA:336,96
+DA:337,96
+DA:338,96
+DA:339,192
+DA:343,192
+DA:344,192
+DA:345,192
+DA:346,192
+DA:347,192
+DA:348,192
+DA:349,96
+DA:350,96
+DA:353,192
+DA:359,192
+DA:362,384
+DA:369,192
+DA:370,192
+DA:371,192
+DA:372,192
+DA:374,192
+DA:375,96
+DA:376,96
+DA:377,96
+DA:381,96
+DA:382,96
+DA:383,96
+DA:385,96
+DA:386,0
+DA:388,96
+DA:390,96
+DA:391,96
+DA:399,192
+DA:405,96
+DA:407,288
+DA:411,192
+DA:415,0
+DA:416,0
+DA:417,0
+DA:418,0
+DA:420,192
+DA:421,192
+DA:423,144
+DA:426,96
+DA:432,96
+DA:434,96
+DA:436,96
+DA:443,96
+DA:445,96
+DA:448,96
+DA:449,0
+DA:450,0
+DA:451,0
+DA:453,96
+DA:454,96
+DA:455,96
+DA:456,96
+DA:457,96
+DA:458,96
+DA:461,96
+DA:464,192
+DA:465,96
+DA:466,0
+DA:468,96
+DA:473,288
+DA:475,384
+DA:476,192
+DA:477,192
+DA:478,96
+DA:479,96
+DA:485,192
+DA:488,192
+DA:489,192
+DA:493,192
+DA:498,192
+DA:502,96
+DA:504,96
+DA:506,96
+DA:507,96
+DA:509,96
+DA:511,96
+DA:513,96
+DA:517,96
+DA:518,96
+DA:519,96
+DA:525,0
+DA:529,0
+DA:531,0
+DA:537,0
+DA:538,0
+DA:539,0
+DA:540,0
+DA:541,0
+DA:543,0
+DA:544,0
+DA:545,0
+DA:546,0
+DA:547,0
+DA:548,0
+DA:549,0
+DA:552,0
+DA:555,0
+DA:558,0
+DA:561,0
+DA:562,0
+DA:567,0
+DA:568,0
+DA:569,0
+DA:570,0
+DA:571,0
+DA:573,0
+DA:575,0
+DA:576,0
+DA:580,0
+DA:581,0
+DA:586,0
+DA:587,0
+DA:588,0
+DA:589,0
+DA:591,0
+DA:593,0
+DA:594,0
+DA:598,0
+DA:599,0
+DA:601,0
+DA:602,0
+DA:604,0
+DA:607,0
+DA:608,0
+DA:610,0
+DA:612,0
+DA:616,0
+DA:617,0
+DA:618,0
+DA:621,0
+DA:624,0
+DA:628,0
+DA:630,0
+DA:633,0
+DA:639,0
+DA:640,0
+DA:641,0
+DA:642,0
+DA:643,0
+DA:644,0
+DA:649,0
+DA:650,0
+DA:651,0
+DA:654,0
+DA:656,0
+DA:661,0
+DA:663,0
+DA:664,0
+DA:665,0
+DA:666,0
+DA:668,0
+DA:671,0
+DA:677,0
+DA:678,0
+DA:679,0
+DA:680,0
+DA:681,0
+DA:687,0
+DA:688,0
+DA:689,0
+DA:696,0
+DA:697,0
+DA:701,0
+DA:702,0
+DA:703,0
+DA:705,0
+DA:710,0
+DA:713,0
+DA:714,0
+DA:715,0
+DA:716,0
+DA:717,0
+DA:718,0
+DA:724,0
+DA:725,0
+DA:731,0
+DA:732,0
+DA:738,96
+DA:741,96
+DA:743,96
+DA:744,96
+DA:745,96
+DA:746,96
+DA:747,384
+DA:748,288
+DA:749,288
+DA:751,96
+DA:752,96
+DA:753,96
+DA:755,288
+DA:756,192
+DA:757,192
+DA:759,96
+DA:760,96
+DA:762,192
+DA:763,96
+DA:764,96
+DA:766,96
+DA:767,96
+DA:770,96
+DA:771,96
+DA:773,96
+DA:774,96
+DA:780,96
+DA:784,96
+DA:785,96
+DA:786,96
+DA:787,96
+DA:788,96
+DA:789,96
+DA:790,96
+DA:791,96
+DA:792,96
+LF:393
+LH:244
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/backends/blocked/ceed-blocked.c
+FNL:0,18,32
+FNA:0,48,CeedInit_Blocked
+FNL:1,37,37
+FNA:1,192,CeedRegister_Ref_Blocked
+FNF:2
+FNH:2
+DA:18,48
+DA:21,48
+DA:23,48
+DA:26,48
+DA:27,48
+DA:28,48
+DA:30,48
+DA:31,48
+DA:37,192
+LF:9
+LH:9
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/backends/ceed-backend-list-avx.h
+FNL:0,12,12
+FNA:0,192,CeedRegister_Avx_Blocked
+FNL:1,13,13
+FNA:1,192,CeedRegister_Avx_Serial
+FNF:2
+FNH:2
+DA:12,384
+DA:13,384
+LF:2
+LH:2
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/backends/ceed-backend-list-cuda.h
+FNL:0,12,12
+FNA:0,192,CeedRegister_Cuda
+FNL:1,13,13
+FNA:1,192,CeedRegister_Cuda_Gen
+FNL:2,14,14
+FNA:2,192,CeedRegister_Cuda_Shared
+FNF:3
+FNH:3
+DA:12,384
+DA:13,384
+DA:14,384
+LF:3
+LH:3
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/backends/ceed-backend-list-hip.h
+FNL:0,12,12
+FNA:0,192,CeedRegister_Hip
+FNL:1,13,13
+FNA:1,192,CeedRegister_Hip_Gen
+FNL:2,14,14
+FNA:2,192,CeedRegister_Hip_Shared
+FNF:3
+FNH:3
+DA:12,384
+DA:13,384
+DA:14,384
+LF:3
+LH:3
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/backends/ceed-backend-list-magma.h
+FNL:0,12,12
+FNA:0,192,CeedRegister_Magma
+FNL:1,13,13
+FNA:1,192,CeedRegister_Magma_Det
+FNF:2
+FNH:2
+DA:12,384
+DA:13,384
+LF:2
+LH:2
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/backends/ceed-backend-list-memcheck.h
+FNL:0,12,12
+FNA:0,0,CeedRegister_Memcheck_Blocked
+FNL:1,13,13
+FNA:1,0,CeedRegister_Memcheck_Serial
+FNF:2
+FNH:0
+DA:12,192
+DA:13,192
+LF:2
+LH:2
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/backends/ceed-backend-list-ref.h
+FNF:0
+FNH:0
+DA:12,192
+DA:13,192
+DA:14,192
+DA:15,192
+LF:4
+LH:4
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/backends/ceed-backend-list-sycl.h
+FNL:0,12,12
+FNA:0,192,CeedRegister_Sycl
+FNL:1,13,13
+FNA:1,192,CeedRegister_Sycl_Shared
+FNL:2,14,14
+FNA:2,192,CeedRegister_Sycl_Gen
+FNF:3
+FNH:3
+DA:12,384
+DA:13,384
+DA:14,384
+LF:3
+LH:3
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/backends/ceed-backend-list-xsmm.h
+FNL:0,12,12
+FNA:0,0,CeedRegister_Xsmm_Blocked
+FNL:1,13,13
+FNA:1,0,CeedRegister_Xsmm_Serial
+FNF:2
+FNH:0
+DA:12,192
+DA:13,192
+LF:2
+LH:2
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/backends/memcheck/ceed-memcheck-blocked.c
+FNL:0,17,34
+FNA:0,24,CeedInit_Memcheck
+FNL:1,39,39
+FNA:1,192,CeedRegister_Memcheck_Blocked
+FNF:2
+FNH:2
+DA:17,24
+DA:20,24
+DA:23,24
+DA:24,24
+DA:25,24
+DA:27,24
+DA:28,24
+DA:29,24
+DA:30,24
+DA:31,24
+DA:32,24
+DA:33,24
+DA:39,192
+LF:13
+LH:13
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/backends/memcheck/ceed-memcheck-qfunction.c
+FNL:0,111,124
+FNA:0,96,CeedQFunctionCreate_Memcheck
+FNL:1,19,93
+FNA:1,768,CeedQFunctionApply_Memcheck
+FNL:2,98,106
+FNA:2,96,CeedQFunctionDestroy_Memcheck
+FNF:3
+FNH:3
+DA:19,768
+DA:20,768
+DA:23,768
+DA:27,768
+DA:28,768
+DA:29,768
+DA:30,768
+DA:33,2304
+DA:35,1536
+DA:37,1536
+DA:39,1536
+DA:41,1536
+DA:42,1536
+DA:46,1536
+DA:48,768
+DA:50,768
+DA:52,768
+DA:53,768
+DA:55,768
+DA:56,768
+DA:60,768
+DA:63,2304
+DA:64,1536
+DA:65,1536
+DA:72,768
+DA:73,768
+DA:74,768
+DA:75,1536
+DA:80,768
+DA:81,768
+DA:82,50688
+DA:83,49920
+DA:87,768
+DA:88,768
+DA:91,768
+DA:92,768
+DA:98,96
+DA:101,96
+DA:102,96
+DA:103,96
+DA:104,96
+DA:105,96
+DA:111,96
+DA:115,96
+DA:116,96
+DA:117,96
+DA:118,96
+DA:119,96
+DA:120,96
+DA:121,96
+DA:122,96
+DA:123,96
+LF:52
+LH:52
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/backends/memcheck/ceed-memcheck-qfunctioncontext.c
+FNL:0,116,140
+FNA:0,0,CeedQFunctionContextTakeData_Memcheck
+FNL:1,145,160
+FNA:1,396,CeedQFunctionContextGetData_Memcheck
+FNL:2,165,182
+FNA:2,0,CeedQFunctionContextGetDataRead_Memcheck
+FNL:3,187,203
+FNA:3,396,CeedQFunctionContextRestoreData_Memcheck
+FNL:4,19,25
+FNA:4,396,CeedQFunctionContextHasValidData_Memcheck
+FNL:5,208,225
+FNA:5,0,CeedQFunctionContextRestoreDataRead_Memcheck
+FNL:6,230,262
+FNA:6,144,CeedQFunctionContextDataDestroy_Memcheck
+FNL:7,267,285
+FNA:7,72,CeedQFunctionContextDestroy_Memcheck
+FNL:8,290,309
+FNA:8,72,CeedQFunctionContextCreate_Memcheck
+FNL:9,30,38
+FNA:9,0,CeedQFunctionContextHasBorrowedDataOfType_Memcheck
+FNL:10,43,89
+FNA:10,72,CeedQFunctionContextSetData_Memcheck
+FNL:11,94,111
+FNA:11,396,CeedQFunctionContextSyncData_Memcheck
+FNF:12
+FNH:8
+DA:19,396
+DA:22,396
+DA:23,396
+DA:24,396
+DA:30,0
+DA:33,0
+DA:35,0
+DA:36,0
+DA:37,0
+DA:43,72
+DA:47,72
+DA:49,72
+DA:50,72
+DA:53,72
+DA:54,0
+DA:55,0
+DA:57,72
+DA:58,72
+DA:59,0
+DA:60,0
+DA:62,72
+DA:65,72
+DA:68,72
+DA:69,0
+DA:70,0
+DA:71,0
+DA:72,0
+DA:73,24
+DA:74,24
+DA:75,24
+DA:76,24
+DA:77,24
+DA:78,48
+DA:79,48
+DA:80,48
+DA:81,48
+DA:85,72
+DA:86,72
+DA:87,72
+DA:88,72
+DA:94,396
+DA:98,396
+DA:100,396
+DA:101,396
+DA:104,396
+DA:105,204
+DA:107,396
+DA:108,192
+DA:110,396
+DA:116,0
+DA:120,0
+DA:122,0
+DA:123,0
+DA:126,0
+DA:129,0
+DA:130,0
+DA:131,0
+DA:134,0
+DA:135,0
+DA:136,0
+DA:138,0
+DA:139,0
+DA:145,396
+DA:149,396
+DA:151,396
+DA:152,396
+DA:155,396
+DA:156,396
+DA:157,396
+DA:158,396
+DA:159,396
+DA:165,0
+DA:169,0
+DA:171,0
+DA:172,0
+DA:175,0
+DA:176,0
+DA:177,0
+DA:178,0
+DA:180,0
+DA:181,0
+DA:187,396
+DA:191,396
+DA:192,396
+DA:195,396
+DA:196,396
+DA:199,396
+DA:200,396
+DA:201,396
+DA:202,396
+DA:208,0
+DA:212,0
+DA:213,0
+DA:216,0
+DA:218,0
+DA:221,0
+DA:222,0
+DA:223,0
+DA:224,0
+DA:230,144
+DA:235,144
+DA:237,144
+DA:238,144
+DA:242,144
+DA:243,0
+DA:245,0
+DA:246,0
+DA:247,0
+DA:250,144
+DA:251,72
+DA:252,72
+DA:254,144
+DA:255,24
+DA:256,24
+DA:258,144
+DA:259,48
+DA:261,144
+DA:267,72
+DA:271,72
+DA:272,72
+DA:273,0
+DA:274,0
+DA:276,72
+DA:277,0
+DA:278,0
+DA:280,72
+DA:281,48
+DA:283,72
+DA:284,72
+DA:290,72
+DA:294,72
+DA:295,72
+DA:296,72
+DA:297,72
+DA:298,72
+DA:299,72
+DA:300,72
+DA:301,72
+DA:302,72
+DA:303,72
+DA:304,72
+DA:305,72
+DA:306,72
+DA:307,72
+DA:308,72
+LF:145
+LH:92
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/backends/memcheck/ceed-memcheck-restriction.c
+FNL:0,109,149
+FNA:0,0,CeedElemRestrictionApplyCurlOrientedNoTranspose_Memcheck_Core
+FNL:1,151,190
+FNA:1,0,CeedElemRestrictionApplyCurlOrientedUnsignedNoTranspose_Memcheck_Core
+FNL:2,19,41
+FNA:2,192,CeedElemRestrictionGetBackendStrides_Memcheck
+FNL:3,192,216
+FNA:3,48,CeedElemRestrictionApplyStridedTranspose_Memcheck_Core
+FNL:4,218,240
+FNA:4,48,CeedElemRestrictionApplyOffsetTranspose_Memcheck_Core
+FNL:5,242,264
+FNA:5,0,CeedElemRestrictionApplyOrientedTranspose_Memcheck_Core
+FNL:6,266,315
+FNA:6,0,CeedElemRestrictionApplyCurlOrientedTranspose_Memcheck_Core
+FNL:7,317,365
+FNA:7,0,CeedElemRestrictionApplyCurlOrientedUnsignedTranspose_Memcheck_Core
+FNL:8,367,390
+FNA:8,0,CeedElemRestrictionApplyAtPointsInElement_Memcheck_Core
+FNL:9,392,500
+FNA:9,240,CeedElemRestrictionApply_Memcheck_Core
+FNL:10,46,70
+FNA:10,48,CeedElemRestrictionApplyStridedNoTranspose_Memcheck_Core
+FNL:11,505,516
+FNA:11,240,CeedElemRestrictionApply_Memcheck
+FNL:12,521,533
+FNA:12,0,CeedElemRestrictionApplyUnsigned_Memcheck
+FNL:13,538,550
+FNA:13,0,CeedElemRestrictionApplyUnoriented_Memcheck
+FNL:14,555,563
+FNA:14,0,CeedElemRestrictionApplyAtPointsInElement_Memcheck
+FNL:15,568,579
+FNA:15,0,CeedElemRestrictionApplyBlock_Memcheck
+FNL:16,584,593
+FNA:16,72,CeedElemRestrictionGetOffsets_Memcheck
+FNL:17,598,607
+FNA:17,0,CeedElemRestrictionGetOrientations_Memcheck
+FNL:18,612,621
+FNA:18,0,CeedElemRestrictionGetCurlOrientations_Memcheck
+FNL:19,626,635
+FNA:19,192,CeedElemRestrictionDestroy_Memcheck
+FNL:20,640,773
+FNA:20,264,CeedElemRestrictionCreate_Memcheck
+FNL:21,72,88
+FNA:21,96,CeedElemRestrictionApplyOffsetNoTranspose_Memcheck_Core
+FNL:22,90,107
+FNA:22,0,CeedElemRestrictionApplyOrientedNoTranspose_Memcheck_Core
+FNF:23
+FNH:10
+DA:19,192
+DA:22,192
+DA:23,192
+DA:24,192
+DA:26,192
+DA:27,192
+DA:28,192
+DA:40,192
+DA:46,48
+DA:52,48
+DA:54,48
+DA:55,48
+DA:56,0
+DA:59,432
+DA:60,768
+DA:61,9888
+DA:62,34464
+DA:63,24960
+DA:64,24960
+DA:69,48
+DA:72,96
+DA:79,96
+DA:80,864
+DA:81,1656
+DA:82,55608
+DA:83,54720
+DA:87,96
+DA:90,0
+DA:97,0
+DA:98,0
+DA:99,0
+DA:100,0
+DA:101,0
+DA:102,0
+DA:106,0
+DA:109,0
+DA:116,0
+DA:117,0
+DA:118,0
+DA:119,0
+DA:121,0
+DA:122,0
+DA:123,0
+DA:124,0
+DA:125,0
+DA:126,0
+DA:128,0
+DA:129,0
+DA:130,0
+DA:131,0
+DA:132,0
+DA:133,0
+DA:134,0
+DA:135,0
+DA:136,0
+DA:139,0
+DA:140,0
+DA:141,0
+DA:142,0
+DA:143,0
+DA:144,0
+DA:148,0
+DA:151,0
+DA:157,0
+DA:158,0
+DA:159,0
+DA:160,0
+DA:162,0
+DA:163,0
+DA:164,0
+DA:165,0
+DA:166,0
+DA:167,0
+DA:169,0
+DA:170,0
+DA:171,0
+DA:172,0
+DA:173,0
+DA:174,0
+DA:175,0
+DA:176,0
+DA:177,0
+DA:180,0
+DA:181,0
+DA:182,0
+DA:183,0
+DA:184,0
+DA:185,0
+DA:189,0
+DA:192,48
+DA:198,48
+DA:200,48
+DA:201,48
+DA:202,0
+DA:205,432
+DA:206,768
+DA:207,9888
+DA:208,24096
+DA:209,14592
+DA:210,14592
+DA:215,48
+DA:218,48
+DA:225,48
+DA:226,432
+DA:227,768
+DA:228,6624
+DA:230,16000
+DA:233,9760
+DA:234,9760
+DA:239,48
+DA:242,0
+DA:249,0
+DA:250,0
+DA:251,0
+DA:252,0
+DA:254,0
+DA:257,0
+DA:258,0
+DA:263,0
+DA:266,0
+DA:269,0
+DA:272,0
+DA:274,0
+DA:275,0
+DA:276,0
+DA:278,0
+DA:279,0
+DA:281,0
+DA:282,0
+DA:283,0
+DA:284,0
+DA:285,0
+DA:287,0
+DA:288,0
+DA:290,0
+DA:291,0
+DA:292,0
+DA:293,0
+DA:294,0
+DA:295,0
+DA:296,0
+DA:297,0
+DA:299,0
+DA:300,0
+DA:303,0
+DA:304,0
+DA:305,0
+DA:306,0
+DA:307,0
+DA:309,0
+DA:310,0
+DA:314,0
+DA:317,0
+DA:319,0
+DA:322,0
+DA:324,0
+DA:325,0
+DA:326,0
+DA:328,0
+DA:329,0
+DA:331,0
+DA:332,0
+DA:333,0
+DA:334,0
+DA:335,0
+DA:337,0
+DA:338,0
+DA:340,0
+DA:341,0
+DA:342,0
+DA:343,0
+DA:344,0
+DA:345,0
+DA:346,0
+DA:347,0
+DA:349,0
+DA:350,0
+DA:353,0
+DA:354,0
+DA:355,0
+DA:356,0
+DA:357,0
+DA:359,0
+DA:360,0
+DA:364,0
+DA:367,0
+DA:371,0
+DA:374,0
+DA:375,0
+DA:376,0
+DA:377,0
+DA:378,0
+DA:379,0
+DA:380,0
+DA:383,0
+DA:384,0
+DA:387,0
+DA:389,0
+DA:392,240
+DA:401,240
+DA:402,240
+DA:403,240
+DA:404,240
+DA:405,240
+DA:407,240
+DA:409,96
+DA:412,144
+DA:415,240
+DA:421,96
+DA:422,48
+DA:423,48
+DA:425,48
+DA:426,48
+DA:427,48
+DA:429,48
+DA:430,0
+DA:431,0
+DA:432,0
+DA:435,0
+DA:438,0
+DA:439,0
+DA:440,0
+DA:441,0
+DA:443,0
+DA:444,0
+DA:447,0
+DA:450,0
+DA:451,0
+DA:452,0
+DA:453,0
+DA:461,144
+DA:462,48
+DA:463,48
+DA:465,48
+DA:466,96
+DA:467,96
+DA:469,96
+DA:470,0
+DA:471,0
+DA:472,0
+DA:475,0
+DA:478,0
+DA:479,0
+DA:480,0
+DA:481,0
+DA:483,0
+DA:484,0
+DA:487,0
+DA:490,0
+DA:491,0
+DA:492,0
+DA:493,0
+DA:496,240
+DA:497,240
+DA:498,240
+DA:499,240
+DA:505,240
+DA:509,240
+DA:510,240
+DA:511,240
+DA:512,240
+DA:513,240
+DA:514,240
+DA:515,240
+DA:521,0
+DA:526,0
+DA:527,0
+DA:528,0
+DA:529,0
+DA:530,0
+DA:531,0
+DA:532,0
+DA:538,0
+DA:543,0
+DA:544,0
+DA:545,0
+DA:546,0
+DA:547,0
+DA:548,0
+DA:549,0
+DA:555,0
+DA:560,0
+DA:561,0
+DA:562,0
+DA:568,0
+DA:573,0
+DA:574,0
+DA:575,0
+DA:576,0
+DA:577,0
+DA:578,0
+DA:584,72
+DA:587,72
+DA:589,72
+DA:591,72
+DA:592,72
+DA:598,0
+DA:601,0
+DA:603,0
+DA:605,0
+DA:606,0
+DA:612,0
+DA:615,0
+DA:617,0
+DA:619,0
+DA:620,0
+DA:626,192
+DA:629,192
+DA:630,192
+DA:631,192
+DA:632,192
+DA:633,192
+DA:634,192
+DA:640,264
+DA:643,264
+DA:647,264
+DA:648,264
+DA:649,264
+DA:650,264
+DA:651,264
+DA:652,264
+DA:653,264
+DA:654,264
+DA:656,264
+DA:658,264
+DA:659,264
+DA:664,264
+DA:666,264
+DA:667,264
+DA:668,96
+DA:669,96
+DA:670,96
+DA:671,96
+DA:677,264
+DA:678,0
+DA:680,0
+DA:681,0
+DA:683,0
+DA:684,0
+DA:687,0
+DA:688,0
+DA:692,264
+DA:697,168
+DA:698,34328
+DA:699,34160
+DA:705,168
+DA:706,168
+DA:707,168
+DA:708,96
+DA:709,96
+DA:710,96
+DA:711,96
+DA:712,96
+DA:713,72
+DA:714,72
+DA:715,72
+DA:716,72
+DA:717,0
+DA:718,0
+DA:722,168
+DA:723,0
+DA:724,0
+DA:725,0
+DA:726,0
+DA:727,0
+DA:728,0
+DA:729,0
+DA:730,0
+DA:731,0
+DA:732,0
+DA:733,0
+DA:734,0
+DA:735,0
+DA:737,168
+DA:738,0
+DA:739,0
+DA:740,0
+DA:741,0
+DA:742,0
+DA:743,0
+DA:744,0
+DA:745,0
+DA:746,0
+DA:747,0
+DA:748,0
+DA:749,0
+DA:750,0
+DA:756,264
+DA:759,264
+DA:760,264
+DA:761,264
+DA:762,264
+DA:763,0
+DA:766,264
+DA:767,264
+DA:768,264
+DA:769,264
+DA:770,264
+DA:771,264
+DA:772,264
+LF:400
+LH:145
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/backends/memcheck/ceed-memcheck-serial.c
+FNL:0,17,35
+FNA:0,24,CeedInit_Memcheck
+FNL:1,40,40
+FNA:1,192,CeedRegister_Memcheck_Serial
+FNF:2
+FNH:2
+DA:17,24
+DA:20,24
+DA:24,24
+DA:25,24
+DA:26,24
+DA:28,24
+DA:29,24
+DA:30,24
+DA:31,24
+DA:32,24
+DA:33,24
+DA:34,24
+DA:40,192
+LF:13
+LH:13
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/backends/memcheck/ceed-memcheck-vector.c
+FNL:0,102,113
+FNA:0,144,CeedVectorSetValue_Memcheck
+FNL:1,118,130
+FNA:1,0,CeedVectorSetValueStrided_Memcheck
+FNL:2,135,152
+FNA:2,2352,CeedVectorSyncArray_Memcheck
+FNL:3,157,181
+FNA:3,0,CeedVectorTakeArray_Memcheck
+FNL:4,186,201
+FNA:4,2352,CeedVectorGetArray_Memcheck
+FNL:5,206,223
+FNA:5,3120,CeedVectorGetArrayRead_Memcheck
+FNL:6,21,27
+FNA:6,3240,CeedVectorHasValidArray_Memcheck
+FNL:7,228,247
+FNA:7,2232,CeedVectorGetArrayWrite_Memcheck
+FNL:8,252,279
+FNA:8,2352,CeedVectorRestoreArray_Memcheck
+FNL:9,284,301
+FNA:9,3120,CeedVectorRestoreArrayRead_Memcheck
+FNL:10,306,317
+FNA:10,0,CeedVectorReciprocal_Memcheck
+FNL:11,32,40
+FNA:11,0,CeedVectorHasBorrowedArrayOfType_Memcheck
+FNL:12,322,331
+FNA:12,0,CeedVectorScale_Memcheck
+FNL:13,336,346
+FNA:13,0,CeedVectorAXPY_Memcheck
+FNL:14,351,361
+FNA:14,0,CeedVectorAXPBY_Memcheck
+FNL:15,366,379
+FNA:15,0,CeedVectorPointwiseMult_Memcheck
+FNL:16,384,402
+FNA:16,864,CeedVectorDestroy_Memcheck
+FNL:17,407,434
+FNA:17,864,CeedVectorCreate_Memcheck
+FNL:18,45,97
+FNA:18,2544,CeedVectorSetArray_Memcheck
+FNF:19
+FNH:11
+DA:21,3240
+DA:24,3240
+DA:25,3240
+DA:26,3240
+DA:32,0
+DA:35,0
+DA:37,0
+DA:38,0
+DA:39,0
+DA:45,2544
+DA:49,2544
+DA:51,2544
+DA:52,2544
+DA:55,2544
+DA:56,31648
+DA:57,1680
+DA:59,2544
+DA:60,2544
+DA:61,1968
+DA:62,0
+DA:63,0
+DA:65,1968
+DA:69,2544
+DA:72,2544
+DA:73,576
+DA:75,576
+DA:76,0
+DA:77,0
+DA:78,0
+DA:79,0
+DA:80,0
+DA:81,1968
+DA:82,1968
+DA:83,1968
+DA:84,1968
+DA:85,1968
+DA:89,2544
+DA:90,2544
+DA:91,2544
+DA:92,1992
+DA:94,359032
+DA:96,2544
+DA:102,144
+DA:106,144
+DA:107,144
+DA:109,144
+DA:110,144
+DA:111,30960
+DA:112,144
+DA:118,0
+DA:122,0
+DA:123,0
+DA:125,0
+DA:126,0
+DA:127,0
+DA:128,0
+DA:129,0
+DA:135,2352
+DA:139,2352
+DA:141,2352
+DA:142,2352
+DA:145,2352
+DA:146,0
+DA:148,2352
+DA:149,792
+DA:151,2352
+DA:157,0
+DA:161,0
+DA:163,0
+DA:164,0
+DA:167,0
+DA:170,0
+DA:171,0
+DA:172,0
+DA:175,0
+DA:176,0
+DA:177,0
+DA:179,0
+DA:180,0
+DA:186,2352
+DA:190,2352
+DA:192,2352
+DA:193,2352
+DA:196,2352
+DA:197,2352
+DA:198,2352
+DA:199,2352
+DA:200,2352
+DA:206,3120
+DA:210,3120
+DA:212,3120
+DA:213,3120
+DA:216,3120
+DA:217,3120
+DA:218,3120
+DA:219,3120
+DA:221,3120
+DA:222,3120
+DA:228,2232
+DA:232,2232
+DA:234,2232
+DA:235,2232
+DA:238,2232
+DA:241,2232
+DA:244,417752
+DA:245,2232
+DA:246,2232
+DA:252,2352
+DA:256,2352
+DA:257,2352
+DA:260,2352
+DA:261,417752
+DA:262,415520
+DA:263,0
+DA:267,2232
+DA:271,2352
+DA:272,2352
+DA:275,449456
+DA:276,2352
+DA:277,2352
+DA:278,2352
+DA:284,3120
+DA:288,3120
+DA:289,3120
+DA:292,3120
+DA:294,3120
+DA:297,503680
+DA:298,3120
+DA:299,3120
+DA:300,3120
+DA:306,0
+DA:310,0
+DA:311,0
+DA:313,0
+DA:314,0
+DA:316,0
+DA:322,0
+DA:326,0
+DA:327,0
+DA:329,0
+DA:330,0
+DA:336,0
+DA:340,0
+DA:341,0
+DA:342,0
+DA:344,0
+DA:345,0
+DA:351,0
+DA:355,0
+DA:356,0
+DA:357,0
+DA:359,0
+DA:360,0
+DA:366,0
+DA:370,0
+DA:371,0
+DA:372,0
+DA:373,0
+DA:375,0
+DA:376,0
+DA:377,0
+DA:378,0
+DA:384,864
+DA:388,864
+DA:389,864
+DA:390,864
+DA:391,864
+DA:393,864
+DA:394,0
+DA:395,0
+DA:397,864
+DA:398,288
+DA:400,864
+DA:401,864
+DA:407,864
+DA:411,864
+DA:412,864
+DA:413,864
+DA:414,864
+DA:415,864
+DA:416,864
+DA:417,864
+DA:418,864
+DA:419,864
+DA:420,864
+DA:421,864
+DA:422,864
+DA:423,864
+DA:424,864
+DA:425,864
+DA:426,864
+DA:427,864
+DA:428,864
+DA:429,864
+DA:430,864
+DA:431,864
+DA:432,864
+DA:433,864
+LF:198
+LH:129
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/backends/opt/ceed-opt-blocked.c
+FNL:0,18,24
+FNA:0,48,CeedDestroy_Opt
+FNL:1,29,51
+FNA:1,48,CeedInit_Opt_Blocked
+FNL:2,56,56
+FNA:2,192,CeedRegister_Opt_Blocked
+FNF:3
+FNH:3
+DA:18,48
+DA:21,48
+DA:22,48
+DA:23,48
+DA:29,48
+DA:33,48
+DA:35,48
+DA:38,48
+DA:39,48
+DA:40,48
+DA:42,48
+DA:43,48
+DA:44,48
+DA:47,48
+DA:48,48
+DA:49,48
+DA:50,48
+DA:56,192
+LF:18
+LH:18
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/backends/opt/ceed-opt-operator.c
+FNL:0,19,200
+FNA:0,384,CeedOperatorSetupFields_Opt
+FNL:1,205,272
+FNA:1,192,CeedOperatorSetup_Opt
+FNL:2,277,312
+FNA:2,192,CeedOperatorSetupInputs_Opt
+FNL:3,317,368
+FNA:3,1536,CeedOperatorInputBasis_Opt
+FNL:4,373,418
+FNA:4,1536,CeedOperatorOutputBasis_Opt
+FNL:5,423,437
+FNA:5,192,CeedOperatorRestoreInputs_Opt
+FNL:6,442,513
+FNA:6,192,CeedOperatorApplyAdd_Opt
+FNL:7,518,726
+FNA:7,0,CeedOperatorLinearAssembleQFunctionCore_Opt
+FNL:8,731,733
+FNA:8,0,CeedOperatorLinearAssembleQFunction_Opt
+FNL:9,738,740
+FNA:9,0,CeedOperatorLinearAssembleQFunctionUpdate_Opt
+FNL:10,745,780
+FNA:10,192,CeedOperatorDestroy_Opt
+FNL:11,785,805
+FNA:11,192,CeedOperatorCreate_Opt
+FNF:12
+FNH:9
+DA:19,384
+DA:31,384
+DA:32,384
+DA:33,384
+DA:34,384
+DA:36,384
+DA:37,192
+DA:38,192
+DA:40,192
+DA:41,192
+DA:45,960
+DA:49,576
+DA:50,576
+DA:57,480
+DA:58,480
+DA:59,480
+DA:60,480
+DA:61,480
+DA:62,480
+DA:63,480
+DA:65,480
+DA:66,480
+DA:67,288
+DA:68,288
+DA:70,288
+DA:71,288
+DA:73,288
+DA:74,288
+DA:75,0
+DA:76,0
+DA:77,0
+DA:79,0
+DA:80,0
+DA:81,0
+DA:83,0
+DA:84,0
+DA:85,0
+DA:86,0
+DA:87,0
+DA:88,0
+DA:90,0
+DA:91,0
+DA:92,0
+DA:95,0
+DA:96,0
+DA:97,0
+DA:98,192
+DA:101,192
+DA:102,192
+DA:104,192
+DA:105,0
+DA:107,0
+DA:109,480
+DA:110,480
+DA:111,480
+DA:114,576
+DA:115,192
+DA:116,192
+DA:117,192
+DA:118,192
+DA:119,192
+DA:120,192
+DA:121,192
+DA:122,288
+DA:126,288
+DA:127,288
+DA:128,288
+DA:129,288
+DA:130,288
+DA:131,288
+DA:132,288
+DA:133,288
+DA:134,288
+DA:135,288
+DA:136,96
+DA:137,96
+DA:138,96
+DA:139,96
+DA:140,96
+DA:141,96
+DA:142,96
+DA:145,576
+DA:148,384
+DA:149,576
+DA:153,384
+DA:154,384
+DA:155,576
+DA:159,192
+DA:160,192
+DA:161,192
+DA:162,0
+DA:163,0
+DA:164,0
+DA:166,192
+DA:167,192
+DA:169,384
+DA:170,384
+DA:173,384
+DA:177,192
+DA:178,192
+DA:179,192
+DA:183,0
+DA:184,0
+DA:185,0
+DA:186,0
+DA:187,0
+DA:188,0
+DA:189,0
+DA:191,0
+DA:192,0
+DA:194,192
+DA:195,192
+DA:198,384
+DA:199,384
+DA:205,192
+DA:215,192
+DA:216,192
+DA:218,192
+DA:219,192
+DA:220,192
+DA:221,192
+DA:222,192
+DA:223,192
+DA:224,192
+DA:225,192
+DA:226,192
+DA:227,192
+DA:230,192
+DA:231,192
+DA:233,192
+DA:234,192
+DA:235,192
+DA:236,192
+DA:237,192
+DA:238,192
+DA:239,192
+DA:240,192
+DA:242,192
+DA:243,192
+DA:247,192
+DA:250,192
+DA:254,192
+DA:258,0
+DA:259,0
+DA:260,0
+DA:262,0
+DA:263,0
+DA:265,0
+DA:269,192
+DA:270,192
+DA:271,192
+DA:277,192
+DA:280,576
+DA:283,384
+DA:284,384
+DA:290,288
+DA:291,288
+DA:293,96
+DA:294,96
+DA:295,96
+DA:297,96
+DA:299,96
+DA:302,192
+DA:303,0
+DA:304,0
+DA:305,0
+DA:308,288
+DA:311,192
+DA:317,1536
+DA:320,4608
+DA:329,3072
+DA:330,3072
+DA:331,3072
+DA:332,3072
+DA:335,3072
+DA:336,3072
+DA:337,3072
+DA:338,3072
+DA:339,3072
+DA:341,3072
+DA:342,1536
+DA:345,3072
+DA:346,768
+DA:347,768
+DA:348,768
+DA:350,768
+DA:351,1536
+DA:355,1536
+DA:356,1536
+DA:357,0
+DA:358,0
+DA:360,1536
+DA:361,1536
+DA:362,1536
+DA:363,768
+DA:364,768
+DA:367,1536
+DA:373,1536
+DA:376,3072
+DA:383,1536
+DA:385,1536
+DA:386,768
+DA:387,768
+DA:388,768
+DA:392,768
+DA:393,768
+DA:394,0
+DA:396,768
+DA:398,768
+DA:399,768
+DA:407,1536
+DA:409,1536
+DA:410,1536
+DA:411,1536
+DA:413,1536
+DA:415,1536
+DA:417,1536
+DA:423,192
+DA:425,576
+DA:429,384
+DA:430,384
+DA:431,384
+DA:432,96
+DA:434,384
+DA:436,192
+DA:442,192
+DA:447,192
+DA:454,192
+DA:456,192
+DA:457,192
+DA:458,192
+DA:459,192
+DA:460,192
+DA:461,192
+DA:462,192
+DA:465,192
+DA:466,0
+DA:467,0
+DA:468,0
+DA:470,0
+DA:473,192
+DA:474,192
+DA:475,192
+DA:476,192
+DA:479,192
+DA:482,384
+DA:484,192
+DA:485,192
+DA:487,96
+DA:488,96
+DA:489,96
+DA:494,1728
+DA:496,1536
+DA:500,1536
+DA:501,1536
+DA:505,1536
+DA:510,192
+DA:511,192
+DA:512,192
+DA:518,0
+DA:523,0
+DA:529,0
+DA:530,0
+DA:531,0
+DA:532,0
+DA:533,0
+DA:535,0
+DA:536,0
+DA:537,0
+DA:538,0
+DA:539,0
+DA:540,0
+DA:541,0
+DA:542,0
+DA:543,0
+DA:546,0
+DA:549,0
+DA:552,0
+DA:555,0
+DA:556,0
+DA:561,0
+DA:562,0
+DA:563,0
+DA:564,0
+DA:565,0
+DA:567,0
+DA:569,0
+DA:570,0
+DA:574,0
+DA:575,0
+DA:580,0
+DA:581,0
+DA:582,0
+DA:583,0
+DA:585,0
+DA:587,0
+DA:588,0
+DA:592,0
+DA:593,0
+DA:595,0
+DA:596,0
+DA:597,0
+DA:601,0
+DA:602,0
+DA:604,0
+DA:606,0
+DA:610,0
+DA:611,0
+DA:612,0
+DA:615,0
+DA:618,0
+DA:622,0
+DA:623,0
+DA:624,0
+DA:627,0
+DA:631,0
+DA:637,0
+DA:638,0
+DA:639,0
+DA:640,0
+DA:641,0
+DA:642,0
+DA:647,0
+DA:648,0
+DA:649,0
+DA:652,0
+DA:654,0
+DA:658,0
+DA:659,0
+DA:662,0
+DA:663,0
+DA:664,0
+DA:666,0
+DA:669,0
+DA:675,0
+DA:676,0
+DA:677,0
+DA:678,0
+DA:679,0
+DA:685,0
+DA:686,0
+DA:687,0
+DA:693,0
+DA:694,0
+DA:698,0
+DA:699,0
+DA:700,0
+DA:702,0
+DA:707,0
+DA:708,0
+DA:712,0
+DA:716,0
+DA:717,0
+DA:718,0
+DA:722,0
+DA:723,0
+DA:724,0
+DA:725,0
+DA:731,0
+DA:732,0
+DA:738,0
+DA:739,0
+DA:745,192
+DA:748,192
+DA:749,768
+DA:750,576
+DA:751,576
+DA:753,192
+DA:754,192
+DA:755,192
+DA:756,192
+DA:757,192
+DA:758,192
+DA:760,576
+DA:761,384
+DA:762,384
+DA:764,192
+DA:765,192
+DA:767,384
+DA:768,192
+DA:769,192
+DA:771,192
+DA:772,192
+DA:775,192
+DA:776,192
+DA:778,192
+DA:779,192
+DA:785,192
+DA:790,192
+DA:791,192
+DA:792,192
+DA:794,192
+DA:795,192
+DA:797,192
+DA:799,192
+DA:800,192
+DA:801,192
+DA:802,192
+DA:803,192
+DA:804,192
+LF:400
+LH:249
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/backends/opt/ceed-opt-serial.c
+FNL:0,18,24
+FNA:0,48,CeedDestroy_Opt
+FNL:1,29,51
+FNA:1,48,CeedInit_Opt_Serial
+FNL:2,56,56
+FNA:2,192,CeedRegister_Opt_Serial
+FNF:3
+FNH:3
+DA:18,48
+DA:21,48
+DA:22,48
+DA:23,48
+DA:29,48
+DA:33,48
+DA:35,48
+DA:38,48
+DA:39,48
+DA:40,48
+DA:42,48
+DA:43,48
+DA:44,48
+DA:47,48
+DA:48,48
+DA:49,48
+DA:50,48
+DA:56,192
+LF:18
+LH:18
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/backends/opt/ceed-opt-tensor.c
+FNL:0,16,35
+FNA:0,2016,CeedTensorContractApply_Core_Opt
+FNL:1,40,49
+FNA:1,2016,CeedTensorContractApply_Opt
+FNL:2,54,57
+FNA:2,96,CeedTensorContractCreate_Opt
+FNF:3
+FNH:3
+DA:16,2016
+DA:19,2016
+DA:21,2016
+DA:22,504
+DA:23,504
+DA:26,13080
+DA:27,72600
+DA:28,419808
+DA:29,358272
+DA:30,2727456
+DA:34,2016
+DA:40,2016
+DA:42,2016
+DA:43,415104
+DA:46,2016
+DA:47,672
+DA:54,96
+DA:55,96
+DA:56,96
+LF:19
+LH:19
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/backends/ref/ceed-ref-basis.c
+FNL:0,19,251
+FNA:0,4800,CeedBasisApplyCore_Ref
+FNL:1,253,256
+FNA:1,4800,CeedBasisApply_Ref
+FNL:2,258,261
+FNA:2,0,CeedBasisApplyAdd_Ref
+FNL:3,266,273
+FNA:3,384,CeedBasisDestroyTensor_Ref
+FNL:4,278,306
+FNA:4,384,CeedBasisCreateTensorH1_Ref
+FNL:5,311,328
+FNA:5,0,CeedBasisCreateH1_Ref
+FNL:6,333,350
+FNA:6,0,CeedBasisCreateHdiv_Ref
+FNL:7,355,372
+FNA:7,0,CeedBasisCreateHcurl_Ref
+FNF:8
+FNH:4
+DA:19,4800
+DA:21,4800
+DA:28,4800
+DA:29,4800
+DA:30,4800
+DA:31,4800
+DA:32,4800
+DA:33,4800
+DA:34,4800
+DA:35,4800
+DA:36,192
+DA:38,4800
+DA:39,4800
+DA:41,4800
+DA:44,1536
+DA:45,64576
+DA:48,4800
+DA:49,4800
+DA:53,4800
+DA:54,4800
+DA:55,4800
+DA:57,3072
+DA:58,3072
+DA:59,0
+DA:60,3072
+DA:61,3072
+DA:63,3072
+DA:64,1536
+DA:65,1536
+DA:67,3072
+DA:68,3072
+DA:71,3072
+DA:72,7104
+DA:73,4032
+DA:75,4032
+DA:76,4032
+DA:79,3072
+DA:81,1536
+DA:86,1536
+DA:88,1536
+DA:89,0
+DA:90,0
+DA:92,1536
+DA:95,1536
+DA:96,1536
+DA:97,1536
+DA:98,1536
+DA:102,3552
+DA:103,2016
+DA:107,2016
+DA:108,2016
+DA:112,1536
+DA:113,1536
+DA:114,0
+DA:115,0
+DA:117,1536
+DA:118,3552
+DA:119,2016
+DA:124,2016
+DA:125,2016
+DA:127,0
+DA:130,0
+DA:133,0
+DA:135,0
+DA:136,0
+DA:139,0
+DA:140,0
+DA:142,0
+DA:145,0
+DA:147,0
+DA:148,0
+DA:149,0
+DA:151,0
+DA:154,0
+DA:155,0
+DA:157,0
+DA:158,0
+DA:162,0
+DA:163,0
+DA:167,1536
+DA:169,192
+DA:170,192
+DA:173,192
+DA:174,192
+DA:175,576
+DA:176,384
+DA:178,3648
+DA:179,22848
+DA:180,66048
+DA:181,46464
+DA:183,255552
+DA:188,192
+DA:199,0
+DA:201,0
+DA:203,0
+DA:206,0
+DA:207,0
+DA:208,0
+DA:210,0
+DA:213,0
+DA:214,0
+DA:215,0
+DA:217,0
+DA:220,0
+DA:221,0
+DA:222,0
+DA:224,0
+DA:227,0
+DA:228,0
+DA:229,0
+DA:231,0
+DA:234,0
+DA:235,0
+DA:236,0
+DA:237,0
+DA:239,0
+DA:246,4800
+DA:247,4608
+DA:249,4800
+DA:250,4800
+DA:253,4800
+DA:254,4800
+DA:255,4800
+DA:258,0
+DA:259,0
+DA:260,0
+DA:266,384
+DA:269,384
+DA:270,384
+DA:271,384
+DA:272,384
+DA:278,384
+DA:284,384
+DA:285,384
+DA:287,384
+DA:289,384
+DA:290,384
+DA:291,384
+DA:292,384
+DA:294,384
+DA:296,384
+DA:297,384
+DA:298,384
+DA:300,384
+DA:301,384
+DA:302,384
+DA:303,384
+DA:304,384
+DA:305,384
+DA:311,0
+DA:316,0
+DA:317,0
+DA:319,0
+DA:320,0
+DA:321,0
+DA:323,0
+DA:324,0
+DA:325,0
+DA:326,0
+DA:327,0
+DA:333,0
+DA:338,0
+DA:339,0
+DA:341,0
+DA:342,0
+DA:343,0
+DA:345,0
+DA:346,0
+DA:347,0
+DA:348,0
+DA:349,0
+DA:355,0
+DA:360,0
+DA:361,0
+DA:363,0
+DA:364,0
+DA:365,0
+DA:367,0
+DA:368,0
+DA:369,0
+DA:370,0
+DA:371,0
+LF:182
+LH:98
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/backends/ref/ceed-ref-operator.c
+FNL:0,1025,1084
+FNA:0,0,CeedOperatorApplyAddAtPoints_Ref
+FNL:1,1089,1310
+FNA:1,0,CeedOperatorLinearAssembleQFunctionAtPointsCore_Ref
+FNL:2,1315,1317
+FNA:2,0,CeedOperatorLinearAssembleQFunctionAtPoints_Ref
+FNL:3,1322,1325
+FNA:3,0,CeedOperatorLinearAssembleQFunctionAtPointsUpdate_Ref
+FNL:4,1330,1528
+FNA:4,0,CeedOperatorLinearAssembleAddDiagonalAtPoints_Ref
+FNL:5,145,206
+FNA:5,96,CeedOperatorSetup_Ref
+FNL:6,1533,1735
+FNA:6,0,CeedOperatorAssembleSingleAtPoints_Ref
+FNL:7,1740,1771
+FNA:7,96,CeedOperatorDestroy_Ref
+FNL:8,1776,1789
+FNA:8,96,CeedOperatorCreate_Ref
+FNL:9,1794,1810
+FNA:9,0,CeedOperatorCreateAtPoints_Ref
+FNL:10,19,140
+FNA:10,192,CeedOperatorSetupFields_Ref
+FNL:11,211,249
+FNA:11,96,CeedOperatorSetupInputs_Ref
+FNL:12,254,299
+FNA:12,1344,CeedOperatorInputBasis_Ref
+FNL:13,304,345
+FNA:13,1344,CeedOperatorOutputBasis_Ref
+FNL:14,350,373
+FNA:14,96,CeedOperatorRestoreInputs_Ref
+FNL:15,378,473
+FNA:15,96,CeedOperatorApplyAdd_Ref
+FNL:16,478,654
+FNA:16,0,CeedOperatorLinearAssembleQFunctionCore_Ref
+FNL:17,659,661
+FNA:17,0,CeedOperatorLinearAssembleQFunction_Ref
+FNL:18,666,668
+FNA:18,0,CeedOperatorLinearAssembleQFunctionUpdate_Ref
+FNL:19,673,827
+FNA:19,0,CeedOperatorSetupFieldsAtPoints_Ref
+FNL:20,832,882
+FNA:20,0,CeedOperatorSetupAtPoints_Ref
+FNL:21,887,949
+FNA:21,0,CeedOperatorInputBasisAtPoints_Ref
+FNL:22,954,1020
+FNA:22,0,CeedOperatorOutputBasisAtPoints_Ref
+FNF:23
+FNH:9
+DA:19,192
+DA:31,192
+DA:32,192
+DA:33,192
+DA:34,192
+DA:36,192
+DA:37,96
+DA:38,96
+DA:40,96
+DA:41,96
+DA:45,480
+DA:50,288
+DA:51,288
+DA:52,240
+DA:53,240
+DA:54,240
+DA:57,288
+DA:58,96
+DA:59,96
+DA:60,96
+DA:61,96
+DA:62,96
+DA:63,144
+DA:67,144
+DA:68,144
+DA:69,144
+DA:70,144
+DA:71,144
+DA:72,144
+DA:73,144
+DA:74,144
+DA:75,144
+DA:76,144
+DA:77,48
+DA:78,48
+DA:79,48
+DA:80,48
+DA:81,48
+DA:82,48
+DA:83,48
+DA:87,192
+DA:88,288
+DA:92,192
+DA:93,192
+DA:94,288
+DA:98,96
+DA:99,96
+DA:100,96
+DA:101,0
+DA:102,0
+DA:103,0
+DA:105,96
+DA:106,96
+DA:108,192
+DA:109,192
+DA:112,192
+DA:116,96
+DA:117,96
+DA:118,96
+DA:122,0
+DA:123,0
+DA:124,0
+DA:125,0
+DA:126,0
+DA:127,0
+DA:128,0
+DA:129,0
+DA:131,0
+DA:132,0
+DA:134,96
+DA:135,96
+DA:138,192
+DA:139,192
+DA:145,96
+DA:153,96
+DA:154,96
+DA:156,96
+DA:157,96
+DA:158,96
+DA:159,96
+DA:160,96
+DA:161,96
+DA:164,96
+DA:166,96
+DA:167,96
+DA:168,96
+DA:169,96
+DA:170,96
+DA:171,96
+DA:172,96
+DA:173,96
+DA:174,96
+DA:176,96
+DA:177,96
+DA:181,96
+DA:184,96
+DA:188,96
+DA:192,0
+DA:193,0
+DA:194,0
+DA:196,0
+DA:197,0
+DA:199,0
+DA:203,96
+DA:204,96
+DA:205,96
+DA:211,96
+DA:214,288
+DA:221,192
+DA:222,192
+DA:223,192
+DA:224,96
+DA:225,96
+DA:228,192
+DA:230,192
+DA:233,144
+DA:235,144
+DA:238,144
+DA:239,144
+DA:240,144
+DA:242,144
+DA:244,144
+DA:246,192
+DA:248,96
+DA:254,1344
+DA:257,4032
+DA:264,2688
+DA:268,0
+DA:269,0
+DA:270,0
+DA:271,0
+DA:274,2688
+DA:275,2688
+DA:276,2688
+DA:277,2688
+DA:278,2688
+DA:280,2688
+DA:281,672
+DA:282,672
+DA:283,672
+DA:284,1344
+DA:288,1344
+DA:289,1344
+DA:290,1344
+DA:291,1344
+DA:292,1344
+DA:293,1344
+DA:294,672
+DA:295,672
+DA:298,1344
+DA:304,1344
+DA:307,2688
+DA:314,1344
+DA:315,1344
+DA:316,1344
+DA:317,1344
+DA:319,1344
+DA:320,672
+DA:321,672
+DA:322,672
+DA:326,672
+DA:327,672
+DA:328,672
+DA:330,672
+DA:331,0
+DA:333,672
+DA:335,672
+DA:336,672
+DA:344,1344
+DA:350,96
+DA:352,288
+DA:356,192
+DA:360,0
+DA:361,0
+DA:362,0
+DA:363,0
+DA:366,192
+DA:367,192
+DA:369,144
+DA:372,96
+DA:378,96
+DA:381,96
+DA:388,96
+DA:390,96
+DA:391,96
+DA:394,96
+DA:397,0
+DA:398,0
+DA:399,0
+DA:400,0
+DA:401,0
+DA:402,0
+DA:403,0
+DA:406,96
+DA:407,96
+DA:408,96
+DA:409,96
+DA:412,96
+DA:415,192
+DA:416,96
+DA:417,0
+DA:419,96
+DA:424,1440
+DA:426,2688
+DA:427,1344
+DA:428,1344
+DA:429,672
+DA:430,672
+DA:436,1344
+DA:439,1344
+DA:440,1344
+DA:444,1344
+DA:449,192
+DA:454,96
+DA:456,96
+DA:458,96
+DA:460,96
+DA:461,96
+DA:463,96
+DA:464,96
+DA:465,96
+DA:466,96
+DA:470,96
+DA:471,96
+DA:472,96
+DA:478,0
+DA:482,0
+DA:488,0
+DA:489,0
+DA:490,0
+DA:491,0
+DA:492,0
+DA:493,0
+DA:494,0
+DA:495,0
+DA:496,0
+DA:499,0
+DA:502,0
+DA:505,0
+DA:508,0
+DA:509,0
+DA:514,0
+DA:516,0
+DA:517,0
+DA:518,0
+DA:519,0
+DA:521,0
+DA:523,0
+DA:524,0
+DA:528,0
+DA:529,0
+DA:534,0
+DA:536,0
+DA:537,0
+DA:538,0
+DA:540,0
+DA:542,0
+DA:543,0
+DA:547,0
+DA:548,0
+DA:549,0
+DA:552,0
+DA:555,0
+DA:558,0
+DA:559,0
+DA:562,0
+DA:564,0
+DA:568,0
+DA:574,0
+DA:575,0
+DA:576,0
+DA:577,0
+DA:578,0
+DA:579,0
+DA:584,0
+DA:585,0
+DA:586,0
+DA:589,0
+DA:591,0
+DA:595,0
+DA:597,0
+DA:600,0
+DA:601,0
+DA:602,0
+DA:604,0
+DA:607,0
+DA:613,0
+DA:614,0
+DA:615,0
+DA:616,0
+DA:617,0
+DA:623,0
+DA:624,0
+DA:625,0
+DA:632,0
+DA:633,0
+DA:637,0
+DA:639,0
+DA:640,0
+DA:642,0
+DA:647,0
+DA:650,0
+DA:651,0
+DA:652,0
+DA:653,0
+DA:659,0
+DA:660,0
+DA:666,0
+DA:667,0
+DA:673,0
+DA:685,0
+DA:686,0
+DA:687,0
+DA:688,0
+DA:690,0
+DA:691,0
+DA:692,0
+DA:694,0
+DA:695,0
+DA:701,0
+DA:704,0
+DA:705,0
+DA:706,0
+DA:707,0
+DA:708,0
+DA:709,0
+DA:710,0
+DA:711,0
+DA:716,0
+DA:720,0
+DA:721,0
+DA:724,0
+DA:725,0
+DA:726,0
+DA:727,0
+DA:730,0
+DA:731,0
+DA:734,0
+DA:735,0
+DA:736,0
+DA:737,0
+DA:738,0
+DA:739,0
+DA:741,0
+DA:742,0
+DA:744,0
+DA:745,0
+DA:747,0
+DA:751,0
+DA:752,0
+DA:753,0
+DA:754,0
+DA:755,0
+DA:756,0
+DA:757,0
+DA:758,0
+DA:759,0
+DA:760,0
+DA:761,0
+DA:762,0
+DA:763,0
+DA:764,0
+DA:765,0
+DA:767,0
+DA:768,0
+DA:771,0
+DA:772,0
+DA:775,0
+DA:776,0
+DA:780,0
+DA:781,0
+DA:782,0
+DA:786,0
+DA:787,0
+DA:788,0
+DA:789,0
+DA:790,0
+DA:791,0
+DA:793,0
+DA:794,0
+DA:796,0
+DA:797,0
+DA:800,0
+DA:804,0
+DA:805,0
+DA:806,0
+DA:810,0
+DA:811,0
+DA:812,0
+DA:813,0
+DA:814,0
+DA:815,0
+DA:816,0
+DA:818,0
+DA:819,0
+DA:821,0
+DA:822,0
+DA:825,0
+DA:826,0
+DA:832,0
+DA:840,0
+DA:841,0
+DA:843,0
+DA:844,0
+DA:845,0
+DA:846,0
+DA:847,0
+DA:848,0
+DA:851,0
+DA:853,0
+DA:854,0
+DA:855,0
+DA:856,0
+DA:857,0
+DA:858,0
+DA:859,0
+DA:860,0
+DA:862,0
+DA:863,0
+DA:867,0
+DA:870,0
+DA:874,0
+DA:875,0
+DA:876,0
+DA:879,0
+DA:880,0
+DA:881,0
+DA:887,0
+DA:891,0
+DA:901,0
+DA:902,0
+DA:903,0
+DA:904,0
+DA:905,0
+DA:908,0
+DA:909,0
+DA:910,0
+DA:911,0
+DA:914,0
+DA:915,0
+DA:916,0
+DA:918,0
+DA:922,0
+DA:923,0
+DA:924,0
+DA:925,0
+DA:927,0
+DA:929,0
+DA:933,0
+DA:934,0
+DA:935,0
+DA:936,0
+DA:937,0
+DA:939,0
+DA:941,0
+DA:942,0
+DA:943,0
+DA:944,0
+DA:946,0
+DA:948,0
+DA:954,0
+DA:958,0
+DA:967,0
+DA:968,0
+DA:969,0
+DA:970,0
+DA:973,0
+DA:974,0
+DA:976,0
+DA:977,0
+DA:978,0
+DA:979,0
+DA:983,0
+DA:984,0
+DA:985,0
+DA:988,0
+DA:991,0
+DA:992,0
+DA:1001,0
+DA:1002,0
+DA:1003,0
+DA:1007,0
+DA:1008,0
+DA:1009,0
+DA:1011,0
+DA:1012,0
+DA:1014,0
+DA:1016,0
+DA:1017,0
+DA:1019,0
+DA:1025,0
+DA:1026,0
+DA:1027,0
+DA:1028,0
+DA:1029,0
+DA:1035,0
+DA:1036,0
+DA:1037,0
+DA:1038,0
+DA:1039,0
+DA:1042,0
+DA:1045,0
+DA:1048,0
+DA:1051,0
+DA:1055,0
+DA:1056,0
+DA:1057,0
+DA:1060,0
+DA:1064,0
+DA:1065,0
+DA:1069,0
+DA:1073,0
+DA:1077,0
+DA:1080,0
+DA:1081,0
+DA:1082,0
+DA:1083,0
+DA:1089,0
+DA:1092,0
+DA:1093,0
+DA:1094,0
+DA:1099,0
+DA:1101,0
+DA:1102,0
+DA:1103,0
+DA:1104,0
+DA:1105,0
+DA:1106,0
+DA:1107,0
+DA:1108,0
+DA:1111,0
+DA:1114,0
+DA:1117,0
+DA:1118,0
+DA:1121,0
+DA:1124,0
+DA:1125,0
+DA:1130,0
+DA:1132,0
+DA:1136,0
+DA:1138,0
+DA:1139,0
+DA:1140,0
+DA:1141,0
+DA:1144,0
+DA:1145,0
+DA:1147,0
+DA:1149,0
+DA:1150,0
+DA:1154,0
+DA:1155,0
+DA:1160,0
+DA:1162,0
+DA:1166,0
+DA:1168,0
+DA:1169,0
+DA:1170,0
+DA:1171,0
+DA:1174,0
+DA:1175,0
+DA:1176,0
+DA:1178,0
+DA:1180,0
+DA:1181,0
+DA:1185,0
+DA:1189,0
+DA:1192,0
+DA:1193,0
+DA:1195,0
+DA:1198,0
+DA:1201,0
+DA:1202,0
+DA:1205,0
+DA:1209,0
+DA:1210,0
+DA:1211,0
+DA:1214,0
+DA:1218,0
+DA:1224,0
+DA:1225,0
+DA:1226,0
+DA:1228,0
+DA:1230,0
+DA:1231,0
+DA:1236,0
+DA:1237,0
+DA:1238,0
+DA:1241,0
+DA:1243,0
+DA:1248,0
+DA:1250,0
+DA:1251,0
+DA:1252,0
+DA:1253,0
+DA:1255,0
+DA:1258,0
+DA:1264,0
+DA:1265,0
+DA:1266,0
+DA:1267,0
+DA:1268,0
+DA:1274,0
+DA:1275,0
+DA:1276,0
+DA:1280,0
+DA:1284,0
+DA:1285,0
+DA:1289,0
+DA:1291,0
+DA:1292,0
+DA:1294,0
+DA:1299,0
+DA:1302,0
+DA:1305,0
+DA:1306,0
+DA:1307,0
+DA:1308,0
+DA:1309,0
+DA:1315,0
+DA:1316,0
+DA:1322,0
+DA:1324,0
+DA:1330,0
+DA:1331,0
+DA:1332,0
+DA:1334,0
+DA:1335,0
+DA:1341,0
+DA:1342,0
+DA:1343,0
+DA:1344,0
+DA:1345,0
+DA:1348,0
+DA:1354,0
+DA:1355,0
+DA:1356,0
+DA:1357,0
+DA:1361,0
+DA:1367,0
+DA:1368,0
+DA:1369,0
+DA:1370,0
+DA:1374,0
+DA:1378,0
+DA:1379,0
+DA:1380,0
+DA:1381,0
+DA:1382,0
+DA:1386,0
+DA:1389,0
+DA:1390,0
+DA:1393,0
+DA:1394,0
+DA:1395,0
+DA:1398,0
+DA:1402,0
+DA:1403,0
+DA:1404,0
+DA:1410,0
+DA:1411,0
+DA:1412,0
+DA:1413,0
+DA:1416,0
+DA:1417,0
+DA:1418,0
+DA:1419,0
+DA:1420,0
+DA:1421,0
+DA:1422,0
+DA:1424,0
+DA:1425,0
+DA:1430,0
+DA:1431,0
+DA:1432,0
+DA:1433,0
+DA:1436,0
+DA:1440,0
+DA:1441,0
+DA:1445,0
+DA:1450,0
+DA:1452,0
+DA:1458,0
+DA:1459,0
+DA:1460,0
+DA:1461,0
+DA:1464,0
+DA:1465,0
+DA:1466,0
+DA:1467,0
+DA:1468,0
+DA:1470,0
+DA:1471,0
+DA:1473,0
+DA:1476,0
+DA:1478,0
+DA:1479,0
+DA:1480,0
+DA:1481,0
+DA:1486,0
+DA:1488,0
+DA:1489,0
+DA:1490,0
+DA:1491,0
+DA:1492,0
+DA:1493,0
+DA:1494,0
+DA:1497,0
+DA:1498,0
+DA:1500,0
+DA:1502,0
+DA:1505,0
+DA:1508,0
+DA:1509,0
+DA:1510,0
+DA:1514,0
+DA:1518,0
+DA:1521,0
+DA:1522,0
+DA:1523,0
+DA:1524,0
+DA:1525,0
+DA:1526,0
+DA:1527,0
+DA:1533,0
+DA:1534,0
+DA:1535,0
+DA:1537,0
+DA:1538,0
+DA:1544,0
+DA:1545,0
+DA:1546,0
+DA:1547,0
+DA:1548,0
+DA:1551,0
+DA:1557,0
+DA:1558,0
+DA:1559,0
+DA:1560,0
+DA:1564,0
+DA:1570,0
+DA:1571,0
+DA:1572,0
+DA:1573,0
+DA:1577,0
+DA:1580,0
+DA:1584,0
+DA:1585,0
+DA:1586,0
+DA:1587,0
+DA:1588,0
+DA:1592,0
+DA:1595,0
+DA:1596,0
+DA:1599,0
+DA:1601,0
+DA:1602,0
+DA:1605,0
+DA:1609,0
+DA:1610,0
+DA:1611,0
+DA:1617,0
+DA:1618,0
+DA:1619,0
+DA:1620,0
+DA:1623,0
+DA:1624,0
+DA:1625,0
+DA:1626,0
+DA:1627,0
+DA:1628,0
+DA:1629,0
+DA:1631,0
+DA:1632,0
+DA:1633,0
+DA:1634,0
+DA:1640,0
+DA:1641,0
+DA:1642,0
+DA:1643,0
+DA:1644,0
+DA:1647,0
+DA:1651,0
+DA:1652,0
+DA:1656,0
+DA:1661,0
+DA:1663,0
+DA:1669,0
+DA:1670,0
+DA:1671,0
+DA:1672,0
+DA:1675,0
+DA:1676,0
+DA:1677,0
+DA:1678,0
+DA:1679,0
+DA:1681,0
+DA:1682,0
+DA:1684,0
+DA:1687,0
+DA:1689,0
+DA:1690,0
+DA:1691,0
+DA:1697,0
+DA:1698,0
+DA:1699,0
+DA:1700,0
+DA:1702,0
+DA:1703,0
+DA:1705,0
+DA:1709,0
+DA:1712,0
+DA:1713,0
+DA:1714,0
+DA:1718,0
+DA:1722,0
+DA:1725,0
+DA:1728,0
+DA:1729,0
+DA:1730,0
+DA:1731,0
+DA:1732,0
+DA:1733,0
+DA:1734,0
+DA:1740,96
+DA:1743,96
+DA:1744,96
+DA:1745,96
+DA:1746,96
+DA:1747,96
+DA:1748,384
+DA:1749,288
+DA:1751,96
+DA:1752,96
+DA:1754,288
+DA:1755,192
+DA:1756,192
+DA:1758,96
+DA:1759,96
+DA:1761,192
+DA:1762,96
+DA:1763,96
+DA:1765,96
+DA:1766,96
+DA:1767,96
+DA:1769,96
+DA:1770,96
+DA:1776,96
+DA:1780,96
+DA:1781,96
+DA:1782,96
+DA:1783,96
+DA:1784,96
+DA:1785,96
+DA:1786,96
+DA:1787,96
+DA:1788,96
+DA:1794,0
+DA:1798,0
+DA:1799,0
+DA:1800,0
+DA:1801,0
+DA:1802,0
+DA:1804,0
+DA:1805,0
+DA:1806,0
+DA:1807,0
+DA:1808,0
+DA:1809,0
+LF:868
+LH:222
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/backends/ref/ceed-ref-qfunction.c
+FNL:0,17,45
+FNA:0,2304,CeedQFunctionApply_Ref
+FNL:1,50,58
+FNA:1,288,CeedQFunctionDestroy_Ref
+FNL:2,63,76
+FNA:2,288,CeedQFunctionCreate_Ref
+FNF:3
+FNH:3
+DA:17,2304
+DA:18,2304
+DA:20,2304
+DA:23,2304
+DA:24,2304
+DA:25,2304
+DA:26,2304
+DA:28,6912
+DA:29,4608
+DA:31,4608
+DA:32,2304
+DA:35,2304
+DA:37,6912
+DA:38,4608
+DA:40,4608
+DA:41,2304
+DA:43,2304
+DA:44,2304
+DA:50,288
+DA:53,288
+DA:54,288
+DA:55,288
+DA:56,288
+DA:57,288
+DA:63,288
+DA:67,288
+DA:68,288
+DA:69,288
+DA:70,288
+DA:71,288
+DA:72,288
+DA:73,288
+DA:74,288
+DA:75,288
+LF:34
+LH:34
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/backends/ref/ceed-ref-qfunctioncontext.c
+FNL:0,103,103
+FNA:0,1188,CeedQFunctionContextRestoreData_Ref
+FNL:1,108,115
+FNA:1,216,CeedQFunctionContextDestroy_Ref
+FNL:2,120,138
+FNA:2,216,CeedQFunctionContextCreate_Ref
+FNL:3,18,24
+FNA:3,1188,CeedQFunctionContextHasValidData_Ref
+FNL:4,29,36
+FNA:4,0,CeedQFunctionContextHasBorrowedDataOfType_Ref
+FNL:5,41,68
+FNA:5,216,CeedQFunctionContextSetData_Ref
+FNL:6,73,84
+FNA:6,0,CeedQFunctionContextTakeData_Ref
+FNL:7,89,98
+FNA:7,1188,CeedQFunctionContextGetData_Ref
+FNF:8
+FNH:6
+DA:18,1188
+DA:21,1188
+DA:22,1188
+DA:23,1188
+DA:29,0
+DA:32,0
+DA:33,0
+DA:34,0
+DA:35,0
+DA:41,216
+DA:45,216
+DA:46,216
+DA:48,216
+DA:50,216
+DA:51,216
+DA:52,0
+DA:53,0
+DA:54,0
+DA:55,0
+DA:56,0
+DA:57,0
+DA:58,72
+DA:59,72
+DA:60,72
+DA:61,72
+DA:62,72
+DA:63,144
+DA:64,144
+DA:65,144
+DA:67,216
+DA:73,0
+DA:76,0
+DA:78,0
+DA:80,0
+DA:81,0
+DA:82,0
+DA:83,0
+DA:89,1188
+DA:92,1188
+DA:94,1188
+DA:96,1188
+DA:97,1188
+DA:103,1188
+DA:108,216
+DA:111,216
+DA:112,216
+DA:113,216
+DA:114,216
+DA:120,216
+DA:124,216
+DA:125,216
+DA:126,216
+DA:127,216
+DA:128,216
+DA:129,216
+DA:130,216
+DA:131,216
+DA:132,216
+DA:133,216
+DA:134,216
+DA:135,216
+DA:136,216
+DA:137,216
+LF:63
+LH:45
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/backends/ref/ceed-ref-restriction.c
+FNL:0,138,179
+FNA:0,0,CeedElemRestrictionApplyCurlOrientedUnsignedNoTranspose_Ref_Core
+FNL:1,181,218
+FNA:1,816,CeedElemRestrictionApplyStridedTranspose_Ref_Core
+FNL:2,19,57
+FNA:2,144,CeedElemRestrictionApplyStridedNoTranspose_Ref_Core
+FNL:3,220,242
+FNA:3,816,CeedElemRestrictionApplyOffsetTranspose_Ref_Core
+FNL:4,244,266
+FNA:4,0,CeedElemRestrictionApplyOrientedTranspose_Ref_Core
+FNL:5,268,317
+FNA:5,0,CeedElemRestrictionApplyCurlOrientedTranspose_Ref_Core
+FNL:6,319,369
+FNA:6,0,CeedElemRestrictionApplyCurlOrientedUnsignedTranspose_Ref_Core
+FNL:7,371,394
+FNA:7,0,CeedElemRestrictionApplyAtPointsInElement_Ref_Core
+FNL:8,396,503
+FNA:8,3408,CeedElemRestrictionApply_Ref_Core
+FNL:9,508,512
+FNA:9,2680,CeedElemRestrictionApply_Ref_110
+FNL:10,514,518
+FNA:10,0,CeedElemRestrictionApply_Ref_111
+FNL:11,520,524
+FNA:11,504,CeedElemRestrictionApply_Ref_180
+FNL:12,526,530
+FNA:12,0,CeedElemRestrictionApply_Ref_181
+FNL:13,532,536
+FNA:13,40,CeedElemRestrictionApply_Ref_310
+FNL:14,538,542
+FNA:14,0,CeedElemRestrictionApply_Ref_311
+FNL:15,544,548
+FNA:15,24,CeedElemRestrictionApply_Ref_380
+FNL:16,550,554
+FNA:16,0,CeedElemRestrictionApply_Ref_381
+FNL:17,588,592
+FNA:17,0,CeedElemRestrictionApply_Ref_511
+FNL:18,59,75
+FNA:18,1632,CeedElemRestrictionApplyOffsetNoTranspose_Ref_Core
+FNL:19,602,606
+FNA:19,0,CeedElemRestrictionApply_Ref_581
+FNL:20,611,622
+FNA:20,336,CeedElemRestrictionApply_Ref
+FNL:21,627,639
+FNA:21,0,CeedElemRestrictionApplyUnsigned_Ref
+FNL:22,644,656
+FNA:22,0,CeedElemRestrictionApplyUnoriented_Ref
+FNL:23,661,669
+FNA:23,0,CeedElemRestrictionApplyAtPointsInElement_Ref
+FNL:24,674,685
+FNA:24,3072,CeedElemRestrictionApplyBlock_Ref
+FNL:25,690,699
+FNA:25,360,CeedElemRestrictionGetOffsets_Ref
+FNL:26,704,713
+FNA:26,0,CeedElemRestrictionGetOrientations_Ref
+FNL:27,718,727
+FNA:27,0,CeedElemRestrictionGetCurlOrientations_Ref
+FNL:28,732,741
+FNA:28,816,CeedElemRestrictionDestroy_Ref
+FNL:29,746,910
+FNA:29,1032,CeedElemRestrictionCreate_Ref
+FNL:30,77,94
+FNA:30,0,CeedElemRestrictionApplyOrientedNoTranspose_Ref_Core
+FNL:31,96,136
+FNA:31,0,CeedElemRestrictionApplyCurlOrientedNoTranspose_Ref_Core
+FNF:32
+FNH:14
+DA:19,144
+DA:26,144
+DA:27,144
+DA:30,1296
+DA:31,2304
+DA:32,29664
+DA:33,103392
+DA:34,74880
+DA:35,74880
+DA:44,0
+DA:45,0
+DA:46,0
+DA:47,0
+DA:48,0
+DA:49,0
+DA:50,0
+DA:56,144
+DA:59,1632
+DA:66,1632
+DA:67,3936
+DA:68,4968
+DA:69,166824
+DA:70,164160
+DA:74,1632
+DA:77,0
+DA:84,0
+DA:85,0
+DA:86,0
+DA:87,0
+DA:88,0
+DA:89,0
+DA:93,0
+DA:96,0
+DA:103,0
+DA:104,0
+DA:105,0
+DA:106,0
+DA:108,0
+DA:109,0
+DA:110,0
+DA:111,0
+DA:112,0
+DA:113,0
+DA:115,0
+DA:116,0
+DA:117,0
+DA:118,0
+DA:119,0
+DA:120,0
+DA:121,0
+DA:122,0
+DA:123,0
+DA:126,0
+DA:127,0
+DA:128,0
+DA:129,0
+DA:130,0
+DA:131,0
+DA:135,0
+DA:138,0
+DA:146,0
+DA:147,0
+DA:148,0
+DA:149,0
+DA:151,0
+DA:152,0
+DA:153,0
+DA:154,0
+DA:155,0
+DA:156,0
+DA:158,0
+DA:159,0
+DA:160,0
+DA:161,0
+DA:162,0
+DA:163,0
+DA:164,0
+DA:165,0
+DA:166,0
+DA:169,0
+DA:170,0
+DA:171,0
+DA:172,0
+DA:173,0
+DA:174,0
+DA:178,0
+DA:181,816
+DA:188,816
+DA:189,816
+DA:192,1968
+DA:193,2304
+DA:194,29664
+DA:195,72288
+DA:196,43776
+DA:205,0
+DA:206,0
+DA:207,0
+DA:208,0
+DA:209,0
+DA:210,0
+DA:211,0
+DA:217,816
+DA:220,816
+DA:227,816
+DA:228,1968
+DA:229,2304
+DA:230,19872
+DA:232,48000
+DA:235,29280
+DA:236,29280
+DA:241,816
+DA:244,0
+DA:251,0
+DA:252,0
+DA:253,0
+DA:254,0
+DA:256,0
+DA:259,0
+DA:260,0
+DA:265,0
+DA:268,0
+DA:271,0
+DA:274,0
+DA:276,0
+DA:277,0
+DA:278,0
+DA:280,0
+DA:281,0
+DA:283,0
+DA:284,0
+DA:285,0
+DA:286,0
+DA:287,0
+DA:289,0
+DA:290,0
+DA:292,0
+DA:293,0
+DA:294,0
+DA:295,0
+DA:296,0
+DA:297,0
+DA:298,0
+DA:299,0
+DA:301,0
+DA:302,0
+DA:305,0
+DA:306,0
+DA:307,0
+DA:308,0
+DA:309,0
+DA:311,0
+DA:312,0
+DA:316,0
+DA:319,0
+DA:323,0
+DA:326,0
+DA:328,0
+DA:329,0
+DA:330,0
+DA:332,0
+DA:333,0
+DA:335,0
+DA:336,0
+DA:337,0
+DA:338,0
+DA:339,0
+DA:341,0
+DA:342,0
+DA:344,0
+DA:345,0
+DA:346,0
+DA:347,0
+DA:348,0
+DA:349,0
+DA:350,0
+DA:351,0
+DA:353,0
+DA:354,0
+DA:357,0
+DA:358,0
+DA:359,0
+DA:360,0
+DA:361,0
+DA:363,0
+DA:364,0
+DA:368,0
+DA:371,0
+DA:375,0
+DA:378,0
+DA:379,0
+DA:380,0
+DA:381,0
+DA:382,0
+DA:383,0
+DA:384,0
+DA:387,0
+DA:388,0
+DA:391,0
+DA:393,0
+DA:396,3408
+DA:400,3408
+DA:405,3408
+DA:406,3408
+DA:407,3408
+DA:408,3408
+DA:409,3408
+DA:411,3408
+DA:413,1632
+DA:416,1776
+DA:418,3408
+DA:424,1632
+DA:425,816
+DA:426,816
+DA:428,816
+DA:429,816
+DA:430,816
+DA:432,816
+DA:433,0
+DA:434,0
+DA:435,0
+DA:438,0
+DA:441,0
+DA:442,0
+DA:443,0
+DA:444,0
+DA:446,0
+DA:447,0
+DA:450,0
+DA:453,0
+DA:454,0
+DA:455,0
+DA:456,0
+DA:464,1776
+DA:465,144
+DA:466,144
+DA:468,144
+DA:469,1632
+DA:470,1632
+DA:472,1632
+DA:473,0
+DA:474,0
+DA:475,0
+DA:478,0
+DA:481,0
+DA:482,0
+DA:483,0
+DA:484,0
+DA:486,0
+DA:487,0
+DA:490,0
+DA:493,0
+DA:494,0
+DA:495,0
+DA:496,0
+DA:499,3408
+DA:500,3408
+DA:501,3408
+DA:502,3408
+DA:508,2680
+DA:511,2680
+DA:514,0
+DA:517,0
+DA:520,504
+DA:523,504
+DA:526,0
+DA:529,0
+DA:532,40
+DA:535,40
+DA:538,0
+DA:541,0
+DA:544,24
+DA:547,24
+DA:550,0
+DA:553,0
+DA:588,0
+DA:591,0
+DA:602,0
+DA:605,0
+DA:611,336
+DA:615,336
+DA:616,336
+DA:617,336
+DA:618,336
+DA:619,336
+DA:620,336
+DA:621,336
+DA:627,0
+DA:632,0
+DA:633,0
+DA:634,0
+DA:635,0
+DA:636,0
+DA:637,0
+DA:638,0
+DA:644,0
+DA:649,0
+DA:650,0
+DA:651,0
+DA:652,0
+DA:653,0
+DA:654,0
+DA:655,0
+DA:661,0
+DA:666,0
+DA:667,0
+DA:668,0
+DA:674,3072
+DA:679,3072
+DA:680,3072
+DA:681,3072
+DA:682,3072
+DA:683,3072
+DA:684,3072
+DA:690,360
+DA:693,360
+DA:695,360
+DA:697,360
+DA:698,360
+DA:704,0
+DA:707,0
+DA:709,0
+DA:711,0
+DA:712,0
+DA:718,0
+DA:721,0
+DA:723,0
+DA:725,0
+DA:726,0
+DA:732,816
+DA:735,816
+DA:736,816
+DA:737,816
+DA:738,816
+DA:739,816
+DA:740,816
+DA:746,1032
+DA:749,1032
+DA:753,1032
+DA:754,1032
+DA:755,1032
+DA:756,1032
+DA:757,1032
+DA:758,1032
+DA:759,1032
+DA:760,1032
+DA:762,1032
+DA:764,1032
+DA:765,1032
+DA:770,1032
+DA:772,1032
+DA:773,1032
+DA:774,384
+DA:775,384
+DA:776,384
+DA:782,1032
+DA:783,0
+DA:785,0
+DA:786,0
+DA:788,0
+DA:789,0
+DA:792,0
+DA:793,0
+DA:797,1032
+DA:802,648
+DA:804,648
+DA:805,648
+DA:806,648
+DA:808,648
+DA:811,168
+DA:812,34328
+DA:813,34160
+DA:819,648
+DA:820,648
+DA:821,648
+DA:824,648
+DA:825,0
+DA:826,0
+DA:827,648
+DA:828,0
+DA:829,0
+DA:835,1032
+DA:837,1032
+DA:838,1032
+DA:839,544
+DA:840,544
+DA:841,544
+DA:842,0
+DA:843,0
+DA:844,0
+DA:845,312
+DA:846,312
+DA:847,312
+DA:848,0
+DA:849,0
+DA:850,0
+DA:851,64
+DA:852,64
+DA:853,64
+DA:854,0
+DA:855,0
+DA:856,0
+DA:857,24
+DA:858,24
+DA:859,24
+DA:860,0
+DA:861,0
+DA:862,0
+DA:880,0
+DA:881,0
+DA:882,0
+DA:888,0
+DA:889,0
+DA:890,0
+DA:891,88
+DA:892,88
+DA:893,88
+DA:897,1032
+DA:898,1032
+DA:899,1032
+DA:900,1032
+DA:901,0
+DA:903,1032
+DA:904,1032
+DA:905,1032
+DA:906,1032
+DA:907,1032
+DA:908,1032
+DA:909,1032
+LF:428
+LH:162
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/backends/ref/ceed-ref-tensor.c
+FNL:0,16,38
+FNA:0,4032,CeedTensorContractApply_Ref
+FNL:1,43,43
+FNA:1,192,CeedTensorContractDestroy_Ref
+FNL:2,48,56
+FNA:2,192,CeedTensorContractCreate_Ref
+FNF:3
+FNH:3
+DA:16,4032
+DA:18,4032
+DA:20,4032
+DA:21,1008
+DA:22,1008
+DA:25,4032
+DA:26,830208
+DA:29,26160
+DA:30,145200
+DA:31,839616
+DA:32,716544
+DA:33,5454912
+DA:37,4032
+DA:43,192
+DA:48,192
+DA:51,192
+DA:52,192
+DA:53,192
+DA:54,192
+DA:55,192
+LF:20
+LH:20
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/backends/ref/ceed-ref-vector.c
+FNL:0,103,110
+FNA:0,8856,CeedVectorGetArrayWrite_Ref
+FNL:1,115,115
+FNA:1,10560,CeedVectorRestoreArray_Ref
+FNL:2,120,120
+FNA:2,11856,CeedVectorRestoreArrayRead_Ref
+FNL:3,125,132
+FNA:3,2784,CeedVectorDestroy_Ref
+FNL:4,137,156
+FNA:4,2784,CeedVectorCreate_Ref
+FNL:5,18,25
+FNA:5,13560,CeedVectorHasValidArray_Ref
+FNL:6,30,37
+FNA:6,0,CeedVectorHasBorrowedArrayOfType_Ref
+FNL:7,42,54
+FNA:7,4752,CeedVectorSetArray_Ref
+FNL:8,59,70
+FNA:8,0,CeedVectorTakeArray_Ref
+FNL:9,75,84
+FNA:9,22416,CeedVectorGetArrayCore_Ref
+FNL:10,89,91
+FNA:10,11856,CeedVectorGetArrayRead_Ref
+FNL:11,96,98
+FNA:11,1704,CeedVectorGetArray_Ref
+FNF:12
+FNH:10
+DA:18,13560
+DA:21,13560
+DA:23,13560
+DA:24,13560
+DA:30,0
+DA:33,0
+DA:34,0
+DA:35,0
+DA:36,0
+DA:42,4752
+DA:46,4752
+DA:47,4752
+DA:49,4752
+DA:51,4752
+DA:53,4752
+DA:59,0
+DA:62,0
+DA:64,0
+DA:66,0
+DA:67,0
+DA:68,0
+DA:69,0
+DA:75,22416
+DA:78,22416
+DA:80,22416
+DA:82,22416
+DA:83,22416
+DA:89,11856
+DA:90,11856
+DA:96,1704
+DA:97,1704
+DA:103,8856
+DA:106,8856
+DA:108,8856
+DA:109,8856
+DA:115,10560
+DA:120,11856
+DA:125,2784
+DA:128,2784
+DA:129,2784
+DA:130,2784
+DA:131,2784
+DA:137,2784
+DA:141,2784
+DA:142,2784
+DA:143,2784
+DA:144,2784
+DA:145,2784
+DA:146,2784
+DA:147,2784
+DA:148,2784
+DA:149,2784
+DA:150,2784
+DA:151,2784
+DA:152,2784
+DA:153,2784
+DA:154,2784
+DA:155,2784
+LF:58
+LH:46
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/backends/ref/ceed-ref.c
+FNL:0,17,36
+FNA:0,192,CeedInit_Ref
+FNL:1,41,46
+FNA:1,192,CeedRegister_Ref
+FNF:2
+FNH:2
+DA:17,192
+DA:18,192
+DA:20,192
+DA:22,192
+DA:23,192
+DA:24,192
+DA:25,192
+DA:26,192
+DA:27,192
+DA:28,192
+DA:29,192
+DA:30,192
+DA:31,192
+DA:32,192
+DA:33,192
+DA:34,192
+DA:35,192
+DA:41,192
+DA:44,192
+LF:19
+LH:19
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/backends/weak/ceed-backend-weak.c
+FNF:0
+FNH:0
+LF:0
+LH:0
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/backends/xsmm/ceed-xsmm-blocked.c
+FNL:0,18,32
+FNA:0,24,CeedInit_Xsmm_Blocked
+FNL:1,37,37
+FNA:1,192,CeedRegister_Xsmm_Blocked
+FNF:2
+FNH:2
+DA:18,24
+DA:21,24
+DA:23,24
+DA:26,24
+DA:27,24
+DA:28,24
+DA:30,24
+DA:31,24
+DA:37,192
+LF:9
+LH:9
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/backends/xsmm/ceed-xsmm-serial.c
+FNL:0,18,32
+FNA:0,24,CeedInit_Xsmm_Serial
+FNL:1,37,37
+FNA:1,192,CeedRegister_Xsmm_Serial
+FNF:2
+FNH:2
+DA:18,24
+DA:21,24
+DA:23,24
+DA:26,24
+DA:27,24
+DA:28,24
+DA:30,24
+DA:31,24
+DA:37,192
+LF:9
+LH:9
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/backends/xsmm/ceed-xsmm-tensor.c
+FNL:0,17,63
+FNA:0,2016,CeedTensorContractApply_Xsmm
+FNL:1,68,71
+FNA:1,96,CeedTensorContractCreate_Xsmm
+FNF:2
+FNH:2
+DA:17,2016
+DA:19,2016
+DA:21,1344
+DA:22,1344
+DA:23,1344
+DA:27,1344
+DA:29,1344
+DA:32,1344
+DA:35,1344
+DA:36,1344
+DA:37,1344
+DA:38,1344
+DA:41,672
+DA:42,672
+DA:43,672
+DA:47,672
+DA:49,672
+DA:52,672
+DA:55,672
+DA:56,4696
+DA:57,4024
+DA:58,4024
+DA:59,4024
+DA:62,2016
+DA:68,96
+DA:69,96
+DA:70,96
+LF:27
+LH:27
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/ex1-volumetest/(t*.test/(t*.-f.h))
+FNL:0,2,36
+FNA:0,384,build_mass_
+FNL:1,39,54
+FNA:1,384,apply_mass_
+FNF:2
+FNH:2
+DA:2,384
+DA:14,384
+DA:15,384
+DA:17,672
+DA:19,3648
+DA:20,3360
+DA:24,4752
+DA:25,4680
+DA:29,17664
+DA:32,17304
+DA:35,384
+DA:36,384
+DA:39,384
+DA:50,25344
+DA:51,25344
+DA:53,384
+DA:54,384
+LF:17
+LH:17
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/examples/ceed/ex1-volume-f.f90
+FNL:0,157,190
+FNA:0,96,transformmeshcoords_
+FNL:1,193,250
+FNA:1,96,setcartesianmeshcoords_
+FNL:2,253,557
+FNA:2,96,MAIN__
+FNL:3,42,72
+FNA:3,96,getcartesianmeshsize_
+FNL:4,557,557
+FNA:4,96,main
+FNL:5,75,154
+FNA:5,192,buildcartesianrestriction_
+FNF:6
+FNH:6
+DA:42,96
+DA:51,96
+DA:52,96
+DA:58,384
+DA:59,288
+DA:60,288
+DA:62,96
+DA:64,288
+DA:65,192
+DA:66,192
+DA:67,64
+DA:68,64
+DA:70,288
+DA:72,96
+DA:75,192
+DA:100,192
+DA:104,192
+DA:105,192
+DA:106,192
+DA:107,192
+DA:108,192
+DA:110,576
+DA:111,384
+DA:112,384
+DA:113,576
+DA:115,192
+DA:119,192
+DA:120,192
+DA:122,2880
+DA:123,2688
+DA:124,2688
+DA:125,2688
+DA:126,2688
+DA:128,6144
+DA:129,3456
+DA:130,6144
+DA:133,41920
+DA:134,39040
+DA:135,39040
+DA:136,39040
+DA:138,122880
+DA:139,83840
+DA:140,83840
+DA:141,122880
+DA:143,41728
+DA:148,192
+DA:149,192
+DA:151,96
+DA:153,192
+DA:154,192
+DA:157,96
+DA:172,96
+DA:173,128
+DA:175,4160
+DA:176,4160
+DA:178,32
+DA:181,12160
+DA:182,12096
+DA:183,12096
+DA:185,12096
+DA:186,12160
+DA:188,160
+DA:190,96
+DA:193,96
+DA:210,96
+DA:211,96
+DA:214,96
+DA:215,96
+DA:217,288
+DA:218,192
+DA:219,288
+DA:222,96
+DA:223,96
+DA:226,96
+DA:227,96
+DA:228,96
+DA:229,96
+DA:230,576
+DA:231,576
+DA:234,16320
+DA:235,16224
+DA:237,51840
+DA:238,35520
+DA:239,35520
+DA:240,51744
+DA:243,96
+DA:245,96
+DA:247,96
+DA:248,96
+DA:249,96
+DA:250,96
+DA:253,96
+DA:267,96
+DA:282,96
+DA:283,96
+DA:284,96
+DA:285,96
+DA:286,96
+DA:287,96
+DA:288,96
+DA:289,96
+DA:290,96
+DA:291,96
+DA:292,96
+DA:296,96
+DA:297,624
+DA:298,528
+DA:300,96
+DA:303,0
+DA:306,96
+DA:309,96
+DA:310,96
+DA:311,96
+DA:314,0
+DA:315,0
+DA:318,0
+DA:319,0
+DA:322,0
+DA:323,0
+DA:326,0
+DA:327,0
+DA:330,0
+DA:331,0
+DA:335,96
+DA:338,528
+DA:343,96
+DA:344,96
+DA:345,96
+DA:347,0
+DA:352,96
+DA:354,0
+DA:355,0
+DA:356,0
+DA:357,0
+DA:358,0
+DA:359,0
+DA:360,0
+DA:361,0
+DA:362,0
+DA:364,0
+DA:366,0
+DA:367,0
+DA:368,0
+DA:370,0
+DA:377,96
+DA:381,96
+DA:382,96
+DA:385,96
+DA:386,96
+DA:388,0
+DA:389,0
+DA:390,0
+DA:392,0
+DA:393,0
+DA:395,0
+DA:401,96
+DA:403,96
+DA:405,96
+DA:407,0
+DA:408,0
+DA:414,96
+DA:415,96
+DA:418,96
+DA:419,96
+DA:420,96
+DA:422,96
+DA:424,96
+DA:427,96
+DA:428,16
+DA:430,16
+DA:433,16
+DA:436,48
+DA:441,48
+DA:442,48
+DA:443,48
+DA:444,48
+DA:445,48
+DA:449,96
+DA:450,96
+DA:451,96
+DA:452,96
+DA:455,96
+DA:456,96
+DA:457,288
+DA:458,288
+DA:460,96
+DA:461,96
+DA:464,96
+DA:465,48
+DA:469,48
+DA:470,48
+DA:471,48
+DA:472,48
+DA:476,96
+DA:477,96
+DA:478,96
+DA:479,96
+DA:482,96
+DA:483,96
+DA:485,96
+DA:486,96
+DA:487,96
+DA:488,96
+DA:489,96
+DA:490,96
+DA:493,96
+DA:496,96
+DA:499,96
+DA:501,0
+DA:504,96
+DA:506,96
+DA:511,96
+DA:513,96
+DA:514,16320
+DA:515,16320
+DA:517,96
+DA:519,96
+DA:521,0
+DA:522,0
+DA:523,0
+DA:524,0
+DA:527,96
+DA:528,32
+DA:530,0
+DA:534,64
+DA:536,0
+DA:543,96
+DA:544,96
+DA:545,96
+DA:546,96
+DA:547,96
+DA:548,96
+DA:549,96
+DA:550,96
+DA:551,96
+DA:552,96
+DA:553,96
+DA:554,96
+DA:555,96
+DA:556,96
+DA:557,96
+LF:241
+LH:200
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/examples/ceed/ex1-volume.c
+FNL:0,294,316
+FNA:0,96,GetCartesianMeshSize
+FNL:1,318,363
+FNA:1,192,BuildCartesianRestriction
+FNL:2,365,394
+FNA:2,96,SetCartesianMeshCoords
+FNL:3,401,429
+FNA:3,96,TransformMeshCoords
+FNL:4,55,292
+FNA:4,96,main
+FNF:5
+FNH:5
+DA:55,96
+DA:56,96
+DA:57,96
+DA:58,96
+DA:59,96
+DA:60,96
+DA:61,96
+DA:62,96
+DA:63,96
+DA:66,432
+DA:97,96
+DA:100,96
+DA:121,96
+DA:126,96
+DA:127,96
+DA:130,96
+DA:132,96
+DA:133,96
+DA:146,96
+DA:147,96
+DA:148,96
+DA:158,96
+DA:159,96
+DA:162,96
+DA:168,96
+DA:169,96
+DA:170,96
+DA:175,96
+DA:177,48
+DA:178,48
+DA:179,48
+DA:182,48
+DA:183,48
+DA:184,48
+DA:185,48
+DA:186,48
+DA:192,96
+DA:193,96
+DA:194,96
+DA:195,96
+DA:199,96
+DA:200,96
+DA:202,288
+DA:203,96
+DA:204,96
+DA:209,96
+DA:211,48
+DA:214,48
+DA:215,48
+DA:216,48
+DA:217,48
+DA:223,96
+DA:224,96
+DA:225,96
+DA:226,96
+DA:231,96
+DA:232,96
+DA:235,96
+DA:238,96
+DA:241,96
+DA:246,96
+DA:253,96
+DA:258,96
+DA:259,16320
+DA:260,96
+DA:262,96
+DA:270,96
+DA:272,96
+DA:276,96
+DA:277,96
+DA:278,96
+DA:279,96
+DA:280,96
+DA:281,96
+DA:282,96
+DA:283,96
+DA:284,96
+DA:285,96
+DA:286,96
+DA:287,96
+DA:288,96
+DA:289,96
+DA:290,96
+DA:291,96
+DA:294,96
+DA:297,96
+DA:298,96
+DA:300,384
+DA:301,288
+DA:302,288
+DA:304,96
+DA:306,288
+DA:307,192
+DA:309,192
+DA:310,64
+DA:311,64
+DA:313,192
+DA:315,96
+DA:318,192
+DA:319,192
+DA:320,192
+DA:321,192
+DA:322,192
+DA:323,192
+DA:325,576
+DA:326,384
+DA:327,384
+DA:328,384
+DA:330,192
+DA:334,192
+DA:336,2880
+DA:337,2688
+DA:339,6144
+DA:340,3456
+DA:341,3456
+DA:343,2688
+DA:345,41728
+DA:346,39040
+DA:348,122880
+DA:349,83840
+DA:350,83840
+DA:351,83840
+DA:353,39040
+DA:356,192
+DA:358,192
+DA:359,96
+DA:361,192
+DA:362,192
+DA:365,96
+DA:366,96
+DA:367,96
+DA:369,288
+DA:370,192
+DA:371,192
+DA:375,96
+DA:376,96
+DA:379,96
+DA:380,576
+DA:381,16320
+DA:382,16224
+DA:384,51744
+DA:385,35520
+DA:387,35520
+DA:388,35520
+DA:391,96
+DA:392,96
+DA:393,96
+DA:401,96
+DA:405,96
+DA:406,96
+DA:407,4160
+DA:409,4128
+DA:411,32
+DA:413,64
+DA:415,12160
+DA:418,12096
+DA:420,12096
+DA:421,12096
+DA:422,12096
+DA:423,12096
+DA:425,64
+DA:427,96
+DA:428,96
+LF:163
+LH:163
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/examples/ceed/ex1-volume.h
+FNL:0,16,52
+FNA:0,384,build_mass
+FNL:1,55,64
+FNA:1,384,apply_mass
+FNF:2
+FNH:2
+DA:16,384
+DA:17,384
+DA:21,384
+DA:22,384
+DA:24,384
+DA:25,288
+DA:26,288
+DA:29,3360
+DA:30,288
+DA:31,72
+DA:32,72
+DA:35,4680
+DA:36,4608
+DA:38,72
+DA:39,24
+DA:40,24
+DA:43,17304
+DA:44,17280
+DA:45,17280
+DA:46,17280
+DA:47,17280
+DA:49,24
+DA:51,384
+DA:55,384
+DA:58,384
+DA:59,384
+DA:62,25344
+DA:63,384
+LF:28
+LH:28
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/gallery/ceed-gallery-list.h
+FNF:0
+FNH:0
+DA:15,96
+DA:16,96
+DA:17,96
+DA:18,96
+DA:19,96
+DA:20,96
+DA:21,96
+DA:22,96
+DA:23,96
+DA:24,96
+DA:25,96
+DA:26,96
+DA:27,96
+DA:28,96
+DA:29,96
+DA:30,96
+LF:16
+LH:16
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/gallery/identity/ceed-identity.c
+FNL:0,17,36
+FNA:0,0,CeedQFunctionInit_Identity
+FNL:1,41,43
+FNA:1,96,CeedQFunctionRegister_Identity
+FNF:2
+FNH:1
+DA:17,0
+DA:19,0
+DA:20,0
+DA:24,0
+DA:28,0
+DA:29,0
+DA:30,0
+DA:31,0
+DA:32,0
+DA:33,0
+DA:35,0
+DA:41,96
+DA:42,96
+LF:13
+LH:2
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/gallery/mass-vector/ceed-vectormassapply.c
+FNL:0,16,30
+FNA:0,0,CeedQFunctionInit_Vector3MassApply
+FNL:1,35,37
+FNA:1,96,CeedQFunctionRegister_Vector3MassApply
+FNF:2
+FNH:1
+DA:16,0
+DA:18,0
+DA:19,0
+DA:22,0
+DA:23,0
+DA:24,0
+DA:25,0
+DA:27,0
+DA:29,0
+DA:35,96
+DA:36,96
+LF:11
+LH:2
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/gallery/mass/ceed-mass1dbuild.c
+FNL:0,16,30
+FNA:0,32,CeedQFunctionInit_Mass1DBuild
+FNL:1,35,37
+FNA:1,96,CeedQFunctionRegister_Mass1DBuild
+FNF:2
+FNH:2
+DA:16,32
+DA:18,32
+DA:19,32
+DA:22,32
+DA:23,32
+DA:24,32
+DA:25,32
+DA:27,32
+DA:29,32
+DA:35,96
+DA:36,96
+LF:11
+LH:11
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/gallery/mass/ceed-mass2dbuild.c
+FNL:0,16,30
+FNA:0,32,CeedQFunctionInit_Mass2DBuild
+FNL:1,35,37
+FNA:1,96,CeedQFunctionRegister_Mass2DBuild
+FNF:2
+FNH:2
+DA:16,32
+DA:18,32
+DA:19,32
+DA:22,32
+DA:23,32
+DA:24,32
+DA:25,32
+DA:27,32
+DA:29,32
+DA:35,96
+DA:36,96
+LF:11
+LH:11
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/gallery/mass/ceed-mass3dbuild.c
+FNL:0,16,30
+FNA:0,32,CeedQFunctionInit_Mass3DBuild
+FNL:1,35,37
+FNA:1,96,CeedQFunctionRegister_Mass3DBuild
+FNF:2
+FNH:2
+DA:16,32
+DA:18,32
+DA:19,32
+DA:22,32
+DA:23,32
+DA:24,32
+DA:25,32
+DA:27,32
+DA:29,32
+DA:35,96
+DA:36,96
+LF:11
+LH:11
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/gallery/mass/ceed-massapply.c
+FNL:0,16,29
+FNA:0,96,CeedQFunctionInit_MassApply
+FNL:1,34,36
+FNA:1,96,CeedQFunctionRegister_MassApply
+FNF:2
+FNH:2
+DA:16,96
+DA:18,96
+DA:19,96
+DA:22,96
+DA:23,96
+DA:24,96
+DA:26,96
+DA:28,96
+DA:34,96
+DA:35,96
+LF:10
+LH:10
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/gallery/poisson-vector/ceed-vectorpoisson1dapply.c
+FNL:0,16,30
+FNA:0,0,CeedQFunctionInit_Vector3Poisson1DApply
+FNL:1,35,37
+FNA:1,96,CeedQFunctionRegister_Vector3Poisson1DApply
+FNF:2
+FNH:1
+DA:16,0
+DA:18,0
+DA:19,0
+DA:22,0
+DA:23,0
+DA:24,0
+DA:25,0
+DA:27,0
+DA:29,0
+DA:35,96
+DA:36,96
+LF:11
+LH:2
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/gallery/poisson-vector/ceed-vectorpoisson2dapply.c
+FNL:0,16,30
+FNA:0,0,CeedQFunctionInit_Vector3Poisson2DApply
+FNL:1,35,37
+FNA:1,96,CeedQFunctionRegister_Vector3Poisson2DApply
+FNF:2
+FNH:1
+DA:16,0
+DA:18,0
+DA:19,0
+DA:22,0
+DA:23,0
+DA:24,0
+DA:25,0
+DA:27,0
+DA:29,0
+DA:35,96
+DA:36,96
+LF:11
+LH:2
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/gallery/poisson-vector/ceed-vectorpoisson3dapply.c
+FNL:0,16,30
+FNA:0,0,CeedQFunctionInit_Vector3Poisson3DApply
+FNL:1,35,37
+FNA:1,96,CeedQFunctionRegister_Vector3Poisson3DApply
+FNF:2
+FNH:1
+DA:16,0
+DA:18,0
+DA:19,0
+DA:22,0
+DA:23,0
+DA:24,0
+DA:25,0
+DA:27,0
+DA:29,0
+DA:35,96
+DA:36,96
+LF:11
+LH:2
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/gallery/poisson/ceed-poisson1dapply.c
+FNL:0,16,30
+FNA:0,0,CeedQFunctionInit_Poisson1DApply
+FNL:1,35,37
+FNA:1,96,CeedQFunctionRegister_Poisson1DApply
+FNF:2
+FNH:1
+DA:16,0
+DA:18,0
+DA:19,0
+DA:22,0
+DA:23,0
+DA:24,0
+DA:25,0
+DA:27,0
+DA:29,0
+DA:35,96
+DA:36,96
+LF:11
+LH:2
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/gallery/poisson/ceed-poisson1dbuild.c
+FNL:0,16,30
+FNA:0,0,CeedQFunctionInit_Poisson1DBuild
+FNL:1,35,37
+FNA:1,96,CeedQFunctionRegister_Poisson1DBuild
+FNF:2
+FNH:1
+DA:16,0
+DA:18,0
+DA:19,0
+DA:22,0
+DA:23,0
+DA:24,0
+DA:25,0
+DA:27,0
+DA:29,0
+DA:35,96
+DA:36,96
+LF:11
+LH:2
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/gallery/poisson/ceed-poisson2dapply.c
+FNL:0,16,30
+FNA:0,0,CeedQFunctionInit_Poisson2DApply
+FNL:1,35,37
+FNA:1,96,CeedQFunctionRegister_Poisson2DApply
+FNF:2
+FNH:1
+DA:16,0
+DA:18,0
+DA:19,0
+DA:22,0
+DA:23,0
+DA:24,0
+DA:25,0
+DA:27,0
+DA:29,0
+DA:35,96
+DA:36,96
+LF:11
+LH:2
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/gallery/poisson/ceed-poisson2dbuild.c
+FNL:0,16,30
+FNA:0,0,CeedQFunctionInit_Poisson2DBuild
+FNL:1,35,37
+FNA:1,96,CeedQFunctionRegister_Poisson2DBuild
+FNF:2
+FNH:1
+DA:16,0
+DA:18,0
+DA:19,0
+DA:22,0
+DA:23,0
+DA:24,0
+DA:25,0
+DA:27,0
+DA:29,0
+DA:35,96
+DA:36,96
+LF:11
+LH:2
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/gallery/poisson/ceed-poisson3dapply.c
+FNL:0,16,30
+FNA:0,0,CeedQFunctionInit_Poisson3DApply
+FNL:1,35,37
+FNA:1,96,CeedQFunctionRegister_Poisson3DApply
+FNF:2
+FNH:1
+DA:16,0
+DA:18,0
+DA:19,0
+DA:22,0
+DA:23,0
+DA:24,0
+DA:25,0
+DA:27,0
+DA:29,0
+DA:35,96
+DA:36,96
+LF:11
+LH:2
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/gallery/poisson/ceed-poisson3dbuild.c
+FNL:0,16,30
+FNA:0,0,CeedQFunctionInit_Poisson3DBuild
+FNL:1,35,37
+FNA:1,96,CeedQFunctionRegister_Poisson3DBuild
+FNF:2
+FNH:1
+DA:16,0
+DA:18,0
+DA:19,0
+DA:22,0
+DA:23,0
+DA:24,0
+DA:25,0
+DA:27,0
+DA:29,0
+DA:35,96
+DA:36,96
+LF:11
+LH:2
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/gallery/scale/ceed-scale.c
+FNL:0,16,24
+FNA:0,0,CeedQFunctionInit_Scale
+FNL:1,29,29
+FNA:1,96,CeedQFunctionRegister_Scale
+FNF:2
+FNH:1
+DA:16,0
+DA:18,0
+DA:19,0
+DA:23,0
+DA:29,96
+LF:5
+LH:1
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/include/ceed/ceed.h
+FNL:0,525,533
+FNA:0,14400,CeedIntPow
+FNL:1,545,545
+FNA:1,420064,CeedIntMin
+FNL:2,557,557
+FNA:2,0,CeedIntMax
+FNF:3
+FNH:2
+DA:525,14400
+DA:526,14400
+DA:527,20992
+DA:528,6592
+DA:529,6592
+DA:530,6592
+DA:532,14400
+DA:545,420064
+DA:557,0
+LF:9
+LH:8
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/include/ceed/jit-source/gallery/ceed-identity.h
+FNL:0,17,31
+FNA:0,0,Identity
+FNF:1
+FNH:0
+DA:17,0
+DA:19,0
+DA:20,0
+DA:23,0
+DA:25,0
+DA:28,0
+DA:30,0
+LF:7
+LH:0
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/include/ceed/jit-source/gallery/ceed-mass1dbuild.h
+FNL:0,13,24
+FNA:0,576,Mass1DBuild
+FNF:1
+FNH:1
+DA:13,576
+DA:16,576
+DA:18,576
+DA:21,6720
+DA:23,576
+LF:5
+LH:5
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/include/ceed/jit-source/gallery/ceed-mass2dbuild.h
+FNL:0,13,26
+FNA:0,144,Mass2DBuild
+FNF:1
+FNH:1
+DA:13,144
+DA:16,144
+DA:18,144
+DA:21,9360
+DA:22,9216
+DA:25,144
+LF:6
+LH:6
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/include/ceed/jit-source/gallery/ceed-mass3dbuild.h
+FNL:0,13,28
+FNA:0,48,Mass3DBuild
+FNF:1
+FNH:1
+DA:13,48
+DA:16,48
+DA:18,48
+DA:21,34608
+DA:22,34560
+DA:23,34560
+DA:24,34560
+DA:27,48
+LF:8
+LH:8
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/include/ceed/jit-source/gallery/ceed-massapply.h
+FNL:0,13,24
+FNA:0,768,MassApply
+FNF:1
+FNH:1
+DA:13,768
+DA:16,768
+DA:18,768
+DA:21,50688
+DA:23,768
+LF:5
+LH:5
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/include/ceed/jit-source/gallery/ceed-poisson1dapply.h
+FNL:0,13,25
+FNA:0,0,Poisson1DApply
+FNF:1
+FNH:0
+DA:13,0
+DA:16,0
+DA:19,0
+DA:22,0
+DA:24,0
+LF:5
+LH:0
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/include/ceed/jit-source/gallery/ceed-poisson1dbuild.h
+FNL:0,13,28
+FNA:0,0,Poisson1DBuild
+FNF:1
+FNH:0
+DA:13,0
+DA:19,0
+DA:22,0
+DA:25,0
+DA:27,0
+LF:5
+LH:0
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/include/ceed/jit-source/gallery/ceed-poisson2dapply.h
+FNL:0,13,39
+FNA:0,0,Poisson2DApply
+FNF:1
+FNH:0
+DA:13,0
+DA:16,0
+DA:18,0
+DA:20,0
+DA:23,0
+DA:28,0
+DA:29,0
+DA:30,0
+DA:35,0
+DA:38,0
+LF:10
+LH:0
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/include/ceed/jit-source/gallery/ceed-poisson2dbuild.h
+FNL:0,13,38
+FNA:0,0,Poisson2DBuild
+FNF:1
+FNH:0
+DA:13,0
+DA:18,0
+DA:20,0
+DA:23,0
+DA:27,0
+DA:28,0
+DA:29,0
+DA:30,0
+DA:31,0
+DA:32,0
+DA:33,0
+DA:34,0
+DA:37,0
+LF:13
+LH:0
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/include/ceed/jit-source/gallery/ceed-poisson3dapply.h
+FNL:0,13,41
+FNA:0,0,Poisson3DApply
+FNF:1
+FNH:0
+DA:13,0
+DA:16,0
+DA:18,0
+DA:20,0
+DA:23,0
+DA:29,0
+DA:30,0
+DA:31,0
+DA:32,0
+DA:37,0
+DA:40,0
+LF:11
+LH:0
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/include/ceed/jit-source/gallery/ceed-poisson3dbuild.h
+FNL:0,13,51
+FNA:0,0,Poisson3DBuild
+FNF:1
+FNH:0
+DA:13,0
+DA:17,0
+DA:19,0
+DA:21,0
+DA:24,0
+DA:27,0
+DA:28,0
+DA:31,0
+DA:32,0
+DA:35,0
+DA:42,0
+DA:43,0
+DA:44,0
+DA:45,0
+DA:46,0
+DA:47,0
+DA:50,0
+LF:17
+LH:0
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/include/ceed/jit-source/gallery/ceed-scale.h
+FNL:0,13,27
+FNA:0,0,Scale
+FNF:1
+FNH:0
+DA:13,0
+DA:15,0
+DA:19,0
+DA:20,0
+DA:22,0
+DA:25,0
+DA:26,0
+LF:7
+LH:0
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/include/ceed/jit-source/gallery/ceed-vectormassapply.h
+FNL:0,13,30
+FNA:0,0,Vector3MassApply
+FNF:1
+FNH:0
+DA:13,0
+DA:16,0
+DA:18,0
+DA:20,0
+DA:23,0
+DA:24,0
+DA:25,0
+DA:29,0
+LF:8
+LH:0
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/include/ceed/jit-source/gallery/ceed-vectorpoisson1dapply.h
+FNL:0,13,30
+FNA:0,0,Vector3Poisson1DApply
+FNF:1
+FNH:0
+DA:13,0
+DA:16,0
+DA:18,0
+DA:20,0
+DA:23,0
+DA:24,0
+DA:25,0
+DA:29,0
+LF:8
+LH:0
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/include/ceed/jit-source/gallery/ceed-vectorpoisson2dapply.h
+FNL:0,13,40
+FNA:0,0,Vector3Poisson2DApply
+FNF:1
+FNH:0
+DA:13,0
+DA:16,0
+DA:18,0
+DA:20,0
+DA:23,0
+DA:28,0
+DA:29,0
+DA:30,0
+DA:35,0
+DA:36,0
+DA:39,0
+LF:11
+LH:0
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/include/ceed/jit-source/gallery/ceed-vectorpoisson3dapply.h
+FNL:0,13,43
+FNA:0,0,Vector3Poisson3DApply
+FNF:1
+FNH:0
+DA:13,0
+DA:16,0
+DA:18,0
+DA:20,0
+DA:23,0
+DA:29,0
+DA:30,0
+DA:31,0
+DA:32,0
+DA:37,0
+DA:38,0
+DA:39,0
+DA:42,0
+LF:13
+LH:0
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/interface/ceed-basis.c
+FNL:0,1078,1081
+FNA:0,6528,CeedBasisGetFESpace
+FNL:1,1093,1096
+FNA:1,0,CeedBasisGetTopologyDimension
+FNL:2,1108,1111
+FNA:2,4800,CeedBasisGetTensorContract
+FNL:3,1123,1127
+FNA:3,384,CeedBasisSetTensorContract
+FNL:4,1146,1156
+FNA:4,384,CeedMatrixMatrixMultiply
+FNL:5,1171,1207
+FNA:5,384,CeedQRFactorization
+FNL:6,1228,1241
+FNA:6,384,CeedHouseholderApplyQ
+FNL:7,1256,1285
+FNA:7,384,CeedMatrixPseudoinverse
+FNL:8,129,146
+FNA:8,0,CeedGivensRotation
+FNL:9,1300,1425
+FNA:9,0,CeedSymmetricSchurDecomposition
+FNL:10,1447,1507
+FNA:10,0,CeedSimultaneousDiagonalization
+FNL:11,1536,1576
+FNA:11,864,CeedBasisCreateTensorH1
+FNL:12,1594,1653
+FNA:12,384,CeedBasisCreateTensorH1Lagrange
+FNL:13,163,178
+FNA:13,0,CeedScalarView
+FNL:14,1673,1712
+FNA:14,0,CeedBasisCreateH1
+FNL:15,1732,1771
+FNA:15,0,CeedBasisCreateHdiv
+FNL:16,1791,1831
+FNA:16,0,CeedBasisCreateHcurl
+FNL:17,1856,1899
+FNA:17,0,CeedBasisCreateProjection
+FNL:18,190,193
+FNA:18,0,CeedBasisView_Object
+FNL:19,1914,1919
+FNA:19,6528,CeedBasisReferenceCopy
+FNL:20,1931,1934
+FNA:20,0,CeedBasisSetNumViewTabs
+FNL:21,1946,1949
+FNA:21,0,CeedBasisGetNumViewTabs
+FNL:22,1961,2037
+FNA:22,0,CeedBasisView
+FNL:23,204,207
+FNA:23,0,CeedBasisDestroy_Object
+FNL:24,2060,2065
+FNA:24,4800,CeedBasisApply
+FNL:25,2088,2094
+FNA:25,0,CeedBasisApplyAdd
+FNL:26,2116,2125
+FNA:26,0,CeedBasisApplyAtPoints
+FNL:27,2147,2157
+FNA:27,0,CeedBasisApplyAddAtPoints
+FNL:28,2169,2172
+FNA:28,768,CeedBasisGetCeed
+FNL:29,2183,2183
+FNA:29,0,CeedBasisReturnCeed
+FNL:30,2195,2198
+FNA:30,20736,CeedBasisGetDimension
+FNL:31,2210,2213
+FNA:31,0,CeedBasisGetTopology
+FNL:32,2225,2228
+FNA:32,13248,CeedBasisGetNumComponents
+FNL:33,2240,2243
+FNA:33,10176,CeedBasisGetNumNodes
+FNL:34,2255,2259
+FNA:34,5184,CeedBasisGetNumNodes1D
+FNL:35,226,322
+FNA:35,0,CeedBasisCreateProjectionMatrices
+FNL:36,2271,2274
+FNA:36,10368,CeedBasisGetNumQuadraturePoints
+FNL:37,2286,2290
+FNA:37,5184,CeedBasisGetNumQuadraturePoints1D
+FNL:38,2302,2305
+FNA:38,0,CeedBasisGetQRef
+FNL:39,2317,2320
+FNA:39,192,CeedBasisGetQWeights
+FNL:40,2332,2354
+FNA:40,0,CeedBasisGetInterp
+FNL:41,2366,2373
+FNA:41,4992,CeedBasisGetInterp1D
+FNL:42,2385,2410
+FNA:42,0,CeedBasisGetGrad
+FNL:43,2422,2429
+FNA:43,384,CeedBasisGetGrad1D
+FNL:44,2441,2444
+FNA:44,0,CeedBasisGetDiv
+FNL:45,2456,2459
+FNA:45,0,CeedBasisGetCurl
+FNL:46,2470,2490
+FNA:46,13440,CeedBasisDestroy
+FNL:47,2503,2542
+FNA:47,384,CeedGaussQuadrature
+FNL:48,2555,2608
+FNA:48,576,CeedLobattoQuadrature
+FNL:49,345,376
+FNA:49,4800,CeedBasisApplyCheckDims
+FNL:50,398,453
+FNA:50,0,CeedBasisApplyAtPointsCheckDims
+FNL:51,476,677
+FNA:51,0,CeedBasisApplyAtPoints_Core
+FNL:52,48,53
+FNA:52,0,CeedChebyshevPolynomialsAtPoint
+FNL:53,66,80
+FNA:53,0,CeedChebyshevDerivativeAtPoint
+FNL:54,708,721
+FNA:54,0,CeedBasisCreateH1Fallback
+FNL:55,733,754
+FNA:55,384,CeedBasisGetCollocatedGrad
+FNL:56,766,796
+FNA:56,0,CeedBasisGetChebyshevInterp1D
+FNL:57,808,811
+FNA:57,10176,CeedBasisIsTensor
+FNL:58,823,837
+FNA:58,384,CeedBasisIsCollocated
+FNL:59,849,852
+FNA:59,5184,CeedBasisGetData
+FNL:60,864,867
+FNA:60,384,CeedBasisSetData
+FNL:61,878,881
+FNA:61,6144,CeedBasisReference
+FNL:62,897,923
+FNA:62,10368,CeedBasisGetNumQuadratureComponents
+FNL:63,937,1066
+FNA:63,0,CeedBasisGetFlopsEstimate
+FNL:64,99,108
+FNA:64,3840,CeedHouseholderReflect
+FNF:65
+FNH:33
+DA:48,0
+DA:49,0
+DA:50,0
+DA:51,0
+DA:52,0
+DA:66,0
+DA:69,0
+DA:70,0
+DA:71,0
+DA:72,0
+DA:73,0
+DA:74,0
+DA:75,0
+DA:76,0
+DA:77,0
+DA:79,0
+DA:99,3840
+DA:100,19200
+DA:101,15360
+DA:103,65280
+DA:104,15360
+DA:105,65280
+DA:107,3840
+DA:129,0
+DA:130,0
+DA:132,0
+DA:133,0
+DA:134,0
+DA:135,0
+DA:139,0
+DA:140,0
+DA:142,0
+DA:143,0
+DA:145,0
+DA:163,0
+DA:164,0
+DA:165,0
+DA:169,0
+DA:170,0
+DA:172,0
+DA:173,0
+DA:174,0
+DA:175,0
+DA:177,0
+DA:190,0
+DA:191,0
+DA:192,0
+DA:204,0
+DA:205,0
+DA:206,0
+DA:226,0
+DA:231,0
+DA:232,0
+DA:233,0
+DA:237,0
+DA:243,0
+DA:244,0
+DA:245,0
+DA:247,0
+DA:248,0
+DA:249,0
+DA:250,0
+DA:252,0
+DA:253,0
+DA:259,0
+DA:260,0
+DA:261,0
+DA:267,0
+DA:269,0
+DA:271,0
+DA:272,0
+DA:273,0
+DA:274,0
+DA:276,0
+DA:277,0
+DA:278,0
+DA:280,0
+DA:281,0
+DA:286,0
+DA:287,0
+DA:288,0
+DA:290,0
+DA:293,0
+DA:296,0
+DA:297,0
+DA:299,0
+DA:300,0
+DA:302,0
+DA:303,0
+DA:304,0
+DA:305,0
+DA:306,0
+DA:308,0
+DA:310,0
+DA:311,0
+DA:313,0
+DA:314,0
+DA:319,0
+DA:320,0
+DA:321,0
+DA:345,4800
+DA:347,4800
+DA:349,4800
+DA:350,4800
+DA:351,4800
+DA:352,4800
+DA:353,4800
+DA:354,4800
+DA:355,4800
+DA:358,4800
+DA:359,4800
+DA:360,4608
+DA:365,1536
+DA:366,9216
+DA:367,3072
+DA:368,3072
+DA:369,4608
+DA:370,192
+DA:371,192
+DA:372,192
+DA:374,4800
+DA:375,4800
+DA:398,0
+DA:400,0
+DA:401,0
+DA:403,0
+DA:404,0
+DA:405,0
+DA:406,0
+DA:407,0
+DA:408,0
+DA:409,0
+DA:410,0
+DA:411,0
+DA:414,0
+DA:415,0
+DA:422,0
+DA:426,0
+DA:427,0
+DA:428,0
+DA:429,0
+DA:430,0
+DA:431,0
+DA:432,0
+DA:433,0
+DA:434,0
+DA:435,0
+DA:436,0
+DA:437,0
+DA:438,0
+DA:439,0
+DA:440,0
+DA:441,0
+DA:442,0
+DA:451,0
+DA:452,0
+DA:476,0
+DA:478,0
+DA:480,0
+DA:482,0
+DA:483,0
+DA:484,0
+DA:485,0
+DA:491,0
+DA:492,0
+DA:495,0
+DA:497,0
+DA:498,0
+DA:499,0
+DA:501,0
+DA:507,0
+DA:508,0
+DA:509,0
+DA:510,0
+DA:511,0
+DA:513,0
+DA:514,0
+DA:515,0
+DA:519,0
+DA:520,0
+DA:521,0
+DA:522,0
+DA:526,0
+DA:528,0
+DA:530,0
+DA:532,0
+DA:534,0
+DA:536,0
+DA:537,0
+DA:538,0
+DA:542,0
+DA:543,0
+DA:549,0
+DA:552,0
+DA:553,0
+DA:554,0
+DA:555,0
+DA:556,0
+DA:557,0
+DA:560,0
+DA:561,0
+DA:563,0
+DA:565,0
+DA:566,0
+DA:568,0
+DA:569,0
+DA:571,0
+DA:573,0
+DA:575,0
+DA:576,0
+DA:579,0
+DA:581,0
+DA:582,0
+DA:584,0
+DA:586,0
+DA:587,0
+DA:588,0
+DA:590,0
+DA:591,0
+DA:593,0
+DA:596,0
+DA:598,0
+DA:600,0
+DA:602,0
+DA:603,0
+DA:604,0
+DA:605,0
+DA:607,0
+DA:614,0
+DA:615,0
+DA:616,0
+DA:618,0
+DA:619,0
+DA:620,0
+DA:623,0
+DA:624,0
+DA:626,0
+DA:627,0
+DA:629,0
+DA:630,0
+DA:632,0
+DA:633,0
+DA:636,0
+DA:638,0
+DA:639,0
+DA:642,0
+DA:644,0
+DA:645,0
+DA:647,0
+DA:648,0
+DA:650,0
+DA:651,0
+DA:652,0
+DA:655,0
+DA:656,0
+DA:660,0
+DA:662,0
+DA:664,0
+DA:666,0
+DA:667,0
+DA:668,0
+DA:671,0
+DA:672,0
+DA:673,0
+DA:676,0
+DA:708,0
+DA:710,0
+DA:713,0
+DA:714,0
+DA:716,0
+DA:717,0
+DA:718,0
+DA:719,0
+DA:720,0
+DA:733,384
+DA:740,384
+DA:741,384
+DA:742,384
+DA:745,384
+DA:746,384
+DA:747,384
+DA:748,384
+DA:749,384
+DA:751,384
+DA:752,384
+DA:753,384
+DA:766,0
+DA:772,0
+DA:773,0
+DA:774,0
+DA:778,0
+DA:779,0
+DA:780,0
+DA:781,0
+DA:784,0
+DA:785,0
+DA:788,0
+DA:789,0
+DA:792,0
+DA:793,0
+DA:794,0
+DA:795,0
+DA:808,10176
+DA:809,10176
+DA:810,10176
+DA:823,384
+DA:824,384
+DA:825,0
+DA:827,0
+DA:828,0
+DA:829,0
+DA:830,0
+DA:834,384
+DA:836,384
+DA:849,5184
+DA:850,5184
+DA:851,5184
+DA:864,384
+DA:865,384
+DA:866,384
+DA:878,6144
+DA:879,6144
+DA:880,6144
+DA:897,10368
+DA:900,10368
+DA:901,10368
+DA:902,6528
+DA:905,6528
+DA:906,6528
+DA:907,6528
+DA:908,3264
+DA:909,3264
+DA:910,3264
+DA:911,0
+DA:912,0
+DA:913,0
+DA:914,0
+DA:915,0
+DA:916,0
+DA:917,576
+DA:919,576
+DA:920,576
+DA:922,10368
+DA:937,0
+DA:941,0
+DA:942,0
+DA:943,0
+DA:946,0
+DA:947,0
+DA:948,0
+DA:949,0
+DA:950,0
+DA:951,0
+DA:952,0
+DA:954,0
+DA:956,0
+DA:957,0
+DA:958,0
+DA:959,0
+DA:961,0
+DA:962,0
+DA:967,0
+DA:968,0
+DA:971,0
+DA:972,0
+DA:974,0
+DA:975,0
+DA:976,0
+DA:977,0
+DA:980,0
+DA:981,0
+DA:982,0
+DA:983,0
+DA:984,0
+DA:985,0
+DA:986,0
+DA:987,0
+DA:988,0
+DA:990,0
+DA:992,0
+DA:994,0
+DA:995,0
+DA:996,0
+DA:997,0
+DA:998,0
+DA:1000,0
+DA:1002,0
+DA:1004,0
+DA:1006,0
+DA:1014,0
+DA:1015,0
+DA:1016,0
+DA:1019,0
+DA:1020,0
+DA:1021,0
+DA:1022,0
+DA:1023,0
+DA:1024,0
+DA:1025,0
+DA:1026,0
+DA:1027,0
+DA:1028,0
+DA:1029,0
+DA:1037,0
+DA:1038,0
+DA:1039,0
+DA:1045,0
+DA:1046,0
+DA:1047,0
+DA:1048,0
+DA:1049,0
+DA:1050,0
+DA:1051,0
+DA:1052,0
+DA:1053,0
+DA:1054,0
+DA:1058,0
+DA:1059,0
+DA:1060,0
+DA:1061,0
+DA:1062,0
+DA:1065,0
+DA:1078,6528
+DA:1079,6528
+DA:1080,6528
+DA:1093,0
+DA:1094,0
+DA:1095,0
+DA:1108,4800
+DA:1109,4800
+DA:1110,4800
+DA:1123,384
+DA:1124,384
+DA:1125,384
+DA:1126,384
+DA:1146,384
+DA:1147,2688
+DA:1148,16128
+DA:1149,13824
+DA:1151,82944
+DA:1152,13824
+DA:1155,384
+DA:1171,384
+DA:1172,384
+DA:1175,384
+DA:1177,2304
+DA:1178,1920
+DA:1180,1920
+DA:1181,0
+DA:1182,0
+DA:1185,1920
+DA:1186,7680
+DA:1187,5760
+DA:1188,5760
+DA:1190,1920
+DA:1191,1920
+DA:1193,1920
+DA:1197,1920
+DA:1198,7680
+DA:1201,1920
+DA:1203,1920
+DA:1204,7680
+DA:1206,384
+DA:1228,384
+DA:1232,384
+DA:1233,2304
+DA:1234,1920
+DA:1235,7680
+DA:1237,1920
+DA:1239,384
+DA:1240,384
+DA:1256,384
+DA:1259,384
+DA:1260,384
+DA:1261,384
+DA:1262,384
+DA:1265,384
+DA:1268,2688
+DA:1269,384
+DA:1271,2688
+DA:1272,2304
+DA:1273,11520
+DA:1274,9216
+DA:1275,32256
+DA:1276,9216
+DA:1281,384
+DA:1282,384
+DA:1283,384
+DA:1284,384
+DA:1300,0
+DA:1302,0
+DA:1304,0
+DA:1307,0
+DA:1308,0
+DA:1309,0
+DA:1313,0
+DA:1315,0
+DA:1317,0
+DA:1318,0
+DA:1319,0
+DA:1320,0
+DA:1322,0
+DA:1323,0
+DA:1325,0
+DA:1329,0
+DA:1330,0
+DA:1333,0
+DA:1334,0
+DA:1335,0
+DA:1338,0
+DA:1339,0
+DA:1342,0
+DA:1343,0
+DA:1344,0
+DA:1345,0
+DA:1349,0
+DA:1350,0
+DA:1351,0
+DA:1352,0
+DA:1353,0
+DA:1354,0
+DA:1356,0
+DA:1361,0
+DA:1362,0
+DA:1364,0
+DA:1366,0
+DA:1367,0
+DA:1368,0
+DA:1369,0
+DA:1370,0
+DA:1372,0
+DA:1373,0
+DA:1374,0
+DA:1376,0
+DA:1379,0
+DA:1380,0
+DA:1381,0
+DA:1382,0
+DA:1383,0
+DA:1385,0
+DA:1387,0
+DA:1389,0
+DA:1390,0
+DA:1391,0
+DA:1393,0
+DA:1394,0
+DA:1396,0
+DA:1398,0
+DA:1399,0
+DA:1404,0
+DA:1405,0
+DA:1408,0
+DA:1411,0
+DA:1412,0
+DA:1413,0
+DA:1416,0
+DA:1420,0
+DA:1423,0
+DA:1424,0
+DA:1447,0
+DA:1450,0
+DA:1451,0
+DA:1452,0
+DA:1455,0
+DA:1456,0
+DA:1459,0
+DA:1460,0
+DA:1461,0
+DA:1462,0
+DA:1463,0
+DA:1471,0
+DA:1474,0
+DA:1475,0
+DA:1476,0
+DA:1477,0
+DA:1481,0
+DA:1483,0
+DA:1486,0
+DA:1489,0
+DA:1490,0
+DA:1491,0
+DA:1492,0
+DA:1493,0
+DA:1500,0
+DA:1503,0
+DA:1504,0
+DA:1505,0
+DA:1506,0
+DA:1536,864
+DA:1538,864
+DA:1541,480
+DA:1542,480
+DA:1543,480
+DA:1544,480
+DA:1545,480
+DA:1548,384
+DA:1549,384
+DA:1550,384
+DA:1551,384
+DA:1553,384
+DA:1555,384
+DA:1556,384
+DA:1557,384
+DA:1558,384
+DA:1559,384
+DA:1560,384
+DA:1561,384
+DA:1562,384
+DA:1563,384
+DA:1564,384
+DA:1565,384
+DA:1566,384
+DA:1567,384
+DA:1568,384
+DA:1569,384
+DA:1570,384
+DA:1571,384
+DA:1572,384
+DA:1573,384
+DA:1574,384
+DA:1575,384
+DA:1594,384
+DA:1596,384
+DA:1599,384
+DA:1600,384
+DA:1601,384
+DA:1602,384
+DA:1605,384
+DA:1606,384
+DA:1607,384
+DA:1608,384
+DA:1609,384
+DA:1610,384
+DA:1611,384
+DA:1612,384
+DA:1613,384
+DA:1614,384
+DA:1615,0
+DA:1616,0
+DA:1617,0
+DA:1619,384
+DA:1623,2688
+DA:1624,2304
+DA:1625,2304
+DA:1626,2304
+DA:1627,11520
+DA:1628,9216
+DA:1629,9216
+DA:1630,9216
+DA:1631,32256
+DA:1632,23040
+DA:1633,23040
+DA:1634,23040
+DA:1635,9216
+DA:1636,9216
+DA:1638,23040
+DA:1639,23040
+DA:1641,9216
+DA:1645,384
+DA:1646,384
+DA:1647,384
+DA:1648,384
+DA:1649,384
+DA:1650,384
+DA:1651,384
+DA:1652,384
+DA:1673,0
+DA:1675,0
+DA:1677,0
+DA:1680,0
+DA:1681,0
+DA:1682,0
+DA:1683,0
+DA:1684,0
+DA:1687,0
+DA:1688,0
+DA:1689,0
+DA:1691,0
+DA:1693,0
+DA:1694,0
+DA:1695,0
+DA:1696,0
+DA:1697,0
+DA:1698,0
+DA:1699,0
+DA:1700,0
+DA:1701,0
+DA:1702,0
+DA:1703,0
+DA:1704,0
+DA:1705,0
+DA:1706,0
+DA:1707,0
+DA:1708,0
+DA:1709,0
+DA:1710,0
+DA:1711,0
+DA:1732,0
+DA:1734,0
+DA:1736,0
+DA:1739,0
+DA:1740,0
+DA:1741,0
+DA:1742,0
+DA:1743,0
+DA:1746,0
+DA:1747,0
+DA:1748,0
+DA:1750,0
+DA:1752,0
+DA:1753,0
+DA:1754,0
+DA:1755,0
+DA:1756,0
+DA:1757,0
+DA:1758,0
+DA:1759,0
+DA:1760,0
+DA:1761,0
+DA:1762,0
+DA:1763,0
+DA:1764,0
+DA:1765,0
+DA:1766,0
+DA:1767,0
+DA:1768,0
+DA:1769,0
+DA:1770,0
+DA:1791,0
+DA:1793,0
+DA:1795,0
+DA:1798,0
+DA:1799,0
+DA:1800,0
+DA:1801,0
+DA:1802,0
+DA:1805,0
+DA:1806,0
+DA:1807,0
+DA:1809,0
+DA:1810,0
+DA:1812,0
+DA:1813,0
+DA:1814,0
+DA:1815,0
+DA:1816,0
+DA:1817,0
+DA:1818,0
+DA:1819,0
+DA:1820,0
+DA:1821,0
+DA:1822,0
+DA:1823,0
+DA:1824,0
+DA:1825,0
+DA:1826,0
+DA:1827,0
+DA:1828,0
+DA:1829,0
+DA:1830,0
+DA:1856,0
+DA:1862,0
+DA:1865,0
+DA:1871,0
+DA:1872,0
+DA:1873,0
+DA:1875,0
+DA:1876,0
+DA:1877,0
+DA:1880,0
+DA:1881,0
+DA:1882,0
+DA:1888,0
+DA:1889,0
+DA:1890,0
+DA:1891,0
+DA:1895,0
+DA:1896,0
+DA:1897,0
+DA:1898,0
+DA:1914,6528
+DA:1915,6528
+DA:1916,6528
+DA:1917,6528
+DA:1918,6528
+DA:1931,0
+DA:1932,0
+DA:1933,0
+DA:1946,0
+DA:1947,0
+DA:1948,0
+DA:1961,0
+DA:1963,0
+DA:1968,0
+DA:1969,0
+DA:1970,0
+DA:1973,0
+DA:1975,0
+DA:1976,0
+DA:1977,0
+DA:1981,0
+DA:1982,0
+DA:1983,0
+DA:1985,0
+DA:1987,0
+DA:1989,0
+DA:1993,0
+DA:1994,0
+DA:1995,0
+DA:1996,0
+DA:1997,0
+DA:1998,0
+DA:2000,0
+DA:2001,0
+DA:2002,0
+DA:2003,0
+DA:2008,0
+DA:2009,0
+DA:2010,0
+DA:2011,0
+DA:2012,0
+DA:2013,0
+DA:2014,0
+DA:2015,0
+DA:2016,0
+DA:2018,0
+DA:2019,0
+DA:2020,0
+DA:2021,0
+DA:2022,0
+DA:2023,0
+DA:2024,0
+DA:2026,0
+DA:2027,0
+DA:2028,0
+DA:2030,0
+DA:2031,0
+DA:2032,0
+DA:2035,0
+DA:2036,0
+DA:2060,4800
+DA:2061,4800
+DA:2062,4800
+DA:2063,4800
+DA:2064,4800
+DA:2088,0
+DA:2089,0
+DA:2090,0
+DA:2091,0
+DA:2092,0
+DA:2093,0
+DA:2116,0
+DA:2118,0
+DA:2119,0
+DA:2120,0
+DA:2122,0
+DA:2124,0
+DA:2147,0
+DA:2149,0
+DA:2150,0
+DA:2151,0
+DA:2152,0
+DA:2154,0
+DA:2156,0
+DA:2169,768
+DA:2170,768
+DA:2171,768
+DA:2183,0
+DA:2195,20736
+DA:2196,20736
+DA:2197,20736
+DA:2210,0
+DA:2211,0
+DA:2212,0
+DA:2225,13248
+DA:2226,13248
+DA:2227,13248
+DA:2240,10176
+DA:2241,10176
+DA:2242,10176
+DA:2255,5184
+DA:2256,5184
+DA:2257,5184
+DA:2258,5184
+DA:2271,10368
+DA:2272,10368
+DA:2273,10368
+DA:2286,5184
+DA:2287,5184
+DA:2288,5184
+DA:2289,5184
+DA:2302,0
+DA:2303,0
+DA:2304,0
+DA:2317,192
+DA:2318,192
+DA:2319,192
+DA:2332,0
+DA:2333,0
+DA:2335,0
+DA:2338,0
+DA:2341,0
+DA:2342,0
+DA:2343,0
+DA:2344,0
+DA:2345,0
+DA:2347,0
+DA:2352,0
+DA:2353,0
+DA:2366,4992
+DA:2369,4992
+DA:2370,4992
+DA:2371,4992
+DA:2372,4992
+DA:2385,0
+DA:2386,0
+DA:2388,0
+DA:2391,0
+DA:2394,0
+DA:2395,0
+DA:2396,0
+DA:2397,0
+DA:2398,0
+DA:2399,0
+DA:2401,0
+DA:2402,0
+DA:2408,0
+DA:2409,0
+DA:2422,384
+DA:2425,384
+DA:2426,384
+DA:2427,384
+DA:2428,384
+DA:2441,0
+DA:2442,0
+DA:2443,0
+DA:2456,0
+DA:2457,0
+DA:2458,0
+DA:2470,13440
+DA:2471,13440
+DA:2472,13056
+DA:2473,13056
+DA:2475,384
+DA:2476,384
+DA:2477,384
+DA:2478,384
+DA:2479,384
+DA:2480,384
+DA:2481,384
+DA:2482,384
+DA:2483,384
+DA:2484,384
+DA:2485,384
+DA:2486,384
+DA:2487,384
+DA:2488,384
+DA:2489,384
+DA:2503,384
+DA:2504,384
+DA:2507,1920
+DA:2509,1536
+DA:2511,1536
+DA:2512,1536
+DA:2513,1536
+DA:2514,9216
+DA:2515,7680
+DA:2516,7680
+DA:2517,7680
+DA:2520,1536
+DA:2521,1536
+DA:2523,7296
+DA:2524,5760
+DA:2525,5760
+DA:2526,34560
+DA:2527,28800
+DA:2528,28800
+DA:2529,28800
+DA:2531,5760
+DA:2532,5760
+DA:2535,1536
+DA:2536,1536
+DA:2537,1536
+DA:2538,1536
+DA:2539,1536
+DA:2541,384
+DA:2555,576
+DA:2556,576
+DA:2560,576
+DA:2561,576
+DA:2562,576
+DA:2563,96
+DA:2564,96
+DA:2566,576
+DA:2567,576
+DA:2569,1728
+DA:2571,1152
+DA:2573,1152
+DA:2574,1152
+DA:2575,1152
+DA:2576,4608
+DA:2577,3456
+DA:2578,3456
+DA:2579,3456
+DA:2582,1152
+DA:2583,1152
+DA:2584,1152
+DA:2586,14976
+DA:2587,13824
+DA:2588,13824
+DA:2589,55296
+DA:2590,41472
+DA:2591,41472
+DA:2592,41472
+DA:2594,13824
+DA:2595,13824
+DA:2596,13824
+DA:2599,1152
+DA:2600,1152
+DA:2601,192
+DA:2602,192
+DA:2604,1152
+DA:2605,1152
+DA:2607,576
+LF:1024
+LH:339
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/interface/ceed-config.c
+FNL:0,33,36
+FNA:0,0,CeedGetGitVersion
+FNL:1,50,53
+FNA:1,0,CeedSetIsClang
+FNL:2,67,70
+FNA:2,0,CeedGetIsClang
+FNL:3,85,88
+FNA:3,0,CeedGetBuildConfiguration
+FNF:4
+FNH:0
+DA:33,0
+DA:34,0
+DA:35,0
+DA:50,0
+DA:51,0
+DA:52,0
+DA:67,0
+DA:68,0
+DA:69,0
+DA:85,0
+DA:86,0
+DA:87,0
+LF:12
+LH:0
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/interface/ceed-elemrestriction.c
+FNL:0,1018,1060
+FNA:0,0,CeedElemRestrictionCreateBlockedOriented
+FNL:1,1090,1133
+FNA:1,0,CeedElemRestrictionCreateBlockedCurlOriented
+FNL:2,111,114
+FNA:2,0,CeedElemRestrictionView_Object
+FNL:3,1154,1190
+FNA:3,288,CeedElemRestrictionCreateBlockedStrided
+FNL:4,1204,1221
+FNA:4,0,CeedElemRestrictionCreateUnsignedCopy
+FNL:5,1235,1252
+FNA:5,0,CeedElemRestrictionCreateUnorientedCopy
+FNL:6,125,128
+FNA:6,0,CeedElemRestrictionDestroy_Object
+FNL:7,1269,1274
+FNA:7,11568,CeedElemRestrictionReferenceCopy
+FNL:8,1287,1298
+FNA:8,960,CeedElemRestrictionCreateVector
+FNL:9,1314,1336
+FNA:9,576,CeedElemRestrictionApply
+FNL:10,1353,1391
+FNA:10,0,CeedElemRestrictionApplyAtPointsInElement
+FNL:11,1408,1446
+FNA:11,3072,CeedElemRestrictionApplyBlock
+FNL:12,1458,1461
+FNA:12,2976,CeedElemRestrictionGetCeed
+FNL:13,1472,1472
+FNA:13,0,CeedElemRestrictionReturnCeed
+FNL:14,148,151
+FNA:14,7296,CeedElemRestrictionGetType
+FNL:15,1484,1487
+FNA:15,5664,CeedElemRestrictionGetCompStride
+FNL:16,1499,1502
+FNA:16,10656,CeedElemRestrictionGetNumElements
+FNL:17,1514,1517
+FNA:17,16992,CeedElemRestrictionGetElementSize
+FNL:18,1530,1539
+FNA:18,0,CeedElemRestrictionGetNumPoints
+FNL:19,1553,1565
+FNA:19,0,CeedElemRestrictionGetNumPointsInElement
+FNL:20,1578,1605
+FNA:20,0,CeedElemRestrictionGetMinMaxPointsInElement
+FNL:21,1619,1621
+FNA:21,0,CeedElemRestrictionGetMaxPointsInElement
+FNL:22,163,166
+FNA:22,0,CeedElemRestrictionIsStrided
+FNL:23,1635,1637
+FNA:23,0,CeedElemRestrictionGetMinPointsInElement
+FNL:24,1649,1652
+FNA:24,6432,CeedElemRestrictionGetLVectorSize
+FNL:25,1664,1667
+FNA:25,1536,CeedElemRestrictionGetEVectorSize
+FNL:26,1679,1682
+FNA:26,9888,CeedElemRestrictionGetNumComponents
+FNL:27,1694,1697
+FNA:27,1872,CeedElemRestrictionGetNumBlocks
+FNL:28,1709,1712
+FNA:28,8016,CeedElemRestrictionGetBlockSize
+FNL:29,1724,1739
+FNA:29,0,CeedElemRestrictionGetMultiplicity
+FNL:30,1751,1754
+FNA:30,0,CeedElemRestrictionSetNumViewTabs
+FNL:31,1766,1769
+FNA:31,0,CeedElemRestrictionGetNumViewTabs
+FNL:32,178,181
+FNA:32,0,CeedElemRestrictionIsAtPoints
+FNL:33,1781,1818
+FNA:33,0,CeedElemRestrictionView
+FNL:34,1829,1845
+FNA:34,25152,CeedElemRestrictionDestroy
+FNL:35,194,222
+FNA:35,0,CeedElemRestrictionAtPointsAreCompatible
+FNL:36,234,238
+FNA:36,288,CeedElemRestrictionGetStrides
+FNL:37,250,255
+FNA:37,1536,CeedElemRestrictionHasBackendStrides
+FNL:38,269,279
+FNA:38,432,CeedElemRestrictionGetOffsets
+FNL:39,291,299
+FNA:39,432,CeedElemRestrictionRestoreOffsets
+FNL:40,313,319
+FNA:40,0,CeedElemRestrictionGetOrientations
+FNL:41,331,335
+FNA:41,0,CeedElemRestrictionRestoreOrientations
+FNL:42,349,355
+FNA:42,0,CeedElemRestrictionGetCurlOrientations
+FNL:43,367,371
+FNA:43,0,CeedElemRestrictionRestoreCurlOrientations
+FNL:44,38,48
+FNA:44,432,CeedPermutePadOffsets
+FNL:45,385,400
+FNA:45,0,CeedElemRestrictionGetLLayout
+FNL:46,414,422
+FNA:46,480,CeedElemRestrictionSetLLayout
+FNL:47,436,440
+FNA:47,0,CeedElemRestrictionGetELayout
+FNL:48,454,457
+FNA:48,1296,CeedElemRestrictionSetELayout
+FNL:49,472,496
+FNA:49,0,CeedElemRestrictionGetAtPointsElementOffset
+FNL:50,509,521
+FNA:50,0,CeedElemRestrictionSetAtPointsEVectorSize
+FNL:51,533,536
+FNA:51,7680,CeedElemRestrictionGetData
+FNL:52,548,551
+FNA:52,1296,CeedElemRestrictionSetData
+FNL:53,562,565
+FNA:53,9456,CeedElemRestrictionReference
+FNL:54,576,615
+FNA:54,0,CeedElemRestrictionGetFlopsEstimate
+FNL:55,64,73
+FNA:55,0,CeedPermutePadOrients
+FNL:56,657,687
+FNA:56,720,CeedElemRestrictionCreate
+FNL:57,712,744
+FNA:57,0,CeedElemRestrictionCreateOriented
+FNL:58,770,802
+FNA:58,0,CeedElemRestrictionCreateCurlOriented
+FNL:59,824,857
+FNA:59,360,CeedElemRestrictionCreateStrided
+FNL:60,89,99
+FNA:60,0,CeedPermutePadCurlOrients
+FNL:61,891,923
+FNA:61,0,CeedElemRestrictionCreateAtPoints
+FNL:62,950,989
+FNA:62,432,CeedElemRestrictionCreateBlocked
+FNF:63
+FNH:29
+DA:38,432
+DA:40,3024
+DA:41,9216
+DA:42,166464
+DA:43,159840
+DA:47,432
+DA:64,0
+DA:65,0
+DA:66,0
+DA:67,0
+DA:68,0
+DA:72,0
+DA:89,0
+DA:91,0
+DA:92,0
+DA:93,0
+DA:94,0
+DA:98,0
+DA:111,0
+DA:112,0
+DA:113,0
+DA:125,0
+DA:126,0
+DA:127,0
+DA:148,7296
+DA:149,7296
+DA:150,7296
+DA:163,0
+DA:164,0
+DA:165,0
+DA:178,0
+DA:179,0
+DA:180,0
+DA:194,0
+DA:198,0
+DA:200,0
+DA:203,0
+DA:204,0
+DA:205,0
+DA:206,0
+DA:209,0
+DA:210,0
+DA:211,0
+DA:212,0
+DA:215,0
+DA:216,0
+DA:217,0
+DA:218,0
+DA:219,0
+DA:221,0
+DA:234,288
+DA:235,288
+DA:236,1152
+DA:237,288
+DA:250,1536
+DA:251,1536
+DA:252,3072
+DA:253,1536
+DA:254,1536
+DA:269,432
+DA:270,432
+DA:271,0
+DA:273,432
+DA:275,432
+DA:276,432
+DA:278,432
+DA:291,432
+DA:292,432
+DA:293,0
+DA:295,432
+DA:296,432
+DA:298,432
+DA:313,0
+DA:314,0
+DA:316,0
+DA:317,0
+DA:318,0
+DA:331,0
+DA:332,0
+DA:333,0
+DA:334,0
+DA:349,0
+DA:350,0
+DA:352,0
+DA:353,0
+DA:354,0
+DA:367,0
+DA:368,0
+DA:369,0
+DA:370,0
+DA:385,0
+DA:389,0
+DA:390,0
+DA:392,0
+DA:393,0
+DA:394,0
+DA:395,0
+DA:397,0
+DA:399,0
+DA:414,480
+DA:417,480
+DA:418,480
+DA:420,1920
+DA:421,480
+DA:436,0
+DA:437,0
+DA:438,0
+DA:439,0
+DA:454,1296
+DA:455,5184
+DA:456,1296
+DA:472,0
+DA:476,0
+DA:477,0
+DA:481,0
+DA:482,0
+DA:483,0
+DA:487,0
+DA:488,0
+DA:489,0
+DA:492,0
+DA:493,0
+DA:495,0
+DA:509,0
+DA:512,0
+DA:513,0
+DA:515,0
+DA:519,0
+DA:520,0
+DA:533,7680
+DA:534,7680
+DA:535,7680
+DA:548,1296
+DA:549,1296
+DA:550,1296
+DA:562,9456
+DA:563,9456
+DA:564,9456
+DA:576,0
+DA:577,0
+DA:580,0
+DA:581,0
+DA:582,0
+DA:583,0
+DA:584,0
+DA:585,0
+DA:586,0
+DA:587,0
+DA:589,0
+DA:590,0
+DA:591,0
+DA:592,0
+DA:593,0
+DA:594,0
+DA:595,0
+DA:596,0
+DA:599,0
+DA:600,0
+DA:603,0
+DA:604,0
+DA:605,0
+DA:606,0
+DA:607,0
+DA:608,0
+DA:609,0
+DA:610,0
+DA:613,0
+DA:614,0
+DA:657,720
+DA:659,720
+DA:662,336
+DA:663,336
+DA:664,336
+DA:665,336
+DA:666,336
+DA:669,384
+DA:670,384
+DA:671,384
+DA:672,384
+DA:674,384
+DA:675,384
+DA:676,384
+DA:677,384
+DA:678,384
+DA:679,384
+DA:680,384
+DA:681,384
+DA:682,384
+DA:683,384
+DA:684,384
+DA:685,384
+DA:686,384
+DA:712,0
+DA:715,0
+DA:718,0
+DA:719,0
+DA:720,0
+DA:722,0
+DA:723,0
+DA:726,0
+DA:727,0
+DA:728,0
+DA:729,0
+DA:731,0
+DA:732,0
+DA:733,0
+DA:734,0
+DA:735,0
+DA:736,0
+DA:737,0
+DA:738,0
+DA:739,0
+DA:740,0
+DA:741,0
+DA:742,0
+DA:743,0
+DA:770,0
+DA:773,0
+DA:776,0
+DA:777,0
+DA:778,0
+DA:780,0
+DA:781,0
+DA:784,0
+DA:785,0
+DA:786,0
+DA:787,0
+DA:789,0
+DA:790,0
+DA:791,0
+DA:792,0
+DA:793,0
+DA:794,0
+DA:795,0
+DA:796,0
+DA:797,0
+DA:798,0
+DA:799,0
+DA:800,0
+DA:801,0
+DA:824,360
+DA:826,360
+DA:829,168
+DA:830,168
+DA:831,168
+DA:832,168
+DA:833,168
+DA:836,192
+DA:837,192
+DA:838,192
+DA:839,192
+DA:843,192
+DA:844,192
+DA:845,192
+DA:846,192
+DA:847,192
+DA:848,192
+DA:849,192
+DA:850,192
+DA:851,192
+DA:852,192
+DA:853,192
+DA:854,768
+DA:855,192
+DA:856,192
+DA:891,0
+DA:893,0
+DA:896,0
+DA:897,0
+DA:898,0
+DA:899,0
+DA:900,0
+DA:903,0
+DA:904,0
+DA:905,0
+DA:906,0
+DA:910,0
+DA:911,0
+DA:912,0
+DA:913,0
+DA:914,0
+DA:915,0
+DA:916,0
+DA:917,0
+DA:918,0
+DA:919,0
+DA:920,0
+DA:921,0
+DA:922,0
+DA:950,432
+DA:953,432
+DA:955,432
+DA:958,0
+DA:959,0
+DA:960,0
+DA:962,0
+DA:963,0
+DA:966,432
+DA:967,432
+DA:968,432
+DA:969,432
+DA:970,432
+DA:972,432
+DA:973,432
+DA:975,432
+DA:976,432
+DA:977,432
+DA:978,432
+DA:979,432
+DA:980,432
+DA:981,432
+DA:982,432
+DA:983,432
+DA:984,432
+DA:985,432
+DA:986,432
+DA:987,432
+DA:988,432
+DA:1018,0
+DA:1022,0
+DA:1024,0
+DA:1027,0
+DA:1028,0
+DA:1029,0
+DA:1031,0
+DA:1032,0
+DA:1035,0
+DA:1036,0
+DA:1037,0
+DA:1038,0
+DA:1040,0
+DA:1041,0
+DA:1042,0
+DA:1043,0
+DA:1045,0
+DA:1046,0
+DA:1047,0
+DA:1048,0
+DA:1049,0
+DA:1050,0
+DA:1051,0
+DA:1052,0
+DA:1053,0
+DA:1054,0
+DA:1055,0
+DA:1056,0
+DA:1058,0
+DA:1059,0
+DA:1090,0
+DA:1094,0
+DA:1096,0
+DA:1099,0
+DA:1100,0
+DA:1101,0
+DA:1103,0
+DA:1104,0
+DA:1107,0
+DA:1108,0
+DA:1109,0
+DA:1110,0
+DA:1111,0
+DA:1113,0
+DA:1114,0
+DA:1115,0
+DA:1116,0
+DA:1118,0
+DA:1119,0
+DA:1120,0
+DA:1121,0
+DA:1122,0
+DA:1123,0
+DA:1124,0
+DA:1125,0
+DA:1126,0
+DA:1127,0
+DA:1128,0
+DA:1129,0
+DA:1131,0
+DA:1132,0
+DA:1154,288
+DA:1156,288
+DA:1158,288
+DA:1161,0
+DA:1162,0
+DA:1163,0
+DA:1164,0
+DA:1165,0
+DA:1168,288
+DA:1169,288
+DA:1170,288
+DA:1171,288
+DA:1172,288
+DA:1176,288
+DA:1177,288
+DA:1178,288
+DA:1179,288
+DA:1180,288
+DA:1181,288
+DA:1182,288
+DA:1183,288
+DA:1184,288
+DA:1185,288
+DA:1186,288
+DA:1187,1152
+DA:1188,288
+DA:1189,288
+DA:1204,0
+DA:1205,0
+DA:1208,0
+DA:1209,0
+DA:1211,0
+DA:1212,0
+DA:1213,0
+DA:1214,0
+DA:1216,0
+DA:1219,0
+DA:1220,0
+DA:1235,0
+DA:1236,0
+DA:1239,0
+DA:1240,0
+DA:1242,0
+DA:1243,0
+DA:1244,0
+DA:1245,0
+DA:1247,0
+DA:1250,0
+DA:1251,0
+DA:1269,11568
+DA:1270,11568
+DA:1271,11568
+DA:1272,11568
+DA:1273,11568
+DA:1287,960
+DA:1291,960
+DA:1292,960
+DA:1293,960
+DA:1294,960
+DA:1295,960
+DA:1296,960
+DA:1297,960
+DA:1314,576
+DA:1318,576
+DA:1319,384
+DA:1320,384
+DA:1322,192
+DA:1323,192
+DA:1325,576
+DA:1326,576
+DA:1329,576
+DA:1330,576
+DA:1333,576
+DA:1334,576
+DA:1335,576
+DA:1353,0
+DA:1358,0
+DA:1361,0
+DA:1364,0
+DA:1365,0
+DA:1366,0
+DA:1367,0
+DA:1371,0
+DA:1372,0
+DA:1373,0
+DA:1374,0
+DA:1376,0
+DA:1377,0
+DA:1381,0
+DA:1382,0
+DA:1386,0
+DA:1387,0
+DA:1389,0
+DA:1390,0
+DA:1408,3072
+DA:1413,3072
+DA:1416,3072
+DA:1417,3072
+DA:1420,1536
+DA:1421,1536
+DA:1422,1536
+DA:1423,1536
+DA:1427,1536
+DA:1428,1536
+DA:1429,1536
+DA:1430,1536
+DA:1432,3072
+DA:1433,3072
+DA:1436,3072
+DA:1437,3072
+DA:1440,3072
+DA:1441,3072
+DA:1444,3072
+DA:1445,3072
+DA:1458,2976
+DA:1459,2976
+DA:1460,2976
+DA:1472,0
+DA:1484,5664
+DA:1485,5664
+DA:1486,5664
+DA:1499,10656
+DA:1500,10656
+DA:1501,10656
+DA:1514,16992
+DA:1515,16992
+DA:1516,16992
+DA:1530,0
+DA:1533,0
+DA:1534,0
+DA:1537,0
+DA:1538,0
+DA:1553,0
+DA:1557,0
+DA:1558,0
+DA:1561,0
+DA:1562,0
+DA:1563,0
+DA:1564,0
+DA:1578,0
+DA:1582,0
+DA:1583,0
+DA:1586,0
+DA:1589,0
+DA:1590,0
+DA:1591,0
+DA:1592,0
+DA:1596,0
+DA:1597,0
+DA:1598,0
+DA:1599,0
+DA:1600,0
+DA:1601,0
+DA:1602,0
+DA:1604,0
+DA:1619,0
+DA:1620,0
+DA:1635,0
+DA:1636,0
+DA:1649,6432
+DA:1650,6432
+DA:1651,6432
+DA:1664,1536
+DA:1665,1536
+DA:1666,1536
+DA:1679,9888
+DA:1680,9888
+DA:1681,9888
+DA:1694,1872
+DA:1695,1872
+DA:1696,1872
+DA:1709,8016
+DA:1710,8016
+DA:1711,8016
+DA:1724,0
+DA:1728,0
+DA:1731,0
+DA:1732,0
+DA:1734,0
+DA:1735,0
+DA:1737,0
+DA:1738,0
+DA:1751,0
+DA:1752,0
+DA:1753,0
+DA:1766,0
+DA:1767,0
+DA:1768,0
+DA:1781,0
+DA:1782,0
+DA:1786,0
+DA:1788,0
+DA:1789,0
+DA:1790,0
+DA:1793,0
+DA:1794,0
+DA:1797,0
+DA:1798,0
+DA:1805,0
+DA:1806,0
+DA:1808,0
+DA:1810,0
+DA:1813,0
+DA:1814,0
+DA:1816,0
+DA:1817,0
+DA:1829,25152
+DA:1830,25152
+DA:1831,24144
+DA:1832,24144
+DA:1834,1008
+DA:1838,1008
+DA:1839,1008
+DA:1841,1008
+DA:1842,1008
+DA:1843,1008
+DA:1844,1008
+LF:596
+LH:225
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/interface/ceed-fortran.c
+FNL:0,1031,1036
+FNA:0,0,ceedoperatorcompositeaddsub_
+FNL:1,1039,1044
+FNA:1,0,ceedoperatorsetname_
+FNL:2,1047,1051
+FNA:2,0,ceedoperatorsetnumviewtabs_
+FNL:3,1054,1098
+FNA:3,0,ceedoperatorlinearassembleqfunction_
+FNL:4,1101,1124
+FNA:4,0,ceedoperatorlinearassemblediagonal_
+FNL:5,1127,1148
+FNA:5,0,ceedoperatormultigridlevelcreate_
+FNL:6,1151,1172
+FNA:6,0,ceedoperatormultigridlevelcreatetensorh1_
+FNL:7,1175,1196
+FNA:7,0,ceedoperatormultigridlevelcreateh1_
+FNL:8,119,132
+FNA:8,384,ceedvectorcreate_
+FNL:9,1199,1203
+FNA:9,0,ceedoperatorview_
+FNL:10,1206,1241
+FNA:10,0,ceedoperatorcreatefdmelementinverse_
+FNL:11,1244,1270
+FNA:11,192,ceedoperatorapply_
+FNL:12,1273,1299
+FNA:12,0,ceedoperatorapplyadd_
+FNL:13,1302,1307
+FNA:13,0,ceedoperatorapplyjacobian_
+FNL:14,1310,1322
+FNA:14,192,ceedoperatordestroy_
+FNL:15,135,137
+FNA:15,288,ceedvectorsetarray_
+FNL:16,140,145
+FNA:16,0,ceedvectortakearray_
+FNL:17,148,148
+FNA:17,0,ceedvectorsyncarray_
+FNL:18,151,151
+FNA:18,96,ceedvectorsetvalue_
+FNL:19,154,159
+FNA:19,0,ceedvectorgetarray_
+FNL:20,162,167
+FNA:20,96,ceedvectorgetarrayread_
+FNL:21,170,175
+FNA:21,0,ceedvectorgetarraywrite_
+FNL:22,178,182
+FNA:22,0,ceedvectorrestorearray_
+FNL:23,185,188
+FNA:23,96,ceedvectorrestorearrayread_
+FNL:24,191,193
+FNA:24,0,ceedvectornorm_
+FNL:25,196,196
+FNA:25,0,ceedvectorreciprocal_
+FNL:26,199,199
+FNA:26,0,ceedvectorsetnumviewtabs_
+FNL:27,202,202
+FNA:27,0,ceedvectorview_
+FNL:28,205,218
+FNA:28,384,ceedvectordestroy_
+FNL:29,229,246
+FNA:29,192,ceedelemrestrictioncreate_
+FNL:30,249,267
+FNA:30,0,ceedelemrestrictioncreateoriented_
+FNL:31,270,289
+FNA:31,0,ceedelemrestrictioncreatecurloriented_
+FNL:32,292,307
+FNA:32,96,ceedelemrestrictioncreatestrided_
+FNL:33,310,327
+FNA:33,0,ceedelemrestrictioncreateblocked_
+FNL:34,330,349
+FNA:34,0,ceedelemrestrictioncreateblockedoriented_
+FNL:35,353,372
+FNA:35,0,ceedelemrestrictioncreateblockedcurloriented_
+FNL:36,375,389
+FNA:36,0,ceedelemrestrictioncreateblockedstrided_
+FNL:37,397,419
+FNA:37,0,ceedelemrestrictionapply_
+FNL:38,422,444
+FNA:38,0,ceedelemrestrictionapplyblock_
+FNL:39,447,449
+FNA:39,0,ceedelemrestrictiongetmultiplicity_
+FNL:40,452,456
+FNA:40,0,ceedelemrestrictiongetelayout_
+FNL:41,459,461
+FNA:41,0,ceedelemrestrictionsetnumviewtabs_
+FNL:42,464,464
+FNA:42,0,ceedelemrestrictionview_
+FNL:43,467,479
+FNA:43,0,ceedrequestwait_
+FNL:44,482,495
+FNA:44,0,ceedelemrestrictiondestroy_
+FNL:45,506,518
+FNA:45,192,ceedbasiscreatetensorh1lagrange_
+FNL:46,521,536
+FNA:46,0,ceedbasiscreatetensorh1_
+FNL:47,539,553
+FNA:47,0,ceedbasiscreateh1_
+FNL:48,556,570
+FNA:48,0,ceedbasiscreatehdiv_
+FNL:49,573,587
+FNA:49,0,ceedbasiscreatehcurl_
+FNL:50,590,590
+FNA:50,0,ceedbasissetnumviewtabs_
+FNL:51,593,593
+FNA:51,0,ceedbasisview_
+FNL:52,596,598
+FNA:52,0,ceedbasisgetcollocatedgrad_
+FNL:53,601,604
+FNA:53,0,ceedbasisapply_
+FNL:54,607,607
+FNA:54,0,ceedbasisgetnumnodes_
+FNL:55,610,610
+FNA:55,0,ceedbasisgetnumquadraturepoints_
+FNL:56,613,618
+FNA:56,0,ceedbasisgetinterp1d_
+FNL:57,621,626
+FNA:57,0,ceedbasisgetgrad1d_
+FNL:58,629,634
+FNA:58,0,ceedbasisgetqref_
+FNL:59,637,650
+FNA:59,192,ceedbasisdestroy_
+FNL:60,64,78
+FNA:60,96,ceedinit_
+FNL:61,653,655
+FNA:61,0,ceedgaussquadrature_
+FNL:62,658,660
+FNA:62,96,ceedlobattoquadrature_
+FNL:63,671,683
+FNA:63,96,ceedqfunctioncontextcreate_
+FNL:64,686,689
+FNA:64,96,ceedqfunctioncontextsetdata_
+FNL:65,692,697
+FNA:65,0,ceedqfunctioncontextgetdata_
+FNL:66,700,703
+FNA:66,0,ceedqfunctioncontextrestoredata_
+FNL:67,706,708
+FNA:67,0,ceedqfunctioncontextsetnumviewtabs_
+FNL:68,711,711
+FNA:68,0,ceedqfunctioncontextview_
+FNL:69,714,727
+FNA:69,96,ceedqfunctioncontextdestroy_
+FNL:70,737,760
+FNA:70,768,CeedQFunctionFortranStub
+FNL:71,763,801
+FNA:71,96,ceedqfunctioncreateinterior_
+FNL:72,804,818
+FNA:72,96,ceedqfunctioncreateinteriorbyname_
+FNL:73,81,83
+FNA:73,0,ceedisdeterministic_
+FNL:74,821,834
+FNA:74,0,ceedqfunctioncreateidentity_
+FNL:75,837,843
+FNA:75,192,ceedqfunctionaddinput_
+FNL:76,846,852
+FNA:76,96,ceedqfunctionaddoutput_
+FNL:77,855,869
+FNA:77,48,ceedqfunctionsetcontext_
+FNL:78,86,86
+FNA:78,0,ceedgetpreferredmemtype_
+FNL:79,872,874
+FNA:79,0,ceedqfunctionsetnumviewtabs_
+FNL:80,877,881
+FNA:80,0,ceedqfunctionview_
+FNL:81,885,933
+FNA:81,0,ceedqfunctionapply_
+FNL:82,89,89
+FNA:82,0,ceedsetnumviewtabs_
+FNL:83,92,92
+FNA:83,0,ceedview_
+FNL:84,936,949
+FNA:84,192,ceedqfunctiondestroy_
+FNL:85,95,108
+FNA:85,96,ceeddestroy_
+FNL:86,960,976
+FNA:86,192,ceedoperatorcreate_
+FNL:87,979,991
+FNA:87,0,ceedoperatorcreatecomposite_
+FNL:88,994,1028
+FNA:88,576,ceedoperatorsetfield_
+FNF:89
+FNH:27
+DA:64,96
+DA:65,96
+DA:66,96
+DA:67,96
+DA:68,96
+DA:71,96
+DA:72,96
+DA:74,96
+DA:75,96
+DA:76,96
+DA:78,96
+DA:81,0
+DA:82,0
+DA:83,0
+DA:86,0
+DA:89,0
+DA:92,0
+DA:95,96
+DA:96,96
+DA:97,96
+DA:99,96
+DA:100,96
+DA:101,96
+DA:102,96
+DA:103,96
+DA:104,96
+DA:105,96
+DA:119,384
+DA:120,384
+DA:121,288
+DA:122,288
+DA:125,384
+DA:126,384
+DA:128,384
+DA:129,384
+DA:130,384
+DA:132,384
+DA:135,288
+DA:136,288
+DA:137,288
+DA:140,0
+DA:142,0
+DA:143,0
+DA:144,0
+DA:145,0
+DA:148,0
+DA:151,96
+DA:154,0
+DA:156,0
+DA:157,0
+DA:158,0
+DA:159,0
+DA:162,96
+DA:164,96
+DA:165,96
+DA:166,96
+DA:167,96
+DA:170,0
+DA:172,0
+DA:173,0
+DA:174,0
+DA:175,0
+DA:178,0
+DA:179,0
+DA:180,0
+DA:181,0
+DA:182,0
+DA:185,96
+DA:186,96
+DA:187,96
+DA:188,96
+DA:191,0
+DA:192,0
+DA:193,0
+DA:196,0
+DA:199,0
+DA:202,0
+DA:205,384
+DA:206,384
+DA:207,384
+DA:209,384
+DA:210,384
+DA:211,384
+DA:212,384
+DA:213,96
+DA:214,96
+DA:215,96
+DA:229,192
+DA:231,192
+DA:232,192
+DA:233,192
+DA:236,192
+DA:238,192
+DA:239,384
+DA:240,192
+DA:242,192
+DA:243,192
+DA:244,192
+DA:246,192
+DA:249,0
+DA:251,0
+DA:252,0
+DA:253,0
+DA:256,0
+DA:257,0
+DA:259,0
+DA:260,0
+DA:261,0
+DA:263,0
+DA:264,0
+DA:265,0
+DA:267,0
+DA:270,0
+DA:273,0
+DA:274,0
+DA:275,0
+DA:278,0
+DA:279,0
+DA:281,0
+DA:282,0
+DA:283,0
+DA:285,0
+DA:286,0
+DA:287,0
+DA:289,0
+DA:292,96
+DA:294,96
+DA:295,96
+DA:296,96
+DA:299,96
+DA:300,96
+DA:301,96
+DA:303,96
+DA:304,96
+DA:305,96
+DA:307,96
+DA:310,0
+DA:312,0
+DA:313,0
+DA:314,0
+DA:317,0
+DA:319,0
+DA:320,0
+DA:321,0
+DA:323,0
+DA:324,0
+DA:325,0
+DA:327,0
+DA:330,0
+DA:333,0
+DA:334,0
+DA:335,0
+DA:338,0
+DA:339,0
+DA:341,0
+DA:342,0
+DA:343,0
+DA:345,0
+DA:346,0
+DA:347,0
+DA:349,0
+DA:353,0
+DA:356,0
+DA:357,0
+DA:358,0
+DA:361,0
+DA:362,0
+DA:364,0
+DA:365,0
+DA:366,0
+DA:368,0
+DA:369,0
+DA:370,0
+DA:372,0
+DA:375,0
+DA:377,0
+DA:378,0
+DA:379,0
+DA:382,0
+DA:383,0
+DA:385,0
+DA:386,0
+DA:387,0
+DA:389,0
+DA:397,0
+DA:398,0
+DA:400,0
+DA:402,0
+DA:403,0
+DA:404,0
+DA:408,0
+DA:409,0
+DA:410,0
+DA:412,0
+DA:413,0
+DA:415,0
+DA:416,0
+DA:417,0
+DA:419,0
+DA:422,0
+DA:423,0
+DA:425,0
+DA:427,0
+DA:428,0
+DA:429,0
+DA:433,0
+DA:434,0
+DA:435,0
+DA:437,0
+DA:438,0
+DA:440,0
+DA:441,0
+DA:442,0
+DA:444,0
+DA:447,0
+DA:448,0
+DA:449,0
+DA:452,0
+DA:454,0
+DA:455,0
+DA:456,0
+DA:459,0
+DA:460,0
+DA:461,0
+DA:464,0
+DA:467,0
+DA:471,0
+DA:472,0
+DA:473,0
+DA:474,0
+DA:475,0
+DA:476,0
+DA:479,0
+DA:482,0
+DA:483,0
+DA:484,0
+DA:486,0
+DA:487,0
+DA:488,0
+DA:489,0
+DA:490,0
+DA:491,0
+DA:492,0
+DA:506,192
+DA:507,192
+DA:508,192
+DA:509,192
+DA:512,192
+DA:514,192
+DA:515,192
+DA:516,192
+DA:518,192
+DA:521,0
+DA:524,0
+DA:525,0
+DA:526,0
+DA:529,0
+DA:530,0
+DA:532,0
+DA:533,0
+DA:534,0
+DA:536,0
+DA:539,0
+DA:541,0
+DA:542,0
+DA:543,0
+DA:546,0
+DA:547,0
+DA:549,0
+DA:550,0
+DA:551,0
+DA:553,0
+DA:556,0
+DA:558,0
+DA:559,0
+DA:560,0
+DA:563,0
+DA:564,0
+DA:566,0
+DA:567,0
+DA:568,0
+DA:570,0
+DA:573,0
+DA:575,0
+DA:576,0
+DA:577,0
+DA:580,0
+DA:581,0
+DA:583,0
+DA:584,0
+DA:585,0
+DA:587,0
+DA:590,0
+DA:593,0
+DA:596,0
+DA:597,0
+DA:598,0
+DA:601,0
+DA:602,0
+DA:603,0
+DA:604,0
+DA:607,0
+DA:610,0
+DA:613,0
+DA:615,0
+DA:616,0
+DA:617,0
+DA:618,0
+DA:621,0
+DA:623,0
+DA:624,0
+DA:625,0
+DA:626,0
+DA:629,0
+DA:631,0
+DA:632,0
+DA:633,0
+DA:634,0
+DA:637,192
+DA:638,192
+DA:639,192
+DA:641,192
+DA:642,192
+DA:643,192
+DA:644,192
+DA:645,96
+DA:646,96
+DA:647,96
+DA:653,0
+DA:654,0
+DA:655,0
+DA:658,96
+DA:659,96
+DA:660,96
+DA:671,96
+DA:672,96
+DA:673,96
+DA:674,96
+DA:677,96
+DA:679,96
+DA:680,96
+DA:681,96
+DA:682,96
+DA:686,96
+DA:687,96
+DA:688,96
+DA:689,96
+DA:692,0
+DA:694,0
+DA:695,0
+DA:696,0
+DA:697,0
+DA:700,0
+DA:701,0
+DA:702,0
+DA:703,0
+DA:706,0
+DA:707,0
+DA:708,0
+DA:711,0
+DA:714,96
+DA:715,96
+DA:716,96
+DA:718,96
+DA:719,96
+DA:720,96
+DA:721,96
+DA:722,96
+DA:723,96
+DA:724,96
+DA:737,768
+DA:738,768
+DA:739,768
+DA:742,768
+DA:746,768
+DA:747,384
+DA:748,384
+DA:751,768
+DA:752,768
+DA:754,768
+DA:755,384
+DA:756,384
+DA:759,768
+DA:763,96
+DA:771,96
+DA:772,96
+DA:773,96
+DA:774,96
+DA:777,96
+DA:778,96
+DA:780,96
+DA:781,96
+DA:782,96
+DA:786,96
+DA:787,96
+DA:788,96
+DA:789,96
+DA:791,96
+DA:792,96
+DA:793,96
+DA:794,96
+DA:795,96
+DA:796,96
+DA:797,96
+DA:798,96
+DA:800,96
+DA:804,96
+DA:805,96
+DA:806,96
+DA:807,96
+DA:808,96
+DA:811,96
+DA:812,96
+DA:814,96
+DA:815,96
+DA:816,96
+DA:818,96
+DA:821,0
+DA:822,0
+DA:823,0
+DA:824,0
+DA:827,0
+DA:828,0
+DA:830,0
+DA:831,0
+DA:832,0
+DA:834,0
+DA:837,192
+DA:839,192
+DA:840,192
+DA:842,192
+DA:843,192
+DA:846,96
+DA:848,96
+DA:849,96
+DA:851,96
+DA:852,96
+DA:855,48
+DA:856,48
+DA:857,48
+DA:860,48
+DA:861,48
+DA:863,48
+DA:864,48
+DA:865,48
+DA:866,48
+DA:867,48
+DA:868,48
+DA:872,0
+DA:873,0
+DA:874,0
+DA:877,0
+DA:878,0
+DA:880,0
+DA:881,0
+DA:885,0
+DA:888,0
+DA:890,0
+DA:891,0
+DA:892,0
+DA:893,0
+DA:894,0
+DA:895,0
+DA:896,0
+DA:897,0
+DA:898,0
+DA:899,0
+DA:900,0
+DA:901,0
+DA:902,0
+DA:903,0
+DA:904,0
+DA:905,0
+DA:906,0
+DA:907,0
+DA:909,0
+DA:910,0
+DA:911,0
+DA:912,0
+DA:913,0
+DA:914,0
+DA:915,0
+DA:916,0
+DA:917,0
+DA:918,0
+DA:919,0
+DA:920,0
+DA:921,0
+DA:922,0
+DA:923,0
+DA:924,0
+DA:925,0
+DA:926,0
+DA:927,0
+DA:928,0
+DA:930,0
+DA:931,0
+DA:932,0
+DA:936,192
+DA:937,192
+DA:939,192
+DA:940,192
+DA:941,192
+DA:942,192
+DA:943,192
+DA:944,96
+DA:945,96
+DA:946,96
+DA:960,192
+DA:961,192
+DA:962,192
+DA:963,192
+DA:966,192
+DA:968,192
+DA:969,192
+DA:970,192
+DA:972,192
+DA:973,192
+DA:974,192
+DA:975,192
+DA:979,0
+DA:980,0
+DA:981,0
+DA:982,0
+DA:985,0
+DA:987,0
+DA:988,0
+DA:989,0
+DA:990,0
+DA:994,576
+DA:995,576
+DA:1000,576
+DA:1002,576
+DA:1003,0
+DA:1004,576
+DA:1005,96
+DA:1007,480
+DA:1010,576
+DA:1011,0
+DA:1012,576
+DA:1013,192
+DA:1015,384
+DA:1017,576
+DA:1018,0
+DA:1019,576
+DA:1020,384
+DA:1021,192
+DA:1022,96
+DA:1024,96
+DA:1027,576
+DA:1028,576
+DA:1031,0
+DA:1032,0
+DA:1033,0
+DA:1035,0
+DA:1036,0
+DA:1039,0
+DA:1040,0
+DA:1041,0
+DA:1043,0
+DA:1044,0
+DA:1047,0
+DA:1048,0
+DA:1050,0
+DA:1051,0
+DA:1054,0
+DA:1056,0
+DA:1057,0
+DA:1058,0
+DA:1060,0
+DA:1063,0
+DA:1064,0
+DA:1065,0
+DA:1067,0
+DA:1069,0
+DA:1071,0
+DA:1072,0
+DA:1075,0
+DA:1076,0
+DA:1077,0
+DA:1081,0
+DA:1082,0
+DA:1083,0
+DA:1085,0
+DA:1086,0
+DA:1087,0
+DA:1088,0
+DA:1089,0
+DA:1092,0
+DA:1093,0
+DA:1094,0
+DA:1095,0
+DA:1096,0
+DA:1101,0
+DA:1102,0
+DA:1104,0
+DA:1105,0
+DA:1108,0
+DA:1109,0
+DA:1110,0
+DA:1114,0
+DA:1115,0
+DA:1116,0
+DA:1118,0
+DA:1119,0
+DA:1120,0
+DA:1121,0
+DA:1122,0
+DA:1127,0
+DA:1133,0
+DA:1134,0
+DA:1136,0
+DA:1137,0
+DA:1138,0
+DA:1140,0
+DA:1141,0
+DA:1142,0
+DA:1143,0
+DA:1144,0
+DA:1145,0
+DA:1146,0
+DA:1147,0
+DA:1151,0
+DA:1157,0
+DA:1158,0
+DA:1160,0
+DA:1161,0
+DA:1162,0
+DA:1164,0
+DA:1165,0
+DA:1166,0
+DA:1167,0
+DA:1168,0
+DA:1169,0
+DA:1170,0
+DA:1171,0
+DA:1175,0
+DA:1181,0
+DA:1182,0
+DA:1184,0
+DA:1185,0
+DA:1186,0
+DA:1188,0
+DA:1189,0
+DA:1190,0
+DA:1191,0
+DA:1192,0
+DA:1193,0
+DA:1194,0
+DA:1195,0
+DA:1199,0
+DA:1200,0
+DA:1202,0
+DA:1203,0
+DA:1206,0
+DA:1208,0
+DA:1209,0
+DA:1210,0
+DA:1212,0
+DA:1214,0
+DA:1216,0
+DA:1217,0
+DA:1220,0
+DA:1221,0
+DA:1222,0
+DA:1226,0
+DA:1227,0
+DA:1228,0
+DA:1230,0
+DA:1231,0
+DA:1232,0
+DA:1233,0
+DA:1234,0
+DA:1237,0
+DA:1238,0
+DA:1239,0
+DA:1244,192
+DA:1245,192
+DA:1246,192
+DA:1248,192
+DA:1250,192
+DA:1251,192
+DA:1254,192
+DA:1255,0
+DA:1256,0
+DA:1260,192
+DA:1261,0
+DA:1262,0
+DA:1264,192
+DA:1265,192
+DA:1266,192
+DA:1267,0
+DA:1268,0
+DA:1273,0
+DA:1274,0
+DA:1275,0
+DA:1277,0
+DA:1279,0
+DA:1280,0
+DA:1283,0
+DA:1284,0
+DA:1285,0
+DA:1289,0
+DA:1290,0
+DA:1291,0
+DA:1293,0
+DA:1294,0
+DA:1295,0
+DA:1296,0
+DA:1297,0
+DA:1302,0
+DA:1307,0
+DA:1310,192
+DA:1311,192
+DA:1312,192
+DA:1313,192
+DA:1314,192
+DA:1315,192
+DA:1316,192
+DA:1317,96
+DA:1318,96
+DA:1319,96
+LF:722
+LH:252
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/interface/ceed-jit-tools.c
+FNL:0,124,296
+FNA:0,0,CeedLoadSourceToInitializedBuffer
+FNL:1,27,61
+FNA:1,144,CeedCheckFilePath
+FNL:2,313,324
+FNA:2,0,CeedLoadSourceAndInitializeBuffer
+FNL:3,339,350
+FNA:3,0,CeedLoadSourceToBuffer
+FNL:4,368,377
+FNA:4,48,CeedPathConcatenate
+FNL:5,389,393
+FNA:5,1536,CeedGetJitRelativePath
+FNL:6,406,439
+FNA:6,48,CeedGetJitAbsolutePath
+FNL:7,74,107
+FNA:7,0,CeedNormalizePath
+FNF:8
+FNH:4
+DA:27,144
+DA:31,144
+DA:33,144
+DA:34,144
+DA:36,144
+DA:37,144
+DA:39,0
+DA:43,144
+DA:44,144
+DA:48,144
+DA:49,144
+DA:51,144
+DA:53,96
+DA:54,96
+DA:55,96
+DA:59,144
+DA:60,144
+DA:74,0
+DA:75,0
+DA:77,0
+DA:79,0
+DA:80,0
+DA:81,0
+DA:84,0
+DA:85,0
+DA:87,0
+DA:88,0
+DA:89,0
+DA:92,0
+DA:93,0
+DA:95,0
+DA:96,0
+DA:98,0
+DA:99,0
+DA:100,0
+DA:101,0
+DA:104,0
+DA:106,0
+DA:124,0
+DA:126,0
+DA:130,0
+DA:131,0
+DA:132,0
+DA:135,0
+DA:136,0
+DA:138,0
+DA:139,0
+DA:140,0
+DA:143,0
+DA:146,0
+DA:147,0
+DA:150,0
+DA:157,0
+DA:160,0
+DA:162,0
+DA:164,0
+DA:165,0
+DA:167,0
+DA:168,0
+DA:171,0
+DA:172,0
+DA:173,0
+DA:176,0
+DA:178,0
+DA:179,0
+DA:180,0
+DA:183,0
+DA:184,0
+DA:186,0
+DA:187,0
+DA:188,0
+DA:189,0
+DA:191,0
+DA:195,0
+DA:197,0
+DA:198,0
+DA:201,0
+DA:202,0
+DA:203,0
+DA:206,0
+DA:208,0
+DA:209,0
+DA:211,0
+DA:212,0
+DA:213,0
+DA:214,0
+DA:216,0
+DA:217,0
+DA:218,0
+DA:219,0
+DA:220,0
+DA:221,0
+DA:222,0
+DA:223,0
+DA:224,0
+DA:225,0
+DA:227,0
+DA:229,0
+DA:232,0
+DA:233,0
+DA:234,0
+DA:236,0
+DA:237,0
+DA:238,0
+DA:239,0
+DA:241,0
+DA:243,0
+DA:245,0
+DA:246,0
+DA:247,0
+DA:248,0
+DA:253,0
+DA:254,0
+DA:255,0
+DA:256,0
+DA:257,0
+DA:258,0
+DA:259,0
+DA:260,0
+DA:262,0
+DA:263,0
+DA:264,0
+DA:265,0
+DA:267,0
+DA:268,0
+DA:269,0
+DA:270,0
+DA:272,0
+DA:275,0
+DA:278,0
+DA:279,0
+DA:281,0
+DA:282,0
+DA:283,0
+DA:284,0
+DA:287,0
+DA:290,0
+DA:291,0
+DA:292,0
+DA:293,0
+DA:294,0
+DA:295,0
+DA:313,0
+DA:315,0
+DA:316,0
+DA:319,0
+DA:322,0
+DA:323,0
+DA:339,0
+DA:340,0
+DA:341,0
+DA:344,0
+DA:347,0
+DA:348,0
+DA:349,0
+DA:368,48
+DA:369,48
+DA:370,48
+DA:371,48
+DA:373,48
+DA:374,48
+DA:375,48
+DA:376,48
+DA:389,1536
+DA:390,1536
+DA:391,1536
+DA:392,1536
+DA:406,48
+DA:411,48
+DA:412,48
+DA:413,48
+DA:415,48
+DA:416,48
+DA:420,48
+DA:421,48
+DA:424,96
+DA:425,48
+DA:427,48
+DA:428,48
+DA:429,48
+LF:180
+LH:41
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/interface/ceed-object.c
+FNL:0,108,111
+FNA:0,0,CeedObjectView
+FNL:1,123,127
+FNA:1,0,CeedObjectSetNumViewTabs
+FNL:2,139,142
+FNA:2,0,CeedObjectGetNumViewTabs
+FNL:3,154,158
+FNA:3,9888,CeedObjectGetCeed
+FNL:4,169,169
+FNA:4,11232,CeedObjectReturnCeed
+FNL:5,180,183
+FNA:5,0,CeedObjectDestroy
+FNL:6,37,45
+FNA:6,7200,CeedObjectCreate
+FNL:7,56,59
+FNA:7,49140,CeedObjectReference
+FNL:8,70,72
+FNA:8,55728,CeedObjectDereference
+FNL:9,83,88
+FNA:9,6780,CeedObjectDestroy_Private
+FNF:10
+FNH:6
+DA:37,7200
+DA:38,7200
+DA:39,7200
+DA:40,7200
+DA:41,7200
+DA:42,7200
+DA:43,7200
+DA:44,7200
+DA:56,49140
+DA:57,49140
+DA:58,49140
+DA:70,55728
+DA:71,55728
+DA:83,6780
+DA:84,6780
+DA:86,6780
+DA:87,6780
+DA:108,0
+DA:109,0
+DA:110,0
+DA:123,0
+DA:124,0
+DA:125,0
+DA:126,0
+DA:139,0
+DA:140,0
+DA:141,0
+DA:154,9888
+DA:155,9888
+DA:156,9888
+DA:157,9888
+DA:169,11232
+DA:180,0
+DA:181,0
+DA:182,0
+LF:35
+LH:22
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/interface/ceed-operator.c
+FNL:0,1049,1064
+FNA:0,1920,CeedOperatorGetFields
+FNL:1,1079,1100
+FNA:1,0,CeedOperatorAtPointsSetPoints
+FNL:2,1112,1115
+FNA:2,1536,CeedOperatorIsAtPoints
+FNL:3,1130,1146
+FNA:3,0,CeedOperatorAtPointsGetPoints
+FNL:4,1163,1185
+FNA:4,0,CeedOperatorGetFieldByName
+FNL:5,1197,1200
+FNA:5,0,CeedOperatorFieldGetName
+FNL:6,1214,1218
+FNA:6,10416,CeedOperatorFieldGetElemRestriction
+FNL:7,1232,1236
+FNA:7,5376,CeedOperatorFieldGetBasis
+FNL:8,1250,1254
+FNA:8,7776,CeedOperatorFieldGetVector
+FNL:9,1273,1279
+FNA:9,0,CeedOperatorFieldGetData
+FNL:10,1291,1319
+FNA:10,0,CeedOperatorCompositeAddSub
+FNL:11,1331,1338
+FNA:11,0,CeedOperatorCompositeGetNumSub
+FNL:12,1350,1357
+FNA:12,0,CeedOperatorCompositeGetSubList
+FNL:13,1374,1391
+FNA:13,0,CeedOperatorCompositeGetSubByName
+FNL:14,138,175
+FNA:14,0,CeedOperatorSingleView
+FNL:15,1402,1450
+FNA:15,3072,CeedOperatorCheckReady
+FNL:16,1465,1494
+FNA:16,0,CeedOperatorGetActiveVectorLengths
+FNL:17,1509,1524
+FNA:17,0,CeedOperatorSetQFunctionAssemblyReuse
+FNL:18,1536,1556
+FNA:18,0,CeedOperatorSetQFunctionAssemblyDataUpdateNeeded
+FNL:19,1568,1579
+FNA:19,0,CeedOperatorSetName
+FNL:20,1591,1602
+FNA:20,0,CeedOperatorGetName
+FNL:21,1615,1652
+FNA:21,0,CeedOperatorView_Core
+FNL:22,1664,1667
+FNA:22,0,CeedOperatorSetNumViewTabs
+FNL:23,1679,1682
+FNA:23,0,CeedOperatorGetNumViewTabs
+FNL:24,1694,1697
+FNA:24,0,CeedOperatorView
+FNL:25,1709,1712
+FNA:25,0,CeedOperatorViewTerse
+FNL:26,1724,1727
+FNA:26,1536,CeedOperatorGetCeed
+FNL:27,1738,1738
+FNA:27,1152,CeedOperatorReturnCeed
+FNL:28,1750,1757
+FNA:28,384,CeedOperatorGetNumElements
+FNL:29,1769,1776
+FNA:29,768,CeedOperatorGetNumQuadraturePoints
+FNL:30,1786,1907
+FNA:30,0,CeedOperatorGetFlopsEstimate
+FNL:31,187,190
+FNA:31,0,CeedOperatorView_Object
+FNL:32,1924,1937
+FNA:32,0,CeedOperatorGetContext
+FNL:33,1952,2051
+FNA:33,0,CeedOperatorGetContextFieldLabel
+FNL:34,201,204
+FNA:34,0,CeedOperatorDestroy_Object
+FNL:35,2066,2068
+FNA:35,0,CeedOperatorSetContextDouble
+FNL:36,2084,2086
+FNA:36,0,CeedOperatorGetContextDoubleRead
+FNL:37,2099,2101
+FNA:37,0,CeedOperatorRestoreContextDoubleRead
+FNL:38,2116,2118
+FNA:38,0,CeedOperatorSetContextInt32
+FNL:39,2134,2136
+FNA:39,0,CeedOperatorGetContextInt32Read
+FNL:40,2149,2151
+FNA:40,0,CeedOperatorRestoreContextInt32Read
+FNL:41,2166,2168
+FNA:41,0,CeedOperatorSetContextBoolean
+FNL:42,218,221
+FNA:42,0,CeedOperatorGetActiveBasis
+FNL:43,2184,2186
+FNA:43,0,CeedOperatorGetContextBooleanRead
+FNL:44,2199,2201
+FNA:44,0,CeedOperatorRestoreContextBooleanRead
+FNL:45,2220,2241
+FNA:45,384,CeedOperatorApply
+FNL:46,2262,2287
+FNA:46,384,CeedOperatorApplyAdd
+FNL:47,2306,2354
+FNA:47,384,CeedOperatorApplyAddActive
+FNL:48,236,287
+FNA:48,0,CeedOperatorGetActiveBases
+FNL:49,2365,2383
+FNA:49,384,CeedOperatorAssemblyDataStrip
+FNL:50,2394,2466
+FNA:50,768,CeedOperatorDestroy
+FNL:51,301,304
+FNA:51,0,CeedOperatorGetActiveElemRestriction
+FNL:52,319,370
+FNA:52,0,CeedOperatorGetActiveElemRestrictions
+FNL:53,36,82
+FNA:53,1152,CeedOperatorCheckField
+FNL:54,387,432
+FNA:54,0,CeedOperatorContextSetGeneric
+FNL:55,450,500
+FNA:55,0,CeedOperatorContextGetGenericRead
+FNL:56,517,563
+FNA:56,0,CeedOperatorContextRestoreGenericRead
+FNL:57,583,590
+FNA:57,0,CeedOperatorGetNumArgs
+FNL:58,604,633
+FNA:58,0,CeedOperatorHasTensorBases
+FNL:59,645,648
+FNA:59,1152,CeedOperatorIsImmutable
+FNL:60,660,663
+FNA:60,384,CeedOperatorIsSetupDone
+FNL:61,675,683
+FNA:61,4224,CeedOperatorGetQFunction
+FNL:62,695,698
+FNA:62,10368,CeedOperatorIsComposite
+FNL:63,710,713
+FNA:63,1152,CeedOperatorGetData
+FNL:64,725,728
+FNA:64,384,CeedOperatorSetData
+FNL:65,739,742
+FNA:65,0,CeedOperatorReference
+FNL:66,753,756
+FNA:66,384,CeedOperatorSetSetupDone
+FNL:67,781,805
+FNA:67,576,CeedOperatorCreate
+FNL:68,823,848
+FNA:68,0,CeedOperatorCreateAtPoints
+FNL:69,860,881
+FNA:69,0,CeedOperatorCreateComposite
+FNL:70,898,903
+FNA:70,0,CeedOperatorReferenceCopy
+FNL:71,928,1032
+FNA:71,1152,CeedOperatorSetField
+FNL:72,98,125
+FNA:72,0,CeedOperatorFieldView
+FNF:73
+FNH:25
+DA:36,1152
+DA:38,1152
+DA:42,1152
+DA:45,1152
+DA:47,1152
+DA:48,960
+DA:51,1152
+DA:53,1152
+DA:54,768
+DA:55,768
+DA:56,768
+DA:57,768
+DA:63,1152
+DA:64,384
+DA:65,384
+DA:68,384
+DA:69,576
+DA:73,576
+DA:76,576
+DA:77,192
+DA:79,192
+DA:81,1152
+DA:98,0
+DA:100,0
+DA:108,0
+DA:109,0
+DA:111,0
+DA:116,0
+DA:117,0
+DA:118,0
+DA:119,0
+DA:120,0
+DA:122,0
+DA:123,0
+DA:124,0
+DA:138,0
+DA:140,0
+DA:145,0
+DA:146,0
+DA:147,0
+DA:148,0
+DA:149,0
+DA:150,0
+DA:151,0
+DA:152,0
+DA:154,0
+DA:155,0
+DA:158,0
+DA:159,0
+DA:160,0
+DA:161,0
+DA:163,0
+DA:165,0
+DA:166,0
+DA:167,0
+DA:168,0
+DA:170,0
+DA:171,0
+DA:172,0
+DA:174,0
+DA:187,0
+DA:188,0
+DA:189,0
+DA:201,0
+DA:202,0
+DA:203,0
+DA:218,0
+DA:219,0
+DA:220,0
+DA:236,0
+DA:241,0
+DA:242,0
+DA:244,0
+DA:245,0
+DA:246,0
+DA:247,0
+DA:250,0
+DA:251,0
+DA:254,0
+DA:255,0
+DA:257,0
+DA:258,0
+DA:260,0
+DA:262,0
+DA:265,0
+DA:266,0
+DA:267,0
+DA:268,0
+DA:271,0
+DA:272,0
+DA:275,0
+DA:276,0
+DA:278,0
+DA:279,0
+DA:281,0
+DA:283,0
+DA:286,0
+DA:301,0
+DA:302,0
+DA:303,0
+DA:319,0
+DA:324,0
+DA:325,0
+DA:327,0
+DA:328,0
+DA:329,0
+DA:330,0
+DA:333,0
+DA:334,0
+DA:337,0
+DA:338,0
+DA:340,0
+DA:341,0
+DA:343,0
+DA:345,0
+DA:348,0
+DA:349,0
+DA:350,0
+DA:351,0
+DA:354,0
+DA:355,0
+DA:358,0
+DA:359,0
+DA:361,0
+DA:362,0
+DA:364,0
+DA:366,0
+DA:369,0
+DA:387,0
+DA:388,0
+DA:390,0
+DA:393,0
+DA:394,0
+DA:396,0
+DA:397,0
+DA:399,0
+DA:402,0
+DA:403,0
+DA:407,0
+DA:408,0
+DA:409,0
+DA:412,0
+DA:415,0
+DA:417,0
+DA:418,0
+DA:420,0
+DA:425,0
+DA:426,0
+DA:427,0
+DA:428,0
+DA:430,0
+DA:431,0
+DA:450,0
+DA:452,0
+DA:454,0
+DA:456,0
+DA:457,0
+DA:460,0
+DA:461,0
+DA:463,0
+DA:464,0
+DA:466,0
+DA:469,0
+DA:470,0
+DA:474,0
+DA:475,0
+DA:476,0
+DA:479,0
+DA:482,0
+DA:484,0
+DA:485,0
+DA:486,0
+DA:487,0
+DA:489,0
+DA:494,0
+DA:495,0
+DA:496,0
+DA:497,0
+DA:499,0
+DA:517,0
+DA:518,0
+DA:520,0
+DA:523,0
+DA:524,0
+DA:526,0
+DA:527,0
+DA:529,0
+DA:532,0
+DA:533,0
+DA:537,0
+DA:538,0
+DA:539,0
+DA:542,0
+DA:545,0
+DA:547,0
+DA:548,0
+DA:549,0
+DA:550,0
+DA:552,0
+DA:557,0
+DA:558,0
+DA:559,0
+DA:560,0
+DA:562,0
+DA:583,0
+DA:586,0
+DA:587,0
+DA:588,0
+DA:589,0
+DA:604,0
+DA:608,0
+DA:609,0
+DA:610,0
+DA:614,0
+DA:615,0
+DA:616,0
+DA:617,0
+DA:619,0
+DA:621,0
+DA:625,0
+DA:626,0
+DA:627,0
+DA:628,0
+DA:630,0
+DA:632,0
+DA:645,1152
+DA:646,1152
+DA:647,1152
+DA:660,384
+DA:661,384
+DA:662,384
+DA:675,4224
+DA:678,4224
+DA:679,4224
+DA:680,4224
+DA:681,4224
+DA:682,4224
+DA:695,10368
+DA:696,10368
+DA:697,10368
+DA:710,1152
+DA:711,1152
+DA:712,1152
+DA:725,384
+DA:726,384
+DA:727,384
+DA:739,0
+DA:740,0
+DA:741,0
+DA:753,384
+DA:754,384
+DA:755,384
+DA:781,576
+DA:782,576
+DA:785,192
+DA:786,192
+DA:787,192
+DA:788,192
+DA:789,192
+DA:792,384
+DA:794,384
+DA:795,384
+DA:796,384
+DA:797,384
+DA:798,384
+DA:799,384
+DA:800,384
+DA:801,384
+DA:802,384
+DA:803,384
+DA:804,384
+DA:823,0
+DA:824,0
+DA:827,0
+DA:828,0
+DA:829,0
+DA:830,0
+DA:831,0
+DA:834,0
+DA:836,0
+DA:837,0
+DA:838,0
+DA:839,0
+DA:840,0
+DA:841,0
+DA:842,0
+DA:843,0
+DA:844,0
+DA:845,0
+DA:846,0
+DA:847,0
+DA:860,0
+DA:861,0
+DA:864,0
+DA:865,0
+DA:866,0
+DA:867,0
+DA:868,0
+DA:872,0
+DA:873,0
+DA:874,0
+DA:875,0
+DA:876,0
+DA:877,0
+DA:879,0
+DA:880,0
+DA:898,0
+DA:899,0
+DA:900,0
+DA:901,0
+DA:902,0
+DA:928,1152
+DA:929,1152
+DA:930,1152
+DA:935,1152
+DA:936,1152
+DA:937,1152
+DA:938,1152
+DA:939,1152
+DA:940,1152
+DA:941,1152
+DA:942,1152
+DA:944,1152
+DA:945,1152
+DA:950,1152
+DA:951,1152
+DA:952,0
+DA:954,0
+DA:956,0
+DA:957,0
+DA:961,0
+DA:962,0
+DA:968,1152
+DA:969,768
+DA:970,1152
+DA:975,1152
+DA:976,1152
+DA:977,1152
+DA:978,2304
+DA:981,1920
+DA:982,1920
+DA:983,768
+DA:984,768
+DA:985,768
+DA:988,384
+DA:989,384
+DA:992,384
+DA:993,384
+DA:994,384
+DA:995,384
+DA:996,384
+DA:1002,1152
+DA:1003,1152
+DA:1004,1152
+DA:1006,1152
+DA:1009,768
+DA:1010,768
+DA:1011,384
+DA:1012,384
+DA:1015,384
+DA:1016,384
+DA:1021,1152
+DA:1022,1152
+DA:1023,1152
+DA:1024,384
+DA:1025,384
+DA:1027,1152
+DA:1028,1152
+DA:1029,1152
+DA:1030,1152
+DA:1031,1152
+DA:1049,1920
+DA:1054,1920
+DA:1055,1920
+DA:1056,1920
+DA:1058,1920
+DA:1059,1920
+DA:1060,1920
+DA:1061,1920
+DA:1062,1920
+DA:1063,1920
+DA:1079,0
+DA:1082,0
+DA:1083,0
+DA:1084,0
+DA:1085,0
+DA:1087,0
+DA:1088,0
+DA:1092,0
+DA:1093,0
+DA:1097,0
+DA:1098,0
+DA:1099,0
+DA:1112,1536
+DA:1113,1536
+DA:1114,1536
+DA:1130,0
+DA:1133,0
+DA:1134,0
+DA:1135,0
+DA:1137,0
+DA:1138,0
+DA:1139,0
+DA:1141,0
+DA:1142,0
+DA:1143,0
+DA:1145,0
+DA:1163,0
+DA:1168,0
+DA:1169,0
+DA:1170,0
+DA:1171,0
+DA:1172,0
+DA:1173,0
+DA:1174,0
+DA:1177,0
+DA:1178,0
+DA:1179,0
+DA:1180,0
+DA:1181,0
+DA:1184,0
+DA:1197,0
+DA:1198,0
+DA:1199,0
+DA:1214,10416
+DA:1215,10416
+DA:1216,10416
+DA:1217,10416
+DA:1232,5376
+DA:1233,5376
+DA:1234,5376
+DA:1235,5376
+DA:1250,7776
+DA:1251,7776
+DA:1252,7776
+DA:1253,7776
+DA:1273,0
+DA:1274,0
+DA:1275,0
+DA:1276,0
+DA:1277,0
+DA:1278,0
+DA:1291,0
+DA:1294,0
+DA:1295,0
+DA:1297,0
+DA:1298,0
+DA:1303,0
+DA:1304,0
+DA:1305,0
+DA:1307,0
+DA:1315,0
+DA:1316,0
+DA:1317,0
+DA:1318,0
+DA:1331,0
+DA:1334,0
+DA:1335,0
+DA:1336,0
+DA:1337,0
+DA:1350,0
+DA:1353,0
+DA:1354,0
+DA:1355,0
+DA:1356,0
+DA:1374,0
+DA:1379,0
+DA:1380,0
+DA:1381,0
+DA:1382,0
+DA:1383,0
+DA:1384,0
+DA:1385,0
+DA:1386,0
+DA:1387,0
+DA:1390,0
+DA:1402,3072
+DA:1404,3072
+DA:1406,3072
+DA:1408,384
+DA:1409,384
+DA:1410,384
+DA:1411,384
+DA:1414,0
+DA:1415,0
+DA:1417,0
+DA:1418,0
+DA:1422,0
+DA:1423,0
+DA:1424,0
+DA:1429,0
+DA:1434,384
+DA:1435,384
+DA:1436,384
+DA:1438,384
+DA:1439,384
+DA:1444,384
+DA:1445,384
+DA:1446,384
+DA:1447,384
+DA:1448,384
+DA:1449,384
+DA:1465,0
+DA:1468,0
+DA:1469,0
+DA:1471,0
+DA:1472,0
+DA:1476,0
+DA:1477,0
+DA:1478,0
+DA:1481,0
+DA:1482,0
+DA:1483,0
+DA:1485,0
+DA:1493,0
+DA:1509,0
+DA:1512,0
+DA:1513,0
+DA:1514,0
+DA:1515,0
+DA:1520,0
+DA:1521,0
+DA:1523,0
+DA:1536,0
+DA:1539,0
+DA:1540,0
+DA:1544,0
+DA:1545,0
+DA:1546,0
+DA:1547,0
+DA:1552,0
+DA:1553,0
+DA:1555,0
+DA:1568,0
+DA:1570,0
+DA:1572,0
+DA:1573,0
+DA:1574,0
+DA:1575,0
+DA:1576,0
+DA:1578,0
+DA:1591,0
+DA:1592,0
+DA:1593,0
+DA:1594,0
+DA:1597,0
+DA:1598,0
+DA:1599,0
+DA:1601,0
+DA:1615,0
+DA:1617,0
+DA:1618,0
+DA:1619,0
+DA:1621,0
+DA:1622,0
+DA:1623,0
+DA:1624,0
+DA:1626,0
+DA:1627,0
+DA:1628,0
+DA:1629,0
+DA:1633,0
+DA:1634,0
+DA:1635,0
+DA:1636,0
+DA:1637,0
+DA:1638,0
+DA:1639,0
+DA:1640,0
+DA:1641,0
+DA:1642,0
+DA:1643,0
+DA:1646,0
+DA:1647,0
+DA:1648,0
+DA:1650,0
+DA:1651,0
+DA:1664,0
+DA:1665,0
+DA:1666,0
+DA:1679,0
+DA:1680,0
+DA:1681,0
+DA:1694,0
+DA:1695,0
+DA:1696,0
+DA:1709,0
+DA:1710,0
+DA:1711,0
+DA:1724,1536
+DA:1725,1536
+DA:1726,1536
+DA:1738,1152
+DA:1750,384
+DA:1753,384
+DA:1754,384
+DA:1755,384
+DA:1756,384
+DA:1769,768
+DA:1772,768
+DA:1773,768
+DA:1774,768
+DA:1775,768
+DA:1786,0
+DA:1789,0
+DA:1791,0
+DA:1792,0
+DA:1793,0
+DA:1796,0
+DA:1798,0
+DA:1801,0
+DA:1804,0
+DA:1805,0
+DA:1809,0
+DA:1814,0
+DA:1815,0
+DA:1816,0
+DA:1817,0
+DA:1819,0
+DA:1821,0
+DA:1822,0
+DA:1823,0
+DA:1825,0
+DA:1827,0
+DA:1828,0
+DA:1829,0
+DA:1831,0
+DA:1832,0
+DA:1834,0
+DA:1836,0
+DA:1838,0
+DA:1839,0
+DA:1840,0
+DA:1841,0
+DA:1844,0
+DA:1847,0
+DA:1848,0
+DA:1854,0
+DA:1855,0
+DA:1856,0
+DA:1857,0
+DA:1858,0
+DA:1859,0
+DA:1860,0
+DA:1861,0
+DA:1862,0
+DA:1864,0
+DA:1872,0
+DA:1873,0
+DA:1874,0
+DA:1875,0
+DA:1876,0
+DA:1877,0
+DA:1879,0
+DA:1883,0
+DA:1886,0
+DA:1887,0
+DA:1893,0
+DA:1894,0
+DA:1895,0
+DA:1896,0
+DA:1897,0
+DA:1898,0
+DA:1899,0
+DA:1900,0
+DA:1901,0
+DA:1903,0
+DA:1906,0
+DA:1924,0
+DA:1929,0
+DA:1930,0
+DA:1931,0
+DA:1932,0
+DA:1933,0
+DA:1934,0
+DA:1935,0
+DA:1936,0
+DA:1952,0
+DA:1953,0
+DA:1955,0
+DA:1957,0
+DA:1960,0
+DA:1961,0
+DA:1962,0
+DA:1963,0
+DA:1972,0
+DA:1973,0
+DA:1974,0
+DA:1975,0
+DA:1976,0
+DA:1978,0
+DA:1979,0
+DA:1982,0
+DA:1983,0
+DA:1984,0
+DA:1985,0
+DA:1986,0
+DA:1987,0
+DA:1988,0
+DA:1995,0
+DA:1997,0
+DA:2005,0
+DA:2011,0
+DA:2012,0
+DA:2025,0
+DA:2026,0
+DA:2027,0
+DA:2028,0
+DA:2029,0
+DA:2031,0
+DA:2036,0
+DA:2037,0
+DA:2040,0
+DA:2041,0
+DA:2042,0
+DA:2043,0
+DA:2044,0
+DA:2045,0
+DA:2047,0
+DA:2048,0
+DA:2050,0
+DA:2066,0
+DA:2067,0
+DA:2084,0
+DA:2085,0
+DA:2099,0
+DA:2100,0
+DA:2116,0
+DA:2117,0
+DA:2134,0
+DA:2135,0
+DA:2149,0
+DA:2150,0
+DA:2166,0
+DA:2167,0
+DA:2184,0
+DA:2185,0
+DA:2199,0
+DA:2200,0
+DA:2220,384
+DA:2223,384
+DA:2225,384
+DA:2226,384
+DA:2228,0
+DA:2229,384
+DA:2231,0
+DA:2235,384
+DA:2238,384
+DA:2240,384
+DA:2262,384
+DA:2265,384
+DA:2267,384
+DA:2268,384
+DA:2270,0
+DA:2271,0
+DA:2276,0
+DA:2277,0
+DA:2278,0
+DA:2279,0
+DA:2282,384
+DA:2284,384
+DA:2286,384
+DA:2306,384
+DA:2309,384
+DA:2311,384
+DA:2312,384
+DA:2317,0
+DA:2318,0
+DA:2321,0
+DA:2325,0
+DA:2326,0
+DA:2329,0
+DA:2330,0
+DA:2331,0
+DA:2335,0
+DA:2341,384
+DA:2343,768
+DA:2346,384
+DA:2347,384
+DA:2348,384
+DA:2351,384
+DA:2353,384
+DA:2365,384
+DA:2368,384
+DA:2369,384
+DA:2370,384
+DA:2371,384
+DA:2375,0
+DA:2376,0
+DA:2377,0
+DA:2378,0
+DA:2379,0
+DA:2382,384
+DA:2394,768
+DA:2395,768
+DA:2396,384
+DA:2397,384
+DA:2400,384
+DA:2401,384
+DA:2404,1536
+DA:2405,1152
+DA:2406,768
+DA:2407,576
+DA:2409,768
+DA:2410,576
+DA:2412,768
+DA:2413,192
+DA:2415,768
+DA:2416,768
+DA:2419,1536
+DA:2420,1152
+DA:2421,384
+DA:2422,384
+DA:2423,192
+DA:2425,384
+DA:2426,0
+DA:2428,384
+DA:2429,384
+DA:2432,384
+DA:2433,384
+DA:2435,384
+DA:2436,384
+DA:2437,384
+DA:2439,384
+DA:2441,384
+DA:2442,0
+DA:2443,0
+DA:2446,384
+DA:2447,384
+DA:2448,384
+DA:2449,384
+DA:2451,384
+DA:2452,0
+DA:2453,0
+DA:2454,0
+DA:2457,384
+DA:2460,384
+DA:2462,384
+DA:2463,384
+DA:2464,384
+DA:2465,384
+LF:841
+LH:250
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/interface/ceed-preconditioning.c
+FNL:0,1011,1215
+FNA:0,0,CeedOperatorMultigridLevelCreateSingle_Core
+FNL:1,105,202
+FNA:1,0,CeedOperatorCreateFallback
+FNL:2,1234,1255
+FNA:2,0,CeedBuildMassLaplace
+FNL:3,1276,1298
+FNA:3,0,CeedOperatorGetBasisPointer
+FNL:4,1310,1340
+FNA:4,0,CeedOperatorCreateActivePointBlockRestriction
+FNL:5,1352,1361
+FNA:5,0,CeedOperatorGetQFunctionAssemblyData
+FNL:6,1373,1378
+FNA:6,0,CeedQFunctionAssemblyDataCreate
+FNL:7,1389,1392
+FNA:7,0,CeedQFunctionAssemblyDataReference
+FNL:8,1404,1408
+FNA:8,0,CeedQFunctionAssemblyDataSetReuse
+FNL:9,1420,1423
+FNA:9,0,CeedQFunctionAssemblyDataSetUpdateNeeded
+FNL:10,1435,1438
+FNA:10,0,CeedQFunctionAssemblyDataIsUpdateNeeded
+FNL:11,1455,1460
+FNA:11,0,CeedQFunctionAssemblyDataReferenceCopy
+FNL:12,1472,1475
+FNA:12,0,CeedQFunctionAssemblyDataIsSetup
+FNL:13,1488,1494
+FNA:13,0,CeedQFunctionAssemblyDataSetObjects
+FNL:14,1507,1513
+FNA:14,0,CeedQFunctionAssemblyDataGetObjects
+FNL:15,1524,1535
+FNA:15,384,CeedQFunctionAssemblyDataDestroy
+FNL:16,1547,1556
+FNA:16,0,CeedOperatorGetOperatorAssemblyData
+FNL:17,1576,1721
+FNA:17,0,CeedOperatorAssemblyDataCreate
+FNL:18,1744,1758
+FNA:18,0,CeedOperatorAssemblyDataGetEvalModes
+FNL:19,1777,1885
+FNA:19,0,CeedOperatorAssemblyDataGetBases
+FNL:20,1902,1910
+FNA:20,0,CeedOperatorAssemblyDataGetElemRestrictions
+FNL:21,1921,1956
+FNA:21,384,CeedOperatorAssemblyDataDestroy
+FNL:22,1968,1995
+FNA:22,0,CeedOperatorGetFallback
+FNL:23,2007,2010
+FNA:23,0,CeedOperatorGetFallbackParent
+FNL:24,2022,2027
+FNA:24,0,CeedOperatorGetFallbackParentCeed
+FNL:25,2057,2073
+FNA:25,0,CeedOperatorLinearAssembleQFunction
+FNL:26,2094,2096
+FNA:26,0,CeedOperatorLinearAssembleQFunctionBuildOrUpdate
+FNL:27,2115,2162
+FNA:27,0,CeedOperatorLinearAssembleDiagonal
+FNL:28,216,387
+FNA:28,0,CeedOperatorLinearAssembleAddDiagonalSingle_Mesh
+FNL:29,2181,2221
+FNA:29,0,CeedOperatorLinearAssembleAddDiagonal
+FNL:30,2243,2325
+FNA:30,0,CeedOperatorLinearAssemblePointBlockDiagonalSymbolic
+FNL:31,2346,2388
+FNA:31,0,CeedOperatorLinearAssemblePointBlockDiagonal
+FNL:32,2409,2449
+FNA:32,0,CeedOperatorLinearAssembleAddPointBlockDiagonal
+FNL:33,2471,2516
+FNA:33,0,CeedOperatorLinearAssembleSymbolic
+FNL:34,2536,2588
+FNA:34,0,CeedOperatorLinearAssemble
+FNL:35,2604,2662
+FNA:35,0,CeedOperatorCompositeGetMultiplicity
+FNL:36,2681,2700
+FNA:36,0,CeedOperatorMultigridLevelCreate
+FNL:37,2720,2767
+FNA:37,0,CeedOperatorMultigridLevelCreateTensorH1
+FNL:38,2787,2832
+FNA:38,0,CeedOperatorMultigridLevelCreateH1
+FNL:39,2853,3076
+FNA:39,0,CeedOperatorCreateFDMElementInverse
+FNL:40,37,94
+FNA:40,0,CeedQFunctionCreateFallback
+FNL:41,401,409
+FNA:41,0,CeedOperatorLinearAssembleAddDiagonalSingle
+FNL:42,423,438
+FNA:42,0,CeedOperatorLinearAssembleAddDiagonalComposite
+FNL:43,454,552
+FNA:43,0,CeedOperatorAssembleSymbolicSingle
+FNL:44,574,632
+FNA:44,0,CeedOperatorLinearAssembleQFunctionBuildOrUpdate_Core
+FNL:45,653,656
+FNA:45,0,CeedOperatorLinearAssembleQFunctionBuildOrUpdateFallback
+FNL:46,671,916
+FNA:46,0,CeedOperatorAssembleSingle
+FNL:47,928,957
+FNA:47,0,CeedOperatorAssemblyCountEntriesSingle
+FNL:48,969,993
+FNA:48,0,CeedOperatorLinearAssembleGetNumEntries
+FNF:49
+FNH:2
+DA:37,0
+DA:38,0
+DA:43,0
+DA:45,0
+DA:47,0
+DA:48,0
+DA:50,0
+DA:51,0
+DA:52,0
+DA:53,0
+DA:54,0
+DA:55,0
+DA:57,0
+DA:64,0
+DA:65,0
+DA:66,0
+DA:71,0
+DA:72,0
+DA:73,0
+DA:75,0
+DA:76,0
+DA:81,0
+DA:82,0
+DA:84,0
+DA:89,0
+DA:90,0
+DA:92,0
+DA:93,0
+DA:105,0
+DA:111,0
+DA:114,0
+DA:115,0
+DA:116,0
+DA:117,0
+DA:119,0
+DA:122,0
+DA:123,0
+DA:127,0
+DA:128,0
+DA:129,0
+DA:130,0
+DA:133,0
+DA:134,0
+DA:137,0
+DA:139,0
+DA:142,0
+DA:143,0
+DA:144,0
+DA:145,0
+DA:146,0
+DA:150,0
+DA:151,0
+DA:152,0
+DA:153,0
+DA:154,0
+DA:156,0
+DA:158,0
+DA:159,0
+DA:165,0
+DA:166,0
+DA:167,0
+DA:168,0
+DA:169,0
+DA:171,0
+DA:177,0
+DA:178,0
+DA:179,0
+DA:180,0
+DA:181,0
+DA:186,0
+DA:187,0
+DA:190,0
+DA:191,0
+DA:192,0
+DA:194,0
+DA:195,0
+DA:198,0
+DA:199,0
+DA:200,0
+DA:201,0
+DA:216,0
+DA:220,0
+DA:221,0
+DA:226,0
+DA:227,0
+DA:229,0
+DA:230,0
+DA:231,0
+DA:232,0
+DA:242,0
+DA:243,0
+DA:246,0
+DA:247,0
+DA:250,0
+DA:252,0
+DA:253,0
+DA:257,0
+DA:258,0
+DA:259,0
+DA:260,0
+DA:261,0
+DA:264,0
+DA:265,0
+DA:268,0
+DA:269,0
+DA:270,0
+DA:271,0
+DA:274,0
+DA:275,0
+DA:278,0
+DA:282,0
+DA:283,0
+DA:285,0
+DA:289,0
+DA:292,0
+DA:293,0
+DA:294,0
+DA:295,0
+DA:296,0
+DA:297,0
+DA:298,0
+DA:301,0
+DA:302,0
+DA:304,0
+DA:305,0
+DA:307,0
+DA:308,0
+DA:309,0
+DA:314,0
+DA:316,0
+DA:317,0
+DA:319,0
+DA:320,0
+DA:321,0
+DA:322,0
+DA:324,0
+DA:325,0
+DA:326,0
+DA:327,0
+DA:328,0
+DA:330,0
+DA:332,0
+DA:333,0
+DA:335,0
+DA:336,0
+DA:337,0
+DA:338,0
+DA:339,0
+DA:341,0
+DA:344,0
+DA:346,0
+DA:347,0
+DA:349,0
+DA:350,0
+DA:351,0
+DA:352,0
+DA:354,0
+DA:355,0
+DA:356,0
+DA:361,0
+DA:362,0
+DA:363,0
+DA:365,0
+DA:366,0
+DA:374,0
+DA:377,0
+DA:380,0
+DA:381,0
+DA:382,0
+DA:384,0
+DA:385,0
+DA:386,0
+DA:401,0
+DA:405,0
+DA:406,0
+DA:407,0
+DA:408,0
+DA:423,0
+DA:428,0
+DA:429,0
+DA:430,0
+DA:431,0
+DA:432,0
+DA:434,0
+DA:437,0
+DA:454,0
+DA:457,0
+DA:465,0
+DA:466,0
+DA:467,0
+DA:469,0
+DA:470,0
+DA:471,0
+DA:472,0
+DA:473,0
+DA:474,0
+DA:477,0
+DA:478,0
+DA:479,0
+DA:480,0
+DA:481,0
+DA:482,0
+DA:483,0
+DA:484,0
+DA:485,0
+DA:486,0
+DA:487,0
+DA:489,0
+DA:490,0
+DA:491,0
+DA:495,0
+DA:496,0
+DA:497,0
+DA:500,0
+DA:501,0
+DA:502,0
+DA:503,0
+DA:504,0
+DA:505,0
+DA:506,0
+DA:507,0
+DA:508,0
+DA:509,0
+DA:510,0
+DA:512,0
+DA:513,0
+DA:514,0
+DA:515,0
+DA:516,0
+DA:517,0
+DA:518,0
+DA:520,0
+DA:523,0
+DA:524,0
+DA:525,0
+DA:526,0
+DA:527,0
+DA:528,0
+DA:529,0
+DA:530,0
+DA:531,0
+DA:533,0
+DA:534,0
+DA:535,0
+DA:541,0
+DA:542,0
+DA:543,0
+DA:544,0
+DA:545,0
+DA:546,0
+DA:548,0
+DA:549,0
+DA:550,0
+DA:551,0
+DA:574,0
+DA:576,0
+DA:577,0
+DA:578,0
+DA:580,0
+DA:583,0
+DA:584,0
+DA:586,0
+DA:587,0
+DA:588,0
+DA:589,0
+DA:591,0
+DA:592,0
+DA:596,0
+DA:600,0
+DA:601,0
+DA:603,0
+DA:604,0
+DA:605,0
+DA:608,0
+DA:609,0
+DA:610,0
+DA:612,0
+DA:613,0
+DA:615,0
+DA:618,0
+DA:619,0
+DA:620,0
+DA:621,0
+DA:626,0
+DA:627,0
+DA:628,0
+DA:629,0
+DA:631,0
+DA:653,0
+DA:655,0
+DA:671,0
+DA:674,0
+DA:675,0
+DA:679,0
+DA:681,0
+DA:682,0
+DA:685,0
+DA:687,0
+DA:688,0
+DA:693,0
+DA:694,0
+DA:695,0
+DA:696,0
+DA:697,0
+DA:701,0
+DA:702,0
+DA:708,0
+DA:709,0
+DA:711,0
+DA:712,0
+DA:713,0
+DA:714,0
+DA:719,0
+DA:726,0
+DA:727,0
+DA:730,0
+DA:731,0
+DA:734,0
+DA:736,0
+DA:739,0
+DA:740,0
+DA:741,0
+DA:742,0
+DA:743,0
+DA:744,0
+DA:746,0
+DA:747,0
+DA:748,0
+DA:749,0
+DA:750,0
+DA:752,0
+DA:753,0
+DA:754,0
+DA:755,0
+DA:756,0
+DA:759,0
+DA:760,0
+DA:761,0
+DA:765,0
+DA:766,0
+DA:767,0
+DA:768,0
+DA:769,0
+DA:774,0
+DA:775,0
+DA:776,0
+DA:777,0
+DA:778,0
+DA:781,0
+DA:782,0
+DA:783,0
+DA:784,0
+DA:786,0
+DA:787,0
+DA:789,0
+DA:794,0
+DA:796,0
+DA:797,0
+DA:798,0
+DA:799,0
+DA:801,0
+DA:802,0
+DA:803,0
+DA:804,0
+DA:806,0
+DA:807,0
+DA:808,0
+DA:809,0
+DA:810,0
+DA:812,0
+DA:813,0
+DA:814,0
+DA:815,0
+DA:817,0
+DA:819,0
+DA:825,0
+DA:826,0
+DA:831,0
+DA:832,0
+DA:833,0
+DA:837,0
+DA:838,0
+DA:840,0
+DA:841,0
+DA:843,0
+DA:844,0
+DA:847,0
+DA:848,0
+DA:851,0
+DA:852,0
+DA:853,0
+DA:854,0
+DA:855,0
+DA:856,0
+DA:860,0
+DA:861,0
+DA:863,0
+DA:864,0
+DA:865,0
+DA:868,0
+DA:869,0
+DA:872,0
+DA:873,0
+DA:874,0
+DA:875,0
+DA:876,0
+DA:877,0
+DA:883,0
+DA:884,0
+DA:885,0
+DA:886,0
+DA:892,0
+DA:893,0
+DA:896,0
+DA:897,0
+DA:898,0
+DA:899,0
+DA:900,0
+DA:901,0
+DA:902,0
+DA:904,0
+DA:905,0
+DA:906,0
+DA:907,0
+DA:908,0
+DA:911,0
+DA:912,0
+DA:913,0
+DA:914,0
+DA:915,0
+DA:928,0
+DA:933,0
+DA:934,0
+DA:936,0
+DA:937,0
+DA:938,0
+DA:939,0
+DA:940,0
+DA:941,0
+DA:942,0
+DA:946,0
+DA:947,0
+DA:949,0
+DA:950,0
+DA:951,0
+DA:953,0
+DA:954,0
+DA:955,0
+DA:956,0
+DA:969,0
+DA:972,0
+DA:973,0
+DA:975,0
+DA:979,0
+DA:980,0
+DA:982,0
+DA:983,0
+DA:986,0
+DA:987,0
+DA:990,0
+DA:992,0
+DA:1011,0
+DA:1017,0
+DA:1018,0
+DA:1021,0
+DA:1024,0
+DA:1025,0
+DA:1031,0
+DA:1032,0
+DA:1036,0
+DA:1037,0
+DA:1038,0
+DA:1039,0
+DA:1040,0
+DA:1042,0
+DA:1045,0
+DA:1047,0
+DA:1050,0
+DA:1051,0
+DA:1053,0
+DA:1054,0
+DA:1055,0
+DA:1056,0
+DA:1057,0
+DA:1058,0
+DA:1060,0
+DA:1061,0
+DA:1063,0
+DA:1064,0
+DA:1065,0
+DA:1066,0
+DA:1069,0
+DA:1072,0
+DA:1073,0
+DA:1075,0
+DA:1076,0
+DA:1077,0
+DA:1078,0
+DA:1079,0
+DA:1080,0
+DA:1082,0
+DA:1083,0
+DA:1085,0
+DA:1086,0
+DA:1087,0
+DA:1088,0
+DA:1094,0
+DA:1095,0
+DA:1099,0
+DA:1103,0
+DA:1104,0
+DA:1106,0
+DA:1107,0
+DA:1108,0
+DA:1109,0
+DA:1110,0
+DA:1111,0
+DA:1112,0
+DA:1113,0
+DA:1114,0
+DA:1118,0
+DA:1119,0
+DA:1120,0
+DA:1123,0
+DA:1127,0
+DA:1130,0
+DA:1135,0
+DA:1136,0
+DA:1137,0
+DA:1138,0
+DA:1139,0
+DA:1140,0
+DA:1141,0
+DA:1142,0
+DA:1143,0
+DA:1144,0
+DA:1145,0
+DA:1147,0
+DA:1148,0
+DA:1149,0
+DA:1150,0
+DA:1155,0
+DA:1156,0
+DA:1157,0
+DA:1158,0
+DA:1161,0
+DA:1164,0
+DA:1168,0
+DA:1173,0
+DA:1174,0
+DA:1175,0
+DA:1176,0
+DA:1177,0
+DA:1178,0
+DA:1179,0
+DA:1180,0
+DA:1181,0
+DA:1182,0
+DA:1183,0
+DA:1185,0
+DA:1186,0
+DA:1187,0
+DA:1188,0
+DA:1193,0
+DA:1194,0
+DA:1195,0
+DA:1196,0
+DA:1199,0
+DA:1202,0
+DA:1206,0
+DA:1209,0
+DA:1210,0
+DA:1211,0
+DA:1212,0
+DA:1213,0
+DA:1214,0
+DA:1234,0
+DA:1236,0
+DA:1237,0
+DA:1238,0
+DA:1239,0
+DA:1240,0
+DA:1244,0
+DA:1245,0
+DA:1246,0
+DA:1248,0
+DA:1249,0
+DA:1252,0
+DA:1253,0
+DA:1254,0
+DA:1276,0
+DA:1277,0
+DA:1278,0
+DA:1279,0
+DA:1280,0
+DA:1281,0
+DA:1282,0
+DA:1283,0
+DA:1284,0
+DA:1285,0
+DA:1286,0
+DA:1287,0
+DA:1288,0
+DA:1289,0
+DA:1290,0
+DA:1291,0
+DA:1292,0
+DA:1293,0
+DA:1294,0
+DA:1296,0
+DA:1297,0
+DA:1310,0
+DA:1316,0
+DA:1317,0
+DA:1320,0
+DA:1321,0
+DA:1322,0
+DA:1323,0
+DA:1324,0
+DA:1325,0
+DA:1326,0
+DA:1327,0
+DA:1328,0
+DA:1329,0
+DA:1333,0
+DA:1337,0
+DA:1338,0
+DA:1339,0
+DA:1352,0
+DA:1353,0
+DA:1356,0
+DA:1357,0
+DA:1359,0
+DA:1360,0
+DA:1373,0
+DA:1374,0
+DA:1375,0
+DA:1376,0
+DA:1377,0
+DA:1389,0
+DA:1390,0
+DA:1391,0
+DA:1404,0
+DA:1405,0
+DA:1406,0
+DA:1407,0
+DA:1420,0
+DA:1421,0
+DA:1422,0
+DA:1435,0
+DA:1436,0
+DA:1437,0
+DA:1455,0
+DA:1456,0
+DA:1457,0
+DA:1458,0
+DA:1459,0
+DA:1472,0
+DA:1473,0
+DA:1474,0
+DA:1488,0
+DA:1489,0
+DA:1490,0
+DA:1492,0
+DA:1493,0
+DA:1507,0
+DA:1508,0
+DA:1510,0
+DA:1511,0
+DA:1512,0
+DA:1524,384
+DA:1525,384
+DA:1526,384
+DA:1527,384
+DA:1529,0
+DA:1530,0
+DA:1531,0
+DA:1533,0
+DA:1534,0
+DA:1547,0
+DA:1548,0
+DA:1551,0
+DA:1552,0
+DA:1554,0
+DA:1555,0
+DA:1576,0
+DA:1577,0
+DA:1578,0
+DA:1579,0
+DA:1580,0
+DA:1586,0
+DA:1587,0
+DA:1590,0
+DA:1591,0
+DA:1594,0
+DA:1597,0
+DA:1598,0
+DA:1599,0
+DA:1602,0
+DA:1603,0
+DA:1604,0
+DA:1606,0
+DA:1608,0
+DA:1609,0
+DA:1610,0
+DA:1611,0
+DA:1612,0
+DA:1613,0
+DA:1615,0
+DA:1618,0
+DA:1619,0
+DA:1620,0
+DA:1621,0
+DA:1622,0
+DA:1623,0
+DA:1624,0
+DA:1625,0
+DA:1626,0
+DA:1627,0
+DA:1628,0
+DA:1629,0
+DA:1630,0
+DA:1631,0
+DA:1632,0
+DA:1633,0
+DA:1634,0
+DA:1635,0
+DA:1637,0
+DA:1639,0
+DA:1640,0
+DA:1641,0
+DA:1642,0
+DA:1643,0
+DA:1644,0
+DA:1646,0
+DA:1648,0
+DA:1650,0
+DA:1654,0
+DA:1655,0
+DA:1656,0
+DA:1657,0
+DA:1660,0
+DA:1661,0
+DA:1662,0
+DA:1664,0
+DA:1666,0
+DA:1667,0
+DA:1668,0
+DA:1669,0
+DA:1670,0
+DA:1671,0
+DA:1673,0
+DA:1676,0
+DA:1677,0
+DA:1678,0
+DA:1679,0
+DA:1680,0
+DA:1681,0
+DA:1682,0
+DA:1683,0
+DA:1684,0
+DA:1685,0
+DA:1686,0
+DA:1687,0
+DA:1688,0
+DA:1689,0
+DA:1690,0
+DA:1691,0
+DA:1692,0
+DA:1693,0
+DA:1695,0
+DA:1697,0
+DA:1698,0
+DA:1699,0
+DA:1700,0
+DA:1701,0
+DA:1702,0
+DA:1704,0
+DA:1706,0
+DA:1708,0
+DA:1710,0
+DA:1711,0
+DA:1712,0
+DA:1713,0
+DA:1714,0
+DA:1715,0
+DA:1716,0
+DA:1717,0
+DA:1718,0
+DA:1719,0
+DA:1720,0
+DA:1744,0
+DA:1748,0
+DA:1749,0
+DA:1750,0
+DA:1751,0
+DA:1752,0
+DA:1753,0
+DA:1754,0
+DA:1755,0
+DA:1756,0
+DA:1757,0
+DA:1777,0
+DA:1781,0
+DA:1784,0
+DA:1785,0
+DA:1786,0
+DA:1787,0
+DA:1789,0
+DA:1791,0
+DA:1792,0
+DA:1794,0
+DA:1795,0
+DA:1797,0
+DA:1798,0
+DA:1799,0
+DA:1800,0
+DA:1804,0
+DA:1805,0
+DA:1806,0
+DA:1807,0
+DA:1809,0
+DA:1810,0
+DA:1811,0
+DA:1813,0
+DA:1814,0
+DA:1815,0
+DA:1816,0
+DA:1817,0
+DA:1819,0
+DA:1820,0
+DA:1824,0
+DA:1825,0
+DA:1829,0
+DA:1832,0
+DA:1833,0
+DA:1834,0
+DA:1835,0
+DA:1837,0
+DA:1839,0
+DA:1840,0
+DA:1842,0
+DA:1843,0
+DA:1845,0
+DA:1846,0
+DA:1847,0
+DA:1848,0
+DA:1852,0
+DA:1853,0
+DA:1854,0
+DA:1855,0
+DA:1857,0
+DA:1858,0
+DA:1859,0
+DA:1861,0
+DA:1862,0
+DA:1863,0
+DA:1864,0
+DA:1865,0
+DA:1867,0
+DA:1868,0
+DA:1872,0
+DA:1873,0
+DA:1878,0
+DA:1879,0
+DA:1880,0
+DA:1881,0
+DA:1882,0
+DA:1883,0
+DA:1884,0
+DA:1902,0
+DA:1905,0
+DA:1906,0
+DA:1907,0
+DA:1908,0
+DA:1909,0
+DA:1921,384
+DA:1922,384
+DA:1923,384
+DA:1924,384
+DA:1926,0
+DA:1927,0
+DA:1928,0
+DA:1929,0
+DA:1930,0
+DA:1931,0
+DA:1932,0
+DA:1934,0
+DA:1935,0
+DA:1936,0
+DA:1937,0
+DA:1938,0
+DA:1939,0
+DA:1941,0
+DA:1942,0
+DA:1943,0
+DA:1944,0
+DA:1945,0
+DA:1946,0
+DA:1947,0
+DA:1948,0
+DA:1949,0
+DA:1950,0
+DA:1951,0
+DA:1952,0
+DA:1954,0
+DA:1955,0
+DA:1968,0
+DA:1970,0
+DA:1971,0
+DA:1975,0
+DA:1976,0
+DA:1977,0
+DA:1981,0
+DA:1982,0
+DA:1983,0
+DA:1984,0
+DA:1986,0
+DA:1987,0
+DA:1989,0
+DA:1991,0
+DA:1993,0
+DA:1994,0
+DA:2007,0
+DA:2008,0
+DA:2009,0
+DA:2022,0
+DA:2023,0
+DA:2024,0
+DA:2025,0
+DA:2026,0
+DA:2057,0
+DA:2058,0
+DA:2060,0
+DA:2062,0
+DA:2067,0
+DA:2068,0
+DA:2069,0
+DA:2070,0
+DA:2072,0
+DA:2094,0
+DA:2095,0
+DA:2115,0
+DA:2117,0
+DA:2119,0
+DA:2120,0
+DA:2122,0
+DA:2123,0
+DA:2126,0
+DA:2127,0
+DA:2129,0
+DA:2130,0
+DA:2133,0
+DA:2135,0
+DA:2136,0
+DA:2137,0
+DA:2139,0
+DA:2140,0
+DA:2141,0
+DA:2142,0
+DA:2144,0
+DA:2145,0
+DA:2146,0
+DA:2151,0
+DA:2152,0
+DA:2153,0
+DA:2154,0
+DA:2155,0
+DA:2159,0
+DA:2160,0
+DA:2161,0
+DA:2181,0
+DA:2183,0
+DA:2185,0
+DA:2186,0
+DA:2188,0
+DA:2189,0
+DA:2192,0
+DA:2193,0
+DA:2195,0
+DA:2196,0
+DA:2199,0
+DA:2201,0
+DA:2202,0
+DA:2203,0
+DA:2205,0
+DA:2206,0
+DA:2211,0
+DA:2212,0
+DA:2213,0
+DA:2214,0
+DA:2215,0
+DA:2219,0
+DA:2220,0
+DA:2243,0
+DA:2248,0
+DA:2250,0
+DA:2251,0
+DA:2252,0
+DA:2254,0
+DA:2255,0
+DA:2256,0
+DA:2258,0
+DA:2259,0
+DA:2269,0
+DA:2270,0
+DA:2271,0
+DA:2272,0
+DA:2275,0
+DA:2276,0
+DA:2277,0
+DA:2278,0
+DA:2281,0
+DA:2282,0
+DA:2284,0
+DA:2285,0
+DA:2292,0
+DA:2293,0
+DA:2294,0
+DA:2296,0
+DA:2301,0
+DA:2302,0
+DA:2303,0
+DA:2304,0
+DA:2305,0
+DA:2307,0
+DA:2308,0
+DA:2310,0
+DA:2311,0
+DA:2312,0
+DA:2313,0
+DA:2314,0
+DA:2319,0
+DA:2320,0
+DA:2321,0
+DA:2322,0
+DA:2324,0
+DA:2346,0
+DA:2348,0
+DA:2350,0
+DA:2351,0
+DA:2353,0
+DA:2354,0
+DA:2357,0
+DA:2358,0
+DA:2360,0
+DA:2361,0
+DA:2364,0
+DA:2366,0
+DA:2367,0
+DA:2368,0
+DA:2370,0
+DA:2371,0
+DA:2372,0
+DA:2377,0
+DA:2378,0
+DA:2379,0
+DA:2380,0
+DA:2381,0
+DA:2385,0
+DA:2386,0
+DA:2387,0
+DA:2409,0
+DA:2411,0
+DA:2413,0
+DA:2414,0
+DA:2416,0
+DA:2417,0
+DA:2420,0
+DA:2421,0
+DA:2423,0
+DA:2424,0
+DA:2427,0
+DA:2429,0
+DA:2430,0
+DA:2435,0
+DA:2436,0
+DA:2437,0
+DA:2438,0
+DA:2439,0
+DA:2443,0
+DA:2444,0
+DA:2446,0
+DA:2448,0
+DA:2471,0
+DA:2473,0
+DA:2477,0
+DA:2478,0
+DA:2480,0
+DA:2482,0
+DA:2483,0
+DA:2488,0
+DA:2489,0
+DA:2490,0
+DA:2491,0
+DA:2492,0
+DA:2499,0
+DA:2500,0
+DA:2501,0
+DA:2504,0
+DA:2505,0
+DA:2506,0
+DA:2507,0
+DA:2508,0
+DA:2509,0
+DA:2510,0
+DA:2513,0
+DA:2515,0
+DA:2536,0
+DA:2538,0
+DA:2539,0
+DA:2542,0
+DA:2543,0
+DA:2546,0
+DA:2547,0
+DA:2549,0
+DA:2550,0
+DA:2553,0
+DA:2555,0
+DA:2556,0
+DA:2557,0
+DA:2559,0
+DA:2560,0
+DA:2561,0
+DA:2562,0
+DA:2563,0
+DA:2564,0
+DA:2565,0
+DA:2567,0
+DA:2568,0
+DA:2569,0
+DA:2570,0
+DA:2571,0
+DA:2576,0
+DA:2577,0
+DA:2578,0
+DA:2579,0
+DA:2580,0
+DA:2585,0
+DA:2586,0
+DA:2587,0
+DA:2604,0
+DA:2613,0
+DA:2616,0
+DA:2619,0
+DA:2620,0
+DA:2621,0
+DA:2624,0
+DA:2625,0
+DA:2626,0
+DA:2627,0
+DA:2628,0
+DA:2629,0
+DA:2632,0
+DA:2637,0
+DA:2638,0
+DA:2642,0
+DA:2643,0
+DA:2644,0
+DA:2645,0
+DA:2646,0
+DA:2647,0
+DA:2648,0
+DA:2649,0
+DA:2651,0
+DA:2652,0
+DA:2654,0
+DA:2655,0
+DA:2656,0
+DA:2657,0
+DA:2659,0
+DA:2660,0
+DA:2661,0
+DA:2681,0
+DA:2683,0
+DA:2685,0
+DA:2688,0
+DA:2691,0
+DA:2692,0
+DA:2693,0
+DA:2697,0
+DA:2699,0
+DA:2720,0
+DA:2725,0
+DA:2727,0
+DA:2728,0
+DA:2731,0
+DA:2732,0
+DA:2733,0
+DA:2734,0
+DA:2740,0
+DA:2745,0
+DA:2747,0
+DA:2748,0
+DA:2749,0
+DA:2750,0
+DA:2751,0
+DA:2752,0
+DA:2753,0
+DA:2754,0
+DA:2755,0
+DA:2756,0
+DA:2757,0
+DA:2758,0
+DA:2759,0
+DA:2763,0
+DA:2765,0
+DA:2766,0
+DA:2787,0
+DA:2792,0
+DA:2794,0
+DA:2795,0
+DA:2798,0
+DA:2799,0
+DA:2800,0
+DA:2801,0
+DA:2804,0
+DA:2810,0
+DA:2812,0
+DA:2813,0
+DA:2814,0
+DA:2815,0
+DA:2816,0
+DA:2817,0
+DA:2818,0
+DA:2819,0
+DA:2820,0
+DA:2821,0
+DA:2822,0
+DA:2823,0
+DA:2824,0
+DA:2828,0
+DA:2830,0
+DA:2831,0
+DA:2853,0
+DA:2855,0
+DA:2856,0
+DA:2860,0
+DA:2861,0
+DA:2867,0
+DA:2869,0
+DA:2871,0
+DA:2872,0
+DA:2877,0
+DA:2878,0
+DA:2879,0
+DA:2880,0
+DA:2881,0
+DA:2886,0
+DA:2887,0
+DA:2888,0
+DA:2891,0
+DA:2892,0
+DA:2893,0
+DA:2896,0
+DA:2897,0
+DA:2900,0
+DA:2901,0
+DA:2902,0
+DA:2903,0
+DA:2904,0
+DA:2906,0
+DA:2908,0
+DA:2909,0
+DA:2910,0
+DA:2911,0
+DA:2912,0
+DA:2913,0
+DA:2914,0
+DA:2915,0
+DA:2918,0
+DA:2919,0
+DA:2920,0
+DA:2921,0
+DA:2922,0
+DA:2923,0
+DA:2924,0
+DA:2926,0
+DA:2927,0
+DA:2928,0
+DA:2929,0
+DA:2932,0
+DA:2933,0
+DA:2934,0
+DA:2935,0
+DA:2936,0
+DA:2938,0
+DA:2941,0
+DA:2942,0
+DA:2944,0
+DA:2945,0
+DA:2948,0
+DA:2949,0
+DA:2950,0
+DA:2951,0
+DA:2954,0
+DA:2955,0
+DA:2956,0
+DA:2957,0
+DA:2958,0
+DA:2959,0
+DA:2961,0
+DA:2962,0
+DA:2964,0
+DA:2965,0
+DA:2966,0
+DA:2967,0
+DA:2968,0
+DA:2972,0
+DA:2973,0
+DA:2975,0
+DA:2978,0
+DA:2979,0
+DA:2980,0
+DA:2981,0
+DA:2988,0
+DA:2989,0
+DA:2990,0
+DA:2991,0
+DA:2992,0
+DA:2993,0
+DA:2994,0
+DA:2995,0
+DA:2996,0
+DA:2999,0
+DA:3002,0
+DA:3003,0
+DA:3004,0
+DA:3005,0
+DA:3006,0
+DA:3007,0
+DA:3008,0
+DA:3012,0
+DA:3013,0
+DA:3014,0
+DA:3022,0
+DA:3023,0
+DA:3024,0
+DA:3025,0
+DA:3026,0
+DA:3027,0
+DA:3028,0
+DA:3029,0
+DA:3030,0
+DA:3035,0
+DA:3036,0
+DA:3041,0
+DA:3042,0
+DA:3043,0
+DA:3044,0
+DA:3045,0
+DA:3051,0
+DA:3052,0
+DA:3053,0
+DA:3054,0
+DA:3056,0
+DA:3057,0
+DA:3060,0
+DA:3061,0
+DA:3062,0
+DA:3063,0
+DA:3066,0
+DA:3067,0
+DA:3068,0
+DA:3069,0
+DA:3070,0
+DA:3071,0
+DA:3072,0
+DA:3073,0
+DA:3074,0
+DA:3075,0
+LF:1371
+LH:8
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/interface/ceed-qfunction-register.c
+FNL:0,30,44
+FNA:0,192,CeedQFunctionRegisterAll
+FNF:1
+FNH:1
+DA:30,192
+DA:31,192
+DA:34,192
+DA:35,96
+DA:40,96
+DA:43,192
+LF:6
+LH:6
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/interface/ceed-qfunction.c
+FNL:0,1023,1026
+FNA:0,0,CeedQFunctionSetContextWritable
+FNL:1,1036,1040
+FNA:1,192,CeedQFunctionSetUserFlopsEstimate
+FNL:2,1052,1055
+FNA:2,0,CeedQFunctionSetNumViewTabs
+FNL:3,106,112
+FNA:3,1152,CeedQFunctionFieldSet
+FNL:4,1067,1070
+FNA:4,0,CeedQFunctionGetNumViewTabs
+FNL:5,1082,1108
+FNA:5,0,CeedQFunctionView
+FNL:6,1120,1123
+FNA:6,480,CeedQFunctionGetCeed
+FNL:7,1134,1134
+FNA:7,0,CeedQFunctionReturnCeed
+FNL:8,1150,1160
+FNA:8,3072,CeedQFunctionApply
+FNL:9,1171,1202
+FNA:9,10368,CeedQFunctionDestroy
+FNL:10,127,143
+FNA:10,0,CeedQFunctionFieldView
+FNL:11,155,158
+FNA:11,0,CeedQFunctionView_Object
+FNL:12,169,172
+FNA:12,0,CeedQFunctionDestroy_Object
+FNL:13,184,187
+FNA:13,96,CeedQFunctionSetFortranStatus
+FNL:14,207,210
+FNA:14,3072,CeedQFunctionGetVectorLength
+FNL:15,223,227
+FNA:15,3072,CeedQFunctionGetNumArgs
+FNL:16,240,247
+FNA:16,0,CeedQFunctionGetName
+FNL:17,259,277
+FNA:17,768,CeedQFunctionGetKernelName
+FNL:18,289,318
+FNA:18,768,CeedQFunctionGetSourcePath
+FNL:19,338,353
+FNA:19,0,CeedQFunctionLoadSourceToBuffer
+FNL:20,365,368
+FNA:20,3072,CeedQFunctionGetUserFunction
+FNL:21,382,386
+FNA:21,6192,CeedQFunctionGetContext
+FNL:22,400,417
+FNA:22,3072,CeedQFunctionGetContextData
+FNL:23,429,444
+FNA:23,3072,CeedQFunctionRestoreContextData
+FNL:24,457,472
+FNA:24,0,CeedQFunctionGetInnerContext
+FNL:25,486,502
+FNA:25,0,CeedQFunctionGetInnerContextData
+FNL:26,514,529
+FNA:26,0,CeedQFunctionRestoreInnerContextData
+FNL:27,541,544
+FNA:27,384,CeedQFunctionIsIdentity
+FNL:28,556,559
+FNA:28,2304,CeedQFunctionIsContextWritable
+FNL:29,571,574
+FNA:29,3456,CeedQFunctionGetData
+FNL:30,586,589
+FNA:30,384,CeedQFunctionSetData
+FNL:31,601,604
+FNA:31,1152,CeedQFunctionIsImmutable
+FNL:32,615,618
+FNA:32,9216,CeedQFunctionSetImmutable
+FNL:33,629,632
+FNA:33,4608,CeedQFunctionReference
+FNL:34,64,87
+FNA:34,1536,CeedQFunctionRegister
+FNL:35,642,645
+FNA:35,0,CeedQFunctionGetFlopsEstimate
+FNL:36,677,711
+FNA:36,720,CeedQFunctionCreateInterior
+FNL:37,724,753
+FNA:37,192,CeedQFunctionCreateInteriorByName
+FNL:38,772,787
+FNA:38,0,CeedQFunctionCreateIdentity
+FNL:39,804,809
+FNA:39,4608,CeedQFunctionReferenceCopy
+FNL:40,834,851
+FNA:40,768,CeedQFunctionAddInput
+FNL:41,876,894
+FNA:41,384,CeedQFunctionAddOutput
+FNL:42,911,919
+FNA:42,5760,CeedQFunctionGetFields
+FNL:43,931,934
+FNA:43,4224,CeedQFunctionFieldGetName
+FNL:44,946,949
+FNA:44,9792,CeedQFunctionFieldGetSize
+FNL:45,961,964
+FNA:45,14784,CeedQFunctionFieldGetEvalMode
+FNL:46,980,985
+FNA:46,1152,CeedQFunctionFieldGetData
+FNL:47,997,1002
+FNA:47,144,CeedQFunctionSetContext
+FNF:48
+FNH:33
+DA:64,1536
+DA:67,1536
+DA:69,1536
+DA:70,1536
+DA:72,1536
+DA:73,1536
+DA:74,1536
+DA:75,1536
+DA:76,1536
+DA:77,1536
+DA:78,1536
+DA:79,1536
+DA:80,1536
+DA:82,0
+DA:85,1536
+DA:86,1536
+DA:106,1152
+DA:107,1152
+DA:108,1152
+DA:109,1152
+DA:110,1152
+DA:111,1152
+DA:127,0
+DA:128,0
+DA:133,0
+DA:134,0
+DA:141,0
+DA:142,0
+DA:155,0
+DA:156,0
+DA:157,0
+DA:169,0
+DA:170,0
+DA:171,0
+DA:184,96
+DA:185,96
+DA:186,96
+DA:207,3072
+DA:208,3072
+DA:209,3072
+DA:223,3072
+DA:224,3072
+DA:225,3072
+DA:226,3072
+DA:240,0
+DA:241,0
+DA:242,0
+DA:244,0
+DA:246,0
+DA:259,768
+DA:260,768
+DA:263,96
+DA:264,96
+DA:265,96
+DA:267,96
+DA:268,96
+DA:270,0
+DA:272,96
+DA:275,768
+DA:276,768
+DA:289,768
+DA:290,768
+DA:295,96
+DA:296,96
+DA:298,96
+DA:299,96
+DA:300,96
+DA:301,48
+DA:303,48
+DA:305,96
+DA:307,96
+DA:309,96
+DA:310,96
+DA:311,96
+DA:313,96
+DA:316,768
+DA:317,768
+DA:338,0
+DA:341,0
+DA:342,0
+DA:343,0
+DA:345,0
+DA:347,0
+DA:348,0
+DA:349,0
+DA:350,0
+DA:352,0
+DA:365,3072
+DA:366,3072
+DA:367,3072
+DA:382,6192
+DA:383,6192
+DA:384,6192
+DA:385,6192
+DA:400,3072
+DA:404,3072
+DA:405,3072
+DA:406,1152
+DA:407,1152
+DA:408,1152
+DA:410,0
+DA:413,1920
+DA:415,3072
+DA:416,3072
+DA:429,3072
+DA:433,3072
+DA:434,3072
+DA:435,1152
+DA:436,1152
+DA:437,1152
+DA:439,0
+DA:442,3072
+DA:443,3072
+DA:457,0
+DA:460,0
+DA:461,0
+DA:462,0
+DA:464,0
+DA:465,0
+DA:466,0
+DA:468,0
+DA:470,0
+DA:471,0
+DA:486,0
+DA:490,0
+DA:491,0
+DA:492,0
+DA:493,0
+DA:494,0
+DA:496,0
+DA:499,0
+DA:501,0
+DA:514,0
+DA:518,0
+DA:519,0
+DA:520,0
+DA:521,0
+DA:522,0
+DA:524,0
+DA:527,0
+DA:528,0
+DA:541,384
+DA:542,384
+DA:543,384
+DA:556,2304
+DA:557,2304
+DA:558,2304
+DA:571,3456
+DA:572,3456
+DA:573,3456
+DA:586,384
+DA:587,384
+DA:588,384
+DA:601,1152
+DA:602,1152
+DA:603,1152
+DA:615,9216
+DA:616,9216
+DA:617,9216
+DA:629,4608
+DA:630,4608
+DA:631,4608
+DA:642,0
+DA:643,0
+DA:644,0
+DA:677,720
+DA:680,720
+DA:683,336
+DA:684,336
+DA:685,336
+DA:686,336
+DA:687,336
+DA:690,384
+DA:693,384
+DA:694,384
+DA:695,384
+DA:696,384
+DA:697,384
+DA:698,384
+DA:699,384
+DA:700,384
+DA:701,384
+DA:703,384
+DA:704,384
+DA:705,384
+DA:707,384
+DA:708,384
+DA:709,384
+DA:710,384
+DA:724,192
+DA:725,192
+DA:727,192
+DA:729,192
+DA:730,3264
+DA:732,3072
+DA:733,7296
+DA:735,3072
+DA:736,352
+DA:737,352
+DA:740,192
+DA:743,192
+DA:747,192
+DA:750,192
+DA:751,192
+DA:752,192
+DA:772,0
+DA:776,0
+DA:777,0
+DA:778,0
+DA:780,0
+DA:782,0
+DA:783,0
+DA:784,0
+DA:785,0
+DA:786,0
+DA:804,4608
+DA:805,4608
+DA:806,4608
+DA:807,4608
+DA:808,4608
+DA:834,768
+DA:837,768
+DA:838,768
+DA:839,768
+DA:840,1152
+DA:841,384
+DA:844,768
+DA:845,0
+DA:848,768
+DA:849,768
+DA:850,768
+DA:876,384
+DA:879,384
+DA:880,384
+DA:881,384
+DA:883,1152
+DA:884,768
+DA:887,384
+DA:888,0
+DA:891,384
+DA:892,384
+DA:893,384
+DA:911,5760
+DA:913,5760
+DA:914,5760
+DA:915,5760
+DA:916,5760
+DA:917,5760
+DA:918,5760
+DA:931,4224
+DA:932,4224
+DA:933,4224
+DA:946,9792
+DA:947,9792
+DA:948,9792
+DA:961,14784
+DA:962,14784
+DA:963,14784
+DA:980,1152
+DA:981,1152
+DA:982,1152
+DA:983,1152
+DA:984,1152
+DA:997,144
+DA:998,144
+DA:999,144
+DA:1000,144
+DA:1001,144
+DA:1023,0
+DA:1024,0
+DA:1025,0
+DA:1036,192
+DA:1037,192
+DA:1038,192
+DA:1039,192
+DA:1052,0
+DA:1053,0
+DA:1054,0
+DA:1067,0
+DA:1068,0
+DA:1069,0
+DA:1082,0
+DA:1083,0
+DA:1087,0
+DA:1089,0
+DA:1090,0
+DA:1091,0
+DA:1094,0
+DA:1095,0
+DA:1097,0
+DA:1098,0
+DA:1099,0
+DA:1102,0
+DA:1103,0
+DA:1104,0
+DA:1106,0
+DA:1107,0
+DA:1120,480
+DA:1121,480
+DA:1122,480
+DA:1134,0
+DA:1150,3072
+DA:1153,3072
+DA:1154,3072
+DA:1155,3072
+DA:1157,3072
+DA:1158,3072
+DA:1159,3072
+DA:1171,10368
+DA:1172,10368
+DA:1173,9984
+DA:1174,9984
+DA:1177,384
+DA:1178,384
+DA:1181,1152
+DA:1182,768
+DA:1183,768
+DA:1185,768
+DA:1186,384
+DA:1187,384
+DA:1189,384
+DA:1190,384
+DA:1193,384
+DA:1195,384
+DA:1196,384
+DA:1197,384
+DA:1198,384
+DA:1199,384
+DA:1200,384
+DA:1201,384
+LF:330
+LH:230
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/interface/ceed-qfunctioncontext.c
+FNL:0,111,128
+FNA:0,576,CeedQFunctionContextDestroyData
+FNL:1,140,143
+FNA:1,0,CeedQFunctionContextView_Object
+FNL:2,154,157
+FNA:2,0,CeedQFunctionContextDestroy_Object
+FNL:3,177,180
+FNA:3,288,CeedQFunctionContextGetCeed
+FNL:4,191,191
+FNA:4,0,CeedQFunctionContextReturnCeed
+FNL:5,203,208
+FNA:5,1584,CeedQFunctionContextHasValidData
+FNL:6,221,226
+FNA:6,0,CeedQFunctionContextHasBorrowedDataOfType
+FNL:7,238,241
+FNA:7,0,CeedQFunctionContextGetState
+FNL:8,253,256
+FNA:8,4680,CeedQFunctionContextGetBackendData
+FNL:9,268,271
+FNA:9,288,CeedQFunctionContextSetBackendData
+FNL:10,284,295
+FNA:10,0,CeedQFunctionContextGetFieldLabel
+FNL:11,309,327
+FNA:11,0,CeedQFunctionContextSetGeneric
+FNL:12,342,365
+FNA:12,0,CeedQFunctionContextGetGenericRead
+FNL:13,36,42
+FNA:13,0,CeedQFunctionContextGetFieldIndex
+FNL:14,379,388
+FNA:14,0,CeedQFunctionContextRestoreGenericRead
+FNL:15,401,405
+FNA:15,0,CeedQFunctionContextSetDouble
+FNL:16,419,423
+FNA:16,0,CeedQFunctionContextGetDoubleRead
+FNL:17,436,440
+FNA:17,0,CeedQFunctionContextRestoreDoubleRead
+FNL:18,453,457
+FNA:18,0,CeedQFunctionContextSetInt32
+FNL:19,471,475
+FNA:19,0,CeedQFunctionContextGetInt32Read
+FNL:20,488,492
+FNA:20,0,CeedQFunctionContextRestoreInt32Read
+FNL:21,505,509
+FNA:21,0,CeedQFunctionContextSetBoolean
+FNL:22,523,527
+FNA:22,0,CeedQFunctionContextGetBooleanRead
+FNL:23,540,544
+FNA:23,0,CeedQFunctionContextRestoreBooleanRead
+FNL:24,557,561
+FNA:24,576,CeedQFunctionContextGetDataDestroy
+FNL:25,572,575
+FNA:25,2496,CeedQFunctionContextReference
+FNL:26,58,100
+FNA:26,0,CeedQFunctionContextRegisterGeneric
+FNL:27,595,610
+FNA:27,540,CeedQFunctionContextCreate
+FNL:28,627,632
+FNA:28,2352,CeedQFunctionContextReferenceCopy
+FNL:29,650,660
+FNA:29,288,CeedQFunctionContextSetData
+FNL:30,676,694
+FNA:30,0,CeedQFunctionContextTakeData
+FNL:31,713,728
+FNA:31,1584,CeedQFunctionContextGetData
+FNL:32,747,761
+FNA:32,0,CeedQFunctionContextGetDataRead
+FNL:33,773,780
+FNA:33,1584,CeedQFunctionContextRestoreData
+FNL:34,792,799
+FNA:34,0,CeedQFunctionContextRestoreDataRead
+FNL:35,814,817
+FNA:35,0,CeedQFunctionContextRegisterDouble
+FNL:36,832,835
+FNA:36,0,CeedQFunctionContextRegisterInt32
+FNL:37,850,853
+FNA:37,0,CeedQFunctionContextRegisterBoolean
+FNL:38,866,870
+FNA:38,0,CeedQFunctionContextGetAllFieldLabels
+FNL:39,886,894
+FNA:39,0,CeedContextFieldLabelGetDescription
+FNL:40,906,909
+FNA:40,1476,CeedQFunctionContextGetContextSize
+FNL:41,921,924
+FNA:41,0,CeedQFunctionContextSetNumViewTabs
+FNL:42,936,939
+FNA:42,0,CeedQFunctionContextGetNumViewTabs
+FNL:43,951,969
+FNA:43,0,CeedQFunctionContextView
+FNL:44,982,987
+FNA:44,0,CeedQFunctionContextSetDataDestroy
+FNL:45,998,1016
+FNA:45,9360,CeedQFunctionContextDestroy
+FNF:46
+FNH:14
+DA:36,0
+DA:37,0
+DA:38,0
+DA:39,0
+DA:41,0
+DA:58,0
+DA:60,0
+DA:61,0
+DA:64,0
+DA:65,0
+DA:69,0
+DA:70,0
+DA:71,0
+DA:72,0
+DA:73,0
+DA:74,0
+DA:76,0
+DA:79,0
+DA:80,0
+DA:81,0
+DA:82,0
+DA:83,0
+DA:84,0
+DA:85,0
+DA:86,0
+DA:87,0
+DA:88,0
+DA:92,0
+DA:93,0
+DA:94,0
+DA:95,0
+DA:96,0
+DA:97,0
+DA:98,0
+DA:99,0
+DA:111,576
+DA:112,576
+DA:113,144
+DA:118,432
+DA:119,432
+DA:122,0
+DA:123,0
+DA:124,0
+DA:127,576
+DA:140,0
+DA:141,0
+DA:142,0
+DA:154,0
+DA:155,0
+DA:156,0
+DA:177,288
+DA:178,288
+DA:179,288
+DA:191,0
+DA:203,1584
+DA:204,1584
+DA:206,1584
+DA:207,1584
+DA:221,0
+DA:222,0
+DA:224,0
+DA:225,0
+DA:238,0
+DA:239,0
+DA:240,0
+DA:253,4680
+DA:254,4680
+DA:255,4680
+DA:268,288
+DA:269,288
+DA:270,288
+DA:284,0
+DA:287,0
+DA:289,0
+DA:290,0
+DA:292,0
+DA:294,0
+DA:309,0
+DA:314,0
+DA:318,0
+DA:319,0
+DA:320,0
+DA:321,0
+DA:322,0
+DA:323,0
+DA:324,0
+DA:326,0
+DA:342,0
+DA:347,0
+DA:351,0
+DA:352,0
+DA:353,0
+DA:354,0
+DA:355,0
+DA:356,0
+DA:357,0
+DA:358,0
+DA:359,0
+DA:360,0
+DA:361,0
+DA:362,0
+DA:364,0
+DA:379,0
+DA:382,0
+DA:386,0
+DA:387,0
+DA:401,0
+DA:402,0
+DA:403,0
+DA:404,0
+DA:419,0
+DA:420,0
+DA:421,0
+DA:422,0
+DA:436,0
+DA:437,0
+DA:438,0
+DA:439,0
+DA:453,0
+DA:454,0
+DA:455,0
+DA:456,0
+DA:471,0
+DA:472,0
+DA:473,0
+DA:474,0
+DA:488,0
+DA:489,0
+DA:490,0
+DA:491,0
+DA:505,0
+DA:506,0
+DA:507,0
+DA:508,0
+DA:523,0
+DA:524,0
+DA:525,0
+DA:526,0
+DA:540,0
+DA:541,0
+DA:542,0
+DA:543,0
+DA:557,576
+DA:558,576
+DA:559,576
+DA:560,576
+DA:572,2496
+DA:573,2496
+DA:574,2496
+DA:595,540
+DA:596,540
+DA:599,252
+DA:600,252
+DA:601,252
+DA:602,252
+DA:603,252
+DA:606,288
+DA:607,288
+DA:608,288
+DA:609,288
+DA:627,2352
+DA:628,2352
+DA:629,2352
+DA:630,2352
+DA:631,2352
+DA:650,288
+DA:651,288
+DA:652,288
+DA:655,288
+DA:656,288
+DA:657,288
+DA:658,288
+DA:659,288
+DA:676,0
+DA:677,0
+DA:678,0
+DA:680,0
+DA:681,0
+DA:683,0
+DA:684,0
+DA:687,0
+DA:688,0
+DA:691,0
+DA:692,0
+DA:693,0
+DA:713,1584
+DA:714,1584
+DA:716,1584
+DA:717,1584
+DA:719,1584
+DA:722,1584
+DA:723,1584
+DA:725,1584
+DA:726,1584
+DA:727,1584
+DA:747,0
+DA:748,0
+DA:750,0
+DA:752,0
+DA:755,0
+DA:756,0
+DA:758,0
+DA:759,0
+DA:760,0
+DA:773,1584
+DA:774,1584
+DA:776,1584
+DA:777,1584
+DA:778,1584
+DA:779,1584
+DA:792,0
+DA:793,0
+DA:795,0
+DA:796,0
+DA:797,0
+DA:798,0
+DA:814,0
+DA:816,0
+DA:832,0
+DA:834,0
+DA:850,0
+DA:852,0
+DA:866,0
+DA:867,0
+DA:868,0
+DA:869,0
+DA:886,0
+DA:888,0
+DA:889,0
+DA:890,0
+DA:891,0
+DA:892,0
+DA:893,0
+DA:906,1476
+DA:907,1476
+DA:908,1476
+DA:921,0
+DA:922,0
+DA:923,0
+DA:936,0
+DA:937,0
+DA:938,0
+DA:951,0
+DA:952,0
+DA:955,0
+DA:957,0
+DA:958,0
+DA:959,0
+DA:962,0
+DA:963,0
+DA:964,0
+DA:965,0
+DA:967,0
+DA:968,0
+DA:982,0
+DA:983,0
+DA:984,0
+DA:985,0
+DA:986,0
+DA:998,9360
+DA:999,9360
+DA:1000,9072
+DA:1001,9072
+DA:1003,288
+DA:1005,288
+DA:1006,288
+DA:1007,288
+DA:1008,0
+DA:1009,0
+DA:1010,0
+DA:1012,288
+DA:1013,288
+DA:1014,288
+DA:1015,288
+LF:274
+LH:81
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/interface/ceed-register.c
+FNL:0,30,44
+FNA:0,432,CeedRegisterAll
+FNF:1
+FNH:1
+DA:30,432
+DA:31,432
+DA:34,432
+DA:35,192
+DA:40,192
+DA:43,432
+LF:6
+LH:6
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/interface/ceed-tensor.c
+FNL:0,124,136
+FNA:0,0,CeedTensorContractStridedApply
+FNL:1,148,151
+FNA:1,192,CeedTensorContractGetCeed
+FNL:2,162,162
+FNA:2,192,CeedTensorContractReturnCeed
+FNL:3,174,177
+FNA:3,0,CeedTensorContractGetData
+FNL:4,189,192
+FNA:4,0,CeedTensorContractSetData
+FNL:5,203,206
+FNA:5,384,CeedTensorContractReference
+FNL:6,223,228
+FNA:6,0,CeedTensorContractReferenceCopy
+FNL:7,239,250
+FNA:7,768,CeedTensorContractDestroy
+FNL:8,31,34
+FNA:8,0,CeedTensorContractDestroy_Object
+FNL:9,54,69
+FNA:9,576,CeedTensorContractCreate
+FNL:10,94,98
+FNA:10,8064,CeedTensorContractApply
+FNF:11
+FNH:6
+DA:31,0
+DA:32,0
+DA:33,0
+DA:54,576
+DA:55,576
+DA:58,192
+DA:59,192
+DA:60,192
+DA:61,192
+DA:62,192
+DA:65,384
+DA:66,384
+DA:67,384
+DA:68,384
+DA:94,8064
+DA:96,8064
+DA:97,8064
+DA:124,0
+DA:126,0
+DA:127,0
+DA:128,0
+DA:131,0
+DA:132,0
+DA:135,0
+DA:148,192
+DA:149,192
+DA:150,192
+DA:162,192
+DA:174,0
+DA:175,0
+DA:176,0
+DA:189,0
+DA:190,0
+DA:191,0
+DA:203,384
+DA:204,384
+DA:205,384
+DA:223,0
+DA:224,0
+DA:225,0
+DA:226,0
+DA:227,0
+DA:239,768
+DA:240,768
+DA:241,384
+DA:242,384
+DA:244,384
+DA:245,192
+DA:247,384
+DA:248,384
+DA:249,384
+LF:51
+LH:30
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/interface/ceed-vector.c
+FNL:0,1002,1031
+FNA:0,0,CeedVectorReciprocal
+FNL:1,1043,1046
+FNA:1,0,CeedVectorSetNumViewTabs
+FNL:2,1058,1061
+FNA:2,0,CeedVectorGetNumViewTabs
+FNL:3,1080,1111
+FNA:3,0,CeedVectorViewRange
+FNL:4,1124,1130
+FNA:4,0,CeedVectorView
+FNL:5,113,118
+FNA:5,0,CeedVectorHasBorrowedArrayOfType
+FNL:6,1142,1145
+FNA:6,3648,CeedVectorGetCeed
+FNL:7,1156,1156
+FNA:7,0,CeedVectorReturnCeed
+FNL:8,1168,1171
+FNA:8,123936,CeedVectorGetLength
+FNL:9,1182,1194
+FNA:9,20352,CeedVectorDestroy
+FNL:10,130,133
+FNA:10,384,CeedVectorGetState
+FNL:11,145,148
+FNA:11,74688,CeedVectorGetData
+FNL:12,160,163
+FNA:12,3648,CeedVectorSetData
+FNL:13,174,177
+FNA:13,1632,CeedVectorReference
+FNL:14,198,216
+FNA:14,6120,CeedVectorCreate
+FNL:15,233,238
+FNA:15,8928,CeedVectorReferenceCopy
+FNL:16,250,285
+FNA:16,0,CeedVectorCopy
+FNL:17,300,336
+FNA:17,0,CeedVectorCopyStrided
+FNL:18,353,365
+FNA:18,6744,CeedVectorSetArray
+FNL:19,377,395
+FNA:19,1056,CeedVectorSetValue
+FNL:20,412,435
+FNA:20,0,CeedVectorSetValueStrided
+FNL:21,450,468
+FNA:21,0,CeedVectorSyncArray
+FNL:22,485,508
+FNA:22,0,CeedVectorTakeArray
+FNL:23,52,55
+FNA:23,0,CeedVectorView_Object
+FNL:24,527,549
+FNA:24,1824,CeedVectorGetArray
+FNL:25,565,586
+FNA:25,14976,CeedVectorGetArrayRead
+FNL:26,602,618
+FNA:26,11088,CeedVectorGetArrayWrite
+FNL:27,630,639
+FNA:27,12912,CeedVectorRestoreArray
+FNL:28,651,661
+FNA:28,14976,CeedVectorRestoreArrayRead
+FNL:29,66,69
+FNA:29,0,CeedVectorDestroy_Object
+FNL:30,677,723
+FNA:30,0,CeedVectorNorm
+FNL:31,735,757
+FNA:31,0,CeedVectorScale
+FNL:32,770,827
+FNA:32,0,CeedVectorAXPY
+FNL:33,841,898
+FNA:33,0,CeedVectorAXPBY
+FNL:34,89,100
+FNA:34,16800,CeedVectorHasValidArray
+FNL:35,913,991
+FNA:35,0,CeedVectorPointwiseMult
+FNF:36
+FNH:17
+DA:52,0
+DA:53,0
+DA:54,0
+DA:66,0
+DA:67,0
+DA:68,0
+DA:89,16800
+DA:92,16800
+DA:93,16800
+DA:94,16800
+DA:95,0
+DA:96,0
+DA:98,16800
+DA:99,16800
+DA:113,0
+DA:114,0
+DA:116,0
+DA:117,0
+DA:130,384
+DA:131,384
+DA:132,384
+DA:145,74688
+DA:146,74688
+DA:147,74688
+DA:160,3648
+DA:161,3648
+DA:162,3648
+DA:174,1632
+DA:175,1632
+DA:176,1632
+DA:198,6120
+DA:199,6120
+DA:200,6120
+DA:203,2472
+DA:204,2472
+DA:205,2472
+DA:206,2472
+DA:207,2472
+DA:210,3648
+DA:211,3648
+DA:212,3648
+DA:213,3648
+DA:214,3648
+DA:215,3648
+DA:233,8928
+DA:234,8928
+DA:235,8928
+DA:236,8928
+DA:237,8928
+DA:250,0
+DA:258,0
+DA:259,0
+DA:260,0
+DA:262,0
+DA:263,0
+DA:264,0
+DA:268,0
+DA:274,0
+DA:275,0
+DA:276,0
+DA:280,0
+DA:281,0
+DA:283,0
+DA:284,0
+DA:300,0
+DA:302,0
+DA:303,0
+DA:309,0
+DA:310,0
+DA:311,0
+DA:312,0
+DA:314,0
+DA:316,0
+DA:320,0
+DA:321,0
+DA:322,0
+DA:323,0
+DA:327,0
+DA:328,0
+DA:329,0
+DA:330,0
+DA:333,0
+DA:334,0
+DA:335,0
+DA:353,6744
+DA:356,6744
+DA:357,6744
+DA:359,6744
+DA:361,6744
+DA:362,6744
+DA:363,6744
+DA:364,6744
+DA:377,1056
+DA:378,1056
+DA:380,1056
+DA:382,1056
+DA:383,144
+DA:384,144
+DA:389,912
+DA:390,912
+DA:391,274224
+DA:392,912
+DA:394,1056
+DA:412,0
+DA:415,0
+DA:417,0
+DA:418,0
+DA:419,0
+DA:422,0
+DA:423,0
+DA:424,0
+DA:428,0
+DA:429,0
+DA:430,0
+DA:431,0
+DA:432,0
+DA:434,0
+DA:450,0
+DA:453,0
+DA:456,0
+DA:457,0
+DA:459,0
+DA:460,0
+DA:464,0
+DA:465,0
+DA:467,0
+DA:485,0
+DA:487,0
+DA:489,0
+DA:490,0
+DA:492,0
+DA:493,0
+DA:494,0
+DA:496,0
+DA:497,0
+DA:500,0
+DA:501,0
+DA:504,0
+DA:506,0
+DA:507,0
+DA:527,1824
+DA:530,1824
+DA:531,1824
+DA:533,1824
+DA:535,1824
+DA:536,1824
+DA:537,1824
+DA:539,1824
+DA:540,1824
+DA:543,1824
+DA:545,0
+DA:547,1824
+DA:548,1824
+DA:565,14976
+DA:568,14976
+DA:569,14976
+DA:572,14976
+DA:573,14976
+DA:574,14976
+DA:576,14976
+DA:577,14976
+DA:580,14976
+DA:582,0
+DA:584,14976
+DA:585,14976
+DA:602,11088
+DA:605,11088
+DA:606,11088
+DA:608,11088
+DA:610,11088
+DA:611,11088
+DA:612,11088
+DA:614,0
+DA:616,11088
+DA:617,11088
+DA:630,12912
+DA:633,12912
+DA:634,12912
+DA:635,12912
+DA:636,12912
+DA:637,12912
+DA:638,12912
+DA:651,14976
+DA:654,14976
+DA:656,14976
+DA:657,14976
+DA:658,14976
+DA:659,14976
+DA:660,14976
+DA:677,0
+DA:678,0
+DA:681,0
+DA:682,0
+DA:685,0
+DA:686,0
+DA:687,0
+DA:688,0
+DA:692,0
+DA:693,0
+DA:694,0
+DA:698,0
+DA:699,0
+DA:701,0
+DA:702,0
+DA:703,0
+DA:704,0
+DA:705,0
+DA:707,0
+DA:708,0
+DA:709,0
+DA:710,0
+DA:712,0
+DA:713,0
+DA:714,0
+DA:715,0
+DA:716,0
+DA:719,0
+DA:721,0
+DA:722,0
+DA:735,0
+DA:736,0
+DA:738,0
+DA:740,0
+DA:741,0
+DA:745,0
+DA:746,0
+DA:749,0
+DA:752,0
+DA:753,0
+DA:754,0
+DA:755,0
+DA:756,0
+DA:770,0
+DA:771,0
+DA:773,0
+DA:774,0
+DA:776,0
+DA:777,0
+DA:778,0
+DA:782,0
+DA:784,0
+DA:785,0
+DA:787,0
+DA:788,0
+DA:794,0
+DA:795,0
+DA:796,0
+DA:797,0
+DA:798,0
+DA:799,0
+DA:800,0
+DA:802,0
+DA:803,0
+DA:807,0
+DA:810,0
+DA:811,0
+DA:812,0
+DA:816,0
+DA:817,0
+DA:819,0
+DA:820,0
+DA:822,0
+DA:824,0
+DA:825,0
+DA:826,0
+DA:841,0
+DA:842,0
+DA:844,0
+DA:845,0
+DA:847,0
+DA:848,0
+DA:849,0
+DA:853,0
+DA:855,0
+DA:856,0
+DA:858,0
+DA:859,0
+DA:865,0
+DA:866,0
+DA:867,0
+DA:868,0
+DA:869,0
+DA:870,0
+DA:871,0
+DA:873,0
+DA:874,0
+DA:878,0
+DA:881,0
+DA:882,0
+DA:883,0
+DA:887,0
+DA:888,0
+DA:890,0
+DA:891,0
+DA:893,0
+DA:895,0
+DA:896,0
+DA:897,0
+DA:913,0
+DA:914,0
+DA:915,0
+DA:916,0
+DA:919,0
+DA:920,0
+DA:921,0
+DA:922,0
+DA:930,0
+DA:931,0
+DA:932,0
+DA:933,0
+DA:934,0
+DA:935,0
+DA:936,0
+DA:937,0
+DA:938,0
+DA:939,0
+DA:941,0
+DA:942,0
+DA:943,0
+DA:946,0
+DA:947,0
+DA:949,0
+DA:950,0
+DA:954,0
+DA:957,0
+DA:958,0
+DA:959,0
+DA:963,0
+DA:964,0
+DA:966,0
+DA:968,0
+DA:969,0
+DA:971,0
+DA:973,0
+DA:974,0
+DA:975,0
+DA:976,0
+DA:977,0
+DA:978,0
+DA:981,0
+DA:982,0
+DA:983,0
+DA:985,0
+DA:987,0
+DA:988,0
+DA:989,0
+DA:990,0
+DA:1002,0
+DA:1003,0
+DA:1007,0
+DA:1008,0
+DA:1012,0
+DA:1015,0
+DA:1016,0
+DA:1019,0
+DA:1020,0
+DA:1021,0
+DA:1024,0
+DA:1025,0
+DA:1026,0
+DA:1029,0
+DA:1030,0
+DA:1043,0
+DA:1044,0
+DA:1045,0
+DA:1058,0
+DA:1059,0
+DA:1060,0
+DA:1080,0
+DA:1082,0
+DA:1086,0
+DA:1089,0
+DA:1091,0
+DA:1092,0
+DA:1093,0
+DA:1096,0
+DA:1097,0
+DA:1098,0
+DA:1099,0
+DA:1101,0
+DA:1102,0
+DA:1104,0
+DA:1105,0
+DA:1106,0
+DA:1107,0
+DA:1108,0
+DA:1109,0
+DA:1110,0
+DA:1124,0
+DA:1127,0
+DA:1128,0
+DA:1129,0
+DA:1142,3648
+DA:1143,3648
+DA:1144,3648
+DA:1156,0
+DA:1168,123936
+DA:1169,123936
+DA:1170,123936
+DA:1182,20352
+DA:1183,20352
+DA:1184,16704
+DA:1185,16704
+DA:1187,3648
+DA:1188,3648
+DA:1190,3648
+DA:1191,3648
+DA:1192,3648
+DA:1193,3648
+LF:409
+LH:118
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/interface/ceed.c
+FNL:0,1013,1022
+FNA:0,0,CeedGetRustSourceRoots
+FNL:1,1034,1042
+FNA:1,48,CeedRestoreJitSourceRoots
+FNL:2,1054,1062
+FNA:2,0,CeedRestoreRustSourceRoots
+FNL:3,1077,1086
+FNA:3,0,CeedGetJitDefines
+FNL:4,1098,1106
+FNA:4,0,CeedRestoreJitDefines
+FNL:5,1171,1377
+FNA:5,432,CeedInit
+FNL:6,123,139
+FNA:6,4416,CeedRegisterImpl
+FNL:7,1389,1402
+FNA:7,0,CeedSetStream
+FNL:8,1419,1424
+FNA:8,24420,CeedReferenceCopy
+FNL:9,1436,1439
+FNA:9,648,CeedGetResource
+FNL:10,1451,1466
+FNA:10,0,CeedGetPreferredMemType
+FNL:11,1478,1481
+FNA:11,0,CeedIsDeterministic
+FNL:12,1493,1512
+FNA:12,432,CeedAddJitSourceRoot
+FNL:13,150,153
+FNA:13,0,CeedWorkVectorsCreate
+FNL:14,1524,1545
+FNA:14,0,CeedAddRustSourceRoot
+FNL:15,1557,1576
+FNA:15,0,CeedAddJitDefine
+FNL:16,1588,1591
+FNA:16,0,CeedSetNumViewTabs
+FNL:17,1603,1606
+FNA:17,0,CeedGetNumViewTabs
+FNL:18,1618,1638
+FNA:18,0,CeedView
+FNL:19,164,179
+FNA:19,300,CeedWorkVectorsDestroy
+FNL:20,1649,1693
+FNA:20,48480,CeedDestroy
+FNL:21,1711,1732
+FNA:21,0,CeedErrorImpl
+FNL:22,1800,1807
+FNA:22,0,CeedErrorExit
+FNL:23,1819,1824
+FNA:23,0,CeedSetErrorHandler
+FNL:24,1838,1842
+FNA:24,0,CeedGetErrorMessage
+FNL:25,1856,1861
+FNA:25,0,CeedResetErrorMessage
+FNL:26,1882,1888
+FNA:26,0,CeedGetVersion
+FNL:27,1899,1902
+FNA:27,0,CeedGetScalarType
+FNL:28,191,194
+FNA:28,0,CeedView_Object
+FNL:29,205,208
+FNA:29,0,CeedDestroy_Object
+FNL:30,282,286
+FNA:30,1812,CeedMallocArray
+FNL:31,303,307
+FNA:31,43296,CeedCallocArray
+FNL:32,324,328
+FNA:32,1776,CeedReallocArray
+FNL:33,344,349
+FNA:33,2928,CeedStringAllocCopy
+FNL:34,360,364
+FNA:34,58680,CeedFree
+FNL:35,380,407
+FNA:35,5400,CeedSetHostGenericArray
+FNL:36,422,426
+FNA:36,0,CeedSetHostBoolArray
+FNL:37,441,445
+FNA:37,0,CeedSetHostCeedInt8Array
+FNL:38,460,464
+FNA:38,648,CeedSetHostCeedIntArray
+FNL:39,479,483
+FNA:39,4752,CeedSetHostCeedScalarArray
+FNL:40,498,502
+FNA:40,1536,CeedRegister
+FNL:41,514,517
+FNA:41,0,CeedIsDebug
+FNL:42,533,540
+FNA:42,0,CeedGetResourceRoot
+FNL:43,552,560
+FNA:43,4032,CeedGetParent
+FNL:44,572,576
+FNA:44,4428,CeedGetDelegate
+FNL:45,591,595
+FNA:45,240,CeedSetDelegate
+FNL:46,608,621
+FNA:46,4428,CeedGetObjectDelegate
+FNL:47,638,656
+FNA:47,0,CeedSetObjectDelegate
+FNL:48,668,678
+FNA:48,0,CeedGetOperatorFallbackCeed
+FNL:49,692,696
+FNA:49,0,CeedSetOperatorFallbackCeed
+FNL:50,708,711
+FNA:50,384,CeedSetDeterministic
+FNL:51,730,752
+FNA:51,63624,CeedSetBackendFunctionImpl
+FNL:52,764,767
+FNA:52,672,CeedGetData
+FNL:53,779,782
+FNA:53,96,CeedSetData
+FNL:54,793,796
+FNA:54,24420,CeedReference
+FNL:55,808,830
+FNA:55,0,CeedGetWorkVectorMemoryUsage
+FNL:56,842,874
+FNA:56,0,CeedClearWorkVectors
+FNL:57,889,938
+FNA:57,0,CeedGetWorkVector
+FNL:58,95,98
+FNA:58,0,CeedRequestWait
+FNL:59,950,974
+FNA:59,0,CeedRestoreWorkVector
+FNL:60,989,998
+FNA:60,48,CeedGetJitSourceRoots
+FNF:61
+FNH:27
+DA:95,0
+DA:96,0
+DA:97,0
+DA:123,4416
+DA:124,4416
+DA:127,4416
+DA:128,4416
+DA:129,4416
+DA:130,4416
+DA:131,4416
+DA:132,4416
+DA:134,0
+DA:137,4416
+DA:138,4416
+DA:150,0
+DA:151,0
+DA:152,0
+DA:164,300
+DA:165,300
+DA:166,0
+DA:167,0
+DA:169,0
+DA:170,0
+DA:171,0
+DA:173,0
+DA:175,0
+DA:176,0
+DA:177,0
+DA:178,0
+DA:191,0
+DA:192,0
+DA:193,0
+DA:205,0
+DA:206,0
+DA:207,0
+DA:282,1812
+DA:283,1812
+DA:284,1812
+DA:285,1812
+DA:303,43296
+DA:304,43296
+DA:305,43296
+DA:306,43296
+DA:324,1776
+DA:325,1776
+DA:326,1776
+DA:327,1776
+DA:344,2928
+DA:345,2928
+DA:346,2928
+DA:347,2928
+DA:348,2928
+DA:360,58680
+DA:361,58680
+DA:362,58680
+DA:363,58680
+DA:380,5400
+DA:382,5400
+DA:383,2112
+DA:384,2112
+DA:385,2112
+DA:386,0
+DA:388,2112
+DA:389,2112
+DA:392,2112
+DA:393,2112
+DA:394,360
+DA:395,360
+DA:396,360
+DA:397,360
+DA:398,360
+DA:399,360
+DA:400,2928
+DA:401,2928
+DA:402,2928
+DA:403,2928
+DA:404,2928
+DA:406,5400
+DA:422,0
+DA:424,0
+DA:425,0
+DA:441,0
+DA:443,0
+DA:444,0
+DA:460,648
+DA:462,648
+DA:463,648
+DA:479,4752
+DA:481,4752
+DA:482,4752
+DA:498,1536
+DA:499,1536
+DA:500,1536
+DA:501,1536
+DA:514,0
+DA:515,0
+DA:516,0
+DA:533,0
+DA:534,0
+DA:535,0
+DA:537,0
+DA:538,0
+DA:539,0
+DA:552,4032
+DA:553,4032
+DA:554,1704
+DA:555,1704
+DA:557,2328
+DA:558,2328
+DA:559,2328
+DA:572,4428
+DA:573,4428
+DA:574,4428
+DA:575,4428
+DA:591,240
+DA:592,240
+DA:593,240
+DA:594,240
+DA:608,4428
+DA:610,4428
+DA:611,0
+DA:612,0
+DA:613,0
+DA:614,0
+DA:619,4428
+DA:620,4428
+DA:638,0
+DA:639,0
+DA:642,0
+DA:643,0
+DA:645,0
+DA:647,0
+DA:650,0
+DA:651,0
+DA:654,0
+DA:655,0
+DA:668,0
+DA:669,0
+DA:670,0
+DA:671,0
+DA:675,0
+DA:676,0
+DA:677,0
+DA:692,0
+DA:693,0
+DA:694,0
+DA:695,0
+DA:708,384
+DA:709,384
+DA:710,384
+DA:730,63624
+DA:731,63624
+DA:734,63624
+DA:735,63624
+DA:736,63624
+DA:739,2096256
+DA:740,2096256
+DA:741,63624
+DA:742,63624
+DA:744,63624
+DA:745,63624
+DA:764,672
+DA:765,672
+DA:766,672
+DA:779,96
+DA:780,96
+DA:781,96
+DA:793,24420
+DA:794,24420
+DA:795,24420
+DA:808,0
+DA:809,0
+DA:812,0
+DA:813,0
+DA:814,0
+DA:815,0
+DA:816,0
+DA:818,0
+DA:819,0
+DA:820,0
+DA:822,0
+DA:823,0
+DA:825,0
+DA:826,0
+DA:829,0
+DA:842,0
+DA:843,0
+DA:846,0
+DA:847,0
+DA:848,0
+DA:849,0
+DA:850,0
+DA:852,0
+DA:853,0
+DA:854,0
+DA:856,0
+DA:857,0
+DA:859,0
+DA:860,0
+DA:861,0
+DA:863,0
+DA:864,0
+DA:865,0
+DA:866,0
+DA:867,0
+DA:868,0
+DA:869,0
+DA:873,0
+DA:889,0
+DA:890,0
+DA:893,0
+DA:896,0
+DA:897,0
+DA:898,0
+DA:899,0
+DA:900,0
+DA:903,0
+DA:906,0
+DA:907,0
+DA:910,0
+DA:911,0
+DA:915,0
+DA:916,0
+DA:917,0
+DA:918,0
+DA:919,0
+DA:920,0
+DA:921,0
+DA:922,0
+DA:923,0
+DA:925,0
+DA:926,0
+DA:928,0
+DA:929,0
+DA:932,0
+DA:933,0
+DA:934,0
+DA:936,0
+DA:937,0
+DA:950,0
+DA:951,0
+DA:954,0
+DA:955,0
+DA:956,0
+DA:957,0
+DA:958,0
+DA:961,0
+DA:962,0
+DA:963,0
+DA:964,0
+DA:965,0
+DA:967,0
+DA:968,0
+DA:989,48
+DA:992,48
+DA:993,48
+DA:994,48
+DA:995,48
+DA:996,48
+DA:997,48
+DA:1013,0
+DA:1016,0
+DA:1017,0
+DA:1018,0
+DA:1019,0
+DA:1020,0
+DA:1021,0
+DA:1034,48
+DA:1037,48
+DA:1038,48
+DA:1039,48
+DA:1040,48
+DA:1041,48
+DA:1054,0
+DA:1057,0
+DA:1058,0
+DA:1059,0
+DA:1060,0
+DA:1061,0
+DA:1077,0
+DA:1080,0
+DA:1081,0
+DA:1082,0
+DA:1083,0
+DA:1084,0
+DA:1085,0
+DA:1098,0
+DA:1101,0
+DA:1102,0
+DA:1103,0
+DA:1104,0
+DA:1105,0
+DA:1171,432
+DA:1172,432
+DA:1175,432
+DA:1176,432
+DA:1179,432
+DA:1180,432
+DA:1181,432
+DA:1182,432
+DA:1183,0
+DA:1185,0
+DA:1186,0
+DA:1188,0
+DA:1190,0
+DA:1191,0
+DA:1193,432
+DA:1197,432
+DA:1198,9504
+DA:1199,10368
+DA:1200,9936
+DA:1201,9936
+DA:1202,65520
+DA:1203,9936
+DA:1204,9936
+DA:1205,864
+DA:1206,864
+DA:1207,864
+DA:1211,432
+DA:1249,432
+DA:1250,432
+DA:1251,432
+DA:1252,432
+DA:1253,432
+DA:1254,432
+DA:1255,432
+DA:1256,0
+DA:1257,0
+DA:1258,432
+DA:1259,432
+DA:1262,432
+DA:1351,432
+DA:1352,432
+DA:1355,432
+DA:1358,432
+DA:1362,432
+DA:1366,432
+DA:1368,432
+DA:1369,0
+DA:1371,432
+DA:1375,432
+DA:1376,432
+DA:1389,0
+DA:1390,0
+DA:1391,0
+DA:1392,0
+DA:1395,0
+DA:1397,0
+DA:1398,0
+DA:1399,0
+DA:1401,0
+DA:1419,24420
+DA:1420,24420
+DA:1421,24420
+DA:1422,24420
+DA:1423,24420
+DA:1436,648
+DA:1437,648
+DA:1438,648
+DA:1451,0
+DA:1452,0
+DA:1453,0
+DA:1456,0
+DA:1458,0
+DA:1459,0
+DA:1461,0
+DA:1463,0
+DA:1465,0
+DA:1478,0
+DA:1479,0
+DA:1480,0
+DA:1493,432
+DA:1496,432
+DA:1497,432
+DA:1499,432
+DA:1500,432
+DA:1502,432
+DA:1503,432
+DA:1504,432
+DA:1505,432
+DA:1507,432
+DA:1508,432
+DA:1509,432
+DA:1510,432
+DA:1511,432
+DA:1524,0
+DA:1527,0
+DA:1528,0
+DA:1530,0
+DA:1531,0
+DA:1533,0
+DA:1534,0
+DA:1535,0
+DA:1536,0
+DA:1538,0
+DA:1539,0
+DA:1540,0
+DA:1541,0
+DA:1542,0
+DA:1543,0
+DA:1544,0
+DA:1557,0
+DA:1560,0
+DA:1561,0
+DA:1563,0
+DA:1564,0
+DA:1566,0
+DA:1567,0
+DA:1568,0
+DA:1569,0
+DA:1571,0
+DA:1572,0
+DA:1573,0
+DA:1574,0
+DA:1575,0
+DA:1588,0
+DA:1589,0
+DA:1590,0
+DA:1603,0
+DA:1604,0
+DA:1605,0
+DA:1618,0
+DA:1619,0
+DA:1622,0
+DA:1625,0
+DA:1627,0
+DA:1628,0
+DA:1629,0
+DA:1631,0
+DA:1635,0
+DA:1636,0
+DA:1637,0
+DA:1649,48480
+DA:1650,48480
+DA:1651,48180
+DA:1652,48180
+DA:1655,300
+DA:1657,300
+DA:1659,300
+DA:1661,300
+DA:1662,0
+DA:1663,0
+DA:1664,0
+DA:1666,0
+DA:1669,300
+DA:1671,600
+DA:1672,300
+DA:1674,300
+DA:1676,300
+DA:1677,0
+DA:1679,300
+DA:1681,300
+DA:1682,0
+DA:1684,300
+DA:1686,300
+DA:1687,300
+DA:1688,300
+DA:1689,300
+DA:1690,300
+DA:1691,300
+DA:1692,300
+DA:1711,0
+DA:1715,0
+DA:1716,0
+DA:1717,0
+DA:1800,0
+DA:1801,0
+DA:1803,0
+DA:1804,0
+DA:1805,0
+DA:1819,0
+DA:1820,0
+DA:1821,0
+DA:1822,0
+DA:1823,0
+DA:1838,0
+DA:1839,0
+DA:1840,0
+DA:1841,0
+DA:1856,0
+DA:1857,0
+DA:1858,0
+DA:1859,0
+DA:1860,0
+DA:1882,0
+DA:1883,0
+DA:1884,0
+DA:1885,0
+DA:1886,0
+DA:1887,0
+DA:1899,0
+DA:1900,0
+DA:1901,0
+LF:493
+LH:205
+end_of_record
+TN:
+SF:/usr/include/valgrind/valgrind.h
+FNL:0,7293,7322
+FNA:0,0,VALGRIND_PRINTF
+FNL:1,7332,7361
+FNA:1,0,VALGRIND_PRINTF_BACKTRACE
+FNF:2
+FNH:0
+DA:7293,0
+DA:7305,0
+DA:7313,0
+DA:7319,0
+DA:7320,0
+DA:7332,0
+DA:7344,0
+DA:7352,0
+DA:7358,0
+DA:7359,0
+LF:10
+LH:0
+end_of_record
diff --git a/examples/ceed/ex1-volume-f-c.h b/examples/ceed/ex1-volume-f-c.h
new file mode 100644
index 0000000000..a3316192ff
--- /dev/null
+++ b/examples/ceed/ex1-volume-f-c.h
@@ -0,0 +1,59 @@
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+#include <ceed/types.h>
+
+/// libCEED Q-function for building quadrature data for a mass operator
+CEED_QFUNCTION(build_mass)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+  long long int *build_data = (long long int *)ctx;
+
+  // in[0] is Jacobians with shape [dim, dim, Q]
+  // in[1] is quadrature weights with shape [1, Q]
+  const CeedScalar *w      = in[1];
+  CeedScalar       *q_data = out[0];
+
+  switch (build_data[0] + 10 * build_data[1]) {
+    case 11: {
+      const CeedScalar(*J)[1][CEED_Q_VLA] = (const CeedScalar(*)[1][CEED_Q_VLA])in[0];
+
+      // Quadrature Point Loop
+      CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { q_data[i] = J[0][0][i] * w[i]; }  // End of Quadrature Point Loop
+    } break;
+    case 22: {
+      const CeedScalar(*J)[2][CEED_Q_VLA] = (const CeedScalar(*)[2][CEED_Q_VLA])in[0];
+
+      // Quadrature Point Loop
+      CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
+        q_data[i] = (J[0][0][i] * J[1][1][i] - J[0][1][i] * J[1][0][i]) * w[i];
+      }  // End of Quadrature Point Loop
+    } break;
+    case 33: {
+      const CeedScalar(*J)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0];
+
+      // Quadrature Point Loop
+      CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
+        q_data[i] =
+            (J[0][0][i] * (J[1][1][i] * J[2][2][i] - J[1][2][i] * J[2][1][i]) - J[0][1][i] * (J[1][0][i] * J[2][2][i] - J[1][2][i] * J[2][0][i]) +
+             J[0][2][i] * (J[1][0][i] * J[2][1][i] - J[1][1][i] * J[2][0][i])) *
+            w[i];
+      }  // End of Quadrature Point Loop
+    } break;
+  }
+  return CEED_ERROR_SUCCESS;
+}
+
+/// libCEED Q-function for applying a mass operator
+CEED_QFUNCTION(apply_mass)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+  // in[0], out[0] are solution variables with shape [1, Q]
+  // in[1] is quadrature data with shape [1, Q]
+  const CeedScalar *u = in[0], *q_data = in[1];
+  CeedScalar       *v = out[0];
+
+  // Quadrature Point Loop
+  CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { v[i] = q_data[i] * u[i]; }  // End of Quadrature Point Loop
+  return CEED_ERROR_SUCCESS;
+}
diff --git a/examples/ceed/ex1-volume-f.f90 b/examples/ceed/ex1-volume-f.f90
new file mode 100644
index 0000000000..580874efc2
--- /dev/null
+++ b/examples/ceed/ex1-volume-f.f90
@@ -0,0 +1,557 @@
+! Copyright (c) 2017-2026,  Lawrence Livermore National Security,  LLC and other CEED contributors.
+! All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+
+! SPDX-License-Identifier: BSD-2-Clause
+
+! This file is part of CEED:  http:Cgithub.com/ceed
+
+! libCEED Example 1
+
+! This example illustrates a simple usage of libCEED to compute the volume of a 3D body using matrix-free application of a mass operator.
+! Arbitrary mesh and solution degrees in 1D,  2D and 3D are supported from the same code.
+
+! The example has no dependencies,  and is designed to be self-contained.
+! For additional examples that use external discretization libraries (MFEM,  PETSc,  etc.) see the subdirectories in libceed/examples.
+
+! All libCEED objects use a Ceed device object constructed based on a command line argument (-ceed).
+
+! Build with:
+
+!     make ex1-volume [CEED_DIR = </path/to/libceed>]
+
+! Sample runs:
+
+!     ./ex1-volume-f
+!     ./ex1-volume-f -ceed /cpu/self
+!     ./ex1-volume-f -ceed /gpu/cuda
+
+! Test in 1D-3D
+! TESTARGS(name = "1D User QFunction") -ceed {ceed_resource} -d 1 -t
+! TESTARGS(name = "2D User QFunction") -ceed {ceed_resource} -d 2 -t
+! TESTARGS(name = "3D User QFunction") -ceed {ceed_resource} -d 3 -t
+! TESTARGS(name = "1D Gallery QFunction") -ceed {ceed_resource} -d 1 -t -g
+! TESTARGS(name = "2D Gallery QFunction") -ceed {ceed_resource} -d 2 -t -g
+! TESTARGS(name = "3D Gallery QFunction") -ceed {ceed_resource} -d 3 -t -g
+
+!> @file
+!> libCEED example using mass operator to compute volume
+
+    include 'ex1-volume-f.h'
+
+!-----------------------------------------------------------------------
+subroutine getcartesianmeshsize(fe_dim, degree, prob_size, num_xyz)
+    implicit none
+    integer fe_dim
+    integer degree
+    integer prob_size
+    integer num_xyz(3)
+
+    integer num_elem
+    integer s, r, d, sd
+    num_elem = prob_size/(degree**fe_dim)
+    s = 0
+
+! Use the approximate formula:
+!    prob_size ~ num_elem * degree^dim
+! find s: num_elem/2 < 2^s <= num_elem
+
+  do while (num_elem > 1)
+    num_elem = num_elem/2
+    s = s + 1
+  end do
+  r = mod(s, fe_dim)
+
+  do d = 1, fe_dim
+    sd = s/fe_dim
+    if (r > 0) then
+      sd = sd + 1
+      r = r - 1
+    end if
+    num_xyz(d) = ISHFT(1, sd)
+  end do
+end
+
+!-----------------------------------------------------------------------
+subroutine buildcartesianrestriction(ceed, fe_dim, num_xyz, degree, num_comp, mesh_size, num_qpts, restriction,&
+&     q_data_restriction, err)
+    implicit none
+    include 'ceed/fortran.h'
+
+    integer ceed
+    integer fe_dim
+    integer num_xyz(3)
+    integer degree
+    integer num_comp
+    integer mesh_size
+    integer num_qpts
+    integer restriction
+    integer q_data_restriction
+    integer err
+
+    integer p
+    integer num_nodes
+    integer elem_qpts
+    integer num_elem
+    integer scalar_size
+    integer nd(3)
+    integer elem_nodes_size
+    integer e_xyz(3),  re
+    integer g_nodes, g_nodes_stride, r_nodes
+    integer, dimension (:), allocatable :: elem_nodes
+
+    integer i, j, k
+
+    p = degree + 1
+    num_nodes = p**fe_dim
+    elem_qpts = num_qpts**fe_dim
+    num_elem  = 1
+    scalar_size = 1
+
+    do i = 1, fe_dim
+      num_elem = num_elem * num_xyz(i)
+      nd(i) = num_xyz(i) * (p - 1) + 1
+      scalar_size = scalar_size*nd(i)
+    end do
+    mesh_size = scalar_size*num_comp
+! elem:       0         1             n-1
+!         |---*-...-*---|---*-...-*---|- ... -|--...--|
+! num_nodes:   0   1    p-1  p  p+1     2*p         n*p
+    elem_nodes_size = num_elem*num_nodes
+    allocate (elem_nodes(elem_nodes_size))
+
+    do i = 1, num_elem
+      e_xyz(1) = 1
+      e_xyz(2) = 1
+      e_xyz(3) = 1
+      re = i - 1
+
+      do j = 1, fe_dim
+        e_xyz(j) = mod(re, num_xyz(j))
+        re = re/num_xyz(j)
+      end do
+
+      do j = 1, num_nodes
+        g_nodes = 0
+        g_nodes_stride = 1
+        r_nodes = j - 1
+
+        do k = 1, fe_dim
+          g_nodes = g_nodes + (e_xyz(k) * (p - 1) + mod(r_nodes, p)) * g_nodes_stride
+          g_nodes_stride = g_nodes_stride * nd(k)
+          r_nodes = r_nodes/p
+        end do
+        elem_nodes((i - 1) * num_nodes + j) = g_nodes
+      end do
+    end do
+
+    call ceedelemrestrictioncreate(ceed, num_elem, num_nodes, num_comp, scalar_size, mesh_size, ceed_mem_host,&
+             &ceed_copy_values, elem_nodes, restriction, err)
+    if (q_data_restriction /=  ceed_qfunction_none) then
+      call ceedelemrestrictioncreatestrided(ceed, num_elem, elem_qpts, num_comp, num_comp * elem_qpts * num_elem,&
+               &ceed_strides_backend, q_data_restriction, err)
+    end if
+    deallocate (elem_nodes)
+end
+
+!-----------------------------------------------------------------------
+subroutine transformmeshcoords(fe_dim, mesh_size, coords, exact_volume, err)
+    implicit none
+
+    integer fe_dim
+    integer mesh_size, scalar_size
+    real*8 coords(mesh_size)
+    real*8 exact_volume
+    real*8 m_pi, m_pi_2
+    parameter(m_pi = 3.14159265358979323846d0)
+    parameter(m_pi_2 = 1.57079632679489661923d0)
+    integer err
+
+    integer i
+    real*8 u, v
+
+    scalar_size = mesh_size/fe_dim
+    select case (fe_dim)
+    case (1)
+      do i = 1, scalar_size
+        coords(i) = 0.5d0 + (1.d0/sqrt(3.d0)) * sin((2.d0/3.d0) * m_pi * (coords(i) - 0.5d0))
+      end do
+      exact_volume = 1.d0
+
+    case (2,  3)
+      do i = 1, scalar_size
+        u = 1.d0 + coords(i)
+        v = m_pi_2 * coords(i + scalar_size)
+
+        coords(i)               = u * cos(v)
+        coords(i + scalar_size) = u * sin(v)
+      end do
+      exact_volume = 3.d0/4.d0 * m_pi
+    end select
+end
+
+!-----------------------------------------------------------------------
+subroutine setcartesianmeshcoords(fe_dim, num_xyz, mesh_degree, mesh_coords, exact_volume, err)
+    implicit none
+    include 'ceed/fortran.h'
+
+    integer fe_dim
+    integer num_xyz(3)
+    integer mesh_degree
+    integer mesh_coords
+    real*8 exact_volume
+    integer err
+
+    integer p
+    integer scalar_size
+    integer coords_size
+    integer r_nodes
+    integer d_1d
+    integer nd(3)
+    real*8, dimension (:), allocatable :: nodes,  qpts
+    real*8, dimension (:), allocatable :: coords
+    integer*8 offset
+    integer i, j
+    p = mesh_degree + 1
+    scalar_size = 1
+
+    do i = 1, fe_dim
+      nd(i) = num_xyz(i) * (p - 1) + 1
+      scalar_size = scalar_size * nd(i)
+    end do
+
+    coords_size = scalar_size * fe_dim
+    allocate (coords(coords_size))
+
+! The H1 basis uses Lobatto quadrature points as nodes
+    allocate (nodes(p))
+    allocate (qpts(p))
+    call ceedlobattoquadrature(p, nodes, qpts, err)
+    deallocate(qpts)
+    do i = 1, p
+      nodes(i) = 0.5 + 0.5 * nodes(i)
+    end do
+
+    do i = 1, scalar_size
+      r_nodes = i - 1
+
+      do j = 1, fe_dim
+        d_1d  =  mod(r_nodes, nd(j))
+        coords(scalar_size * (j - 1) + i) = ((d_1d/(p - 1)) + nodes(mod(d_1d, p - 1) + 1))/num_xyz(j)
+        r_nodes = r_nodes/nd(j)
+      end do
+    end do
+    deallocate(nodes)
+
+    call transformmeshcoords(fe_dim, coords_size, coords, exact_volume, err)
+
+    offset = 0
+    call ceedvectorsetarray(mesh_coords, ceed_mem_host, ceed_copy_values, coords, offset, err)
+    deallocate(coords)
+end
+
+!-----------------------------------------------------------------------
+program main
+    implicit none
+    include 'ceed/fortran.h'
+
+    character ceed_spec*32
+    integer fe_dim, num_comp_x, mesh_degree, sol_degree, num_qpts
+    integer num_elem, num_xyz(3), elem_qpts
+    integer prob_size, mesh_size, sol_size
+    integer help, test, gallery, benchmark
+    integer i, num_args, err
+    character arg*32, arg_value*32
+    real*8 exact_volume, computed_volume
+
+    integer ceed
+    real*8, dimension (:), allocatable :: u_array, v_array
+    integer mesh_coords, q_data, u, v
+    integer mesh_restriction, sol_restriction, q_data_restriction
+    integer mesh_basis, sol_basis
+    integer*8 offset
+    integer build_ctx
+    integer build_ctx_size
+    parameter(build_ctx_size = 2)
+    integer*8 build_ctx_data(build_ctx_size)
+    integer qf_build, qf_apply
+    integer op_build, op_apply
+
+    external build_mass, apply_mass
+
+! Initial values
+    ceed_spec   = '/cpu/self'
+    fe_dim      = 3
+    num_comp_x  = 3
+    mesh_degree = 4
+    sol_degree  = 4
+    num_qpts    = mesh_degree + 2
+    prob_size   = -1
+    help      = 0
+    test      = 0
+    gallery   = 0
+    benchmark = 0
+
+! Process command line arguments
+   
+    num_args = command_argument_count()
+    do i = 1, num_args
+      call get_command_argument(i, arg)
+
+      select case (arg)
+! LCOV_EXCL_START
+        case ('-h')
+          help = 1
+
+        case ('-c',  '-ceed')
+          call get_command_argument(i + 1, ceed_spec)
+
+        case ('-d')
+          call get_command_argument(i + 1, arg_value)
+          read(arg_value, '(I10)') fe_dim
+          num_comp_x = fe_dim
+
+        case ('-m')
+          call get_command_argument(i + 1, arg_value)
+          read(arg_value, '(I10)') mesh_degree
+
+        case ('-p')
+          call get_command_argument(i + 1, arg_value)
+          read(arg_value, '(I10)') sol_degree
+
+        case ('-q')
+          call get_command_argument(i + 1, arg_value)
+          read(arg_value, '(I10)') num_qpts
+
+        case ('-s')
+          call get_command_argument(i + 1, arg_value)
+          read(arg_value, '(I10)') prob_size
+
+        case ('-b')
+          call get_command_argument(i + 1, arg_value)
+          read(arg_value, '(I10)') benchmark
+! LCOV_EXCL_STOP
+
+        case ('-t')
+          test = 1
+
+        case ('-g')
+          gallery = 1
+      end select
+    end do
+
+    if (prob_size < 0) then
+      if (test == 1) then
+        prob_size = 8 * 16
+      else
+        prob_size = 256 * 1024
+      end if
+    end if
+
+! Print options
+    if ((test /= 1) .OR. (help == 1)) then
+! LCOV_EXCL_START
+    write (*, *) 'Selected options: [command line option] : <current value>'
+    write (*, *) '  Ceed specification     [-c] : ', ceed_spec
+    write (*, *) '  Mesh dimension         [-d] : ', fe_dim
+    write (*, *) '  Mesh degree            [-m] : ', mesh_degree
+    write (*, *) '  Solution degree        [-p] : ', sol_degree
+    write (*, *) '  Num. 1D quadrature pts [-q] : ', num_qpts
+    write (*, *) '  Approx. # unknowns     [-s] : ', prob_size
+    if (gallery == 1) then
+      write (*, *) '  QFunction source       [-g] : gallery'
+    else
+      write (*, *) '  QFunction source       [-g] : header'
+    end if
+    if (help == 1) then
+      if (test == 0) then
+        write (*, *) 'Test/quiet mode is OFF (use -t to enable)'
+      else
+        write (*, *) 'Test/quiet mode is ON'
+      end if
+    end if
+! LCOV_EXCL_STOP
+    end if
+
+! Select appropriate backend and logical device based on the (-ceed) command line argument
+    call ceedinit(trim(ceed_spec)//char(0), ceed, err)
+
+! Construct the mesh and solution bases
+    call ceedbasiscreatetensorh1lagrange(ceed, fe_dim, num_comp_x, mesh_degree + 1, num_qpts, ceed_gauss, mesh_basis,&
+             &err)
+    call ceedbasiscreatetensorh1lagrange(ceed, fe_dim, 1, sol_degree + 1, num_qpts, ceed_gauss, sol_basis, err)
+
+! Determine the mesh size based on the given approximate problem size
+    call getcartesianmeshsize(fe_dim, sol_degree, prob_size, num_xyz)
+    if (test == 0) then
+! LCOV_EXCL_START
+    write (*, '(A16, I8)', advance='no') 'Mesh size: nx = ', num_xyz(1)
+    if (num_comp_x > 1) then
+      write (*, '(A7, I8)', advance='no') ',  ny = ', num_xyz(2)
+    end if
+    if (num_comp_x > 2) then
+      write (*, '(A7, I8)', advance='no') ',  nz = ', num_xyz(3)
+    end if
+    write (*, *)
+! LCOV_EXCL_STOP
+    endif
+
+! Build CeedElemRestriction objects describing the mesh and solution discrete representation
+    call buildcartesianrestriction(ceed, fe_dim, num_xyz, mesh_degree, num_comp_x, mesh_size, num_qpts,&
+             &mesh_restriction, ceed_qfunction_none, err)
+    call buildcartesianrestriction(ceed, fe_dim, num_xyz, sol_degree, 1, sol_size, num_qpts, sol_restriction,&
+             &q_data_restriction, err)
+
+    if (test == 0) then
+! LCOV_EXCL_START
+      write (*, *) 'Number of mesh nodes     : ', mesh_size/fe_dim
+      write (*, *) 'Number of solution nodes : ', sol_size
+! LCOV_EXCL_STOP
+    end if
+
+! Create a CeedVector with the mesh coordinates
+! Apply a transformation to the mesh
+    call ceedvectorcreate(ceed, mesh_size, mesh_coords, err)
+    call setcartesianmeshcoords(fe_dim, num_xyz, mesh_degree, mesh_coords, exact_volume, err)
+
+! Context data to be passed to the 'build_mass' QFunction
+    build_ctx_data(1) = fe_dim
+    build_ctx_data(2) = num_comp_x
+    call ceedqfunctioncontextcreate(ceed, build_ctx, err)
+! Note: The context technically only takes arrays of double precision values, but we can pass arrays of ints of the same length
+    offset = 0
+    call ceedqfunctioncontextsetdata(build_ctx, ceed_mem_host, ceed_use_pointer, build_ctx_size, build_ctx_data,&
+             &offset, err)
+
+! Create the QFunction that builds the mass operator (i.e. computes its quadrature data) and set its context data
+    if (gallery == 1) then
+      select case (fe_dim)
+        case (1)
+          call ceedqfunctioncreateinteriorbyname(ceed, 'Mass1DBuild', qf_build, err)
+
+        case (2)
+          call ceedqfunctioncreateinteriorbyname(ceed, 'Mass2DBuild', qf_build, err)
+
+        case (3)
+          call ceedqfunctioncreateinteriorbyname(ceed, 'Mass3DBuild', qf_build, err)
+      end select
+    else
+      call ceedqfunctioncreateinterior(ceed, 1, build_mass,&
+               &SOURCE_DIR&
+               &//'ex1-volume-f-c.h:build_mass'//char(0), qf_build, err)
+      call ceedqfunctionaddinput(qf_build, 'dx', num_comp_x * fe_dim, ceed_eval_grad, err)
+      call ceedqfunctionaddinput(qf_build, 'weights', 1, ceed_eval_weight, err)
+      call ceedqfunctionaddoutput(qf_build, 'qdata', 1, ceed_eval_none, err)
+      call ceedqfunctionsetcontext(qf_build, build_ctx, err)
+    end if
+
+! Create the operator that builds the quadrature data for the mass operator
+    call ceedoperatorcreate(ceed, qf_build, ceed_qfunction_none, ceed_qfunction_none, op_build, err)
+    call ceedoperatorsetfield(op_build, 'dx', mesh_restriction, mesh_basis, ceed_vector_active, err)
+    call ceedoperatorsetfield(op_build, 'weights', ceed_elemrestriction_none, mesh_basis, ceed_vector_none, err)
+    call ceedoperatorsetfield(op_build, 'qdata', q_data_restriction, ceed_basis_none, ceed_vector_active, err)
+
+! Compute the quadrature data for the mass operator
+    num_elem  = 1
+    elem_qpts = num_qpts**fe_dim
+    do i = 1, fe_dim
+      num_elem = num_elem * num_xyz(i)
+    end do
+    call ceedvectorcreate(ceed, num_elem * elem_qpts, q_data, err)
+    call ceedoperatorapply(op_build, mesh_coords, q_data, ceed_request_immediate, err)
+
+! Create the QFunction that defines the action of the mass operator
+    if (gallery == 1) then
+      call ceedqfunctioncreateinteriorbyname(ceed, 'MassApply', qf_apply, err)
+    else
+      call ceedqfunctioncreateinterior(ceed, 1, apply_mass,&
+               &SOURCE_DIR&
+               &//'ex1-volume-f-c.h:apply_mass'//char(0), qf_apply, err)
+      call ceedqfunctionaddinput(qf_apply, 'u', 1, ceed_eval_interp, err)
+      call ceedqfunctionaddinput(qf_apply, 'qdata', 1, ceed_eval_none, err)
+      call ceedqfunctionaddoutput(qf_apply, 'v', 1, ceed_eval_interp, err)
+    end if
+
+! Create the mass operator
+    call ceedoperatorcreate(ceed, qf_apply, ceed_qfunction_none, ceed_qfunction_none, op_apply, err)
+    call ceedoperatorsetfield(op_apply, 'u', sol_restriction, sol_basis, ceed_vector_active, err)
+    call ceedoperatorsetfield(op_apply, 'qdata', q_data_restriction, ceed_basis_none, q_data, err)
+    call ceedoperatorsetfield(op_apply, 'v', sol_restriction, sol_basis, ceed_vector_active, err)
+
+! Create auxiliary solution-size vectors
+    allocate (u_array(sol_size))
+    allocate (v_array(sol_size))
+
+    call ceedvectorcreate(ceed, sol_size, u, err)
+    offset = 0
+    call ceedvectorsetarray(u, ceed_mem_host, ceed_use_pointer, u_array, offset, err)
+    call ceedvectorcreate(ceed, sol_size, v, err)
+    offset = 0
+    call ceedvectorsetarray(v, ceed_mem_host, ceed_use_pointer, v_array, offset, err)
+
+! Initialize 'u' with ones
+    call ceedvectorsetvalue(u, 1.d0, err)
+
+! Compute the mesh volume using the mass operator: volume = 1^T \cdot M \cdot 1
+    call ceedoperatorapply(op_apply, u, v, ceed_request_immediate, err)
+
+! Benchmark runs
+    if (test /= 1 .AND. benchmark /= 0) then
+! LCOV_EXCL_START
+      write (*, *) ' Executing ', benchmark, ' benchmarking runs...'
+! LCOV_EXCL_STOP
+    end if
+    do i = 1, benchmark
+! LCOV_EXCL_START
+      call ceedoperatorapply(op_apply, u, v, ceed_request_immediate, err)
+! LCOV_EXCL_STOP
+    end do
+
+! Compute and print the sum of the entries of 'v' giving the mesh volume
+    computed_volume = 0.d0
+
+    call ceedvectorgetarrayread(v, ceed_mem_host, v_array, offset, err)
+    do i = 1, sol_size
+      computed_volume = computed_volume + v_array(offset + i)
+    end do
+    call ceedvectorrestorearrayread(v, v_array, offset, err)
+
+    if (test /= 1) then
+! LCOV_EXCL_START
+      write (*, *) ' done.'
+      write (*, *) 'Exact mesh volume    :', exact_volume
+      write (*, *) 'Computed mesh volume :', computed_volume
+      write (*, *) 'Volume error         :', (exact_volume - computed_volume)
+! LCOV_EXCL_STOP
+    else
+      if (fe_dim == 1) then
+        if (abs(exact_volume - computed_volume) > 200.d0 * 1e-15) then
+! LCOV_EXCL_START
+          write (*, *) 'Volume error : ', (exact_volume - computed_volume)
+! LCOV_EXCL_STOP
+        end if
+      else
+        if (abs(exact_volume - computed_volume) > 1e-5) then
+! LCOV_EXCL_START
+          write (*, *) 'Volume error : ', (exact_volume - computed_volume)
+! LCOV_EXCL_STOP
+        end if
+      end if
+    end if
+
+! Free dynamically allocated memory
+    call ceedvectordestroy(mesh_coords, err)
+    call ceedvectordestroy(q_data, err)
+    call ceedvectordestroy(u, err)
+    call ceedvectordestroy(v, err)
+    deallocate (u_array)
+    deallocate (v_array)
+    call ceedbasisdestroy(sol_basis, err)
+    call ceedbasisdestroy(mesh_basis, err)
+    call ceedqfunctioncontextdestroy(build_ctx, err)
+    call ceedqfunctiondestroy(qf_build, err)
+    call ceedqfunctiondestroy(qf_apply, err)
+    call ceedoperatordestroy(op_build, err)
+    call ceedoperatordestroy(op_apply, err)
+    call ceeddestroy(ceed, err)
+end
+!-----------------------------------------------------------------------
diff --git a/examples/ceed/ex1-volume-f.h b/examples/ceed/ex1-volume-f.h
new file mode 100644
index 0000000000..08ea68ef6f
--- /dev/null
+++ b/examples/ceed/ex1-volume-f.h
@@ -0,0 +1,55 @@
+!-----------------------------------------------------------------------
+subroutine build_mass(ctx, q, j, w, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15, u16,&
+    qdata, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, ierr)
+      integer*8 ctx(2)
+      integer*8 fe_dim, space_dim
+! j is Jacobians with shape [dim,  dim, Q]
+! w is quadrature weights with shape [1, Q]
+      real*8 j(1)
+      real*8 w(1)
+! qdata is quadrature data with shape [1, Q]
+      real*8 qdata(1)
+      integer q, ierr
+
+      fe_dim = ctx(1)
+      space_dim = ctx(2)
+
+      select case (fe_dim + 10*space_dim)
+        case (11)
+          do i = 1, q
+            qdata(i) = j(i) * w(i)
+          end do
+
+        case (22)
+          do i = 1, q
+            qdata(i) = (j(0*q + i)*j(3*q + i) - j(1*q + i)*j(2*q + i)) * w(i)
+          end do
+
+        case (33)
+          do i = 1, q
+            qdata(i) = (j(0*q + i) * (j(4*q + i)*j(8*q + i) - j(5*q + i)*j(7*q + i)) -&
+                       &j(1*q + i) * (j(3*q + i)*j(8*q + i) - j(5*q + i)*j(6*q + i)) +&
+                       &j(2*q + i) * (j(3*q + i)*j(7*q + i) - j(4*q + i)*j(6*q + i))) * w(i)
+          end do
+      end select
+      ierr = 0
+end
+
+!-----------------------------------------------------------------------
+subroutine apply_mass(ctx, q, u, qdata, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15, u16,&
+    v, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, ierr)
+      integer*8 ctx
+! u is solution variables with shape [1, Q]
+! qdata is quadrature data with shape [1, Q]
+      real*8 u(1)
+      real*8 qdata(1)
+! v is solution variables with shape [1, Q]
+      real*8 v(1)
+      integer q, ierr
+
+      do i = 1, q
+        v(i) = qdata(i) * u(i)
+      end do
+      ierr = 0
+end
+!-----------------------------------------------------------------------
diff --git a/examples/ceed/ex1-volume.h b/examples/ceed/ex1-volume.h
index 72361e620c..581cff997e 100644
--- a/examples/ceed/ex1-volume.h
+++ b/examples/ceed/ex1-volume.h
@@ -14,11 +14,12 @@ struct BuildContext {
 
 /// libCEED Q-function for building quadrature data for a mass operator
 CEED_QFUNCTION(build_mass)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+  struct BuildContext *build_data = (struct BuildContext *)ctx;
+
   // in[0] is Jacobians with shape [dim, dim, Q]
   // in[1] is quadrature weights with shape [1, Q]
-  const CeedScalar    *w          = in[1];
-  CeedScalar          *q_data     = out[0];
-  struct BuildContext *build_data = (struct BuildContext *)ctx;
+  const CeedScalar *w      = in[1];
+  CeedScalar       *q_data = out[0];
 
   switch (build_data->dim + 10 * build_data->space_dim) {
     case 11: {
diff --git a/examples/ceed/ex2-surface.h b/examples/ceed/ex2-surface.h
index 5e8e253003..c8aa53b29b 100644
--- a/examples/ceed/ex2-surface.h
+++ b/examples/ceed/ex2-surface.h
@@ -14,11 +14,12 @@ struct BuildContext {
 
 /// libCEED Q-function for building quadrature data for a diffusion operator
 CEED_QFUNCTION(build_diff)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+  struct BuildContext *build_data = (struct BuildContext *)ctx;
+
   // in[0] is Jacobians with shape [dim, dim, Q]
   // in[1] is quadrature weights, size (Q)
   const CeedScalar *w             = in[1];
   CeedScalar(*q_data)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0];
-  struct BuildContext *build_data = (struct BuildContext *)ctx;
 
   // At every quadrature point, compute w/det(J).adj(J).adj(J)^T and store
   // the symmetric part of the result.
@@ -84,6 +85,7 @@ CEED_QFUNCTION(build_diff)(void *ctx, const CeedInt Q, const CeedScalar *const *
 /// libCEED Q-function for applying a diff operator
 CEED_QFUNCTION(apply_diff)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   struct BuildContext *build_data = (struct BuildContext *)ctx;
+
   // in[0], out[0] solution gradients with shape [dim, 1, Q]
   // in[1] is quadrature data with shape [num_components, Q]
   const CeedScalar(*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[1];
diff --git a/examples/ceed/ex3-volume.h b/examples/ceed/ex3-volume.h
index 956648a211..0d2c0419e4 100644
--- a/examples/ceed/ex3-volume.h
+++ b/examples/ceed/ex3-volume.h
@@ -14,11 +14,12 @@ struct BuildContext {
 
 /// libCEED Q-function for building quadrature data for a mass + diffusion operator
 CEED_QFUNCTION(build_mass_diff)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+  struct BuildContext *build_data = (struct BuildContext *)ctx;
+
   // in[0] is Jacobians with shape [dim, dim, Q]
   // in[1] is quadrature weights, size (Q)
   const CeedScalar *w             = in[1];
   CeedScalar(*q_data)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0];
-  struct BuildContext *build_data = (struct BuildContext *)ctx;
 
   // At every quadrature point, compute w/det(J).adj(J).adj(J)^T and store
   // the symmetric part of the result.
@@ -97,6 +98,7 @@ CEED_QFUNCTION(build_mass_diff)(void *ctx, const CeedInt Q, const CeedScalar *co
 /// libCEED Q-function for applying a mass + diffusion operator
 CEED_QFUNCTION(apply_mass_diff)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   struct BuildContext *build_data = (struct BuildContext *)ctx;
+
   // in[1], out[1] solution values with shape [1, 1, Q]
   // in[1], out[1] solution gradients with shape [dim, 1, Q]
   // in[2] is quadrature data with shape [num_components, Q]
diff --git a/tests/junit.py b/tests/junit.py
index f9ef51891d..6ea4bcb0b6 100755
--- a/tests/junit.py
+++ b/tests/junit.py
@@ -65,7 +65,10 @@ def get_source_path(self, test: str) -> Path:
         elif prefix == 'solids':
             return (Path('examples') / 'solids' / rest).with_suffix('.c')
         elif test.startswith('ex'):
-            return (Path('examples') / 'ceed' / test).with_suffix('.c')
+            if test.endswith('-f'):
+                return (Path('examples') / 'ceed' / test).with_suffix('.f90')
+            else:
+                return (Path('examples') / 'ceed' / test).with_suffix('.c')
         elif test.endswith('-f'):
             return (Path('tests') / test).with_suffix('.f90')
         else:

From d416dc2b8eb8ab8cb4fa3546f1e63962299dc06a Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Tue, 3 Feb 2026 10:16:29 -0700
Subject: [PATCH 536/571] dealii - add gpu version with kokkos

---
 Makefile                                |   7 +-
 examples/deal.II/CMakeLists.txt         |  22 +-
 examples/deal.II/README.md              |   8 +-
 examples/deal.II/bps-ceed.h             | 648 +++++++++++++++++++
 examples/deal.II/{bps.cc => bps-cpu.cc} |   9 +-
 examples/deal.II/bps-cpu.h              | 219 +++++++
 examples/deal.II/bps-kokkos.cc          | 251 ++++++++
 examples/deal.II/bps-kokkos.h           | 327 ++++++++++
 examples/deal.II/bps.h                  | 808 +-----------------------
 9 files changed, 1490 insertions(+), 809 deletions(-)
 create mode 100644 examples/deal.II/bps-ceed.h
 rename examples/deal.II/{bps.cc => bps-cpu.cc} (98%)
 create mode 100644 examples/deal.II/bps-cpu.h
 create mode 100644 examples/deal.II/bps-kokkos.cc
 create mode 100644 examples/deal.II/bps-kokkos.h

diff --git a/Makefile b/Makefile
index ebefbe877d..7e914a7b41 100644
--- a/Makefile
+++ b/Makefile
@@ -338,7 +338,8 @@ examples   := $(examples.c:examples/ceed/%.c=$(OBJDIR)/%$(EXE_SUFFIX))
 examples   += $(examples.f:examples/ceed/%.f90=$(OBJDIR)/%$(EXE_SUFFIX))
 
 # deal.II Examples
-dealiiexamples := $(OBJDIR)/dealii-bps
+dealiiexamples.cc := $(sort $(wildcard examples/deal.II/*.cc))
+dealiiexamples    := $(dealiiexamples.cc:examples/deal.II/%.cc=$(OBJDIR)/dealii-%)
 
 # MFEM Examples
 mfemexamples.cpp := $(sort $(wildcard examples/mfem/*.cpp))
@@ -697,11 +698,11 @@ $(OBJDIR)/%$(EXE_SUFFIX) : examples/ceed/%.f90 | $$(@D)/.DIR
 
 # deal.II
 # Note: Invoking deal.II's CMAKE build system here
-$(OBJDIR)/dealii-bps : examples/deal.II/*.cc examples/deal.II/*.h $(libceed) | $$(@D)/.DIR
+$(OBJDIR)/dealii-% : examples/deal.II/*.cc examples/deal.II/*.h $(libceed) | $$(@D)/.DIR
 	mkdir -p examples/deal.II/build
 	cmake -B examples/deal.II/build -S examples/deal.II -DDEAL_II_DIR=$(DEAL_II_DIR) -DCEED_DIR=$(PWD)
 	+$(call quiet,MAKE) -C examples/deal.II/build
-	cp examples/deal.II/build/bps $(OBJDIR)/dealii-bps
+	cp examples/deal.II/build/$* $@
 
 # MFEM
 $(OBJDIR)/mfem-% : examples/mfem/%.cpp $(libceed) | $$(@D)/.DIR
diff --git a/examples/deal.II/CMakeLists.txt b/examples/deal.II/CMakeLists.txt
index 46522cf0b8..ca87519479 100644
--- a/examples/deal.II/CMakeLists.txt
+++ b/examples/deal.II/CMakeLists.txt
@@ -11,13 +11,21 @@ IF(NOT ${deal.II_FOUND})
     )
 ENDIF()
 
-DEAL_II_INITIALIZE_CACHED_VARIABLES()
-PROJECT("bps")
+FILE(GLOB SOURCE_FILES "*.cc")
 
-DEAL_II_INITIALIZE_CACHED_VARIABLES()
+FOREACH ( source_file ${SOURCE_FILES} )
+  GET_FILENAME_COMPONENT(file_name ${source_file} NAME)
+  STRING( REPLACE ".cc" "" exec ${file_name} )
 
-ADD_EXECUTABLE(bps bps.cc)
-DEAL_II_SETUP_TARGET(bps)
+  DEAL_II_INITIALIZE_CACHED_VARIABLES()
+  PROJECT(${exec})
 
-TARGET_INCLUDE_DIRECTORIES(bps PUBLIC ${CEED_DIR}/include)
-TARGET_LINK_LIBRARIES(bps ${CEED_DIR}/lib/libceed.so)
+  DEAL_II_INITIALIZE_CACHED_VARIABLES()
+
+  ADD_EXECUTABLE(${exec} ${source_file})
+  DEAL_II_SETUP_TARGET(${exec})
+
+  TARGET_INCLUDE_DIRECTORIES(${exec} PUBLIC ${CEED_DIR}/include)
+  TARGET_LINK_LIBRARIES(${exec} ${CEED_DIR}/lib/libceed.so)
+
+ENDFOREACH ( source_file ${SOURCE_FILES} )
diff --git a/examples/deal.II/README.md b/examples/deal.II/README.md
index 1ae1f794d8..18dba6dd7c 100644
--- a/examples/deal.II/README.md
+++ b/examples/deal.II/README.md
@@ -12,10 +12,14 @@ cmake ../ -DDEAL_II_DIR=~/path/to/dealii -DCEED_DIR=~/path/to/libceed
 make
 ```
 
-To run the executable, write:
+To run the executables, write:
 
 ```
-./bps
+./bps_cpu
+```
+
+```
+./bps_kokkos
 ```
 
 Optional command-line arguments are shown by adding the command-line argument "--help".
diff --git a/examples/deal.II/bps-ceed.h b/examples/deal.II/bps-ceed.h
new file mode 100644
index 0000000000..f9041d4c6f
--- /dev/null
+++ b/examples/deal.II/bps-ceed.h
@@ -0,0 +1,648 @@
+// ---------------------------------------------------------------------
+//
+// Copyright (C) 2023 by the deal.II authors
+//
+// This file is part of the deal.II library.
+//
+// The deal.II library is free software; you can use it, redistribute
+// it, and/or modify it under the terms of the GNU Lesser General
+// Public License as published by the Free Software Foundation; either
+// version 2.1 of the License, or (at your option) any later version.
+// The full text of the license can be found in the file LICENSE.md at
+// the top level directory of deal.II.
+//
+//  Authors: Peter Munch, Martin Kronbichler
+//
+// ---------------------------------------------------------------------
+
+#pragma once
+#ifndef bps_ceed_h
+#  define bps_ceed_h
+
+// deal.II includes
+#  include <deal.II/dofs/dof_tools.h>
+
+#  include <deal.II/fe/mapping.h>
+
+#  include <deal.II/lac/la_parallel_vector.h>
+
+#  include <deal.II/matrix_free/fe_evaluation.h>
+#  include <deal.II/matrix_free/matrix_free.h>
+#  include <deal.II/matrix_free/shape_info.h>
+#  include <deal.II/matrix_free/tools.h>
+
+// local includes
+#  include "bps.h"
+
+// libCEED includes
+#  include <ceed.h>
+#  include <ceed/backend.h>
+
+// QFunction source
+#  include "bps-qfunctions.h"
+
+using namespace dealii;
+
+
+/**
+ * Operator implementation using libCEED.
+ */
+template <int dim, typename Number, typename MemorySpace = MemorySpace::Host>
+class OperatorCeed : public OperatorBase<Number, MemorySpace>
+{
+public:
+  using VectorType = typename OperatorBase<Number, MemorySpace>::VectorType;
+
+  /**
+   * Constructor.
+   */
+  OperatorCeed(const Mapping<dim>              &mapping,
+               const DoFHandler<dim>           &dof_handler,
+               const AffineConstraints<Number> &constraints,
+               const Quadrature<dim>           &quadrature,
+               const BPType                    &bp,
+               const std::string               &resource)
+    : mapping(mapping)
+    , dof_handler(dof_handler)
+    , constraints(constraints)
+    , quadrature(quadrature)
+    , bp(bp)
+    , resource(resource)
+  {
+    reinit();
+  }
+
+  /**
+   * Destructor.
+   */
+  ~OperatorCeed()
+  {
+    CeedVectorDestroy(&src_ceed);
+    CeedVectorDestroy(&dst_ceed);
+    CeedOperatorDestroy(&op_apply);
+    CeedDestroy(&ceed);
+  }
+
+  /**
+   * Initialized internal data structures, particularly, libCEED.
+   */
+  void
+  reinit() override
+  {
+    CeedVector           metric_data;
+    CeedBasis            sol_basis;
+    CeedElemRestriction  sol_restriction;
+    CeedElemRestriction  metric_data_restriction;
+    BuildContext         build_ctx_data;
+    CeedQFunctionContext build_ctx;
+    CeedQFunction        qf_apply;
+
+    const auto &tria = dof_handler.get_triangulation();
+    const auto &fe   = dof_handler.get_fe();
+
+    const auto n_components = fe.n_components();
+
+    if (bp == BPType::BP1 || bp == BPType::BP3 || bp == BPType::BP5)
+      {
+        AssertThrow(n_components == 1, ExcInternalError());
+      }
+    else
+      {
+        AssertThrow(n_components == dim, ExcInternalError());
+      }
+
+    // 1) create CEED instance -> "MatrixFree"
+    const char *ceed_spec = resource.c_str();
+    CeedInit(ceed_spec, &ceed);
+
+    // 2) create shape functions -> "ShapeInfo"
+    const unsigned int fe_degree  = fe.tensor_degree();
+    const unsigned int n_q_points = quadrature.get_tensor_basis()[0].size();
+    {
+      const dealii::internal::MatrixFreeFunctions::ShapeInfo<double> shape_info(quadrature, fe, 0);
+      const auto             &shape_data = shape_info.get_shape_data();
+      std::vector<CeedScalar> q_ref_1d;
+      for (const auto q : shape_data.quadrature.get_points())
+        q_ref_1d.push_back(q(0));
+
+      // transpose bases for compatibility with restriction
+      std::vector<CeedScalar> interp_1d(shape_data.shape_values.size());
+      std::vector<CeedScalar> grad_1d(shape_data.shape_gradients.size());
+      for (unsigned int i = 0; i < n_q_points; ++i)
+        for (unsigned int j = 0; j < fe_degree + 1; ++j)
+          {
+            interp_1d[j + i * (fe_degree + 1)] = shape_data.shape_values[j * n_q_points + i];
+            grad_1d[j + i * (fe_degree + 1)]   = shape_data.shape_gradients[j * n_q_points + i];
+          }
+
+      CeedBasisCreateTensorH1(ceed,
+                              dim,
+                              n_components,
+                              fe_degree + 1,
+                              n_q_points,
+                              interp_1d.data(),
+                              grad_1d.data(),
+                              q_ref_1d.data(),
+                              quadrature.get_tensor_basis()[0].get_weights().data(),
+                              &sol_basis);
+    }
+
+    // 3) create restriction matrix -> DoFInfo
+    unsigned int n_local_active_cells = 0;
+
+    for (const auto &cell : dof_handler.active_cell_iterators())
+      if (cell->is_locally_owned())
+        n_local_active_cells++;
+
+    partitioner =
+      std::make_shared<Utilities::MPI::Partitioner>(dof_handler.locally_owned_dofs(),
+                                                    DoFTools::extract_locally_active_dofs(
+                                                      dof_handler),
+                                                    dof_handler.get_communicator());
+
+    std::vector<CeedInt> indices;
+    indices.reserve(n_local_active_cells * fe.n_dofs_per_cell() / n_components);
+
+    const auto dof_mapping = FETools::lexicographic_to_hierarchic_numbering<dim>(fe_degree);
+
+    std::vector<types::global_dof_index> local_indices(fe.n_dofs_per_cell());
+
+    for (const auto &cell : dof_handler.active_cell_iterators())
+      if (cell->is_locally_owned())
+        {
+          cell->get_dof_indices(local_indices);
+
+          for (const auto i : dof_mapping)
+            indices.emplace_back(
+              partitioner->global_to_local(local_indices[fe.component_to_system_index(0, i)]));
+        }
+
+    CeedElemRestrictionCreate(ceed,
+                              n_local_active_cells,
+                              fe.n_dofs_per_cell() / n_components,
+                              n_components,
+                              1,
+                              this->extended_local_size(),
+                              CEED_MEM_HOST,
+                              CEED_COPY_VALUES,
+                              indices.data(),
+                              &sol_restriction);
+
+    // 4) create mapping -> MappingInfo
+    const unsigned int n_components_metric = (bp <= BPType::BP2) ? 1 : (dim * (dim + 1) / 2);
+
+    metric_data_raw = compute_metric_data(ceed, mapping, tria, quadrature, bp);
+
+    strides = {{1,
+                static_cast<int>(quadrature.size()),
+                static_cast<int>(quadrature.size() * n_components_metric)}};
+    CeedVectorCreate(ceed, metric_data_raw.size(), &metric_data);
+    CeedVectorSetArray(metric_data, CEED_MEM_HOST, CEED_USE_POINTER, metric_data_raw.data());
+    CeedElemRestrictionCreateStrided(ceed,
+                                     n_local_active_cells,
+                                     quadrature.size(),
+                                     n_components_metric,
+                                     metric_data_raw.size(),
+                                     strides.data(),
+                                     &metric_data_restriction);
+
+    build_ctx_data.dim       = dim;
+    build_ctx_data.space_dim = dim;
+
+    CeedQFunctionContextCreate(ceed, &build_ctx);
+    CeedQFunctionContextSetData(
+      build_ctx, CEED_MEM_HOST, CEED_COPY_VALUES, sizeof(build_ctx_data), &build_ctx_data);
+
+    // 5) create q operation
+    if (bp == BPType::BP1)
+      CeedQFunctionCreateInterior(ceed, 1, f_apply_mass, f_apply_mass_loc, &qf_apply);
+    else if (bp == BPType::BP2)
+      CeedQFunctionCreateInterior(ceed, 1, f_apply_mass_vec, f_apply_mass_vec_loc, &qf_apply);
+    else if (bp == BPType::BP3 || bp == BPType::BP5)
+      CeedQFunctionCreateInterior(ceed, 1, f_apply_poisson, f_apply_poisson_loc, &qf_apply);
+    else if (bp == BPType::BP4 || bp == BPType::BP6)
+      CeedQFunctionCreateInterior(ceed, 1, f_apply_poisson_vec, f_apply_poisson_vec_loc, &qf_apply);
+    else
+      AssertThrow(false, ExcInternalError());
+
+    if (bp <= BPType::BP2)
+      CeedQFunctionAddInput(qf_apply, "u", n_components, CEED_EVAL_INTERP);
+    else
+      CeedQFunctionAddInput(qf_apply, "u", dim * n_components, CEED_EVAL_GRAD);
+
+    CeedQFunctionAddInput(qf_apply, "metric data", n_components_metric, CEED_EVAL_NONE);
+
+    if (bp <= BPType::BP2)
+      CeedQFunctionAddOutput(qf_apply, "v", n_components, CEED_EVAL_INTERP);
+    else
+      CeedQFunctionAddOutput(qf_apply, "v", dim * n_components, CEED_EVAL_GRAD);
+
+    CeedQFunctionSetContext(qf_apply, build_ctx);
+
+    // 6) put everything together
+    CeedOperatorCreate(ceed, qf_apply, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_apply);
+
+    CeedOperatorSetField(op_apply, "u", sol_restriction, sol_basis, CEED_VECTOR_ACTIVE);
+    CeedOperatorSetField(
+      op_apply, "metric data", metric_data_restriction, CEED_BASIS_NONE, metric_data);
+    CeedOperatorSetField(op_apply, "v", sol_restriction, sol_basis, CEED_VECTOR_ACTIVE);
+
+    // 7) libCEED vectors
+    CeedElemRestrictionCreateVector(sol_restriction, &src_ceed, NULL);
+    CeedElemRestrictionCreateVector(sol_restriction, &dst_ceed, NULL);
+
+    // 8) cleanup
+    CeedVectorDestroy(&metric_data);
+    CeedElemRestrictionDestroy(&metric_data_restriction);
+    CeedElemRestrictionDestroy(&sol_restriction);
+    CeedBasisDestroy(&sol_basis);
+    CeedQFunctionContextDestroy(&build_ctx);
+    CeedQFunctionDestroy(&qf_apply);
+  }
+
+  /**
+   * Perform matrix-vector product.
+   */
+  void
+  vmult(VectorType &dst, const VectorType &src) const override
+  {
+    // communicate: update ghost values
+    src.update_ghost_values();
+
+    // pass memory buffers to libCEED
+    VectorTypeCeed x(src_ceed);
+    VectorTypeCeed y(dst_ceed);
+    x.import_array(src, CEED_MEM_HOST);
+    y.import_array(dst, CEED_MEM_HOST);
+
+    // apply operator
+    CeedOperatorApply(op_apply, x(), y(), CEED_REQUEST_IMMEDIATE);
+
+    // pull arrays back to deal.II
+    x.take_array();
+    y.take_array();
+
+    // communicate: compress
+    src.zero_out_ghost_values();
+    dst.compress(VectorOperation::add);
+
+    // apply constraints: we assume homogeneous DBC
+    constraints.set_zero(dst);
+  }
+
+  /**
+   * Initialized vector.
+   */
+  void
+  initialize_dof_vector(VectorType &vec) const override
+  {
+    vec.reinit(partitioner);
+  }
+
+  /**
+   * Compute inverse of diagonal.
+   */
+  void
+  compute_inverse_diagonal(VectorType &diagonal) const override
+  {
+    this->initialize_dof_vector(diagonal);
+
+    // pass memory buffer to libCEED
+    VectorTypeCeed y(dst_ceed);
+    y.import_array(diagonal, CEED_MEM_HOST);
+
+    CeedOperatorLinearAssembleDiagonal(op_apply, y(), CEED_REQUEST_IMMEDIATE);
+
+    // pull array back to deal.II
+    y.take_array();
+
+    diagonal.compress(VectorOperation::add);
+
+    // apply constraints: we assume homogeneous DBC
+    constraints.set_zero(diagonal);
+
+    for (auto &i : diagonal)
+      i = (std::abs(i) > 1.0e-10) ? (1.0 / i) : 1.0;
+  }
+
+private:
+  /**
+   * Wrapper around a deal.II vector to create a libCEED vector view.
+   */
+  class VectorTypeCeed
+  {
+  public:
+    /**
+     * Constructor.
+     */
+    VectorTypeCeed(const CeedVector &vec_orig)
+    {
+      vec_ceed = NULL;
+      CeedVectorReferenceCopy(vec_orig, &vec_ceed);
+    }
+
+    /**
+     * Return libCEED vector view.
+     */
+    CeedVector &
+    operator()()
+    {
+      return vec_ceed;
+    }
+
+    /**
+     * Set deal.II memory in libCEED vector.
+     */
+    void
+    import_array(const VectorType &vec, const CeedMemType space)
+    {
+      mem_space = space;
+      CeedVectorSetArray(vec_ceed, mem_space, CEED_USE_POINTER, vec.get_values());
+    }
+
+    /**
+     * Sync memory from device to host.
+     */
+    void
+    sync_array()
+    {
+      CeedVectorSyncArray(vec_ceed, mem_space);
+    }
+
+    /**
+     * Take previously set deal.II array from libCEED vector
+     */
+    void
+    take_array()
+    {
+      CeedScalar *ptr;
+      CeedVectorTakeArray(vec_ceed, mem_space, &ptr);
+    }
+
+    /**
+     * Destructor: destroy vector view.
+     */
+    ~VectorTypeCeed()
+    {
+      bool has_array;
+      CeedVectorHasBorrowedArrayOfType(vec_ceed, mem_space, &has_array);
+      if (has_array)
+        {
+          CeedScalar *ptr;
+          CeedVectorTakeArray(vec_ceed, mem_space, &ptr);
+        }
+      CeedVectorDestroy(&vec_ceed);
+    }
+
+  private:
+    /**
+     * libCEED vector view.
+     */
+    CeedMemType mem_space;
+    CeedVector  vec_ceed;
+  };
+
+  /**
+   * Number of locally active DoFs.
+   */
+  unsigned int
+  extended_local_size() const
+  {
+    return partitioner->locally_owned_size() + partitioner->n_ghost_indices();
+  }
+
+  /**
+   * Compute metric data: Jacobian, ...
+   */
+  static std::vector<double>
+  compute_metric_data(const Ceed               &ceed,
+                      const Mapping<dim>       &mapping,
+                      const Triangulation<dim> &tria,
+                      const Quadrature<dim>    &quadrature,
+                      const BPType              bp)
+  {
+    std::vector<double> metric_data_raw;
+
+    CeedBasis            geo_basis;
+    CeedVector           metric_data;
+    CeedElemRestriction  metric_data_restriction;
+    CeedVector           node_coords;
+    CeedElemRestriction  geo_restriction;
+    CeedQFunctionContext build_ctx;
+    CeedQFunction        qf_build;
+    CeedOperator         op_build;
+
+    const unsigned int n_q_points = quadrature.get_tensor_basis()[0].size();
+
+    const unsigned int n_components_metric = (bp <= BPType::BP2) ? 1 : (dim * (dim + 1) / 2);
+
+    const auto mapping_q = dynamic_cast<const MappingQ<dim> *>(&mapping);
+
+    AssertThrow(mapping_q, ExcMessage("Wrong mapping!"));
+
+    const unsigned int fe_degree = mapping_q->get_degree();
+
+    FE_Q<dim> geo_fe(fe_degree);
+
+    {
+      const dealii::internal::MatrixFreeFunctions::ShapeInfo<double> shape_info(quadrature,
+                                                                                geo_fe,
+                                                                                0);
+      const auto             &shape_data = shape_info.get_shape_data();
+      std::vector<CeedScalar> q_ref_1d;
+      for (const auto q : shape_data.quadrature.get_points())
+        q_ref_1d.push_back(q(0));
+
+      // transpose bases for compatibility with restriction
+      std::vector<CeedScalar> interp_1d(shape_data.shape_values.size());
+      std::vector<CeedScalar> grad_1d(shape_data.shape_gradients.size());
+      for (unsigned int i = 0; i < n_q_points; ++i)
+        for (unsigned int j = 0; j < fe_degree + 1; ++j)
+          {
+            interp_1d[j + i * (fe_degree + 1)] = shape_data.shape_values[j * n_q_points + i];
+            grad_1d[j + i * (fe_degree + 1)]   = shape_data.shape_gradients[j * n_q_points + i];
+          }
+
+      CeedBasisCreateTensorH1(ceed,
+                              dim,
+                              dim,
+                              fe_degree + 1,
+                              n_q_points,
+                              interp_1d.data(),
+                              grad_1d.data(),
+                              q_ref_1d.data(),
+                              quadrature.get_tensor_basis()[0].get_weights().data(),
+                              &geo_basis);
+    }
+
+    unsigned int n_local_active_cells = 0;
+
+    for (const auto &cell : tria.active_cell_iterators())
+      if (cell->is_locally_owned())
+        n_local_active_cells++;
+
+    std::vector<double>  geo_support_points;
+    std::vector<CeedInt> geo_indices;
+
+    DoFHandler<dim> geo_dof_handler(tria);
+    geo_dof_handler.distribute_dofs(geo_fe);
+
+    const auto geo_partitioner =
+      std::make_shared<Utilities::MPI::Partitioner>(geo_dof_handler.locally_owned_dofs(),
+                                                    DoFTools::extract_locally_active_dofs(
+                                                      geo_dof_handler),
+                                                    geo_dof_handler.get_communicator());
+
+    geo_indices.reserve(n_local_active_cells * geo_fe.n_dofs_per_cell());
+
+    const auto dof_mapping = FETools::lexicographic_to_hierarchic_numbering<dim>(fe_degree);
+
+    FEValues<dim> fe_values(mapping,
+                            geo_fe,
+                            geo_fe.get_unit_support_points(),
+                            update_quadrature_points);
+
+    std::vector<types::global_dof_index> local_indices(geo_fe.n_dofs_per_cell());
+
+    const unsigned int n_points =
+      geo_partitioner->locally_owned_size() + geo_partitioner->n_ghost_indices();
+
+    geo_support_points.resize(dim * n_points);
+
+    for (const auto &cell : geo_dof_handler.active_cell_iterators())
+      if (cell->is_locally_owned())
+        {
+          fe_values.reinit(cell);
+          cell->get_dof_indices(local_indices);
+
+          for (const auto i : dof_mapping)
+            {
+              const auto index = geo_partitioner->global_to_local(local_indices[i]);
+              geo_indices.emplace_back(index * dim);
+
+              const auto point = fe_values.quadrature_point(i);
+
+              for (unsigned int d = 0; d < dim; ++d)
+                geo_support_points[index * dim + d] = point[d];
+            }
+        }
+
+    metric_data_raw.resize(n_local_active_cells * quadrature.size() * n_components_metric);
+
+    CeedInt strides[3] = {1,
+                          static_cast<int>(quadrature.size()),
+                          static_cast<int>(quadrature.size() * n_components_metric)};
+
+    CeedVectorCreate(ceed, metric_data_raw.size(), &metric_data);
+    CeedVectorSetArray(metric_data, CEED_MEM_HOST, CEED_USE_POINTER, metric_data_raw.data());
+    CeedElemRestrictionCreateStrided(ceed,
+                                     n_local_active_cells,
+                                     quadrature.size(),
+                                     n_components_metric,
+                                     metric_data_raw.size(),
+                                     strides,
+                                     &metric_data_restriction);
+
+    CeedVectorCreate(ceed, geo_support_points.size(), &node_coords);
+    CeedVectorSetArray(node_coords, CEED_MEM_HOST, CEED_USE_POINTER, geo_support_points.data());
+
+    CeedElemRestrictionCreate(ceed,
+                              n_local_active_cells,
+                              geo_fe.n_dofs_per_cell(),
+                              dim,
+                              1,
+                              geo_support_points.size(),
+                              CEED_MEM_HOST,
+                              CEED_COPY_VALUES,
+                              geo_indices.data(),
+                              &geo_restriction);
+
+    BuildContext build_ctx_data;
+    build_ctx_data.dim       = dim;
+    build_ctx_data.space_dim = dim;
+
+    CeedQFunctionContextCreate(ceed, &build_ctx);
+    CeedQFunctionContextSetData(
+      build_ctx, CEED_MEM_HOST, CEED_USE_POINTER, sizeof(build_ctx_data), &build_ctx_data);
+
+    // 5) create q operation
+    if (bp <= BPType::BP2)
+      CeedQFunctionCreateInterior(ceed, 1, f_build_mass, f_build_mass_loc, &qf_build);
+    else
+      CeedQFunctionCreateInterior(ceed, 1, f_build_poisson, f_build_poisson_loc, &qf_build);
+
+    CeedQFunctionAddInput(qf_build, "geo", dim * dim, CEED_EVAL_GRAD);
+    CeedQFunctionAddInput(qf_build, "weight", 1, CEED_EVAL_WEIGHT);
+    CeedQFunctionAddOutput(qf_build, "metric data", n_components_metric, CEED_EVAL_NONE);
+    CeedQFunctionSetContext(qf_build, build_ctx);
+
+    // 6) put everything together
+    CeedOperatorCreate(ceed, qf_build, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_build);
+    CeedOperatorSetField(op_build, "geo", geo_restriction, geo_basis, CEED_VECTOR_ACTIVE);
+    CeedOperatorSetField(
+      op_build, "weight", CEED_ELEMRESTRICTION_NONE, geo_basis, CEED_VECTOR_NONE);
+    CeedOperatorSetField(
+      op_build, "metric data", metric_data_restriction, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE);
+
+    CeedOperatorApply(op_build, node_coords, metric_data, CEED_REQUEST_IMMEDIATE);
+
+    CeedVectorDestroy(&node_coords);
+    CeedVectorSyncArray(metric_data, CEED_MEM_HOST);
+    CeedVectorDestroy(&metric_data);
+    CeedElemRestrictionDestroy(&geo_restriction);
+    CeedElemRestrictionDestroy(&metric_data_restriction);
+    CeedBasisDestroy(&geo_basis);
+    CeedQFunctionContextDestroy(&build_ctx);
+    CeedQFunctionDestroy(&qf_build);
+    CeedOperatorDestroy(&op_build);
+
+    return metric_data_raw;
+  }
+
+  /**
+   * Mapping object passed to the constructor.
+   */
+  const Mapping<dim> &mapping;
+
+  /**
+   * DoFHandler object passed to the constructor.
+   */
+  const DoFHandler<dim> &dof_handler;
+
+  /**
+   * Constraints object passed to the constructor.
+   */
+  const AffineConstraints<Number> &constraints;
+
+  /**
+   * Quadrature rule object passed to the constructor.
+   */
+  const Quadrature<dim> &quadrature;
+
+  /**
+   * Selected BP.
+   */
+  const BPType bp;
+
+  /**
+   * Resource name.
+   */
+  const std::string resource;
+
+  /**
+   * Partitioner for distributed vectors.
+   */
+  std::shared_ptr<Utilities::MPI::Partitioner> partitioner;
+
+  /**
+   * libCEED data structures.
+   */
+  Ceed                   ceed;
+  std::vector<double>    metric_data_raw;
+  std::array<CeedInt, 3> strides;
+  CeedVector             src_ceed;
+  CeedVector             dst_ceed;
+  CeedOperator           op_apply;
+};
+
+#endif
diff --git a/examples/deal.II/bps.cc b/examples/deal.II/bps-cpu.cc
similarity index 98%
rename from examples/deal.II/bps.cc
rename to examples/deal.II/bps-cpu.cc
index c059460656..2355078ccf 100644
--- a/examples/deal.II/bps.cc
+++ b/examples/deal.II/bps-cpu.cc
@@ -46,7 +46,8 @@
 #include <sstream>
 
 // include operators
-#include "bps.h"
+#include "bps-ceed.h"
+#include "bps-cpu.h"
 
 // Test cases
 //TESTARGS(name="BP1") --resource {ceed_resource} --bp BP1 --fe_degree 2 --print_timings 0
@@ -61,7 +62,7 @@ struct Parameters
   unsigned int n_global_refinements = 1;
   unsigned int fe_degree            = 2;
   bool         print_timings        = true;
-  std::string  libCEED_resource      = "/cpu/self";
+  std::string  libCEED_resource     = "/cpu/self";
 
   bool
   parse(int argc, char *argv[])
@@ -176,6 +177,8 @@ main(int argc, char *argv[])
   DoFHandler<dim> dof_handler(tria);
   dof_handler.distribute_dofs(fe);
 
+  DoFRenumbering::support_point_wise(dof_handler);
+
   AffineConstraints<Number> constraints;
 
   if (!(bp == BPType::BP1 || bp == BPType::BP2))
@@ -185,8 +188,6 @@ main(int argc, char *argv[])
       constraints.close();
     }
 
-  DoFRenumbering::support_point_wise(dof_handler);
-
   const auto test = [&](const std::string &label, const auto &op) {
     (void)label;
 
diff --git a/examples/deal.II/bps-cpu.h b/examples/deal.II/bps-cpu.h
new file mode 100644
index 0000000000..71c00cea5d
--- /dev/null
+++ b/examples/deal.II/bps-cpu.h
@@ -0,0 +1,219 @@
+// ---------------------------------------------------------------------
+//
+// Copyright (C) 2023 by the deal.II authors
+//
+// This file is part of the deal.II library.
+//
+// The deal.II library is free software; you can use it, redistribute
+// it, and/or modify it under the terms of the GNU Lesser General
+// Public License as published by the Free Software Foundation; either
+// version 2.1 of the License, or (at your option) any later version.
+// The full text of the license can be found in the file LICENSE.md at
+// the top level directory of deal.II.
+//
+//  Authors: Peter Munch, Martin Kronbichler
+//
+// ---------------------------------------------------------------------
+
+#pragma once
+#ifndef bps_cpu_h
+#  define bps_cpu_h
+
+// deal.II includes
+#  include <deal.II/dofs/dof_tools.h>
+
+#  include <deal.II/fe/mapping.h>
+
+#  include <deal.II/lac/la_parallel_vector.h>
+
+#  include <deal.II/matrix_free/fe_evaluation.h>
+#  include <deal.II/matrix_free/matrix_free.h>
+#  include <deal.II/matrix_free/shape_info.h>
+#  include <deal.II/matrix_free/tools.h>
+
+// local includes
+#  include "bps.h"
+
+using namespace dealii;
+
+
+
+/**
+ * Operator CPU implementation using deal.II.
+ */
+template <int dim, typename Number>
+class OperatorDealii : public OperatorBase<Number, MemorySpace::Host>
+{
+public:
+  using VectorType = typename OperatorBase<Number, MemorySpace::Host>::VectorType;
+
+  /**
+   * Constructor.
+   */
+  OperatorDealii(const Mapping<dim>              &mapping,
+                 const DoFHandler<dim>           &dof_handler,
+                 const AffineConstraints<Number> &constraints,
+                 const Quadrature<dim>           &quadrature,
+                 const BPType                    &bp)
+    : mapping(mapping)
+    , dof_handler(dof_handler)
+    , constraints(constraints)
+    , quadrature(quadrature)
+    , bp(bp)
+  {
+    reinit();
+  }
+
+  /**
+   * Destructor.
+   */
+  ~OperatorDealii() = default;
+
+  /**
+   * Initialized internal data structures, particularly, MatrixFree.
+   */
+  void
+  reinit() override
+  {
+    // configure MatrixFree
+    typename MatrixFree<dim, Number>::AdditionalData additional_data;
+    additional_data.tasks_parallel_scheme =
+      MatrixFree<dim, Number>::AdditionalData::TasksParallelScheme::none;
+
+    // create MatrixFree
+    matrix_free.reinit(mapping, dof_handler, constraints, quadrature, additional_data);
+  }
+
+  /**
+   * Matrix-vector product.
+   */
+  void
+  vmult(VectorType &dst, const VectorType &src) const override
+  {
+    if (dof_handler.get_fe().n_components() == 1)
+      {
+        matrix_free.cell_loop(&OperatorDealii::do_cell_integral_range<1>, this, dst, src, true);
+      }
+    else
+      {
+        AssertThrow(dof_handler.get_fe().n_components() == dim, ExcInternalError());
+
+        matrix_free.cell_loop(&OperatorDealii::do_cell_integral_range<dim>, this, dst, src, true);
+      }
+  }
+
+  /**
+   * Initialize vector.
+   */
+  void
+  initialize_dof_vector(VectorType &vec) const override
+  {
+    matrix_free.initialize_dof_vector(vec);
+  }
+
+  /**
+   * Compute inverse of diagonal.
+   */
+  void
+  compute_inverse_diagonal(VectorType &diagonal) const override
+  {
+    this->initialize_dof_vector(diagonal);
+
+    if (dof_handler.get_fe().n_components() == 1)
+      {
+        MatrixFreeTools::compute_diagonal(matrix_free,
+                                          diagonal,
+                                          &OperatorDealii::do_cell_integral_local<1>,
+                                          this);
+      }
+    else
+      {
+        AssertThrow(dof_handler.get_fe().n_components() == dim, ExcInternalError());
+
+        MatrixFreeTools::compute_diagonal(matrix_free,
+                                          diagonal,
+                                          &OperatorDealii::do_cell_integral_local<dim>,
+                                          this);
+      }
+
+    for (auto &i : diagonal)
+      i = (std::abs(i) > 1.0e-10) ? (1.0 / i) : 1.0;
+  }
+
+private:
+  /**
+   * Cell integral without vector access.
+   */
+  template <int n_components>
+  void
+  do_cell_integral_local(FEEvaluation<dim, -1, 0, n_components, Number> &phi) const
+  {
+    if (bp <= BPType::BP2) // mass matrix
+      {
+        phi.evaluate(EvaluationFlags::values);
+        for (const auto q : phi.quadrature_point_indices())
+          phi.submit_value(phi.get_value(q), q);
+        phi.integrate(EvaluationFlags::values);
+      }
+    else // Poisson operator
+      {
+        phi.evaluate(EvaluationFlags::gradients);
+        for (const auto q : phi.quadrature_point_indices())
+          phi.submit_gradient(phi.get_gradient(q), q);
+        phi.integrate(EvaluationFlags::gradients);
+      }
+  }
+
+  /**
+   * Cell integral on a range of cells.
+   */
+  template <int n_components>
+  void
+  do_cell_integral_range(const MatrixFree<dim, Number>               &matrix_free,
+                         VectorType                                  &dst,
+                         const VectorType                            &src,
+                         const std::pair<unsigned int, unsigned int> &range) const
+  {
+    FEEvaluation<dim, -1, 0, n_components, Number> phi(matrix_free, range);
+
+    for (unsigned cell = range.first; cell < range.second; ++cell)
+      {
+        phi.reinit(cell);
+        phi.read_dof_values(src);            // read source vector
+        do_cell_integral_local(phi);         // cell integral
+        phi.distribute_local_to_global(dst); // write to destination vector
+      }
+  }
+
+  /**
+   * Mapping object passed to the constructor.
+   */
+  const Mapping<dim> &mapping;
+
+  /**
+   * DoFHandler object passed to the constructor.
+   */
+  const DoFHandler<dim> &dof_handler;
+
+  /**
+   * Constraints object passed to the constructor.
+   */
+  const AffineConstraints<Number> &constraints;
+
+  /**
+   * Quadrature rule object passed to the constructor.
+   */
+  const Quadrature<dim> &quadrature;
+
+  /**
+   * Selected BP.
+   */
+  const BPType bp;
+
+  /**
+   * MatrixFree object.
+   */
+  MatrixFree<dim, Number> matrix_free;
+};
+
+#endif
diff --git a/examples/deal.II/bps-kokkos.cc b/examples/deal.II/bps-kokkos.cc
new file mode 100644
index 0000000000..86ef1a1693
--- /dev/null
+++ b/examples/deal.II/bps-kokkos.cc
@@ -0,0 +1,251 @@
+// ---------------------------------------------------------------------
+//
+// Copyright (C) 2023 by the deal.II authors
+//
+// This file is part of the deal.II library.
+//
+// The deal.II library is free software; you can use it, redistribute
+// it, and/or modify it under the terms of the GNU Lesser General
+// Public License as published by the Free Software Foundation; either
+// version 2.1 of the License, or (at your option) any later version.
+// The full text of the license can be found in the file LICENSE.md at
+// the top level directory of deal.II.
+//
+//  Authors: Peter Munch, Martin Kronbichler
+//
+// ---------------------------------------------------------------------
+
+// deal.II includes
+#include <deal.II/base/conditional_ostream.h>
+#include <deal.II/base/mpi.h>
+#include <deal.II/base/parameter_handler.h>
+#include <deal.II/base/quadrature_lib.h>
+
+#include <deal.II/distributed/shared_tria.h>
+#include <deal.II/distributed/tria.h>
+
+#include <deal.II/dofs/dof_handler.h>
+#include <deal.II/dofs/dof_renumbering.h>
+
+#include <deal.II/fe/fe_nothing.h>
+#include <deal.II/fe/fe_q.h>
+#include <deal.II/fe/fe_system.h>
+#include <deal.II/fe/fe_tools.h>
+#include <deal.II/fe/fe_values.h>
+#include <deal.II/fe/mapping_q1.h>
+
+#include <deal.II/grid/grid_generator.h>
+
+#include <deal.II/lac/affine_constraints.h>
+#include <deal.II/lac/precondition.h>
+#include <deal.II/lac/solver_cg.h>
+
+// boost
+#include <boost/algorithm/string.hpp>
+
+#include <sstream>
+
+// include operators
+#include "bps-ceed.h"
+#include "bps-kokkos.h"
+
+// Test cases
+//TESTARGS(name="BP1") --resource {ceed_resource} --bp BP1 --fe_degree 2 --print_timings 0
+//TESTARGS(name="BP4") --resource {ceed_resource} --bp BP4 --fe_degree 1 --print_timings 0
+
+/**
+ * Relevant parameters.
+ */
+struct Parameters
+{
+  BPType       bp                   = BPType::BP5;
+  unsigned int n_global_refinements = 1;
+  unsigned int fe_degree            = 2;
+  bool         print_timings        = true;
+  std::string  libCEED_resource     = "/cpu/self";
+
+  bool
+  parse(int argc, char *argv[])
+  {
+    if (argc == 1 && (std::string(argv[0]) == "--help"))
+      {
+        std::cout << "Usage: ./bp [OPTION]..." << std::endl;
+        std::cout << std::endl;
+        std::cout << "--bp             name of benchmark (BP1-BP6)" << std::endl;
+        std::cout << "--n_refinements  number of refinements (0-)" << std::endl;
+        std::cout << "--fe_degree      polynomial degree (1-)" << std::endl;
+        std::cout << "--print_timings  name of benchmark (0, 1)" << std::endl;
+        std::cout << "--resource       name of resource (e.g., /cpu/self/avx/blocked)" << std::endl;
+
+        return true;
+      }
+
+    AssertThrow(argc % 2 == 0, ExcInternalError());
+
+    while (argc > 0)
+      {
+        std::string label(argv[0]);
+
+        if ("--bp" == label)
+          {
+            std::string bp_string(argv[1]);
+
+            if (bp_string == "BP1")
+              bp = BPType::BP1;
+            else if (bp_string == "BP2")
+              bp = BPType::BP2;
+            else if (bp_string == "BP3")
+              bp = BPType::BP3;
+            else if (bp_string == "BP4")
+              bp = BPType::BP4;
+            else if (bp_string == "BP5")
+              bp = BPType::BP5;
+            else if (bp_string == "BP6")
+              bp = BPType::BP6;
+            else
+              AssertThrow(false, ExcInternalError());
+          }
+        else if ("--n_refinements" == label)
+          {
+            n_global_refinements = std::atoi(argv[1]);
+          }
+        else if ("--fe_degree" == label)
+          {
+            fe_degree = std::atoi(argv[1]);
+          }
+        else if ("--print_timings" == label)
+          {
+            print_timings = std::atoi(argv[1]);
+          }
+        else if ("--resource" == label)
+          {
+            libCEED_resource = std::string(argv[1]);
+          }
+        else
+          {
+            AssertThrow(false, ExcNotImplemented());
+          }
+
+
+        argc -= 2;
+        argv += 2;
+      }
+
+    return false;
+  }
+};
+
+
+
+int
+main(int argc, char *argv[])
+{
+  Utilities::MPI::MPI_InitFinalize mpi_initialization(argc, argv, 1);
+
+  Parameters params;
+  if (params.parse(argc - 1, argv + 1))
+    return 0;
+
+  ConditionalOStream pout(std::cout, Utilities::MPI::this_mpi_process(MPI_COMM_WORLD) == 0);
+
+  //  configuration
+  const BPType bp = params.bp;
+
+  using Number                     = double;
+  using VectorType                 = LinearAlgebra::distributed::Vector<Number, MemorySpace::Default>;
+  const unsigned int dim           = 2;
+  const unsigned int fe_degree     = params.fe_degree;
+  const unsigned int n_q_points    = (bp <= BPType::BP4) ? (fe_degree + 2) : (fe_degree + 1);
+  const unsigned int n_refinements = params.n_global_refinements;
+  const unsigned int n_components =
+    (bp == BPType::BP1 || bp == BPType::BP3 || bp == BPType::BP5) ? 1 : dim;
+
+  // create mapping, quadrature, fe, mesh, ...
+  MappingQ1<dim> mapping;
+  QGauss<dim>    quadrature(n_q_points);
+  FESystem<dim>  fe(FE_Q<dim>(fe_degree), n_components);
+
+#ifdef DEAL_II_WITH_P4EST
+  parallel::distributed::Triangulation<dim> tria(MPI_COMM_WORLD);
+#else
+  Triangulation<dim> tria;
+#endif
+
+  GridGenerator::hyper_cube(tria);
+  tria.refine_global(n_refinements);
+
+  DoFHandler<dim> dof_handler(tria);
+  dof_handler.distribute_dofs(fe);
+
+  DoFRenumbering::support_point_wise(dof_handler);
+
+  AffineConstraints<Number> constraints;
+
+  if (!(bp == BPType::BP1 || bp == BPType::BP2))
+    {
+      // for stiffness matrix
+      DoFTools::make_zero_boundary_constraints(dof_handler, constraints);
+      constraints.close();
+    }
+
+  const auto test = [&](const std::string &label, const auto &op) {
+    (void)label;
+
+    // initialize vector
+    VectorType u, v;
+    op.initialize_dof_vector(u);
+    op.initialize_dof_vector(v);
+    u = 1.0;
+
+    constraints.set_zero(u);
+
+    // perform matrix-vector product
+    op.vmult(v, u);
+
+    // create solver
+    ReductionControl reduction_control(100, 1e-20, 1e-6);
+
+    // create preconditioner
+    DiagonalMatrix<VectorType> diagonal_matrix;
+    op.compute_inverse_diagonal(diagonal_matrix.get_vector());
+
+    std::chrono::time_point<std::chrono::system_clock> now;
+
+    bool not_converged = false;
+
+    try
+      {
+        // solve problem
+        SolverCG<VectorType> solver(reduction_control);
+        now = std::chrono::system_clock::now();
+        solver.solve(op, v, u, diagonal_matrix);
+      }
+    catch (const SolverControl::NoConvergence &)
+      {
+        pout << "Error: solver failed to converge with" << std::endl;
+        not_converged = true;
+      }
+
+
+    const auto time =
+      std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::system_clock::now() - now)
+        .count() /
+      1e9;
+
+
+    if (params.print_timings || not_converged)
+      {
+        pout << label << ": " << reduction_control.last_step() << " " << v.l2_norm() << " "
+             << (params.print_timings ? time : 0.0) << std::endl;
+      }
+  };
+
+  // create and test the libCEED operator
+  OperatorCeed<dim, Number, MemorySpace::Default> op_ceed(
+    mapping, dof_handler, constraints, quadrature, bp, params.libCEED_resource);
+  test("ceed", op_ceed);
+
+  // create and test a native deal.II operator
+  OperatorDealii<dim, Number> op_dealii(mapping, dof_handler, constraints, quadrature, bp);
+  test("dealii", op_dealii);
+}
diff --git a/examples/deal.II/bps-kokkos.h b/examples/deal.II/bps-kokkos.h
new file mode 100644
index 0000000000..bd8ba4f54f
--- /dev/null
+++ b/examples/deal.II/bps-kokkos.h
@@ -0,0 +1,327 @@
+// ---------------------------------------------------------------------
+//
+// Copyright (C) 2023 by the deal.II authors
+//
+// This file is part of the deal.II library.
+//
+// The deal.II library is free software; you can use it, redistribute
+// it, and/or modify it under the terms of the GNU Lesser General
+// Public License as published by the Free Software Foundation; either
+// version 2.1 of the License, or (at your option) any later version.
+// The full text of the license can be found in the file LICENSE.md at
+// the top level directory of deal.II.
+//
+//  Authors: Peter Munch, Martin Kronbichler
+//
+// ---------------------------------------------------------------------
+
+#pragma once
+#ifndef bps_kokkos_h
+#  define bps_kokkos_h
+
+// deal.II includes
+#  include <deal.II/dofs/dof_tools.h>
+
+#  include <deal.II/fe/mapping.h>
+
+#  include <deal.II/lac/la_parallel_vector.h>
+
+#  include <deal.II/matrix_free/fe_evaluation.h>
+#  include <deal.II/matrix_free/matrix_free.h>
+#  include <deal.II/matrix_free/shape_info.h>
+#  include <deal.II/matrix_free/tools.h>
+
+// local includes
+#  include "bps.h"
+
+using namespace dealii;
+
+
+
+template <int dim, int fe_degree, int n_q_points_1d, int n_components, typename Number>
+class OperatorDealiiMassQuad
+{
+public:
+  DEAL_II_HOST_DEVICE void
+  operator()(Portable::FEEvaluation<dim, fe_degree, n_q_points_1d, n_components, Number> *fe_eval,
+             const int q_point) const
+  {
+    fe_eval->submit_value(fe_eval->get_value(q_point), q_point);
+  }
+};
+
+
+
+template <int dim, int fe_degree, int n_q_points_1d, int n_components, typename Number>
+class OperatorDealiiLaplaceQuad
+{
+public:
+  DEAL_II_HOST_DEVICE void
+  operator()(Portable::FEEvaluation<dim, fe_degree, n_q_points_1d, n_components, Number> *fe_eval,
+             const int q_point) const
+  {
+    fe_eval->submit_gradient(fe_eval->get_gradient(q_point), q_point);
+  }
+};
+
+
+
+template <int dim, int fe_degree, int n_q_points_1d, int n_components, typename Number>
+class OperatorDealiiMassLocal
+{
+public:
+  DEAL_II_HOST_DEVICE void
+  operator()(const typename Portable::MatrixFree<dim, Number>::Data *data,
+             const Portable::DeviceVector<Number>                   &src,
+             Portable::DeviceVector<Number>                         &dst) const
+  {
+    Portable::FEEvaluation<dim, fe_degree, n_q_points_1d, n_components, Number> fe_eval(data);
+    fe_eval.read_dof_values(src);
+    fe_eval.evaluate(EvaluationFlags::values);
+    fe_eval.apply_for_each_quad_point(
+      OperatorDealiiMassQuad<dim, fe_degree, n_q_points_1d, n_components, Number>());
+    fe_eval.integrate(EvaluationFlags::values);
+    fe_eval.distribute_local_to_global(dst);
+  }
+
+  static const unsigned int n_local_dofs = Utilities::pow(fe_degree + 1, dim) * n_components;
+  static const unsigned int n_q_points   = Utilities::pow(n_q_points_1d, dim);
+};
+
+
+
+template <int dim, int fe_degree, int n_q_points_1d, int n_components, typename Number>
+class OperatorDealiiLaplaceLocal
+{
+public:
+  DEAL_II_HOST_DEVICE void
+  operator()(const typename Portable::MatrixFree<dim, Number>::Data *data,
+             const Portable::DeviceVector<Number>                   &src,
+             Portable::DeviceVector<Number>                         &dst) const
+  {
+    Portable::FEEvaluation<dim, fe_degree, n_q_points_1d, n_components, Number> fe_eval(data);
+    fe_eval.read_dof_values(src);
+    fe_eval.evaluate(EvaluationFlags::gradients);
+    fe_eval.apply_for_each_quad_point(
+      OperatorDealiiLaplaceQuad<dim, fe_degree, n_q_points_1d, n_components, Number>());
+    fe_eval.integrate(EvaluationFlags::gradients);
+    fe_eval.distribute_local_to_global(dst);
+  }
+
+  static const unsigned int n_local_dofs = Utilities::pow(fe_degree + 1, dim) * n_components;
+  static const unsigned int n_q_points   = Utilities::pow(n_q_points_1d, dim);
+};
+
+
+
+/**
+ * Operator GPU implementation using deal.II.
+ */
+template <int dim, typename Number>
+class OperatorDealii : public OperatorBase<Number, MemorySpace::Default>
+{
+public:
+  using VectorType = typename OperatorBase<Number, MemorySpace::Default>::VectorType;
+
+  /**
+   * Constructor.
+   */
+  OperatorDealii(const Mapping<dim>              &mapping,
+                 const DoFHandler<dim>           &dof_handler,
+                 const AffineConstraints<Number> &constraints,
+                 const Quadrature<dim>           &quadrature,
+                 const BPType                    &bp)
+    : mapping(mapping)
+    , dof_handler(dof_handler)
+    , constraints(constraints)
+    , quadrature(quadrature)
+    , bp(bp)
+  {
+    reinit();
+  }
+
+  /**
+   * Destructor.
+   */
+  ~OperatorDealii() = default;
+
+  /**
+   * Initialized internal data structures, particularly, MatrixFree.
+   */
+  void
+  reinit() override
+  {
+    // configure MatrixFree
+    typename Portable::MatrixFree<dim, Number>::AdditionalData additional_data;
+
+    if (bp <= BPType::BP2) // mass matrix
+      additional_data.mapping_update_flags = update_JxW_values | update_values;
+    else
+      additional_data.mapping_update_flags = update_JxW_values | update_gradients;
+
+    // create MatrixFree
+    AssertThrow(quadrature.is_tensor_product(), ExcNotImplemented());
+    matrix_free.reinit(
+      mapping, dof_handler, constraints, quadrature.get_tensor_basis()[0], additional_data);
+  }
+
+  /**
+   * Matrix-vector product.
+   */
+  void
+  vmult(VectorType &dst, const VectorType &src) const override
+  {
+    dst = 0.0;
+
+    const unsigned int n_components  = dof_handler.get_fe().n_components();
+    const unsigned int fe_degree     = dof_handler.get_fe().tensor_degree();
+    const unsigned int n_q_points_1d = quadrature.get_tensor_basis()[0].size();
+
+    if (n_components == 1 && fe_degree == 1 && n_q_points_1d == 2)
+      this->vmult_internal<1, 1, 2>(dst, src);
+    else if (n_components == 1 && fe_degree == 2 && n_q_points_1d == 3)
+      this->vmult_internal<1, 2, 3>(dst, src);
+    else if (n_components == dim && fe_degree == 1 && n_q_points_1d == 2)
+      this->vmult_internal<dim, 1, 2>(dst, src);
+    else if (n_components == dim && fe_degree == 2 && n_q_points_1d == 3)
+      this->vmult_internal<dim, 2, 3>(dst, src);
+    else if (n_components == 1 && fe_degree == 1 && n_q_points_1d == 3)
+      this->vmult_internal<1, 1, 3>(dst, src);
+    else if (n_components == 1 && fe_degree == 2 && n_q_points_1d == 4)
+      this->vmult_internal<1, 2, 4>(dst, src);
+    else if (n_components == dim && fe_degree == 1 && n_q_points_1d == 3)
+      this->vmult_internal<dim, 1, 3>(dst, src);
+    else if (n_components == dim && fe_degree == 2 && n_q_points_1d == 4)
+      this->vmult_internal<dim, 2, 4>(dst, src);
+    else
+      AssertThrow(false, ExcInternalError());
+
+    matrix_free.copy_constrained_values(src, dst);
+  }
+
+  /**
+   * Initialize vector.
+   */
+  void
+  initialize_dof_vector(VectorType &vec) const override
+  {
+    matrix_free.initialize_dof_vector(vec);
+  }
+
+  /**
+   * Compute inverse of diagonal.
+   */
+  void
+  compute_inverse_diagonal(VectorType &diagonal) const override
+  {
+    this->initialize_dof_vector(diagonal);
+
+    const unsigned int n_components  = dof_handler.get_fe().n_components();
+    const unsigned int fe_degree     = dof_handler.get_fe().tensor_degree();
+    const unsigned int n_q_points_1d = quadrature.get_tensor_basis()[0].size();
+
+    if (n_components == 1 && fe_degree == 1 && n_q_points_1d == 2)
+      this->compute_inverse_diagonal_internal<1, 1, 2>(diagonal);
+    else if (n_components == 1 && fe_degree == 2 && n_q_points_1d == 3)
+      this->compute_inverse_diagonal_internal<1, 2, 3>(diagonal);
+    else if (n_components == dim && fe_degree == 1 && n_q_points_1d == 2)
+      this->compute_inverse_diagonal_internal<dim, 1, 2>(diagonal);
+    else if (n_components == dim && fe_degree == 2 && n_q_points_1d == 3)
+      this->compute_inverse_diagonal_internal<dim, 2, 3>(diagonal);
+    else if (n_components == 1 && fe_degree == 1 && n_q_points_1d == 3)
+      this->compute_inverse_diagonal_internal<1, 1, 3>(diagonal);
+    else if (n_components == 1 && fe_degree == 2 && n_q_points_1d == 4)
+      this->compute_inverse_diagonal_internal<1, 2, 4>(diagonal);
+    else if (n_components == dim && fe_degree == 1 && n_q_points_1d == 3)
+      this->compute_inverse_diagonal_internal<dim, 1, 3>(diagonal);
+    else if (n_components == dim && fe_degree == 2 && n_q_points_1d == 4)
+      this->compute_inverse_diagonal_internal<dim, 2, 4>(diagonal);
+    else
+      AssertThrow(false, ExcInternalError());
+  }
+
+private:
+  /**
+   * Templated vmult function.
+   */
+  template <int n_components, int fe_degree, int n_q_points_1d>
+  void
+  vmult_internal(VectorType &dst, const VectorType &src) const
+  {
+    if (bp <= BPType::BP2) // mass matrix
+      {
+        OperatorDealiiMassLocal<dim, fe_degree, n_q_points_1d, n_components, Number> mass_operator;
+        matrix_free.cell_loop(mass_operator, src, dst);
+      }
+    else
+      {
+        OperatorDealiiLaplaceLocal<dim, fe_degree, n_q_points_1d, n_components, Number>
+          local_operator;
+        matrix_free.cell_loop(local_operator, src, dst);
+      }
+  }
+
+  /**
+   * Templated compute_inverse_diagonal function.
+   */
+  template <int n_components, int fe_degree, int n_q_points_1d>
+  void
+  compute_inverse_diagonal_internal(VectorType &diagonal) const
+  {
+    if (bp <= BPType::BP2) // mass matrix
+      {
+        OperatorDealiiMassQuad<dim, fe_degree, n_q_points_1d, n_components, Number> op_quad;
+
+        MatrixFreeTools::compute_diagonal<dim, fe_degree, n_q_points_1d, n_components, Number>(
+          matrix_free, diagonal, op_quad, EvaluationFlags::values, EvaluationFlags::values);
+      }
+    else
+      {
+        OperatorDealiiLaplaceQuad<dim, fe_degree, n_q_points_1d, n_components, Number> op_quad;
+
+        MatrixFreeTools::compute_diagonal<dim, fe_degree, n_q_points_1d, n_components, Number>(
+          matrix_free, diagonal, op_quad, EvaluationFlags::gradients, EvaluationFlags::gradients);
+      }
+
+
+    Number *diagonal_ptr = diagonal.get_values();
+
+    Kokkos::parallel_for(
+      "lethe::invert_vector",
+      Kokkos::RangePolicy<MemorySpace::Default::kokkos_space::execution_space>(
+        0, diagonal.locally_owned_size()),
+      KOKKOS_LAMBDA(int i) { diagonal_ptr[i] = 1.0 / diagonal_ptr[i]; });
+  }
+
+  /**
+   * Mapping object passed to the constructor.
+   */
+  const Mapping<dim> &mapping;
+
+  /**
+   * DoFHandler object passed to the constructor.
+   */
+  const DoFHandler<dim> &dof_handler;
+
+  /**
+   * Constraints object passed to the constructor.
+   */
+  const AffineConstraints<Number> &constraints;
+
+  /**
+   * Quadrature rule object passed to the constructor.
+   */
+  const Quadrature<dim> &quadrature;
+
+  /**
+   * Selected BP.
+   */
+  const BPType bp;
+
+  /**
+   * MatrixFree object.
+   */
+  Portable::MatrixFree<dim, Number> matrix_free;
+};
+
+#endif
diff --git a/examples/deal.II/bps.h b/examples/deal.II/bps.h
index bcb7899f68..b7d28919bc 100644
--- a/examples/deal.II/bps.h
+++ b/examples/deal.II/bps.h
@@ -15,26 +15,25 @@
 //
 // ---------------------------------------------------------------------
 
+#pragma once
+#ifndef bps_h
+#  define bps_h
+
 // deal.II includes
-#include <deal.II/dofs/dof_tools.h>
+#  include <deal.II/dofs/dof_tools.h>
 
-#include <deal.II/fe/mapping.h>
+#  include <deal.II/fe/mapping.h>
 
-#include <deal.II/lac/la_parallel_vector.h>
+#  include <deal.II/lac/la_parallel_vector.h>
 
-#include <deal.II/matrix_free/fe_evaluation.h>
-#include <deal.II/matrix_free/matrix_free.h>
-#include <deal.II/matrix_free/shape_info.h>
-#include <deal.II/matrix_free/tools.h>
+#  include <deal.II/matrix_free/fe_evaluation.h>
+#  include <deal.II/matrix_free/matrix_free.h>
+#  include <deal.II/matrix_free/shape_info.h>
+#  include <deal.II/matrix_free/tools.h>
 
-// libCEED includes
-#include <ceed.h>
-#include <ceed/backend.h>
+using namespace dealii;
 
-// QFunction source
-#include "bps-qfunctions.h"
 
-using namespace dealii;
 
 /**
  * BP types. For more details, see https://ceed.exascaleproject.org/bps/.
@@ -94,14 +93,14 @@ struct BPInfo
 /**
  * Base class of operators.
  */
-template <typename Number>
+template <typename Number, typename MemorySpace>
 class OperatorBase
 {
 public:
   /**
    * deal.II vector type
    */
-  using VectorType = LinearAlgebra::distributed::Vector<Number>;
+  using VectorType = LinearAlgebra::distributed::Vector<Number, MemorySpace>;
 
   /**
    * Initialize vector.
@@ -128,781 +127,4 @@ class OperatorBase
   compute_inverse_diagonal(VectorType &diagonal) const = 0;
 };
 
-
-/**
- * Operator implementation using libCEED.
- */
-template <int dim, typename Number>
-class OperatorCeed : public OperatorBase<Number>
-{
-public:
-  using VectorType = typename OperatorBase<Number>::VectorType;
-
-  /**
-   * Constructor.
-   */
-  OperatorCeed(const Mapping<dim>              &mapping,
-               const DoFHandler<dim>           &dof_handler,
-               const AffineConstraints<Number> &constraints,
-               const Quadrature<dim>           &quadrature,
-               const BPType                    &bp,
-               const std::string               &resource)
-    : mapping(mapping)
-    , dof_handler(dof_handler)
-    , constraints(constraints)
-    , quadrature(quadrature)
-    , bp(bp)
-    , resource(resource)
-  {
-    reinit();
-  }
-
-  /**
-   * Destructor.
-   */
-  ~OperatorCeed()
-  {
-    CeedVectorDestroy(&src_ceed);
-    CeedVectorDestroy(&dst_ceed);
-    CeedOperatorDestroy(&op_apply);
-    CeedDestroy(&ceed);
-  }
-
-  /**
-   * Initialized internal data structures, particularly, libCEED.
-   */
-  void
-  reinit() override
-  {
-    CeedVector           metric_data;
-    CeedBasis            sol_basis;
-    CeedElemRestriction  sol_restriction;
-    CeedElemRestriction  metric_data_restriction;
-    BuildContext         build_ctx_data;
-    CeedQFunctionContext build_ctx;
-    CeedQFunction        qf_apply;
-
-    const auto &tria = dof_handler.get_triangulation();
-    const auto &fe   = dof_handler.get_fe();
-
-    const auto n_components = fe.n_components();
-
-    if (bp == BPType::BP1 || bp == BPType::BP3 || bp == BPType::BP5)
-      {
-        AssertThrow(n_components == 1, ExcInternalError());
-      }
-    else
-      {
-        AssertThrow(n_components == dim, ExcInternalError());
-      }
-
-    // 1) create CEED instance -> "MatrixFree"
-    const char *ceed_spec = resource.c_str();
-    CeedInit(ceed_spec, &ceed);
-
-    // 2) create shape functions -> "ShapeInfo"
-    const unsigned int fe_degree  = fe.tensor_degree();
-    const unsigned int n_q_points = quadrature.get_tensor_basis()[0].size();
-    {
-      const dealii::internal::MatrixFreeFunctions::ShapeInfo<double> shape_info(quadrature, fe, 0);
-      const auto             &shape_data = shape_info.get_shape_data();
-      std::vector<CeedScalar> q_ref_1d;
-      for (const auto q : shape_data.quadrature.get_points())
-        q_ref_1d.push_back(q(0));
-
-      // transpose bases for compatibility with restriction
-      std::vector<CeedScalar> interp_1d(shape_data.shape_values.size());
-      std::vector<CeedScalar> grad_1d(shape_data.shape_gradients.size());
-      for (unsigned int i = 0; i < n_q_points; ++i)
-        for (unsigned int j = 0; j < fe_degree + 1; ++j)
-          {
-            interp_1d[j + i * (fe_degree + 1)] = shape_data.shape_values[j * n_q_points + i];
-            grad_1d[j + i * (fe_degree + 1)]   = shape_data.shape_gradients[j * n_q_points + i];
-          }
-
-      CeedBasisCreateTensorH1(ceed,
-                              dim,
-                              n_components,
-                              fe_degree + 1,
-                              n_q_points,
-                              interp_1d.data(),
-                              grad_1d.data(),
-                              q_ref_1d.data(),
-                              quadrature.get_tensor_basis()[0].get_weights().data(),
-                              &sol_basis);
-    }
-
-    // 3) create restriction matrix -> DoFInfo
-    unsigned int n_local_active_cells = 0;
-
-    for (const auto &cell : dof_handler.active_cell_iterators())
-      if (cell->is_locally_owned())
-        n_local_active_cells++;
-
-    partitioner =
-      std::make_shared<Utilities::MPI::Partitioner>(dof_handler.locally_owned_dofs(),
-                                                    DoFTools::extract_locally_active_dofs(
-                                                      dof_handler),
-                                                    dof_handler.get_communicator());
-
-    std::vector<CeedInt> indices;
-    indices.reserve(n_local_active_cells * fe.n_dofs_per_cell() / n_components);
-
-    const auto dof_mapping = FETools::lexicographic_to_hierarchic_numbering<dim>(fe_degree);
-
-    std::vector<types::global_dof_index> local_indices(fe.n_dofs_per_cell());
-
-    for (const auto &cell : dof_handler.active_cell_iterators())
-      if (cell->is_locally_owned())
-        {
-          cell->get_dof_indices(local_indices);
-
-          for (const auto i : dof_mapping)
-            indices.emplace_back(
-              partitioner->global_to_local(local_indices[fe.component_to_system_index(0, i)]));
-        }
-
-    CeedElemRestrictionCreate(ceed,
-                              n_local_active_cells,
-                              fe.n_dofs_per_cell() / n_components,
-                              n_components,
-                              1,
-                              this->extended_local_size(),
-                              CEED_MEM_HOST,
-                              CEED_COPY_VALUES,
-                              indices.data(),
-                              &sol_restriction);
-
-    // 4) create mapping -> MappingInfo
-    const unsigned int n_components_metric = (bp <= BPType::BP2) ? 1 : (dim * (dim + 1) / 2);
-
-    metric_data_raw = compute_metric_data(ceed, mapping, tria, quadrature, bp);
-
-    strides = {{1,
-                static_cast<int>(quadrature.size()),
-                static_cast<int>(quadrature.size() * n_components_metric)}};
-    CeedVectorCreate(ceed, metric_data_raw.size(), &metric_data);
-    CeedVectorSetArray(metric_data, CEED_MEM_HOST, CEED_USE_POINTER, metric_data_raw.data());
-    CeedElemRestrictionCreateStrided(ceed,
-                                     n_local_active_cells,
-                                     quadrature.size(),
-                                     n_components_metric,
-                                     metric_data_raw.size(),
-                                     strides.data(),
-                                     &metric_data_restriction);
-
-    build_ctx_data.dim       = dim;
-    build_ctx_data.space_dim = dim;
-
-    CeedQFunctionContextCreate(ceed, &build_ctx);
-    CeedQFunctionContextSetData(
-      build_ctx, CEED_MEM_HOST, CEED_COPY_VALUES, sizeof(build_ctx_data), &build_ctx_data);
-
-    // 5) create q operation
-    if (bp == BPType::BP1)
-      CeedQFunctionCreateInterior(ceed, 1, f_apply_mass, f_apply_mass_loc, &qf_apply);
-    else if (bp == BPType::BP2)
-      CeedQFunctionCreateInterior(ceed, 1, f_apply_mass_vec, f_apply_mass_vec_loc, &qf_apply);
-    else if (bp == BPType::BP3 || bp == BPType::BP5)
-      CeedQFunctionCreateInterior(ceed, 1, f_apply_poisson, f_apply_poisson_loc, &qf_apply);
-    else if (bp == BPType::BP4 || bp == BPType::BP6)
-      CeedQFunctionCreateInterior(ceed, 1, f_apply_poisson_vec, f_apply_poisson_vec_loc, &qf_apply);
-    else
-      AssertThrow(false, ExcInternalError());
-
-    if (bp <= BPType::BP2)
-      CeedQFunctionAddInput(qf_apply, "u", n_components, CEED_EVAL_INTERP);
-    else
-      CeedQFunctionAddInput(qf_apply, "u", dim * n_components, CEED_EVAL_GRAD);
-
-    CeedQFunctionAddInput(qf_apply, "metric data", n_components_metric, CEED_EVAL_NONE);
-
-    if (bp <= BPType::BP2)
-      CeedQFunctionAddOutput(qf_apply, "v", n_components, CEED_EVAL_INTERP);
-    else
-      CeedQFunctionAddOutput(qf_apply, "v", dim * n_components, CEED_EVAL_GRAD);
-
-    CeedQFunctionSetContext(qf_apply, build_ctx);
-
-    // 6) put everything together
-    CeedOperatorCreate(ceed, qf_apply, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_apply);
-
-    CeedOperatorSetField(op_apply, "u", sol_restriction, sol_basis, CEED_VECTOR_ACTIVE);
-    CeedOperatorSetField(
-      op_apply, "metric data", metric_data_restriction, CEED_BASIS_NONE, metric_data);
-    CeedOperatorSetField(op_apply, "v", sol_restriction, sol_basis, CEED_VECTOR_ACTIVE);
-
-    // 7) libCEED vectors
-    CeedElemRestrictionCreateVector(sol_restriction, &src_ceed, NULL);
-    CeedElemRestrictionCreateVector(sol_restriction, &dst_ceed, NULL);
-
-    // 8) cleanup
-    CeedVectorDestroy(&metric_data);
-    CeedElemRestrictionDestroy(&metric_data_restriction);
-    CeedElemRestrictionDestroy(&sol_restriction);
-    CeedBasisDestroy(&sol_basis);
-    CeedQFunctionContextDestroy(&build_ctx);
-    CeedQFunctionDestroy(&qf_apply);
-  }
-
-  /**
-   * Perform matrix-vector product.
-   */
-  void
-  vmult(VectorType &dst, const VectorType &src) const override
-  {
-    // communicate: update ghost values
-    src.update_ghost_values();
-
-    // pass memory buffers to libCEED
-    VectorTypeCeed x(src_ceed);
-    VectorTypeCeed y(dst_ceed);
-    x.import_array(src, CEED_MEM_HOST);
-    y.import_array(dst, CEED_MEM_HOST);
-
-    // apply operator
-    CeedOperatorApply(op_apply, x(), y(), CEED_REQUEST_IMMEDIATE);
-
-    // pull arrays back to deal.II
-    x.take_array();
-    y.take_array();
-
-    // communicate: compress
-    src.zero_out_ghost_values();
-    dst.compress(VectorOperation::add);
-
-    // apply constraints: we assume homogeneous DBC
-    constraints.set_zero(dst);
-  }
-
-  /**
-   * Initialized vector.
-   */
-  void
-  initialize_dof_vector(VectorType &vec) const override
-  {
-    vec.reinit(partitioner);
-  }
-
-  /**
-   * Compute inverse of diagonal.
-   */
-  void
-  compute_inverse_diagonal(VectorType &diagonal) const override
-  {
-    this->initialize_dof_vector(diagonal);
-
-    // pass memory buffer to libCEED
-    VectorTypeCeed y(dst_ceed);
-    y.import_array(diagonal, CEED_MEM_HOST);
-
-    CeedOperatorLinearAssembleDiagonal(op_apply, y(), CEED_REQUEST_IMMEDIATE);
-
-    // pull array back to deal.II
-    y.take_array();
-
-    diagonal.compress(VectorOperation::add);
-
-    // apply constraints: we assume homogeneous DBC
-    constraints.set_zero(diagonal);
-
-    for (auto &i : diagonal)
-      i = (std::abs(i) > 1.0e-10) ? (1.0 / i) : 1.0;
-  }
-
-private:
-  /**
-   * Wrapper around a deal.II vector to create a libCEED vector view.
-   */
-  class VectorTypeCeed
-  {
-  public:
-    /**
-     * Constructor.
-     */
-    VectorTypeCeed(const CeedVector &vec_orig)
-    {
-      vec_ceed = NULL;
-      CeedVectorReferenceCopy(vec_orig, &vec_ceed);
-    }
-
-    /**
-     * Return libCEED vector view.
-     */
-    CeedVector &
-    operator()()
-    {
-      return vec_ceed;
-    }
-
-    /**
-     * Set deal.II memory in libCEED vector.
-     */
-    void
-    import_array(const VectorType &vec, const CeedMemType space)
-    {
-      mem_space = space;
-      CeedVectorSetArray(vec_ceed, mem_space, CEED_USE_POINTER, vec.get_values());
-    }
-
-    /**
-     * Sync memory from device to host.
-     */
-    void
-    sync_array()
-    {
-      CeedVectorSyncArray(vec_ceed, mem_space);
-    }
-
-    /**
-     * Take previously set deal.II array from libCEED vector
-     */
-    void
-    take_array()
-    {
-      CeedScalar *ptr;
-      CeedVectorTakeArray(vec_ceed, mem_space, &ptr);
-    }
-
-    /**
-     * Destructor: destroy vector view.
-     */
-    ~VectorTypeCeed()
-    {
-      bool has_array;
-      CeedVectorHasBorrowedArrayOfType(vec_ceed, mem_space, &has_array);
-      if (has_array)
-        {
-          CeedScalar *ptr;
-          CeedVectorTakeArray(vec_ceed, mem_space, &ptr);
-        }
-      CeedVectorDestroy(&vec_ceed);
-    }
-
-  private:
-    /**
-     * libCEED vector view.
-     */
-    CeedMemType mem_space;
-    CeedVector  vec_ceed;
-  };
-
-  /**
-   * Number of locally active DoFs.
-   */
-  unsigned int
-  extended_local_size() const
-  {
-    return partitioner->locally_owned_size() + partitioner->n_ghost_indices();
-  }
-
-  /**
-   * Compute metric data: Jacobian, ...
-   */
-  static std::vector<double>
-  compute_metric_data(const Ceed               &ceed,
-                      const Mapping<dim>       &mapping,
-                      const Triangulation<dim> &tria,
-                      const Quadrature<dim>    &quadrature,
-                      const BPType              bp)
-  {
-    std::vector<double> metric_data_raw;
-
-    CeedBasis            geo_basis;
-    CeedVector           metric_data;
-    CeedElemRestriction  metric_data_restriction;
-    CeedVector           node_coords;
-    CeedElemRestriction  geo_restriction;
-    CeedQFunctionContext build_ctx;
-    CeedQFunction        qf_build;
-    CeedOperator         op_build;
-
-    const unsigned int n_q_points = quadrature.get_tensor_basis()[0].size();
-
-    const unsigned int n_components_metric = (bp <= BPType::BP2) ? 1 : (dim * (dim + 1) / 2);
-
-    const auto mapping_q = dynamic_cast<const MappingQ<dim> *>(&mapping);
-
-    AssertThrow(mapping_q, ExcMessage("Wrong mapping!"));
-
-    const unsigned int fe_degree = mapping_q->get_degree();
-
-    FE_Q<dim> geo_fe(fe_degree);
-
-    {
-      const dealii::internal::MatrixFreeFunctions::ShapeInfo<double> shape_info(quadrature,
-                                                                                geo_fe,
-                                                                                0);
-      const auto             &shape_data = shape_info.get_shape_data();
-      std::vector<CeedScalar> q_ref_1d;
-      for (const auto q : shape_data.quadrature.get_points())
-        q_ref_1d.push_back(q(0));
-
-      // transpose bases for compatibility with restriction
-      std::vector<CeedScalar> interp_1d(shape_data.shape_values.size());
-      std::vector<CeedScalar> grad_1d(shape_data.shape_gradients.size());
-      for (unsigned int i = 0; i < n_q_points; ++i)
-        for (unsigned int j = 0; j < fe_degree + 1; ++j)
-          {
-            interp_1d[j + i * (fe_degree + 1)] = shape_data.shape_values[j * n_q_points + i];
-            grad_1d[j + i * (fe_degree + 1)]   = shape_data.shape_gradients[j * n_q_points + i];
-          }
-
-      CeedBasisCreateTensorH1(ceed,
-                              dim,
-                              dim,
-                              fe_degree + 1,
-                              n_q_points,
-                              interp_1d.data(),
-                              grad_1d.data(),
-                              q_ref_1d.data(),
-                              quadrature.get_tensor_basis()[0].get_weights().data(),
-                              &geo_basis);
-    }
-
-    unsigned int n_local_active_cells = 0;
-
-    for (const auto &cell : tria.active_cell_iterators())
-      if (cell->is_locally_owned())
-        n_local_active_cells++;
-
-    std::vector<double>  geo_support_points;
-    std::vector<CeedInt> geo_indices;
-
-    DoFHandler<dim> geo_dof_handler(tria);
-    geo_dof_handler.distribute_dofs(geo_fe);
-
-    const auto geo_partitioner =
-      std::make_shared<Utilities::MPI::Partitioner>(geo_dof_handler.locally_owned_dofs(),
-                                                    DoFTools::extract_locally_active_dofs(
-                                                      geo_dof_handler),
-                                                    geo_dof_handler.get_communicator());
-
-    geo_indices.reserve(n_local_active_cells * geo_fe.n_dofs_per_cell());
-
-    const auto dof_mapping = FETools::lexicographic_to_hierarchic_numbering<dim>(fe_degree);
-
-    FEValues<dim> fe_values(mapping,
-                            geo_fe,
-                            geo_fe.get_unit_support_points(),
-                            update_quadrature_points);
-
-    std::vector<types::global_dof_index> local_indices(geo_fe.n_dofs_per_cell());
-
-    const unsigned int n_points =
-      geo_partitioner->locally_owned_size() + geo_partitioner->n_ghost_indices();
-
-    geo_support_points.resize(dim * n_points);
-
-    for (const auto &cell : geo_dof_handler.active_cell_iterators())
-      if (cell->is_locally_owned())
-        {
-          fe_values.reinit(cell);
-          cell->get_dof_indices(local_indices);
-
-          for (const auto i : dof_mapping)
-            {
-              const auto index = geo_partitioner->global_to_local(local_indices[i]);
-              geo_indices.emplace_back(index * dim);
-
-              const auto point = fe_values.quadrature_point(i);
-
-              for (unsigned int d = 0; d < dim; ++d)
-                geo_support_points[index * dim + d] = point[d];
-            }
-        }
-
-    metric_data_raw.resize(n_local_active_cells * quadrature.size() * n_components_metric);
-
-    CeedInt strides[3] = {1,
-                          static_cast<int>(quadrature.size()),
-                          static_cast<int>(quadrature.size() * n_components_metric)};
-
-    CeedVectorCreate(ceed, metric_data_raw.size(), &metric_data);
-    CeedVectorSetArray(metric_data, CEED_MEM_HOST, CEED_USE_POINTER, metric_data_raw.data());
-    CeedElemRestrictionCreateStrided(ceed,
-                                     n_local_active_cells,
-                                     quadrature.size(),
-                                     n_components_metric,
-                                     metric_data_raw.size(),
-                                     strides,
-                                     &metric_data_restriction);
-
-    CeedVectorCreate(ceed, geo_support_points.size(), &node_coords);
-    CeedVectorSetArray(node_coords, CEED_MEM_HOST, CEED_USE_POINTER, geo_support_points.data());
-
-    CeedElemRestrictionCreate(ceed,
-                              n_local_active_cells,
-                              geo_fe.n_dofs_per_cell(),
-                              dim,
-                              1,
-                              geo_support_points.size(),
-                              CEED_MEM_HOST,
-                              CEED_COPY_VALUES,
-                              geo_indices.data(),
-                              &geo_restriction);
-
-    BuildContext build_ctx_data;
-    build_ctx_data.dim       = dim;
-    build_ctx_data.space_dim = dim;
-
-    CeedQFunctionContextCreate(ceed, &build_ctx);
-    CeedQFunctionContextSetData(
-      build_ctx, CEED_MEM_HOST, CEED_USE_POINTER, sizeof(build_ctx_data), &build_ctx_data);
-
-    // 5) create q operation
-    if (bp <= BPType::BP2)
-      CeedQFunctionCreateInterior(ceed, 1, f_build_mass, f_build_mass_loc, &qf_build);
-    else
-      CeedQFunctionCreateInterior(ceed, 1, f_build_poisson, f_build_poisson_loc, &qf_build);
-
-    CeedQFunctionAddInput(qf_build, "geo", dim * dim, CEED_EVAL_GRAD);
-    CeedQFunctionAddInput(qf_build, "weight", 1, CEED_EVAL_WEIGHT);
-    CeedQFunctionAddOutput(qf_build, "metric data", n_components_metric, CEED_EVAL_NONE);
-    CeedQFunctionSetContext(qf_build, build_ctx);
-
-    // 6) put everything together
-    CeedOperatorCreate(ceed, qf_build, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_build);
-    CeedOperatorSetField(op_build, "geo", geo_restriction, geo_basis, CEED_VECTOR_ACTIVE);
-    CeedOperatorSetField(
-      op_build, "weight", CEED_ELEMRESTRICTION_NONE, geo_basis, CEED_VECTOR_NONE);
-    CeedOperatorSetField(
-      op_build, "metric data", metric_data_restriction, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE);
-
-    CeedOperatorApply(op_build, node_coords, metric_data, CEED_REQUEST_IMMEDIATE);
-
-    CeedVectorDestroy(&node_coords);
-    CeedVectorSyncArray(metric_data, CEED_MEM_HOST);
-    CeedVectorDestroy(&metric_data);
-    CeedElemRestrictionDestroy(&geo_restriction);
-    CeedElemRestrictionDestroy(&metric_data_restriction);
-    CeedBasisDestroy(&geo_basis);
-    CeedQFunctionContextDestroy(&build_ctx);
-    CeedQFunctionDestroy(&qf_build);
-    CeedOperatorDestroy(&op_build);
-
-    return metric_data_raw;
-  }
-
-  /**
-   * Mapping object passed to the constructor.
-   */
-  const Mapping<dim> &mapping;
-
-  /**
-   * DoFHandler object passed to the constructor.
-   */
-  const DoFHandler<dim> &dof_handler;
-
-  /**
-   * Constraints object passed to the constructor.
-   */
-  const AffineConstraints<Number> &constraints;
-
-  /**
-   * Quadrature rule object passed to the constructor.
-   */
-  const Quadrature<dim> &quadrature;
-
-  /**
-   * Selected BP.
-   */
-  const BPType bp;
-
-  /**
-   * Resource name.
-   */
-  const std::string resource;
-
-  /**
-   * Partitioner for distributed vectors.
-   */
-  std::shared_ptr<Utilities::MPI::Partitioner> partitioner;
-
-  /**
-   * libCEED data structures.
-   */
-  Ceed                   ceed;
-  std::vector<double>    metric_data_raw;
-  std::array<CeedInt, 3> strides;
-  CeedVector             src_ceed;
-  CeedVector             dst_ceed;
-  CeedOperator           op_apply;
-};
-
-
-
-template <int dim, typename Number>
-class OperatorDealii : public OperatorBase<Number>
-{
-public:
-  using VectorType = typename OperatorBase<Number>::VectorType;
-
-  /**
-   * Constructor.
-   */
-  OperatorDealii(const Mapping<dim>              &mapping,
-                 const DoFHandler<dim>           &dof_handler,
-                 const AffineConstraints<Number> &constraints,
-                 const Quadrature<dim>           &quadrature,
-                 const BPType                    &bp)
-    : mapping(mapping)
-    , dof_handler(dof_handler)
-    , constraints(constraints)
-    , quadrature(quadrature)
-    , bp(bp)
-  {
-    reinit();
-  }
-
-  /**
-   * Destructor.
-   */
-  ~OperatorDealii() = default;
-
-  /**
-   * Initialized internal data structures, particularly, MatrixFree.
-   */
-  void
-  reinit() override
-  {
-    // configure MatrixFree
-    typename MatrixFree<dim, Number>::AdditionalData additional_data;
-    additional_data.tasks_parallel_scheme =
-      MatrixFree<dim, Number>::AdditionalData::TasksParallelScheme::none;
-
-    // create MatrixFree
-    matrix_free.reinit(mapping, dof_handler, constraints, quadrature, additional_data);
-  }
-
-  /**
-   * Matrix-vector product.
-   */
-  void
-  vmult(VectorType &dst, const VectorType &src) const override
-  {
-    if (dof_handler.get_fe().n_components() == 1)
-      {
-        matrix_free.cell_loop(&OperatorDealii::do_cell_integral_range<1>, this, dst, src, true);
-      }
-    else
-      {
-        AssertThrow(dof_handler.get_fe().n_components() == dim, ExcInternalError());
-
-        matrix_free.cell_loop(&OperatorDealii::do_cell_integral_range<dim>, this, dst, src, true);
-      }
-  }
-
-  /**
-   * Initialize vector.
-   */
-  void
-  initialize_dof_vector(VectorType &vec) const override
-  {
-    matrix_free.initialize_dof_vector(vec);
-  }
-
-  /**
-   * Compute inverse of diagonal.
-   */
-  void
-  compute_inverse_diagonal(VectorType &diagonal) const override
-  {
-    this->initialize_dof_vector(diagonal);
-
-    if (dof_handler.get_fe().n_components() == 1)
-      {
-        MatrixFreeTools::compute_diagonal(matrix_free,
-                                          diagonal,
-                                          &OperatorDealii::do_cell_integral_local<1>,
-                                          this);
-      }
-    else
-      {
-        AssertThrow(dof_handler.get_fe().n_components() == dim, ExcInternalError());
-
-        MatrixFreeTools::compute_diagonal(matrix_free,
-                                          diagonal,
-                                          &OperatorDealii::do_cell_integral_local<dim>,
-                                          this);
-      }
-
-    for (auto &i : diagonal)
-      i = (std::abs(i) > 1.0e-10) ? (1.0 / i) : 1.0;
-  }
-
-private:
-  /**
-   * Cell integral without vector access.
-   */
-  template <int n_components>
-  void
-  do_cell_integral_local(FEEvaluation<dim, -1, 0, n_components, Number> &phi) const
-  {
-    if (bp <= BPType::BP2) // mass matrix
-      {
-        phi.evaluate(EvaluationFlags::values);
-        for (const auto q : phi.quadrature_point_indices())
-          phi.submit_value(phi.get_value(q), q);
-        phi.integrate(EvaluationFlags::values);
-      }
-    else // Poisson operator
-      {
-        phi.evaluate(EvaluationFlags::gradients);
-        for (const auto q : phi.quadrature_point_indices())
-          phi.submit_gradient(phi.get_gradient(q), q);
-        phi.integrate(EvaluationFlags::gradients);
-      }
-  }
-
-  /**
-   * Cell integral on a range of cells.
-   */
-  template <int n_components>
-  void
-  do_cell_integral_range(const MatrixFree<dim, Number>               &matrix_free,
-                         VectorType                                  &dst,
-                         const VectorType                            &src,
-                         const std::pair<unsigned int, unsigned int> &range) const
-  {
-    FEEvaluation<dim, -1, 0, n_components, Number> phi(matrix_free, range);
-
-    for (unsigned cell = range.first; cell < range.second; ++cell)
-      {
-        phi.reinit(cell);
-        phi.read_dof_values(src);            // read source vector
-        do_cell_integral_local(phi);         // cell integral
-        phi.distribute_local_to_global(dst); // write to destination vector
-      }
-  }
-
-  /**
-   * Mapping object passed to the constructor.
-   */
-  const Mapping<dim> &mapping;
-
-  /**
-   * DoFHandler object passed to the constructor.
-   */
-  const DoFHandler<dim> &dof_handler;
-
-  /**
-   * Constraints object passed to the constructor.
-   */
-  const AffineConstraints<Number> &constraints;
-
-  /**
-   * Quadrature rule object passed to the constructor.
-   */
-  const Quadrature<dim> &quadrature;
-
-  /**
-   * Selected BP.
-   */
-  const BPType bp;
-
-  /**
-   * MatrixFree object.
-   */
-  MatrixFree<dim, Number> matrix_free;
-};
+#endif

From eb5e7f265ad4bea76490efa023163fbf9f3d6c69 Mon Sep 17 00:00:00 2001
From: Hugh Carson <hughcars@amazon.com>
Date: Wed, 4 Feb 2026 12:44:51 -0500
Subject: [PATCH 537/571] ci: fix macOS tests to use explicit GCC and LLVM
 compilers

The "gcc" on macOS is a symlink to Apple Clang, so the existing matrix
was testing Apple Clang twice instead of testing distinct compilers.

Fix by testing three explicit compilers on macOS:
- clang: Apple Clang (system)
- gcc-15: real GCC from Homebrew
- llvm: LLVM 18 from Homebrew

Print compiler version to make the distinction visible in CI output.
---
 .../workflows/c-fortran-test-linux-osx.yml    | 20 ++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/c-fortran-test-linux-osx.yml b/.github/workflows/c-fortran-test-linux-osx.yml
index e732fcc9a1..f0ca154939 100644
--- a/.github/workflows/c-fortran-test-linux-osx.yml
+++ b/.github/workflows/c-fortran-test-linux-osx.yml
@@ -12,15 +12,33 @@ jobs:
       matrix:
         os: [ubuntu-24.04, macos-15]
         compiler: [gcc, clang]
+        exclude:
+          # "gcc" on macOS is a symlink to Apple Clang, same as "clang"
+          - os: macos-15
+            compiler: gcc
+        include:
+          # macOS: test with real GCC (not Apple Clang symlink)
+          - os: macos-15
+            compiler: gcc-15
+          # macOS: test with Homebrew LLVM
+          - os: macos-15
+            compiler: llvm
 
     runs-on: ${{ matrix.os }}
 
     steps:
     - name: Environment setup
       uses: actions/checkout@v4
+    - name: Set LLVM compiler path
+      if: matrix.compiler == 'llvm'
+      run: echo "CC=$(brew --prefix llvm@18)/bin/clang" >> $GITHUB_ENV
+    - name: Set compiler
+      if: matrix.compiler != 'llvm'
+      run: echo "CC=${{ matrix.compiler }}" >> $GITHUB_ENV
+    - name: Show compiler version
+      run: $CC --version | head -1
     - name: Build and test libCEED
       env:
-        CC: ${{ matrix.compiler }}
         FC: gfortran-14
       run: |
         make info

From 7247fd92a8a86a7e36cba392bcb1b62954e39b1a Mon Sep 17 00:00:00 2001
From: Hugh Carson <hughcars@amazon.com>
Date: Wed, 4 Feb 2026 14:36:14 -0500
Subject: [PATCH 538/571] make: use -mcpu=native for GCC and Clang on non-x86
 architectures

On ARM, -march=native resolves to CPU names (e.g., apple-m1) which are
invalid for -march; must use -mcpu=native instead. Use uname -m to detect
architecture and select the appropriate flag. Clang has the same limitation,
so MARCHFLAG.clang follows MARCHFLAG.gcc.
---
 Makefile | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index ebefbe877d..742c8b2a4b 100644
--- a/Makefile
+++ b/Makefile
@@ -112,8 +112,12 @@ CC_VENDOR := $(subst (GCC),gcc,$(subst icc_orig,icc,$(CC_VENDOR)))
 CC_VENDOR := $(if $(filter cc,$(CC_VENDOR)),gcc,$(CC_VENDOR))
 FC_VENDOR := $(if $(FC),$(firstword $(filter GNU ifort ifx XL,$(shell $(FC) --version 2>&1 || $(FC) -qversion))))
 
+# Host architecture for setting appropriate flags
+UNAME_M := $(shell uname -m)
+
 # Default extra flags by vendor
-MARCHFLAG.gcc           := -march=native
+# GCC: use -march=native only on x86 (where -mcpu doesn't exist); use -mcpu=native elsewhere
+MARCHFLAG.gcc           := $(if $(filter x86_64 i%86,$(UNAME_M)),-march=native,-mcpu=native)
 MARCHFLAG.clang         := $(MARCHFLAG.gcc)
 MARCHFLAG.icc           :=
 MARCHFLAG.oneAPI        := $(MARCHFLAG.clang)

From 94ab23f82a7dd32392e4d004d5d63092f89e6af7 Mon Sep 17 00:00:00 2001
From: Hugh Carson <hughcars@amazon.com>
Date: Thu, 5 Feb 2026 18:38:25 -0500
Subject: [PATCH 539/571] ci: preserve CI job names, add apple-clang test

Map compiler display names to actual binaries in a step instead of
using matrix include/exclude, so job names stay as gcc/clang. Adds
a separate apple-clang job on macOS to test the system compiler.
---
 .../workflows/c-fortran-test-linux-osx.yml    | 35 +++++++++++--------
 1 file changed, 21 insertions(+), 14 deletions(-)

diff --git a/.github/workflows/c-fortran-test-linux-osx.yml b/.github/workflows/c-fortran-test-linux-osx.yml
index f0ca154939..52df23c8d1 100644
--- a/.github/workflows/c-fortran-test-linux-osx.yml
+++ b/.github/workflows/c-fortran-test-linux-osx.yml
@@ -12,29 +12,36 @@ jobs:
       matrix:
         os: [ubuntu-24.04, macos-15]
         compiler: [gcc, clang]
-        exclude:
-          # "gcc" on macOS is a symlink to Apple Clang, same as "clang"
-          - os: macos-15
-            compiler: gcc
         include:
-          # macOS: test with real GCC (not Apple Clang symlink)
-          - os: macos-15
-            compiler: gcc-15
-          # macOS: test with Homebrew LLVM
           - os: macos-15
-            compiler: llvm
+            compiler: apple-clang
 
     runs-on: ${{ matrix.os }}
 
     steps:
     - name: Environment setup
       uses: actions/checkout@v4
-    - name: Set LLVM compiler path
-      if: matrix.compiler == 'llvm'
-      run: echo "CC=$(brew --prefix llvm@18)/bin/clang" >> $GITHUB_ENV
     - name: Set compiler
-      if: matrix.compiler != 'llvm'
-      run: echo "CC=${{ matrix.compiler }}" >> $GITHUB_ENV
+      run: |
+        case "${{ matrix.compiler }}" in
+          gcc)
+            if [[ "${{ matrix.os }}" == macos-* ]]; then
+              echo "CC=gcc-15" >> $GITHUB_ENV
+            else
+              echo "CC=gcc" >> $GITHUB_ENV
+            fi
+            ;;
+          clang)
+            if [[ "${{ matrix.os }}" == macos-* ]]; then
+              echo "CC=$(brew --prefix llvm@18)/bin/clang" >> $GITHUB_ENV
+            else
+              echo "CC=clang" >> $GITHUB_ENV
+            fi
+            ;;
+          apple-clang)
+            echo "CC=clang" >> $GITHUB_ENV
+            ;;
+        esac
     - name: Show compiler version
       run: $CC --version | head -1
     - name: Build and test libCEED

From 7b6fac6b22b60e6240f91980eb9d87fbfb688f61 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 6 Feb 2026 13:02:42 -0700
Subject: [PATCH 540/571] minor - typo

---
 interface/ceed-vector.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/interface/ceed-vector.c b/interface/ceed-vector.c
index 1a42381897..eb9f4fc85b 100644
--- a/interface/ceed-vector.c
+++ b/interface/ceed-vector.c
@@ -196,7 +196,7 @@ int CeedVectorReference(CeedVector vec) {
   @ref User
 **/
 int CeedVectorCreate(Ceed ceed, CeedSize length, CeedVector *vec) {
-  CeedCheck(length >= 0, ceed, CEED_ERROR_UNSUPPORTED, "CeedVector must have length >= 0, recieved %" CeedSize_FMT, length);
+  CeedCheck(length >= 0, ceed, CEED_ERROR_UNSUPPORTED, "CeedVector must have length >= 0, received %" CeedSize_FMT, length);
   if (!ceed->VectorCreate) {
     Ceed delegate;
 

From 0b6847a6bd8ae6afc56b1dc81e69df6d744052aa Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Mon, 9 Feb 2026 08:42:10 -0700
Subject: [PATCH 541/571] gallery - add identity to scalar qf

---
 gallery/ceed-gallery-list.h                   |  1 +
 gallery/identity/ceed-identity-to-scalar.c    | 33 +++++++++++++++++++
 .../gallery/ceed-identity-to-scalar.h         | 22 +++++++++++++
 3 files changed, 56 insertions(+)
 create mode 100644 gallery/identity/ceed-identity-to-scalar.c
 create mode 100644 include/ceed/jit-source/gallery/ceed-identity-to-scalar.h

diff --git a/gallery/ceed-gallery-list.h b/gallery/ceed-gallery-list.h
index d45aea46cc..030a3b2c28 100644
--- a/gallery/ceed-gallery-list.h
+++ b/gallery/ceed-gallery-list.h
@@ -13,6 +13,7 @@
 // At the time of this writing, all the gallery functions are defined, but we're adopting the same strategy here as for the backends because future gallery @ref CeedQFunction might depend on external libraries.
 
 CEED_GALLERY_QFUNCTION(CeedQFunctionRegister_Identity)
+CEED_GALLERY_QFUNCTION(CeedQFunctionRegister_IdentityScalar)
 CEED_GALLERY_QFUNCTION(CeedQFunctionRegister_Mass1DBuild)
 CEED_GALLERY_QFUNCTION(CeedQFunctionRegister_Mass2DBuild)
 CEED_GALLERY_QFUNCTION(CeedQFunctionRegister_Mass3DBuild)
diff --git a/gallery/identity/ceed-identity-to-scalar.c b/gallery/identity/ceed-identity-to-scalar.c
new file mode 100644
index 0000000000..ddb04574e1
--- /dev/null
+++ b/gallery/identity/ceed-identity-to-scalar.c
@@ -0,0 +1,33 @@
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+#include <ceed.h>
+#include <ceed/backend.h>
+#include <ceed/jit-source/gallery/ceed-identity-to-scalar.h>
+#include <stddef.h>
+#include <string.h>
+
+/**
+  @brief Set fields identity `CeedQFunction` that copies first input component directly into output
+**/
+static int CeedQFunctionInit_IdentityScalar(Ceed ceed, const char *requested, CeedQFunction qf) {
+  // Check QFunction name
+  const char *name = "Identity to scalar";
+  CeedCheck(!strcmp(name, requested), ceed, CEED_ERROR_UNSUPPORTED, "QFunction '%s' does not match requested name: %s", name, requested);
+
+  // QFunction fields 'input' and 'output' with requested emodes added by the library rather than being added here
+
+  CeedCall(CeedQFunctionSetUserFlopsEstimate(qf, 0));
+  return CEED_ERROR_SUCCESS;
+}
+
+/**
+  @brief Register identity `CeedQFunction` that copies first input component directly into output
+**/
+CEED_INTERN int CeedQFunctionRegister_IdentityScalar(void) {
+  return CeedQFunctionRegister("Identity to scalar", IdentityScalar_loc, 1, IdentityScalar, CeedQFunctionInit_IdentityScalar);
+}
diff --git a/include/ceed/jit-source/gallery/ceed-identity-to-scalar.h b/include/ceed/jit-source/gallery/ceed-identity-to-scalar.h
new file mode 100644
index 0000000000..5cf406fe51
--- /dev/null
+++ b/include/ceed/jit-source/gallery/ceed-identity-to-scalar.h
@@ -0,0 +1,22 @@
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+/**
+  @brief  Identity QFunction that copies first input component directly into output
+**/
+#include <ceed/types.h>
+
+CEED_QFUNCTION(IdentityScalar)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+  // in[0] is input, size (Q*size)
+  const CeedScalar *input = in[0];
+  // out[0] is output, size (Q)
+  CeedScalar *output = out[0];
+
+  // Quadrature point loop
+  CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { output[i] = input[i]; }  // End of Quadrature Point Loop
+  return CEED_ERROR_SUCCESS;
+}

From becbf5cd1f8a17165b4b35b6aeb0ea66cbe89550 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Mon, 9 Feb 2026 08:45:04 -0700
Subject: [PATCH 542/571] gallery - add scalar scaling qf

---
 gallery/ceed-gallery-list.h                   |  1 +
 gallery/scale/ceed-scale-scalar.c             | 30 +++++++++++++++++++
 .../jit-source/gallery/ceed-scale-scalar.h    | 29 ++++++++++++++++++
 3 files changed, 60 insertions(+)
 create mode 100644 gallery/scale/ceed-scale-scalar.c
 create mode 100644 include/ceed/jit-source/gallery/ceed-scale-scalar.h

diff --git a/gallery/ceed-gallery-list.h b/gallery/ceed-gallery-list.h
index 030a3b2c28..c1829eef64 100644
--- a/gallery/ceed-gallery-list.h
+++ b/gallery/ceed-gallery-list.h
@@ -29,3 +29,4 @@ CEED_GALLERY_QFUNCTION(CeedQFunctionRegister_Vector3Poisson1DApply)
 CEED_GALLERY_QFUNCTION(CeedQFunctionRegister_Vector3Poisson2DApply)
 CEED_GALLERY_QFUNCTION(CeedQFunctionRegister_Vector3Poisson3DApply)
 CEED_GALLERY_QFUNCTION(CeedQFunctionRegister_Scale)
+CEED_GALLERY_QFUNCTION(CeedQFunctionRegister_ScaleScalar)
diff --git a/gallery/scale/ceed-scale-scalar.c b/gallery/scale/ceed-scale-scalar.c
new file mode 100644
index 0000000000..c98554c2e6
--- /dev/null
+++ b/gallery/scale/ceed-scale-scalar.c
@@ -0,0 +1,30 @@
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+#include <ceed.h>
+#include <ceed/backend.h>
+#include <ceed/jit-source/gallery/ceed-scale-scalar.h>
+#include <string.h>
+
+/**
+  @brief  Set fields for vector scaling `CeedQFunction` that scales inputs
+**/
+static int CeedQFunctionInit_ScaleScalar(Ceed ceed, const char *requested, CeedQFunction qf) {
+  // Check QFunction name
+  const char *name = "Scale (scalar)";
+  CeedCheck(!strcmp(name, requested), ceed, CEED_ERROR_UNSUPPORTED, "QFunction '%s' does not match requested name: %s", name, requested);
+
+  // QFunction fields 'input' and 'output' with requested emodes added by the library rather than being added here
+  return CEED_ERROR_SUCCESS;
+}
+
+/**
+  @brief Register scaling `CeedQFunction`
+**/
+CEED_INTERN int CeedQFunctionRegister_ScaleScalar(void) {
+  return CeedQFunctionRegister("Scale (scalar)", ScaleScalar_loc, 1, ScaleScalar, CeedQFunctionInit_ScaleScalar);
+}
diff --git a/include/ceed/jit-source/gallery/ceed-scale-scalar.h b/include/ceed/jit-source/gallery/ceed-scale-scalar.h
new file mode 100644
index 0000000000..bd1c210084
--- /dev/null
+++ b/include/ceed/jit-source/gallery/ceed-scale-scalar.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+/**
+  @brief  Scaling QFunction that scales inputs
+**/
+#include <ceed/types.h>
+
+CEED_QFUNCTION(ScaleScalar)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+  // Ctx holds field size
+  const CeedInt size = *(CeedInt *)ctx;
+
+  // in[0] is input, size (Q*size)
+  // in[1] is scaling factor, size (Q*size)
+  const CeedScalar *input = in[0];
+  const CeedScalar *scale = in[1];
+  // out[0] is output, size (Q*size)
+  CeedScalar *output = out[0];
+
+  // Quadrature point loop
+  CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
+    for (CeedInt j = 0; j < size; j++) output[i + j * Q] = input[i + j * Q] * scale[i];
+  }  // End of Quadrature Point Loop
+  return 0;
+}

From 6261a418bdb3efd56d67795785948f0f7b436c3e Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Mon, 9 Feb 2026 08:52:00 -0700
Subject: [PATCH 543/571] minor - style consistency

---
 gallery/identity/ceed-identity-to-scalar.c                | 1 +
 gallery/identity/ceed-identity.c                          | 5 +++--
 gallery/mass-vector/ceed-vectormassapply.c                | 3 ++-
 gallery/mass/ceed-mass1dbuild.c                           | 3 ++-
 gallery/mass/ceed-mass2dbuild.c                           | 3 ++-
 gallery/mass/ceed-mass3dbuild.c                           | 3 ++-
 gallery/mass/ceed-massapply.c                             | 2 +-
 gallery/poisson-vector/ceed-vectorpoisson1dapply.c        | 3 ++-
 gallery/poisson-vector/ceed-vectorpoisson2dapply.c        | 3 ++-
 gallery/poisson-vector/ceed-vectorpoisson3dapply.c        | 3 ++-
 gallery/poisson/ceed-poisson1dapply.c                     | 3 ++-
 gallery/poisson/ceed-poisson1dbuild.c                     | 3 ++-
 gallery/poisson/ceed-poisson2dapply.c                     | 3 ++-
 gallery/poisson/ceed-poisson2dbuild.c                     | 3 ++-
 gallery/poisson/ceed-poisson3dapply.c                     | 3 ++-
 gallery/poisson/ceed-poisson3dbuild.c                     | 3 ++-
 gallery/scale/ceed-scale-scalar.c                         | 1 +
 gallery/scale/ceed-scale.c                                | 2 +-
 include/ceed/jit-source/gallery/ceed-identity.h           | 1 -
 include/ceed/jit-source/gallery/ceed-mass1dbuild.h        | 1 -
 include/ceed/jit-source/gallery/ceed-mass2dbuild.h        | 1 -
 include/ceed/jit-source/gallery/ceed-mass3dbuild.h        | 1 -
 include/ceed/jit-source/gallery/ceed-massapply.h          | 1 -
 include/ceed/jit-source/gallery/ceed-poisson1dapply.h     | 1 -
 include/ceed/jit-source/gallery/ceed-poisson1dbuild.h     | 1 -
 include/ceed/jit-source/gallery/ceed-poisson2dapply.h     | 1 -
 include/ceed/jit-source/gallery/ceed-poisson2dbuild.h     | 8 ++++----
 include/ceed/jit-source/gallery/ceed-poisson3dapply.h     | 1 -
 include/ceed/jit-source/gallery/ceed-poisson3dbuild.h     | 1 -
 include/ceed/jit-source/gallery/ceed-scale-scalar.h       | 2 +-
 include/ceed/jit-source/gallery/ceed-scale.h              | 2 +-
 include/ceed/jit-source/gallery/ceed-vectormassapply.h    | 1 -
 .../ceed/jit-source/gallery/ceed-vectorpoisson1dapply.h   | 1 -
 .../ceed/jit-source/gallery/ceed-vectorpoisson2dapply.h   | 1 -
 .../ceed/jit-source/gallery/ceed-vectorpoisson3dapply.h   | 1 -
 35 files changed, 39 insertions(+), 37 deletions(-)

diff --git a/gallery/identity/ceed-identity-to-scalar.c b/gallery/identity/ceed-identity-to-scalar.c
index ddb04574e1..403fcbafe5 100644
--- a/gallery/identity/ceed-identity-to-scalar.c
+++ b/gallery/identity/ceed-identity-to-scalar.c
@@ -17,6 +17,7 @@
 static int CeedQFunctionInit_IdentityScalar(Ceed ceed, const char *requested, CeedQFunction qf) {
   // Check QFunction name
   const char *name = "Identity to scalar";
+
   CeedCheck(!strcmp(name, requested), ceed, CEED_ERROR_UNSUPPORTED, "QFunction '%s' does not match requested name: %s", name, requested);
 
   // QFunction fields 'input' and 'output' with requested emodes added by the library rather than being added here
diff --git a/gallery/identity/ceed-identity.c b/gallery/identity/ceed-identity.c
index 1391986b58..415d19a274 100644
--- a/gallery/identity/ceed-identity.c
+++ b/gallery/identity/ceed-identity.c
@@ -17,21 +17,22 @@
 static int CeedQFunctionInit_Identity(Ceed ceed, const char *requested, CeedQFunction qf) {
   // Check QFunction name
   const char *name = "Identity";
+
   CeedCheck(!strcmp(name, requested), ceed, CEED_ERROR_UNSUPPORTED, "QFunction '%s' does not match requested name: %s", name, requested);
 
   // QFunction fields 'input' and 'output' with requested emodes added by the library rather than being added here
 
-  CeedCall(CeedQFunctionSetUserFlopsEstimate(qf, 0));
-
   // Context data
   CeedQFunctionContext ctx;
   IdentityCtx          ctx_data = {.size = 1};
+
   CeedCall(CeedQFunctionContextCreate(ceed, &ctx));
   CeedCall(CeedQFunctionContextSetData(ctx, CEED_MEM_HOST, CEED_COPY_VALUES, sizeof(ctx_data), &ctx_data));
   CeedCall(CeedQFunctionContextRegisterInt32(ctx, "size", offsetof(IdentityCtx, size), 1, "field size of identity QFunction"));
   CeedCall(CeedQFunctionSetContext(qf, ctx));
   CeedCall(CeedQFunctionContextDestroy(&ctx));
 
+  CeedCall(CeedQFunctionSetUserFlopsEstimate(qf, 0));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/gallery/mass-vector/ceed-vectormassapply.c b/gallery/mass-vector/ceed-vectormassapply.c
index 388f203802..bae789a0c5 100644
--- a/gallery/mass-vector/ceed-vectormassapply.c
+++ b/gallery/mass-vector/ceed-vectormassapply.c
@@ -16,16 +16,17 @@
 static int CeedQFunctionInit_Vector3MassApply(Ceed ceed, const char *requested, CeedQFunction qf) {
   // Check QFunction name
   const char *name = "Vector3MassApply";
+
   CeedCheck(!strcmp(name, requested), ceed, CEED_ERROR_UNSUPPORTED, "QFunction '%s' does not match requested name: %s", name, requested);
 
   // Add QFunction fields
   const CeedInt num_comp = 3;
+
   CeedCall(CeedQFunctionAddInput(qf, "u", num_comp, CEED_EVAL_INTERP));
   CeedCall(CeedQFunctionAddInput(qf, "qdata", 1, CEED_EVAL_NONE));
   CeedCall(CeedQFunctionAddOutput(qf, "v", num_comp, CEED_EVAL_INTERP));
 
   CeedCall(CeedQFunctionSetUserFlopsEstimate(qf, num_comp));
-
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/gallery/mass/ceed-mass1dbuild.c b/gallery/mass/ceed-mass1dbuild.c
index 13f6ce0f77..7931ad0c36 100644
--- a/gallery/mass/ceed-mass1dbuild.c
+++ b/gallery/mass/ceed-mass1dbuild.c
@@ -16,16 +16,17 @@
 static int CeedQFunctionInit_Mass1DBuild(Ceed ceed, const char *requested, CeedQFunction qf) {
   // Check QFunction name
   const char *name = "Mass1DBuild";
+
   CeedCheck(!strcmp(name, requested), ceed, CEED_ERROR_UNSUPPORTED, "QFunction '%s' does not match requested name: %s", name, requested);
 
   // Add QFunction fields
   const CeedInt dim = 1;
+
   CeedCall(CeedQFunctionAddInput(qf, "dx", dim * dim, CEED_EVAL_GRAD));
   CeedCall(CeedQFunctionAddInput(qf, "weights", 1, CEED_EVAL_WEIGHT));
   CeedCall(CeedQFunctionAddOutput(qf, "qdata", 1, CEED_EVAL_NONE));
 
   CeedCall(CeedQFunctionSetUserFlopsEstimate(qf, 1));
-
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/gallery/mass/ceed-mass2dbuild.c b/gallery/mass/ceed-mass2dbuild.c
index b4431443a2..961ddbf2e9 100644
--- a/gallery/mass/ceed-mass2dbuild.c
+++ b/gallery/mass/ceed-mass2dbuild.c
@@ -16,16 +16,17 @@
 static int CeedQFunctionInit_Mass2DBuild(Ceed ceed, const char *requested, CeedQFunction qf) {
   // Check QFunction name
   const char *name = "Mass2DBuild";
+
   CeedCheck(!strcmp(name, requested), ceed, CEED_ERROR_UNSUPPORTED, "QFunction '%s' does not match requested name: %s", name, requested);
 
   // Add QFunction fields
   const CeedInt dim = 2;
+
   CeedCall(CeedQFunctionAddInput(qf, "dx", dim * dim, CEED_EVAL_GRAD));
   CeedCall(CeedQFunctionAddInput(qf, "weights", 1, CEED_EVAL_WEIGHT));
   CeedCall(CeedQFunctionAddOutput(qf, "qdata", 1, CEED_EVAL_NONE));
 
   CeedCall(CeedQFunctionSetUserFlopsEstimate(qf, 4));
-
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/gallery/mass/ceed-mass3dbuild.c b/gallery/mass/ceed-mass3dbuild.c
index 58ffc2fc9e..e4edf2dd85 100644
--- a/gallery/mass/ceed-mass3dbuild.c
+++ b/gallery/mass/ceed-mass3dbuild.c
@@ -16,16 +16,17 @@
 static int CeedQFunctionInit_Mass3DBuild(Ceed ceed, const char *requested, CeedQFunction qf) {
   // Check QFunction name
   const char *name = "Mass3DBuild";
+
   CeedCheck(!strcmp(name, requested), ceed, CEED_ERROR_UNSUPPORTED, "QFunction '%s' does not match requested name: %s", name, requested);
 
   // Add QFunction fields
   const CeedInt dim = 3;
+
   CeedCall(CeedQFunctionAddInput(qf, "dx", dim * dim, CEED_EVAL_GRAD));
   CeedCall(CeedQFunctionAddInput(qf, "weights", 1, CEED_EVAL_WEIGHT));
   CeedCall(CeedQFunctionAddOutput(qf, "qdata", 1, CEED_EVAL_NONE));
 
   CeedCall(CeedQFunctionSetUserFlopsEstimate(qf, 15));
-
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/gallery/mass/ceed-massapply.c b/gallery/mass/ceed-massapply.c
index cefa208c0f..11c19aa799 100644
--- a/gallery/mass/ceed-massapply.c
+++ b/gallery/mass/ceed-massapply.c
@@ -16,6 +16,7 @@
 static int CeedQFunctionInit_MassApply(Ceed ceed, const char *requested, CeedQFunction qf) {
   // Check QFunction name
   const char *name = "MassApply";
+
   CeedCheck(!strcmp(name, requested), ceed, CEED_ERROR_UNSUPPORTED, "QFunction '%s' does not match requested name: %s", name, requested);
 
   // Add QFunction fields
@@ -24,7 +25,6 @@ static int CeedQFunctionInit_MassApply(Ceed ceed, const char *requested, CeedQFu
   CeedCall(CeedQFunctionAddOutput(qf, "v", 1, CEED_EVAL_INTERP));
 
   CeedCall(CeedQFunctionSetUserFlopsEstimate(qf, 1));
-
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/gallery/poisson-vector/ceed-vectorpoisson1dapply.c b/gallery/poisson-vector/ceed-vectorpoisson1dapply.c
index 433eca7f2e..d49026a97d 100644
--- a/gallery/poisson-vector/ceed-vectorpoisson1dapply.c
+++ b/gallery/poisson-vector/ceed-vectorpoisson1dapply.c
@@ -16,16 +16,17 @@
 static int CeedQFunctionInit_Vector3Poisson1DApply(Ceed ceed, const char *requested, CeedQFunction qf) {
   // Check QFunction name
   const char *name = "Vector3Poisson1DApply";
+
   CeedCheck(!strcmp(name, requested), ceed, CEED_ERROR_UNSUPPORTED, "QFunction '%s' does not match requested name: %s", name, requested);
 
   // Add QFunction fields
   const CeedInt dim = 1, num_comp = 3;
+
   CeedCall(CeedQFunctionAddInput(qf, "du", num_comp * dim, CEED_EVAL_GRAD));
   CeedCall(CeedQFunctionAddInput(qf, "qdata", dim * (dim + 1) / 2, CEED_EVAL_NONE));
   CeedCall(CeedQFunctionAddOutput(qf, "dv", num_comp * dim, CEED_EVAL_GRAD));
 
   CeedCall(CeedQFunctionSetUserFlopsEstimate(qf, num_comp));
-
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/gallery/poisson-vector/ceed-vectorpoisson2dapply.c b/gallery/poisson-vector/ceed-vectorpoisson2dapply.c
index 4efd1225f6..7e4031f477 100644
--- a/gallery/poisson-vector/ceed-vectorpoisson2dapply.c
+++ b/gallery/poisson-vector/ceed-vectorpoisson2dapply.c
@@ -16,16 +16,17 @@
 static int CeedQFunctionInit_Vector3Poisson2DApply(Ceed ceed, const char *requested, CeedQFunction qf) {
   // Check QFunction name
   const char *name = "Vector3Poisson2DApply";
+
   CeedCheck(!strcmp(name, requested), ceed, CEED_ERROR_UNSUPPORTED, "QFunction '%s' does not match requested name: %s", name, requested);
 
   // Add QFunction fields
   const CeedInt dim = 2, num_comp = 3;
+
   CeedCall(CeedQFunctionAddInput(qf, "du", num_comp * dim, CEED_EVAL_GRAD));
   CeedCall(CeedQFunctionAddInput(qf, "qdata", dim * (dim + 1) / 2, CEED_EVAL_NONE));
   CeedCall(CeedQFunctionAddOutput(qf, "dv", num_comp * dim, CEED_EVAL_GRAD));
 
   CeedCall(CeedQFunctionSetUserFlopsEstimate(qf, num_comp * 6));
-
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/gallery/poisson-vector/ceed-vectorpoisson3dapply.c b/gallery/poisson-vector/ceed-vectorpoisson3dapply.c
index d35e0ae3d9..9e1864287f 100644
--- a/gallery/poisson-vector/ceed-vectorpoisson3dapply.c
+++ b/gallery/poisson-vector/ceed-vectorpoisson3dapply.c
@@ -16,16 +16,17 @@
 static int CeedQFunctionInit_Vector3Poisson3DApply(Ceed ceed, const char *requested, CeedQFunction qf) {
   // Check QFunction name
   const char *name = "Vector3Poisson3DApply";
+
   CeedCheck(!strcmp(name, requested), ceed, CEED_ERROR_UNSUPPORTED, "QFunction '%s' does not match requested name: %s", name, requested);
 
   // Add QFunction fields
   const CeedInt dim = 3, num_comp = 3;
+
   CeedCall(CeedQFunctionAddInput(qf, "du", num_comp * dim, CEED_EVAL_GRAD));
   CeedCall(CeedQFunctionAddInput(qf, "qdata", dim * (dim + 1) / 2, CEED_EVAL_NONE));
   CeedCall(CeedQFunctionAddOutput(qf, "dv", num_comp * dim, CEED_EVAL_GRAD));
 
   CeedCall(CeedQFunctionSetUserFlopsEstimate(qf, num_comp * 15));
-
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/gallery/poisson/ceed-poisson1dapply.c b/gallery/poisson/ceed-poisson1dapply.c
index 2112d24ce2..b007a60092 100644
--- a/gallery/poisson/ceed-poisson1dapply.c
+++ b/gallery/poisson/ceed-poisson1dapply.c
@@ -16,16 +16,17 @@
 static int CeedQFunctionInit_Poisson1DApply(Ceed ceed, const char *requested, CeedQFunction qf) {
   // Check QFunction name
   const char *name = "Poisson1DApply";
+
   CeedCheck(!strcmp(name, requested), ceed, CEED_ERROR_UNSUPPORTED, "QFunction '%s' does not match requested name: %s", name, requested);
 
   // Add QFunction fields
   const CeedInt dim = 1;
+
   CeedCall(CeedQFunctionAddInput(qf, "du", dim, CEED_EVAL_GRAD));
   CeedCall(CeedQFunctionAddInput(qf, "qdata", dim * (dim + 1) / 2, CEED_EVAL_NONE));
   CeedCall(CeedQFunctionAddOutput(qf, "dv", dim, CEED_EVAL_GRAD));
 
   CeedCall(CeedQFunctionSetUserFlopsEstimate(qf, 1));
-
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/gallery/poisson/ceed-poisson1dbuild.c b/gallery/poisson/ceed-poisson1dbuild.c
index c54aa72f55..cd8075a0e8 100644
--- a/gallery/poisson/ceed-poisson1dbuild.c
+++ b/gallery/poisson/ceed-poisson1dbuild.c
@@ -16,16 +16,17 @@
 static int CeedQFunctionInit_Poisson1DBuild(Ceed ceed, const char *requested, CeedQFunction qf) {
   // Check QFunction name
   const char *name = "Poisson1DBuild";
+
   CeedCheck(!strcmp(name, requested), ceed, CEED_ERROR_UNSUPPORTED, "QFunction '%s' does not match requested name: %s", name, requested);
 
   // Add QFunction fields
   const CeedInt dim = 1;
+
   CeedCall(CeedQFunctionAddInput(qf, "dx", dim * dim, CEED_EVAL_GRAD));
   CeedCall(CeedQFunctionAddInput(qf, "weights", 1, CEED_EVAL_WEIGHT));
   CeedCall(CeedQFunctionAddOutput(qf, "qdata", dim * (dim + 1) / 2, CEED_EVAL_NONE));
 
   CeedCall(CeedQFunctionSetUserFlopsEstimate(qf, 1));
-
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/gallery/poisson/ceed-poisson2dapply.c b/gallery/poisson/ceed-poisson2dapply.c
index 1c90c84c5c..d055386dfe 100644
--- a/gallery/poisson/ceed-poisson2dapply.c
+++ b/gallery/poisson/ceed-poisson2dapply.c
@@ -16,16 +16,17 @@
 static int CeedQFunctionInit_Poisson2DApply(Ceed ceed, const char *requested, CeedQFunction qf) {
   // Check QFunction name
   const char *name = "Poisson2DApply";
+
   CeedCheck(!strcmp(name, requested), ceed, CEED_ERROR_UNSUPPORTED, "QFunction '%s' does not match requested name: %s", name, requested);
 
   // Add QFunction fields
   const CeedInt dim = 2;
+
   CeedCall(CeedQFunctionAddInput(qf, "du", dim, CEED_EVAL_GRAD));
   CeedCall(CeedQFunctionAddInput(qf, "qdata", dim * (dim + 1) / 2, CEED_EVAL_NONE));
   CeedCall(CeedQFunctionAddOutput(qf, "dv", dim, CEED_EVAL_GRAD));
 
   CeedCall(CeedQFunctionSetUserFlopsEstimate(qf, 6));
-
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/gallery/poisson/ceed-poisson2dbuild.c b/gallery/poisson/ceed-poisson2dbuild.c
index 7310aab0d7..7768b0d29f 100644
--- a/gallery/poisson/ceed-poisson2dbuild.c
+++ b/gallery/poisson/ceed-poisson2dbuild.c
@@ -16,16 +16,17 @@
 static int CeedQFunctionInit_Poisson2DBuild(Ceed ceed, const char *requested, CeedQFunction qf) {
   // Check QFunction name
   const char *name = "Poisson2DBuild";
+
   CeedCheck(!strcmp(name, requested), ceed, CEED_ERROR_UNSUPPORTED, "QFunction '%s' does not match requested name: %s", name, requested);
 
   // Add QFunction fields
   const CeedInt dim = 2;
+
   CeedCall(CeedQFunctionAddInput(qf, "dx", dim * dim, CEED_EVAL_GRAD));
   CeedCall(CeedQFunctionAddInput(qf, "weights", 1, CEED_EVAL_WEIGHT));
   CeedCall(CeedQFunctionAddOutput(qf, "qdata", dim * (dim + 1) / 2, CEED_EVAL_NONE));
 
   CeedCall(CeedQFunctionSetUserFlopsEstimate(qf, 17));
-
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/gallery/poisson/ceed-poisson3dapply.c b/gallery/poisson/ceed-poisson3dapply.c
index a5e0207b15..bcce1a9dc1 100644
--- a/gallery/poisson/ceed-poisson3dapply.c
+++ b/gallery/poisson/ceed-poisson3dapply.c
@@ -16,16 +16,17 @@
 static int CeedQFunctionInit_Poisson3DApply(Ceed ceed, const char *requested, CeedQFunction qf) {
   // Check QFunction name
   const char *name = "Poisson3DApply";
+
   CeedCheck(!strcmp(name, requested), ceed, CEED_ERROR_UNSUPPORTED, "QFunction '%s' does not match requested name: %s", name, requested);
 
   // Add QFunction fields
   const CeedInt dim = 3;
+
   CeedCall(CeedQFunctionAddInput(qf, "du", dim, CEED_EVAL_GRAD));
   CeedCall(CeedQFunctionAddInput(qf, "qdata", dim * (dim + 1) / 2, CEED_EVAL_NONE));
   CeedCall(CeedQFunctionAddOutput(qf, "dv", dim, CEED_EVAL_GRAD));
 
   CeedCall(CeedQFunctionSetUserFlopsEstimate(qf, 15));
-
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/gallery/poisson/ceed-poisson3dbuild.c b/gallery/poisson/ceed-poisson3dbuild.c
index 8054f3d4ad..3ae866ed7d 100644
--- a/gallery/poisson/ceed-poisson3dbuild.c
+++ b/gallery/poisson/ceed-poisson3dbuild.c
@@ -16,16 +16,17 @@
 static int CeedQFunctionInit_Poisson3DBuild(Ceed ceed, const char *requested, CeedQFunction qf) {
   // Check QFunction name
   const char *name = "Poisson3DBuild";
+
   CeedCheck(!strcmp(name, requested), ceed, CEED_ERROR_UNSUPPORTED, "QFunction '%s' does not match requested name: %s", name, requested);
 
   // Add QFunction fields
   const CeedInt dim = 3;
+
   CeedCall(CeedQFunctionAddInput(qf, "dx", dim * dim, CEED_EVAL_GRAD));
   CeedCall(CeedQFunctionAddInput(qf, "weights", 1, CEED_EVAL_WEIGHT));
   CeedCall(CeedQFunctionAddOutput(qf, "qdata", dim * (dim + 1) / 2, CEED_EVAL_NONE));
 
   CeedCall(CeedQFunctionSetUserFlopsEstimate(qf, 69));
-
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/gallery/scale/ceed-scale-scalar.c b/gallery/scale/ceed-scale-scalar.c
index c98554c2e6..ff950dbf49 100644
--- a/gallery/scale/ceed-scale-scalar.c
+++ b/gallery/scale/ceed-scale-scalar.c
@@ -16,6 +16,7 @@
 static int CeedQFunctionInit_ScaleScalar(Ceed ceed, const char *requested, CeedQFunction qf) {
   // Check QFunction name
   const char *name = "Scale (scalar)";
+
   CeedCheck(!strcmp(name, requested), ceed, CEED_ERROR_UNSUPPORTED, "QFunction '%s' does not match requested name: %s", name, requested);
 
   // QFunction fields 'input' and 'output' with requested emodes added by the library rather than being added here
diff --git a/gallery/scale/ceed-scale.c b/gallery/scale/ceed-scale.c
index 6f86879e2c..f998ac38e4 100644
--- a/gallery/scale/ceed-scale.c
+++ b/gallery/scale/ceed-scale.c
@@ -16,10 +16,10 @@
 static int CeedQFunctionInit_Scale(Ceed ceed, const char *requested, CeedQFunction qf) {
   // Check QFunction name
   const char *name = "Scale";
+
   CeedCheck(!strcmp(name, requested), ceed, CEED_ERROR_UNSUPPORTED, "QFunction '%s' does not match requested name: %s", name, requested);
 
   // QFunction fields 'input' and 'output' with requested emodes added by the library rather than being added here
-
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/include/ceed/jit-source/gallery/ceed-identity.h b/include/ceed/jit-source/gallery/ceed-identity.h
index a502fce59d..110083b372 100644
--- a/include/ceed/jit-source/gallery/ceed-identity.h
+++ b/include/ceed/jit-source/gallery/ceed-identity.h
@@ -26,6 +26,5 @@ CEED_QFUNCTION(Identity)(void *ctx, const CeedInt Q, const CeedScalar *const *in
 
   // Quadrature point loop
   CeedPragmaSIMD for (CeedInt i = 0; i < Q * size; i++) { output[i] = input[i]; }  // End of Quadrature Point Loop
-
   return CEED_ERROR_SUCCESS;
 }
diff --git a/include/ceed/jit-source/gallery/ceed-mass1dbuild.h b/include/ceed/jit-source/gallery/ceed-mass1dbuild.h
index dfa9c4ae7e..d9a985a56d 100644
--- a/include/ceed/jit-source/gallery/ceed-mass1dbuild.h
+++ b/include/ceed/jit-source/gallery/ceed-mass1dbuild.h
@@ -19,6 +19,5 @@ CEED_QFUNCTION(Mass1DBuild)(void *ctx, const CeedInt Q, const CeedScalar *const
 
   // Quadrature point loop
   CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { q_data[i] = J[i] * w[i]; }  // End of Quadrature Point Loop
-
   return CEED_ERROR_SUCCESS;
 }
diff --git a/include/ceed/jit-source/gallery/ceed-mass2dbuild.h b/include/ceed/jit-source/gallery/ceed-mass2dbuild.h
index a23c14858a..4a6946ebce 100644
--- a/include/ceed/jit-source/gallery/ceed-mass2dbuild.h
+++ b/include/ceed/jit-source/gallery/ceed-mass2dbuild.h
@@ -21,6 +21,5 @@ CEED_QFUNCTION(Mass2DBuild)(void *ctx, const CeedInt Q, const CeedScalar *const
   CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
     q_data[i] = (J[0][0][i] * J[1][1][i] - J[0][1][i] * J[1][0][i]) * w[i];
   }  // End of Quadrature Point Loop
-
   return CEED_ERROR_SUCCESS;
 }
diff --git a/include/ceed/jit-source/gallery/ceed-mass3dbuild.h b/include/ceed/jit-source/gallery/ceed-mass3dbuild.h
index fdff95017e..1d7f094dba 100644
--- a/include/ceed/jit-source/gallery/ceed-mass3dbuild.h
+++ b/include/ceed/jit-source/gallery/ceed-mass3dbuild.h
@@ -23,6 +23,5 @@ CEED_QFUNCTION(Mass3DBuild)(void *ctx, const CeedInt Q, const CeedScalar *const
                  J[0][2][i] * (J[1][0][i] * J[2][1][i] - J[1][1][i] * J[2][0][i])) *
                 w[i];
   }  // End of Quadrature Point Loop
-
   return CEED_ERROR_SUCCESS;
 }
diff --git a/include/ceed/jit-source/gallery/ceed-massapply.h b/include/ceed/jit-source/gallery/ceed-massapply.h
index bfd2c99491..41a0695e39 100644
--- a/include/ceed/jit-source/gallery/ceed-massapply.h
+++ b/include/ceed/jit-source/gallery/ceed-massapply.h
@@ -19,6 +19,5 @@ CEED_QFUNCTION(MassApply)(void *ctx, const CeedInt Q, const CeedScalar *const *i
 
   // Quadrature point loop
   CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { v[i] = u[i] * q_data[i]; }  // End of Quadrature Point Loop
-
   return CEED_ERROR_SUCCESS;
 }
diff --git a/include/ceed/jit-source/gallery/ceed-poisson1dapply.h b/include/ceed/jit-source/gallery/ceed-poisson1dapply.h
index 6400c5d1b2..d23f134eb0 100644
--- a/include/ceed/jit-source/gallery/ceed-poisson1dapply.h
+++ b/include/ceed/jit-source/gallery/ceed-poisson1dapply.h
@@ -20,6 +20,5 @@ CEED_QFUNCTION(Poisson1DApply)(void *ctx, const CeedInt Q, const CeedScalar *con
 
   // Quadrature point loop
   CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { vg[i] = ug[i] * q_data[i]; }  // End of Quadrature Point Loop
-
   return CEED_ERROR_SUCCESS;
 }
diff --git a/include/ceed/jit-source/gallery/ceed-poisson1dbuild.h b/include/ceed/jit-source/gallery/ceed-poisson1dbuild.h
index 94b366d997..b84fa01d31 100644
--- a/include/ceed/jit-source/gallery/ceed-poisson1dbuild.h
+++ b/include/ceed/jit-source/gallery/ceed-poisson1dbuild.h
@@ -23,6 +23,5 @@ CEED_QFUNCTION(Poisson1DBuild)(void *ctx, const CeedInt Q, const CeedScalar *con
 
   // Quadrature point loop
   CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { q_data[i] = w[i] / J[i]; }  // End of Quadrature Point Loop
-
   return CEED_ERROR_SUCCESS;
 }
diff --git a/include/ceed/jit-source/gallery/ceed-poisson2dapply.h b/include/ceed/jit-source/gallery/ceed-poisson2dapply.h
index 3e94ac700f..62329ad1a4 100644
--- a/include/ceed/jit-source/gallery/ceed-poisson2dapply.h
+++ b/include/ceed/jit-source/gallery/ceed-poisson2dapply.h
@@ -34,6 +34,5 @@ CEED_QFUNCTION(Poisson2DApply)(void *ctx, const CeedInt Q, const CeedScalar *con
     // j = direction of vg
     for (CeedInt j = 0; j < dim; j++) vg[j][i] = (ug[0][i] * dXdxdXdxT[0][j] + ug[1][i] * dXdxdXdxT[1][j]);
   }  // End of Quadrature Point Loop
-
   return CEED_ERROR_SUCCESS;
 }
diff --git a/include/ceed/jit-source/gallery/ceed-poisson2dbuild.h b/include/ceed/jit-source/gallery/ceed-poisson2dbuild.h
index 23f00a26dd..8546c304cd 100644
--- a/include/ceed/jit-source/gallery/ceed-poisson2dbuild.h
+++ b/include/ceed/jit-source/gallery/ceed-poisson2dbuild.h
@@ -29,10 +29,10 @@ CEED_QFUNCTION(Poisson2DBuild)(void *ctx, const CeedInt Q, const CeedScalar *con
     const CeedScalar J01 = J[1][0][i];
     const CeedScalar J11 = J[1][1][i];
     const CeedScalar qw  = w[i] / (J00 * J11 - J10 * J01);
-    q_data[0][i]         = qw * (J01 * J01 + J11 * J11);
-    q_data[1][i]         = qw * (J00 * J00 + J10 * J10);
-    q_data[2][i]         = -qw * (J00 * J01 + J10 * J11);
-  }  // End of Quadrature Point Loop
 
+    q_data[0][i] = qw * (J01 * J01 + J11 * J11);
+    q_data[1][i] = qw * (J00 * J00 + J10 * J10);
+    q_data[2][i] = -qw * (J00 * J01 + J10 * J11);
+  }  // End of Quadrature Point Loop
   return CEED_ERROR_SUCCESS;
 }
diff --git a/include/ceed/jit-source/gallery/ceed-poisson3dapply.h b/include/ceed/jit-source/gallery/ceed-poisson3dapply.h
index 4b3f687494..77295c9fb8 100644
--- a/include/ceed/jit-source/gallery/ceed-poisson3dapply.h
+++ b/include/ceed/jit-source/gallery/ceed-poisson3dapply.h
@@ -36,6 +36,5 @@ CEED_QFUNCTION(Poisson3DApply)(void *ctx, const CeedInt Q, const CeedScalar *con
     // j = direction of vg
     for (CeedInt j = 0; j < dim; j++) vg[j][i] = (ug[0][i] * dXdxdXdxT[0][j] + ug[1][i] * dXdxdXdxT[1][j] + ug[2][i] * dXdxdXdxT[2][j]);
   }  // End of Quadrature Point Loop
-
   return CEED_ERROR_SUCCESS;
 }
diff --git a/include/ceed/jit-source/gallery/ceed-poisson3dbuild.h b/include/ceed/jit-source/gallery/ceed-poisson3dbuild.h
index 71ea0d1a69..b42bbb93f9 100644
--- a/include/ceed/jit-source/gallery/ceed-poisson3dbuild.h
+++ b/include/ceed/jit-source/gallery/ceed-poisson3dbuild.h
@@ -46,6 +46,5 @@ CEED_QFUNCTION(Poisson3DBuild)(void *ctx, const CeedInt Q, const CeedScalar *con
     q_data[4][i] = qw * (A[0][0] * A[2][0] + A[0][1] * A[2][1] + A[0][2] * A[2][2]);
     q_data[5][i] = qw * (A[0][0] * A[1][0] + A[0][1] * A[1][1] + A[0][2] * A[1][2]);
   }  // End of Quadrature Point Loop
-
   return CEED_ERROR_SUCCESS;
 }
diff --git a/include/ceed/jit-source/gallery/ceed-scale-scalar.h b/include/ceed/jit-source/gallery/ceed-scale-scalar.h
index bd1c210084..f70c62ec9a 100644
--- a/include/ceed/jit-source/gallery/ceed-scale-scalar.h
+++ b/include/ceed/jit-source/gallery/ceed-scale-scalar.h
@@ -25,5 +25,5 @@ CEED_QFUNCTION(ScaleScalar)(void *ctx, const CeedInt Q, const CeedScalar *const
   CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
     for (CeedInt j = 0; j < size; j++) output[i + j * Q] = input[i + j * Q] * scale[i];
   }  // End of Quadrature Point Loop
-  return 0;
+  return CEED_ERROR_SUCCESS;
 }
diff --git a/include/ceed/jit-source/gallery/ceed-scale.h b/include/ceed/jit-source/gallery/ceed-scale.h
index 12fcfe5277..6c0157f7e2 100644
--- a/include/ceed/jit-source/gallery/ceed-scale.h
+++ b/include/ceed/jit-source/gallery/ceed-scale.h
@@ -23,5 +23,5 @@ CEED_QFUNCTION(Scale)(void *ctx, const CeedInt Q, const CeedScalar *const *in, C
 
   // Quadrature point loop
   CeedPragmaSIMD for (CeedInt i = 0; i < Q * size; i++) { output[i] = input[i] * scale[i]; }  // End of Quadrature Point Loop
-  return 0;
+  return CEED_ERROR_SUCCESS;
 }
diff --git a/include/ceed/jit-source/gallery/ceed-vectormassapply.h b/include/ceed/jit-source/gallery/ceed-vectormassapply.h
index 072ccc4cd0..adc67918f6 100644
--- a/include/ceed/jit-source/gallery/ceed-vectormassapply.h
+++ b/include/ceed/jit-source/gallery/ceed-vectormassapply.h
@@ -25,6 +25,5 @@ CEED_QFUNCTION(Vector3MassApply)(void *ctx, const CeedInt Q, const CeedScalar *c
       v[c][i] = u[c][i] * q_data[i];
     }
   }  // End of Quadrature Point Loop
-
   return CEED_ERROR_SUCCESS;
 }
diff --git a/include/ceed/jit-source/gallery/ceed-vectorpoisson1dapply.h b/include/ceed/jit-source/gallery/ceed-vectorpoisson1dapply.h
index 928a5e9882..8921c348ae 100644
--- a/include/ceed/jit-source/gallery/ceed-vectorpoisson1dapply.h
+++ b/include/ceed/jit-source/gallery/ceed-vectorpoisson1dapply.h
@@ -25,6 +25,5 @@ CEED_QFUNCTION(Vector3Poisson1DApply)(void *ctx, const CeedInt Q, const CeedScal
       vg[c][i] = ug[c][i] * q_data[i];
     }
   }  // End of Quadrature Point Loop
-
   return CEED_ERROR_SUCCESS;
 }
diff --git a/include/ceed/jit-source/gallery/ceed-vectorpoisson2dapply.h b/include/ceed/jit-source/gallery/ceed-vectorpoisson2dapply.h
index 16f98bed6b..12f7d73468 100644
--- a/include/ceed/jit-source/gallery/ceed-vectorpoisson2dapply.h
+++ b/include/ceed/jit-source/gallery/ceed-vectorpoisson2dapply.h
@@ -35,6 +35,5 @@ CEED_QFUNCTION(Vector3Poisson2DApply)(void *ctx, const CeedInt Q, const CeedScal
     for (CeedInt j = 0; j < dim; j++)
       for (CeedInt c = 0; c < num_comp; c++) vg[j][c][i] = (ug[0][c][i] * dXdxdXdxT[0][j] + ug[1][c][i] * dXdxdXdxT[1][j]);
   }  // End of Quadrature Point Loop
-
   return CEED_ERROR_SUCCESS;
 }
diff --git a/include/ceed/jit-source/gallery/ceed-vectorpoisson3dapply.h b/include/ceed/jit-source/gallery/ceed-vectorpoisson3dapply.h
index 60e296ac81..634ecb01a5 100644
--- a/include/ceed/jit-source/gallery/ceed-vectorpoisson3dapply.h
+++ b/include/ceed/jit-source/gallery/ceed-vectorpoisson3dapply.h
@@ -38,6 +38,5 @@ CEED_QFUNCTION(Vector3Poisson3DApply)(void *ctx, const CeedInt Q, const CeedScal
       for (CeedInt c = 0; c < num_comp; c++)
         vg[j][c][i] = (ug[0][c][i] * dXdxdXdxT[0][j] + ug[1][c][i] * dXdxdXdxT[1][j] + ug[2][c][i] * dXdxdXdxT[2][j]);
   }  // End of Quadrature Point Loop
-
   return CEED_ERROR_SUCCESS;
 }

From 32db0c4dc7e6b5d9c7baf16042a707676f075368 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Mon, 9 Feb 2026 09:25:30 -0700
Subject: [PATCH 544/571] pc - use scalar multiplicity vec for multigrid

---
 interface/ceed-preconditioning.c | 54 +++++++++++++++++++++++++-------
 1 file changed, 43 insertions(+), 11 deletions(-)

diff --git a/interface/ceed-preconditioning.c b/interface/ceed-preconditioning.c
index 1ccc3baa42..71f6c92eac 100644
--- a/interface/ceed-preconditioning.c
+++ b/interface/ceed-preconditioning.c
@@ -1097,21 +1097,53 @@ static int CeedOperatorMultigridLevelCreateSingle_Core(CeedOperator op_fine, Cee
 
   // Multiplicity vector
   if (op_restrict || op_prolong) {
-    CeedVector          mult_e_vec;
+    CeedInt             num_elem, num_comp, elem_size;
+    CeedVector          mult_l_vec, mult_e_vec;
     CeedRestrictionType rstr_type;
+    CeedElemRestriction rstr_p_mult_full;
 
     CeedCall(CeedElemRestrictionGetType(rstr_fine, &rstr_type));
     CeedCheck(rstr_type != CEED_RESTRICTION_CURL_ORIENTED, ceed, CEED_ERROR_UNSUPPORTED,
               "Element restrictions created with CeedElemRestrictionCreateCurlOriented are not supported");
     CeedCheck(p_mult_fine, ceed, CEED_ERROR_INCOMPATIBLE, "Prolongation or restriction operator creation requires fine grid multiplicity vector");
-    CeedCall(CeedElemRestrictionCreateUnsignedCopy(rstr_fine, &rstr_p_mult_fine));
-    CeedCall(CeedElemRestrictionCreateVector(rstr_fine, &mult_vec, &mult_e_vec));
+
+    // Create multiplicity multi-component l-vector
+    CeedCall(CeedElemRestrictionCreateUnsignedCopy(rstr_fine, &rstr_p_mult_full));
+    CeedCall(CeedElemRestrictionCreateVector(rstr_fine, &mult_l_vec, &mult_e_vec));
     CeedCall(CeedVectorSetValue(mult_e_vec, 0.0));
-    CeedCall(CeedElemRestrictionApply(rstr_p_mult_fine, CEED_NOTRANSPOSE, p_mult_fine, mult_e_vec, CEED_REQUEST_IMMEDIATE));
-    CeedCall(CeedVectorSetValue(mult_vec, 0.0));
-    CeedCall(CeedElemRestrictionApply(rstr_p_mult_fine, CEED_TRANSPOSE, mult_e_vec, mult_vec, CEED_REQUEST_IMMEDIATE));
+    CeedCall(CeedElemRestrictionApply(rstr_p_mult_full, CEED_NOTRANSPOSE, p_mult_fine, mult_e_vec, CEED_REQUEST_IMMEDIATE));
+    CeedCall(CeedVectorSetValue(mult_l_vec, 0.0));
+    CeedCall(CeedElemRestrictionApply(rstr_p_mult_full, CEED_TRANSPOSE, mult_e_vec, mult_l_vec, CEED_REQUEST_IMMEDIATE));
+    CeedCall(CeedVectorReciprocal(mult_l_vec));
+
+    // Create multiplicity single component e-vector
+    CeedCall(CeedElemRestrictionGetNumElements(rstr_p_mult_full, &num_elem));
+    CeedCall(CeedElemRestrictionGetNumComponents(rstr_p_mult_full, &num_comp));
+    CeedCall(CeedElemRestrictionGetElementSize(rstr_p_mult_full, &elem_size));
+    CeedCall(CeedElemRestrictionCreateStrided(ceed, num_elem, elem_size, 1, num_elem * elem_size, CEED_STRIDES_BACKEND, &rstr_p_mult_fine));
+    CeedCall(CeedElemRestrictionCreateVector(rstr_p_mult_fine, &mult_vec, NULL));
+    {
+      CeedQFunction qf_to_scalar;
+      CeedOperator  op_to_scalar;
+
+      CeedCall(CeedQFunctionCreateInteriorByName(ceed, "Identity to scalar", &qf_to_scalar));
+      CeedCall(CeedQFunctionAddInput(qf_to_scalar, "input", num_comp, CEED_EVAL_NONE));
+      CeedCall(CeedQFunctionAddOutput(qf_to_scalar, "output", 1, CEED_EVAL_NONE));
+
+      CeedCall(CeedOperatorCreate(ceed, qf_to_scalar, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_to_scalar));
+      CeedCall(CeedOperatorSetField(op_to_scalar, "input", rstr_p_mult_full, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE));
+      CeedCall(CeedOperatorSetField(op_to_scalar, "output", rstr_p_mult_fine, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE));
+
+      CeedCall(CeedOperatorApply(op_to_scalar, mult_l_vec, mult_vec, CEED_REQUEST_IMMEDIATE));
+
+      // Clean-up
+      CeedCall(CeedQFunctionDestroy(&qf_to_scalar));
+      CeedCall(CeedOperatorDestroy(&op_to_scalar));
+    }
+    // Clean-up
     CeedCall(CeedVectorDestroy(&mult_e_vec));
-    CeedCall(CeedVectorReciprocal(mult_vec));
+    CeedCall(CeedVectorDestroy(&mult_l_vec));
+    CeedCall(CeedElemRestrictionDestroy(&rstr_p_mult_full));
   }
 
   // Clone name
@@ -1132,7 +1164,7 @@ static int CeedOperatorMultigridLevelCreateSingle_Core(CeedOperator op_fine, Cee
     CeedQFunctionContext ctx_r;
     CeedQFunction        qf_restrict;
 
-    CeedCall(CeedQFunctionCreateInteriorByName(ceed, "Scale", &qf_restrict));
+    CeedCall(CeedQFunctionCreateInteriorByName(ceed, "Scale (scalar)", &qf_restrict));
     CeedCall(CeedCalloc(1, &num_comp_r_data));
     num_comp_r_data[0] = num_comp;
     CeedCall(CeedQFunctionContextCreate(ceed, &ctx_r));
@@ -1140,7 +1172,7 @@ static int CeedOperatorMultigridLevelCreateSingle_Core(CeedOperator op_fine, Cee
     CeedCall(CeedQFunctionSetContext(qf_restrict, ctx_r));
     CeedCall(CeedQFunctionContextDestroy(&ctx_r));
     CeedCall(CeedQFunctionAddInput(qf_restrict, "input", num_comp, CEED_EVAL_NONE));
-    CeedCall(CeedQFunctionAddInput(qf_restrict, "scale", num_comp, CEED_EVAL_NONE));
+    CeedCall(CeedQFunctionAddInput(qf_restrict, "scale", 1, CEED_EVAL_NONE));
     CeedCall(CeedQFunctionAddOutput(qf_restrict, "output", num_comp, CEED_EVAL_INTERP));
     CeedCall(CeedQFunctionSetUserFlopsEstimate(qf_restrict, num_comp));
 
@@ -1170,7 +1202,7 @@ static int CeedOperatorMultigridLevelCreateSingle_Core(CeedOperator op_fine, Cee
     CeedQFunctionContext ctx_p;
     CeedQFunction        qf_prolong;
 
-    CeedCall(CeedQFunctionCreateInteriorByName(ceed, "Scale", &qf_prolong));
+    CeedCall(CeedQFunctionCreateInteriorByName(ceed, "Scale (scalar)", &qf_prolong));
     CeedCall(CeedCalloc(1, &num_comp_p_data));
     num_comp_p_data[0] = num_comp;
     CeedCall(CeedQFunctionContextCreate(ceed, &ctx_p));
@@ -1178,7 +1210,7 @@ static int CeedOperatorMultigridLevelCreateSingle_Core(CeedOperator op_fine, Cee
     CeedCall(CeedQFunctionSetContext(qf_prolong, ctx_p));
     CeedCall(CeedQFunctionContextDestroy(&ctx_p));
     CeedCall(CeedQFunctionAddInput(qf_prolong, "input", num_comp, CEED_EVAL_INTERP));
-    CeedCall(CeedQFunctionAddInput(qf_prolong, "scale", num_comp, CEED_EVAL_NONE));
+    CeedCall(CeedQFunctionAddInput(qf_prolong, "scale", 1, CEED_EVAL_NONE));
     CeedCall(CeedQFunctionAddOutput(qf_prolong, "output", num_comp, CEED_EVAL_NONE));
     CeedCall(CeedQFunctionSetUserFlopsEstimate(qf_prolong, num_comp));
 

From 397d7ab35e859d6657fd0554ae95a0acfff6fe8d Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Mon, 9 Feb 2026 10:03:03 -0700
Subject: [PATCH 545/571] pc - test to use scalar or multi-comp mult vector

---
 interface/ceed-preconditioning.c | 69 ++++++++++++++++++++------------
 1 file changed, 44 insertions(+), 25 deletions(-)

diff --git a/interface/ceed-preconditioning.c b/interface/ceed-preconditioning.c
index 71f6c92eac..b9cf0b8ed8 100644
--- a/interface/ceed-preconditioning.c
+++ b/interface/ceed-preconditioning.c
@@ -1013,7 +1013,7 @@ static int CeedOperatorMultigridLevelCreateSingle_Core(CeedOperator op_fine, Cee
                                                        CeedOperator *op_prolong, CeedOperator *op_restrict) {
   bool                is_composite;
   Ceed                ceed;
-  CeedInt             num_comp, num_input_fields, num_output_fields;
+  CeedInt             dim              = 0, num_comp, num_input_fields, num_output_fields;
   CeedVector          mult_vec         = NULL;
   CeedElemRestriction rstr_p_mult_fine = NULL, rstr_fine = NULL;
   CeedOperatorField  *input_fields, *output_fields;
@@ -1060,6 +1060,7 @@ static int CeedOperatorMultigridLevelCreateSingle_Core(CeedOperator op_fine, Cee
       CeedCall(CeedOperatorFieldGetElemRestriction(input_fields[i], &rstr));
       CeedCall(CeedOperatorFieldGetBasis(input_fields[i], &basis));
     }
+    if (dim == 0) CeedCall(CeedBasisGetDimension(basis, &dim));
     CeedCall(CeedOperatorSetField(*op_coarse, field_name, rstr, basis, vec));
     CeedCall(CeedVectorDestroy(&vec));
     CeedCall(CeedElemRestrictionDestroy(&rstr));
@@ -1082,11 +1083,13 @@ static int CeedOperatorMultigridLevelCreateSingle_Core(CeedOperator op_fine, Cee
       CeedCall(CeedOperatorFieldGetElemRestriction(output_fields[i], &rstr));
       CeedCall(CeedOperatorFieldGetBasis(output_fields[i], &basis));
     }
+    if (dim == 0) CeedCall(CeedBasisGetDimension(basis, &dim));
     CeedCall(CeedOperatorSetField(*op_coarse, field_name, rstr, basis, vec));
     CeedCall(CeedVectorDestroy(&vec));
     CeedCall(CeedElemRestrictionDestroy(&rstr));
     CeedCall(CeedBasisDestroy(&basis));
   }
+  dim = dim ? dim : 1;
   // -- Clone QFunctionAssemblyData
   {
     CeedQFunctionAssemblyData fine_data;
@@ -1096,6 +1099,8 @@ static int CeedOperatorMultigridLevelCreateSingle_Core(CeedOperator op_fine, Cee
   }
 
   // Multiplicity vector
+  bool use_scalar_mult = true;
+
   if (op_restrict || op_prolong) {
     CeedInt             num_elem, num_comp, elem_size;
     CeedVector          mult_l_vec, mult_e_vec;
@@ -1116,29 +1121,43 @@ static int CeedOperatorMultigridLevelCreateSingle_Core(CeedOperator op_fine, Cee
     CeedCall(CeedElemRestrictionApply(rstr_p_mult_full, CEED_TRANSPOSE, mult_e_vec, mult_l_vec, CEED_REQUEST_IMMEDIATE));
     CeedCall(CeedVectorReciprocal(mult_l_vec));
 
-    // Create multiplicity single component e-vector
-    CeedCall(CeedElemRestrictionGetNumElements(rstr_p_mult_full, &num_elem));
-    CeedCall(CeedElemRestrictionGetNumComponents(rstr_p_mult_full, &num_comp));
-    CeedCall(CeedElemRestrictionGetElementSize(rstr_p_mult_full, &elem_size));
-    CeedCall(CeedElemRestrictionCreateStrided(ceed, num_elem, elem_size, 1, num_elem * elem_size, CEED_STRIDES_BACKEND, &rstr_p_mult_fine));
-    CeedCall(CeedElemRestrictionCreateVector(rstr_p_mult_fine, &mult_vec, NULL));
+    // Determine to use scalar multiplicity or not
     {
-      CeedQFunction qf_to_scalar;
-      CeedOperator  op_to_scalar;
-
-      CeedCall(CeedQFunctionCreateInteriorByName(ceed, "Identity to scalar", &qf_to_scalar));
-      CeedCall(CeedQFunctionAddInput(qf_to_scalar, "input", num_comp, CEED_EVAL_NONE));
-      CeedCall(CeedQFunctionAddOutput(qf_to_scalar, "output", 1, CEED_EVAL_NONE));
-
-      CeedCall(CeedOperatorCreate(ceed, qf_to_scalar, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_to_scalar));
-      CeedCall(CeedOperatorSetField(op_to_scalar, "input", rstr_p_mult_full, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE));
-      CeedCall(CeedOperatorSetField(op_to_scalar, "output", rstr_p_mult_fine, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE));
+      const CeedInt p = pow(elem_size, 1.0 / dim);
 
-      CeedCall(CeedOperatorApply(op_to_scalar, mult_l_vec, mult_vec, CEED_REQUEST_IMMEDIATE));
+      use_scalar_mult = num_comp > 1 && (dim < 3 || num_comp - 1 > (3 * (pow(p, dim - 1) - pow(p, dim - 2)) + 1) / pow(p - 1, dim));
+    }
 
-      // Clean-up
-      CeedCall(CeedQFunctionDestroy(&qf_to_scalar));
-      CeedCall(CeedOperatorDestroy(&op_to_scalar));
+    if (use_scalar_mult) {
+      // Create multiplicity single component e-vector
+      CeedCall(CeedElemRestrictionGetNumElements(rstr_p_mult_full, &num_elem));
+      CeedCall(CeedElemRestrictionGetNumComponents(rstr_p_mult_full, &num_comp));
+      CeedCall(CeedElemRestrictionGetElementSize(rstr_p_mult_full, &elem_size));
+      CeedCall(CeedElemRestrictionCreateStrided(ceed, num_elem, elem_size, 1, num_elem * elem_size, CEED_STRIDES_BACKEND, &rstr_p_mult_fine));
+      CeedCall(CeedElemRestrictionCreateVector(rstr_p_mult_fine, &mult_vec, NULL));
+      {
+        CeedQFunction qf_to_scalar;
+        CeedOperator  op_to_scalar;
+
+        CeedCall(CeedQFunctionCreateInteriorByName(ceed, "Identity to scalar", &qf_to_scalar));
+        CeedCall(CeedQFunctionAddInput(qf_to_scalar, "input", num_comp, CEED_EVAL_NONE));
+        CeedCall(CeedQFunctionAddOutput(qf_to_scalar, "output", 1, CEED_EVAL_NONE));
+
+        CeedCall(CeedOperatorCreate(ceed, qf_to_scalar, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_to_scalar));
+        CeedCall(CeedOperatorSetField(op_to_scalar, "input", rstr_p_mult_full, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE));
+        CeedCall(CeedOperatorSetField(op_to_scalar, "output", rstr_p_mult_fine, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE));
+
+        CeedCall(CeedOperatorApply(op_to_scalar, mult_l_vec, mult_vec, CEED_REQUEST_IMMEDIATE));
+
+        // Clean-up
+        CeedCall(CeedQFunctionDestroy(&qf_to_scalar));
+        CeedCall(CeedOperatorDestroy(&op_to_scalar));
+      }
+    } else {
+      mult_vec = NULL;
+      CeedCall(CeedVectorReferenceCopy(mult_l_vec, &mult_vec));
+      rstr_p_mult_fine = NULL;
+      CeedCall(CeedElemRestrictionReferenceCopy(rstr_p_mult_full, &rstr_p_mult_fine));
     }
     // Clean-up
     CeedCall(CeedVectorDestroy(&mult_e_vec));
@@ -1164,7 +1183,7 @@ static int CeedOperatorMultigridLevelCreateSingle_Core(CeedOperator op_fine, Cee
     CeedQFunctionContext ctx_r;
     CeedQFunction        qf_restrict;
 
-    CeedCall(CeedQFunctionCreateInteriorByName(ceed, "Scale (scalar)", &qf_restrict));
+    CeedCall(CeedQFunctionCreateInteriorByName(ceed, use_scalar_mult ? "Scale (scalar)" : "Scale", &qf_restrict));
     CeedCall(CeedCalloc(1, &num_comp_r_data));
     num_comp_r_data[0] = num_comp;
     CeedCall(CeedQFunctionContextCreate(ceed, &ctx_r));
@@ -1172,7 +1191,7 @@ static int CeedOperatorMultigridLevelCreateSingle_Core(CeedOperator op_fine, Cee
     CeedCall(CeedQFunctionSetContext(qf_restrict, ctx_r));
     CeedCall(CeedQFunctionContextDestroy(&ctx_r));
     CeedCall(CeedQFunctionAddInput(qf_restrict, "input", num_comp, CEED_EVAL_NONE));
-    CeedCall(CeedQFunctionAddInput(qf_restrict, "scale", 1, CEED_EVAL_NONE));
+    CeedCall(CeedQFunctionAddInput(qf_restrict, "scale", use_scalar_mult ? 1 : num_comp, CEED_EVAL_NONE));
     CeedCall(CeedQFunctionAddOutput(qf_restrict, "output", num_comp, CEED_EVAL_INTERP));
     CeedCall(CeedQFunctionSetUserFlopsEstimate(qf_restrict, num_comp));
 
@@ -1202,7 +1221,7 @@ static int CeedOperatorMultigridLevelCreateSingle_Core(CeedOperator op_fine, Cee
     CeedQFunctionContext ctx_p;
     CeedQFunction        qf_prolong;
 
-    CeedCall(CeedQFunctionCreateInteriorByName(ceed, "Scale (scalar)", &qf_prolong));
+    CeedCall(CeedQFunctionCreateInteriorByName(ceed, use_scalar_mult ? "Scale (scalar)" : "Scale", &qf_prolong));
     CeedCall(CeedCalloc(1, &num_comp_p_data));
     num_comp_p_data[0] = num_comp;
     CeedCall(CeedQFunctionContextCreate(ceed, &ctx_p));
@@ -1210,7 +1229,7 @@ static int CeedOperatorMultigridLevelCreateSingle_Core(CeedOperator op_fine, Cee
     CeedCall(CeedQFunctionSetContext(qf_prolong, ctx_p));
     CeedCall(CeedQFunctionContextDestroy(&ctx_p));
     CeedCall(CeedQFunctionAddInput(qf_prolong, "input", num_comp, CEED_EVAL_INTERP));
-    CeedCall(CeedQFunctionAddInput(qf_prolong, "scale", 1, CEED_EVAL_NONE));
+    CeedCall(CeedQFunctionAddInput(qf_prolong, "scale", use_scalar_mult ? 1 : num_comp, CEED_EVAL_NONE));
     CeedCall(CeedQFunctionAddOutput(qf_prolong, "output", num_comp, CEED_EVAL_NONE));
     CeedCall(CeedQFunctionSetUserFlopsEstimate(qf_prolong, num_comp));
 

From 334b1dcfef601403a7980e03f226e013098bd6c0 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Thu, 12 Feb 2026 11:07:57 -0700
Subject: [PATCH 546/571] minor - style consistency for bibtex in README

---
 README.md | 64 +++++++++++++++++++++++++++++++++++++------------------
 1 file changed, 43 insertions(+), 21 deletions(-)

diff --git a/README.md b/README.md
index 84b46748bf..9725a1422c 100644
--- a/README.md
+++ b/README.md
@@ -401,7 +401,22 @@ If you utilize libCEED please cite:
 
 ```bibtex
 @article{libceed-joss-paper,
-  author       = {Jed Brown and Ahmad Abdelfattah and Valeria Barra and Natalie Beams and Jean Sylvain Camier and Veselin Dobrev and Yohann Dudouit and Leila Ghaffari and Tzanio Kolev and David Medina and Will Pazner and Thilina Ratnayaka and Jeremy Thompson and Stan Tomov},
+  author       = {
+    Brown, Jed and
+    Abdelfattah, Ahmad and
+    Barra, Valeria and
+    Beams, Natalie and
+    Camier, Jean-Sylvain and
+    Dobrev, Veselin and
+    Dudouit, Yohann and
+    Ghaffari, Leila and
+    Kolev, Tzanio and
+    Medina, David and
+    Pazner, Will and
+    Ratnayaka, Thilina and
+    Thompson, Jeremy L. and
+    Tomov, Stan
+  },
   title        = {{libCEED}: Fast algebra for high-order element-based discretizations},
   journal      = {Journal of Open Source Software},
   year         = {2021},
@@ -418,23 +433,25 @@ To cite the user manual:
 
 ```bibtex
 @misc{libceed-user-manual,
-  author       = {Abdelfattah, Ahmad and
-                  Barra, Valeria and
-                  Beams, Natalie and
-                  Brown, Jed and
-                  Camier, Jean-Sylvain and
-                  Dobrev, Veselin and
-                  Dudouit, Yohann and
-                  Ghaffari, Leila and
-                  Grimberg, Sebastian and
-                  Kolev, Tzanio and
-                  Medina, David and
-                  Pazner, Will and
-                  Ratnayaka, Thilina and
-                  Shakeri, Rezgar and
-                  Thompson, Jeremy L and
-                  Tomov, Stanimire and
-                  Wright III, James},
+  author       = {
+    Abdelfattah, Ahmad and
+    Barra, Valeria and
+    Beams, Natalie and
+    Brown, Jed and
+    Camier, Jean-Sylvain and
+    Dobrev, Veselin and
+    Dudouit, Yohann and
+    Ghaffari, Leila and
+    Grimberg, Sebastian and
+    Kolev, Tzanio and
+    Medina, David and
+    Pazner, Will and
+    Ratnayaka, Thilina and
+    Shakeri, Rezgar and
+    Thompson, Jeremy L. and
+    Tomov, Stanimire and
+    Wright III, James
+  },
   title        = {{libCEED} User Manual},
   month        = nov,
   year         = 2023,
@@ -447,9 +464,14 @@ To cite the user manual:
 For libCEED's Python interface please cite:
 
 ```bibtex
-@InProceedings{libceed-paper-proc-scipy-2020,
-  author    = {{V}aleria {B}arra and {J}ed {B}rown and {J}eremy {T}hompson and {Y}ohann {D}udouit},
-  title     = {{H}igh-performance operator evaluations with ease of use: lib{C}{E}{E}{D}'s {P}ython interface},
+@InProceedings{libceed-scipy,
+  author    = {
+    Barra, Valeria and
+    Brown, Jed and
+    Thompson, Jeremy L. and
+    Dudouit, Yohann
+  },
+  title     = {{H}igh-performance operator evaluations with ease of use: {libCEED}'s {P}ython interface},
   booktitle = {{P}roceedings of the 19th {P}ython in {S}cience {C}onference},
   pages     = {85 - 90},
   year      = {2020},

From be3958531954aa94a1b1db4aa6e1a06adbd54dac Mon Sep 17 00:00:00 2001
From: Zach Atkins <zach.atkins@colorado.edu>
Date: Wed, 18 Feb 2026 14:06:37 -0700
Subject: [PATCH 547/571] operator(composite): add interface to force
 sequential execution of sub-operators

---
 backends/cuda-gen/ceed-cuda-gen-operator.c | 12 ++++---
 backends/hip-gen/ceed-hip-gen-operator.c   | 21 ++++++-----
 include/ceed-impl.h                        |  1 +
 include/ceed/ceed.h                        |  2 ++
 interface/ceed-operator.c                  | 42 ++++++++++++++++++++++
 5 files changed, 65 insertions(+), 13 deletions(-)

diff --git a/backends/cuda-gen/ceed-cuda-gen-operator.c b/backends/cuda-gen/ceed-cuda-gen-operator.c
index 41ce035b7e..97fcf6b4b0 100644
--- a/backends/cuda-gen/ceed-cuda-gen-operator.c
+++ b/backends/cuda-gen/ceed-cuda-gen-operator.c
@@ -294,30 +294,32 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec,
 }
 
 static int CeedOperatorApplyAddComposite_Cuda_gen(CeedOperator op, CeedVector input_vec, CeedVector output_vec, CeedRequest *request) {
-  bool              is_run_good[CEED_COMPOSITE_MAX] = {false};
+  bool              is_run_good[CEED_COMPOSITE_MAX] = {false}, is_sequential;
   CeedInt           num_suboperators;
   const CeedScalar *input_arr  = NULL;
   CeedScalar       *output_arr = NULL;
   Ceed              ceed;
   CeedOperator     *sub_operators;
+  cudaStream_t      stream = NULL;
 
   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
   CeedCall(CeedOperatorCompositeGetNumSub(op, &num_suboperators));
   CeedCall(CeedOperatorCompositeGetSubList(op, &sub_operators));
+  CeedCall(CeedOperatorCompositeIsSequential(op, &is_sequential));
   if (input_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(input_vec, CEED_MEM_DEVICE, &input_arr));
   if (output_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArray(output_vec, CEED_MEM_DEVICE, &output_arr));
+  if (is_sequential) CeedCallCuda(ceed, cudaStreamCreate(&stream));
   for (CeedInt i = 0; i < num_suboperators; i++) {
     CeedInt num_elem = 0;
 
     CeedCall(CeedOperatorGetNumElements(sub_operators[i], &num_elem));
     if (num_elem > 0) {
-      cudaStream_t stream = NULL;
-
-      CeedCallCuda(ceed, cudaStreamCreate(&stream));
+      if (!is_sequential) CeedCallCuda(ceed, cudaStreamCreate(&stream));
       CeedCallBackend(CeedOperatorApplyAddCore_Cuda_gen(sub_operators[i], stream, input_arr, output_arr, &is_run_good[i], request));
-      CeedCallCuda(ceed, cudaStreamDestroy(stream));
+      if (!is_sequential) CeedCallCuda(ceed, cudaStreamDestroy(stream));
     }
   }
+  if (is_sequential) CeedCallCuda(ceed, cudaStreamDestroy(stream));
   if (input_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorRestoreArrayRead(input_vec, &input_arr));
   if (output_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorRestoreArray(output_vec, &output_arr));
   CeedCallCuda(ceed, cudaDeviceSynchronize());
diff --git a/backends/hip-gen/ceed-hip-gen-operator.c b/backends/hip-gen/ceed-hip-gen-operator.c
index c5918f914b..f036ae52b5 100644
--- a/backends/hip-gen/ceed-hip-gen-operator.c
+++ b/backends/hip-gen/ceed-hip-gen-operator.c
@@ -252,7 +252,7 @@ static int CeedOperatorApplyAdd_Hip_gen(CeedOperator op, CeedVector input_vec, C
 }
 
 static int CeedOperatorApplyAddComposite_Hip_gen(CeedOperator op, CeedVector input_vec, CeedVector output_vec, CeedRequest *request) {
-  bool                  is_run_good[CEED_COMPOSITE_MAX] = {false};
+  bool                  is_run_good[CEED_COMPOSITE_MAX] = {false}, is_sequential;
   CeedInt               num_suboperators;
   const CeedScalar     *input_arr  = NULL;
   CeedScalar           *output_arr = NULL;
@@ -264,23 +264,28 @@ static int CeedOperatorApplyAddComposite_Hip_gen(CeedOperator op, CeedVector inp
   CeedCallBackend(CeedOperatorGetData(op, &impl));
   CeedCallBackend(CeedOperatorCompositeGetNumSub(op, &num_suboperators));
   CeedCallBackend(CeedOperatorCompositeGetSubList(op, &sub_operators));
+  CeedCall(CeedOperatorCompositeIsSequential(op, &is_sequential));
   if (input_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(input_vec, CEED_MEM_DEVICE, &input_arr));
   if (output_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArray(output_vec, CEED_MEM_DEVICE, &output_arr));
   for (CeedInt i = 0; i < num_suboperators; i++) {
-    CeedInt num_elem = 0;
+    CeedInt       num_elem     = 0;
+    const CeedInt stream_index = is_sequential ? 0 : i;
 
     CeedCallBackend(CeedOperatorGetNumElements(sub_operators[i], &num_elem));
     if (num_elem > 0) {
-      if (!impl->streams[i]) CeedCallHip(ceed, hipStreamCreate(&impl->streams[i]));
-      CeedCallBackend(CeedOperatorApplyAddCore_Hip_gen(sub_operators[i], impl->streams[i], input_arr, output_arr, &is_run_good[i], request));
+      if (!impl->streams[stream_index]) CeedCallHip(ceed, hipStreamCreate(&impl->streams[stream_index]));
+      CeedCallBackend(CeedOperatorApplyAddCore_Hip_gen(sub_operators[i], impl->streams[stream_index], input_arr, output_arr, &is_run_good[i],
+                                                       request));
     } else {
       is_run_good[i] = true;
     }
   }
-
-  for (CeedInt i = 0; i < num_suboperators; i++) {
-    if (impl->streams[i]) {
-      if (is_run_good[i]) CeedCallHip(ceed, hipStreamSynchronize(impl->streams[i]));
+  if (is_sequential) CeedCallHip(ceed, hipStreamSynchronize(impl->streams[0]));
+  else {
+    for (CeedInt i = 0; i < num_suboperators; i++) {
+      if (impl->streams[i]) {
+        if (is_run_good[i]) CeedCallHip(ceed, hipStreamSynchronize(impl->streams[i]));
+      }
     }
   }
   if (input_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorRestoreArrayRead(input_vec, &input_arr));
diff --git a/include/ceed-impl.h b/include/ceed-impl.h
index 669f039d5a..e5f8773f37 100644
--- a/include/ceed-impl.h
+++ b/include/ceed-impl.h
@@ -382,6 +382,7 @@ struct CeedOperator_private {
   bool                      is_composite;
   bool                      is_at_points;
   bool                      has_restriction;
+  bool                      is_sequential;
   CeedQFunctionAssemblyData qf_assembled;
   CeedOperatorAssemblyData  op_assembled;
   CeedOperator             *sub_operators;
diff --git a/include/ceed/ceed.h b/include/ceed/ceed.h
index d5db4a4bc5..a76b9238c3 100644
--- a/include/ceed/ceed.h
+++ b/include/ceed/ceed.h
@@ -447,6 +447,8 @@ CEED_EXTERN int  CeedOperatorCompositeAddSub(CeedOperator composite_op, CeedOper
 CEED_EXTERN int  CeedOperatorCompositeGetNumSub(CeedOperator op, CeedInt *num_suboperators);
 CEED_EXTERN int  CeedOperatorCompositeGetSubList(CeedOperator op, CeedOperator **sub_operators);
 CEED_EXTERN int  CeedOperatorCompositeGetSubByName(CeedOperator op, const char *op_name, CeedOperator *sub_op);
+CEED_EXTERN int  CeedOperatorCompositeSetSequential(CeedOperator op, bool is_sequential);
+CEED_EXTERN int  CeedOperatorCompositeIsSequential(CeedOperator op, bool *is_sequential);
 CEED_EXTERN int  CeedOperatorCheckReady(CeedOperator op);
 CEED_EXTERN int  CeedOperatorGetActiveVectorLengths(CeedOperator op, CeedSize *input_size, CeedSize *output_size);
 CEED_EXTERN int  CeedOperatorSetQFunctionAssemblyReuse(CeedOperator op, bool reuse_assembly_data);
diff --git a/interface/ceed-operator.c b/interface/ceed-operator.c
index 6eb0b74d46..55e72cb2c5 100644
--- a/interface/ceed-operator.c
+++ b/interface/ceed-operator.c
@@ -1390,6 +1390,48 @@ int CeedOperatorCompositeGetSubByName(CeedOperator op, const char *op_name, Ceed
   return CEED_ERROR_SUCCESS;
 }
 
+/**
+  @brief Set whether the sub-operators of the composite `CeedOperator` must be run sequentially.
+
+  Note: This value currently only affects the GPU `/gpu/cuda/gen` and `/gpu/hip/gen` backends.
+
+  @param[in] op            Composite `CeedOperator`
+  @param[in] is_sequential Flag value to set, if `true`, forces the composite `CeedOperator` to execute sequentially
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Advanced
+**/
+int CeedOperatorCompositeSetSequential(CeedOperator op, bool is_sequential) {
+  bool is_composite;
+
+  CeedCall(CeedOperatorIsComposite(op, &is_composite));
+  CeedCheck(is_composite, CeedOperatorReturnCeed(op), CEED_ERROR_MINOR, "Only defined for a composite operator");
+  op->is_sequential = is_sequential;
+  return CEED_ERROR_SUCCESS;
+}
+
+/**
+  @brief Get whether the sub-operators of the composite `CeedOperator` must be run sequentially.
+
+  Note: This value currently only affects the GPU `/gpu/cuda/gen` and `/gpu/hip/gen` backends.
+
+  @param[in]  op            Composite `CeedOperator`
+  @param[out] is_sequential Variable to store sequential status
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Advanced
+**/
+int CeedOperatorCompositeIsSequential(CeedOperator op, bool *is_sequential) {
+  bool is_composite;
+
+  CeedCall(CeedOperatorIsComposite(op, &is_composite));
+  CeedCheck(is_composite, CeedOperatorReturnCeed(op), CEED_ERROR_MINOR, "Only defined for a composite operator");
+  *is_sequential = op->is_sequential;
+  return CEED_ERROR_SUCCESS;
+}
+
 /**
   @brief Check if a `CeedOperator` is ready to be used.
 

From 745f16d1ea9fdcf634671fe63363ae0b5c9a02f9 Mon Sep 17 00:00:00 2001
From: Zach Atkins <zach.atkins@colorado.edu>
Date: Wed, 18 Feb 2026 14:10:06 -0700
Subject: [PATCH 548/571] backends(cuda/gen,hip/gen): fix at-points assembly to
 avoid writing to non-active outputs

---
 .../cuda-gen/ceed-cuda-gen-operator-build.cpp | 22 ++++++++++++++-----
 .../hip-gen/ceed-hip-gen-operator-build.cpp   | 22 ++++++++++++++-----
 2 files changed, 34 insertions(+), 10 deletions(-)

diff --git a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
index 79105413fa..7a36364d97 100644
--- a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
+++ b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
@@ -702,7 +702,7 @@ static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, C
                                                      CeedQFunctionField *qf_input_fields, CeedInt num_output_fields,
                                                      CeedOperatorField *op_output_fields, CeedQFunctionField *qf_output_fields,
                                                      std::string qfunction_name, CeedInt Q_1d, bool is_all_tensor, bool is_at_points,
-                                                     bool use_3d_slices) {
+                                                     bool use_3d_slices, bool is_assemble) {
   std::string         Q_name    = is_all_tensor ? "Q_1d" : "Q";
   CeedEvalMode        eval_mode = CEED_EVAL_NONE;
   CeedElemRestriction elem_rstr;
@@ -1029,6 +1029,7 @@ static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, C
           CeedInt             comp_stride;
           CeedElemRestriction elem_rstr;
 
+          if (is_assemble) break;
           CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
           CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
           CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
@@ -1583,7 +1584,7 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_b
   // -- Q function
   CeedCallBackend(CeedOperatorBuildKernelQFunction_Cuda_gen(code, data, tab, max_dim, max_num_points, num_input_fields, op_input_fields,
                                                             qf_input_fields, num_output_fields, op_output_fields, qf_output_fields, qfunction_name,
-                                                            Q_1d, is_all_tensor, is_at_points, use_3d_slices));
+                                                            Q_1d, is_all_tensor, is_at_points, use_3d_slices, false));
 
   // -- Output basis and restriction
   code << "\n" << tab << "// -- Output field basis action and restrictions\n";
@@ -2008,7 +2009,7 @@ static int CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen(CeedOperator op, boo
   // -- Q function
   CeedCallBackend(CeedOperatorBuildKernelQFunction_Cuda_gen(code, data, tab, max_dim, max_num_points, num_input_fields, op_input_fields,
                                                             qf_input_fields, num_output_fields, op_output_fields, qf_output_fields, qfunction_name,
-                                                            Q_1d, is_all_tensor, is_at_points, use_3d_slices));
+                                                            Q_1d, is_all_tensor, is_at_points, use_3d_slices, true));
 
   // -- Output basis and restriction
   code << "\n" << tab << "// -- Output field basis action and restrictions\n";
@@ -2274,7 +2275,18 @@ extern "C" int CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen(CeedOpera
     }
   }
   for (CeedInt i = 0; i < num_output_fields; i++) {
-    code << tab << "CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n";
+    bool is_active = false;
+
+    {
+      CeedVector vec;
+
+      CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
+      is_active = vec == CEED_VECTOR_ACTIVE;
+      CeedCallBackend(CeedVectorDestroy(&vec));
+    }
+    if (is_active) {
+      code << tab << "CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n";
+    }
   }
 
   code << tab << "const CeedInt max_dim = " << max_dim << ";\n";
@@ -2605,7 +2617,7 @@ extern "C" int CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen(CeedOpera
   // -- Q function
   CeedCallBackend(CeedOperatorBuildKernelQFunction_Cuda_gen(code, data, tab, max_dim, max_num_points, num_input_fields, op_input_fields,
                                                             qf_input_fields, num_output_fields, op_output_fields, qf_output_fields, qfunction_name,
-                                                            Q_1d, is_all_tensor, is_at_points, use_3d_slices));
+                                                            Q_1d, is_all_tensor, is_at_points, use_3d_slices, true));
 
   // -- Output basis and restriction
   code << "\n" << tab << "// -- Output field basis action and restrictions\n";
diff --git a/backends/hip-gen/ceed-hip-gen-operator-build.cpp b/backends/hip-gen/ceed-hip-gen-operator-build.cpp
index fc47a6cdd5..4f2438cffa 100644
--- a/backends/hip-gen/ceed-hip-gen-operator-build.cpp
+++ b/backends/hip-gen/ceed-hip-gen-operator-build.cpp
@@ -729,7 +729,7 @@ static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, Ce
                                                     CeedQFunctionField *qf_input_fields, CeedInt num_output_fields,
                                                     CeedOperatorField *op_output_fields, CeedQFunctionField *qf_output_fields,
                                                     std::string qfunction_name, CeedInt Q_1d, bool is_all_tensor, bool is_at_points,
-                                                    bool use_3d_slices) {
+                                                    bool use_3d_slices, bool is_assemble) {
   std::string         Q_name    = is_all_tensor ? "Q_1d" : "Q";
   CeedEvalMode        eval_mode = CEED_EVAL_NONE;
   CeedElemRestriction elem_rstr;
@@ -1056,6 +1056,7 @@ static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, Ce
           CeedInt             comp_stride;
           CeedElemRestriction elem_rstr;
 
+          if (is_assemble) break;
           CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
           CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
           CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
@@ -1596,7 +1597,7 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_bu
   // -- Q function
   CeedCallBackend(CeedOperatorBuildKernelQFunction_Hip_gen(code, data, tab, max_dim, max_num_points, num_input_fields, op_input_fields,
                                                            qf_input_fields, num_output_fields, op_output_fields, qf_output_fields, qfunction_name,
-                                                           Q_1d, is_all_tensor, is_at_points, use_3d_slices));
+                                                           Q_1d, is_all_tensor, is_at_points, use_3d_slices, false));
 
   // -- Output basis and restriction
   code << "\n" << tab << "// -- Output field basis action and restrictions\n";
@@ -2013,7 +2014,7 @@ static int CeedOperatorBuildKernelAssemblyAtPoints_Hip_gen(CeedOperator op, bool
   // -- Q function
   CeedCallBackend(CeedOperatorBuildKernelQFunction_Hip_gen(code, data, tab, max_dim, max_num_points, num_input_fields, op_input_fields,
                                                            qf_input_fields, num_output_fields, op_output_fields, qf_output_fields, qfunction_name,
-                                                           Q_1d, is_all_tensor, is_at_points, use_3d_slices));
+                                                           Q_1d, is_all_tensor, is_at_points, use_3d_slices, true));
 
   // -- Output basis and restriction
   code << "\n" << tab << "// -- Output field basis action and restrictions\n";
@@ -2270,7 +2271,18 @@ extern "C" int CeedOperatorBuildKernelLinearAssembleQFunction_Hip_gen(CeedOperat
     }
   }
   for (CeedInt i = 0; i < num_output_fields; i++) {
-    code << tab << "CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n";
+    bool is_active = false;
+
+    {
+      CeedVector vec;
+
+      CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
+      is_active = vec == CEED_VECTOR_ACTIVE;
+      CeedCallBackend(CeedVectorDestroy(&vec));
+    }
+    if (is_active) {
+      code << tab << "CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n";
+    }
   }
 
   code << tab << "const CeedInt max_dim = " << max_dim << ";\n";
@@ -2601,7 +2613,7 @@ extern "C" int CeedOperatorBuildKernelLinearAssembleQFunction_Hip_gen(CeedOperat
   // -- Q function
   CeedCallBackend(CeedOperatorBuildKernelQFunction_Hip_gen(code, data, tab, max_dim, max_num_points, num_input_fields, op_input_fields,
                                                            qf_input_fields, num_output_fields, op_output_fields, qf_output_fields, qfunction_name,
-                                                           Q_1d, is_all_tensor, is_at_points, use_3d_slices));
+                                                           Q_1d, is_all_tensor, is_at_points, use_3d_slices, true));
 
   // -- Output basis and restriction
   code << "\n" << tab << "// -- Output field basis action and restrictions\n";

From 736f144a02fcdbcd732ee675ef7e42bf47b1eba5 Mon Sep 17 00:00:00 2001
From: Zach Atkins <zach.atkins@colorado.edu>
Date: Wed, 18 Feb 2026 14:10:24 -0700
Subject: [PATCH 549/571] backends(cuda/ref,hip/ref): fix use of incorrect
 output field index

---
 backends/cuda-ref/ceed-cuda-ref-operator.c | 4 ++--
 backends/hip-ref/ceed-hip-ref-operator.c   | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/backends/cuda-ref/ceed-cuda-ref-operator.c b/backends/cuda-ref/ceed-cuda-ref-operator.c
index 1525246bad..b4531fde50 100644
--- a/backends/cuda-ref/ceed-cuda-ref-operator.c
+++ b/backends/cuda-ref/ceed-cuda-ref-operator.c
@@ -594,7 +594,7 @@ static int CeedOperatorApplyAdd_Cuda(CeedOperator op, CeedVector in_vec, CeedVec
     if (eval_mode == CEED_EVAL_NONE) {
       CeedScalar *e_vec_array;
 
-      CeedCallBackend(CeedVectorTakeArray(impl->q_vecs_out[i], CEED_MEM_DEVICE, &e_vec_array));
+      CeedCallBackend(CeedVectorTakeArray(impl->q_vecs_out[field], CEED_MEM_DEVICE, &e_vec_array));
       CeedCallBackend(CeedVectorRestoreArray(e_vec, &e_vec_array));
     }
 
@@ -942,7 +942,7 @@ static int CeedOperatorApplyAddAtPoints_Cuda(CeedOperator op, CeedVector in_vec,
     if (eval_mode == CEED_EVAL_NONE) {
       CeedScalar *e_vec_array;
 
-      CeedCallBackend(CeedVectorTakeArray(impl->q_vecs_out[i], CEED_MEM_DEVICE, &e_vec_array));
+      CeedCallBackend(CeedVectorTakeArray(impl->q_vecs_out[field], CEED_MEM_DEVICE, &e_vec_array));
       CeedCallBackend(CeedVectorRestoreArray(e_vec, &e_vec_array));
     }
 
diff --git a/backends/hip-ref/ceed-hip-ref-operator.c b/backends/hip-ref/ceed-hip-ref-operator.c
index 7f3eea4e35..4ef3c76bdb 100644
--- a/backends/hip-ref/ceed-hip-ref-operator.c
+++ b/backends/hip-ref/ceed-hip-ref-operator.c
@@ -592,7 +592,7 @@ static int CeedOperatorApplyAdd_Hip(CeedOperator op, CeedVector in_vec, CeedVect
     if (eval_mode == CEED_EVAL_NONE) {
       CeedScalar *e_vec_array;
 
-      CeedCallBackend(CeedVectorTakeArray(impl->q_vecs_out[i], CEED_MEM_DEVICE, &e_vec_array));
+      CeedCallBackend(CeedVectorTakeArray(impl->q_vecs_out[field], CEED_MEM_DEVICE, &e_vec_array));
       CeedCallBackend(CeedVectorRestoreArray(e_vec, &e_vec_array));
     }
 
@@ -939,7 +939,7 @@ static int CeedOperatorApplyAddAtPoints_Hip(CeedOperator op, CeedVector in_vec,
     if (eval_mode == CEED_EVAL_NONE) {
       CeedScalar *e_vec_array;
 
-      CeedCallBackend(CeedVectorTakeArray(impl->q_vecs_out[i], CEED_MEM_DEVICE, &e_vec_array));
+      CeedCallBackend(CeedVectorTakeArray(impl->q_vecs_out[field], CEED_MEM_DEVICE, &e_vec_array));
       CeedCallBackend(CeedVectorRestoreArray(e_vec, &e_vec_array));
     }
 

From 11a80696262ed62202a28e3026fb905f003ce9af Mon Sep 17 00:00:00 2001
From: Zach Atkins <zach.atkins@colorado.edu>
Date: Thu, 19 Feb 2026 10:18:49 -0700
Subject: [PATCH 550/571] ci: add sequential composite operator test

---
 tests/t599-operator.c | 146 ++++++++++++++++++++++++++++++++++++++++++
 tests/t599-operator.h |  16 +++++
 2 files changed, 162 insertions(+)
 create mode 100644 tests/t599-operator.c
 create mode 100644 tests/t599-operator.h

diff --git a/tests/t599-operator.c b/tests/t599-operator.c
new file mode 100644
index 0000000000..1690d438cb
--- /dev/null
+++ b/tests/t599-operator.c
@@ -0,0 +1,146 @@
+/// @file
+/// Test creation, action, and destruction for mass matrix operator at points using sequential composite operator
+/// \test Test creation, action, and destruction for mass matrix operator at points using sequential composite operator
+#include "t599-operator.h"
+
+#include <ceed.h>
+#include <math.h>
+#include <stdio.h>
+
+int main(int argc, char **argv) {
+  Ceed    ceed;
+  CeedInt num_elem_1d = 3, num_elem = num_elem_1d * num_elem_1d, dim = 2, p = 3, q = 5;
+  CeedInt num_nodes = (num_elem_1d * (p - 1) + 1) * (num_elem_1d * (p - 1) + 1), num_points_per_elem = 4, num_points = num_elem * num_points_per_elem;
+  CeedVector          x_points, u, v, u_points;
+  CeedElemRestriction elem_restriction_x_points, elem_restriction_u_points, elem_restriction_u;
+  CeedBasis           basis_u;
+  CeedQFunction       qf_to_points, qf_from_points;
+  CeedOperator        op_to_points, op_from_points, op_mass;
+  bool                is_at_points, is_sequential;
+
+  CeedInit(argv[1], &ceed);
+
+  CeedVectorCreate(ceed, dim * num_points, &x_points);
+  {
+    CeedScalar x_array[dim * num_points];
+
+    for (CeedInt e = 0; e < num_elem; e++) {
+      for (CeedInt d = 0; d < dim; d++) {
+        x_array[num_points_per_elem * (e * dim + d) + 0] = 0.25;
+        x_array[num_points_per_elem * (e * dim + d) + 1] = d == 0 ? -0.25 : 0.25;
+        x_array[num_points_per_elem * (e * dim + d) + 2] = d == 0 ? 0.25 : -0.25;
+        x_array[num_points_per_elem * (e * dim + d) + 3] = 0.25;
+      }
+    }
+    CeedVectorSetArray(x_points, CEED_MEM_HOST, CEED_COPY_VALUES, x_array);
+  }
+  {
+    CeedInt ind_x[num_elem + 1 + num_points];
+
+    for (CeedInt i = 0; i <= num_elem; i++) ind_x[i] = num_elem + 1 + i * num_points_per_elem;
+    for (CeedInt i = 0; i < num_points; i++) ind_x[num_elem + 1 + i] = i;
+    CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points, dim, num_points * dim, CEED_MEM_HOST, CEED_COPY_VALUES, ind_x,
+                                      &elem_restriction_x_points);
+    CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points, 1, num_points, CEED_MEM_HOST, CEED_COPY_VALUES, ind_x, &elem_restriction_u_points);
+    CeedElemRestrictionCreateVector(elem_restriction_u_points, &u_points, NULL);
+    CeedVectorSetValue(u_points, 0);
+  }
+
+  {
+    CeedInt ind_u[num_elem * p * p];
+
+    for (CeedInt e = 0; e < num_elem; e++) {
+      CeedInt elem_xy[2] = {1, 1}, n_d[2] = {0, 0};
+
+      for (CeedInt d = 0; d < dim; d++) n_d[d] = num_elem_1d * (p - 1) + 1;
+      {
+        CeedInt r_e = e;
+
+        for (CeedInt d = 0; d < dim; d++) {
+          elem_xy[d] = r_e % num_elem_1d;
+          r_e /= num_elem_1d;
+        }
+      }
+      CeedInt num_nodes_in_elem = p * p, *elem_nodes = ind_u + e * num_nodes_in_elem;
+
+      for (CeedInt n = 0; n < num_nodes_in_elem; n++) {
+        CeedInt g_node = 0, g_node_stride = 1, r_node = n;
+
+        for (CeedInt d = 0; d < dim; d++) {
+          g_node += (elem_xy[d] * (p - 1) + r_node % p) * g_node_stride;
+          g_node_stride *= n_d[d];
+          r_node /= p;
+        }
+        elem_nodes[n] = g_node;
+      }
+    }
+    CeedElemRestrictionCreate(ceed, num_elem, p * p, 1, 1, num_nodes, CEED_MEM_HOST, CEED_COPY_VALUES, ind_u, &elem_restriction_u);
+  }
+  CeedBasisCreateTensorH1Lagrange(ceed, dim, 1, p, q, CEED_GAUSS, &basis_u);
+
+  CeedQFunctionCreateInterior(ceed, 1, mass, mass_loc, &qf_to_points);
+  CeedQFunctionAddInput(qf_to_points, "u", 1, CEED_EVAL_INTERP);
+  CeedQFunctionAddOutput(qf_to_points, "u_points", 1, CEED_EVAL_NONE);
+
+  CeedOperatorCreateAtPoints(ceed, qf_to_points, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_to_points);
+  CeedOperatorSetField(op_to_points, "u", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetField(op_to_points, "u_points", elem_restriction_u_points, CEED_BASIS_NONE, u_points);
+  CeedOperatorAtPointsSetPoints(op_to_points, elem_restriction_x_points, x_points);
+
+  CeedOperatorIsAtPoints(op_to_points, &is_at_points);
+  if (!is_at_points) printf("Error: Operator should be at points\n");
+
+  CeedQFunctionCreateInterior(ceed, 1, mass, mass_loc, &qf_from_points);
+  CeedQFunctionAddInput(qf_from_points, "u_points", 1, CEED_EVAL_NONE);
+  CeedQFunctionAddOutput(qf_from_points, "v", 1, CEED_EVAL_INTERP);
+
+  CeedOperatorCreateAtPoints(ceed, qf_from_points, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_from_points);
+  CeedOperatorSetField(op_from_points, "u_points", elem_restriction_u_points, CEED_BASIS_NONE, u_points);
+  CeedOperatorSetField(op_from_points, "v", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE);
+  CeedOperatorAtPointsSetPoints(op_from_points, elem_restriction_x_points, x_points);
+
+  CeedOperatorIsAtPoints(op_from_points, &is_at_points);
+  if (!is_at_points) printf("Error: Operator should be at points\n");
+
+  CeedOperatorCreateComposite(ceed, &op_mass);
+  CeedOperatorCompositeSetSequential(op_mass, true);
+  CeedOperatorCompositeAddSub(op_mass, op_to_points);
+  CeedOperatorCompositeAddSub(op_mass, op_from_points);
+
+  CeedVectorCreate(ceed, num_nodes, &u);
+  CeedVectorSetValue(u, 1.0);
+  CeedVectorCreate(ceed, num_nodes, &v);
+  CeedOperatorApply(op_mass, u, v, CEED_REQUEST_IMMEDIATE);
+
+  CeedOperatorCompositeIsSequential(op_mass, &is_sequential);
+  if (!is_sequential) printf("Error: Composite operator should be sequential\n");
+
+  {
+    CeedScalar        sum = 0.0;
+    const CeedScalar *v_array;
+
+    CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array);
+    for (CeedInt i = 0; i < num_nodes; i++) sum += v_array[i];
+    CeedVectorRestoreArrayRead(v, &v_array);
+    // Summing 9 reference elements, each 2x2 => 36 sq units area
+    if (fabs(sum - 4.0 * num_elem) > CEED_EPSILON * 5e3) {
+      printf("Incorrect area computed, %g != %g (abs error %g)\n", sum, 4.0 * num_elem, fabs(sum - 4.0 * num_elem));
+    }
+  }
+
+  CeedVectorDestroy(&x_points);
+  CeedVectorDestroy(&u_points);
+  CeedVectorDestroy(&u);
+  CeedVectorDestroy(&v);
+  CeedElemRestrictionDestroy(&elem_restriction_x_points);
+  CeedElemRestrictionDestroy(&elem_restriction_u_points);
+  CeedElemRestrictionDestroy(&elem_restriction_u);
+  CeedBasisDestroy(&basis_u);
+  CeedQFunctionDestroy(&qf_to_points);
+  CeedQFunctionDestroy(&qf_from_points);
+  CeedOperatorDestroy(&op_to_points);
+  CeedOperatorDestroy(&op_from_points);
+  CeedOperatorDestroy(&op_mass);
+  CeedDestroy(&ceed);
+  return 0;
+}
diff --git a/tests/t599-operator.h b/tests/t599-operator.h
new file mode 100644
index 0000000000..c50595bc26
--- /dev/null
+++ b/tests/t599-operator.h
@@ -0,0 +1,16 @@
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+#include <ceed/types.h>
+
+CEED_QFUNCTION(mass)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+  const CeedScalar *u = in[0];
+  CeedScalar       *v = out[0];
+
+  for (CeedInt i = 0; i < Q; i++) v[i] = u[i];
+  return 0;
+}

From 53ee81eed7fe6a54158b819b478dad66ca478dc3 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Tue, 24 Feb 2026 09:06:07 -0700
Subject: [PATCH 551/571] doc - expand style conventions in dev docs

---
 doc/sphinx/source/libCEEDdev.md | 58 +++++++++++++++++++++++++++++++++
 1 file changed, 58 insertions(+)

diff --git a/doc/sphinx/source/libCEEDdev.md b/doc/sphinx/source/libCEEDdev.md
index cd6bfbd733..e311171a79 100644
--- a/doc/sphinx/source/libCEEDdev.md
+++ b/doc/sphinx/source/libCEEDdev.md
@@ -201,8 +201,66 @@ In addition to those automatically enforced style rules, libCEED tends to follow
 - Type names: `PascalCase` or language specific style
 - Constant names: `CAPS_SNAKE_CASE` or language specific style
 
+In general, variable and function names should avoid abbreviations and err on the side of verbosity to improve readability.
+
 Also, documentation files should have one sentence per line to help make git diffs clearer and less disruptive.
 
+## Function Conventions
+
+### Naming
+All functions in the libCEED library should be prefixed by `Ceed` and generally take a `Ceed` object as its first argument.
+If a function takes, for example, a `CeedOperator` as its first argument, then it should be prefixed with `CeedOperator`.
+
+### Style
+Functions should adhere mostly to the PETSc function style, specifically:
+
+1. All local variables of a particular type (for example, `CeedInt`) should be listed on the same line if possible; otherwise, they should be listed on adjacent lines. For example,
+```c
+// Correct
+CeedInt   a, b, c;
+CeedInt  *d, *e;
+CeedInt **f;
+
+// Incorrect
+CeedInt a, b, c, *d, *e, **f;
+```
+  
+2. Local variables should be initialized in their declaration when possible.
+3. Nearly all functions should have a return type of `int` and return a `CeedErrorType` to allow for error checking.
+4. All functions must start with a single blank line after the local variable declarations.
+5. All libCEED function calls must have their return value checked for errors using the `CeedCall()` or `CeedCallBackend()` macro.
+   This should be wrapped around the function in question.
+6. In libCEED functions, variables must be declared at the beginning of the code block (C90 style), never mixed in with code.
+   However, when variables are only used in a limited scope, it is encouraged to declare them in that scope.
+7. Do not put a blank line immediately before `return CEED_ERROR_SUCCESS;`.
+8. All libCEED functions must use Doxygen comment blocks before their *definition* (not declaration).
+   The block should begin with `/**` and end with `**/`, each on their own line.
+   The block should be indented by two spaces and should contain an `@brief` tag and description, a newline, a line stating whether the function is collective, a  newline, `@param` tags for each parameter, a newline, and a `@return` line formatted exactly as in the example below.
+   All parameter lines in the Doxygen block should be formatted such that parameter names and descriptions are aligned.
+   There should be a exactly one space between `@param[dir]` (where `dir` is `in`, `out`, or `in,out`) and the parameter name for the closest pair, as well as  between the parameter name and description.
+    For example:
+```c
+/**
+  @brief Initialize a `Ceed` context to use the specified resource.
+
+  Note: Prefixing the resource with "help:" (e.g. "help:/cpu/self") will result in @ref CeedInt() printing the current libCEED version number and a list of current available backend resources to `stderr`.
+
+  @param[in]  resource Resource to use, e.g., "/cpu/self"
+  @param[out] ceed     The library context
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref User
+
+  @sa CeedRegister() CeedDestroy()
+**/
+int CeedInit(const char *resource, Ceed *ceed) {
+```
+9. Function declarations should include parameter names, which must exactly match those in the function definition.
+10. External functions, i.e. those used in tests or examples, must have their *declarations* prefixed with `CEED_EXTERN`.
+    All other functions should have their *declarations* prefixed with `CEED_INTERN`.
+    Function *definitions* should have neither.
+
 ## Clang-tidy
 
 Please check your code for common issues by running

From c5ba8324130e3dcc66653556c3e2d63ed4a4d0bf Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Mon, 2 Mar 2026 14:12:43 -0700
Subject: [PATCH 552/571] minor - typo

---
 backends/hip-gen/ceed-hip-gen-operator.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backends/hip-gen/ceed-hip-gen-operator.c b/backends/hip-gen/ceed-hip-gen-operator.c
index f036ae52b5..7532ba55b6 100644
--- a/backends/hip-gen/ceed-hip-gen-operator.c
+++ b/backends/hip-gen/ceed-hip-gen-operator.c
@@ -494,7 +494,7 @@ static int CeedOperatorLinearAssembleQFunctionCore_Hip_gen(CeedOperator op, bool
   if (data->use_assembly_fallback) {
     CeedOperator op_fallback;
 
-    CeedDebug(CeedOperatorReturnCeed(op), "\nFalling back to /gpu/hip/ref CeedOperator for LineearAssembleQFunction\n");
+    CeedDebug(CeedOperatorReturnCeed(op), "\nFalling back to /gpu/hip/ref CeedOperator for LinearAssembleQFunction\n");
     CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback));
     CeedCallBackend(CeedOperatorLinearAssembleQFunctionBuildOrUpdateFallback(op_fallback, assembled, rstr, request));
     return CEED_ERROR_SUCCESS;

From 1fd91769f9fc8c16ac47e9d6babdb436cb7aa0b0 Mon Sep 17 00:00:00 2001
From: Zach Atkins <zach.atkins@colorado.edu>
Date: Fri, 6 Mar 2026 16:11:09 -0700
Subject: [PATCH 553/571] interface: fix uninitialized use in
 CeedOperatorMultigridLevelCreateSingle_Core

---
 interface/ceed-preconditioning.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/interface/ceed-preconditioning.c b/interface/ceed-preconditioning.c
index b9cf0b8ed8..dd83302f2d 100644
--- a/interface/ceed-preconditioning.c
+++ b/interface/ceed-preconditioning.c
@@ -1114,6 +1114,9 @@ static int CeedOperatorMultigridLevelCreateSingle_Core(CeedOperator op_fine, Cee
 
     // Create multiplicity multi-component l-vector
     CeedCall(CeedElemRestrictionCreateUnsignedCopy(rstr_fine, &rstr_p_mult_full));
+    CeedCall(CeedElemRestrictionGetNumElements(rstr_p_mult_full, &num_elem));
+    CeedCall(CeedElemRestrictionGetNumComponents(rstr_p_mult_full, &num_comp));
+    CeedCall(CeedElemRestrictionGetElementSize(rstr_p_mult_full, &elem_size));
     CeedCall(CeedElemRestrictionCreateVector(rstr_fine, &mult_l_vec, &mult_e_vec));
     CeedCall(CeedVectorSetValue(mult_e_vec, 0.0));
     CeedCall(CeedElemRestrictionApply(rstr_p_mult_full, CEED_NOTRANSPOSE, p_mult_fine, mult_e_vec, CEED_REQUEST_IMMEDIATE));
@@ -1130,9 +1133,6 @@ static int CeedOperatorMultigridLevelCreateSingle_Core(CeedOperator op_fine, Cee
 
     if (use_scalar_mult) {
       // Create multiplicity single component e-vector
-      CeedCall(CeedElemRestrictionGetNumElements(rstr_p_mult_full, &num_elem));
-      CeedCall(CeedElemRestrictionGetNumComponents(rstr_p_mult_full, &num_comp));
-      CeedCall(CeedElemRestrictionGetElementSize(rstr_p_mult_full, &elem_size));
       CeedCall(CeedElemRestrictionCreateStrided(ceed, num_elem, elem_size, 1, num_elem * elem_size, CEED_STRIDES_BACKEND, &rstr_p_mult_fine));
       CeedCall(CeedElemRestrictionCreateVector(rstr_p_mult_fine, &mult_vec, NULL));
       {

From fcde94cc9faa6e936e57edcecf8187b6b7c37ec7 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Thu, 12 Mar 2026 11:46:15 -0600
Subject: [PATCH 554/571] cov - add missing exclusion markers

---
 backends/blocked/ceed-blocked-operator.c | 2 ++
 backends/opt/ceed-opt-operator.c         | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/backends/blocked/ceed-blocked-operator.c b/backends/blocked/ceed-blocked-operator.c
index 61663b2406..0161c2819f 100644
--- a/backends/blocked/ceed-blocked-operator.c
+++ b/backends/blocked/ceed-blocked-operator.c
@@ -102,9 +102,11 @@ static int CeedOperatorSetupFields_Blocked(CeedQFunction qf, CeedOperator op, bo
           CeedCallBackend(CeedElemRestrictionCreateBlockedStrided(ceed_rstr, num_elem, elem_size, block_size, num_comp, l_size, strides,
                                                                   &block_rstr[i + start_e]));
         } break;
+        // LCOV_EXCL_START
         case CEED_RESTRICTION_POINTS:
           // Empty case - won't occur
           break;
+          // LCOV_EXCL_STOP
       }
       CeedCallBackend(CeedDestroy(&ceed_rstr));
       CeedCallBackend(CeedElemRestrictionDestroy(&rstr));
diff --git a/backends/opt/ceed-opt-operator.c b/backends/opt/ceed-opt-operator.c
index 3a67a97607..5f072d1e25 100644
--- a/backends/opt/ceed-opt-operator.c
+++ b/backends/opt/ceed-opt-operator.c
@@ -102,9 +102,11 @@ static int CeedOperatorSetupFields_Opt(CeedQFunction qf, CeedOperator op, bool i
           CeedCallBackend(CeedElemRestrictionCreateBlockedStrided(ceed_rstr, num_elem, elem_size, block_size, num_comp, l_size, strides,
                                                                   &block_rstr[i + start_e]));
         } break;
+        // LCOV_EXCL_START
         case CEED_RESTRICTION_POINTS:
           // Empty case - won't occur
           break;
+          // LCOV_EXCL_STOP
       }
       CeedCallBackend(CeedDestroy(&ceed_rstr));
       CeedCallBackend(CeedElemRestrictionDestroy(&rstr));

From f974b973fa8cf4f5e9d344a7689934d9b40c0d75 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Thu, 12 Mar 2026 11:53:43 -0600
Subject: [PATCH 555/571] ci - remove cov substitution causing error

---
 .gitlab-ci.yml | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 5df451315e..a907bfb1db 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -121,7 +121,7 @@ noether-cpu:
   after_script:
     - |
       if [ -f .SUCCESS ]; then
-        lcov --directory . --capture --output-file coverage.info --ignore-errors source,mismatch --substitute 's#(t*-f.h)#test/(t*.-f.h)#g';
+        lcov --directory . --capture --output-file coverage.info --ignore-errors source,mismatch;
         bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F interface;
         bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F gallery;
         bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F backends;
@@ -209,7 +209,7 @@ noether-rust-qfunctions:
   after_script:
     - |
       if [ -f .SUCCESS ]; then
-        lcov --directory . --capture --output-file coverage.info --ignore-errors source,mismatch --substitute 's#(t*-f.h)#test/(t*.-f.h)#g';
+        lcov --directory . --capture --output-file coverage.info --ignore-errors source,mismatch;
         bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F interface;
         bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F gallery;
         bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F backends;
@@ -290,7 +290,7 @@ noether-cuda:
   after_script:
     - |
       if [ -f .SUCCESS ]; then
-        lcov --directory . --capture --output-file coverage.info --ignore-errors source,mismatch --substitute 's#(t*-f.h)#test/(t*.-f.h)#g';
+        lcov --directory . --capture --output-file coverage.info --ignore-errors source,mismatch;
         bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F interface;
         bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F gallery;
         bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F backends;
@@ -357,7 +357,7 @@ noether-cuda:
 #  after_script:
 #    - |
 #      if [ -f .SUCCESS ]; then
-#        lcov --directory . --capture --output-file coverage.info --ignore-errors source,mismatch --substitute 's#(t*-f.h)#test/(t*.-f.h)#g';
+#        lcov --directory . --capture --output-file coverage.info --ignore-errors source,mismatch;
 #        bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F interface;
 #        bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F gallery;
 #        bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F backends;
@@ -460,7 +460,7 @@ noether-float:
   after_script:
     - |
       if [ $(cat .job_status) == "SUCCESS" ]; then
-        lcov --directory . --capture --output-file coverage.info --ignore-errors source,mismatch --substitute 's#(t*-f.h)#test/(t*.-f.h)#g';
+        lcov --directory . --capture --output-file coverage.info --ignore-errors source,mismatch;
         bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F interface;
         bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F gallery;
         bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F backends;

From 1f89dcd0c2c1efa76f2e2a04143691882e0e11c0 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Thu, 12 Mar 2026 14:28:03 -0600
Subject: [PATCH 556/571] ci - better flags for cov accuracy on Noether

---
 .gitlab-ci.yml | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index a907bfb1db..cacab6fdf8 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -34,7 +34,7 @@ noether-asan:
   script:
     - rm -f .SUCCESS
     # libCEED
-    - make configure OPT='-O -march=native -ffp-contract=fast'
+    - make configure OPT='-g -O0 -fno-inline -march=native -ffp-contract=fast'
     - BACKENDS_CPU=$(make info-backends-all | grep -o '/cpu[^ ]*' | tr '\n' ' ')
     - echo "-------------- libCEED -------------" && make info
     - echo "-------------- BACKENDS_CPU --------" && echo $BACKENDS_CPU
@@ -83,7 +83,7 @@ noether-cpu:
   script:
     - rm -f .SUCCESS
     # libCEED
-    - make configure OPT='-O -march=native -ffp-contract=fast'
+    - make configure OPT='-g -O0 -fno-inline -march=native -ffp-contract=fast'
     - BACKENDS_CPU=$(make info-backends-all | grep -o '/cpu[^ ]*' | tr '\n' ' ')
     - echo "-------------- libCEED -------------" && make info
     - echo "-------------- BACKENDS_CPU --------" && echo $BACKENDS_CPU
@@ -158,7 +158,7 @@ noether-cpu:
 #  script:
 #    - rm -f .SUCCESS
 #    # libCEED
-#    - make configure SYCL_DIR=/opt/intel/oneapi/compiler/latest OPT='-O -march=native -ffp-contract=fast'
+#    - make configure SYCL_DIR=/opt/intel/oneapi/compiler/latest OPT='-g -O0 -fno-inline -march=native -ffp-contract=fast'
 #    - BACKENDS_SYCL=$(make info-backends-all | grep -o '/sycl[^ ]*' | tr '\n' ' ')
 #    - echo "-------------- libCEED -------------" && make info
 #    - echo "-------------- BACKENDS_SYCL -------" && echo $BACKENDS_SYCL
@@ -195,7 +195,7 @@ noether-rust-qfunctions:
     - rustup component add rust-src --toolchain nightly
     - rustup component add llvm-tools --toolchain nightly
     # libCEED
-    - make configure OPT='-O -march=native -ffp-contract=fast' CUDA_DIR=/usr/local/cuda-12.9
+    - make configure OPT='-g -O0 -fno-inline -march=native -ffp-contract=fast' CUDA_DIR=/usr/local/cuda-12.9
     - echo "-------------- libCEED -------------" && make info
     - make clean
     - make -k -j$NPROC_CPU -l$NPROC_CPU
@@ -248,7 +248,7 @@ noether-cuda:
   script:
     - rm -f .SUCCESS
     # libCEED
-    - make configure OPT='-O -march=native -ffp-contract=fast' CUDA_DIR=/usr/local/cuda-12.9
+    - make configure OPT='-g -O0 -fno-inline -march=native -ffp-contract=fast' CUDA_DIR=/usr/local/cuda-12.9
     - echo "-------------- libCEED -------------" && make info
     - BACKENDS_GPU=$(make info-backends | grep -o '/gpu[^ ]*' | tr '\n' ' ')
     - echo "-------------- BACKENDS_GPU --------" && echo $BACKENDS_GPU
@@ -332,7 +332,7 @@ noether-cuda:
 #  script:
 #    - rm -f .SUCCESS
 #    # libCEED
-#    - make configure ROCM_DIR=/opt/rocm-6.3.0 OPT='-O -march=native -ffp-contract=fast'
+#    - make configure ROCM_DIR=/opt/rocm-6.3.0 OPT='-g -O0 -fno-inline -march=native -ffp-contract=fast'
 #    - BACKENDS_CPU=$(make info-backends-all | grep -o '/cpu[^ ]*' | tr '\n' ' ') && BACKENDS_GPU=$(make info-backends | grep -o '/gpu[^ ]*' | tr '\n' ' ')
 #    - echo "-------------- libCEED -------------" && make info
 #    - echo "-------------- BACKENDS_GPU --------" && echo $BACKENDS_GPU
@@ -394,7 +394,7 @@ noether-rocm:
   script:
     - rm -f .SUCCESS
     # libCEED
-    - make configure ROCM_DIR=/opt/rocm-6.3.0 OPT='-O -march=native -ffp-contract=fast'
+    - make configure ROCM_DIR=/opt/rocm-6.3.0 OPT='-g -O0 -fno-inline -march=native -ffp-contract=fast'
     - BACKENDS_CPU=$(make info-backends-all | grep -o '/cpu[^ ]*' | tr '\n' ' ') && BACKENDS_GPU=$(make info-backends | grep -o '/gpu[^ ]*' | tr '\n' ' ')
     - echo "-------------- libCEED -------------" && make info
     - echo "-------------- BACKENDS_GPU --------" && echo $BACKENDS_GPU
@@ -440,7 +440,7 @@ noether-float:
     # Change to single precision
     - sed -i 's/ceed-f64/ceed-f32/1' include/ceed/types.h
     # Build libCEED
-    - make configure OPT='-O -march=native -ffp-contract=fast' CUDA_DIR=/usr/local/cuda-12.9
+    - make configure OPT='-g -O0 -fno-inline -march=native -ffp-contract=fast' CUDA_DIR=/usr/local/cuda-12.9
     - BACKENDS_CPU=$(make info-backends-all | grep -o '/cpu[^ ]*' | tr '\n' ' ') && BACKENDS_GPU=$(make info-backends | grep -o '/gpu[^ ]*' | tr '\n' ' ')
     - echo "-------------- libCEED -------------" && make info
     - echo "-------------- BACKENDS_CPU --------" && echo $BACKENDS_CPU

From ad35aa9384f523fdfb9cc65d38cd355beee8a0d4 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Thu, 12 Mar 2026 14:52:27 -0600
Subject: [PATCH 557/571] ci - substitute only sometimes needed

---
 .gitlab-ci.yml | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index cacab6fdf8..d3188148a5 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -121,7 +121,7 @@ noether-cpu:
   after_script:
     - |
       if [ -f .SUCCESS ]; then
-        lcov --directory . --capture --output-file coverage.info --ignore-errors source,mismatch;
+        lcov --directory . --capture --output-file coverage.info --ignore-errors source,mismatch,unused --substitute 's#(t*-f.h)#test/(t*.-f.h)#g';
         bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F interface;
         bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F gallery;
         bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F backends;
@@ -209,7 +209,7 @@ noether-rust-qfunctions:
   after_script:
     - |
       if [ -f .SUCCESS ]; then
-        lcov --directory . --capture --output-file coverage.info --ignore-errors source,mismatch;
+        lcov --directory . --capture --output-file coverage.info --ignore-errors source,mismatch,unused --substitute 's#(t*-f.h)#test/(t*.-f.h)#g';
         bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F interface;
         bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F gallery;
         bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F backends;
@@ -290,7 +290,7 @@ noether-cuda:
   after_script:
     - |
       if [ -f .SUCCESS ]; then
-        lcov --directory . --capture --output-file coverage.info --ignore-errors source,mismatch;
+        lcov --directory . --capture --output-file coverage.info --ignore-errors source,mismatch,unused --substitute 's#(t*-f.h)#test/(t*.-f.h)#g';
         bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F interface;
         bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F gallery;
         bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F backends;
@@ -357,7 +357,7 @@ noether-cuda:
 #  after_script:
 #    - |
 #      if [ -f .SUCCESS ]; then
-#        lcov --directory . --capture --output-file coverage.info --ignore-errors source,mismatch;
+#        lcov --directory . --capture --output-file coverage.info --ignore-errors source,mismatch,unused --substitute 's#(t*-f.h)#test/(t*.-f.h)#g';
 #        bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F interface;
 #        bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F gallery;
 #        bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F backends;
@@ -460,7 +460,7 @@ noether-float:
   after_script:
     - |
       if [ $(cat .job_status) == "SUCCESS" ]; then
-        lcov --directory . --capture --output-file coverage.info --ignore-errors source,mismatch;
+        lcov --directory . --capture --output-file coverage.info --ignore-errors source,mismatch,unused --substitute 's#(t*-f.h)#test/(t*.-f.h)#g';
         bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F interface;
         bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F gallery;
         bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F backends;

From 55dd3a7683bc8607434f8cf34c6e149a38b4fa8f Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Thu, 12 Mar 2026 15:16:50 -0600
Subject: [PATCH 558/571] ci - add missing exclusion markers

---
 backends/cuda-ref/ceed-cuda-ref-qfunctioncontext.c        | 4 ++++
 backends/cuda-ref/ceed-cuda-ref-vector.c                  | 4 ++++
 backends/hip-ref/ceed-hip-ref-qfunctioncontext.c          | 4 ++++
 backends/hip-ref/ceed-hip-ref-vector.c                    | 4 ++++
 backends/sycl-ref/ceed-sycl-ref-qfunctioncontext.sycl.cpp | 4 ++++
 backends/sycl-ref/ceed-sycl-vector.sycl.cpp               | 4 ++++
 6 files changed, 24 insertions(+)

diff --git a/backends/cuda-ref/ceed-cuda-ref-qfunctioncontext.c b/backends/cuda-ref/ceed-cuda-ref-qfunctioncontext.c
index b5d25b6f63..491e658338 100644
--- a/backends/cuda-ref/ceed-cuda-ref-qfunctioncontext.c
+++ b/backends/cuda-ref/ceed-cuda-ref-qfunctioncontext.c
@@ -79,7 +79,9 @@ static inline int CeedQFunctionContextSync_Cuda(const CeedQFunctionContext ctx,
     case CEED_MEM_DEVICE:
       return CeedQFunctionContextSyncH2D_Cuda(ctx);
   }
+  // LCOV_EXCL_START
   return CEED_ERROR_UNSUPPORTED;
+  // LCOV_EXCL_STOP
 }
 
 //------------------------------------------------------------------------------
@@ -223,7 +225,9 @@ static int CeedQFunctionContextSetData_Cuda(const CeedQFunctionContext ctx, cons
     case CEED_MEM_DEVICE:
       return CeedQFunctionContextSetDataDevice_Cuda(ctx, copy_mode, data);
   }
+  // LCOV_EXCL_START
   return CEED_ERROR_UNSUPPORTED;
+  // LCOV_EXCL_STOP
 }
 
 //------------------------------------------------------------------------------
diff --git a/backends/cuda-ref/ceed-cuda-ref-vector.c b/backends/cuda-ref/ceed-cuda-ref-vector.c
index 3675a727d7..b0489d36d6 100644
--- a/backends/cuda-ref/ceed-cuda-ref-vector.c
+++ b/backends/cuda-ref/ceed-cuda-ref-vector.c
@@ -107,7 +107,9 @@ static int CeedVectorSyncArray_Cuda(const CeedVector vec, CeedMemType mem_type)
     case CEED_MEM_DEVICE:
       return CeedVectorSyncH2D_Cuda(vec);
   }
+  // LCOV_EXCL_START
   return CEED_ERROR_UNSUPPORTED;
+  // LCOV_EXCL_STOP
 }
 
 //------------------------------------------------------------------------------
@@ -217,7 +219,9 @@ static int CeedVectorSetArray_Cuda(const CeedVector vec, const CeedMemType mem_t
     case CEED_MEM_DEVICE:
       return CeedVectorSetArrayDevice_Cuda(vec, copy_mode, array);
   }
+  // LCOV_EXCL_START
   return CEED_ERROR_UNSUPPORTED;
+  // LCOV_EXCL_STOP
 }
 
 //------------------------------------------------------------------------------
diff --git a/backends/hip-ref/ceed-hip-ref-qfunctioncontext.c b/backends/hip-ref/ceed-hip-ref-qfunctioncontext.c
index 84a69716e6..a223fa91d8 100644
--- a/backends/hip-ref/ceed-hip-ref-qfunctioncontext.c
+++ b/backends/hip-ref/ceed-hip-ref-qfunctioncontext.c
@@ -78,7 +78,9 @@ static inline int CeedQFunctionContextSync_Hip(const CeedQFunctionContext ctx, C
     case CEED_MEM_DEVICE:
       return CeedQFunctionContextSyncH2D_Hip(ctx);
   }
+  // LCOV_EXCL_START
   return CEED_ERROR_UNSUPPORTED;
+  // LCOV_EXCL_STOP
 }
 
 //------------------------------------------------------------------------------
@@ -222,7 +224,9 @@ static int CeedQFunctionContextSetData_Hip(const CeedQFunctionContext ctx, const
     case CEED_MEM_DEVICE:
       return CeedQFunctionContextSetDataDevice_Hip(ctx, copy_mode, data);
   }
+  // LCOV_EXCL_START
   return CEED_ERROR_UNSUPPORTED;
+  // LCOV_EXCL_STOP
 }
 
 //------------------------------------------------------------------------------
diff --git a/backends/hip-ref/ceed-hip-ref-vector.c b/backends/hip-ref/ceed-hip-ref-vector.c
index 0bf497f27f..f1d1dcd93a 100644
--- a/backends/hip-ref/ceed-hip-ref-vector.c
+++ b/backends/hip-ref/ceed-hip-ref-vector.c
@@ -115,7 +115,9 @@ static int CeedVectorSyncArray_Hip(const CeedVector vec, CeedMemType mem_type) {
     case CEED_MEM_DEVICE:
       return CeedVectorSyncH2D_Hip(vec);
   }
+  // LCOV_EXCL_START
   return CEED_ERROR_UNSUPPORTED;
+  // LCOV_EXCL_STOP
 }
 
 //------------------------------------------------------------------------------
@@ -270,7 +272,9 @@ static int CeedVectorSetArray_Hip(const CeedVector vec, const CeedMemType mem_ty
     case CEED_MEM_DEVICE:
       return CeedVectorSetArrayDevice_Hip(vec, copy_mode, array);
   }
+  // LCOV_EXCL_START
   return CEED_ERROR_UNSUPPORTED;
+  // LCOV_EXCL_STOP
 }
 
 //------------------------------------------------------------------------------
diff --git a/backends/sycl-ref/ceed-sycl-ref-qfunctioncontext.sycl.cpp b/backends/sycl-ref/ceed-sycl-ref-qfunctioncontext.sycl.cpp
index b6a50a0226..1a08c26cb5 100644
--- a/backends/sycl-ref/ceed-sycl-ref-qfunctioncontext.sycl.cpp
+++ b/backends/sycl-ref/ceed-sycl-ref-qfunctioncontext.sycl.cpp
@@ -90,7 +90,9 @@ static inline int CeedQFunctionContextSync_Sycl(const CeedQFunctionContext ctx,
     case CEED_MEM_DEVICE:
       return CeedQFunctionContextSyncH2D_Sycl(ctx);
   }
+  // LCOV_EXCL_START
   return CEED_ERROR_UNSUPPORTED;
+  // LCOV_EXCL_STOP
 }
 
 //------------------------------------------------------------------------------
@@ -247,7 +249,9 @@ static int CeedQFunctionContextSetData_Sycl(const CeedQFunctionContext ctx, cons
     case CEED_MEM_DEVICE:
       return CeedQFunctionContextSetDataDevice_Sycl(ctx, copy_mode, data);
   }
+  // LCOV_EXCL_START
   return CEED_ERROR_UNSUPPORTED;
+  // LCOV_EXCL_STOP
 }
 
 //------------------------------------------------------------------------------
diff --git a/backends/sycl-ref/ceed-sycl-vector.sycl.cpp b/backends/sycl-ref/ceed-sycl-vector.sycl.cpp
index 9b1e8f1033..689d84f78e 100644
--- a/backends/sycl-ref/ceed-sycl-vector.sycl.cpp
+++ b/backends/sycl-ref/ceed-sycl-vector.sycl.cpp
@@ -118,7 +118,9 @@ static int CeedVectorSyncArray_Sycl(const CeedVector vec, CeedMemType mem_type)
     case CEED_MEM_DEVICE:
       return CeedVectorSyncH2D_Sycl(vec);
   }
+  // LCOV_EXCL_START
   return CEED_ERROR_UNSUPPORTED;
+  // LCOV_EXCL_STOP
 }
 
 //------------------------------------------------------------------------------
@@ -267,7 +269,9 @@ static int CeedVectorSetArray_Sycl(const CeedVector vec, const CeedMemType mem_t
     case CEED_MEM_DEVICE:
       return CeedVectorSetArrayDevice_Sycl(vec, copy_mode, array);
   }
+  // LCOV_EXCL_START
   return CEED_ERROR_UNSUPPORTED;
+  // LCOV_EXCL_STOP
 }
 
 //------------------------------------------------------------------------------

From 3c7ef06308310bc5d5828e3a984a4aa80a426f06 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Thu, 12 Mar 2026 17:15:46 -0600
Subject: [PATCH 559/571] dealii - set cmake min to 3.10

---
 examples/deal.II/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/deal.II/CMakeLists.txt b/examples/deal.II/CMakeLists.txt
index ca87519479..d5de2d7ddb 100644
--- a/examples/deal.II/CMakeLists.txt
+++ b/examples/deal.II/CMakeLists.txt
@@ -1,4 +1,4 @@
-CMAKE_MINIMUM_REQUIRED(VERSION 3.5.0)
+CMAKE_MINIMUM_REQUIRED(VERSION 3.10.0)
 
 FIND_PACKAGE(deal.II 8.0 QUIET
   HINTS ${deal.II_DIR} ${DEAL_II_DIR} ../ ../../ $ENV{DEAL_II_DIR}

From 6b1bb947516b708707aaa03d7d2d65470927fb32 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Thu, 12 Mar 2026 17:18:30 -0600
Subject: [PATCH 560/571] gitignore - stray Nek5k file

---
 .gitignore | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.gitignore b/.gitignore
index 6baed96b94..0cb9d41a69 100644
--- a/.gitignore
+++ b/.gitignore
@@ -54,6 +54,9 @@ doc/sphinx/source/examples/
 # Clang GPU temp files
 temp/*
 
+# Nek5K
+SESSION.NAME
+
 # Output files, videos, and compressed archives should not be added accidentally
 *.avi
 *.bin

From 16baf82b58149c30c645444a3c37d0afd9008a0f Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 13 Mar 2026 09:10:10 -0600
Subject: [PATCH 561/571] LLM/Gen-AI Policy (#1933)

* contributing - draft LLM statement

* contributing - deleniate permitted and disallowed LLM usage

* contributing - grammar

Co-authored-by: Valeria Barra <39932030+valeriabarra@users.noreply.github.com>

* dev - add PR template

* contributing - wording consistency between PR template and CONTRIBUTING.md

* minor - typo

* contributing - refrence sf dora

* minor - fix spelling

Co-authored-by: Yohann <dudouit1@llnl.gov>

---------

Co-authored-by: Valeria Barra <39932030+valeriabarra@users.noreply.github.com>
Co-authored-by: Yohann <dudouit1@llnl.gov>
---
 .github/pull_request_template.md | 12 ++++++++++++
 CONTRIBUTING.md                  | 12 ++++++++++++
 2 files changed, 24 insertions(+)
 create mode 100644 .github/pull_request_template.md

diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
new file mode 100644
index 0000000000..9c6f967319
--- /dev/null
+++ b/.github/pull_request_template.md
@@ -0,0 +1,12 @@
+Purpose:
+
+Describe the purpose of the PR here.
+
+Closes: #ISSUE_NUMBER
+
+LLM/GenAI Disclosure:
+
+Describe any LLM and GenAI usage here.
+
+By submitting this PR, the author certifies to its contents as described by the [Developer's Certificate of Origin](https://developercertificate.org/).
+Please follow the [Contributing Guidelines](https://github.com/CEED/libCEED/blob/main/CONTRIBUTING.md) for all PRs.
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index d3eeeb5c03..5f0cdbf4fd 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -44,6 +44,18 @@ By making a contribution to this project, I certify that:
 
 (d) I understand and agree that this project and the contribution are public and that a record of the contribution (including all personal information I submit with it, including my sign-off) is maintained indefinitely and may be redistributed consistent with this project or the open source license(s) involved.
 
+## LLM Generated Content
+
+libCEED is a research software project, and we require citation of the origin of ideas in the same way that citations are expected for research papers.
+See the [San Francisco Declaration on Research Assessment](https://sfdora.org/read) for discussion on treating other research outputs, such as datasets and software, as first class artifacts like research papers.
+
+LLM/GenAI generated code can contain novel algorithms developed by other researchers and replicated without attribution.
+As such, we cannot accept pull requests containing code predominantly generated by LLM/GenAI.
+
+LLMs may be used to aid the development of code for pull requests (PR); however, the individual submitting the PR must certify to its contents as described by Developer's Certificate of Origin.
+The human creating the PR is ultimately responsible for the content in the PR.
+PRs must disclose and describe all LLM usage.
+
 ## Authorship
 
 libCEED contains components authored by many individuals.

From 4d7a563fc20aa7fe05bbc578ffc891a3f50aaf44 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 13 Mar 2026 09:33:29 -0600
Subject: [PATCH 562/571] make - force dealii serial build from top make

---
 Makefile          | 5 ++++-
 examples/Makefile | 5 ++++-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index 5dec972cfc..a29e1bfe65 100644
--- a/Makefile
+++ b/Makefile
@@ -702,10 +702,13 @@ $(OBJDIR)/%$(EXE_SUFFIX) : examples/ceed/%.f90 | $$(@D)/.DIR
 
 # deal.II
 # Note: Invoking deal.II's CMAKE build system here
-$(OBJDIR)/dealii-% : examples/deal.II/*.cc examples/deal.II/*.h $(libceed) | $$(@D)/.DIR
+.NOPARALLEL: dealii
+dealii :
 	mkdir -p examples/deal.II/build
 	cmake -B examples/deal.II/build -S examples/deal.II -DDEAL_II_DIR=$(DEAL_II_DIR) -DCEED_DIR=$(PWD)
 	+$(call quiet,MAKE) -C examples/deal.II/build
+
+$(OBJDIR)/dealii-% : examples/deal.II/*.cc examples/deal.II/*.h $(libceed) dealii | $$(@D)/.DIR
 	cp examples/deal.II/build/$* $@
 
 # MFEM
diff --git a/examples/Makefile b/examples/Makefile
index f220748ba2..4cb4a1ed9e 100644
--- a/examples/Makefile
+++ b/examples/Makefile
@@ -36,6 +36,9 @@ all: ceed mfem nek petsc fluids solids
 ceed:
 	make CEED_DIR=$(CEED_DIR) -C ceed all
 
+dealii:
+	$(RM) -rf deal.II/build
+
 mfem:
 	make CEED_DIR=$(CEED_DIR) MFEM_DIR=$(MFEM_DIR) -C mfem all
 
@@ -54,7 +57,7 @@ fluids:
 solids:
 	make CEED_DIR=$(CEED_DIR) PETSC_DIR=$(PETSC_DIR) PETSC_ARCH=$(PETSC_ARCH) -C solids all
 
-clean:
+clean: dealii
 	+make -C ceed clean
 	+make -C mfem clean
 	+make -C nek clean

From f816377087fa8bceb7974578f13b26312fa75b3c Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 13 Mar 2026 12:08:00 -0600
Subject: [PATCH 563/571] rust - allow setting ceed OPT flags for rust, mostly
 for CI/cov

---
 .github/workflows/rust-test-with-style.yml | 2 +-
 rust/libceed-sys/build.rs                  | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/rust-test-with-style.yml b/.github/workflows/rust-test-with-style.yml
index 80ea4a4dbf..63ce2d7b3c 100644
--- a/.github/workflows/rust-test-with-style.yml
+++ b/.github/workflows/rust-test-with-style.yml
@@ -32,7 +32,7 @@ jobs:
       env:
         CC: ${{ matrix.compiler }}
         FC: gfortran
-      run: cargo llvm-cov test --doctests --lcov --output-path lcov.info
+      run: CARGO_CEED_OPT_FLAGS="-g -O0 -fno-inline" cargo llvm-cov test --doctests --lcov --output-path lcov.info
     - name: Codecov upload
       uses: codecov/codecov-action@v4
       with:
diff --git a/rust/libceed-sys/build.rs b/rust/libceed-sys/build.rs
index d1cc93be6e..8c510fcc8c 100644
--- a/rust/libceed-sys/build.rs
+++ b/rust/libceed-sys/build.rs
@@ -14,6 +14,7 @@ fn main() {
     } else {
         // Install libceed.a or libceed.so to $OUT_DIR/lib
         let makeflags = env("CARGO_MAKEFLAGS").unwrap();
+        let optflags = env("CARGO_CEED_OPT_FLAGS").unwrap_or_else(|| "".to_string());
         let mut make = Command::new("make");
         make.arg("install")
             .arg(format!("prefix={}", out_dir.to_string_lossy()))
@@ -28,6 +29,9 @@ fn main() {
             .arg("FC=") // Don't try to find Fortran (unused library build/install)
             .env("MAKEFLAGS", makeflags)
             .current_dir("c-src");
+        if optflags.len() > 0 {
+            make.env("OPT", optflags);
+        }
         if statik {
             make.arg("STATIC=1");
         }

From a23f91e7dde13e25c67b07f1369be92942ee2243 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 13 Mar 2026 12:18:15 -0600
Subject: [PATCH 564/571] rust - minor lifeline clarification

---
 examples/rust/mesh/src/lib.rs |  4 ++--
 rust/libceed/src/operator.rs  | 10 +++++-----
 rust/libceed/src/qfunction.rs | 12 ++++++------
 rust/libceed/src/vector.rs    |  4 ++--
 4 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/examples/rust/mesh/src/lib.rs b/examples/rust/mesh/src/lib.rs
index 775d8e5ec2..ce48153b18 100644
--- a/examples/rust/mesh/src/lib.rs
+++ b/examples/rust/mesh/src/lib.rs
@@ -48,7 +48,7 @@ pub fn build_cartesian_restriction(
     degree: usize,
     num_comp: usize,
     num_qpts: usize,
-) -> libceed::Result<(ElemRestriction, ElemRestriction)> {
+) -> libceed::Result<(ElemRestriction<'_>, ElemRestriction<'_>)> {
     let p = degree + 1;
     let num_nodes = p.pow(dim as u32); // number of nodes per element
     let elem_qpts = num_qpts.pow(dim as u32); // number of quadrature pts per element
@@ -119,7 +119,7 @@ pub fn cartesian_mesh_coords(
     num_xyz: [usize; 3],
     mesh_degree: usize,
     mesh_size: usize,
-) -> libceed::Result<Vector> {
+) -> libceed::Result<Vector<'_>> {
     let p = mesh_degree + 1;
     let mut num_d = [0; 3];
     let mut scalar_size = 1;
diff --git a/rust/libceed/src/operator.rs b/rust/libceed/src/operator.rs
index 85cb3a0d18..fae468d3c9 100644
--- a/rust/libceed/src/operator.rs
+++ b/rust/libceed/src/operator.rs
@@ -175,7 +175,7 @@ impl<'a> OperatorField<'a> {
     /// # Ok(())
     /// # }
     /// ```
-    pub fn elem_restriction(&self) -> ElemRestrictionOpt {
+    pub fn elem_restriction(&self) -> ElemRestrictionOpt<'_> {
         if self.elem_restriction.ptr == unsafe { bind_ceed::CEED_ELEMRESTRICTION_NONE } {
             ElemRestrictionOpt::None
         } else {
@@ -237,7 +237,7 @@ impl<'a> OperatorField<'a> {
     /// # Ok(())
     /// # }
     /// ```
-    pub fn basis(&self) -> BasisOpt {
+    pub fn basis(&self) -> BasisOpt<'_> {
         if self.basis.ptr == unsafe { bind_ceed::CEED_BASIS_NONE } {
             BasisOpt::None
         } else {
@@ -285,7 +285,7 @@ impl<'a> OperatorField<'a> {
     /// # Ok(())
     /// # }
     /// ```
-    pub fn vector(&self) -> VectorOpt {
+    pub fn vector(&self) -> VectorOpt<'_> {
         if self.vector.ptr == unsafe { bind_ceed::CEED_VECTOR_ACTIVE } {
             VectorOpt::Active
         } else if self.vector.ptr == unsafe { bind_ceed::CEED_VECTOR_NONE } {
@@ -856,7 +856,7 @@ impl<'a> Operator<'a> {
     /// # Ok(())
     /// # }
     /// ```
-    pub fn inputs(&self) -> crate::Result<Vec<crate::OperatorField>> {
+    pub fn inputs(&self) -> crate::Result<Vec<crate::OperatorField<'_>>> {
         // Get array of raw C pointers for inputs
         let mut num_inputs = 0;
         let mut inputs_ptr = std::ptr::null_mut();
@@ -926,7 +926,7 @@ impl<'a> Operator<'a> {
     /// # Ok(())
     /// # }
     /// ```
-    pub fn outputs(&self) -> crate::Result<Vec<crate::OperatorField>> {
+    pub fn outputs(&self) -> crate::Result<Vec<crate::OperatorField<'_>>> {
         // Get array of raw C pointers for outputs
         let mut num_outputs = 0;
         let mut outputs_ptr = std::ptr::null_mut();
diff --git a/rust/libceed/src/qfunction.rs b/rust/libceed/src/qfunction.rs
index 09912c93be..f1eb5786f9 100644
--- a/rust/libceed/src/qfunction.rs
+++ b/rust/libceed/src/qfunction.rs
@@ -467,7 +467,7 @@ impl<'a> QFunctionCore<'a> {
         })
     }
 
-    pub fn inputs(&self) -> crate::Result<&[QFunctionField]> {
+    pub fn inputs(&self) -> crate::Result<&[QFunctionField<'_>]> {
         // Get array of raw C pointers for inputs
         let mut num_inputs = 0;
         let mut inputs_ptr = std::ptr::null_mut();
@@ -487,7 +487,7 @@ impl<'a> QFunctionCore<'a> {
         Ok(inputs_slice)
     }
 
-    pub fn outputs(&self) -> crate::Result<&[QFunctionField]> {
+    pub fn outputs(&self) -> crate::Result<&[QFunctionField<'_>]> {
         // Get array of raw C pointers for outputs
         let mut num_outputs = 0;
         let mut outputs_ptr = std::ptr::null_mut();
@@ -826,7 +826,7 @@ impl<'a> QFunction<'a> {
     /// # Ok(())
     /// # }
     /// ```
-    pub fn inputs(&self) -> crate::Result<&[QFunctionField]> {
+    pub fn inputs(&self) -> crate::Result<&[QFunctionField<'_>]> {
         self.qf_core.inputs()
     }
 
@@ -856,7 +856,7 @@ impl<'a> QFunction<'a> {
     /// # Ok(())
     /// # }
     /// ```
-    pub fn outputs(&self) -> crate::Result<&[QFunctionField]> {
+    pub fn outputs(&self) -> crate::Result<&[QFunctionField<'_>]> {
         self.qf_core.outputs()
     }
 }
@@ -960,7 +960,7 @@ impl<'a> QFunctionByName<'a> {
     /// # Ok(())
     /// # }
     /// ```
-    pub fn inputs(&self) -> crate::Result<&[QFunctionField]> {
+    pub fn inputs(&self) -> crate::Result<&[QFunctionField<'_>]> {
         self.qf_core.inputs()
     }
 
@@ -979,7 +979,7 @@ impl<'a> QFunctionByName<'a> {
     /// # Ok(())
     /// # }
     /// ```
-    pub fn outputs(&self) -> crate::Result<&[QFunctionField]> {
+    pub fn outputs(&self) -> crate::Result<&[QFunctionField<'_>]> {
         self.qf_core.outputs()
     }
 }
diff --git a/rust/libceed/src/vector.rs b/rust/libceed/src/vector.rs
index 42c1c211f0..a1f9cd5178 100644
--- a/rust/libceed/src/vector.rs
+++ b/rust/libceed/src/vector.rs
@@ -561,7 +561,7 @@ impl<'a> Vector<'a> {
     /// # Ok(())
     /// # }
     /// ```
-    pub fn view(&self) -> crate::Result<VectorView> {
+    pub fn view(&self) -> crate::Result<VectorView<'_>> {
         VectorView::new(self)
     }
 
@@ -583,7 +583,7 @@ impl<'a> Vector<'a> {
     /// # Ok(())
     /// # }
     /// ```
-    pub fn view_mut(&mut self) -> crate::Result<VectorViewMut> {
+    pub fn view_mut(&mut self) -> crate::Result<VectorViewMut<'_>> {
         VectorViewMut::new(self)
     }
 

From 78eb7e18a17dcc481a1ea416323c2d800e1a25f6 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 13 Mar 2026 14:34:34 -0600
Subject: [PATCH 565/571] cov - missing excl marker

---
 tests/t599-operator.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/t599-operator.c b/tests/t599-operator.c
index 1690d438cb..a38d6b1f47 100644
--- a/tests/t599-operator.c
+++ b/tests/t599-operator.c
@@ -124,7 +124,9 @@ int main(int argc, char **argv) {
     CeedVectorRestoreArrayRead(v, &v_array);
     // Summing 9 reference elements, each 2x2 => 36 sq units area
     if (fabs(sum - 4.0 * num_elem) > CEED_EPSILON * 5e3) {
+      // LCOV_EXCL_START
       printf("Incorrect area computed, %g != %g (abs error %g)\n", sum, 4.0 * num_elem, fabs(sum - 4.0 * num_elem));
+      // LCOV_EXCL_STOP
     }
   }
 

From db0d71ce740256902e8b92792a34d7cd9f29df92 Mon Sep 17 00:00:00 2001
From: Paulius Velesko <pvelesko@pglc.io>
Date: Tue, 17 Mar 2026 09:34:30 +0200
Subject: [PATCH 566/571] Makefile: add chipStar HIP platform detection and
 dynamic lib name
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Detect HIP platform at build time via hipconfig output:
- __HIP_PLATFORM_SPIRV__ → HIP_LIB_NAME=CHIP (chipStar)
- __HIP_PLATFORM_HCC__/__HIP_PLATFORM_AMD__ → HIP_LIB_NAME=amdhip64

Move ROCM_DIR and HIP_ARCH defaults to the top of the file where other
tool-path defaults live. Use HIP_LIB_NAME in the library detection glob
and in PKG_LIBS. Remove the subst=,, stripping from HIPCONFIG_CPPFLAGS
so flags are passed through unmodified.
---
 Makefile | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/Makefile b/Makefile
index a29e1bfe65..9c0d807e45 100644
--- a/Makefile
+++ b/Makefile
@@ -77,6 +77,9 @@ endif
 ifeq (,$(filter-out undefined default,$(origin ARFLAGS)))
   ARFLAGS = $(if $(DARWIN),cr,crD)
 endif
+# Often /opt/rocm, but sometimes present on machines that don't support HIP
+ROCM_DIR ?= ${HIP_DIR}
+HIP_ARCH ?=
 NVCC ?= $(CUDA_DIR)/bin/nvcc
 NVCC_CXX ?= $(CXX)
 HIPCC ?= $(ROCM_DIR)/bin/hipcc
@@ -88,6 +91,16 @@ ifneq ($(EMSCRIPTEN),)
   EM_LDFLAGS = -s TOTAL_MEMORY=256MB
 endif
 
+HIP_CONFIG_RES := $(shell $(ROCM_DIR)/bin/hipconfig)
+ifneq (,$(findstring __HIP_PLATFORM_SPIRV__,$(HIP_CONFIG_RES)))
+  HIP_LIB_NAME = CHIP
+else ifneq (,$(findstring __HIP_PLATFORM_HCC__,$(HIP_CONFIG_RES)))
+  HIP_LIB_NAME = amdhip64
+else ifneq (,$(findstring __HIP_PLATFORM_AMD__,$(HIP_CONFIG_RES)))
+  HIP_LIB_NAME = amdhip64
+else 
+  $(error "HIP platform not supported")
+endif
 # ASAN must be left empty if you don't want to use it
 ASAN ?=
 
@@ -555,16 +568,16 @@ ifneq ($(CUDA_LIB_DIR),)
 endif
 
 # HIP Backends
-HIP_LIB_DIR := $(wildcard $(foreach d,lib lib64,$(ROCM_DIR)/$d/libamdhip64.${SO_EXT}))
+HIP_LIB_DIR := $(wildcard $(foreach d,lib lib64,$(ROCM_DIR)/$d/lib${HIP_LIB_NAME}.${SO_EXT}))
 HIP_LIB_DIR := $(patsubst %/,%,$(dir $(firstword $(HIP_LIB_DIR))))
 HIP_BACKENDS = /gpu/hip/ref /gpu/hip/shared /gpu/hip/gen
 ifneq ($(HIP_LIB_DIR),)
-  HIPCONFIG_CPPFLAGS := $(subst =,,$(shell $(ROCM_DIR)/bin/hipconfig -C))
+  HIPCONFIG_CPPFLAGS := $(shell $(ROCM_DIR)/bin/hipconfig -C)
   $(hip-all.c:%.c=$(OBJDIR)/%.o) $(hip-all.c:%=%.tidy): CPPFLAGS += $(HIPCONFIG_CPPFLAGS)
   ifneq ($(CXX), $(HIPCC))
     $(hip-all.cpp:%.cpp=$(OBJDIR)/%.o) $(hip-all.cpp:%=%.tidy): CPPFLAGS += $(HIPCONFIG_CPPFLAGS)
   endif
-  PKG_LIBS += -L$(abspath $(HIP_LIB_DIR)) -lamdhip64 -lhipblas
+  PKG_LIBS += -L$(abspath $(HIP_LIB_DIR)) -l${HIP_LIB_NAME} -lhipblas
   LIBCEED_CONTAINS_CXX = 1
   libceed.c     += $(hip-all.c)
   libceed.cpp   += $(hip-all.cpp)

From 40f5f4cb0c433c7fa1b1e5347aafcd3f7aa9176d Mon Sep 17 00:00:00 2001
From: Paulius Velesko <pvelesko@pglc.io>
Date: Tue, 17 Mar 2026 09:34:49 +0200
Subject: [PATCH 567/571] Makefile: use SYCLCXX as linker for libceed.so when
 SYCL is enabled

When SYCL backends are built, libceed.so must be linked with icpx
(SYCLCXX) rather than g++, and -fsycl must appear in CEED_LDFLAGS
(before object files) so icpx can merge the SYCL fat binary device
sections. Without this, libceed.so lacks NEEDED: libsycl.so.7 and
SYCL kernels fail to load at runtime.
---
 Makefile | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 9c0d807e45..1d3dcad4a7 100644
--- a/Makefile
+++ b/Makefile
@@ -653,7 +653,12 @@ endif
 
 pkgconfig-libs-private = $(PKG_LIBS)
 ifeq ($(LIBCEED_CONTAINS_CXX),1)
-  $(libceeds) : LINK = $(CXX)
+  ifneq ($(SYCL_LIB_DIR),)
+    $(libceeds) : LINK = $(SYCLCXX)
+    $(libceeds) : CEED_LDFLAGS += $(SYCLFLAGS)
+  else
+    $(libceeds) : LINK = $(CXX)
+  endif
   ifeq ($(STATIC),1)
     $(examples) $(tests) : CEED_LDLIBS += $(LIBCXX)
     pkgconfig-libs-private += $(LIBCXX)

From 59c1d55f300d3e76952c1b5a20c6d74e5a2e355d Mon Sep 17 00:00:00 2001
From: Paulius Velesko <pvelesko@pglc.io>
Date: Tue, 17 Mar 2026 09:36:09 +0200
Subject: [PATCH 568/571] hip: refactor element loops to prevent syncthread
 deadlocks on chipStar

Replace the outer element for-loop with a single element assignment and
guard all memory accesses with if (elem < num_elem). Shared memory
operations (Interp, Grad, etc.) must execute unconditionally across all
threads so __syncthreads() is reached uniformly; only the
load/store steps are guarded. Also guard qfunction calls with
thread-id bounds checks and comment out the pragma unroll that
triggered miscompilation on chipStar's LLVM.
---
 Makefile                                      |   1 +
 .../hip-gen/ceed-hip-gen-operator-build.cpp   |  27 ++
 .../jit-source/hip/hip-ref-basis-nontensor.h  |  34 +++
 .../jit-source/hip/hip-shared-basis-tensor.h  | 249 ++++++++++++++++++
 4 files changed, 311 insertions(+)

diff --git a/Makefile b/Makefile
index 1d3dcad4a7..9180a2cb0e 100644
--- a/Makefile
+++ b/Makefile
@@ -101,6 +101,7 @@ else ifneq (,$(findstring __HIP_PLATFORM_AMD__,$(HIP_CONFIG_RES)))
 else 
   $(error "HIP platform not supported")
 endif
+
 # ASAN must be left empty if you don't want to use it
 ASAN ?=
 
diff --git a/backends/hip-gen/ceed-hip-gen-operator-build.cpp b/backends/hip-gen/ceed-hip-gen-operator-build.cpp
index 4f2438cffa..d2261d6f1b 100644
--- a/backends/hip-gen/ceed-hip-gen-operator-build.cpp
+++ b/backends/hip-gen/ceed-hip-gen-operator-build.cpp
@@ -12,6 +12,7 @@
 #include <ceed/gen-tools.h>
 #include <ceed/jit-tools.h>
 
+#include <cstring>
 #include <iostream>
 #include <sstream>
 #include <string>
@@ -1029,6 +1030,10 @@ static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, Ce
   // Apply QFunction
   code << "\n";
   code << tab << "// -- Apply QFunction\n";
+#ifdef __HIP_PLATFORM_SPIRV__
+  code << tab << "if (elem < num_elem) {\n";
+  tab.push();
+#endif
   code << tab << "" << qfunction_name << "(ctx, ";
   if (max_dim != 3 || is_at_points || use_3d_slices || !is_all_tensor) {
     code << "1";
@@ -1036,6 +1041,10 @@ static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, Ce
     code << Q_name;
   }
   code << ", inputs, outputs);\n";
+#ifdef __HIP_PLATFORM_SPIRV__
+  tab.pop();
+  code << tab << "}\n";
+#endif
 
   if (is_at_points) {
     // Map back to coefficients
@@ -1495,8 +1504,12 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_bu
   // Loop over all elements
   code << "\n" << tab << "// Element loop\n";
   code << tab << "__syncthreads();\n";
+#ifdef __HIP_PLATFORM_SPIRV__
+  code << tab << "CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z;\n";
+#else
   code << tab << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n";
   tab.push();
+#endif
 
   // -- Compute minimum buffer space needed
   CeedInt max_rstr_buffer_size = 1;
@@ -1617,8 +1630,10 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_bu
   }
 
   // Close loop and function
+#ifndef __HIP_PLATFORM_SPIRV__
   tab.pop();
   code << tab << "}\n";
+#endif
   tab.pop();
   code << tab << "}\n";
   code << tab << "// -----------------------------------------------------------------------------\n\n";
@@ -1853,8 +1868,12 @@ static int CeedOperatorBuildKernelAssemblyAtPoints_Hip_gen(CeedOperator op, bool
   // Loop over all elements
   code << "\n" << tab << "// Element loop\n";
   code << tab << "__syncthreads();\n";
+#ifdef __HIP_PLATFORM_SPIRV__
+  code << tab << "CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z;\n";
+#else
   code << tab << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n";
   tab.push();
+#endif
 
   // -- Compute minimum buffer space needed
   CeedInt max_rstr_buffer_size = 1;
@@ -2080,8 +2099,10 @@ static int CeedOperatorBuildKernelAssemblyAtPoints_Hip_gen(CeedOperator op, bool
   code << tab << "}\n";
 
   // Close loop and function
+#ifndef __HIP_PLATFORM_SPIRV__
   tab.pop();
   code << tab << "}\n";
+#endif
   tab.pop();
   code << tab << "}\n";
   code << tab << "// -----------------------------------------------------------------------------\n\n";
@@ -2421,8 +2442,12 @@ extern "C" int CeedOperatorBuildKernelLinearAssembleQFunction_Hip_gen(CeedOperat
   // Loop over all elements
   code << "\n" << tab << "// Element loop\n";
   code << tab << "__syncthreads();\n";
+#ifdef __HIP_PLATFORM_SPIRV__
+  code << tab << "CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z;\n";
+#else
   code << tab << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n";
   tab.push();
+#endif
 
   // -- Compute minimum buffer space needed
   CeedInt max_rstr_buffer_size = 1;
@@ -2662,8 +2687,10 @@ extern "C" int CeedOperatorBuildKernelLinearAssembleQFunction_Hip_gen(CeedOperat
   code << tab << "}\n";
 
   // Close loop and function
+#ifndef __HIP_PLATFORM_SPIRV__
   tab.pop();
   code << tab << "}\n";
+#endif
   tab.pop();
   code << tab << "}\n";
   code << tab << "// -----------------------------------------------------------------------------\n\n";
diff --git a/include/ceed/jit-source/hip/hip-ref-basis-nontensor.h b/include/ceed/jit-source/hip/hip-ref-basis-nontensor.h
index 923318aa86..71074a35dc 100644
--- a/include/ceed/jit-source/hip/hip-ref-basis-nontensor.h
+++ b/include/ceed/jit-source/hip/hip-ref-basis-nontensor.h
@@ -20,18 +20,32 @@
 //------------------------------------------------------------------------------
 extern "C" __global__ void Interp(const CeedInt num_elem, const CeedScalar *__restrict__ d_B, const CeedScalar *__restrict__ d_U,
                                   CeedScalar *__restrict__ d_V) {
+#ifdef __HIP_PLATFORM_SPIRV__
+  CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z;
+  if (elem < num_elem)
+    Contract<BASIS_NUM_COMP, BASIS_Q_COMP_INTERP, BASIS_P, BASIS_Q>(elem, BASIS_P, BASIS_Q, BASIS_P * num_elem, BASIS_Q * num_elem,
+                                                                    BASIS_NUM_COMP * BASIS_Q * num_elem, d_B, d_U, d_V);
+#else
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
     Contract<BASIS_NUM_COMP, BASIS_Q_COMP_INTERP, BASIS_P, BASIS_Q>(elem, BASIS_P, BASIS_Q, BASIS_P * num_elem, BASIS_Q * num_elem,
                                                                     BASIS_NUM_COMP * BASIS_Q * num_elem, d_B, d_U, d_V);
   }
+#endif
 }
 
 extern "C" __global__ void InterpTranspose(const CeedInt num_elem, const CeedScalar *__restrict__ d_B, const CeedScalar *__restrict__ d_U,
                                            CeedScalar *__restrict__ d_V) {
+#ifdef __HIP_PLATFORM_SPIRV__
+  CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z;
+  if (elem < num_elem)
+    ContractTranspose<BASIS_NUM_COMP, BASIS_Q_COMP_INTERP, BASIS_P, BASIS_Q>(elem, BASIS_Q, BASIS_P, BASIS_Q * num_elem, BASIS_P * num_elem,
+                                                                             BASIS_NUM_COMP * BASIS_Q * num_elem, d_B, d_U, d_V);
+#else
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
     ContractTranspose<BASIS_NUM_COMP, BASIS_Q_COMP_INTERP, BASIS_P, BASIS_Q>(elem, BASIS_Q, BASIS_P, BASIS_Q * num_elem, BASIS_P * num_elem,
                                                                              BASIS_NUM_COMP * BASIS_Q * num_elem, d_B, d_U, d_V);
   }
+#endif
 }
 
 //------------------------------------------------------------------------------
@@ -39,18 +53,32 @@ extern "C" __global__ void InterpTranspose(const CeedInt num_elem, const CeedSca
 //------------------------------------------------------------------------------
 extern "C" __global__ void Deriv(const CeedInt num_elem, const CeedScalar *__restrict__ d_B, const CeedScalar *__restrict__ d_U,
                                  CeedScalar *__restrict__ d_V) {
+#ifdef __HIP_PLATFORM_SPIRV__
+  CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z;
+  if (elem < num_elem)
+    Contract<BASIS_NUM_COMP, BASIS_Q_COMP_DERIV, BASIS_P, BASIS_Q>(elem, BASIS_P, BASIS_Q, BASIS_P * num_elem, BASIS_Q * num_elem,
+                                                                   BASIS_NUM_COMP * BASIS_Q * num_elem, d_B, d_U, d_V);
+#else
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
     Contract<BASIS_NUM_COMP, BASIS_Q_COMP_DERIV, BASIS_P, BASIS_Q>(elem, BASIS_P, BASIS_Q, BASIS_P * num_elem, BASIS_Q * num_elem,
                                                                    BASIS_NUM_COMP * BASIS_Q * num_elem, d_B, d_U, d_V);
   }
+#endif
 }
 
 extern "C" __global__ void DerivTranspose(const CeedInt num_elem, const CeedScalar *__restrict__ d_B, const CeedScalar *__restrict__ d_U,
                                           CeedScalar *__restrict__ d_V) {
+#ifdef __HIP_PLATFORM_SPIRV__
+  CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z;
+  if (elem < num_elem)
+    ContractTranspose<BASIS_NUM_COMP, BASIS_Q_COMP_DERIV, BASIS_P, BASIS_Q>(elem, BASIS_Q, BASIS_P, BASIS_Q * num_elem, BASIS_P * num_elem,
+                                                                            BASIS_NUM_COMP * BASIS_Q * num_elem, d_B, d_U, d_V);
+#else
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
     ContractTranspose<BASIS_NUM_COMP, BASIS_Q_COMP_DERIV, BASIS_P, BASIS_Q>(elem, BASIS_Q, BASIS_P, BASIS_Q * num_elem, BASIS_P * num_elem,
                                                                             BASIS_NUM_COMP * BASIS_Q * num_elem, d_B, d_U, d_V);
   }
+#endif
 }
 
 //------------------------------------------------------------------------------
@@ -60,7 +88,13 @@ extern "C" __global__ void Weight(const CeedInt num_elem, const CeedScalar *__re
   const CeedInt t_id = threadIdx.x;
   // TODO load q_weight in shared memory if blockDim.z > 1?
 
+#ifdef __HIP_PLATFORM_SPIRV__
+  CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z;
+  if (elem < num_elem)
+    d_V[elem * BASIS_Q + t_id] = q_weight[t_id];
+#else
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
     d_V[elem * BASIS_Q + t_id] = q_weight[t_id];
   }
+#endif
 }
diff --git a/include/ceed/jit-source/hip/hip-shared-basis-tensor.h b/include/ceed/jit-source/hip/hip-shared-basis-tensor.h
index 3b82d1190c..9e1d3b5263 100644
--- a/include/ceed/jit-source/hip/hip-shared-basis-tensor.h
+++ b/include/ceed/jit-source/hip/hip-shared-basis-tensor.h
@@ -35,6 +35,24 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
   __syncthreads();
 
   // Apply basis element by element
+#ifdef __HIP_PLATFORM_SPIRV__
+  CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z;
+  if (BASIS_DIM == 1) {
+    if (elem < num_elem) ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U);
+    Interp1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V);
+    if (elem < num_elem) WriteElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, r_V, d_V);
+  } else if (BASIS_DIM == 2) {
+    if (elem < num_elem) ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U);
+    InterpTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V);
+    if (elem < num_elem) WriteElementStrided2d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, r_V, d_V);
+  } else if (BASIS_DIM == 3) {
+    if (elem < num_elem) ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                                         BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U);
+    InterpTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V);
+    if (elem < num_elem) WriteElementStrided3d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
+                                                                          BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_V, d_V);
+  }
+#else
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
     if (BASIS_DIM == 1) {
       ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U);
@@ -52,6 +70,7 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
                                                         BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_V, d_V);
     }
   }
+#endif
 }
 
 extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
@@ -68,6 +87,23 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
   CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
 
   // Apply basis element by element
+#ifdef __HIP_PLATFORM_SPIRV__
+  CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z;
+  if (elem < num_elem) {
+    if (BASIS_DIM == 1) {
+      ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U);
+      WriteElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, r_U, d_V);
+    } else if (BASIS_DIM == 2) {
+      ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U);
+      WriteElementStrided2d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, r_U, d_V);
+    } else if (BASIS_DIM == 3) {
+      ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                       BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U);
+      WriteElementStrided3d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
+                                                        BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_U, d_V);
+    }
+  }
+#else
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
     if (BASIS_DIM == 1) {
       ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U);
@@ -82,6 +118,7 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
                                                         BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_U, d_V);
     }
   }
+#endif
 }
 
 extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
@@ -104,6 +141,24 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
   __syncthreads();
 
   // Apply basis element by element
+#ifdef __HIP_PLATFORM_SPIRV__
+  CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z;
+  if (BASIS_DIM == 1) {
+    if (elem < num_elem) ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
+    InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V);
+    if (elem < num_elem) WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
+  } else if (BASIS_DIM == 2) {
+    if (elem < num_elem) ReadElementStrided2d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
+    InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V);
+    if (elem < num_elem) WriteElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+  } else if (BASIS_DIM == 3) {
+    if (elem < num_elem) ReadElementStrided3d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
+                                                                         BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
+    InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V);
+    if (elem < num_elem) WriteElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                                          BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+  }
+#else
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
     if (BASIS_DIM == 1) {
       ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
@@ -121,6 +176,7 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
                                                         BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
     }
   }
+#endif
 }
 
 extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
@@ -137,6 +193,23 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
   CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
 
   // Apply basis element by element
+#ifdef __HIP_PLATFORM_SPIRV__
+  CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z;
+  if (elem < num_elem) {
+    if (BASIS_DIM == 1) {
+      ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
+      WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_U, d_V);
+    } else if (BASIS_DIM == 2) {
+      ReadElementStrided2d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
+      WriteElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_U, d_V);
+    } else if (BASIS_DIM == 3) {
+      ReadElementStrided3d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
+                                                       BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
+      WriteElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                        BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_U, d_V);
+    }
+  }
+#else
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
     if (BASIS_DIM == 1) {
       ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
@@ -151,6 +224,7 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
                                                         BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_U, d_V);
     }
   }
+#endif
 }
 
 extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
@@ -173,6 +247,24 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
   __syncthreads();
 
   // Apply basis element by element
+#ifdef __HIP_PLATFORM_SPIRV__
+  CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z;
+  if (BASIS_DIM == 1) {
+    if (elem < num_elem) ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
+    InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V);
+    if (elem < num_elem) SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
+  } else if (BASIS_DIM == 2) {
+    if (elem < num_elem) ReadElementStrided2d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
+    InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V);
+    if (elem < num_elem) SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+  } else if (BASIS_DIM == 3) {
+    if (elem < num_elem) ReadElementStrided3d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
+                                                                         BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
+    InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V);
+    if (elem < num_elem) SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                                        BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+  }
+#else
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
     if (BASIS_DIM == 1) {
       ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
@@ -190,6 +282,7 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
                                                       BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
     }
   }
+#endif
 }
 
 extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
@@ -207,6 +300,23 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
   CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
 
   // Apply basis element by element
+#ifdef __HIP_PLATFORM_SPIRV__
+  CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z;
+  if (elem < num_elem) {
+    if (BASIS_DIM == 1) {
+      ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
+      SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_U, d_V);
+    } else if (BASIS_DIM == 2) {
+      ReadElementStrided2d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
+      SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_U, d_V);
+    } else if (BASIS_DIM == 3) {
+      ReadElementStrided3d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
+                                                       BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
+      SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                      BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_U, d_V);
+    }
+  }
+#else
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
     if (BASIS_DIM == 1) {
       ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
@@ -221,6 +331,7 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
                                                       BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_U, d_V);
     }
   }
+#endif
 }
 
 //------------------------------------------------------------------------------
@@ -248,6 +359,26 @@ extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__ void Grad(const C
   __syncthreads();
 
   // Apply basis element by element
+#ifdef __HIP_PLATFORM_SPIRV__
+  CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z;
+  if (BASIS_DIM == 1) {
+    if (elem < num_elem) ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U);
+    Grad1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
+    if (elem < num_elem) WriteElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, r_V, d_V);
+  } else if (BASIS_DIM == 2) {
+    if (elem < num_elem) ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U);
+    GradTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
+    if (elem < num_elem) WriteElementStrided2d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, r_V,
+                                                                    d_V);
+  } else if (BASIS_DIM == 3) {
+    if (elem < num_elem) ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                                         BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U);
+    if (BASIS_HAS_COLLOCATED_GRAD) GradTensorCollocated3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
+    else GradTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
+    if (elem < num_elem) WriteElementStrided3d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
+                                                                    BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_V, d_V);
+  }
+#else
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
     if (BASIS_DIM == 1) {
       ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U);
@@ -267,6 +398,7 @@ extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__ void Grad(const C
                                                                     BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_V, d_V);
     }
   }
+#endif
 }
 
 extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
@@ -290,6 +422,25 @@ extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
   __syncthreads();
 
   // Apply basis element by element
+#ifdef __HIP_PLATFORM_SPIRV__
+  CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z;
+  if (BASIS_DIM == 1) {
+    if (elem < num_elem) ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U);
+    Grad1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
+    if (elem < num_elem) WriteElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, r_V, d_V);
+  } else if (BASIS_DIM == 2) {
+    if (elem < num_elem) ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U);
+    GradTensorCollocatedNodes2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
+    if (elem < num_elem) WriteElementStrided2d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, r_V,
+                                                                    d_V);
+  } else if (BASIS_DIM == 3) {
+    if (elem < num_elem) ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                                         BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U);
+    GradTensorCollocatedNodes3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
+    if (elem < num_elem) WriteElementStrided3d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
+                                                                    BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_V, d_V);
+  }
+#else
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
     if (BASIS_DIM == 1) {
       ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U);
@@ -308,6 +459,7 @@ extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
                                                                     BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_V, d_V);
     }
   }
+#endif
 }
 
 extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
@@ -333,6 +485,26 @@ extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
   __syncthreads();
 
   // Apply basis element by element
+#ifdef __HIP_PLATFORM_SPIRV__
+  CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z;
+  if (BASIS_DIM == 1) {
+    if (elem < num_elem) ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
+    GradTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
+    if (elem < num_elem) WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
+  } else if (BASIS_DIM == 2) {
+    if (elem < num_elem) ReadElementStrided2d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U,
+                                                                   r_U);
+    GradTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
+    if (elem < num_elem) WriteElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+  } else if (BASIS_DIM == 3) {
+    if (elem < num_elem) ReadElementStrided3d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
+                                                                   BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
+    if (BASIS_HAS_COLLOCATED_GRAD) GradTransposeTensorCollocated3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
+    else GradTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
+    if (elem < num_elem) WriteElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                                          BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+  }
+#else
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
     if (BASIS_DIM == 1) {
       ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
@@ -352,6 +524,7 @@ extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
                                                         BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
     }
   }
+#endif
 }
 
 extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
@@ -375,6 +548,25 @@ extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
   __syncthreads();
 
   // Apply basis element by element
+#ifdef __HIP_PLATFORM_SPIRV__
+  CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z;
+  if (BASIS_DIM == 1) {
+    if (elem < num_elem) ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
+    GradTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
+    if (elem < num_elem) WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
+  } else if (BASIS_DIM == 2) {
+    if (elem < num_elem) ReadElementStrided2d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U,
+                                                                   r_U);
+    GradTransposeTensorCollocatedNodes2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
+    if (elem < num_elem) WriteElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+  } else if (BASIS_DIM == 3) {
+    if (elem < num_elem) ReadElementStrided3d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
+                                                                   BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
+    GradTransposeTensorCollocatedNodes3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
+    if (elem < num_elem) WriteElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                                          BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+  }
+#else
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
     if (BASIS_DIM == 1) {
       ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
@@ -393,6 +585,7 @@ extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
                                                         BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
     }
   }
+#endif
 }
 
 extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
@@ -418,6 +611,26 @@ extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
   __syncthreads();
 
   // Apply basis element by element
+#ifdef __HIP_PLATFORM_SPIRV__
+  CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z;
+  if (BASIS_DIM == 1) {
+    if (elem < num_elem) ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
+    GradTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
+    if (elem < num_elem) SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
+  } else if (BASIS_DIM == 2) {
+    if (elem < num_elem) ReadElementStrided2d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U,
+                                                                   r_U);
+    GradTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
+    if (elem < num_elem) SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+  } else if (BASIS_DIM == 3) {
+    if (elem < num_elem) ReadElementStrided3d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
+                                                                   BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
+    if (BASIS_HAS_COLLOCATED_GRAD) GradTransposeTensorCollocated3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
+    else GradTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
+    if (elem < num_elem) SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                                        BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+  }
+#else
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
     if (BASIS_DIM == 1) {
       ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
@@ -437,6 +650,7 @@ extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
                                                       BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
     }
   }
+#endif
 }
 
 extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
@@ -460,6 +674,25 @@ extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
   __syncthreads();
 
   // Apply basis element by element
+#ifdef __HIP_PLATFORM_SPIRV__
+  CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z;
+  if (BASIS_DIM == 1) {
+    if (elem < num_elem) ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
+    GradTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
+    if (elem < num_elem) SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
+  } else if (BASIS_DIM == 2) {
+    if (elem < num_elem) ReadElementStrided2d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U,
+                                                                   r_U);
+    GradTransposeTensorCollocatedNodes2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
+    if (elem < num_elem) SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+  } else if (BASIS_DIM == 3) {
+    if (elem < num_elem) ReadElementStrided3d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
+                                                                   BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
+    GradTransposeTensorCollocatedNodes3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
+    if (elem < num_elem) SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                                        BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+  }
+#else
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
     if (BASIS_DIM == 1) {
       ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
@@ -478,6 +711,7 @@ extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
                                                       BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
     }
   }
+#endif
 }
 
 //------------------------------------------------------------------------------
@@ -496,6 +730,20 @@ extern "C" __launch_bounds__(BASIS_WEIGHT_BLOCK_SIZE) __global__
 
   CeedScalar r_W[BASIS_DIM > 2 ? BASIS_Q_1D : 1];
 
+#ifdef __HIP_PLATFORM_SPIRV__
+  CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z;
+  if (BASIS_DIM == 1) {
+    Weight1d<BASIS_P_1D, BASIS_Q_1D>(data, q_weight_1d, r_W);
+    if (elem < num_elem) WriteElementStrided1d<1, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, r_W, d_W);
+  } else if (BASIS_DIM == 2) {
+    WeightTensor2d<BASIS_P_1D, BASIS_Q_1D>(data, q_weight_1d, r_W);
+    if (elem < num_elem) WriteElementStrided2d<1, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, r_W, d_W);
+  } else if (BASIS_DIM == 3) {
+    WeightTensor3d<BASIS_P_1D, BASIS_Q_1D>(data, q_weight_1d, r_W);
+    if (elem < num_elem) WriteElementStrided3d<1, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_W,
+                                           d_W);
+  }
+#else
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
     if (BASIS_DIM == 1) {
       Weight1d<BASIS_P_1D, BASIS_Q_1D>(data, q_weight_1d, r_W);
@@ -509,4 +757,5 @@ extern "C" __launch_bounds__(BASIS_WEIGHT_BLOCK_SIZE) __global__
                                            d_W);
     }
   }
+#endif
 }

From e3f38d5ec083e520f757308058f181c047edf771 Mon Sep 17 00:00:00 2001
From: Paulius Velesko <pvelesko@pglc.io>
Date: Tue, 17 Mar 2026 09:36:54 +0200
Subject: [PATCH 569/571] tests/junit_common: use errors='replace' when
 decoding subprocess output

Prevents UnicodeDecodeError when GPU runtimes emit non-UTF-8 bytes
(e.g. chipStar CHIP warnings contain raw binary data in some paths).
---
 tests/junit_common.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/junit_common.py b/tests/junit_common.py
index eee7924ba9..c03ca24659 100644
--- a/tests/junit_common.py
+++ b/tests/junit_common.py
@@ -244,7 +244,7 @@ def has_cgnsdiff() -> bool:
                           stdout=subprocess.PIPE,
                           stderr=subprocess.PIPE,
                           env=my_env)
-    return 'not found' not in proc.stderr.decode('utf-8')
+    return 'not found' not in proc.stderr.decode('utf-8', errors='replace')
 
 
 def contains_any(base: str, substrings: List[str]) -> bool:
@@ -498,7 +498,7 @@ def diff_cgns(test_cgns: Path, true_cgns: Path, cgns_tol: float) -> str:
                           stderr=subprocess.PIPE,
                           env=my_env)
 
-    return proc.stderr.decode('utf-8') + proc.stdout.decode('utf-8')
+    return proc.stderr.decode('utf-8', errors='replace') + proc.stdout.decode('utf-8', errors='replace')
 
 
 def diff_ascii(test_file: Path, true_file: Path, backend: str) -> str:
@@ -634,8 +634,8 @@ def run_test(index: int, test: str, spec: TestSpec, backend: str,
                              classname=source_path.parent,
                              elapsed_sec=time.time() - start,
                              timestamp=time.strftime('%Y-%m-%d %H:%M:%S %Z', time.localtime(start)),
-                             stdout=proc.stdout.decode('utf-8'),
-                             stderr=proc.stderr.decode('utf-8'),
+                             stdout=proc.stdout.decode('utf-8', errors='replace'),
+                             stderr=proc.stderr.decode('utf-8', errors='replace'),
                              allow_multiple_subelements=True,
                              category=spec.name,)
         ref_csvs: List[Path] = []

From 7d1f784ababd32c1c95fd12cde9ecd59640a6cc0 Mon Sep 17 00:00:00 2001
From: Paulius Velesko <pvelesko@pglc.io>
Date: Tue, 17 Mar 2026 09:37:08 +0200
Subject: [PATCH 570/571] tests/junit_common: filter chipStar CHIP runtime
 lines from test stderr

chipStar prints 'CHIP info/warning/debug ...' lines to stderr on every
run. These are not test failures but caused JUnit to mark all HIP tests
as failed. Filter them out before checking whether stderr is non-empty.
---
 tests/junit_common.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/tests/junit_common.py b/tests/junit_common.py
index c03ca24659..ce1115a547 100644
--- a/tests/junit_common.py
+++ b/tests/junit_common.py
@@ -670,7 +670,12 @@ def run_test(index: int, test: str, spec: TestSpec, backend: str,
 
     # classify other results
     if not test_case.is_skipped() and not test_case.status:
-        if test_case.stderr:
+        # Filter out chipStar (CHIP) runtime informational/warning lines which are not errors
+        filtered_stderr = '\n'.join(
+            line for line in test_case.stderr.split('\n')
+            if not line.startswith(('CHIP info ', 'CHIP warning ', 'CHIP debug '))
+        ).strip()
+        if filtered_stderr:
             test_case.add_failure_info('stderr', test_case.stderr)
         if proc.returncode != 0:
             test_case.add_error_info(f'returncode = {proc.returncode}')

From a65eb92355b99cc8bfc09a037e0f560ab5642eff Mon Sep 17 00:00:00 2001
From: Paulius Velesko <pvelesko@pglc.io>
Date: Tue, 17 Mar 2026 11:24:50 +0200
Subject: [PATCH 571/571] Makefile: filter chipStar clang-only flags from
 HIPCONFIG_CPPFLAGS for gcc

chipStar's hipconfig -C outputs --offload=spirv64, -nohipwrapperinc, --hip-path=,
and --target= which are clang-only flags.  When CC=gcc is used for .c files (or
CXX != HIPCC for .cpp files), these flags cause build failures.

Add HIPCONFIG_CPPFLAGS_C that filters the clang-only flags and adds an explicit
-I$(ROCM_DIR)/include (since -nohipwrapperinc was suppressing the wrapper that
would have pulled in hip_runtime.h).
---
 Makefile | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index 9180a2cb0e..f824ba2cd8 100644
--- a/Makefile
+++ b/Makefile
@@ -574,9 +574,16 @@ HIP_LIB_DIR := $(patsubst %/,%,$(dir $(firstword $(HIP_LIB_DIR))))
 HIP_BACKENDS = /gpu/hip/ref /gpu/hip/shared /gpu/hip/gen
 ifneq ($(HIP_LIB_DIR),)
   HIPCONFIG_CPPFLAGS := $(shell $(ROCM_DIR)/bin/hipconfig -C)
-  $(hip-all.c:%.c=$(OBJDIR)/%.o) $(hip-all.c:%=%.tidy): CPPFLAGS += $(HIPCONFIG_CPPFLAGS)
+  # chipStar hipconfig -C includes clang-only flags (--target=, --offload=, -nohipwrapperinc, --hip-path=);
+  # strip those out for gcc-compiled C sources, keeping -D/-I/-include flags
+  ifeq ($(HIP_LIB_NAME),CHIP)
+    HIPCONFIG_CPPFLAGS_C := $(filter-out --offload% -nohipwrapperinc --hip-path% --target%,$(HIPCONFIG_CPPFLAGS)) -I$(ROCM_DIR)/include
+  else
+    HIPCONFIG_CPPFLAGS_C := $(HIPCONFIG_CPPFLAGS)
+  endif
+  $(hip-all.c:%.c=$(OBJDIR)/%.o) $(hip-all.c:%=%.tidy): CPPFLAGS += $(HIPCONFIG_CPPFLAGS_C)
   ifneq ($(CXX), $(HIPCC))
-    $(hip-all.cpp:%.cpp=$(OBJDIR)/%.o) $(hip-all.cpp:%=%.tidy): CPPFLAGS += $(HIPCONFIG_CPPFLAGS)
+    $(hip-all.cpp:%.cpp=$(OBJDIR)/%.o) $(hip-all.cpp:%=%.tidy): CPPFLAGS += $(HIPCONFIG_CPPFLAGS_C)
   endif
   PKG_LIBS += -L$(abspath $(HIP_LIB_DIR)) -l${HIP_LIB_NAME} -lhipblas
   LIBCEED_CONTAINS_CXX = 1